From 85d41487a996b1d0861f6344d112b739947b6fa2 Mon Sep 17 00:00:00 2001
From: Alec Miller <alecmiller@yahoo.com>
Date: Sun, 29 Sep 2024 18:13:49 -0700
Subject: [PATCH] kram - simd - more work on double234.h, needs long enabled

---
 libkram/vectormath/double234.h    | 161 ++++++++++++++++++------------
 libkram/vectormath/float234.h     |   2 -
 libkram/vectormath/vectormath++.h |  13 ++-
 3 files changed, 103 insertions(+), 73 deletions(-)

diff --git a/libkram/vectormath/double234.h b/libkram/vectormath/double234.h
index fa684ec..af6c7e7 100644
--- a/libkram/vectormath/double234.h
+++ b/libkram/vectormath/double234.h
@@ -35,46 +35,102 @@ namespace SIMD_NAMESPACE {
 
 macroVector8TypesStorageRenames(double, double)
 
+SIMD_CALL double2 double2m(double x) {
+    return x;
+}
+SIMD_CALL double2 double2m(double x, double y) {
+    return {x,y};
+}
+
+SIMD_CALL double3 double3m(double x) {
+    return x;
+}
+SIMD_CALL double3 double3m(double x, double y, double z) {
+    return {x,y,z};
+}
+SIMD_CALL double3 double3m(double2 v, double z) {
+    double3 r; r.xy = v; r.z = z; return r;
+}
+
+SIMD_CALL double4 double4m(double x) {
+    return x;
+}
+SIMD_CALL double4 double4m(double2 xy, double2 zw) {
+    double4 r; r.xy = xy; r.zw = zw; return r;
+}
+SIMD_CALL double4 double4m(double x, double y, double z, double w = 1.0) {
+    return {x,y,z,w};
+}
+SIMD_CALL double4 double4m(double3 v, double w = 1.0) {
+    double4 r; r.xyz = v; r.w = w; return r;
+}
+
 //-----------------------------------
 // start of implementation
 
 #if SIMD_NEON
 
 // TODO: expose double2 ops on Neon.
+// think I have to, so that 4 can call 2x2 with hi/lo
+
+SIMD_CALL double reduce_min(double2 x) {
+    return vminvq_f64(x);
+}
 
 SIMD_CALL double reduce_min(double4 x) {
-    return vminvq_f32(x); // TODO: fp64
+    return fmin(reduce_min(x.lo),reduce_min(x.hi));
+}
+
+SIMD_CALL double reduce_max(double2 x) {
+    return vmaxvq_f64(x);
 }
 
 SIMD_CALL double reduce_max(double4 x) {
-    return vmaxvq_f32(x); // TODO: fp64
+    return fmax(reduce_max(x.lo),reduce_max(x.hi));
 }
 
+SIMD_CALL double2 min(double2 x, double2 y) {
+    // precise returns x on Nan
+    return vminnmq_f64(x, y);
+}
 SIMD_CALL double4 min(double4 x, double4 y) {
     // precise returns x on Nan
-    return vminnmq_f32(x, y); // TODO: fp64
+    return double4m(min(x.lo,y.lo), min(x.hi,y.hi));
+}
+
+SIMD_CALL double2 max(double2 x, double2 y) {
+    // precise returns x on Nan
+    return vmaxnmq_f64(x, y);
 }
 
 SIMD_CALL double4 max(double4 x, double4 y) {
     // precise returns x on Nan
-    return vmaxnmq_f32(x, y); // TODO: fp64
+    return double4m(max(x.lo,y.lo), max(x.hi,y.hi));
 }
 
-SIMD_CALL double4 muladd(double4 x, double4 y, double4 t) {
+
+SIMD_CALL double2 muladd(double2 x, double2 y, double2 t) {
     // requires __ARM_VFPV4__
     // t passed first unlike sse
-    return vfmaq_f32(t, x,y); // TODO: fp64
+    return vfmaq_f64(t, x,y);
+}
+SIMD_CALL double4 muladd(double4 x, double4 y, double4 t) {
+    return double4m(muladd(x.lo,y.lo,t.lo), muladd(x.hi,y.hi,t.hi));
 }
 
+SIMD_CALL double2 sqrt(double2 x) {
+    return vsqrtq_f64(x);
+}
 SIMD_CALL double4 sqrt(double4 x) {
-    return vsqrtq_f32(x); // TODO: fp64
+    return double4m(sqrt(x.lo,x.lo), sqrt(x.hi,x.hi));
 }
 
 // use sse2neon to port this for now
 SIMD_CALL double4 reduce_addv(double4 x) {
     // 4:1 reduction
-    x = _mm_hadd_pd(x, x); // xy = x+y,z+w
-    x = _mm_hadd_pd(x, x); // x  = x+y
+    x = _mm_hadd_pd(x.lo, x.lo);
+    x = _mm_hadd_pd(x.hi, x.hi);
+    x = _mm_hadd_pd(x.lo, x.hi);
     return x.x; // repeat x to all values
 }
 
@@ -95,13 +151,13 @@ SIMD_CALL double reduce_max(double4 x) {
     return fmax(fmax(x.x,x.y), fmax(x.z,x.w));
 }
 
-// needs SIMD_INT
+// needs SIMD_LONG
 // needed for precise min/max calls below
-#if SIMD_INT
-SIMD_CALL double4 bitselect_forminmax(double4 x, double4 y, int4 mask) { // TODO: fp64
-    return (double4)(((int4)x & ~mask) | ((int4)y & mask));
+#if SIMD_LONG
+SIMD_CALL double4 bitselect_forminmax(double4 x, double4 y, long4 mask) {
+    return (double4)(((long4)x & ~mask) | ((long4)y & mask));
 }
-#endif
+#endif // SIMD_LONG
 
 SIMD_CALL double4 min(double4 x, double4 y) {
     // precise returns x on Nan
@@ -129,8 +185,9 @@ SIMD_CALL double4 sqrt(double4 x) {
 
 SIMD_CALL double4 reduce_addv(double4 x) {
     // 4:1 reduction
-    x = _mm_hadd_pd(x, x); // xy = x+y,z+w
-    x = _mm_hadd_pd(x, x); // x  = x+y
+    x = _mm_hadd_pd(x.lo, x.lo);
+    x = _mm_hadd_pd(x.hi, x.hi);
+    x = _mm_hadd_pd(x.lo, x.hi);
     return x.x; // repeat x to all values
 }
 
@@ -138,47 +195,49 @@ SIMD_CALL double reduce_add(double4 x) {
     return reduce_addv(x).x;
 }
 
-#endif // SIMD_INT && SIMD_SSE
+#endif // SIMD_LONG && SIMD_SSE
 
 // SSE4.1
+// single ops in AVX/2
+
 SIMD_CALL double4 round(double4 vv) {
-    return _mm_round_pd(vv, 0x8);  // round to nearest | exc
+    return double4m(_mm_round_pd(vv.lo, 0x8),_mm_round_pd(vv.hi, 0x8));  // round to nearest | exc
 }
 SIMD_CALL double4 ceil(double4 vv) {
-    return _mm_ceil_pd(vv);
+    return double4m(_mm_ceil_pd(vv.lo),_mm_ceil_pd(vv.hi));
 }
 SIMD_CALL double4 floor(double4 vv) {
-    return _mm_floor_pd(vv);
+    return double4m(_mm_floor_pd(vv.lo),_mm_floor_pd(vv.hi));
 }
 
 // end of implementation
 //-----------------------------------
 
-#if SIMD_INT
+#if SIMD_LONG
 
 // bitselect
-SIMD_CALL double2 bitselect(double2 x, double2 y, int2 mask) { // TODO: fp64
-    return (double2)bitselect((int2)x, (int2)y, mask);
+SIMD_CALL double2 bitselect(double2 x, double2 y, long2 mask) {
+    return (double2)bitselect((long2)x, (long2)y, mask);
 }
-SIMD_CALL double3 bitselect(double3 x, double3 y, int3 mask) {
-    return (double3)bitselect((int3)x, (int3)y, mask);
+SIMD_CALL double3 bitselect(double3 x, double3 y, long3 mask) {
+    return (double3)bitselect((long3)x, (long3)y, mask);
 }
-SIMD_CALL double4 bitselect(double4 x, double4 y, int4 mask) {
-    return (double4)bitselect((int4)x, (int4)y, mask);
+SIMD_CALL double4 bitselect(double4 x, double4 y, long4 mask) {
+    return (double4)bitselect((long4)x, (long4)y, mask);
 }
 
 // select
-SIMD_CALL double2 select(double2 x, double2 y, int2 mask) { // TODO: fp64
-    return bitselect(x, y, mask >> 31);
+SIMD_CALL double2 select(double2 x, double2 y, long2 mask) {
+    return bitselect(x, y, mask >> 63);
 }
-SIMD_CALL double3 select(double3 x, double3 y, int3 mask) {
-    return bitselect(x, y, mask >> 31);
+SIMD_CALL double3 select(double3 x, double3 y, long3 mask) {
+    return bitselect(x, y, mask >> 63);
 }
-SIMD_CALL double4 select(double4 x, double4 y, int4 mask) {
-    return bitselect(x, y, mask >> 31);
+SIMD_CALL double4 select(double4 x, double4 y, long4 mask) {
+    return bitselect(x, y, mask >> 63);
 }
 
-#endif // SIMD_INT
+#endif // SIMD_LONG
 
 // zeroext - internal helper
 SIMD_CALL double4 zeroext(double2 x) {
@@ -189,10 +248,10 @@ SIMD_CALL double4 zeroext(double3 x) {
 }
 
 // any
-SIMD_CALL bool any(int3 x) {
+SIMD_CALL bool any(long3 x) {
     return any(vec3to4(x));
 }
-SIMD_CALL bool all(int3 x) {
+SIMD_CALL bool all(long4 x) {
     return all(vec3to4(x));
 }
 
@@ -449,46 +508,16 @@ SIMD_CALL double4 smoothstep(double4 edge0, double4 edge1, double4 x) {
 SIMD_CALL double2 fract(double2 x) {
     return min(x - floor(x), 0x1.fffffep-1f); // TODO: fp64
 }
-
 SIMD_CALL double3 fract(double3 x) {
     return min(x - floor(x), 0x1.fffffep-1f);
 }
-
 SIMD_CALL double4 fract(double4 x) {
     return min(x - floor(x), 0x1.fffffep-1f);
 }
 
 //-------------------
 
-SIMD_CALL double2 double2m(double x) {
-    return x;
-}
-SIMD_CALL double2 double2m(double x, double y) {
-    return {x,y};
-}
 
-SIMD_CALL double3 double3m(double x) {
-    return x;
-}
-SIMD_CALL double3 double3m(double x, double y, double z) {
-    return {x,y,z};
-}
-SIMD_CALL double3 double3m(double2 v, double z) {
-    double3 r; r.xy = v; r.z = z; return r;
-}
-
-SIMD_CALL double4 double4m(double x) {
-    return x;
-}
-SIMD_CALL double4 double4m(double2 xy, double2 zw) {
-    double4 r; r.xy = xy; r.zw = zw; return r;
-}
-SIMD_CALL double4 double4m(double x, double y, double z, double w = 1.0) {
-    return {x,y,z,w};
-}
-SIMD_CALL double4 double4m(double3 v, double w = 1.0) {
-    double4 r; r.xyz = v; r.w = w; return r;
-}
 
 // power series
 macroVectorRepeatFnDecl(double, log)
diff --git a/libkram/vectormath/float234.h b/libkram/vectormath/float234.h
index 7568202..a70ba6a 100644
--- a/libkram/vectormath/float234.h
+++ b/libkram/vectormath/float234.h
@@ -447,11 +447,9 @@ SIMD_CALL float4 smoothstep(float4 edge0, float4 edge1, float4 x) {
 SIMD_CALL float2 fract(float2 x) {
     return min(x - floor(x), 0x1.fffffep-1f);
 }
-
 SIMD_CALL float3 fract(float3 x) {
     return min(x - floor(x), 0x1.fffffep-1f);
 }
-
 SIMD_CALL float4 fract(float4 x) {
     return min(x - floor(x), 0x1.fffffep-1f);
 }
diff --git a/libkram/vectormath/vectormath++.h b/libkram/vectormath/vectormath++.h
index 29d0dbd..b410c28 100644
--- a/libkram/vectormath/vectormath++.h
+++ b/libkram/vectormath/vectormath++.h
@@ -136,18 +136,21 @@
 // a define to override setings from prefix file
 #ifndef SIMD_CONFIG
 
+#define SIMD_INT    1
+#define SIMD_LONG   1
+
 // Vector and matrix types.  Currently only matrix types for SIMD_FLOAT, SIMD_DOUBLE.
 // SIMD_INT must be kept on for conditional tests.
-#define SIMD_HALF   1
-#define SIMD_FLOAT  1
-#define SIMD_DOUBLE 0
+// SIMD_HALF for bitselect would need SIMD_SHORT or SIMD_INT?
+#define SIMD_HALF   (1)
+#define SIMD_FLOAT  (1 && SIMD_INT)
+#define SIMD_DOUBLE (0 && SIMD_LONG)
+
 
-#define SIMD_INT    1
 #define SIMD_CHAR   0
 //#define SIMD_UCHAR  0
 #define SIMD_SHORT  0
 //#define SIMD_USHORT 0
-#define SIMD_LONG   0
 //#define SIMD_ULONG  0
 
 // Whether to support > 4 length vecs with some ops