From f0590b7a97aa830d631db116a2969c1b0858b34a Mon Sep 17 00:00:00 2001
From: Alec Miller <alecmiller@yahoo.com>
Date: Fri, 27 Sep 2024 22:24:59 -0700
Subject: [PATCH] kram - simd - quatf

---
 libkram/vectormath/vectormath++.cpp | 216 +++++++++++++++++++++++++++-
 libkram/vectormath/vectormath++.h   |  53 ++++---
 2 files changed, 244 insertions(+), 25 deletions(-)

diff --git a/libkram/vectormath/vectormath++.cpp b/libkram/vectormath/vectormath++.cpp
index 5da136e..156b75b 100644
--- a/libkram/vectormath/vectormath++.cpp
+++ b/libkram/vectormath/vectormath++.cpp
@@ -5,14 +5,53 @@
 
 #if USE_SIMDLIB
 
+// These are large functions that can be buried and optimized in the .cpp
+// Has alternate cmath form it uses for now.  Look into ISPC calls to
+// replace all this.
+#include "sse_mathfun.h"
+
 // Tests with godbolt are here to show code comparsions with optimizations.
 
 // -Og can't unroll small loops for some reason. -O2 and -O3 do.
 // https://godbolt.org/z/KMPa8bchb
+//
+// As optimal as it gets
+// https://godbolt.org/z/YxzobGM17
+//
+// optimized quake rcp, rsqrt, sqrt
+// https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf
+//
+// TODO: Fabian on fp16 <-> fp32
+// Not the right gist, you want the RTNE one (nm: that only matters for float->half,
+// this was the half->float one. FWIW, other dir is https://gist.github.com/rygorous/eb3a019b99fdaa9c3064.
+// These days I use a variant of the RTNE/RN version that also preserves NaN payload bits,
+// which is slightly more ops but matches hardware conversions exactly for every input, including all NaNs.
+//
+// TODO: bring over fast inverses (RTS, RTU, etc)
+//
+// TODO: saturating conversions would be useful to, and prevent overflow
+// see the conversion.h code, bit select to clamp values.
+//
+// TODO: matrix_types.h has a type_traits with rows, cols, etc.
+// can call get_traits() on them.  See matrix.h for all the ops.
 
+// The storage of affine data in a column matrix is no different than rows
+// translation is in (r30 r31 r32) or in (c30, c31 c32)
+//
+// r0: r00 r01 r02 r03
+// r1: r10
+// r2: r20
+// r3: r30
+//
+// c0  c1  c2  c3
+// c00 c10 c20 c30
+// c01 c11
+// c02
+// c03
+//
+
+//-----------------
 
-// these are large functions that can be buried and optimized in the .cpp
-#include "sse_mathfun.h"
 
 namespace SIMD_NAMESPACE {
 
@@ -205,8 +244,6 @@ float3x3 inverse(const float3x3& x) {
     return r;
 }
 
-// TODO: bring over fast inverses (RTS, RTU, etc)
-
 float4x4 inverse(const float4x4& x) {
     // This is a full gje inverse
     
@@ -601,6 +638,177 @@ half4 half4m(float4 vv)
 #endif // SIMD_SSE
 #endif // SIMD_HALF4_ONLY
 
+#if SIMD_FLOAT
+
+static quatf quatf_zero(0.0f, 0.0f, 0.0f, 0.0f);
+static quatf quatf_identity(0.0f, 0.0f, 0.0f, 1.0f);
+
+const quatf& quatf::zero() { return quatf_zero; }
+const quatf& quatf::identity() { return quatf_identity; }
+
+// can negate w, or xyz with -kCross
+static const float4 kCross = {1.0f,1.0f,1.0f,-1.0f};
+
+// can eliminate 4 shufs by using 4 constants, 2 q swizzles are dupes
+static const float4 kConvertToMatrix = {0,1,2,-2};
+
+quatf::quatf(float3 axis, float angleInRadians)
+{
+    float c = ::cosf(angleInRadians * 0.5f);
+    float s = ::sinf(angleInRadians * 0.5f);
+    
+    v = float4m(s * axis, c);
+}
+
+quatf inverse(quatf q)
+{
+    return quatf(normalize(q.v * -kCross)); // vec *, but goiong to get quad mul below
+}
+
+quatf operator*(quatf q1, quatf q2)
+{
+    // This is original
+    //q1.y * q2.z - q1.z * q2.y + q1.x * q2.w + q1.w * q2.x,
+    //q1.z * q2.x - q1.x * q2.z + q1.y * q2.w + q1.w * q2.y,
+    //q1.x * q2.y - q1.y * q2.x + q1.z * q2.w + q1.w * q2.z,
+    //q1.w * q2.w - q1.x * q2.x - q1.y * q2.y - q1.z * q2.z
+    
+    // quaternion multiplication is similar to a four-space cross-product
+    float4 qv1 = q1.v;
+    float4 qv2 = q2.v;
+
+    return quatf
+    (
+        dot((qv1 * qv2.wzyx).xywz, kCross), // Vec4(1,1,-1,1))
+        dot((qv1 * qv2.zwxy).wyzx, kCross), // Vec4(-1,1,1,1))
+        dot((qv1 * qv2.yxwz).xwzy, kCross), // Vec4(1,-1,1,1))
+        dot(-qv1 * qv2, kCross)
+    );
+}
+
+                                                                                    
+float4x4 float4x4m(quatf qq)
+{
+    /* from Ken Shoemake's GGIV article
+    float xs = x*s,      ys = y*s,          zs = z*s;
+    float wx = w*xs,      wy = w*ys,     wz = w*zs;
+    float xx = x*xs,      xy = x*ys,     xz = x*zs;
+    float yy = y*ys,      yz = y*zs;
+    float zz = z*zs;
+   
+    // original formulation
+    return Mat44
+            (
+            1.0 - 2.0*(y*y + z*z),          2.0*(x*y - w*z),        2.0*(x*z + w*y), 0.0,
+                  2.0*(x*y + w*z), 1.0 - 2.0*(x*x + z*z),        2.0*(y*z - w*x), 0.0,
+                  2.0*(x*z - w*y),       2.0*(y*z + w*x),  1.0 - 2.0*(x*x + y*y), 0.0
+                  0.0,                                  0.0,                       0.0, 1.0
+            );
+   */
+
+    float4 q = qq.v;
+    
+    // should be able to transpose using inverse (rinv = rt)
+    q *= -kCross;
+
+    // not doing normalize like original is
+    //q = normalize(q);
+    float4 c = kConvertToMatrix;
+    
+    float4x4 m;
+    float4 t;
+
+    t = q.wzyw * c.wzwx;
+    m[0] = q.yzxw * t.zxyw - q.zxyw * t.yzxw + c.yxxx;
+    
+    t = q.zwxw * c.wwzx;
+    m[1] = q.yzxw * t.zxyw - q.zxyw * t.yzxw + c.xyxx;
+    
+    // orthonormal basis, so use cross product for last term
+    m[2] = float4m(cross(m[0].xyz, m[1].xyz), 0.0f); // 2 instr hlsl, 7 ops SSE
+    m[3] = float4_posw();
+
+    return m;
+}
+
+// Should just use nlerp, instead of slerp
+quatf lerp(quatf q0, quatf q1, float t)
+{
+    float4 v = lerp(q0.v, q1.v, t);
+    return quatf(v);
+}
+
+// Slow but constant velocity.
+quatf slerp(quatf q0, quatf q1, float t)
+{
+    // theta/2
+    float cosThetaHalf = dot(q0.v,q1.v);
+    if (cosThetaHalf >= 0.995f)
+    {
+        return lerp(q0,q1,t);
+    }
+    else
+    {
+        // expensive
+        float thetaHalf = acosf(cosThetaHalf);
+        float sinThetaHalf = sinf(thetaHalf);
+
+        float s0 = sinf(thetaHalf * (1-t)) / sinThetaHalf;  // at t=0, s0 = 1
+        float s1 = sinf(thetaHalf * t)     / sinThetaHalf;  // at t=1, s1 = 1
+
+        return quatf(s0 * q0.v+ s1 * q1.v);
+    }
+}
+
+// compute control points for a bezier spline segment
+inline void quat_bezier_cp_impl(quatf q0, quatf q1, quatf q2,
+              quatf& a1, quatf& b1)
+{
+    // TODO: find out of these were quat or vec mul?
+    a1.v = 2.0f * dot(q0.v,q1.v) * q1.v - q0.v; // Double(q0, q1);
+    
+    // Bisect(a1, q2);
+    a1.v = (a1.v + q2.v);
+    a1.v = normalize(a1.v);
+    
+    b1.v = 2.0f * dot(a1.v,q1.v) * q1.v - a1.v; // Double(a1, q1);
+}
+
+
+// compute control points for a bezier spline segment (quats must be smallest angle)
+void quat_bezier_cp(quatf q0, quatf q1, quatf q2, quatf q3,
+                     quatf& a1, quatf& b2)
+{
+    quatf b1, a2; // b1 unused calc
+    quat_bezier_cp_impl(q0,q1,q1, a1,b1);
+    quat_bezier_cp_impl(q1,q2,q3, a2,b2);
+}
+
+
+// spherical bezier spline interpolation
+// takes q1, a1, b2, q2
+quatf quat_bezer_slerp(quatf a, quatf b, quatf c, quatf d, float t)
+{
+    quatf mid(slerp(b, c, t));
+
+    return slerp(
+                 slerp(slerp(a, b, t), mid, t),
+                 slerp(mid, slerp(c, d, t), t),
+        t);
+}
+
+quatf quat_bezer_lerp(quatf a, quatf b, quatf c, quatf d, float t)
+{
+    quatf mid(lerp(b, c, t));
+
+    return lerp(
+                 lerp(lerp(a, b, t), mid, t),
+                 lerp(mid, lerp(c, d, t), t),
+        t);
+}
+
+#endif // SIMD_FLOAT
+
 
 } // namespace SIMD_NAMESPACE
 
diff --git a/libkram/vectormath/vectormath++.h b/libkram/vectormath/vectormath++.h
index f8fe724..25096e8 100644
--- a/libkram/vectormath/vectormath++.h
+++ b/libkram/vectormath/vectormath++.h
@@ -263,7 +263,6 @@ typedef ::cname##8s cppname##8; \
 
 //-----------------------------------
 
-// TODO: can do +=, -= faster than calling sub/add, but this uses same impl that way
 #define macroMatrixOps(type) \
 SIMD_CALL_OP type& operator*=(type& x, const type& y) { x = mul(x, y); return x; } \
 SIMD_CALL_OP type& operator+=(type& x, const type& y) { x = add(x, y); return x; } \
@@ -1522,12 +1521,6 @@ SIMD_CALL int4 float4m(float4 __x) { return __builtin_convertvector(__x, int4);
 #endif
 
 #if SIMD_FLOAT && SIMD_HALF
-
-// TODO: ryg
-// Not the right gist, you want the RTNE one (nm: that only matters for float->half,
-// this was the half->float one. FWIW, other dir is https://gist.github.com/rygorous/eb3a019b99fdaa9c3064.
-// These days I use a variant of the RTNE/RN version that also preserves NaN payload bits,
-// which is slightly more ops but matches hardware conversions exactly for every input, including all NaNs.
     
 #if SIMD_HALF4_ONLY
 
@@ -1585,25 +1578,43 @@ struct vecf {
 };
 
     
-// TODO: quat, need a fast lerp and correct 2 quats
-// and rotate a vec, can convert to/from vector.
-#if USE_FLOAT
-struct quatf : float4 {
+#if SIMD_FLOAT
+//typedef float4s quatfs;
+
+// Only need a float quat.  double/half are pretty worthless.
+struct quatf {
+    quatf(): v{0.0f,0.0f,0.0f,1.0f} {}
+    quatf(float x, float y, float z, float w) : v{x,y,z,w} {}
+    quatf(float3 vv, float angle);
+    explicit quatf(float4 vv) { v = vv; }
     
-};
-#endif
-
-#if USE_DOUBLE
-struct quatd : double4 {
+    static const quatf& zero();
+    static const quatf& identity();
     
+    float4 v;
 };
-#endif
 
-// TODO: saturating conversions would be useful to, and prevent overflow
-// see the conversion.h code, bit select to clamp values.
+// how many quatf ops are needed?
+quatf operator*(quatf q1, quatf q2);
+// need quat * v
+// quatf operator*(quatf q1, float3 q2);
+
+float4x4 float4x4m(quatf q);
+//quatf quatfm(float3 axis, float angleInRadians);
+// need matrix into quatf
+// need shortest arc correction
+
+void quat_bezier_cp(quatf q0, quatf q1, quatf q2, quatf q3,
+                    quatf& a1, quatf& b2);
+quatf slerp(quatf q0, quatf q1, float t);
+quatf lerp(quatf q0, quatf q1, float t);
 
-// matrix_types.h has a type_traits with rows, cols, etc.
-// can call get_traits() on them.  See matrix.h for all the ops.
+quatf quat_bezer_lerp(quatf a, quatf b, quatf c, quatf d, float t);
+quatf quat_bezer_slerp(quatf a, quatf b, quatf c, quatf d, float t);
+
+quatf inverse(quatf q);
+
+#endif
 
 } // namespace SIMD_NAMESPACE