kram - update simd

optimize sse2neon to arm64 specific header. Strip unneeded armv7. Switch to faster and more precise ops. No more approximate recip, rsqrt. Faster to call div and sqrt now.
alecazam · Sep 19, 2024 · b995d93 · b995d93
1 parent e83d3fd
commit b995d93
Show file tree

Hide file tree

Showing 5 changed files with 8,007 additions and 149 deletions.
diff --git a/build2/kram.xcodeproj/project.pbxproj b/build2/kram.xcodeproj/project.pbxproj
@@ -7,6 +7,8 @@
 	objects = {
 
 /* Begin PBXBuildFile section */
+		706706482C9B3BB30008F865 /* sse2neon-arm64.h in Headers */ = {isa = PBXBuildFile; fileRef = 706706472C9B3BB30008F865 /* sse2neon-arm64.h */; };
+		706706492C9B3BB30008F865 /* sse2neon-arm64.h in Headers */ = {isa = PBXBuildFile; fileRef = 706706472C9B3BB30008F865 /* sse2neon-arm64.h */; };
 		706EEF7F26D1595D001C950E /* EtcBlock4x4Encoding_RGB8.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEDAA26D1583E001C950E /* EtcBlock4x4Encoding_RGB8.cpp */; };
 		706EEF8026D1595D001C950E /* EtcImage.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEDAC26D1583E001C950E /* EtcImage.cpp */; };
 		706EEF8126D1595D001C950E /* EtcDifferentialTrys.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEDAF26D1583E001C950E /* EtcDifferentialTrys.cpp */; };
@@ -409,6 +411,7 @@
 /* End PBXBuildFile section */
 
 /* Begin PBXFileReference section */
+		706706472C9B3BB30008F865 /* sse2neon-arm64.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "sse2neon-arm64.h"; sourceTree = "<group>"; };
 		706ECDDE26D1577A001C950E /* libkram.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libkram.a; sourceTree = BUILT_PRODUCTS_DIR; };
 		706EEDAA26D1583E001C950E /* EtcBlock4x4Encoding_RGB8.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = EtcBlock4x4Encoding_RGB8.cpp; sourceTree = "<group>"; };
 		706EEDAB26D1583E001C950E /* EtcErrorMetric.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = EtcErrorMetric.h; sourceTree = "<group>"; };
@@ -1021,6 +1024,7 @@
 				706EEE1C26D1583F001C950E /* KramMipper.cpp */,
 				706EEE2D26D1583F001C950E /* win_mmap.h */,
 				706EEE2226D1583F001C950E /* sse2neon.h */,
+				706706472C9B3BB30008F865 /* sse2neon-arm64.h */,
 				706EEE3426D1583F001C950E /* float4a.h */,
 				706EEE2F26D1583F001C950E /* float4a.cpp */,
 				70D222E32AD22BED00B9EA23 /* BlockedLinearAllocator.h */,
@@ -1387,6 +1391,7 @@
 				70871DC927DDDBCD00D0B9E1 /* astcenc_vecmathlib_common_4.h in Headers */,
 				706EEFD626D15984001C950E /* EtcBlock4x4Encoding_R11.h in Headers */,
 				706EEFD726D15984001C950E /* EtcBlock4x4Encoding_RG11.h in Headers */,
+				706706492C9B3BB30008F865 /* sse2neon-arm64.h in Headers */,
 				706EEFD826D15984001C950E /* EtcMath.h in Headers */,
 				706EEFD926D15984001C950E /* EtcIndividualTrys.h in Headers */,
 				706EEFDA26D15984001C950E /* EtcBlock4x4EncodingBits.h in Headers */,
@@ -1500,6 +1505,7 @@
 				70871DCA27DDDBCD00D0B9E1 /* astcenc_vecmathlib_common_4.h in Headers */,
 				706EF15026D166C5001C950E /* EtcBlock4x4Encoding_R11.h in Headers */,
 				706EF15126D166C5001C950E /* EtcBlock4x4Encoding_RG11.h in Headers */,
+				706706482C9B3BB30008F865 /* sse2neon-arm64.h in Headers */,
 				706EF15226D166C5001C950E /* EtcMath.h in Headers */,
 				706EF15326D166C5001C950E /* EtcIndividualTrys.h in Headers */,
 				706EF15426D166C5001C950E /* EtcBlock4x4EncodingBits.h in Headers */,

diff --git a/libkram/kram/KramConfig.h b/libkram/kram/KramConfig.h
@@ -304,7 +304,7 @@ import std.regex;
 // to keep astcenc compiling
 #include <immintrin.h>  // AVX1
 #elif USE_NEON
-#include "sse2neon.h"
+#include "sse2neon-arm64.h"
 #endif
 
 // TODO: move half4 to it's own file, but always include it

diff --git a/libkram/kram/float4a.h b/libkram/kram/float4a.h
@@ -15,7 +15,7 @@
 
 // this is also defined in KramConfig.h, but this keeps file independent
 #if USE_NEON
-#include "sse2neon.h"
+#include "sse2neon-arm64.h"
 #else
 //#include <smmintrin.h> // SSE4.1, and includes all before it
 #include <immintrin.h>  // AVX
@@ -27,32 +27,33 @@
 namespace simd {
 
 #if USE_NEON
-#define _mm_fixzero_ps(a, b) _mm_and_ps(a, _mm_cmpneq_ps(b, _mm_setzero_ps()))
+//#define _mm_fixzero_ps(a, b) _mm_and_ps(a, _mm_cmpneq_ps(b, _mm_setzero_ps()))
 
+// removing these
 // rqrt (high precision)
-inline float32x4_t _mm_rsqrthp_ps(const float32x4_t& a)
-{
-    float32x4_t est = vrsqrteq_f32(a);
-
-    est = _mm_fixzero_ps(est, a);
-
-    // newton raphson
-    float32x4_t stepA = vrsqrtsq_f32(a, vmulq_f32(est, est));  // xn+1 = xn(3-dxn*dxn)/2
-
-    return _mm_mul_ps(est, stepA);
-}
-
-// recip
-inline float32x4_t _mm_rcphp_ps(const float32x4_t& a)
-{
-    float32x4_t est = vrecpeq_f32(a);
-
-    est = _mm_fixzero_ps(est, a);
-
-    float32x4_t stepA = vrecpsq_f32(est, a);  // xn+1 = xn(2-dxn)
-
-    return _mm_mul_ps(est, stepA);
-}
+//inline float32x4_t _mm_rsqrthp_ps(const float32x4_t& a)
+//{
+//    float32x4_t est = vrsqrteq_f32(a);
+//
+//    est = _mm_fixzero_ps(est, a);
+//
+//    // newton raphson
+//    float32x4_t stepA = vrsqrtsq_f32(a, vmulq_f32(est, est));  // xn+1 = xn(3-dxn*dxn)/2
+//
+//    return _mm_mul_ps(est, stepA);
+//}
+//
+//// recip
+//inline float32x4_t _mm_rcphp_ps(const float32x4_t& a)
+//{
+//    float32x4_t est = vrecpeq_f32(a);
+//
+//    est = _mm_fixzero_ps(est, a);
+//
+//    float32x4_t stepA = vrecpsq_f32(est, a);  // xn+1 = xn(2-dxn)
+//
+//    return _mm_mul_ps(est, stepA);
+//}
 
 #else
 
@@ -61,6 +62,7 @@ inline float32x4_t _mm_rcphp_ps(const float32x4_t& a)
 
 #define _mm_fixzero_ps(a, b) _mm_and_ps(a, _mm_cmpneq_ps(b, _mm_setzero_ps()))
 
+/* eliminating these
 inline float32x4_t _mm_rsqrthp_ps(const float32x4_t& a)
 {
     static const float32x4_t kHalf = _mm_set1_ps(0.5f);
@@ -108,6 +110,7 @@ inline float32x4_t _mm_rcphp_ps(const float32x4_t& a)
 #define _mm_rsqrthp_ss(a) _mm_setx_ps(a, _mm_rsqrthp_ps(a))
 #define _mm_rcphp_ss(a) _mm_setx_ps(a, _mm_rcphp_ps(a))
 
+ */
 #endif
 
 //---------------------------------------------------------------------------------------
@@ -118,28 +121,18 @@ using tSwizzle = uint32_t;
 #define macroSwizzle(x, y, z, w) _MM_SHUFFLE(w, z, y, x)
 
 // replicate a lane into a new vector
+// This can already be done with clang vector types much better.  v.x or v.xxxx
 #define _mm_splatx_ps(v) _mm_shuffle_ps(v, v, macroSwizzle(0, 0, 0, 0))
 #define _mm_splaty_ps(v) _mm_shuffle_ps(v, v, macroSwizzle(1, 1, 1, 1))
 #define _mm_splatz_ps(v) _mm_shuffle_ps(v, v, macroSwizzle(2, 2, 2, 2))
 #define _mm_splatw_ps(v) _mm_shuffle_ps(v, v, macroSwizzle(3, 3, 3, 3))
 
-// dot product app with horizontal adds, without using _mm_hadd_ps()
+// dot product app with horizontal adds
 inline float32x4_t _mm_hadd4_ps(const float32x4_t& r)
 {
-#if 0  // SSE1
-//    // use for hpadd
-//    static const tSwizzle kSwizzleYYZW = macroSwizzle(1, 1, 2, 3);
-//    //static const tSwizzle kSwizzleZYZW = macroSwizzle(2,1,2,3);
-//    static const tSwizzle kSwizzleWZZW = macroSwizzle(3, 2, 2, 3);
-//
-//    float32x4_t t = _mm_add_ps(r, _mm_shuffle_ps(r, r, kSwizzleWZZW));  // xy + wz
-//    t = _mm_add_ss(t, _mm_shuffle_ps(t, t, kSwizzleYYZW));              // x + y
-//    return t;
-#else  // SSE3
     float32x4_t t = _mm_hadd_ps(r, r);  // xy + wz
     t = _mm_hadd_ps(t, t);              // x + y
     return t;
-#endif
 }
 
 static const uint32_t kSignBitsF32x4i = {0x80000000};
@@ -156,7 +149,7 @@ static const float32x4_t kOnesF32x4 = _mm_set1_ps(1.0f);
 
 //---------------------------------------------------------------------------------------
 
-// Note float3 should be it's own type, but it should be float4 in size.
+// Note float3 should be its own type, but it should be float4 in size.
 // float2 is harder since on Neon, it supports a float2 data structure.
 // Needs SSE4.1, but that's most of the processors these days.
 class float4 {
@@ -338,11 +331,11 @@ inline float4 max(const float4& lhs, const float4& rhs)
 // do 4 of these at once
 inline float4 recip(const float4& vv)
 {
-    return float4(_mm_rcphp_ps(vv.reg));
+    return floar4(1.0f/vv.reg); // _mm_rcphp_ps(vv.reg));
 }
 inline float4 rsqrt(const float4& vv)
 {
-    return float4(_mm_rsqrthp_ps(vv.reg));
+    return float4(1.0f/_mm_sqrt_ps(vv.reg)); // _mm_rsqrthp_ps(vv.reg));
 }
 inline float4 sqrt(const float4& vv)
 {