Skip to content

Commit

Permalink
kram - update simd
Browse files Browse the repository at this point in the history
optimize sse2neon to arm64 specific header.  Strip unneeded armv7.  Switch to faster and more precise ops.  No more approximate recip, rsqrt.  Faster to call div and sqrt now.
  • Loading branch information
alecazam committed Sep 19, 2024
1 parent e83d3fd commit b995d93
Show file tree
Hide file tree
Showing 5 changed files with 8,007 additions and 149 deletions.
6 changes: 6 additions & 0 deletions build2/kram.xcodeproj/project.pbxproj
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
objects = {

/* Begin PBXBuildFile section */
706706482C9B3BB30008F865 /* sse2neon-arm64.h in Headers */ = {isa = PBXBuildFile; fileRef = 706706472C9B3BB30008F865 /* sse2neon-arm64.h */; };
706706492C9B3BB30008F865 /* sse2neon-arm64.h in Headers */ = {isa = PBXBuildFile; fileRef = 706706472C9B3BB30008F865 /* sse2neon-arm64.h */; };
706EEF7F26D1595D001C950E /* EtcBlock4x4Encoding_RGB8.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEDAA26D1583E001C950E /* EtcBlock4x4Encoding_RGB8.cpp */; };
706EEF8026D1595D001C950E /* EtcImage.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEDAC26D1583E001C950E /* EtcImage.cpp */; };
706EEF8126D1595D001C950E /* EtcDifferentialTrys.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEDAF26D1583E001C950E /* EtcDifferentialTrys.cpp */; };
Expand Down Expand Up @@ -409,6 +411,7 @@
/* End PBXBuildFile section */

/* Begin PBXFileReference section */
706706472C9B3BB30008F865 /* sse2neon-arm64.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "sse2neon-arm64.h"; sourceTree = "<group>"; };
706ECDDE26D1577A001C950E /* libkram.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libkram.a; sourceTree = BUILT_PRODUCTS_DIR; };
706EEDAA26D1583E001C950E /* EtcBlock4x4Encoding_RGB8.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = EtcBlock4x4Encoding_RGB8.cpp; sourceTree = "<group>"; };
706EEDAB26D1583E001C950E /* EtcErrorMetric.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = EtcErrorMetric.h; sourceTree = "<group>"; };
Expand Down Expand Up @@ -1021,6 +1024,7 @@
706EEE1C26D1583F001C950E /* KramMipper.cpp */,
706EEE2D26D1583F001C950E /* win_mmap.h */,
706EEE2226D1583F001C950E /* sse2neon.h */,
706706472C9B3BB30008F865 /* sse2neon-arm64.h */,
706EEE3426D1583F001C950E /* float4a.h */,
706EEE2F26D1583F001C950E /* float4a.cpp */,
70D222E32AD22BED00B9EA23 /* BlockedLinearAllocator.h */,
Expand Down Expand Up @@ -1387,6 +1391,7 @@
70871DC927DDDBCD00D0B9E1 /* astcenc_vecmathlib_common_4.h in Headers */,
706EEFD626D15984001C950E /* EtcBlock4x4Encoding_R11.h in Headers */,
706EEFD726D15984001C950E /* EtcBlock4x4Encoding_RG11.h in Headers */,
706706492C9B3BB30008F865 /* sse2neon-arm64.h in Headers */,
706EEFD826D15984001C950E /* EtcMath.h in Headers */,
706EEFD926D15984001C950E /* EtcIndividualTrys.h in Headers */,
706EEFDA26D15984001C950E /* EtcBlock4x4EncodingBits.h in Headers */,
Expand Down Expand Up @@ -1500,6 +1505,7 @@
70871DCA27DDDBCD00D0B9E1 /* astcenc_vecmathlib_common_4.h in Headers */,
706EF15026D166C5001C950E /* EtcBlock4x4Encoding_R11.h in Headers */,
706EF15126D166C5001C950E /* EtcBlock4x4Encoding_RG11.h in Headers */,
706706482C9B3BB30008F865 /* sse2neon-arm64.h in Headers */,
706EF15226D166C5001C950E /* EtcMath.h in Headers */,
706EF15326D166C5001C950E /* EtcIndividualTrys.h in Headers */,
706EF15426D166C5001C950E /* EtcBlock4x4EncodingBits.h in Headers */,
Expand Down
2 changes: 1 addition & 1 deletion libkram/kram/KramConfig.h
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,7 @@ import std.regex;
// to keep astcenc compiling
#include <immintrin.h> // AVX1
#elif USE_NEON
#include "sse2neon.h"
#include "sse2neon-arm64.h"
#endif

// TODO: move half4 to it's own file, but always include it
Expand Down
73 changes: 33 additions & 40 deletions libkram/kram/float4a.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

// this is also defined in KramConfig.h, but this keeps file independent
#if USE_NEON
#include "sse2neon.h"
#include "sse2neon-arm64.h"
#else
//#include <smmintrin.h> // SSE4.1, and includes all before it
#include <immintrin.h> // AVX
Expand All @@ -27,32 +27,33 @@
namespace simd {

#if USE_NEON
#define _mm_fixzero_ps(a, b) _mm_and_ps(a, _mm_cmpneq_ps(b, _mm_setzero_ps()))
//#define _mm_fixzero_ps(a, b) _mm_and_ps(a, _mm_cmpneq_ps(b, _mm_setzero_ps()))

// removing these
// rqrt (high precision)
inline float32x4_t _mm_rsqrthp_ps(const float32x4_t& a)
{
float32x4_t est = vrsqrteq_f32(a);

est = _mm_fixzero_ps(est, a);

// newton raphson
float32x4_t stepA = vrsqrtsq_f32(a, vmulq_f32(est, est)); // xn+1 = xn(3-dxn*dxn)/2

return _mm_mul_ps(est, stepA);
}

// recip
inline float32x4_t _mm_rcphp_ps(const float32x4_t& a)
{
float32x4_t est = vrecpeq_f32(a);

est = _mm_fixzero_ps(est, a);

float32x4_t stepA = vrecpsq_f32(est, a); // xn+1 = xn(2-dxn)

return _mm_mul_ps(est, stepA);
}
//inline float32x4_t _mm_rsqrthp_ps(const float32x4_t& a)
//{
// float32x4_t est = vrsqrteq_f32(a);
//
// est = _mm_fixzero_ps(est, a);
//
// // newton raphson
// float32x4_t stepA = vrsqrtsq_f32(a, vmulq_f32(est, est)); // xn+1 = xn(3-dxn*dxn)/2
//
// return _mm_mul_ps(est, stepA);
//}
//
//// recip
//inline float32x4_t _mm_rcphp_ps(const float32x4_t& a)
//{
// float32x4_t est = vrecpeq_f32(a);
//
// est = _mm_fixzero_ps(est, a);
//
// float32x4_t stepA = vrecpsq_f32(est, a); // xn+1 = xn(2-dxn)
//
// return _mm_mul_ps(est, stepA);
//}

#else

Expand All @@ -61,6 +62,7 @@ inline float32x4_t _mm_rcphp_ps(const float32x4_t& a)

#define _mm_fixzero_ps(a, b) _mm_and_ps(a, _mm_cmpneq_ps(b, _mm_setzero_ps()))

/* eliminating these
inline float32x4_t _mm_rsqrthp_ps(const float32x4_t& a)
{
static const float32x4_t kHalf = _mm_set1_ps(0.5f);
Expand Down Expand Up @@ -108,6 +110,7 @@ inline float32x4_t _mm_rcphp_ps(const float32x4_t& a)
#define _mm_rsqrthp_ss(a) _mm_setx_ps(a, _mm_rsqrthp_ps(a))
#define _mm_rcphp_ss(a) _mm_setx_ps(a, _mm_rcphp_ps(a))
*/
#endif

//---------------------------------------------------------------------------------------
Expand All @@ -118,28 +121,18 @@ using tSwizzle = uint32_t;
#define macroSwizzle(x, y, z, w) _MM_SHUFFLE(w, z, y, x)

// replicate a lane into a new vector
// This can already be done with clang vector types much better. v.x or v.xxxx
#define _mm_splatx_ps(v) _mm_shuffle_ps(v, v, macroSwizzle(0, 0, 0, 0))
#define _mm_splaty_ps(v) _mm_shuffle_ps(v, v, macroSwizzle(1, 1, 1, 1))
#define _mm_splatz_ps(v) _mm_shuffle_ps(v, v, macroSwizzle(2, 2, 2, 2))
#define _mm_splatw_ps(v) _mm_shuffle_ps(v, v, macroSwizzle(3, 3, 3, 3))

// dot product app with horizontal adds, without using _mm_hadd_ps()
// dot product app with horizontal adds
inline float32x4_t _mm_hadd4_ps(const float32x4_t& r)
{
#if 0 // SSE1
// // use for hpadd
// static const tSwizzle kSwizzleYYZW = macroSwizzle(1, 1, 2, 3);
// //static const tSwizzle kSwizzleZYZW = macroSwizzle(2,1,2,3);
// static const tSwizzle kSwizzleWZZW = macroSwizzle(3, 2, 2, 3);
//
// float32x4_t t = _mm_add_ps(r, _mm_shuffle_ps(r, r, kSwizzleWZZW)); // xy + wz
// t = _mm_add_ss(t, _mm_shuffle_ps(t, t, kSwizzleYYZW)); // x + y
// return t;
#else // SSE3
float32x4_t t = _mm_hadd_ps(r, r); // xy + wz
t = _mm_hadd_ps(t, t); // x + y
return t;
#endif
}

static const uint32_t kSignBitsF32x4i = {0x80000000};
Expand All @@ -156,7 +149,7 @@ static const float32x4_t kOnesF32x4 = _mm_set1_ps(1.0f);

//---------------------------------------------------------------------------------------

// Note float3 should be it's own type, but it should be float4 in size.
// Note float3 should be its own type, but it should be float4 in size.
// float2 is harder since on Neon, it supports a float2 data structure.
// Needs SSE4.1, but that's most of the processors these days.
class float4 {
Expand Down Expand Up @@ -338,11 +331,11 @@ inline float4 max(const float4& lhs, const float4& rhs)
// do 4 of these at once
inline float4 recip(const float4& vv)
{
return float4(_mm_rcphp_ps(vv.reg));
return floar4(1.0f/vv.reg); // _mm_rcphp_ps(vv.reg));
}
inline float4 rsqrt(const float4& vv)
{
return float4(_mm_rsqrthp_ps(vv.reg));
return float4(1.0f/_mm_sqrt_ps(vv.reg)); // _mm_rsqrthp_ps(vv.reg));
}
inline float4 sqrt(const float4& vv)
{
Expand Down
Loading

0 comments on commit b995d93

Please sign in to comment.