diff --git a/Arith/src/defs.hxx b/Arith/src/defs.hxx index cdecb4aaa..58bb55a6a 100644 --- a/Arith/src/defs.hxx +++ b/Arith/src/defs.hxx @@ -206,6 +206,12 @@ constexpr ARITH_INLINE ARITH_DEVICE ARITH_HOST T flipsign(const T &x, return copysign(T(1), y) * x; } +// Return 1/x +template +constexpr ARITH_INLINE ARITH_DEVICE ARITH_HOST T inv(const T &x) { + return T(1) / x; +} + // A max function that returns nan when any argument is nan template constexpr ARITH_INLINE ARITH_DEVICE ARITH_HOST T max1(const T &x, const T &y) { diff --git a/Arith/src/simd.cxx b/Arith/src/simd.cxx index 17dd8e09d..26b1c5448 100644 --- a/Arith/src/simd.cxx +++ b/Arith/src/simd.cxx @@ -3,6 +3,7 @@ #include #include +#include namespace Arith { @@ -25,17 +26,34 @@ std::size_t get_memop_count() { return count; } +template constexpr bool isequal(const simd x, const T a) { + return all(x == a); +} +template constexpr bool isequal(const simdl x, const bool a) { + return all(x == a); +} +template constexpr bool isapprox(const simd x, const T a) { + return all(fabs(x - a) < 1.0e-14); +} + +void check(const bool isgood) { + if (isgood) + return; + CCTK_ERROR("Test failure"); +} + void TestSIMD() { // nvcc V11.1.74 doesn't accept this as "constexpr" values #ifndef __CUDACC__ - typedef simd realv; + using real = CCTK_REAL; + using realv = simd; realv x; realv y = 0; realv z = zero(); - assert(all(y == 0)); - assert(all(y == z)); + check(all(y == 0)); + check(all(y == z)); realv a = 2; realv b = 3; @@ -45,10 +63,131 @@ void TestSIMD() { realv r1 = mulsub(a, b, c); realv r2 = negmuladd(a, b, c); realv r3 = negmulsub(a, b, c); - assert(all(r0 == muladd(2, 3, 4))); - assert(all(r1 == mulsub(2, 3, 4))); - assert(all(r2 == negmuladd(2, 3, 4))); - assert(all(r3 == negmulsub(2, 3, 4))); + check(all(r0 == muladd(2, 3, 4))); + check(all(r1 == mulsub(2, 3, 4))); + check(all(r2 == negmuladd(2, 3, 4))); + check(all(r3 == negmulsub(2, 3, 4))); + + real s = 2; + real t = 2; + real u = 4; + + check(isequal(+a, +s)); + check(isequal(-a, -s)); + + check(isequal(a + b, s + t)); + check(isequal(a - b, s - t)); + check(isequal(a * b, s * t)); + check(isequal(a / b, s / t)); + check(isequal(s + b, s + t)); + check(isequal(s - b, s - t)); + check(isequal(s * b, s * t)); + check(isequal(s / b, s / t)); + check(isequal(a + t, s + t)); + check(isequal(a - t, s - t)); + check(isequal(a * t, s * t)); + check(isequal(a / t, s / t)); + + check(isequal(a == b, s == t)); + check(isequal(a != b, s != t)); + check(isequal(a < b, s < t)); + check(isequal(a > b, s > t)); + check(isequal(a <= b, s <= t)); + check(isequal(a >= b, s >= t)); + check(isequal(s == b, s == t)); + check(isequal(s != b, s != t)); + check(isequal(s < b, s < t)); + check(isequal(s > b, s > t)); + check(isequal(s <= b, s <= t)); + check(isequal(s >= b, s >= t)); + check(isequal(a == t, s == t)); + check(isequal(a != t, s != t)); + check(isequal(a < t, s < t)); + check(isequal(a > t, s > t)); + check(isequal(a <= t, s <= t)); + check(isequal(a >= t, s >= t)); + + check(isapprox(abs(a), abs(s))); + check(isapprox(acos(a), acos(s))); + check(isapprox(acosh(a), acosh(s))); + check(allisfinite(a) == allisfinite(s)); + check(anyisnan(a) == anyisnan(s)); + check(isapprox(asin(a), asin(s))); + check(isapprox(asinh(a), asinh(s))); + check(isapprox(atan(a), atan(s))); + check(isapprox(atanh(a), atanh(s))); + check(isapprox(cbrt(a), cbrt(s))); + // check(isapprox(cis(a), cis(s))); + // check(isapprox(cispi(a), cispi(s))); + check(isapprox(copysign(a, b), copysign(s, t))); + check(isapprox(cos(a), cos(s))); + check(isapprox(cosh(a), cosh(s))); + check(isapprox(cospi(a), cos(CCTK_REAL(M_PI) * s))); + check(isapprox(exp(a), exp(s))); + check(isapprox(exp10(a), pow(CCTK_REAL(10), s))); + check(isapprox(exp2(a), exp2(s))); + check(isapprox(fabs(a), fabs(s))); + check(isapprox(flipsign(a, b), flipsign(s, t))); + check(isapprox(fmax(a, b), fmax(s, t))); + check(isapprox(fmax(a, t), fmax(s, t))); + check(isapprox(fmax(s, b), fmax(s, t))); + check(isapprox(fmin(a, b), fmin(s, t))); + check(isapprox(fmin(a, t), fmin(s, t))); + check(isapprox(fmin(s, b), fmin(s, t))); + check(isapprox(hypot(a, b), hypot(s, t))); + check(isapprox(inv(a), inv(s))); + check(isequal(isfinite(a), isfinite(s))); + check(isequal(isinf(a), isinf(s))); + check(isequal(isnan(a), isnan(s))); + check(isapprox(log(a), log(s))); + check(isapprox(log10(a), log10(s))); + check(isapprox(log2(a), log2(s))); + check(isapprox(max(a, b), max(s, t))); + check(isapprox(max(a, t), max(s, t))); + check(isapprox(max(s, b), max(s, t))); + check(isapprox(min(a, b), min(s, t))); + check(isapprox(min(a, t), min(s, t))); + check(isapprox(min(s, b), min(s, t))); + check(isapprox(muladd(a, b, c), muladd(s, t, u))); + check(isapprox(muladd(a, b, u), muladd(s, t, u))); + check(isapprox(muladd(a, t, c), muladd(s, t, u))); + check(isapprox(muladd(a, t, u), muladd(s, t, u))); + check(isapprox(muladd(s, b, c), muladd(s, t, u))); + check(isapprox(muladd(s, b, u), muladd(s, t, u))); + check(isapprox(muladd(s, t, c), muladd(s, t, u))); + check(isapprox(mulsub(a, b, c), mulsub(s, t, u))); + check(isapprox(mulsub(a, b, u), mulsub(s, t, u))); + check(isapprox(mulsub(a, t, c), mulsub(s, t, u))); + check(isapprox(mulsub(a, t, u), mulsub(s, t, u))); + check(isapprox(mulsub(s, b, c), mulsub(s, t, u))); + check(isapprox(mulsub(s, b, u), mulsub(s, t, u))); + check(isapprox(mulsub(s, t, c), mulsub(s, t, u))); + check(isapprox(negmuladd(a, b, c), negmuladd(s, t, u))); + check(isapprox(negmuladd(a, b, u), negmuladd(s, t, u))); + check(isapprox(negmuladd(a, t, c), negmuladd(s, t, u))); + check(isapprox(negmuladd(a, t, u), negmuladd(s, t, u))); + check(isapprox(negmuladd(s, b, c), negmuladd(s, t, u))); + check(isapprox(negmuladd(s, b, u), negmuladd(s, t, u))); + check(isapprox(negmuladd(s, t, c), negmuladd(s, t, u))); + check(isapprox(negmulsub(a, b, c), negmulsub(s, t, u))); + check(isapprox(negmulsub(a, b, u), negmulsub(s, t, u))); + check(isapprox(negmulsub(a, t, c), negmulsub(s, t, u))); + check(isapprox(negmulsub(a, t, u), negmulsub(s, t, u))); + check(isapprox(negmulsub(s, b, c), negmulsub(s, t, u))); + check(isapprox(negmulsub(s, b, u), negmulsub(s, t, u))); + check(isapprox(negmulsub(s, t, c), negmulsub(s, t, u))); + check(isapprox(pow(a, b), pow(s, t))); + check(isapprox(pow(a, t), pow(s, t))); + check(isapprox(pow(s, b), pow(s, t))); + check(isequal(signbit(a), signbit(s))); + check(isapprox(sin(a), sin(s))); + check(isapprox(sinh(a), sinh(s))); + check(isapprox(sinpi(a), sin(CCTK_REAL(M_PI) * s))); + check(isapprox(sqrt(a), sqrt(s))); + check(isapprox(tan(a), tan(s))); + check(isapprox(tanh(a), tanh(s))); + // check(isapprox(tanpi(a), tanpi(s))); + #endif } diff --git a/Arith/src/simd.hxx b/Arith/src/simd.hxx index 366060dfc..9f946b61e 100644 --- a/Arith/src/simd.hxx +++ b/Arith/src/simd.hxx @@ -7,14 +7,19 @@ // // Disable SIMD when the `NSIMD` library is not available // #ifndef HAVE_CAPABILITY_NSIMD -// #ifndef SIMD_CPU -// #define SIMD_CPU +// #ifndef SIMD_DISABLE +// #define SIMD_DISABLE // #endif // #endif +// Accept `SIMD_CPU` as well for backward compatibility +#ifdef SIMD_CPU +#define SIMD_DISABLE +#endif + #include "defs.hxx" -#ifndef SIMD_CPU +#ifndef SIMD_DISABLE #include #undef vec // This should arguably not be defined in C++ #else @@ -36,7 +41,7 @@ using namespace std; template struct simd; template struct simdl; -#ifndef SIMD_CPU +#ifndef SIMD_DISABLE namespace detail { struct reinterpret32 { typedef i32 int_type; @@ -86,7 +91,7 @@ std::size_t get_memop_count(); template struct simd { using value_type = T; -#ifndef SIMD_CPU +#ifndef SIMD_DISABLE using storage_type = nsimd::pack; #else using storage_type = T; @@ -101,7 +106,7 @@ template struct simd { constexpr ARITH_DEVICE ARITH_HOST simd() {} constexpr ARITH_DEVICE ARITH_HOST simd(const T &a) : elts(a) {} -#ifndef SIMD_CPU +#ifndef SIMD_DISABLE template , T> > * = nullptr> constexpr ARITH_DEVICE ARITH_HOST simd(const nsimd::pack &elts) @@ -122,7 +127,7 @@ template struct simd { } constexpr ARITH_DEVICE ARITH_HOST std::size_t size() const { -#ifndef SIMD_CPU +#ifndef SIMD_DISABLE return sizeof(nsimd::pack) / sizeof(T); #else return 1; @@ -133,7 +138,7 @@ template struct simd { #ifdef CCTK_DEBUG assert(n >= 0 && n < int(storage_size)); #endif -#ifndef SIMD_CPU +#ifndef SIMD_DISABLE T xarr[storage_size]; storeu(xarr, *this); return xarr[n]; @@ -160,7 +165,7 @@ template struct simd { friend constexpr ARITH_DEVICE ARITH_HOST simd operator+(const simd &x) { count_flop(); - return +x.elts; + return x.elts; } friend constexpr ARITH_DEVICE ARITH_HOST simd operator-(const simd &x) { count_flop(); @@ -439,13 +444,23 @@ template struct simd { return abs(x.elts); } - friend constexpr ARITH_DEVICE ARITH_HOST simd andnot(const simd &x, - const simd &y) { -#ifndef SIMD_CPU - count_flop(); - return andnotb(x.elts, y.elts); + friend constexpr ARITH_DEVICE ARITH_HOST simd acos(const simd &x) { + count_flop(10); +#ifndef SIMD_DISABLE + return acos_u10(x.elts); #else - return x & ~y; + using std::acos; + return acos(x.elts); +#endif + } + + friend constexpr ARITH_DEVICE ARITH_HOST simd acosh(const simd &x) { + count_flop(10); +#ifndef SIMD_DISABLE + return acosh_u10(x.elts); +#else + using std::acosh; + return acosh(x.elts); #endif } @@ -454,14 +469,74 @@ template struct simd { return all(isfinite(x)); } + friend constexpr ARITH_DEVICE ARITH_HOST simd andnot(const simd &x, + const simd &y) { +#ifndef SIMD_DISABLE + count_flop(); + return andnotb(x.elts, y.elts); +#else + return x & ~y; +#endif + } + friend constexpr ARITH_DEVICE ARITH_HOST bool anyisnan(const simd &x) { using std::isnan; return any(isnan(x)); } + friend constexpr ARITH_DEVICE ARITH_HOST simd asin(const simd &x) { + count_flop(10); +#ifndef SIMD_DISABLE + return asin_u10(x.elts); +#else + using std::asin; + return asin(x.elts); +#endif + } + + friend constexpr ARITH_DEVICE ARITH_HOST simd asinh(const simd &x) { + count_flop(10); +#ifndef SIMD_DISABLE + return asinh_u10(x.elts); +#else + using std::asinh; + return asinh(x.elts); +#endif + } + + friend constexpr ARITH_DEVICE ARITH_HOST simd atan(const simd &x) { + count_flop(10); +#ifndef SIMD_DISABLE + return atan_u10(x.elts); +#else + using std::atan; + return atan(x.elts); +#endif + } + + friend constexpr ARITH_DEVICE ARITH_HOST simd atanh(const simd &x) { + count_flop(10); +#ifndef SIMD_DISABLE + return atanh_u10(x.elts); +#else + using std::atanh; + return atanh(x.elts); +#endif + } + + friend constexpr ARITH_DEVICE ARITH_HOST simd cbrt(const simd &x) { + count_flop(10); +#ifndef SIMD_DISABLE + return cbrt_u10(x.elts); +#else + using std::cbrt; + return cbrt(x.elts); +#endif + } + friend constexpr ARITH_DEVICE ARITH_HOST simd copysign(const simd &x, const simd &y) { -#ifndef SIMD_CPU +#ifndef SIMD_DISABLE typedef detail::unsigned_type U; const T signmask = nsimd::scalar_reinterpret(T{}, U(1) << (8 * sizeof(U) - 1)); @@ -473,6 +548,68 @@ template struct simd { #endif } + friend constexpr ARITH_DEVICE ARITH_HOST simd cos(const simd &x) { + count_flop(10); +#ifndef SIMD_DISABLE + return cos_u10(x.elts); +#else + using std::cos; + return cos(x.elts); +#endif + } + + friend constexpr ARITH_DEVICE ARITH_HOST simd cosh(const simd &x) { + count_flop(10); +#ifndef SIMD_DISABLE + return cosh_u10(x.elts); +#else + using std::cosh; + return cosh(x.elts); +#endif + } + + friend constexpr ARITH_DEVICE ARITH_HOST simd cospi(const simd &x) { + count_flop(10); +#ifndef SIMD_DISABLE + return cospi_u05(x.elts); +#else + using std::acos, std::cos; + const T pi = acos(T(-1)); + return cos(pi * x.elts); +#endif + } + + friend constexpr ARITH_DEVICE ARITH_HOST simd exp(const simd &x) { + count_flop(10); +#ifndef SIMD_DISABLE + return exp_u10(x.elts); +#else + using std::exp; + return exp(x.elts); +#endif + } + + friend constexpr ARITH_DEVICE ARITH_HOST simd exp10(const simd &x) { + count_flop(10); +#ifndef SIMD_DISABLE + return exp10_u10(x.elts); +#else + using std::exp2; + const T log2_10 = log2(T(10)); + return exp2(log2_10 * x.elts); +#endif + } + + friend constexpr ARITH_DEVICE ARITH_HOST simd exp2(const simd &x) { + count_flop(10); +#ifndef SIMD_DISABLE + return exp2_u10(x.elts); +#else + using std::exp2; + return exp2(x.elts); +#endif + } + friend constexpr ARITH_DEVICE ARITH_HOST simd fabs(const simd &x) { count_flop(); using std::abs; @@ -481,7 +618,7 @@ template struct simd { friend constexpr ARITH_DEVICE ARITH_HOST simd flipsign(const simd &x, const simd &y) { -#ifndef SIMD_CPU +#ifndef SIMD_DISABLE typedef detail::unsigned_type U; const T signmask = nsimd::scalar_reinterpret(T{}, U(1) << (8 * sizeof(U) - 1)); @@ -531,6 +668,26 @@ template struct simd { return min(x, simd(b)); } + friend constexpr ARITH_DEVICE ARITH_HOST simd hypot(const simd &x, + const simd &y) { + count_flop(10); +#ifndef SIMD_DISABLE + return hypot_u05(x.elts, y.elts); +#else + using std::hypot; + return hypot(x.elts, y.elts); +#endif + } + + friend constexpr ARITH_DEVICE ARITH_HOST simd inv(const simd &x) { + count_flop(10); +#ifndef SIMD_DISABLE + return rec(x.elts); +#else + return inv(x.elts); +#endif + } + friend constexpr ARITH_DEVICE ARITH_HOST simdl isfinite(const simd &x) { // using std::isfinite; // return isfinite(x.elts); @@ -549,6 +706,36 @@ template struct simd { return x != x; } + friend constexpr ARITH_DEVICE ARITH_HOST simd log(const simd &x) { + count_flop(10); +#ifndef SIMD_DISABLE + return log_u10(x.elts); +#else + using std::log; + return log(x.elts); +#endif + } + + friend constexpr ARITH_DEVICE ARITH_HOST simd log10(const simd &x) { + count_flop(10); +#ifndef SIMD_DISABLE + return log10_u10(x.elts); +#else + using std::log10; + return log10(x.elts); +#endif + } + + friend constexpr ARITH_DEVICE ARITH_HOST simd log2(const simd &x) { + count_flop(10); +#ifndef SIMD_DISABLE + return log2_u10(x.elts); +#else + using std::log2; + return log2(x.elts); +#endif + } + friend constexpr ARITH_DEVICE ARITH_HOST simd max(const simd &x, const simd &y) { count_flop(); @@ -603,7 +790,7 @@ template struct simd { const simd &y, const simd &z) { count_flop(2); -#ifndef SIMD_CPU +#ifndef SIMD_DISABLE return nsimd::fma(x.elts, y.elts, z.elts); #else return muladd(x.elts, y.elts, z.elts); @@ -642,7 +829,7 @@ template struct simd { const simd &y, const simd &z) { count_flop(2); -#ifndef SIMD_CPU +#ifndef SIMD_DISABLE return nsimd::fms(x.elts, y.elts, z.elts); #else return mulsub(x.elts, y.elts, z.elts); @@ -681,7 +868,7 @@ template struct simd { const simd &y, const simd &z) { count_flop(2); -#ifndef SIMD_CPU +#ifndef SIMD_DISABLE return nsimd::fnma(x.elts, y.elts, z.elts); #else return negmuladd(x.elts, y.elts, z.elts); @@ -722,7 +909,7 @@ template struct simd { const simd &y, const simd &z) { count_flop(2); -#ifndef SIMD_CPU +#ifndef SIMD_DISABLE return nsimd::fnms(x.elts, y.elts, z.elts); #else return negmulsub(x.elts, y.elts, z.elts); @@ -759,8 +946,25 @@ template struct simd { return negmulsub(x, simd(b), simd(c)); } + friend constexpr ARITH_DEVICE ARITH_HOST simd pow(const simd &x, + const simd &y) { + count_flop(10); +#ifndef SIMD_DISABLE + return pow_u10(x.elts, y.elts); +#else + using std::pow; + return pow(x.elts, y.elts); +#endif + } + friend constexpr ARITH_DEVICE ARITH_HOST simd pow(const simd &x, const T &b) { + return pow(x, simd(b)); + } + friend constexpr ARITH_DEVICE ARITH_HOST simd pow(const T &a, const simd &y) { + return pow(simd(a), y); + } + friend constexpr ARITH_DEVICE ARITH_HOST simdl signbit(const simd &x) { -#ifndef SIMD_CPU +#ifndef SIMD_DISABLE typedef detail::unsigned_type U; const T signmask = nsimd::scalar_reinterpret(T{}, U(1) << (8 * sizeof(U) - 1)); @@ -771,19 +975,70 @@ template struct simd { #endif } + friend constexpr ARITH_DEVICE ARITH_HOST simd sin(const simd &x) { + count_flop(10); +#ifndef SIMD_DISABLE + return sin_u10(x.elts); +#else + using std::sin; + return sin(x.elts); +#endif + } + + friend constexpr ARITH_DEVICE ARITH_HOST simd sinh(const simd &x) { + count_flop(10); +#ifndef SIMD_DISABLE + return sinh_u10(x.elts); +#else + using std::sinh; + return sinh(x.elts); +#endif + } + + friend constexpr ARITH_DEVICE ARITH_HOST simd sinpi(const simd &x) { + count_flop(10); +#ifndef SIMD_DISABLE + return sinpi_u05(x.elts); +#else + using std::acos, std::sin; + const T pi = acos(T(-1)); + return sin(pi * x.elts); +#endif + } + friend constexpr ARITH_DEVICE ARITH_HOST simd sqrt(const simd &x) { count_flop(10); using std::sqrt; return sqrt(x.elts); } + friend constexpr ARITH_DEVICE ARITH_HOST simd tan(const simd &x) { + count_flop(10); +#ifndef SIMD_DISABLE + return tan_u10(x.elts); +#else + using std::tan; + return tan(x.elts); +#endif + } + + friend constexpr ARITH_DEVICE ARITH_HOST simd tanh(const simd &x) { + count_flop(10); +#ifndef SIMD_DISABLE + return tanh_u10(x.elts); +#else + using std::tanh; + return tanh(x.elts); +#endif + } + friend constexpr ARITH_DEVICE ARITH_HOST simdl to_logical(const simd &x) { return to_logical(x.elts); } friend ARITH_DEVICE ARITH_HOST void storea(T *ptr, const simd &x) { count_memop(); -#ifndef SIMD_CPU +#ifndef SIMD_DISABLE storea(ptr, x.elts); #else *ptr = x.elts; @@ -791,7 +1046,7 @@ template struct simd { } friend ARITH_DEVICE ARITH_HOST void storeu(T *ptr, const simd &x) { count_memop(); -#ifndef SIMD_CPU +#ifndef SIMD_DISABLE storeu(ptr, x.elts); #else *ptr = x.elts; @@ -800,7 +1055,7 @@ template struct simd { friend ARITH_DEVICE ARITH_HOST void mask_storea(const simdl &mask, T *ptr, const simd &x) { count_memop(); -#ifndef SIMD_CPU +#ifndef SIMD_DISABLE mask_storea(mask.elts, ptr, x.elts); #else if (mask.elts) @@ -810,11 +1065,27 @@ template struct simd { friend ARITH_DEVICE ARITH_HOST void mask_storeu(const simdl &mask, T *ptr, const simd &x) { count_memop(); -#ifndef SIMD_CPU +#ifndef SIMD_DISABLE mask_storeu(mask.elts, ptr, x.elts); #else - if (mask.elts) + if (mask.elts) { +#if 0 && defined __CUDACC__ + // __stwb: Cache write-back all coherent levels + // __stcg: Cache at global level (cache in L2 and below, not L1) + // __stcs: Cache streaming, likely to be accessed once + // __stwt: Cache write-through (to system memory) + __stwt(ptr, x.elts); +#elif 0 && defined __HIPCC__ + __builtin_nontemporal_store(x.elts, ptr); +#else + // CPU +#if 0 && defined __llvm__ + __builtin_nontemporal_store(x.elts, ptr); +#else *ptr = x.elts; +#endif +#endif + } #endif } @@ -873,7 +1144,7 @@ template struct nan > { template ARITH_DEVICE ARITH_HOST inline simd iota() { -#ifndef SIMD_CPU +#ifndef SIMD_DISABLE return nsimd::iota >(); #else return 0; @@ -884,7 +1155,7 @@ template ARITH_DEVICE ARITH_HOST inline simdl mask_for_loop_tail(const int i, const int n) { simd::count_flop(); -#ifndef SIMD_CPU +#ifndef SIMD_DISABLE return nsimd::mask_for_loop_tail >(i, n); #else return i < n; @@ -894,7 +1165,7 @@ ARITH_DEVICE ARITH_HOST inline simdl mask_for_loop_tail(const int i, template ARITH_DEVICE ARITH_HOST inline simd loada(const T *ptr) { simd::count_memop(); -#ifndef SIMD_CPU +#ifndef SIMD_DISABLE return nsimd::loada >(ptr); #else return *ptr; @@ -904,7 +1175,7 @@ ARITH_DEVICE ARITH_HOST inline simd loada(const T *ptr) { template ARITH_DEVICE ARITH_HOST inline simd loadu(const T *ptr) { simd::count_memop(); -#ifndef SIMD_CPU +#ifndef SIMD_DISABLE return nsimd::loadu >(ptr); #else return *ptr; @@ -915,7 +1186,7 @@ template ARITH_DEVICE ARITH_HOST inline simd maskz_loada(const simdl &mask, const T *ptr) { simd::count_memop(); -#ifndef SIMD_CPU +#ifndef SIMD_DISABLE return nsimd::maskz_loada(mask.elts, ptr); #else return mask.elts ? *ptr : 0; @@ -926,7 +1197,7 @@ template ARITH_DEVICE ARITH_HOST inline simd maskz_loadu(const simdl &mask, const T *ptr) { simd::count_memop(); -#ifndef SIMD_CPU +#ifndef SIMD_DISABLE return nsimd::maskz_loadu(mask.elts, ptr); #else return mask.elts ? *ptr : 0; @@ -937,7 +1208,7 @@ template ARITH_DEVICE ARITH_HOST inline simd masko_loada(const simdl &mask, const T *ptr, const simd &other) { simd::count_memop(); -#ifndef SIMD_CPU +#ifndef SIMD_DISABLE return masko_loada(mask.elts, ptr, other.elts); #else return mask.elts ? *ptr : other.elts; @@ -948,7 +1219,7 @@ template ARITH_DEVICE ARITH_HOST inline simd masko_loadu(const simdl &mask, const T *ptr, const simd &other) { simd::count_memop(); -#ifndef SIMD_CPU +#ifndef SIMD_DISABLE return masko_loadu(mask.elts, ptr, other.elts); #else return mask.elts ? *ptr : other.elts; @@ -960,7 +1231,7 @@ template masko_loada(const simdl &mask, const T *ptr, const U &other) { simd::count_memop(); -#ifndef SIMD_CPU +#ifndef SIMD_DISABLE return masko_loada(mask, ptr, simd(other)); #else return mask.elts ? *ptr : other; @@ -972,78 +1243,13 @@ template masko_loadu(const simdl &mask, const T *ptr, const U &other) { simd::count_memop(); -#ifndef SIMD_CPU +#ifndef SIMD_DISABLE return masko_loadu(mask, ptr, simd(other)); #else return mask.elts ? *ptr : other; #endif } -template -ARITH_DEVICE ARITH_HOST inline simd acos(const simd &x) { - simd::count_memop(10); - alignas(simd) T xarr[simd::storage_size]; - storea(xarr, x); - alignas(simd) T yarr[simd::storage_size]; - using std::acos; - for (std::size_t n = 0; n < x.size(); ++n) - yarr[n] = acos(xarr[n]); - const simd y = loada >(yarr); - return y; -} - -template -ARITH_DEVICE ARITH_HOST inline simd cbrt(const simd &x) { - simd::count_memop(10); - alignas(simd) T xarr[simd::storage_size]; - storea(xarr, x); - alignas(simd) T yarr[simd::storage_size]; - using std::cbrt; - for (std::size_t n = 0; n < x.size(); ++n) - yarr[n] = cbrt(xarr[n]); - const simd y = loada >(yarr); - return y; -} - -template -ARITH_DEVICE ARITH_HOST inline simd cos(const simd &x) { - simd::count_memop(10); - alignas(simd) T xarr[simd::storage_size]; - storea(xarr, x); - alignas(simd) T yarr[simd::storage_size]; - using std::cos; - for (std::size_t n = 0; n < x.size(); ++n) - yarr[n] = cos(xarr[n]); - const simd y = loada >(yarr); - return y; -} - -template -ARITH_DEVICE ARITH_HOST inline simd exp(const simd &x) { - simd::count_memop(10); - alignas(simd) T xarr[simd::storage_size]; - storea(xarr, x); - alignas(simd) T yarr[simd::storage_size]; - using std::exp; - for (std::size_t n = 0; n < x.size(); ++n) - yarr[n] = exp(xarr[n]); - const simd y = loada >(yarr); - return y; -} - -template -ARITH_DEVICE ARITH_HOST inline simd sin(const simd &x) { - simd::count_memop(10); - alignas(simd) T xarr[simd::storage_size]; - storea(xarr, x); - alignas(simd) T yarr[simd::storage_size]; - using std::sin; - for (std::size_t n = 0; n < x.size(); ++n) - yarr[n] = sin(xarr[n]); - const simd y = loada >(yarr); - return y; -} - //////////////////////////////////////////////////////////////////////////////// // A SIMD vector of booleans, usable with `simd`. @@ -1059,7 +1265,7 @@ ARITH_DEVICE ARITH_HOST inline simd sin(const simd &x) { template struct simdl { typedef T value_type; -#ifndef SIMD_CPU +#ifndef SIMD_DISABLE using storage_type = nsimd::packl; #else using storage_type = bool; @@ -1074,13 +1280,13 @@ template struct simdl { constexpr ARITH_DEVICE ARITH_HOST simdl() {} constexpr ARITH_DEVICE ARITH_HOST simdl(bool a) : elts(a) {} -#ifndef SIMD_CPU +#ifndef SIMD_DISABLE constexpr ARITH_DEVICE ARITH_HOST simdl(const nsimd::packl &elts) : elts(elts) {} #endif constexpr ARITH_DEVICE ARITH_HOST std::size_t size() const { -#ifndef SIMD_CPU +#ifndef SIMD_DISABLE return sizeof(nsimd::packl) / sizeof(T); #else return 1; @@ -1089,7 +1295,7 @@ template struct simdl { constexpr ARITH_DEVICE ARITH_HOST bool operator[](const std::ptrdiff_t n) const { -#ifndef SIMD_CPU +#ifndef SIMD_DISABLE // TOOD: Introduce `to_mask` for simd/simdl return simd(nsimd::to_mask(elts))[n]; #else @@ -1127,7 +1333,11 @@ template struct simdl { } friend ARITH_DEVICE ARITH_HOST simdl operator^(const simdl &x, const simdl &y) { +#ifndef SIMD_DISABLE + return xorl(x.elts, y.elts); +#else return x.elts ^ y.elts; +#endif } friend ARITH_DEVICE ARITH_HOST simdl operator&(const bool a, const simdl &y) { @@ -1172,77 +1382,77 @@ template struct simdl { friend ARITH_DEVICE ARITH_HOST simdl operator==(const simdl &x, const simdl &y) { - return x.elts == y.elts; + return !(x != y); } friend ARITH_DEVICE ARITH_HOST simdl operator!=(const simdl &x, const simdl &y) { - return x.elts != y.elts; + return x ^ y; } friend ARITH_DEVICE ARITH_HOST simdl operator<(const simdl &x, const simdl &y) { - return x.elts < y.elts; + return !x & y; } friend ARITH_DEVICE ARITH_HOST simdl operator>(const simdl &x, const simdl &y) { - return x.elts > y.elts; + return y < x; } friend ARITH_DEVICE ARITH_HOST simdl operator<=(const simdl &x, const simdl &y) { - return x.elts <= y.elts; + return !(x > y); } friend ARITH_DEVICE ARITH_HOST simdl operator>=(const simdl &x, const simdl &y) { - return x.elts >= y.elts; + return !(x < y); } friend ARITH_DEVICE ARITH_HOST simdl operator==(const bool a, const simdl &y) { - return a == y.elts; + return simdl(a) == y; } friend ARITH_DEVICE ARITH_HOST simdl operator!=(const bool a, const simdl &y) { - return a != y.elts; + return simdl(a) != y; } friend ARITH_DEVICE ARITH_HOST simdl operator<(const bool a, const simdl &y) { - return a < y.elts; + return simdl(a) < y; } friend ARITH_DEVICE ARITH_HOST simdl operator>(const bool a, const simdl &y) { - return a > y.elts; + return simdl(a) > y; } friend ARITH_DEVICE ARITH_HOST simdl operator<=(const bool a, const simdl &y) { - return a <= y.elts; + return simdl(a) <= y; } friend ARITH_DEVICE ARITH_HOST simdl operator>=(const bool a, const simdl &y) { - return a >= y.elts; + return simdl(a) >= y; } friend ARITH_DEVICE ARITH_HOST simdl operator==(const simdl &x, const bool b) { - return x.elts == b; + return x == simdl(b); } friend ARITH_DEVICE ARITH_HOST simdl operator!=(const simdl &x, const bool b) { - return x.elts != b; + return x != simdl(b); } friend ARITH_DEVICE ARITH_HOST simdl operator<(const simdl &x, const bool b) { - return x.elts < b; + return x < simdl(b); } friend ARITH_DEVICE ARITH_HOST simdl operator>(const simdl &x, const bool b) { - return x.elts > b; + return x > simdl(b); } friend ARITH_DEVICE ARITH_HOST simdl operator<=(const simdl &x, const bool b) { - return x.elts <= b; + return x <= simdl(b); } friend ARITH_DEVICE ARITH_HOST simdl operator>=(const simdl &x, const bool b) { - return x.elts >= b; + return x >= simdl(b); } friend ARITH_DEVICE ARITH_HOST bool all(const simdl &x) { -#ifndef SIMD_CPU +#ifndef SIMD_DISABLE return all(x.elts); #else return x.elts; @@ -1250,7 +1460,7 @@ template struct simdl { } friend ARITH_DEVICE ARITH_HOST simdl andnot(const simdl &x, const simdl &y) { -#ifndef SIMD_CPU +#ifndef SIMD_DISABLE return andnotl(x.elts, y.elts); #else return x && !y; @@ -1258,7 +1468,7 @@ template struct simdl { } friend ARITH_DEVICE ARITH_HOST bool any(const simdl &x) { -#ifndef SIMD_CPU +#ifndef SIMD_DISABLE return any(x.elts); #else return x.elts; @@ -1267,7 +1477,7 @@ template struct simdl { friend ARITH_DEVICE ARITH_HOST simd if_else(const simdl &cond, const simd &x, const simd &y) { -#ifndef SIMD_CPU +#ifndef SIMD_DISABLE return if_else1(cond.elts, x.elts, y.elts); #else return cond.elts ? x.elts : y.elts; @@ -1288,7 +1498,7 @@ template struct simdl { friend ARITH_DEVICE ARITH_HOST simdl if_else(const simdl &cond, const simdl &x, const simdl &y) { -#ifndef SIMD_CPU +#ifndef SIMD_DISABLE return if_else1(cond.elts, x.elts, y.elts); #else return cond.elts ? x.elts : y.elts; @@ -1308,14 +1518,14 @@ template struct simdl { } friend ARITH_DEVICE ARITH_HOST void storela(T *ptr, const simdl &x) { -#ifndef SIMD_CPU +#ifndef SIMD_DISABLE storela(ptr, x.elts); #else *ptr = x.elts; #endif } friend ARITH_DEVICE ARITH_HOST void storelu(T *ptr, const simdl &x) { -#ifndef SIMD_CPU +#ifndef SIMD_DISABLE storelu(ptr, x.elts); #else *ptr = x.elts; diff --git a/TestDerivs/src/test.cxx b/TestDerivs/src/test.cxx index 43e855a26..9a84b5bf4 100644 --- a/TestDerivs/src/test.cxx +++ b/TestDerivs/src/test.cxx @@ -50,7 +50,8 @@ extern "C" void TestDerivs_Set(CCTK_ARGUMENTS) { const CCTK_REAL y0 = p.y; const CCTK_REAL z0 = p.z; vreal u0; - poly(kxx, kxy, kyz, Arith::cos(x0), std::sin(y0), std::sin(z0), u0); + using std::sin, std::cos; + poly(kxx, kxy, kyz, cos(x0), sin(y0), sin(z0), u0); chi.store(mask, p.I, u0); }); @@ -133,12 +134,13 @@ extern "C" void TestDerivs_CalcDerivs(CCTK_ARGUMENTS) { #if CCTK_DEBUG grid.loop_int_device<0, 0, 0>( grid.nghostzones, [=] ARITH_DEVICE(const PointDesc &p) ARITH_INLINE { - const auto sinx = std::sin(p.x); - const auto siny = std::sin(p.y); - const auto sinz = std::sin(p.z); - const auto cosx = std::cos(p.x); - const auto cosy = std::cos(p.y); - const auto cosz = std::cos(p.z); + using std::cos, std::sin; + const auto sinx = sin(p.x); + const auto siny = sin(p.y); + const auto sinz = sin(p.z); + const auto cosx = cos(p.x); + const auto cosy = cos(p.y); + const auto cosz = cos(p.z); const auto dxxdchi = -2 * kxx * cosx * cosx + 2 * kxx * sinx * sinx - kxy * cosx * siny; const auto dxydchi = -kxy * cosy * sinx; diff --git a/scripts/actions-cuda-real32.cfg b/scripts/actions-cuda-real32.cfg index 46b94a970..f0c895eae 100644 --- a/scripts/actions-cuda-real32.cfg +++ b/scripts/actions-cuda-real32.cfg @@ -15,7 +15,7 @@ FC = gfortran F90 = gfortran LD = nvcc -CPPFLAGS = -DSIMD_CPU +CPPFLAGS = -DSIMD_DISABLE CFLAGS = -pipe -g -std=gnu11 # - We use "--relocatable-device-code=true" to allow building with # debug versions of AMReX diff --git a/scripts/actions-cuda-real64.cfg b/scripts/actions-cuda-real64.cfg index f81beb5dd..8f9f1bba6 100644 --- a/scripts/actions-cuda-real64.cfg +++ b/scripts/actions-cuda-real64.cfg @@ -15,7 +15,7 @@ FC = gfortran F90 = gfortran LD = nvcc -CPPFLAGS = -DSIMD_CPU +CPPFLAGS = -DSIMD_DISABLE CFLAGS = -pipe -g -std=gnu11 # - We use "--relocatable-device-code=true" to allow building with # debug versions of AMReX diff --git a/scripts/actions-oneapi-real64.cfg b/scripts/actions-oneapi-real64.cfg index 5e2d41ccf..01fbebb07 100644 --- a/scripts/actions-oneapi-real64.cfg +++ b/scripts/actions-oneapi-real64.cfg @@ -16,7 +16,7 @@ F90 = gfortran LD = /opt/intel/oneapi/compiler/2024.2/bin/icpx # -g # Debug information uses too much disk space on CI -CPPFLAGS = -DSIMD_DISABLE -DSIMD_CPU +CPPFLAGS = -DSIMD_DISABLE CFLAGS = -fp-model=precise -march=x86-64-v3 -pipe -std=gnu11 CXXFLAGS = -fp-model=precise -fsycl -march=x86-64-v3 -pipe -std=c++17 FPPFLAGS = -traditional diff --git a/scripts/actions-rocm-real64.cfg b/scripts/actions-rocm-real64.cfg index 1fe8a0ecf..f067f13ba 100644 --- a/scripts/actions-rocm-real64.cfg +++ b/scripts/actions-rocm-real64.cfg @@ -16,7 +16,7 @@ FC = gfortran F90 = gfortran LD = /opt/rocm/llvm/bin/clang++ -CPPFLAGS = -DSIMD_CPU +CPPFLAGS = -DSIMD_DISABLE CFLAGS = -pipe -g -std=gnu11 CXXFLAGS = -pipe -g -std=c++17 --offload-arch=gfx90a FPPFLAGS = -traditional