From 8292b1770fe6df5675b3cf183ca339c5371e946e Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Fri, 11 Oct 2024 02:23:18 -0700 Subject: [PATCH 01/29] Initial support for register for staggered dslash kernel --- CMakeLists.txt | 3 + include/dslash.h | 12 ++- include/dslash_helper.cuh | 8 +- include/kernels/dslash_staggered.cuh | 111 ++++++++++++++++++--------- include/quda_define.h.in | 6 ++ 5 files changed, 99 insertions(+), 41 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 803f5dba41..b0cc69dacd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -216,6 +216,8 @@ if(QUDA_MAX_MULTI_BLAS_N GREATER 32) message(SEND_ERROR "Maximum QUDA_MAX_MULTI_BLAS_N is 32.") endif() +set(QUDA_MAX_MULTI_RHS_TILE "1" CACHE STRING "maximum tile size for MRHS kernels") + set(QUDA_PRECISION "14" CACHE STRING "which precisions to instantiate in QUDA (4-bit number - double, single, half, quarter)") @@ -275,6 +277,7 @@ mark_as_advanced(QUDA_ALTERNATIVE_I_TO_F) mark_as_advanced(QUDA_MAX_MULTI_BLAS_N) mark_as_advanced(QUDA_MAX_MULTI_RHS) +mark_as_advanced(QUDA_MAX_MULTI_RHS_TILE) mark_as_advanced(QUDA_PRECISION) mark_as_advanced(QUDA_RECONSTRUCT) mark_as_advanced(QUDA_CLOVER_CHOLESKY_PROMOTE) diff --git a/include/dslash.h b/include/dslash.h index 4b1aada52e..80638b494e 100644 --- a/include/dslash.h +++ b/include/dslash.h @@ -67,6 +67,10 @@ namespace quda if (arg.xpay) strcat(aux_base, ",xpay"); if (arg.dagger) strcat(aux_base, ",dagger"); setRHSstring(aux_base, in.size()); + strcat(aux_base, ",n_rhs_tile="); + char tile_str[16]; + i32toa(tile_str, Arg::n_src_tile); + strcat(aux_base, tile_str); } /** @@ -329,7 +333,13 @@ namespace quda Dslash(Arg &arg, cvector_ref &out, cvector_ref &in, const ColorSpinorField &halo, const std::string &app_base = "") : - TunableKernel3D(in[0], halo.X(4), arg.nParity), arg(arg), out(out), in(in), halo(halo), nDimComms(4), dslashParam(arg) + TunableKernel3D(in[0], (halo.X(4) + Arg::n_src_tile - 1) / Arg::n_src_tile, arg.nParity), + arg(arg), + out(out), + in(in), + halo(halo), + nDimComms(4), + dslashParam(arg) { if (checkLocation(out, in) == QUDA_CPU_FIELD_LOCATION) errorQuda("CPU Fields not supported in Dslash framework yet"); diff --git a/include/dslash_helper.cuh b/include/dslash_helper.cuh index df176549fc..c39fd41f3f 100644 --- a/include/dslash_helper.cuh +++ b/include/dslash_helper.cuh @@ -241,11 +241,12 @@ namespace quda return true; } - template struct DslashArg { + template struct DslashArg { using Float = Float_; using real = typename mapper::type; static constexpr int nDim = nDim_; + static constexpr int n_src_tile = n_src_tile_; // how many RHS per thread const int parity; // only use this for single parity fields const int nParity; // number of parities we're working on @@ -650,8 +651,9 @@ namespace quda Arg arg; dslash_functor_arg(const Arg &arg, unsigned int threads_x) : - kernel_param(dim3(threads_x, arg.dc.Ls, arg.nParity)), - arg(arg) { } + kernel_param(dim3(threads_x, (arg.dc.Ls + Arg::n_src_tile - 1) / Arg::n_src_tile, arg.nParity)), arg(arg) + { + } }; /** diff --git a/include/kernels/dslash_staggered.cuh b/include/kernels/dslash_staggered.cuh index 7eafa4a687..cfeaa553e7 100644 --- a/include/kernels/dslash_staggered.cuh +++ b/include/kernels/dslash_staggered.cuh @@ -14,9 +14,9 @@ namespace quda /** @brief Parameter structure for driving the Staggered Dslash operator */ - template - struct StaggeredArg : DslashArg { + template + struct StaggeredArg : DslashArg { typedef typename mapper::type real; static constexpr int nColor = nColor_; static constexpr int nSpin = 1; @@ -57,8 +57,9 @@ namespace quda StaggeredArg(cvector_ref &out, cvector_ref &in, const ColorSpinorField &halo, const GaugeField &U, const GaugeField &L, double a, cvector_ref &x, int parity, bool dagger, const int *comm_override) : - DslashArg(out, in, halo, U, x, parity, dagger, a == 0.0 ? false : true, improved_ ? 3 : 1, - spin_project, comm_override), + DslashArg(out, in, halo, U, x, parity, dagger, a == 0.0 ? false : true, + improved_ ? 3 : 1, spin_project, comm_override), + n_src(out.size()), halo_pack(halo, improved_ ? 3 : 1), halo(halo, improved_ ? 3 : 1), U(U), @@ -87,9 +88,9 @@ namespace quda @param[in] parity The site parity @param[in] x_cb The checkerboarded site index */ - template - __device__ __host__ inline void applyStaggered(Vector &out, const Arg &arg, Coord &coord, int parity, int, - int thread_dim, bool &active, int src_idx) + template + __device__ __host__ inline void applyStaggered(array &out, const Arg &arg, Coord &coord, + int parity, int, int thread_dim, bool &active, int src_idx) { typedef typename mapper::type real; typedef Matrix, Arg::nColor> Link; @@ -104,13 +105,18 @@ namespace quda if (doHalo(d) && ghost) { const int ghost_idx = ghostFaceIndexStaggered<1>(coord, arg.dim, d, 1); const Link U = arg.improved ? arg.U(d, coord.x_cb, parity) : arg.U(d, coord.x_cb, parity, StaggeredPhase(coord, d, +1, arg)); - Vector in = arg.halo.Ghost(d, 1, ghost_idx + src_idx * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity); - out = mv_add(U, in, out); + for (auto s = 0; s < n_src_tile; s++) { + Vector in + = arg.halo.Ghost(d, 1, ghost_idx + (src_idx + s) * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity); + out[s] = mv_add(U, in, out[s]); + } } else if (doBulk() && !ghost) { const int fwd_idx = linkIndexP1(coord, arg.dim, d); const Link U = arg.improved ? arg.U(d, coord.x_cb, parity) : arg.U(d, coord.x_cb, parity, StaggeredPhase(coord, d, +1, arg)); - Vector in = arg.in[src_idx](fwd_idx, their_spinor_parity); - out = mv_add(U, in, out); + for (auto s = 0; s < n_src_tile; s++) { + Vector in = arg.in[src_idx + s](fwd_idx, their_spinor_parity); + out[s] = mv_add(U, in, out[s]); + } } } @@ -120,14 +126,18 @@ namespace quda if (doHalo(d) && ghost) { const int ghost_idx = ghostFaceIndexStaggered<1>(coord, arg.dim, d, arg.nFace); const Link L = arg.L(d, coord.x_cb, parity); - const Vector in - = arg.halo.Ghost(d, 1, ghost_idx + src_idx * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity); - out = mv_add(L, in, out); + for (auto s = 0; s < n_src_tile; s++) { + const Vector in + = arg.halo.Ghost(d, 1, ghost_idx + (src_idx + s) * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity); + out[s] = mv_add(L, in, out[s]); + } } else if (doBulk() && !ghost) { const int fwd3_idx = linkIndexP3(coord, arg.dim, d); const Link L = arg.L(d, coord.x_cb, parity); - const Vector in = arg.in[src_idx](fwd3_idx, their_spinor_parity); - out = mv_add(L, in, out); + for (auto s = 0; s < n_src_tile; s++) { + const Vector in = arg.in[src_idx + s](fwd3_idx, their_spinor_parity); + out[s] = mv_add(L, in, out[s]); + } } } @@ -140,15 +150,20 @@ namespace quda const int ghost_idx = arg.improved ? ghostFaceIndexStaggered<0>(coord, arg.dim, d, 3) : ghost_idx2; const Link U = arg.improved ? arg.U.Ghost(d, ghost_idx2, 1 - parity) : arg.U.Ghost(d, ghost_idx2, 1 - parity, StaggeredPhase(coord, d, -1, arg)); - Vector in = arg.halo.Ghost(d, 0, ghost_idx + src_idx * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity); - out = mv_add(conj(U), -in, out); + for (auto s = 0; s < n_src_tile; s++) { + Vector in + = arg.halo.Ghost(d, 0, ghost_idx + (src_idx + s) * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity); + out[s] = mv_add(conj(U), -in, out[s]); + } } else if (doBulk() && !ghost) { const int back_idx = linkIndexM1(coord, arg.dim, d); const int gauge_idx = back_idx; const Link U = arg.improved ? arg.U(d, gauge_idx, 1 - parity) : arg.U(d, gauge_idx, 1 - parity, StaggeredPhase(coord, d, -1, arg)); - Vector in = arg.in[src_idx](back_idx, their_spinor_parity); - out = mv_add(conj(U), -in, out); + for (auto s = 0; s < n_src_tile; s++) { + Vector in = arg.in[src_idx + s](back_idx, their_spinor_parity); + out[s] = mv_add(conj(U), -in, out[s]); + } } } @@ -159,15 +174,19 @@ namespace quda // when updating replace arg.nFace with 1 here const int ghost_idx = ghostFaceIndexStaggered<0>(coord, arg.dim, d, 1); const Link L = arg.L.Ghost(d, ghost_idx, 1 - parity); - const Vector in - = arg.halo.Ghost(d, 0, ghost_idx + src_idx * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity); - out = mv_add(conj(L), -in, out); + for (auto s = 0; s < n_src_tile; s++) { + const Vector in + = arg.halo.Ghost(d, 0, ghost_idx + (src_idx + s) * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity); + out[s] = mv_add(conj(L), -in, out[s]); + } } else if (doBulk() && !ghost) { const int back3_idx = linkIndexM3(coord, arg.dim, d); const int gauge_idx = back3_idx; const Link L = arg.L(d, gauge_idx, 1 - parity); - const Vector in = arg.in[src_idx](back3_idx, their_spinor_parity); - out = mv_add(conj(L), -in, out); + for (auto s = 0; s < n_src_tile; s++) { + const Vector in = arg.in[src_idx + s](back3_idx, their_spinor_parity); + out[s] = mv_add(conj(L), -in, out[s]); + } } } } // nDim @@ -181,8 +200,8 @@ namespace quda constexpr staggered(const Arg &arg) : arg(arg) {} static constexpr const char *filename() { return KERNEL_FILE; } // this file name - used for run-time compilation - template - __device__ __host__ __forceinline__ void operator()(int idx, int src_idx, int parity) + template + __device__ __host__ __forceinline__ void apply(int idx, int src_idx, int parity) { using real = typename mapper::type; using Vector = ColorSpinor; @@ -195,20 +214,38 @@ namespace quda const int my_spinor_parity = nParity == 2 ? parity : 0; - Vector out; + array out; + applyStaggered(out, arg, coord, parity, idx, thread_dim, active, src_idx); - applyStaggered(out, arg, coord, parity, idx, thread_dim, active, src_idx); - - out *= arg.dagger_scale; + for (auto s = 0; s < n_src_tile; s++) out[s] *= arg.dagger_scale; if (xpay && mykernel_type == INTERIOR_KERNEL) { - Vector x = arg.x[src_idx](coord.x_cb, my_spinor_parity); - out = arg.a * x - out; + array x; + for (auto s = 0; s < n_src_tile; s++) { + x[s] = arg.x[src_idx + s](coord.x_cb, my_spinor_parity); + out[s] = arg.a * x[s] - out[s]; + } } else if (mykernel_type != INTERIOR_KERNEL) { - Vector x = arg.out[src_idx](coord.x_cb, my_spinor_parity); - out = x + (xpay ? -out : out); + array x; + for (auto s = 0; s < n_src_tile; s++) { + x[s] = arg.x[src_idx + s](coord.x_cb, my_spinor_parity); + out[s] = x[s] + (xpay ? -out[s] : out[s]); + } + } + if (mykernel_type != EXTERIOR_KERNEL_ALL || active) { + for (auto s = 0; s < n_src_tile; s++) { arg.out[src_idx + s](coord.x_cb, my_spinor_parity) = out[s]; } + } + } + + template + __device__ __host__ __forceinline__ void operator()(int idx, int src_idx_block, int parity) + { + int src_idx = src_idx_block * Arg::n_src_tile; + if (src_idx + n_src_tile <= arg.n_src) { + apply(idx, src_idx, parity); + } else if constexpr (n_src_tile - 1 > 0) { + operator()(idx, src_idx_block, parity); } - if (mykernel_type != EXTERIOR_KERNEL_ALL || active) arg.out[src_idx](coord.x_cb, my_spinor_parity) = out; } }; diff --git a/include/quda_define.h.in b/include/quda_define.h.in index ea6657120f..90a6b24d1a 100644 --- a/include/quda_define.h.in +++ b/include/quda_define.h.in @@ -25,6 +25,12 @@ */ #define MAX_MULTI_RHS @QUDA_MAX_MULTI_RHS@ +/** + * @def MAX_MULTI_BLAS_N + * @brief This macro sets the register tile size for MRHS kernels + */ +#define MAX_MULTI_RHS_TILE @QUDA_MAX_MULTI_RHS_TILE@ + #cmakedefine QUDA_HETEROGENEOUS_ATOMIC #ifdef QUDA_HETEROGENEOUS_ATOMIC /** From e816fb1cb2c6e1b9fa7228fdd0aecde51614a2a3 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Fri, 11 Oct 2024 02:47:12 -0700 Subject: [PATCH 02/29] Sanity check for QUDA_MAX_MULTI_RHS_TILE --- CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index b0cc69dacd..20cc44a36b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -217,6 +217,9 @@ if(QUDA_MAX_MULTI_BLAS_N GREATER 32) endif() set(QUDA_MAX_MULTI_RHS_TILE "1" CACHE STRING "maximum tile size for MRHS kernels") +if(QUDA_MAX_MULTI_RHS_TILE GREATER QUDA_MAX_MULTI_RHS) + message(SEND_ERROR "QUDA_MAX_MULTI_RHS_TILE is greater than QUDA_MAX_MULTI_RHS") +endif() set(QUDA_PRECISION "14" From eeea3b002d7e230e7b1b390c997a39798a2822d5 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Fri, 11 Oct 2024 02:47:41 -0700 Subject: [PATCH 03/29] Possible WAR for ROCm performance issue with MRHS staggered kernels --- include/kernels/dslash_staggered.cuh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/kernels/dslash_staggered.cuh b/include/kernels/dslash_staggered.cuh index cfeaa553e7..07e8f333dd 100644 --- a/include/kernels/dslash_staggered.cuh +++ b/include/kernels/dslash_staggered.cuh @@ -238,8 +238,9 @@ namespace quda } template - __device__ __host__ __forceinline__ void operator()(int idx, int src_idx_block, int parity) + __device__ __host__ __forceinline__ void operator()(int idx, int src_idx_block_, int parity) { + int src_idx_block = MAX_MULTI_RHS == 1 ? 0 : src_idx_block_; int src_idx = src_idx_block * Arg::n_src_tile; if (src_idx + n_src_tile <= arg.n_src) { apply(idx, src_idx, parity); From 467a493b11415170d84c93369e85717de6a1d0c4 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Fri, 11 Oct 2024 05:07:37 -0700 Subject: [PATCH 04/29] FieldTmp now supports creating temporaries using a T::param_type --- include/field_cache.h | 27 ++++++++++++++++++++++++++- lib/field_cache.cpp | 15 +++++++++++++++ 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/include/field_cache.h b/include/field_cache.h index 5e05f9acf6..f288844300 100644 --- a/include/field_cache.h +++ b/include/field_cache.h @@ -15,7 +15,7 @@ namespace quda { */ template struct FieldKey { - std::string volume; /** volume kstring */ + std::string volume; /** volume string */ std::string aux; /** auxiliary string */ FieldKey() = default; @@ -78,6 +78,18 @@ namespace quda { */ FieldTmp(const FieldKey &key, const typename T::param_type ¶m); + /** + @brief Create a field temporary that corresponds to the field + constructed from the param struct. If a matching field is + present in the cache, it will be popped from the cache. If no + such temporary exists a temporary will be allocated. + @param[in] key Key corresponding to the field instance we + require + @param[in] param Parameter structure used to allocated + the temporary + */ + FieldTmp(typename T::param_type param); + /** @brief Copy constructor is deleted to prevent accidental cache bloat @@ -111,6 +123,18 @@ namespace quda { */ template auto getFieldTmp(const T &a) { return FieldTmp(a); } + /** + @brief Get a field temporary that is identical to the field + instance argument. If a matching field is present in the cache, + it will be popped from the cache. If no such temporary exists, a + temporary will be allocated. When the destructor for the + FieldTmp is called, e.g., the returned object goes out of scope, + the temporary will be pushed onto the cache. + + @param[in] a Field we wish to create a matching temporary for + */ + template auto getFieldTmp(const typename T::param_type ¶m) { return FieldTmp(param); } + /** @brief Get a vector of field temporaries that are identical to the vector instance argument. If enough matching fields are @@ -130,4 +154,5 @@ namespace quda { for (auto i = 0u; i < a.size(); i++) tmp.push_back(std::move(getFieldTmp(a[i]))); return tmp; } + } diff --git a/lib/field_cache.cpp b/lib/field_cache.cpp index 5ea09e3059..b98f24c065 100644 --- a/lib/field_cache.cpp +++ b/lib/field_cache.cpp @@ -35,6 +35,21 @@ namespace quda { } } + template FieldTmp::FieldTmp(typename T::param_type param) + { + param.create = QUDA_REFERENCE_FIELD_CREATE; + key = FieldKey(T(param)); + + auto it = cache.find(key); + if (it != cache.end() && it->second.size()) { // found an entry + tmp = std::move(it->second.top()); + it->second.pop(); // pop the defunct object + } else { // no entry found, we must allocate a new field + param.create = QUDA_ZERO_FIELD_CREATE; + tmp = T(param); + } + } + template FieldTmp::~FieldTmp() { // don't cache the field if it's empty (e.g., has been moved) From 36fcf3dac220dfc2554539efd53fd4578b1dfc8e Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Fri, 11 Oct 2024 05:08:22 -0700 Subject: [PATCH 05/29] DslashCoarse now uses getFieldTmp for it mma temporaries to avoid allocation overheads when memory pool is disabled --- lib/dslash_coarse.in.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/dslash_coarse.in.cpp b/lib/dslash_coarse.in.cpp index 747f2e9958..52298ceb14 100644 --- a/lib/dslash_coarse.in.cpp +++ b/lib/dslash_coarse.in.cpp @@ -88,7 +88,7 @@ namespace quda param.nVec = nVec; param.create = QUDA_NULL_FIELD_CREATE; param.fieldOrder = order; - return ColorSpinorField(param); + return getFieldTmp(param); } // Apply the coarse Dirac matrix to a coarse grid vector @@ -107,9 +107,9 @@ namespace quda IntList<@QUDA_MULTIGRID_NVEC_LIST@>()); } else { constexpr QudaFieldOrder csOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; - ColorSpinorField v_inA = create_color_spinor_copy(inA, csOrder); - ColorSpinorField v_inB = create_color_spinor_copy(inB, csOrder); - ColorSpinorField v_out = create_color_spinor_copy(out, csOrder); + auto v_inA = create_color_spinor_copy(inA, csOrder); + auto v_inB = create_color_spinor_copy(inB, csOrder); + auto v_out = create_color_spinor_copy(out, csOrder); if (dslash) { BlockTransposeForward(v_inA, inA); } if (clover) { BlockTransposeForward(v_inB, inB); } From db865e495ec19762ca8d3cb14a011e9c2fb49457 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Mon, 14 Oct 2024 11:34:05 -0700 Subject: [PATCH 06/29] Small cleanup of staggered dslash --- include/kernels/dslash_staggered.cuh | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/include/kernels/dslash_staggered.cuh b/include/kernels/dslash_staggered.cuh index 07e8f333dd..f1c5bf9210 100644 --- a/include/kernels/dslash_staggered.cuh +++ b/include/kernels/dslash_staggered.cuh @@ -220,16 +220,14 @@ namespace quda for (auto s = 0; s < n_src_tile; s++) out[s] *= arg.dagger_scale; if (xpay && mykernel_type == INTERIOR_KERNEL) { - array x; for (auto s = 0; s < n_src_tile; s++) { - x[s] = arg.x[src_idx + s](coord.x_cb, my_spinor_parity); - out[s] = arg.a * x[s] - out[s]; + Vector x = arg.x[src_idx + s](coord.x_cb, my_spinor_parity); + out[s] = arg.a * x - out[s]; } } else if (mykernel_type != INTERIOR_KERNEL) { - array x; for (auto s = 0; s < n_src_tile; s++) { - x[s] = arg.x[src_idx + s](coord.x_cb, my_spinor_parity); - out[s] = x[s] + (xpay ? -out[s] : out[s]); + Vector x = arg.x[src_idx + s](coord.x_cb, my_spinor_parity); + out[s] = x + (xpay ? -out[s] : out[s]); } } if (mykernel_type != EXTERIOR_KERNEL_ALL || active) { From 8f72560aaadee965af94e63738e6b1b8cc44cfb4 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Tue, 15 Oct 2024 11:31:23 -0700 Subject: [PATCH 07/29] Add default initialization to array::data --- include/array.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/array.h b/include/array.h index 3005087c85..e603f066c5 100644 --- a/include/array.h +++ b/include/array.h @@ -12,7 +12,7 @@ namespace quda template struct array { using value_type = T; static constexpr int N = n; - T data[n]; + T data[n] = {}; constexpr T &operator[](int i) { return data[i]; } constexpr const T &operator[](int i) const { return data[i]; } From b2cebb08ae2e9712ebaffa3a3536e455ff4f4940 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Tue, 15 Oct 2024 11:58:30 -0700 Subject: [PATCH 08/29] Fix multi-gpu bug --- include/kernels/dslash_staggered.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/kernels/dslash_staggered.cuh b/include/kernels/dslash_staggered.cuh index f1c5bf9210..ef01f43907 100644 --- a/include/kernels/dslash_staggered.cuh +++ b/include/kernels/dslash_staggered.cuh @@ -226,7 +226,7 @@ namespace quda } } else if (mykernel_type != INTERIOR_KERNEL) { for (auto s = 0; s < n_src_tile; s++) { - Vector x = arg.x[src_idx + s](coord.x_cb, my_spinor_parity); + Vector x = arg.out[src_idx + s](coord.x_cb, my_spinor_parity); out[s] = x + (xpay ? -out[s] : out[s]); } } From 704f6b2c5ef854ab6a3fcdc000db57f42b170a45 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Mon, 14 Oct 2024 16:12:40 -0700 Subject: [PATCH 09/29] Fixed a logic bug in the MR convergence check --- lib/inv_mr_quda.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/inv_mr_quda.cpp b/lib/inv_mr_quda.cpp index 1696cd4d42..0e86f80a17 100644 --- a/lib/inv_mr_quda.cpp +++ b/lib/inv_mr_quda.cpp @@ -148,13 +148,13 @@ namespace quda mat(r, x); r2 = blas::xmyNorm(b, r); for (auto i = 0u; i < b2.size(); i++) param.true_res[i] = sqrt(r2[i] / b2[i]); - converged = (step < param.Nsteps && r2 > stop) ? false : true; + converged = (step < param.Nsteps && r2 < stop) ? true : false; if (!converged) blas::copy(r_sloppy, r); PrintStats("MR (restart)", iter, r2, b2); } else { blas::ax(scale, r_sloppy); r2 = blas::norm2(r_sloppy); - converged = (step < param.Nsteps && r2 > stop) ? false : true; + converged = (step < param.Nsteps && r2 < stop) ? true : false; if (!converged) blas::copy(r, r_sloppy); } step++; From 57c631c9b8648b3b74e6054a5260fb3e4fce8062 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Tue, 15 Oct 2024 14:58:18 -0700 Subject: [PATCH 10/29] Revert "Add default initialization to array::data" This reverts commit 8f72560aaadee965af94e63738e6b1b8cc44cfb4. --- include/array.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/array.h b/include/array.h index e603f066c5..3005087c85 100644 --- a/include/array.h +++ b/include/array.h @@ -12,7 +12,7 @@ namespace quda template struct array { using value_type = T; static constexpr int N = n; - T data[n] = {}; + T data[n]; constexpr T &operator[](int i) { return data[i]; } constexpr const T &operator[](int i) const { return data[i]; } From 9ddd25b9f9d67067522cde09b38518356517e86e Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Wed, 16 Oct 2024 11:16:48 -0700 Subject: [PATCH 11/29] n_src should be a member of DslashArg --- include/dslash_helper.cuh | 2 ++ include/kernels/dslash_staggered.cuh | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/dslash_helper.cuh b/include/dslash_helper.cuh index c39fd41f3f..8a9ba2391c 100644 --- a/include/dslash_helper.cuh +++ b/include/dslash_helper.cuh @@ -270,6 +270,7 @@ namespace quda int threadDimMapLower[4]; int threadDimMapUpper[4]; + int_fastdiv n_src; int_fastdiv Ls; // these are set with symmetric preconditioned twisted-mass dagger @@ -328,6 +329,7 @@ namespace quda exterior_threads(0), threadDimMapLower {}, threadDimMapUpper {}, + n_src(in.size()), Ls(halo.X(4) / in.size()), twist_a(0.0), twist_b(0.0), diff --git a/include/kernels/dslash_staggered.cuh b/include/kernels/dslash_staggered.cuh index ef01f43907..3dfaf01be0 100644 --- a/include/kernels/dslash_staggered.cuh +++ b/include/kernels/dslash_staggered.cuh @@ -37,7 +37,6 @@ namespace quda using GL = typename gauge_mapper::type; - const int_fastdiv n_src; F out[MAX_MULTI_RHS]; /** output vector field */ F in[MAX_MULTI_RHS]; /** input vector field */ const Ghost halo_pack; /** accessor for writing the halo */ @@ -59,7 +58,6 @@ namespace quda cvector_ref &x, int parity, bool dagger, const int *comm_override) : DslashArg(out, in, halo, U, x, parity, dagger, a == 0.0 ? false : true, improved_ ? 3 : 1, spin_project, comm_override), - n_src(out.size()), halo_pack(halo, improved_ ? 3 : 1), halo(halo, improved_ ? 3 : 1), U(U), From 01725e6e84e26af54767496b080a0230645cdac6 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Wed, 16 Oct 2024 17:15:19 -0700 Subject: [PATCH 12/29] Fix fused pack + dslash kernels with n_src_tile > 1 --- include/kernels/dslash_pack.cuh | 201 ++++++++++++++++++------------ include/kernels/dslash_wilson.cuh | 1 - lib/dslash_pack2.cu | 4 + 3 files changed, 127 insertions(+), 79 deletions(-) diff --git a/include/kernels/dslash_pack.cuh b/include/kernels/dslash_pack.cuh index 94e0465238..ee34616bbd 100644 --- a/include/kernels/dslash_pack.cuh +++ b/include/kernels/dslash_pack.cuh @@ -11,8 +11,10 @@ namespace quda { int *getPackComms(); - template + constexpr int pack_tile_size = 1; + + template struct PackArg : kernel_param<> { typedef Float_ Float; @@ -24,6 +26,7 @@ namespace quda static constexpr bool dagger = dagger_; static constexpr int twist = twist_; // whether we are doing preconditioned twisted-mass or not (1 - singlet, 2 - doublet) static constexpr QudaPCType pc_type = pc_type_; // preconditioning type (4-d or 5-d) + static constexpr int n_src_tile = n_src_tile_; static constexpr bool spinor_direct_load = false; // false means texture load @@ -54,6 +57,7 @@ namespace quda int sites_per_block; + int_fastdiv n_src; int_fastdiv Ls; char *packBuffer[4 * QUDA_MAX_DIM]; @@ -96,6 +100,7 @@ namespace quda threadDimMapLower {}, threadDimMapUpper {}, sites_per_block((work_items + grid - 1) / grid), + n_src(in.size()), Ls(halo.X(4) / in.size()) #ifdef NVSHMEM_COMMS , @@ -131,7 +136,7 @@ namespace quda } }; - template + template __device__ __host__ inline void pack(const Arg &arg, int ghost_idx, int s, int parity, int src_idx) { typedef typename mapper::type real; @@ -160,46 +165,50 @@ namespace quda int idx = indexFromFaceIndex(ghost_idx, parity, arg); constexpr int proj_dir = dagger ? +1 : -1; - Vector f = arg.in[src_idx](idx + s * arg.dc.volume_4d_cb, spinor_parity); - if (twist == 1) { - f = arg.twist_a * (f + arg.twist_b * f.igamma(4)); - } else if (twist == 2) { - Vector f1 = arg.in[src_idx](idx + (1 - s) * arg.dc.volume_4d_cb, spinor_parity); // load other flavor - if (s == 0) - f = arg.twist_a * (f + arg.twist_b * f.igamma(4) + arg.twist_c * f1); - else - f = arg.twist_a * (f - arg.twist_b * f.igamma(4) + arg.twist_c * f1); - } - if (arg.spin_project) { - arg.halo_pack.Ghost(dim, 0, ghost_idx + (src_idx * arg.Ls + s) * arg.dc.ghostFaceCB[dim], spinor_parity) - = f.project(dim, proj_dir); - } else { - arg.halo_pack.Ghost(dim, 0, ghost_idx + (src_idx * arg.Ls + s) * arg.dc.ghostFaceCB[dim], spinor_parity) = f; + for (auto src = src_idx; src < src_idx + n_src_tile; src++) { + Vector f = arg.in[src](idx + s * arg.dc.volume_4d_cb, spinor_parity); + if (twist == 1) { + f = arg.twist_a * (f + arg.twist_b * f.igamma(4)); + } else if (twist == 2) { + Vector f1 = arg.in[src](idx + (1 - s) * arg.dc.volume_4d_cb, spinor_parity); // load other flavor + if (s == 0) + f = arg.twist_a * (f + arg.twist_b * f.igamma(4) + arg.twist_c * f1); + else + f = arg.twist_a * (f - arg.twist_b * f.igamma(4) + arg.twist_c * f1); + } + if (arg.spin_project) { + arg.halo_pack.Ghost(dim, 0, ghost_idx + (src * arg.Ls + s) * arg.dc.ghostFaceCB[dim], spinor_parity) + = f.project(dim, proj_dir); + } else { + arg.halo_pack.Ghost(dim, 0, ghost_idx + (src * arg.Ls + s) * arg.dc.ghostFaceCB[dim], spinor_parity) = f; + } } } else { // forwards int idx = indexFromFaceIndex(ghost_idx, parity, arg); constexpr int proj_dir = dagger ? -1 : +1; - Vector f = arg.in[src_idx](idx + s * arg.dc.volume_4d_cb, spinor_parity); - if (twist == 1) { - f = arg.twist_a * (f + arg.twist_b * f.igamma(4)); - } else if (twist == 2) { - Vector f1 = arg.in[src_idx](idx + (1 - s) * arg.dc.volume_4d_cb, spinor_parity); // load other flavor - if (s == 0) - f = arg.twist_a * (f + arg.twist_b * f.igamma(4) + arg.twist_c * f1); - else - f = arg.twist_a * (f - arg.twist_b * f.igamma(4) + arg.twist_c * f1); - } - if (arg.spin_project) { - arg.halo_pack.Ghost(dim, 1, ghost_idx + (src_idx * arg.Ls + s) * arg.dc.ghostFaceCB[dim], spinor_parity) - = f.project(dim, proj_dir); - } else { - arg.halo_pack.Ghost(dim, 1, ghost_idx + (src_idx * arg.Ls + s) * arg.dc.ghostFaceCB[dim], spinor_parity) = f; + for (auto src = src_idx; src < src_idx + n_src_tile; src++) { + Vector f = arg.in[src](idx + s * arg.dc.volume_4d_cb, spinor_parity); + if (twist == 1) { + f = arg.twist_a * (f + arg.twist_b * f.igamma(4)); + } else if (twist == 2) { + Vector f1 = arg.in[src](idx + (1 - s) * arg.dc.volume_4d_cb, spinor_parity); // load other flavor + if (s == 0) + f = arg.twist_a * (f + arg.twist_b * f.igamma(4) + arg.twist_c * f1); + else + f = arg.twist_a * (f - arg.twist_b * f.igamma(4) + arg.twist_c * f1); + } + if (arg.spin_project) { + arg.halo_pack.Ghost(dim, 1, ghost_idx + (src * arg.Ls + s) * arg.dc.ghostFaceCB[dim], spinor_parity) + = f.project(dim, proj_dir); + } else { + arg.halo_pack.Ghost(dim, 1, ghost_idx + (src * arg.Ls + s) * arg.dc.ghostFaceCB[dim], spinor_parity) = f; + } } } } - template + template __device__ __host__ inline void packStaggered(const Arg &arg, int ghost_idx, int parity, int src_idx) { typedef typename mapper::type real; @@ -217,12 +226,16 @@ namespace quda if (face_num == 0) { // backwards int idx = indexFromFaceIndexStaggered<4, QUDA_4D_PC, dim, nFace, 0>(ghost_idx, parity, arg); - Vector f = arg.in[src_idx](idx, spinor_parity); - arg.halo_pack.Ghost(dim, 0, ghost_idx + src_idx * nFace * arg.dc.ghostFaceCB[dim], spinor_parity) = f; + for (auto s = 0; s < n_src_tile; s++) { + Vector f = arg.in[src_idx + s](idx, spinor_parity); + arg.halo_pack.Ghost(dim, 0, ghost_idx + (src_idx + s) * nFace * arg.dc.ghostFaceCB[dim], spinor_parity) = f; + } } else { // forwards int idx = indexFromFaceIndexStaggered<4, QUDA_4D_PC, dim, nFace, 1>(ghost_idx, parity, arg); - Vector f = arg.in[src_idx](idx, spinor_parity); - arg.halo_pack.Ghost(dim, 1, ghost_idx + src_idx * nFace * arg.dc.ghostFaceCB[dim], spinor_parity) = f; + for (auto s = 0; s < n_src_tile; s++) { + Vector f = arg.in[src_idx + s](idx, spinor_parity); + arg.halo_pack.Ghost(dim, 1, ghost_idx + (src_idx + s) * nFace * arg.dc.ghostFaceCB[dim], spinor_parity) = f; + } } } @@ -250,24 +263,36 @@ namespace quda if (Arg::pc_type == QUDA_5D_PC) { // 5-d checkerboarded, include s (not ghostFaceCB since both faces) switch (dim) { case 0: - pack(arg, ghost_idx + s * arg.dc.ghostFace[0], 0, parity, src_idx); + pack(arg, ghost_idx + s * arg.dc.ghostFace[0], 0, + parity, src_idx); break; case 1: - pack(arg, ghost_idx + s * arg.dc.ghostFace[1], 0, parity, src_idx); + pack(arg, ghost_idx + s * arg.dc.ghostFace[1], 0, + parity, src_idx); break; case 2: - pack(arg, ghost_idx + s * arg.dc.ghostFace[2], 0, parity, src_idx); + pack(arg, ghost_idx + s * arg.dc.ghostFace[2], 0, + parity, src_idx); break; case 3: - pack(arg, ghost_idx + s * arg.dc.ghostFace[3], 0, parity, src_idx); + pack(arg, ghost_idx + s * arg.dc.ghostFace[3], 0, + parity, src_idx); break; } } else { // 4-d checkerboarding, keeping s separate (if it exists) switch (dim) { - case 0: pack(arg, ghost_idx, s, parity, src_idx); break; - case 1: pack(arg, ghost_idx, s, parity, src_idx); break; - case 2: pack(arg, ghost_idx, s, parity, src_idx); break; - case 3: pack(arg, ghost_idx, s, parity, src_idx); break; + case 0: + pack(arg, ghost_idx, s, parity, src_idx); + break; + case 1: + pack(arg, ghost_idx, s, parity, src_idx); + break; + case 2: + pack(arg, ghost_idx, s, parity, src_idx); + break; + case 3: + pack(arg, ghost_idx, s, parity, src_idx); + break; } } @@ -288,7 +313,8 @@ namespace quda // 64 - use uber kernel (merge exterior) template struct packShmem { - template __device__ __forceinline__ void operator()(const Arg &arg, int src_s, int parity) + template + __device__ __forceinline__ void operator()(const Arg &arg, int src_s, int parity) { // (active_dims * 2 + dir) * blocks_per_dir + local_block_idx int local_block_idx = target::block_idx().x % arg.blocks_per_dir; @@ -315,9 +341,9 @@ namespace quda while (local_tid < arg.dc.ghostFaceCB[0]) { int ghost_idx = dir * arg.dc.ghostFaceCB[0] + local_tid; if (pc == QUDA_5D_PC) - pack(arg, ghost_idx + s * arg.dc.ghostFace[0], 0, parity, src_idx); + pack(arg, ghost_idx + s * arg.dc.ghostFace[0], 0, parity, src_idx); else - pack(arg, ghost_idx, s, parity, src_idx); + pack(arg, ghost_idx, s, parity, src_idx); local_tid += arg.blocks_per_dir * target::block_dim().x; } break; @@ -325,9 +351,9 @@ namespace quda while (local_tid < arg.dc.ghostFaceCB[1]) { int ghost_idx = dir * arg.dc.ghostFaceCB[1] + local_tid; if (pc == QUDA_5D_PC) - pack(arg, ghost_idx + s * arg.dc.ghostFace[1], 0, parity, src_idx); + pack(arg, ghost_idx + s * arg.dc.ghostFace[1], 0, parity, src_idx); else - pack(arg, ghost_idx, s, parity, src_idx); + pack(arg, ghost_idx, s, parity, src_idx); local_tid += arg.blocks_per_dir * target::block_dim().x; } break; @@ -335,9 +361,9 @@ namespace quda while (local_tid < arg.dc.ghostFaceCB[2]) { int ghost_idx = dir * arg.dc.ghostFaceCB[2] + local_tid; if (pc == QUDA_5D_PC) - pack(arg, ghost_idx + s * arg.dc.ghostFace[2], 0, parity, src_idx); + pack(arg, ghost_idx + s * arg.dc.ghostFace[2], 0, parity, src_idx); else - pack(arg, ghost_idx, s, parity, src_idx); + pack(arg, ghost_idx, s, parity, src_idx); local_tid += arg.blocks_per_dir * target::block_dim().x; } break; @@ -345,9 +371,9 @@ namespace quda while (local_tid < arg.dc.ghostFaceCB[3]) { int ghost_idx = dir * arg.dc.ghostFaceCB[3] + local_tid; if (pc == QUDA_5D_PC) - pack(arg, ghost_idx + s * arg.dc.ghostFace[3], 0, parity, src_idx); + pack(arg, ghost_idx + s * arg.dc.ghostFace[3], 0, parity, src_idx); else - pack(arg, ghost_idx, s, parity, src_idx); + pack(arg, ghost_idx, s, parity, src_idx); local_tid += arg.blocks_per_dir * target::block_dim().x; } break; @@ -358,12 +384,19 @@ namespace quda #endif } - __device__ __forceinline__ void operator()(const Arg &arg, int s, int parity, int twist_pack) + template + __device__ __forceinline__ void operator()(const Arg &arg, int src_s_block_, int parity, int twist_pack) { - switch (twist_pack) { - case 0: this->operator()<0>(arg, s, parity); break; - case 1: this->operator()<1>(arg, s, parity); break; - case 2: this->operator()<2>(arg, s, parity); break; + int src_s_block = MAX_MULTI_RHS == 1 ? 0 : src_s_block_; + int src_s_idx = src_s_block * Arg::n_src_tile; + if (src_s_idx + n_src_tile * arg.Ls <= arg.n_src * arg.Ls) { + switch (twist_pack) { + case 0: this->operator()<0, n_src_tile>(arg, src_s_idx, parity); break; + case 1: this->operator()<1, n_src_tile>(arg, src_s_idx, parity); break; + case 2: this->operator()<2, n_src_tile>(arg, src_s_idx, parity); break; + } + } else if constexpr (n_src_tile - 1 > 0) { + operator()(arg, src_s_block, parity); } } }; @@ -377,7 +410,7 @@ namespace quda { if (arg.nParity == 1) parity = arg.parity; packShmem pack; - pack.operator()(arg, s, parity); + pack.operator()(arg, s, parity, Arg::twist); } }; @@ -400,17 +433,17 @@ namespace quda if (arg.nFace == 1) { switch (dim) { - case 0: packStaggered<0, 1>(arg, ghost_idx, parity, src_idx); break; - case 1: packStaggered<1, 1>(arg, ghost_idx, parity, src_idx); break; - case 2: packStaggered<2, 1>(arg, ghost_idx, parity, src_idx); break; - case 3: packStaggered<3, 1>(arg, ghost_idx, parity, src_idx); break; + case 0: packStaggered<0, 1, Arg::n_src_tile>(arg, ghost_idx, parity, src_idx); break; + case 1: packStaggered<1, 1, Arg::n_src_tile>(arg, ghost_idx, parity, src_idx); break; + case 2: packStaggered<2, 1, Arg::n_src_tile>(arg, ghost_idx, parity, src_idx); break; + case 3: packStaggered<3, 1, Arg::n_src_tile>(arg, ghost_idx, parity, src_idx); break; } } else if (arg.nFace == 3) { switch (dim) { - case 0: packStaggered<0, 3>(arg, ghost_idx, parity, src_idx); break; - case 1: packStaggered<1, 3>(arg, ghost_idx, parity, src_idx); break; - case 2: packStaggered<2, 3>(arg, ghost_idx, parity, src_idx); break; - case 3: packStaggered<3, 3>(arg, ghost_idx, parity, src_idx); break; + case 0: packStaggered<0, 3, Arg::n_src_tile>(arg, ghost_idx, parity, src_idx); break; + case 1: packStaggered<1, 3, Arg::n_src_tile>(arg, ghost_idx, parity, src_idx); break; + case 2: packStaggered<2, 3, Arg::n_src_tile>(arg, ghost_idx, parity, src_idx); break; + case 3: packStaggered<3, 3, Arg::n_src_tile>(arg, ghost_idx, parity, src_idx); break; } } @@ -422,7 +455,7 @@ namespace quda template struct packStaggeredShmem { - __device__ __forceinline__ void operator()(const Arg &arg, int src_idx, int parity, int = 0) + template __device__ __forceinline__ void apply(const Arg &arg, int src_idx, int parity) { // (active_dims * 2 + dir) * blocks_per_dir + local_block_idx int local_block_idx = target::block_idx().x % arg.blocks_per_dir; @@ -446,9 +479,9 @@ namespace quda while (local_tid < arg.nFace * arg.dc.ghostFaceCB[0]) { int ghost_idx = dir * arg.nFace * arg.dc.ghostFaceCB[0] + local_tid; if (arg.nFace == 1) - packStaggered<0, 1>(arg, ghost_idx, parity, src_idx); + packStaggered<0, 1, n_src_tile>(arg, ghost_idx, parity, src_idx); else - packStaggered<0, 3>(arg, ghost_idx, parity, src_idx); + packStaggered<0, 3, n_src_tile>(arg, ghost_idx, parity, src_idx); local_tid += arg.blocks_per_dir * target::block_dim().x; } break; @@ -456,9 +489,9 @@ namespace quda while (local_tid < arg.nFace * arg.dc.ghostFaceCB[1]) { int ghost_idx = dir * arg.nFace * arg.dc.ghostFaceCB[1] + local_tid; if (arg.nFace == 1) - packStaggered<1, 1>(arg, ghost_idx, parity, src_idx); + packStaggered<1, 1, n_src_tile>(arg, ghost_idx, parity, src_idx); else - packStaggered<1, 3>(arg, ghost_idx, parity, src_idx); + packStaggered<1, 3, n_src_tile>(arg, ghost_idx, parity, src_idx); local_tid += arg.blocks_per_dir * target::block_dim().x; } break; @@ -466,9 +499,9 @@ namespace quda while (local_tid < arg.nFace * arg.dc.ghostFaceCB[2]) { int ghost_idx = dir * arg.nFace * arg.dc.ghostFaceCB[2] + local_tid; if (arg.nFace == 1) - packStaggered<2, 1>(arg, ghost_idx, parity, src_idx); + packStaggered<2, 1, n_src_tile>(arg, ghost_idx, parity, src_idx); else - packStaggered<2, 3>(arg, ghost_idx, parity, src_idx); + packStaggered<2, 3, n_src_tile>(arg, ghost_idx, parity, src_idx); local_tid += arg.blocks_per_dir * target::block_dim().x; } break; @@ -476,9 +509,9 @@ namespace quda while (local_tid < arg.nFace * arg.dc.ghostFaceCB[3]) { int ghost_idx = dir * arg.nFace * arg.dc.ghostFaceCB[3] + local_tid; if (arg.nFace == 1) - packStaggered<3, 1>(arg, ghost_idx, parity, src_idx); + packStaggered<3, 1, n_src_tile>(arg, ghost_idx, parity, src_idx); else - packStaggered<3, 3>(arg, ghost_idx, parity, src_idx); + packStaggered<3, 3, n_src_tile>(arg, ghost_idx, parity, src_idx); local_tid += arg.blocks_per_dir * target::block_dim().x; } break; @@ -488,6 +521,18 @@ namespace quda if (arg.shmem) shmem_signal(dim, dir, arg); #endif } + + template + __device__ __forceinline__ void operator()(const Arg &arg, int src_idx_block_, int parity, int = 0) + { + int src_idx_block = MAX_MULTI_RHS == 1 ? 0 : src_idx_block_; + int src_idx = src_idx_block * Arg::n_src_tile; + if (src_idx + n_src_tile <= arg.n_src) { + apply(arg, src_idx, parity); + } else if constexpr (n_src_tile - 1 > 0) { + operator()(arg, src_idx_block, parity); + } + } }; template struct pack_staggered_shmem { diff --git a/include/kernels/dslash_wilson.cuh b/include/kernels/dslash_wilson.cuh index 514c4df957..584999d52f 100644 --- a/include/kernels/dslash_wilson.cuh +++ b/include/kernels/dslash_wilson.cuh @@ -34,7 +34,6 @@ namespace quda typedef typename mapper::type real; - const int_fastdiv n_src; F out[MAX_MULTI_RHS]; /** output vector field set */ F in[MAX_MULTI_RHS]; /** input vector field set */ F x[MAX_MULTI_RHS]; /** input vector set when doing xpay */ diff --git a/lib/dslash_pack2.cu b/lib/dslash_pack2.cu index 043ea65abc..01b70f3284 100644 --- a/lib/dslash_pack2.cu +++ b/lib/dslash_pack2.cu @@ -139,6 +139,10 @@ namespace quda strcpy(aux, "policy_kernel,"); strcat(aux, in.AuxString().c_str()); setRHSstring(aux, in.size()); + strcat(aux, ",n_rhs_tile="); + char tile_str[16]; + i32toa(tile_str, pack_tile_size); + strcat(aux, tile_str); char comm[5]; for (int i = 0; i < 4; i++) comm[i] = (comm_dim_pack[i] ? '1' : '0'); comm[4] = '\0'; From 77b6b594c755665e97c1cc3ae94a2822b18bb18d Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Mon, 18 Nov 2024 14:11:43 -0800 Subject: [PATCH 13/29] Fix bug in dslash_test_utils.h --- tests/dslash_test_utils.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/dslash_test_utils.h b/tests/dslash_test_utils.h index ec4ca5b13a..5512c3d0a2 100644 --- a/tests/dslash_test_utils.h +++ b/tests/dslash_test_utils.h @@ -428,9 +428,8 @@ struct DslashTestWrapper { switch (dtest_type) { case dslash_test_type::Dslash: // My dslash should be the same as the clover dslash - for (int i = 0; i < Nsrc; i++) - clover_dslash(spinorRef[i].data(), hostGauge, hostCloverInv, spinor[i].data(), parity, inv_param.dagger, - inv_param.cpu_prec, gauge_param); + clover_dslash(spinorRef[i].data(), hostGauge, hostCloverInv, spinor[i].data(), parity, inv_param.dagger, + inv_param.cpu_prec, gauge_param); break; case dslash_test_type::MatPC: // my matpc op From 27339e4afdb752d5119788e1e26585be279558d9 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Mon, 18 Nov 2024 14:21:48 -0800 Subject: [PATCH 14/29] nvc++ no longer needs to use constant memory args for dslash --- include/dslash_helper.cuh | 5 ----- 1 file changed, 5 deletions(-) diff --git a/include/dslash_helper.cuh b/include/dslash_helper.cuh index 8a9ba2391c..0e49e64b68 100644 --- a/include/dslash_helper.cuh +++ b/include/dslash_helper.cuh @@ -12,12 +12,7 @@ #include #include -#if defined(_NVHPC_CUDA) -#include -constexpr quda::use_kernel_arg_p use_kernel_arg = quda::use_kernel_arg_p::FALSE; -#else constexpr quda::use_kernel_arg_p use_kernel_arg = quda::use_kernel_arg_p::TRUE; -#endif #include From 936c2c4b21ae2e816c997268d499cefa00674eab Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Mon, 18 Nov 2024 14:23:49 -0800 Subject: [PATCH 15/29] Fix for nvc++ and remove unneeded target specific thread_array.h files --- include/targets/cuda/thread_array.h | 18 ------------------ include/targets/hip/thread_array.h | 1 - 2 files changed, 19 deletions(-) delete mode 100644 include/targets/cuda/thread_array.h delete mode 100644 include/targets/hip/thread_array.h diff --git a/include/targets/cuda/thread_array.h b/include/targets/cuda/thread_array.h deleted file mode 100644 index 1c4d7f3244..0000000000 --- a/include/targets/cuda/thread_array.h +++ /dev/null @@ -1,18 +0,0 @@ -#pragma once - -#ifndef _NVHPC_CUDA - -#include "../generic/thread_array.h" - -#else - -#include - -namespace quda -{ - template struct thread_array : array { - static constexpr unsigned int shared_mem_size(dim3 block) { return 0; } - }; -} // namespace quda - -#endif diff --git a/include/targets/hip/thread_array.h b/include/targets/hip/thread_array.h deleted file mode 100644 index 24f986cc26..0000000000 --- a/include/targets/hip/thread_array.h +++ /dev/null @@ -1 +0,0 @@ -#include "../generic/thread_array.h" From fd00d0888e89db6ce38615a8eee9a8853091c901 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Tue, 19 Nov 2024 07:50:43 -0800 Subject: [PATCH 16/29] Remove WAR for nvc++ in reduce_helper which is no longer needed --- include/targets/cuda/reduce_helper.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/include/targets/cuda/reduce_helper.h b/include/targets/cuda/reduce_helper.h index c8c6101892..7b8e3e4b57 100644 --- a/include/targets/cuda/reduce_helper.h +++ b/include/targets/cuda/reduce_helper.h @@ -169,11 +169,7 @@ namespace quda auto tid = target::thread_idx_linear<2>(); if (arg.result_d) { // write to host mapped memory -#ifdef _NVHPC_CUDA // WAR for nvc++ - constexpr bool coalesced_write = false; -#else constexpr bool coalesced_write = true; -#endif if constexpr (coalesced_write) { static_assert(n <= device::warp_size(), "reduction array is greater than warp size"); auto mask = __ballot_sync(0xffffffff, tid < n); From fc5ac97cd4d34e4b57a3ab01a3b6bee58bb4cc7c Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Wed, 20 Nov 2024 07:10:09 -0800 Subject: [PATCH 17/29] Add versioned CPM files to .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index a60ac7da48..8e122f0a4e 100644 --- a/.gitignore +++ b/.gitignore @@ -19,3 +19,4 @@ include/jitify_options.hpp .tags* autom4te.cache/* .vscode +cmake/CPM_*.cmake From a10d6f301d7a0f5aefffce81d92f0163e1ff832b Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Wed, 20 Nov 2024 07:47:19 -0800 Subject: [PATCH 18/29] Fix complex_quda.h to be C++20 compliant --- include/complex_quda.h | 38 ++++++++++++++++---------------------- 1 file changed, 16 insertions(+), 22 deletions(-) diff --git a/include/complex_quda.h b/include/complex_quda.h index 18da63def5..1481dec40f 100644 --- a/include/complex_quda.h +++ b/include/complex_quda.h @@ -360,23 +360,19 @@ struct complex typedef ValueType value_type; // Constructors - __host__ __device__ inline complex(const ValueType &re = ValueType(), const ValueType &im = ValueType()) + __host__ __device__ inline complex(const ValueType &re = ValueType(), const ValueType &im = ValueType()) { real(re); imag(im); } - template - __host__ __device__ - inline complex(const complex & z) + template __host__ __device__ inline complex(const complex &z) { real(z.real()); imag(z.imag()); } - template - __host__ __device__ - inline complex(const std::complex & z) + template __host__ __device__ inline complex(const std::complex &z) { real(z.real()); imag(z.imag()); @@ -436,12 +432,11 @@ struct complex template <> struct complex : public float2 { public: typedef float value_type; - complex() = default; - constexpr complex(const float &re, const float &im = float()) : float2 {re, im} { } + complex() = default; + constexpr complex(const float &re, const float &im = float()) : float2 {re, im} { } template - constexpr complex(const std::complex &z) : - float2 {static_cast(z.real()), static_cast(z.imag())} + constexpr complex(const std::complex &z) : float2 {static_cast(z.real()), static_cast(z.imag())} { } @@ -500,16 +495,15 @@ template <> struct complex : public float2 { template <> struct complex : public double2 { public: typedef double value_type; - complex() = default; - constexpr complex(const double &re, const double &im = double()) : double2 {re, im} { } + complex() = default; + constexpr complex(const double &re, const double &im = double()) : double2 {re, im} { } template - constexpr complex(const std::complex &z) : - double2 {static_cast(z.real()), static_cast(z.imag())} + constexpr complex(const std::complex &z) : double2 {static_cast(z.real()), static_cast(z.imag())} { } - template __host__ __device__ inline complex &operator=(const complex &z) + template __host__ __device__ inline complex &operator=(const complex &z) { real(z.real()); imag(z.imag()); @@ -572,9 +566,9 @@ template <> struct complex : public char2 { public: typedef int8_t value_type; - complex() = default; + complex() = default; - constexpr complex(const int8_t &re, const int8_t &im = int8_t()) : char2 {re, im} { } + constexpr complex(const int8_t &re, const int8_t &im = int8_t()) : char2 {re, im} { } __host__ __device__ inline complex &operator+=(const complex &z) { @@ -608,9 +602,9 @@ struct complex : public short2 public: typedef short value_type; - complex() = default; + complex() = default; - constexpr complex(const short &re, const short &im = short()) : short2 {re, im} { } + constexpr complex(const short &re, const short &im = short()) : short2 {re, im} { } __host__ __device__ inline complex &operator+=(const complex &z) { @@ -644,9 +638,9 @@ struct complex : public int2 public: typedef int value_type; - complex() = default; + complex() = default; - constexpr complex(const int &re, const int &im = int()) : int2 {re, im} { } + constexpr complex(const int &re, const int &im = int()) : int2 {re, im} { } __host__ __device__ inline complex &operator+=(const complex &z) { From c47e1f00a798be1f25fb4fa6c888b5d7435ef924 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Wed, 20 Nov 2024 10:20:35 -0800 Subject: [PATCH 19/29] Add new variant of heterogeneous reductions: with some compilers, optimization can optimize out infinities rendering the use of heterogenous atomics with an inifinity sentinal. As an alternative we can use negative zero as the sentinal. Default remains infinity, disable with QUDA_HETEROGENEOUS_ATOMIC_INF_INIT=OFF --- include/quda_define.h.in | 10 ++++++++++ include/targets/cuda/reduce_helper.h | 29 +++++++++++++++++++++++++++- lib/targets/cuda/target_cuda.cmake | 3 +++ 3 files changed, 41 insertions(+), 1 deletion(-) diff --git a/include/quda_define.h.in b/include/quda_define.h.in index 90a6b24d1a..86957d9aba 100644 --- a/include/quda_define.h.in +++ b/include/quda_define.h.in @@ -42,6 +42,16 @@ #undef QUDA_HETEROGENEOUS_ATOMIC #endif +#cmakedefine QUDA_HETEROGENEOUS_ATOMIC_INF_INIT +#ifdef QUDA_HETEROGENEOUS_ATOMIC_INF_INIT +/** + * @def HETEROGENEOUS_ATOMIC_INF_INIT + * @brief This macro sets whether we are using infinity for the signaling sentinal + */ +#define HETEROGENEOUS_ATOMIC_INF_INIT +#undef QUDA_HETEROGENEOUS_ATOMIC_INF_INIT +#endif + #cmakedefine QUDA_LARGE_KERNEL_ARG #cmakedefine QUDA_DIRAC_CLOVER_HASENBUSCH diff --git a/include/targets/cuda/reduce_helper.h b/include/targets/cuda/reduce_helper.h index 7b8e3e4b57..58aca5b9be 100644 --- a/include/targets/cuda/reduce_helper.h +++ b/include/targets/cuda/reduce_helper.h @@ -17,6 +17,11 @@ using count_t = cuda::atomic; namespace quda { +// By default we use negative infinity as the sentinal for testing for +// reduction completion. On some compilers we may need to use finite +// numbers, so the alternative approach uses negative zero (set with CMake +// option QUDA_HETEROGENEOUS_ATOMIC_INF_INIT). +#ifdef HETEROGENEOUS_ATOMIC_INF_INIT /** @brief The initialization value we used to check for completion */ @@ -28,6 +33,28 @@ namespace quda */ template constexpr T terminate_value() { return cuda::std::numeric_limits::infinity(); } + /** + @brief Test if the result is complete (e.g., is not equal to the sentinal) + */ + template bool is_complete(const T &result) { return result != init_value(); } +#else + /** + @brief The initialization value we used to check for completion + */ + template constexpr T init_value() { return -static_cast(0.0); } + + /** + @brief The termination value we use to prevent a possible hang in + case the computed reduction is equal to the initialization + */ + template constexpr T terminate_value() { return static_cast(0.0); } + + /** + @brief Test if the result is complete (e.g., is not equal to the sentinal) + */ + template bool is_complete(const T &result) { return !(result == static_cast(0.0) && std::signbit(result)); } +#endif + // declaration of reduce function template __device__ inline void reduce(Arg &arg, const Reducer &r, const T &in, const int idx = 0); @@ -129,7 +156,7 @@ namespace quda if (consumed) errorQuda("Cannot call complete more than once for each construction"); for (int i = 0; i < n_reduce * n_item; i++) { - while (result_h[i].load(cuda::std::memory_order_relaxed) == init_value()) { } + while (!is_complete(result_h[i].load(cuda::std::memory_order_relaxed))) { } } // copy back result element by element and convert if necessary to host reduce type diff --git a/lib/targets/cuda/target_cuda.cmake b/lib/targets/cuda/target_cuda.cmake index eb71bb9d53..2670f0a147 100644 --- a/lib/targets/cuda/target_cuda.cmake +++ b/lib/targets/cuda/target_cuda.cmake @@ -103,8 +103,11 @@ if(CMAKE_CUDA_COMPILER_ID MATCHES "NVIDIA" OR CMAKE_CUDA_COMPILER_ID MATCHES "NV endif() cmake_dependent_option(QUDA_HETEROGENEOUS_ATOMIC "enable heterogeneous atomic support ?" ON "QUDA_HETEROGENEOUS_ATOMIC_SUPPORT" OFF) +cmake_dependent_option(QUDA_HETEROGENEOUS_ATOMIC_INF_INIT "use infinity as the sentinal for heterogeneous atomic support?" ON + "QUDA_HETEROGENEOUS_ATOMIC_SUPPORT" OFF) mark_as_advanced(QUDA_HETEROGENEOUS_ATOMIC) +mark_as_advanced(QUDA_HETEROGENEOUS_ATOMIC_INF_INIT) mark_as_advanced(QUDA_JITIFY) mark_as_advanced(QUDA_DOWNLOAD_NVSHMEM) mark_as_advanced(QUDA_DOWNLOAD_NVSHMEM_TAR) From 8b61db2c3eb75f258e7d73f402bc98000c443e75 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Wed, 20 Nov 2024 11:45:34 -0800 Subject: [PATCH 20/29] Update to use latest release of Eigen 3.4: this fixes some bugs with nvc++ allowing for the removal of the WARs deployed previously --- CMakeLists.txt | 13 ++----------- cmake/eigen34_neon.diff | 8 -------- include/eigen_helper.h | 4 ---- 3 files changed, 2 insertions(+), 23 deletions(-) delete mode 100644 cmake/eigen34_neon.diff diff --git a/CMakeLists.txt b/CMakeLists.txt index 45219482b2..e9fe94cab2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -426,21 +426,12 @@ if(QUDA_DOWNLOAD_EIGEN) CPMAddPackage( NAME Eigen VERSION ${QUDA_EIGEN_VERSION} - URL https://gitlab.com/libeigen/eigen/-/archive/${QUDA_EIGEN_VERSION}/eigen-${QUDA_EIGEN_VERSION}.tar.bz2 - URL_HASH SHA256=B4C198460EBA6F28D34894E3A5710998818515104D6E74E5CC331CE31E46E626 + URL https://gitlab.com/libeigen/eigen/-/archive/e67c494cba7180066e73b9f6234d0b2129f1cdf5.tar.bz2 + URL_HASH SHA256=98d244932291506b75c4ae7459af29b1112ea3d2f04660686a925d9ef6634583 DOWNLOAD_ONLY YES SYSTEM YES) target_include_directories(Eigen SYSTEM INTERFACE ${Eigen_SOURCE_DIR}) install(DIRECTORY ${Eigen_SOURCE_DIR}/Eigen TYPE INCLUDE) - - # Eigen 3.4 needs to be patched on Neon with nvc++ - if (${CMAKE_CXX_COMPILER_ID} MATCHES "NVHPC") - set(CMAKE_PATCH_EIGEN OFF CACHE BOOL "Internal use only; do not modify") - if (NOT CMAKE_PATCH_EIGEN) - execute_process(COMMAND patch -N "${Eigen_SOURCE_DIR}/Eigen/src/Core/arch/NEON/Complex.h" "${CMAKE_SOURCE_DIR}/cmake/eigen34_neon.diff") - set(CMAKE_PATCH_EIGEN ON CACHE BOOL "Internal use only; do not modify" FORCE) - endif() - endif() else() # fall back to using find_package find_package(Eigen QUIET) diff --git a/cmake/eigen34_neon.diff b/cmake/eigen34_neon.diff deleted file mode 100644 index 7190ca7fa4..0000000000 --- a/cmake/eigen34_neon.diff +++ /dev/null @@ -1,8 +0,0 @@ -21c21 -< #if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML ---- -> #if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML || __NVCOMPILER_LLVM__ -393c393 -< #if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML ---- -> #if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML || __NVCOMPILER_LLVM__ diff --git a/include/eigen_helper.h b/include/eigen_helper.h index 8c1bf012a8..373f9ed865 100644 --- a/include/eigen_helper.h +++ b/include/eigen_helper.h @@ -5,10 +5,6 @@ #define EIGEN_USE_BLAS #endif -#if defined(__NVCOMPILER) // WAR for nvc++ until we update to latest Eigen -#define EIGEN_DONT_VECTORIZE -#endif - #include // hide annoying warning From 490930b680edca2c0a8d8a816852cdb87dd3f02b Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Wed, 20 Nov 2024 12:36:24 -0800 Subject: [PATCH 21/29] Apply ROCm perf WAR to Laplace operator --- include/kernels/laplace.cuh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/kernels/laplace.cuh b/include/kernels/laplace.cuh index b8797e722e..a3c2b1b377 100644 --- a/include/kernels/laplace.cuh +++ b/include/kernels/laplace.cuh @@ -143,8 +143,9 @@ namespace quda static constexpr const char *filename() { return KERNEL_FILE; } // this file name - used for run-time compilation template - __device__ __host__ __forceinline__ void operator()(int idx, int src_idx, int parity) + __device__ __host__ __forceinline__ void operator()(int idx, int src_idx_, int parity) { + int src_idx = MAX_MULTI_RHS == 1 ? 0 : src_idx_; using real = typename mapper::type; using Vector = ColorSpinor; From 26c3e5999bb8b0cd5376e9b1529ba1444b426307 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Wed, 20 Nov 2024 12:36:39 -0800 Subject: [PATCH 22/29] Fix nvc++ compiler warning --- include/kernels/staggered_quark_smearing.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/kernels/staggered_quark_smearing.cuh b/include/kernels/staggered_quark_smearing.cuh index 22323e66a2..85e1790318 100644 --- a/include/kernels/staggered_quark_smearing.cuh +++ b/include/kernels/staggered_quark_smearing.cuh @@ -179,9 +179,9 @@ namespace quda if (arg.is_t0_kernel) { if (arg.t0 < 0) return; - if (mykernel_type == INTERIOR_KERNEL) { + if constexpr (mykernel_type == INTERIOR_KERNEL) { idx += arg.t0 * arg.t0_offset; - } else if (mykernel_type != EXTERIOR_KERNEL_ALL) { + } else if constexpr (mykernel_type != EXTERIOR_KERNEL_ALL) { if (idx >= arg.t0_face_size[mykernel_type]) idx += arg.face_size[mykernel_type] - arg.t0_face_size[mykernel_type]; idx += arg.t0 * arg.t0_face_offset[mykernel_type]; From 2b81309b3521f5aaad727142411270b3e8530466 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Wed, 27 Nov 2024 09:38:30 -0800 Subject: [PATCH 23/29] Fix compiler warning introduced with 3debb29 --- tests/host_reference/hisq_force_reference.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/host_reference/hisq_force_reference.cpp b/tests/host_reference/hisq_force_reference.cpp index 33d773ed95..5da6dc1a7a 100644 --- a/tests/host_reference/hisq_force_reference.cpp +++ b/tests/host_reference/hisq_force_reference.cpp @@ -837,13 +837,13 @@ void computeMiddleLinkSite(int half_lattice_index, // half_lattice_index to bett if (Qprev == NULL) { if (sig_positive) { - ls.loadMatrixFromField(static_cast(oprod), 1 - oddBit, sig, point_d, &colorMatY); + ls.loadMatrixFromField(static_cast(oprod), 1 - oddBit, sig, point_d, &colorMatY); } else { - ls.loadMatrixFromField(static_cast(oprod), oddBit, OPP_DIR(sig), point_c, &colorMatY); + ls.loadMatrixFromField(static_cast(oprod), oddBit, OPP_DIR(sig), point_c, &colorMatY); colorMatY = conj(colorMatY); } } else { // Qprev != NULL - ls.loadMatrixFromField(static_cast(oprod), oddBit, point_c, &colorMatY); + ls.loadMatrixFromField(static_cast(oprod), oddBit, point_c, &colorMatY); } colorMatW = (!mu_positive) ? bc_link * colorMatY : conj(bc_link) * colorMatY; From 0a3f608a9fcd1a69cf8563488151058c6726bb1d Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Wed, 27 Nov 2024 10:03:44 -0800 Subject: [PATCH 24/29] Fix process divergence issues (could hang when autotuning) in genericPrintMatrix routine. Apply same patch to genericPrintVector for future proofing --- lib/color_spinor_util.in.cu | 4 ++-- lib/gauge_norm.in.cu | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/color_spinor_util.in.cu b/lib/color_spinor_util.in.cu index a268a4b16a..71664614cf 100644 --- a/lib/color_spinor_util.in.cu +++ b/lib/color_spinor_util.in.cu @@ -378,8 +378,6 @@ namespace quda { void genericPrintVector(const ColorSpinorField &a, int parity, unsigned int x_cb, int rank) { - if (rank != comm_rank()) return; - ColorSpinorParam param(a); param.location = QUDA_CPU_FIELD_LOCATION; param.create = QUDA_COPY_FIELD_CREATE; @@ -388,6 +386,8 @@ namespace quda { std::unique_ptr clone_a = !host_clone ? nullptr : std::make_unique(param); const ColorSpinorField &a_ = !host_clone ? a : *clone_a.get(); + if (rank != comm_rank()) return; // rank returns after potential copy to host to prevent tuning hang + switch (a.Precision()) { case QUDA_DOUBLE_PRECISION: genericPrintVector(a_, parity, x_cb); break; case QUDA_SINGLE_PRECISION: genericPrintVector(a_, parity, x_cb); break; diff --git a/lib/gauge_norm.in.cu b/lib/gauge_norm.in.cu index 08cb5bf2ca..ddf59287f4 100644 --- a/lib/gauge_norm.in.cu +++ b/lib/gauge_norm.in.cu @@ -159,8 +159,6 @@ namespace quda { void genericPrintMatrix(const GaugeField &a, int d, int parity, unsigned int x_cb, int rank) { - if (rank != comm_rank()) return; - GaugeFieldParam param(a); param.field = const_cast(&a); param.location = QUDA_CPU_FIELD_LOCATION; @@ -172,6 +170,8 @@ namespace quda { std::unique_ptr clone_a = !host_clone ? nullptr : std::make_unique(param); const GaugeField &a_ = !host_clone ? a : *clone_a.get(); + if (rank != comm_rank()) return; // rank returns after potential copy to host to prevent tuning hang + switch (a.Precision()) { case QUDA_DOUBLE_PRECISION: genericPrintMatrix(a_, d, parity, x_cb); break; case QUDA_SINGLE_PRECISION: genericPrintMatrix(a_, d, parity, x_cb); break; From 22393c5a893b20a99b85b1f92dc86e32ef1c1d40 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Tue, 3 Dec 2024 13:26:40 -0800 Subject: [PATCH 25/29] Add pragma unroll --- include/kernels/dslash_staggered.cuh | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/include/kernels/dslash_staggered.cuh b/include/kernels/dslash_staggered.cuh index 3dfaf01be0..ec09ab1622 100644 --- a/include/kernels/dslash_staggered.cuh +++ b/include/kernels/dslash_staggered.cuh @@ -103,6 +103,7 @@ namespace quda if (doHalo(d) && ghost) { const int ghost_idx = ghostFaceIndexStaggered<1>(coord, arg.dim, d, 1); const Link U = arg.improved ? arg.U(d, coord.x_cb, parity) : arg.U(d, coord.x_cb, parity, StaggeredPhase(coord, d, +1, arg)); +#pragma unroll for (auto s = 0; s < n_src_tile; s++) { Vector in = arg.halo.Ghost(d, 1, ghost_idx + (src_idx + s) * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity); @@ -111,6 +112,7 @@ namespace quda } else if (doBulk() && !ghost) { const int fwd_idx = linkIndexP1(coord, arg.dim, d); const Link U = arg.improved ? arg.U(d, coord.x_cb, parity) : arg.U(d, coord.x_cb, parity, StaggeredPhase(coord, d, +1, arg)); +#pragma unroll for (auto s = 0; s < n_src_tile; s++) { Vector in = arg.in[src_idx + s](fwd_idx, their_spinor_parity); out[s] = mv_add(U, in, out[s]); @@ -124,6 +126,7 @@ namespace quda if (doHalo(d) && ghost) { const int ghost_idx = ghostFaceIndexStaggered<1>(coord, arg.dim, d, arg.nFace); const Link L = arg.L(d, coord.x_cb, parity); +#pragma unroll for (auto s = 0; s < n_src_tile; s++) { const Vector in = arg.halo.Ghost(d, 1, ghost_idx + (src_idx + s) * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity); @@ -132,6 +135,7 @@ namespace quda } else if (doBulk() && !ghost) { const int fwd3_idx = linkIndexP3(coord, arg.dim, d); const Link L = arg.L(d, coord.x_cb, parity); +#pragma unroll for (auto s = 0; s < n_src_tile; s++) { const Vector in = arg.in[src_idx + s](fwd3_idx, their_spinor_parity); out[s] = mv_add(L, in, out[s]); @@ -148,6 +152,7 @@ namespace quda const int ghost_idx = arg.improved ? ghostFaceIndexStaggered<0>(coord, arg.dim, d, 3) : ghost_idx2; const Link U = arg.improved ? arg.U.Ghost(d, ghost_idx2, 1 - parity) : arg.U.Ghost(d, ghost_idx2, 1 - parity, StaggeredPhase(coord, d, -1, arg)); +#pragma unroll for (auto s = 0; s < n_src_tile; s++) { Vector in = arg.halo.Ghost(d, 0, ghost_idx + (src_idx + s) * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity); @@ -158,6 +163,7 @@ namespace quda const int gauge_idx = back_idx; const Link U = arg.improved ? arg.U(d, gauge_idx, 1 - parity) : arg.U(d, gauge_idx, 1 - parity, StaggeredPhase(coord, d, -1, arg)); +#pragma unroll for (auto s = 0; s < n_src_tile; s++) { Vector in = arg.in[src_idx + s](back_idx, their_spinor_parity); out[s] = mv_add(conj(U), -in, out[s]); @@ -172,6 +178,7 @@ namespace quda // when updating replace arg.nFace with 1 here const int ghost_idx = ghostFaceIndexStaggered<0>(coord, arg.dim, d, 1); const Link L = arg.L.Ghost(d, ghost_idx, 1 - parity); +#pragma unroll for (auto s = 0; s < n_src_tile; s++) { const Vector in = arg.halo.Ghost(d, 0, ghost_idx + (src_idx + s) * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity); @@ -181,6 +188,7 @@ namespace quda const int back3_idx = linkIndexM3(coord, arg.dim, d); const int gauge_idx = back3_idx; const Link L = arg.L(d, gauge_idx, 1 - parity); +#pragma unroll for (auto s = 0; s < n_src_tile; s++) { const Vector in = arg.in[src_idx + s](back3_idx, their_spinor_parity); out[s] = mv_add(conj(L), -in, out[s]); @@ -215,20 +223,24 @@ namespace quda array out; applyStaggered(out, arg, coord, parity, idx, thread_dim, active, src_idx); +#pragma unroll for (auto s = 0; s < n_src_tile; s++) out[s] *= arg.dagger_scale; if (xpay && mykernel_type == INTERIOR_KERNEL) { +#pragma unroll for (auto s = 0; s < n_src_tile; s++) { Vector x = arg.x[src_idx + s](coord.x_cb, my_spinor_parity); out[s] = arg.a * x - out[s]; } } else if (mykernel_type != INTERIOR_KERNEL) { +#pragma unroll for (auto s = 0; s < n_src_tile; s++) { Vector x = arg.out[src_idx + s](coord.x_cb, my_spinor_parity); out[s] = x + (xpay ? -out[s] : out[s]); } } if (mykernel_type != EXTERIOR_KERNEL_ALL || active) { +#pragma unroll for (auto s = 0; s < n_src_tile; s++) { arg.out[src_idx + s](coord.x_cb, my_spinor_parity) = out[s]; } } } From 8ffcdd5f1607aaaeb01a3ad06e73cd294e398349 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Wed, 4 Dec 2024 09:38:48 -0800 Subject: [PATCH 26/29] Do not run BiCGStab(l) with staggered half precision since it is unstable --- tests/staggered_invert_test_gtest.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/staggered_invert_test_gtest.hpp b/tests/staggered_invert_test_gtest.hpp index 7ef2c270b3..1b659eec59 100644 --- a/tests/staggered_invert_test_gtest.hpp +++ b/tests/staggered_invert_test_gtest.hpp @@ -53,6 +53,10 @@ bool skip_test(test_t param) // MR struggles with the staggered and asqtad spectrum, it's not MR's fault if (solution_type == QUDA_MAT_SOLUTION && solve_type == QUDA_DIRECT_SOLVE && inverter_type == QUDA_MR_INVERTER) return true; + + // BiCGStab-L is unstable with staggered operator with low precision + // we could likely restore this when 20-bit quark fields are merged in + if (inverter_type == QUDA_BICGSTABL_INVERTER && prec_sloppy < QUDA_SINGLE_PRECISION) return true; } // CG3 is rather unstable with low precision From a860b1cfe86ed079cdaef3352f867d35b8dcc86f Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Wed, 4 Dec 2024 17:17:45 -0800 Subject: [PATCH 27/29] sentinal -> sentinel --- include/quda_define.h.in | 2 +- include/targets/cuda/reduce_helper.h | 6 +++--- lib/targets/cuda/target_cuda.cmake | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/quda_define.h.in b/include/quda_define.h.in index 86957d9aba..3e572b70bd 100644 --- a/include/quda_define.h.in +++ b/include/quda_define.h.in @@ -46,7 +46,7 @@ #ifdef QUDA_HETEROGENEOUS_ATOMIC_INF_INIT /** * @def HETEROGENEOUS_ATOMIC_INF_INIT - * @brief This macro sets whether we are using infinity for the signaling sentinal + * @brief This macro sets whether we are using infinity for the signaling sentinel */ #define HETEROGENEOUS_ATOMIC_INF_INIT #undef QUDA_HETEROGENEOUS_ATOMIC_INF_INIT diff --git a/include/targets/cuda/reduce_helper.h b/include/targets/cuda/reduce_helper.h index 58aca5b9be..a604b66900 100644 --- a/include/targets/cuda/reduce_helper.h +++ b/include/targets/cuda/reduce_helper.h @@ -17,7 +17,7 @@ using count_t = cuda::atomic; namespace quda { -// By default we use negative infinity as the sentinal for testing for +// By default we use negative infinity as the sentinel for testing for // reduction completion. On some compilers we may need to use finite // numbers, so the alternative approach uses negative zero (set with CMake // option QUDA_HETEROGENEOUS_ATOMIC_INF_INIT). @@ -34,7 +34,7 @@ namespace quda template constexpr T terminate_value() { return cuda::std::numeric_limits::infinity(); } /** - @brief Test if the result is complete (e.g., is not equal to the sentinal) + @brief Test if the result is complete (e.g., is not equal to the sentinel) */ template bool is_complete(const T &result) { return result != init_value(); } #else @@ -50,7 +50,7 @@ namespace quda template constexpr T terminate_value() { return static_cast(0.0); } /** - @brief Test if the result is complete (e.g., is not equal to the sentinal) + @brief Test if the result is complete (e.g., is not equal to the sentinel) */ template bool is_complete(const T &result) { return !(result == static_cast(0.0) && std::signbit(result)); } #endif diff --git a/lib/targets/cuda/target_cuda.cmake b/lib/targets/cuda/target_cuda.cmake index 2670f0a147..721edc3d3b 100644 --- a/lib/targets/cuda/target_cuda.cmake +++ b/lib/targets/cuda/target_cuda.cmake @@ -103,7 +103,7 @@ if(CMAKE_CUDA_COMPILER_ID MATCHES "NVIDIA" OR CMAKE_CUDA_COMPILER_ID MATCHES "NV endif() cmake_dependent_option(QUDA_HETEROGENEOUS_ATOMIC "enable heterogeneous atomic support ?" ON "QUDA_HETEROGENEOUS_ATOMIC_SUPPORT" OFF) -cmake_dependent_option(QUDA_HETEROGENEOUS_ATOMIC_INF_INIT "use infinity as the sentinal for heterogeneous atomic support?" ON +cmake_dependent_option(QUDA_HETEROGENEOUS_ATOMIC_INF_INIT "use infinity as the sentinel for heterogeneous atomic support?" ON "QUDA_HETEROGENEOUS_ATOMIC_SUPPORT" OFF) mark_as_advanced(QUDA_HETEROGENEOUS_ATOMIC) From c3fbf502e6f9fbcbd52d86da68e8497ada169839 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Wed, 4 Dec 2024 18:01:24 -0800 Subject: [PATCH 28/29] Apply review comments --- CMakeLists.txt | 3 ++- include/kernels/dslash_pack.cuh | 16 ++++++++++------ 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e9fe94cab2..512a7b9baa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -216,7 +216,8 @@ if(QUDA_MAX_MULTI_BLAS_N GREATER 32) message(SEND_ERROR "Maximum QUDA_MAX_MULTI_BLAS_N is 32.") endif() -set(QUDA_MAX_MULTI_RHS_TILE "1" CACHE STRING "maximum tile size for MRHS kernels") +# For now only we only support register tiles for the staggered dslash operators +set(QUDA_MAX_MULTI_RHS_TILE "1" CACHE STRING "maximum tile size for MRHS kernels (staggered only)") if(QUDA_MAX_MULTI_RHS_TILE GREATER QUDA_MAX_MULTI_RHS) message(SEND_ERROR "QUDA_MAX_MULTI_RHS_TILE is greater than QUDA_MAX_MULTI_RHS") endif() diff --git a/include/kernels/dslash_pack.cuh b/include/kernels/dslash_pack.cuh index ee34616bbd..a6a5b3c1d5 100644 --- a/include/kernels/dslash_pack.cuh +++ b/include/kernels/dslash_pack.cuh @@ -165,6 +165,7 @@ namespace quda int idx = indexFromFaceIndex(ghost_idx, parity, arg); constexpr int proj_dir = dagger ? +1 : -1; +#pragma unroll for (auto src = src_idx; src < src_idx + n_src_tile; src++) { Vector f = arg.in[src](idx + s * arg.dc.volume_4d_cb, spinor_parity); if (twist == 1) { @@ -187,6 +188,7 @@ namespace quda int idx = indexFromFaceIndex(ghost_idx, parity, arg); constexpr int proj_dir = dagger ? -1 : +1; +#pragma unroll for (auto src = src_idx; src < src_idx + n_src_tile; src++) { Vector f = arg.in[src](idx + s * arg.dc.volume_4d_cb, spinor_parity); if (twist == 1) { @@ -226,15 +228,17 @@ namespace quda if (face_num == 0) { // backwards int idx = indexFromFaceIndexStaggered<4, QUDA_4D_PC, dim, nFace, 0>(ghost_idx, parity, arg); - for (auto s = 0; s < n_src_tile; s++) { - Vector f = arg.in[src_idx + s](idx, spinor_parity); - arg.halo_pack.Ghost(dim, 0, ghost_idx + (src_idx + s) * nFace * arg.dc.ghostFaceCB[dim], spinor_parity) = f; +#pragma unroll + for (auto src = src_idx; src < src_idx + n_src_tile; src++) { + Vector f = arg.in[src](idx, spinor_parity); + arg.halo_pack.Ghost(dim, 0, ghost_idx + src * nFace * arg.dc.ghostFaceCB[dim], spinor_parity) = f; } } else { // forwards int idx = indexFromFaceIndexStaggered<4, QUDA_4D_PC, dim, nFace, 1>(ghost_idx, parity, arg); - for (auto s = 0; s < n_src_tile; s++) { - Vector f = arg.in[src_idx + s](idx, spinor_parity); - arg.halo_pack.Ghost(dim, 1, ghost_idx + (src_idx + s) * nFace * arg.dc.ghostFaceCB[dim], spinor_parity) = f; +#pragma unroll + for (auto src = src_idx; src < src_idx + n_src_tile; src++) { + Vector f = arg.in[src](idx, spinor_parity); + arg.halo_pack.Ghost(dim, 1, ghost_idx + src * nFace * arg.dc.ghostFaceCB[dim], spinor_parity) = f; } } } From b6f5edbb73e08661bdc67a666af784fba8b66cb4 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Wed, 4 Dec 2024 18:02:46 -0800 Subject: [PATCH 29/29] Apply clang format --- include/field_cache.h | 1 - include/targets/cuda/reduce_helper.h | 5 ++++- lib/field_cache.cpp | 2 +- lib/gauge_norm.in.cu | 2 +- 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/include/field_cache.h b/include/field_cache.h index f288844300..4def6faa3a 100644 --- a/include/field_cache.h +++ b/include/field_cache.h @@ -154,5 +154,4 @@ namespace quda { for (auto i = 0u; i < a.size(); i++) tmp.push_back(std::move(getFieldTmp(a[i]))); return tmp; } - } diff --git a/include/targets/cuda/reduce_helper.h b/include/targets/cuda/reduce_helper.h index a604b66900..aceb88c83d 100644 --- a/include/targets/cuda/reduce_helper.h +++ b/include/targets/cuda/reduce_helper.h @@ -52,7 +52,10 @@ namespace quda /** @brief Test if the result is complete (e.g., is not equal to the sentinel) */ - template bool is_complete(const T &result) { return !(result == static_cast(0.0) && std::signbit(result)); } + template bool is_complete(const T &result) + { + return !(result == static_cast(0.0) && std::signbit(result)); + } #endif // declaration of reduce function diff --git a/lib/field_cache.cpp b/lib/field_cache.cpp index b98f24c065..7283bf39d0 100644 --- a/lib/field_cache.cpp +++ b/lib/field_cache.cpp @@ -44,7 +44,7 @@ namespace quda { if (it != cache.end() && it->second.size()) { // found an entry tmp = std::move(it->second.top()); it->second.pop(); // pop the defunct object - } else { // no entry found, we must allocate a new field + } else { // no entry found, we must allocate a new field param.create = QUDA_ZERO_FIELD_CREATE; tmp = T(param); } diff --git a/lib/gauge_norm.in.cu b/lib/gauge_norm.in.cu index ddf59287f4..2cc755bc81 100644 --- a/lib/gauge_norm.in.cu +++ b/lib/gauge_norm.in.cu @@ -170,7 +170,7 @@ namespace quda { std::unique_ptr clone_a = !host_clone ? nullptr : std::make_unique(param); const GaugeField &a_ = !host_clone ? a : *clone_a.get(); - if (rank != comm_rank()) return; // rank returns after potential copy to host to prevent tuning hang + if (rank != comm_rank()) return; // rank returns after potential copy to host to prevent tuning hang switch (a.Precision()) { case QUDA_DOUBLE_PRECISION: genericPrintMatrix(a_, d, parity, x_cb); break;