From 8292b1770fe6df5675b3cf183ca339c5371e946e Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Fri, 11 Oct 2024 02:23:18 -0700
Subject: [PATCH 01/29] Initial support for register for staggered dslash
 kernel

---
 CMakeLists.txt                       |   3 +
 include/dslash.h                     |  12 ++-
 include/dslash_helper.cuh            |   8 +-
 include/kernels/dslash_staggered.cuh | 111 ++++++++++++++++++---------
 include/quda_define.h.in             |   6 ++
 5 files changed, 99 insertions(+), 41 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 803f5dba41..b0cc69dacd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -216,6 +216,8 @@ if(QUDA_MAX_MULTI_BLAS_N GREATER 32)
   message(SEND_ERROR "Maximum QUDA_MAX_MULTI_BLAS_N is 32.")
 endif()
 
+set(QUDA_MAX_MULTI_RHS_TILE "1" CACHE STRING "maximum tile size for MRHS kernels")
+
 set(QUDA_PRECISION
   "14"
   CACHE STRING "which precisions to instantiate in QUDA (4-bit number - double, single, half, quarter)")
@@ -275,6 +277,7 @@ mark_as_advanced(QUDA_ALTERNATIVE_I_TO_F)
 
 mark_as_advanced(QUDA_MAX_MULTI_BLAS_N)
 mark_as_advanced(QUDA_MAX_MULTI_RHS)
+mark_as_advanced(QUDA_MAX_MULTI_RHS_TILE)
 mark_as_advanced(QUDA_PRECISION)
 mark_as_advanced(QUDA_RECONSTRUCT)
 mark_as_advanced(QUDA_CLOVER_CHOLESKY_PROMOTE)
diff --git a/include/dslash.h b/include/dslash.h
index 4b1aada52e..80638b494e 100644
--- a/include/dslash.h
+++ b/include/dslash.h
@@ -67,6 +67,10 @@ namespace quda
       if (arg.xpay) strcat(aux_base, ",xpay");
       if (arg.dagger) strcat(aux_base, ",dagger");
       setRHSstring(aux_base, in.size());
+      strcat(aux_base, ",n_rhs_tile=");
+      char tile_str[16];
+      i32toa(tile_str, Arg::n_src_tile);
+      strcat(aux_base, tile_str);
     }
 
     /**
@@ -329,7 +333,13 @@ namespace quda
 
     Dslash(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
            const ColorSpinorField &halo, const std::string &app_base = "") :
-      TunableKernel3D(in[0], halo.X(4), arg.nParity), arg(arg), out(out), in(in), halo(halo), nDimComms(4), dslashParam(arg)
+      TunableKernel3D(in[0], (halo.X(4) + Arg::n_src_tile - 1) / Arg::n_src_tile, arg.nParity),
+      arg(arg),
+      out(out),
+      in(in),
+      halo(halo),
+      nDimComms(4),
+      dslashParam(arg)
     {
       if (checkLocation(out, in) == QUDA_CPU_FIELD_LOCATION)
         errorQuda("CPU Fields not supported in Dslash framework yet");
diff --git a/include/dslash_helper.cuh b/include/dslash_helper.cuh
index df176549fc..c39fd41f3f 100644
--- a/include/dslash_helper.cuh
+++ b/include/dslash_helper.cuh
@@ -241,11 +241,12 @@ namespace quda
     return true;
   }
 
-  template <typename Float_, int nDim_> struct DslashArg {
+  template <typename Float_, int nDim_, int n_src_tile_ = 1> struct DslashArg {
 
     using Float = Float_;
     using real = typename mapper<Float>::type;
     static constexpr int nDim = nDim_;
+    static constexpr int n_src_tile = n_src_tile_; // how many RHS per thread
 
     const int parity;  // only use this for single parity fields
     const int nParity; // number of parities we're working on
@@ -650,8 +651,9 @@ namespace quda
     Arg arg;
 
     dslash_functor_arg(const Arg &arg, unsigned int threads_x) :
-      kernel_param(dim3(threads_x, arg.dc.Ls, arg.nParity)),
-      arg(arg) { }
+      kernel_param(dim3(threads_x, (arg.dc.Ls + Arg::n_src_tile - 1) / Arg::n_src_tile, arg.nParity)), arg(arg)
+    {
+    }
   };
 
   /**
diff --git a/include/kernels/dslash_staggered.cuh b/include/kernels/dslash_staggered.cuh
index 7eafa4a687..cfeaa553e7 100644
--- a/include/kernels/dslash_staggered.cuh
+++ b/include/kernels/dslash_staggered.cuh
@@ -14,9 +14,9 @@ namespace quda
   /**
      @brief Parameter structure for driving the Staggered Dslash operator
   */
-  template <typename Float, int nColor_, int nDim, QudaReconstructType reconstruct_u_,
-            QudaReconstructType reconstruct_l_, bool improved_, QudaStaggeredPhase phase_ = QUDA_STAGGERED_PHASE_MILC>
-  struct StaggeredArg : DslashArg<Float, nDim> {
+  template <typename Float, int nColor_, int nDim, QudaReconstructType reconstruct_u_, QudaReconstructType reconstruct_l_,
+            bool improved_, QudaStaggeredPhase phase_ = QUDA_STAGGERED_PHASE_MILC, int n_src_tile = MAX_MULTI_RHS_TILE>
+  struct StaggeredArg : DslashArg<Float, nDim, n_src_tile> {
     typedef typename mapper<Float>::type real;
     static constexpr int nColor = nColor_;
     static constexpr int nSpin = 1;
@@ -57,8 +57,9 @@ namespace quda
     StaggeredArg(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
                  const ColorSpinorField &halo, const GaugeField &U, const GaugeField &L, double a,
                  cvector_ref<const ColorSpinorField> &x, int parity, bool dagger, const int *comm_override) :
-      DslashArg<Float, nDim>(out, in, halo, U, x, parity, dagger, a == 0.0 ? false : true, improved_ ? 3 : 1,
-                             spin_project, comm_override),
+      DslashArg<Float, nDim, n_src_tile>(out, in, halo, U, x, parity, dagger, a == 0.0 ? false : true,
+                                         improved_ ? 3 : 1, spin_project, comm_override),
+      n_src(out.size()),
       halo_pack(halo, improved_ ? 3 : 1),
       halo(halo, improved_ ? 3 : 1),
       U(U),
@@ -87,9 +88,9 @@ namespace quda
      @param[in] parity The site parity
      @param[in] x_cb The checkerboarded site index
   */
-  template <int nParity, KernelType kernel_type, typename Coord, typename Arg, typename Vector>
-  __device__ __host__ inline void applyStaggered(Vector &out, const Arg &arg, Coord &coord, int parity, int,
-                                                 int thread_dim, bool &active, int src_idx)
+  template <int nParity, KernelType kernel_type, int n_src_tile, typename Coord, typename Arg, typename Vector>
+  __device__ __host__ inline void applyStaggered(array<Vector, n_src_tile> &out, const Arg &arg, Coord &coord,
+                                                 int parity, int, int thread_dim, bool &active, int src_idx)
   {
     typedef typename mapper<typename Arg::Float>::type real;
     typedef Matrix<complex<real>, Arg::nColor> Link;
@@ -104,13 +105,18 @@ namespace quda
         if (doHalo<kernel_type>(d) && ghost) {
           const int ghost_idx = ghostFaceIndexStaggered<1>(coord, arg.dim, d, 1);
           const Link U = arg.improved ? arg.U(d, coord.x_cb, parity) : arg.U(d, coord.x_cb, parity, StaggeredPhase(coord, d, +1, arg));
-          Vector in = arg.halo.Ghost(d, 1, ghost_idx + src_idx * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity);
-          out = mv_add(U, in, out);
+          for (auto s = 0; s < n_src_tile; s++) {
+            Vector in
+              = arg.halo.Ghost(d, 1, ghost_idx + (src_idx + s) * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity);
+            out[s] = mv_add(U, in, out[s]);
+          }
         } else if (doBulk<kernel_type>() && !ghost) {
           const int fwd_idx = linkIndexP1(coord, arg.dim, d);
           const Link U = arg.improved ? arg.U(d, coord.x_cb, parity) : arg.U(d, coord.x_cb, parity, StaggeredPhase(coord, d, +1, arg));
-          Vector in = arg.in[src_idx](fwd_idx, their_spinor_parity);
-          out = mv_add(U, in, out);
+          for (auto s = 0; s < n_src_tile; s++) {
+            Vector in = arg.in[src_idx + s](fwd_idx, their_spinor_parity);
+            out[s] = mv_add(U, in, out[s]);
+          }
         }
       }
 
@@ -120,14 +126,18 @@ namespace quda
         if (doHalo<kernel_type>(d) && ghost) {
           const int ghost_idx = ghostFaceIndexStaggered<1>(coord, arg.dim, d, arg.nFace);
           const Link L = arg.L(d, coord.x_cb, parity);
-          const Vector in
-            = arg.halo.Ghost(d, 1, ghost_idx + src_idx * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity);
-          out = mv_add(L, in, out);
+          for (auto s = 0; s < n_src_tile; s++) {
+            const Vector in
+              = arg.halo.Ghost(d, 1, ghost_idx + (src_idx + s) * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity);
+            out[s] = mv_add(L, in, out[s]);
+          }
         } else if (doBulk<kernel_type>() && !ghost) {
           const int fwd3_idx = linkIndexP3(coord, arg.dim, d);
           const Link L = arg.L(d, coord.x_cb, parity);
-          const Vector in = arg.in[src_idx](fwd3_idx, their_spinor_parity);
-          out = mv_add(L, in, out);
+          for (auto s = 0; s < n_src_tile; s++) {
+            const Vector in = arg.in[src_idx + s](fwd3_idx, their_spinor_parity);
+            out[s] = mv_add(L, in, out[s]);
+          }
         }
       }
 
@@ -140,15 +150,20 @@ namespace quda
           const int ghost_idx = arg.improved ? ghostFaceIndexStaggered<0>(coord, arg.dim, d, 3) : ghost_idx2;
           const Link U = arg.improved ? arg.U.Ghost(d, ghost_idx2, 1 - parity) :
             arg.U.Ghost(d, ghost_idx2, 1 - parity, StaggeredPhase(coord, d, -1, arg));
-          Vector in = arg.halo.Ghost(d, 0, ghost_idx + src_idx * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity);
-          out = mv_add(conj(U), -in, out);
+          for (auto s = 0; s < n_src_tile; s++) {
+            Vector in
+              = arg.halo.Ghost(d, 0, ghost_idx + (src_idx + s) * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity);
+            out[s] = mv_add(conj(U), -in, out[s]);
+          }
         } else if (doBulk<kernel_type>() && !ghost) {
           const int back_idx = linkIndexM1(coord, arg.dim, d);
           const int gauge_idx = back_idx;
           const Link U = arg.improved ? arg.U(d, gauge_idx, 1 - parity) :
             arg.U(d, gauge_idx, 1 - parity, StaggeredPhase(coord, d, -1, arg));
-          Vector in = arg.in[src_idx](back_idx, their_spinor_parity);
-          out = mv_add(conj(U), -in, out);
+          for (auto s = 0; s < n_src_tile; s++) {
+            Vector in = arg.in[src_idx + s](back_idx, their_spinor_parity);
+            out[s] = mv_add(conj(U), -in, out[s]);
+          }
         }
       }
 
@@ -159,15 +174,19 @@ namespace quda
           // when updating replace arg.nFace with 1 here
           const int ghost_idx = ghostFaceIndexStaggered<0>(coord, arg.dim, d, 1);
           const Link L = arg.L.Ghost(d, ghost_idx, 1 - parity);
-          const Vector in
-            = arg.halo.Ghost(d, 0, ghost_idx + src_idx * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity);
-          out = mv_add(conj(L), -in, out);
+          for (auto s = 0; s < n_src_tile; s++) {
+            const Vector in
+              = arg.halo.Ghost(d, 0, ghost_idx + (src_idx + s) * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity);
+            out[s] = mv_add(conj(L), -in, out[s]);
+          }
         } else if (doBulk<kernel_type>() && !ghost) {
           const int back3_idx = linkIndexM3(coord, arg.dim, d);
           const int gauge_idx = back3_idx;
           const Link L = arg.L(d, gauge_idx, 1 - parity);
-          const Vector in = arg.in[src_idx](back3_idx, their_spinor_parity);
-          out = mv_add(conj(L), -in, out);
+          for (auto s = 0; s < n_src_tile; s++) {
+            const Vector in = arg.in[src_idx + s](back3_idx, their_spinor_parity);
+            out[s] = mv_add(conj(L), -in, out[s]);
+          }
         }
       }
     } // nDim
@@ -181,8 +200,8 @@ namespace quda
     constexpr staggered(const Arg &arg) : arg(arg) {}
     static constexpr const char *filename() { return KERNEL_FILE; } // this file name - used for run-time compilation
 
-    template <KernelType mykernel_type = kernel_type>
-    __device__ __host__ __forceinline__ void operator()(int idx, int src_idx, int parity)
+    template <KernelType mykernel_type, int n_src_tile>
+    __device__ __host__ __forceinline__ void apply(int idx, int src_idx, int parity)
     {
       using real = typename mapper<typename Arg::Float>::type;
       using Vector = ColorSpinor<real, Arg::nColor, 1>;
@@ -195,20 +214,38 @@ namespace quda
 
       const int my_spinor_parity = nParity == 2 ? parity : 0;
 
-      Vector out;
+      array<Vector, n_src_tile> out;
+      applyStaggered<nParity, mykernel_type, n_src_tile>(out, arg, coord, parity, idx, thread_dim, active, src_idx);
 
-      applyStaggered<nParity, mykernel_type>(out, arg, coord, parity, idx, thread_dim, active, src_idx);
-
-      out *= arg.dagger_scale;
+      for (auto s = 0; s < n_src_tile; s++) out[s] *= arg.dagger_scale;
 
       if (xpay && mykernel_type == INTERIOR_KERNEL) {
-        Vector x = arg.x[src_idx](coord.x_cb, my_spinor_parity);
-        out = arg.a * x - out;
+        array<Vector, n_src_tile> x;
+        for (auto s = 0; s < n_src_tile; s++) {
+          x[s] = arg.x[src_idx + s](coord.x_cb, my_spinor_parity);
+          out[s] = arg.a * x[s] - out[s];
+        }
       } else if (mykernel_type != INTERIOR_KERNEL) {
-        Vector x = arg.out[src_idx](coord.x_cb, my_spinor_parity);
-        out = x + (xpay ? -out : out);
+        array<Vector, n_src_tile> x;
+        for (auto s = 0; s < n_src_tile; s++) {
+          x[s] = arg.x[src_idx + s](coord.x_cb, my_spinor_parity);
+          out[s] = x[s] + (xpay ? -out[s] : out[s]);
+        }
+      }
+      if (mykernel_type != EXTERIOR_KERNEL_ALL || active) {
+        for (auto s = 0; s < n_src_tile; s++) { arg.out[src_idx + s](coord.x_cb, my_spinor_parity) = out[s]; }
+      }
+    }
+
+    template <KernelType mykernel_type = kernel_type, int n_src_tile = Arg::n_src_tile>
+    __device__ __host__ __forceinline__ void operator()(int idx, int src_idx_block, int parity)
+    {
+      int src_idx = src_idx_block * Arg::n_src_tile;
+      if (src_idx + n_src_tile <= arg.n_src) {
+        apply<mykernel_type, n_src_tile>(idx, src_idx, parity);
+      } else if constexpr (n_src_tile - 1 > 0) {
+        operator()<mykernel_type, n_src_tile - 1>(idx, src_idx_block, parity);
       }
-      if (mykernel_type != EXTERIOR_KERNEL_ALL || active) arg.out[src_idx](coord.x_cb, my_spinor_parity) = out;
     }
   };
 
diff --git a/include/quda_define.h.in b/include/quda_define.h.in
index ea6657120f..90a6b24d1a 100644
--- a/include/quda_define.h.in
+++ b/include/quda_define.h.in
@@ -25,6 +25,12 @@
  */
 #define MAX_MULTI_RHS @QUDA_MAX_MULTI_RHS@
 
+/**
+ * @def   MAX_MULTI_BLAS_N
+ * @brief This macro sets the register tile size for MRHS kernels
+ */
+#define MAX_MULTI_RHS_TILE @QUDA_MAX_MULTI_RHS_TILE@
+
 #cmakedefine QUDA_HETEROGENEOUS_ATOMIC
 #ifdef QUDA_HETEROGENEOUS_ATOMIC
 /**

From e816fb1cb2c6e1b9fa7228fdd0aecde51614a2a3 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Fri, 11 Oct 2024 02:47:12 -0700
Subject: [PATCH 02/29] Sanity check for QUDA_MAX_MULTI_RHS_TILE

---
 CMakeLists.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b0cc69dacd..20cc44a36b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -217,6 +217,9 @@ if(QUDA_MAX_MULTI_BLAS_N GREATER 32)
 endif()
 
 set(QUDA_MAX_MULTI_RHS_TILE "1" CACHE STRING "maximum tile size for MRHS kernels")
+if(QUDA_MAX_MULTI_RHS_TILE GREATER QUDA_MAX_MULTI_RHS)
+  message(SEND_ERROR "QUDA_MAX_MULTI_RHS_TILE is greater than QUDA_MAX_MULTI_RHS")
+endif()
 
 set(QUDA_PRECISION
   "14"

From eeea3b002d7e230e7b1b390c997a39798a2822d5 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Fri, 11 Oct 2024 02:47:41 -0700
Subject: [PATCH 03/29] Possible WAR for ROCm performance issue with MRHS
 staggered kernels

---
 include/kernels/dslash_staggered.cuh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/kernels/dslash_staggered.cuh b/include/kernels/dslash_staggered.cuh
index cfeaa553e7..07e8f333dd 100644
--- a/include/kernels/dslash_staggered.cuh
+++ b/include/kernels/dslash_staggered.cuh
@@ -238,8 +238,9 @@ namespace quda
     }
 
     template <KernelType mykernel_type = kernel_type, int n_src_tile = Arg::n_src_tile>
-    __device__ __host__ __forceinline__ void operator()(int idx, int src_idx_block, int parity)
+    __device__ __host__ __forceinline__ void operator()(int idx, int src_idx_block_, int parity)
     {
+      int src_idx_block = MAX_MULTI_RHS == 1 ? 0 : src_idx_block_;
       int src_idx = src_idx_block * Arg::n_src_tile;
       if (src_idx + n_src_tile <= arg.n_src) {
         apply<mykernel_type, n_src_tile>(idx, src_idx, parity);

From 467a493b11415170d84c93369e85717de6a1d0c4 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Fri, 11 Oct 2024 05:07:37 -0700
Subject: [PATCH 04/29] FieldTmp now supports creating temporaries using a
 T::param_type

---
 include/field_cache.h | 27 ++++++++++++++++++++++++++-
 lib/field_cache.cpp   | 15 +++++++++++++++
 2 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/include/field_cache.h b/include/field_cache.h
index 5e05f9acf6..f288844300 100644
--- a/include/field_cache.h
+++ b/include/field_cache.h
@@ -15,7 +15,7 @@ namespace quda {
    */
   template <typename T>
   struct FieldKey {
-    std::string volume; /** volume kstring */
+    std::string volume; /** volume string */
     std::string aux;    /** auxiliary string */
 
     FieldKey() = default;
@@ -78,6 +78,18 @@ namespace quda {
      */
     FieldTmp(const FieldKey<T> &key, const typename T::param_type &param);
 
+    /**
+       @brief Create a field temporary that corresponds to the field
+       constructed from the param struct.  If a matching field is
+       present in the cache, it will be popped from the cache.  If no
+       such temporary exists a temporary will be allocated.
+       @param[in] key Key corresponding to the field instance we
+       require
+       @param[in] param Parameter structure used to allocated
+       the temporary
+     */
+    FieldTmp(typename T::param_type param);
+
     /**
        @brief Copy constructor is deleted to prevent accidental cache
        bloat
@@ -111,6 +123,18 @@ namespace quda {
    */
   template <typename T> auto getFieldTmp(const T &a) { return FieldTmp<T>(a); }
 
+  /**
+     @brief Get a field temporary that is identical to the field
+     instance argument.  If a matching field is present in the cache,
+     it will be popped from the cache.  If no such temporary exists, a
+     temporary will be allocated.  When the destructor for the
+     FieldTmp is called, e.g., the returned object goes out of scope,
+     the temporary will be pushed onto the cache.
+
+     @param[in] a Field we wish to create a matching temporary for
+   */
+  template <typename T> auto getFieldTmp(const typename T::param_type &param) { return FieldTmp<T>(param); }
+
   /**
      @brief Get a vector of field temporaries that are identical to
      the vector instance argument.  If enough matching fields are
@@ -130,4 +154,5 @@ namespace quda {
     for (auto i = 0u; i < a.size(); i++) tmp.push_back(std::move(getFieldTmp(a[i])));
     return tmp;
   }
+
 }
diff --git a/lib/field_cache.cpp b/lib/field_cache.cpp
index 5ea09e3059..b98f24c065 100644
--- a/lib/field_cache.cpp
+++ b/lib/field_cache.cpp
@@ -35,6 +35,21 @@ namespace quda {
     }
   }
 
+  template <typename T> FieldTmp<T>::FieldTmp(typename T::param_type param)
+  {
+    param.create = QUDA_REFERENCE_FIELD_CREATE;
+    key = FieldKey(T(param));
+
+    auto it = cache.find(key);
+    if (it != cache.end() && it->second.size()) { // found an entry
+      tmp = std::move(it->second.top());
+      it->second.pop(); // pop the defunct object
+    } else { // no entry found, we must allocate a new field
+      param.create = QUDA_ZERO_FIELD_CREATE;
+      tmp = T(param);
+    }
+  }
+
   template <typename T> FieldTmp<T>::~FieldTmp()
   {
     // don't cache the field if it's empty (e.g., has been moved)

From 36fcf3dac220dfc2554539efd53fd4578b1dfc8e Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Fri, 11 Oct 2024 05:08:22 -0700
Subject: [PATCH 05/29] DslashCoarse now uses getFieldTmp for it mma
 temporaries to avoid allocation overheads when memory pool is disabled

---
 lib/dslash_coarse.in.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/dslash_coarse.in.cpp b/lib/dslash_coarse.in.cpp
index 747f2e9958..52298ceb14 100644
--- a/lib/dslash_coarse.in.cpp
+++ b/lib/dslash_coarse.in.cpp
@@ -88,7 +88,7 @@ namespace quda
     param.nVec = nVec;
     param.create = QUDA_NULL_FIELD_CREATE;
     param.fieldOrder = order;
-    return ColorSpinorField(param);
+    return getFieldTmp<ColorSpinorField>(param);
   }
 
   // Apply the coarse Dirac matrix to a coarse grid vector
@@ -107,9 +107,9 @@ namespace quda
                            IntList<@QUDA_MULTIGRID_NVEC_LIST@>());
       } else {
         constexpr QudaFieldOrder csOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
-        ColorSpinorField v_inA = create_color_spinor_copy(inA, csOrder);
-        ColorSpinorField v_inB = create_color_spinor_copy(inB, csOrder);
-        ColorSpinorField v_out = create_color_spinor_copy(out, csOrder);
+        auto v_inA = create_color_spinor_copy(inA, csOrder);
+        auto v_inB = create_color_spinor_copy(inB, csOrder);
+        auto v_out = create_color_spinor_copy(out, csOrder);
 
         if (dslash) { BlockTransposeForward(v_inA, inA); }
         if (clover) { BlockTransposeForward(v_inB, inB); }

From db865e495ec19762ca8d3cb14a011e9c2fb49457 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Mon, 14 Oct 2024 11:34:05 -0700
Subject: [PATCH 06/29] Small cleanup of staggered dslash

---
 include/kernels/dslash_staggered.cuh | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/include/kernels/dslash_staggered.cuh b/include/kernels/dslash_staggered.cuh
index 07e8f333dd..f1c5bf9210 100644
--- a/include/kernels/dslash_staggered.cuh
+++ b/include/kernels/dslash_staggered.cuh
@@ -220,16 +220,14 @@ namespace quda
       for (auto s = 0; s < n_src_tile; s++) out[s] *= arg.dagger_scale;
 
       if (xpay && mykernel_type == INTERIOR_KERNEL) {
-        array<Vector, n_src_tile> x;
         for (auto s = 0; s < n_src_tile; s++) {
-          x[s] = arg.x[src_idx + s](coord.x_cb, my_spinor_parity);
-          out[s] = arg.a * x[s] - out[s];
+          Vector x = arg.x[src_idx + s](coord.x_cb, my_spinor_parity);
+          out[s] = arg.a * x - out[s];
         }
       } else if (mykernel_type != INTERIOR_KERNEL) {
-        array<Vector, n_src_tile> x;
         for (auto s = 0; s < n_src_tile; s++) {
-          x[s] = arg.x[src_idx + s](coord.x_cb, my_spinor_parity);
-          out[s] = x[s] + (xpay ? -out[s] : out[s]);
+          Vector x = arg.x[src_idx + s](coord.x_cb, my_spinor_parity);
+          out[s] = x + (xpay ? -out[s] : out[s]);
         }
       }
       if (mykernel_type != EXTERIOR_KERNEL_ALL || active) {

From 8f72560aaadee965af94e63738e6b1b8cc44cfb4 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Tue, 15 Oct 2024 11:31:23 -0700
Subject: [PATCH 07/29] Add default initialization to array::data

---
 include/array.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/array.h b/include/array.h
index 3005087c85..e603f066c5 100644
--- a/include/array.h
+++ b/include/array.h
@@ -12,7 +12,7 @@ namespace quda
   template <typename T, int n> struct array {
     using value_type = T;
     static constexpr int N = n;
-    T data[n];
+    T data[n] = {};
 
     constexpr T &operator[](int i) { return data[i]; }
     constexpr const T &operator[](int i) const { return data[i]; }

From b2cebb08ae2e9712ebaffa3a3536e455ff4f4940 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Tue, 15 Oct 2024 11:58:30 -0700
Subject: [PATCH 08/29] Fix multi-gpu bug

---
 include/kernels/dslash_staggered.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/kernels/dslash_staggered.cuh b/include/kernels/dslash_staggered.cuh
index f1c5bf9210..ef01f43907 100644
--- a/include/kernels/dslash_staggered.cuh
+++ b/include/kernels/dslash_staggered.cuh
@@ -226,7 +226,7 @@ namespace quda
         }
       } else if (mykernel_type != INTERIOR_KERNEL) {
         for (auto s = 0; s < n_src_tile; s++) {
-          Vector x = arg.x[src_idx + s](coord.x_cb, my_spinor_parity);
+          Vector x = arg.out[src_idx + s](coord.x_cb, my_spinor_parity);
           out[s] = x + (xpay ? -out[s] : out[s]);
         }
       }

From 704f6b2c5ef854ab6a3fcdc000db57f42b170a45 Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Mon, 14 Oct 2024 16:12:40 -0700
Subject: [PATCH 09/29] Fixed a logic bug in the MR convergence check

---
 lib/inv_mr_quda.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/inv_mr_quda.cpp b/lib/inv_mr_quda.cpp
index 1696cd4d42..0e86f80a17 100644
--- a/lib/inv_mr_quda.cpp
+++ b/lib/inv_mr_quda.cpp
@@ -148,13 +148,13 @@ namespace quda
         mat(r, x);
         r2 = blas::xmyNorm(b, r);
         for (auto i = 0u; i < b2.size(); i++) param.true_res[i] = sqrt(r2[i] / b2[i]);
-        converged = (step < param.Nsteps && r2 > stop) ? false : true;
+        converged = (step < param.Nsteps && r2 < stop) ? true : false;
         if (!converged) blas::copy(r_sloppy, r);
         PrintStats("MR (restart)", iter, r2, b2);
       } else {
         blas::ax(scale, r_sloppy);
         r2 = blas::norm2(r_sloppy);
-        converged = (step < param.Nsteps && r2 > stop) ? false : true;
+        converged = (step < param.Nsteps && r2 < stop) ? true : false;
         if (!converged) blas::copy(r, r_sloppy);
       }
       step++;

From 57c631c9b8648b3b74e6054a5260fb3e4fce8062 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Tue, 15 Oct 2024 14:58:18 -0700
Subject: [PATCH 10/29] Revert "Add default initialization to array::data"

This reverts commit 8f72560aaadee965af94e63738e6b1b8cc44cfb4.
---
 include/array.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/array.h b/include/array.h
index e603f066c5..3005087c85 100644
--- a/include/array.h
+++ b/include/array.h
@@ -12,7 +12,7 @@ namespace quda
   template <typename T, int n> struct array {
     using value_type = T;
     static constexpr int N = n;
-    T data[n] = {};
+    T data[n];
 
     constexpr T &operator[](int i) { return data[i]; }
     constexpr const T &operator[](int i) const { return data[i]; }

From 9ddd25b9f9d67067522cde09b38518356517e86e Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 16 Oct 2024 11:16:48 -0700
Subject: [PATCH 11/29] n_src should be a member of DslashArg

---
 include/dslash_helper.cuh            | 2 ++
 include/kernels/dslash_staggered.cuh | 2 --
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/dslash_helper.cuh b/include/dslash_helper.cuh
index c39fd41f3f..8a9ba2391c 100644
--- a/include/dslash_helper.cuh
+++ b/include/dslash_helper.cuh
@@ -270,6 +270,7 @@ namespace quda
     int threadDimMapLower[4];
     int threadDimMapUpper[4];
 
+    int_fastdiv n_src;
     int_fastdiv Ls;
 
     // these are set with symmetric preconditioned twisted-mass dagger
@@ -328,6 +329,7 @@ namespace quda
       exterior_threads(0),
       threadDimMapLower {},
       threadDimMapUpper {},
+      n_src(in.size()),
       Ls(halo.X(4) / in.size()),
       twist_a(0.0),
       twist_b(0.0),
diff --git a/include/kernels/dslash_staggered.cuh b/include/kernels/dslash_staggered.cuh
index ef01f43907..3dfaf01be0 100644
--- a/include/kernels/dslash_staggered.cuh
+++ b/include/kernels/dslash_staggered.cuh
@@ -37,7 +37,6 @@ namespace quda
     using GL =
         typename gauge_mapper<Float, reconstruct_l, 18, QUDA_STAGGERED_PHASE_NO, gauge_direct_load, ghost, use_inphase>::type;
 
-    const int_fastdiv n_src;
     F out[MAX_MULTI_RHS];  /** output vector field */
     F in[MAX_MULTI_RHS];   /** input vector field */
     const Ghost halo_pack; /** accessor for writing the halo */
@@ -59,7 +58,6 @@ namespace quda
                  cvector_ref<const ColorSpinorField> &x, int parity, bool dagger, const int *comm_override) :
       DslashArg<Float, nDim, n_src_tile>(out, in, halo, U, x, parity, dagger, a == 0.0 ? false : true,
                                          improved_ ? 3 : 1, spin_project, comm_override),
-      n_src(out.size()),
       halo_pack(halo, improved_ ? 3 : 1),
       halo(halo, improved_ ? 3 : 1),
       U(U),

From 01725e6e84e26af54767496b080a0230645cdac6 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 16 Oct 2024 17:15:19 -0700
Subject: [PATCH 12/29] Fix fused pack + dslash kernels with n_src_tile > 1

---
 include/kernels/dslash_pack.cuh   | 201 ++++++++++++++++++------------
 include/kernels/dslash_wilson.cuh |   1 -
 lib/dslash_pack2.cu               |   4 +
 3 files changed, 127 insertions(+), 79 deletions(-)

diff --git a/include/kernels/dslash_pack.cuh b/include/kernels/dslash_pack.cuh
index 94e0465238..ee34616bbd 100644
--- a/include/kernels/dslash_pack.cuh
+++ b/include/kernels/dslash_pack.cuh
@@ -11,8 +11,10 @@ namespace quda
 {
   int *getPackComms();
 
-  template <typename Float_, int nColor_, int nSpin_, bool spin_project_ = true,
-            bool dagger_ = false, int twist_ = 0, QudaPCType pc_type_ = QUDA_4D_PC>
+  constexpr int pack_tile_size = 1;
+
+  template <typename Float_, int nColor_, int nSpin_, bool spin_project_ = true, bool dagger_ = false, int twist_ = 0,
+            QudaPCType pc_type_ = QUDA_4D_PC, int n_src_tile_ = pack_tile_size>
   struct PackArg : kernel_param<> {
 
     typedef Float_ Float;
@@ -24,6 +26,7 @@ namespace quda
     static constexpr bool dagger = dagger_;
     static constexpr int twist = twist_; // whether we are doing preconditioned twisted-mass or not (1 - singlet, 2 - doublet)
     static constexpr QudaPCType pc_type = pc_type_; // preconditioning type (4-d or 5-d)
+    static constexpr int n_src_tile = n_src_tile_;
 
     static constexpr bool spinor_direct_load = false; // false means texture load
 
@@ -54,6 +57,7 @@ namespace quda
 
     int sites_per_block;
 
+    int_fastdiv n_src;
     int_fastdiv Ls;
 
     char *packBuffer[4 * QUDA_MAX_DIM];
@@ -96,6 +100,7 @@ namespace quda
       threadDimMapLower {},
       threadDimMapUpper {},
       sites_per_block((work_items + grid - 1) / grid),
+      n_src(in.size()),
       Ls(halo.X(4) / in.size())
 #ifdef NVSHMEM_COMMS
       ,
@@ -131,7 +136,7 @@ namespace quda
     }
   };
 
-  template <bool dagger, int twist, int dim, QudaPCType pc, typename Arg>
+  template <bool dagger, int twist, int dim, QudaPCType pc, int n_src_tile, typename Arg>
   __device__ __host__ inline void pack(const Arg &arg, int ghost_idx, int s, int parity, int src_idx)
   {
     typedef typename mapper<typename Arg::Float>::type real;
@@ -160,46 +165,50 @@ namespace quda
 
       int idx = indexFromFaceIndex<nDim, pc, dim, nFace, 0>(ghost_idx, parity, arg);
       constexpr int proj_dir = dagger ? +1 : -1;
-      Vector f = arg.in[src_idx](idx + s * arg.dc.volume_4d_cb, spinor_parity);
-      if (twist == 1) {
-        f = arg.twist_a * (f + arg.twist_b * f.igamma(4));
-      } else if (twist == 2) {
-        Vector f1 = arg.in[src_idx](idx + (1 - s) * arg.dc.volume_4d_cb, spinor_parity); // load other flavor
-        if (s == 0)
-          f = arg.twist_a * (f + arg.twist_b * f.igamma(4) + arg.twist_c * f1);
-        else
-          f = arg.twist_a * (f - arg.twist_b * f.igamma(4) + arg.twist_c * f1);
-      }
-      if (arg.spin_project) {
-        arg.halo_pack.Ghost(dim, 0, ghost_idx + (src_idx * arg.Ls + s) * arg.dc.ghostFaceCB[dim], spinor_parity)
-          = f.project(dim, proj_dir);
-      } else {
-        arg.halo_pack.Ghost(dim, 0, ghost_idx + (src_idx * arg.Ls + s) * arg.dc.ghostFaceCB[dim], spinor_parity) = f;
+      for (auto src = src_idx; src < src_idx + n_src_tile; src++) {
+        Vector f = arg.in[src](idx + s * arg.dc.volume_4d_cb, spinor_parity);
+        if (twist == 1) {
+          f = arg.twist_a * (f + arg.twist_b * f.igamma(4));
+        } else if (twist == 2) {
+          Vector f1 = arg.in[src](idx + (1 - s) * arg.dc.volume_4d_cb, spinor_parity); // load other flavor
+          if (s == 0)
+            f = arg.twist_a * (f + arg.twist_b * f.igamma(4) + arg.twist_c * f1);
+          else
+            f = arg.twist_a * (f - arg.twist_b * f.igamma(4) + arg.twist_c * f1);
+        }
+        if (arg.spin_project) {
+          arg.halo_pack.Ghost(dim, 0, ghost_idx + (src * arg.Ls + s) * arg.dc.ghostFaceCB[dim], spinor_parity)
+            = f.project(dim, proj_dir);
+        } else {
+          arg.halo_pack.Ghost(dim, 0, ghost_idx + (src * arg.Ls + s) * arg.dc.ghostFaceCB[dim], spinor_parity) = f;
+        }
       }
     } else { // forwards
 
       int idx = indexFromFaceIndex<nDim, pc, dim, nFace, 1>(ghost_idx, parity, arg);
       constexpr int proj_dir = dagger ? -1 : +1;
-      Vector f = arg.in[src_idx](idx + s * arg.dc.volume_4d_cb, spinor_parity);
-      if (twist == 1) {
-        f = arg.twist_a * (f + arg.twist_b * f.igamma(4));
-      } else if (twist == 2) {
-        Vector f1 = arg.in[src_idx](idx + (1 - s) * arg.dc.volume_4d_cb, spinor_parity); // load other flavor
-        if (s == 0)
-          f = arg.twist_a * (f + arg.twist_b * f.igamma(4) + arg.twist_c * f1);
-        else
-          f = arg.twist_a * (f - arg.twist_b * f.igamma(4) + arg.twist_c * f1);
-      }
-      if (arg.spin_project) {
-        arg.halo_pack.Ghost(dim, 1, ghost_idx + (src_idx * arg.Ls + s) * arg.dc.ghostFaceCB[dim], spinor_parity)
-          = f.project(dim, proj_dir);
-      } else {
-        arg.halo_pack.Ghost(dim, 1, ghost_idx + (src_idx * arg.Ls + s) * arg.dc.ghostFaceCB[dim], spinor_parity) = f;
+      for (auto src = src_idx; src < src_idx + n_src_tile; src++) {
+        Vector f = arg.in[src](idx + s * arg.dc.volume_4d_cb, spinor_parity);
+        if (twist == 1) {
+          f = arg.twist_a * (f + arg.twist_b * f.igamma(4));
+        } else if (twist == 2) {
+          Vector f1 = arg.in[src](idx + (1 - s) * arg.dc.volume_4d_cb, spinor_parity); // load other flavor
+          if (s == 0)
+            f = arg.twist_a * (f + arg.twist_b * f.igamma(4) + arg.twist_c * f1);
+          else
+            f = arg.twist_a * (f - arg.twist_b * f.igamma(4) + arg.twist_c * f1);
+        }
+        if (arg.spin_project) {
+          arg.halo_pack.Ghost(dim, 1, ghost_idx + (src * arg.Ls + s) * arg.dc.ghostFaceCB[dim], spinor_parity)
+            = f.project(dim, proj_dir);
+        } else {
+          arg.halo_pack.Ghost(dim, 1, ghost_idx + (src * arg.Ls + s) * arg.dc.ghostFaceCB[dim], spinor_parity) = f;
+        }
       }
     }
   }
 
-  template <int dim, int nFace = 1, typename Arg>
+  template <int dim, int nFace = 1, int n_src_tile, typename Arg>
   __device__ __host__ inline void packStaggered(const Arg &arg, int ghost_idx, int parity, int src_idx)
   {
     typedef typename mapper<typename Arg::Float>::type real;
@@ -217,12 +226,16 @@ namespace quda
 
     if (face_num == 0) { // backwards
       int idx = indexFromFaceIndexStaggered<4, QUDA_4D_PC, dim, nFace, 0>(ghost_idx, parity, arg);
-      Vector f = arg.in[src_idx](idx, spinor_parity);
-      arg.halo_pack.Ghost(dim, 0, ghost_idx + src_idx * nFace * arg.dc.ghostFaceCB[dim], spinor_parity) = f;
+      for (auto s = 0; s < n_src_tile; s++) {
+        Vector f = arg.in[src_idx + s](idx, spinor_parity);
+        arg.halo_pack.Ghost(dim, 0, ghost_idx + (src_idx + s) * nFace * arg.dc.ghostFaceCB[dim], spinor_parity) = f;
+      }
     } else { // forwards
       int idx = indexFromFaceIndexStaggered<4, QUDA_4D_PC, dim, nFace, 1>(ghost_idx, parity, arg);
-      Vector f = arg.in[src_idx](idx, spinor_parity);
-      arg.halo_pack.Ghost(dim, 1, ghost_idx + src_idx * nFace * arg.dc.ghostFaceCB[dim], spinor_parity) = f;
+      for (auto s = 0; s < n_src_tile; s++) {
+        Vector f = arg.in[src_idx + s](idx, spinor_parity);
+        arg.halo_pack.Ghost(dim, 1, ghost_idx + (src_idx + s) * nFace * arg.dc.ghostFaceCB[dim], spinor_parity) = f;
+      }
     }
   }
 
@@ -250,24 +263,36 @@ namespace quda
         if (Arg::pc_type == QUDA_5D_PC) { // 5-d checkerboarded, include s (not ghostFaceCB since both faces)
           switch (dim) {
           case 0:
-            pack<Arg::dagger, Arg::twist, 0, Arg::pc_type>(arg, ghost_idx + s * arg.dc.ghostFace[0], 0, parity, src_idx);
+            pack<Arg::dagger, Arg::twist, 0, Arg::pc_type, Arg::n_src_tile>(arg, ghost_idx + s * arg.dc.ghostFace[0], 0,
+                                                                            parity, src_idx);
             break;
           case 1:
-            pack<Arg::dagger, Arg::twist, 1, Arg::pc_type>(arg, ghost_idx + s * arg.dc.ghostFace[1], 0, parity, src_idx);
+            pack<Arg::dagger, Arg::twist, 1, Arg::pc_type, Arg::n_src_tile>(arg, ghost_idx + s * arg.dc.ghostFace[1], 0,
+                                                                            parity, src_idx);
             break;
           case 2:
-            pack<Arg::dagger, Arg::twist, 2, Arg::pc_type>(arg, ghost_idx + s * arg.dc.ghostFace[2], 0, parity, src_idx);
+            pack<Arg::dagger, Arg::twist, 2, Arg::pc_type, Arg::n_src_tile>(arg, ghost_idx + s * arg.dc.ghostFace[2], 0,
+                                                                            parity, src_idx);
             break;
           case 3:
-            pack<Arg::dagger, Arg::twist, 3, Arg::pc_type>(arg, ghost_idx + s * arg.dc.ghostFace[3], 0, parity, src_idx);
+            pack<Arg::dagger, Arg::twist, 3, Arg::pc_type, Arg::n_src_tile>(arg, ghost_idx + s * arg.dc.ghostFace[3], 0,
+                                                                            parity, src_idx);
             break;
           }
         } else { // 4-d checkerboarding, keeping s separate (if it exists)
           switch (dim) {
-          case 0: pack<Arg::dagger, Arg::twist, 0, Arg::pc_type>(arg, ghost_idx, s, parity, src_idx); break;
-          case 1: pack<Arg::dagger, Arg::twist, 1, Arg::pc_type>(arg, ghost_idx, s, parity, src_idx); break;
-          case 2: pack<Arg::dagger, Arg::twist, 2, Arg::pc_type>(arg, ghost_idx, s, parity, src_idx); break;
-          case 3: pack<Arg::dagger, Arg::twist, 3, Arg::pc_type>(arg, ghost_idx, s, parity, src_idx); break;
+          case 0:
+            pack<Arg::dagger, Arg::twist, 0, Arg::pc_type, Arg::n_src_tile>(arg, ghost_idx, s, parity, src_idx);
+            break;
+          case 1:
+            pack<Arg::dagger, Arg::twist, 1, Arg::pc_type, Arg::n_src_tile>(arg, ghost_idx, s, parity, src_idx);
+            break;
+          case 2:
+            pack<Arg::dagger, Arg::twist, 2, Arg::pc_type, Arg::n_src_tile>(arg, ghost_idx, s, parity, src_idx);
+            break;
+          case 3:
+            pack<Arg::dagger, Arg::twist, 3, Arg::pc_type, Arg::n_src_tile>(arg, ghost_idx, s, parity, src_idx);
+            break;
           }
         }
 
@@ -288,7 +313,8 @@ namespace quda
   // 64 - use uber kernel (merge exterior)
   template <bool dagger, QudaPCType pc, typename Arg> struct packShmem {
 
-    template <int twist> __device__ __forceinline__ void operator()(const Arg &arg, int src_s, int parity)
+    template <int twist, int n_src_tile>
+    __device__ __forceinline__ void operator()(const Arg &arg, int src_s, int parity)
     {
       // (active_dims * 2 + dir) * blocks_per_dir + local_block_idx
       int local_block_idx = target::block_idx().x % arg.blocks_per_dir;
@@ -315,9 +341,9 @@ namespace quda
           while (local_tid < arg.dc.ghostFaceCB[0]) {
             int ghost_idx = dir * arg.dc.ghostFaceCB[0] + local_tid;
             if (pc == QUDA_5D_PC)
-              pack<dagger, twist, 0, pc>(arg, ghost_idx + s * arg.dc.ghostFace[0], 0, parity, src_idx);
+              pack<dagger, twist, 0, pc, n_src_tile>(arg, ghost_idx + s * arg.dc.ghostFace[0], 0, parity, src_idx);
             else
-              pack<dagger, twist, 0, pc>(arg, ghost_idx, s, parity, src_idx);
+              pack<dagger, twist, 0, pc, n_src_tile>(arg, ghost_idx, s, parity, src_idx);
             local_tid += arg.blocks_per_dir * target::block_dim().x;
           }
           break;
@@ -325,9 +351,9 @@ namespace quda
           while (local_tid < arg.dc.ghostFaceCB[1]) {
             int ghost_idx = dir * arg.dc.ghostFaceCB[1] + local_tid;
             if (pc == QUDA_5D_PC)
-              pack<dagger, twist, 1, pc>(arg, ghost_idx + s * arg.dc.ghostFace[1], 0, parity, src_idx);
+              pack<dagger, twist, 1, pc, n_src_tile>(arg, ghost_idx + s * arg.dc.ghostFace[1], 0, parity, src_idx);
             else
-              pack<dagger, twist, 1, pc>(arg, ghost_idx, s, parity, src_idx);
+              pack<dagger, twist, 1, pc, n_src_tile>(arg, ghost_idx, s, parity, src_idx);
             local_tid += arg.blocks_per_dir * target::block_dim().x;
           }
           break;
@@ -335,9 +361,9 @@ namespace quda
           while (local_tid < arg.dc.ghostFaceCB[2]) {
             int ghost_idx = dir * arg.dc.ghostFaceCB[2] + local_tid;
             if (pc == QUDA_5D_PC)
-              pack<dagger, twist, 2, pc>(arg, ghost_idx + s * arg.dc.ghostFace[2], 0, parity, src_idx);
+              pack<dagger, twist, 2, pc, n_src_tile>(arg, ghost_idx + s * arg.dc.ghostFace[2], 0, parity, src_idx);
             else
-              pack<dagger, twist, 2, pc>(arg, ghost_idx, s, parity, src_idx);
+              pack<dagger, twist, 2, pc, n_src_tile>(arg, ghost_idx, s, parity, src_idx);
             local_tid += arg.blocks_per_dir * target::block_dim().x;
           }
           break;
@@ -345,9 +371,9 @@ namespace quda
           while (local_tid < arg.dc.ghostFaceCB[3]) {
             int ghost_idx = dir * arg.dc.ghostFaceCB[3] + local_tid;
             if (pc == QUDA_5D_PC)
-              pack<dagger, twist, 3, pc>(arg, ghost_idx + s * arg.dc.ghostFace[3], 0, parity, src_idx);
+              pack<dagger, twist, 3, pc, n_src_tile>(arg, ghost_idx + s * arg.dc.ghostFace[3], 0, parity, src_idx);
             else
-              pack<dagger, twist, 3, pc>(arg, ghost_idx, s, parity, src_idx);
+              pack<dagger, twist, 3, pc, n_src_tile>(arg, ghost_idx, s, parity, src_idx);
             local_tid += arg.blocks_per_dir * target::block_dim().x;
           }
           break;
@@ -358,12 +384,19 @@ namespace quda
 #endif
     }
 
-    __device__ __forceinline__ void operator()(const Arg &arg, int s, int parity, int twist_pack)
+    template <int n_src_tile = Arg::n_src_tile>
+    __device__ __forceinline__ void operator()(const Arg &arg, int src_s_block_, int parity, int twist_pack)
     {
-      switch (twist_pack) {
-      case 0: this->operator()<0>(arg, s, parity); break;
-      case 1: this->operator()<1>(arg, s, parity); break;
-      case 2: this->operator()<2>(arg, s, parity); break;
+      int src_s_block = MAX_MULTI_RHS == 1 ? 0 : src_s_block_;
+      int src_s_idx = src_s_block * Arg::n_src_tile;
+      if (src_s_idx + n_src_tile * arg.Ls <= arg.n_src * arg.Ls) {
+        switch (twist_pack) {
+        case 0: this->operator()<0, n_src_tile>(arg, src_s_idx, parity); break;
+        case 1: this->operator()<1, n_src_tile>(arg, src_s_idx, parity); break;
+        case 2: this->operator()<2, n_src_tile>(arg, src_s_idx, parity); break;
+        }
+      } else if constexpr (n_src_tile - 1 > 0) {
+        operator()<n_src_tile - 1>(arg, src_s_block, parity);
       }
     }
   };
@@ -377,7 +410,7 @@ namespace quda
     {
       if (arg.nParity == 1) parity = arg.parity;
       packShmem<Arg::dagger, Arg::pc_type, Arg> pack;
-      pack.operator()<Arg::twist>(arg, s, parity);
+      pack.operator()(arg, s, parity, Arg::twist);
     }
   };
 
@@ -400,17 +433,17 @@ namespace quda
 
         if (arg.nFace == 1) {
           switch (dim) {
-          case 0: packStaggered<0, 1>(arg, ghost_idx, parity, src_idx); break;
-          case 1: packStaggered<1, 1>(arg, ghost_idx, parity, src_idx); break;
-          case 2: packStaggered<2, 1>(arg, ghost_idx, parity, src_idx); break;
-          case 3: packStaggered<3, 1>(arg, ghost_idx, parity, src_idx); break;
+          case 0: packStaggered<0, 1, Arg::n_src_tile>(arg, ghost_idx, parity, src_idx); break;
+          case 1: packStaggered<1, 1, Arg::n_src_tile>(arg, ghost_idx, parity, src_idx); break;
+          case 2: packStaggered<2, 1, Arg::n_src_tile>(arg, ghost_idx, parity, src_idx); break;
+          case 3: packStaggered<3, 1, Arg::n_src_tile>(arg, ghost_idx, parity, src_idx); break;
           }
         } else if (arg.nFace == 3) {
           switch (dim) {
-          case 0: packStaggered<0, 3>(arg, ghost_idx, parity, src_idx); break;
-          case 1: packStaggered<1, 3>(arg, ghost_idx, parity, src_idx); break;
-          case 2: packStaggered<2, 3>(arg, ghost_idx, parity, src_idx); break;
-          case 3: packStaggered<3, 3>(arg, ghost_idx, parity, src_idx); break;
+          case 0: packStaggered<0, 3, Arg::n_src_tile>(arg, ghost_idx, parity, src_idx); break;
+          case 1: packStaggered<1, 3, Arg::n_src_tile>(arg, ghost_idx, parity, src_idx); break;
+          case 2: packStaggered<2, 3, Arg::n_src_tile>(arg, ghost_idx, parity, src_idx); break;
+          case 3: packStaggered<3, 3, Arg::n_src_tile>(arg, ghost_idx, parity, src_idx); break;
           }
         }
 
@@ -422,7 +455,7 @@ namespace quda
 
   template <bool dagger, QudaPCType pc, typename Arg> struct packStaggeredShmem {
 
-    __device__ __forceinline__ void operator()(const Arg &arg, int src_idx, int parity, int = 0)
+    template <int n_src_tile> __device__ __forceinline__ void apply(const Arg &arg, int src_idx, int parity)
     {
       // (active_dims * 2 + dir) * blocks_per_dir + local_block_idx
       int local_block_idx = target::block_idx().x % arg.blocks_per_dir;
@@ -446,9 +479,9 @@ namespace quda
           while (local_tid < arg.nFace * arg.dc.ghostFaceCB[0]) {
             int ghost_idx = dir * arg.nFace * arg.dc.ghostFaceCB[0] + local_tid;
             if (arg.nFace == 1)
-              packStaggered<0, 1>(arg, ghost_idx, parity, src_idx);
+              packStaggered<0, 1, n_src_tile>(arg, ghost_idx, parity, src_idx);
             else
-              packStaggered<0, 3>(arg, ghost_idx, parity, src_idx);
+              packStaggered<0, 3, n_src_tile>(arg, ghost_idx, parity, src_idx);
             local_tid += arg.blocks_per_dir * target::block_dim().x;
           }
           break;
@@ -456,9 +489,9 @@ namespace quda
           while (local_tid < arg.nFace * arg.dc.ghostFaceCB[1]) {
             int ghost_idx = dir * arg.nFace * arg.dc.ghostFaceCB[1] + local_tid;
             if (arg.nFace == 1)
-              packStaggered<1, 1>(arg, ghost_idx, parity, src_idx);
+              packStaggered<1, 1, n_src_tile>(arg, ghost_idx, parity, src_idx);
             else
-              packStaggered<1, 3>(arg, ghost_idx, parity, src_idx);
+              packStaggered<1, 3, n_src_tile>(arg, ghost_idx, parity, src_idx);
             local_tid += arg.blocks_per_dir * target::block_dim().x;
           }
           break;
@@ -466,9 +499,9 @@ namespace quda
           while (local_tid < arg.nFace * arg.dc.ghostFaceCB[2]) {
             int ghost_idx = dir * arg.nFace * arg.dc.ghostFaceCB[2] + local_tid;
             if (arg.nFace == 1)
-              packStaggered<2, 1>(arg, ghost_idx, parity, src_idx);
+              packStaggered<2, 1, n_src_tile>(arg, ghost_idx, parity, src_idx);
             else
-              packStaggered<2, 3>(arg, ghost_idx, parity, src_idx);
+              packStaggered<2, 3, n_src_tile>(arg, ghost_idx, parity, src_idx);
             local_tid += arg.blocks_per_dir * target::block_dim().x;
           }
           break;
@@ -476,9 +509,9 @@ namespace quda
           while (local_tid < arg.nFace * arg.dc.ghostFaceCB[3]) {
             int ghost_idx = dir * arg.nFace * arg.dc.ghostFaceCB[3] + local_tid;
             if (arg.nFace == 1)
-              packStaggered<3, 1>(arg, ghost_idx, parity, src_idx);
+              packStaggered<3, 1, n_src_tile>(arg, ghost_idx, parity, src_idx);
             else
-              packStaggered<3, 3>(arg, ghost_idx, parity, src_idx);
+              packStaggered<3, 3, n_src_tile>(arg, ghost_idx, parity, src_idx);
             local_tid += arg.blocks_per_dir * target::block_dim().x;
           }
           break;
@@ -488,6 +521,18 @@ namespace quda
       if (arg.shmem) shmem_signal(dim, dir, arg);
 #endif
     }
+
+    template <int n_src_tile = Arg::n_src_tile>
+    __device__ __forceinline__ void operator()(const Arg &arg, int src_idx_block_, int parity, int = 0)
+    {
+      int src_idx_block = MAX_MULTI_RHS == 1 ? 0 : src_idx_block_;
+      int src_idx = src_idx_block * Arg::n_src_tile;
+      if (src_idx + n_src_tile <= arg.n_src) {
+        apply<n_src_tile>(arg, src_idx, parity);
+      } else if constexpr (n_src_tile - 1 > 0) {
+        operator()<n_src_tile - 1>(arg, src_idx_block, parity);
+      }
+    }
   };
 
   template <typename Arg> struct pack_staggered_shmem {
diff --git a/include/kernels/dslash_wilson.cuh b/include/kernels/dslash_wilson.cuh
index 514c4df957..584999d52f 100644
--- a/include/kernels/dslash_wilson.cuh
+++ b/include/kernels/dslash_wilson.cuh
@@ -34,7 +34,6 @@ namespace quda
 
     typedef typename mapper<Float>::type real;
 
-    const int_fastdiv n_src;
     F out[MAX_MULTI_RHS]; /** output vector field set */
     F in[MAX_MULTI_RHS];  /** input vector field set */
     F x[MAX_MULTI_RHS];   /** input vector set when doing xpay */
diff --git a/lib/dslash_pack2.cu b/lib/dslash_pack2.cu
index 043ea65abc..01b70f3284 100644
--- a/lib/dslash_pack2.cu
+++ b/lib/dslash_pack2.cu
@@ -139,6 +139,10 @@ namespace quda
       strcpy(aux, "policy_kernel,");
       strcat(aux, in.AuxString().c_str());
       setRHSstring(aux, in.size());
+      strcat(aux, ",n_rhs_tile=");
+      char tile_str[16];
+      i32toa(tile_str, pack_tile_size);
+      strcat(aux, tile_str);
       char comm[5];
       for (int i = 0; i < 4; i++) comm[i] = (comm_dim_pack[i] ? '1' : '0');
       comm[4] = '\0';

From 77b6b594c755665e97c1cc3ae94a2822b18bb18d Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Mon, 18 Nov 2024 14:11:43 -0800
Subject: [PATCH 13/29] Fix bug in dslash_test_utils.h

---
 tests/dslash_test_utils.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/dslash_test_utils.h b/tests/dslash_test_utils.h
index ec4ca5b13a..5512c3d0a2 100644
--- a/tests/dslash_test_utils.h
+++ b/tests/dslash_test_utils.h
@@ -428,9 +428,8 @@ struct DslashTestWrapper {
         switch (dtest_type) {
         case dslash_test_type::Dslash:
           // My dslash should be the same as the clover dslash
-          for (int i = 0; i < Nsrc; i++)
-            clover_dslash(spinorRef[i].data(), hostGauge, hostCloverInv, spinor[i].data(), parity, inv_param.dagger,
-                          inv_param.cpu_prec, gauge_param);
+          clover_dslash(spinorRef[i].data(), hostGauge, hostCloverInv, spinor[i].data(), parity, inv_param.dagger,
+                        inv_param.cpu_prec, gauge_param);
           break;
         case dslash_test_type::MatPC:
           // my matpc op

From 27339e4afdb752d5119788e1e26585be279558d9 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Mon, 18 Nov 2024 14:21:48 -0800
Subject: [PATCH 14/29] nvc++ no longer needs to use constant memory args for
 dslash

---
 include/dslash_helper.cuh | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/include/dslash_helper.cuh b/include/dslash_helper.cuh
index 8a9ba2391c..0e49e64b68 100644
--- a/include/dslash_helper.cuh
+++ b/include/dslash_helper.cuh
@@ -12,12 +12,7 @@
 #include <kernel_helper.h>
 #include <tune_quda.h>
 
-#if defined(_NVHPC_CUDA)
-#include <constant_kernel_arg.h>
-constexpr quda::use_kernel_arg_p use_kernel_arg = quda::use_kernel_arg_p::FALSE;
-#else
 constexpr quda::use_kernel_arg_p use_kernel_arg = quda::use_kernel_arg_p::TRUE;
-#endif
 
 #include <kernel.h>
 

From 936c2c4b21ae2e816c997268d499cefa00674eab Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Mon, 18 Nov 2024 14:23:49 -0800
Subject: [PATCH 15/29] Fix for nvc++ and remove unneeded target specific
 thread_array.h files

---
 include/targets/cuda/thread_array.h | 18 ------------------
 include/targets/hip/thread_array.h  |  1 -
 2 files changed, 19 deletions(-)
 delete mode 100644 include/targets/cuda/thread_array.h
 delete mode 100644 include/targets/hip/thread_array.h

diff --git a/include/targets/cuda/thread_array.h b/include/targets/cuda/thread_array.h
deleted file mode 100644
index 1c4d7f3244..0000000000
--- a/include/targets/cuda/thread_array.h
+++ /dev/null
@@ -1,18 +0,0 @@
-#pragma once
-
-#ifndef _NVHPC_CUDA
-
-#include "../generic/thread_array.h"
-
-#else
-
-#include <array.h>
-
-namespace quda
-{
-  template <typename T, int n> struct thread_array : array<T, n> {
-    static constexpr unsigned int shared_mem_size(dim3 block) { return 0; }
-  };
-} // namespace quda
-
-#endif
diff --git a/include/targets/hip/thread_array.h b/include/targets/hip/thread_array.h
deleted file mode 100644
index 24f986cc26..0000000000
--- a/include/targets/hip/thread_array.h
+++ /dev/null
@@ -1 +0,0 @@
-#include "../generic/thread_array.h"

From fd00d0888e89db6ce38615a8eee9a8853091c901 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Tue, 19 Nov 2024 07:50:43 -0800
Subject: [PATCH 16/29] Remove WAR for nvc++ in reduce_helper which is no
 longer needed

---
 include/targets/cuda/reduce_helper.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/include/targets/cuda/reduce_helper.h b/include/targets/cuda/reduce_helper.h
index c8c6101892..7b8e3e4b57 100644
--- a/include/targets/cuda/reduce_helper.h
+++ b/include/targets/cuda/reduce_helper.h
@@ -169,11 +169,7 @@ namespace quda
     auto tid = target::thread_idx_linear<2>();
 
     if (arg.result_d) { // write to host mapped memory
-#ifdef _NVHPC_CUDA      // WAR for nvc++
-      constexpr bool coalesced_write = false;
-#else
       constexpr bool coalesced_write = true;
-#endif
       if constexpr (coalesced_write) {
         static_assert(n <= device::warp_size(), "reduction array is greater than warp size");
         auto mask = __ballot_sync(0xffffffff, tid < n);

From fc5ac97cd4d34e4b57a3ab01a3b6bee58bb4cc7c Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 20 Nov 2024 07:10:09 -0800
Subject: [PATCH 17/29] Add versioned CPM files to .gitignore

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index a60ac7da48..8e122f0a4e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -19,3 +19,4 @@ include/jitify_options.hpp
 .tags*
 autom4te.cache/*
 .vscode
+cmake/CPM_*.cmake

From a10d6f301d7a0f5aefffce81d92f0163e1ff832b Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 20 Nov 2024 07:47:19 -0800
Subject: [PATCH 18/29] Fix complex_quda.h to be C++20 compliant

---
 include/complex_quda.h | 38 ++++++++++++++++----------------------
 1 file changed, 16 insertions(+), 22 deletions(-)

diff --git a/include/complex_quda.h b/include/complex_quda.h
index 18da63def5..1481dec40f 100644
--- a/include/complex_quda.h
+++ b/include/complex_quda.h
@@ -360,23 +360,19 @@ struct complex
   typedef ValueType value_type;
 
   // Constructors
-  __host__ __device__ inline complex<ValueType>(const ValueType &re = ValueType(), const ValueType &im = ValueType())
+  __host__ __device__ inline complex(const ValueType &re = ValueType(), const ValueType &im = ValueType())
   {
     real(re);
     imag(im);
     }
 
-  template <class X>
-    __host__ __device__
-    inline complex<ValueType>(const complex<X> & z)
+    template <class X> __host__ __device__ inline complex(const complex<X> &z)
     {
       real(z.real());
       imag(z.imag());
     }
 
-  template <class X>
-    __host__ __device__
-    inline complex<ValueType>(const std::complex<X> & z)
+    template <class X> __host__ __device__ inline complex(const std::complex<X> &z)
     {
       real(z.real());
       imag(z.imag());
@@ -436,12 +432,11 @@ struct complex
 template <> struct complex<float> : public float2 {
 public:
   typedef float value_type;
-  complex<float>() = default;
-  constexpr complex<float>(const float &re, const float &im = float()) : float2 {re, im} { }
+  complex() = default;
+  constexpr complex(const float &re, const float &im = float()) : float2 {re, im} { }
 
   template <typename X>
-  constexpr complex<float>(const std::complex<X> &z) :
-    float2 {static_cast<float>(z.real()), static_cast<float>(z.imag())}
+  constexpr complex(const std::complex<X> &z) : float2 {static_cast<float>(z.real()), static_cast<float>(z.imag())}
   {
   }
 
@@ -500,16 +495,15 @@ template <> struct complex<float> : public float2 {
 template <> struct complex<double> : public double2 {
 public:
   typedef double value_type;
-  complex<double>() = default;
-  constexpr complex<double>(const double &re, const double &im = double()) : double2 {re, im} { }
+  complex() = default;
+  constexpr complex(const double &re, const double &im = double()) : double2 {re, im} { }
 
   template <typename X>
-  constexpr complex<double>(const std::complex<X> &z) :
-    double2 {static_cast<double>(z.real()), static_cast<double>(z.imag())}
+  constexpr complex(const std::complex<X> &z) : double2 {static_cast<double>(z.real()), static_cast<double>(z.imag())}
   {
   }
 
-  template <typename T> __host__ __device__ inline complex<double> &operator=(const complex<T> &z)
+  template <typename T> __host__ __device__ inline complex &operator=(const complex<T> &z)
   {
     real(z.real());
     imag(z.imag());
@@ -572,9 +566,9 @@ template <> struct complex<int8_t> : public char2 {
 public:
   typedef int8_t value_type;
 
-  complex<int8_t>() = default;
+  complex() = default;
 
-  constexpr complex<int8_t>(const int8_t &re, const int8_t &im = int8_t()) : char2 {re, im} { }
+  constexpr complex(const int8_t &re, const int8_t &im = int8_t()) : char2 {re, im} { }
 
   __host__ __device__ inline complex<int8_t> &operator+=(const complex<int8_t> &z)
   {
@@ -608,9 +602,9 @@ struct complex <short> : public short2
 public:
   typedef short value_type;
 
-  complex<short>() = default;
+  complex() = default;
 
-  constexpr complex<short>(const short &re, const short &im = short()) : short2 {re, im} { }
+  constexpr complex(const short &re, const short &im = short()) : short2 {re, im} { }
 
   __host__ __device__ inline complex<short> &operator+=(const complex<short> &z)
   {
@@ -644,9 +638,9 @@ struct complex <int> : public int2
 public:
   typedef int value_type;
 
-  complex<int>() = default;
+  complex() = default;
 
-  constexpr complex<int>(const int &re, const int &im = int()) : int2 {re, im} { }
+  constexpr complex(const int &re, const int &im = int()) : int2 {re, im} { }
 
   __host__ __device__ inline complex<int> &operator+=(const complex<int> &z)
   {

From c47e1f00a798be1f25fb4fa6c888b5d7435ef924 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 20 Nov 2024 10:20:35 -0800
Subject: [PATCH 19/29] Add new variant of heterogeneous reductions: with some
 compilers, optimization can optimize out infinities rendering the use of
 heterogenous atomics with an inifinity sentinal.  As an alternative we can
 use negative zero as the sentinal.  Default remains infinity, disable with
 QUDA_HETEROGENEOUS_ATOMIC_INF_INIT=OFF

---
 include/quda_define.h.in             | 10 ++++++++++
 include/targets/cuda/reduce_helper.h | 29 +++++++++++++++++++++++++++-
 lib/targets/cuda/target_cuda.cmake   |  3 +++
 3 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/include/quda_define.h.in b/include/quda_define.h.in
index 90a6b24d1a..86957d9aba 100644
--- a/include/quda_define.h.in
+++ b/include/quda_define.h.in
@@ -42,6 +42,16 @@
 #undef QUDA_HETEROGENEOUS_ATOMIC
 #endif
 
+#cmakedefine QUDA_HETEROGENEOUS_ATOMIC_INF_INIT
+#ifdef QUDA_HETEROGENEOUS_ATOMIC_INF_INIT
+/**
+ * @def   HETEROGENEOUS_ATOMIC_INF_INIT
+ * @brief This macro sets whether we are using infinity for the signaling sentinal
+ */
+#define HETEROGENEOUS_ATOMIC_INF_INIT
+#undef QUDA_HETEROGENEOUS_ATOMIC_INF_INIT
+#endif
+
 #cmakedefine QUDA_LARGE_KERNEL_ARG
 
 #cmakedefine QUDA_DIRAC_CLOVER_HASENBUSCH
diff --git a/include/targets/cuda/reduce_helper.h b/include/targets/cuda/reduce_helper.h
index 7b8e3e4b57..58aca5b9be 100644
--- a/include/targets/cuda/reduce_helper.h
+++ b/include/targets/cuda/reduce_helper.h
@@ -17,6 +17,11 @@ using count_t = cuda::atomic<unsigned int, cuda::thread_scope_device>;
 namespace quda
 {
 
+// By default we use negative infinity as the sentinal for testing for
+// reduction completion.  On some compilers we may need to use finite
+// numbers, so the alternative approach uses negative zero (set with CMake
+// option QUDA_HETEROGENEOUS_ATOMIC_INF_INIT).
+#ifdef HETEROGENEOUS_ATOMIC_INF_INIT
   /**
      @brief The initialization value we used to check for completion
    */
@@ -28,6 +33,28 @@ namespace quda
   */
   template <typename T> constexpr T terminate_value() { return cuda::std::numeric_limits<T>::infinity(); }
 
+  /**
+     @brief Test if the result is complete (e.g., is not equal to the sentinal)
+   */
+  template <class T> bool is_complete(const T &result) { return result != init_value<T>(); }
+#else
+  /**
+     @brief The initialization value we used to check for completion
+   */
+  template <typename T> constexpr T init_value() { return -static_cast<T>(0.0); }
+
+  /**
+     @brief The termination value we use to prevent a possible hang in
+     case the computed reduction is equal to the initialization
+  */
+  template <typename T> constexpr T terminate_value() { return static_cast<T>(0.0); }
+
+  /**
+     @brief Test if the result is complete (e.g., is not equal to the sentinal)
+   */
+  template <class T> bool is_complete(const T &result) { return !(result == static_cast<T>(0.0) && std::signbit(result)); }
+#endif
+
   // declaration of reduce function
   template <typename Reducer, typename Arg, typename T>
   __device__ inline void reduce(Arg &arg, const Reducer &r, const T &in, const int idx = 0);
@@ -129,7 +156,7 @@ namespace quda
       if (consumed) errorQuda("Cannot call complete more than once for each construction");
 
       for (int i = 0; i < n_reduce * n_item; i++) {
-        while (result_h[i].load(cuda::std::memory_order_relaxed) == init_value<system_atomic_t>()) { }
+        while (!is_complete(result_h[i].load(cuda::std::memory_order_relaxed))) { }
       }
 
       // copy back result element by element and convert if necessary to host reduce type
diff --git a/lib/targets/cuda/target_cuda.cmake b/lib/targets/cuda/target_cuda.cmake
index eb71bb9d53..2670f0a147 100644
--- a/lib/targets/cuda/target_cuda.cmake
+++ b/lib/targets/cuda/target_cuda.cmake
@@ -103,8 +103,11 @@ if(CMAKE_CUDA_COMPILER_ID MATCHES "NVIDIA" OR CMAKE_CUDA_COMPILER_ID MATCHES "NV
 endif()
 cmake_dependent_option(QUDA_HETEROGENEOUS_ATOMIC "enable heterogeneous atomic support ?" ON
                        "QUDA_HETEROGENEOUS_ATOMIC_SUPPORT" OFF)
+cmake_dependent_option(QUDA_HETEROGENEOUS_ATOMIC_INF_INIT "use infinity as the sentinal for heterogeneous atomic support?" ON
+                       "QUDA_HETEROGENEOUS_ATOMIC_SUPPORT" OFF)
 
 mark_as_advanced(QUDA_HETEROGENEOUS_ATOMIC)
+mark_as_advanced(QUDA_HETEROGENEOUS_ATOMIC_INF_INIT)
 mark_as_advanced(QUDA_JITIFY)
 mark_as_advanced(QUDA_DOWNLOAD_NVSHMEM)
 mark_as_advanced(QUDA_DOWNLOAD_NVSHMEM_TAR)

From 8b61db2c3eb75f258e7d73f402bc98000c443e75 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 20 Nov 2024 11:45:34 -0800
Subject: [PATCH 20/29] Update to use latest release of Eigen 3.4: this fixes
 some bugs with nvc++ allowing for the removal of the WARs deployed previously

---
 CMakeLists.txt          | 13 ++-----------
 cmake/eigen34_neon.diff |  8 --------
 include/eigen_helper.h  |  4 ----
 3 files changed, 2 insertions(+), 23 deletions(-)
 delete mode 100644 cmake/eigen34_neon.diff

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 45219482b2..e9fe94cab2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -426,21 +426,12 @@ if(QUDA_DOWNLOAD_EIGEN)
   CPMAddPackage(
     NAME Eigen
     VERSION ${QUDA_EIGEN_VERSION}
-    URL https://gitlab.com/libeigen/eigen/-/archive/${QUDA_EIGEN_VERSION}/eigen-${QUDA_EIGEN_VERSION}.tar.bz2
-    URL_HASH SHA256=B4C198460EBA6F28D34894E3A5710998818515104D6E74E5CC331CE31E46E626
+    URL https://gitlab.com/libeigen/eigen/-/archive/e67c494cba7180066e73b9f6234d0b2129f1cdf5.tar.bz2
+    URL_HASH SHA256=98d244932291506b75c4ae7459af29b1112ea3d2f04660686a925d9ef6634583
     DOWNLOAD_ONLY YES
     SYSTEM YES)
   target_include_directories(Eigen SYSTEM INTERFACE ${Eigen_SOURCE_DIR})
   install(DIRECTORY ${Eigen_SOURCE_DIR}/Eigen TYPE INCLUDE)
-
-  # Eigen 3.4 needs to be patched on Neon with nvc++
-  if (${CMAKE_CXX_COMPILER_ID} MATCHES "NVHPC")
-    set(CMAKE_PATCH_EIGEN OFF CACHE BOOL "Internal use only; do not modify")
-    if (NOT CMAKE_PATCH_EIGEN)
-      execute_process(COMMAND patch -N "${Eigen_SOURCE_DIR}/Eigen/src/Core/arch/NEON/Complex.h" "${CMAKE_SOURCE_DIR}/cmake/eigen34_neon.diff")
-      set(CMAKE_PATCH_EIGEN ON CACHE BOOL "Internal use only; do not modify" FORCE)
-    endif()
-  endif()
 else()
   # fall back to using find_package
   find_package(Eigen QUIET)
diff --git a/cmake/eigen34_neon.diff b/cmake/eigen34_neon.diff
deleted file mode 100644
index 7190ca7fa4..0000000000
--- a/cmake/eigen34_neon.diff
+++ /dev/null
@@ -1,8 +0,0 @@
-21c21
-< #if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML
----
-> #if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML || __NVCOMPILER_LLVM__
-393c393
-< #if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML
----
-> #if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML || __NVCOMPILER_LLVM__
diff --git a/include/eigen_helper.h b/include/eigen_helper.h
index 8c1bf012a8..373f9ed865 100644
--- a/include/eigen_helper.h
+++ b/include/eigen_helper.h
@@ -5,10 +5,6 @@
 #define EIGEN_USE_BLAS
 #endif
 
-#if defined(__NVCOMPILER) // WAR for nvc++ until we update to latest Eigen
-#define EIGEN_DONT_VECTORIZE
-#endif
-
 #include <math.h>
 
 // hide annoying warning

From 490930b680edca2c0a8d8a816852cdb87dd3f02b Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 20 Nov 2024 12:36:24 -0800
Subject: [PATCH 21/29] Apply ROCm perf WAR to Laplace operator

---
 include/kernels/laplace.cuh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/kernels/laplace.cuh b/include/kernels/laplace.cuh
index b8797e722e..a3c2b1b377 100644
--- a/include/kernels/laplace.cuh
+++ b/include/kernels/laplace.cuh
@@ -143,8 +143,9 @@ namespace quda
     static constexpr const char *filename() { return KERNEL_FILE; } // this file name - used for run-time compilation
 
     template <KernelType mykernel_type = kernel_type>
-    __device__ __host__ __forceinline__ void operator()(int idx, int src_idx, int parity)
+    __device__ __host__ __forceinline__ void operator()(int idx, int src_idx_, int parity)
     {
+      int src_idx = MAX_MULTI_RHS == 1 ? 0 : src_idx_;
       using real = typename mapper<typename Arg::Float>::type;
       using Vector = ColorSpinor<real, Arg::nColor, Arg::nSpin>;
 

From 26c3e5999bb8b0cd5376e9b1529ba1444b426307 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 20 Nov 2024 12:36:39 -0800
Subject: [PATCH 22/29] Fix nvc++ compiler warning

---
 include/kernels/staggered_quark_smearing.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/kernels/staggered_quark_smearing.cuh b/include/kernels/staggered_quark_smearing.cuh
index 22323e66a2..85e1790318 100644
--- a/include/kernels/staggered_quark_smearing.cuh
+++ b/include/kernels/staggered_quark_smearing.cuh
@@ -179,9 +179,9 @@ namespace quda
       if (arg.is_t0_kernel) {
         if (arg.t0 < 0) return;
 
-        if (mykernel_type == INTERIOR_KERNEL) {
+        if constexpr (mykernel_type == INTERIOR_KERNEL) {
           idx += arg.t0 * arg.t0_offset;
-        } else if (mykernel_type != EXTERIOR_KERNEL_ALL) {
+        } else if constexpr (mykernel_type != EXTERIOR_KERNEL_ALL) {
           if (idx >= arg.t0_face_size[mykernel_type])
             idx += arg.face_size[mykernel_type] - arg.t0_face_size[mykernel_type];
           idx += arg.t0 * arg.t0_face_offset[mykernel_type];

From 2b81309b3521f5aaad727142411270b3e8530466 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 27 Nov 2024 09:38:30 -0800
Subject: [PATCH 23/29] Fix compiler warning introduced with 3debb29

---
 tests/host_reference/hisq_force_reference.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/host_reference/hisq_force_reference.cpp b/tests/host_reference/hisq_force_reference.cpp
index 33d773ed95..5da6dc1a7a 100644
--- a/tests/host_reference/hisq_force_reference.cpp
+++ b/tests/host_reference/hisq_force_reference.cpp
@@ -837,13 +837,13 @@ void computeMiddleLinkSite(int half_lattice_index, // half_lattice_index to bett
 
   if (Qprev == NULL) {
     if (sig_positive) {
-      ls.loadMatrixFromField(static_cast<const Real *const *const>(oprod), 1 - oddBit, sig, point_d, &colorMatY);
+      ls.loadMatrixFromField(static_cast<const Real *const *>(oprod), 1 - oddBit, sig, point_d, &colorMatY);
     } else {
-      ls.loadMatrixFromField(static_cast<const Real *const *const>(oprod), oddBit, OPP_DIR(sig), point_c, &colorMatY);
+      ls.loadMatrixFromField(static_cast<const Real *const *>(oprod), oddBit, OPP_DIR(sig), point_c, &colorMatY);
       colorMatY = conj(colorMatY);
     }
   } else { // Qprev != NULL
-    ls.loadMatrixFromField(static_cast<const Real *const>(oprod), oddBit, point_c, &colorMatY);
+    ls.loadMatrixFromField(static_cast<const Real *>(oprod), oddBit, point_c, &colorMatY);
   }
 
   colorMatW = (!mu_positive) ? bc_link * colorMatY : conj(bc_link) * colorMatY;

From 0a3f608a9fcd1a69cf8563488151058c6726bb1d Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 27 Nov 2024 10:03:44 -0800
Subject: [PATCH 24/29] Fix process divergence issues (could hang when
 autotuning) in genericPrintMatrix routine.  Apply same patch to
 genericPrintVector for future proofing

---
 lib/color_spinor_util.in.cu | 4 ++--
 lib/gauge_norm.in.cu        | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/color_spinor_util.in.cu b/lib/color_spinor_util.in.cu
index a268a4b16a..71664614cf 100644
--- a/lib/color_spinor_util.in.cu
+++ b/lib/color_spinor_util.in.cu
@@ -378,8 +378,6 @@ namespace quda {
 
   void genericPrintVector(const ColorSpinorField &a, int parity, unsigned int x_cb, int rank)
   {
-    if (rank != comm_rank()) return;
-
     ColorSpinorParam param(a);
     param.location = QUDA_CPU_FIELD_LOCATION;
     param.create = QUDA_COPY_FIELD_CREATE;
@@ -388,6 +386,8 @@ namespace quda {
     std::unique_ptr<ColorSpinorField> clone_a = !host_clone ? nullptr : std::make_unique<ColorSpinorField>(param);
     const ColorSpinorField &a_ = !host_clone ? a : *clone_a.get();
 
+    if (rank != comm_rank()) return; // rank returns after potential copy to host to prevent tuning hang
+
     switch (a.Precision()) {
     case QUDA_DOUBLE_PRECISION:  genericPrintVector<double>(a_, parity, x_cb); break;
     case QUDA_SINGLE_PRECISION:  genericPrintVector<float>(a_, parity, x_cb); break;
diff --git a/lib/gauge_norm.in.cu b/lib/gauge_norm.in.cu
index 08cb5bf2ca..ddf59287f4 100644
--- a/lib/gauge_norm.in.cu
+++ b/lib/gauge_norm.in.cu
@@ -159,8 +159,6 @@ namespace quda {
 
   void genericPrintMatrix(const GaugeField &a, int d, int parity, unsigned int x_cb, int rank)
   {
-    if (rank != comm_rank()) return;
-
     GaugeFieldParam param(a);
     param.field = const_cast<GaugeField *>(&a);
     param.location = QUDA_CPU_FIELD_LOCATION;
@@ -172,6 +170,8 @@ namespace quda {
     std::unique_ptr<GaugeField> clone_a = !host_clone ? nullptr : std::make_unique<GaugeField>(param);
     const GaugeField &a_ = !host_clone ? a : *clone_a.get();
 
+    if (rank != comm_rank()) return;  // rank returns after potential copy to host to prevent tuning hang
+
     switch (a.Precision()) {
     case QUDA_DOUBLE_PRECISION: genericPrintMatrix<double>(a_, d, parity, x_cb); break;
     case QUDA_SINGLE_PRECISION: genericPrintMatrix<float>(a_, d, parity, x_cb); break;

From 22393c5a893b20a99b85b1f92dc86e32ef1c1d40 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Tue, 3 Dec 2024 13:26:40 -0800
Subject: [PATCH 25/29] Add pragma unroll

---
 include/kernels/dslash_staggered.cuh | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/include/kernels/dslash_staggered.cuh b/include/kernels/dslash_staggered.cuh
index 3dfaf01be0..ec09ab1622 100644
--- a/include/kernels/dslash_staggered.cuh
+++ b/include/kernels/dslash_staggered.cuh
@@ -103,6 +103,7 @@ namespace quda
         if (doHalo<kernel_type>(d) && ghost) {
           const int ghost_idx = ghostFaceIndexStaggered<1>(coord, arg.dim, d, 1);
           const Link U = arg.improved ? arg.U(d, coord.x_cb, parity) : arg.U(d, coord.x_cb, parity, StaggeredPhase(coord, d, +1, arg));
+#pragma unroll
           for (auto s = 0; s < n_src_tile; s++) {
             Vector in
               = arg.halo.Ghost(d, 1, ghost_idx + (src_idx + s) * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity);
@@ -111,6 +112,7 @@ namespace quda
         } else if (doBulk<kernel_type>() && !ghost) {
           const int fwd_idx = linkIndexP1(coord, arg.dim, d);
           const Link U = arg.improved ? arg.U(d, coord.x_cb, parity) : arg.U(d, coord.x_cb, parity, StaggeredPhase(coord, d, +1, arg));
+#pragma unroll
           for (auto s = 0; s < n_src_tile; s++) {
             Vector in = arg.in[src_idx + s](fwd_idx, their_spinor_parity);
             out[s] = mv_add(U, in, out[s]);
@@ -124,6 +126,7 @@ namespace quda
         if (doHalo<kernel_type>(d) && ghost) {
           const int ghost_idx = ghostFaceIndexStaggered<1>(coord, arg.dim, d, arg.nFace);
           const Link L = arg.L(d, coord.x_cb, parity);
+#pragma unroll
           for (auto s = 0; s < n_src_tile; s++) {
             const Vector in
               = arg.halo.Ghost(d, 1, ghost_idx + (src_idx + s) * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity);
@@ -132,6 +135,7 @@ namespace quda
         } else if (doBulk<kernel_type>() && !ghost) {
           const int fwd3_idx = linkIndexP3(coord, arg.dim, d);
           const Link L = arg.L(d, coord.x_cb, parity);
+#pragma unroll
           for (auto s = 0; s < n_src_tile; s++) {
             const Vector in = arg.in[src_idx + s](fwd3_idx, their_spinor_parity);
             out[s] = mv_add(L, in, out[s]);
@@ -148,6 +152,7 @@ namespace quda
           const int ghost_idx = arg.improved ? ghostFaceIndexStaggered<0>(coord, arg.dim, d, 3) : ghost_idx2;
           const Link U = arg.improved ? arg.U.Ghost(d, ghost_idx2, 1 - parity) :
             arg.U.Ghost(d, ghost_idx2, 1 - parity, StaggeredPhase(coord, d, -1, arg));
+#pragma unroll
           for (auto s = 0; s < n_src_tile; s++) {
             Vector in
               = arg.halo.Ghost(d, 0, ghost_idx + (src_idx + s) * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity);
@@ -158,6 +163,7 @@ namespace quda
           const int gauge_idx = back_idx;
           const Link U = arg.improved ? arg.U(d, gauge_idx, 1 - parity) :
             arg.U(d, gauge_idx, 1 - parity, StaggeredPhase(coord, d, -1, arg));
+#pragma unroll
           for (auto s = 0; s < n_src_tile; s++) {
             Vector in = arg.in[src_idx + s](back_idx, their_spinor_parity);
             out[s] = mv_add(conj(U), -in, out[s]);
@@ -172,6 +178,7 @@ namespace quda
           // when updating replace arg.nFace with 1 here
           const int ghost_idx = ghostFaceIndexStaggered<0>(coord, arg.dim, d, 1);
           const Link L = arg.L.Ghost(d, ghost_idx, 1 - parity);
+#pragma unroll
           for (auto s = 0; s < n_src_tile; s++) {
             const Vector in
               = arg.halo.Ghost(d, 0, ghost_idx + (src_idx + s) * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity);
@@ -181,6 +188,7 @@ namespace quda
           const int back3_idx = linkIndexM3(coord, arg.dim, d);
           const int gauge_idx = back3_idx;
           const Link L = arg.L(d, gauge_idx, 1 - parity);
+#pragma unroll
           for (auto s = 0; s < n_src_tile; s++) {
             const Vector in = arg.in[src_idx + s](back3_idx, their_spinor_parity);
             out[s] = mv_add(conj(L), -in, out[s]);
@@ -215,20 +223,24 @@ namespace quda
       array<Vector, n_src_tile> out;
       applyStaggered<nParity, mykernel_type, n_src_tile>(out, arg, coord, parity, idx, thread_dim, active, src_idx);
 
+#pragma unroll
       for (auto s = 0; s < n_src_tile; s++) out[s] *= arg.dagger_scale;
 
       if (xpay && mykernel_type == INTERIOR_KERNEL) {
+#pragma unroll
         for (auto s = 0; s < n_src_tile; s++) {
           Vector x = arg.x[src_idx + s](coord.x_cb, my_spinor_parity);
           out[s] = arg.a * x - out[s];
         }
       } else if (mykernel_type != INTERIOR_KERNEL) {
+#pragma unroll
         for (auto s = 0; s < n_src_tile; s++) {
           Vector x = arg.out[src_idx + s](coord.x_cb, my_spinor_parity);
           out[s] = x + (xpay ? -out[s] : out[s]);
         }
       }
       if (mykernel_type != EXTERIOR_KERNEL_ALL || active) {
+#pragma unroll
         for (auto s = 0; s < n_src_tile; s++) { arg.out[src_idx + s](coord.x_cb, my_spinor_parity) = out[s]; }
       }
     }

From 8ffcdd5f1607aaaeb01a3ad06e73cd294e398349 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 4 Dec 2024 09:38:48 -0800
Subject: [PATCH 26/29] Do not run BiCGStab(l) with staggered half precision
 since it is unstable

---
 tests/staggered_invert_test_gtest.hpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/staggered_invert_test_gtest.hpp b/tests/staggered_invert_test_gtest.hpp
index 7ef2c270b3..1b659eec59 100644
--- a/tests/staggered_invert_test_gtest.hpp
+++ b/tests/staggered_invert_test_gtest.hpp
@@ -53,6 +53,10 @@ bool skip_test(test_t param)
     // MR struggles with the staggered and asqtad spectrum, it's not MR's fault
     if (solution_type == QUDA_MAT_SOLUTION && solve_type == QUDA_DIRECT_SOLVE && inverter_type == QUDA_MR_INVERTER)
       return true;
+
+    // BiCGStab-L is unstable with staggered operator with low precision
+    // we could likely restore this when 20-bit quark fields are merged in
+    if (inverter_type == QUDA_BICGSTABL_INVERTER && prec_sloppy < QUDA_SINGLE_PRECISION) return true;
   }
 
   // CG3 is rather unstable with low precision

From a860b1cfe86ed079cdaef3352f867d35b8dcc86f Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 4 Dec 2024 17:17:45 -0800
Subject: [PATCH 27/29] sentinal -> sentinel

---
 include/quda_define.h.in             | 2 +-
 include/targets/cuda/reduce_helper.h | 6 +++---
 lib/targets/cuda/target_cuda.cmake   | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/quda_define.h.in b/include/quda_define.h.in
index 86957d9aba..3e572b70bd 100644
--- a/include/quda_define.h.in
+++ b/include/quda_define.h.in
@@ -46,7 +46,7 @@
 #ifdef QUDA_HETEROGENEOUS_ATOMIC_INF_INIT
 /**
  * @def   HETEROGENEOUS_ATOMIC_INF_INIT
- * @brief This macro sets whether we are using infinity for the signaling sentinal
+ * @brief This macro sets whether we are using infinity for the signaling sentinel
  */
 #define HETEROGENEOUS_ATOMIC_INF_INIT
 #undef QUDA_HETEROGENEOUS_ATOMIC_INF_INIT
diff --git a/include/targets/cuda/reduce_helper.h b/include/targets/cuda/reduce_helper.h
index 58aca5b9be..a604b66900 100644
--- a/include/targets/cuda/reduce_helper.h
+++ b/include/targets/cuda/reduce_helper.h
@@ -17,7 +17,7 @@ using count_t = cuda::atomic<unsigned int, cuda::thread_scope_device>;
 namespace quda
 {
 
-// By default we use negative infinity as the sentinal for testing for
+// By default we use negative infinity as the sentinel for testing for
 // reduction completion.  On some compilers we may need to use finite
 // numbers, so the alternative approach uses negative zero (set with CMake
 // option QUDA_HETEROGENEOUS_ATOMIC_INF_INIT).
@@ -34,7 +34,7 @@ namespace quda
   template <typename T> constexpr T terminate_value() { return cuda::std::numeric_limits<T>::infinity(); }
 
   /**
-     @brief Test if the result is complete (e.g., is not equal to the sentinal)
+     @brief Test if the result is complete (e.g., is not equal to the sentinel)
    */
   template <class T> bool is_complete(const T &result) { return result != init_value<T>(); }
 #else
@@ -50,7 +50,7 @@ namespace quda
   template <typename T> constexpr T terminate_value() { return static_cast<T>(0.0); }
 
   /**
-     @brief Test if the result is complete (e.g., is not equal to the sentinal)
+     @brief Test if the result is complete (e.g., is not equal to the sentinel)
    */
   template <class T> bool is_complete(const T &result) { return !(result == static_cast<T>(0.0) && std::signbit(result)); }
 #endif
diff --git a/lib/targets/cuda/target_cuda.cmake b/lib/targets/cuda/target_cuda.cmake
index 2670f0a147..721edc3d3b 100644
--- a/lib/targets/cuda/target_cuda.cmake
+++ b/lib/targets/cuda/target_cuda.cmake
@@ -103,7 +103,7 @@ if(CMAKE_CUDA_COMPILER_ID MATCHES "NVIDIA" OR CMAKE_CUDA_COMPILER_ID MATCHES "NV
 endif()
 cmake_dependent_option(QUDA_HETEROGENEOUS_ATOMIC "enable heterogeneous atomic support ?" ON
                        "QUDA_HETEROGENEOUS_ATOMIC_SUPPORT" OFF)
-cmake_dependent_option(QUDA_HETEROGENEOUS_ATOMIC_INF_INIT "use infinity as the sentinal for heterogeneous atomic support?" ON
+cmake_dependent_option(QUDA_HETEROGENEOUS_ATOMIC_INF_INIT "use infinity as the sentinel for heterogeneous atomic support?" ON
                        "QUDA_HETEROGENEOUS_ATOMIC_SUPPORT" OFF)
 
 mark_as_advanced(QUDA_HETEROGENEOUS_ATOMIC)

From c3fbf502e6f9fbcbd52d86da68e8497ada169839 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 4 Dec 2024 18:01:24 -0800
Subject: [PATCH 28/29] Apply review comments

---
 CMakeLists.txt                  |  3 ++-
 include/kernels/dslash_pack.cuh | 16 ++++++++++------
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e9fe94cab2..512a7b9baa 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -216,7 +216,8 @@ if(QUDA_MAX_MULTI_BLAS_N GREATER 32)
   message(SEND_ERROR "Maximum QUDA_MAX_MULTI_BLAS_N is 32.")
 endif()
 
-set(QUDA_MAX_MULTI_RHS_TILE "1" CACHE STRING "maximum tile size for MRHS kernels")
+# For now only we only support register tiles for the staggered dslash operators
+set(QUDA_MAX_MULTI_RHS_TILE "1" CACHE STRING "maximum tile size for MRHS kernels (staggered only)")
 if(QUDA_MAX_MULTI_RHS_TILE GREATER QUDA_MAX_MULTI_RHS)
   message(SEND_ERROR "QUDA_MAX_MULTI_RHS_TILE is greater than QUDA_MAX_MULTI_RHS")
 endif()
diff --git a/include/kernels/dslash_pack.cuh b/include/kernels/dslash_pack.cuh
index ee34616bbd..a6a5b3c1d5 100644
--- a/include/kernels/dslash_pack.cuh
+++ b/include/kernels/dslash_pack.cuh
@@ -165,6 +165,7 @@ namespace quda
 
       int idx = indexFromFaceIndex<nDim, pc, dim, nFace, 0>(ghost_idx, parity, arg);
       constexpr int proj_dir = dagger ? +1 : -1;
+#pragma unroll
       for (auto src = src_idx; src < src_idx + n_src_tile; src++) {
         Vector f = arg.in[src](idx + s * arg.dc.volume_4d_cb, spinor_parity);
         if (twist == 1) {
@@ -187,6 +188,7 @@ namespace quda
 
       int idx = indexFromFaceIndex<nDim, pc, dim, nFace, 1>(ghost_idx, parity, arg);
       constexpr int proj_dir = dagger ? -1 : +1;
+#pragma unroll
       for (auto src = src_idx; src < src_idx + n_src_tile; src++) {
         Vector f = arg.in[src](idx + s * arg.dc.volume_4d_cb, spinor_parity);
         if (twist == 1) {
@@ -226,15 +228,17 @@ namespace quda
 
     if (face_num == 0) { // backwards
       int idx = indexFromFaceIndexStaggered<4, QUDA_4D_PC, dim, nFace, 0>(ghost_idx, parity, arg);
-      for (auto s = 0; s < n_src_tile; s++) {
-        Vector f = arg.in[src_idx + s](idx, spinor_parity);
-        arg.halo_pack.Ghost(dim, 0, ghost_idx + (src_idx + s) * nFace * arg.dc.ghostFaceCB[dim], spinor_parity) = f;
+#pragma unroll
+      for (auto src = src_idx; src < src_idx + n_src_tile; src++) {
+        Vector f = arg.in[src](idx, spinor_parity);
+        arg.halo_pack.Ghost(dim, 0, ghost_idx + src * nFace * arg.dc.ghostFaceCB[dim], spinor_parity) = f;
       }
     } else { // forwards
       int idx = indexFromFaceIndexStaggered<4, QUDA_4D_PC, dim, nFace, 1>(ghost_idx, parity, arg);
-      for (auto s = 0; s < n_src_tile; s++) {
-        Vector f = arg.in[src_idx + s](idx, spinor_parity);
-        arg.halo_pack.Ghost(dim, 1, ghost_idx + (src_idx + s) * nFace * arg.dc.ghostFaceCB[dim], spinor_parity) = f;
+#pragma unroll
+      for (auto src = src_idx; src < src_idx + n_src_tile; src++) {
+        Vector f = arg.in[src](idx, spinor_parity);
+        arg.halo_pack.Ghost(dim, 1, ghost_idx + src * nFace * arg.dc.ghostFaceCB[dim], spinor_parity) = f;
       }
     }
   }

From b6f5edbb73e08661bdc67a666af784fba8b66cb4 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 4 Dec 2024 18:02:46 -0800
Subject: [PATCH 29/29] Apply clang format

---
 include/field_cache.h                | 1 -
 include/targets/cuda/reduce_helper.h | 5 ++++-
 lib/field_cache.cpp                  | 2 +-
 lib/gauge_norm.in.cu                 | 2 +-
 4 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/include/field_cache.h b/include/field_cache.h
index f288844300..4def6faa3a 100644
--- a/include/field_cache.h
+++ b/include/field_cache.h
@@ -154,5 +154,4 @@ namespace quda {
     for (auto i = 0u; i < a.size(); i++) tmp.push_back(std::move(getFieldTmp(a[i])));
     return tmp;
   }
-
 }
diff --git a/include/targets/cuda/reduce_helper.h b/include/targets/cuda/reduce_helper.h
index a604b66900..aceb88c83d 100644
--- a/include/targets/cuda/reduce_helper.h
+++ b/include/targets/cuda/reduce_helper.h
@@ -52,7 +52,10 @@ namespace quda
   /**
      @brief Test if the result is complete (e.g., is not equal to the sentinel)
    */
-  template <class T> bool is_complete(const T &result) { return !(result == static_cast<T>(0.0) && std::signbit(result)); }
+  template <class T> bool is_complete(const T &result)
+  {
+    return !(result == static_cast<T>(0.0) && std::signbit(result));
+  }
 #endif
 
   // declaration of reduce function
diff --git a/lib/field_cache.cpp b/lib/field_cache.cpp
index b98f24c065..7283bf39d0 100644
--- a/lib/field_cache.cpp
+++ b/lib/field_cache.cpp
@@ -44,7 +44,7 @@ namespace quda {
     if (it != cache.end() && it->second.size()) { // found an entry
       tmp = std::move(it->second.top());
       it->second.pop(); // pop the defunct object
-    } else { // no entry found, we must allocate a new field
+    } else {            // no entry found, we must allocate a new field
       param.create = QUDA_ZERO_FIELD_CREATE;
       tmp = T(param);
     }
diff --git a/lib/gauge_norm.in.cu b/lib/gauge_norm.in.cu
index ddf59287f4..2cc755bc81 100644
--- a/lib/gauge_norm.in.cu
+++ b/lib/gauge_norm.in.cu
@@ -170,7 +170,7 @@ namespace quda {
     std::unique_ptr<GaugeField> clone_a = !host_clone ? nullptr : std::make_unique<GaugeField>(param);
     const GaugeField &a_ = !host_clone ? a : *clone_a.get();
 
-    if (rank != comm_rank()) return;  // rank returns after potential copy to host to prevent tuning hang
+    if (rank != comm_rank()) return; // rank returns after potential copy to host to prevent tuning hang
 
     switch (a.Precision()) {
     case QUDA_DOUBLE_PRECISION: genericPrintMatrix<double>(a_, d, parity, x_cb); break;