diff --git a/.gitignore b/.gitignore
index a60ac7da48..8e122f0a4e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -19,3 +19,4 @@ include/jitify_options.hpp
 .tags*
 autom4te.cache/*
 .vscode
+cmake/CPM_*.cmake
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 88c83c5b8f..512a7b9baa 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -216,6 +216,12 @@ if(QUDA_MAX_MULTI_BLAS_N GREATER 32)
   message(SEND_ERROR "Maximum QUDA_MAX_MULTI_BLAS_N is 32.")
 endif()
 
+# For now only we only support register tiles for the staggered dslash operators
+set(QUDA_MAX_MULTI_RHS_TILE "1" CACHE STRING "maximum tile size for MRHS kernels (staggered only)")
+if(QUDA_MAX_MULTI_RHS_TILE GREATER QUDA_MAX_MULTI_RHS)
+  message(SEND_ERROR "QUDA_MAX_MULTI_RHS_TILE is greater than QUDA_MAX_MULTI_RHS")
+endif()
+
 set(QUDA_PRECISION
   "14"
   CACHE STRING "which precisions to instantiate in QUDA (4-bit number - double, single, half, quarter)")
@@ -275,6 +281,7 @@ mark_as_advanced(QUDA_ALTERNATIVE_I_TO_F)
 
 mark_as_advanced(QUDA_MAX_MULTI_BLAS_N)
 mark_as_advanced(QUDA_MAX_MULTI_RHS)
+mark_as_advanced(QUDA_MAX_MULTI_RHS_TILE)
 mark_as_advanced(QUDA_PRECISION)
 mark_as_advanced(QUDA_RECONSTRUCT)
 mark_as_advanced(QUDA_CLOVER_CHOLESKY_PROMOTE)
@@ -420,21 +427,12 @@ if(QUDA_DOWNLOAD_EIGEN)
   CPMAddPackage(
     NAME Eigen
     VERSION ${QUDA_EIGEN_VERSION}
-    URL https://gitlab.com/libeigen/eigen/-/archive/${QUDA_EIGEN_VERSION}/eigen-${QUDA_EIGEN_VERSION}.tar.bz2
-    URL_HASH SHA256=B4C198460EBA6F28D34894E3A5710998818515104D6E74E5CC331CE31E46E626
+    URL https://gitlab.com/libeigen/eigen/-/archive/e67c494cba7180066e73b9f6234d0b2129f1cdf5.tar.bz2
+    URL_HASH SHA256=98d244932291506b75c4ae7459af29b1112ea3d2f04660686a925d9ef6634583
     DOWNLOAD_ONLY YES
     SYSTEM YES)
   target_include_directories(Eigen SYSTEM INTERFACE ${Eigen_SOURCE_DIR})
   install(DIRECTORY ${Eigen_SOURCE_DIR}/Eigen TYPE INCLUDE)
-
-  # Eigen 3.4 needs to be patched on Neon with nvc++
-  if (${CMAKE_CXX_COMPILER_ID} MATCHES "NVHPC")
-    set(CMAKE_PATCH_EIGEN OFF CACHE BOOL "Internal use only; do not modify")
-    if (NOT CMAKE_PATCH_EIGEN)
-      execute_process(COMMAND patch -N "${Eigen_SOURCE_DIR}/Eigen/src/Core/arch/NEON/Complex.h" "${CMAKE_SOURCE_DIR}/cmake/eigen34_neon.diff")
-      set(CMAKE_PATCH_EIGEN ON CACHE BOOL "Internal use only; do not modify" FORCE)
-    endif()
-  endif()
 else()
   # fall back to using find_package
   find_package(Eigen QUIET)
diff --git a/cmake/eigen34_neon.diff b/cmake/eigen34_neon.diff
deleted file mode 100644
index 7190ca7fa4..0000000000
--- a/cmake/eigen34_neon.diff
+++ /dev/null
@@ -1,8 +0,0 @@
-21c21
-< #if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML
----
-> #if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML || __NVCOMPILER_LLVM__
-393c393
-< #if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML
----
-> #if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML || __NVCOMPILER_LLVM__
diff --git a/include/complex_quda.h b/include/complex_quda.h
index 18da63def5..1481dec40f 100644
--- a/include/complex_quda.h
+++ b/include/complex_quda.h
@@ -360,23 +360,19 @@ struct complex
   typedef ValueType value_type;
 
   // Constructors
-  __host__ __device__ inline complex<ValueType>(const ValueType &re = ValueType(), const ValueType &im = ValueType())
+  __host__ __device__ inline complex(const ValueType &re = ValueType(), const ValueType &im = ValueType())
   {
     real(re);
     imag(im);
     }
 
-  template <class X>
-    __host__ __device__
-    inline complex<ValueType>(const complex<X> & z)
+    template <class X> __host__ __device__ inline complex(const complex<X> &z)
     {
       real(z.real());
       imag(z.imag());
     }
 
-  template <class X>
-    __host__ __device__
-    inline complex<ValueType>(const std::complex<X> & z)
+    template <class X> __host__ __device__ inline complex(const std::complex<X> &z)
     {
       real(z.real());
       imag(z.imag());
@@ -436,12 +432,11 @@ struct complex
 template <> struct complex<float> : public float2 {
 public:
   typedef float value_type;
-  complex<float>() = default;
-  constexpr complex<float>(const float &re, const float &im = float()) : float2 {re, im} { }
+  complex() = default;
+  constexpr complex(const float &re, const float &im = float()) : float2 {re, im} { }
 
   template <typename X>
-  constexpr complex<float>(const std::complex<X> &z) :
-    float2 {static_cast<float>(z.real()), static_cast<float>(z.imag())}
+  constexpr complex(const std::complex<X> &z) : float2 {static_cast<float>(z.real()), static_cast<float>(z.imag())}
   {
   }
 
@@ -500,16 +495,15 @@ template <> struct complex<float> : public float2 {
 template <> struct complex<double> : public double2 {
 public:
   typedef double value_type;
-  complex<double>() = default;
-  constexpr complex<double>(const double &re, const double &im = double()) : double2 {re, im} { }
+  complex() = default;
+  constexpr complex(const double &re, const double &im = double()) : double2 {re, im} { }
 
   template <typename X>
-  constexpr complex<double>(const std::complex<X> &z) :
-    double2 {static_cast<double>(z.real()), static_cast<double>(z.imag())}
+  constexpr complex(const std::complex<X> &z) : double2 {static_cast<double>(z.real()), static_cast<double>(z.imag())}
   {
   }
 
-  template <typename T> __host__ __device__ inline complex<double> &operator=(const complex<T> &z)
+  template <typename T> __host__ __device__ inline complex &operator=(const complex<T> &z)
   {
     real(z.real());
     imag(z.imag());
@@ -572,9 +566,9 @@ template <> struct complex<int8_t> : public char2 {
 public:
   typedef int8_t value_type;
 
-  complex<int8_t>() = default;
+  complex() = default;
 
-  constexpr complex<int8_t>(const int8_t &re, const int8_t &im = int8_t()) : char2 {re, im} { }
+  constexpr complex(const int8_t &re, const int8_t &im = int8_t()) : char2 {re, im} { }
 
   __host__ __device__ inline complex<int8_t> &operator+=(const complex<int8_t> &z)
   {
@@ -608,9 +602,9 @@ struct complex <short> : public short2
 public:
   typedef short value_type;
 
-  complex<short>() = default;
+  complex() = default;
 
-  constexpr complex<short>(const short &re, const short &im = short()) : short2 {re, im} { }
+  constexpr complex(const short &re, const short &im = short()) : short2 {re, im} { }
 
   __host__ __device__ inline complex<short> &operator+=(const complex<short> &z)
   {
@@ -644,9 +638,9 @@ struct complex <int> : public int2
 public:
   typedef int value_type;
 
-  complex<int>() = default;
+  complex() = default;
 
-  constexpr complex<int>(const int &re, const int &im = int()) : int2 {re, im} { }
+  constexpr complex(const int &re, const int &im = int()) : int2 {re, im} { }
 
   __host__ __device__ inline complex<int> &operator+=(const complex<int> &z)
   {
diff --git a/include/dslash.h b/include/dslash.h
index 4b1aada52e..80638b494e 100644
--- a/include/dslash.h
+++ b/include/dslash.h
@@ -67,6 +67,10 @@ namespace quda
       if (arg.xpay) strcat(aux_base, ",xpay");
       if (arg.dagger) strcat(aux_base, ",dagger");
       setRHSstring(aux_base, in.size());
+      strcat(aux_base, ",n_rhs_tile=");
+      char tile_str[16];
+      i32toa(tile_str, Arg::n_src_tile);
+      strcat(aux_base, tile_str);
     }
 
     /**
@@ -329,7 +333,13 @@ namespace quda
 
     Dslash(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
            const ColorSpinorField &halo, const std::string &app_base = "") :
-      TunableKernel3D(in[0], halo.X(4), arg.nParity), arg(arg), out(out), in(in), halo(halo), nDimComms(4), dslashParam(arg)
+      TunableKernel3D(in[0], (halo.X(4) + Arg::n_src_tile - 1) / Arg::n_src_tile, arg.nParity),
+      arg(arg),
+      out(out),
+      in(in),
+      halo(halo),
+      nDimComms(4),
+      dslashParam(arg)
     {
       if (checkLocation(out, in) == QUDA_CPU_FIELD_LOCATION)
         errorQuda("CPU Fields not supported in Dslash framework yet");
diff --git a/include/dslash_helper.cuh b/include/dslash_helper.cuh
index df176549fc..0e49e64b68 100644
--- a/include/dslash_helper.cuh
+++ b/include/dslash_helper.cuh
@@ -12,12 +12,7 @@
 #include <kernel_helper.h>
 #include <tune_quda.h>
 
-#if defined(_NVHPC_CUDA)
-#include <constant_kernel_arg.h>
-constexpr quda::use_kernel_arg_p use_kernel_arg = quda::use_kernel_arg_p::FALSE;
-#else
 constexpr quda::use_kernel_arg_p use_kernel_arg = quda::use_kernel_arg_p::TRUE;
-#endif
 
 #include <kernel.h>
 
@@ -241,11 +236,12 @@ namespace quda
     return true;
   }
 
-  template <typename Float_, int nDim_> struct DslashArg {
+  template <typename Float_, int nDim_, int n_src_tile_ = 1> struct DslashArg {
 
     using Float = Float_;
     using real = typename mapper<Float>::type;
     static constexpr int nDim = nDim_;
+    static constexpr int n_src_tile = n_src_tile_; // how many RHS per thread
 
     const int parity;  // only use this for single parity fields
     const int nParity; // number of parities we're working on
@@ -269,6 +265,7 @@ namespace quda
     int threadDimMapLower[4];
     int threadDimMapUpper[4];
 
+    int_fastdiv n_src;
     int_fastdiv Ls;
 
     // these are set with symmetric preconditioned twisted-mass dagger
@@ -327,6 +324,7 @@ namespace quda
       exterior_threads(0),
       threadDimMapLower {},
       threadDimMapUpper {},
+      n_src(in.size()),
       Ls(halo.X(4) / in.size()),
       twist_a(0.0),
       twist_b(0.0),
@@ -650,8 +648,9 @@ namespace quda
     Arg arg;
 
     dslash_functor_arg(const Arg &arg, unsigned int threads_x) :
-      kernel_param(dim3(threads_x, arg.dc.Ls, arg.nParity)),
-      arg(arg) { }
+      kernel_param(dim3(threads_x, (arg.dc.Ls + Arg::n_src_tile - 1) / Arg::n_src_tile, arg.nParity)), arg(arg)
+    {
+    }
   };
 
   /**
diff --git a/include/eigen_helper.h b/include/eigen_helper.h
index 8c1bf012a8..373f9ed865 100644
--- a/include/eigen_helper.h
+++ b/include/eigen_helper.h
@@ -5,10 +5,6 @@
 #define EIGEN_USE_BLAS
 #endif
 
-#if defined(__NVCOMPILER) // WAR for nvc++ until we update to latest Eigen
-#define EIGEN_DONT_VECTORIZE
-#endif
-
 #include <math.h>
 
 // hide annoying warning
diff --git a/include/field_cache.h b/include/field_cache.h
index 5e05f9acf6..4def6faa3a 100644
--- a/include/field_cache.h
+++ b/include/field_cache.h
@@ -15,7 +15,7 @@ namespace quda {
    */
   template <typename T>
   struct FieldKey {
-    std::string volume; /** volume kstring */
+    std::string volume; /** volume string */
     std::string aux;    /** auxiliary string */
 
     FieldKey() = default;
@@ -78,6 +78,18 @@ namespace quda {
      */
     FieldTmp(const FieldKey<T> &key, const typename T::param_type &param);
 
+    /**
+       @brief Create a field temporary that corresponds to the field
+       constructed from the param struct.  If a matching field is
+       present in the cache, it will be popped from the cache.  If no
+       such temporary exists a temporary will be allocated.
+       @param[in] key Key corresponding to the field instance we
+       require
+       @param[in] param Parameter structure used to allocated
+       the temporary
+     */
+    FieldTmp(typename T::param_type param);
+
     /**
        @brief Copy constructor is deleted to prevent accidental cache
        bloat
@@ -111,6 +123,18 @@ namespace quda {
    */
   template <typename T> auto getFieldTmp(const T &a) { return FieldTmp<T>(a); }
 
+  /**
+     @brief Get a field temporary that is identical to the field
+     instance argument.  If a matching field is present in the cache,
+     it will be popped from the cache.  If no such temporary exists, a
+     temporary will be allocated.  When the destructor for the
+     FieldTmp is called, e.g., the returned object goes out of scope,
+     the temporary will be pushed onto the cache.
+
+     @param[in] a Field we wish to create a matching temporary for
+   */
+  template <typename T> auto getFieldTmp(const typename T::param_type &param) { return FieldTmp<T>(param); }
+
   /**
      @brief Get a vector of field temporaries that are identical to
      the vector instance argument.  If enough matching fields are
diff --git a/include/kernels/dslash_pack.cuh b/include/kernels/dslash_pack.cuh
index 94e0465238..a6a5b3c1d5 100644
--- a/include/kernels/dslash_pack.cuh
+++ b/include/kernels/dslash_pack.cuh
@@ -11,8 +11,10 @@ namespace quda
 {
   int *getPackComms();
 
-  template <typename Float_, int nColor_, int nSpin_, bool spin_project_ = true,
-            bool dagger_ = false, int twist_ = 0, QudaPCType pc_type_ = QUDA_4D_PC>
+  constexpr int pack_tile_size = 1;
+
+  template <typename Float_, int nColor_, int nSpin_, bool spin_project_ = true, bool dagger_ = false, int twist_ = 0,
+            QudaPCType pc_type_ = QUDA_4D_PC, int n_src_tile_ = pack_tile_size>
   struct PackArg : kernel_param<> {
 
     typedef Float_ Float;
@@ -24,6 +26,7 @@ namespace quda
     static constexpr bool dagger = dagger_;
     static constexpr int twist = twist_; // whether we are doing preconditioned twisted-mass or not (1 - singlet, 2 - doublet)
     static constexpr QudaPCType pc_type = pc_type_; // preconditioning type (4-d or 5-d)
+    static constexpr int n_src_tile = n_src_tile_;
 
     static constexpr bool spinor_direct_load = false; // false means texture load
 
@@ -54,6 +57,7 @@ namespace quda
 
     int sites_per_block;
 
+    int_fastdiv n_src;
     int_fastdiv Ls;
 
     char *packBuffer[4 * QUDA_MAX_DIM];
@@ -96,6 +100,7 @@ namespace quda
       threadDimMapLower {},
       threadDimMapUpper {},
       sites_per_block((work_items + grid - 1) / grid),
+      n_src(in.size()),
       Ls(halo.X(4) / in.size())
 #ifdef NVSHMEM_COMMS
       ,
@@ -131,7 +136,7 @@ namespace quda
     }
   };
 
-  template <bool dagger, int twist, int dim, QudaPCType pc, typename Arg>
+  template <bool dagger, int twist, int dim, QudaPCType pc, int n_src_tile, typename Arg>
   __device__ __host__ inline void pack(const Arg &arg, int ghost_idx, int s, int parity, int src_idx)
   {
     typedef typename mapper<typename Arg::Float>::type real;
@@ -160,46 +165,52 @@ namespace quda
 
       int idx = indexFromFaceIndex<nDim, pc, dim, nFace, 0>(ghost_idx, parity, arg);
       constexpr int proj_dir = dagger ? +1 : -1;
-      Vector f = arg.in[src_idx](idx + s * arg.dc.volume_4d_cb, spinor_parity);
-      if (twist == 1) {
-        f = arg.twist_a * (f + arg.twist_b * f.igamma(4));
-      } else if (twist == 2) {
-        Vector f1 = arg.in[src_idx](idx + (1 - s) * arg.dc.volume_4d_cb, spinor_parity); // load other flavor
-        if (s == 0)
-          f = arg.twist_a * (f + arg.twist_b * f.igamma(4) + arg.twist_c * f1);
-        else
-          f = arg.twist_a * (f - arg.twist_b * f.igamma(4) + arg.twist_c * f1);
-      }
-      if (arg.spin_project) {
-        arg.halo_pack.Ghost(dim, 0, ghost_idx + (src_idx * arg.Ls + s) * arg.dc.ghostFaceCB[dim], spinor_parity)
-          = f.project(dim, proj_dir);
-      } else {
-        arg.halo_pack.Ghost(dim, 0, ghost_idx + (src_idx * arg.Ls + s) * arg.dc.ghostFaceCB[dim], spinor_parity) = f;
+#pragma unroll
+      for (auto src = src_idx; src < src_idx + n_src_tile; src++) {
+        Vector f = arg.in[src](idx + s * arg.dc.volume_4d_cb, spinor_parity);
+        if (twist == 1) {
+          f = arg.twist_a * (f + arg.twist_b * f.igamma(4));
+        } else if (twist == 2) {
+          Vector f1 = arg.in[src](idx + (1 - s) * arg.dc.volume_4d_cb, spinor_parity); // load other flavor
+          if (s == 0)
+            f = arg.twist_a * (f + arg.twist_b * f.igamma(4) + arg.twist_c * f1);
+          else
+            f = arg.twist_a * (f - arg.twist_b * f.igamma(4) + arg.twist_c * f1);
+        }
+        if (arg.spin_project) {
+          arg.halo_pack.Ghost(dim, 0, ghost_idx + (src * arg.Ls + s) * arg.dc.ghostFaceCB[dim], spinor_parity)
+            = f.project(dim, proj_dir);
+        } else {
+          arg.halo_pack.Ghost(dim, 0, ghost_idx + (src * arg.Ls + s) * arg.dc.ghostFaceCB[dim], spinor_parity) = f;
+        }
       }
     } else { // forwards
 
       int idx = indexFromFaceIndex<nDim, pc, dim, nFace, 1>(ghost_idx, parity, arg);
       constexpr int proj_dir = dagger ? -1 : +1;
-      Vector f = arg.in[src_idx](idx + s * arg.dc.volume_4d_cb, spinor_parity);
-      if (twist == 1) {
-        f = arg.twist_a * (f + arg.twist_b * f.igamma(4));
-      } else if (twist == 2) {
-        Vector f1 = arg.in[src_idx](idx + (1 - s) * arg.dc.volume_4d_cb, spinor_parity); // load other flavor
-        if (s == 0)
-          f = arg.twist_a * (f + arg.twist_b * f.igamma(4) + arg.twist_c * f1);
-        else
-          f = arg.twist_a * (f - arg.twist_b * f.igamma(4) + arg.twist_c * f1);
-      }
-      if (arg.spin_project) {
-        arg.halo_pack.Ghost(dim, 1, ghost_idx + (src_idx * arg.Ls + s) * arg.dc.ghostFaceCB[dim], spinor_parity)
-          = f.project(dim, proj_dir);
-      } else {
-        arg.halo_pack.Ghost(dim, 1, ghost_idx + (src_idx * arg.Ls + s) * arg.dc.ghostFaceCB[dim], spinor_parity) = f;
+#pragma unroll
+      for (auto src = src_idx; src < src_idx + n_src_tile; src++) {
+        Vector f = arg.in[src](idx + s * arg.dc.volume_4d_cb, spinor_parity);
+        if (twist == 1) {
+          f = arg.twist_a * (f + arg.twist_b * f.igamma(4));
+        } else if (twist == 2) {
+          Vector f1 = arg.in[src](idx + (1 - s) * arg.dc.volume_4d_cb, spinor_parity); // load other flavor
+          if (s == 0)
+            f = arg.twist_a * (f + arg.twist_b * f.igamma(4) + arg.twist_c * f1);
+          else
+            f = arg.twist_a * (f - arg.twist_b * f.igamma(4) + arg.twist_c * f1);
+        }
+        if (arg.spin_project) {
+          arg.halo_pack.Ghost(dim, 1, ghost_idx + (src * arg.Ls + s) * arg.dc.ghostFaceCB[dim], spinor_parity)
+            = f.project(dim, proj_dir);
+        } else {
+          arg.halo_pack.Ghost(dim, 1, ghost_idx + (src * arg.Ls + s) * arg.dc.ghostFaceCB[dim], spinor_parity) = f;
+        }
       }
     }
   }
 
-  template <int dim, int nFace = 1, typename Arg>
+  template <int dim, int nFace = 1, int n_src_tile, typename Arg>
   __device__ __host__ inline void packStaggered(const Arg &arg, int ghost_idx, int parity, int src_idx)
   {
     typedef typename mapper<typename Arg::Float>::type real;
@@ -217,12 +228,18 @@ namespace quda
 
     if (face_num == 0) { // backwards
       int idx = indexFromFaceIndexStaggered<4, QUDA_4D_PC, dim, nFace, 0>(ghost_idx, parity, arg);
-      Vector f = arg.in[src_idx](idx, spinor_parity);
-      arg.halo_pack.Ghost(dim, 0, ghost_idx + src_idx * nFace * arg.dc.ghostFaceCB[dim], spinor_parity) = f;
+#pragma unroll
+      for (auto src = src_idx; src < src_idx + n_src_tile; src++) {
+        Vector f = arg.in[src](idx, spinor_parity);
+        arg.halo_pack.Ghost(dim, 0, ghost_idx + src * nFace * arg.dc.ghostFaceCB[dim], spinor_parity) = f;
+      }
     } else { // forwards
       int idx = indexFromFaceIndexStaggered<4, QUDA_4D_PC, dim, nFace, 1>(ghost_idx, parity, arg);
-      Vector f = arg.in[src_idx](idx, spinor_parity);
-      arg.halo_pack.Ghost(dim, 1, ghost_idx + src_idx * nFace * arg.dc.ghostFaceCB[dim], spinor_parity) = f;
+#pragma unroll
+      for (auto src = src_idx; src < src_idx + n_src_tile; src++) {
+        Vector f = arg.in[src](idx, spinor_parity);
+        arg.halo_pack.Ghost(dim, 1, ghost_idx + src * nFace * arg.dc.ghostFaceCB[dim], spinor_parity) = f;
+      }
     }
   }
 
@@ -250,24 +267,36 @@ namespace quda
         if (Arg::pc_type == QUDA_5D_PC) { // 5-d checkerboarded, include s (not ghostFaceCB since both faces)
           switch (dim) {
           case 0:
-            pack<Arg::dagger, Arg::twist, 0, Arg::pc_type>(arg, ghost_idx + s * arg.dc.ghostFace[0], 0, parity, src_idx);
+            pack<Arg::dagger, Arg::twist, 0, Arg::pc_type, Arg::n_src_tile>(arg, ghost_idx + s * arg.dc.ghostFace[0], 0,
+                                                                            parity, src_idx);
             break;
           case 1:
-            pack<Arg::dagger, Arg::twist, 1, Arg::pc_type>(arg, ghost_idx + s * arg.dc.ghostFace[1], 0, parity, src_idx);
+            pack<Arg::dagger, Arg::twist, 1, Arg::pc_type, Arg::n_src_tile>(arg, ghost_idx + s * arg.dc.ghostFace[1], 0,
+                                                                            parity, src_idx);
             break;
           case 2:
-            pack<Arg::dagger, Arg::twist, 2, Arg::pc_type>(arg, ghost_idx + s * arg.dc.ghostFace[2], 0, parity, src_idx);
+            pack<Arg::dagger, Arg::twist, 2, Arg::pc_type, Arg::n_src_tile>(arg, ghost_idx + s * arg.dc.ghostFace[2], 0,
+                                                                            parity, src_idx);
             break;
           case 3:
-            pack<Arg::dagger, Arg::twist, 3, Arg::pc_type>(arg, ghost_idx + s * arg.dc.ghostFace[3], 0, parity, src_idx);
+            pack<Arg::dagger, Arg::twist, 3, Arg::pc_type, Arg::n_src_tile>(arg, ghost_idx + s * arg.dc.ghostFace[3], 0,
+                                                                            parity, src_idx);
             break;
           }
         } else { // 4-d checkerboarding, keeping s separate (if it exists)
           switch (dim) {
-          case 0: pack<Arg::dagger, Arg::twist, 0, Arg::pc_type>(arg, ghost_idx, s, parity, src_idx); break;
-          case 1: pack<Arg::dagger, Arg::twist, 1, Arg::pc_type>(arg, ghost_idx, s, parity, src_idx); break;
-          case 2: pack<Arg::dagger, Arg::twist, 2, Arg::pc_type>(arg, ghost_idx, s, parity, src_idx); break;
-          case 3: pack<Arg::dagger, Arg::twist, 3, Arg::pc_type>(arg, ghost_idx, s, parity, src_idx); break;
+          case 0:
+            pack<Arg::dagger, Arg::twist, 0, Arg::pc_type, Arg::n_src_tile>(arg, ghost_idx, s, parity, src_idx);
+            break;
+          case 1:
+            pack<Arg::dagger, Arg::twist, 1, Arg::pc_type, Arg::n_src_tile>(arg, ghost_idx, s, parity, src_idx);
+            break;
+          case 2:
+            pack<Arg::dagger, Arg::twist, 2, Arg::pc_type, Arg::n_src_tile>(arg, ghost_idx, s, parity, src_idx);
+            break;
+          case 3:
+            pack<Arg::dagger, Arg::twist, 3, Arg::pc_type, Arg::n_src_tile>(arg, ghost_idx, s, parity, src_idx);
+            break;
           }
         }
 
@@ -288,7 +317,8 @@ namespace quda
   // 64 - use uber kernel (merge exterior)
   template <bool dagger, QudaPCType pc, typename Arg> struct packShmem {
 
-    template <int twist> __device__ __forceinline__ void operator()(const Arg &arg, int src_s, int parity)
+    template <int twist, int n_src_tile>
+    __device__ __forceinline__ void operator()(const Arg &arg, int src_s, int parity)
     {
       // (active_dims * 2 + dir) * blocks_per_dir + local_block_idx
       int local_block_idx = target::block_idx().x % arg.blocks_per_dir;
@@ -315,9 +345,9 @@ namespace quda
           while (local_tid < arg.dc.ghostFaceCB[0]) {
             int ghost_idx = dir * arg.dc.ghostFaceCB[0] + local_tid;
             if (pc == QUDA_5D_PC)
-              pack<dagger, twist, 0, pc>(arg, ghost_idx + s * arg.dc.ghostFace[0], 0, parity, src_idx);
+              pack<dagger, twist, 0, pc, n_src_tile>(arg, ghost_idx + s * arg.dc.ghostFace[0], 0, parity, src_idx);
             else
-              pack<dagger, twist, 0, pc>(arg, ghost_idx, s, parity, src_idx);
+              pack<dagger, twist, 0, pc, n_src_tile>(arg, ghost_idx, s, parity, src_idx);
             local_tid += arg.blocks_per_dir * target::block_dim().x;
           }
           break;
@@ -325,9 +355,9 @@ namespace quda
           while (local_tid < arg.dc.ghostFaceCB[1]) {
             int ghost_idx = dir * arg.dc.ghostFaceCB[1] + local_tid;
             if (pc == QUDA_5D_PC)
-              pack<dagger, twist, 1, pc>(arg, ghost_idx + s * arg.dc.ghostFace[1], 0, parity, src_idx);
+              pack<dagger, twist, 1, pc, n_src_tile>(arg, ghost_idx + s * arg.dc.ghostFace[1], 0, parity, src_idx);
             else
-              pack<dagger, twist, 1, pc>(arg, ghost_idx, s, parity, src_idx);
+              pack<dagger, twist, 1, pc, n_src_tile>(arg, ghost_idx, s, parity, src_idx);
             local_tid += arg.blocks_per_dir * target::block_dim().x;
           }
           break;
@@ -335,9 +365,9 @@ namespace quda
           while (local_tid < arg.dc.ghostFaceCB[2]) {
             int ghost_idx = dir * arg.dc.ghostFaceCB[2] + local_tid;
             if (pc == QUDA_5D_PC)
-              pack<dagger, twist, 2, pc>(arg, ghost_idx + s * arg.dc.ghostFace[2], 0, parity, src_idx);
+              pack<dagger, twist, 2, pc, n_src_tile>(arg, ghost_idx + s * arg.dc.ghostFace[2], 0, parity, src_idx);
             else
-              pack<dagger, twist, 2, pc>(arg, ghost_idx, s, parity, src_idx);
+              pack<dagger, twist, 2, pc, n_src_tile>(arg, ghost_idx, s, parity, src_idx);
             local_tid += arg.blocks_per_dir * target::block_dim().x;
           }
           break;
@@ -345,9 +375,9 @@ namespace quda
           while (local_tid < arg.dc.ghostFaceCB[3]) {
             int ghost_idx = dir * arg.dc.ghostFaceCB[3] + local_tid;
             if (pc == QUDA_5D_PC)
-              pack<dagger, twist, 3, pc>(arg, ghost_idx + s * arg.dc.ghostFace[3], 0, parity, src_idx);
+              pack<dagger, twist, 3, pc, n_src_tile>(arg, ghost_idx + s * arg.dc.ghostFace[3], 0, parity, src_idx);
             else
-              pack<dagger, twist, 3, pc>(arg, ghost_idx, s, parity, src_idx);
+              pack<dagger, twist, 3, pc, n_src_tile>(arg, ghost_idx, s, parity, src_idx);
             local_tid += arg.blocks_per_dir * target::block_dim().x;
           }
           break;
@@ -358,12 +388,19 @@ namespace quda
 #endif
     }
 
-    __device__ __forceinline__ void operator()(const Arg &arg, int s, int parity, int twist_pack)
+    template <int n_src_tile = Arg::n_src_tile>
+    __device__ __forceinline__ void operator()(const Arg &arg, int src_s_block_, int parity, int twist_pack)
     {
-      switch (twist_pack) {
-      case 0: this->operator()<0>(arg, s, parity); break;
-      case 1: this->operator()<1>(arg, s, parity); break;
-      case 2: this->operator()<2>(arg, s, parity); break;
+      int src_s_block = MAX_MULTI_RHS == 1 ? 0 : src_s_block_;
+      int src_s_idx = src_s_block * Arg::n_src_tile;
+      if (src_s_idx + n_src_tile * arg.Ls <= arg.n_src * arg.Ls) {
+        switch (twist_pack) {
+        case 0: this->operator()<0, n_src_tile>(arg, src_s_idx, parity); break;
+        case 1: this->operator()<1, n_src_tile>(arg, src_s_idx, parity); break;
+        case 2: this->operator()<2, n_src_tile>(arg, src_s_idx, parity); break;
+        }
+      } else if constexpr (n_src_tile - 1 > 0) {
+        operator()<n_src_tile - 1>(arg, src_s_block, parity);
       }
     }
   };
@@ -377,7 +414,7 @@ namespace quda
     {
       if (arg.nParity == 1) parity = arg.parity;
       packShmem<Arg::dagger, Arg::pc_type, Arg> pack;
-      pack.operator()<Arg::twist>(arg, s, parity);
+      pack.operator()(arg, s, parity, Arg::twist);
     }
   };
 
@@ -400,17 +437,17 @@ namespace quda
 
         if (arg.nFace == 1) {
           switch (dim) {
-          case 0: packStaggered<0, 1>(arg, ghost_idx, parity, src_idx); break;
-          case 1: packStaggered<1, 1>(arg, ghost_idx, parity, src_idx); break;
-          case 2: packStaggered<2, 1>(arg, ghost_idx, parity, src_idx); break;
-          case 3: packStaggered<3, 1>(arg, ghost_idx, parity, src_idx); break;
+          case 0: packStaggered<0, 1, Arg::n_src_tile>(arg, ghost_idx, parity, src_idx); break;
+          case 1: packStaggered<1, 1, Arg::n_src_tile>(arg, ghost_idx, parity, src_idx); break;
+          case 2: packStaggered<2, 1, Arg::n_src_tile>(arg, ghost_idx, parity, src_idx); break;
+          case 3: packStaggered<3, 1, Arg::n_src_tile>(arg, ghost_idx, parity, src_idx); break;
           }
         } else if (arg.nFace == 3) {
           switch (dim) {
-          case 0: packStaggered<0, 3>(arg, ghost_idx, parity, src_idx); break;
-          case 1: packStaggered<1, 3>(arg, ghost_idx, parity, src_idx); break;
-          case 2: packStaggered<2, 3>(arg, ghost_idx, parity, src_idx); break;
-          case 3: packStaggered<3, 3>(arg, ghost_idx, parity, src_idx); break;
+          case 0: packStaggered<0, 3, Arg::n_src_tile>(arg, ghost_idx, parity, src_idx); break;
+          case 1: packStaggered<1, 3, Arg::n_src_tile>(arg, ghost_idx, parity, src_idx); break;
+          case 2: packStaggered<2, 3, Arg::n_src_tile>(arg, ghost_idx, parity, src_idx); break;
+          case 3: packStaggered<3, 3, Arg::n_src_tile>(arg, ghost_idx, parity, src_idx); break;
           }
         }
 
@@ -422,7 +459,7 @@ namespace quda
 
   template <bool dagger, QudaPCType pc, typename Arg> struct packStaggeredShmem {
 
-    __device__ __forceinline__ void operator()(const Arg &arg, int src_idx, int parity, int = 0)
+    template <int n_src_tile> __device__ __forceinline__ void apply(const Arg &arg, int src_idx, int parity)
     {
       // (active_dims * 2 + dir) * blocks_per_dir + local_block_idx
       int local_block_idx = target::block_idx().x % arg.blocks_per_dir;
@@ -446,9 +483,9 @@ namespace quda
           while (local_tid < arg.nFace * arg.dc.ghostFaceCB[0]) {
             int ghost_idx = dir * arg.nFace * arg.dc.ghostFaceCB[0] + local_tid;
             if (arg.nFace == 1)
-              packStaggered<0, 1>(arg, ghost_idx, parity, src_idx);
+              packStaggered<0, 1, n_src_tile>(arg, ghost_idx, parity, src_idx);
             else
-              packStaggered<0, 3>(arg, ghost_idx, parity, src_idx);
+              packStaggered<0, 3, n_src_tile>(arg, ghost_idx, parity, src_idx);
             local_tid += arg.blocks_per_dir * target::block_dim().x;
           }
           break;
@@ -456,9 +493,9 @@ namespace quda
           while (local_tid < arg.nFace * arg.dc.ghostFaceCB[1]) {
             int ghost_idx = dir * arg.nFace * arg.dc.ghostFaceCB[1] + local_tid;
             if (arg.nFace == 1)
-              packStaggered<1, 1>(arg, ghost_idx, parity, src_idx);
+              packStaggered<1, 1, n_src_tile>(arg, ghost_idx, parity, src_idx);
             else
-              packStaggered<1, 3>(arg, ghost_idx, parity, src_idx);
+              packStaggered<1, 3, n_src_tile>(arg, ghost_idx, parity, src_idx);
             local_tid += arg.blocks_per_dir * target::block_dim().x;
           }
           break;
@@ -466,9 +503,9 @@ namespace quda
           while (local_tid < arg.nFace * arg.dc.ghostFaceCB[2]) {
             int ghost_idx = dir * arg.nFace * arg.dc.ghostFaceCB[2] + local_tid;
             if (arg.nFace == 1)
-              packStaggered<2, 1>(arg, ghost_idx, parity, src_idx);
+              packStaggered<2, 1, n_src_tile>(arg, ghost_idx, parity, src_idx);
             else
-              packStaggered<2, 3>(arg, ghost_idx, parity, src_idx);
+              packStaggered<2, 3, n_src_tile>(arg, ghost_idx, parity, src_idx);
             local_tid += arg.blocks_per_dir * target::block_dim().x;
           }
           break;
@@ -476,9 +513,9 @@ namespace quda
           while (local_tid < arg.nFace * arg.dc.ghostFaceCB[3]) {
             int ghost_idx = dir * arg.nFace * arg.dc.ghostFaceCB[3] + local_tid;
             if (arg.nFace == 1)
-              packStaggered<3, 1>(arg, ghost_idx, parity, src_idx);
+              packStaggered<3, 1, n_src_tile>(arg, ghost_idx, parity, src_idx);
             else
-              packStaggered<3, 3>(arg, ghost_idx, parity, src_idx);
+              packStaggered<3, 3, n_src_tile>(arg, ghost_idx, parity, src_idx);
             local_tid += arg.blocks_per_dir * target::block_dim().x;
           }
           break;
@@ -488,6 +525,18 @@ namespace quda
       if (arg.shmem) shmem_signal(dim, dir, arg);
 #endif
     }
+
+    template <int n_src_tile = Arg::n_src_tile>
+    __device__ __forceinline__ void operator()(const Arg &arg, int src_idx_block_, int parity, int = 0)
+    {
+      int src_idx_block = MAX_MULTI_RHS == 1 ? 0 : src_idx_block_;
+      int src_idx = src_idx_block * Arg::n_src_tile;
+      if (src_idx + n_src_tile <= arg.n_src) {
+        apply<n_src_tile>(arg, src_idx, parity);
+      } else if constexpr (n_src_tile - 1 > 0) {
+        operator()<n_src_tile - 1>(arg, src_idx_block, parity);
+      }
+    }
   };
 
   template <typename Arg> struct pack_staggered_shmem {
diff --git a/include/kernels/dslash_staggered.cuh b/include/kernels/dslash_staggered.cuh
index 7eafa4a687..ec09ab1622 100644
--- a/include/kernels/dslash_staggered.cuh
+++ b/include/kernels/dslash_staggered.cuh
@@ -14,9 +14,9 @@ namespace quda
   /**
      @brief Parameter structure for driving the Staggered Dslash operator
   */
-  template <typename Float, int nColor_, int nDim, QudaReconstructType reconstruct_u_,
-            QudaReconstructType reconstruct_l_, bool improved_, QudaStaggeredPhase phase_ = QUDA_STAGGERED_PHASE_MILC>
-  struct StaggeredArg : DslashArg<Float, nDim> {
+  template <typename Float, int nColor_, int nDim, QudaReconstructType reconstruct_u_, QudaReconstructType reconstruct_l_,
+            bool improved_, QudaStaggeredPhase phase_ = QUDA_STAGGERED_PHASE_MILC, int n_src_tile = MAX_MULTI_RHS_TILE>
+  struct StaggeredArg : DslashArg<Float, nDim, n_src_tile> {
     typedef typename mapper<Float>::type real;
     static constexpr int nColor = nColor_;
     static constexpr int nSpin = 1;
@@ -37,7 +37,6 @@ namespace quda
     using GL =
         typename gauge_mapper<Float, reconstruct_l, 18, QUDA_STAGGERED_PHASE_NO, gauge_direct_load, ghost, use_inphase>::type;
 
-    const int_fastdiv n_src;
     F out[MAX_MULTI_RHS];  /** output vector field */
     F in[MAX_MULTI_RHS];   /** input vector field */
     const Ghost halo_pack; /** accessor for writing the halo */
@@ -57,8 +56,8 @@ namespace quda
     StaggeredArg(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
                  const ColorSpinorField &halo, const GaugeField &U, const GaugeField &L, double a,
                  cvector_ref<const ColorSpinorField> &x, int parity, bool dagger, const int *comm_override) :
-      DslashArg<Float, nDim>(out, in, halo, U, x, parity, dagger, a == 0.0 ? false : true, improved_ ? 3 : 1,
-                             spin_project, comm_override),
+      DslashArg<Float, nDim, n_src_tile>(out, in, halo, U, x, parity, dagger, a == 0.0 ? false : true,
+                                         improved_ ? 3 : 1, spin_project, comm_override),
       halo_pack(halo, improved_ ? 3 : 1),
       halo(halo, improved_ ? 3 : 1),
       U(U),
@@ -87,9 +86,9 @@ namespace quda
      @param[in] parity The site parity
      @param[in] x_cb The checkerboarded site index
   */
-  template <int nParity, KernelType kernel_type, typename Coord, typename Arg, typename Vector>
-  __device__ __host__ inline void applyStaggered(Vector &out, const Arg &arg, Coord &coord, int parity, int,
-                                                 int thread_dim, bool &active, int src_idx)
+  template <int nParity, KernelType kernel_type, int n_src_tile, typename Coord, typename Arg, typename Vector>
+  __device__ __host__ inline void applyStaggered(array<Vector, n_src_tile> &out, const Arg &arg, Coord &coord,
+                                                 int parity, int, int thread_dim, bool &active, int src_idx)
   {
     typedef typename mapper<typename Arg::Float>::type real;
     typedef Matrix<complex<real>, Arg::nColor> Link;
@@ -104,13 +103,20 @@ namespace quda
         if (doHalo<kernel_type>(d) && ghost) {
           const int ghost_idx = ghostFaceIndexStaggered<1>(coord, arg.dim, d, 1);
           const Link U = arg.improved ? arg.U(d, coord.x_cb, parity) : arg.U(d, coord.x_cb, parity, StaggeredPhase(coord, d, +1, arg));
-          Vector in = arg.halo.Ghost(d, 1, ghost_idx + src_idx * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity);
-          out = mv_add(U, in, out);
+#pragma unroll
+          for (auto s = 0; s < n_src_tile; s++) {
+            Vector in
+              = arg.halo.Ghost(d, 1, ghost_idx + (src_idx + s) * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity);
+            out[s] = mv_add(U, in, out[s]);
+          }
         } else if (doBulk<kernel_type>() && !ghost) {
           const int fwd_idx = linkIndexP1(coord, arg.dim, d);
           const Link U = arg.improved ? arg.U(d, coord.x_cb, parity) : arg.U(d, coord.x_cb, parity, StaggeredPhase(coord, d, +1, arg));
-          Vector in = arg.in[src_idx](fwd_idx, their_spinor_parity);
-          out = mv_add(U, in, out);
+#pragma unroll
+          for (auto s = 0; s < n_src_tile; s++) {
+            Vector in = arg.in[src_idx + s](fwd_idx, their_spinor_parity);
+            out[s] = mv_add(U, in, out[s]);
+          }
         }
       }
 
@@ -120,14 +126,20 @@ namespace quda
         if (doHalo<kernel_type>(d) && ghost) {
           const int ghost_idx = ghostFaceIndexStaggered<1>(coord, arg.dim, d, arg.nFace);
           const Link L = arg.L(d, coord.x_cb, parity);
-          const Vector in
-            = arg.halo.Ghost(d, 1, ghost_idx + src_idx * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity);
-          out = mv_add(L, in, out);
+#pragma unroll
+          for (auto s = 0; s < n_src_tile; s++) {
+            const Vector in
+              = arg.halo.Ghost(d, 1, ghost_idx + (src_idx + s) * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity);
+            out[s] = mv_add(L, in, out[s]);
+          }
         } else if (doBulk<kernel_type>() && !ghost) {
           const int fwd3_idx = linkIndexP3(coord, arg.dim, d);
           const Link L = arg.L(d, coord.x_cb, parity);
-          const Vector in = arg.in[src_idx](fwd3_idx, their_spinor_parity);
-          out = mv_add(L, in, out);
+#pragma unroll
+          for (auto s = 0; s < n_src_tile; s++) {
+            const Vector in = arg.in[src_idx + s](fwd3_idx, their_spinor_parity);
+            out[s] = mv_add(L, in, out[s]);
+          }
         }
       }
 
@@ -140,15 +152,22 @@ namespace quda
           const int ghost_idx = arg.improved ? ghostFaceIndexStaggered<0>(coord, arg.dim, d, 3) : ghost_idx2;
           const Link U = arg.improved ? arg.U.Ghost(d, ghost_idx2, 1 - parity) :
             arg.U.Ghost(d, ghost_idx2, 1 - parity, StaggeredPhase(coord, d, -1, arg));
-          Vector in = arg.halo.Ghost(d, 0, ghost_idx + src_idx * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity);
-          out = mv_add(conj(U), -in, out);
+#pragma unroll
+          for (auto s = 0; s < n_src_tile; s++) {
+            Vector in
+              = arg.halo.Ghost(d, 0, ghost_idx + (src_idx + s) * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity);
+            out[s] = mv_add(conj(U), -in, out[s]);
+          }
         } else if (doBulk<kernel_type>() && !ghost) {
           const int back_idx = linkIndexM1(coord, arg.dim, d);
           const int gauge_idx = back_idx;
           const Link U = arg.improved ? arg.U(d, gauge_idx, 1 - parity) :
             arg.U(d, gauge_idx, 1 - parity, StaggeredPhase(coord, d, -1, arg));
-          Vector in = arg.in[src_idx](back_idx, their_spinor_parity);
-          out = mv_add(conj(U), -in, out);
+#pragma unroll
+          for (auto s = 0; s < n_src_tile; s++) {
+            Vector in = arg.in[src_idx + s](back_idx, their_spinor_parity);
+            out[s] = mv_add(conj(U), -in, out[s]);
+          }
         }
       }
 
@@ -159,15 +178,21 @@ namespace quda
           // when updating replace arg.nFace with 1 here
           const int ghost_idx = ghostFaceIndexStaggered<0>(coord, arg.dim, d, 1);
           const Link L = arg.L.Ghost(d, ghost_idx, 1 - parity);
-          const Vector in
-            = arg.halo.Ghost(d, 0, ghost_idx + src_idx * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity);
-          out = mv_add(conj(L), -in, out);
+#pragma unroll
+          for (auto s = 0; s < n_src_tile; s++) {
+            const Vector in
+              = arg.halo.Ghost(d, 0, ghost_idx + (src_idx + s) * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity);
+            out[s] = mv_add(conj(L), -in, out[s]);
+          }
         } else if (doBulk<kernel_type>() && !ghost) {
           const int back3_idx = linkIndexM3(coord, arg.dim, d);
           const int gauge_idx = back3_idx;
           const Link L = arg.L(d, gauge_idx, 1 - parity);
-          const Vector in = arg.in[src_idx](back3_idx, their_spinor_parity);
-          out = mv_add(conj(L), -in, out);
+#pragma unroll
+          for (auto s = 0; s < n_src_tile; s++) {
+            const Vector in = arg.in[src_idx + s](back3_idx, their_spinor_parity);
+            out[s] = mv_add(conj(L), -in, out[s]);
+          }
         }
       }
     } // nDim
@@ -181,8 +206,8 @@ namespace quda
     constexpr staggered(const Arg &arg) : arg(arg) {}
     static constexpr const char *filename() { return KERNEL_FILE; } // this file name - used for run-time compilation
 
-    template <KernelType mykernel_type = kernel_type>
-    __device__ __host__ __forceinline__ void operator()(int idx, int src_idx, int parity)
+    template <KernelType mykernel_type, int n_src_tile>
+    __device__ __host__ __forceinline__ void apply(int idx, int src_idx, int parity)
     {
       using real = typename mapper<typename Arg::Float>::type;
       using Vector = ColorSpinor<real, Arg::nColor, 1>;
@@ -195,20 +220,41 @@ namespace quda
 
       const int my_spinor_parity = nParity == 2 ? parity : 0;
 
-      Vector out;
+      array<Vector, n_src_tile> out;
+      applyStaggered<nParity, mykernel_type, n_src_tile>(out, arg, coord, parity, idx, thread_dim, active, src_idx);
 
-      applyStaggered<nParity, mykernel_type>(out, arg, coord, parity, idx, thread_dim, active, src_idx);
-
-      out *= arg.dagger_scale;
+#pragma unroll
+      for (auto s = 0; s < n_src_tile; s++) out[s] *= arg.dagger_scale;
 
       if (xpay && mykernel_type == INTERIOR_KERNEL) {
-        Vector x = arg.x[src_idx](coord.x_cb, my_spinor_parity);
-        out = arg.a * x - out;
+#pragma unroll
+        for (auto s = 0; s < n_src_tile; s++) {
+          Vector x = arg.x[src_idx + s](coord.x_cb, my_spinor_parity);
+          out[s] = arg.a * x - out[s];
+        }
       } else if (mykernel_type != INTERIOR_KERNEL) {
-        Vector x = arg.out[src_idx](coord.x_cb, my_spinor_parity);
-        out = x + (xpay ? -out : out);
+#pragma unroll
+        for (auto s = 0; s < n_src_tile; s++) {
+          Vector x = arg.out[src_idx + s](coord.x_cb, my_spinor_parity);
+          out[s] = x + (xpay ? -out[s] : out[s]);
+        }
+      }
+      if (mykernel_type != EXTERIOR_KERNEL_ALL || active) {
+#pragma unroll
+        for (auto s = 0; s < n_src_tile; s++) { arg.out[src_idx + s](coord.x_cb, my_spinor_parity) = out[s]; }
+      }
+    }
+
+    template <KernelType mykernel_type = kernel_type, int n_src_tile = Arg::n_src_tile>
+    __device__ __host__ __forceinline__ void operator()(int idx, int src_idx_block_, int parity)
+    {
+      int src_idx_block = MAX_MULTI_RHS == 1 ? 0 : src_idx_block_;
+      int src_idx = src_idx_block * Arg::n_src_tile;
+      if (src_idx + n_src_tile <= arg.n_src) {
+        apply<mykernel_type, n_src_tile>(idx, src_idx, parity);
+      } else if constexpr (n_src_tile - 1 > 0) {
+        operator()<mykernel_type, n_src_tile - 1>(idx, src_idx_block, parity);
       }
-      if (mykernel_type != EXTERIOR_KERNEL_ALL || active) arg.out[src_idx](coord.x_cb, my_spinor_parity) = out;
     }
   };
 
diff --git a/include/kernels/dslash_wilson.cuh b/include/kernels/dslash_wilson.cuh
index 514c4df957..584999d52f 100644
--- a/include/kernels/dslash_wilson.cuh
+++ b/include/kernels/dslash_wilson.cuh
@@ -34,7 +34,6 @@ namespace quda
 
     typedef typename mapper<Float>::type real;
 
-    const int_fastdiv n_src;
     F out[MAX_MULTI_RHS]; /** output vector field set */
     F in[MAX_MULTI_RHS];  /** input vector field set */
     F x[MAX_MULTI_RHS];   /** input vector set when doing xpay */
diff --git a/include/kernels/laplace.cuh b/include/kernels/laplace.cuh
index b8797e722e..a3c2b1b377 100644
--- a/include/kernels/laplace.cuh
+++ b/include/kernels/laplace.cuh
@@ -143,8 +143,9 @@ namespace quda
     static constexpr const char *filename() { return KERNEL_FILE; } // this file name - used for run-time compilation
 
     template <KernelType mykernel_type = kernel_type>
-    __device__ __host__ __forceinline__ void operator()(int idx, int src_idx, int parity)
+    __device__ __host__ __forceinline__ void operator()(int idx, int src_idx_, int parity)
     {
+      int src_idx = MAX_MULTI_RHS == 1 ? 0 : src_idx_;
       using real = typename mapper<typename Arg::Float>::type;
       using Vector = ColorSpinor<real, Arg::nColor, Arg::nSpin>;
 
diff --git a/include/kernels/staggered_quark_smearing.cuh b/include/kernels/staggered_quark_smearing.cuh
index 22323e66a2..85e1790318 100644
--- a/include/kernels/staggered_quark_smearing.cuh
+++ b/include/kernels/staggered_quark_smearing.cuh
@@ -179,9 +179,9 @@ namespace quda
       if (arg.is_t0_kernel) {
         if (arg.t0 < 0) return;
 
-        if (mykernel_type == INTERIOR_KERNEL) {
+        if constexpr (mykernel_type == INTERIOR_KERNEL) {
           idx += arg.t0 * arg.t0_offset;
-        } else if (mykernel_type != EXTERIOR_KERNEL_ALL) {
+        } else if constexpr (mykernel_type != EXTERIOR_KERNEL_ALL) {
           if (idx >= arg.t0_face_size[mykernel_type])
             idx += arg.face_size[mykernel_type] - arg.t0_face_size[mykernel_type];
           idx += arg.t0 * arg.t0_face_offset[mykernel_type];
diff --git a/include/quda_define.h.in b/include/quda_define.h.in
index ea6657120f..3e572b70bd 100644
--- a/include/quda_define.h.in
+++ b/include/quda_define.h.in
@@ -25,6 +25,12 @@
  */
 #define MAX_MULTI_RHS @QUDA_MAX_MULTI_RHS@
 
+/**
+ * @def   MAX_MULTI_BLAS_N
+ * @brief This macro sets the register tile size for MRHS kernels
+ */
+#define MAX_MULTI_RHS_TILE @QUDA_MAX_MULTI_RHS_TILE@
+
 #cmakedefine QUDA_HETEROGENEOUS_ATOMIC
 #ifdef QUDA_HETEROGENEOUS_ATOMIC
 /**
@@ -36,6 +42,16 @@
 #undef QUDA_HETEROGENEOUS_ATOMIC
 #endif
 
+#cmakedefine QUDA_HETEROGENEOUS_ATOMIC_INF_INIT
+#ifdef QUDA_HETEROGENEOUS_ATOMIC_INF_INIT
+/**
+ * @def   HETEROGENEOUS_ATOMIC_INF_INIT
+ * @brief This macro sets whether we are using infinity for the signaling sentinel
+ */
+#define HETEROGENEOUS_ATOMIC_INF_INIT
+#undef QUDA_HETEROGENEOUS_ATOMIC_INF_INIT
+#endif
+
 #cmakedefine QUDA_LARGE_KERNEL_ARG
 
 #cmakedefine QUDA_DIRAC_CLOVER_HASENBUSCH
diff --git a/include/targets/cuda/reduce_helper.h b/include/targets/cuda/reduce_helper.h
index c8c6101892..aceb88c83d 100644
--- a/include/targets/cuda/reduce_helper.h
+++ b/include/targets/cuda/reduce_helper.h
@@ -17,6 +17,11 @@ using count_t = cuda::atomic<unsigned int, cuda::thread_scope_device>;
 namespace quda
 {
 
+// By default we use negative infinity as the sentinel for testing for
+// reduction completion.  On some compilers we may need to use finite
+// numbers, so the alternative approach uses negative zero (set with CMake
+// option QUDA_HETEROGENEOUS_ATOMIC_INF_INIT).
+#ifdef HETEROGENEOUS_ATOMIC_INF_INIT
   /**
      @brief The initialization value we used to check for completion
    */
@@ -28,6 +33,31 @@ namespace quda
   */
   template <typename T> constexpr T terminate_value() { return cuda::std::numeric_limits<T>::infinity(); }
 
+  /**
+     @brief Test if the result is complete (e.g., is not equal to the sentinel)
+   */
+  template <class T> bool is_complete(const T &result) { return result != init_value<T>(); }
+#else
+  /**
+     @brief The initialization value we used to check for completion
+   */
+  template <typename T> constexpr T init_value() { return -static_cast<T>(0.0); }
+
+  /**
+     @brief The termination value we use to prevent a possible hang in
+     case the computed reduction is equal to the initialization
+  */
+  template <typename T> constexpr T terminate_value() { return static_cast<T>(0.0); }
+
+  /**
+     @brief Test if the result is complete (e.g., is not equal to the sentinel)
+   */
+  template <class T> bool is_complete(const T &result)
+  {
+    return !(result == static_cast<T>(0.0) && std::signbit(result));
+  }
+#endif
+
   // declaration of reduce function
   template <typename Reducer, typename Arg, typename T>
   __device__ inline void reduce(Arg &arg, const Reducer &r, const T &in, const int idx = 0);
@@ -129,7 +159,7 @@ namespace quda
       if (consumed) errorQuda("Cannot call complete more than once for each construction");
 
       for (int i = 0; i < n_reduce * n_item; i++) {
-        while (result_h[i].load(cuda::std::memory_order_relaxed) == init_value<system_atomic_t>()) { }
+        while (!is_complete(result_h[i].load(cuda::std::memory_order_relaxed))) { }
       }
 
       // copy back result element by element and convert if necessary to host reduce type
@@ -169,11 +199,7 @@ namespace quda
     auto tid = target::thread_idx_linear<2>();
 
     if (arg.result_d) { // write to host mapped memory
-#ifdef _NVHPC_CUDA      // WAR for nvc++
-      constexpr bool coalesced_write = false;
-#else
       constexpr bool coalesced_write = true;
-#endif
       if constexpr (coalesced_write) {
         static_assert(n <= device::warp_size(), "reduction array is greater than warp size");
         auto mask = __ballot_sync(0xffffffff, tid < n);
diff --git a/include/targets/cuda/thread_array.h b/include/targets/cuda/thread_array.h
deleted file mode 100644
index 1c4d7f3244..0000000000
--- a/include/targets/cuda/thread_array.h
+++ /dev/null
@@ -1,18 +0,0 @@
-#pragma once
-
-#ifndef _NVHPC_CUDA
-
-#include "../generic/thread_array.h"
-
-#else
-
-#include <array.h>
-
-namespace quda
-{
-  template <typename T, int n> struct thread_array : array<T, n> {
-    static constexpr unsigned int shared_mem_size(dim3 block) { return 0; }
-  };
-} // namespace quda
-
-#endif
diff --git a/include/targets/hip/thread_array.h b/include/targets/hip/thread_array.h
deleted file mode 100644
index 24f986cc26..0000000000
--- a/include/targets/hip/thread_array.h
+++ /dev/null
@@ -1 +0,0 @@
-#include "../generic/thread_array.h"
diff --git a/lib/color_spinor_util.in.cu b/lib/color_spinor_util.in.cu
index a268a4b16a..71664614cf 100644
--- a/lib/color_spinor_util.in.cu
+++ b/lib/color_spinor_util.in.cu
@@ -378,8 +378,6 @@ namespace quda {
 
   void genericPrintVector(const ColorSpinorField &a, int parity, unsigned int x_cb, int rank)
   {
-    if (rank != comm_rank()) return;
-
     ColorSpinorParam param(a);
     param.location = QUDA_CPU_FIELD_LOCATION;
     param.create = QUDA_COPY_FIELD_CREATE;
@@ -388,6 +386,8 @@ namespace quda {
     std::unique_ptr<ColorSpinorField> clone_a = !host_clone ? nullptr : std::make_unique<ColorSpinorField>(param);
     const ColorSpinorField &a_ = !host_clone ? a : *clone_a.get();
 
+    if (rank != comm_rank()) return; // rank returns after potential copy to host to prevent tuning hang
+
     switch (a.Precision()) {
     case QUDA_DOUBLE_PRECISION:  genericPrintVector<double>(a_, parity, x_cb); break;
     case QUDA_SINGLE_PRECISION:  genericPrintVector<float>(a_, parity, x_cb); break;
diff --git a/lib/dslash_coarse.in.cpp b/lib/dslash_coarse.in.cpp
index 747f2e9958..52298ceb14 100644
--- a/lib/dslash_coarse.in.cpp
+++ b/lib/dslash_coarse.in.cpp
@@ -88,7 +88,7 @@ namespace quda
     param.nVec = nVec;
     param.create = QUDA_NULL_FIELD_CREATE;
     param.fieldOrder = order;
-    return ColorSpinorField(param);
+    return getFieldTmp<ColorSpinorField>(param);
   }
 
   // Apply the coarse Dirac matrix to a coarse grid vector
@@ -107,9 +107,9 @@ namespace quda
                            IntList<@QUDA_MULTIGRID_NVEC_LIST@>());
       } else {
         constexpr QudaFieldOrder csOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
-        ColorSpinorField v_inA = create_color_spinor_copy(inA, csOrder);
-        ColorSpinorField v_inB = create_color_spinor_copy(inB, csOrder);
-        ColorSpinorField v_out = create_color_spinor_copy(out, csOrder);
+        auto v_inA = create_color_spinor_copy(inA, csOrder);
+        auto v_inB = create_color_spinor_copy(inB, csOrder);
+        auto v_out = create_color_spinor_copy(out, csOrder);
 
         if (dslash) { BlockTransposeForward(v_inA, inA); }
         if (clover) { BlockTransposeForward(v_inB, inB); }
diff --git a/lib/dslash_pack2.cu b/lib/dslash_pack2.cu
index 043ea65abc..01b70f3284 100644
--- a/lib/dslash_pack2.cu
+++ b/lib/dslash_pack2.cu
@@ -139,6 +139,10 @@ namespace quda
       strcpy(aux, "policy_kernel,");
       strcat(aux, in.AuxString().c_str());
       setRHSstring(aux, in.size());
+      strcat(aux, ",n_rhs_tile=");
+      char tile_str[16];
+      i32toa(tile_str, pack_tile_size);
+      strcat(aux, tile_str);
       char comm[5];
       for (int i = 0; i < 4; i++) comm[i] = (comm_dim_pack[i] ? '1' : '0');
       comm[4] = '\0';
diff --git a/lib/field_cache.cpp b/lib/field_cache.cpp
index 5ea09e3059..7283bf39d0 100644
--- a/lib/field_cache.cpp
+++ b/lib/field_cache.cpp
@@ -35,6 +35,21 @@ namespace quda {
     }
   }
 
+  template <typename T> FieldTmp<T>::FieldTmp(typename T::param_type param)
+  {
+    param.create = QUDA_REFERENCE_FIELD_CREATE;
+    key = FieldKey(T(param));
+
+    auto it = cache.find(key);
+    if (it != cache.end() && it->second.size()) { // found an entry
+      tmp = std::move(it->second.top());
+      it->second.pop(); // pop the defunct object
+    } else {            // no entry found, we must allocate a new field
+      param.create = QUDA_ZERO_FIELD_CREATE;
+      tmp = T(param);
+    }
+  }
+
   template <typename T> FieldTmp<T>::~FieldTmp()
   {
     // don't cache the field if it's empty (e.g., has been moved)
diff --git a/lib/gauge_norm.in.cu b/lib/gauge_norm.in.cu
index 08cb5bf2ca..2cc755bc81 100644
--- a/lib/gauge_norm.in.cu
+++ b/lib/gauge_norm.in.cu
@@ -159,8 +159,6 @@ namespace quda {
 
   void genericPrintMatrix(const GaugeField &a, int d, int parity, unsigned int x_cb, int rank)
   {
-    if (rank != comm_rank()) return;
-
     GaugeFieldParam param(a);
     param.field = const_cast<GaugeField *>(&a);
     param.location = QUDA_CPU_FIELD_LOCATION;
@@ -172,6 +170,8 @@ namespace quda {
     std::unique_ptr<GaugeField> clone_a = !host_clone ? nullptr : std::make_unique<GaugeField>(param);
     const GaugeField &a_ = !host_clone ? a : *clone_a.get();
 
+    if (rank != comm_rank()) return; // rank returns after potential copy to host to prevent tuning hang
+
     switch (a.Precision()) {
     case QUDA_DOUBLE_PRECISION: genericPrintMatrix<double>(a_, d, parity, x_cb); break;
     case QUDA_SINGLE_PRECISION: genericPrintMatrix<float>(a_, d, parity, x_cb); break;
diff --git a/lib/inv_mr_quda.cpp b/lib/inv_mr_quda.cpp
index 297c390c08..a4713a16b5 100644
--- a/lib/inv_mr_quda.cpp
+++ b/lib/inv_mr_quda.cpp
@@ -82,10 +82,10 @@ namespace quda
 
     int iter = 0;
     int step = 0;
-    bool converged = false;
+    bool is_done = false;
 
     PrintStats("MR", iter, r2, b2);
-    while (!converged) {
+    while (!is_done) {
 
       int k = 0;
       vector<double> scale(b.size(), 1.0);
@@ -142,22 +142,23 @@ namespace quda
         commGlobalReductionPop(); // renable global reductions for outer solver
       }
 
+      step++;
+
       // FIXME - add over/under relaxation in outer loop
       bool compute_true_res = param.compute_true_res || param.Nsteps > 1;
       if (compute_true_res) {
         mat(r, x);
         r2 = blas::xmyNorm(b, r);
         for (auto i = 0u; i < b2.size(); i++) param.true_res[i] = sqrt(r2[i] / b2[i]);
-        converged = (step > param.Nsteps || r2 < stop);
-        if (!converged) blas::copy(r_sloppy, r);
+        is_done = (step >= param.Nsteps || r2 < stop);
+        if (!is_done) blas::copy(r_sloppy, r);
         PrintStats("MR (restart)", iter, r2, b2);
       } else {
         blas::ax(scale, r_sloppy);
         r2 = blas::norm2(r_sloppy);
-        converged = (step > param.Nsteps || r2 < stop);
-        if (!converged) blas::copy(r, r_sloppy);
+        is_done = (step >= param.Nsteps || r2 < stop);
+        if (!is_done) blas::copy(r, r_sloppy);
       }
-      step++;
     }
 
     PrintSummary("MR", iter, r2, b2, stopping(param.tol, b2, param.residual_type));
diff --git a/lib/targets/cuda/target_cuda.cmake b/lib/targets/cuda/target_cuda.cmake
index eb71bb9d53..721edc3d3b 100644
--- a/lib/targets/cuda/target_cuda.cmake
+++ b/lib/targets/cuda/target_cuda.cmake
@@ -103,8 +103,11 @@ if(CMAKE_CUDA_COMPILER_ID MATCHES "NVIDIA" OR CMAKE_CUDA_COMPILER_ID MATCHES "NV
 endif()
 cmake_dependent_option(QUDA_HETEROGENEOUS_ATOMIC "enable heterogeneous atomic support ?" ON
                        "QUDA_HETEROGENEOUS_ATOMIC_SUPPORT" OFF)
+cmake_dependent_option(QUDA_HETEROGENEOUS_ATOMIC_INF_INIT "use infinity as the sentinel for heterogeneous atomic support?" ON
+                       "QUDA_HETEROGENEOUS_ATOMIC_SUPPORT" OFF)
 
 mark_as_advanced(QUDA_HETEROGENEOUS_ATOMIC)
+mark_as_advanced(QUDA_HETEROGENEOUS_ATOMIC_INF_INIT)
 mark_as_advanced(QUDA_JITIFY)
 mark_as_advanced(QUDA_DOWNLOAD_NVSHMEM)
 mark_as_advanced(QUDA_DOWNLOAD_NVSHMEM_TAR)
diff --git a/tests/dslash_test_utils.h b/tests/dslash_test_utils.h
index ec4ca5b13a..5512c3d0a2 100644
--- a/tests/dslash_test_utils.h
+++ b/tests/dslash_test_utils.h
@@ -428,9 +428,8 @@ struct DslashTestWrapper {
         switch (dtest_type) {
         case dslash_test_type::Dslash:
           // My dslash should be the same as the clover dslash
-          for (int i = 0; i < Nsrc; i++)
-            clover_dslash(spinorRef[i].data(), hostGauge, hostCloverInv, spinor[i].data(), parity, inv_param.dagger,
-                          inv_param.cpu_prec, gauge_param);
+          clover_dslash(spinorRef[i].data(), hostGauge, hostCloverInv, spinor[i].data(), parity, inv_param.dagger,
+                        inv_param.cpu_prec, gauge_param);
           break;
         case dslash_test_type::MatPC:
           // my matpc op
diff --git a/tests/host_reference/hisq_force_reference.cpp b/tests/host_reference/hisq_force_reference.cpp
index 33d773ed95..5da6dc1a7a 100644
--- a/tests/host_reference/hisq_force_reference.cpp
+++ b/tests/host_reference/hisq_force_reference.cpp
@@ -837,13 +837,13 @@ void computeMiddleLinkSite(int half_lattice_index, // half_lattice_index to bett
 
   if (Qprev == NULL) {
     if (sig_positive) {
-      ls.loadMatrixFromField(static_cast<const Real *const *const>(oprod), 1 - oddBit, sig, point_d, &colorMatY);
+      ls.loadMatrixFromField(static_cast<const Real *const *>(oprod), 1 - oddBit, sig, point_d, &colorMatY);
     } else {
-      ls.loadMatrixFromField(static_cast<const Real *const *const>(oprod), oddBit, OPP_DIR(sig), point_c, &colorMatY);
+      ls.loadMatrixFromField(static_cast<const Real *const *>(oprod), oddBit, OPP_DIR(sig), point_c, &colorMatY);
       colorMatY = conj(colorMatY);
     }
   } else { // Qprev != NULL
-    ls.loadMatrixFromField(static_cast<const Real *const>(oprod), oddBit, point_c, &colorMatY);
+    ls.loadMatrixFromField(static_cast<const Real *>(oprod), oddBit, point_c, &colorMatY);
   }
 
   colorMatW = (!mu_positive) ? bc_link * colorMatY : conj(bc_link) * colorMatY;
diff --git a/tests/staggered_invert_test_gtest.hpp b/tests/staggered_invert_test_gtest.hpp
index 7ef2c270b3..1b659eec59 100644
--- a/tests/staggered_invert_test_gtest.hpp
+++ b/tests/staggered_invert_test_gtest.hpp
@@ -53,6 +53,10 @@ bool skip_test(test_t param)
     // MR struggles with the staggered and asqtad spectrum, it's not MR's fault
     if (solution_type == QUDA_MAT_SOLUTION && solve_type == QUDA_DIRECT_SOLVE && inverter_type == QUDA_MR_INVERTER)
       return true;
+
+    // BiCGStab-L is unstable with staggered operator with low precision
+    // we could likely restore this when 20-bit quark fields are merged in
+    if (inverter_type == QUDA_BICGSTABL_INVERTER && prec_sloppy < QUDA_SINGLE_PRECISION) return true;
   }
 
   // CG3 is rather unstable with low precision