diff --git a/.gitignore b/.gitignore index a60ac7da48..8e122f0a4e 100644 --- a/.gitignore +++ b/.gitignore @@ -19,3 +19,4 @@ include/jitify_options.hpp .tags* autom4te.cache/* .vscode +cmake/CPM_*.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 88c83c5b8f..512a7b9baa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -216,6 +216,12 @@ if(QUDA_MAX_MULTI_BLAS_N GREATER 32) message(SEND_ERROR "Maximum QUDA_MAX_MULTI_BLAS_N is 32.") endif() +# For now only we only support register tiles for the staggered dslash operators +set(QUDA_MAX_MULTI_RHS_TILE "1" CACHE STRING "maximum tile size for MRHS kernels (staggered only)") +if(QUDA_MAX_MULTI_RHS_TILE GREATER QUDA_MAX_MULTI_RHS) + message(SEND_ERROR "QUDA_MAX_MULTI_RHS_TILE is greater than QUDA_MAX_MULTI_RHS") +endif() + set(QUDA_PRECISION "14" CACHE STRING "which precisions to instantiate in QUDA (4-bit number - double, single, half, quarter)") @@ -275,6 +281,7 @@ mark_as_advanced(QUDA_ALTERNATIVE_I_TO_F) mark_as_advanced(QUDA_MAX_MULTI_BLAS_N) mark_as_advanced(QUDA_MAX_MULTI_RHS) +mark_as_advanced(QUDA_MAX_MULTI_RHS_TILE) mark_as_advanced(QUDA_PRECISION) mark_as_advanced(QUDA_RECONSTRUCT) mark_as_advanced(QUDA_CLOVER_CHOLESKY_PROMOTE) @@ -420,21 +427,12 @@ if(QUDA_DOWNLOAD_EIGEN) CPMAddPackage( NAME Eigen VERSION ${QUDA_EIGEN_VERSION} - URL https://gitlab.com/libeigen/eigen/-/archive/${QUDA_EIGEN_VERSION}/eigen-${QUDA_EIGEN_VERSION}.tar.bz2 - URL_HASH SHA256=B4C198460EBA6F28D34894E3A5710998818515104D6E74E5CC331CE31E46E626 + URL https://gitlab.com/libeigen/eigen/-/archive/e67c494cba7180066e73b9f6234d0b2129f1cdf5.tar.bz2 + URL_HASH SHA256=98d244932291506b75c4ae7459af29b1112ea3d2f04660686a925d9ef6634583 DOWNLOAD_ONLY YES SYSTEM YES) target_include_directories(Eigen SYSTEM INTERFACE ${Eigen_SOURCE_DIR}) install(DIRECTORY ${Eigen_SOURCE_DIR}/Eigen TYPE INCLUDE) - - # Eigen 3.4 needs to be patched on Neon with nvc++ - if (${CMAKE_CXX_COMPILER_ID} MATCHES "NVHPC") - set(CMAKE_PATCH_EIGEN OFF CACHE BOOL "Internal use only; do not modify") - if (NOT CMAKE_PATCH_EIGEN) - execute_process(COMMAND patch -N "${Eigen_SOURCE_DIR}/Eigen/src/Core/arch/NEON/Complex.h" "${CMAKE_SOURCE_DIR}/cmake/eigen34_neon.diff") - set(CMAKE_PATCH_EIGEN ON CACHE BOOL "Internal use only; do not modify" FORCE) - endif() - endif() else() # fall back to using find_package find_package(Eigen QUIET) diff --git a/cmake/eigen34_neon.diff b/cmake/eigen34_neon.diff deleted file mode 100644 index 7190ca7fa4..0000000000 --- a/cmake/eigen34_neon.diff +++ /dev/null @@ -1,8 +0,0 @@ -21c21 -< #if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML ---- -> #if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML || __NVCOMPILER_LLVM__ -393c393 -< #if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML ---- -> #if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML || __NVCOMPILER_LLVM__ diff --git a/include/complex_quda.h b/include/complex_quda.h index 18da63def5..1481dec40f 100644 --- a/include/complex_quda.h +++ b/include/complex_quda.h @@ -360,23 +360,19 @@ struct complex typedef ValueType value_type; // Constructors - __host__ __device__ inline complex(const ValueType &re = ValueType(), const ValueType &im = ValueType()) + __host__ __device__ inline complex(const ValueType &re = ValueType(), const ValueType &im = ValueType()) { real(re); imag(im); } - template - __host__ __device__ - inline complex(const complex & z) + template __host__ __device__ inline complex(const complex &z) { real(z.real()); imag(z.imag()); } - template - __host__ __device__ - inline complex(const std::complex & z) + template __host__ __device__ inline complex(const std::complex &z) { real(z.real()); imag(z.imag()); @@ -436,12 +432,11 @@ struct complex template <> struct complex : public float2 { public: typedef float value_type; - complex() = default; - constexpr complex(const float &re, const float &im = float()) : float2 {re, im} { } + complex() = default; + constexpr complex(const float &re, const float &im = float()) : float2 {re, im} { } template - constexpr complex(const std::complex &z) : - float2 {static_cast(z.real()), static_cast(z.imag())} + constexpr complex(const std::complex &z) : float2 {static_cast(z.real()), static_cast(z.imag())} { } @@ -500,16 +495,15 @@ template <> struct complex : public float2 { template <> struct complex : public double2 { public: typedef double value_type; - complex() = default; - constexpr complex(const double &re, const double &im = double()) : double2 {re, im} { } + complex() = default; + constexpr complex(const double &re, const double &im = double()) : double2 {re, im} { } template - constexpr complex(const std::complex &z) : - double2 {static_cast(z.real()), static_cast(z.imag())} + constexpr complex(const std::complex &z) : double2 {static_cast(z.real()), static_cast(z.imag())} { } - template __host__ __device__ inline complex &operator=(const complex &z) + template __host__ __device__ inline complex &operator=(const complex &z) { real(z.real()); imag(z.imag()); @@ -572,9 +566,9 @@ template <> struct complex : public char2 { public: typedef int8_t value_type; - complex() = default; + complex() = default; - constexpr complex(const int8_t &re, const int8_t &im = int8_t()) : char2 {re, im} { } + constexpr complex(const int8_t &re, const int8_t &im = int8_t()) : char2 {re, im} { } __host__ __device__ inline complex &operator+=(const complex &z) { @@ -608,9 +602,9 @@ struct complex : public short2 public: typedef short value_type; - complex() = default; + complex() = default; - constexpr complex(const short &re, const short &im = short()) : short2 {re, im} { } + constexpr complex(const short &re, const short &im = short()) : short2 {re, im} { } __host__ __device__ inline complex &operator+=(const complex &z) { @@ -644,9 +638,9 @@ struct complex : public int2 public: typedef int value_type; - complex() = default; + complex() = default; - constexpr complex(const int &re, const int &im = int()) : int2 {re, im} { } + constexpr complex(const int &re, const int &im = int()) : int2 {re, im} { } __host__ __device__ inline complex &operator+=(const complex &z) { diff --git a/include/dslash.h b/include/dslash.h index 4b1aada52e..80638b494e 100644 --- a/include/dslash.h +++ b/include/dslash.h @@ -67,6 +67,10 @@ namespace quda if (arg.xpay) strcat(aux_base, ",xpay"); if (arg.dagger) strcat(aux_base, ",dagger"); setRHSstring(aux_base, in.size()); + strcat(aux_base, ",n_rhs_tile="); + char tile_str[16]; + i32toa(tile_str, Arg::n_src_tile); + strcat(aux_base, tile_str); } /** @@ -329,7 +333,13 @@ namespace quda Dslash(Arg &arg, cvector_ref &out, cvector_ref &in, const ColorSpinorField &halo, const std::string &app_base = "") : - TunableKernel3D(in[0], halo.X(4), arg.nParity), arg(arg), out(out), in(in), halo(halo), nDimComms(4), dslashParam(arg) + TunableKernel3D(in[0], (halo.X(4) + Arg::n_src_tile - 1) / Arg::n_src_tile, arg.nParity), + arg(arg), + out(out), + in(in), + halo(halo), + nDimComms(4), + dslashParam(arg) { if (checkLocation(out, in) == QUDA_CPU_FIELD_LOCATION) errorQuda("CPU Fields not supported in Dslash framework yet"); diff --git a/include/dslash_helper.cuh b/include/dslash_helper.cuh index df176549fc..0e49e64b68 100644 --- a/include/dslash_helper.cuh +++ b/include/dslash_helper.cuh @@ -12,12 +12,7 @@ #include #include -#if defined(_NVHPC_CUDA) -#include -constexpr quda::use_kernel_arg_p use_kernel_arg = quda::use_kernel_arg_p::FALSE; -#else constexpr quda::use_kernel_arg_p use_kernel_arg = quda::use_kernel_arg_p::TRUE; -#endif #include @@ -241,11 +236,12 @@ namespace quda return true; } - template struct DslashArg { + template struct DslashArg { using Float = Float_; using real = typename mapper::type; static constexpr int nDim = nDim_; + static constexpr int n_src_tile = n_src_tile_; // how many RHS per thread const int parity; // only use this for single parity fields const int nParity; // number of parities we're working on @@ -269,6 +265,7 @@ namespace quda int threadDimMapLower[4]; int threadDimMapUpper[4]; + int_fastdiv n_src; int_fastdiv Ls; // these are set with symmetric preconditioned twisted-mass dagger @@ -327,6 +324,7 @@ namespace quda exterior_threads(0), threadDimMapLower {}, threadDimMapUpper {}, + n_src(in.size()), Ls(halo.X(4) / in.size()), twist_a(0.0), twist_b(0.0), @@ -650,8 +648,9 @@ namespace quda Arg arg; dslash_functor_arg(const Arg &arg, unsigned int threads_x) : - kernel_param(dim3(threads_x, arg.dc.Ls, arg.nParity)), - arg(arg) { } + kernel_param(dim3(threads_x, (arg.dc.Ls + Arg::n_src_tile - 1) / Arg::n_src_tile, arg.nParity)), arg(arg) + { + } }; /** diff --git a/include/eigen_helper.h b/include/eigen_helper.h index 8c1bf012a8..373f9ed865 100644 --- a/include/eigen_helper.h +++ b/include/eigen_helper.h @@ -5,10 +5,6 @@ #define EIGEN_USE_BLAS #endif -#if defined(__NVCOMPILER) // WAR for nvc++ until we update to latest Eigen -#define EIGEN_DONT_VECTORIZE -#endif - #include // hide annoying warning diff --git a/include/field_cache.h b/include/field_cache.h index 5e05f9acf6..4def6faa3a 100644 --- a/include/field_cache.h +++ b/include/field_cache.h @@ -15,7 +15,7 @@ namespace quda { */ template struct FieldKey { - std::string volume; /** volume kstring */ + std::string volume; /** volume string */ std::string aux; /** auxiliary string */ FieldKey() = default; @@ -78,6 +78,18 @@ namespace quda { */ FieldTmp(const FieldKey &key, const typename T::param_type ¶m); + /** + @brief Create a field temporary that corresponds to the field + constructed from the param struct. If a matching field is + present in the cache, it will be popped from the cache. If no + such temporary exists a temporary will be allocated. + @param[in] key Key corresponding to the field instance we + require + @param[in] param Parameter structure used to allocated + the temporary + */ + FieldTmp(typename T::param_type param); + /** @brief Copy constructor is deleted to prevent accidental cache bloat @@ -111,6 +123,18 @@ namespace quda { */ template auto getFieldTmp(const T &a) { return FieldTmp(a); } + /** + @brief Get a field temporary that is identical to the field + instance argument. If a matching field is present in the cache, + it will be popped from the cache. If no such temporary exists, a + temporary will be allocated. When the destructor for the + FieldTmp is called, e.g., the returned object goes out of scope, + the temporary will be pushed onto the cache. + + @param[in] a Field we wish to create a matching temporary for + */ + template auto getFieldTmp(const typename T::param_type ¶m) { return FieldTmp(param); } + /** @brief Get a vector of field temporaries that are identical to the vector instance argument. If enough matching fields are diff --git a/include/kernels/dslash_pack.cuh b/include/kernels/dslash_pack.cuh index 94e0465238..a6a5b3c1d5 100644 --- a/include/kernels/dslash_pack.cuh +++ b/include/kernels/dslash_pack.cuh @@ -11,8 +11,10 @@ namespace quda { int *getPackComms(); - template + constexpr int pack_tile_size = 1; + + template struct PackArg : kernel_param<> { typedef Float_ Float; @@ -24,6 +26,7 @@ namespace quda static constexpr bool dagger = dagger_; static constexpr int twist = twist_; // whether we are doing preconditioned twisted-mass or not (1 - singlet, 2 - doublet) static constexpr QudaPCType pc_type = pc_type_; // preconditioning type (4-d or 5-d) + static constexpr int n_src_tile = n_src_tile_; static constexpr bool spinor_direct_load = false; // false means texture load @@ -54,6 +57,7 @@ namespace quda int sites_per_block; + int_fastdiv n_src; int_fastdiv Ls; char *packBuffer[4 * QUDA_MAX_DIM]; @@ -96,6 +100,7 @@ namespace quda threadDimMapLower {}, threadDimMapUpper {}, sites_per_block((work_items + grid - 1) / grid), + n_src(in.size()), Ls(halo.X(4) / in.size()) #ifdef NVSHMEM_COMMS , @@ -131,7 +136,7 @@ namespace quda } }; - template + template __device__ __host__ inline void pack(const Arg &arg, int ghost_idx, int s, int parity, int src_idx) { typedef typename mapper::type real; @@ -160,46 +165,52 @@ namespace quda int idx = indexFromFaceIndex(ghost_idx, parity, arg); constexpr int proj_dir = dagger ? +1 : -1; - Vector f = arg.in[src_idx](idx + s * arg.dc.volume_4d_cb, spinor_parity); - if (twist == 1) { - f = arg.twist_a * (f + arg.twist_b * f.igamma(4)); - } else if (twist == 2) { - Vector f1 = arg.in[src_idx](idx + (1 - s) * arg.dc.volume_4d_cb, spinor_parity); // load other flavor - if (s == 0) - f = arg.twist_a * (f + arg.twist_b * f.igamma(4) + arg.twist_c * f1); - else - f = arg.twist_a * (f - arg.twist_b * f.igamma(4) + arg.twist_c * f1); - } - if (arg.spin_project) { - arg.halo_pack.Ghost(dim, 0, ghost_idx + (src_idx * arg.Ls + s) * arg.dc.ghostFaceCB[dim], spinor_parity) - = f.project(dim, proj_dir); - } else { - arg.halo_pack.Ghost(dim, 0, ghost_idx + (src_idx * arg.Ls + s) * arg.dc.ghostFaceCB[dim], spinor_parity) = f; +#pragma unroll + for (auto src = src_idx; src < src_idx + n_src_tile; src++) { + Vector f = arg.in[src](idx + s * arg.dc.volume_4d_cb, spinor_parity); + if (twist == 1) { + f = arg.twist_a * (f + arg.twist_b * f.igamma(4)); + } else if (twist == 2) { + Vector f1 = arg.in[src](idx + (1 - s) * arg.dc.volume_4d_cb, spinor_parity); // load other flavor + if (s == 0) + f = arg.twist_a * (f + arg.twist_b * f.igamma(4) + arg.twist_c * f1); + else + f = arg.twist_a * (f - arg.twist_b * f.igamma(4) + arg.twist_c * f1); + } + if (arg.spin_project) { + arg.halo_pack.Ghost(dim, 0, ghost_idx + (src * arg.Ls + s) * arg.dc.ghostFaceCB[dim], spinor_parity) + = f.project(dim, proj_dir); + } else { + arg.halo_pack.Ghost(dim, 0, ghost_idx + (src * arg.Ls + s) * arg.dc.ghostFaceCB[dim], spinor_parity) = f; + } } } else { // forwards int idx = indexFromFaceIndex(ghost_idx, parity, arg); constexpr int proj_dir = dagger ? -1 : +1; - Vector f = arg.in[src_idx](idx + s * arg.dc.volume_4d_cb, spinor_parity); - if (twist == 1) { - f = arg.twist_a * (f + arg.twist_b * f.igamma(4)); - } else if (twist == 2) { - Vector f1 = arg.in[src_idx](idx + (1 - s) * arg.dc.volume_4d_cb, spinor_parity); // load other flavor - if (s == 0) - f = arg.twist_a * (f + arg.twist_b * f.igamma(4) + arg.twist_c * f1); - else - f = arg.twist_a * (f - arg.twist_b * f.igamma(4) + arg.twist_c * f1); - } - if (arg.spin_project) { - arg.halo_pack.Ghost(dim, 1, ghost_idx + (src_idx * arg.Ls + s) * arg.dc.ghostFaceCB[dim], spinor_parity) - = f.project(dim, proj_dir); - } else { - arg.halo_pack.Ghost(dim, 1, ghost_idx + (src_idx * arg.Ls + s) * arg.dc.ghostFaceCB[dim], spinor_parity) = f; +#pragma unroll + for (auto src = src_idx; src < src_idx + n_src_tile; src++) { + Vector f = arg.in[src](idx + s * arg.dc.volume_4d_cb, spinor_parity); + if (twist == 1) { + f = arg.twist_a * (f + arg.twist_b * f.igamma(4)); + } else if (twist == 2) { + Vector f1 = arg.in[src](idx + (1 - s) * arg.dc.volume_4d_cb, spinor_parity); // load other flavor + if (s == 0) + f = arg.twist_a * (f + arg.twist_b * f.igamma(4) + arg.twist_c * f1); + else + f = arg.twist_a * (f - arg.twist_b * f.igamma(4) + arg.twist_c * f1); + } + if (arg.spin_project) { + arg.halo_pack.Ghost(dim, 1, ghost_idx + (src * arg.Ls + s) * arg.dc.ghostFaceCB[dim], spinor_parity) + = f.project(dim, proj_dir); + } else { + arg.halo_pack.Ghost(dim, 1, ghost_idx + (src * arg.Ls + s) * arg.dc.ghostFaceCB[dim], spinor_parity) = f; + } } } } - template + template __device__ __host__ inline void packStaggered(const Arg &arg, int ghost_idx, int parity, int src_idx) { typedef typename mapper::type real; @@ -217,12 +228,18 @@ namespace quda if (face_num == 0) { // backwards int idx = indexFromFaceIndexStaggered<4, QUDA_4D_PC, dim, nFace, 0>(ghost_idx, parity, arg); - Vector f = arg.in[src_idx](idx, spinor_parity); - arg.halo_pack.Ghost(dim, 0, ghost_idx + src_idx * nFace * arg.dc.ghostFaceCB[dim], spinor_parity) = f; +#pragma unroll + for (auto src = src_idx; src < src_idx + n_src_tile; src++) { + Vector f = arg.in[src](idx, spinor_parity); + arg.halo_pack.Ghost(dim, 0, ghost_idx + src * nFace * arg.dc.ghostFaceCB[dim], spinor_parity) = f; + } } else { // forwards int idx = indexFromFaceIndexStaggered<4, QUDA_4D_PC, dim, nFace, 1>(ghost_idx, parity, arg); - Vector f = arg.in[src_idx](idx, spinor_parity); - arg.halo_pack.Ghost(dim, 1, ghost_idx + src_idx * nFace * arg.dc.ghostFaceCB[dim], spinor_parity) = f; +#pragma unroll + for (auto src = src_idx; src < src_idx + n_src_tile; src++) { + Vector f = arg.in[src](idx, spinor_parity); + arg.halo_pack.Ghost(dim, 1, ghost_idx + src * nFace * arg.dc.ghostFaceCB[dim], spinor_parity) = f; + } } } @@ -250,24 +267,36 @@ namespace quda if (Arg::pc_type == QUDA_5D_PC) { // 5-d checkerboarded, include s (not ghostFaceCB since both faces) switch (dim) { case 0: - pack(arg, ghost_idx + s * arg.dc.ghostFace[0], 0, parity, src_idx); + pack(arg, ghost_idx + s * arg.dc.ghostFace[0], 0, + parity, src_idx); break; case 1: - pack(arg, ghost_idx + s * arg.dc.ghostFace[1], 0, parity, src_idx); + pack(arg, ghost_idx + s * arg.dc.ghostFace[1], 0, + parity, src_idx); break; case 2: - pack(arg, ghost_idx + s * arg.dc.ghostFace[2], 0, parity, src_idx); + pack(arg, ghost_idx + s * arg.dc.ghostFace[2], 0, + parity, src_idx); break; case 3: - pack(arg, ghost_idx + s * arg.dc.ghostFace[3], 0, parity, src_idx); + pack(arg, ghost_idx + s * arg.dc.ghostFace[3], 0, + parity, src_idx); break; } } else { // 4-d checkerboarding, keeping s separate (if it exists) switch (dim) { - case 0: pack(arg, ghost_idx, s, parity, src_idx); break; - case 1: pack(arg, ghost_idx, s, parity, src_idx); break; - case 2: pack(arg, ghost_idx, s, parity, src_idx); break; - case 3: pack(arg, ghost_idx, s, parity, src_idx); break; + case 0: + pack(arg, ghost_idx, s, parity, src_idx); + break; + case 1: + pack(arg, ghost_idx, s, parity, src_idx); + break; + case 2: + pack(arg, ghost_idx, s, parity, src_idx); + break; + case 3: + pack(arg, ghost_idx, s, parity, src_idx); + break; } } @@ -288,7 +317,8 @@ namespace quda // 64 - use uber kernel (merge exterior) template struct packShmem { - template __device__ __forceinline__ void operator()(const Arg &arg, int src_s, int parity) + template + __device__ __forceinline__ void operator()(const Arg &arg, int src_s, int parity) { // (active_dims * 2 + dir) * blocks_per_dir + local_block_idx int local_block_idx = target::block_idx().x % arg.blocks_per_dir; @@ -315,9 +345,9 @@ namespace quda while (local_tid < arg.dc.ghostFaceCB[0]) { int ghost_idx = dir * arg.dc.ghostFaceCB[0] + local_tid; if (pc == QUDA_5D_PC) - pack(arg, ghost_idx + s * arg.dc.ghostFace[0], 0, parity, src_idx); + pack(arg, ghost_idx + s * arg.dc.ghostFace[0], 0, parity, src_idx); else - pack(arg, ghost_idx, s, parity, src_idx); + pack(arg, ghost_idx, s, parity, src_idx); local_tid += arg.blocks_per_dir * target::block_dim().x; } break; @@ -325,9 +355,9 @@ namespace quda while (local_tid < arg.dc.ghostFaceCB[1]) { int ghost_idx = dir * arg.dc.ghostFaceCB[1] + local_tid; if (pc == QUDA_5D_PC) - pack(arg, ghost_idx + s * arg.dc.ghostFace[1], 0, parity, src_idx); + pack(arg, ghost_idx + s * arg.dc.ghostFace[1], 0, parity, src_idx); else - pack(arg, ghost_idx, s, parity, src_idx); + pack(arg, ghost_idx, s, parity, src_idx); local_tid += arg.blocks_per_dir * target::block_dim().x; } break; @@ -335,9 +365,9 @@ namespace quda while (local_tid < arg.dc.ghostFaceCB[2]) { int ghost_idx = dir * arg.dc.ghostFaceCB[2] + local_tid; if (pc == QUDA_5D_PC) - pack(arg, ghost_idx + s * arg.dc.ghostFace[2], 0, parity, src_idx); + pack(arg, ghost_idx + s * arg.dc.ghostFace[2], 0, parity, src_idx); else - pack(arg, ghost_idx, s, parity, src_idx); + pack(arg, ghost_idx, s, parity, src_idx); local_tid += arg.blocks_per_dir * target::block_dim().x; } break; @@ -345,9 +375,9 @@ namespace quda while (local_tid < arg.dc.ghostFaceCB[3]) { int ghost_idx = dir * arg.dc.ghostFaceCB[3] + local_tid; if (pc == QUDA_5D_PC) - pack(arg, ghost_idx + s * arg.dc.ghostFace[3], 0, parity, src_idx); + pack(arg, ghost_idx + s * arg.dc.ghostFace[3], 0, parity, src_idx); else - pack(arg, ghost_idx, s, parity, src_idx); + pack(arg, ghost_idx, s, parity, src_idx); local_tid += arg.blocks_per_dir * target::block_dim().x; } break; @@ -358,12 +388,19 @@ namespace quda #endif } - __device__ __forceinline__ void operator()(const Arg &arg, int s, int parity, int twist_pack) + template + __device__ __forceinline__ void operator()(const Arg &arg, int src_s_block_, int parity, int twist_pack) { - switch (twist_pack) { - case 0: this->operator()<0>(arg, s, parity); break; - case 1: this->operator()<1>(arg, s, parity); break; - case 2: this->operator()<2>(arg, s, parity); break; + int src_s_block = MAX_MULTI_RHS == 1 ? 0 : src_s_block_; + int src_s_idx = src_s_block * Arg::n_src_tile; + if (src_s_idx + n_src_tile * arg.Ls <= arg.n_src * arg.Ls) { + switch (twist_pack) { + case 0: this->operator()<0, n_src_tile>(arg, src_s_idx, parity); break; + case 1: this->operator()<1, n_src_tile>(arg, src_s_idx, parity); break; + case 2: this->operator()<2, n_src_tile>(arg, src_s_idx, parity); break; + } + } else if constexpr (n_src_tile - 1 > 0) { + operator()(arg, src_s_block, parity); } } }; @@ -377,7 +414,7 @@ namespace quda { if (arg.nParity == 1) parity = arg.parity; packShmem pack; - pack.operator()(arg, s, parity); + pack.operator()(arg, s, parity, Arg::twist); } }; @@ -400,17 +437,17 @@ namespace quda if (arg.nFace == 1) { switch (dim) { - case 0: packStaggered<0, 1>(arg, ghost_idx, parity, src_idx); break; - case 1: packStaggered<1, 1>(arg, ghost_idx, parity, src_idx); break; - case 2: packStaggered<2, 1>(arg, ghost_idx, parity, src_idx); break; - case 3: packStaggered<3, 1>(arg, ghost_idx, parity, src_idx); break; + case 0: packStaggered<0, 1, Arg::n_src_tile>(arg, ghost_idx, parity, src_idx); break; + case 1: packStaggered<1, 1, Arg::n_src_tile>(arg, ghost_idx, parity, src_idx); break; + case 2: packStaggered<2, 1, Arg::n_src_tile>(arg, ghost_idx, parity, src_idx); break; + case 3: packStaggered<3, 1, Arg::n_src_tile>(arg, ghost_idx, parity, src_idx); break; } } else if (arg.nFace == 3) { switch (dim) { - case 0: packStaggered<0, 3>(arg, ghost_idx, parity, src_idx); break; - case 1: packStaggered<1, 3>(arg, ghost_idx, parity, src_idx); break; - case 2: packStaggered<2, 3>(arg, ghost_idx, parity, src_idx); break; - case 3: packStaggered<3, 3>(arg, ghost_idx, parity, src_idx); break; + case 0: packStaggered<0, 3, Arg::n_src_tile>(arg, ghost_idx, parity, src_idx); break; + case 1: packStaggered<1, 3, Arg::n_src_tile>(arg, ghost_idx, parity, src_idx); break; + case 2: packStaggered<2, 3, Arg::n_src_tile>(arg, ghost_idx, parity, src_idx); break; + case 3: packStaggered<3, 3, Arg::n_src_tile>(arg, ghost_idx, parity, src_idx); break; } } @@ -422,7 +459,7 @@ namespace quda template struct packStaggeredShmem { - __device__ __forceinline__ void operator()(const Arg &arg, int src_idx, int parity, int = 0) + template __device__ __forceinline__ void apply(const Arg &arg, int src_idx, int parity) { // (active_dims * 2 + dir) * blocks_per_dir + local_block_idx int local_block_idx = target::block_idx().x % arg.blocks_per_dir; @@ -446,9 +483,9 @@ namespace quda while (local_tid < arg.nFace * arg.dc.ghostFaceCB[0]) { int ghost_idx = dir * arg.nFace * arg.dc.ghostFaceCB[0] + local_tid; if (arg.nFace == 1) - packStaggered<0, 1>(arg, ghost_idx, parity, src_idx); + packStaggered<0, 1, n_src_tile>(arg, ghost_idx, parity, src_idx); else - packStaggered<0, 3>(arg, ghost_idx, parity, src_idx); + packStaggered<0, 3, n_src_tile>(arg, ghost_idx, parity, src_idx); local_tid += arg.blocks_per_dir * target::block_dim().x; } break; @@ -456,9 +493,9 @@ namespace quda while (local_tid < arg.nFace * arg.dc.ghostFaceCB[1]) { int ghost_idx = dir * arg.nFace * arg.dc.ghostFaceCB[1] + local_tid; if (arg.nFace == 1) - packStaggered<1, 1>(arg, ghost_idx, parity, src_idx); + packStaggered<1, 1, n_src_tile>(arg, ghost_idx, parity, src_idx); else - packStaggered<1, 3>(arg, ghost_idx, parity, src_idx); + packStaggered<1, 3, n_src_tile>(arg, ghost_idx, parity, src_idx); local_tid += arg.blocks_per_dir * target::block_dim().x; } break; @@ -466,9 +503,9 @@ namespace quda while (local_tid < arg.nFace * arg.dc.ghostFaceCB[2]) { int ghost_idx = dir * arg.nFace * arg.dc.ghostFaceCB[2] + local_tid; if (arg.nFace == 1) - packStaggered<2, 1>(arg, ghost_idx, parity, src_idx); + packStaggered<2, 1, n_src_tile>(arg, ghost_idx, parity, src_idx); else - packStaggered<2, 3>(arg, ghost_idx, parity, src_idx); + packStaggered<2, 3, n_src_tile>(arg, ghost_idx, parity, src_idx); local_tid += arg.blocks_per_dir * target::block_dim().x; } break; @@ -476,9 +513,9 @@ namespace quda while (local_tid < arg.nFace * arg.dc.ghostFaceCB[3]) { int ghost_idx = dir * arg.nFace * arg.dc.ghostFaceCB[3] + local_tid; if (arg.nFace == 1) - packStaggered<3, 1>(arg, ghost_idx, parity, src_idx); + packStaggered<3, 1, n_src_tile>(arg, ghost_idx, parity, src_idx); else - packStaggered<3, 3>(arg, ghost_idx, parity, src_idx); + packStaggered<3, 3, n_src_tile>(arg, ghost_idx, parity, src_idx); local_tid += arg.blocks_per_dir * target::block_dim().x; } break; @@ -488,6 +525,18 @@ namespace quda if (arg.shmem) shmem_signal(dim, dir, arg); #endif } + + template + __device__ __forceinline__ void operator()(const Arg &arg, int src_idx_block_, int parity, int = 0) + { + int src_idx_block = MAX_MULTI_RHS == 1 ? 0 : src_idx_block_; + int src_idx = src_idx_block * Arg::n_src_tile; + if (src_idx + n_src_tile <= arg.n_src) { + apply(arg, src_idx, parity); + } else if constexpr (n_src_tile - 1 > 0) { + operator()(arg, src_idx_block, parity); + } + } }; template struct pack_staggered_shmem { diff --git a/include/kernels/dslash_staggered.cuh b/include/kernels/dslash_staggered.cuh index 7eafa4a687..ec09ab1622 100644 --- a/include/kernels/dslash_staggered.cuh +++ b/include/kernels/dslash_staggered.cuh @@ -14,9 +14,9 @@ namespace quda /** @brief Parameter structure for driving the Staggered Dslash operator */ - template - struct StaggeredArg : DslashArg { + template + struct StaggeredArg : DslashArg { typedef typename mapper::type real; static constexpr int nColor = nColor_; static constexpr int nSpin = 1; @@ -37,7 +37,6 @@ namespace quda using GL = typename gauge_mapper::type; - const int_fastdiv n_src; F out[MAX_MULTI_RHS]; /** output vector field */ F in[MAX_MULTI_RHS]; /** input vector field */ const Ghost halo_pack; /** accessor for writing the halo */ @@ -57,8 +56,8 @@ namespace quda StaggeredArg(cvector_ref &out, cvector_ref &in, const ColorSpinorField &halo, const GaugeField &U, const GaugeField &L, double a, cvector_ref &x, int parity, bool dagger, const int *comm_override) : - DslashArg(out, in, halo, U, x, parity, dagger, a == 0.0 ? false : true, improved_ ? 3 : 1, - spin_project, comm_override), + DslashArg(out, in, halo, U, x, parity, dagger, a == 0.0 ? false : true, + improved_ ? 3 : 1, spin_project, comm_override), halo_pack(halo, improved_ ? 3 : 1), halo(halo, improved_ ? 3 : 1), U(U), @@ -87,9 +86,9 @@ namespace quda @param[in] parity The site parity @param[in] x_cb The checkerboarded site index */ - template - __device__ __host__ inline void applyStaggered(Vector &out, const Arg &arg, Coord &coord, int parity, int, - int thread_dim, bool &active, int src_idx) + template + __device__ __host__ inline void applyStaggered(array &out, const Arg &arg, Coord &coord, + int parity, int, int thread_dim, bool &active, int src_idx) { typedef typename mapper::type real; typedef Matrix, Arg::nColor> Link; @@ -104,13 +103,20 @@ namespace quda if (doHalo(d) && ghost) { const int ghost_idx = ghostFaceIndexStaggered<1>(coord, arg.dim, d, 1); const Link U = arg.improved ? arg.U(d, coord.x_cb, parity) : arg.U(d, coord.x_cb, parity, StaggeredPhase(coord, d, +1, arg)); - Vector in = arg.halo.Ghost(d, 1, ghost_idx + src_idx * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity); - out = mv_add(U, in, out); +#pragma unroll + for (auto s = 0; s < n_src_tile; s++) { + Vector in + = arg.halo.Ghost(d, 1, ghost_idx + (src_idx + s) * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity); + out[s] = mv_add(U, in, out[s]); + } } else if (doBulk() && !ghost) { const int fwd_idx = linkIndexP1(coord, arg.dim, d); const Link U = arg.improved ? arg.U(d, coord.x_cb, parity) : arg.U(d, coord.x_cb, parity, StaggeredPhase(coord, d, +1, arg)); - Vector in = arg.in[src_idx](fwd_idx, their_spinor_parity); - out = mv_add(U, in, out); +#pragma unroll + for (auto s = 0; s < n_src_tile; s++) { + Vector in = arg.in[src_idx + s](fwd_idx, their_spinor_parity); + out[s] = mv_add(U, in, out[s]); + } } } @@ -120,14 +126,20 @@ namespace quda if (doHalo(d) && ghost) { const int ghost_idx = ghostFaceIndexStaggered<1>(coord, arg.dim, d, arg.nFace); const Link L = arg.L(d, coord.x_cb, parity); - const Vector in - = arg.halo.Ghost(d, 1, ghost_idx + src_idx * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity); - out = mv_add(L, in, out); +#pragma unroll + for (auto s = 0; s < n_src_tile; s++) { + const Vector in + = arg.halo.Ghost(d, 1, ghost_idx + (src_idx + s) * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity); + out[s] = mv_add(L, in, out[s]); + } } else if (doBulk() && !ghost) { const int fwd3_idx = linkIndexP3(coord, arg.dim, d); const Link L = arg.L(d, coord.x_cb, parity); - const Vector in = arg.in[src_idx](fwd3_idx, their_spinor_parity); - out = mv_add(L, in, out); +#pragma unroll + for (auto s = 0; s < n_src_tile; s++) { + const Vector in = arg.in[src_idx + s](fwd3_idx, their_spinor_parity); + out[s] = mv_add(L, in, out[s]); + } } } @@ -140,15 +152,22 @@ namespace quda const int ghost_idx = arg.improved ? ghostFaceIndexStaggered<0>(coord, arg.dim, d, 3) : ghost_idx2; const Link U = arg.improved ? arg.U.Ghost(d, ghost_idx2, 1 - parity) : arg.U.Ghost(d, ghost_idx2, 1 - parity, StaggeredPhase(coord, d, -1, arg)); - Vector in = arg.halo.Ghost(d, 0, ghost_idx + src_idx * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity); - out = mv_add(conj(U), -in, out); +#pragma unroll + for (auto s = 0; s < n_src_tile; s++) { + Vector in + = arg.halo.Ghost(d, 0, ghost_idx + (src_idx + s) * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity); + out[s] = mv_add(conj(U), -in, out[s]); + } } else if (doBulk() && !ghost) { const int back_idx = linkIndexM1(coord, arg.dim, d); const int gauge_idx = back_idx; const Link U = arg.improved ? arg.U(d, gauge_idx, 1 - parity) : arg.U(d, gauge_idx, 1 - parity, StaggeredPhase(coord, d, -1, arg)); - Vector in = arg.in[src_idx](back_idx, their_spinor_parity); - out = mv_add(conj(U), -in, out); +#pragma unroll + for (auto s = 0; s < n_src_tile; s++) { + Vector in = arg.in[src_idx + s](back_idx, their_spinor_parity); + out[s] = mv_add(conj(U), -in, out[s]); + } } } @@ -159,15 +178,21 @@ namespace quda // when updating replace arg.nFace with 1 here const int ghost_idx = ghostFaceIndexStaggered<0>(coord, arg.dim, d, 1); const Link L = arg.L.Ghost(d, ghost_idx, 1 - parity); - const Vector in - = arg.halo.Ghost(d, 0, ghost_idx + src_idx * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity); - out = mv_add(conj(L), -in, out); +#pragma unroll + for (auto s = 0; s < n_src_tile; s++) { + const Vector in + = arg.halo.Ghost(d, 0, ghost_idx + (src_idx + s) * arg.nFace * arg.dc.ghostFaceCB[d], their_spinor_parity); + out[s] = mv_add(conj(L), -in, out[s]); + } } else if (doBulk() && !ghost) { const int back3_idx = linkIndexM3(coord, arg.dim, d); const int gauge_idx = back3_idx; const Link L = arg.L(d, gauge_idx, 1 - parity); - const Vector in = arg.in[src_idx](back3_idx, their_spinor_parity); - out = mv_add(conj(L), -in, out); +#pragma unroll + for (auto s = 0; s < n_src_tile; s++) { + const Vector in = arg.in[src_idx + s](back3_idx, their_spinor_parity); + out[s] = mv_add(conj(L), -in, out[s]); + } } } } // nDim @@ -181,8 +206,8 @@ namespace quda constexpr staggered(const Arg &arg) : arg(arg) {} static constexpr const char *filename() { return KERNEL_FILE; } // this file name - used for run-time compilation - template - __device__ __host__ __forceinline__ void operator()(int idx, int src_idx, int parity) + template + __device__ __host__ __forceinline__ void apply(int idx, int src_idx, int parity) { using real = typename mapper::type; using Vector = ColorSpinor; @@ -195,20 +220,41 @@ namespace quda const int my_spinor_parity = nParity == 2 ? parity : 0; - Vector out; + array out; + applyStaggered(out, arg, coord, parity, idx, thread_dim, active, src_idx); - applyStaggered(out, arg, coord, parity, idx, thread_dim, active, src_idx); - - out *= arg.dagger_scale; +#pragma unroll + for (auto s = 0; s < n_src_tile; s++) out[s] *= arg.dagger_scale; if (xpay && mykernel_type == INTERIOR_KERNEL) { - Vector x = arg.x[src_idx](coord.x_cb, my_spinor_parity); - out = arg.a * x - out; +#pragma unroll + for (auto s = 0; s < n_src_tile; s++) { + Vector x = arg.x[src_idx + s](coord.x_cb, my_spinor_parity); + out[s] = arg.a * x - out[s]; + } } else if (mykernel_type != INTERIOR_KERNEL) { - Vector x = arg.out[src_idx](coord.x_cb, my_spinor_parity); - out = x + (xpay ? -out : out); +#pragma unroll + for (auto s = 0; s < n_src_tile; s++) { + Vector x = arg.out[src_idx + s](coord.x_cb, my_spinor_parity); + out[s] = x + (xpay ? -out[s] : out[s]); + } + } + if (mykernel_type != EXTERIOR_KERNEL_ALL || active) { +#pragma unroll + for (auto s = 0; s < n_src_tile; s++) { arg.out[src_idx + s](coord.x_cb, my_spinor_parity) = out[s]; } + } + } + + template + __device__ __host__ __forceinline__ void operator()(int idx, int src_idx_block_, int parity) + { + int src_idx_block = MAX_MULTI_RHS == 1 ? 0 : src_idx_block_; + int src_idx = src_idx_block * Arg::n_src_tile; + if (src_idx + n_src_tile <= arg.n_src) { + apply(idx, src_idx, parity); + } else if constexpr (n_src_tile - 1 > 0) { + operator()(idx, src_idx_block, parity); } - if (mykernel_type != EXTERIOR_KERNEL_ALL || active) arg.out[src_idx](coord.x_cb, my_spinor_parity) = out; } }; diff --git a/include/kernels/dslash_wilson.cuh b/include/kernels/dslash_wilson.cuh index 514c4df957..584999d52f 100644 --- a/include/kernels/dslash_wilson.cuh +++ b/include/kernels/dslash_wilson.cuh @@ -34,7 +34,6 @@ namespace quda typedef typename mapper::type real; - const int_fastdiv n_src; F out[MAX_MULTI_RHS]; /** output vector field set */ F in[MAX_MULTI_RHS]; /** input vector field set */ F x[MAX_MULTI_RHS]; /** input vector set when doing xpay */ diff --git a/include/kernels/laplace.cuh b/include/kernels/laplace.cuh index b8797e722e..a3c2b1b377 100644 --- a/include/kernels/laplace.cuh +++ b/include/kernels/laplace.cuh @@ -143,8 +143,9 @@ namespace quda static constexpr const char *filename() { return KERNEL_FILE; } // this file name - used for run-time compilation template - __device__ __host__ __forceinline__ void operator()(int idx, int src_idx, int parity) + __device__ __host__ __forceinline__ void operator()(int idx, int src_idx_, int parity) { + int src_idx = MAX_MULTI_RHS == 1 ? 0 : src_idx_; using real = typename mapper::type; using Vector = ColorSpinor; diff --git a/include/kernels/staggered_quark_smearing.cuh b/include/kernels/staggered_quark_smearing.cuh index 22323e66a2..85e1790318 100644 --- a/include/kernels/staggered_quark_smearing.cuh +++ b/include/kernels/staggered_quark_smearing.cuh @@ -179,9 +179,9 @@ namespace quda if (arg.is_t0_kernel) { if (arg.t0 < 0) return; - if (mykernel_type == INTERIOR_KERNEL) { + if constexpr (mykernel_type == INTERIOR_KERNEL) { idx += arg.t0 * arg.t0_offset; - } else if (mykernel_type != EXTERIOR_KERNEL_ALL) { + } else if constexpr (mykernel_type != EXTERIOR_KERNEL_ALL) { if (idx >= arg.t0_face_size[mykernel_type]) idx += arg.face_size[mykernel_type] - arg.t0_face_size[mykernel_type]; idx += arg.t0 * arg.t0_face_offset[mykernel_type]; diff --git a/include/quda_define.h.in b/include/quda_define.h.in index ea6657120f..3e572b70bd 100644 --- a/include/quda_define.h.in +++ b/include/quda_define.h.in @@ -25,6 +25,12 @@ */ #define MAX_MULTI_RHS @QUDA_MAX_MULTI_RHS@ +/** + * @def MAX_MULTI_BLAS_N + * @brief This macro sets the register tile size for MRHS kernels + */ +#define MAX_MULTI_RHS_TILE @QUDA_MAX_MULTI_RHS_TILE@ + #cmakedefine QUDA_HETEROGENEOUS_ATOMIC #ifdef QUDA_HETEROGENEOUS_ATOMIC /** @@ -36,6 +42,16 @@ #undef QUDA_HETEROGENEOUS_ATOMIC #endif +#cmakedefine QUDA_HETEROGENEOUS_ATOMIC_INF_INIT +#ifdef QUDA_HETEROGENEOUS_ATOMIC_INF_INIT +/** + * @def HETEROGENEOUS_ATOMIC_INF_INIT + * @brief This macro sets whether we are using infinity for the signaling sentinel + */ +#define HETEROGENEOUS_ATOMIC_INF_INIT +#undef QUDA_HETEROGENEOUS_ATOMIC_INF_INIT +#endif + #cmakedefine QUDA_LARGE_KERNEL_ARG #cmakedefine QUDA_DIRAC_CLOVER_HASENBUSCH diff --git a/include/targets/cuda/reduce_helper.h b/include/targets/cuda/reduce_helper.h index c8c6101892..aceb88c83d 100644 --- a/include/targets/cuda/reduce_helper.h +++ b/include/targets/cuda/reduce_helper.h @@ -17,6 +17,11 @@ using count_t = cuda::atomic; namespace quda { +// By default we use negative infinity as the sentinel for testing for +// reduction completion. On some compilers we may need to use finite +// numbers, so the alternative approach uses negative zero (set with CMake +// option QUDA_HETEROGENEOUS_ATOMIC_INF_INIT). +#ifdef HETEROGENEOUS_ATOMIC_INF_INIT /** @brief The initialization value we used to check for completion */ @@ -28,6 +33,31 @@ namespace quda */ template constexpr T terminate_value() { return cuda::std::numeric_limits::infinity(); } + /** + @brief Test if the result is complete (e.g., is not equal to the sentinel) + */ + template bool is_complete(const T &result) { return result != init_value(); } +#else + /** + @brief The initialization value we used to check for completion + */ + template constexpr T init_value() { return -static_cast(0.0); } + + /** + @brief The termination value we use to prevent a possible hang in + case the computed reduction is equal to the initialization + */ + template constexpr T terminate_value() { return static_cast(0.0); } + + /** + @brief Test if the result is complete (e.g., is not equal to the sentinel) + */ + template bool is_complete(const T &result) + { + return !(result == static_cast(0.0) && std::signbit(result)); + } +#endif + // declaration of reduce function template __device__ inline void reduce(Arg &arg, const Reducer &r, const T &in, const int idx = 0); @@ -129,7 +159,7 @@ namespace quda if (consumed) errorQuda("Cannot call complete more than once for each construction"); for (int i = 0; i < n_reduce * n_item; i++) { - while (result_h[i].load(cuda::std::memory_order_relaxed) == init_value()) { } + while (!is_complete(result_h[i].load(cuda::std::memory_order_relaxed))) { } } // copy back result element by element and convert if necessary to host reduce type @@ -169,11 +199,7 @@ namespace quda auto tid = target::thread_idx_linear<2>(); if (arg.result_d) { // write to host mapped memory -#ifdef _NVHPC_CUDA // WAR for nvc++ - constexpr bool coalesced_write = false; -#else constexpr bool coalesced_write = true; -#endif if constexpr (coalesced_write) { static_assert(n <= device::warp_size(), "reduction array is greater than warp size"); auto mask = __ballot_sync(0xffffffff, tid < n); diff --git a/include/targets/cuda/thread_array.h b/include/targets/cuda/thread_array.h deleted file mode 100644 index 1c4d7f3244..0000000000 --- a/include/targets/cuda/thread_array.h +++ /dev/null @@ -1,18 +0,0 @@ -#pragma once - -#ifndef _NVHPC_CUDA - -#include "../generic/thread_array.h" - -#else - -#include - -namespace quda -{ - template struct thread_array : array { - static constexpr unsigned int shared_mem_size(dim3 block) { return 0; } - }; -} // namespace quda - -#endif diff --git a/include/targets/hip/thread_array.h b/include/targets/hip/thread_array.h deleted file mode 100644 index 24f986cc26..0000000000 --- a/include/targets/hip/thread_array.h +++ /dev/null @@ -1 +0,0 @@ -#include "../generic/thread_array.h" diff --git a/lib/color_spinor_util.in.cu b/lib/color_spinor_util.in.cu index a268a4b16a..71664614cf 100644 --- a/lib/color_spinor_util.in.cu +++ b/lib/color_spinor_util.in.cu @@ -378,8 +378,6 @@ namespace quda { void genericPrintVector(const ColorSpinorField &a, int parity, unsigned int x_cb, int rank) { - if (rank != comm_rank()) return; - ColorSpinorParam param(a); param.location = QUDA_CPU_FIELD_LOCATION; param.create = QUDA_COPY_FIELD_CREATE; @@ -388,6 +386,8 @@ namespace quda { std::unique_ptr clone_a = !host_clone ? nullptr : std::make_unique(param); const ColorSpinorField &a_ = !host_clone ? a : *clone_a.get(); + if (rank != comm_rank()) return; // rank returns after potential copy to host to prevent tuning hang + switch (a.Precision()) { case QUDA_DOUBLE_PRECISION: genericPrintVector(a_, parity, x_cb); break; case QUDA_SINGLE_PRECISION: genericPrintVector(a_, parity, x_cb); break; diff --git a/lib/dslash_coarse.in.cpp b/lib/dslash_coarse.in.cpp index 747f2e9958..52298ceb14 100644 --- a/lib/dslash_coarse.in.cpp +++ b/lib/dslash_coarse.in.cpp @@ -88,7 +88,7 @@ namespace quda param.nVec = nVec; param.create = QUDA_NULL_FIELD_CREATE; param.fieldOrder = order; - return ColorSpinorField(param); + return getFieldTmp(param); } // Apply the coarse Dirac matrix to a coarse grid vector @@ -107,9 +107,9 @@ namespace quda IntList<@QUDA_MULTIGRID_NVEC_LIST@>()); } else { constexpr QudaFieldOrder csOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; - ColorSpinorField v_inA = create_color_spinor_copy(inA, csOrder); - ColorSpinorField v_inB = create_color_spinor_copy(inB, csOrder); - ColorSpinorField v_out = create_color_spinor_copy(out, csOrder); + auto v_inA = create_color_spinor_copy(inA, csOrder); + auto v_inB = create_color_spinor_copy(inB, csOrder); + auto v_out = create_color_spinor_copy(out, csOrder); if (dslash) { BlockTransposeForward(v_inA, inA); } if (clover) { BlockTransposeForward(v_inB, inB); } diff --git a/lib/dslash_pack2.cu b/lib/dslash_pack2.cu index 043ea65abc..01b70f3284 100644 --- a/lib/dslash_pack2.cu +++ b/lib/dslash_pack2.cu @@ -139,6 +139,10 @@ namespace quda strcpy(aux, "policy_kernel,"); strcat(aux, in.AuxString().c_str()); setRHSstring(aux, in.size()); + strcat(aux, ",n_rhs_tile="); + char tile_str[16]; + i32toa(tile_str, pack_tile_size); + strcat(aux, tile_str); char comm[5]; for (int i = 0; i < 4; i++) comm[i] = (comm_dim_pack[i] ? '1' : '0'); comm[4] = '\0'; diff --git a/lib/field_cache.cpp b/lib/field_cache.cpp index 5ea09e3059..7283bf39d0 100644 --- a/lib/field_cache.cpp +++ b/lib/field_cache.cpp @@ -35,6 +35,21 @@ namespace quda { } } + template FieldTmp::FieldTmp(typename T::param_type param) + { + param.create = QUDA_REFERENCE_FIELD_CREATE; + key = FieldKey(T(param)); + + auto it = cache.find(key); + if (it != cache.end() && it->second.size()) { // found an entry + tmp = std::move(it->second.top()); + it->second.pop(); // pop the defunct object + } else { // no entry found, we must allocate a new field + param.create = QUDA_ZERO_FIELD_CREATE; + tmp = T(param); + } + } + template FieldTmp::~FieldTmp() { // don't cache the field if it's empty (e.g., has been moved) diff --git a/lib/gauge_norm.in.cu b/lib/gauge_norm.in.cu index 08cb5bf2ca..2cc755bc81 100644 --- a/lib/gauge_norm.in.cu +++ b/lib/gauge_norm.in.cu @@ -159,8 +159,6 @@ namespace quda { void genericPrintMatrix(const GaugeField &a, int d, int parity, unsigned int x_cb, int rank) { - if (rank != comm_rank()) return; - GaugeFieldParam param(a); param.field = const_cast(&a); param.location = QUDA_CPU_FIELD_LOCATION; @@ -172,6 +170,8 @@ namespace quda { std::unique_ptr clone_a = !host_clone ? nullptr : std::make_unique(param); const GaugeField &a_ = !host_clone ? a : *clone_a.get(); + if (rank != comm_rank()) return; // rank returns after potential copy to host to prevent tuning hang + switch (a.Precision()) { case QUDA_DOUBLE_PRECISION: genericPrintMatrix(a_, d, parity, x_cb); break; case QUDA_SINGLE_PRECISION: genericPrintMatrix(a_, d, parity, x_cb); break; diff --git a/lib/inv_mr_quda.cpp b/lib/inv_mr_quda.cpp index 297c390c08..a4713a16b5 100644 --- a/lib/inv_mr_quda.cpp +++ b/lib/inv_mr_quda.cpp @@ -82,10 +82,10 @@ namespace quda int iter = 0; int step = 0; - bool converged = false; + bool is_done = false; PrintStats("MR", iter, r2, b2); - while (!converged) { + while (!is_done) { int k = 0; vector scale(b.size(), 1.0); @@ -142,22 +142,23 @@ namespace quda commGlobalReductionPop(); // renable global reductions for outer solver } + step++; + // FIXME - add over/under relaxation in outer loop bool compute_true_res = param.compute_true_res || param.Nsteps > 1; if (compute_true_res) { mat(r, x); r2 = blas::xmyNorm(b, r); for (auto i = 0u; i < b2.size(); i++) param.true_res[i] = sqrt(r2[i] / b2[i]); - converged = (step > param.Nsteps || r2 < stop); - if (!converged) blas::copy(r_sloppy, r); + is_done = (step >= param.Nsteps || r2 < stop); + if (!is_done) blas::copy(r_sloppy, r); PrintStats("MR (restart)", iter, r2, b2); } else { blas::ax(scale, r_sloppy); r2 = blas::norm2(r_sloppy); - converged = (step > param.Nsteps || r2 < stop); - if (!converged) blas::copy(r, r_sloppy); + is_done = (step >= param.Nsteps || r2 < stop); + if (!is_done) blas::copy(r, r_sloppy); } - step++; } PrintSummary("MR", iter, r2, b2, stopping(param.tol, b2, param.residual_type)); diff --git a/lib/targets/cuda/target_cuda.cmake b/lib/targets/cuda/target_cuda.cmake index eb71bb9d53..721edc3d3b 100644 --- a/lib/targets/cuda/target_cuda.cmake +++ b/lib/targets/cuda/target_cuda.cmake @@ -103,8 +103,11 @@ if(CMAKE_CUDA_COMPILER_ID MATCHES "NVIDIA" OR CMAKE_CUDA_COMPILER_ID MATCHES "NV endif() cmake_dependent_option(QUDA_HETEROGENEOUS_ATOMIC "enable heterogeneous atomic support ?" ON "QUDA_HETEROGENEOUS_ATOMIC_SUPPORT" OFF) +cmake_dependent_option(QUDA_HETEROGENEOUS_ATOMIC_INF_INIT "use infinity as the sentinel for heterogeneous atomic support?" ON + "QUDA_HETEROGENEOUS_ATOMIC_SUPPORT" OFF) mark_as_advanced(QUDA_HETEROGENEOUS_ATOMIC) +mark_as_advanced(QUDA_HETEROGENEOUS_ATOMIC_INF_INIT) mark_as_advanced(QUDA_JITIFY) mark_as_advanced(QUDA_DOWNLOAD_NVSHMEM) mark_as_advanced(QUDA_DOWNLOAD_NVSHMEM_TAR) diff --git a/tests/dslash_test_utils.h b/tests/dslash_test_utils.h index ec4ca5b13a..5512c3d0a2 100644 --- a/tests/dslash_test_utils.h +++ b/tests/dslash_test_utils.h @@ -428,9 +428,8 @@ struct DslashTestWrapper { switch (dtest_type) { case dslash_test_type::Dslash: // My dslash should be the same as the clover dslash - for (int i = 0; i < Nsrc; i++) - clover_dslash(spinorRef[i].data(), hostGauge, hostCloverInv, spinor[i].data(), parity, inv_param.dagger, - inv_param.cpu_prec, gauge_param); + clover_dslash(spinorRef[i].data(), hostGauge, hostCloverInv, spinor[i].data(), parity, inv_param.dagger, + inv_param.cpu_prec, gauge_param); break; case dslash_test_type::MatPC: // my matpc op diff --git a/tests/host_reference/hisq_force_reference.cpp b/tests/host_reference/hisq_force_reference.cpp index 33d773ed95..5da6dc1a7a 100644 --- a/tests/host_reference/hisq_force_reference.cpp +++ b/tests/host_reference/hisq_force_reference.cpp @@ -837,13 +837,13 @@ void computeMiddleLinkSite(int half_lattice_index, // half_lattice_index to bett if (Qprev == NULL) { if (sig_positive) { - ls.loadMatrixFromField(static_cast(oprod), 1 - oddBit, sig, point_d, &colorMatY); + ls.loadMatrixFromField(static_cast(oprod), 1 - oddBit, sig, point_d, &colorMatY); } else { - ls.loadMatrixFromField(static_cast(oprod), oddBit, OPP_DIR(sig), point_c, &colorMatY); + ls.loadMatrixFromField(static_cast(oprod), oddBit, OPP_DIR(sig), point_c, &colorMatY); colorMatY = conj(colorMatY); } } else { // Qprev != NULL - ls.loadMatrixFromField(static_cast(oprod), oddBit, point_c, &colorMatY); + ls.loadMatrixFromField(static_cast(oprod), oddBit, point_c, &colorMatY); } colorMatW = (!mu_positive) ? bc_link * colorMatY : conj(bc_link) * colorMatY; diff --git a/tests/staggered_invert_test_gtest.hpp b/tests/staggered_invert_test_gtest.hpp index 7ef2c270b3..1b659eec59 100644 --- a/tests/staggered_invert_test_gtest.hpp +++ b/tests/staggered_invert_test_gtest.hpp @@ -53,6 +53,10 @@ bool skip_test(test_t param) // MR struggles with the staggered and asqtad spectrum, it's not MR's fault if (solution_type == QUDA_MAT_SOLUTION && solve_type == QUDA_DIRECT_SOLVE && inverter_type == QUDA_MR_INVERTER) return true; + + // BiCGStab-L is unstable with staggered operator with low precision + // we could likely restore this when 20-bit quark fields are merged in + if (inverter_type == QUDA_BICGSTABL_INVERTER && prec_sloppy < QUDA_SINGLE_PRECISION) return true; } // CG3 is rather unstable with low precision