diff --git a/CMakeLists.txt b/CMakeLists.txt index abcf27de99..d99e888931 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -172,6 +172,7 @@ option(QUDA_OPENBLAS "enable OpenBLAS" OFF) # Interface options option(QUDA_INTERFACE_QDP "build qdp interface" ON) option(QUDA_INTERFACE_MILC "build milc interface" ON) +option(QUDA_INTERFACE_OPENQCD "build OpenQCD interface" OFF) option(QUDA_INTERFACE_CPS "build cps interface" OFF) option(QUDA_INTERFACE_QDPJIT "build qdpjit interface" OFF) option(QUDA_INTERFACE_BQCD "build bqcd interface" OFF) diff --git a/include/clover_field.h b/include/clover_field.h index 85933999cf..efefed5c37 100644 --- a/include/clover_field.h +++ b/include/clover_field.h @@ -159,6 +159,8 @@ namespace quda { inverse(param.inverse), clover(param.clover), cloverInv(param.cloverInv), + csw(param.csw), + coeff(param.coeff), twist_flavor(param.twist_flavor), mu2(param.mu2), epsilon2(param.epsilon2), diff --git a/include/clover_field_order.h b/include/clover_field_order.h index 65d5ef6cff..129227ccf1 100644 --- a/include/clover_field_order.h +++ b/include/clover_field_order.h @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -637,7 +638,11 @@ namespace quda { errorQuda("Accessor reconstruct = %d does not match field reconstruct %d", enable_reconstruct, clover.Reconstruct()); if (clover.max_element(is_inverse) == 0.0 && isFixed::value) +#ifdef BUILD_OPENQCD_INTERFACE + warningQuda("%p max_element(%d) appears unset", &clover, is_inverse); /* ignore if the SW-field is zero */ +#else errorQuda("%p max_element(%d) appears unset", &clover, is_inverse); +#endif if (clover.Diagonal() == 0.0 && clover.Reconstruct()) errorQuda("%p diagonal appears unset", &clover); this->clover = clover_ ? clover_ : clover.data(is_inverse); } @@ -1015,6 +1020,97 @@ namespace quda { size_t Bytes() const { return length*sizeof(Float); } }; + /** + * OpenQCD ordering for clover fields + */ + template struct OpenQCDOrder { + static constexpr bool enable_reconstruct = false; + typedef typename mapper::type RegType; + Float *clover; + const int volumeCB; + const QudaTwistFlavorType twist_flavor; + const Float mu2; + const Float epsilon2; + const double coeff; + const double csw; + const double kappa; + const int dim[4]; // xyzt convention + const int L[4]; // txyz convention + + OpenQCDOrder(const CloverField &clover, bool inverse, Float *clover_ = nullptr, void * = nullptr) : + volumeCB(clover.Stride()), + twist_flavor(clover.TwistFlavor()), + mu2(clover.Mu2()), + epsilon2(clover.Epsilon2()), + coeff(clover.Coeff()), + csw(clover.Csw()), + kappa(clover.Coeff() / clover.Csw()), + dim {clover.X()[0], clover.X()[1], clover.X()[2], clover.X()[3]}, // *local* lattice dimensions, xyzt + L {clover.X()[3], clover.X()[0], clover.X()[1], clover.X()[2]} // *local* lattice dimensions, txyz + { + if (clover.Order() != QUDA_OPENQCD_CLOVER_ORDER) { + errorQuda("Invalid clover order %d for this accessor", clover.Order()); + } + this->clover = clover_ ? clover_ : clover.data(inverse); + if (clover.Coeff() == 0.0 || clover.Csw() == 0.0) { errorQuda("Neither coeff nor csw may be zero!"); } + } + + QudaTwistFlavorType TwistFlavor() const { return twist_flavor; } + Float Mu2() const { return mu2; } + Float Epsilon2() const { return epsilon2; } + + /** + * @brief Gets the offset in Floats from the openQCD base pointer to + * the spinor field. + * + * @param[in] x_cb Checkerboard index coming from quda + * @param[in] parity The parity coming from quda + * + * @return The offset. + */ + __device__ __host__ inline int getCloverOffset(int x_cb, int parity) const + { + int x_quda[4], x[4]; + getCoords(x_quda, x_cb, dim, parity); // x_quda contains xyzt local Carthesian corrdinates + openqcd::rotate_coords(x_quda, x); // xyzt -> txyz, x = openQCD local Carthesian lattice coordinate + return openqcd::ipt(x, L) * length; + } + + /** + * @brief Load a clover field at lattice point x_cb + * + * @param v The output clover matrix in QUDA order + * @param x_cb The checkerboarded lattice site + * @param parity The parity of the lattice site + */ + __device__ __host__ inline void load(RegType v[length], int x_cb, int parity) const + { + int sign[36] = {-1, -1, -1, -1, -1, -1, // diagonals (idx 0-5) + -1, +1, -1, +1, -1, -1, -1, -1, -1, -1, // column 0 (idx 6-15) + -1, +1, -1, -1, -1, -1, -1, -1, // column 1 (idx 16-23) + -1, -1, -1, -1, -1, -1, // column 2 (idx 24-29) + -1, +1, -1, +1, // column 3 (idx 30-33) + -1, +1}; // column 4 (idx 34-35) + int map[36] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 18, 19, 24, 25, 16, 17, + 12, 13, 20, 21, 26, 27, 14, 15, 22, 23, 28, 29, 30, 31, 32, 33, 34, 35}; + const int M = length / 2; + int offset = getCloverOffset(x_cb, parity); + auto Ap = &clover[offset]; // A_+ + auto Am = &clover[offset + M]; // A_- + +#pragma unroll + for (int i = 0; i < M; i++) { + v[i] = sign[i] * (kappa * Am[map[i]] - (i < 6)); + v[M + i] = sign[i] * (kappa * Ap[map[i]] - (i < 6)); + } + } + + // FIXME implement the save routine for OpenQCD ordered fields + __device__ __host__ inline void save(RegType[length], int, int) const { } + + size_t Bytes() const { return length * sizeof(Float); } + }; + } // namespace clover // Use traits to reduce the template explosion diff --git a/include/color_spinor_field.h b/include/color_spinor_field.h index 9edc8fbdef..5ac9215668 100644 --- a/include/color_spinor_field.h +++ b/include/color_spinor_field.h @@ -231,6 +231,9 @@ namespace quda } else if (inv_param.dirac_order == QUDA_TIFR_PADDED_DIRAC_ORDER) { fieldOrder = QUDA_PADDED_SPACE_SPIN_COLOR_FIELD_ORDER; siteOrder = QUDA_EVEN_ODD_SITE_ORDER; + } else if (inv_param.dirac_order == QUDA_OPENQCD_DIRAC_ORDER) { + fieldOrder = QUDA_OPENQCD_FIELD_ORDER; + siteOrder = QUDA_EVEN_ODD_SITE_ORDER; } else { errorQuda("Dirac order %d not supported", inv_param.dirac_order); } diff --git a/include/color_spinor_field_order.h b/include/color_spinor_field_order.h index 0db9c7bb3c..176bd32397 100644 --- a/include/color_spinor_field_order.h +++ b/include/color_spinor_field_order.h @@ -1836,6 +1836,85 @@ namespace quda size_t Bytes() const { return nParity * volumeCB * Nc * Ns * 2 * sizeof(Float); } }; + /** + * struct to define order of spinor fields in OpenQCD + * + * @tparam Float Underlying type of data (precision) + * @tparam Ns Number of spin degrees of freedom + * @tparam Nc Number of color degrees of freedom + */ + template struct OpenQCDDiracOrder { + using Accessor = OpenQCDDiracOrder; + using real = typename mapper::type; + using complex = complex; + + static const int length = 2 * Ns * Nc; // 12 complex (2 floats) numbers per spinor color field + Float *field; + size_t offset; + Float *ghost[8]; + int volumeCB; + int faceVolumeCB[4]; + int nParity; + const int dim[4]; // xyzt convention + const int L[4]; // txyz convention + + OpenQCDDiracOrder(const ColorSpinorField &a, int = 1, Float *field_ = 0, float * = 0) : + field(field_ ? field_ : a.data()), + offset(a.Bytes() / (2 * sizeof(Float))), // TODO: What's this for?? + volumeCB(a.VolumeCB()), + nParity(a.SiteSubset()), + dim {a.X(0), a.X(1), a.X(2), a.X(3)}, // *local* lattice dimensions, xyzt + L {a.X(3), a.X(0), a.X(1), a.X(2)} // *local* lattice dimensions, txyz + { + if constexpr (length != 24) { errorQuda("Spinor field length %d not supported", length); } + } + + /** + * @brief Gets the offset in Floats from the openQCD base pointer to + * the spinor field. + * + * @param[in] x Checkerboard index coming from quda + * @param[in] parity The parity coming from quda + * + * @return The offset. + */ + __device__ __host__ inline int getSpinorOffset(int x_cb, int parity) const + { + int x_quda[4], x[4]; + getCoords(x_quda, x_cb, dim, parity); // x_quda contains xyzt local Carthesian corrdinates + openqcd::rotate_coords(x_quda, x); // xyzt -> txyz, x = openQCD local Carthesian lattice coordinate + return openqcd::ipt(x, L) * length; + } + + __device__ __host__ inline void load(complex v[length / 2], int x_cb, int parity = 0) const + { + auto in = &field[getSpinorOffset(x_cb, parity)]; + block_load(v, reinterpret_cast(in)); + } + + __device__ __host__ inline void save(const complex v[length / 2], int x_cb, int parity = 0) const + { + auto out = &field[getSpinorOffset(x_cb, parity)]; + block_store(reinterpret_cast(out), v); + } + + /** + @brief This accessor routine returns a colorspinor_wrapper to this object, + allowing us to overload various operators for manipulating at + the site level interms of matrix operations. + @param[in] x_cb Checkerboarded space-time index we are requesting + @param[in] parity Parity we are requesting + @return Instance of a colorspinor_wrapper that curries in access to + this field at the above coordinates. + */ + __device__ __host__ inline auto operator()(int x_cb, int parity) const + { + return colorspinor_wrapper(*this, x_cb, parity); + } + + size_t Bytes() const { return nParity * volumeCB * Nc * Ns * 2 * sizeof(Float); } + }; // openQCDDiracOrder + } // namespace colorspinor // Use traits to reduce the template explosion diff --git a/include/comm_quda.h b/include/comm_quda.h index 8496efab5a..d33e949bdf 100644 --- a/include/comm_quda.h +++ b/include/comm_quda.h @@ -49,6 +49,13 @@ namespace quda */ int comm_dim(int dim); + /** + Return whether the dimension dim is a C* dimension or not + @param dim Dimension which we are querying + @return C* dimension or nor + */ + bool comm_dim_cstar(int dim); + /** Return the global number of processes in the dimension dim @param dim Dimension which we are querying diff --git a/include/communicator_quda.h b/include/communicator_quda.h index bf0e9ffba5..c5ddbe1f75 100644 --- a/include/communicator_quda.h +++ b/include/communicator_quda.h @@ -41,6 +41,7 @@ namespace quda int (*coords)[QUDA_MAX_DIM]; int my_rank; int my_coords[QUDA_MAX_DIM]; + int cstar; // number of C* direction as per openQxD convention // It might be worth adding communicators to allow for efficient reductions: // #if defined(MPI_COMMS) // MPI_Comm comm; @@ -126,9 +127,26 @@ namespace quda inline int comm_rank_displaced(const Topology *topo, const int displacement[]) { int coords[QUDA_MAX_DIM]; - - for (int i = 0; i < QUDA_MAX_DIM; i++) { - coords[i] = (i < topo->ndim) ? mod(comm_coords(topo)[i] + displacement[i], comm_dims(topo)[i]) : 0; + int shift_integer; + + int Nx_displacement = 0; + for (int i = QUDA_MAX_DIM - 1; i >= 0; i--) { + // cstar shift[x] shift[y] shift[z] shift[t] + // 0 0 0 0 0 + // 1 0 0 0 0 + // 2 0 1 0 0 + // 3 0 1 1 0 + if (i < topo->ndim && ((i == 1 && topo->cstar >= 2) || (i == 2 && topo->cstar >= 3))) { + // if we go over the boundary and have a shifted boundary condition, + // we shift Nx/2 ranks in x-direction: + // shift_integer in { 0, 1, 2} + // (shift_integer - 1) in {-1, 0, 1} + shift_integer = (comm_coords(topo)[i] + displacement[i] + comm_dims(topo)[i]) / comm_dims(topo)[i]; + Nx_displacement += (shift_integer - 1) * (comm_dims(topo)[0] / 2); + } + coords[i] = (i < topo->ndim) ? + mod(comm_coords(topo)[i] + displacement[i] + (i == 0 ? Nx_displacement : 0), comm_dims(topo)[i]) : + 0; } return comm_rank_from_coords(topo, coords); @@ -390,6 +408,12 @@ namespace quda return comm_dims(topo)[dim]; } + bool comm_dim_cstar(int dim) + { + Topology *topo = comm_default_topology(); + return (topo->cstar >= 2 && dim == 1) || (topo->cstar >= 3 && dim == 2); + } + int comm_coord(int dim) { Topology *topo = comm_default_topology(); diff --git a/include/dirac_quda.h b/include/dirac_quda.h index 2ceb4f5b70..7c2130b5da 100644 --- a/include/dirac_quda.h +++ b/include/dirac_quda.h @@ -2589,6 +2589,8 @@ namespace quda { && (pc_type == QUDA_MATPC_EVEN_EVEN || pc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC)) return true; + if (dirac_type == QUDA_WILSON_DIRAC || dirac_type == QUDA_CLOVER_DIRAC) return true; + return false; } }; diff --git a/include/dslash_quda.h b/include/dslash_quda.h index 32111e92f9..b5b73ec11b 100644 --- a/include/dslash_quda.h +++ b/include/dslash_quda.h @@ -930,6 +930,12 @@ namespace quda */ void gamma5(ColorSpinorField &out, const ColorSpinorField &in); + /* RG: I have added these */ + void gamma0(ColorSpinorField &out, const ColorSpinorField &in); + void gamma1(ColorSpinorField &out, const ColorSpinorField &in); + void gamma2(ColorSpinorField &out, const ColorSpinorField &in); + void gamma3(ColorSpinorField &out, const ColorSpinorField &in); + /** @brief Applies a (1 \pm gamma5)/2 projection matrix to a spinor @param[out] out Output field diff --git a/include/eigensolve_quda.h b/include/eigensolve_quda.h index 4fcea4b3d3..5a46f1464c 100644 --- a/include/eigensolve_quda.h +++ b/include/eigensolve_quda.h @@ -254,10 +254,11 @@ namespace quda /** @brief Computes Left/Right SVD from pre computed Right/Left - @param[in] evecs Computed eigenvectors of NormOp - @param[in] evals Computed eigenvalues of NormOp + @param[in,out] evecs Computed eigenvectors of NormOp + @param[in,out] evals Computed eigenvalues of NormOp + @param[in] dagger Whether NormOp was MdagM (false) or MMdag (true) */ - void computeSVD(std::vector &evecs, std::vector &evals); + void computeSVD(std::vector &evecs, std::vector &evals, bool dagger = false); /** @brief Compute eigenvalues and their residiua diff --git a/include/enum_quda.h b/include/enum_quda.h index 462ce98cf0..7584816e0a 100644 --- a/include/enum_quda.h +++ b/include/enum_quda.h @@ -52,6 +52,7 @@ typedef enum QudaGaugeFieldOrder_s { QUDA_BQCD_GAUGE_ORDER, // expect *gauge, mu, even-odd, spacetime+halos, column-row order QUDA_TIFR_GAUGE_ORDER, // expect *gauge, mu, even-odd, spacetime, column-row order QUDA_TIFR_PADDED_GAUGE_ORDER, // expect *gauge, mu, parity, t, z+halo, y, x/2, column-row order + QUDA_OPENQCD_GAUGE_ORDER, // expect *gauge, spacetime, mu, parity row-column order -- links attached to odd points only QUDA_INVALID_GAUGE_ORDER = QUDA_INVALID_ENUM } QudaGaugeFieldOrder; @@ -256,6 +257,7 @@ typedef enum QudaDiracFieldOrder_s { QUDA_CPS_WILSON_DIRAC_ORDER, // odd-even, color inside spin QUDA_LEX_DIRAC_ORDER, // lexicographical order, color inside spin QUDA_TIFR_PADDED_DIRAC_ORDER, // padded z dimension for TIFR RHMC code + QUDA_OPENQCD_DIRAC_ORDER, // openqcd QUDA_INVALID_DIRAC_ORDER = QUDA_INVALID_ENUM } QudaDiracFieldOrder; @@ -267,6 +269,7 @@ typedef enum QudaCloverFieldOrder_s { QUDA_PACKED_CLOVER_ORDER, // even-odd, QDP packed QUDA_QDPJIT_CLOVER_ORDER, // (diagonal / off-diagonal)-chirality-spacetime QUDA_BQCD_CLOVER_ORDER, // even-odd, super-diagonal packed and reordered + QUDA_OPENQCD_CLOVER_ORDER, // openqcd QUDA_INVALID_CLOVER_ORDER = QUDA_INVALID_ENUM } QudaCloverFieldOrder; @@ -360,6 +363,7 @@ typedef enum QudaFieldOrder_s { QUDA_QDPJIT_FIELD_ORDER, // QDP field ordering (complex-color-spin-spacetime) QUDA_QOP_DOMAIN_WALL_FIELD_ORDER, // QOP domain-wall ordering QUDA_PADDED_SPACE_SPIN_COLOR_FIELD_ORDER, // TIFR RHMC ordering + QUDA_OPENQCD_FIELD_ORDER, // OPENQCD ordering QUDA_INVALID_FIELD_ORDER = QUDA_INVALID_ENUM } QudaFieldOrder; @@ -372,16 +376,20 @@ typedef enum QudaFieldCreate_s { QUDA_INVALID_FIELD_CREATE = QUDA_INVALID_ENUM } QudaFieldCreate; -typedef enum QudaGammaBasis_s { // gamj=((top 2 rows)(bottom 2 rows)) s1,s2,s3 are Pauli spin matrices, 1 is 2x2 identity - QUDA_DEGRAND_ROSSI_GAMMA_BASIS, // gam1=((0,i*s1)(-i*s1,0)) gam2=((0,-i*s2)(i*s2,0)) gam3=((0,i*s3)(-i*s3,0)) gam4=((0,1)(1,0)) gam5=((-1,0)(0,1)) - QUDA_UKQCD_GAMMA_BASIS, // gam1=((0,i*s1)(-i*s1,0)) gam2=((0,i*s2)(-i*s2,0)) gam3=((0,i*s3)(-i*s3,0)) gam4=((1,0)(0,-1)) gam5=((0,-1)(-1,0)) - QUDA_CHIRAL_GAMMA_BASIS, // gam1=((0,-i*s1)(i*s1,0)) gam2=((0,-i*s2)(i*s2,0)) gam3=((0,-i*s3)(i*s3,0)) gam4=((0,-1)(-1,0))gam5=((1,0)(0,-1)) - QUDA_DIRAC_PAULI_GAMMA_BASIS, // gam1=((0,-i*s1)(i*s1,0)) gam2=((0,-i*s2)(i*s2,0)) gam3=((0,-i*s3)(i*s3,0)) gam4=((1,0)(0,-1)) gam5=((0,1)(1,0)) - QUDA_INVALID_GAMMA_BASIS = QUDA_INVALID_ENUM // gam5=gam4*gam1*gam2*gam3 +typedef enum QudaGammaBasis_s { // gamj=((top 2 rows)(bottom 2 rows)) s1,s2,s3 are Pauli spin matrices, 1 is 2x2 identity + QUDA_DEGRAND_ROSSI_GAMMA_BASIS, // gam1=((0,i*s1)(-i*s1,0)) gam2=((0,-i*s2)(i*s2,0)) gam3=((0,i*s3)(-i*s3,0)) + // gam4=((0,1)(1,0)) gam5=((-1,0)(0,1)) + QUDA_UKQCD_GAMMA_BASIS, // gam1=((0,i*s1)(-i*s1,0)) gam2=((0,i*s2)(-i*s2,0)) gam3=((0,i*s3)(-i*s3,0)) gam4=((1,0)(0,-1)) gam5=((0,-1)(-1,0)) + QUDA_CHIRAL_GAMMA_BASIS, // gam1=((0,-i*s1)(i*s1,0)) gam2=((0,-i*s2)(i*s2,0)) gam3=((0,-i*s3)(i*s3,0)) gam4=((0,-1)(-1,0))gam5=((1,0)(0,-1)) + QUDA_OPENQCD_GAMMA_BASIS, // gam1=((0,-i*s3)(i*s3,0)) gam2=((0,-1)(-1,0)) gam3=((0,-s1)(s1,0)) gam4=((0,-i*s2)(i*s2,0)) gam5=((1,0)(0,-1)) + QUDA_DIRAC_PAULI_GAMMA_BASIS, // gam1=((0,-i*s1)(i*s1,0)) gam2=((0,-i*s2)(i*s2,0)) gam3=((0,-i*s3)(i*s3,0)) + // gam4=((1,0)(0,-1)) gam5=((0,1)(1,0)) + QUDA_INVALID_GAMMA_BASIS = QUDA_INVALID_ENUM // gam5=gam4*gam1*gam2*gam3 } QudaGammaBasis; - // Dirac-Pauli -> DeGrand-Rossi T = i/sqrt(2)*((s2,-s2)(s2,s2)) field_DR = T * field_DP - // UKQCD -> DeGrand-Rossi T = i/sqrt(2)*((-s2,-s2)(-s2,s2)) field_DR = T * field_UK - // Chiral -> DeGrand-Rossi T = i*((0,-s2)(s2,0)) field_DR = T * field_chiral +// Dirac-Pauli -> DeGrand-Rossi T = i/sqrt(2)*((s2,-s2)(s2,s2)) field_DR = T * field_DP +// UKQCD -> DeGrand-Rossi T = i/sqrt(2)*((-s2,-s2)(-s2,s2)) field_DR = T * field_UK +// Chiral -> DeGrand-Rossi T = i*((0,-s2)(s2,0)) field_DR = T * field_chiral + typedef enum QudaSourceType_s { QUDA_POINT_SOURCE, QUDA_RANDOM_SOURCE, @@ -645,4 +653,3 @@ typedef enum QudaWFlowStepType_s { #ifdef __cplusplus } #endif - diff --git a/include/enum_quda_fortran.h b/include/enum_quda_fortran.h index 8874959d7b..c1dee15a3e 100644 --- a/include/enum_quda_fortran.h +++ b/include/enum_quda_fortran.h @@ -47,6 +47,7 @@ #define QUDA_BQCD_GAUGE_ORDER 15 // expect *gauge mu even-odd spacetime+halos row-column order #define QUDA_TIFR_GAUGE_ORDER 16 #define QUDA_TIFR_PADDED_GAUGE_ORDER 17 +#define QUDA_OPENQCD_GAUGE_ORDER 18 #define QUDA_INVALID_GAUGE_ORDER QUDA_INVALID_ENUM #define QudaTboundary integer(4) @@ -229,6 +230,7 @@ #define QUDA_CPS_WILSON_DIRAC_ORDER 4 // odd-even color inside spin #define QUDA_LEX_DIRAC_ORDER 5 // lexicographical order color inside spin #define QUDA_TIFR_PADDED_DIRAC_ORDER 6 +#define QUDA_OPENQCD_DIRAC_ORDER 7 // openqcd #define QUDA_INVALID_DIRAC_ORDER QUDA_INVALID_ENUM #define QudaCloverFieldOrder integer(4) @@ -239,6 +241,7 @@ #define QUDA_PACKED_CLOVER_ORDER 9 // even-odd packed #define QUDA_QDPJIT_CLOVER_ORDER 10 // lexicographical order packed #define QUDA_BQCD_CLOVER_ORDER 11 // BQCD order which is a packed super-diagonal form +#define QUDA_OPENQCD_CLOVER_ORDER 12 // openqcd #define QUDA_INVALID_CLOVER_ORDER QUDA_INVALID_ENUM #define QudaVerbosity integer(4) @@ -324,6 +327,7 @@ #define QUDA_QDPJIT_FIELD_ORDER 11 // QDP field ordering (complex-color-spin-spacetime) #define QUDA_QOP_DOMAIN_WALL_FIELD_ORDER 12 // QOP domain-wall ordering #define QUDA_PADDED_SPACE_SPIN_COLOR_FIELD_ORDER 13 // TIFR RHMC ordering +#define QUDA_OPENQCD_FIELD_ORDER 14 // openQCD ordering #define QUDA_INVALID_FIELD_ORDER QUDA_INVALID_ENUM #define QudaFieldCreate integer(4) diff --git a/include/gamma.cuh b/include/gamma.cuh index ed5474711f..bad953f631 100644 --- a/include/gamma.cuh +++ b/include/gamma.cuh @@ -26,10 +26,10 @@ namespace quda { Gamma(const Gamma &g) = default; __device__ __host__ inline int getcol(int row) const { - if (basis == QUDA_DEGRAND_ROSSI_GAMMA_BASIS) { - switch(dir) { - case 0: - case 1: + if (basis == QUDA_DEGRAND_ROSSI_GAMMA_BASIS || basis == QUDA_OPENQCD_GAMMA_BASIS) { + switch (dir) { + case 0: + case 1: switch(row) { case 0: return 3; case 1: return 2; @@ -54,11 +54,11 @@ namespace quda { case 3: return 3; } break; - } + } } else { - switch(dir) { - case 0: - case 1: + switch (dir) { + case 0: + case 1: switch(row) { case 0: return 3; case 1: return 2; @@ -90,7 +90,7 @@ namespace quda { case 3: return 1; } break; - } + } } return 0; } @@ -201,7 +201,51 @@ namespace quda { } break; } + } else if (basis == QUDA_OPENQCD_GAMMA_BASIS) { + switch (dir) { + case 0: /* corresponds to gamma1 in OpenQCD convention */ + switch (row) { + case 0: + case 1: return -I; + case 2: + case 3: return I; + } + break; + case 1: /* gamma2 in openQCD */ + switch (row) { + case 0: + case 3: return -1; + case 1: + case 2: return 1; + } + break; + case 2: /* gamma3 in openQCD */ + switch (row) { + case 0: + case 3: return -I; + case 1: + case 2: return I; + } + break; + case 3: /* gamma0 in openQCD */ + switch (row) { + case 0: + case 1: + case 2: + case 3: return -1; + } + break; + case 4: /* gamma5 in openQCD */ + switch (row) { + case 0: + case 1: return 1; + case 2: + case 3: return -1; + } + break; + } } + return 0; } @@ -283,7 +327,51 @@ namespace quda { } break; } + } else if (basis == QUDA_OPENQCD_GAMMA_BASIS) { + switch (dir) { + case 0: /* gamma1 in openQCD convention */ + switch (row) { + case 0: + case 1: return complex(a.imag(), -a.real()); // I + case 2: + case 3: return complex(-a.imag(), a.real()); // -I + } + break; + case 1: /* gamma2 in openQCD */ + switch (row) { + case 0: + case 3: return -a; + case 1: + case 2: return a; + } + break; + case 2: /* gamma3 in openQCD */ + switch (row) { + case 0: + case 3: return complex(a.imag(), -a.real()); // I + case 1: + case 2: return complex(-a.imag(), a.real()); // -I + } + break; + case 3: /* gamma0 in openQCD */ + switch (row) { + case 0: + case 1: + case 2: + case 3: return -a; + } + break; + case 4: /* gamma5 in openQCD */ + switch (row) { + case 0: + case 1: return a; + case 2: + case 3: return -a; + } + break; + } } + return a; } diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h index 57992908c2..4c023e7903 100644 --- a/include/gauge_field_order.h +++ b/include/gauge_field_order.h @@ -2350,6 +2350,118 @@ namespace quda { size_t Bytes() const { return Nc * Nc * 2 * sizeof(Float); } }; + /** + * struct to define order of gauge fields in OpenQCD + */ + template struct OpenQCDOrder : LegacyOrder { + + using Accessor = OpenQCDOrder; + using real = typename mapper::type; + using complex = complex; + + Float *gauge; + const int volumeCB; + static constexpr int Nc = 3; + const int dim[4]; // xyzt convention + const int L[4]; // txyz convention + const int nproc[4]; + + OpenQCDOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) : + LegacyOrder(u, ghost_), + gauge(gauge_ ? gauge_ : (Float *)u.data()), // pointer to the gauge field on CPU + volumeCB(u.VolumeCB()), // Volume and VolumeCB refer to the global lattice, if VolumeLocal, then local lattice + dim {u.X()[0], u.X()[1], u.X()[2], u.X()[3]}, // *local* lattice dimensions, xyzt + L {u.X()[3], u.X()[0], u.X()[1], u.X()[2]}, // *local* lattice dimensions, txyz + nproc {comm_dim(3), comm_dim(0), comm_dim(1), comm_dim(2)} // txyz + { + if constexpr (length != 18) { errorQuda("Gauge field length %d not supported", length); } + } + + /** + * @brief Obtains the offset in Floats from the openQCD base pointer + * to the gauge fields. At this point, fields are already + * reordered with a xyzt-lexicographical spacetime index, so + * nothing special to do here. + * + * @param[in] x Checkerboard index coming from quda + * @param[in] dir The direction coming from quda + * @param[in] parity The parity coming from quda + * + * @return The offset. + */ + __device__ __host__ inline int getGaugeOffset_lexi(int x_cb, int dir, int parity) const + { + int x[4]; + getCoords(x, x_cb, dim, parity); + return (4 * openqcd::lexi(x, dim, 4) + dir) * length; + } + + /** + * @brief Obtains the offset in Floats from the openQCD base pointer + * to the gauge fields. + * + * @param[in] x_cb Checkerboard index coming from quda + * @param[in] dir The direction coming from quda + * @param[in] parity The parity coming from quda + * + * @return The offset. + */ + __device__ __host__ inline int getGaugeOffset(int x_cb, int dir, int parity) const + { + int quda_x[4], x[4]; + getCoords(quda_x, x_cb, dim, parity); // x_quda = quda local lattice coordinates + openqcd::rotate_coords(quda_x, x); // x = openQCD local lattice coordinates + + int mu = (dir + 1) % 4; // mu = openQCD direction + int ix = openqcd::ipt(x, L); + int iz = openqcd::iup(x, mu, L, nproc); + int ofs = 0; + int volume = openqcd::vol(L); + + if (ix < volume / 2) { // ix even -> iz odd + if (iz < volume) { // iz in interior + ofs = 8 * (iz - volume / 2) + 2 * mu + 1; + } else { + int ib = iz - volume - openqcd::ifc(L, nproc, mu) - openqcd::bndry(L, nproc) / 2; // iz in exterior + ofs = 4 * volume + openqcd::face_offset(L, nproc, mu) + ib; + } + } else if (volume / 2 <= ix && ix < volume) { // ix odd + ofs = 8 * (ix - volume / 2) + 2 * mu; + } + + return ofs * length; + } + + __device__ __host__ inline void load(complex v[length / 2], int x_cb, int dir, int parity, Float = 1.0) const + { + auto in = &gauge[getGaugeOffset(x_cb, dir, parity)]; + block_load(v, reinterpret_cast(in)); + } + + __device__ __host__ inline void save(const complex v[length / 2], int x_cb, int dir, int parity) const + { + auto out = &gauge[getGaugeOffset_lexi(x_cb, dir, parity)]; + block_store(reinterpret_cast(out), v); + } + + /** + @brief This accessor routine returns a gauge_wrapper to this object, + allowing us to overload various operators for manipulating at + the site level interms of matrix operations. + @param[in] dim Which dimension are we requesting + @param[in] x_cb Checkerboarded space-time index we are requesting + @param[in] parity Parity we are requesting + @return Instance of a gauge_wrapper that curries in access to + this field at the above coordinates. + */ + __device__ __host__ inline auto operator()(int dim, int x_cb, int parity) const + { + return gauge_wrapper(const_cast(*this), dim, x_cb, parity); + } + + size_t Bytes() const { return 2 * Nc * Nc * sizeof(Float); } + }; // class OpenQCDOrder + } // namespace gauge template @@ -2494,5 +2606,8 @@ namespace quda { template struct gauge_order_mapper { typedef gauge::TIFROrder type; }; template struct gauge_order_mapper { typedef gauge::TIFRPaddedOrder type; }; template struct gauge_order_mapper { typedef gauge::FloatNOrder type; }; + template struct gauge_order_mapper { + typedef gauge::OpenQCDOrder type; + }; } // namespace quda diff --git a/include/index_helper.cuh b/include/index_helper.cuh index cf1faf72b0..93d01cc65d 100644 --- a/include/index_helper.cuh +++ b/include/index_helper.cuh @@ -1106,6 +1106,295 @@ namespace quda { return (((x[3]*X[2] + x[2])*X[1] + x[1])*X[0] + x[0]) >> 1; } + /** + * These are index helper functions used in the order classes of openQCD, i.e. + * + * - OpenQCDOrder in quda:include/gauge_field_order.h + * - OpenQCDDiracOrder in quda:include/color_spinor_field_order.h + * - OpenQCDOrder in quda:include/clover_field_order.h. + * + * The main helper functions are ipt() and iup(), giving pure function + * implementations of the ipt[] and iup[][] arrays (see + * openqcd:include/global.h) that are needed to calculate the correct offsets + * of the fields base pointers. + */ + namespace openqcd + { + + /** + * @brief Returns the surface in direction mu + * + * @param X Extent in all 4 directions + * @param[in] mu Direction + * + * @return Surface + */ + __device__ __host__ inline int surface(const int *X, const int mu) + { + if (mu == 0) { + return X[1] * X[2] * X[3]; + } else if (mu == 1) { + return X[0] * X[2] * X[3]; + } else if (mu == 2) { + return X[0] * X[1] * X[3]; + } + return X[0] * X[1] * X[2]; + } + + /** + * @brief Return BNDRY (see openqcd:include/global.h) + * + * @param[in] L Local lattice extent L0-L3 in txyz convention + * @param[in] nproc NPROC0-NPROC3 from openqcd + * + * @return BNDRY + */ + __device__ __host__ inline int bndry(const int *L, const int *nproc) + { + return 2 + * (((1 - (nproc[0] % 2)) * surface(L, 0)) + ((1 - (nproc[1] % 2)) * surface(L, 1)) + + ((1 - (nproc[2] % 2)) * surface(L, 2)) + ((1 - (nproc[3] % 2)) * surface(L, 3))); + } + + /** + * @brief Calculate the offset needed for boundary points in openQCD. + * + * @param[in] L Local lattice extent L0-L3 in txyz convention + * @param[in] nproc NPROC0-NPROC3 from openqcd + * @param[in] mu Direction in txyz + * + * @return The offset + */ + __device__ __host__ inline int ifc(const int *L, const int *nproc, const int mu) + { + if (mu == 0) { + return ((1 - (nproc[0] % 2)) * surface(L, 0)) / 2; + } else if (mu == 1) { + return ((1 - (nproc[0] % 2)) * surface(L, 0)) + (((1 - (nproc[1] % 2)) * surface(L, 1)) / 2); + } else if (mu == 2) { + return ((1 - (nproc[0] % 2)) * surface(L, 0)) + ((1 - (nproc[1] % 2)) * surface(L, 1)) + + (((1 - (nproc[2] % 2)) * surface(L, 2)) / 2); + } + return ((1 - (nproc[0] % 2)) * surface(L, 0)) + ((1 - (nproc[1] % 2)) * surface(L, 1)) + + ((1 - (nproc[2] % 2)) * surface(L, 2)) + (((1 - (nproc[3] % 2)) * surface(L, 3)) / 2); + } + + /** + * @brief Calculate the offset of the faces in openQCD. + * + * @param[in] L Local lattice extent L0-L3 in txyz convention + * @param[in] nproc NPROC0-NPROC3 from openqcd + * @param[in] mu Direction in txyz + * + * @return The offset + */ + __device__ __host__ inline int face_offset(const int *L, const int *nproc, const int mu) + { + if (mu == 0) { + return 0; + } else if (mu == 1) { + return ((1 - (nproc[0] % 2)) * surface(L, 0)) / 2; + } else if (mu == 2) { + return ((1 - (nproc[0] % 2)) * surface(L, 0)) / 2 + ((1 - (nproc[1] % 2)) * surface(L, 1)) / 2; + } + return ((1 - (nproc[0] % 2)) * surface(L, 0)) / 2 + ((1 - (nproc[1] % 2)) * surface(L, 1)) / 2 + + ((1 - (nproc[2] % 2)) * surface(L, 2)) / 2; + } + + /** + * @brief Rotate coordinates (xyzt -> txyz) + * + * @param[in] x_quda Cartesian local lattice coordinates in quda + * convention (xyzt) + * @param[out] x_openQCD Cartesian local lattice coordinates in openQCD + * convention (txyz) + */ + __device__ __host__ inline void rotate_coords(const int *x_quda, int *x_openQCD) + { + x_openQCD[1] = x_quda[0]; + x_openQCD[2] = x_quda[1]; + x_openQCD[3] = x_quda[2]; + x_openQCD[0] = x_quda[3]; + } + + /** + * @brief Generate a lexicographical index with x[Ndims-1] running + * fastest, for example if Ndims=4: + * ix = X3*X2*X1*x0 + X3*X2*x1 + X3*x2 + x3. + * + * @param[in] x Integer array of dimension Ndims with coordinates + * @param[in] X Integer array of dimension Ndims with extents + * @param[in] Ndims The number of dimensions + * + * @return Lexicographical index + */ + __device__ __host__ inline int lexi(const int *x, const int *X, const int Ndims) + { + int i, ix = x[0]; + +#pragma unroll + for (i = 1; i < Ndims; i++) { ix = (X[i] * ix + x[i]); } + return ix; + } + + /** + * @brief Return the volume + * + * @param[in] X Integer array of 4 dimensions + * + * @return Volume + */ + __device__ __host__ inline int vol(const int *X) { return X[0] * X[1] * X[2] * X[3]; } + + /** + * @brief Return cbs[]. This is the cache block size in openQCD, which + * the local lattice is divided into. + * + * @param[in] mu Direction + * @param[in] X Extents + * + * @return cbs + */ + __device__ __host__ inline int setup_cbs(const int mu, const int *X) + { + if (mu == 0) { + return X[0]; + } else if ((X[mu] % 4) == 0) { + return 4; + } else if ((X[mu] % 3) == 0) { + return 3; + } else if ((X[mu] % 2) == 0) { + return 2; + } else { + return 1; + } + } + + /** + * @brief Pure function to return ipt[iy], where + * iy=x3+L3*x2+L2*L3*x1+L1*L2*L3*x0 without accessing the + * ipt-array, but calculating the index on the fly. Notice that xi + * and Li are in openQCD (txyz) convention. If they come from + * QUDA, you have to rotate them first (see rotate_coords). + * + * @param[in] x Carthesian local lattice corrdinates, 0 <= x[i] < L[i] + * @param[in] L Local lattice extents + * + * @return ipt[x3+L3*x2+L2*L3*x1+L1*L2*L3*x0] = the local flat index of + * openQCD + */ + __device__ __host__ inline int ipt(const int *x, const int *L) + { + int xb[4], xn[4]; + + // openQCDs cache block size in txyz convention + int cbs[4] = {setup_cbs(0, L), setup_cbs(1, L), setup_cbs(2, L), setup_cbs(3, L)}; + + // openQCDs cache block grid in txyz convention + int cbn[4] = {L[0] / cbs[0], L[1] / cbs[1], L[2] / cbs[2], L[3] / cbs[3]}; + + xb[0] = x[0] % cbs[0]; + xb[1] = x[1] % cbs[1]; + xb[2] = x[2] % cbs[2]; + xb[3] = x[3] % cbs[3]; + + xn[0] = x[0] / cbs[0]; + xn[1] = x[1] / cbs[1]; + xn[2] = x[2] / cbs[2]; + xn[3] = x[3] / cbs[3]; + + return (lexi(xb, cbs, 4) / 2 + vol(cbs) * lexi(xn, cbn, 4) / 2 + + ((x[0] + x[1] + x[2] + x[3]) % 2 != 0) * (vol(L) / 2) // odd -> +VOLUME/2 + ); + } + + /** + * @brief Determines the number of boundary points in direction mu prior to + * the Carthesian index x with dimensions X + * + * @param[in] mu Direction + * @param[in] x Lattice point + * @param[in] X Dimensions of the lattice/block + * + * @return Number of prior boundary points + */ + __device__ __host__ inline int boundary_pts(const int mu, const int *x, const int *X) + { + int ret = 0; + + if (mu == 3) { + ret = lexi(x, X, 3); // lexi without x[3] + } else if (mu == 2) { + ret = X[3] * lexi(x, X, 2); + if (x[2] == (X[2] - 1)) { + ret += x[3]; // lexi without x[2] + } + } else if (mu == 1) { + if (x[1] == (X[1] - 1)) { + ret = X[2] * X[3] * x[0] + X[3] * x[2] + x[3]; // lexi without x[1] + } else { + ret = surface(X, 1); + } + } else if (mu == 0) { + if (x[0] == (X[0] - 1)) { + ret = lexi(x + 1, X + 1, 3); // lexi without x[0] + } else { + ret = surface(X, 0); + } + } + + return ret; + } + + /** + * @brief Pure implementation of iup[ix][mu]. Returns neighbouring + * point of ix in positive mu direction. + * + * @param[in] x Cartesian local lattice corrdinates, 0 <= x[i] < Li, + * length 4 in txyz convention + * @param[in] mu Direction in txyz convention + * @param[in] L Local lattice extents, length 4 in txyz convention + * @param[in] nproc NPROC0-NPROC3 from openqcd + * + * @return iup[ix][mu] + */ + __device__ __host__ inline int iup(const int *x, const int mu, const int *L, const int *nproc) + { + int i, ret, xb[4], xn[4]; + + if ((x[mu] == (L[mu] - 1)) && (nproc[mu] > 1)) { + + int cbs[4] = {setup_cbs(0, L), setup_cbs(1, L), setup_cbs(2, L), setup_cbs(3, L)}; + int cbn[4] = {L[0] / cbs[0], L[1] / cbs[1], L[2] / cbs[2], L[3] / cbs[3]}; + + xb[0] = x[0] % cbs[0]; + xb[1] = x[1] % cbs[1]; + xb[2] = x[2] % cbs[2]; + xb[3] = x[3] % cbs[3]; + + xn[0] = x[0] / cbs[0]; + xn[1] = x[1] / cbs[1]; + xn[2] = x[2] / cbs[2]; + xn[3] = x[3] / cbs[3]; + + ret = vol(L) + ifc(L, nproc, mu); + if ((x[0] + x[1] + x[2] + x[3]) % 2 == 0) { ret += bndry(L, nproc) / 2; } + + ret += surface(cbs, mu) * boundary_pts(mu, xn, cbn) / 2; + ret += boundary_pts(mu, xb, cbs) / 2; + return ret; + + } else { +#pragma unroll + for (i = 0; i < 4; i++) { xb[i] = x[i]; } + + xb[mu] = (xb[mu] + 1) % (L[mu] * nproc[mu]); + return ipt(xb, L); + } + } + + } // namespace openqcd + /** @brief Compute the flattened 4-d index from a separate T and flattened 3-d XYZ index. diff --git a/include/instantiate.h b/include/instantiate.h index 8eee6ad269..841b5cd0d4 100644 --- a/include/instantiate.h +++ b/include/instantiate.h @@ -68,6 +68,11 @@ namespace quda template <> constexpr bool is_enabled() { return false; } template <> constexpr bool is_enabled() { return false; } #endif +#ifdef BUILD_OPENQCD_INTERFACE + template <> constexpr bool is_enabled() { return true; } +#else + template <> constexpr bool is_enabled() { return false; } +#endif /** @brief Helper function for returning if a given precision is enabled @@ -108,13 +113,23 @@ namespace quda }; struct ReconstructWilson { +#ifdef BUILD_OPENQCD_INTERFACE + static constexpr std::array recon + = {QUDA_RECONSTRUCT_NO, QUDA_RECONSTRUCT_12, QUDA_RECONSTRUCT_8, QUDA_RECONSTRUCT_13, QUDA_RECONSTRUCT_9}; +#else static constexpr std::array recon = {QUDA_RECONSTRUCT_NO, QUDA_RECONSTRUCT_12, QUDA_RECONSTRUCT_8}; +#endif }; struct ReconstructStaggered { +#ifdef BUILD_OPENQCD_INTERFACE + static constexpr std::array recon + = {QUDA_RECONSTRUCT_NO, QUDA_RECONSTRUCT_13, QUDA_RECONSTRUCT_9, QUDA_RECONSTRUCT_12, QUDA_RECONSTRUCT_8}; +#else static constexpr std::array recon = {QUDA_RECONSTRUCT_NO, QUDA_RECONSTRUCT_13, QUDA_RECONSTRUCT_9}; +#endif }; struct ReconstructNo12 { @@ -578,6 +593,26 @@ namespace quda template <> constexpr bool is_enabled() { return true; } #endif + struct WilsonReconstruct { +#ifdef BUILD_OPENQCD_INTERFACE + static constexpr std::array recon + = {QUDA_RECONSTRUCT_NO, QUDA_RECONSTRUCT_12, QUDA_RECONSTRUCT_8, QUDA_RECONSTRUCT_13, QUDA_RECONSTRUCT_9}; +#else + static constexpr std::array recon + = {QUDA_RECONSTRUCT_NO, QUDA_RECONSTRUCT_12, QUDA_RECONSTRUCT_8}; +#endif + }; + + struct StaggeredReconstruct { +#ifdef BUILD_OPENQCD_INTERFACE + static constexpr std::array recon + = {QUDA_RECONSTRUCT_NO, QUDA_RECONSTRUCT_13, QUDA_RECONSTRUCT_9, QUDA_RECONSTRUCT_12, QUDA_RECONSTRUCT_8}; +#else + static constexpr std::array recon + = {QUDA_RECONSTRUCT_NO, QUDA_RECONSTRUCT_13, QUDA_RECONSTRUCT_9}; +#endif + }; + #ifdef GPU_DISTANCE_PRECONDITIONING constexpr bool is_enabled_distance_precondition() { return true; } #else diff --git a/include/instantiate_dslash.h b/include/instantiate_dslash.h index eab0ead243..03e5ca2e6d 100644 --- a/include/instantiate_dslash.h +++ b/include/instantiate_dslash.h @@ -36,6 +36,20 @@ namespace quda Apply(out, in, x, U, args...); else errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-8/9", QUDA_RECONSTRUCT); +#ifdef BUILD_OPENQCD_INTERFACE + } else if (Recon::recon.size() > 3 && U.Reconstruct() == Recon::recon[3]) { +#if QUDA_RECONSTRUCT & 2 + Apply(out, in, x, U, args...); +#else + errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-13", QUDA_RECONSTRUCT); +#endif + } else if (Recon::recon.size() > 4 && U.Reconstruct() == Recon::recon[4]) { +#if QUDA_RECONSTRUCT & 1 + Apply(out, in, x, U, args...); +#else + errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-9", QUDA_RECONSTRUCT); +#endif +#endif /* BUILD_OPENQCD_INTERFACE */ } else { errorQuda("Unsupported reconstruct type %d", U.Reconstruct()); } diff --git a/include/kernels/copy_color_spinor.cuh b/include/kernels/copy_color_spinor.cuh index 505163cc93..c0fadbb376 100644 --- a/include/kernels/copy_color_spinor.cuh +++ b/include/kernels/copy_color_spinor.cuh @@ -220,6 +220,60 @@ namespace quda } }; + /** Transform from openqcd into non-relativistic basis (a.k.a UKQCD basis): + * gamma_ukqcd = U gamma_openqcd U^dagger with + * U = [-1 0 1 0] + [ 0 -1 0 1] + [ 1 0 1 0] + [ 0 1 0 1] / sqrt(2), + * see https://github.com/JeffersonLab/chroma/blob/master/docs/notes/gamma_conventions.tex + for further notes. */ + template struct OpenqcdToNonRelBasis { + template + __device__ __host__ inline void operator()(complex out[Ns * Nc], const complex in[Ns * Nc]) const + { + int s1[4] = {0, 1, 0, 1}; + int s2[4] = {2, 3, 2, 3}; + FloatOut K1[4] + = {static_cast(-kP), static_cast(-kP), static_cast(kP), static_cast(kP)}; + FloatOut K2[4] + = {static_cast(kP), static_cast(kP), static_cast(kP), static_cast(kP)}; + for (int s = 0; s < Ns; s++) { + for (int c = 0; c < Nc; c++) { + out[s * Nc + c] = K1[s] * static_cast>(in[s1[s] * Nc + c]) + + K2[s] * static_cast>(in[s2[s] * Nc + c]); + } + } + } + }; + + /** Transform from non-relativistic (aka ukqcd) into openqcd basis: + * gamma_ukqcd = U gamma_openqcd U^dagger with + * U = [-1 0 1 0] + * [ 0 -1 0 1] + * [ 1 0 1 0] + * [ 0 1 0 1] / sqrt(2) + */ + template struct NonRelToOpenqcdBasis { + template + __device__ __host__ inline void operator()(complex out[Ns * Nc], const complex in[Ns * Nc]) const + { + int s1[4] = {0, 1, 0, 1}; + int s2[4] = {2, 3, 2, 3}; + + FloatOut K1[4] + = {static_cast(-kU), static_cast(-kU), static_cast(kU), static_cast(kU)}; + FloatOut K2[4] + = {static_cast(kU), static_cast(kU), static_cast(kU), static_cast(kU)}; + for (int s = 0; s < Ns; s++) { + for (int c = 0; c < Nc; c++) { + out[s * Nc + c] = K1[s] * static_cast>(in[s1[s] * Nc + c]) + + K2[s] * static_cast>(in[s2[s] * Nc + c]); + } + } + } + }; + template struct CopyColorSpinor_ { const Arg &arg; constexpr CopyColorSpinor_(const Arg &arg) : arg(arg) { } diff --git a/include/quda.h b/include/quda.h index e059442a5d..9abb9c416a 100644 --- a/include/quda.h +++ b/include/quda.h @@ -464,6 +464,9 @@ extern "C" { /** The t0 parameter for distance preconditioning, the timeslice where the source is located */ int distance_pc_t0; + /** Additional user-defined properties */ + void *additional_prop; + } QudaInvertParam; // Parameter set for solving eigenvalue problems. @@ -1005,7 +1008,7 @@ extern "C" { * initQuda. Calling initQudaMemory requires that the user has * previously called initQudaDevice. */ - void initQudaMemory(); + void initQudaMemory(void); /** * Initialize the library. This function is actually a wrapper @@ -1028,7 +1031,7 @@ extern "C" { * @details This should only be needed for automated testing when * different partitioning is applied within a single run. */ - void updateR(); + void updateR(void); /** * A new QudaGaugeParam should always be initialized immediately @@ -1822,7 +1825,7 @@ extern "C" { void flushPoolQuda(QudaMemoryType type); void setMPICommHandleQuda(void *mycomm); - + // Parameter set for quark smearing operations typedef struct QudaQuarkSmearParam_s { //------------------------------------------------- diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h new file mode 100644 index 0000000000..b2a24462fd --- /dev/null +++ b/include/quda_openqcd_interface.h @@ -0,0 +1,482 @@ +#pragma once + +#define OPENQCD_MAX_INVERTERS 32 +#define OPENQCD_MAX_EIGENSOLVERS 32 + +/** + * The macro battle below is to trick quda.h to think that double_complex is + * defined to be the struct below. For this we need to set the __CUDACC_RTC__, + * which makes double_complex to be defined as double2 (see quda.h), which we + * redefine below as openqcd_complex_dble. The original definitions of + * __CUDACC_RTC__ and double2 are recovered below. We do this to be able to + * include this header file into a openQxD program and compile with flags + * "-std=C89 -pedantic -Werror". Else the compiler trows an + * "ISO C90 does not support complex types" error because of the + * "double _Complex" data types exposed in quda.h. + */ + +typedef struct { + double re, im; +} openqcd_complex_dble; + +#ifdef __CUDACC_RTC__ +#define __CUDACC_RTC_ORIGINAL__ __CUDACC_RTC__ +#endif + +#ifdef double2 +#define double2_ORIGINAL double2 +#endif + +#define __CUDACC_RTC__ +#define double2 openqcd_complex_dble +#include +#undef double2 +#undef __CUDACC_RTC__ + +#ifdef double2_ORIGINAL +#define double2 double2_ORIGINAL +#undef double2_ORIGINAL +#endif + +#ifdef __CUDACC_RTC_ORIGINAL__ +#define __CUDACC_RTC__ __CUDACC_RTC_ORIGINAL__ +#undef __CUDACC_RTC_ORIGINAL__ +#endif + +/** + * @file quda_openqcd_interface.h + * + * @section Description + * + * The header file defines the interface to enable easy + * interfacing between QUDA and the OpenQCD software. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Copied from flags.h + * ############################################# + */ +#ifndef FLAGS_H +typedef struct { + int type; + int cstar; + double phi3[2][3]; + double phi1[2]; +} bc_parms_t; + +typedef struct { + int qhat; + double m0, su3csw, u1csw, cF[2], theta[3]; +} dirac_parms_t; + +typedef struct { + int gauge; + int nfl; +} flds_parms_t; +#endif +/** + * ############################################# + */ + +typedef enum OpenQCDGaugeGroup_s { + OPENQCD_GAUGE_SU3 = 1, + OPENQCD_GAUGE_U1 = 2, + OPENQCD_GAUGE_SU3xU1 = 3, + OPENQCD_GAUGE_INVALID = QUDA_INVALID_ENUM +} OpenQCDGaugeGroup; + +/** + * Parameters related to problem size and machine topology. They should hold the + * numbers in quda format, i.e. xyzt convention. For example L[0] = L1, L[1] = + * L2, ... + */ +typedef struct { + int L[4]; /** Local lattice dimensions L1, L2, L3, L0 */ + int nproc[4]; /** Machine grid size NPROC1, NPROC2, NPROC3, NPROC0*/ + int nproc_blk[4]; /** Blocking size NPROC0_BLK, NPROC1_BLK, NPROC2_BLK, NPROC3_BLK, + is assumed to be [1, 1, 1, 1] */ + int N[4]; /** Glocal lattice dimensions N1, N2, N3, N3 */ + int device; /** GPU device number */ + int cstar; /** number of cstar directions, equals bc_cstar() */ + int *data; /** rank topology, length 5 + NPROC1*NPROC2*NPROC3*NPROC0: + data[0] = cstar; + data[1+i] = nproc[i] for 0 <= i < 4 + data[5+lex(ix,iy,iz,it)] returns rank number in + openQCD, where lex stands for lexicographical + indexing (in QUDA order (xyzt)) */ + bc_parms_t (*bc_parms)(void); /** @see bc_parms() */ + flds_parms_t (*flds_parms)(void); /** @see flds_parms() */ + dirac_parms_t (*dirac_parms)(void); /** @see dirac_parms() */ + void *(*h_gauge)(void); /** function to return a pointer to the gauge field */ + void *(*h_sw)(void); /** function to return a pointer to the updated Clover field */ + void (*get_gfld_flags)(int *ud, int *ad); /** function pointer to gauge field revision query function */ +} openQCD_QudaLayout_t; + +/** + * Parameters used to create a QUDA context. + */ +typedef struct { + QudaVerbosity verbosity; /** How verbose QUDA should be (QUDA_SILENT, QUDA_VERBOSE or QUDA_SUMMARIZE) */ + FILE *logfile; /** log file handler */ + void *gauge; /** base pointer to the gauge fields */ + int volume; /** VOLUME */ + int bndry; /** BNDRY */ + void (*reorder_gauge_quda_to_openqcd)(void *in, void *out); +} openQCD_QudaInitArgs_t; + +typedef struct { + int initialized; /** Whether openQCD_qudaInit() was called or not */ + int ud_rev; /** Revision of ud field from openqxd */ + int ad_rev; /** Revision of ad field from openqxd */ + int swd_ud_rev; /** Revision of ud field used to calc/transfer the SW field from openqxd */ + int swd_ad_rev; /** Revision of ad field used to calc/transfer the SW field from openqxd */ + double swd_kappa; /** kappa corresponding to the current SW field in QUDA */ + double swd_su3csw; /** SU(3) csw coefficient corresponding to the current SW field in QUDA */ + double swd_u1csw; /** U(1) csw coefficient corresponding to the current SW field in QUDA */ + int swd_qhat; /** qhat coefficient corresponding to the current SW field in QUDA */ + openQCD_QudaInitArgs_t init; + openQCD_QudaLayout_t layout; + void *dirac_handle; /** void-pointer to QudaInvertParam struct for the Dirac operator. + * Notice that this void pointer HAS to be directly before + * handles[32], because it's possible to call + * openQCD_qudaSolverGetHandle with -1. */ + void *inv_handles[OPENQCD_MAX_INVERTERS]; /** Array of void-pointers to QudaInvertParam structs for the solver(s) */ + void *eig_handles[OPENQCD_MAX_EIGENSOLVERS]; /** Array of void-pointers to QudaInvertParam structs for the solver(s) */ + char infile[1024]; /** Path to the input file (if given to quda_init()) */ +} openQCD_QudaState_t; + +typedef struct openQCD_QudaSolver_s { + char infile[1024]; /** Path to the input file (if given to quda_init()) */ + int id; /** Solver section identifier in the input file */ + QudaMultigridParam *mg_param; /** Pointer to the multigrid param struct */ + double u1csw; /** u1csw property */ + int qhat; /** qhat property */ + int mg_ud_rev; /** Revision of ud field from openqxd */ + int mg_ad_rev; /** Revision of ad field from openqxd */ + double mg_kappa; /** kappa corresponding to the current mg-instance in QUDA */ + double mg_su3csw; /** SU(3) csw coefficient corresponding to the current mg-instance in QUDA */ + double mg_u1csw; /** U(1) csw coefficient corresponding to the current mg-instance in QUDA */ + int mg_qhat; /** qhat corresponding to the current mg-instance in QUDA */ +} openQCD_QudaSolver; + +typedef struct { + double kappa; /* kappa: hopping parameter */ + double mu; /* mu: twisted mass */ + double su3csw; /* su3csw: csw coefficient for SU(3) fields */ + double u1csw; /* u1csw: csw coefficient for U(1) fields, quda doesn't respect that parameter (yet) */ + int qhat; /* qhat: quda doesn't respect that parameter (yet) */ +} openQCD_QudaDiracParam_t; + +/** + * Initialize the QUDA context. + * + * @param[in] init Meta data for the QUDA context + * @param[in] layout Layout struct + * @param infile Input file + */ +void openQCD_qudaInit(openQCD_QudaInitArgs_t init, openQCD_QudaLayout_t layout, char *infile); + +/** + * Destroy the QUDA context and deallocate all solvers. + */ +void openQCD_qudaFinalize(void); + +/** + * Copy a spinor to GPU and back to CPU. + * + * @param[in] h_in Spinor input field (from openQCD) + * @param[out] h_out Spinor output field + */ +void openQCD_back_and_forth(void *h_in, void *h_out); + +/** + * @brief Wrapper around openqcd::ipt + * + * @param[in] x Euclidean corrdinate in txyz convention + * + * @return ipt[x] + * + * @see openqcd::ipt() + */ +int openQCD_qudaIndexIpt(const int *x); + +/** + * @brief Wrapper around openqcd::iup + * + * @param[in] x Euclidean corrdinate in txyz convention + * @param[in] mu Direction + * + * @return iup[x][mu] + * + * @see openqcd::iup() + */ +int openQCD_qudaIndexIup(const int *x, const int mu); + +/** + * @brief Norm square in QUDA. + * + * @param[in] h_in Spinor input field (from openQCD) + * + * @return The norm + */ +double openQCD_qudaNorm(void *h_in); + +/** + * @brief Prototype function for the norm-square in QUDA without + * transfering the field. Should serve as an example. + * + * @param[in] d_in Spinor input field (device pointer) + * + * @return The norm + */ +double openQCD_qudaNorm_NoLoads(void *d_in); + +/** + * @brief Applies Dirac matrix to spinor. + * + * openQCD_out = gamma[dir] * openQCD_in + * + * @param[in] dir Dirac index, 0 <= dir <= 5, notice that dir is in + * openQCD convention, ie. (0: t, 1: x, 2: y, 3: z, 4: 5, 5: 5) + * @param[in] openQCD_in of type spinor_dble[NSPIN] + * @param[out] openQCD_out of type spinor_dble[NSPIN] + */ +void openQCD_qudaGamma(const int dir, void *openQCD_in, void *openQCD_out); + +/** + * @brief Explicit transfer of an openQCD field from host to device + * + * @param openQCD_field Input openQCD spinor (host pointer) + * + * @return Device pointer + */ +void *openQCD_qudaH2D(void *openQCD_field); + +/** + * @brief Explicit transfer of a QUDA field from device to host + * + * @param quda_field Input quda spinor field (device pointer) + * @param openQCD_field Output openQCD spinor (host pointer) + */ +void openQCD_qudaD2H(void *quda_field, void *openQCD_field); + +/** + * @brief Free a device field allocated by openQCD_qudaH2D() + * + * @param quda_field Pointer to device pointer + */ +void openQCD_qudaSpinorFree(void **quda_field); + +/** + * @brief Apply the Wilson-Clover Dirac operator to a field. All fields + * passed and returned are host (CPU) fields in openQCD order. + * + * @param[in] src Source spinor field + * @param[out] dst Destination spinor field + * @param[in] p Dirac parameter struct + * + * @deprecated Replaced by openQCD_qudaDw() + */ +void openQCD_qudaDw_deprecated(void *src, void *dst, openQCD_QudaDiracParam_t p); + +/** + * @brief Apply the Dirac operator that corresponds to the current openQxD + * setup to a field. All fields passed and returned are host (CPU) + * fields in openQCD order. + * + * @param[in] mu Twisted mass parameter + * @param[in] in Input spinor (host pointer) + * @param[out] out Output spinor (host pointer) + */ +void openQCD_qudaDw(double mu, void *in, void *out); + +/** + * @brief Apply the Dirac operator that corresponds to the current openQxD + * setup to a field. All fields passed and returned are devicde + * (GPU) fields returned by openQCD_qudaH2D(). + * + * @param[in] mu Twisted mass parameter + * @param[in] d_in Input spinor (device pointer) + * @param[out] d_out Output spinor (device pointer) + */ +void openQCD_qudaDw_NoLoads(double mu, void *d_in, void *d_out); + +/** + * Setup the solver interface to quda. This function parses the file given by + * [infile] as an openQCD ini file. The solver section given by the [id] + * parameter must have a key-value pair like solver = QUDA and may contain every + * member of the struct [QudaInvertParam]. If one sets inv_type_precondition = + * QUDA_MG_INVERTER, one can additionally use all the members from the struct + * [QudaMultigridParam] in a section called "Solver {id} Multigrid", where {id} + * is replaced by [id]. For every level given by n_level in the above section, + * one has to provide a subsection called "Solver {id} Multigrid Level {level}", + * where {level} runs from 0 to n_level-1. All these subsections may have keys + * given by all the array-valued members of QudaMultigridParam, for example + * smoother_tol may appear in all subsections. This function must be called on + * all ranks simulaneously. + * + * @param[in] id The identifier of the solver section, i.e. "Solver #". The + * input file is taken from the arguments of quda_init(). If + * id is -1, then the section called "Lattice parameters" is + * parsed in the same way. + * + * @return Pointer to the solver context + */ + +void *openQCD_qudaSolverGetHandle(int id); + +/** + * @brief Return a hash from a subset of the settings in the + * QudaInvertParam struct. Return 0 if the struct is not initialized + * yet. + * + * @param[in] id The solver identifier + * + * @return Hash value + */ +int openQCD_qudaSolverGetHash(int id); + +/** + * @brief Print solver information about the QUDA solver. Print + * "Solver is not initialized yet" is the solver struct is nul + * initialized yet. + * + * @param[in] id The solver identifier + */ +void openQCD_qudaSolverPrintSetup(int id); + +/** + * @brief Solve Ax=b for an Clover Wilson operator with a multigrid solver. + * All fields passed and returned are host (CPU) field in openQCD + * order. + * + * @param[in] id The solver identifier in the input file, i.e. + * "Solver #". The input file is the one given by + * quda_init + * @param[in] mu Twisted mass parameter + * @param[in] source The source + * @param[out] solution The solution + * @param[out] status If the function is able to solve the Dirac equation to + * the desired accuracy (invert_param->tol), status + * reports the total number of iteration steps. -1 + * indicates that the inversion failed. + * + * @return Residual + */ +double openQCD_qudaInvert(int id, double mu, void *source, void *solution, int *status); + +/** + * @brief Destroys an existing solver context and frees all involed + * structs. + * + * @param[in] id The solver identifier + */ +void openQCD_qudaSolverDestroy(int id); + +/** + * Setup the eigen-solver interface to quda. This function parses the file + * given by [infile] as an openQCD ini file. The solver section given by the + * [inv_section] parameter must have a key-value pair like solver = QUDA and may + * contain every member of the struct [QudaInvertParam]. See + * [openQCD_qudaSolverSetup] for more details about the solver. The eigen-solver + * section given by the [section] parameter may contain every member of the + * struct [QudaEigParam]. + * + * @param[in] id The section id of the eigensolver + * @param[in] solver_id The section id of the solver. If -1, the section is + * not read in. + * + * @return Pointer to the eigen-solver context + */ +void *openQCD_qudaEigensolverGetHandle(int id, int solver_id); + +/** + * @brief Print the eigensolver setup. + * + * @param[in] id The eigensolver identifier + * @param[in] solver_id The solver identifier + */ +void openQCD_qudaEigensolverPrintSetup(int id, int solver_id); + +/** + * @brief Solve Ax=b for an Clover Wilson operator with a multigrid + * solver. All fields are fields passed and returned are host + * (CPU) field in openQCD order. This function requires an + * existing solver context created with openQCD_qudaSolverSetup(). + * + * @param[in] id The eigensolver section identifier + * @param[in] solver_id The solver section identifier + * @param[inout] h_evecs Allocated array of void-pointers to param->n_conf + * fields + * @param[out] h_evals Allocated array of param->n_conf complex_dbles + */ +void openQCD_qudaEigensolve(int id, int solver_id, void **h_evecs, void *h_evals); + +/** + * @brief Destroys an existing eigen-solver context and frees all involed + * structs. + * + * @param[in] id The section id of the eigensolver + */ +void openQCD_qudaEigensolverDestroy(int id); + +/** + * @brief Wrapper for the plaquette. We could call plaqQuda() directly in + * openQCD, but we have to make sure manually that the gauge field + * is loaded + * + * @return Plaquette value + * @see https://github.com/lattice/quda/wiki/Gauge-Measurements#wilson-plaquette-action + */ +double openQCD_qudaPlaquette(void); + +/** + * @brief Load the gauge fields from host to quda. Notice that the boundary + * fields have to be up2date; i.e. call copy_bnd_hd(), copy_bnd_ud() + * before pass fields into this function. + * + * @param[in] gauge The gauge fields (in openqcd order) + * @param[in] prec Precision of the incoming gauge field + * @param[in] rec How the field should be stored internally in QUDA + * @param[in] t_boundary Time boundary condition + */ +void openQCD_qudaGaugeLoad(void *gauge, QudaPrecision prec, QudaReconstructType rec, QudaTboundary t_boundary); + +/** + * @brief Save the gauge fields from quda to host. + * + * @param[out] gauge The gauge fields (will be stored in openqcd order) + * @param[in] prec Precision of the outgoing gauge field + * @param[in] rec How the field should be stored internally in QUDA + * @param[in] t_boundary Time boundary condition + */ +void openQCD_qudaGaugeSave(void *gauge, QudaPrecision prec, QudaReconstructType rec, QudaTboundary t_boundary); + +/** + * @brief Free the gauge field allocated in quda. + */ +void openQCD_qudaGaugeFree(void); + +/** + * @brief Load the clover fields from host to quda. + * + * @param[in] clover The clover fields (in openqcd order) + * @param[in] kappa The kappa (we need this, because quda has its clover + * field multiplied by kappa and we have to reverse this + * when loading ours) + * @param[in] csw The csw coefficient of the clover field + */ +void openQCD_qudaCloverLoad(void *clover, double kappa, double csw); + +/** + * @brief Free the clover field allocated in quda. + */ +void openQCD_qudaCloverFree(void); + +#ifdef __cplusplus +} +#endif diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index d88ba6a239..871a583a7a 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -79,7 +79,8 @@ set (QUDA_OBJS copy_gauge_offset.cu copy_color_spinor_offset.cu copy_clover_offset.cu staggered_oprod.cu clover_trace_quda.cu hisq_paths_force_quda.cu - unitarize_force_quda.cu unitarize_links_quda.cu milc_interface.cpp + unitarize_force_quda.cu unitarize_links_quda.cu + milc_interface.cpp openqcd_interface.cpp tune.cpp device_vector.cu inv_gmresdr_quda.cpp @@ -424,6 +425,10 @@ if(QUDA_INTERFACE_TIFR OR QUDA_INTERFACE_ALL) target_compile_definitions(quda PUBLIC BUILD_TIFR_INTERFACE) endif(QUDA_INTERFACE_TIFR OR QUDA_INTERFACE_ALL) +if(QUDA_INTERFACE_OPENQCD OR QUDA_INTERFACE_ALL) + target_compile_definitions(quda PUBLIC BUILD_OPENQCD_INTERFACE) +endif(QUDA_INTERFACE_OPENQCD OR QUDA_INTERFACE_ALL) + # MULTI GPU AND USQCD if(QUDA_MPI OR QUDA_QMP) target_compile_definitions(quda PUBLIC MULTI_GPU) diff --git a/lib/check_params.h b/lib/check_params.h index 02ce0e4621..1fb6acd4e3 100644 --- a/lib/check_params.h +++ b/lib/check_params.h @@ -212,6 +212,7 @@ void printQudaEigParam(QudaEigParam *param) { P(use_norm_op, QUDA_BOOLEAN_INVALID); P(compute_svd, QUDA_BOOLEAN_INVALID); P(require_convergence, QUDA_BOOLEAN_INVALID); + P(spectrum, QUDA_SPECTRUM_INVALID); P(n_ev, INVALID_INT); P(n_kr, INVALID_INT); P(n_conv, INVALID_INT); @@ -770,6 +771,7 @@ void printQudaInvertParam(QudaInvertParam *param) { #endif #ifdef INIT_PARAM + P(additional_prop, 0); P(distance_pc_alpha0, 0.0); P(distance_pc_t0, -1); #else diff --git a/lib/checksum.cu b/lib/checksum.cu index b759afde8d..af8374e723 100644 --- a/lib/checksum.cu +++ b/lib/checksum.cu @@ -51,9 +51,12 @@ namespace quda { } else if (u.Order() == QUDA_TIFR_PADDED_GAUGE_ORDER) { ChecksumArg arg(u,mini); checksum = ChecksumCPU(arg); + } else if (u.Order() == QUDA_OPENQCD_GAUGE_ORDER) { + ChecksumArg arg(u, mini); + checksum = ChecksumCPU(arg); } else { errorQuda("Checksum not implemented"); - } + } return checksum; } diff --git a/lib/comm_common.cpp b/lib/comm_common.cpp index 2c581886da..11b55e6085 100644 --- a/lib/comm_common.cpp +++ b/lib/comm_common.cpp @@ -102,6 +102,12 @@ namespace quda Topology *topo = new Topology; +#ifdef BUILD_OPENQCD_INTERFACE + topo->cstar = (map_data == nullptr) ? 0 : static_cast(map_data)[0]; +#else + topo->cstar = 0; +#endif + topo->ndim = ndim; int nodes = 1; diff --git a/lib/communicator_stack.cpp b/lib/communicator_stack.cpp index ae36f5b2d0..cd34b9cefe 100644 --- a/lib/communicator_stack.cpp +++ b/lib/communicator_stack.cpp @@ -113,6 +113,8 @@ namespace quda int comm_dim(int dim) { return get_current_communicator().comm_dim(dim); } + bool comm_dim_cstar(int dim) { return get_current_communicator().comm_dim_cstar(dim); } + int comm_dim_global(int dim) { return get_default_communicator().comm_dim(dim); } int comm_coord(int dim) { return get_current_communicator().comm_coord(dim); } diff --git a/lib/copy_clover.cu b/lib/copy_clover.cu index 25d6b06ff5..126650f0e6 100644 --- a/lib/copy_clover.cu +++ b/lib/copy_clover.cu @@ -129,6 +129,12 @@ namespace quda { copyClover, FloatOut, FloatIn>(out, in, inverse, location, Out, In); #else errorQuda("BQCD interface has not been built\n"); +#endif + } else if (in.Order() == QUDA_OPENQCD_CLOVER_ORDER) { +#ifdef BUILD_OPENQCD_INTERFACE + copyClover, FloatOut, FloatIn>(out, in, inverse, location, Out, In); +#else + errorQuda("OpenQCD interface has not been built\n"); #endif } else { errorQuda("Clover field %d order not supported", in.Order()); diff --git a/lib/copy_clover_offset.cu b/lib/copy_clover_offset.cu index f29e663c14..7bec58484c 100644 --- a/lib/copy_clover_offset.cu +++ b/lib/copy_clover_offset.cu @@ -45,6 +45,16 @@ namespace quda CopyFieldOffset copier(arg, in); #else errorQuda("BQCD interface has not been built\n"); +#endif + } else if (in.Order() == QUDA_OPENQCD_CLOVER_ORDER) { +#ifdef BUILD_OPENQCD_INTERFACE + using C = OpenQCDOrder; + C out_accessor(out, inverse); + C in_accessor(in, inverse); + CopyFieldOffsetArg arg(out_accessor, out, in_accessor, in, offset); + CopyFieldOffset copier(arg, in); +#else + errorQuda("OpenQCD interface has not been built\n"); #endif } else { errorQuda("Clover field %d order not supported", in.Order()); diff --git a/lib/copy_color_spinor.cuh b/lib/copy_color_spinor.cuh index 02e6357a35..efe7558289 100644 --- a/lib/copy_color_spinor.cuh +++ b/lib/copy_color_spinor.cuh @@ -27,6 +27,7 @@ namespace quda case QUDA_DEGRAND_ROSSI_GAMMA_BASIS: return "degrand_rossi"; case QUDA_UKQCD_GAMMA_BASIS: return "ukqcd"; case QUDA_CHIRAL_GAMMA_BASIS: return "chiral"; + case QUDA_OPENQCD_GAMMA_BASIS: return "openqcd"; case QUDA_DIRAC_PAULI_GAMMA_BASIS: return "dirac_pauli"; default: errorQuda("Unknown gamma basis %d", basis); } @@ -67,6 +68,10 @@ namespace quda launch(tp, stream, Arg(out, in, Out_, In_)); } else if (out.GammaBasis() == QUDA_CHIRAL_GAMMA_BASIS && in.GammaBasis() == QUDA_UKQCD_GAMMA_BASIS) { launch(tp, stream, Arg(out, in, Out_, In_)); + } else if (out.GammaBasis() == QUDA_UKQCD_GAMMA_BASIS && in.GammaBasis() == QUDA_OPENQCD_GAMMA_BASIS) { + launch(tp, stream, Arg(out, in, Out_, In_)); + } else if (out.GammaBasis() == QUDA_OPENQCD_GAMMA_BASIS && in.GammaBasis() == QUDA_UKQCD_GAMMA_BASIS) { + launch(tp, stream, Arg(out, in, Out_, In_)); } else if (out.GammaBasis() == QUDA_DIRAC_PAULI_GAMMA_BASIS && in.GammaBasis() == QUDA_DEGRAND_ROSSI_GAMMA_BASIS) { launch(tp, stream, Arg(out, in, Out_, In_)); } else if (out.GammaBasis() == QUDA_DEGRAND_ROSSI_GAMMA_BASIS && in.GammaBasis() == QUDA_DIRAC_PAULI_GAMMA_BASIS) { @@ -122,6 +127,12 @@ namespace quda CopyColorSpinor(out, in, param); else errorQuda("QDPJIT interface has not been built"); + } else if (out.FieldOrder() == QUDA_OPENQCD_FIELD_ORDER) { + using O = OpenQCDDiracOrder; + if constexpr (is_enabled()) + CopyColorSpinor(out, in, param); + else + errorQuda("OpenQCD interface has not been built"); } else { errorQuda("Order %d not defined (Ns = %d, Nc = %d, precision = %d)", out.FieldOrder(), Ns, Nc, out.Precision()); } @@ -153,6 +164,12 @@ namespace quda genericCopyColorSpinor(param); else errorQuda("QDPJIT interface has not been built"); + } else if (in.FieldOrder() == QUDA_OPENQCD_FIELD_ORDER) { + using ColorSpinor = OpenQCDDiracOrder; + if constexpr (is_enabled()) + genericCopyColorSpinor(param); + else + errorQuda("OpenQCD interface has not been built"); } else { errorQuda("Order %d not defined (Ns=%d, Nc=%d, precision = %d)", in.FieldOrder(), Ns, Nc, in.Precision()); } diff --git a/lib/copy_gauge_extended.cu b/lib/copy_gauge_extended.cu index 620e9bbe85..dadcdac4f3 100644 --- a/lib/copy_gauge_extended.cu +++ b/lib/copy_gauge_extended.cu @@ -65,7 +65,7 @@ namespace quda { #else errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-8", QUDA_RECONSTRUCT); #endif -#ifdef GPU_STAGGERED_DIRAC +#if defined(GPU_STAGGERED_DIRAC) || defined(BUILD_OPENQCD_INTERFACE) } else if (out.Reconstruct() == QUDA_RECONSTRUCT_13) { #if QUDA_RECONSTRUCT & 2 typedef typename gauge_mapper::type G; @@ -80,7 +80,7 @@ namespace quda { #else errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-9", QUDA_RECONSTRUCT); #endif -#endif // GPU_STAGGERED_DIRAC +#endif // defined(GPU_STAGGERED_DIRAC) || defined(BUILD_OPENQCD_INTERFACE) } else { errorQuda("Reconstruction %d and order %d not supported", out.Reconstruct(), out.Order()); } @@ -111,10 +111,18 @@ namespace quda { errorQuda("TIFR interface has not been built\n"); #endif + } else if (out.Order() == QUDA_OPENQCD_GAUGE_ORDER) { + +#ifdef BUILD_OPENQCD_INTERFACE + using G = OpenQCDOrder; + CopyGaugeEx(out, in, location, Out, In); +#else + errorQuda("OPENQCD interface has not been built"); +#endif + } else { errorQuda("Gauge field %d order not supported", out.Order()); } - } template @@ -139,7 +147,7 @@ namespace quda { #else errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-8", QUDA_RECONSTRUCT); #endif -#ifdef GPU_STAGGERED_DIRAC +#if defined(GPU_STAGGERED_DIRAC) || defined(BUILD_OPENQCD_INTERFACE) } else if (in.Reconstruct() == QUDA_RECONSTRUCT_13) { #if QUDA_RECONSTRUCT & 2 typedef typename gauge_mapper::type G; @@ -154,7 +162,7 @@ namespace quda { #else errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-9", QUDA_RECONSTRUCT); #endif -#endif // GPU_STAGGERED_DIRAC +#endif // defined(GPU_STAGGERED_DIRAC) || defined(BUILD_OPENQCD_INTERFACE) } else { errorQuda("Reconstruction %d and order %d not supported", in.Reconstruct(), in.Order()); } @@ -185,10 +193,17 @@ namespace quda { errorQuda("TIFR interface has not been built\n"); #endif + } else if (in.Order() == QUDA_OPENQCD_GAUGE_ORDER) { +#ifdef BUILD_OPENQCD_INTERFACE + using G = OpenQCDOrder; + copyGaugeEx(out, in, location, Out, In); +#else + errorQuda("OpenQCD interface has not been built\n"); +#endif + } else { errorQuda("Gauge field %d order not supported", in.Order()); } - } template diff --git a/lib/copy_gauge_inc.cu b/lib/copy_gauge_inc.cu index d4f3f1a4fe..ef83b13f7e 100644 --- a/lib/copy_gauge_inc.cu +++ b/lib/copy_gauge_inc.cu @@ -27,7 +27,7 @@ namespace quda { #else errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-8", QUDA_RECONSTRUCT); #endif -#ifdef GPU_STAGGERED_DIRAC +#if defined(GPU_STAGGERED_DIRAC) || defined(BUILD_OPENQCD_INTERFACE) } else if (out.Reconstruct() == QUDA_RECONSTRUCT_13) { #if QUDA_RECONSTRUCT & 2 typedef typename gauge_mapper::type G; @@ -56,7 +56,7 @@ namespace quda { #else errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-9", QUDA_RECONSTRUCT); #endif -#endif // GPU_STAGGERED_DIRAC +#endif // defined(GPU_STAGGERED_DIRAC) || defined(BUILD_OPENQCD_INTERFACE) } else { errorQuda("Reconstruction %d and order %d not supported", out.Reconstruct(), out.Order()); } @@ -131,10 +131,18 @@ namespace quda { errorQuda("TIFR interface has not been built\n"); #endif + } else if (out.Order() == QUDA_OPENQCD_GAUGE_ORDER) { + +#ifdef BUILD_OPENQCD_INTERFACE + copyGauge(OpenQCDOrder(out, Out, outGhost), inOrder, + out, in, location, type); +#else + errorQuda("OPENQCD interface has not been built\n"); +#endif + } else { errorQuda("Gauge field %d order not supported", out.Order()); } - } template @@ -161,7 +169,7 @@ namespace quda { #else errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-8", QUDA_RECONSTRUCT); #endif -#ifdef GPU_STAGGERED_DIRAC +#if defined(GPU_STAGGERED_DIRAC) || defined(BUILD_OPENQCD_INTERFACE) } else if (in.Reconstruct() == QUDA_RECONSTRUCT_13) { #if QUDA_RECONSTRUCT & 2 typedef typename gauge_mapper::type G; @@ -183,7 +191,7 @@ namespace quda { #else errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-9", QUDA_RECONSTRUCT); #endif -#endif // GPU_STAGGERED_DIRAC +#endif // defined(GPU_STAGGERED_DIRAC) || defined(BUILD_OPENQCD_INTERFACE) } else { errorQuda("Reconstruction %d and order %d not supported", in.Reconstruct(), in.Order()); } @@ -259,6 +267,15 @@ namespace quda { errorQuda("TIFR interface has not been built\n"); #endif + } else if (in.Order() == QUDA_OPENQCD_GAUGE_ORDER) { + +#ifdef BUILD_OPENQCD_INTERFACE + copyGauge(OpenQCDOrder(in, In, inGhost), out, in, location, Out, + outGhost, type); +#else + errorQuda("OPENQCD interface has not been built\n"); +#endif + } else { errorQuda("Gauge field order %d not supported", in.Order()); } diff --git a/lib/copy_gauge_offset.cu b/lib/copy_gauge_offset.cu index 9a8494c3a2..6f8bc2cb01 100644 --- a/lib/copy_gauge_offset.cu +++ b/lib/copy_gauge_offset.cu @@ -100,6 +100,13 @@ namespace quda copy_gauge_offset(out, in, offset); #else errorQuda("TIFR interface has not been built\n"); +#endif + } else if (in.Order() == QUDA_OPENQCD_GAUGE_ORDER) { +#ifdef BUILD_OPENQCD_INTERFACE + using G = typename gauge_order_mapper::type; + copy_gauge_offset(out, in, offset); +#else + errorQuda("OpenQCD interface has not been built\n"); #endif } else { errorQuda("Gauge field %d order not supported", in.Order()); diff --git a/lib/dslash_gamma_helper.cu b/lib/dslash_gamma_helper.cu index 9d53ffbd32..fc948f733f 100644 --- a/lib/dslash_gamma_helper.cu +++ b/lib/dslash_gamma_helper.cu @@ -108,6 +108,13 @@ namespace quda { // Applies a gamma5 matrix to a spinor (wrapper to ApplyGamma) void gamma5(cvector_ref &out, cvector_ref &in) { ApplyGamma(out, in, 4); } + /* RG: I have added these */ + void gamma5(ColorSpinorField &out, const ColorSpinorField &in) { ApplyGamma(out, in, 4); } + void gamma0(ColorSpinorField &out, const ColorSpinorField &in) { ApplyGamma(out, in, 0); } + void gamma1(ColorSpinorField &out, const ColorSpinorField &in) { ApplyGamma(out, in, 1); } + void gamma2(ColorSpinorField &out, const ColorSpinorField &in) { ApplyGamma(out, in, 2); } + void gamma3(ColorSpinorField &out, const ColorSpinorField &in) { ApplyGamma(out, in, 3); } + template class TauApply : public TunableKernel3D { cvector_ref &out; diff --git a/lib/eig_block_trlm.cpp b/lib/eig_block_trlm.cpp index 6e73da086f..3f32d2feee 100644 --- a/lib/eig_block_trlm.cpp +++ b/lib/eig_block_trlm.cpp @@ -204,7 +204,7 @@ namespace quda // Compute eigenvalues computeEvals(kSpace, evals); - if (compute_svd) computeSVD(kSpace, evals); + if (compute_svd) computeSVD(kSpace, evals, eig_param->use_dagger); } // Local clean-up diff --git a/lib/eig_iram.cpp b/lib/eig_iram.cpp index 0dac4410f0..892992228f 100644 --- a/lib/eig_iram.cpp +++ b/lib/eig_iram.cpp @@ -513,7 +513,7 @@ namespace quda // Compute the eigen/singular values. getProfile().TPSTART(QUDA_PROFILE_COMPUTE); computeEvals(kSpace, evals); - if (compute_svd) computeSVD(kSpace, evals); + if (compute_svd) computeSVD(kSpace, evals, eig_param->use_dagger); converged = true; } else { diff --git a/lib/eig_trlm.cpp b/lib/eig_trlm.cpp index 9511810c7f..74920aa4e6 100644 --- a/lib/eig_trlm.cpp +++ b/lib/eig_trlm.cpp @@ -26,7 +26,8 @@ namespace quda beta.resize(n_kr, 0.0); // Thick restart specific checks - if (n_kr < n_ev + 6) errorQuda("n_kr=%d must be greater than n_ev+6=%d\n", n_kr, n_ev + 6); + if (n_kr < n_ev + 6) errorQuda("n_kr=%d must be greater than or equal to n_ev+6=%d\n", n_kr, n_ev + 6); + if (n_kr < n_conv + 12) errorQuda("n_kr=%d must be greater than or equal to n_conv+12=%d\n", n_kr, n_conv + 12); if (!(eig_param->spectrum == QUDA_SPECTRUM_LR_EIG || eig_param->spectrum == QUDA_SPECTRUM_SR_EIG)) { errorQuda("Only real spectrum type (LR or SR) can be passed to the TR Lanczos solver"); @@ -180,7 +181,7 @@ namespace quda // Compute eigenvalues/singular values computeEvals(kSpace, evals); - if (compute_svd) computeSVD(kSpace, evals); + if (compute_svd) computeSVD(kSpace, evals, eig_param->use_dagger); } // Local clean-up diff --git a/lib/eigensolve_quda.cpp b/lib/eigensolve_quda.cpp index 52f4fd1100..6cd36a684e 100644 --- a/lib/eigensolve_quda.cpp +++ b/lib/eigensolve_quda.cpp @@ -496,7 +496,7 @@ namespace quda } } - void EigenSolver::computeSVD(std::vector &evecs, std::vector &evals) + void EigenSolver::computeSVD(std::vector &evecs, std::vector &evals, bool dagger) { logQuda(QUDA_SUMMARIZE, "Computing SVD of M\n"); @@ -521,9 +521,15 @@ namespace quda // Lambda already contains the square root of the eigenvalue of the norm op. - // M*Rev_i = M*Rsv_i = sigma_i Lsv_i - mat.Expose()->M({evecs.begin() + n_conv + lower, evecs.begin() + n_conv + upper}, - {evecs.begin() + lower, evecs.begin() + upper}); + if (dagger) { + // Mdag*Lev_i = Mdag*Lsv_i = sigma_i Rsv_i + mat.Expose()->Mdag({evecs.begin() + n_conv + lower, evecs.begin() + n_conv + upper}, + {evecs.begin() + lower, evecs.begin() + upper}); + } else { + // M*Rev_i = M*Rsv_i = sigma_i Lsv_i + mat.Expose()->M({evecs.begin() + n_conv + lower, evecs.begin() + n_conv + upper}, + {evecs.begin() + lower, evecs.begin() + upper}); + } // sigma_i = sqrt(sigma_i (Lsv_i)^dag * sigma_i * Lsv_i ) auto sigma = blas::norm2({evecs.begin() + n_conv + lower, evecs.begin() + n_conv + upper}); diff --git a/lib/extract_gauge_ghost.in.cu b/lib/extract_gauge_ghost.in.cu index 878de6118b..0639d2eaae 100644 --- a/lib/extract_gauge_ghost.in.cu +++ b/lib/extract_gauge_ghost.in.cu @@ -111,10 +111,17 @@ namespace quda { errorQuda("TIFR interface has not been built"); } + } else if (u.Order() == QUDA_OPENQCD_GAUGE_ORDER) { + + if constexpr (is_enabled()) { + ExtractGhost>(u, Ghost, extract, offset); + } else { + errorQuda("OpenQCD interface has not been built"); + } + } else { errorQuda("Gauge field %d order not supported", u.Order()); } - } }; diff --git a/lib/extract_gauge_ghost_extended.cu b/lib/extract_gauge_ghost_extended.cu index 37c4f7235a..11b2661e6f 100644 --- a/lib/extract_gauge_ghost_extended.cu +++ b/lib/extract_gauge_ghost_extended.cu @@ -142,10 +142,17 @@ namespace quda { errorQuda("TIFR interface has not been built"); } + } else if (u.Order() == QUDA_OPENQCD_GAUGE_ORDER) { + + if constexpr (is_enabled()) { + ExtractGhostEx>(u, dim, R, ghost, extract); + } else { + errorQuda("OpenQCD interface has not been built"); + } + } else { errorQuda("Gauge field %d order not supported", u.Order()); } - } }; diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp index 40a1351656..d2fcc56580 100644 --- a/lib/gauge_field.cpp +++ b/lib/gauge_field.cpp @@ -151,6 +151,25 @@ namespace quda { bytes = site_dim * (x[0] + 4) * (x[1] + 2) * (x[2] + 2) * (x[3] + 2) * nInternal * precision; } else if (order == QUDA_MILC_SITE_GAUGE_ORDER) { bytes = volume * site_size; + } else if (order == QUDA_OPENQCD_GAUGE_ORDER) { + /** + * With an openQCD gauge field, we need all the links of even lattice + * points in positive direction. These are links that lie in the buffer + * space that spans 7*BNDRY/4 gauge fields. These boundary fields are + * located at base_ptr + 4*VOLUME. Therefore, we need to transfer more + * than 4*VOLUME matrices. + */ + + /* analogue to BNDRY in openQCD:include/global.h */ + long int bndry = 0; + bndry += (1 - (comm_dim(0) % 2)) * x[1] * x[2] * x[3]; + bndry += (1 - (comm_dim(1) % 2)) * x[0] * x[2] * x[3]; + bndry += (1 - (comm_dim(2) % 2)) * x[0] * x[1] * x[3]; + bndry += (1 - (comm_dim(3) % 2)) * x[0] * x[1] * x[2]; + bndry *= 2; + + length += 18 * 7 * bndry / 4; + bytes = length * precision; } else { bytes = length * precision; } @@ -190,7 +209,7 @@ namespace quda { } else if (order == QUDA_CPS_WILSON_GAUGE_ORDER || order == QUDA_MILC_GAUGE_ORDER || order == QUDA_BQCD_GAUGE_ORDER || order == QUDA_TIFR_GAUGE_ORDER || order == QUDA_TIFR_PADDED_GAUGE_ORDER - || order == QUDA_MILC_SITE_GAUGE_ORDER) { + || order == QUDA_MILC_SITE_GAUGE_ORDER || order == QUDA_OPENQCD_GAUGE_ORDER) { // does not support device if (order == QUDA_MILC_SITE_GAUGE_ORDER && param.create != QUDA_REFERENCE_FIELD_CREATE) { @@ -1361,7 +1380,8 @@ namespace quda { } } else if (Order() == QUDA_CPS_WILSON_GAUGE_ORDER || Order() == QUDA_MILC_GAUGE_ORDER || Order() == QUDA_MILC_SITE_GAUGE_ORDER || Order() == QUDA_BQCD_GAUGE_ORDER - || Order() == QUDA_TIFR_GAUGE_ORDER || Order() == QUDA_TIFR_PADDED_GAUGE_ORDER) { + || Order() == QUDA_TIFR_GAUGE_ORDER || Order() == QUDA_TIFR_PADDED_GAUGE_ORDER + || Order() == QUDA_OPENQCD_GAUGE_ORDER) { std::memcpy(buffer, data(), Bytes()); } else { errorQuda("Unsupported order = %d", Order()); @@ -1381,7 +1401,8 @@ namespace quda { } } else if (Order() == QUDA_CPS_WILSON_GAUGE_ORDER || Order() == QUDA_MILC_GAUGE_ORDER || Order() == QUDA_MILC_SITE_GAUGE_ORDER || Order() == QUDA_BQCD_GAUGE_ORDER - || Order() == QUDA_TIFR_GAUGE_ORDER || Order() == QUDA_TIFR_PADDED_GAUGE_ORDER) { + || Order() == QUDA_TIFR_GAUGE_ORDER || Order() == QUDA_TIFR_PADDED_GAUGE_ORDER + || Order() == QUDA_OPENQCD_GAUGE_ORDER) { std::memcpy(data(), buffer, Bytes()); } else { errorQuda("Unsupported order = %d", Order()); diff --git a/lib/instantiate.cpp b/lib/instantiate.cpp index 15d0b85148..079e27628c 100644 --- a/lib/instantiate.cpp +++ b/lib/instantiate.cpp @@ -1,7 +1,7 @@ #include /** - This file contains deinitions required when compiling with C++14. + This file contains definitions required when compiling with C++14. Without these, we can end up with undefined references at link time. We can remove this file when we jump to C++17 and declare these are inline variables in instantiate.h. @@ -12,11 +12,25 @@ namespace quda // declared in instantiate.h constexpr std::array ReconstructFull::recon; +#ifdef BUILD_OPENQCD_INTERFACE + constexpr std::array ReconstructWilson::recon; + constexpr std::array ReconstructStaggered::recon; +#else constexpr std::array ReconstructWilson::recon; constexpr std::array ReconstructStaggered::recon; +#endif constexpr std::array ReconstructNo12::recon; constexpr std::array ReconstructNone::recon; constexpr std::array ReconstructMom::recon; constexpr std::array Reconstruct10::recon; + // declared in dslash.h +#ifdef BUILD_OPENQCD_INTERFACE + constexpr std::array WilsonReconstruct::recon; + constexpr std::array StaggeredReconstruct::recon; +#else + constexpr std::array WilsonReconstruct::recon; + constexpr std::array StaggeredReconstruct::recon; +#endif + } // namespace quda diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp new file mode 100644 index 0000000000..bb37d39f23 --- /dev/null +++ b/lib/openqcd_interface.cpp @@ -0,0 +1,1908 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + +static openQCD_QudaState_t qudaState = {false, -1, -1, -1, -1, 0.0, 0.0, 0.0, 0, {}, {}, nullptr, {}, {}, ""}; + +using namespace quda; + +/** + * code for NVTX taken from Jiri Kraus' blog post: + * http://devblogs.nvidia.com/parallelforall/cuda-pro-tip-generate-custom-application-profile-timelines-nvtx/ + */ + +#ifdef INTERFACE_NVTX + +#if QUDA_NVTX_VERSION == 3 +#include "nvtx3/nvToolsExt.h" +#else +#include "nvToolsExt.h" +#endif + +static const uint32_t colors[] = {0x0000ff00, 0x000000ff, 0x00ffff00, 0x00ff00ff, 0x0000ffff, 0x00ff0000, 0x00ffffff}; +static const int num_colors = sizeof(colors) / sizeof(uint32_t); + +#define PUSH_RANGE(name, cid) \ + { \ + int color_id = cid; \ + color_id = color_id % num_colors; \ + nvtxEventAttributes_t eventAttrib = {0}; \ + eventAttrib.version = NVTX_VERSION; \ + eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \ + eventAttrib.colorType = NVTX_COLOR_ARGB; \ + eventAttrib.color = colors[color_id]; \ + eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \ + eventAttrib.message.ascii = name; \ + nvtxRangePushEx(&eventAttrib); \ + } +#define POP_RANGE nvtxRangePop(); +#else +#define PUSH_RANGE(name, cid) +#define POP_RANGE +#endif + +#define QUDA_OPENQCD_VERBOSE 1 + +template void inline qudaopenqcd_called(const char *func, QudaVerbosity verb) +{ + /* add NVTX markup if enabled */ + if (start) { + PUSH_RANGE(func, 1); + } else { + POP_RANGE; + } + +#ifdef QUDA_OPENQCD_VERBOSE + if (verb >= QUDA_VERBOSE) { + if (start) { + printfQuda("QUDA_OPENQCD_INTERFACE: %s (called) \n", func); + } else { + printfQuda("QUDA_OPENQCD_INTERFACE: %s (return) \n", func); + } + } +#endif +} + +template void inline qudaopenqcd_called(const char *func) +{ + qudaopenqcd_called(func, getVerbosity()); +} + +/** + * Mapping of enums to their actual values. We have this mapping such that we + * can use the named parameters in our input files rather than the number. this + * makes reading and writing the configuration more understandable. + */ +std::unordered_map enum_map + = {{"QUDA_CG_INVERTER", std::to_string(QUDA_CG_INVERTER)}, + {"QUDA_BICGSTAB_INVERTER", std::to_string(QUDA_BICGSTAB_INVERTER)}, + {"QUDA_GCR_INVERTER", std::to_string(QUDA_GCR_INVERTER)}, + {"QUDA_MR_INVERTER", std::to_string(QUDA_MR_INVERTER)}, + {"QUDA_SD_INVERTER", std::to_string(QUDA_SD_INVERTER)}, + {"QUDA_PCG_INVERTER", std::to_string(QUDA_PCG_INVERTER)}, + {"QUDA_EIGCG_INVERTER", std::to_string(QUDA_EIGCG_INVERTER)}, + {"QUDA_INC_EIGCG_INVERTER", std::to_string(QUDA_INC_EIGCG_INVERTER)}, + {"QUDA_GMRESDR_INVERTER", std::to_string(QUDA_GMRESDR_INVERTER)}, + {"QUDA_GMRESDR_PROJ_INVERTER", std::to_string(QUDA_GMRESDR_PROJ_INVERTER)}, + {"QUDA_GMRESDR_SH_INVERTER", std::to_string(QUDA_GMRESDR_SH_INVERTER)}, + {"QUDA_FGMRESDR_INVERTER", std::to_string(QUDA_FGMRESDR_INVERTER)}, + {"QUDA_MG_INVERTER", std::to_string(QUDA_MG_INVERTER)}, + {"QUDA_BICGSTABL_INVERTER", std::to_string(QUDA_BICGSTABL_INVERTER)}, + {"QUDA_CGNE_INVERTER", std::to_string(QUDA_CGNE_INVERTER)}, + {"QUDA_CGNR_INVERTER", std::to_string(QUDA_CGNR_INVERTER)}, + {"QUDA_CG3_INVERTER", std::to_string(QUDA_CG3_INVERTER)}, + {"QUDA_CG3NE_INVERTER", std::to_string(QUDA_CG3NE_INVERTER)}, + {"QUDA_CG3NR_INVERTER", std::to_string(QUDA_CG3NR_INVERTER)}, + {"QUDA_CA_CG_INVERTER", std::to_string(QUDA_CA_CG_INVERTER)}, + {"QUDA_CA_CGNE_INVERTER", std::to_string(QUDA_CA_CGNE_INVERTER)}, + {"QUDA_CA_CGNR_INVERTER", std::to_string(QUDA_CA_CGNR_INVERTER)}, + {"QUDA_CA_GCR_INVERTER", std::to_string(QUDA_CA_GCR_INVERTER)}, + {"QUDA_INVALID_INVERTER", std::to_string(QUDA_INVALID_INVERTER)}, + {"QUDA_MAT_SOLUTION", std::to_string(QUDA_MAT_SOLUTION)}, + {"QUDA_MATDAG_MAT_SOLUTION", std::to_string(QUDA_MATDAG_MAT_SOLUTION)}, + {"QUDA_MATPC_SOLUTION", std::to_string(QUDA_MATPC_SOLUTION)}, + {"QUDA_MATPC_DAG_SOLUTION", std::to_string(QUDA_MATPC_DAG_SOLUTION)}, + {"QUDA_MATPCDAG_MATPC_SOLUTION", std::to_string(QUDA_MATPCDAG_MATPC_SOLUTION)}, + {"QUDA_MATPCDAG_MATPC_SHIFT_SOLUTION", std::to_string(QUDA_MATPCDAG_MATPC_SHIFT_SOLUTION)}, + {"QUDA_INVALID_SOLUTION", std::to_string(QUDA_INVALID_SOLUTION)}, + {"QUDA_DIRECT_SOLVE", std::to_string(QUDA_DIRECT_SOLVE)}, + {"QUDA_NORMOP_SOLVE", std::to_string(QUDA_NORMOP_SOLVE)}, + {"QUDA_DIRECT_PC_SOLVE", std::to_string(QUDA_DIRECT_PC_SOLVE)}, + {"QUDA_NORMOP_PC_SOLVE", std::to_string(QUDA_NORMOP_PC_SOLVE)}, + {"QUDA_NORMERR_SOLVE", std::to_string(QUDA_NORMERR_SOLVE)}, + {"QUDA_NORMERR_PC_SOLVE", std::to_string(QUDA_NORMERR_PC_SOLVE)}, + {"QUDA_NORMEQ_SOLVE", std::to_string(QUDA_NORMEQ_SOLVE)}, + {"QUDA_NORMEQ_PC_SOLVE", std::to_string(QUDA_NORMEQ_PC_SOLVE)}, + {"QUDA_INVALID_SOLVE", std::to_string(QUDA_INVALID_SOLVE)}, + {"QUDA_MATPC_EVEN_EVEN", std::to_string(QUDA_MATPC_EVEN_EVEN)}, + {"QUDA_MATPC_ODD_ODD", std::to_string(QUDA_MATPC_ODD_ODD)}, + {"QUDA_MATPC_EVEN_EVEN_ASYMMETRIC", std::to_string(QUDA_MATPC_EVEN_EVEN_ASYMMETRIC)}, + {"QUDA_MATPC_ODD_ODD_ASYMMETRIC", std::to_string(QUDA_MATPC_ODD_ODD_ASYMMETRIC)}, + {"QUDA_MATPC_INVALID", std::to_string(QUDA_MATPC_INVALID)}, + {"QUDA_DEFAULT_NORMALIZATION", std::to_string(QUDA_DEFAULT_NORMALIZATION)}, + {"QUDA_SOURCE_NORMALIZATION", std::to_string(QUDA_SOURCE_NORMALIZATION)}, + {"QUDA_QUARTER_PRECISION", std::to_string(QUDA_QUARTER_PRECISION)}, + {"QUDA_HALF_PRECISION", std::to_string(QUDA_HALF_PRECISION)}, + {"QUDA_SINGLE_PRECISION", std::to_string(QUDA_SINGLE_PRECISION)}, + {"QUDA_DOUBLE_PRECISION", std::to_string(QUDA_DOUBLE_PRECISION)}, + {"QUDA_INVALID_PRECISION", std::to_string(QUDA_INVALID_PRECISION)}, + {"QUDA_BOOLEAN_FALSE", std::to_string(QUDA_BOOLEAN_FALSE)}, + {"false", std::to_string(QUDA_BOOLEAN_FALSE)}, + {"FALSE", std::to_string(QUDA_BOOLEAN_FALSE)}, + {"no", std::to_string(QUDA_BOOLEAN_FALSE)}, + {"n", std::to_string(QUDA_BOOLEAN_FALSE)}, + {"off", std::to_string(QUDA_BOOLEAN_FALSE)}, + {"QUDA_BOOLEAN_TRUE", std::to_string(QUDA_BOOLEAN_TRUE)}, + {"true", std::to_string(QUDA_BOOLEAN_TRUE)}, + {"TRUE", std::to_string(QUDA_BOOLEAN_TRUE)}, + {"yes", std::to_string(QUDA_BOOLEAN_TRUE)}, + {"y", std::to_string(QUDA_BOOLEAN_TRUE)}, + {"on", std::to_string(QUDA_BOOLEAN_TRUE)}, + {"QUDA_BOOLEAN_INVALID", std::to_string(QUDA_BOOLEAN_INVALID)}, + {"QUDA_COMPUTE_NULL_VECTOR_NO", std::to_string(QUDA_COMPUTE_NULL_VECTOR_NO)}, + {"QUDA_COMPUTE_NULL_VECTOR_YES", std::to_string(QUDA_COMPUTE_NULL_VECTOR_YES)}, + {"QUDA_COMPUTE_NULL_VECTOR_INVALID", std::to_string(QUDA_COMPUTE_NULL_VECTOR_INVALID)}, + {"QUDA_MG_CYCLE_VCYCLE", std::to_string(QUDA_MG_CYCLE_VCYCLE)}, + {"QUDA_MG_CYCLE_FCYCLE", std::to_string(QUDA_MG_CYCLE_FCYCLE)}, + {"QUDA_MG_CYCLE_WCYCLE", std::to_string(QUDA_MG_CYCLE_WCYCLE)}, + {"QUDA_MG_CYCLE_RECURSIVE", std::to_string(QUDA_MG_CYCLE_RECURSIVE)}, + {"QUDA_MG_CYCLE_INVALID", std::to_string(QUDA_MG_CYCLE_INVALID)}, + {"QUDA_CPU_FIELD_LOCATION", std::to_string(QUDA_CPU_FIELD_LOCATION)}, + {"QUDA_CUDA_FIELD_LOCATION", std::to_string(QUDA_CUDA_FIELD_LOCATION)}, + {"QUDA_INVALID_FIELD_LOCATION", std::to_string(QUDA_INVALID_FIELD_LOCATION)}, + {"QUDA_TWIST_SINGLET", std::to_string(QUDA_TWIST_SINGLET)}, + {"QUDA_TWIST_NONDEG_DOUBLET", std::to_string(QUDA_TWIST_NONDEG_DOUBLET)}, + {"QUDA_TWIST_NO", std::to_string(QUDA_TWIST_NO)}, + {"QUDA_TWIST_INVALID", std::to_string(QUDA_TWIST_INVALID)}, + {"QUDA_DAG_NO", std::to_string(QUDA_DAG_NO)}, + {"QUDA_DAG_YES", std::to_string(QUDA_DAG_YES)}, + {"QUDA_DAG_INVALID", std::to_string(QUDA_DAG_INVALID)}, + {"QUDA_KAPPA_NORMALIZATION", std::to_string(QUDA_KAPPA_NORMALIZATION)}, + {"QUDA_MASS_NORMALIZATION", std::to_string(QUDA_MASS_NORMALIZATION)}, + {"QUDA_ASYMMETRIC_MASS_NORMALIZATION", std::to_string(QUDA_ASYMMETRIC_MASS_NORMALIZATION)}, + {"QUDA_INVALID_NORMALIZATION", std::to_string(QUDA_INVALID_NORMALIZATION)}, + {"QUDA_PRESERVE_SOURCE_NO", std::to_string(QUDA_PRESERVE_SOURCE_NO)}, + {"QUDA_PRESERVE_SOURCE_YES", std::to_string(QUDA_PRESERVE_SOURCE_YES)}, + {"QUDA_PRESERVE_SOURCE_INVALID", std::to_string(QUDA_PRESERVE_SOURCE_INVALID)}, + {"QUDA_USE_INIT_GUESS_NO", std::to_string(QUDA_USE_INIT_GUESS_NO)}, + {"QUDA_USE_INIT_GUESS_YES", std::to_string(QUDA_USE_INIT_GUESS_YES)}, + {"QUDA_USE_INIT_GUESS_INVALID", std::to_string(QUDA_USE_INIT_GUESS_INVALID)}, + {"QUDA_SILENT", std::to_string(QUDA_SILENT)}, + {"QUDA_SUMMARIZE", std::to_string(QUDA_SUMMARIZE)}, + {"QUDA_VERBOSE", std::to_string(QUDA_VERBOSE)}, + {"QUDA_DEBUG_VERBOSE", std::to_string(QUDA_DEBUG_VERBOSE)}, + {"QUDA_INVALID_VERBOSITY", std::to_string(QUDA_INVALID_VERBOSITY)}, + {"QUDA_POWER_BASIS", std::to_string(QUDA_POWER_BASIS)}, + {"QUDA_CHEBYSHEV_BASIS", std::to_string(QUDA_CHEBYSHEV_BASIS)}, + {"QUDA_INVALID_BASIS", std::to_string(QUDA_INVALID_BASIS)}, + {"QUDA_ADDITIVE_SCHWARZ", std::to_string(QUDA_ADDITIVE_SCHWARZ)}, + {"QUDA_MULTIPLICATIVE_SCHWARZ", std::to_string(QUDA_MULTIPLICATIVE_SCHWARZ)}, + {"QUDA_INVALID_SCHWARZ", std::to_string(QUDA_INVALID_SCHWARZ)}, + {"QUDA_MADWF_ACCELERATOR", std::to_string(QUDA_MADWF_ACCELERATOR)}, + {"QUDA_INVALID_ACCELERATOR", std::to_string(QUDA_INVALID_ACCELERATOR)}, + {"QUDA_L2_RELATIVE_RESIDUAL", std::to_string(QUDA_L2_RELATIVE_RESIDUAL)}, + {"QUDA_L2_ABSOLUTE_RESIDUAL", std::to_string(QUDA_L2_ABSOLUTE_RESIDUAL)}, + {"QUDA_HEAVY_QUARK_RESIDUAL", std::to_string(QUDA_HEAVY_QUARK_RESIDUAL)}, + {"QUDA_INVALID_RESIDUAL", std::to_string(QUDA_INVALID_RESIDUAL)}, + {"QUDA_NULL_VECTOR_SETUP", std::to_string(QUDA_NULL_VECTOR_SETUP)}, + {"QUDA_TEST_VECTOR_SETUP", std::to_string(QUDA_TEST_VECTOR_SETUP)}, + {"QUDA_INVALID_SETUP_TYPE", std::to_string(QUDA_INVALID_SETUP_TYPE)}, + {"QUDA_TRANSFER_AGGREGATE", std::to_string(QUDA_TRANSFER_AGGREGATE)}, + {"QUDA_TRANSFER_COARSE_KD", std::to_string(QUDA_TRANSFER_COARSE_KD)}, + {"QUDA_TRANSFER_OPTIMIZED_KD", std::to_string(QUDA_TRANSFER_OPTIMIZED_KD)}, + {"QUDA_TRANSFER_OPTIMIZED_KD_DROP_LONG", std::to_string(QUDA_TRANSFER_OPTIMIZED_KD_DROP_LONG)}, + {"QUDA_TRANSFER_INVALID", std::to_string(QUDA_TRANSFER_INVALID)}, + {"QUDA_EIG_TR_LANCZOS", std::to_string(QUDA_EIG_TR_LANCZOS)}, + {"QUDA_EIG_BLK_TR_LANCZOS", std::to_string(QUDA_EIG_BLK_TR_LANCZOS)}, + {"QUDA_EIG_IR_ARNOLDI", std::to_string(QUDA_EIG_IR_ARNOLDI)}, + {"QUDA_EIG_BLK_IR_ARNOLDI", std::to_string(QUDA_EIG_BLK_IR_ARNOLDI)}, + {"QUDA_EIG_INVALID", std::to_string(QUDA_EIG_INVALID)}, + {"QUDA_SPECTRUM_LM_EIG", std::to_string(QUDA_SPECTRUM_LM_EIG)}, + {"QUDA_SPECTRUM_SM_EIG", std::to_string(QUDA_SPECTRUM_SM_EIG)}, + {"QUDA_SPECTRUM_LR_EIG", std::to_string(QUDA_SPECTRUM_LR_EIG)}, + {"QUDA_SPECTRUM_SR_EIG", std::to_string(QUDA_SPECTRUM_SR_EIG)}, + {"QUDA_SPECTRUM_LI_EIG", std::to_string(QUDA_SPECTRUM_LI_EIG)}, + {"QUDA_SPECTRUM_SI_EIG", std::to_string(QUDA_SPECTRUM_SI_EIG)}, + {"QUDA_SPECTRUM_INVALID", std::to_string(QUDA_SPECTRUM_INVALID)}, + {"QUDA_MEMORY_DEVICE", std::to_string(QUDA_MEMORY_DEVICE)}, + {"QUDA_MEMORY_DEVICE_PINNED", std::to_string(QUDA_MEMORY_DEVICE_PINNED)}, + {"QUDA_MEMORY_HOST", std::to_string(QUDA_MEMORY_HOST)}, + {"QUDA_MEMORY_HOST_PINNED", std::to_string(QUDA_MEMORY_HOST_PINNED)}, + {"QUDA_MEMORY_MAPPED", std::to_string(QUDA_MEMORY_MAPPED)}, + {"QUDA_MEMORY_MANAGED", std::to_string(QUDA_MEMORY_MANAGED)}, + {"QUDA_MEMORY_INVALID", std::to_string(QUDA_MEMORY_INVALID)}, + {"QUDA_CUSOLVE_EXTLIB", std::to_string(QUDA_CUSOLVE_EXTLIB)}, + {"QUDA_EIGEN_EXTLIB", std::to_string(QUDA_EIGEN_EXTLIB)}, + {"QUDA_EXTLIB_INVALID", std::to_string(QUDA_EXTLIB_INVALID)}}; + +/** + * @brief Just a simple key-value store + */ +class KeyValueStore +{ +private: + std::unordered_map>> store; + std::unordered_map *map = nullptr; + std::string filename = ""; + +public: + /** + * @brief Sets a key value pair + * + * @param[in] section The section + * @param[in] key The key + * @param[in] value The value + */ + void set(const std::string §ion, const std::string &key, const std::string &value) + { + if (map != nullptr) { + auto mvalue = map->find(value); + if (mvalue != map->end()) { + std::get<0>(store[section][key]) = mvalue->second; + std::get<1>(store[section][key]) = value; + return; + } + } + std::get<0>(store[section][key]) = value; + std::get<1>(store[section][key]) = value; + } + + void set_map(std::unordered_map *_map) { map = _map; } + + bool section_exists(const std::string §ion) { return store.find(section) != store.end(); } + + /** + * @brief Gets the specified key. + * + * @param[in] section The section + * @param[in] key The key + * @param[in] default_value The default value if section/key is absent + * + * @tparam T Desired return type + * + * @return The corresponding value + */ + template + T get(const std::string §ion, const std::string &key, T default_value = T(), bool fail = false) + { + int idx; + std::string rkey; + std::smatch match; + std::regex p_key("([^\\[]+)\\[(\\d+)\\]"); /* key[idx] */ + auto sec = store.find(section); + + if (sec != store.end()) { + if (std::regex_search(key, match, p_key)) { + rkey = match[1]; + idx = std::stoi(match[2]); + } else { + rkey = key; + idx = 0; + } + + auto item = sec->second.find(rkey); + if (item != sec->second.end()) { + std::stringstream ss(std::get<0>(item->second)); + if constexpr (std::is_enum_v) { + typename std::underlying_type::type result, dummy; + for (int i = 0; i < idx; i++) { ss >> dummy; } + if (ss >> result) { return static_cast(result); } + } else { + T result, dummy; + for (int i = 0; i < idx; i++) { ss >> dummy; } + if (ss >> result) { return result; } + } + } + } + if (fail) { + errorQuda("Key \"%s\" in section \"%s\" in file %s does not exist.", key.c_str(), section.c_str(), + filename.c_str()); + } + return default_value; /* Return default value for non-existent keys */ + } + + /** + * @brief Fill the store with entries from an ini-file + * + * @param[in] fname The fname + */ + void load(const std::string &fname) + { + std::string line, section; + std::smatch match; + filename = fname; + std::ifstream file(filename.c_str()); + + std::regex p_section("^\\s*\\[([\\w\\ ]+)\\].*$"); /* [section] */ + std::regex p_comment("^[^#]*(\\s*#.*)$"); /* line # comment */ + std::regex p_key_val("^([^\\s]+)\\s+(.*[^\\s]+)\\s*$"); /* key value */ + + if (file.is_open()) { + + while (std::getline(file, line)) { + + /* remove all comments */ + if (std::regex_search(line, match, p_comment)) { line.erase(match.position(1)); } + + if (std::regex_search(line, match, p_section)) { + section = match[1]; + } else if (std::regex_search(line, match, p_key_val)) { + std::string key = match[1]; + std::string val = match[2]; + this->set(section, key, val); + } + } + + file.close(); + } else { + std::cerr << "Error opening file: " << filename << std::endl; + } + } + + /** + * @brief Dumps all entries in the store. + */ + void dump(std::string _section = "") + { + for (const auto §ion : store) { + if (_section == "" || _section == section.first) { + std::cout << "[" << section.first << "]" << std::endl; + for (const auto &pair : section.second) { + std::cout << " " << pair.first << " = " << std::get<1>(pair.second); + if (std::get<0>(pair.second) != std::get<1>(pair.second)) { std::cout << " # " << std::get<0>(pair.second); } + std::cout << std::endl; + } + } + } + } +}; + +/** + * @brief Returns the local lattice dimensions as lat_dim_t + * + * @return The local dimensions. + */ +static lat_dim_t get_local_dims(int *fill = nullptr) +{ + lat_dim_t X; + + for (int i = 0; i < 4; i++) { + if (fill) { + fill[i] = qudaState.layout.L[i]; + } else { + X[i] = qudaState.layout.L[i]; + } + } + + return X; +} + +/** + * @brief Calculate the rank from coordinates. + * + * @param[in] coords coords is the 4D cartesian coordinate of a rank + * @param[in] fdata should point to an instance of qudaLayout.ranks, + * @see struct openQCD_QudaLayout_t in + * @file include/quda_openqcd_interface.h + * + * @return rank + */ +int rankFromCoords(const int *coords, void *fdata) +{ + int *base = static_cast(fdata); + int *NPROC = base + 1; + int *ranks = base + 5; + int i; + + i = coords[3] + NPROC[3] * (coords[2] + NPROC[2] * (coords[1] + NPROC[1] * (coords[0]))); + return ranks[i]; +} + +/** + * Set set the local dimensions and machine topology for QUDA to use + * + * @param layout Struct defining local dimensions and machine topology + * @param infile Input file + */ +void openQCD_qudaSetLayout(openQCD_QudaLayout_t layout, char *infile) +{ + int my_rank; + char prefix[20]; + + for (int dir = 0; dir < 4; ++dir) { + if (layout.N[dir] % 2 != 0) { + errorQuda("Error: Odd lattice dimensions are not supported\n"); + exit(1); + } + } + +#ifdef MULTI_GPU +/* TODO: would we ever want to run with QMP COMMS? */ +#ifdef QMP_COMMS + initCommsGridQuda(4, layout.nproc, nullptr, nullptr); +#else + initCommsGridQuda(4, layout.nproc, rankFromCoords, (void *)(layout.data)); +#endif + static int device = -1; /* enable a default allocation of devices to processes */ +#else + static int device = layout.device; +#endif + + /* must happen *after* communication initialization */ + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + sprintf(prefix, "QUDA (rank=%d): ", my_rank); + + strcpy(qudaState.infile, infile); + if (my_rank == 0) { + KeyValueStore kv; + kv.set_map(&enum_map); + kv.load(qudaState.infile); + qudaState.init.verbosity = kv.get("QUDA", "verbosity", qudaState.init.verbosity); + } + + MPI_Bcast((void *)&qudaState.init.verbosity, sizeof(qudaState.init.verbosity), MPI_INT, 0, MPI_COMM_WORLD); + setVerbosityQuda(qudaState.init.verbosity, prefix, qudaState.init.logfile); + initQuda(device); +} + +static int getLinkPadding(const int dim[4]) +{ + int padding = MAX(dim[1] * dim[2] * dim[3] / 2, dim[0] * dim[2] * dim[3] / 2); + padding = MAX(padding, dim[0] * dim[1] * dim[3] / 2); + padding = MAX(padding, dim[0] * dim[1] * dim[2] / 2); + return padding; +} + +/** + * @brief Creates a new quda parameter struct + * + * @return The quda parameter struct. + */ +static QudaInvertParam newOpenQCDParam(void) +{ + static const QudaVerbosity verbosity = getVerbosity(); + + QudaInvertParam param = newQudaInvertParam(); + + param.verbosity = verbosity; + + param.cpu_prec = QUDA_DOUBLE_PRECISION; /* The precision used by the input fermion fields */ + param.cuda_prec = QUDA_DOUBLE_PRECISION; /* The precision used by the QUDA solver */ + param.cuda_prec_eigensolver = QUDA_DOUBLE_PRECISION; /* The precision used by the QUDA eigensolver */ + + param.cuda_prec_sloppy = QUDA_SINGLE_PRECISION; /* The precision used by the QUDA solver */ + param.cuda_prec_precondition = QUDA_HALF_PRECISION; /* The precision used by the QUDA solver */ + + /** + * The order of the input and output fermion fields. Imposes fieldOrder = + * QUDA_OPENQCD_FIELD_ORDER in color_spinor_field.h and + * QUDA_OPENQCD_FIELD_ORDER makes quda to instantiate OpenQCDDiracOrder. + */ + param.dirac_order = QUDA_OPENQCD_DIRAC_ORDER; + + /** + * Gamma basis of the input and output host fields. Specifies the basis change + * into QUDAs internal gamma basis. Note that QUDA applies the basis change U + * to a spinor field when uploading and U^dagger when downloading. + */ + param.gamma_basis = QUDA_OPENQCD_GAMMA_BASIS; + + return param; +} + +/** + * @brief Initialize quda gauge param struct + * + * @param[in] prec Precision + * @param[in] rec QUDA internal gauge field format + * @param[in] t_boundary Time boundary condition + * + * @return The quda gauge parameter struct. + */ +static QudaGaugeParam newOpenQCDGaugeParam(QudaPrecision prec, QudaReconstructType rec, QudaTboundary t_boundary) +{ + QudaGaugeParam param = newQudaGaugeParam(); + + get_local_dims(param.X); + param.cuda_prec_sloppy = param.cpu_prec = param.cuda_prec = prec; + param.type = QUDA_SU3_LINKS; + + param.reconstruct_sloppy = param.reconstruct = rec; + + /* This makes quda to instantiate OpenQCDOrder */ + param.gauge_order = QUDA_OPENQCD_GAUGE_ORDER; + + param.t_boundary = t_boundary; + param.gauge_fix = QUDA_GAUGE_FIXED_NO; + param.scale = 1.0; + param.anisotropy = 1.0; /* 1.0 means not anisotropic */ + param.ga_pad = getLinkPadding(param.X); /* Why this? */ + + return param; +} + +void openQCD_qudaInit(openQCD_QudaInitArgs_t init, openQCD_QudaLayout_t layout, char *infile) +{ + if (qudaState.initialized) return; + qudaState.init = init; + qudaState.layout = layout; + + qudaopenqcd_called(__func__); + openQCD_qudaSetLayout(qudaState.layout, infile); + qudaopenqcd_called(__func__); + qudaState.initialized = true; +} + +void openQCD_qudaFinalize(void) +{ + for (int id = 0; id < OPENQCD_MAX_INVERTERS; ++id) { + if (qudaState.inv_handles[id] != nullptr) { openQCD_qudaSolverDestroy(id); } + } + + for (int id = 0; id < OPENQCD_MAX_EIGENSOLVERS; ++id) { + if (qudaState.eig_handles[id] != nullptr) { openQCD_qudaEigensolverDestroy(id); } + } + + qudaState.initialized = false; + endQuda(); +} + +double openQCD_qudaPlaquette(void) +{ + double plaq[3]; + + plaqQuda(plaq); + + /* Note different Nc normalization wrt openQCD! */ + return 3.0 * plaq[0]; +} + +void openQCD_qudaGaugeLoad(void *gauge, QudaPrecision prec, QudaReconstructType rec, QudaTboundary t_boundary) +{ + QudaGaugeParam param = newOpenQCDGaugeParam(prec, rec, t_boundary); + loadGaugeQuda(gauge, ¶m); +} + +void openQCD_qudaGaugeSave(void *gauge, QudaPrecision prec, QudaReconstructType rec, QudaTboundary t_boundary) +{ + QudaGaugeParam param = newOpenQCDGaugeParam(prec, rec, t_boundary); + + void *buffer = pool_pinned_malloc((4 * qudaState.init.volume + 7 * qudaState.init.bndry / 4) * 18 * prec); + saveGaugeQuda(buffer, ¶m); + qudaState.init.reorder_gauge_quda_to_openqcd(buffer, gauge); + pool_pinned_free(buffer); +} + +void openQCD_qudaGaugeFree(void) { freeGaugeQuda(); } + +void openQCD_qudaCloverLoad(void *clover, double kappa, double csw) +{ + QudaInvertParam param = newOpenQCDParam(); + param.clover_order = QUDA_OPENQCD_CLOVER_ORDER; + param.dslash_type = QUDA_CLOVER_WILSON_DSLASH; + param.clover_cpu_prec = QUDA_DOUBLE_PRECISION; + param.clover_cuda_prec = QUDA_DOUBLE_PRECISION; + param.clover_cuda_prec_eigensolver = QUDA_DOUBLE_PRECISION; + + param.kappa = kappa; + param.clover_csw = csw; + param.clover_coeff = 0.0; + + loadCloverQuda(clover, NULL, ¶m); +} + +void openQCD_qudaCloverFree(void) { freeCloverQuda(); } + +/** + * @brief Set the su3csw corfficient and all related properties. + * + * @param param The parameter struct + * @param[in] su3csw The su3csw coefficient + */ +inline void set_su3csw(QudaInvertParam *param, double su3csw) +{ + param->clover_csw = su3csw; + if (su3csw != 0.0) { + param->clover_location = QUDA_CUDA_FIELD_LOCATION; /* seems to have no effect? */ + param->clover_cpu_prec = QUDA_DOUBLE_PRECISION; + param->clover_cuda_prec = QUDA_DOUBLE_PRECISION; + param->clover_cuda_prec_eigensolver = QUDA_DOUBLE_PRECISION; + + param->clover_coeff = 0.0; + + /* Set to Wilson Dirac operator with Clover term */ + param->dslash_type = QUDA_CLOVER_WILSON_DSLASH; + + if (qudaState.layout.flds_parms().gauge == OPENQCD_GAUGE_SU3) { + param->clover_order = QUDA_FLOAT8_CLOVER_ORDER; /* what implication has this? */ + param->compute_clover = true; + } else { + param->clover_order = QUDA_OPENQCD_CLOVER_ORDER; + } + } +} + +/** + * @brief Creates a new quda Dirac parameter struct + * + * @param[in] p OpenQCD Dirac parameter struct + * + * @return The quda Dirac parameter struct. + */ +static QudaInvertParam newOpenQCDDiracParam(openQCD_QudaDiracParam_t p) +{ + QudaInvertParam param = newOpenQCDParam(); + + param.dslash_type = QUDA_WILSON_DSLASH; + param.kappa = p.kappa; + param.mu = p.mu; + param.dagger = QUDA_DAG_NO; + param.matpc_type = QUDA_MATPC_EVEN_EVEN; + + set_su3csw(¶m, p.su3csw); + + param.inv_type = QUDA_CG_INVERTER; /* just set some, needed? */ + + /* What is the difference? only works with QUDA_MASS_NORMALIZATION */ + param.mass_normalization = QUDA_MASS_NORMALIZATION; + + /* Extent of the 5th dimension (for domain wall) */ + param.Ls = 1; + + return param; +} + +void openQCD_back_and_forth(void *h_in, void *h_out) +{ + /* sets up the necessary parameters */ + QudaInvertParam param = newOpenQCDParam(); + + /* creates a field on the CPU */ + ColorSpinorParam cpuParam(h_in, param, get_local_dims(), false, QUDA_CPU_FIELD_LOCATION); + ColorSpinorField in_h(cpuParam); + + /* creates a field on the GPU with the same parameter set as the CPU field */ + ColorSpinorParam cudaParam(cpuParam, param, QUDA_CUDA_FIELD_LOCATION); + ColorSpinorField in(cudaParam); + + /* transfer the CPU field to GPU */ + in = in_h; + + /* creates a field on the CPU */ + cpuParam.v = h_out; + cpuParam.location = QUDA_CPU_FIELD_LOCATION; + ColorSpinorField out_h(cpuParam); + + /* creates a zero-field on the GPU */ + cudaParam.create = QUDA_NULL_FIELD_CREATE; + cudaParam.location = QUDA_CUDA_FIELD_LOCATION; + ColorSpinorField out(cudaParam); + + out = in; + + /* transfer the GPU field back to CPU */ + out_h = out; +} + +int openQCD_qudaIndexIpt(const int *x) +{ + int L_openqcd[4]; + openqcd::rotate_coords(qudaState.layout.L, L_openqcd); + return openqcd::ipt(x, L_openqcd); +} + +int openQCD_qudaIndexIup(const int *x, const int mu) +{ + int L_openqcd[4], nproc_openqcd[4]; + openqcd::rotate_coords(qudaState.layout.L, L_openqcd); + openqcd::rotate_coords(qudaState.layout.nproc, nproc_openqcd); + return openqcd::iup(x, mu, L_openqcd, nproc_openqcd); +} + +double openQCD_qudaNorm(void *h_in) +{ + QudaInvertParam param = newOpenQCDParam(); + + ColorSpinorParam cpuParam(h_in, param, get_local_dims(), false, QUDA_CPU_FIELD_LOCATION); + ColorSpinorField in_h(cpuParam); + + ColorSpinorParam cudaParam(cpuParam, param, QUDA_CUDA_FIELD_LOCATION); + ColorSpinorField in(cudaParam); + + in = in_h; + + return blas::norm2(in); +} + +double openQCD_qudaNorm_NoLoads(void *d_in) { return blas::norm2(*reinterpret_cast(d_in)); } + +void openQCD_qudaGamma(const int dir, void *openQCD_in, void *openQCD_out) +{ + /* sets up the necessary parameters */ + QudaInvertParam param = newOpenQCDParam(); + + /* creates a field on the CPU */ + ColorSpinorParam cpuParam(openQCD_in, param, get_local_dims(), false, QUDA_CPU_FIELD_LOCATION); + ColorSpinorField in_h(cpuParam); + + /* creates a field on the GPU with the same parameter set as the CPU field */ + ColorSpinorParam cudaParam(cpuParam, param, QUDA_CUDA_FIELD_LOCATION); + ColorSpinorField in(cudaParam); + + /* transfer the CPU field to GPU */ + in = in_h; + + /* creates a zero-field on the GPU */ + cudaParam.create = QUDA_NULL_FIELD_CREATE; + cudaParam.location = QUDA_CUDA_FIELD_LOCATION; + ColorSpinorField out(cudaParam); + + /* gamma_i run within QUDA using QUDA fields */ + switch (dir) { + case 0: /* t direction */ gamma3(out, in); break; + case 1: /* x direction */ gamma0(out, in); break; + case 2: /* y direction */ gamma1(out, in); break; + case 3: /* z direction */ gamma2(out, in); break; + case 4: + case 5: + gamma5(out, in); + /* UKQCD uses a different convention for Gamma matrices: + * gamma5_ukqcd = gammax gammay gammaz gammat, + * gamma5_openqcd = gammat gammax gammay gammaz, + * and thus + * gamma5_openqcd = -1 * U gamma5_ukqcd U^dagger, + * with U the transformation matrix from OpenQCD to UKQCD. */ + blas::ax(-1.0, out); + break; + default: errorQuda("Unknown gamma: %d\n", dir); + } + + /* creates a field on the CPU */ + cpuParam.v = openQCD_out; + cpuParam.location = QUDA_CPU_FIELD_LOCATION; + ColorSpinorField out_h(cpuParam); + + /* transfer the GPU field back to CPU */ + out_h = out; +} + +void *openQCD_qudaH2D(void *openQCD_field) +{ + /* sets up the necessary parameters */ + QudaInvertParam param = newOpenQCDParam(); + + /* creates a field on the CPU */ + ColorSpinorParam cpuParam(openQCD_field, param, get_local_dims(), false, QUDA_CPU_FIELD_LOCATION); + ColorSpinorField in_h(cpuParam); + + /* creates a field on the GPU with the same parameter set as the CPU field */ + ColorSpinorParam cudaParam(cpuParam, param, QUDA_CUDA_FIELD_LOCATION); + ColorSpinorField *in = new ColorSpinorField(cudaParam); + + *in = in_h; /* transfer the CPU field to GPU */ + + return in; +} + +void openQCD_qudaSpinorFree(void **quda_field) +{ + delete reinterpret_cast(*quda_field); + *quda_field = nullptr; +} + +void openQCD_qudaD2H(void *quda_field, void *openQCD_field) +{ + int my_rank; + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + + /* sets up the necessary parameters */ + QudaInvertParam param = newOpenQCDParam(); + + /* creates a field on the CPU */ + ColorSpinorParam cpuParam(openQCD_field, param, get_local_dims(), false, QUDA_CPU_FIELD_LOCATION); + ColorSpinorField out_h(cpuParam); + + /* transfer the GPU field to CPU */ + out_h = *reinterpret_cast(quda_field); +} + +/** + * @brief Check whether the gauge field from openQCD is in sync with the + * one from QUDA. + * + * @return true/false + */ +inline bool gauge_field_get_up2date(void) +{ + int ud_rev = -2, ad_rev = -2; + + /* get current residing gauge field revision (residing in openqxd) */ + qudaState.layout.get_gfld_flags(&ud_rev, &ad_rev); + return ud_rev == qudaState.ud_rev && ad_rev == qudaState.ad_rev; +} + +/** + * @brief Check whether the gauge field is not (yet) set in openQCD. + * + * @return true/false + */ +inline bool gauge_field_get_unset(void) +{ + int ud_rev = -2, ad_rev = -2; + qudaState.layout.get_gfld_flags(&ud_rev, &ad_rev); + return ud_rev == 0 && ad_rev == 0; +} + +/** + * @brief Check if the current SW field needs to update wrt the parameters from openQCD. + * + * @return true/false + */ +inline bool clover_field_get_up2date(void) +{ + return (gauge_field_get_up2date() && qudaState.swd_ud_rev == qudaState.ud_rev && qudaState.swd_ad_rev == qudaState.ad_rev + && qudaState.swd_kappa == 1.0 / (2.0 * (qudaState.layout.dirac_parms().m0 + 4.0)) + && qudaState.swd_su3csw == qudaState.layout.dirac_parms().su3csw + && qudaState.swd_u1csw == qudaState.layout.dirac_parms().u1csw + && qudaState.swd_qhat == qudaState.layout.dirac_parms().qhat); +} + +/** + * @brief Check whether the multigrid instance associated to the parameter + * struct is up to date with the global gauge field revision, + * parameters are in sync, and clover/gauge fields are up to date. + * + * @param param The parameter struct + * + * @return true/false + */ +inline bool mg_get_up2date(QudaInvertParam *param) +{ + openQCD_QudaSolver *additional_prop = static_cast(param->additional_prop); + + return (param->preconditioner != nullptr && gauge_field_get_up2date() && clover_field_get_up2date() + && additional_prop->mg_ud_rev == qudaState.ud_rev && additional_prop->mg_ad_rev == qudaState.ad_rev + && additional_prop->mg_kappa == 1.0 / (2.0 * (qudaState.layout.dirac_parms().m0 + 4.0)) + && additional_prop->mg_su3csw == qudaState.layout.dirac_parms().su3csw + && additional_prop->mg_u1csw == qudaState.layout.dirac_parms().u1csw + && additional_prop->mg_qhat == qudaState.layout.dirac_parms().qhat); +} + +/** + * @brief Sets the multigrid instance associated to the parameter struct to + * be in sync with openQxD. + * + * @param param The parameter struct + */ +inline void mg_set_revision(QudaInvertParam *param) +{ + openQCD_QudaSolver *additional_prop = static_cast(param->additional_prop); + + additional_prop->mg_ud_rev = qudaState.ud_rev; + additional_prop->mg_ad_rev = qudaState.ad_rev; + additional_prop->mg_kappa = 1.0 / (2.0 * (qudaState.layout.dirac_parms().m0 + 4.0)); + additional_prop->mg_su3csw = qudaState.layout.dirac_parms().su3csw; + additional_prop->mg_u1csw = qudaState.layout.dirac_parms().u1csw; + additional_prop->mg_qhat = qudaState.layout.dirac_parms().qhat; +} + +/** + * @brief Set the global revisions numners for the SW field. + */ +inline void clover_field_set_revision(void) +{ + qudaState.swd_ud_rev = qudaState.ud_rev; + qudaState.swd_ad_rev = qudaState.ad_rev; + qudaState.swd_kappa = 1.0 / (2.0 * (qudaState.layout.dirac_parms().m0 + 4.0)); + qudaState.swd_su3csw = qudaState.layout.dirac_parms().su3csw; + qudaState.swd_u1csw = qudaState.layout.dirac_parms().u1csw; + qudaState.swd_qhat = qudaState.layout.dirac_parms().qhat; +} + +/** + * @brief Set the global revisions numners for the gauge field. + */ +inline void gauge_field_set_revision(void) { qudaState.layout.get_gfld_flags(&qudaState.ud_rev, &qudaState.ad_rev); } + +/** + * @brief Check if the solver parameters are in sync with the parameters + * from openQCD. + * + * @param param_ The parameter struct + * + * @return Whether parameters are in sync or not + */ +int openQCD_qudaInvertParamCheck(void *param_) +{ + QudaInvertParam *param = static_cast(param_); + openQCD_QudaSolver *additional_prop = static_cast(param->additional_prop); + bool ret = true; + + if (param->kappa != (1.0 / (2.0 * (qudaState.layout.dirac_parms().m0 + 4.0)))) { + logQuda( + QUDA_VERBOSE, "Property m0/kappa does not match in QudaInvertParam struct and openQxD:dirac_parms (openQxD: %.6e, QUDA: %.6e)\n", + (1.0 / (2.0 * (qudaState.layout.dirac_parms().m0 + 4.0))), param->kappa); + logQuda(QUDA_VERBOSE, " => need params update\n"); + ret = false; + } + + if (additional_prop->u1csw != qudaState.layout.dirac_parms().u1csw) { + logQuda( + QUDA_VERBOSE, + "Property u1csw does not match in QudaInvertParam struct and openQxD:dirac_parms (openQxD: %.6e, QUDA: %.6e)\n", + qudaState.layout.dirac_parms().u1csw, additional_prop->u1csw); + logQuda(QUDA_VERBOSE, " => need clover field and params update\n"); + ret = false; + } + + if (additional_prop->qhat != qudaState.layout.dirac_parms().qhat) { + logQuda(QUDA_VERBOSE, + "Property qhat does not match in QudaInvertParam struct and openQxD:dirac_parms (openQxD: %d, QUDA: %d)\n", + qudaState.layout.dirac_parms().qhat, additional_prop->qhat); + logQuda(QUDA_VERBOSE, " => need gauge, clover field and params update\n"); + ret = false; + } + + if (param->clover_csw != qudaState.layout.dirac_parms().su3csw) { + logQuda( + QUDA_VERBOSE, "Property su3csw/clover_csw does not match in QudaInvertParam struct and openQxD:dirac_parms (openQxD: %.6e, QUDA: %.6e)\n", + qudaState.layout.dirac_parms().su3csw, param->clover_csw); + logQuda(QUDA_VERBOSE, " => need clover field and params update\n"); + ret = false; + } + + return ret; +} + +/** + * @brief Check if the solver identifier is in bounds + * + * @param[in] id The identifier + */ +void inline check_solver_id(int id) +{ + if (id < -1 || id > OPENQCD_MAX_INVERTERS - 1) { + errorQuda("Solver id %d is out of range [%d, %d).", id, -1, OPENQCD_MAX_INVERTERS); + } +} + +/** + * @brief Check if the eigen-solver identifier is in bounds + * + * @param[in] id The identifier + */ +void inline check_eigensolver_id(int id) +{ + if (id < 0 || id > OPENQCD_MAX_EIGENSOLVERS - 1) { + errorQuda("Eigensolver id %d is out of range [%d, %d).", id, 0, OPENQCD_MAX_EIGENSOLVERS); + } +} + +/** + * @brief Transfer the gauge field if the gauge field was updated in + * openQxD. (Re-)calculate or transfer the clover field if + * parameters have changed or gauge field was updated. Update the + * settings kappa, qhat, su3csw and u1csw in the QudaInvertParam + * struct such that they are in sync with openQxD. Set up or update + * the multigrid instance if set in QudaInvertParam and if gauge- or + * clover-fields or parameters have changed. + * + * @param param_ The parameter struct, where in param->additional_prop a + * pointer to the QudaMultigridParam struct was placed. + */ +void openQCD_qudaSolverUpdate(void *param_) +{ + if (param_ == nullptr) { errorQuda("Solver handle is NULL."); } + + QudaInvertParam *param = static_cast(param_); + openQCD_QudaSolver *additional_prop = static_cast(param->additional_prop); + + bool do_param_update = !openQCD_qudaInvertParamCheck(param_); + bool do_gauge_transfer = (!gauge_field_get_up2date() && !gauge_field_get_unset()) + || additional_prop->qhat != qudaState.layout.dirac_parms().qhat; + bool do_clover_update = !clover_field_get_up2date() && !gauge_field_get_unset(); + bool do_multigrid_update = param_ != qudaState.dirac_handle && param->inv_type_precondition == QUDA_MG_INVERTER + && !mg_get_up2date(param) && !gauge_field_get_unset(); + bool do_multigrid_fat_update = do_multigrid_update + && (do_gauge_transfer || additional_prop->mg_ud_rev != qudaState.ud_rev + || additional_prop->mg_ad_rev != qudaState.ad_rev); + + if (do_gauge_transfer) { + if (qudaState.layout.h_gauge == nullptr) { errorQuda("qudaState.layout.h_gauge is not set."); } + logQuda(QUDA_VERBOSE, "Loading gauge field from openQCD ...\n"); + void *h_gauge = qudaState.layout.h_gauge(); + PUSH_RANGE("openQCD_qudaGaugeLoad", 3); + QudaReconstructType rec + = qudaState.layout.flds_parms().gauge == OPENQCD_GAUGE_SU3 ? QUDA_RECONSTRUCT_8 : QUDA_RECONSTRUCT_9; + + /** + * We set t_boundary = QUDA_ANTI_PERIODIC_T. This setting is a label that + * tells QUDA the current state of the residing gauge field, that is the + * same state as the one we transfer from openqxd. In openqxd the hdfld + * exhibits phases of -1 for the temporal time boundaries, meaning that the + * gauge fields are explicitly multiplied by -1 on the t=0 time slice, see + * chs_hd0() in hflds.c. The QUDA_ANTI_PERIODIC_T flag says that these + * phases are incorporated into the field and that QUDA has to add these + * phases on the t=0 time slice when reconstructing the field from + * QUDA_RECONSTRUCT_8/12, but not from QUDA_RECONSTRUCT_NO. In case of + * QUDA_RECONSTRUCT_NO the value if t_boundary has no effect. + * + * @see https://github.com/lattice/quda/issues/1315 + * @see Reconstruct#Unpack() in gauge_field_order.h + * @see Reconstruct<8,...>#Unpack() in gauge_field_order.h + * @see Reconstruct<12,...>#Unpack() in gauge_field_order.h + */ + openQCD_qudaGaugeLoad(h_gauge, QUDA_DOUBLE_PRECISION, rec, QUDA_ANTI_PERIODIC_T); + gauge_field_set_revision(); + POP_RANGE; + } + + if (do_param_update) { + logQuda(QUDA_VERBOSE, "Syncing kappa, qhat, su3csw, u1csw values from openQCD ...\n"); + param->kappa = 1.0 / (2.0 * (qudaState.layout.dirac_parms().m0 + 4.0)); + additional_prop->u1csw = qudaState.layout.dirac_parms().u1csw; + additional_prop->qhat = qudaState.layout.dirac_parms().qhat; + set_su3csw(param, qudaState.layout.dirac_parms().su3csw); + + QudaInvertParam *mg_inv_param = additional_prop->mg_param->invert_param; + mg_inv_param->kappa = 1.0 / (2.0 * (qudaState.layout.dirac_parms().m0 + 4.0)); + set_su3csw(mg_inv_param, qudaState.layout.dirac_parms().su3csw); + } + + if (do_clover_update) { + if (param->clover_csw == 0.0) { + logQuda(QUDA_VERBOSE, "Deallocating Clover field in QUDA ...\n"); + freeCloverQuda(); + qudaState.swd_ud_rev = 0; + qudaState.swd_ad_rev = 0; + qudaState.swd_kappa = 0.0; + qudaState.swd_su3csw = 0.0; + qudaState.swd_u1csw = 0.0; + qudaState.swd_qhat = 0; + } else { + if (qudaState.layout.flds_parms().gauge == OPENQCD_GAUGE_SU3) { + /** + * SU3 case: + * Leaving both h_clover = h_clovinv = NULL allocates the clover field on + * the GPU and finally calls @createCloverQuda to calculate the clover + * field. + */ + logQuda(QUDA_VERBOSE, "Generating Clover field in QUDA ...\n"); + PUSH_RANGE("loadCloverQuda", 3); + loadCloverQuda(NULL, NULL, param); + POP_RANGE; + clover_field_set_revision(); + } else { + /** + * U3 case: Transfer the SW-field from openQCD. + */ + + if (qudaState.layout.h_sw == nullptr) { errorQuda("qudaState.layout.h_sw is not set."); } + + logQuda(QUDA_VERBOSE, "Loading Clover field from openQCD ...\n"); + void *h_sw = qudaState.layout.h_sw(); + PUSH_RANGE("openQCD_qudaCloverLoad", 3); + openQCD_qudaCloverLoad(h_sw, param->kappa, param->clover_csw); + POP_RANGE; + clover_field_set_revision(); + + /*loadCloverQuda(qudaState.layout.h_sw(), NULL, param);*/ + /* TODO: The above line would be prefered over openQCD_qudaCloverLoad, but throws this error, no idea why? + QUDA: ERROR: qudaEventRecord_ returned CUDA_ERROR_ILLEGAL_ADDRESS + (timer.h:82 in start()) + (rank 0, host yoshi, quda_api.cpp:72 in void quda::target::cuda::set_driver_error(CUresult, const char*, const + char*, const char*, const char*, bool)()) QUDA: last kernel called was + (name=N4quda10CopyCloverINS_6clover11FloatNOrderIdLi72ELi2ELb0ELb1ELb0EEENS1_12OpenQCDOrderIdLi72EEEddEE,volume=32x16x16x64,aux=GPU-offline,vol=524288precision=8Nc=3,compute_diagonal)*/ + } + } + } + + /* setup/update the multigrid instance or do nothing */ + if (do_multigrid_update) { + QudaMultigridParam *mg_param = additional_prop->mg_param; + + if (mg_param == nullptr) { errorQuda("No multigrid parameter struct set."); } + + if (do_multigrid_fat_update && param->preconditioner != nullptr) { + logQuda(QUDA_VERBOSE, "Destroying existing multigrid instance ...\n"); + PUSH_RANGE("destroyMultigridQuda", 4); + destroyMultigridQuda(param->preconditioner); + param->preconditioner = nullptr; + POP_RANGE; + + additional_prop->mg_ud_rev = 0; + additional_prop->mg_ad_rev = 0; + additional_prop->mg_kappa = 0.0; + additional_prop->mg_su3csw = 0.0; + additional_prop->mg_u1csw = 0.0; + additional_prop->mg_qhat = 0; + } + + if (param->preconditioner == nullptr) { + logQuda(QUDA_VERBOSE, "Setting up multigrid instance ...\n"); + PUSH_RANGE("newMultigridQuda", 4); + param->preconditioner = newMultigridQuda(mg_param); + POP_RANGE; + mg_set_revision(param); + } else { + logQuda(QUDA_VERBOSE, "Updating existing multigrid instance ...\n"); + PUSH_RANGE("updateMultigridQuda", 4); + updateMultigridQuda(param->preconditioner, mg_param); + POP_RANGE; + mg_set_revision(param); + } + } +} + +void *openQCD_qudaSolverReadIn(int id) +{ + int my_rank; + + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + + /* Allocate on the heap */ + QudaInvertParam *param = new QudaInvertParam(newQudaInvertParam()); + QudaInvertParam *invert_param_mg = new QudaInvertParam(newQudaInvertParam()); + QudaMultigridParam *multigrid_param = new QudaMultigridParam(newQudaMultigridParam()); + std::string section = "Solver " + std::to_string(id); + + /* Some default settings */ + /* Some of them should not be changed */ + param->verbosity = QUDA_SUMMARIZE; + param->cpu_prec = QUDA_DOUBLE_PRECISION; + param->cuda_prec = QUDA_DOUBLE_PRECISION; + param->cuda_prec_eigensolver = QUDA_DOUBLE_PRECISION; + param->cuda_prec_sloppy = QUDA_SINGLE_PRECISION; + param->cuda_prec_precondition = QUDA_HALF_PRECISION; + param->dirac_order = QUDA_OPENQCD_DIRAC_ORDER; + param->gamma_basis = QUDA_OPENQCD_GAMMA_BASIS; + param->dslash_type = QUDA_WILSON_DSLASH; + param->kappa = 1.0 / (2.0 * (qudaState.layout.dirac_parms().m0 + 4.0)); + param->mu = 0.0; + param->dagger = QUDA_DAG_NO; + param->solution_type = QUDA_MAT_SOLUTION; + param->solve_type = QUDA_DIRECT_SOLVE; + param->matpc_type = QUDA_MATPC_EVEN_EVEN; + param->solver_normalization = QUDA_DEFAULT_NORMALIZATION; + param->inv_type_precondition = QUDA_INVALID_INVERTER; /* disables any preconditioning */ + param->mass_normalization = QUDA_MASS_NORMALIZATION; + + set_su3csw(param, qudaState.layout.dirac_parms().su3csw); + + if (my_rank == 0 && id != -1) { + + KeyValueStore kv; + kv.set_map(&enum_map); + kv.load(qudaState.infile); + + if (!kv.section_exists(section)) { + errorQuda("Solver section \"%s\" in file %s does not exist.", section.c_str(), qudaState.infile); + } + + param->verbosity = kv.get(section, "verbosity", param->verbosity); + + if (param->verbosity >= QUDA_DEBUG_VERBOSE) { kv.dump(); } + + if (kv.get(section, "solver") != "QUDA") { + errorQuda("Solver section \"%s\" in file %s is not a valid quda-solver section (solver = %s).", section.c_str(), + qudaState.infile, kv.get(section, "solver").c_str()); + } + + /* both fields reside on the CPU */ + param->input_location = kv.get(section, "input_location", QUDA_CPU_FIELD_LOCATION); + param->output_location = kv.get(section, "output_location", QUDA_CPU_FIELD_LOCATION); + + param->inv_type = kv.get(section, "inv_type", param->inv_type); + param->kappa = kv.get(section, "kappa", param->kappa); + param->mu = kv.get(section, "mu", param->mu); + param->tm_rho = kv.get(section, "tm_rho", param->tm_rho); + param->epsilon = kv.get(section, "epsilon", param->epsilon); + param->twist_flavor = kv.get(section, "twist_flavor", param->twist_flavor); + param->laplace3D = kv.get(section, "laplace3D", param->laplace3D); + + /* Solver settings */ + param->tol = kv.get(section, "tol", param->tol); + param->tol_restart = kv.get(section, "tol_restart", param->tol_restart); + param->tol_hq = kv.get(section, "tol_hq", param->tol_hq); + + param->compute_true_res = kv.get(section, "compute_true_res", param->compute_true_res); + param->true_res[0] = kv.get(section, "true_res", param->true_res[0]); /* TODO: mrhs */ + param->true_res_hq[0] = kv.get(section, "true_res_hq", param->true_res_hq[0]); + param->maxiter = kv.get(section, "maxiter", param->maxiter); + param->reliable_delta = kv.get(section, "reliable_delta", param->reliable_delta); + param->reliable_delta_refinement + = kv.get(section, "reliable_delta_refinement", param->reliable_delta_refinement); + param->use_alternative_reliable = kv.get(section, "use_alternative_reliable", param->use_alternative_reliable); + param->use_sloppy_partial_accumulator + = kv.get(section, "use_sloppy_partial_accumulator", param->use_sloppy_partial_accumulator); + + param->solution_accumulator_pipeline + = kv.get(section, "solution_accumulator_pipeline", param->solution_accumulator_pipeline); + + param->max_res_increase = kv.get(section, "max_res_increase", param->max_res_increase); + param->max_res_increase_total = kv.get(section, "max_res_increase_total", param->max_res_increase_total); + param->max_hq_res_increase = kv.get(section, "max_hq_res_increase", param->max_hq_res_increase); + param->max_hq_res_restart_total = kv.get(section, "max_hq_res_restart_total", param->max_hq_res_restart_total); + + param->heavy_quark_check = kv.get(section, "heavy_quark_check", param->heavy_quark_check); + + param->pipeline = kv.get(section, "pipeline", param->pipeline); + param->num_offset = kv.get(section, "num_offset", param->num_offset); + param->num_src = kv.get(section, "num_src", param->num_src); + param->num_src_per_sub_partition + = kv.get(section, "num_src_per_sub_partition", param->num_src_per_sub_partition); + + param->split_grid[0] = kv.get(section, "split_grid[1]", param->split_grid[0]); + param->split_grid[1] = kv.get(section, "split_grid[2]", param->split_grid[1]); + param->split_grid[2] = kv.get(section, "split_grid[3]", param->split_grid[2]); + param->split_grid[3] = kv.get(section, "split_grid[0]", param->split_grid[3]); + + param->overlap = kv.get(section, "overlap", param->overlap); + + for (int i = 0; i < param->num_offset; i++) { + std::string sub_key = "offset[" + std::to_string(i) + "]"; + param->offset[i] = kv.get(section, sub_key, param->offset[i]); + sub_key = "tol_offset[" + std::to_string(i) + "]"; + param->tol_offset[i] = kv.get(section, sub_key, param->tol_offset[i]); + sub_key = "tol_hq_offset[" + std::to_string(i) + "]"; + param->tol_hq_offset[i] = kv.get(section, sub_key, param->tol_hq_offset[i]); + } + + param->compute_action = kv.get(section, "compute_action", param->compute_action); + + param->solution_type = kv.get(section, "solution_type", param->solution_type); + param->solve_type = kv.get(section, "solve_type", param->solve_type); + param->matpc_type = kv.get(section, "matpc_type", param->matpc_type); + param->dagger = kv.get(section, "dagger", param->dagger); + param->mass_normalization = kv.get(section, "mass_normalization", param->mass_normalization); + param->solver_normalization + = kv.get(section, "solver_normalization", param->solver_normalization); + + param->preserve_source = kv.get(section, "preserve_source", param->preserve_source); + + param->cpu_prec = kv.get(section, "cpu_prec", param->cpu_prec); + param->cuda_prec = kv.get(section, "cuda_prec", param->cuda_prec); + param->cuda_prec_sloppy = kv.get(section, "cuda_prec_sloppy", param->cuda_prec_sloppy); + param->cuda_prec_refinement_sloppy + = kv.get(section, "cuda_prec_refinement_sloppy", param->cuda_prec_refinement_sloppy); + param->cuda_prec_precondition + = kv.get(section, "cuda_prec_precondition", param->cuda_prec_precondition); + param->cuda_prec_eigensolver = kv.get(section, "cuda_prec_eigensolver", param->cuda_prec_eigensolver); + + param->clover_location = kv.get(section, "clover_location", param->clover_location); + param->clover_cpu_prec = kv.get(section, "clover_cpu_prec", param->clover_cpu_prec); + param->clover_cuda_prec = kv.get(section, "clover_cuda_prec", param->clover_cuda_prec); + param->clover_cuda_prec_sloppy + = kv.get(section, "clover_cuda_prec_sloppy", param->clover_cuda_prec_sloppy); + param->clover_cuda_prec_refinement_sloppy + = kv.get(section, "clover_cuda_prec_refinement_sloppy", param->clover_cuda_prec_refinement_sloppy); + param->clover_cuda_prec_precondition + = kv.get(section, "clover_cuda_prec_precondition", param->clover_cuda_prec_precondition); + param->clover_cuda_prec_eigensolver + = kv.get(section, "clover_cuda_prec_eigensolver", param->clover_cuda_prec_eigensolver); + + param->use_init_guess = kv.get(section, "use_init_guess", param->use_init_guess); + + param->clover_csw = kv.get(section, "clover_csw", param->clover_csw); + param->clover_coeff = kv.get(section, "clover_coeff", param->clover_coeff); + param->clover_rho = kv.get(section, "clover_rho", param->clover_rho); + param->compute_clover_trlog = kv.get(section, "compute_clover_trlog", param->compute_clover_trlog); + param->Nsteps = kv.get(section, "Nsteps", param->Nsteps); + param->gcrNkrylov = kv.get(section, "gcrNkrylov", param->gcrNkrylov); + + param->inv_type_precondition + = kv.get(section, "inv_type_precondition", param->inv_type_precondition); + param->deflate = kv.get(section, "deflate", param->deflate); + param->verbosity_precondition + = kv.get(section, "verbosity_precondition", param->verbosity_precondition); + param->tol_precondition = kv.get(section, "tol_precondition", param->tol_precondition); + param->maxiter_precondition = kv.get(section, "maxiter_precondition", param->maxiter_precondition); + param->omega = kv.get(section, "omega", param->omega); + param->ca_basis = kv.get(section, "ca_basis", param->ca_basis); + param->ca_lambda_min = kv.get(section, "ca_lambda_min", param->ca_lambda_min); + param->ca_lambda_max = kv.get(section, "ca_lambda_max", param->ca_lambda_max); + param->ca_basis_precondition = kv.get(section, "ca_basis_precondition", param->ca_basis_precondition); + param->ca_lambda_min_precondition + = kv.get(section, "ca_lambda_min_precondition", param->ca_lambda_min_precondition); + param->ca_lambda_max_precondition + = kv.get(section, "ca_lambda_max_precondition", param->ca_lambda_max_precondition); + param->precondition_cycle = kv.get(section, "precondition_cycle", param->precondition_cycle); + param->schwarz_type = kv.get(section, "schwarz_type", param->schwarz_type); + param->accelerator_type_precondition + = kv.get(section, "accelerator_type_precondition", param->accelerator_type_precondition); + + param->madwf_diagonal_suppressor + = kv.get(section, "madwf_diagonal_suppressor", param->madwf_diagonal_suppressor); + param->madwf_ls = kv.get(section, "madwf_ls", param->madwf_ls); + param->madwf_null_miniter = kv.get(section, "madwf_null_miniter", param->madwf_null_miniter); + param->madwf_null_tol = kv.get(section, "madwf_null_tol", param->madwf_null_tol); + param->madwf_train_maxiter = kv.get(section, "madwf_train_maxiter", param->madwf_train_maxiter); + + param->madwf_param_load = kv.get(section, "madwf_param_load", param->madwf_param_load); + param->madwf_param_save = kv.get(section, "madwf_param_save", param->madwf_param_save); + strcpy(param->madwf_param_infile, + kv.get(section, "madwf_param_infile", param->madwf_param_infile).c_str()); + strcpy(param->madwf_param_outfile, + kv.get(section, "madwf_param_outfile", param->madwf_param_outfile).c_str()); + + param->residual_type = kv.get(section, "residual_type", param->residual_type); + + if (param->inv_type_precondition == QUDA_MG_INVERTER) { + + std::string mg_section = section + " Multigrid"; + + if (!kv.section_exists(mg_section)) { + errorQuda("Solver section \"%s\" in file %s does not exist.", mg_section.c_str(), qudaState.infile); + } + + /* (shallow) copy the struct */ + *invert_param_mg = *param; + + /* these have to be fixed, and cannot be overwritten by the input file */ + invert_param_mg->gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; + invert_param_mg->dirac_order = QUDA_DIRAC_ORDER; + + multigrid_param->n_level = kv.get(mg_section, "n_level", multigrid_param->n_level, true); + multigrid_param->setup_type = kv.get(mg_section, "setup_type", multigrid_param->setup_type); + multigrid_param->pre_orthonormalize + = kv.get(mg_section, "pre_orthonormalize", multigrid_param->pre_orthonormalize); + multigrid_param->post_orthonormalize + = kv.get(mg_section, "post_orthonormalize", multigrid_param->post_orthonormalize); + multigrid_param->compute_null_vector + = kv.get(mg_section, "compute_null_vector", multigrid_param->compute_null_vector); + multigrid_param->generate_all_levels + = kv.get(mg_section, "generate_all_levels", multigrid_param->generate_all_levels); + multigrid_param->run_verify = kv.get(mg_section, "run_verify", multigrid_param->run_verify); + multigrid_param->run_low_mode_check + = kv.get(mg_section, "run_low_mode_check", multigrid_param->run_low_mode_check); + multigrid_param->run_oblique_proj_check + = kv.get(mg_section, "run_oblique_proj_check", multigrid_param->run_oblique_proj_check); + multigrid_param->coarse_guess = kv.get(mg_section, "coarse_guess", multigrid_param->coarse_guess); + multigrid_param->preserve_deflation + = kv.get(mg_section, "preserve_deflation", multigrid_param->preserve_deflation); + multigrid_param->allow_truncation + = kv.get(mg_section, "allow_truncation", multigrid_param->allow_truncation); + multigrid_param->staggered_kd_dagger_approximation = kv.get( + mg_section, "staggered_kd_dagger_approximation", multigrid_param->staggered_kd_dagger_approximation); + multigrid_param->thin_update_only + = kv.get(mg_section, "thin_update_only", multigrid_param->thin_update_only); + + for (int i = 0; i < multigrid_param->n_level; i++) { + std::string subsection = section + " Multigrid Level " + std::to_string(i); + + if (!kv.section_exists(subsection)) { + errorQuda("Solver section \"%s\" in file %s does not exist.", subsection.c_str(), qudaState.infile); + } + + multigrid_param->geo_block_size[i][0] + = kv.get(subsection, "geo_block_size[1]", multigrid_param->geo_block_size[i][0]); + multigrid_param->geo_block_size[i][1] + = kv.get(subsection, "geo_block_size[2]", multigrid_param->geo_block_size[i][1]); + multigrid_param->geo_block_size[i][2] + = kv.get(subsection, "geo_block_size[3]", multigrid_param->geo_block_size[i][2]); + multigrid_param->geo_block_size[i][3] + = kv.get(subsection, "geo_block_size[0]", multigrid_param->geo_block_size[i][3]); + multigrid_param->spin_block_size[i] + = kv.get(subsection, "spin_block_size", multigrid_param->spin_block_size[i]); + multigrid_param->n_vec[i] = kv.get(subsection, "n_vec", multigrid_param->n_vec[i]); + multigrid_param->n_vec_batch[i] = kv.get(subsection, "n_vec_batch", 1); + multigrid_param->precision_null[i] + = kv.get(subsection, "precision_null", multigrid_param->precision_null[i]); + multigrid_param->n_block_ortho[i] = kv.get(subsection, "n_block_ortho", multigrid_param->n_block_ortho[i]); + multigrid_param->block_ortho_two_pass[i] + = kv.get(subsection, "block_ortho_two_pass", multigrid_param->block_ortho_two_pass[i]); + multigrid_param->verbosity[i] = kv.get(subsection, "verbosity", multigrid_param->verbosity[i]); + multigrid_param->setup_inv_type[i] + = kv.get(subsection, "setup_inv_type", multigrid_param->setup_inv_type[i]); + multigrid_param->setup_use_mma[i] + = kv.get(subsection, "setup_use_mma", multigrid_param->setup_use_mma[i]); + multigrid_param->dslash_use_mma[i] + = kv.get(subsection, "dslash_use_mma", multigrid_param->dslash_use_mma[i]); + multigrid_param->num_setup_iter[i] + = kv.get(subsection, "num_setup_iter", multigrid_param->num_setup_iter[i]); + multigrid_param->setup_tol[i] = kv.get(subsection, "setup_tol", multigrid_param->setup_tol[i]); + multigrid_param->setup_maxiter[i] = kv.get(subsection, "setup_maxiter", multigrid_param->setup_maxiter[i]); + multigrid_param->setup_maxiter_refresh[i] + = kv.get(subsection, "setup_maxiter_refresh", multigrid_param->setup_maxiter_refresh[i]); + multigrid_param->setup_ca_basis[i] + = kv.get(subsection, "setup_ca_basis", multigrid_param->setup_ca_basis[i]); + multigrid_param->setup_ca_basis_size[i] + = kv.get(subsection, "setup_ca_basis_size", multigrid_param->setup_ca_basis_size[i]); + multigrid_param->setup_ca_lambda_min[i] + = kv.get(subsection, "setup_ca_lambda_min", multigrid_param->setup_ca_lambda_min[i]); + multigrid_param->setup_ca_lambda_max[i] + = kv.get(subsection, "setup_ca_lambda_max", multigrid_param->setup_ca_lambda_max[i]); + + multigrid_param->coarse_solver[i] + = kv.get(subsection, "coarse_solver", multigrid_param->coarse_solver[i]); + multigrid_param->coarse_solver_tol[i] + = kv.get(subsection, "coarse_solver_tol", multigrid_param->coarse_solver_tol[i]); + multigrid_param->coarse_solver_maxiter[i] + = kv.get(subsection, "coarse_solver_maxiter", multigrid_param->coarse_solver_maxiter[i]); + multigrid_param->coarse_solver_ca_basis[i] + = kv.get(subsection, "coarse_solver_ca_basis", multigrid_param->coarse_solver_ca_basis[i]); + multigrid_param->coarse_solver_ca_basis_size[i] + = kv.get(subsection, "coarse_solver_ca_basis_size", multigrid_param->coarse_solver_ca_basis_size[i]); + multigrid_param->coarse_solver_ca_lambda_min[i] + = kv.get(subsection, "coarse_solver_ca_lambda_min", multigrid_param->coarse_solver_ca_lambda_min[i]); + multigrid_param->coarse_solver_ca_lambda_max[i] + = kv.get(subsection, "coarse_solver_ca_lambda_max", multigrid_param->coarse_solver_ca_lambda_max[i]); + multigrid_param->smoother[i] = kv.get(subsection, "smoother", multigrid_param->smoother[i]); + multigrid_param->smoother_tol[i] = kv.get(subsection, "smoother_tol", multigrid_param->smoother_tol[i]); + multigrid_param->nu_pre[i] = kv.get(subsection, "nu_pre", multigrid_param->nu_pre[i]); + multigrid_param->nu_post[i] = kv.get(subsection, "nu_post", multigrid_param->nu_post[i]); + multigrid_param->smoother_solver_ca_basis[i] + = kv.get(subsection, "smoother_solver_ca_basis", multigrid_param->smoother_solver_ca_basis[i]); + multigrid_param->smoother_solver_ca_lambda_min[i] = kv.get( + subsection, "smoother_solver_ca_lambda_min", multigrid_param->smoother_solver_ca_lambda_min[i]); + multigrid_param->smoother_solver_ca_lambda_max[i] = kv.get( + subsection, "smoother_solver_ca_lambda_max", multigrid_param->smoother_solver_ca_lambda_max[i]); + multigrid_param->omega[i] = kv.get(subsection, "omega", multigrid_param->omega[i]); + multigrid_param->smoother_halo_precision[i] + = kv.get(subsection, "smoother_halo_precision", multigrid_param->smoother_halo_precision[i]); + multigrid_param->smoother_schwarz_type[i] + = kv.get(subsection, "smoother_schwarz_type", multigrid_param->smoother_schwarz_type[i]); + multigrid_param->smoother_schwarz_cycle[i] + = kv.get(subsection, "smoother_schwarz_cycle", multigrid_param->smoother_schwarz_cycle[i]); + multigrid_param->coarse_grid_solution_type[i] = kv.get( + subsection, "coarse_grid_solution_type", multigrid_param->coarse_grid_solution_type[i]); + multigrid_param->smoother_solve_type[i] + = kv.get(subsection, "smoother_solve_type", multigrid_param->smoother_solve_type[i]); + multigrid_param->cycle_type[i] + = kv.get(subsection, "cycle_type", multigrid_param->cycle_type[i]); + multigrid_param->global_reduction[i] + = kv.get(subsection, "global_reduction", multigrid_param->global_reduction[i]); + multigrid_param->location[i] = kv.get(subsection, "location", multigrid_param->location[i]); + multigrid_param->setup_location[i] + = kv.get(subsection, "setup_location", multigrid_param->setup_location[i]); + multigrid_param->use_eig_solver[i] + = kv.get(subsection, "use_eig_solver", multigrid_param->use_eig_solver[i]); + + multigrid_param->vec_load[i] = kv.get(subsection, "vec_load", multigrid_param->vec_load[i]); + multigrid_param->vec_store[i] = kv.get(subsection, "vec_store", multigrid_param->vec_store[i]); + /*strcpy(multigrid_param->vec_infile[i], kv.get(subsection, "vec_infile", + multigrid_param->vec_infile[i]).c_str()); strcpy(multigrid_param->vec_outfile[i], + kv.get(subsection, "vec_outfile", multigrid_param->vec_outfile[i]).c_str());*/ + + multigrid_param->mu_factor[i] = kv.get(subsection, "mu_factor", multigrid_param->mu_factor[i]); + multigrid_param->transfer_type[i] + = kv.get(subsection, "transfer_type", multigrid_param->transfer_type[i]); + } + } + } + + /* transfer of the struct to all processes */ + MPI_Bcast((void *)param, sizeof(*param), MPI_BYTE, 0, MPI_COMM_WORLD); + MPI_Bcast((void *)invert_param_mg, sizeof(*invert_param_mg), MPI_BYTE, 0, MPI_COMM_WORLD); + MPI_Bcast((void *)multigrid_param, sizeof(*multigrid_param), MPI_BYTE, 0, MPI_COMM_WORLD); + multigrid_param->invert_param = invert_param_mg; + + /** + * We need a void* to store the multigrid_param (QudaMultigridParam) struct, + * such that we can access it and setup multigrid in a later stage, for + * instance right before calling invertQuda() if multigrid was not + * instantiated until then. + */ + openQCD_QudaSolver *additional_prop = new openQCD_QudaSolver(); + sprintf(additional_prop->infile, "%s", qudaState.infile); + additional_prop->id = id; + additional_prop->mg_param = multigrid_param; + additional_prop->u1csw = 0.0; + additional_prop->qhat = 0.0; + param->additional_prop = reinterpret_cast(additional_prop); + + return (void *)param; +} + +void *openQCD_qudaSolverGetHandle(int id) +{ + check_solver_id(id); + if (qudaState.inv_handles[id] == nullptr) { + if (id != -1) { + logQuda(QUDA_VERBOSE, "Read in solver parameters from file %s for solver (id=%d)\n", qudaState.infile, id); + } + qudaState.inv_handles[id] = openQCD_qudaSolverReadIn(id); + } + + openQCD_qudaSolverUpdate(qudaState.inv_handles[id]); + return qudaState.inv_handles[id]; +} + +void openQCD_qudaDw_deprecated(void *src, void *dst, openQCD_QudaDiracParam_t p) +{ + QudaInvertParam param = newOpenQCDDiracParam(p); + + /* both fields reside on the CPU */ + param.input_location = QUDA_CPU_FIELD_LOCATION; + param.output_location = QUDA_CPU_FIELD_LOCATION; + + MatQuda(static_cast(dst), static_cast(src), ¶m); + + logQuda(QUDA_DEBUG_VERBOSE, "MatQuda()\n"); + logQuda(QUDA_DEBUG_VERBOSE, " gflops = %.2e\n", param.gflops); + logQuda(QUDA_DEBUG_VERBOSE, " secs = %.2e\n", param.secs); +} + +void openQCD_qudaDw(double mu, void *in, void *out) +{ + if (gauge_field_get_unset()) { errorQuda("Gauge field not populated in openQxD."); } + + QudaInvertParam *param = static_cast(openQCD_qudaSolverGetHandle(-1)); + param->mu = mu; + + if (!openQCD_qudaInvertParamCheck(param)) { + errorQuda("QudaInvertParam struct check failed, parameters/fields between openQxD and QUDA are not in sync."); + } + + /* both fields reside on the CPU */ + param->input_location = QUDA_CPU_FIELD_LOCATION; + param->output_location = QUDA_CPU_FIELD_LOCATION; + + MatQuda(static_cast(out), static_cast(in), param); +} + +void openQCD_qudaDw_NoLoads(double mu, void *d_in, void *d_out) +{ + if (gauge_field_get_unset()) { errorQuda("Gauge field not populated in openQxD."); } + + QudaInvertParam *inv_param = static_cast(openQCD_qudaSolverGetHandle(-1)); + inv_param->mu = mu; + + if (!openQCD_qudaInvertParamCheck(inv_param)) { + errorQuda("QudaInvertParam struct check failed, parameters/fields between openQxD and QUDA are not in sync."); + } + + /* both fields reside on the GPU */ + inv_param->input_location = QUDA_CUDA_FIELD_LOCATION; + inv_param->output_location = QUDA_CUDA_FIELD_LOCATION; + + ColorSpinorField *in = reinterpret_cast(d_in); + ColorSpinorField *out = reinterpret_cast(d_out); + + /* truncated version of what MatQuda does */ + pushVerbosity(inv_param->verbosity); + + bool pc = (inv_param->solution_type == QUDA_MATPC_SOLUTION || inv_param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION); + + DiracParam diracParam; + setDiracParam(diracParam, inv_param, pc); + + Dirac *dirac = Dirac::create(diracParam); // create the Dirac operator + dirac->M(*out, *in); // apply the operator + delete dirac; // clean up + + if (pc) { + if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION) { + blas::ax(0.25 / (inv_param->kappa * inv_param->kappa), *out); + } else if (inv_param->mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { + blas::ax(0.5 / inv_param->kappa, *out); + } + } else { + if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION + || inv_param->mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { + blas::ax(0.5 / inv_param->kappa, *out); + } + } + + popVerbosity(); +} + +/** + * @brief Take the string-hash over a struct using std::hash. + * + * @param[in] in Input struct + * + * @tparam T Type of input struct + * + * @return Hash value + */ +template int hash_struct(T *in) +{ + int hash = 0; + char *cstruct = reinterpret_cast(in); + + for (char *c = cstruct; c < cstruct + sizeof(T); c += strlen(c) + 1) { + if (strlen(c) != 0) { hash ^= (std::hash {}(std::string(c)) << 1); } + } + + return hash; +} + +int openQCD_qudaSolverGetHash(int id) +{ + check_solver_id(id); + if (qudaState.inv_handles[id] != nullptr) { + QudaInvertParam *param = reinterpret_cast(qudaState.inv_handles[id]); + QudaInvertParam hparam = newQudaInvertParam(); + memset(&hparam, '\0', sizeof(QudaInvertParam)); /* set everything to zero */ + + /* Set some properties we want to take the hash over */ + hparam.inv_type = param->inv_type; + hparam.tol = param->tol; + hparam.tol_restart = param->tol_restart; + hparam.tol_hq = param->tol_hq; + hparam.maxiter = param->maxiter; + hparam.reliable_delta = param->reliable_delta; + hparam.solution_type = param->solution_type; + hparam.solve_type = param->solve_type; + hparam.matpc_type = param->matpc_type; + hparam.dagger = param->dagger; + hparam.mass_normalization = param->mass_normalization; + hparam.solver_normalization = param->solver_normalization; + hparam.cpu_prec = param->cpu_prec; + hparam.cuda_prec = param->cuda_prec; + hparam.use_init_guess = param->use_init_guess; + hparam.gcrNkrylov = param->gcrNkrylov; + + return hash_struct(&hparam); + } else { + return 0; + } +} + +void openQCD_qudaSolverPrintSetup(int id) +{ + check_solver_id(id); + if (qudaState.inv_handles[id] != nullptr) { + QudaInvertParam *param = static_cast(qudaState.inv_handles[id]); + openQCD_QudaSolver *additional_prop = static_cast(param->additional_prop); + + printQudaInvertParam(param); + printfQuda("additional_prop->infile = %s\n", additional_prop->infile); + printfQuda("additional_prop->id = %d\n", additional_prop->id); + printfQuda("additional_prop->mg_param = %p\n", additional_prop->mg_param); + printfQuda("additional_prop->u1csw = %.6e\n", additional_prop->u1csw); + printfQuda("additional_prop->qhat = %d\n", additional_prop->qhat); + printfQuda("additional_prop->mg_ud_rev = %d\n", additional_prop->mg_ud_rev); + printfQuda("additional_prop->mg_ad_rev = %d\n", additional_prop->mg_ad_rev); + printfQuda("additional_prop->mg_kappa = %.6e\n", additional_prop->mg_kappa); + printfQuda("additional_prop->mg_su3csw = %.6e\n", additional_prop->mg_su3csw); + printfQuda("additional_prop->mg_u1csw = %.6e\n", additional_prop->mg_u1csw); + printfQuda("additional_prop->mg_qhat = %d\n", additional_prop->mg_qhat); + printfQuda("handle = %p\n", param); + printfQuda("hash = %d\n", openQCD_qudaSolverGetHash(id)); + + printfQuda("inv_type_precondition = %d\n", param->inv_type_precondition); + + if (param->inv_type_precondition == QUDA_MG_INVERTER) { printQudaMultigridParam(additional_prop->mg_param); } + } else { + printfQuda("\n"); + } +} + +double openQCD_qudaInvert(int id, double mu, void *source, void *solution, int *status) +{ + if (gauge_field_get_unset()) { errorQuda("Gauge field not populated in openQxD."); } + + /** + * This is to make sure we behave in the same way as openQCDs solvers, we call + * h_sw() which in turn calls sw_term(). We have to make sure that the SW-term + * in openQxD is setup and in sync with QUDAs. + */ + if (qudaState.layout.h_sw != nullptr) { + qudaState.layout.h_sw(); + } else { + errorQuda("qudaState.layout.h_sw is not set."); + } + + QudaInvertParam *param = static_cast(openQCD_qudaSolverGetHandle(id)); + param->mu = mu; + + if (!openQCD_qudaInvertParamCheck(param)) { + errorQuda("Solver check failed, parameters/fields between openQxD and QUDA are not in sync."); + } + + logQuda(QUDA_VERBOSE, "Calling invertQuda() ...\n"); + PUSH_RANGE("invertQuda", 5); + invertQuda(static_cast(solution), static_cast(source), param); + POP_RANGE; + + *status = param->true_res[0] <= param->tol ? param->iter : -1; + + logQuda(QUDA_VERBOSE, "openQCD_qudaInvert()\n"); + logQuda(QUDA_VERBOSE, " true_res = %.2e\n", param->true_res[0]); + logQuda(QUDA_VERBOSE, " true_res_hq = %.2e\n", param->true_res_hq[0]); + logQuda(QUDA_VERBOSE, " iter = %d\n", param->iter); + logQuda(QUDA_VERBOSE, " gflops = %.2e\n", param->gflops); + logQuda(QUDA_VERBOSE, " secs = %.2e\n", param->secs); + logQuda(QUDA_VERBOSE, " status = %d\n", *status); + + return param->true_res[0]; +} + +void openQCD_qudaSolverDestroy(int id) +{ + check_solver_id(id); + if (qudaState.inv_handles[id] != nullptr) { + QudaInvertParam *param = static_cast(qudaState.inv_handles[id]); + + if (param->inv_type_precondition == QUDA_MG_INVERTER) { destroyMultigridQuda(param->preconditioner); } + + delete static_cast(param->additional_prop)->mg_param; + delete static_cast(param->additional_prop); + delete param; + qudaState.inv_handles[id] = nullptr; + } +} + +void *openQCD_qudaEigensolverReadIn(int id, int solver_id) +{ + int my_rank; + QudaEigParam *param; + + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + QudaVerbosity verbosity = QUDA_SUMMARIZE; + + /* Allocate on the heap */ + if (qudaState.eig_handles[id] == nullptr) { + param = new QudaEigParam(newQudaEigParam()); + } else { + param = static_cast(qudaState.eig_handles[id]); + } + + if (my_rank == 0) { + + KeyValueStore kv; + kv.set_map(&enum_map); + kv.load(qudaState.infile); + + std::string section = "Eigensolver " + std::to_string(id); + + verbosity = kv.get(section, "verbosity", verbosity); + + if (verbosity >= QUDA_DEBUG_VERBOSE) { kv.dump(); } + + if (kv.get(section, "solver") != "QUDA") { + errorQuda("Eigensolver section \"%s\" in file %s is not a valid quda-eigensolver section (solver = %s)\n", + section.c_str(), qudaState.infile, kv.get(section, "solver").c_str()); + } + + param->eig_type = kv.get(section, "eig_type", param->eig_type); + param->use_poly_acc = kv.get(section, "use_poly_acc", param->use_poly_acc); + param->poly_deg = kv.get(section, "poly_deg", param->poly_deg); + param->a_min = kv.get(section, "a_min", param->a_min); + param->a_max = kv.get(section, "a_max", param->a_max); + param->preserve_deflation = kv.get(section, "preserve_deflation", param->preserve_deflation); + /*param->*preserve_deflation_space = kv.get(section, *"*preserve_deflation_space", param->preserve_deflation_space);*/ + param->preserve_evals = kv.get(section, "preserve_evals", param->preserve_evals); + param->use_dagger = kv.get(section, "use_dagger", param->use_dagger); + param->use_norm_op = kv.get(section, "use_norm_op", param->use_norm_op); + param->use_pc = kv.get(section, "use_pc", param->use_pc); + param->use_eigen_qr = kv.get(section, "use_eigen_qr", param->use_eigen_qr); + param->compute_svd = kv.get(section, "compute_svd", param->compute_svd); + param->compute_gamma5 = kv.get(section, "compute_gamma5", param->compute_gamma5); + param->require_convergence = kv.get(section, "require_convergence", param->require_convergence); + param->spectrum = kv.get(section, "spectrum", param->spectrum); + param->n_ev = kv.get(section, "n_ev", param->n_ev); + param->n_kr = kv.get(section, "n_kr", param->n_kr); + param->n_conv = kv.get(section, "n_conv", param->n_conv); + param->n_ev_deflate = kv.get(section, "n_ev_deflate", param->n_ev_deflate); + param->tol = kv.get(section, "tol", param->tol); + param->qr_tol = kv.get(section, "qr_tol", param->qr_tol); + param->check_interval = kv.get(section, "check_interval", param->check_interval); + param->max_restarts = kv.get(section, "max_restarts", param->max_restarts); + param->batched_rotate = kv.get(section, "batched_rotate", param->batched_rotate); + param->block_size = kv.get(section, "block_size", param->block_size); + param->arpack_check = kv.get(section, "arpack_check", param->arpack_check); + strcpy(param->QUDA_logfile, kv.get(section, "QUDA_logfile", param->QUDA_logfile).c_str()); + strcpy(param->arpack_logfile, kv.get(section, "arpack_logfile", param->arpack_logfile).c_str()); + + param->nk = kv.get(section, "nk", param->nk); + param->np = kv.get(section, "np", param->np); + param->import_vectors = kv.get(section, "import_vectors", param->import_vectors); + param->cuda_prec_ritz = kv.get(section, "cuda_prec_ritz", param->cuda_prec_ritz); + param->mem_type_ritz = kv.get(section, "mem_type_ritz", param->mem_type_ritz); + param->location = kv.get(section, "location", param->location); + param->run_verify = kv.get(section, "run_verify", param->run_verify); + /*strcpy(param->vec_infile, kv.get(section, "vec_infile", param->vec_infile).c_str());*/ + /*strcpy(param->vec_outfile, kv.get(section, "vec_outfile", param->vec_outfile).c_str());*/ + param->vec_outfile[0] = '\0'; + param->vec_infile[0] = '\0'; + param->save_prec = kv.get(section, "save_prec", param->save_prec); + param->io_parity_inflate = kv.get(section, "io_parity_inflate", param->io_parity_inflate); + param->extlib_type = kv.get(section, "extlib_type", param->extlib_type); + } + + /* transfer of the struct to all the processes */ + MPI_Bcast((void *)param, sizeof(*param), MPI_BYTE, 0, MPI_COMM_WORLD); + + void *inv_param = openQCD_qudaSolverGetHandle(solver_id); + param->invert_param = static_cast(inv_param); + + param->invert_param->verbosity = std::max(param->invert_param->verbosity, verbosity); + + if (solver_id != -1 && param->invert_param->verbosity >= QUDA_DEBUG_VERBOSE) { + printQudaInvertParam(param->invert_param); + } + + if (param->invert_param->verbosity >= QUDA_DEBUG_VERBOSE) { printQudaEigParam(param); } + + return (void *)param; +} + +void *openQCD_qudaEigensolverGetHandle(int id, int solver_id) +{ + check_eigensolver_id(id); + check_solver_id(solver_id); + + if (qudaState.eig_handles[id] == nullptr) { + logQuda(QUDA_VERBOSE, "Read in eigensolver parameters from file %s for eigensolver (id=%d)\n", qudaState.infile, id); + qudaState.eig_handles[id] = openQCD_qudaEigensolverReadIn(id, solver_id); + } + + openQCD_qudaSolverUpdate(static_cast(qudaState.eig_handles[id])->invert_param); + return qudaState.eig_handles[id]; +} + +void openQCD_qudaEigensolverPrintSetup(int id, int solver_id) +{ + check_eigensolver_id(id); + check_solver_id(solver_id); + + if (qudaState.eig_handles[id] != nullptr) { + QudaEigParam *param = static_cast(qudaState.eig_handles[id]); + printQudaEigParam(param); + printfQuda("\n"); + openQCD_qudaSolverPrintSetup(solver_id); + } else { + printfQuda("\n"); + } +} + +void openQCD_qudaEigensolve(int id, int solver_id, void **h_evecs, void *h_evals) +{ + if (gauge_field_get_unset()) { errorQuda("Gauge field not populated in openQxD."); } + + if (qudaState.layout.h_sw != nullptr) { + qudaState.layout.h_sw(); + } else { + errorQuda("qudaState.layout.h_sw is not set."); + } + + QudaEigParam *eig_param = static_cast(openQCD_qudaEigensolverGetHandle(id, solver_id)); + + if (!openQCD_qudaInvertParamCheck(eig_param->invert_param)) { + errorQuda("Solver check failed, parameters/fields between openQxD and QUDA are not in sync."); + } + + logQuda(QUDA_VERBOSE, "Calling eigensolveQuda() ...\n"); + PUSH_RANGE("eigensolveQuda", 6); + eigensolveQuda(h_evecs, static_cast(h_evals), eig_param); + POP_RANGE; + + logQuda(QUDA_SUMMARIZE, "openQCD_qudaEigensolve()\n"); + logQuda(QUDA_SUMMARIZE, " gflops = %.2e\n", eig_param->invert_param->gflops); + logQuda(QUDA_SUMMARIZE, " secs = %.2e\n", eig_param->invert_param->secs); + logQuda(QUDA_SUMMARIZE, " iter = %d\n", eig_param->invert_param->iter); +} + +void openQCD_qudaEigensolverDestroy(int id) +{ + check_eigensolver_id(id); + + if (qudaState.eig_handles[id] != nullptr) { + QudaEigParam *eig_param = static_cast(qudaState.eig_handles[id]); + + delete eig_param; + qudaState.eig_handles[id] = nullptr; + } +} diff --git a/lib/quda_fortran.F90 b/lib/quda_fortran.F90 index fc7666dd60..c2005fc464 100644 --- a/lib/quda_fortran.F90 +++ b/lib/quda_fortran.F90 @@ -367,6 +367,9 @@ module quda_fortran ! The t0 parameter for distance preconditioning, the timeslice where the source is located integer(4) distance_pc_t0 + ! Additional user-defined properties + integer(8) :: additional_prop + end type quda_invert_param end module quda_fortran diff --git a/lib/targets/cuda/comm_target.cpp b/lib/targets/cuda/comm_target.cpp index bd9fa1fd3e..89bc7f2a38 100644 --- a/lib/targets/cuda/comm_target.cpp +++ b/lib/targets/cuda/comm_target.cpp @@ -75,8 +75,11 @@ namespace quda // TODO: We maybe can force loopback comms to use the IB path here if (comm_dim(dim) == 1) continue; #endif - // even if comm_dim(2) == 2, we might not have p2p enabled in both directions, so check this - const int num_dir = (comm_dim(dim) == 2 && comm_peer2peer_enabled(0, dim) && comm_peer2peer_enabled(1, dim)) ? 1 : 2; + // even if comm_dim(dim) == 2, we might not have p2p enabled in both directions, so check this + const int num_dir + = (!comm_dim_cstar(dim) && comm_dim(dim) == 2 && comm_peer2peer_enabled(0, dim) && comm_peer2peer_enabled(1, dim)) ? + 1 : + 2; for (int dir = 0; dir < num_dir; dir++) { remote[dim][dir] = nullptr; #ifndef NVSHMEM_COMMS diff --git a/lib/targets/hip/comm_target.cpp b/lib/targets/hip/comm_target.cpp index cbb4725253..30812d4440 100644 --- a/lib/targets/hip/comm_target.cpp +++ b/lib/targets/hip/comm_target.cpp @@ -69,9 +69,11 @@ namespace quda // open the remote memory handles and set the send ghost pointers for (int dim = 0; dim < 4; ++dim) { if (comm_dim(dim) == 1) continue; - // even if comm_dim(2) == 2, we might not have p2p enabled in both directions, so check this - const int num_dir - = (comm_dim(dim) == 2 && comm_peer2peer_enabled(0, dim) && comm_peer2peer_enabled(1, dim)) ? 1 : 2; + // even if comm_dim(dim) == 2, we might not have p2p enabled in both directions, so check this + const int num_dir = (!comm_dim_cstar(dim) && comm_dim(dim) == 2 && comm_peer2peer_enabled(0, dim) + && comm_peer2peer_enabled(1, dim)) ? + 1 : + 2; for (int dir = 0; dir < num_dir; ++dir) { remote[dim][dir] = nullptr; if (!comm_peer2peer_enabled(dir, dim)) continue;