From 7380064469938e65feb0b1857974ae18a9311db9 Mon Sep 17 00:00:00 2001
From: James Osborn <osborn@alcf.anl.gov>
Date: Sat, 16 Nov 2024 20:43:44 -0600
Subject: [PATCH] add printing gauge field for debugging initialize
 dslash_type_precondition fix CMake to use QUDA_DIRAC_LAPLACE instead of old
 QUDA_LAPLACE

---
 include/gauge_field.h       | 27 +++++++++++--
 include/gauge_field_order.h | 23 +++++++-----
 lib/check_params.h          |  1 +
 lib/color_spinor_util.in.cu |  4 +-
 lib/gauge_field.cpp         |  9 ++++-
 lib/gauge_norm.in.cu        | 75 ++++++++++++++++++++++++++++++++++++-
 tests/CMakeLists.txt        |  6 +--
 7 files changed, 123 insertions(+), 22 deletions(-)
diff --git a/include/gauge_field.h b/include/gauge_field.h
index a62b6a684f..776681e44b 100644
--- a/include/gauge_field.h
+++ b/include/gauge_field.h
@@ -250,12 +250,12 @@ namespace quda {
     */
     void setTuningString();
 
+  public:
     /**
        @brief Initialize the padded region to 0
      */
     void zeroPad();
 
-  public:
     /**
        @brief Default constructor
     */
@@ -455,7 +455,7 @@ namespace quda {
     std::enable_if_t<std::is_pointer_v<T> && !std::is_pointer_v<typename std::remove_pointer<T>::type>, T> data() const
     {
       if (is_pointer_array(order)) errorQuda("Non dim-array ordered field requested but order is %d", order);
-      return reinterpret_cast<T>(gauge.data());
+      return static_cast<T>(gauge.data());
     }
 
     /**
@@ -473,7 +473,7 @@ namespace quda {
                     "data() requires a pointer cast type");
       if (d >= (unsigned)geometry) errorQuda("Invalid array index %d for geometry %d field", d, geometry);
       if (!is_pointer_array(order)) errorQuda("Dim-array ordered field requested but order is %d", order);
-      return reinterpret_cast<T>(gauge_array[d].data());
+      return static_cast<T>(gauge_array[d].data());
     }
 
     void *raw_pointer() const
@@ -500,7 +500,7 @@ namespace quda {
     {
       if (!is_pointer_array(order)) errorQuda("Dim-array ordered field requested but order is %d", order);
       array<T, QUDA_MAX_DIM> u = {};
-      for (auto d = 0; d < geometry; d++) u[d] = static_cast<T>(gauge_array[d]);
+      for (auto d = 0; d < geometry; d++) u[d] = static_cast<T>(gauge_array[d].data());
       return u;
     }
 
@@ -651,9 +651,28 @@ namespace quda {
       }
     }
 
+    /**
+     * @brief Print the site data
+     * @param[in] parity Parity index
+     * @param[in] dim The dimension in which we are printing
+     * @param[in] x_cb Checkerboard space-time index
+     * @param[in] rank The rank we are requesting from (default is rank = 0)
+     */
+    void PrintMatrix(int dim, int parity, unsigned int x_cb, int rank = 0) const;
+
     friend struct GaugeFieldParam;
   };
 
+  /**
+     @brief Print the value of the field at the requested coordinates
+     @param[in] a The field we are printing from
+     @param[in] dim The dimension in which we are printing
+     @param[in] parity Parity index
+     @param[in] x_cb Checkerboard space-time index
+     @param[in] rank The rank we are requesting from (default is rank = 0)
+  */
+  void genericPrintMatrix(const GaugeField &a, int dim, int parity, unsigned int x_cb, int rank = 0);
+
   /**
      @brief This is a debugging function, where we cast a gauge field
      into a spinor field so we can compute its L1 norm.
diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h
index 38079c3cbb..e3d777565e 100644
--- a/include/gauge_field_order.h
+++ b/include/gauge_field_order.h
@@ -153,8 +153,11 @@ namespace quda {
 
     template <typename Float, typename storeFloat> __host__ __device__ inline constexpr bool fixed_point() { return false; }
     template <> __host__ __device__ inline constexpr bool fixed_point<float, int8_t>() { return true; }
-    template<> __host__ __device__ inline constexpr bool fixed_point<float,short>() { return true; }
-    template<> __host__ __device__ inline constexpr bool fixed_point<float,int>() { return true; }
+    template <> __host__ __device__ inline constexpr bool fixed_point<float, short>() { return true; }
+    template <> __host__ __device__ inline constexpr bool fixed_point<float, int>() { return true; }
+    template <> __host__ __device__ inline constexpr bool fixed_point<double, int8_t>() { return true; }
+    template <> __host__ __device__ inline constexpr bool fixed_point<double, short>() { return true; }
+    template <> __host__ __device__ inline constexpr bool fixed_point<double, int>() { return true; }
 
     template <typename Float, typename storeFloat> __host__ __device__ inline constexpr bool match() { return false; }
     template<> __host__ __device__ inline constexpr bool match<int,int>() { return true; }
@@ -377,7 +380,7 @@ namespace quda {
       {
         for (int d = 0; d < U.Geometry(); d++)
           u[d] = gauge_ ? static_cast<complex<storeFloat> **>(gauge_)[d] : U.data<complex<storeFloat> *>(d);
-        resetScale(U.Scale());
+        resetScale(U.Scale() * (U.LinkMax() == 0.0 ? 1.0 : U.LinkMax()));
       }
 
       void resetScale(Float max)
@@ -459,7 +462,7 @@ namespace quda {
           ghostOffset[d + 4] = U.Nface() * U.SurfaceCB(d) * U.Ncolor() * U.Ncolor();
         }
 
-        resetScale(U.Scale());
+        resetScale(U.Scale() * (U.LinkMax() == 0.0 ? 1.0 : U.LinkMax()));
       }
 
       void resetScale(Float max)
@@ -494,7 +497,7 @@ namespace quda {
         scale(static_cast<Float>(1.0)),
         scale_inv(static_cast<Float>(1.0))
       {
-        resetScale(U.Scale());
+        resetScale(U.Scale() * (U.LinkMax() == 0.0 ? 1.0 : U.LinkMax()));
       }
 
       void resetScale(Float max)
@@ -583,7 +586,7 @@ namespace quda {
           ghostOffset[d + 4] = U.Nface() * U.SurfaceCB(d) * U.Ncolor() * U.Ncolor();
         }
 
-        resetScale(U.Scale());
+        resetScale(U.Scale() * (U.LinkMax() == 0.0 ? 1.0 : U.LinkMax()));
       }
 
       void resetScale(Float max)
@@ -636,7 +639,7 @@ namespace quda {
         scale(static_cast<Float>(1.0)),
         scale_inv(static_cast<Float>(1.0))
       {
-	resetScale(U.Scale());
+        resetScale(U.Scale() * (U.LinkMax() == 0.0 ? 1.0 : U.LinkMax()));
       }
 
       void resetScale(Float max)
@@ -715,7 +718,7 @@ namespace quda {
             nullptr;
           ghostVolumeCB[d + 4] = U.Nface() * U.SurfaceCB(d);
         }
-        resetScale(U.Scale());
+        resetScale(U.Scale() * (U.LinkMax() == 0.0 ? 1.0 : U.LinkMax()));
       }
 
       void resetScale(Float max)
@@ -745,8 +748,8 @@ namespace quda {
        @tparam nSpinCoarse Number of "spin degrees of freedom" (for coarse-link fields only)
        @tparam order Storage order of the field
        @tparam native_ghost Whether to use native ghosts (inlined into
+               the padded area for internal-order fields or use a separate array if false)
        @tparam storeFloat_ Underlying storage type for the field
-       the padded area for internal-order fields or use a separate array if false)
      */
     template <typename Float_, int nColor, int nSpinCoarse, QudaGaugeFieldOrder order, bool native_ghost = true,
               typename storeFloat_ = Float_>
@@ -1478,7 +1481,7 @@ namespace quda {
             z *= scale;
 #pragma unroll
             for (int i = 0; i < 9; i++) out[i] = cmul(z, out[i]);
-          } else { // stagic phase
+          } else { // static phase
 #pragma unroll
             for (int i = 0; i < 9; i++) { out[i] *= phase; }
           }
diff --git a/lib/check_params.h b/lib/check_params.h
index f227ee4716..639db3eb3d 100644
--- a/lib/check_params.h
+++ b/lib/check_params.h
@@ -578,6 +578,7 @@ void printQudaInvertParam(QudaInvertParam *param) {
   // domain decomposition parameters
   //P(inv_type_sloppy, QUDA_INVALID_INVERTER); // disable since invalid means no preconditioner
 #if defined INIT_PARAM
+  P(dslash_type_precondition, QUDA_INVALID_DSLASH);
   P(inv_type_precondition, QUDA_INVALID_INVERTER);
   P(preconditioner, 0);
   P(tol_precondition, INVALID_DOUBLE);
diff --git a/lib/color_spinor_util.in.cu b/lib/color_spinor_util.in.cu
index ef370eda18..324184153f 100644
--- a/lib/color_spinor_util.in.cu
+++ b/lib/color_spinor_util.in.cu
@@ -329,7 +329,7 @@ namespace quda {
       printf("rank = %d x = %u, s = %d, { ", comm_rank(), x_cb, s);
       for (int c = 0; c < o.Ncolor(); c++) {
         auto value = complex<double>(o(parity, x_cb, s, c));
-        printf("(%f,%f) ", value.real(), value.imag());
+        printf("(%g,%g) ", value.real(), value.imag());
       }
       printf("}\n");
     }
@@ -340,7 +340,7 @@ namespace quda {
   {
     if (a.isNative()) {
       constexpr auto order = colorspinor::getNative<Float>(nSpin);
-      print_vector(FieldOrderCB<double, nSpin, nColor, 1, order, Float, Float, false, true>(a), parity, x_cb);
+      print_vector(FieldOrderCB<double, nSpin, nColor, 1, order, Float, Float, false, false>(a), parity, x_cb);
     } else if (a.FieldOrder() == QUDA_SPACE_SPIN_COLOR_FIELD_ORDER) {
       constexpr auto order = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
       print_vector(FieldOrderCB<double, nSpin, nColor, 1, order, Float, Float, false, true>(a), parity, x_cb);
diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp
index 20cf7621ac..40a1351656 100644
--- a/lib/gauge_field.cpp
+++ b/lib/gauge_field.cpp
@@ -1355,7 +1355,7 @@ namespace quda {
       qudaMemcpy(buffer, data(), Bytes(), qudaMemcpyDeviceToHost);
     } else {
       if (is_pointer_array(order)) {
-        char *dst_buffer = reinterpret_cast<char *>(buffer);
+        char *dst_buffer = static_cast<char *>(buffer);
         for (int d = 0; d < site_dim; d++) {
           std::memcpy(&dst_buffer[d * bytes / site_dim], gauge_array[d].data(), bytes / site_dim);
         }
@@ -1375,7 +1375,7 @@ namespace quda {
       qudaMemcpy(data(), buffer, Bytes(), qudaMemcpyHostToDevice);
     } else {
       if (is_pointer_array(order)) {
-        const char *dst_buffer = reinterpret_cast<const char *>(buffer);
+        const char *dst_buffer = static_cast<const char *>(buffer);
         for (int d = 0; d < site_dim; d++) {
           std::memcpy(gauge_array[d].data(), &dst_buffer[d * bytes / site_dim], bytes / site_dim);
         }
@@ -1389,4 +1389,9 @@ namespace quda {
     }
   }
 
+  void GaugeField::PrintMatrix(int dim, int parity, unsigned int x_cb, int rank) const
+  {
+    genericPrintMatrix(*this, dim, parity, x_cb, rank);
+  }
+
 } // namespace quda
diff --git a/lib/gauge_norm.in.cu b/lib/gauge_norm.in.cu
index 0da4412d17..1b1f3c6882 100644
--- a/lib/gauge_norm.in.cu
+++ b/lib/gauge_norm.in.cu
@@ -1,5 +1,6 @@
 #include <gauge_field_order.h>
 #include <instantiate.h>
+#include <memory>
 
 namespace quda {
 
@@ -61,7 +62,7 @@ namespace quda {
       norm_ = norm<T, fixed, 2 * nColor>(u, d, type); // factor of two to account for spin with MG fields
     } else {
       if constexpr (sizeof...(N) > 0) {
-        norm_ = norm<T, fixed>(u, d, type, IntList<N...>());        
+        norm_ = norm<T, fixed>(u, d, type, IntList<N...>());
       } else {
         errorQuda("Nc = %d has not been instantiated", u.Ncolor());
       }
@@ -108,4 +109,76 @@ namespace quda {
     return nrm;
   }
 
+  template <class Order> void print_matrix(const Order &o, int d, int parity, unsigned int x_cb)
+  {
+    for (int r = 0; r < o.Ncolor(); r++) {
+      printf("rank %d parity %d x %u row %d", comm_rank(), parity, x_cb, r);
+      for (int c = 0; c < o.Ncolor(); c++) {
+        auto value = complex<double>(o(d, parity, x_cb, r, c));
+        printf(" (%g,%g)", value.real(), value.imag());
+      }
+      printf("\n");
+    }
+  }
+
+  template <typename Float, int nColor>
+  void genericPrintMatrix(const GaugeField &a, int d, int parity, unsigned int x_cb)
+  {
+    switch (a.FieldOrder()) {
+    case QUDA_FLOAT2_GAUGE_ORDER:
+      print_matrix(FieldOrder<double, nColor, 1, QUDA_FLOAT2_GAUGE_ORDER, true, Float>(a), d, parity, x_cb);
+      break;
+    case QUDA_QDP_GAUGE_ORDER:
+      print_matrix(FieldOrder<double, nColor, 1, QUDA_QDP_GAUGE_ORDER, true, Float>(a), d, parity, x_cb);
+      break;
+    case QUDA_MILC_GAUGE_ORDER:
+      print_matrix(FieldOrder<double, nColor, 1, QUDA_MILC_GAUGE_ORDER, true, Float>(a), d, parity, x_cb);
+      break;
+    default: errorQuda("Unsupported field order %d", a.FieldOrder());
+    }
+  }
+
+  template <typename Float, int nColor, int... N>
+  void genericPrintMatrix(const GaugeField &a, int d, int parity, unsigned int x_cb, IntList<nColor, N...>)
+  {
+    if (a.Ncolor() == nColor) {
+      genericPrintMatrix<Float, nColor>(a, d, parity, x_cb);
+    } else {
+      if constexpr (sizeof...(N) > 0) {
+        genericPrintMatrix<Float, N...>(a, d, parity, x_cb, IntList<N...>());
+      } else {
+        errorQuda("Not supported Ncolor = %d", a.Ncolor());
+      }
+    }
+  }
+
+  template <typename Float> void genericPrintMatrix(const GaugeField &a, int d, int parity, unsigned int x_cb)
+  {
+    genericPrintMatrix<Float>(a, d, parity, x_cb, IntList<@QUDA_MULTIGRID_NC_NVEC_LIST @>());
+  }
+
+  void genericPrintMatrix(const GaugeField &a, int d, int parity, unsigned int x_cb, int rank)
+  {
+    if (rank != comm_rank()) return;
+
+    GaugeFieldParam param(a);
+    param.field = const_cast<GaugeField *>(&a);
+    param.location = QUDA_CPU_FIELD_LOCATION;
+    param.create = QUDA_COPY_FIELD_CREATE;
+    // if field is a pinned device field then we need to clone it on the host
+    bool host_clone
+      = (a.Location() == QUDA_CUDA_FIELD_LOCATION && a.MemType() == QUDA_MEMORY_DEVICE && !use_managed_memory()) ? true :
+                                                                                                                   false;
+    std::unique_ptr<GaugeField> clone_a = !host_clone ? nullptr : std::make_unique<GaugeField>(param);
+    const GaugeField &a_ = !host_clone ? a : *clone_a.get();
+
+    switch (a.Precision()) {
+    case QUDA_DOUBLE_PRECISION: genericPrintMatrix<double>(a_, d, parity, x_cb); break;
+    case QUDA_SINGLE_PRECISION: genericPrintMatrix<float>(a_, d, parity, x_cb); break;
+    case QUDA_HALF_PRECISION: genericPrintMatrix<short>(a_, d, parity, x_cb); break;
+    case QUDA_QUARTER_PRECISION: genericPrintMatrix<int8_t>(a_, d, parity, x_cb); break;
+    default: errorQuda("Precision %d not implemented", a.Precision());
+    }
+  }
+
 } // namespace quda
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index b295e00d9c..2beb90ceae 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -963,7 +963,7 @@ endif()
       set_tests_properties(dslash_${DIRAC_NAME}_build_policy${pol2} PROPERTIES ENVIRONMENT QUDA_ENABLE_DSLASH_POLICY=${pol2})
     endif()
 
-    if(QUDA_LAPLACE)
+    if(QUDA_DIRAC_LAPLACE)
       set(DIRAC_NAME laplace)
       add_test(NAME dslash_${DIRAC_NAME}_mat_policy${pol2}
                COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_dslash_ctest> ${MPIEXEC_POSTFLAGS}
@@ -1221,7 +1221,7 @@ foreach(prec IN LISTS TEST_PRECS)
       endif()
     endif()
 
-    if (QUDA_LAPLACE)
+    if (QUDA_DIRAC_LAPLACE)
       add_test(NAME invert_test_laplace_${prec}
         COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_invert_test> ${MPIEXEC_POSTFLAGS}
         --dslash-type laplace --ngcrkrylov 8 --compute-fat-long true
@@ -1469,7 +1469,7 @@ foreach(prec IN LISTS TEST_PRECS)
         --gtest_output=xml:staggered_eigensolve_test_staggered_${prec}.xml)
     endif()
 
-    if (QUDA_LAPLACE)
+    if (QUDA_DIRAC_LAPLACE)
       add_test(NAME eigensolve_test_laplace_${prec}
         COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_eigensolve_test> ${MPIEXEC_POSTFLAGS}
         --dslash-type laplace --compute-fat-long true