diff --git a/CarpetX/src/io_meta.cxx b/CarpetX/src/io_meta.cxx
index e63f972af..40b9355e7 100644
--- a/CarpetX/src/io_meta.cxx
+++ b/CarpetX/src/io_meta.cxx
@@ -75,6 +75,9 @@ YAML::Emitter &operator<<(YAML::Emitter &yaml, const reduction_t &reduction) {
   case reduction_t::maximum_location:
     yaml << "maximum_location";
     break;
+  case reduction_t::sum_location:
+    yaml << "sum_location";
+    break;
   default:
     assert(0);
   }
diff --git a/CarpetX/src/io_meta.hxx b/CarpetX/src/io_meta.hxx
index de5c95260..8395ee7fb 100644
--- a/CarpetX/src/io_meta.hxx
+++ b/CarpetX/src/io_meta.hxx
@@ -32,6 +32,7 @@ enum class reduction_t {
   norm_inf,
   minimum_location,
   maximum_location,
+  sum_location,
 };
 
 struct output_file_description_t {
diff --git a/CarpetX/src/io_norm.cxx b/CarpetX/src/io_norm.cxx
index 7f27380ef..3a9a7e319 100644
--- a/CarpetX/src/io_norm.cxx
+++ b/CarpetX/src/io_norm.cxx
@@ -79,6 +79,11 @@ void OutputNorms(const cGH *restrict cctkGH) {
       reductions.push_back(buf.str());
     }
   }
+  for (int d = 0; d < dim; ++d) {
+    std::ostringstream buf;
+    buf << "sumloc[" << d << "]";
+    reductions.push_back(buf.str());
+  }
 
   if (is_root) {
     static std::once_flag create_directory;
@@ -215,6 +220,8 @@ void OutputNorms(const cGH *restrict cctkGH) {
           for (int d = 0; d < dim; ++d)
             file << sep << red.maxloc[d];
         }
+        for (int d = 0; d < dim; ++d)
+          file << sep << red.sumloc[d];
       }
     }
 
@@ -240,8 +247,9 @@ void OutputNorms(const cGH *restrict cctkGH) {
         ofd.reductions.push_back(reduction_t::minimum_location);
         ofd.reductions.push_back(reduction_t::maximum_location);
       }
+      ofd.reductions.push_back(reduction_t::sum_location);
       ofd.format_name = "CarpetX/norms/TSV";
-      ofd.format_version = {1, 0, 0};
+      ofd.format_version = {1, 1, 0};
 
       OutputMeta_RegisterOutputFile(std::move(ofd));
     }
diff --git a/CarpetX/src/reduction.hxx b/CarpetX/src/reduction.hxx
index cf0179a11..6668f3741 100644
--- a/CarpetX/src/reduction.hxx
+++ b/CarpetX/src/reduction.hxx
@@ -1,6 +1,7 @@
 #ifndef CARPETX_CARPETX_REDUCTION_HXX
 #define CARPETX_CARPETX_REDUCTION_HXX
 
+#include "defs.hxx"
 #include "vect.hxx"
 
 #include <cctk.h>
@@ -14,28 +15,16 @@ namespace CarpetX {
 using namespace std;
 using namespace Arith;
 
-template <typename T> constexpr T pow21(T x) noexcept { return x * x; }
-
-template <typename T> constexpr T fmax1(T x, T y) noexcept {
-  if (isnan(x))
-    return x;
-  if (isnan(y))
-    return y;
-  return fmax(x, y);
-}
-template <typename T> constexpr T fmin1(T x, T y) noexcept {
-  if (isnan(x))
-    return x;
-  if (isnan(y))
-    return y;
-  return fmin(x, y);
-}
-
 template <typename T, int D> struct reduction {
   // TODO: contains_inf, contains_nan?
   T min, max, sum, sum2;
   T vol, maxabs, sumabs, sum2abs;
-  vect<T, D> minloc, maxloc;
+  vect<T, D> minloc, maxloc, sumloc;
+
+  // We currently omit minloc/maxloc/sumloc (TODO: fix this)
+  using tuple_type = amrex::GpuTuple<T, T, T, T, T, T, T, T>;
+  constexpr reduction(tuple_type);
+  constexpr operator tuple_type() const;
 
   constexpr reduction();
   constexpr reduction(const vect<T, D> &p, const T &V, const T &x);
@@ -49,9 +38,9 @@ template <typename T, int D> struct reduction {
 
   constexpr T avg() const noexcept { return sum / vol; }
   constexpr T sdv() const noexcept {
-    // Splitting pow21(sum/vol) improves floating-point accuracy
-    // return sqrt(fmax1(T(0), sum2 / vol - pow21(sum / vol)));
-    return sqrt(fmax1(T(0), sum2 / vol - pow21(sum) / pow21(vol)));
+    // Splitting pow2(sum/vol) improves floating-point accuracy
+    // return sqrt(max1(T(0), sum2 / vol - pow2(sum / vol)));
+    return sqrt(max1(T(0), sum2 / vol - pow2(sum) / pow2(vol)));
   }
   constexpr T norm0() const noexcept { return vol; }
   constexpr T norm1() const noexcept { return sumabs / vol; }
@@ -62,27 +51,42 @@ template <typename T, int D> struct reduction {
   friend ostream &operator<<(ostream &os, const reduction<T1, D1> &red);
 };
 
+template <typename T, int D>
+constexpr reduction<T, D>::reduction(tuple_type tuple)
+    : min(amrex::get<0>(tuple)), max(amrex::get<1>(tuple)),
+      sum(amrex::get<2>(tuple)), sum2(amrex::get<3>(tuple)),
+      vol(amrex::get<4>(tuple)), maxabs(amrex::get<5>(tuple)),
+      sumabs(amrex::get<6>(tuple)), sum2abs(amrex::get<7>(tuple)), minloc{},
+      maxloc{}, sumloc{} {}
+
+template <typename T, int D>
+constexpr reduction<T, D>::operator reduction<T, D>::tuple_type() const {
+  return tuple_type{min, max, sum, sum2, vol, maxabs, sumabs, sum2abs};
+}
+
 template <typename T, int D>
 constexpr reduction<T, D>::reduction()
     : min(1.0 / 0.0), max(-1.0 / 0.0), sum(0.0), sum2(0.0), vol(0.0),
       maxabs(0.0), sumabs(0.0), sum2abs(0.0),
-      minloc(vect<T, D>::pure(0.0 / 0.0)), maxloc(vect<T, D>::pure(0.0 / 0.0)) {
-}
+      minloc(vect<T, D>::pure(0.0 / 0.0)), maxloc(vect<T, D>::pure(0.0 / 0.0)),
+      sumloc(vect<T, D>::pure(0.0)) {}
 
 template <typename T, int D>
 constexpr reduction<T, D>::reduction(const vect<T, D> &p, const T &V,
                                      const T &x)
-    : min(x), max(x), sum(V * x), sum2(V * pow21(x)), vol(V), maxabs(fabs(x)),
-      sumabs(V * fabs(x)), sum2abs(V * pow21(fabs(x))), minloc(p), maxloc(p) {}
+    : min(x), max(x), sum(V * x), sum2(V * pow2(x)), vol(V), maxabs(fabs(x)),
+      sumabs(V * fabs(x)), sum2abs(V * pow2(fabs(x))), minloc(p), maxloc(p),
+      sumloc(x * p) {}
 
 template <typename T, int D>
 constexpr reduction<T, D>::reduction(const reduction &x, const reduction &y)
-    : min(fmin1(x.min, y.min)), max(fmax1(x.max, y.max)), sum(x.sum + y.sum),
+    : min(min1(x.min, y.min)), max(max1(x.max, y.max)), sum(x.sum + y.sum),
       sum2(x.sum2 + y.sum2), vol(x.vol + y.vol),
-      maxabs(fmax1(x.maxabs, y.maxabs)), sumabs(x.sumabs + y.sumabs),
+      maxabs(max1(x.maxabs, y.maxabs)), sumabs(x.sumabs + y.sumabs),
       sum2abs(x.sum2abs + y.sum2abs),
       minloc(x.min <= y.min ? x.minloc : y.minloc),
-      maxloc(x.max >= y.max ? x.maxloc : y.maxloc) {}
+      maxloc(x.max >= y.max ? x.maxloc : y.maxloc),
+      sumloc(x.sumloc + y.sumloc) {}
 
 template <typename T, int D>
 ostream &operator<<(ostream &os, const reduction<T, D> &red) {
@@ -97,6 +101,7 @@ ostream &operator<<(ostream &os, const reduction<T, D> &red) {
             << "  sum2abs:  " << red.sum2abs << "\n"
             << "  minloc:   " << red.minloc << "\n"
             << "  maxloc:   " << red.maxloc << "\n"
+            << "  sumloc:   " << red.sumloc << "\n"
             << "  avg:      " << red.avg() << "\n"
             << "  sdv:      " << red.sdv() << "\n"
             << "  norm1:    " << red.norm1() << "\n"
@@ -105,6 +110,17 @@ ostream &operator<<(ostream &os, const reduction<T, D> &red) {
             << "}\n";
 }
 
+template <typename T, int D> struct combine {
+  using value_type = typename reduction<T, D>::tuple_type;
+  constexpr AMREX_GPU_DEVICE value_type init() const {
+    return (value_type)reduction<T, D>();
+  }
+  constexpr AMREX_GPU_DEVICE value_type operator()(const value_type &x,
+                                                   const value_type &y) const {
+    return (value_type)reduction<T, D>(x), reduction<T, D>(y);
+  }
+};
+
 typedef reduction<CCTK_REAL, dim> reduction_CCTK_REAL;
 #pragma omp declare reduction(reduction:reduction_CCTK_REAL : omp_out += omp_in)