diff --git a/examples/combi_example/TaskExample.hpp b/examples/combi_example/TaskExample.hpp
index 379817634..6d83aebd3 100644
--- a/examples/combi_example/TaskExample.hpp
+++ b/examples/combi_example/TaskExample.hpp
@@ -20,7 +20,7 @@ class TaskExample : public Task {
    */
   TaskExample(const LevelVector& l, const std::vector<BoundaryType>& boundary, real coeff,
               LoadModel* loadModel, real dt, size_t nsteps,
-              std::vector<int> p = std::vector<int>(0),
+              const std::vector<int>& p = std::vector<int>(0),
               FaultCriterion* faultCrit = (new StaticFaults({0, IndexVector(0), IndexVector(0)})))
       : Task(l, boundary, coeff, loadModel, faultCrit),
         dt_(dt),
@@ -31,7 +31,7 @@ class TaskExample : public Task {
         dfg_(NULL) {}
 
   void init(CommunicatorType lcomm,
-            std::vector<IndexVector> decomposition = std::vector<IndexVector>()) {
+            const std::vector<IndexVector>& decomposition = std::vector<IndexVector>()) {
     assert(!initialized_);
     assert(dfg_ == NULL);
 
@@ -108,8 +108,6 @@ class TaskExample : public Task {
   void run(CommunicatorType lcomm) {
     assert(initialized_);
 
-    int lrank = theMPISystem()->getLocalRank();
-
     auto elements = dfg_->getData();
     // TODO if your Example uses another data structure, you need to copy
     // the data from elements to that data structure
@@ -152,9 +150,9 @@ class TaskExample : public Task {
     dfg_->gatherFullGrid(fg, r);
   }
 
-  DistributedFullGrid<CombiDataType>& getDistributedFullGrid(int n = 0) override { return *dfg_; }
+  DistributedFullGrid<CombiDataType>& getDistributedFullGrid(size_t n = 0) override { return *dfg_; }
 
-  const DistributedFullGrid<CombiDataType>& getDistributedFullGrid(int n = 0) const override { return *dfg_; }
+  const DistributedFullGrid<CombiDataType>& getDistributedFullGrid(size_t n = 0) const override { return *dfg_; }
 
   static real myfunction(std::vector<real>& coords, real t) {
     real u = std::cos(M_PI * t);
@@ -183,7 +181,7 @@ class TaskExample : public Task {
 
   // new variables that are set by manager. need to be added to serialize
   real dt_;
-  size_t nsteps_;
+  size_t nsteps_ = 0;
   std::vector<int> p_;
 
   // pure local variables that exist only on the worker processes
diff --git a/examples/combi_example_faults/TaskExample.hpp b/examples/combi_example_faults/TaskExample.hpp
index a98ed3ce5..852f4edf0 100644
--- a/examples/combi_example_faults/TaskExample.hpp
+++ b/examples/combi_example_faults/TaskExample.hpp
@@ -22,7 +22,7 @@ class TaskExample: public Task {
    */
   TaskExample(DimType dim, const LevelVector& l, const std::vector<BoundaryType>& boundary,
               real coeff, LoadModel* loadModel, real dt, size_t nsteps,
-              std::vector<int> p = std::vector<int>(0),
+              const std::vector<int>& p = std::vector<int>(0),
               FaultCriterion* faultCrit = (new StaticFaults({0, IndexVector(0), IndexVector(0)})))
       : Task(l, boundary, coeff, loadModel, faultCrit),
         dt_(dt),
@@ -33,7 +33,7 @@ class TaskExample: public Task {
         combiStep_(0),
         dfg_(NULL) {}
 
-  void init(CommunicatorType lcomm, std::vector<IndexVector> decomposition = std::vector<IndexVector>()) {
+  void init(CommunicatorType lcomm, const std::vector<IndexVector>& decomposition = std::vector<IndexVector>()) {
     assert(!initialized_);
     assert(dfg_ == NULL);
 
@@ -110,9 +110,6 @@ class TaskExample: public Task {
   void run(CommunicatorType lcomm) {
     assert(initialized_);
 
-    int globalRank = theMPISystem()->getGlobalRank();
-    int lrank = theMPISystem()->getLocalRank();
-
     /* pseudo timestepping to demonstrate the behaviour of your typical
      * time-dependent simulation problem. */
     auto elements = dfg_->getData();
@@ -147,11 +144,12 @@ class TaskExample: public Task {
     dfg_->gatherFullGrid(fg, r);
   }
 
-  DistributedFullGrid<CombiDataType>& getDistributedFullGrid(int n) override {
+  DistributedFullGrid<CombiDataType>& getDistributedFullGrid(size_t n) override {
     return *dfg_;
   }
 
-  const DistributedFullGrid<CombiDataType>& getDistributedFullGrid(int n) const override{
+  const DistributedFullGrid<CombiDataType>& getDistributedFullGrid(size_t n) const override{
+    assert(n == 0);
     return *dfg_;
   }
 
@@ -200,7 +198,7 @@ class TaskExample: public Task {
 
   // new variables that are set by manager. need to be added to serialize
   real dt_;
-  size_t nsteps_;
+  size_t nsteps_ = 0;
   size_t stepsTotal_;
   std::vector<int> p_;
 
diff --git a/examples/combi_workers_only/combi_example_worker_only.cpp b/examples/combi_workers_only/combi_example_worker_only.cpp
index af4ce36f4..bdfc4e991 100644
--- a/examples/combi_workers_only/combi_example_worker_only.cpp
+++ b/examples/combi_workers_only/combi_example_worker_only.cpp
@@ -152,7 +152,7 @@ int main(int argc, char** argv) {
       interpolationCoords = broadcastParameters::getCoordinatesFromRankZero(
           interpolationCoordsFile, theMPISystem()->getWorldComm());
 
-      if (interpolationCoords.size() != 1e5) {
+      if (interpolationCoords.size() != static_cast<size_t>(1e5)) {
         sleep(1);
         throw std::runtime_error("not enough interpolation coordinates");
       }
diff --git a/examples/distributed_advection/TaskAdvection.hpp b/examples/distributed_advection/TaskAdvection.hpp
index 5f955b5bd..3bd2dad31 100644
--- a/examples/distributed_advection/TaskAdvection.hpp
+++ b/examples/distributed_advection/TaskAdvection.hpp
@@ -33,7 +33,7 @@ class TaskAdvection : public Task {
    */
   TaskAdvection(const LevelVector& l, const std::vector<BoundaryType>& boundary, real coeff,
                 LoadModel* loadModel, real dt, size_t nsteps,
-                std::vector<int> p = std::vector<int>(0),
+                const std::vector<int>& p = std::vector<int>(0),
                 FaultCriterion* faultCrit = (new StaticFaults({0, IndexVector(0), IndexVector(0)})))
       : Task(l, boundary, coeff, loadModel, faultCrit),
         dt_(dt),
@@ -42,17 +42,16 @@ class TaskAdvection : public Task {
         initialized_(false),
         stepsTotal_(0),
         dfg_(nullptr) {
-    for (const auto& b : boundary) {
+    for ([[maybe_unused]] const auto& b : boundary) {
       assert(b == 1);
     }
   }
 
   void init(CommunicatorType lcomm,
-            std::vector<IndexVector> decomposition = std::vector<IndexVector>()) override {
+            const std::vector<IndexVector>& decomposition = std::vector<IndexVector>()) override {
     assert(!initialized_);
     assert(dfg_ == NULL);
 
-    auto lrank = theMPISystem()->getLocalRank();
     DimType dim = this->getDim();
     const LevelVector& l = this->getLevelVector();
 
@@ -105,7 +104,7 @@ class TaskAdvection : public Task {
       // swap at the end of each time step
       auto& u_dot_dphi = *phi_;
       auto const ElementVector = dfg_->getData();
-      for (unsigned int d = 0; d < this->getDim(); ++d) {
+      for (DimType d = 0; d < this->getDim(); ++d) {
         static std::vector<int> subarrayExtents;
         std::vector<CombiDataType> phi_ghost{};
         dfg_->exchangeGhostLayerUpward(d, subarrayExtents, phi_ghost);
@@ -183,7 +182,7 @@ class TaskAdvection : public Task {
     dfg_->gatherFullGrid(fg, r);
   }
 
-  DistributedFullGrid<CombiDataType>& getDistributedFullGrid(int n = 0) override { return *dfg_; }
+  DistributedFullGrid<CombiDataType>& getDistributedFullGrid(size_t n = 0) override { return *dfg_; }
 
   void setZero() override {
     dfg_->setZero();
diff --git a/examples/distributed_third_level/TaskAdvection.hpp b/examples/distributed_third_level/TaskAdvection.hpp
index fac0c1ae0..82a4c43c7 100644
--- a/examples/distributed_third_level/TaskAdvection.hpp
+++ b/examples/distributed_third_level/TaskAdvection.hpp
@@ -33,7 +33,7 @@ class TaskAdvection : public Task {
    */
   TaskAdvection(const LevelVector& l, const std::vector<BoundaryType>& boundary, real coeff,
                 LoadModel* loadModel, real dt, size_t nsteps,
-                std::vector<int> p = std::vector<int>(0),
+                const std::vector<int>& p = std::vector<int>(0),
                 FaultCriterion* faultCrit = (new StaticFaults({0, IndexVector(0), IndexVector(0)})))
       : Task(l, boundary, coeff, loadModel, faultCrit),
         dt_(dt),
@@ -42,13 +42,13 @@ class TaskAdvection : public Task {
         initialized_(false),
         stepsTotal_(0),
         dfg_(nullptr) {
-    for (const auto& b : boundary) {
+    for ([[maybe_unused]] const auto& b : boundary) {
       assert(b == 1);
     }
   }
 
   void init(CommunicatorType lcomm,
-            std::vector<IndexVector> decomposition = std::vector<IndexVector>()) override {
+            const std::vector<IndexVector>& decomposition = std::vector<IndexVector>()) override {
     assert(!initialized_);
     assert(dfg_ == NULL);
 
@@ -157,7 +157,7 @@ class TaskAdvection : public Task {
         }
         // iterate the lowest layer and update the values, compensating for the wrong update
         // before
-        assert(numLocalElements / dfg_->getLocalSizes()[d] == phi_ghost.size());
+        assert(static_cast<size_t>(numLocalElements) / dfg_->getLocalSizes()[d] == phi_ghost.size());
         const auto& stride = fullOffsetsInThisDimension;
         const IndexType jump = stride * dfg_->getLocalSizes()[d];
         const IndexType numberOfPolesHigherDimensions = numLocalElements / jump;
@@ -235,7 +235,7 @@ class TaskAdvection : public Task {
     dfg_->gatherFullGrid(fg, r);
   }
 
-  DistributedFullGrid<CombiDataType>& getDistributedFullGrid(int n = 0) override { return *dfg_; }
+  DistributedFullGrid<CombiDataType>& getDistributedFullGrid(size_t n = 0) override { return *dfg_; }
 
   void setZero() override {
     dfg_->setZero();
diff --git a/examples/distributed_third_level/combi_example_worker_only.cpp b/examples/distributed_third_level/combi_example_worker_only.cpp
index 91e6ba45b..deceae7cc 100644
--- a/examples/distributed_third_level/combi_example_worker_only.cpp
+++ b/examples/distributed_third_level/combi_example_worker_only.cpp
@@ -70,7 +70,7 @@ int main(int argc, char** argv) {
     bool evalMCError = cfg.get<bool>("application.mcerror", false);
     uint16_t numberOfFileParts = cfg.get<uint16_t>("io.numberParts", 1);
 
-    theMPISystem()->initOuputGroupComm(numberOfFileParts);
+    theMPISystem()->initOutputGroupComm(numberOfFileParts);
 
     // read in third level parameters if available
     std::string thirdLevelHost, thirdLevelSSHCommand = "";
@@ -166,7 +166,7 @@ int main(int argc, char** argv) {
     interpolationCoords = broadcastParameters::getCoordinatesFromRankZero(
         interpolationCoordsFile, theMPISystem()->getWorldComm());
 
-    if (interpolationCoords.size() != 1e5) {
+    if (interpolationCoords.size() != static_cast<size_t>(1e5)) {
       sleep(1);
       throw std::runtime_error("not enough interpolation coordinates");
     }
diff --git a/examples/distributed_third_level/tools/worker_only_io_test.cpp b/examples/distributed_third_level/tools/worker_only_io_test.cpp
index 303f0ab0b..1da1fdc8c 100644
--- a/examples/distributed_third_level/tools/worker_only_io_test.cpp
+++ b/examples/distributed_third_level/tools/worker_only_io_test.cpp
@@ -69,7 +69,7 @@ int main(int argc, char** argv) {
     nsteps = cfg.get<size_t>("application.nsteps");
     bool evalMCError = cfg.get<bool>("application.mcerror", false);
     uint16_t numberOfFileParts = cfg.get<uint16_t>("io.numberParts", 1);
-    theMPISystem()->initOuputGroupComm(numberOfFileParts);
+    theMPISystem()->initOutputGroupComm(numberOfFileParts);
 
     // read in third level parameters if available
     std::string thirdLevelHost, thirdLevelSSHCommand = "";
@@ -86,8 +86,7 @@ int main(int argc, char** argv) {
       MIDDLE_PROCESS_EXCLUSIVE_SECTION std::cout << "running in file-based third level mode"
                                                  << std::endl;
     } else {
-      MIDDLE_PROCESS_EXCLUSIVE_SECTION std::cout << "running in file-based local mode"
-                                                 << std::endl;
+      MIDDLE_PROCESS_EXCLUSIVE_SECTION std::cout << "running in file-based local mode" << std::endl;
     }
 
     // periodic boundary conditions
@@ -239,13 +238,12 @@ int main(int argc, char** argv) {
         readSparseGridFile =
             "dsg_" + std::to_string((systemNumber + 1) % 2) + "_step" + std::to_string(i);
         std::string readSparseGridFileToken = readSparseGridFile + "_token.txt";
-        OUTPUT_GROUP_EXCLUSIVE_SECTION {
-        }
+        OUTPUT_GROUP_EXCLUSIVE_SECTION {}
 
       } else {
         readSparseGridFile = writeSparseGridFile;
-	OUTPUT_GROUP_EXCLUSIVE_SECTION {
-	  worker.waitForTokenFile(writeSparseGridFileToken);
+        OUTPUT_GROUP_EXCLUSIVE_SECTION {
+          worker.waitForTokenFile(writeSparseGridFileToken);
           Stats::startEvent("read SG");
           int numRead = worker.readReduce(readSparseGridFile, true);
           Stats::stopEvent("read SG");
diff --git a/examples/gene_distributed/src/GeneTask.cpp b/examples/gene_distributed/src/GeneTask.cpp
index b3088dd42..d86943213 100644
--- a/examples/gene_distributed/src/GeneTask.cpp
+++ b/examples/gene_distributed/src/GeneTask.cpp
@@ -163,7 +163,7 @@ void GeneTask::decideToKill(){ //toDo check if combiStep should be included in t
 /**
  * This routine initializes GeneTask; currently it only sets a bool value.
  */
-void GeneTask::init(CommunicatorType lcomm, std::vector<IndexVector> decomposition){
+void GeneTask::init(CommunicatorType lcomm, const std::vector<IndexVector>& decomposition){
 //  if( dfg_ == NULL ){
 //      dfg_ = new OwningDistributedFullGrid<CombiDataType>( dim_, l_, lcomm,
 //          this->getBoundary(), p_, false);
@@ -247,7 +247,7 @@ void GeneTask::getFullGrid( FullGrid<CombiDataType>& fg, RankType lroot,
 /**
  * This routine returns the local part of the fullgrid
  */
-DistributedFullGrid<complex>& GeneTask::getDistributedFullGrid(int specie){
+DistributedFullGrid<complex>& GeneTask::getDistributedFullGrid(size_t specie) override {
   return *dfgVector_[specie];
 }
 
diff --git a/examples/gene_distributed/src/GeneTask.hpp b/examples/gene_distributed/src/GeneTask.hpp
index 9d7651b43..0a33e749c 100644
--- a/examples/gene_distributed/src/GeneTask.hpp
+++ b/examples/gene_distributed/src/GeneTask.hpp
@@ -31,7 +31,7 @@ class GeneTask : public combigrid::Task {
  public:
   GeneTask(DimType dim, const LevelVector& l, const std::vector<BoundaryType>& boundary, real coeff,
            LoadModel* loadModel, std::string& path, real dt, real combitime, size_t nsteps,
-           real shat, real lx, int ky0_ind, std::vector<int> p = std::vector<int>(0),
+           real shat, real lx, int ky0_ind, const std::vector<int>& p = std::vector<int>(0),
            FaultCriterion* faultCrit = (new StaticFaults({0, IndexVector(0), IndexVector(0)})),
            IndexType numSpecies = 1, bool GENE_Global = false, bool GENE_Linear = true,
            size_t checkpointFrequency = 1, size_t offsetForDiagnostics = 0);
@@ -57,20 +57,21 @@ class GeneTask : public combigrid::Task {
    */
   void changeDir(CommunicatorType lcomm);
 
-  //void init(CommunicatorType lcomm);
+  // void init(CommunicatorType lcomm);
 
   /**
    * This method initializes the task
    * lcomm is the local communicator of the process group.
    * decomposition is the spatial decomposition of the component grid
    */
-  void init(CommunicatorType lcomm, std::vector<IndexVector> decomposition = std::vector<IndexVector>());
+  void init(CommunicatorType lcomm,
+            const std::vector<IndexVector>& decomposition = std::vector<IndexVector>());
 
   /**
    * This method returns the decomposition of the grid of the specified species
    */
-  std::vector<IndexVector> getDecomposition(int species){
-      return dfgVector_[species]->getDecomposition();
+  std::vector<IndexVector> getDecomposition(int species) {
+    return dfgVector_[species]->getDecomposition();
   }
 
   /**
@@ -108,7 +109,7 @@ class GeneTask : public combigrid::Task {
   /**
    * Returns the distributed full grid of the specified specie
    */
-  DistributedFullGrid<CombiDataType>& getDistributedFullGrid(int specie);
+  DistributedFullGrid<CombiDataType>& getDistributedFullGrid(size_t specie);
 
   /*
    * Convert fg to GeneGrid and scatter over processes of pgroup. The fullgrid
diff --git a/examples/gene_distributed/tools/testBoundaryZ.cpp b/examples/gene_distributed/tools/testBoundaryZ.cpp
index c5554a9d4..4facbb928 100644
--- a/examples/gene_distributed/tools/testBoundaryZ.cpp
+++ b/examples/gene_distributed/tools/testBoundaryZ.cpp
@@ -19,7 +19,7 @@
 using namespace combigrid;
 
 
-void testBoundaryZ(FullGrid<CombiDataType>& fg, std::string filePrefix ){
+void testBoundaryZ(FullGrid<CombiDataType>& fg, const std::string& filePrefix ){
   const double tol = 1e-15;
 
   MultiArrayRef6 fgData = createMultiArrayRef<CombiDataType,6>( fg );
diff --git a/examples/gene_distributed_linear/src/GeneTask.cpp b/examples/gene_distributed_linear/src/GeneTask.cpp
index e055b0f06..ba83155f8 100644
--- a/examples/gene_distributed_linear/src/GeneTask.cpp
+++ b/examples/gene_distributed_linear/src/GeneTask.cpp
@@ -184,7 +184,7 @@ void GeneTask::decideToKill(){ //toDo check if combiStep should be included in t
 /**
  * This routine initializes GeneTask; currently it only sets a bool value.
  */
-void GeneTask::init(CommunicatorType lcomm, std::vector<IndexVector> decomposition){
+void GeneTask::init(CommunicatorType lcomm, const std::vector<IndexVector>& decomposition){
 //  if( dfg_ == NULL ){
 //      dfg_ = new DistributedFullGrid<CombiDataType>( dim_, l_, lcomm,
 //          this->getBoundary(), p_, false);
@@ -279,7 +279,7 @@ void GeneTask::getFullGrid( FullGrid<CombiDataType>& fg, RankType lroot,
 /**
  * This routine returns the local part of the fullgrid
  */
-DistributedFullGrid<complex>& GeneTask::getDistributedFullGrid(int specie){
+DistributedFullGrid<complex>& GeneTask::getDistributedFullGrid(size_t specie) override {
   return *dfgVector_[specie];
 }
 
diff --git a/examples/gene_distributed_linear/src/GeneTask.hpp b/examples/gene_distributed_linear/src/GeneTask.hpp
index 54f60a770..103f01bd6 100644
--- a/examples/gene_distributed_linear/src/GeneTask.hpp
+++ b/examples/gene_distributed_linear/src/GeneTask.hpp
@@ -32,7 +32,7 @@ class GeneTask : public combigrid::Task {
  public:
   GeneTask(DimType dim, const LevelVector& l, const std::vector<BoundaryType>& boundary, real coeff,
            LoadModel* loadModel, std::string& path, real dt, real combitime, size_t nsteps,
-           real shat, real lx, int ky0_ind, std::vector<int> p = std::vector<int>(0),
+           real shat, real lx, int ky0_ind, const std::vector<int>& p = std::vector<int>(0),
            FaultCriterion* faultCrit = (new StaticFaults({0, IndexVector(0), IndexVector(0)})),
            IndexType numSpecies = 1, bool GENE_Global = false, bool GENE_Linear = true);
 
@@ -51,20 +51,21 @@ class GeneTask : public combigrid::Task {
    */
   void changeDir(CommunicatorType lcomm);
 
-  //void init(CommunicatorType lcomm);
+  // void init(CommunicatorType lcomm);
 
   /**
    * This method initializes the task
    * lcomm is the local communicator of the process group.
    * decomposition is the spatial decomposition of the component grid
    */
-  void init(CommunicatorType lcomm, std::vector<IndexVector> decomposition = std::vector<IndexVector>());
+  void init(CommunicatorType lcomm,
+            const std::vector<IndexVector>& decomposition = std::vector<IndexVector>());
 
   /**
    * This method returns the decomposition of the grid of the specified species
    */
-  std::vector<IndexVector> getDecomposition(int species){
-      return dfgVector_[species]->getDecomposition();
+  std::vector<IndexVector> getDecomposition(int species) {
+    return dfgVector_[species]->getDecomposition();
   }
 
   /**
@@ -102,7 +103,7 @@ class GeneTask : public combigrid::Task {
   /**
    * Returns the distributed full grid of the specified specie
    */
-  DistributedFullGrid<CombiDataType>& getDistributedFullGrid(int specie);
+  DistributedFullGrid<CombiDataType>& getDistributedFullGrid(size_t specie);
 
   /*
    * Convert fg to GeneGrid and scatter over processes of pgroup. The fullgrid
diff --git a/examples/selalib_distributed/src/SelalibTask.hpp b/examples/selalib_distributed/src/SelalibTask.hpp
index e04c0542c..d6e1bdb41 100644
--- a/examples/selalib_distributed/src/SelalibTask.hpp
+++ b/examples/selalib_distributed/src/SelalibTask.hpp
@@ -97,7 +97,7 @@ class SelalibTask : public combigrid::Task {
       // MASTER_EXCLUSIVE_SECTION{
       //   std::cout << "run " << *this << std::endl;
       // }
-      bool haveResolution = coeff_ == std::numeric_limits<combigrid::real>::max();
+      // bool haveResolution = coeff_ == std::numeric_limits<combigrid::real>::max();
       Stats::startEvent("BSL run");
       if (selalibSimPointer_ == nullptr) {
         throw std::runtime_error("selalibSimPointer_ is null");
@@ -105,7 +105,8 @@ class SelalibTask : public combigrid::Task {
       sim_bsl_vp_3d3v_cart_dd_slim_run(simPtrPtr_);
       Stats::stopEvent("BSL run");
 
-      int32_t* iPtr = &currentNumTimeStepsRun_;
+      int32_t currentNumTimeStepsAsInt = static_cast<int32_t>(currentNumTimeStepsRun_);
+      int32_t* iPtr = &currentNumTimeStepsAsInt;
       sim_bsl_vp_3d3v_cart_dd_slim_write_diagnostics(simPtrPtr_, iPtr);
       changeDir(lcomm, true);
     }
@@ -134,7 +135,7 @@ class SelalibTask : public combigrid::Task {
    * decomposition is the spatial decomposition of the component grid
    */
   void init(CommunicatorType lcomm,
-            std::vector<IndexVector> decomposition = std::vector<IndexVector>()) {
+            const std::vector<IndexVector>& decomposition = std::vector<IndexVector>()) {
     static_assert(!reverseOrderingDFGPartitions,
                   "BSL needs this flag to be false, "
                   "change only if the partitioning does not match between dfg and distribution");
@@ -203,7 +204,7 @@ class SelalibTask : public combigrid::Task {
   /**
    * Returns the distributed full grid of the specified specie
    */
-  DistributedFullGrid<CombiDataType>& getDistributedFullGrid(int specie) { return *dfg_; }
+  DistributedFullGrid<CombiDataType>& getDistributedFullGrid(size_t specie) { return *dfg_; }
 
   /**
    * @return double* the pointer to the distribution (in Fortran allocated memory)
@@ -225,7 +226,7 @@ class SelalibTask : public combigrid::Task {
    */
   real getCurrentTime() const override { return currentNumTimeStepsRun_ * dt_; }
 
-  void setCurrentNumTimeStepsRun(IndexType currentNumTimeStepsRun) {
+  void setCurrentNumTimeStepsRun(size_t currentNumTimeStepsRun) {
     currentNumTimeStepsRun_ = currentNumTimeStepsRun;
   }
 
@@ -237,7 +238,8 @@ class SelalibTask : public combigrid::Task {
     changeDir(dfg_->getCommunicator(), false);
     // set selalib distribution from dfg
     assert(diagnosticsInitialized_);
-    int32_t* iPtr = &currentNumTimeStepsRun_;
+    int32_t currentNumTimeStepsAsInt = static_cast<int32_t>(currentNumTimeStepsRun_);
+    int32_t* iPtr = &currentNumTimeStepsAsInt;
     sim_bsl_vp_3d3v_cart_dd_slim_write_diagnostics(simPtrPtr_, iPtr);
     changeDir(dfg_->getCommunicator(), true);
   }
@@ -266,7 +268,7 @@ class SelalibTask : public combigrid::Task {
   /*
    * simulation time specific parameters
    */
-  int32_t currentNumTimeStepsRun_;  // current number of time steps already run in the simulation
+  size_t currentNumTimeStepsRun_;  // current number of time steps already run in the simulation
   real dt_;
 
   /**number of time-steps in between two combinations
diff --git a/examples/selalib_distributed/src/selalib_distributed.cpp b/examples/selalib_distributed/src/selalib_distributed.cpp
index bb2f53be9..ffd1823ef 100644
--- a/examples/selalib_distributed/src/selalib_distributed.cpp
+++ b/examples/selalib_distributed/src/selalib_distributed.cpp
@@ -165,7 +165,7 @@ int main(int argc, char** argv) {
       std::cout << "reduceCombinationDimsLmax = " << reduceCombinationDimsLmax << std::endl;
       std::cout << "boundary = " << boundary << std::endl;
       std::cout << "hierarchization_dims = " << hierarchizationDims << std::endl;
-      auto numDOF = printCombiDegreesOfFreedom(combischeme.getCombiSpaces(), boundary);
+      [[maybe_unused]] auto numDOF = printCombiDegreesOfFreedom(combischeme.getCombiSpaces(), boundary);
       std::cout << "CombiScheme: " << std::endl;
       for (size_t i = 0; i < levels.size(); ++i) {
         std::cout << "\t" << levels[i] << " " << coeffs[i] << std::endl;
@@ -177,11 +177,10 @@ int main(int argc, char** argv) {
       setCheckpointRestart(basename, levels);
     } else {
       // using a very high diagnostics interval -> write no diagnostics in the component grids
-      size_t veryHighNumber = 2147483647; // =2^31-1
-      size_t sometimes = 100;
-      size_t always = 1;
+      [[maybe_unused]] size_t veryHighNumber = 2147483647; // =2^31-1
+      [[maybe_unused]] size_t sometimes = 100;
+      [[maybe_unused]] size_t always = 1;
       // create necessary folders and files to run each task in a separate folder
-
       std::vector<size_t> taskNumbers(levels.size());  // only necessary in case of static task assignment
       std::iota(taskNumbers.begin(), taskNumbers.end(), 0);
       createTaskFolders(basename, levels, taskNumbers, p, nsteps, dt, always, nameDiagnostics);
diff --git a/examples/selalib_distributed/src/selalib_distributed_workers_only.cpp b/examples/selalib_distributed/src/selalib_distributed_workers_only.cpp
index 20e5d1c9d..e0847bd83 100644
--- a/examples/selalib_distributed/src/selalib_distributed_workers_only.cpp
+++ b/examples/selalib_distributed/src/selalib_distributed_workers_only.cpp
@@ -166,9 +166,9 @@ int main(int argc, char** argv) {
       setCheckpointRestart(basename, levels);
     } else {
       // using a very high diagnostics interval -> write no diagnostics in the component grids
-      size_t veryHighNumber = 2147483647;  // =2^31-1
-      size_t sometimes = 100;
-      size_t always = 1;
+      [[maybe_unused]] size_t veryHighNumber = 2147483647;  // =2^31-1
+      [[maybe_unused]] size_t sometimes = 100;
+      [[maybe_unused]] size_t always = 1;
       // create necessary folders and files to run each task in a separate folder
       createTaskFolders(basename, levels, taskNumbers, p, nsteps, dt, veryHighNumber,
                         nameDiagnostics);
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 030936704..ffafb516d 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -158,9 +158,8 @@ target_link_libraries(discotec PUBLIC MPI::MPI_CXX)
 
 include(FetchContent)
 
-find_package(Boost REQUIRED COMPONENTS serialization headers program_options unit_test_framework system filesystem date_time)
+find_package(Boost REQUIRED COMPONENTS serialization program_options unit_test_framework system filesystem date_time)
 
-target_include_directories(discotec PRIVATE Boost::headers)
 target_link_libraries(discotec PRIVATE Boost::serialization Boost::program_options Boost::system Boost::date_time)
 
 find_package(GLPK REQUIRED)
diff --git a/src/combicom/CombiCom.hpp b/src/combicom/CombiCom.hpp
index d1b02cc3f..20b475749 100644
--- a/src/combicom/CombiCom.hpp
+++ b/src/combicom/CombiCom.hpp
@@ -50,14 +50,14 @@ void FGAllreduce(FullGrid<FG_ELEMENT>& fg, MPI_Comm comm) {
 }
 
 template <typename SparseGridType>
-int sumAndCheckSubspaceSizes(const SparseGridType& dsg,
-                             const std::vector<SubspaceSizeType>& subspaceSizes) {
-  int bsize = 0;
+size_t sumAndCheckSubspaceSizes(const SparseGridType& dsg,
+                                const std::vector<SubspaceSizeType>& subspaceSizes) {
+  size_t bsize = 0;
   for (size_t i = 0; i < subspaceSizes.size(); ++i) {
     // check for implementation errors, the reduced subspace size should not be
     // different from the size of already initialized subspaces
     bool check = (subspaceSizes[i] == 0 || dsg.getDataSize(i) == 0 ||
-                  subspaceSizes[i] == int(dsg.getDataSize(i)));
+                  subspaceSizes[i] == dsg.getDataSize(i));
 
     if (!check) {
       int rank;
@@ -215,7 +215,6 @@ std::vector<std::set<typename AnyDistributedSparseGrid::SubspaceIndexType>>& get
     static thread_local std::set<typename AnyDistributedSparseGrid::SubspaceIndexType>
         chunkSubspaces;
     chunkSubspaces.clear();
-    const auto subspaceItBefore = subspaceIt;
     SubspaceSizeType chunkDataSize = 0;
     // select chunk of not more than chunkSize, but at least one index
     auto nextAddedDataSize = dsg.getDataSize(*subspaceIt);
@@ -225,7 +224,7 @@ std::vector<std::set<typename AnyDistributedSparseGrid::SubspaceIndexType>>& get
       ++subspaceIt;
     } while (subspaceIt != siContainer.cend() &&
              (nextAddedDataSize = dsg.getDataSize(*subspaceIt)) &&
-             (chunkDataSize + nextAddedDataSize) < maxChunkSize);
+             (chunkDataSize + nextAddedDataSize) < static_cast<size_t>(maxChunkSize));
     subspaceIndicesChunks.push_back(std::move(chunkSubspaces));
   }
   return subspaceIndicesChunks;
@@ -255,7 +254,7 @@ getReductionDatatypes(const DistributedSparseGridUniform<FG_ELEMENT>& dsg,
       arrayOfDisplacements.reserve(subspacesChunk.size());
       for (const auto& ss : subspacesChunk) {
         arrayOfBlocklengths.push_back(dsg.getDataSize(ss));
-        arrayOfDisplacements.push_back(dsg.getData(ss) - rawDataStartFirst);
+        arrayOfDisplacements.push_back(static_cast<int>(dsg.getData(ss) - rawDataStartFirst));
       }
 
       MPI_Datatype myIndexedDatatype;
@@ -298,13 +297,14 @@ void distributedGlobalSubspaceReduce(SparseGridType& dsg, uint32_t maxMiBToSendP
       datatypesByStartIndex =
           getReductionDatatypes(dsg, commAndItsSubspaces.second, maxMiBToSendPerThread);
     }
-
+    /*
     // // this would be best for outgroup reduce, but leads to MPI truncation
     // // errors if not ordered (desynchronization between MPI ranks on the same communicators
     // // probably)
     // #pragma omp parallel if (dsg.getSubspacesByCommunicator().size() == 1) default(none) \
     // shared(dsg, indexedAdd, datatypesByStartIndex, commAndItsSubspaces)
     // #pragma omp for ordered schedule(static)
+    */
     for (size_t datatypeIndex = 0; datatypeIndex < datatypesByStartIndex.size(); ++datatypeIndex) {
       // reduce for each datatype
       auto& subspaceStartIndex = datatypesByStartIndex[datatypeIndex].first;
@@ -312,7 +312,7 @@ void distributedGlobalSubspaceReduce(SparseGridType& dsg, uint32_t maxMiBToSendP
       auto& datatype = datatypesByStartIndex[datatypeIndex].second;
       if (globalReduceRankThatCollects == MPI_PROC_NULL) {
         // #pragma omp ordered
-        auto success = MPI_Allreduce(MPI_IN_PLACE, dsg.getData(subspaceStartIndex), 1, datatype,
+        [[maybe_unused]] auto success = MPI_Allreduce(MPI_IN_PLACE, dsg.getData(subspaceStartIndex), 1, datatype,
                                      indexedAdd, comm);
         assert(success == MPI_SUCCESS);
 
@@ -394,7 +394,7 @@ static void asyncBcastDsgData(SparseGridType& dsg, RankType root, CommunicatorTy
   MPI_Datatype dataType =
       getMPIDatatype(abstraction::getabstractionDataType<typename SparseGridType::ElementType>());
 
-  auto success = MPI_Ibcast(data, dataSize, dataType, root, comm, request);
+  [[maybe_unused]] auto success = MPI_Ibcast(data, dataSize, dataType, root, comm, request);
   assert(success == MPI_SUCCESS);
 }
 
@@ -421,7 +421,7 @@ static void asyncBcastOutgroupDsgData(SparseGridType& dsg, RankType root, Commun
 
     auto& subspaceStartIndex = datatypesByStartIndex[0].first;
     auto& datatype = datatypesByStartIndex[0].second;
-    auto success = MPI_Ibcast(dsg.getData(subspaceStartIndex), 1, datatype, root, comm, request);
+    [[maybe_unused]] auto success = MPI_Ibcast(dsg.getData(subspaceStartIndex), 1, datatype, root, comm, request);
     assert(success == MPI_SUCCESS);
   }
 }
@@ -451,7 +451,8 @@ void reduceSubspaceSizes(SparseGridType& dsg, CommunicatorType comm) {
   MPI_Datatype dtype = getMPIDatatype(abstraction::getabstractionDataType<SubspaceSizeType>());
 
   // perform allreduce
-  assert(dsg.getNumSubspaces() < static_cast<SubspaceSizeType>(std::numeric_limits<int>::max()));
+  assert(dsg.getNumSubspaces() <
+         static_cast<AnyDistributedSparseGrid::SubspaceIndexType>(std::numeric_limits<int>::max()));
   MPI_Allreduce(MPI_IN_PLACE, dsg.getSubspaceDataSizes().data(),
                 static_cast<int>(dsg.getNumSubspaces()), dtype, MPI_MAX, comm);
   // assume that the sizes changed, the buffer might be the wrong size now
@@ -464,7 +465,8 @@ void broadcastSubspaceSizes(SparseGridType& dsg, CommunicatorType comm, RankType
   MPI_Datatype dtype = getMPIDatatype(abstraction::getabstractionDataType<SubspaceSizeType>());
 
   // perform broadcast
-  assert(dsg.getNumSubspaces() < static_cast<SubspaceSizeType>(std::numeric_limits<int>::max()));
+  assert(dsg.getNumSubspaces() <
+         static_cast<AnyDistributedSparseGrid::SubspaceIndexType>(std::numeric_limits<int>::max()));
   MPI_Bcast(dsg.getSubspaceDataSizes().data(), static_cast<int>(dsg.getNumSubspaces()), dtype,
             sendingRank, comm);
   // assume that the sizes changed, the buffer might be the wrong size now
@@ -480,7 +482,8 @@ void localMaxReduceSubspaceSizes(SparseGridType& dsg, CommunicatorType comm, Ran
   std::vector<SubspaceSizeType> subspaceSizes = dsg.getSubspaceDataSizes();
 
   // perform broadcast
-  assert(dsg.getNumSubspaces() < static_cast<SubspaceSizeType>(std::numeric_limits<int>::max()));
+  assert(dsg.getNumSubspaces() <
+         static_cast<AnyDistributedSparseGrid::SubspaceIndexType>(std::numeric_limits<int>::max()));
   MPI_Bcast(subspaceSizes.data(), static_cast<int>(dsg.getNumSubspaces()), dtype, sendingRank,
             comm);
   assert(sumAndCheckSubspaceSizes(dsg, subspaceSizes));
@@ -531,7 +534,7 @@ void receiveSubspaceSizesWithScatter(SparseGridType& dsg, CommunicatorType comm,
                                      RankType collectorRank) {
   auto numSubspaces = static_cast<int>(dsg.getNumSubspaces());
   assert(numSubspaces > 0);
-  assert(numSubspaces == dsg.getSubspaceDataSizes().size());
+  assert(static_cast<size_t>(numSubspaces) == dsg.getSubspaceDataSizes().size());
   MPI_Datatype dtype = getMPIDatatype(abstraction::getabstractionDataType<SubspaceSizeType>());
 
   // receive updated sizes from manager
diff --git a/src/combischeme/CombiMinMaxScheme.hpp b/src/combischeme/CombiMinMaxScheme.hpp
index 713d766df..8166cef68 100644
--- a/src/combischeme/CombiMinMaxScheme.hpp
+++ b/src/combischeme/CombiMinMaxScheme.hpp
@@ -163,7 +163,7 @@ class CombiMinMaxSchemeFromFile : public CombiMinMaxScheme {
           assert(lmin <= lvl);
           combiSpaces_.push_back(lvl);
         } else if (c.first == "group_no") {
-          processGroupNumbers_.push_back(c.second.get_value<size_t>());
+          processGroupNumbers_.push_back(c.second.get_value<RankType>());
         } else {
           assert(false);
         }
@@ -176,10 +176,10 @@ class CombiMinMaxSchemeFromFile : public CombiMinMaxScheme {
     assert(coefficients_.size() == combiSpaces_.size());
   }
 
-  inline const std::vector<size_t>& getProcessGroupNumbers() const { return processGroupNumbers_; }
+  inline const std::vector<RankType>& getProcessGroupNumbers() const { return processGroupNumbers_; }
 
  private:
-  std::vector<size_t> processGroupNumbers_;
+  std::vector<RankType> processGroupNumbers_;
 };
 
 inline long long int getCombiDegreesOfFreedom(const LevelVector& level,
@@ -300,7 +300,7 @@ inline std::vector<long long int> getPartitionedNumDOFSG(
   IndexType multiplier = 1;
   for (const auto& d : decomposition) {
     decompositionOffsets.push_back(multiplier);
-    multiplier *= d.size();
+    multiplier *= static_cast<IndexType>(d.size());
   }
   auto numProcsPerGroup = multiplier;
   std::vector<long long int> numDOF(numProcsPerGroup, 0);
@@ -319,8 +319,8 @@ inline std::vector<long long int> getPartitionedNumDOFSG(
                 level_d, decomposition[d][k + 1], referenceLevel[d]);
         if (highestIndexOfSubspaceInThisDimensionInThisProcess != -1) {
           subspaceExtentsPerProcessPerDimension[d][k] =
-              highestIndexOfSubspaceInThisDimensionInThisProcess -
-              numIndicesProcessedOnPoleMinusOne;
+              static_cast<IndexType>(highestIndexOfSubspaceInThisDimensionInThisProcess -
+              numIndicesProcessedOnPoleMinusOne);
           numIndicesProcessedOnPoleMinusOne = highestIndexOfSubspaceInThisDimensionInThisProcess;
         } else {
           subspaceExtentsPerProcessPerDimension[d][k] = 0;
@@ -329,10 +329,10 @@ inline std::vector<long long int> getPartitionedNumDOFSG(
       // the rest belongs to the last partition
       subspaceExtentsPerProcessPerDimension[d][subspaceExtentsPerProcessPerDimension[d].size() -
                                                1] =
-          powerOfTwo[level_d - 1] - numIndicesProcessedOnPoleMinusOne - 1;
+          powerOfTwo[level_d - 1] - static_cast<IndexType>(numIndicesProcessedOnPoleMinusOne) - 1;
       if (level_d == 1) {
         subspaceExtentsPerProcessPerDimension[d][subspaceExtentsPerProcessPerDimension[d].size() -
-                                                 1] = 3 - numIndicesProcessedOnPoleMinusOne - 1;
+                                                 1] = static_cast<IndexType>(3 - numIndicesProcessedOnPoleMinusOne - 1);
         assert(std::accumulate(subspaceExtentsPerProcessPerDimension[d].begin(),
                                subspaceExtentsPerProcessPerDimension[d].end(), 0) == 3);
       } else {
@@ -350,8 +350,8 @@ inline std::vector<long long int> getPartitionedNumDOFSG(
       size_t numDOFtoAdd = 1;
       // iterate the vector index entries belonging to linear index i
       auto tmp = i;
-      for (int d = dim - 1; d >= 0; --d) {
-        assert(d < subspaceExtentsPerProcessPerDimension.size());
+      for (IndexType d = dim - 1; d >= 0; --d) {
+        assert(static_cast<size_t>(d) < subspaceExtentsPerProcessPerDimension.size());
         auto decompositionIndexInDimD = tmp / decompositionOffsets[d];
         assert(decompositionIndexInDimD < subspaceExtentsPerProcessPerDimension[d].size());
         numDOFtoAdd *= subspaceExtentsPerProcessPerDimension[d][decompositionIndexInDimD];
@@ -468,7 +468,7 @@ inline size_t getAssignedLevels(const CombiMinMaxSchemeFromFile& combischeme,
     throw std::runtime_error(
         "CombiSchemeFromFile::getAssignedLevels : inconsistent scheme from file");
   }
-  const auto [itMin, itMax] = std::minmax_element(pgNumbers.begin(), pgNumbers.end());
+  [[maybe_unused]] const auto [itMin, itMax] = std::minmax_element(pgNumbers.begin(), pgNumbers.end());
   assert(*itMin == 0);  // make sure it starts with 0
   // filter out only those tasks that belong to "our" process group
   for (size_t taskNo = 0; taskNo < pgNumbers.size(); ++taskNo) {
@@ -534,7 +534,7 @@ inline size_t getLoadBalancedLevels(const CombiMinMaxScheme& combischeme,
     // find the group with the lowest load
     RankType minGroup = 0;
     long long minLoad = std::numeric_limits<long long>::max();
-    for (RankType group = 0; group < totalNumGroups; ++group) {
+    for (RankType group = 0; group < static_cast<int>(totalNumGroups); ++group) {
       if (loadAssignedToGroup[group] < minLoad) {
         minGroup = group;
         minLoad = loadAssignedToGroup[group];
diff --git a/src/combischeme/CombiThirdLevelScheme.cpp b/src/combischeme/CombiThirdLevelScheme.cpp
index 9e66d86dc..724a21776 100644
--- a/src/combischeme/CombiThirdLevelScheme.cpp
+++ b/src/combischeme/CombiThirdLevelScheme.cpp
@@ -34,7 +34,7 @@ void CombiThirdLevelScheme::decomposeScheme(const std::vector<LevelVector>& full
                                             std::vector<std::vector<real>>& decomposedCoeffs,
                                             size_t numSystems,
                                             const std::vector<real>& fractionsOfScheme) {
-  auto fracSum =
+  [[maybe_unused]] auto fracSum =
       std::accumulate(fractionsOfScheme.begin(), fractionsOfScheme.end(), 0., std::plus<real>());
   assert(std::abs(fracSum - 1.) < 1e-3);
   assert(fractionsOfScheme.size() == numSystems);
@@ -56,7 +56,7 @@ void CombiThirdLevelScheme::decomposeScheme(const std::vector<LevelVector>& full
   }
 
   // assert that all are assigned
-  assert(std::accumulate(decomposedScheme.begin(), decomposedScheme.end(), 0,
+  assert(std::accumulate(decomposedScheme.begin(), decomposedScheme.end(), static_cast<size_t>(0),
                          [](size_t a, const std::vector<LevelVector>& l) {
                            return a + l.size();
                          }) == fullScheme.size());
diff --git a/src/fault_tolerance/FTUtils.cpp b/src/fault_tolerance/FTUtils.cpp
index 0f65d940d..32226242f 100644
--- a/src/fault_tolerance/FTUtils.cpp
+++ b/src/fault_tolerance/FTUtils.cpp
@@ -116,7 +116,7 @@ matrix get_inv_M(const CombigridDict& aux_downset, const int& dim) {
   return M_inv;
 }
 
-CombigridDict set_entire_downset_dict(const LevelVectorList levels,
+CombigridDict set_entire_downset_dict(const LevelVectorList& levels,
                                       const CombigridDict& received_dict, const int& dim) {
   LevelVector level_min = levels.front();
   LevelVector level_max = levels.back();
diff --git a/src/fault_tolerance/FTUtils.hpp b/src/fault_tolerance/FTUtils.hpp
index 7d73129e9..fd6c5be86 100644
--- a/src/fault_tolerance/FTUtils.hpp
+++ b/src/fault_tolerance/FTUtils.hpp
@@ -29,7 +29,7 @@ CombigridDict get_python_data(const std::string& script_run, const int& dim);
 matrix get_inv_M(const CombigridDict& aux_downset, const int& dim);
 
 /* used to create the inverse of M matrix for the interpolation based problem */
-CombigridDict set_entire_downset_dict(const LevelVectorList levels,
+CombigridDict set_entire_downset_dict(const LevelVectorList& levels,
                                       const CombigridDict& received_dict, const int& dim);
 
 /* used to get a vector of the entire downset indices */
diff --git a/src/fault_tolerance/StaticFaults.cpp b/src/fault_tolerance/StaticFaults.cpp
index f5f0db20b..61b775479 100644
--- a/src/fault_tolerance/StaticFaults.cpp
+++ b/src/fault_tolerance/StaticFaults.cpp
@@ -14,7 +14,7 @@ bool StaticFaults::failNow(int ncombi, real t_iter, int globalRank) {
 
   std::vector<IndexType>::iterator it;
   it = std::find(iF.begin(), iF.end(), ncombi);
-  IndexType idx = std::distance(iF.begin(), it);
+  auto idx = std::distance(iF.begin(), it);
   // std::cout << "faultInfo" << iF[0] << " " << rF[0] << "\n";
   // Check if current iteration is in iterationFaults_
   while (it != iF.end()) {
diff --git a/src/fullgrid/DistributedFullGrid.hpp b/src/fullgrid/DistributedFullGrid.hpp
index 590a556e5..978f04130 100644
--- a/src/fullgrid/DistributedFullGrid.hpp
+++ b/src/fullgrid/DistributedFullGrid.hpp
@@ -8,6 +8,7 @@
 #include <iostream>
 #include <numeric>
 #include <string>
+#include <vector>
 
 #include "fullgrid/FullGrid.hpp"
 #include "fullgrid/Tensor.hpp"
@@ -16,6 +17,7 @@
 #include "mpi/MPITags.hpp"
 #include "sparsegrid/AnyDistributedSparseGrid.hpp"
 #include "sparsegrid/DistributedSparseGridUniform.hpp"
+#include "utils/DecompositionUtils.hpp"
 #include "utils/IndexVector.hpp"
 #include "utils/LevelSetUtils.hpp"
 #include "utils/LevelVector.hpp"
@@ -25,45 +27,24 @@
 
 namespace combigrid {
 
-/* a regular (equidistant) domain decompositioning for an even number of processes
- * leads to grid points on the (geometrical) process boundaries.
- * with the forwardDecomposition flag it can be decided if the grid points on
- * the process boundaries belong to the process on the right-hand side (true)
- * of the process boundary, or to the one on the left-hand side (false).
- */
-static inline std::vector<IndexVector> getDefaultDecomposition(
-    const IndexVector& globalNumPointsPerDimension,
-    const std::vector<int>& cartesianProcsPerDimension, bool forwardDecomposition) {
-  auto dim = static_cast<DimType>(globalNumPointsPerDimension.size());
-  assert(cartesianProcsPerDimension.size() == dim);
-
-  // create decomposition vectors
-  std::vector<IndexVector> decomposition(dim);
-  for (DimType i = 0; i < dim; ++i) {
-    if (cartesianProcsPerDimension[i] > globalNumPointsPerDimension[i]) {
-      throw std::invalid_argument(
-          "change to coarser parallelization! currently not all processes can have points.");
-    }
-    IndexVector& llbnd = decomposition[i];
-    llbnd.resize(cartesianProcsPerDimension[i]);
-
-    for (int j = 0; j < cartesianProcsPerDimension[i]; ++j) {
-      double tmp = static_cast<double>(globalNumPointsPerDimension[i]) * static_cast<double>(j) /
-                   static_cast<double>(cartesianProcsPerDimension[i]);
-
-      if (forwardDecomposition)
-        llbnd[j] = static_cast<IndexType>(std::ceil(tmp));
-      else
-        llbnd[j] = static_cast<IndexType>(std::floor(tmp));
-    }
-  }
-  return decomposition;
-}
-
 /**
  * @brief DistributedFullGrid : a (non-owning) indexed full grid data structure
- * 
- * @tparam FG_ELEMENT 
+ *
+ * @tparam FG_ELEMENT the data type to be stored on the grid
+ * @param dim the dimensionality of the full grid (may become a template parameter in future
+ * versions)
+ * @param levels the level vector describing the full grid
+ * @param comm the Cartesian communicator to be used for the distributed full grid (will not be
+ * duplicated; ownership is not transferred)
+ * @param hasBdrPoints a vector of flags to show if the dimension has boundary points (0 for no
+ * points, 1 for one-sided boundary, 2 for both sides)
+ * @param dataPointer a pointer to the beginning of the data array
+ * @param procs a vector of the number of processes in each dimension (must match the decomposition
+ * in comm)
+ * @param forwardDecomposition a flag to decide if the middle grid points on the process boundaries
+ * belong to the process on the right-hand side (true) of the process boundary, or to the one on the
+ * left-hand side (false).
+ * @param decomposition a vector of the lower bounds of the grid points in each dimension
  */
 template <typename FG_ELEMENT>
 class DistributedFullGrid {
@@ -72,374 +53,83 @@ class DistributedFullGrid {
   DistributedFullGrid(DimType dim, const LevelVector& levels, CommunicatorType const& comm,
                       const std::vector<BoundaryType>& hasBdrPoints, FG_ELEMENT* dataPointer,
                       const std::vector<int>& procs, bool forwardDecomposition = true,
-                      const std::vector<IndexVector>& decomposition = std::vector<IndexVector>())
-      : dim_(dim), levels_(levels), hasBoundaryPoints_(hasBdrPoints) {
-    assert(levels_.size() == dim);
-    assert(hasBoundaryPoints_.size() == dim);
-    assert(procs.size() == dim);
-
-    InitMPI(comm, procs);
-
-    IndexVector nrPoints(dim_);
-    gridSpacing_.resize(dim_);
-
-    // set global num of elements and offsets
-    IndexType nrElements = 1;
-    for (DimType j = 0; j < dim_; j++) {
-      nrPoints[j] = combigrid::getNumDofNodal(levels_[j], hasBoundaryPoints_[j]);
-      nrElements = nrElements * nrPoints[j];
-      if (hasBoundaryPoints_[j] == 1) {
-        assert(!decomposition.empty() || !forwardDecomposition);
-      }
-      assert(levels_[j] < 30);
-      gridSpacing_[j] = oneOverPowOfTwo[levels_[j]];
-    }
-
-    if (decomposition.size() == 0) {
-      setDecomposition(getDefaultDecomposition(
-          nrPoints, this->getCartesianUtils().getCartesianDimensions(), forwardDecomposition));
-    } else {
-      setDecomposition(decomposition);
-    }
-    globalIndexer_ = TensorIndexer(std::move(nrPoints));
-    myPartitionsLowerBounds_ = getLowerBounds(this->getRank());
-    myPartitionsFirstGlobalIndex_ = globalIndexer_.sequentialIndex(myPartitionsLowerBounds_);
-    myPartitionsUpperBounds_ = getUpperBounds(this->getRank());
-
-    // set local elements and local offsets
-    auto nrLocalPoints = getUpperBounds() - getLowerBounds();
-    localTensor_ = Tensor<FG_ELEMENT>(dataPointer, std::move(nrLocalPoints));
-  }
+                      const std::vector<IndexVector>& decomposition = std::vector<IndexVector>());
 
   DistributedFullGrid(const DistributedFullGrid& other) = delete;
   DistributedFullGrid& operator=(const DistributedFullGrid&) = delete;
   DistributedFullGrid(DistributedFullGrid&& other) = default;
   DistributedFullGrid& operator=(DistributedFullGrid&& other) = default;
 
-  virtual ~DistributedFullGrid() {
-    for (size_t i = 0; i < upwardSubarrays_.size(); ++i) {
-      MPI_Type_free(&upwardSubarrays_[i]);
-    }
-    for (size_t i = 0; i < downwardSubarrays_.size(); ++i) {
-      MPI_Type_free(&downwardSubarrays_[i]);
-    }
-  }
-
-inline double getPointDistanceToCoordinate(IndexType oneDimensionalLocalIndex, double coord,
-                                           DimType d) const {
-  const auto& h = getGridSpacing();
-  auto coordDistance =
-      static_cast<double>(this->getLowerBounds()[d] + (hasBoundaryPoints_[d] > 0 ? 0 : 1) +
-                          oneDimensionalLocalIndex) *
-          h[d] -
-      coord;
-#ifndef NDEBUG
-  if (std::abs(coordDistance) > h[d]) {
-    std::cout << "assert bounds " << coordDistance << coord << h << static_cast<int>(d)
-              << oneDimensionalLocalIndex << std::endl;
-    assert(false &&
-           "should only be called for coordinates within the support of this point's basis "
-           "function");
-  }
-#endif  // ndef NDEBUG
-  return std::abs(coordDistance);
-}
+  virtual ~DistributedFullGrid();
 
-inline FG_ELEMENT evalIndexAndAllUpperNeighbors(const IndexVector& localIndex,
-                                                const std::vector<real>& coords) const {
-  FG_ELEMENT result = 0.;
-  auto localLinearIndex = this->getLocalLinearIndex(localIndex);
-  for (size_t localIndexIterate = 0;
-       localIndexIterate < combigrid::powerOfTwoByBitshift(this->getDimension());
-       ++localIndexIterate) {
-#ifndef NDEBUG
-    auto neighborVectorIndex = localIndex;
-#endif
-    auto neighborIndex = localLinearIndex;
-    real phi_c = 1.;  // value of product of iterate's basis function on coords
-    for (DimType d = 0; d < dim_; ++d) {
-      const auto lastIndexInDim = this->getLocalSizes()[d] - 1;
-      bool isUpperInDim = std::bitset<sizeof(int) * CHAR_BIT>(localIndexIterate).test(d);
-      auto iterateIndexInThisDimension = localIndex[d] + isUpperInDim;
-      if (iterateIndexInThisDimension < 0) {
-        phi_c = 0.;
-        break;
-      } else if (isUpperInDim && this->hasBoundaryPoints_[d] == 1 &&
-                 //  this->getCartesianUtils().isOnLowerBoundaryInDimension(d) &&
-                 iterateIndexInThisDimension == this->getGlobalSizes()[d]) {
-        // if we have a wrap-around AND are on the lowest cartesian process in this dimension
-        // we need to set the index in this dim to 0 and shift the coordinate by 1.
-        neighborIndex -= localIndex[d] * this->getLocalOffsets()[d];
-        iterateIndexInThisDimension = 0;
-        auto coordDistance =
-            getPointDistanceToCoordinate(iterateIndexInThisDimension, coords[d] - 1.0, d);
-        auto oneOverHinD = powerOfTwo[levels_[d]];
-        phi_c *= 1. - coordDistance * oneOverHinD;
-      } else if (iterateIndexInThisDimension > lastIndexInDim) {
-        phi_c = 0.;
-        break;
-      } else {
-        auto coordDistance =
-            getPointDistanceToCoordinate(iterateIndexInThisDimension, coords[d], d);
-        auto oneOverHinD = powerOfTwo[levels_[d]];
-        phi_c *= 1. - coordDistance * oneOverHinD;
-        neighborIndex += isUpperInDim * this->getLocalOffsets()[d];
-      }
-#ifndef NDEBUG
-      neighborVectorIndex[d] = iterateIndexInThisDimension;
-#endif
-    }
-    if (phi_c > 0.) {
-#ifndef NDEBUG
-      auto unlinearizedNeighborIndex = neighborVectorIndex;
-      this->getLocalVectorIndex(neighborIndex, unlinearizedNeighborIndex);
-      if (unlinearizedNeighborIndex != neighborVectorIndex) {
-        std::cerr << "expected " << unlinearizedNeighborIndex << " got " << neighborVectorIndex
-                  << std::endl;
-      }
-      if (neighborIndex < 0) {
-        std::cerr << "expected " << unlinearizedNeighborIndex << " got " << neighborVectorIndex
-                  << " or " << neighborIndex << std::endl;
-      }
-#endif
-      assert(neighborIndex > -1);
-      assert(neighborIndex < this->getNrLocalElements());
-      result += phi_c * this->getData()[neighborIndex];
-    }
-  }
-  return result;
-}
+  inline double getPointDistanceToCoordinate(IndexType oneDimensionalLocalIndex, double coord,
+                                             DimType d) const;
 
-/** evaluates the full grid on the specified coordinates
- * @param coords ND coordinates on the unit square [0,1]^D*/
-FG_ELEMENT evalLocal(const std::vector<real>& coords) const {
-  FG_ELEMENT value;
-  evalLocal(coords, value);
-  return value;
-}
+  inline FG_ELEMENT evalIndexAndAllUpperNeighbors(const IndexVector& localIndex,
+                                                  const std::vector<real>& coords) const;
 
-void evalLocal(const std::vector<real>& coords, FG_ELEMENT& value) const {
-  assert(coords.size() == this->getDimension());
-  // get the lowest-index point of the points
-  // whose basis functions contribute to the interpolated value
-  const auto& h = getGridSpacing();
-  static thread_local IndexVector localIndexLowerNonzeroNeighborPoint;
-  localIndexLowerNonzeroNeighborPoint.resize(dim_);
-  for (DimType d = 0; d < dim_; ++d) {
-#ifndef NDEBUG
-    if (coords[d] < 0. || coords[d] > 1.) {
-      std::cout << "coords " << coords << " out of bounds" << std::endl;
-    }
-    assert(coords[d] >= 0. && coords[d] <= 1.);
-#endif  // ndef NDEBUG
-    // this is the local index of the point that is lower than the coordinate
-    // may also be negative if the coordinate is lower than this processes' coordinates
-    IndexType localIndexLowerNonzeroNeighborIndexInThisDimension = static_cast<IndexType>(
-        std::floor((coords[d] - this->getLowerBoundsCoord(d)) * this->getInverseGridSpacingIn(d)));
+  /** evaluates the full grid on the specified coordinates
+   * @param coords ND coordinates on the unit square [0,1]^D*/
+  FG_ELEMENT evalLocal(const std::vector<real>& coords) const;
 
-    // check if we even need to evaluate on this process
-    if (localIndexLowerNonzeroNeighborIndexInThisDimension < -1) {
-      // index too small
-      value = 0.;
-      return;
-    } else if ((coords[d] >= 1.0 - h[d]) && this->hasBoundaryPoints_[d] == 1 &&
-               this->getCartesianUtils().isOnLowerBoundaryInDimension(d)) {
-      // if we have periodic boundary and this process is at the lower end of the dimension d
-      // we need the periodic coordinate => don't return 0
-    } else if (localIndexLowerNonzeroNeighborIndexInThisDimension > this->getLocalSizes()[d] - 1) {
-      // index too high
-      value = 0.;
-      return;
-    }
-    localIndexLowerNonzeroNeighborPoint[d] = localIndexLowerNonzeroNeighborIndexInThisDimension;
-  }
-  // evaluate at those points and sum up according to the basis function
-  value = evalIndexAndAllUpperNeighbors(localIndexLowerNonzeroNeighborPoint, coords);
-}
+  void evalLocal(const std::vector<real>& coords, FG_ELEMENT& value) const;
 
-/** evaluates the full grid on the specified coordinates
- * @param interpolationCoords vector of ND coordinates on the unit square [0,1]^D*/
-std::vector<FG_ELEMENT> getInterpolatedValues(
-    const std::vector<std::vector<real>>& interpolationCoords) const {
-  auto numValues = interpolationCoords.size();
-  std::vector<FG_ELEMENT> values;
-  values.resize(numValues);
-#pragma omp parallel for default(none) firstprivate(numValues) shared(values, interpolationCoords) \
-    schedule(static)
-  for (size_t i = 0; i < numValues; ++i) {
-    this->evalLocal(interpolationCoords[i], values[i]);
-  }
-  MPI_Allreduce(MPI_IN_PLACE, values.data(), static_cast<int>(numValues), this->getMPIDatatype(),
-                MPI_SUM, this->getCommunicator());
-  return values;
-}
+  /** evaluates the full grid on the specified coordinates
+   * @param interpolationCoords vector of ND coordinates on the unit square [0,1]^D*/
+  std::vector<FG_ELEMENT> getInterpolatedValues(
+      const std::vector<std::vector<real>>& interpolationCoords) const;
 
   /** return the coordinates on the unit square corresponding to global idx
    * @param globalIndex [IN] global linear index of the element i
    * @param coords [OUT] the vector must be resized already */
-  inline void getCoordsGlobal(IndexType globalLinearIndex, std::vector<real>& coords) const {
-    IndexType ind = 0;
-    IndexType tmp_add = 0;
-
-    coords.resize(dim_);
-
-    for (DimType j = 0; j < dim_; j++) {
-      ind = globalLinearIndex % this->getGlobalSizes()[j];
-      globalLinearIndex = globalLinearIndex / this->getGlobalSizes()[j];
-      // set the coordinate based on if we have boundary points
-      tmp_add = (hasBoundaryPoints_[j] > 0) ? (0) : (1);
-      coords[j] = static_cast<double>(ind + tmp_add) * getGridSpacing()[j];
-    }
-  }
+  inline void getCoordsGlobal(IndexType globalLinearIndex, std::vector<real>& coords) const;
 
   /** return coordinates on the unit square corresponding to local idx
    * @param globalIndex [IN] local linear index of the element i
    * @param coords [OUT] the vector must be resized already */
-  inline void getCoordsLocal(IndexType localLinearIndex, std::vector<real>& coords) const {
-    // todo: probably very inefficient implementation, if crucial for
-    // performance implement more direct way of computing the coordinates
-    assert(localLinearIndex < getNrLocalElements());
-
-    IndexType globalLinearIndex = getGlobalLinearIndex(localLinearIndex);
-
-    getCoordsGlobal(globalLinearIndex, coords);
-  }
+  inline void getCoordsLocal(IndexType localLinearIndex, std::vector<real>& coords) const;
 
   /** returns the LI (level,index) notation for a given element in the full grid
    * @param elementIndex [IN] the linear index of the element
    * @param levels [OUT] the levels of the point in the LI notation
    * @param indexes [OUT] the indexes of the point in the LI notation */
-  inline void getGlobalLI(IndexType elementIndex, LevelVector& levels, IndexVector& indexes) const {
-    IndexType startindex, tmp_val;
-
-    assert(elementIndex < this->getNrElements());
-    levels.resize(dim_);
-    indexes.resize(dim_);
-
-    tmp_val = elementIndex;
-
-    // first calculate intermediary indexes
-    for (DimType k = 0; k < dim_; k++) {
-      startindex = (hasBoundaryPoints_[k] > 0) ? 0 : 1;
-      indexes[k] = tmp_val % this->getGlobalSizes()[k] + startindex;
-      tmp_val = tmp_val / this->getGlobalSizes()[k];
-    }
-
-    // The level and index of the element in the hashgridstorage are computed dividing by two the
-    // index and level in the fullgrid
-    // until we obtain an impair number for the index, thus obtaining the level and index in the
-    // hierarchical basis (Aliz Nagy)
-    // ...
-    for (DimType k = 0; k < dim_; k++) {
-      tmp_val = levels_[k];
-
-      if (indexes[k] != 0) {
-        // todo: these operations can be optimized
-        while (indexes[k] % 2 == 0) {
-          indexes[k] = indexes[k] / 2;
-          tmp_val--;
-        }
-      } else {
-        tmp_val = 0;
-      }
-
-      levels[k] = tmp_val;
-    }
-  }
+  inline void getGlobalLI(IndexType elementIndex, LevelVector& levels, IndexVector& indexes) const;
 
   /** the global vector index corresponding to a global linear index
    * @param linIndex [IN] the linear index
    * @param axisIndex [OUT] the returned vector index */
-  inline void getGlobalVectorIndex(IndexType globLinIndex, IndexVector& globAxisIndex) const {
-    assert(globLinIndex < this->getNrElements());
-    assert(globAxisIndex.size() == dim_);
-
-    globAxisIndex = this->globalIndexer_.getVectorIndex(globLinIndex);
-  }
+  inline void getGlobalVectorIndex(IndexType globLinIndex, IndexVector& globAxisIndex) const;
 
   /** the global vector index corresponding to a local vector index */
   inline void getGlobalVectorIndex(const IndexVector& locAxisIndex,
-                                   IndexVector& globAxisIndex) const {
-    assert(locAxisIndex.size() == dim_);
-
-    globAxisIndex = this->getLowerBounds() + locAxisIndex;
-  }
+                                   IndexVector& globAxisIndex) const;
 
   /** the local vector index corresponding to a local linear index */
-  inline void getLocalVectorIndex(IndexType locLinIndex, IndexVector& locAxisIndex) const {
-    locAxisIndex = this->localTensor_.getVectorIndex(locLinIndex);
-  }
+  inline void getLocalVectorIndex(IndexType locLinIndex, IndexVector& locAxisIndex) const;
 
   /** the local vector index corresponding to a global vector index
    if global index vector not contained in local domain false will be
    returned */
   inline bool getLocalVectorIndex(const IndexVector& globAxisIndex,
-                                  IndexVector& locAxisIndex) const {
-    assert(globAxisIndex.size() == dim_);
-
-    if (this->isGlobalIndexHere(globAxisIndex)) {
-      locAxisIndex.assign(globAxisIndex.begin(), globAxisIndex.end());
-      std::transform(locAxisIndex.begin(), locAxisIndex.end(), this->getLowerBounds().begin(),
-                     locAxisIndex.begin(), std::minus<IndexType>());
-      return true;
-    } else {
-      return false;
-    }
-  }
+                                  IndexVector& locAxisIndex) const;
 
   /** returns the global linear index corresponding to the global index vector
    * @param axisIndex [IN] the vector index */
-  inline IndexType getGlobalLinearIndex(const IndexVector& globAxisIndex) const {
-    return globalIndexer_.sequentialIndex(globAxisIndex);
-  }
+  inline IndexType getGlobalLinearIndex(const IndexVector& globAxisIndex) const;
 
   // the global linear index corresponding to the local linear index
-  inline IndexType getGlobalLinearIndex(IndexType locLinIndex) const {
-    assert(locLinIndex < this->getNrLocalElements());
-
-    // convert to local vector index
-    static thread_local IndexVector locAxisIndex(dim_);
-    locAxisIndex.resize(dim_);
-    getLocalVectorIndex(locLinIndex, locAxisIndex);
-
-    // convert to global linear index
-    IndexType globLinIndex =
-        this->myPartitionsFirstGlobalIndex_ + this->getGlobalLinearIndex(locAxisIndex);
-    assert(globLinIndex < this->getNrElements());
-    return globLinIndex;
-  }
+  inline IndexType getGlobalLinearIndex(IndexType locLinIndex) const;
 
   /** the local linear index corresponding to the local index vector */
-  inline IndexType getLocalLinearIndex(const IndexVector& locAxisIndex) const {
-    return localTensor_.sequentialIndex(locAxisIndex);
-  }
+  inline IndexType getLocalLinearIndex(const IndexVector& locAxisIndex) const;
 
   /** the global linear index corresponding to the local linear index
    returns negative value if element not inside local partition */
-  inline IndexType getLocalLinearIndex(IndexType globLinIndex) const {
-    assert(globLinIndex < this->getNrElements());
+  inline IndexType getLocalLinearIndex(IndexType globLinIndex) const;
 
-    // convert to global vector index
-    static thread_local IndexVector globAxisIndex(dim_);
-    globAxisIndex.resize(dim_);
-    getGlobalVectorIndex(globLinIndex, globAxisIndex);
+  inline bool isGlobalIndexHere(IndexVector globalVectorIndex) const;
 
-    if (this->isGlobalIndexHere(globAxisIndex)) {
-      // convert to local linear index
-      return getLocalLinearIndex(globAxisIndex) - this->myPartitionsFirstGlobalIndex_;
-    } else {
-      return -1;
-    }
-  }
-
-  inline bool isGlobalIndexHere(IndexVector globalVectorIndex) const {
-    return (globalVectorIndex >= this->getLowerBounds()) &&
-           (globalVectorIndex < this->getUpperBounds());
-  }
-
-  inline bool isGlobalIndexHere(IndexType globLinIndex) const {
-    return isGlobalIndexHere(this->globalIndexer_.getVectorIndex(globLinIndex));
-  }
+  inline bool isGlobalIndexHere(IndexType globLinIndex) const;
 
   /** returns the dimension of the full grid */
   inline DimType getDimension() const { return dim_; }
@@ -451,21 +141,12 @@ std::vector<FG_ELEMENT> getInterpolatedValues(
   /** return the offset in the full grid vector of the dimension */
   inline IndexType getOffset(DimType i) const { return this->getOffsets()[i]; }
 
-  inline const IndexVector& getOffsets() const {
-    return globalIndexer_.getOffsetsVector();
-  }
-
-  inline const IndexVector& getLocalOffsets() const {
-    return localTensor_.getOffsetsVector();
-  }
+  inline const IndexVector& getOffsets() const { return globalIndexer_.getOffsetsVector(); }
 
-  inline std::tuple<std::vector<int>,std::vector<int>,std::vector<int>> getSizesSubsizesStartsOfSubtensor() const {
-    std::vector<int> csizes(this->getGlobalSizes().begin(), this->getGlobalSizes().end());
-    std::vector<int> csubsizes(this->getLocalSizes().begin(), this->getLocalSizes().end());
-    std::vector<int> cstarts(this->getLowerBounds().begin(), this->getLowerBounds().end());
+  inline const IndexVector& getLocalOffsets() const { return localTensor_.getOffsetsVector(); }
 
-    return std::make_tuple(csizes, csubsizes, cstarts);
-  }
+  inline std::tuple<std::vector<int>, std::vector<int>, std::vector<int>>
+  getSizesSubsizesStartsOfSubtensor() const;
 
   /** return the level vector */
   inline const LevelVector& getLevels() const { return levels_; }
@@ -473,54 +154,25 @@ std::vector<FG_ELEMENT> getInterpolatedValues(
   /** returns the grid spacing (sometimes called h) */
   inline const std::vector<double>& getGridSpacing() const { return gridSpacing_; }
 
-  inline double getInverseGridSpacingIn(DimType inDimension) const {
-    return powerOfTwo[levels_[inDimension]];
-  }
+  inline double getInverseGridSpacingIn(DimType inDimension) const;
 
-  inline std::vector<double> getInverseGridSpacing() const {
-    std::vector<double> oneOverH;
-    oneOverH.resize(gridSpacing_.size());
-    // should be the same as the number of intervals (N)
-    for (DimType j = 0; j < dim_; j++) {
-      oneOverH[j] = powerOfTwo[levels_[j]];
-    }
-#ifndef NDEBUG
-    std::vector<double> oneOverHByDivision;
-    oneOverHByDivision.resize(gridSpacing_.size());
-    std::transform(gridSpacing_.begin(), gridSpacing_.end(), oneOverHByDivision.begin(),
-                   std::bind(std::divides<double>(), 1, std::placeholders::_1));
-    for (DimType j = 0; j < dim_; j++) {
-      assert(std::abs(oneOverHByDivision[j] - oneOverH[j]) < 1e-10);
-      assert(oneOverHByDivision[j] == oneOverH[j]);
-    }
-#endif
-    return oneOverH;
-  }
+  inline std::vector<double> getInverseGridSpacing() const;
 
-  double getInnerNodalBasisFunctionIntegral() const {
-    const auto& h = this->getGridSpacing();
-    return std::accumulate(h.begin(), h.end(), 1., std::multiplies<double>());
-  }
+  double getInnerNodalBasisFunctionIntegral() const;
 
   /** returns the number of elements in the entire, global full grid */
-  inline IndexType getNrElements() const {
-    return globalIndexer_.size();
-  }
+  inline IndexType getNrElements() const { return static_cast<IndexType>(globalIndexer_.size()); }
 
   /** number of elements in the local partition */
-  inline IndexType getNrLocalElements() const { 
-    return localTensor_.size();
-  }
+  inline IndexType getNrLocalElements() const { return static_cast<IndexType>(localTensor_.size()); }
 
   /** number of points per dimension i */
-  inline IndexType length(DimType i) const { return this->getGlobalSizes()[i]; }
+  inline IndexType globalNumPointsInDimension(DimType i) const { return this->getGlobalSizes()[i]; }
 
   /** vector of flags to show if the dimension has boundary points*/
   inline const std::vector<BoundaryType>& returnBoundaryFlags() const { return hasBoundaryPoints_; }
 
-  inline void setZero() {
-    std::memset(this->getData(), 0, this->getNrLocalElements() * sizeof(FG_ELEMENT));
-  }
+  inline void setZero();
 
   inline FG_ELEMENT* getData() { return localTensor_.getData(); }
 
@@ -534,109 +186,34 @@ std::vector<FG_ELEMENT> getInterpolatedValues(
   inline int getCommunicatorSize() const { return this->getCartesianUtils().getCommunicatorSize(); }
 
   /** lower Bounds of this process */
-  inline const IndexVector& getLowerBounds() const {
-    // return getLowerBounds(this->getRank());
-    return myPartitionsLowerBounds_;
-  }
+  inline const IndexVector& getLowerBounds() const;
 
   /** lower bounds of rank r */
-  inline IndexVector getLowerBounds(RankType r) const {
-    assert(r >= 0 && r < this->getCommunicatorSize());
-    // get coords of r in cart comm
-    IndexVector lowerBounds(dim_);
-    const auto& coords = cartesianUtils_.getPartitionCoordsOfRank(r);
-
-    for (DimType i = 0; i < dim_; ++i) {
-      lowerBounds[i] = getDecomposition()[i][coords[i]];
-    }
-    return lowerBounds;
-  }
+  inline IndexVector getLowerBounds(RankType r) const;
 
   /** coordinates of this process' lower bounds in some dimension*/
-  inline real getLowerBoundsCoord(DimType inDimension) const {
-    return static_cast<real>(this->getLowerBounds()[inDimension] +
-                             (hasBoundaryPoints_[inDimension] > 0 ? 0 : 1)) *
-           this->getGridSpacing()[inDimension];
-  }
+  inline real getLowerBoundsCoord(DimType inDimension) const;
 
   /** upper Bounds of this process */
-  inline const IndexVector& getUpperBounds() const {
-    // return getUpperBounds(this->getRank());
-    return myPartitionsUpperBounds_;
-  }
+  inline const IndexVector& getUpperBounds() const;
 
   /** upper bounds of rank r */
-  inline IndexVector getUpperBounds(RankType r) const {
-    assert(r >= 0 && r < this->getCommunicatorSize());
-    IndexVector upperBounds(dim_);
-    const auto& coords = cartesianUtils_.getPartitionCoordsOfRank(r);
-
-    for (DimType i = 0; i < dim_; ++i) {
-      RankType n;
-      std::vector<int> nc(coords);
-
-      if (nc[i] < this->getCartesianUtils().getCartesianDimensions()[i] - 1) {
-        // get rank of next neighbor in dim i
-        nc[i] += 1;
-        n = this->getCartesianUtils().getRankFromPartitionCoords(nc);
-        upperBounds[i] = getLowerBounds(n)[i];
-      } else {
-        // no neighbor in dim i -> end of domain
-        upperBounds[i] = this->getGlobalSizes()[i];
-      }
-    }
-    return upperBounds;
-  }
+  inline IndexVector getUpperBounds(RankType r) const;
 
   /** coordinates of this process' upper bounds */
-  inline std::vector<real> getUpperBoundsCoords() const {
-    return getUpperBoundsCoords(this->getRank());
-  }
+  inline std::vector<real> getUpperBoundsCoords() const;
 
   /** coordinates of rank r' upper bounds */
-  inline std::vector<real> getUpperBoundsCoords(RankType r) const {
-    assert(r >= 0 && r < this->getCommunicatorSize());
-
-    /* the upper bound index vector can correspond to coordinates outside of
-     * the domain, thus we cannot use the getGlobalLinearIndex method here.
-     */
-    const IndexVector& ubvi = getUpperBounds(r);
-    std::vector<real> coords(dim_);
-    IndexType tmp_add = 0;
-
-    for (DimType j = 0; j < dim_; ++j) {
-      tmp_add = (hasBoundaryPoints_[j] > 0) ? (0) : (1);
-      coords[j] = static_cast<double>(ubvi[j] + tmp_add) * getGridSpacing()[j];
-    }
-    return coords;
-  }
+  inline std::vector<real> getUpperBoundsCoords(RankType r) const;
 
   /* returns the neighboring process (in the sense that the neighbor has the same
    * partion coordinates in all other dimensions than d) in dimension d which
    * contains the point with the one-dimensional index idx1d
    */
-  RankType getNeighbor1dFromAxisIndex(DimType dim, IndexType idx1d) const {
-    // if global index is outside of domain return negative value
-    {
-      if (idx1d < 0) return -1;
-      if (idx1d > this->getGlobalSizes()[dim] - 1) return -1;
-    }
-    const auto& decomp1d = this->getDecomposition()[dim];
-    auto lower = std::lower_bound(decomp1d.begin(), decomp1d.end(), idx1d + 1);
-    int partitionIdx1d = static_cast<int>(std::distance(decomp1d.begin(), lower)) - 1;
-    RankType r = this->getCartesianUtils().getNeighbor1dFromPartitionIndex(dim, partitionIdx1d);
-
-    // check if global index vector is actually contained in the domain of rank r
-    assert(idx1d >= this->getLowerBounds(r)[dim]);
-    assert(idx1d < this->getUpperBounds(r)[dim]);
-    assert(r < this->getCommunicatorSize());
-    return r;
-  }
+  RankType getNeighbor1dFromAxisIndex(DimType dim, IndexType idx1d) const;
 
   /** Number of Grids in every dimension*/
-  inline const std::vector<int>& getParallelization() const {
-    return this->getCartesianUtils().getCartesianDimensions();
-  }
+  inline const std::vector<int>& getParallelization() const;
 
   /** returns the 1d global index of the first point in the local domain
    *
@@ -649,183 +226,36 @@ std::vector<FG_ELEMENT> getInterpolatedValues(
   inline IndexType getLastGlobal1dIndex(DimType d) const { return getUpperBounds()[d] - 1; }
 
   // returns level of a global 1d index
-  inline LevelType getLevel(DimType d, IndexType idx1d) const {
-    // IndexVector givec(dim_, 0);
-    // givec[d] = idx1d;
-    // IndexType idx = getGlobalLinearIndex(givec);
-    // LevelVector levels(dim_);
-    // IndexVector tmp(dim_);
-    // getGlobalLI(idx, levels, tmp);
-    // return levels[d];
-
-    if (this->hasBoundaryPoints_[d] == 0) {
-      ++idx1d;
-    }
-    // get level of idx1d by rightmost set bit
-    // (e.g., all points on the finest level already have a 1 at the rightmost bit)
-    if (idx1d == 0) {
-      return 0;
-    }
-    const LevelType lmax = this->getLevels()[d];
-    // auto set_bit = idx1d & ~(idx1d-1);
-    // LevelType l = lmax - log2(set_bit);
-    // builtin is fast and should work with gcc and clang
-    // if it is not available, use the one above (at a slight performance hit)
-    // or c++20's std::countr_zero
-    LevelType l;
-    if constexpr(sizeof(IndexType) == sizeof(long)) {
-      l = static_cast<LevelType>(lmax - __builtin_ctzl(static_cast<unsigned long>(idx1d)));
-    } else if constexpr(sizeof(IndexType) == sizeof(long long)){
-      l = static_cast<LevelType>(lmax - __builtin_ctzll(static_cast<unsigned long long>(idx1d)));
-    } else {
-      l = static_cast<LevelType>(lmax - __builtin_ctz(static_cast<unsigned int>(idx1d)));
-    }
-    return l;
-  }
+  inline LevelType getLevel(DimType d, IndexType idx1d) const;
 
   // get 1d index of LeftPredecessor of a point
   // returns negative number if point has no left predecessor
-  inline IndexType getLeftPredecessor(DimType d, IndexType idx1d) const {
-    LevelType l = getLevel(d, idx1d);
-
-    // boundary points
-    if (l == 0) return -1;
-
-    LevelType ldiff = static_cast<LevelType>(levels_[d] - l);
-    IndexType lpidx = idx1d - combigrid::powerOfTwoByBitshift(ldiff);
-
-    return lpidx;
-  }
-
-  inline IndexType getRightPredecessor(DimType d, IndexType idx1d) const {
-    LevelType l = getLevel(d, idx1d);
-
-    // boundary points
-    if (l == 0) return -1;
-
-    LevelType ldiff = static_cast<LevelType>(levels_[d] - l);
-    IndexType rpidx = idx1d + combigrid::powerOfTwoByBitshift(ldiff);
+  inline IndexType getLeftPredecessor(DimType d, IndexType idx1d) const;
 
-    // check if outside of domain
-    IndexType numElementsD = this->getGlobalSizes()[d];
-
-    // in case of periodic, "virtual" domain is larger by one
-    bool oneSidedBoundary = this->returnBoundaryFlags()[d] == 1;
-    if (oneSidedBoundary && rpidx == numElementsD) return numElementsD;
-
-    if (rpidx > numElementsD - 1) rpidx = -1;
-
-    return rpidx;
-  }
+  inline IndexType getRightPredecessor(DimType d, IndexType idx1d) const;
 
   // get coordinates of the partition which contains the point specified
   // by the global index vector
-  inline void getPartitionCoords(IndexVector& globalAxisIndex, std::vector<int>& partitionCoords) const {
-    partitionCoords.resize(dim_);
-
-    for (DimType d = 0; d < dim_; ++d) {
-      partitionCoords[d] = -1;
-      for (int i = 0; i < this->getCartesianUtils().getCartesianDimensions()[d]; ++i) {
-        if (globalAxisIndex[d] >= getDecomposition()[d][i]) partitionCoords[d] = i;
-      }
-
-      // check whether the partition coordinates are valid
-      assert(partitionCoords[d] > -1 &&
-             partitionCoords[d] < this->getCartesianUtils().getCartesianDimensions()[d]);
-    }
-  }
+  inline void getPartitionCoords(IndexVector& globalAxisIndex,
+                                 std::vector<int>& partitionCoords) const;
 
-  inline void print() const {
-    std::visit([&](auto&& arg) { print(arg); }, localTensor_);
-  }
+  inline void print() const;
 
   // return extents of local grid
-  inline const IndexVector& getLocalSizes() const {
-    return localTensor_.getExtentsVector();
-  }
+  inline const IndexVector& getLocalSizes() const { return localTensor_.getExtentsVector(); }
 
   // return extents of global grid
-  inline const IndexVector& getGlobalSizes() const {
-    return globalIndexer_.getExtentsVector();
-  }
+  inline const IndexVector& getGlobalSizes() const { return globalIndexer_.getExtentsVector(); }
 
   // return MPI DataType
-  inline MPI_Datatype getMPIDatatype() const {
-    return abstraction::getMPIDatatype(abstraction::getabstractionDataType<FG_ELEMENT>());
-  }
+  inline MPI_Datatype getMPIDatatype() const;
 
   // gather fullgrid on rank r
-  void gatherFullGrid(FullGrid<FG_ELEMENT>& fg, RankType root) {
-    int size = this->getCommunicatorSize();
-    int rank = this->getRank();
-    CommunicatorType comm = this->getCommunicator();
-
-    // each rank: send dfg to root
-    int dest = root;
-    MPI_Request sendRequest;
-    MPI_Isend(this->getData(), static_cast<int>(this->getNrLocalElements()), this->getMPIDatatype(),
-              dest, 0, comm, &sendRequest);
-
-    std::vector<MPI_Request> requests;
-    std::vector<MPI_Datatype> subarrayTypes;
-
-    // rank r: for each rank create subarray view on fg
-    if (rank == root) {
-      if (!fg.isGridCreated()) fg.createFullGrid();
-
-      for (int r = 0; r < size; ++r) {
-        IndexVector sizes(fg.getSizes().begin(), fg.getSizes().end());
-        IndexVector subsizes = this->getUpperBounds(r) - this->getLowerBounds(r);
-        IndexVector starts = this->getLowerBounds(r);
-
-        std::vector<int> csizes(sizes.begin(), sizes.end());
-        std::vector<int> csubsizes(subsizes.begin(), subsizes.end());
-        std::vector<int> cstarts(starts.begin(), starts.end());
-
-        // create subarray view on data
-        MPI_Datatype mysubarray;
-        MPI_Type_create_subarray(static_cast<int>(this->getDimension()), &csizes[0], &csubsizes[0],
-                                 &cstarts[0], MPI_ORDER_FORTRAN, this->getMPIDatatype(),
-                                 &mysubarray);
-        MPI_Type_commit(&mysubarray);
-        subarrayTypes.push_back(mysubarray);
-
-        int src = r;
-        MPI_Request req;
-        MPI_Irecv(fg.getData(), 1, mysubarray, src, 0, this->getCommunicator(), &req);
-        requests.push_back(req);
-      }
-    }
-
-    // all
-    MPI_Wait(&sendRequest, MPI_STATUS_IGNORE);
-
-    if (rank == root) {
-      MPI_Waitall(static_cast<int>(requests.size()), &requests[0], MPI_STATUSES_IGNORE);
-    }
-
-    // free subarrays
-    for (size_t i = 0; i < subarrayTypes.size(); ++i) MPI_Type_free(&subarrayTypes[i]);
-  }
+  void gatherFullGrid(FullGrid<FG_ELEMENT>& fg, RankType root);
 
   inline void getFGPointsOfSubspaceRecursive(DimType d, IndexType localLinearIndexSum,
                                              std::vector<IndexVector>& oneDIndices,
-                                             std::vector<IndexType>& subspaceIndices) const {
-    assert(d < dim_);
-    assert(oneDIndices.size() == dim_);
-    assert(!oneDIndices.empty());
-
-    for (const auto idx : oneDIndices[d]) {
-      auto updatedLocalIndexSum = localLinearIndexSum;
-      updatedLocalIndexSum += this->getLocalOffsets()[d] * idx;
-      if (d > 0) {
-        getFGPointsOfSubspaceRecursive(static_cast<DimType>(d - 1), updatedLocalIndexSum,
-                                       oneDIndices, subspaceIndices);
-      } else {
-        subspaceIndices.emplace_back(updatedLocalIndexSum);
-      }
-    }
-  }
+                                             std::vector<IndexType>& subspaceIndices) const;
 
   /**
    * @brief Get the indices of the points of the subspace on this partition
@@ -833,28 +263,7 @@ std::vector<FG_ELEMENT> getInterpolatedValues(
    * @param l level of hierarchical subspace
    * @return the indices of points on this partition
    */
-  inline std::vector<IndexType> getFGPointsOfSubspace(const LevelVector& l) const {
-    IndexVector subspaceIndices;
-    IndexType numPointsOfSubspace = 1;
-    static thread_local std::vector<IndexVector> oneDIndices;
-    oneDIndices.resize(dim_);
-    for (DimType d = 0; d < dim_; ++d) {
-      if (l[d] > levels_[d]) {
-        return subspaceIndices;
-      }
-      get1dIndicesLocal(d, l[d], oneDIndices[d]);
-      numPointsOfSubspace *= oneDIndices[d].size();
-    }
-    if (numPointsOfSubspace > 0) {
-      subspaceIndices.reserve(numPointsOfSubspace);
-
-      IndexType localLinearIndexSum = 0;
-      getFGPointsOfSubspaceRecursive(static_cast<DimType>(dim_ - 1), localLinearIndexSum,
-                                     oneDIndices, subspaceIndices);
-    }
-    assert(static_cast<IndexType>(subspaceIndices.size()) == numPointsOfSubspace);
-    return subspaceIndices;
-  }
+  inline std::vector<IndexType> getFGPointsOfSubspace(const LevelVector& l) const;
 
   /**
    * @brief extracts the (hopefully) hierarchical coefficients from dsg
@@ -863,99 +272,17 @@ std::vector<FG_ELEMENT> getInterpolatedValues(
    * @param dsg the DSG to extract from
    */
   template <bool sparseGridFullyAllocated = true>
-  size_t extractFromUniformSG(const DistributedSparseGridUniform<FG_ELEMENT>& dsg) {
-    assert(dsg.isSubspaceDataCreated());
+  size_t extractFromUniformSG(const DistributedSparseGridUniform<FG_ELEMENT>& dsg);
 
-    // all the hierarchical subspaces contained in this full grid
-    const auto downwardClosedSet = combigrid::getDownSet(levels_);
-
-    // loop over all subspaces (-> somewhat linear access in the sg)
-    size_t numCopied = 0;
-    static thread_local IndexVector subspaceIndices;
-    typename AnyDistributedSparseGrid::SubspaceIndexType sIndex = 0;
-#pragma omp parallel for shared(dsg, downwardClosedSet) default(none) schedule(guided) \
-    firstprivate(sIndex) reduction(+ : numCopied)
-    for (const auto& level : downwardClosedSet) {
-      sIndex = dsg.getIndexInRange(level, sIndex);
-      bool shouldBeCopied = sIndex > -1 && dsg.getDataSize(sIndex) > 0;
-      if constexpr (!sparseGridFullyAllocated) {
-        shouldBeCopied = shouldBeCopied && dsg.isSubspaceCurrentlyAllocated(sIndex);
-      }
-      if (shouldBeCopied) {
-        auto sPointer = dsg.getData(sIndex);
-        subspaceIndices = std::move(this->getFGPointsOfSubspace(level));
-        // #pragma omp simd linear(sPointer : 1) // no simd benefit
-        for (size_t fIndex = 0; fIndex < subspaceIndices.size(); ++fIndex) {
-          this->getData()[subspaceIndices[fIndex]] = *sPointer;
-          ++sPointer;
-        }
-        assert(dsg.getDataSize(sIndex) == subspaceIndices.size());
-        numCopied += subspaceIndices.size();
-      }
-    }
-    return numCopied;
-  }
-
-  inline IndexType getStrideForThisLevel(LevelType l, DimType d) const {
-    assert(d < this->getDimension());
-    // special treatment for level 1 suspaces with boundary
-    return (l == 1 && hasBoundaryPoints_[d] > 0)
-               ? combigrid::powerOfTwoByBitshift(static_cast<LevelType>(levels_[d] - 1))
-               : combigrid::powerOfTwoByBitshift(static_cast<LevelType>(levels_[d] - l + 1));
-  }
+  inline IndexType getStrideForThisLevel(LevelType l, DimType d) const;
 
   inline IndexType getLocalStartForThisLevel(LevelType l, DimType d,
-                                             IndexType strideForThisLevel) const {
-    const auto firstGlobal1dIdx = getFirstGlobal1dIndex(d);
-
-    // get global offset to find indices of this level
-    // this is the first global index that has level l in dimension d
-    IndexType offsetForThisLevel;
-    if (hasBoundaryPoints_[d] > 0) {
-      if (l == 1) {
-        offsetForThisLevel = 0;
-      } else {
-        // offsetForThisLevel = strideForThisLevel / 2;
-        offsetForThisLevel =
-            combigrid::powerOfTwoByBitshift(static_cast<LevelType>(levels_[d] - l));
-      }
-    } else {
-      // offsetForThisLevel = strideForThisLevel / 2 - 1;
-      offsetForThisLevel =
-          combigrid::powerOfTwoByBitshift(static_cast<LevelType>(levels_[d] - l)) - 1;
-    }
-    assert(offsetForThisLevel > -1);
-    assert(strideForThisLevel >= offsetForThisLevel);
-
-    // first level-index that is on our partition
-    const IndexType firstGlobalIndexOnThisPartition =
-        (firstGlobal1dIdx < offsetForThisLevel)
-            ? 0
-            : (firstGlobal1dIdx - offsetForThisLevel + strideForThisLevel - 1) / strideForThisLevel;
-    // get global index of first local index which has level l
-    const IndexType globalStart =
-        (firstGlobalIndexOnThisPartition)*strideForThisLevel + offsetForThisLevel;
-    assert(getLevel(d, globalStart) == l ||
-           (getLevel(d, globalStart) == 0 && hasBoundaryPoints_[d] > 0 && l == 1));
-    assert(globalStart >= firstGlobal1dIdx);
-
-    return globalStart - firstGlobal1dIdx;
-  }
+                                             IndexType strideForThisLevel) const;
 
   inline IndexType getNumPointsOnThisPartition(DimType d, IndexType localStart,
-                                               IndexType strideForThisLevel) const {
-    assert(d < this->getDimension());
-    return (this->getLocalSizes()[d] - 1 < localStart)
-               ? 0
-               : (this->getLocalSizes()[d] - 1 - localStart) / strideForThisLevel + 1;
-  }
+                                               IndexType strideForThisLevel) const;
 
-  inline IndexType getNumPointsOnThisPartition(LevelType l, DimType d) const {
-    assert(!(l > levels_[d]));
-    const auto strideForThisLevel = getStrideForThisLevel(l, d);
-    return getNumPointsOnThisPartition(d, getLocalStartForThisLevel(l, d, strideForThisLevel),
-                                       strideForThisLevel);
-  }
+  inline IndexType getNumPointsOnThisPartition(LevelType l, DimType d) const;
 
   /**
    * @brief get the local 1d indices that correspond to a given level
@@ -964,26 +291,7 @@ std::vector<FG_ELEMENT> getInterpolatedValues(
    * @param l the level
    * @param oneDIndices a list of the local vector indices
    */
-  inline void get1dIndicesLocal(DimType d, LevelType l, IndexVector& oneDIndices) const {
-    assert(l > 0);
-    assert(d < this->getDimension());
-    if (l > levels_[d]) {
-      oneDIndices.clear();
-      return;
-    }
-
-    const IndexType strideForThisLevel = getStrideForThisLevel(l, d);
-    IndexType localStart = getLocalStartForThisLevel(l, d, strideForThisLevel);
-    const IndexType numPointsOnThisPartition =
-        getNumPointsOnThisPartition(d, localStart, strideForThisLevel);
-    oneDIndices.resize(numPointsOnThisPartition);
-
-    std::generate(oneDIndices.begin(), oneDIndices.end(), [&localStart, &strideForThisLevel]() {
-      auto localStartBefore = localStart;
-      localStart += strideForThisLevel;
-      return localStartBefore;
-    });
-  }
+  inline void get1dIndicesLocal(DimType d, LevelType l, IndexVector& oneDIndices) const;
 
   /**
    * @brief Get the Lp Norm of the data on the dfg:
@@ -992,459 +300,63 @@ std::vector<FG_ELEMENT> getInterpolatedValues(
    * @param p : the (polynomial) parameter, 0 is interpreted as maximum norm
    * @return real : the norm
    */
-  real getLpNorm(int p) const {
-    assert(p >= 0);
-    // special case maximum norm
-    MPI_Datatype dtype = abstraction::getMPIDatatype(abstraction::getabstractionDataType<real>());
-    if (p == 0) {
-      real max = 0.0;
-      auto data = this->getData();
-#pragma omp parallel for reduction(max : max) default(none) shared(data) schedule(static)
-      for (size_t i = 0; i < this->getNrLocalElements(); ++i) {
-        max = std::max(max, std::abs(data[i]));
-      }
-      real globalMax(-1);
-      MPI_Allreduce(&max, &globalMax, 1, dtype, MPI_MAX, getCommunicator());
-      return globalMax;
-    } else {
-      real p_f = static_cast<real>(p);
-      auto data = this->getData();
-      real res = 0.0;
-
-#pragma omp parallel for reduction(+ : res) default(none) shared(data, oneOverPowOfTwo) \
-    firstprivate(p_f) schedule(static)
-      for (size_t i = 0; i < this->getNrLocalElements(); ++i) {
-        auto isOnBoundary = this->isLocalLinearIndexOnBoundary(i);
-        auto countBoundary = std::count(isOnBoundary.begin(), isOnBoundary.end(), true);
-        double hatFcnIntegral = oneOverPowOfTwo[countBoundary];
-        // std::cout << hatFcnIntegral << std::endl;
-        // sumIntegral += hatFcnIntegral;
-        real abs = std::abs(data[i]);
-        res += std::pow(abs, p_f) * hatFcnIntegral;
-      }
-      // std::cout << " sum " << sumIntegral << std::endl;
-      real globalRes(0.);
-      MPI_Allreduce(&res, &globalRes, 1, dtype, MPI_SUM, getCommunicator());
-      // TODO this is only correct for 1 norm
-      // other norms have mixed terms and need additional boundary communication
-      return std::pow(globalRes * std::pow(this->getInnerNodalBasisFunctionIntegral(), p_f),
-                      1.0 / p_f);
-    }
-  }
+  real getLpNorm(int p) const;
 
   // write data to file using MPI-IO
-  void writePlotFile(const char* filename) const {
-    auto dim = getDimension();
-
-    // create subarray data type
-    auto [csizes, csubsizes, cstarts] = this->getSizesSubsizesStartsOfSubtensor();
-
-    // create subarray view on data
-    MPI_Datatype mysubarray;
-    MPI_Type_create_subarray(static_cast<int>(getDimension()), &csizes[0], &csubsizes[0],
-                             &cstarts[0], MPI_ORDER_FORTRAN, getMPIDatatype(), &mysubarray);
-    MPI_Type_commit(&mysubarray);
-
-    // open file
-    MPI_File fh;
-    MPI_File_open(getCommunicator(), filename, MPI_MODE_WRONLY | MPI_MODE_CREATE, MPI_INFO_NULL,
-                  &fh);
-
-    MPI_Offset offset = 0;
-
-    // .raw files can be read by paraview, in that case write the header separately
-    std::string fn = filename;
-    if (fn.find(".raw") != std::string::npos) {
-      // rank 0 write human-readable header
-      if (this->getRank() == 0) {
-        auto headername = fn + "_header";
-        std::ofstream ofs(headername);
-
-        // first line: dimension
-        ofs << "dimensionality " << getDimension() << std::endl;
-
-        // grid points per dimension in the order
-        // x_0, x_1, ... , x_d
-        ofs << "Extents ";
-        for (auto s : csizes) {
-          ofs << s << " ";
-        }
-        ofs << std::endl;
-
-        // data type
-        ofs << "Data type size " << sizeof(FG_ELEMENT) << std::endl;
-        // TODO (pollinta) write endianness and data spacing
-      }
-    } else {
-      // rank 0 write dim and resolution (and data format?)
-      if (this->getRank() == 0) {
-        MPI_File_write(fh, &dim, 1, MPI_UNSIGNED_CHAR, MPI_STATUS_IGNORE);
-
-        std::vector<int> res(csizes.begin(), csizes.end());
-        MPI_File_write(fh, &res[0], 6, MPI_INT, MPI_STATUS_IGNORE);
-      }
-
-      // set file view to right offset (in bytes)
-      offset = (1 + dim) * sizeof(int);
-    }
-
-    MPI_File_set_view(fh, offset, getMPIDatatype(), mysubarray, "native", MPI_INFO_NULL);
-
-    // write subarray
-     MPI_File_write_all(fh, getData(), static_cast<int>(getNrLocalElements()), getMPIDatatype(), MPI_STATUS_IGNORE);
-    // close file
-    MPI_File_close(&fh);
-    MPI_Type_free(&mysubarray);
-  }
+  void writePlotFile(const char* filename) const;
 
   // write data to legacy-type VTK file using MPI-IO
-  void writePlotFileVTK(const char* filename) const {
-    auto dim = getDimension();
-    assert(dim < 4);  // vtk supports only up to 3D
-
-    // create subarray data type
-    auto [csizes, csubsizes, cstarts] = this->getSizesSubsizesStartsOfSubtensor();
-
-    // create subarray view on data
-    MPI_Datatype mysubarray;
-    MPI_Type_create_subarray(static_cast<int>(getDimension()), &csizes[0], &csubsizes[0],
-                             &cstarts[0], MPI_ORDER_FORTRAN, getMPIDatatype(), &mysubarray);
-    MPI_Type_commit(&mysubarray);
-
-    // open file
-    MPI_File fh;
-    MPI_File_open(getCommunicator(), filename, MPI_MODE_WRONLY + MPI_MODE_CREATE, MPI_INFO_NULL,
-                  &fh);
-
-    std::stringstream vtk_header;
-    // cf https://lorensen.github.io/VTKExamples/site/VTKFileFormats/ => structured points
-    vtk_header << "# vtk DataFile Version 2.0.\n"
-               << "This file contains the combination solution evaluated on a full grid\n"
-               << "BINARY\n"
-               << "DATASET STRUCTURED_POINTS\n";
-    if (dim == 3) {  // TODO change for non-boundary grids using getGridSpacing
-      vtk_header << "DIMENSIONS " << csizes[0] << " " << csizes[1] << " " << csizes[2] << "\n"
-                 << "ORIGIN 0 0 0\n"
-                 << "SPACING " << 1. / static_cast<double>(csizes[0] - 1) << " "
-                 << 1. / static_cast<double>(csizes[1] - 1) << " "
-                 << 1. / static_cast<double>(csizes[2] - 1) << "\n";
-    } else if (dim == 2) {
-      vtk_header << "DIMENSIONS " << csizes[0] << " " << csizes[1] << " 1\n"
-                 << "ORIGIN 0 0 0\n"
-                 << "SPACING " << 1. / static_cast<double>(csizes[0] - 1) << " "
-                 << 1. / static_cast<double>(csizes[1] - 1) << " 1\n";
-    } else if (dim == 1) {
-      vtk_header << "DIMENSIONS " << csizes[0] << " 1 1\n"
-                 << "ORIGIN 0 0 0\n"
-                 << "SPACING " << 1. / static_cast<double>(csizes[0] - 1) << " 1 1\n";
-    } else {
-      assert(false);
-    }
-    vtk_header << "POINT_DATA "
-               << std::accumulate(csizes.begin(), csizes.end(), 1, std::multiplies<int>())
-               << "\n"
-               << "SCALARS quantity double 1\n"
-               << "LOOKUP_TABLE default\n";
-    // TODO set the right data type from combidatatype, for now double by default
-    bool rightDataType = std::is_same<CombiDataType, double>::value;
-    assert(rightDataType);
-    auto header_string = vtk_header.str();
-    auto header_size = header_string.size();
-
-    // rank 0 write header
-    if (this->getRank() == 0) {
-      MPI_File_write(fh, header_string.data(), static_cast<int>(header_size), MPI_CHAR,
-                     MPI_STATUS_IGNORE);
-    }
-
-    // set file view to right offset (in bytes)
-    MPI_Offset offset = header_size * sizeof(char);
-    // external32 not supported in OpenMPI < 5. -> writes "native" endianness
-    // might work with MPICH
-    MPI_File_set_view(fh, offset, getMPIDatatype(), mysubarray, "external32", MPI_INFO_NULL);
-
-    // write subarray
-    MPI_File_write_all(fh, getData(), static_cast<int>(getNrLocalElements()), getMPIDatatype(), MPI_STATUS_IGNORE);
-
-    // close file
-    MPI_File_close(&fh);
-    MPI_Type_free(&mysubarray);
-  }
+  void writePlotFileVTK(const char* filename) const;
 
   const std::vector<IndexVector>& getDecomposition() const { return decomposition_; }
 
-  MPI_Datatype getUpwardSubarray(DimType d) {
-    // do index calculations
-    // set lower bounds of subarray
-    auto subarrayLowerBounds = this->getLowerBounds();
-    auto subarrayUpperBounds = this->getUpperBounds();
-    subarrayLowerBounds[d] += this->getLocalSizes()[d] - 1;
-
-    auto subarrayExtents = subarrayUpperBounds - subarrayLowerBounds;
-    assert(subarrayExtents[d] == 1);
-    auto subarrayStarts = subarrayLowerBounds - this->getLowerBounds();
-
-    // create MPI datatype
-    std::vector<int> sizes(this->getLocalSizes().begin(), this->getLocalSizes().end());
-    std::vector<int> subsizes(subarrayExtents.begin(), subarrayExtents.end());
-    // the starts are local indices
-    std::vector<int> starts(subarrayStarts.begin(), subarrayStarts.end());
-
-    // create subarray view on data
-    MPI_Datatype mysubarray;
-    MPI_Type_create_subarray(static_cast<int>(this->getDimension()), sizes.data(), subsizes.data(),
-                             starts.data(), MPI_ORDER_FORTRAN, this->getMPIDatatype(), &mysubarray);
-    MPI_Type_commit(&mysubarray);
-    return mysubarray;
-  }
+  MPI_Datatype getUpwardSubarray(DimType d);
 
-  std::vector<MPI_Datatype> getUpwardSubarrays() {
-    // initialize upwardSubarrays_ only once
-    if (upwardSubarrays_.size() == 0) {
-      upwardSubarrays_.resize(this->getDimension());
-      for (DimType d = 0; d < this->getDimension(); ++d) {
-        upwardSubarrays_[d] = getUpwardSubarray(d);
-      }
-    }
-    return upwardSubarrays_;
-  }
+  std::vector<MPI_Datatype> getUpwardSubarrays();
 
-  MPI_Datatype getDownwardSubarray(DimType d) {
-    // do index calculations
-    // set upper bounds of subarray
-    IndexVector subarrayLowerBounds = this->getLowerBounds();
-    IndexVector subarrayUpperBounds = this->getUpperBounds();
-    subarrayUpperBounds[d] -= this->getLocalSizes()[d] - 1;
-
-    auto subarrayExtents = subarrayUpperBounds - subarrayLowerBounds;
-    assert(subarrayExtents[d] == 1);
-    auto subarrayStarts = subarrayLowerBounds - this->getLowerBounds();
-
-    // create MPI datatype
-    // also, the data dimensions are reversed
-    std::vector<int> sizes(this->getLocalSizes().begin(), this->getLocalSizes().end());
-    std::vector<int> subsizes(subarrayExtents.begin(), subarrayExtents.end());
-    // the starts are local indices
-    std::vector<int> starts(subarrayStarts.begin(), subarrayStarts.end());
-
-    // create subarray view on data
-    MPI_Datatype mysubarray;
-    MPI_Type_create_subarray(static_cast<int>(this->getDimension()), sizes.data(), subsizes.data(),
-                             starts.data(), MPI_ORDER_FORTRAN, this->getMPIDatatype(), &mysubarray);
-    MPI_Type_commit(&mysubarray);
-    return mysubarray;
-  }
+  MPI_Datatype getDownwardSubarray(DimType d);
+
+  std::vector<MPI_Datatype> getDownwardSubarrays();
 
-  std::vector<MPI_Datatype> getDownwardSubarrays() {
-    // initialize downwardSubarrays_ only once
-    if (downwardSubarrays_.size() == 0) {
-      downwardSubarrays_.resize(this->getDimension());
-      for (DimType d = 0; d < this->getDimension(); ++d) {
-        downwardSubarrays_[d] = getDownwardSubarray(d);
-      }
-    }
-    return downwardSubarrays_;
-  }
   /**
    * @brief get the ranks of the highest and lowest "neighbor" rank in dimension d
    *    only sets highest and lowest if they actually are my neighbors
    */
-  void getHighestAndLowestNeighbor(DimType d, int& highest, int& lowest) {
-    //TODO this is not going to work with periodic cartesian communicator
-    lowest = MPI_PROC_NULL;
-    highest = MPI_PROC_NULL;
-
-    auto d_reverse = this->getDimension() - d - 1;
-    if (!reverseOrderingDFGPartitions) {
-      d_reverse = d;
-    }
-
-    MPI_Cart_shift(this->getCommunicator(), static_cast<int>(d_reverse),
-                   static_cast<int>(getParallelization()[d] - 1), &lowest, &highest);
-
-    // assert only boundaries have those neighbors (remove in case of periodicity)
-    // this assumes no periodicity!
-    if (!this->getLowerBounds()[d] == 0) {
-      assert(highest < 0);
-    }
-    if (!this->getUpperBounds()[d] == this->getGlobalSizes()[d]) {
-      assert(lowest < 0);
-    }
-  }
-
-  void writeLowerBoundaryToUpperBoundary(DimType d) {
-    assert(hasBoundaryPoints_[d] == 2);
-
-    // create MPI datatypes
-    auto downSubarray = getDownwardSubarray(d);
-    auto upSubarray = getUpwardSubarray(d);
+  void getHighestAndLowestNeighbor(DimType d, int& highest, int& lowest) const;
 
-    // if I have the highest neighbor (i. e. I am the lowest rank), I need to send my lowest layer in d to them,
-    // if I have the lowest neighbor (i. e. I am the highest rank), I can receive it
-    int lower, higher;
-    getHighestAndLowestNeighbor(d, higher, lower);
-
-    // TODO asynchronous over d??
-    auto success = MPI_Sendrecv(this->getData(), 1, downSubarray, higher, TRANSFER_GHOST_LAYER_TAG,
-                                this->getData(), 1, upSubarray, lower, TRANSFER_GHOST_LAYER_TAG,
-                                this->getCommunicator(), MPI_STATUS_IGNORE);
-    assert(success == MPI_SUCCESS);
-    MPI_Type_free(&downSubarray);
-    MPI_Type_free(&upSubarray);
-  }
+  void writeLowerBoundaryToUpperBoundary(DimType d);
 
   // non-RVO dependent version of ghost layer exchange
   void exchangeGhostLayerUpward(DimType d, std::vector<int>& subarrayExtents,
                                 std::vector<FG_ELEMENT>& recvbuffer,
-                                MPI_Request* recvRequest = nullptr) {
-    subarrayExtents.resize(this->getDimension());
-    subarrayExtents.assign(this->getLocalSizes().begin(), this->getLocalSizes().end());
-    subarrayExtents[d] = 1;
-
-    // create MPI datatype
-    auto subarray = getUpwardSubarray(d);
-
-    // if I have a higher neighbor, I need to send my highest layer in d to them,
-    // if I have a lower neighbor, I can receive it
-    int lower = MPI_PROC_NULL;
-    int higher = MPI_PROC_NULL;
-
-    // somehow the cartesian directions in the communicator are reversed
-    // cf InitMPI(...)
-    auto d_reverse = this->getDimension() - d - 1;
-    if (!reverseOrderingDFGPartitions) {
-      d_reverse = d;
-    }
-    MPI_Cart_shift(this->getCommunicator(), static_cast<int>(d_reverse), 1, &lower, &higher);
-
-    if (this->returnBoundaryFlags()[d] == 1) {
-      // if one-sided boundary, assert that everyone has neighbors
-      assert(lower >= 0);
-      assert(higher >= 0);
-    } else if (this->returnBoundaryFlags()[d] == 2) {
-      // assert that boundaries have no neighbors
-      if (this->getLowerBounds()[d] == 0) {
-        assert(lower < 0);
-      }
-      if (this->getUpperBounds()[d] == this->getGlobalSizes()[d]) {
-        assert(higher < 0);
-      }
-    }
-
-    // create recvbuffer
-    auto numElements = std::accumulate(subarrayExtents.begin(), subarrayExtents.end(), 1,
-                                       std::multiplies<IndexType>());
-    if (lower < 0) {
-      numElements = 0;
-      std::memset(subarrayExtents.data(), 0, subarrayExtents.size() * sizeof(int));
-    }
-    recvbuffer.resize(numElements);
-    std::memset(recvbuffer.data(), 0, recvbuffer.size() * sizeof(FG_ELEMENT));
-
-    if (recvRequest != nullptr) {
-      auto success = MPI_Irecv(recvbuffer.data(), numElements, this->getMPIDatatype(), lower,
-                               TRANSFER_GHOST_LAYER_TAG, this->getCommunicator(), recvRequest);
-      assert(success == MPI_SUCCESS);
-      success = MPI_Send(this->getData(), 1, subarray, higher, TRANSFER_GHOST_LAYER_TAG,
-                         this->getCommunicator());
-      assert(success == MPI_SUCCESS);
-    } else {
-      auto success =
-          MPI_Sendrecv(this->getData(), 1, subarray, higher, TRANSFER_GHOST_LAYER_TAG,
-                       recvbuffer.data(), numElements, this->getMPIDatatype(), lower,
-                       TRANSFER_GHOST_LAYER_TAG, this->getCommunicator(), MPI_STATUS_IGNORE);
-      assert(success == MPI_SUCCESS);
-    }
-    MPI_Type_free(&subarray);
-  }
+                                MPI_Request* recvRequest = nullptr);
 
-  std::vector<FG_ELEMENT> exchangeGhostLayerUpward(DimType d, std::vector<int>& subarrayExtents) {
-    std::vector<FG_ELEMENT> recvbuffer{};
-    exchangeGhostLayerUpward(d, subarrayExtents, recvbuffer);
-    return recvbuffer;
-  }
+  std::vector<FG_ELEMENT> exchangeGhostLayerUpward(DimType d, std::vector<int>& subarrayExtents);
 
   /**
    * @brief check if given globalLinearIndex is on the boundary of this DistributedFullGrid
    */
-  std::vector<bool> isGlobalLinearIndexOnBoundary(IndexType globalLinearIndex) const {
-    // this could likely be done way more efficiently, but it's
-    // currently not in any performance critical spot
-    std::vector<bool> isOnBoundary(this->getDimension(), false);
-
-    // convert to global vector index
-    IndexVector globalAxisIndex(dim_);
-    getGlobalVectorIndex(globalLinearIndex, globalAxisIndex);
-    for (DimType d = 0; d < this->getDimension(); ++d) {
-      if (this->returnBoundaryFlags()[d] == 2) {
-        if (globalAxisIndex[d] == 0 || globalAxisIndex[d] == this->length(d) - 1) {
-          isOnBoundary[d] = true;
-        }
-      }
-    }
-    return isOnBoundary;
-  }
+  std::vector<bool> isGlobalLinearIndexOnBoundary(IndexType globalLinearIndex) const;
 
   /**
    * @brief check if given localLinearIndex is on the boundary of this DistributedFullGrid
    */
-  std::vector<bool> isLocalLinearIndexOnBoundary(IndexType localLinearIndex) const {
-    return isGlobalLinearIndexOnBoundary(getGlobalLinearIndex(localLinearIndex));
-  }
+  std::vector<bool> isLocalLinearIndexOnBoundary(IndexType localLinearIndex) const;
 
   /**
    * @brief recursive helper function for getCornersGlobal*Indices
-   *        (otherwise, it can't be dimension-independent)
    *
    */
   std::vector<IndexVector> getCornersGlobalVectorIndicesRecursive(
-      std::vector<IndexVector> indicesSoFar, DimType dim) const {
-    if (dim < this->getDimension()) {
-      std::vector<IndexVector> newIndicesSoFar{};
-      for (const auto& indexVec : indicesSoFar) {
-        newIndicesSoFar.push_back(indexVec);
-        newIndicesSoFar.back().push_back(0);
-        newIndicesSoFar.push_back(indexVec);
-        newIndicesSoFar.back().push_back(this->length(dim) - 1);
-      }
-      assert(newIndicesSoFar.size() == 2 * indicesSoFar.size());
-      return getCornersGlobalVectorIndicesRecursive(newIndicesSoFar, static_cast<DimType>(dim + 1));
-    } else {
-      return indicesSoFar;
-    }
-  }
+      std::vector<IndexVector> indicesSoFar, DimType dim) const;
 
   /**
    * @brief get a vector containing the global vector indices of the 2^d corners of this dfg
    *
    */
-  std::vector<IndexVector> getCornersGlobalVectorIndices() const {
-    auto emptyVectorInVector = std::vector<IndexVector>(1);
-    assert(emptyVectorInVector.size() == 1);
-    assert(emptyVectorInVector[0].size() == 0);
-    std::vector<IndexVector> cornersVectors =
-        getCornersGlobalVectorIndicesRecursive(emptyVectorInVector, 0);
-    assert(cornersVectors.size() == static_cast<size_t>(powerOfTwo[this->getDimension()]));
-    return cornersVectors;
-  }
+  std::vector<IndexVector> getCornersGlobalVectorIndices() const;
 
-  std::vector<FG_ELEMENT> getCornersValues() const {
-    std::vector<FG_ELEMENT> values(powerOfTwo[this->getDimension()]);
-    auto corners = getCornersGlobalVectorIndices();
-    for (size_t cornerNo = 0; cornerNo < corners.size(); ++cornerNo) {
-      if (this->isGlobalIndexHere(corners[cornerNo])) {
-        // convert to local vector index, then to linear index
-        IndexVector locAxisIndex(this->getDimension());
-        bool present = getLocalVectorIndex(corners[cornerNo], locAxisIndex);
-        assert(present);
-        auto index = getLocalLinearIndex(locAxisIndex);
-        values[cornerNo] = this->getData()[index];
-      }
-    }
-    MPI_Allreduce(MPI_IN_PLACE, values.data(), static_cast<int>(values.size()),
-                  this->getMPIDatatype(), MPI_SUM, this->getCommunicator());
-    return values;
-  }
+  std::vector<FG_ELEMENT> getCornersValues() const;
 
   const MPICartesianUtils& getCartesianUtils() const { return cartesianUtils_; }
 
@@ -1452,6 +364,16 @@ std::vector<FG_ELEMENT> getInterpolatedValues(
   inline void setData(FG_ELEMENT* newData) { return localTensor_.setData(newData); }
 
  private:
+  /**
+   * @brief sets the MPI-related member cartesianUtils_
+   *
+   * @param comm the communicator to use (assumed to be cartesian)
+   * @param procs the desired partition (for sanity checking)
+   */
+  void InitMPI(MPI_Comm comm, const std::vector<int>& procs);
+
+  void setDecomposition(const std::vector<IndexVector>& decomposition);
+
   /** dimension of the full grid */
   const DimType dim_;
 
@@ -1461,13 +383,13 @@ std::vector<FG_ELEMENT> getInterpolatedValues(
   /** the grid spacing h for each dimension */
   std::vector<double> gridSpacing_;
 
-  //TODO: make these normal templates, and SomeDistributedFullGrid a std::variant ?
+  // TODO: make these normal templates, and SomeDistributedFullGrid a std::variant ?
   /** TensorIndexer , only populated for the used dimensionality**/
   /** number of points per axis, boundary included*/
-  TensorIndexer globalIndexer_ {};
+  TensorIndexer globalIndexer_{};
 
   // /** Tensor -- only populated for the used dimensionality**/
-  Tensor<FG_ELEMENT> localTensor_ {};
+  Tensor<FG_ELEMENT> localTensor_{};
 
   /** flag to show if the dimension has boundary points*/
   const std::vector<BoundaryType> hasBoundaryPoints_;
@@ -1481,11 +403,11 @@ std::vector<FG_ELEMENT> getInterpolatedValues(
   /** my partition's upper 1D bounds */
   IndexVector myPartitionsUpperBounds_;
 
- /**
-  * the decomposition of the full grid over processors
-  * contains (for every dimension) the grid point indices at
-  * which a process boundary is assumed
-  */
+  /**
+   * the decomposition of the full grid over processors
+   * contains (for every dimension) the grid point indices at
+   * which a process boundary is assumed
+   */
   std::vector<IndexVector> decomposition_;
 
   /** utility to get info about cartesian communicator  */
@@ -1494,100 +416,13 @@ std::vector<FG_ELEMENT> getInterpolatedValues(
   // the MPI Datatypes representing the boundary layers of the MPI processes' subgrid
   std::vector<MPI_Datatype> downwardSubarrays_;
   std::vector<MPI_Datatype> upwardSubarrays_;
-
-  /**
-   * @brief sets the MPI-related member cartesianUtils_
-   *
-   * @param comm the communicator to use (assumed to be cartesian)
-   * @param procs the desired partition (for sanity checking)
-   */
-  void InitMPI(MPI_Comm comm, const std::vector<int>& procs) {
-    // check if communicator is already cartesian
-    int status;
-    MPI_Topo_test(comm, &status);
-
-    if (status == MPI_CART) {
-      if (comm != cartesianUtils_.getComm()) {
-        assert(uniformDecomposition);
-        cartesianUtils_ = MPICartesianUtils(comm);
-      }
-      if (procs != cartesianUtils_.getCartesianDimensions()) {
-        std::cerr << "unpart" << std::endl;
-        throw std::runtime_error("The given communicator is not partitioned as desired");
-      }
-    } else {
-      // MPI_Comm_dup(comm, &communicator_);
-      // cf. https://www.researchgate.net/publication/220439585_MPI_on_millions_of_cores
-      // "Figure 3 shows the memory consumption in all these cases after 32 calls to MPI Comm dup"
-      std::cerr << "undup" << std::endl;
-      throw std::runtime_error(
-          "Currently testing to not duplicate communicator (to save memory), \
-                          if you do want to use this code please take care that \
-                          MPI_Comm_free(&communicator_) will be called at some point");
-    }
-  }
-
-  void setDecomposition(const std::vector<IndexVector>& decomposition) {
-#ifndef NDEBUG
-    assert(decomposition.size() == dim_);
-    for (DimType i = 0; i < dim_; ++i)
-      assert(decomposition[i].size() ==
-             static_cast<size_t>(this->getCartesianUtils().getCartesianDimensions()[i]));
-
-    // check if 1d bounds given in ascending order
-    // if not, this might indicate there's something wrong
-    for (DimType i = 0; i < dim_; ++i) {
-      IndexVector tmp(decomposition[i]);
-      std::sort(tmp.begin(), tmp.end());
-      assert(tmp == decomposition[i]);
-    }
-
-    // check if partitions size larger zero
-    // this can be detected by checking for duplicate entries in
-    // the 1d decompositions
-    for (DimType i = 0; i < dim_; ++i) {
-      IndexVector tmp(decomposition[i]);
-      IndexVector::iterator last = std::unique(tmp.begin(), tmp.end());
-
-      // todo: this is not sufficient to check for duplicate entries
-      // if last points to the same element as tmp.end()
-      // this means all entries in tmp are unique
-      assert(last == tmp.end() && "some partition in decomposition is of zero size!");
-    }
-#endif // not def NDEBUG
-
-    decomposition_ = decomposition;
-  }
 };
-// end class
-
-template <typename FG_ELEMENT>
-MPICartesianUtils DistributedFullGrid<FG_ELEMENT>::cartesianUtils_;
-
-// output operator
-template <typename FG_ELEMENT>
-inline std::ostream& operator<<(std::ostream& os, const DistributedFullGrid<FG_ELEMENT>& dfg) {
-  // os << dfg.getRank() << " " << dfg.getDimension() << " " << dfg.getLevels() << std::endl;
-  // os << "bounds " << dfg.getLowerBounds() << " to " << dfg.getUpperBounds()  << " to " << dfg.getUpperBoundsCoords() << std::endl;
-  // os << "offsets " << dfg.getOffsets() << " " << dfg.getLocalOffsets() << std::endl;
-  // os << "sizes " << dfg.getLocalSizes() << "; " << dfg.getNrLocalElements() << " " << dfg.getNrLocalElements() << "; " << dfg.getNrElements() << std::endl;
-  // std::vector<std::vector<IndexType>> decomposition = dfg.getDecomposition();
-  // for (auto& dec : decomposition) {
-  //   os << dec;
-  // }
-  // os << " decomposition; parallelization " << dfg.getParallelization() << std::endl;
-  // if(dfg.getNrLocalElements() < 30)
-  //   os << "elements " << dfg.getElementVector() << std::endl;
-
-  dfg.print();
-
-  return os;
-}
 
 template <typename FG_ELEMENT>
 class OwningDistributedFullGrid : public DistributedFullGrid<FG_ELEMENT> {
  public:
   OwningDistributedFullGrid() = default;
+
   explicit OwningDistributedFullGrid(
       DimType dim, const LevelVector& levels, CommunicatorType const& comm,
       const std::vector<BoundaryType>& hasBdrPoints, const std::vector<int>& procs,
@@ -1617,32 +452,1361 @@ class OwningDistributedFullGrid : public DistributedFullGrid<FG_ELEMENT> {
   std::vector<FG_ELEMENT> ownedDataVector_{};
 };
 
-static inline std::vector<IndexVector> downsampleDecomposition(
-    const std::vector<IndexVector> decomposition, const LevelVector& referenceLevel,
-    const LevelVector& newLevel, const std::vector<BoundaryType>& boundary) {
-  auto newDecomposition = decomposition;
-  if (decomposition.size() > 0) {
-    for (DimType d = 0 ; d < static_cast<DimType>(referenceLevel.size()); ++ d) {
-      // for now, assume that we never want to interpolate on a level finer than referenceLevel
-      assert(referenceLevel[d] >= newLevel[d]);
-      auto levelDiff = referenceLevel[d] - newLevel[d];
-      auto stepFactor = oneOverPowOfTwo[levelDiff];
-      if(boundary[d] > 0) {
-        // all levels contain the boundary points -> point 0 is the same
-        for (auto& dec: newDecomposition[d]) {
-          dec = static_cast<IndexType>(std::ceil(static_cast<double>(dec)*stepFactor));
-        }
+template <typename FG_ELEMENT>
+DistributedFullGrid<FG_ELEMENT>::DistributedFullGrid(DimType dim, const LevelVector& levels,
+                                                     CommunicatorType const& comm,
+                                                     const std::vector<BoundaryType>& hasBdrPoints,
+                                                     FG_ELEMENT* dataPointer,
+                                                     const std::vector<int>& procs,
+                                                     bool forwardDecomposition,
+                                                     const std::vector<IndexVector>& decomposition)
+    : dim_(dim), levels_(levels), hasBoundaryPoints_(hasBdrPoints) {
+  assert(levels_.size() == dim);
+  assert(hasBoundaryPoints_.size() == dim);
+  assert(procs.size() == dim);
+
+  InitMPI(comm, procs);
+
+  IndexVector nrPoints(dim_);
+  gridSpacing_.resize(dim_);
+
+  // set global num of elements and offsets
+  IndexType nrElements = 1;
+  for (DimType j = 0; j < dim_; j++) {
+    nrPoints[j] = combigrid::getNumDofNodal(levels_[j], hasBoundaryPoints_[j]);
+    nrElements = nrElements * nrPoints[j];
+    if (hasBoundaryPoints_[j] == 1) {
+      assert(!decomposition.empty() || !forwardDecomposition);
+    }
+    assert(levels_[j] < 30);
+    gridSpacing_[j] = oneOverPowOfTwo[levels_[j]];
+  }
+
+  if (decomposition.size() == 0) {
+    setDecomposition(getDefaultDecomposition(
+        nrPoints, this->getCartesianUtils().getCartesianDimensions(), forwardDecomposition));
+  } else {
+    setDecomposition(decomposition);
+  }
+  globalIndexer_ = TensorIndexer(std::move(nrPoints));
+  myPartitionsLowerBounds_ = getLowerBounds(this->getRank());
+  myPartitionsFirstGlobalIndex_ = globalIndexer_.sequentialIndex(myPartitionsLowerBounds_);
+  myPartitionsUpperBounds_ = getUpperBounds(this->getRank());
+
+  // set local elements and local offsets
+  auto nrLocalPoints = getUpperBounds() - getLowerBounds();
+  localTensor_ = Tensor<FG_ELEMENT>(dataPointer, std::move(nrLocalPoints));
+}
+
+template <typename FG_ELEMENT>
+DistributedFullGrid<FG_ELEMENT>::~DistributedFullGrid() {
+  for (size_t i = 0; i < upwardSubarrays_.size(); ++i) {
+    MPI_Type_free(&upwardSubarrays_[i]);
+  }
+  for (size_t i = 0; i < downwardSubarrays_.size(); ++i) {
+    MPI_Type_free(&downwardSubarrays_[i]);
+  }
+}
+
+template <typename FG_ELEMENT>
+double DistributedFullGrid<FG_ELEMENT>::getPointDistanceToCoordinate(
+    IndexType oneDimensionalLocalIndex, double coord, DimType d) const {
+  const auto& h = getGridSpacing();
+  auto coordDistance =
+      static_cast<double>(this->getLowerBounds()[d] + (hasBoundaryPoints_[d] > 0 ? 0 : 1) +
+                          oneDimensionalLocalIndex) *
+          h[d] -
+      coord;
+#ifndef NDEBUG
+  if (std::abs(coordDistance) > h[d]) {
+    std::cout << "assert bounds " << coordDistance << coord << h << static_cast<int>(d)
+              << oneDimensionalLocalIndex << std::endl;
+    assert(false &&
+           "should only be called for coordinates within the support of this point's basis "
+           "function");
+  }
+#endif  // ndef NDEBUG
+  return std::abs(coordDistance);
+}
+
+template <typename FG_ELEMENT>
+FG_ELEMENT DistributedFullGrid<FG_ELEMENT>::evalIndexAndAllUpperNeighbors(
+    const IndexVector& localIndex, const std::vector<real>& coords) const {
+  FG_ELEMENT result = 0.;
+  auto localLinearIndex = this->getLocalLinearIndex(localIndex);
+  for (size_t localIndexIterate = 0;
+       localIndexIterate < combigrid::powerOfTwoByBitshift(this->getDimension());
+       ++localIndexIterate) {
+#ifndef NDEBUG
+    auto neighborVectorIndex = localIndex;
+#endif
+    auto neighborIndex = localLinearIndex;
+    real phi_c = 1.;  // value of product of iterate's basis function on coords
+    for (DimType d = 0; d < dim_; ++d) {
+      const auto lastIndexInDim = this->getLocalSizes()[d] - 1;
+      bool isUpperInDim = std::bitset<sizeof(int) * CHAR_BIT>(localIndexIterate).test(d);
+      auto iterateIndexInThisDimension = localIndex[d] + isUpperInDim;
+      if (iterateIndexInThisDimension < 0) {
+        phi_c = 0.;
+        break;
+      } else if (isUpperInDim && this->hasBoundaryPoints_[d] == 1 &&
+                 //  this->getCartesianUtils().isOnLowerBoundaryInDimension(d) &&
+                 iterateIndexInThisDimension == this->getGlobalSizes()[d]) {
+        // if we have a wrap-around AND are on the lowest cartesian process in this dimension
+        // we need to set the index in this dim to 0 and shift the coordinate by 1.
+        neighborIndex -= localIndex[d] * this->getLocalOffsets()[d];
+        iterateIndexInThisDimension = 0;
+        auto coordDistance =
+            getPointDistanceToCoordinate(iterateIndexInThisDimension, coords[d] - 1.0, d);
+        auto oneOverHinD = powerOfTwo[levels_[d]];
+        phi_c *= 1. - coordDistance * oneOverHinD;
+      } else if (iterateIndexInThisDimension > lastIndexInDim) {
+        phi_c = 0.;
+        break;
       } else {
-        // all levels do not contain the boundary points -> mid point is the same
-        auto leftProtrusion = powerOfTwo[levelDiff] - 1;
-        for (auto& dec: newDecomposition[d]) {
-          // same as before, but subtract the "left" protrusion on the finer level
-          dec = static_cast<IndexType>(std::max(0., std::ceil(static_cast<double>(dec - leftProtrusion)*stepFactor)));
-        }
+        auto coordDistance =
+            getPointDistanceToCoordinate(iterateIndexInThisDimension, coords[d], d);
+        auto oneOverHinD = powerOfTwo[levels_[d]];
+        phi_c *= 1. - coordDistance * oneOverHinD;
+        neighborIndex += isUpperInDim * this->getLocalOffsets()[d];
+      }
+#ifndef NDEBUG
+      neighborVectorIndex[d] = iterateIndexInThisDimension;
+#endif
+    }
+    if (phi_c > 0.) {
+#ifndef NDEBUG
+      auto unlinearizedNeighborIndex = neighborVectorIndex;
+      this->getLocalVectorIndex(neighborIndex, unlinearizedNeighborIndex);
+      if (unlinearizedNeighborIndex != neighborVectorIndex) {
+        std::cerr << "expected " << unlinearizedNeighborIndex << " got " << neighborVectorIndex
+                  << std::endl;
+      }
+      if (neighborIndex < 0) {
+        std::cerr << "expected " << unlinearizedNeighborIndex << " got " << neighborVectorIndex
+                  << " or " << neighborIndex << std::endl;
       }
+#endif
+      assert(neighborIndex > -1);
+      assert(neighborIndex < this->getNrLocalElements());
+      result += phi_c * this->getData()[neighborIndex];
     }
   }
-  return newDecomposition;
+  return result;
+}
+
+template <typename FG_ELEMENT>
+FG_ELEMENT DistributedFullGrid<FG_ELEMENT>::evalLocal(const std::vector<real>& coords) const {
+  FG_ELEMENT value;
+  evalLocal(coords, value);
+  return value;
+}
+template <typename FG_ELEMENT>
+void DistributedFullGrid<FG_ELEMENT>::evalLocal(const std::vector<real>& coords,
+                                                FG_ELEMENT& value) const {
+  assert(coords.size() == this->getDimension());
+  // get the lowest-index point of the points
+  // whose basis functions contribute to the interpolated value
+  const auto& h = getGridSpacing();
+  static thread_local IndexVector localIndexLowerNonzeroNeighborPoint;
+  localIndexLowerNonzeroNeighborPoint.resize(dim_);
+  for (DimType d = 0; d < dim_; ++d) {
+#ifndef NDEBUG
+    if (coords[d] < 0. || coords[d] > 1.) {
+      std::cout << "coords " << coords << " out of bounds" << std::endl;
+    }
+    assert(coords[d] >= 0. && coords[d] <= 1.);
+#endif  // ndef NDEBUG
+    // this is the local index of the point that is lower than the coordinate
+    // may also be negative if the coordinate is lower than this processes' coordinates
+    IndexType localIndexLowerNonzeroNeighborIndexInThisDimension = static_cast<IndexType>(
+        std::floor((coords[d] - this->getLowerBoundsCoord(d)) * this->getInverseGridSpacingIn(d)));
+
+    // check if we even need to evaluate on this process
+    if (localIndexLowerNonzeroNeighborIndexInThisDimension < -1) {
+      // index too small
+      value = 0.;
+      return;
+    } else if ((coords[d] >= 1.0 - h[d]) && this->hasBoundaryPoints_[d] == 1 &&
+               this->getCartesianUtils().isOnLowerBoundaryInDimension(d)) {
+      // if we have periodic boundary and this process is at the lower end of the dimension d
+      // we need the periodic coordinate => don't return 0
+    } else if (localIndexLowerNonzeroNeighborIndexInThisDimension > this->getLocalSizes()[d] - 1) {
+      // index too high
+      value = 0.;
+      return;
+    }
+    localIndexLowerNonzeroNeighborPoint[d] = localIndexLowerNonzeroNeighborIndexInThisDimension;
+  }
+  // evaluate at those points and sum up according to the basis function
+  value = evalIndexAndAllUpperNeighbors(localIndexLowerNonzeroNeighborPoint, coords);
+}
+
+template <typename FG_ELEMENT>
+std::vector<FG_ELEMENT> DistributedFullGrid<FG_ELEMENT>::getInterpolatedValues(
+    const std::vector<std::vector<real>>& interpolationCoords) const {
+  auto numValues = interpolationCoords.size();
+  std::vector<FG_ELEMENT> values;
+  values.resize(numValues);
+#pragma omp parallel for default(none) firstprivate(numValues) shared(values, interpolationCoords) \
+    schedule(static)
+  for (size_t i = 0; i < numValues; ++i) {
+    this->evalLocal(interpolationCoords[i], values[i]);
+  }
+  MPI_Allreduce(MPI_IN_PLACE, values.data(), static_cast<int>(numValues), this->getMPIDatatype(),
+                MPI_SUM, this->getCommunicator());
+  return values;
+}
+
+template <typename FG_ELEMENT>
+void DistributedFullGrid<FG_ELEMENT>::getCoordsGlobal(IndexType globalLinearIndex,
+                                                      std::vector<real>& coords) const {
+  IndexType ind = 0;
+  IndexType tmp_add = 0;
+
+  coords.resize(dim_);
+
+  for (DimType j = 0; j < dim_; j++) {
+    ind = globalLinearIndex % this->getGlobalSizes()[j];
+    globalLinearIndex = globalLinearIndex / this->getGlobalSizes()[j];
+    // set the coordinate based on if we have boundary points
+    tmp_add = (hasBoundaryPoints_[j] > 0) ? (0) : (1);
+    coords[j] = static_cast<double>(ind + tmp_add) * getGridSpacing()[j];
+  }
+}
+
+template <typename FG_ELEMENT>
+void DistributedFullGrid<FG_ELEMENT>::getCoordsLocal(IndexType localLinearIndex,
+                                                     std::vector<real>& coords) const {
+  // todo: probably very inefficient implementation, if crucial for
+  // performance implement more direct way of computing the coordinates
+  assert(localLinearIndex < getNrLocalElements());
+
+  IndexType globalLinearIndex = getGlobalLinearIndex(localLinearIndex);
+
+  getCoordsGlobal(globalLinearIndex, coords);
+}
+
+template <typename FG_ELEMENT>
+void DistributedFullGrid<FG_ELEMENT>::getGlobalLI(IndexType elementIndex, LevelVector& levels,
+                                                  IndexVector& indexes) const {
+  IndexType startindex, tmp_val;
+
+  assert(elementIndex < this->getNrElements());
+  levels.resize(dim_);
+  indexes.resize(dim_);
+
+  tmp_val = elementIndex;
+
+  // first calculate intermediary indexes
+  for (DimType k = 0; k < dim_; k++) {
+    startindex = (hasBoundaryPoints_[k] > 0) ? 0 : 1;
+    indexes[k] = tmp_val % this->getGlobalSizes()[k] + startindex;
+    tmp_val = tmp_val / this->getGlobalSizes()[k];
+  }
+
+  // The level and index of the element in the hashgridstorage are computed dividing by two the
+  // index and level in the fullgrid
+  // until we obtain an impair number for the index, thus obtaining the level and index in the
+  // hierarchical basis (Aliz Nagy)
+  // ...
+  for (DimType k = 0; k < dim_; k++) {
+    tmp_val = levels_[k];
+
+    if (indexes[k] != 0) {
+      // todo: these operations can be optimized
+      while (indexes[k] % 2 == 0) {
+        indexes[k] = indexes[k] / 2;
+        tmp_val--;
+      }
+    } else {
+      tmp_val = 0;
+    }
+
+    levels[k] = tmp_val;
+  }
+}
+
+template <typename FG_ELEMENT>
+void DistributedFullGrid<FG_ELEMENT>::getGlobalVectorIndex(IndexType globLinIndex,
+                                                           IndexVector& globAxisIndex) const {
+  assert(globLinIndex < this->getNrElements());
+  assert(globAxisIndex.size() == dim_);
+
+  globAxisIndex = this->globalIndexer_.getVectorIndex(globLinIndex);
+}
+
+template <typename FG_ELEMENT>
+void DistributedFullGrid<FG_ELEMENT>::getGlobalVectorIndex(const IndexVector& locAxisIndex,
+                                                           IndexVector& globAxisIndex) const {
+  assert(locAxisIndex.size() == dim_);
+
+  globAxisIndex = this->getLowerBounds() + locAxisIndex;
+}
+
+template <typename FG_ELEMENT>
+void DistributedFullGrid<FG_ELEMENT>::getLocalVectorIndex(IndexType locLinIndex,
+                                                          IndexVector& locAxisIndex) const {
+  locAxisIndex = this->localTensor_.getVectorIndex(locLinIndex);
+}
+
+template <typename FG_ELEMENT>
+bool DistributedFullGrid<FG_ELEMENT>::getLocalVectorIndex(const IndexVector& globAxisIndex,
+                                                          IndexVector& locAxisIndex) const {
+  assert(globAxisIndex.size() == dim_);
+
+  if (this->isGlobalIndexHere(globAxisIndex)) {
+    locAxisIndex.assign(globAxisIndex.begin(), globAxisIndex.end());
+    std::transform(locAxisIndex.begin(), locAxisIndex.end(), this->getLowerBounds().begin(),
+                   locAxisIndex.begin(), std::minus<IndexType>());
+    return true;
+  } else {
+    return false;
+  }
+}
+
+template <typename FG_ELEMENT>
+IndexType DistributedFullGrid<FG_ELEMENT>::getGlobalLinearIndex(
+    const IndexVector& globAxisIndex) const {
+  return globalIndexer_.sequentialIndex(globAxisIndex);
+}
+
+template <typename FG_ELEMENT>
+IndexType DistributedFullGrid<FG_ELEMENT>::getGlobalLinearIndex(IndexType locLinIndex) const {
+  assert(locLinIndex < this->getNrLocalElements());
+
+  // convert to local vector index
+  static thread_local IndexVector locAxisIndex(dim_);
+  locAxisIndex.resize(dim_);
+  getLocalVectorIndex(locLinIndex, locAxisIndex);
+
+  // convert to global linear index
+  IndexType globLinIndex =
+      this->myPartitionsFirstGlobalIndex_ + this->getGlobalLinearIndex(locAxisIndex);
+  assert(globLinIndex < this->getNrElements());
+  return globLinIndex;
+}
+
+template <typename FG_ELEMENT>
+IndexType DistributedFullGrid<FG_ELEMENT>::getLocalLinearIndex(
+    const IndexVector& locAxisIndex) const {
+  return localTensor_.sequentialIndex(locAxisIndex);
+}
+
+template <typename FG_ELEMENT>
+IndexType DistributedFullGrid<FG_ELEMENT>::getLocalLinearIndex(IndexType globLinIndex) const {
+  assert(globLinIndex < this->getNrElements());
+
+  // convert to global vector index
+  static thread_local IndexVector globAxisIndex(dim_);
+  globAxisIndex.resize(dim_);
+  getGlobalVectorIndex(globLinIndex, globAxisIndex);
+
+  if (this->isGlobalIndexHere(globAxisIndex)) {
+    // convert to local linear index
+    return getLocalLinearIndex(globAxisIndex) - this->myPartitionsFirstGlobalIndex_;
+  } else {
+    return -1;
+  }
+}
+
+template <typename FG_ELEMENT>
+bool DistributedFullGrid<FG_ELEMENT>::isGlobalIndexHere(IndexVector globalVectorIndex) const {
+  return (globalVectorIndex >= this->getLowerBounds()) &&
+         (globalVectorIndex < this->getUpperBounds());
+}
+
+template <typename FG_ELEMENT>
+bool DistributedFullGrid<FG_ELEMENT>::isGlobalIndexHere(IndexType globLinIndex) const {
+  return isGlobalIndexHere(this->globalIndexer_.getVectorIndex(globLinIndex));
+}
+
+template <typename FG_ELEMENT>
+std::tuple<std::vector<int>, std::vector<int>, std::vector<int>>
+DistributedFullGrid<FG_ELEMENT>::getSizesSubsizesStartsOfSubtensor() const {
+  std::vector<int> csizes(this->getGlobalSizes().begin(), this->getGlobalSizes().end());
+  std::vector<int> csubsizes(this->getLocalSizes().begin(), this->getLocalSizes().end());
+  std::vector<int> cstarts(this->getLowerBounds().begin(), this->getLowerBounds().end());
+
+  return std::make_tuple(csizes, csubsizes, cstarts);
+}
+
+template <typename FG_ELEMENT>
+double DistributedFullGrid<FG_ELEMENT>::getInverseGridSpacingIn(DimType inDimension) const {
+  return powerOfTwo[levels_[inDimension]];
+}
+
+template <typename FG_ELEMENT>
+std::vector<double> DistributedFullGrid<FG_ELEMENT>::getInverseGridSpacing() const {
+  std::vector<double> oneOverH;
+  oneOverH.resize(gridSpacing_.size());
+  // should be the same as the number of intervals (N)
+  for (DimType j = 0; j < dim_; j++) {
+    oneOverH[j] = powerOfTwo[levels_[j]];
+  }
+#ifndef NDEBUG
+  std::vector<double> oneOverHByDivision;
+  oneOverHByDivision.resize(gridSpacing_.size());
+  std::transform(gridSpacing_.begin(), gridSpacing_.end(), oneOverHByDivision.begin(),
+                 std::bind(std::divides<double>(), 1, std::placeholders::_1));
+  for (DimType j = 0; j < dim_; j++) {
+    assert(std::abs(oneOverHByDivision[j] - oneOverH[j]) < 1e-10);
+    assert(oneOverHByDivision[j] == oneOverH[j]);
+  }
+#endif
+  return oneOverH;
+}
+
+template <typename FG_ELEMENT>
+double DistributedFullGrid<FG_ELEMENT>::getInnerNodalBasisFunctionIntegral() const {
+  const auto& h = this->getGridSpacing();
+  return std::accumulate(h.begin(), h.end(), 1., std::multiplies<double>());
+}
+
+template <typename FG_ELEMENT>
+void DistributedFullGrid<FG_ELEMENT>::setZero() {
+  std::memset(this->getData(), 0, this->getNrLocalElements() * sizeof(FG_ELEMENT));
+}
+
+template <typename FG_ELEMENT>
+const IndexVector& DistributedFullGrid<FG_ELEMENT>::getLowerBounds() const {
+  // return getLowerBounds(this->getRank());
+  return myPartitionsLowerBounds_;
+}
+
+template <typename FG_ELEMENT>
+IndexVector DistributedFullGrid<FG_ELEMENT>::getLowerBounds(RankType r) const {
+  assert(r >= 0 && r < this->getCommunicatorSize());
+  // get coords of r in cart comm
+  IndexVector lowerBounds(dim_);
+  const auto& coords = cartesianUtils_.getPartitionCoordsOfRank(r);
+
+  for (DimType i = 0; i < dim_; ++i) {
+    lowerBounds[i] = getDecomposition()[i][coords[i]];
+  }
+  return lowerBounds;
+}
+
+template <typename FG_ELEMENT>
+real DistributedFullGrid<FG_ELEMENT>::getLowerBoundsCoord(DimType inDimension) const {
+  return static_cast<real>(this->getLowerBounds()[inDimension] +
+                           (hasBoundaryPoints_[inDimension] > 0 ? 0 : 1)) *
+         this->getGridSpacing()[inDimension];
+}
+
+template <typename FG_ELEMENT>
+const IndexVector& DistributedFullGrid<FG_ELEMENT>::getUpperBounds() const {
+  // return getUpperBounds(this->getRank());
+  return myPartitionsUpperBounds_;
+}
+
+template <typename FG_ELEMENT>
+IndexVector DistributedFullGrid<FG_ELEMENT>::getUpperBounds(RankType r) const {
+  assert(r >= 0 && r < this->getCommunicatorSize());
+  IndexVector upperBounds(dim_);
+  const auto& coords = cartesianUtils_.getPartitionCoordsOfRank(r);
+
+  for (DimType i = 0; i < dim_; ++i) {
+    RankType n;
+    std::vector<int> nc(coords);
+
+    if (nc[i] < this->getCartesianUtils().getCartesianDimensions()[i] - 1) {
+      // get rank of next neighbor in dim i
+      nc[i] += 1;
+      n = this->getCartesianUtils().getRankFromPartitionCoords(nc);
+      upperBounds[i] = getLowerBounds(n)[i];
+    } else {
+      // no neighbor in dim i -> end of domain
+      upperBounds[i] = this->getGlobalSizes()[i];
+    }
+  }
+  return upperBounds;
+}
+
+template <typename FG_ELEMENT>
+std::vector<real> DistributedFullGrid<FG_ELEMENT>::getUpperBoundsCoords() const {
+  return getUpperBoundsCoords(this->getRank());
+}
+
+template <typename FG_ELEMENT>
+std::vector<real> DistributedFullGrid<FG_ELEMENT>::getUpperBoundsCoords(RankType r) const {
+  assert(r >= 0 && r < this->getCommunicatorSize());
+
+  /* the upper bound index vector can correspond to coordinates outside of
+   * the domain, thus we cannot use the getGlobalLinearIndex method here.
+   */
+  const IndexVector& ubvi = getUpperBounds(r);
+  std::vector<real> coords(dim_);
+  IndexType tmp_add = 0;
+
+  for (DimType j = 0; j < dim_; ++j) {
+    tmp_add = (hasBoundaryPoints_[j] > 0) ? (0) : (1);
+    coords[j] = static_cast<double>(ubvi[j] + tmp_add) * getGridSpacing()[j];
+  }
+  return coords;
+}
+
+template <typename FG_ELEMENT>
+RankType DistributedFullGrid<FG_ELEMENT>::getNeighbor1dFromAxisIndex(DimType dim,
+                                                                     IndexType idx1d) const {
+  // if global index is outside of domain return negative value
+  {
+    if (idx1d < 0) return -1;
+    if (idx1d > this->getGlobalSizes()[dim] - 1) return -1;
+  }
+  const auto& decomp1d = this->getDecomposition()[dim];
+  auto lower = std::lower_bound(decomp1d.begin(), decomp1d.end(), idx1d + 1);
+  int partitionIdx1d = static_cast<int>(std::distance(decomp1d.begin(), lower)) - 1;
+  RankType r = this->getCartesianUtils().getNeighbor1dFromPartitionIndex(dim, partitionIdx1d);
+
+  // check if global index vector is actually contained in the domain of rank r
+  assert(idx1d >= this->getLowerBounds(r)[dim]);
+  assert(idx1d < this->getUpperBounds(r)[dim]);
+  assert(r < this->getCommunicatorSize());
+  return r;
+}
+
+template <typename FG_ELEMENT>
+const std::vector<int>& DistributedFullGrid<FG_ELEMENT>::getParallelization() const {
+  return this->getCartesianUtils().getCartesianDimensions();
+}
+
+template <typename FG_ELEMENT>
+LevelType DistributedFullGrid<FG_ELEMENT>::getLevel(DimType d, IndexType idx1d) const {
+  // IndexVector givec(dim_, 0);
+  // givec[d] = idx1d;
+  // IndexType idx = getGlobalLinearIndex(givec);
+  // LevelVector levels(dim_);
+  // IndexVector tmp(dim_);
+  // getGlobalLI(idx, levels, tmp);
+  // return levels[d];
+
+  if (this->hasBoundaryPoints_[d] == 0) {
+    ++idx1d;
+  }
+  // get level of idx1d by rightmost set bit
+  // (e.g., all points on the finest level already have a 1 at the rightmost bit)
+  if (idx1d == 0) {
+    return 0;
+  }
+  const LevelType lmax = this->getLevels()[d];
+  // auto set_bit = idx1d & ~(idx1d-1);
+  // LevelType l = lmax - log2(set_bit);
+  // builtin is fast and should work with gcc and clang
+  // if it is not available, use the one above (at a slight performance hit)
+  // or c++20's std::countr_zero
+  LevelType l;
+  if constexpr (sizeof(IndexType) == sizeof(long)) {
+    l = static_cast<LevelType>(lmax - __builtin_ctzl(static_cast<unsigned long>(idx1d)));
+  } else if constexpr (sizeof(IndexType) == sizeof(long long)) {
+    l = static_cast<LevelType>(lmax - __builtin_ctzll(static_cast<unsigned long long>(idx1d)));
+  } else {
+    l = static_cast<LevelType>(lmax - __builtin_ctz(static_cast<unsigned int>(idx1d)));
+  }
+  return l;
+}
+
+template <typename FG_ELEMENT>
+IndexType DistributedFullGrid<FG_ELEMENT>::getLeftPredecessor(DimType d, IndexType idx1d) const {
+  LevelType l = getLevel(d, idx1d);
+
+  // boundary points
+  if (l == 0) return -1;
+
+  LevelType ldiff = static_cast<LevelType>(levels_[d] - l);
+  IndexType lpidx = idx1d - combigrid::powerOfTwoByBitshift(ldiff);
+
+  return lpidx;
+}
+
+template <typename FG_ELEMENT>
+IndexType DistributedFullGrid<FG_ELEMENT>::getRightPredecessor(DimType d, IndexType idx1d) const {
+  LevelType l = getLevel(d, idx1d);
+
+  // boundary points
+  if (l == 0) return -1;
+
+  LevelType ldiff = static_cast<LevelType>(levels_[d] - l);
+  IndexType rpidx = idx1d + combigrid::powerOfTwoByBitshift(ldiff);
+
+  // check if outside of domain
+  IndexType numElementsD = this->getGlobalSizes()[d];
+
+  // in case of periodic, "virtual" domain is larger by one
+  bool oneSidedBoundary = this->returnBoundaryFlags()[d] == 1;
+  if (oneSidedBoundary && rpidx == numElementsD) return numElementsD;
+
+  if (rpidx > numElementsD - 1) rpidx = -1;
+
+  return rpidx;
+}
+
+template <typename FG_ELEMENT>
+void DistributedFullGrid<FG_ELEMENT>::getPartitionCoords(IndexVector& globalAxisIndex,
+                                                         std::vector<int>& partitionCoords) const {
+  partitionCoords.resize(dim_);
+
+  for (DimType d = 0; d < dim_; ++d) {
+    partitionCoords[d] = -1;
+    for (int i = 0; i < this->getCartesianUtils().getCartesianDimensions()[d]; ++i) {
+      if (globalAxisIndex[d] >= getDecomposition()[d][i]) partitionCoords[d] = i;
+    }
+
+    // check whether the partition coordinates are valid
+    assert(partitionCoords[d] > -1 &&
+           partitionCoords[d] < this->getCartesianUtils().getCartesianDimensions()[d]);
+  }
+}
+
+template <typename FG_ELEMENT>
+void DistributedFullGrid<FG_ELEMENT>::print() const {
+  std::visit([&](auto&& arg) { print(arg); }, localTensor_);
+}
+
+template <typename FG_ELEMENT>
+MPI_Datatype DistributedFullGrid<FG_ELEMENT>::getMPIDatatype() const {
+  return abstraction::getMPIDatatype(abstraction::getabstractionDataType<FG_ELEMENT>());
+}
+template <typename FG_ELEMENT>
+void DistributedFullGrid<FG_ELEMENT>::gatherFullGrid(FullGrid<FG_ELEMENT>& fg, RankType root) {
+  int size = this->getCommunicatorSize();
+  int rank = this->getRank();
+  CommunicatorType comm = this->getCommunicator();
+
+  // each rank: send dfg to root
+  int dest = root;
+  MPI_Request sendRequest;
+  MPI_Isend(this->getData(), static_cast<int>(this->getNrLocalElements()), this->getMPIDatatype(),
+            dest, 0, comm, &sendRequest);
+
+  std::vector<MPI_Request> requests;
+  std::vector<MPI_Datatype> subarrayTypes;
+
+  // rank r: for each rank create subarray view on fg
+  if (rank == root) {
+    if (!fg.isGridCreated()) fg.createFullGrid();
+
+    for (int r = 0; r < size; ++r) {
+      IndexVector sizes(fg.getSizes().begin(), fg.getSizes().end());
+      IndexVector subsizes = this->getUpperBounds(r) - this->getLowerBounds(r);
+      IndexVector starts = this->getLowerBounds(r);
+
+      std::vector<int> csizes(sizes.begin(), sizes.end());
+      std::vector<int> csubsizes(subsizes.begin(), subsizes.end());
+      std::vector<int> cstarts(starts.begin(), starts.end());
+
+      // create subarray view on data
+      MPI_Datatype mysubarray;
+      MPI_Type_create_subarray(static_cast<int>(this->getDimension()), &csizes[0], &csubsizes[0],
+                               &cstarts[0], MPI_ORDER_FORTRAN, this->getMPIDatatype(), &mysubarray);
+      MPI_Type_commit(&mysubarray);
+      subarrayTypes.push_back(mysubarray);
+
+      int src = r;
+      MPI_Request req;
+      MPI_Irecv(fg.getData(), 1, mysubarray, src, 0, this->getCommunicator(), &req);
+      requests.push_back(req);
+    }
+  }
+
+  MPI_Wait(&sendRequest, MPI_STATUS_IGNORE);
+
+  if (rank == root) {
+    MPI_Waitall(static_cast<int>(requests.size()), &requests[0], MPI_STATUSES_IGNORE);
+  }
+
+  for (size_t i = 0; i < subarrayTypes.size(); ++i) MPI_Type_free(&subarrayTypes[i]);
+}
+
+template <typename FG_ELEMENT>
+void DistributedFullGrid<FG_ELEMENT>::getFGPointsOfSubspaceRecursive(
+    DimType d, IndexType localLinearIndexSum, std::vector<IndexVector>& oneDIndices,
+    std::vector<IndexType>& subspaceIndices) const {
+  assert(d < dim_);
+  assert(oneDIndices.size() == dim_);
+  assert(!oneDIndices.empty());
+
+  for (const auto idx : oneDIndices[d]) {
+    auto updatedLocalIndexSum = localLinearIndexSum;
+    updatedLocalIndexSum += this->getLocalOffsets()[d] * idx;
+    if (d > 0) {
+      getFGPointsOfSubspaceRecursive(static_cast<DimType>(d - 1), updatedLocalIndexSum, oneDIndices,
+                                     subspaceIndices);
+    } else {
+      subspaceIndices.emplace_back(updatedLocalIndexSum);
+    }
+  }
+}
+template <typename FG_ELEMENT>
+std::vector<IndexType> DistributedFullGrid<FG_ELEMENT>::getFGPointsOfSubspace(
+    const LevelVector& l) const {
+  IndexVector subspaceIndices;
+  IndexType numPointsOfSubspace = 1;
+  static thread_local std::vector<IndexVector> oneDIndices;
+  oneDIndices.resize(dim_);
+  for (DimType d = 0; d < dim_; ++d) {
+    if (l[d] > levels_[d]) {
+      return subspaceIndices;
+    }
+    get1dIndicesLocal(d, l[d], oneDIndices[d]);
+    numPointsOfSubspace *= static_cast<decltype(numPointsOfSubspace)>(oneDIndices[d].size());
+  }
+  if (numPointsOfSubspace > 0) {
+    subspaceIndices.reserve(numPointsOfSubspace);
+
+    IndexType localLinearIndexSum = 0;
+    getFGPointsOfSubspaceRecursive(static_cast<DimType>(dim_ - 1), localLinearIndexSum, oneDIndices,
+                                   subspaceIndices);
+  }
+  assert(static_cast<IndexType>(subspaceIndices.size()) == numPointsOfSubspace);
+  return subspaceIndices;
+}
+
+template <typename FG_ELEMENT>
+template <bool sparseGridFullyAllocated>
+size_t DistributedFullGrid<FG_ELEMENT>::extractFromUniformSG(
+    const DistributedSparseGridUniform<FG_ELEMENT>& dsg) {
+  assert(dsg.isSubspaceDataCreated());
+
+  // all the hierarchical subspaces contained in this full grid
+  const auto downwardClosedSet = combigrid::getDownSet(levels_);
+
+  // loop over all subspaces (-> somewhat linear access in the sg)
+  size_t numCopied = 0;
+  static thread_local IndexVector subspaceIndices;
+  typename AnyDistributedSparseGrid::SubspaceIndexType sIndex = 0;
+#pragma omp parallel for shared(dsg, downwardClosedSet) default(none) schedule(guided) \
+    firstprivate(sIndex) reduction(+ : numCopied)
+  for (const auto& level : downwardClosedSet) {
+    sIndex = dsg.getIndexInRange(level, sIndex);
+    bool shouldBeCopied = sIndex > -1 && dsg.getDataSize(sIndex) > 0;
+    if constexpr (!sparseGridFullyAllocated) {
+      shouldBeCopied = shouldBeCopied && dsg.isSubspaceCurrentlyAllocated(sIndex);
+    }
+    if (shouldBeCopied) {
+      auto sPointer = dsg.getData(sIndex);
+      subspaceIndices = std::move(this->getFGPointsOfSubspace(level));
+      // #pragma omp simd linear(sPointer : 1) // no simd benefit
+      for (size_t fIndex = 0; fIndex < subspaceIndices.size(); ++fIndex) {
+        this->getData()[subspaceIndices[fIndex]] = *sPointer;
+        ++sPointer;
+      }
+      assert(dsg.getDataSize(sIndex) == subspaceIndices.size());
+      numCopied += subspaceIndices.size();
+    }
+  }
+  return numCopied;
+}
+
+template <typename FG_ELEMENT>
+IndexType DistributedFullGrid<FG_ELEMENT>::getStrideForThisLevel(LevelType l, DimType d) const {
+  assert(d < this->getDimension());
+  // special treatment for level 1 suspaces with boundary
+  return (l == 1 && hasBoundaryPoints_[d] > 0)
+             ? combigrid::powerOfTwoByBitshift(static_cast<LevelType>(levels_[d] - 1))
+             : combigrid::powerOfTwoByBitshift(static_cast<LevelType>(levels_[d] - l + 1));
+}
+
+template <typename FG_ELEMENT>
+IndexType DistributedFullGrid<FG_ELEMENT>::getLocalStartForThisLevel(
+    LevelType l, DimType d, IndexType strideForThisLevel) const {
+  const auto firstGlobal1dIdx = getFirstGlobal1dIndex(d);
+
+  // get global offset to find indices of this level
+  // this is the first global index that has level l in dimension d
+  IndexType offsetForThisLevel;
+  if (hasBoundaryPoints_[d] > 0) {
+    if (l == 1) {
+      offsetForThisLevel = 0;
+    } else {
+      // offsetForThisLevel = strideForThisLevel / 2;
+      offsetForThisLevel = combigrid::powerOfTwoByBitshift(static_cast<LevelType>(levels_[d] - l));
+    }
+  } else {
+    // offsetForThisLevel = strideForThisLevel / 2 - 1;
+    offsetForThisLevel =
+        combigrid::powerOfTwoByBitshift(static_cast<LevelType>(levels_[d] - l)) - 1;
+  }
+  assert(offsetForThisLevel > -1);
+  assert(strideForThisLevel >= offsetForThisLevel);
+
+  // first level-index that is on our partition
+  const IndexType firstGlobalIndexOnThisPartition =
+      (firstGlobal1dIdx < offsetForThisLevel)
+          ? 0
+          : (firstGlobal1dIdx - offsetForThisLevel + strideForThisLevel - 1) / strideForThisLevel;
+  // get global index of first local index which has level l
+  const IndexType globalStart =
+      (firstGlobalIndexOnThisPartition)*strideForThisLevel + offsetForThisLevel;
+  assert(getLevel(d, globalStart) == l ||
+         (getLevel(d, globalStart) == 0 && hasBoundaryPoints_[d] > 0 && l == 1));
+  assert(globalStart >= firstGlobal1dIdx);
+
+  return globalStart - firstGlobal1dIdx;
+}
+
+template <typename FG_ELEMENT>
+IndexType DistributedFullGrid<FG_ELEMENT>::getNumPointsOnThisPartition(
+    DimType d, IndexType localStart, IndexType strideForThisLevel) const {
+  assert(d < this->getDimension());
+  return (this->getLocalSizes()[d] - 1 < localStart)
+             ? 0
+             : (this->getLocalSizes()[d] - 1 - localStart) / strideForThisLevel + 1;
+}
+
+template <typename FG_ELEMENT>
+IndexType DistributedFullGrid<FG_ELEMENT>::getNumPointsOnThisPartition(LevelType l,
+                                                                       DimType d) const {
+  assert(!(l > levels_[d]));
+  const auto strideForThisLevel = getStrideForThisLevel(l, d);
+  return getNumPointsOnThisPartition(d, getLocalStartForThisLevel(l, d, strideForThisLevel),
+                                     strideForThisLevel);
+}
+
+template <typename FG_ELEMENT>
+void DistributedFullGrid<FG_ELEMENT>::get1dIndicesLocal(DimType d, LevelType l,
+                                                        IndexVector& oneDIndices) const {
+  assert(l > 0);
+  assert(d < this->getDimension());
+  if (l > levels_[d]) {
+    oneDIndices.clear();
+    return;
+  }
+
+  const IndexType strideForThisLevel = getStrideForThisLevel(l, d);
+  IndexType localStart = getLocalStartForThisLevel(l, d, strideForThisLevel);
+  const IndexType numPointsOnThisPartition =
+      getNumPointsOnThisPartition(d, localStart, strideForThisLevel);
+  oneDIndices.resize(numPointsOnThisPartition);
+
+  std::generate(oneDIndices.begin(), oneDIndices.end(), [&localStart, &strideForThisLevel]() {
+    auto localStartBefore = localStart;
+    localStart += strideForThisLevel;
+    return localStartBefore;
+  });
+}
+
+template <typename FG_ELEMENT>
+real DistributedFullGrid<FG_ELEMENT>::getLpNorm(int p) const {
+  assert(p >= 0);
+  // special case maximum norm
+  MPI_Datatype dtype = abstraction::getMPIDatatype(abstraction::getabstractionDataType<real>());
+  if (p == 0) {
+    real max = 0.0;
+    auto data = this->getData();
+#pragma omp parallel for reduction(max : max) default(none) shared(data) schedule(static)
+    for (IndexType i = 0; i < this->getNrLocalElements(); ++i) {
+      max = std::max(max, std::abs(data[i]));
+    }
+    real globalMax(-1);
+    MPI_Allreduce(&max, &globalMax, 1, dtype, MPI_MAX, getCommunicator());
+    return globalMax;
+  } else {
+    real p_f = static_cast<real>(p);
+    auto data = this->getData();
+    real res = 0.0;
+
+#pragma omp parallel for reduction(+ : res) default(none) shared(data, oneOverPowOfTwo) \
+    firstprivate(p_f) schedule(static)
+    for (IndexType i = 0; i < this->getNrLocalElements(); ++i) {
+      auto isOnBoundary = this->isLocalLinearIndexOnBoundary(i);
+      auto countBoundary = std::count(isOnBoundary.begin(), isOnBoundary.end(), true);
+      double hatFcnIntegral = oneOverPowOfTwo[countBoundary];
+      // std::cout << hatFcnIntegral << std::endl;
+      // sumIntegral += hatFcnIntegral;
+      real abs = std::abs(data[i]);
+      res += std::pow(abs, p_f) * hatFcnIntegral;
+    }
+    // std::cout << " sum " << sumIntegral << std::endl;
+    real globalRes(0.);
+    MPI_Allreduce(&res, &globalRes, 1, dtype, MPI_SUM, getCommunicator());
+    // TODO this is only correct for 1 norm
+    // other norms have mixed terms and need additional boundary communication
+    return std::pow(globalRes * std::pow(this->getInnerNodalBasisFunctionIntegral(), p_f),
+                    1.0 / p_f);
+  }
+}
+
+template <typename FG_ELEMENT>
+void DistributedFullGrid<FG_ELEMENT>::writePlotFile(const char* filename) const {
+  auto dim = getDimension();
+
+  // create subarray data type
+  auto [csizes, csubsizes, cstarts] = this->getSizesSubsizesStartsOfSubtensor();
+
+  // create subarray view on data
+  MPI_Datatype mysubarray;
+  MPI_Type_create_subarray(static_cast<int>(getDimension()), &csizes[0], &csubsizes[0], &cstarts[0],
+                           MPI_ORDER_FORTRAN, getMPIDatatype(), &mysubarray);
+  MPI_Type_commit(&mysubarray);
+
+  // open file
+  MPI_File fh;
+  MPI_File_open(getCommunicator(), filename, MPI_MODE_WRONLY | MPI_MODE_CREATE, MPI_INFO_NULL, &fh);
+
+  MPI_Offset offset = 0;
+
+  // .raw files can be read by paraview, in that case write the header separately
+  std::string fn = filename;
+  if (fn.find(".raw") != std::string::npos) {
+    // rank 0 write human-readable header
+    if (this->getRank() == 0) {
+      auto headername = fn + "_header";
+      std::ofstream ofs(headername);
+
+      // first line: dimension
+      ofs << "dimensionality " << getDimension() << std::endl;
+
+      // grid points per dimension in the order
+      // x_0, x_1, ... , x_d
+      ofs << "Extents ";
+      for (auto s : csizes) {
+        ofs << s << " ";
+      }
+      ofs << std::endl;
+
+      // data type
+      ofs << "Data type size " << sizeof(FG_ELEMENT) << std::endl;
+      // TODO (pollinta) write endianness and data spacing
+    }
+  } else {
+    // rank 0 write dim and resolution (and data format?)
+    if (this->getRank() == 0) {
+      MPI_File_write(fh, &dim, 1, MPI_UNSIGNED_CHAR, MPI_STATUS_IGNORE);
+
+      std::vector<int> res(csizes.begin(), csizes.end());
+      MPI_File_write(fh, &res[0], 6, MPI_INT, MPI_STATUS_IGNORE);
+    }
+
+    // set file view to right offset (in bytes)
+    offset = (1 + dim) * sizeof(int);
+  }
+
+  MPI_File_set_view(fh, offset, getMPIDatatype(), mysubarray, "native", MPI_INFO_NULL);
+
+  // write subarray
+  MPI_File_write_all(fh, getData(), static_cast<int>(getNrLocalElements()), getMPIDatatype(),
+                     MPI_STATUS_IGNORE);
+  // close file
+  MPI_File_close(&fh);
+  MPI_Type_free(&mysubarray);
+}
+
+template <typename FG_ELEMENT>
+void DistributedFullGrid<FG_ELEMENT>::writePlotFileVTK(const char* filename) const {
+  auto dim = getDimension();
+  assert(dim < 4);  // vtk supports only up to 3D
+
+  // create subarray data type
+  auto [csizes, csubsizes, cstarts] = this->getSizesSubsizesStartsOfSubtensor();
+
+  // create subarray view on data
+  MPI_Datatype mysubarray;
+  MPI_Type_create_subarray(static_cast<int>(getDimension()), &csizes[0], &csubsizes[0], &cstarts[0],
+                           MPI_ORDER_FORTRAN, getMPIDatatype(), &mysubarray);
+  MPI_Type_commit(&mysubarray);
+
+  // open file
+  MPI_File fh;
+  MPI_File_open(getCommunicator(), filename, MPI_MODE_WRONLY + MPI_MODE_CREATE, MPI_INFO_NULL, &fh);
+
+  std::stringstream vtk_header;
+  // cf https://lorensen.github.io/VTKExamples/site/VTKFileFormats/ => structured points
+  vtk_header << "# vtk DataFile Version 2.0.\n"
+             << "This file contains the combination solution evaluated on a full grid\n"
+             << "BINARY\n"
+             << "DATASET STRUCTURED_POINTS\n";
+  if (dim == 3) {  // TODO change for non-boundary grids using getGridSpacing
+    vtk_header << "DIMENSIONS " << csizes[0] << " " << csizes[1] << " " << csizes[2] << "\n"
+               << "ORIGIN 0 0 0\n"
+               << "SPACING " << 1. / static_cast<double>(csizes[0] - 1) << " "
+               << 1. / static_cast<double>(csizes[1] - 1) << " "
+               << 1. / static_cast<double>(csizes[2] - 1) << "\n";
+  } else if (dim == 2) {
+    vtk_header << "DIMENSIONS " << csizes[0] << " " << csizes[1] << " 1\n"
+               << "ORIGIN 0 0 0\n"
+               << "SPACING " << 1. / static_cast<double>(csizes[0] - 1) << " "
+               << 1. / static_cast<double>(csizes[1] - 1) << " 1\n";
+  } else if (dim == 1) {
+    vtk_header << "DIMENSIONS " << csizes[0] << " 1 1\n"
+               << "ORIGIN 0 0 0\n"
+               << "SPACING " << 1. / static_cast<double>(csizes[0] - 1) << " 1 1\n";
+  } else {
+    assert(false);
+  }
+  vtk_header << "POINT_DATA "
+             << std::accumulate(csizes.begin(), csizes.end(), 1, std::multiplies<int>()) << "\n"
+             << "SCALARS quantity double 1\n"
+             << "LOOKUP_TABLE default\n";
+  // TODO set the right data type from combidatatype, for now double by default
+  [[maybe_unused]] bool rightDataType = std::is_same<CombiDataType, double>::value;
+  assert(rightDataType);
+  auto header_string = vtk_header.str();
+  auto header_size = header_string.size();
+
+  // rank 0 write header
+  if (this->getRank() == 0) {
+    MPI_File_write(fh, header_string.data(), static_cast<int>(header_size), MPI_CHAR,
+                   MPI_STATUS_IGNORE);
+  }
+
+  // set file view to right offset (in bytes)
+  MPI_Offset offset = header_size * sizeof(char);
+  // external32 not supported in OpenMPI < 5. -> writes "native" endianness
+  // might work with MPICH
+  MPI_File_set_view(fh, offset, getMPIDatatype(), mysubarray, "external32", MPI_INFO_NULL);
+
+  // write subarray
+  MPI_File_write_all(fh, getData(), static_cast<int>(getNrLocalElements()), getMPIDatatype(),
+                     MPI_STATUS_IGNORE);
+
+  // close file
+  MPI_File_close(&fh);
+  MPI_Type_free(&mysubarray);
+}
+
+template <typename FG_ELEMENT>
+MPI_Datatype DistributedFullGrid<FG_ELEMENT>::getUpwardSubarray(DimType d) {
+  // do index calculations
+  // set lower bounds of subarray
+  auto subarrayLowerBounds = this->getLowerBounds();
+  auto subarrayUpperBounds = this->getUpperBounds();
+  subarrayLowerBounds[d] += this->getLocalSizes()[d] - 1;
+
+  auto subarrayExtents = subarrayUpperBounds - subarrayLowerBounds;
+  assert(subarrayExtents[d] == 1);
+  auto subarrayStarts = subarrayLowerBounds - this->getLowerBounds();
+
+  // create MPI datatype
+  std::vector<int> sizes(this->getLocalSizes().begin(), this->getLocalSizes().end());
+  std::vector<int> subsizes(subarrayExtents.begin(), subarrayExtents.end());
+  // the starts are local indices
+  std::vector<int> starts(subarrayStarts.begin(), subarrayStarts.end());
+
+  // create subarray view on data
+  MPI_Datatype mysubarray;
+  MPI_Type_create_subarray(static_cast<int>(this->getDimension()), sizes.data(), subsizes.data(),
+                           starts.data(), MPI_ORDER_FORTRAN, this->getMPIDatatype(), &mysubarray);
+  MPI_Type_commit(&mysubarray);
+  return mysubarray;
+}
+
+template <typename FG_ELEMENT>
+std::vector<MPI_Datatype> DistributedFullGrid<FG_ELEMENT>::getUpwardSubarrays() {
+  // initialize upwardSubarrays_ only once
+  if (upwardSubarrays_.size() == 0) {
+    upwardSubarrays_.resize(this->getDimension());
+    for (DimType d = 0; d < this->getDimension(); ++d) {
+      upwardSubarrays_[d] = getUpwardSubarray(d);
+    }
+  }
+  return upwardSubarrays_;
+}
+
+template <typename FG_ELEMENT>
+MPI_Datatype DistributedFullGrid<FG_ELEMENT>::getDownwardSubarray(DimType d) {
+  // do index calculations
+  // set upper bounds of subarray
+  IndexVector subarrayLowerBounds = this->getLowerBounds();
+  IndexVector subarrayUpperBounds = this->getUpperBounds();
+  subarrayUpperBounds[d] -= this->getLocalSizes()[d] - 1;
+
+  auto subarrayExtents = subarrayUpperBounds - subarrayLowerBounds;
+  assert(subarrayExtents[d] == 1);
+  auto subarrayStarts = subarrayLowerBounds - this->getLowerBounds();
+
+  // create MPI datatype
+  // also, the data dimensions are reversed
+  std::vector<int> sizes(this->getLocalSizes().begin(), this->getLocalSizes().end());
+  std::vector<int> subsizes(subarrayExtents.begin(), subarrayExtents.end());
+  // the starts are local indices
+  std::vector<int> starts(subarrayStarts.begin(), subarrayStarts.end());
+
+  // create subarray view on data
+  MPI_Datatype mysubarray;
+  MPI_Type_create_subarray(static_cast<int>(this->getDimension()), sizes.data(), subsizes.data(),
+                           starts.data(), MPI_ORDER_FORTRAN, this->getMPIDatatype(), &mysubarray);
+  MPI_Type_commit(&mysubarray);
+  return mysubarray;
+}
+
+template <typename FG_ELEMENT>
+std::vector<MPI_Datatype> DistributedFullGrid<FG_ELEMENT>::getDownwardSubarrays() {
+  // initialize downwardSubarrays_ only once
+  if (downwardSubarrays_.size() == 0) {
+    downwardSubarrays_.resize(this->getDimension());
+    for (DimType d = 0; d < this->getDimension(); ++d) {
+      downwardSubarrays_[d] = getDownwardSubarray(d);
+    }
+  }
+  return downwardSubarrays_;
+}
+
+template <typename FG_ELEMENT>
+void DistributedFullGrid<FG_ELEMENT>::getHighestAndLowestNeighbor(DimType d, int& highest,
+                                                                  int& lowest) const {
+  lowest = MPI_PROC_NULL;
+  highest = MPI_PROC_NULL;
+
+  auto d_reverse = this->getDimension() - d - 1;
+  if (!reverseOrderingDFGPartitions) {
+    d_reverse = d;
+  }
+
+  MPI_Cart_shift(this->getCommunicator(), static_cast<int>(d_reverse),
+                 static_cast<int>(getParallelization()[d] - 1), &lowest, &highest);
+
+  // assert only boundaries have those neighbors (remove in case of periodicity)
+  // this assumes no periodicity!
+  if (!this->getLowerBounds()[d] == 0) {
+    assert(highest < 0);
+  }
+  if (!this->getUpperBounds()[d] == this->getGlobalSizes()[d]) {
+    assert(lowest < 0);
+  }
+}
+
+template <typename FG_ELEMENT>
+void DistributedFullGrid<FG_ELEMENT>::writeLowerBoundaryToUpperBoundary(DimType d) {
+  assert(hasBoundaryPoints_[d] == 2);
+
+  // create MPI datatypes
+  auto downSubarray = getDownwardSubarray(d);
+  auto upSubarray = getUpwardSubarray(d);
+
+  // if I have the highest neighbor (i. e. I am the lowest rank), I need to send my lowest layer
+  // in d to them, if I have the lowest neighbor (i. e. I am the highest rank), I can receive it
+  int lower, higher;
+  getHighestAndLowestNeighbor(d, higher, lower);
+
+  // TODO asynchronous over d??
+  [[maybe_unused]] auto success = MPI_Sendrecv(
+      this->getData(), 1, downSubarray, higher, TRANSFER_GHOST_LAYER_TAG, this->getData(), 1,
+      upSubarray, lower, TRANSFER_GHOST_LAYER_TAG, this->getCommunicator(), MPI_STATUS_IGNORE);
+  assert(success == MPI_SUCCESS);
+  MPI_Type_free(&downSubarray);
+  MPI_Type_free(&upSubarray);
+}
+
+template <typename FG_ELEMENT>
+void DistributedFullGrid<FG_ELEMENT>::exchangeGhostLayerUpward(DimType d,
+                                                               std::vector<int>& subarrayExtents,
+                                                               std::vector<FG_ELEMENT>& recvbuffer,
+                                                               MPI_Request* recvRequest) {
+  subarrayExtents.resize(this->getDimension());
+  subarrayExtents.assign(this->getLocalSizes().begin(), this->getLocalSizes().end());
+  subarrayExtents[d] = 1;
+
+  // create MPI datatype
+  auto subarray = getUpwardSubarray(d);
+
+  // if I have a higher neighbor, I need to send my highest layer in d to them,
+  // if I have a lower neighbor, I can receive it
+  int lower = MPI_PROC_NULL;
+  int higher = MPI_PROC_NULL;
+
+  // somehow the cartesian directions in the communicator are reversed
+  // cf InitMPI(...)
+  auto d_reverse = this->getDimension() - d - 1;
+  if (!reverseOrderingDFGPartitions) {
+    d_reverse = d;
+  }
+  MPI_Cart_shift(this->getCommunicator(), static_cast<int>(d_reverse), 1, &lower, &higher);
+
+  if (this->returnBoundaryFlags()[d] == 1) {
+    // if one-sided boundary, assert that everyone has neighbors
+    assert(lower >= 0);
+    assert(higher >= 0);
+  } else if (this->returnBoundaryFlags()[d] == 2) {
+    // assert that boundaries have no neighbors
+    if (this->getLowerBounds()[d] == 0) {
+      assert(lower < 0);
+    }
+    if (this->getUpperBounds()[d] == this->getGlobalSizes()[d]) {
+      assert(higher < 0);
+    }
+  }
+
+  // create recvbuffer
+  auto numElements = std::accumulate(subarrayExtents.begin(), subarrayExtents.end(), 1,
+                                     std::multiplies<IndexType>());
+  if (lower < 0) {
+    numElements = 0;
+    std::memset(subarrayExtents.data(), 0, subarrayExtents.size() * sizeof(int));
+  }
+  recvbuffer.resize(numElements);
+  std::memset(recvbuffer.data(), 0, recvbuffer.size() * sizeof(FG_ELEMENT));
+
+  if (recvRequest != nullptr) {
+    [[maybe_unused]] auto success =
+        MPI_Irecv(recvbuffer.data(), numElements, this->getMPIDatatype(), lower,
+                  TRANSFER_GHOST_LAYER_TAG, this->getCommunicator(), recvRequest);
+    assert(success == MPI_SUCCESS);
+    success = MPI_Send(this->getData(), 1, subarray, higher, TRANSFER_GHOST_LAYER_TAG,
+                       this->getCommunicator());
+    assert(success == MPI_SUCCESS);
+  } else {
+    [[maybe_unused]] auto success =
+        MPI_Sendrecv(this->getData(), 1, subarray, higher, TRANSFER_GHOST_LAYER_TAG,
+                     recvbuffer.data(), numElements, this->getMPIDatatype(), lower,
+                     TRANSFER_GHOST_LAYER_TAG, this->getCommunicator(), MPI_STATUS_IGNORE);
+    assert(success == MPI_SUCCESS);
+  }
+  MPI_Type_free(&subarray);
+}
+
+template <typename FG_ELEMENT>
+std::vector<FG_ELEMENT> DistributedFullGrid<FG_ELEMENT>::exchangeGhostLayerUpward(
+    DimType d, std::vector<int>& subarrayExtents) {
+  std::vector<FG_ELEMENT> recvbuffer{};
+  exchangeGhostLayerUpward(d, subarrayExtents, recvbuffer);
+  return recvbuffer;
+}
+
+template <typename FG_ELEMENT>
+std::vector<bool> DistributedFullGrid<FG_ELEMENT>::isGlobalLinearIndexOnBoundary(
+    IndexType globalLinearIndex) const {
+  // this could likely be done way more efficiently, but it's
+  // currently not in any performance critical spot
+  std::vector<bool> isOnBoundary(this->getDimension(), false);
+
+  // convert to global vector index
+  IndexVector globalAxisIndex(dim_);
+  getGlobalVectorIndex(globalLinearIndex, globalAxisIndex);
+  for (DimType d = 0; d < this->getDimension(); ++d) {
+    if (this->returnBoundaryFlags()[d] == 2) {
+      if (globalAxisIndex[d] == 0 ||
+          globalAxisIndex[d] == this->globalNumPointsInDimension(d) - 1) {
+        isOnBoundary[d] = true;
+      }
+    }
+  }
+  return isOnBoundary;
+}
+
+template <typename FG_ELEMENT>
+std::vector<bool> DistributedFullGrid<FG_ELEMENT>::isLocalLinearIndexOnBoundary(
+    IndexType localLinearIndex) const {
+  return isGlobalLinearIndexOnBoundary(getGlobalLinearIndex(localLinearIndex));
+}
+template <typename FG_ELEMENT>
+std::vector<IndexVector> DistributedFullGrid<FG_ELEMENT>::getCornersGlobalVectorIndicesRecursive(
+    std::vector<IndexVector> indicesSoFar, DimType dim) const {
+  if (dim < this->getDimension()) {
+    std::vector<IndexVector> newIndicesSoFar{};
+    for (const auto& indexVec : indicesSoFar) {
+      newIndicesSoFar.push_back(indexVec);
+      newIndicesSoFar.back().push_back(0);
+      newIndicesSoFar.push_back(indexVec);
+      newIndicesSoFar.back().push_back(this->globalNumPointsInDimension(dim) - 1);
+    }
+    assert(newIndicesSoFar.size() == 2 * indicesSoFar.size());
+    return getCornersGlobalVectorIndicesRecursive(newIndicesSoFar, static_cast<DimType>(dim + 1));
+  } else {
+    return indicesSoFar;
+  }
+}
+
+template <typename FG_ELEMENT>
+std::vector<IndexVector> DistributedFullGrid<FG_ELEMENT>::getCornersGlobalVectorIndices() const {
+  auto emptyVectorInVector = std::vector<IndexVector>(1);
+  assert(emptyVectorInVector.size() == 1);
+  assert(emptyVectorInVector[0].size() == 0);
+  std::vector<IndexVector> cornersVectors =
+      getCornersGlobalVectorIndicesRecursive(emptyVectorInVector, 0);
+  assert(cornersVectors.size() == static_cast<size_t>(powerOfTwo[this->getDimension()]));
+  return cornersVectors;
+}
+
+template <typename FG_ELEMENT>
+std::vector<FG_ELEMENT> DistributedFullGrid<FG_ELEMENT>::getCornersValues() const {
+  std::vector<FG_ELEMENT> values(powerOfTwo[this->getDimension()]);
+  auto corners = getCornersGlobalVectorIndices();
+  for (size_t cornerNo = 0; cornerNo < corners.size(); ++cornerNo) {
+    if (this->isGlobalIndexHere(corners[cornerNo])) {
+      // convert to local vector index, then to linear index
+      IndexVector locAxisIndex(this->getDimension());
+      [[maybe_unused]] bool present = getLocalVectorIndex(corners[cornerNo], locAxisIndex);
+      assert(present);
+      auto index = getLocalLinearIndex(locAxisIndex);
+      values[cornerNo] = this->getData()[index];
+    }
+  }
+  MPI_Allreduce(MPI_IN_PLACE, values.data(), static_cast<int>(values.size()),
+                this->getMPIDatatype(), MPI_SUM, this->getCommunicator());
+  return values;
+}
+
+template <typename FG_ELEMENT>
+void DistributedFullGrid<FG_ELEMENT>::InitMPI(MPI_Comm comm, const std::vector<int>& procs) {
+  // check if communicator is already cartesian
+  int status;
+  MPI_Topo_test(comm, &status);
+
+  if (status == MPI_CART) {
+    if (comm != cartesianUtils_.getComm()) {
+      assert(uniformDecomposition);
+      cartesianUtils_ = MPICartesianUtils(comm);
+    }
+    if (procs != cartesianUtils_.getCartesianDimensions()) {
+      std::cerr << "unpart" << std::endl;
+      throw std::runtime_error("The given communicator is not partitioned as desired");
+    }
+  } else {
+    // MPI_Comm_dup(comm, &communicator_);
+    // cf. https://www.researchgate.net/publication/220439585_MPI_on_millions_of_cores
+    // "Figure 3 shows the memory consumption in all these cases after 32 calls to MPI Comm dup"
+    std::cerr << "undup" << std::endl;
+    throw std::runtime_error(
+        "Currently testing to not duplicate communicator (to save memory), \
+                          if you do want to use this code please take care that \
+                          MPI_Comm_free(&communicator_) will be called at some point");
+  }
+}
+
+template <typename FG_ELEMENT>
+void DistributedFullGrid<FG_ELEMENT>::setDecomposition(
+    const std::vector<IndexVector>& decomposition) {
+#ifndef NDEBUG
+  assert(decomposition.size() == dim_);
+  for (DimType i = 0; i < dim_; ++i)
+    assert(decomposition[i].size() ==
+           static_cast<size_t>(this->getCartesianUtils().getCartesianDimensions()[i]));
+
+  // check if 1d bounds given in ascending order
+  // if not, this might indicate there's something wrong
+  for (DimType i = 0; i < dim_; ++i) {
+    IndexVector tmp(decomposition[i]);
+    std::sort(tmp.begin(), tmp.end());
+    assert(tmp == decomposition[i]);
+  }
+
+  // check if partitions size larger zero
+  // this can be detected by checking for duplicate entries in
+  // the 1d decompositions
+  for (DimType i = 0; i < dim_; ++i) {
+    IndexVector tmp(decomposition[i]);
+    IndexVector::iterator last = std::unique(tmp.begin(), tmp.end());
+
+    // todo: this is not sufficient to check for duplicate entries
+    // if last points to the same element as tmp.end()
+    // this means all entries in tmp are unique
+    assert(last == tmp.end() && "some partition in decomposition is of zero size!");
+  }
+#endif  // not def NDEBUG
+
+  decomposition_ = decomposition;
+}
+
+template <typename FG_ELEMENT>
+MPICartesianUtils DistributedFullGrid<FG_ELEMENT>::cartesianUtils_;
+
+// output operator
+template <typename FG_ELEMENT>
+inline std::ostream& operator<<(std::ostream& os, const DistributedFullGrid<FG_ELEMENT>& dfg) {
+  dfg.print();
+
+  return os;
 }
 
 }  // namespace combigrid
diff --git a/src/fullgrid/MultiArray.hpp b/src/fullgrid/MultiArray.hpp
index 8befc84c8..77f90f96e 100644
--- a/src/fullgrid/MultiArray.hpp
+++ b/src/fullgrid/MultiArray.hpp
@@ -13,7 +13,7 @@ template <typename T, DimType D>
 using MultiArrayRef = boost::multi_array_ref<T, D>;
 
 /* create a multiarray ref to the local data in a distributed fullgrid */
-template <typename T, size_t D>
+template <typename T, DimType D>
 static MultiArrayRef<T, D> createMultiArrayRef(DistributedFullGrid<T>& dfg) {
   /* note that the sizes of dfg are in reverse order compared to shape */
   std::vector<size_t> shape(dfg.getLocalSizes().rbegin(), dfg.getLocalSizes().rend());
@@ -30,7 +30,7 @@ static MultiArrayRef<T, D> createMultiArrayRef(FullGrid<T>& fg) {
   assert(fg.isGridCreated());
 
   /* note that the sizes of dfg are in reverse order compared to shape */
-  std::vector<size_t> shape(fg.getSizes().rbegin(), fg.getSizes().rend());
+  IndexVector shape(fg.getSizes().rbegin(), fg.getSizes().rend());
   if (!reverseOrderingDFGPartitions) {
     assert(false && "this is not adapted to normal ordering of DFG partitions yet");
   }
diff --git a/src/fullgrid/Tensor.hpp b/src/fullgrid/Tensor.hpp
index 380e2b769..644d53497 100644
--- a/src/fullgrid/Tensor.hpp
+++ b/src/fullgrid/Tensor.hpp
@@ -26,7 +26,7 @@ class TensorIndexer {
       localOffsets_[j] = nrElements;
       nrElements = nrElements * extents_[j];
     }
-    assert(this->size() == nrElements);
+    assert(this->size() == static_cast<size_t>(nrElements));
   }
 
   // have only move constructors for now
@@ -58,7 +58,7 @@ class TensorIndexer {
     }
     auto size = std::accumulate(this->extents_.begin(), this->extents_.end(), 1U,
                                 std::multiplies<size_t>());
-    assert(size < std::numeric_limits<IndexType>::max());
+    assert(size < static_cast<size_t>(std::numeric_limits<IndexType>::max()));
     return size;
   }
 
@@ -99,7 +99,7 @@ class Tensor : public TensorIndexer {
  public:
   Tensor() = default;
   explicit Tensor(Type* data, IndexVector&& extents)
-      : data_(data), TensorIndexer(std::move(extents)) {}
+      : TensorIndexer(std::move(extents)), data_(data) {}
 
   // have only move constructors for now
   Tensor(Tensor const&) = delete;
diff --git a/src/hierarchization/DistributedHierarchization.hpp b/src/hierarchization/DistributedHierarchization.hpp
index 04fcd0ba7..2fbd2bb98 100644
--- a/src/hierarchization/DistributedHierarchization.hpp
+++ b/src/hierarchization/DistributedHierarchization.hpp
@@ -123,10 +123,12 @@ static void sendAndReceiveIndicesBlock(const std::map<RankType, std::set<IndexTy
 
   numSend = 0;
   numRecv = 0;
+  /*
   // #pragma omp parallel shared(sendRequests, numSend, send1dIndices, recvRequests, numRecv, \
 //                                 remoteData, recv1dIndices, dfg, mysubarray)              \
 //     firstprivate(dfgStartAddr, dim) default(none)
   // no benefit from parallelization here
+  */
   {
 #pragma omp for schedule(static) nowait
     for (size_t x = 0; x < send1dIndices.size(); ++x) {
@@ -254,7 +256,7 @@ template <typename FG_ELEMENT>
 static void exchangeAllData1d(const DistributedFullGrid<FG_ELEMENT>& dfg, DimType dim,
                               RemoteDataCollector<FG_ELEMENT>& remoteData) {
   // send every index to all neighboring ranks in dimension dim
-  const auto globalIdxMax = dfg.length(dim);
+  const auto globalIdxMax = dfg.globalNumPointsInDimension(dim);
   const IndexType idxMin = dfg.getFirstGlobal1dIndex(dim);
   const IndexType idxMax = dfg.getLastGlobal1dIndex(dim);
 
@@ -268,15 +270,18 @@ static void exchangeAllData1d(const DistributedFullGrid<FG_ELEMENT>& dfg, DimTyp
   }
   std::map<RankType, std::set<IndexType>> send1dIndices;
   std::map<RankType, std::set<IndexType>> recv1dIndices;
-
+  /*
   // #pragma omp parallel for schedule(static) default(none) \
-//     shared(poleNeighbors, send1dIndices, allMyIndices)
+  //   shared(poleNeighbors, send1dIndices, allMyIndices)
+  */
   for (const auto& r : poleNeighbors) {
 #pragma omp critical
     send1dIndices[r] = allMyIndices;
   }
+  /*
   // #pragma omp parallel for schedule(static) default(none) \
-//     shared(poleNeighbors, recv1dIndices)
+  //     shared(poleNeighbors, recv1dIndices)
+  */
   for (const auto& r : poleNeighbors) {
 #pragma omp critical
     recv1dIndices[r] = {};
@@ -325,11 +330,11 @@ static void exchangeData1d(const DistributedFullGrid<FG_ELEMENT>& dfg, DimType d
   std::map<RankType, std::set<IndexType>> send1dIndices;
 
   // main loop
-  IndexType idxMin = dfg.getFirstGlobal1dIndex(dim);
-  IndexType idxMax = dfg.getLastGlobal1dIndex(dim);
-  bool oneSidedBoundary = dfg.returnBoundaryFlags()[dim] == 1;
-  auto globalIdxMax = dfg.length(dim);
-  LevelType lmax = dfg.getLevels()[dim];
+  const IndexType idxMin = dfg.getFirstGlobal1dIndex(dim);
+  const IndexType idxMax = dfg.getLastGlobal1dIndex(dim);
+  const bool oneSidedBoundary = dfg.returnBoundaryFlags()[dim] == 1;
+  const auto globalIdxMax = dfg.globalNumPointsInDimension(dim);
+  const LevelType lmax = dfg.getLevels()[dim];
 
   IndexType idx = idxMin;
 
@@ -981,7 +986,8 @@ void hierarchizeNoBoundary(DistributedFullGrid<FG_ELEMENT>& dfg,
   for (IndexType nn = 0; nn < nbrOfPoles;
        ++nn) {  // integer operations form bottleneck here -- nested loops are twice as slow
     divresult = std::lldiv(nn, stride);
-    start = divresult.quot * jump + divresult.rem;  // localer lin index start of pole
+    start = static_cast<decltype(start)>(divresult.quot * jump +
+                                         divresult.rem);  // localer lin index start of pole
 
 #ifndef NDEBUG
     IndexVector localIndexVector(dfg.getDimension());
@@ -1063,7 +1069,8 @@ void dehierarchizeNoBoundary(DistributedFullGrid<FG_ELEMENT>& dfg,
   for (IndexType nn = 0; nn < nbrOfPoles;
        ++nn) {  // integer operations form bottleneck here -- nested loops are twice as slow
     divresult = std::lldiv(nn, stride);
-    start = divresult.quot * jump + divresult.rem;  // localer lin index start of pole
+    start = static_cast<decltype(start)>(divresult.quot * jump +
+                                         divresult.rem);  // localer lin index start of pole
 
 #ifndef NDEBUG
     IndexVector localIndexVector(dfg.getDimension());
diff --git a/src/io/BroadcastParameters.cpp b/src/io/BroadcastParameters.cpp
index 3c3c0a8c1..a94943438 100644
--- a/src/io/BroadcastParameters.cpp
+++ b/src/io/BroadcastParameters.cpp
@@ -18,11 +18,9 @@ std::string getFileContentsFromRankZero(const std::string& fileName, const MPI_C
   // only one rank reads inputs and broadcasts to others
   auto mpiRank = getCommRank(comm);
   std::string contentString;
-  int contentStringSize = 0;
   if (mpiRank == 0) {
     std::ifstream ifs(fileName);
     contentString.assign((std::istreambuf_iterator<char>(ifs)), (std::istreambuf_iterator<char>()));
-    contentStringSize = contentString.size();
   }
   MPIUtils::broadcastContainer(contentString, 0, comm);
   return contentString;
@@ -57,12 +55,12 @@ std::vector<std::vector<real>> getCoordinatesFromRankZero(const std::string& h5F
   auto mpiRank = getCommRank(comm);
   if (mpiRank == 0) {
     h5io::readH5Coordinates(coordinates, h5FileName);
-    coordinateSizes[0] = coordinates.size();
+    coordinateSizes[0] = static_cast<int>(coordinates.size());
     assert(coordinateSizes[0] > 0);
-    coordinateSizes[1] = coordinates[0].size();
+    coordinateSizes[1] = static_cast<int>(coordinates[0].size());
     assert(coordinateSizes[1] > 0);
     coordinatesSerialized = combigrid::serializeInterpolationCoords(coordinates);
-    assert(coordinatesSerialized.size() == coordinateSizes[0] * coordinateSizes[1]);
+    assert(coordinatesSerialized.size() == static_cast<size_t>(coordinateSizes[0] * coordinateSizes[1]));
   }
 
   // broadcast vector sizes
@@ -70,7 +68,7 @@ std::vector<std::vector<real>> getCoordinatesFromRankZero(const std::string& h5F
   coordinatesSerialized.resize(coordinateSizes[0] * coordinateSizes[1]);
 
   // broadcast serialized vector
-  MPI_Bcast(&coordinatesSerialized[0], coordinatesSerialized.size(),
+  MPI_Bcast(&coordinatesSerialized[0], static_cast<int>(coordinatesSerialized.size()),
             abstraction::getMPIDatatype(abstraction::getabstractionDataType<real>()), 0, comm);
 
   // split vector into coordinates
diff --git a/src/io/FileInputOutput.cpp b/src/io/FileInputOutput.cpp
index b3fdee679..6b472d3ab 100644
--- a/src/io/FileInputOutput.cpp
+++ b/src/io/FileInputOutput.cpp
@@ -1,54 +1,59 @@
 #include "io/FileInputOutput.hpp"
+
 #include <filesystem>
 #include <fstream>
 #include <numeric>
 #include <vector>
 
-namespace combigrid {
-void writeConcatenatedFileRootOnly(const char* data, int sizeOfData, const std::string& path, MPI_Comm comm, bool replaceExistingFile) {
-    int mpi_size, mpi_rank;
-    MPI_Comm_rank(comm, &mpi_rank);
-    MPI_Comm_size(comm, &mpi_size);
+#include "mpi/MPISystem.hpp"
 
-    std::vector<int> recvCounts;
+namespace combigrid {
+void writeConcatenatedFileRootOnly(const char* data, size_t sizeOfData, const std::string& path,
+                                   MPI_Comm comm, bool replaceExistingFile) {
+  int mpi_size, mpi_rank;
+  MPI_Comm_rank(comm, &mpi_rank);
+  MPI_Comm_size(comm, &mpi_size);
 
-    if (mpi_rank == 0) {
-      recvCounts.resize(mpi_size);
-    }
+  std::vector<int> recvCounts;
 
-    MPI_Gather(&sizeOfData, 1, MPI_INT, recvCounts.data(), 1, MPI_INT, 0, comm);
+  if (mpi_rank == 0) {
+    recvCounts.resize(mpi_size);
+  }
+  int sizeAsInt = static_cast<int>(sizeOfData);
+  MPI_Datatype datatype = MPI_INT;
+  MPI_Gather(&sizeAsInt, 1, datatype, recvCounts.data(), 1, datatype, 0, comm);
 
-    std::string recvBuffer;
-    std::vector<int> displacement;
+  std::string recvBuffer;
+  std::vector<int> displacement;
 
-    if (mpi_rank == 0) {
-      displacement.resize(mpi_size);
-      std::exclusive_scan(recvCounts.begin(), recvCounts.end(), displacement.begin(), 0);
-      recvBuffer.resize(std::accumulate(recvCounts.begin(),recvCounts.end(), 0));
-    }
+  if (mpi_rank == 0) {
+    displacement.resize(mpi_size);
+    std::exclusive_scan(recvCounts.begin(), recvCounts.end(), displacement.begin(), 0);
+    recvBuffer.resize(std::accumulate(recvCounts.begin(), recvCounts.end(), 0));
+  }
 
-    MPI_Gatherv(data, sizeOfData, MPI_CHAR, &recvBuffer[0], recvCounts.data(), displacement.data(), MPI_CHAR, 0, comm);
+  MPI_Gatherv(data, sizeAsInt, MPI_CHAR, &recvBuffer[0], recvCounts.data(), displacement.data(),
+              MPI_CHAR, 0, comm);
 
-    if (mpi_rank == 0) {
-      if(replaceExistingFile == true){
-        bool fileExist = std::filesystem::exists(path);
-        if(fileExist == true){
-          std::remove(path.c_str());
-        }
+  if (mpi_rank == 0) {
+    if (replaceExistingFile == true) {
+      bool fileExist = std::filesystem::exists(path);
+      if (fileExist == true) {
+        std::remove(path.c_str());
       }
-      std::ofstream out(path.c_str());
-      out << recvBuffer;
-      out.close();
     }
+    std::ofstream out(path.c_str());
+    out << recvBuffer;
+    out.close();
+  }
 }
 
-bool getFileExistsRootOnly(const std::string& fileName, CommunicatorType comm, RankType myRank,
-                           RankType rootRank) {
+bool getFileExistsRootOnly(const std::string& fileName, CommunicatorType comm, RankType rootRank) {
   bool doesItExist = false;
-  if (myRank == rootRank) {
+  if (combigrid::getCommRank(comm) == rootRank) {
     doesItExist = std::filesystem::exists(fileName);
   }
   MPI_Bcast(&doesItExist, 1, MPI_C_BOOL, rootRank, comm);
   return doesItExist;
 }
-}  // namespace combigrid
\ No newline at end of file
+}  // namespace combigrid
diff --git a/src/io/FileInputOutput.hpp b/src/io/FileInputOutput.hpp
index eca5941e1..3b1104928 100644
--- a/src/io/FileInputOutput.hpp
+++ b/src/io/FileInputOutput.hpp
@@ -9,10 +9,9 @@
 #include "utils/Types.hpp"
 
 namespace combigrid {
-void writeConcatenatedFileRootOnly(const char* data, int sizeOfData, const std::string& path,
+void writeConcatenatedFileRootOnly(const char* data, size_t sizeOfData, const std::string& path,
                                    MPI_Comm comm, bool replaceExistingFile = false);
 
-bool getFileExistsRootOnly(const std::string& fileName, CommunicatorType comm, RankType myRank,
-                           RankType rootRank = 0);
+bool getFileExistsRootOnly(const std::string& fileName, CommunicatorType comm, RankType rootRank = 0);
 
-}  // namespace combigrid
\ No newline at end of file
+}  // namespace combigrid
diff --git a/src/io/H5InputOutput.cpp b/src/io/H5InputOutput.cpp
index abd3d0f9a..c45ae11bf 100644
--- a/src/io/H5InputOutput.cpp
+++ b/src/io/H5InputOutput.cpp
@@ -4,11 +4,11 @@ namespace combigrid {
 namespace h5io {
 
 // some instantiations
-void readH5Coordinates(std::vector<std::vector<real>>& coordinates, std::string saveFilePath) {
+void readH5Coordinates(std::vector<std::vector<real>>& coordinates, const std::string& saveFilePath) {
   return h5io::readValuesFromH5File(coordinates, saveFilePath);
 }
 
-void readH5Values(std::vector<real>& values, std::string saveFilePath) {
+void readH5Values(std::vector<real>& values, const std::string& saveFilePath) {
   return h5io::readValuesFromH5File(values, saveFilePath);
 }
 
diff --git a/src/io/H5InputOutput.hpp b/src/io/H5InputOutput.hpp
index db57430ba..f695f8a0b 100644
--- a/src/io/H5InputOutput.hpp
+++ b/src/io/H5InputOutput.hpp
@@ -62,9 +62,9 @@ void readValuesFromH5File(T& values, const std::string& fileName) {
 }
 
 // some instantiations
-void readH5Coordinates(std::vector<std::vector<real>>& coordinates, std::string saveFilePath);
+void readH5Coordinates(std::vector<std::vector<real>>& coordinates, const std::string& saveFilePath);
 
-void readH5Values(std::vector<real>& values, std::string saveFilePath);
+void readH5Values(std::vector<real>& values, const std::string& saveFilePath);
 
 }  // namespace h5io
 }  // namespace combigrid
\ No newline at end of file
diff --git a/src/manager/CombiParameters.hpp b/src/manager/CombiParameters.hpp
index dadec05e8..928f8bc27 100644
--- a/src/manager/CombiParameters.hpp
+++ b/src/manager/CombiParameters.hpp
@@ -17,7 +17,7 @@ class CombiParameters {
   CombiParameters(DimType dim, LevelVector lmin, LevelVector lmax,
                   std::vector<BoundaryType>& boundary, std::vector<LevelVector>& levels,
                   std::vector<real>& coeffs, std::vector<size_t>& taskIDs,
-                  IndexType numberOfCombinations, IndexType numGrids = 1,
+                  size_t numberOfCombinations, IndexType numGrids = 1,
                   CombinationVariant combinationVariant = CombinationVariant::sparseGridReduce,
                   const std::vector<int>& parallelization = {0},
                   LevelVector reduceCombinationDimsLmin = LevelVector(0),
@@ -56,7 +56,7 @@ class CombiParameters {
   // constructor variant w/o combination scheme specified -- the workers have their partial list
   // imlpicitly as tasks vector
   CombiParameters(DimType dim, LevelVector lmin, LevelVector lmax,
-                  std::vector<BoundaryType>& boundary, IndexType numberOfCombinations,
+                  std::vector<BoundaryType>& boundary, size_t numberOfCombinations,
                   IndexType numGrids = 1,
                   CombinationVariant combinationVariant = CombinationVariant::sparseGridReduce,
                   const std::vector<int>& parallelization = {0},
@@ -95,7 +95,7 @@ class CombiParameters {
   CombiParameters(DimType dim, LevelVector lmin, LevelVector lmax,
                   const std::vector<BoundaryType>& boundary, std::vector<LevelVector>& levels,
                   std::vector<real>& coeffs, std::vector<bool>& hierarchizationDims,
-                  std::vector<size_t>& taskIDs, IndexType numberOfCombinations,
+                  std::vector<size_t>& taskIDs, size_t numberOfCombinations,
                   IndexType numGrids = 1,
                   CombinationVariant combinationVariant = CombinationVariant::sparseGridReduce,
                   LevelVector reduceCombinationDimsLmin = LevelVector(0),
@@ -213,11 +213,12 @@ class CombiParameters {
   inline DimType getDim() const { return dim_; }
 
   inline size_t getNumLevels() const { return levels_.size(); }
+
   /**
    * this method returns the number of grids a task contains
    * in case we have multiple grids in our simulation
    */
-  inline IndexType getNumGrids() const { return numGridsPerTask_; }
+  inline size_t getNumGrids() const { return numGridsPerTask_; }
 
   inline const std::vector<bool>& getHierarchizationDims() const { return hierarchizationDims_; }
 
@@ -264,7 +265,7 @@ class CombiParameters {
     return procs_;
   }
 
-  inline const IndexType& getNumberOfCombinations() const { return numberOfCombinations_; }
+  inline const size_t& getNumberOfCombinations() const { return numberOfCombinations_; }
 
   inline CombinationVariant getCombinationVariant() const { return combinationVariant_; }
 
@@ -313,7 +314,7 @@ class CombiParameters {
       assert(decomposition[d][0] == 0);
       auto numPoints = combigrid::getNumDofNodal(lmax_[d], boundary_[d]);
       assert(decomposition[d].back() < numPoints);
-      assert(procs_[d] == decomposition[d].size());
+      assert(static_cast<size_t>(procs_[d]) == decomposition[d].size());
     }
 #endif // not def NDEBUG
   }
@@ -357,13 +358,11 @@ class CombiParameters {
   bool forwardDecomposition_;
 
   friend class boost::serialization::access;
-  IndexType numberOfCombinations_;  // total number of combinations
-  IndexType numGridsPerTask_;       // number of grids per task
+  size_t numberOfCombinations_;  // total number of combinations
+  size_t numGridsPerTask_;       // number of grids per task
 
   CombinationVariant combinationVariant_;
 
-  uint32_t sizeForChunkedCommunicationInMebibyte_;
-
   /**
    * This level vector indicates which dimension of lmin should be decreased by how many levels
    * for constructing the distributed sparse grid.
@@ -385,6 +384,7 @@ class CombiParameters {
     */
   LevelVector reduceCombinationDimsLmax_;
 
+  uint32_t sizeForChunkedCommunicationInMebibyte_;
 
   std::string thirdLevelHost_;
 
@@ -451,29 +451,6 @@ inline static void setCombiParametersHierarchicalBasesUniform(CombiParameters& c
   }
 }
 
-inline static std::vector<IndexVector> getStandardDecomposition(LevelVector lref,
-                                                                std::vector<int> procsRef) {
-  assert(lref.size() == procsRef.size());
-  std::vector<IndexVector> decomposition;
-  for (DimType d = 0; d < static_cast<DimType>(lref.size()); ++d) {
-    IndexVector di;
-    if (procsRef[d] == 1) {
-      di = {0};
-    } else if (procsRef[d] == 2) {
-      di = {0, powerOfTwo[lref[d]] / procsRef[d] + 1};
-    } else if (procsRef[d] == 3) {
-      di = {0, powerOfTwo[lref[d]] / procsRef[d] + 1, 2 * powerOfTwo[lref[d]] / procsRef[d] + 1};
-    } else if (procsRef[d] == 4) {
-      di = {0, powerOfTwo[lref[d]] / procsRef[d] + 1, 2 * powerOfTwo[lref[d]] / procsRef[d] + 1,
-            3 * powerOfTwo[lref[d]] / procsRef[d] + 1};
-    } else {
-      throw std::runtime_error("please implement a test decomposition matching procs and lref");
-    }
-    decomposition.push_back(di);
-  }
-  return decomposition;
-}
-
 }  // namespace combigrid
 
 #endif /* SRC_SGPP_COMBIGRID_MANAGER_COMBIPARAMETERS_HPP_ */
diff --git a/src/manager/InterpolationWorker.hpp b/src/manager/InterpolationWorker.hpp
index 28b2d06fb..8000a2240 100644
--- a/src/manager/InterpolationWorker.hpp
+++ b/src/manager/InterpolationWorker.hpp
@@ -90,17 +90,16 @@ static void writeInterpolatedValuesSingleFile(
 
 static void writeVTKPlotFilesOfAllTasks(const std::vector<std::unique_ptr<Task>>& tasks,
                                         int numberOfGrids) {
-  for (const auto& task : tasks) {
 #ifdef USE_VTK
+  for (const auto& task : tasks) {
     for (int g = 0; g < numberOfGrids; ++g) {
       DistributedFullGrid<CombiDataType>& dfg = task->getDistributedFullGrid(g);
       DFGPlotFileWriter<CombiDataType> writer{dfg, g};
       writer.writePlotFile();
     }
+  }
 #else
-    std::cout << "Warning: no vtk output produced as DisCoTec was compiled without VTK."
-              << std::endl;
+  std::cout << "Warning: no vtk output produced as DisCoTec was compiled without VTK." << std::endl;
 #endif /* USE_VTK */
-  }
 }
 } /* namespace combigrid */
diff --git a/src/manager/ProcessGroupManager.cpp b/src/manager/ProcessGroupManager.cpp
index f49019914..94ffff804 100644
--- a/src/manager/ProcessGroupManager.cpp
+++ b/src/manager/ProcessGroupManager.cpp
@@ -200,9 +200,9 @@ bool ProcessGroupManager::pretendCombineThirdLevelForWorkers(CombiParameters& pa
   const CommunicatorType& comm = thirdLevelComms[params.getThirdLevelPG()];
 
   // exchange dsgus
-  IndexType numGrids = params.getNumGrids();
+  auto numGrids = params.getNumGrids();
   std::vector<CombiDataType> dsguData;
-  for (IndexType g = 0; g < numGrids; g++) {
+  for (size_t g = 0; g < numGrids; g++) {
     for (RankType p = 0; p < (RankType)theMPISystem()->getNumProcs(); p++) {
       // we assume here that all dsgus have the same size otherwise size collection must change
       size_t dsguSize = (size_t)(dsguDataSizePerWorker_[(size_t)p] / numGrids);
@@ -332,7 +332,7 @@ bool ProcessGroupManager::pretendReduceLocalAndRemoteSubspaceSizes(
     from = to;
   }
   assert(to == sendBuff.end());
-  for (const auto& dataSize : formerDsguDataSizePerWorker_) {
+  for ([[maybe_unused]] const auto& dataSize : formerDsguDataSizePerWorker_) {
     assert(dataSize > 0);
   }
 
@@ -365,9 +365,9 @@ void ProcessGroupManager::exchangeDsgus(const ThirdLevelUtils& thirdLevel, Combi
   const CommunicatorType& comm = thirdLevelComms[params.getThirdLevelPG()];
 
   // exchange dsgus
-  IndexType numGrids = params.getNumGrids();
+  auto numGrids = params.getNumGrids();
   std::vector<CombiDataType> dsguData;
-  for (IndexType g = 0; g < numGrids; g++) {
+  for (size_t g = 0; g < numGrids; g++) {
     for (RankType p = 0; p < (RankType)theMPISystem()->getNumProcs(); p++) {
       // we assume here that all dsgus have the same size otherwise size collection must change
       size_t dsguSize = (size_t)(dsguDataSizePerWorker_[(size_t)p] / numGrids);
@@ -583,7 +583,7 @@ void ProcessGroupManager::writeSparseGridMinMaxCoefficients(const std::string& f
 }
 
 void ProcessGroupManager::doDiagnostics(size_t taskID) {
-  auto status = waitStatus();
+  [[maybe_unused]] auto status = waitStatus();
   assert(status == PROCESS_GROUP_WAIT);
   for (auto task : tasks_) {
     if (task->getID() == taskID) {
@@ -659,10 +659,10 @@ std::vector<double> ProcessGroupManager::evalErrorOnDFG(const LevelVector& leval
 
 void ProcessGroupManager::interpolateValues(const std::vector<real>& interpolationCoordsSerial,
                                             std::vector<CombiDataType>& values,
-                                            MPI_Request* request, std::string filenamePrefix) {
+                                            MPI_Request* request, const std::string& filenamePrefix) {
   assert(interpolationCoordsSerial.size() < static_cast<size_t>(std::numeric_limits<int>::max()) &&
          "needs chunking!");
-  for (const auto& coord : interpolationCoordsSerial) {
+  for ([[maybe_unused]] const auto& coord : interpolationCoordsSerial) {
     assert(coord >= 0.0 && coord <= 1.0);
   }
   // if no request was passed, assume we are not waiting for a reply from that group
diff --git a/src/manager/ProcessGroupManager.hpp b/src/manager/ProcessGroupManager.hpp
index a47f89a82..f3fb1302e 100644
--- a/src/manager/ProcessGroupManager.hpp
+++ b/src/manager/ProcessGroupManager.hpp
@@ -96,7 +96,7 @@ class ProcessGroupManager {
 
   void interpolateValues(const std::vector<real>& interpolationCoordsSerial,
                          std::vector<CombiDataType>& values, MPI_Request* request = nullptr,
-                         std::string filenamePrefix = "");
+                         const std::string& filenamePrefix = "");
 
   void writeInterpolatedValuesPerGrid(const std::vector<real>& interpolationCoordsSerial,
                                       const std::string& filenamePrefix);
diff --git a/src/manager/ProcessGroupWorker.cpp b/src/manager/ProcessGroupWorker.cpp
index 8959549ce..7d37f9710 100644
--- a/src/manager/ProcessGroupWorker.cpp
+++ b/src/manager/ProcessGroupWorker.cpp
@@ -34,7 +34,7 @@ std::string receiveStringFromManagerAndBroadcastToGroup() {
 }
 
 void sendNormsToManager(const std::vector<double> lpnorms) {
-  for (int p = 0; p < lpnorms.size(); ++p) {
+  for (size_t p = 0; p < lpnorms.size(); ++p) {
     // send from master to manager
     MASTER_EXCLUSIVE_SECTION {
       MPI_Send(&lpnorms[p], 1, MPI_DOUBLE, theMPISystem()->getManagerRank(), TRANSFER_NORM_TAG,
@@ -77,7 +77,7 @@ std::vector<std::vector<real>> receiveAndBroadcastInterpolationCoords(DimType di
   MASTER_EXCLUSIVE_SECTION {
     MPI_Status status;
     status.MPI_ERROR = MPI_SUCCESS;
-    int result = MPI_Probe(theMPISystem()->getManagerRank(), TRANSFER_INTERPOLATION_TAG,
+    [[maybe_unused]] int result = MPI_Probe(theMPISystem()->getManagerRank(), TRANSFER_INTERPOLATION_TAG,
                            theMPISystem()->getGlobalComm(), &status);
 #ifndef NDEBUG
     assert(result == MPI_SUCCESS);
@@ -110,7 +110,7 @@ std::vector<std::vector<real>> receiveAndBroadcastInterpolationCoords(DimType di
       std::cout << "error recv: " << errorString << std::endl;
     }
     assert(status.MPI_ERROR == MPI_SUCCESS);
-    for (const auto& coord : interpolationCoordsSerial) {
+    for ([[maybe_unused]] const auto& coord : interpolationCoordsSerial) {
       assert(coord >= 0.0 && coord <= 1.0);
     }
 #endif  // NDEBUG
@@ -121,7 +121,7 @@ std::vector<std::vector<real>> receiveAndBroadcastInterpolationCoords(DimType di
   interpolationCoordsSerial.resize(coordsSize);
   MPI_Bcast(interpolationCoordsSerial.data(), coordsSize, realType, theMPISystem()->getMasterRank(),
             theMPISystem()->getLocalComm());
-  for (const auto& coord : interpolationCoordsSerial) {
+  for ([[maybe_unused]] const auto& coord : interpolationCoordsSerial) {
     assert(coord >= 0.0 && coord <= 1.0);
   }
   // split vector into coordinates
@@ -153,7 +153,8 @@ SignalType ProcessGroupWorker::wait() {
       Stats::startEvent("run first");
       auto& currentTask = this->getTaskWorker().getLastTask();
       currentTask->run(theMPISystem()->getLocalComm());
-      Stats::Event e = Stats::stopEvent("run first");
+      // Stats::Event e = Stats::stopEvent("run first");
+      Stats::stopEvent("run first");
     } break;
     case RUN_NEXT: {
       assert(this->getTaskWorker().getTasks().size() > 0);
@@ -254,7 +255,7 @@ SignalType ProcessGroupWorker::wait() {
     case WRITE_DSGS_TO_DISK: {
       Stats::startEvent("write to disk");
       std::string filenamePrefix = receiveStringFromManagerAndBroadcastToGroup();
-      writeDSGsToDisk(filenamePrefix);
+      writeDSGsToDisk(filenamePrefix, "");
       Stats::stopEvent("write to disk");
     } break;
     case READ_DSGS_FROM_DISK: {
@@ -341,7 +342,7 @@ SignalType ProcessGroupWorker::wait() {
           interpolateValues(receiveAndBroadcastInterpolationCoords(combiParameters_.getDim()));
       // send result
       MASTER_EXCLUSIVE_SECTION {
-        MPI_Send(values.data(), values.size(),
+        MPI_Send(values.data(), static_cast<int>(values.size()),
                  abstraction::getMPIDatatype(abstraction::getabstractionDataType<CombiDataType>()),
                  theMPISystem()->getManagerRank(), TRANSFER_INTERPOLATION_TAG,
                  theMPISystem()->getGlobalComm());
@@ -720,8 +721,6 @@ void ProcessGroupWorker::combineThirdLevel() {
 
   assert(theMPISystem()->getThirdLevelComms().size() == 1 && "init thirdLevel communicator failed");
   const CommunicatorType& managerComm = theMPISystem()->getThirdLevelComms()[0];
-  const CommunicatorType& globalReduceComm = theMPISystem()->getGlobalReduceComm();
-  const RankType& globalReduceRank = theMPISystem()->getGlobalReduceRank();
   const RankType& manager = theMPISystem()->getThirdLevelManagerRank();
 
   std::vector<MPI_Request> requests;
@@ -769,7 +768,7 @@ void ProcessGroupWorker::combineThirdLevel() {
   // wait for bcasts to other pgs in globalReduceComm
   Stats::startEvent("wait for bcasts");
   for (MPI_Request& request : requests) {
-    auto returnedValue = MPI_Wait(&request, MPI_STATUS_IGNORE);
+    [[maybe_unused]] auto returnedValue = MPI_Wait(&request, MPI_STATUS_IGNORE);
     assert(returnedValue == MPI_SUCCESS);
   }
   Stats::stopEvent("wait for bcasts");
@@ -777,36 +776,17 @@ void ProcessGroupWorker::combineThirdLevel() {
 
 int ProcessGroupWorker::combineThirdLevelFileBasedWrite(
     const std::string& filenamePrefixToWrite, const std::string& writeCompleteTokenFileName) {
-  assert(this->getSparseGridWorker().getNumberOfGrids() > 0);
-  assert(combiParametersSet_);
-
-  // write sparse grid and corresponding token file
-  Stats::startEvent("write SG");
-  int numWritten = this->writeDSGsToDisk(filenamePrefixToWrite);
-  MASTER_EXCLUSIVE_SECTION { std::ofstream tokenFile(writeCompleteTokenFileName); }
-  Stats::stopEvent("write SG");
-  return numWritten;
-}
+  OUTPUT_GROUP_EXCLUSIVE_SECTION {
+    assert(this->getSparseGridWorker().getNumberOfGrids() > 0);
+    assert(combiParametersSet_);
 
-void ProcessGroupWorker::removeReadingFiles(const std::string& filenamePrefixToRead, 
-		const std::string& startReadingTokenFileName, bool keepSparseGridFiles) const {
-  OUTPUT_GROUP_EXCLUSIVE_SECTION{
-    MASTER_EXCLUSIVE_SECTION {
-      // remove reading token
-      std::filesystem::remove(startReadingTokenFileName);
-      // remove sparse grid file(s)
-      if (!keepSparseGridFiles) {
-        std::filesystem::remove(filenamePrefixToRead + "_" + std::to_string(0));
-        for (const auto& entry : std::filesystem::directory_iterator(".")) {
-          if (entry.path().string().find(filenamePrefixToRead + "_" + std::to_string(0) + ".part") !=
-                std::string::npos) {
-            assert(entry.is_regular_file());
-            std::filesystem::remove(entry.path());
-          }
-        }
-      }
-    }
-  } else {
+    // write sparse grid and corresponding token file
+    Stats::startEvent("write SG");
+    int numWritten = this->writeDSGsToDisk(filenamePrefixToWrite, writeCompleteTokenFileName);
+    Stats::stopEvent("write SG");
+    return numWritten;
+  }
+  else {
     throw std::runtime_error("should only be called from output group");
   }
 }
@@ -828,36 +808,19 @@ void ProcessGroupWorker::waitForTokenFile(const std::string& startReadingTokenFi
   }
 }
 
-int ProcessGroupWorker::readReduce(const std::string& filenamePrefixToRead, bool overwrite) {
-  return getSparseGridWorker().readReduce(
-      filenamePrefixToRead, this->combiParameters_.getChunkSizeInMebibybtePerThread(), overwrite);
+int ProcessGroupWorker::readReduce(const std::vector<std::string>& filenamePrefixesToRead,
+                                   const std::vector<std::string>& startReadingTokenFileNames,
+                                   bool overwriteInMemory) {
+  return getSparseGridWorker().readReduce(filenamePrefixesToRead, startReadingTokenFileNames,
+                                          this->combiParameters_.getChunkSizeInMebibybtePerThread(),
+                                          overwriteInMemory);
 }
 
 void ProcessGroupWorker::combineThirdLevelFileBasedReadReduce(
     const std::vector<std::string>& filenamePrefixesToRead,
-    const std::vector<std::string>& startReadingTokenFileNames, bool overwrite,
+    const std::vector<std::string>& startReadingTokenFileNames, bool overwriteInMemory,
     bool keepSparseGridFiles) {
-  // wait until we can start to read any of the files
-  std::set<size_t> indicesStillToReadReduce;
-  for (size_t i = 0; i < filenamePrefixesToRead.size(); ++i) {
-    indicesStillToReadReduce.insert(i);
-  }
-  while (!indicesStillToReadReduce.empty()) {
-    for (auto it = indicesStillToReadReduce.begin(); it != indicesStillToReadReduce.end(); ++it) {
-      if (combigrid::getFileExistsRootOnly(startReadingTokenFileNames[*it],
-                                           theMPISystem()->getOutputGroupComm(),
-                                           theMPISystem()->getOutputGroupRank())) {
-        overwrite ? Stats::startEvent("read SG") : Stats::startEvent("read/reduce SG");
-        int numRead = this->readReduce(filenamePrefixesToRead[*it], overwrite);
-        assert(numRead > 0);
-        overwrite ? Stats::stopEvent("read SG") : Stats::stopEvent("read/reduce SG");
-        indicesStillToReadReduce.erase(it);
-        break;  // because iterators may be invalidated
-      }
-    }
-    // wait for 200ms
-    std::this_thread::sleep_for(std::chrono::milliseconds(200));
-  }
+  this->readReduce(filenamePrefixesToRead, startReadingTokenFileNames, overwriteInMemory);
 
   MPI_Request request = MPI_REQUEST_NULL;
   if (this->combiParameters_.getCombinationVariant() ==
@@ -875,23 +838,22 @@ void ProcessGroupWorker::combineThirdLevelFileBasedReadReduce(
     // update fgs
     updateFullFromCombinedSparseGrids();
   }
-  for (size_t i = 0; i < filenamePrefixesToRead.size(); ++i) {
-    // remove reading token and sparse grid file(s)
-    this->removeReadingFiles(filenamePrefixesToRead[i], startReadingTokenFileNames[i],
-                             keepSparseGridFiles);
-  }
 
-  auto returnedValue = MPI_Wait(&request, MPI_STATUS_IGNORE);
+  // remove reading token and sparse grid file(s)
+  this->getSparseGridWorker().removeReadingFiles(filenamePrefixesToRead, startReadingTokenFileNames,
+                                                 keepSparseGridFiles);
+
+  [[maybe_unused]] auto returnedValue = MPI_Wait(&request, MPI_STATUS_IGNORE);
   assert(returnedValue == MPI_SUCCESS);
 }
 
 void ProcessGroupWorker::combineReadDistributeSystemWide(
     const std::vector<std::string>& filenamePrefixesToRead,
-    const std::vector<std::string>& startReadingTokenFileNames, bool overwrite,
+    const std::vector<std::string>& startReadingTokenFileNames, bool overwriteInMemory,
     bool keepSparseGridFiles) {
   OUTPUT_GROUP_EXCLUSIVE_SECTION {
     this->combineThirdLevelFileBasedReadReduce(filenamePrefixesToRead, startReadingTokenFileNames,
-                                               overwrite, keepSparseGridFiles);
+                                               overwriteInMemory, keepSparseGridFiles);
   }
   else {
     if (combiParameters_.getCombinationVariant() == chunkedOutgroupSparseGridReduce) {
@@ -968,7 +930,7 @@ int ProcessGroupWorker::reduceExtraSubspaceSizesFileBased(
     const std::string& filenamePrefixToWrite, const std::string& writeCompleteTokenFileName,
     const std::vector<std::string>& filenamePrefixesToRead,
     const std::vector<std::string>& startReadingTokenFileNames) {
-  int numSizesWritten = 0;
+  [[maybe_unused]] int numSizesWritten = 0;
   int numSizesReduced = 0;
   // we only need to write and read something if we are the I/O group
   OUTPUT_GROUP_EXCLUSIVE_SECTION { setExtraSparseGrid(true); }
@@ -1015,13 +977,12 @@ void ProcessGroupWorker::waitForThirdLevelCombiResult(bool fromOutputGroup) {
     // receive third level combi result from third level pgroup (global reduce comm)
     broadcastSender = (RankType)combiParameters_.getThirdLevelPG();
   }
-  CommunicatorType globalReduceComm = theMPISystem()->getGlobalReduceComm();
 
   Stats::startEvent("wait for bcasts");
   MPI_Request request;
   this->getSparseGridWorker().startSingleBroadcastDSGs(combiParameters_.getCombinationVariant(),
                                                        broadcastSender, &request);
-  auto returnedValue = MPI_Wait(&request, MPI_STATUS_IGNORE);
+  [[maybe_unused]] auto returnedValue = MPI_Wait(&request, MPI_STATUS_IGNORE);
   assert(returnedValue == MPI_SUCCESS);
   Stats::stopEvent("wait for bcasts");
 
@@ -1032,14 +993,17 @@ void ProcessGroupWorker::zeroDsgsData() {
   this->getSparseGridWorker().zeroDsgsData(combiParameters_.getCombinationVariant());
 }
 
-int ProcessGroupWorker::writeDSGsToDisk(const std::string& filenamePrefix) {
-  return this->getSparseGridWorker().writeDSGsToDisk(
+int ProcessGroupWorker::writeDSGsToDisk(const std::string& filenamePrefix,
+                                        const std::string& writeCompleteTokenFileName) {
+  auto numWritten = this->getSparseGridWorker().writeDSGsToDisk(
       filenamePrefix, this->getCombiParameters().getCombinationVariant());
+  this->getSparseGridWorker().writeTokenFiles(writeCompleteTokenFileName);
+  return numWritten;
 }
 
 int ProcessGroupWorker::readDSGsFromDisk(const std::string& filenamePrefix,
                                          bool alwaysReadFullDSG) {
-  return this->getSparseGridWorker().readDSGsFromDisk(filenamePrefix, alwaysReadFullDSG);
+  return this->getSparseGridWorker().readDSGsFromDisk(filenamePrefix, true, alwaysReadFullDSG);
 }
 
 } /* namespace combigrid */
diff --git a/src/manager/ProcessGroupWorker.hpp b/src/manager/ProcessGroupWorker.hpp
index 7715aed2c..d73fe5f54 100644
--- a/src/manager/ProcessGroupWorker.hpp
+++ b/src/manager/ProcessGroupWorker.hpp
@@ -73,7 +73,8 @@ class ProcessGroupWorker {
   void writeSparseGridMinMaxCoefficients(const std::string& fileNamePrefix) const;
 
   /** write extra SGs to disk (binary w/ MPI-IO) */
-  int writeDSGsToDisk(const std::string& filenamePrefix);
+  int writeDSGsToDisk(const std::string& filenamePrefix,
+                      const std::string& writeCompleteTokenFileName);
 
   /** read extra SGs from disk (binary w/ MPI-IO) */
   int readDSGsFromDisk(const std::string& filenamePrefix, bool alwaysReadFullDSG = false);
@@ -95,11 +96,8 @@ class ProcessGroupWorker {
 
   void waitForTokenFile(const std::string& startReadingTokenFileName) const;
 
-  void removeReadingFiles(const std::string& filenamePrefixToRead,
-                          const std::string& startReadingTokenFileName,
-                          bool keepSparseGridFiles) const;
-
-  int readReduce(const std::string& filenamePrefixToRead, bool overwrite);
+  int readReduce(const std::vector<std::string>& filenamePrefixesToRead,
+                 const std::vector<std::string>& startReadingTokenFileNames, bool overwrite);
 
   void combineThirdLevelFileBasedReadReduce(
       const std::vector<std::string>& filenamePrefixesToRead,
diff --git a/src/manager/ProcessManager.cpp b/src/manager/ProcessManager.cpp
index fc5d749f2..07e2bddbb 100644
--- a/src/manager/ProcessManager.cpp
+++ b/src/manager/ProcessManager.cpp
@@ -57,7 +57,7 @@ void ProcessManager::receiveDurationsOfTasksFromGroupMasters(size_t numDurations
   }
   for (size_t i = 0; i < numDurationsToReceive; ++i) {
     DurationInformation recvbuf;
-    for(const auto& t : pgroups_[i]->getTaskContainer()){
+    for([[maybe_unused]] const auto& t : pgroups_[i]->getTaskContainer()){
       // this assumes that the manager rank is the highest in globalComm
       MPIUtils::receiveClass(&recvbuf, i, theMPISystem()->getGlobalComm());
 
@@ -104,7 +104,7 @@ void ProcessManager::exit() {
 
   // send exit signal to each group
   for (size_t i = 0; i < pgroups_.size(); ++i) {
-    bool success = pgroups_[i]->exit();
+    [[maybe_unused]] bool success = pgroups_[i]->exit();
     assert(success);
   }
 }
@@ -125,7 +125,7 @@ void ProcessManager::initDsgus() {
 
   // tell groups to init Dsgus
   for (size_t i = 0; i < pgroups_.size(); ++i) {
-    bool success = pgroups_[i]->initDsgus();
+    [[maybe_unused]] bool success = pgroups_[i]->initDsgus();
     assert(success);
   }
 
@@ -136,13 +136,13 @@ void ProcessManager::initDsgus() {
 void ProcessManager::updateCombiParameters() {
   Stats::startEvent("manager update parameters");
   {
-    bool fail = waitAllFinished();
+    [[maybe_unused]] bool fail = waitAllFinished();
     assert(!fail && "should not fail here");
   }
 
   for (auto g : pgroups_) g->updateCombiParameters(params_);
   {
-    bool fail = waitAllFinished();
+    [[maybe_unused]] bool fail = waitAllFinished();
     assert(!fail && "should not fail here");
   }
   Stats::stopEvent("manager update parameters");
@@ -411,7 +411,7 @@ void ProcessManager::waitForAllGroupsToWait() const {
 void ProcessManager::parallelEval(const LevelVector& leval, std::string& filename, size_t groupID) {
   // actually it would be enough to wait for the group which does the eval
   {
-    bool fail = waitAllFinished();
+    [[maybe_unused]] bool fail = waitAllFinished();
 
     assert(!fail && "should not fail here");
   }
@@ -422,7 +422,7 @@ void ProcessManager::parallelEval(const LevelVector& leval, std::string& filenam
   g->parallelEval(leval, filename);
 
   {
-    bool fail = waitAllFinished();
+    [[maybe_unused]] bool fail = waitAllFinished();
 
     assert(!fail && "should not fail here");
   }
@@ -494,7 +494,7 @@ std::vector<CombiDataType> ProcessManager::interpolateValues(
 }
 
 void ProcessManager::writeInterpolatedValuesPerGrid(
-    const std::vector<std::vector<real>>& interpolationCoords, std::string filenamePrefix) {
+    const std::vector<std::vector<real>>& interpolationCoords, const std::string& filenamePrefix) {
   // send interpolation coords as a single array
   std::vector<real> interpolationCoordsSerial = serializeInterpolationCoords(interpolationCoords);
 
@@ -504,7 +504,7 @@ void ProcessManager::writeInterpolatedValuesPerGrid(
 }
 
 void ProcessManager::writeInterpolatedValuesSingleFile(
-    const std::vector<std::vector<real>>& interpolationCoords, std::string filenamePrefix) {
+    const std::vector<std::vector<real>>& interpolationCoords, const std::string& filenamePrefix) {
   Stats::startEvent("manager write interpolated");
   // send interpolation coords as a single array
   std::vector<real> interpolationCoordsSerial = serializeInterpolationCoords(interpolationCoords);
@@ -521,7 +521,7 @@ void ProcessManager::writeInterpolatedValuesSingleFile(
 }
 
 void ProcessManager::writeInterpolationCoordinates(
-    const std::vector<std::vector<real>>& interpolationCoords, std::string filenamePrefix) const {
+    const std::vector<std::vector<real>>& interpolationCoords, const std::string& filenamePrefix) const {
   std::string saveFilePath = filenamePrefix + "_coords.h5";
   h5io::writeValuesToH5File(interpolationCoords, saveFilePath, "manager", "coordinates");
 }
@@ -630,12 +630,12 @@ void ProcessManager::writeCombigridsToVTKPlotFile(ProcessGroupManagerID pg) {
 #endif /* defined(USE_VTK) */
 }
 
-void ProcessManager::writeDSGsToDisk(std::string filenamePrefix) {
+void ProcessManager::writeDSGsToDisk(const std::string& filenamePrefix) {
   thirdLevelPGroup_->writeDSGsToDisk(filenamePrefix);
   waitForPG(thirdLevelPGroup_);
 }
 
-void ProcessManager::readDSGsFromDisk(std::string filenamePrefix) {
+void ProcessManager::readDSGsFromDisk(const std::string& filenamePrefix) {
   thirdLevelPGroup_->readDSGsFromDisk(filenamePrefix);
   waitForPG(thirdLevelPGroup_);
 }
diff --git a/src/manager/ProcessManager.hpp b/src/manager/ProcessManager.hpp
index ee8fbc935..96271520d 100644
--- a/src/manager/ProcessManager.hpp
+++ b/src/manager/ProcessManager.hpp
@@ -134,13 +134,13 @@ class ProcessManager {
       const std::vector<std::vector<real>>& interpolationCoords);
 
   void writeInterpolatedValuesSingleFile(const std::vector<std::vector<real>>& interpolationCoords,
-                                         std::string filenamePrefix);
+                                         const std::string& filenamePrefix);
 
   void writeInterpolatedValuesPerGrid(const std::vector<std::vector<real>>& interpolationCoords,
-                                      std::string filenamePrefix);
+                                      const std::string& filenamePrefix);
 
   void writeInterpolationCoordinates(const std::vector<std::vector<real>>& interpolationCoords,
-                                     std::string filenamePrefix) const;
+                                     const std::string& filenamePrefix) const;
 
   void writeSparseGridMinMaxCoefficients(const std::string& filename);
 
@@ -175,9 +175,9 @@ class ProcessManager {
 
   void writeCombigridsToVTKPlotFile(ProcessGroupManagerID pg);
 
-  void writeDSGsToDisk(std::string filenamePrefix);
+  void writeDSGsToDisk(const std::string& filenamePrefix);
 
-  void readDSGsFromDisk(std::string filenamePrefix);
+  void readDSGsFromDisk(const std::string& filenamePrefix);
 
  private:
   ProcessGroupManagerContainer& pgroups_;
@@ -267,7 +267,7 @@ void ProcessManager::combine() {
 
   // send signal to each group
   for (size_t i = 0; i < pgroups_.size(); ++i) {
-    bool success = pgroups_[i]->combine();
+    [[maybe_unused]] bool success = pgroups_[i]->combine();
     assert(success);
   }
 
@@ -290,7 +290,7 @@ void ProcessManager::combineSystemWide() {
 
   // tell groups to combine local and global
   for (size_t i = 0; i < pgroups_.size(); ++i) {
-    bool success = pgroups_[i]->combineSystemWide();
+    [[maybe_unused]] bool success = pgroups_[i]->combineSystemWide();
     assert(success);
   }
 
diff --git a/src/manager/SparseGridWorker.hpp b/src/manager/SparseGridWorker.hpp
index 1cca2d500..45d2c38fe 100644
--- a/src/manager/SparseGridWorker.hpp
+++ b/src/manager/SparseGridWorker.hpp
@@ -1,5 +1,7 @@
 #pragma once
 
+#include <thread>
+
 #include "combicom/CombiCom.hpp"
 #include "fullgrid/DistributedFullGrid.hpp"
 #include "hierarchization/DistributedHierarchization.hpp"
@@ -8,7 +10,6 @@
 #include "mpi/MPIUtils.hpp"
 #include "sparsegrid/DistributedSparseGridIO.hpp"
 #include "sparsegrid/DistributedSparseGridUniform.hpp"
-
 namespace combigrid {
 
 // template <typename CombiDataType, DimType NumDimensions>
@@ -26,9 +27,9 @@ class SparseGridWorker {
                                       uint32_t maxMiBToSendPerThread,
                                       bool collectMinMaxCoefficients = false);
 
-  inline void copyFromExtraDsgToPartialDSG(int gridNumber = 0);
+  inline void copyFromExtraDsgToPartialDSG(size_t gridNumber = 0);
 
-  inline void copyFromPartialDsgToExtraDSG(int gridNumber = 0);
+  inline void copyFromPartialDsgToExtraDSG(size_t gridNumber = 0);
 
   /* free DSG memory as intermediate step */
   inline void deleteDsgsData();
@@ -53,13 +54,13 @@ class SparseGridWorker {
   inline const std::vector<std::unique_ptr<DistributedSparseGridUniform<CombiDataType>>>&
   getExtraUniDSGVector() const;
 
-  inline int getNumberOfGrids() const;
+  inline size_t getNumberOfGrids() const;
 
   inline std::unique_ptr<DistributedSparseGridUniform<CombiDataType>>&
   getSparseGridToUseForThirdLevel(bool thirdLevelExtraSparseGrid);
 
   inline void initCombinedUniDSGVector(const LevelVector& lmin, LevelVector lmax,
-                                       const LevelVector& reduceLmaxByVector, int numGrids,
+                                       const LevelVector& reduceLmaxByVector, size_t numGrids,
                                        CombinationVariant combinationVariant,
                                        bool clearLevels = false);
 
@@ -71,14 +72,17 @@ class SparseGridWorker {
 
   inline void maxReduceSubspaceSizesInOutputGroup();
 
-  inline int readDSGsFromDisk(const std::string& filenamePrefix, bool alwaysReadFullDSG = false);
+  inline int readDSGsFromDisk(const std::string& filenamePrefixToRead,
+                              bool addSpeciesNumberToFileName, bool alwaysReadFullDSG = false);
 
   inline int readDSGsFromDiskAndReduce(const std::string& filenamePrefixToRead,
                                        uint32_t maxMiBToReadPerThread,
+                                       bool addSpeciesNumberToFileName,
                                        bool alwaysReadFullDSG = false);
 
-  inline int readReduce(const std::string& filenamePrefixToRead, uint32_t maxMiBToReadPerThread,
-                        bool overwrite);
+  inline int readReduce(const std::vector<std::string>& filenamePrefixesToRead,
+                        const std::vector<std::string>& startReadingTokenFileNames,
+                        uint32_t maxMiBToReadPerThread, bool overwrite);
 
   inline int reduceExtraSubspaceSizes(const std::vector<std::string>& filenamesToRead,
                                       CombinationVariant combinationVariant, bool overwrite);
@@ -90,6 +94,10 @@ class SparseGridWorker {
 
   inline void reduceSubspaceSizesBetweenGroups(CombinationVariant combinationVariant);
 
+  inline void removeReadingFiles(const std::vector<std::string>& filenamePrefixToRead,
+                                 const std::vector<std::string>& startReadingTokenFileName,
+                                 bool keepSparseGridFiles) const;
+
   inline void setExtraSparseGrid(bool initializeSizes = true);
 
   inline void startSingleBroadcastDSGs(CombinationVariant combinationVariant,
@@ -102,6 +110,8 @@ class SparseGridWorker {
 
   inline void writeMinMaxCoefficients(const std::string& fileNamePrefix) const;
 
+  inline void writeTokenFiles(const std::string& writeCompleteTokenFileName) const;
+
   inline int writeSubspaceSizesToFile(const std::string& filenamePrefixToWrite) const;
 
   inline void zeroDsgsData(CombinationVariant combinationVariant);
@@ -126,7 +136,7 @@ class SparseGridWorker {
    * @param g the dimension index (in the case that there are multiple different full grids per
    * task)
    */
-  inline void fillDFGFromDSGU(DistributedFullGrid<CombiDataType>& dfg, int g,
+  inline void fillDFGFromDSGU(DistributedFullGrid<CombiDataType>& dfg, size_t g,
                               const std::vector<BoundaryType>& boundary,
                               const std::vector<bool>& hierarchizationDims,
                               const std::vector<BasisFunctionBasis*>& hierarchicalBases,
@@ -150,7 +160,7 @@ inline void SparseGridWorker::collectReduceDistribute(CombinationVariant combina
   auto chunkSize =
       combigrid::CombiCom::getGlobalReduceChunkSize<CombiDataType>(maxMiBToSendPerThread);
 
-  for (int g = 0; g < numGrids; ++g) {
+  for (size_t g = 0; g < numGrids; ++g) {
     auto& dsg = this->getCombinedUniDSGVector()[g];
     if (!dsg->getSubspacesByCommunicator().empty()) {
       auto& chunkedSubspaces = combigrid::CombiCom::getChunkedSubspaces(
@@ -158,7 +168,7 @@ inline void SparseGridWorker::collectReduceDistribute(CombinationVariant combina
       for (auto& subspaceChunk : chunkedSubspaces) {
         // allocate new subspace vector
         dsg->allocateDifferentSubspaces(std::move(subspaceChunk));
-        assert(dsg->getRawDataSize() <= chunkSize);
+        assert(dsg->getRawDataSize() <= static_cast<size_t>(chunkSize));
 #ifndef NDEBUG
         auto myRawDataSize = dsg->getRawDataSize();
         decltype(myRawDataSize) maxRawDataSize = 0;
@@ -174,8 +184,7 @@ inline void SparseGridWorker::collectReduceDistribute(CombinationVariant combina
 
         // local reduce (fg -> sg, within rank)
         for (const auto& t : this->taskWorkerRef_.getTasks()) {
-          const DistributedFullGrid<CombiDataType>& dfg =
-              t->getDistributedFullGrid(static_cast<int>(g));
+          const DistributedFullGrid<CombiDataType>& dfg = t->getDistributedFullGrid(g);
           dsg->addDistributedFullGrid<false>(dfg, t->getCoefficient());
         }
         // global reduce (across process groups)
@@ -227,7 +236,7 @@ inline void SparseGridWorker::collectReduceDistribute(CombinationVariant combina
   }
 }
 
-inline void SparseGridWorker::copyFromPartialDsgToExtraDSG(int gridNumber) {
+inline void SparseGridWorker::copyFromPartialDsgToExtraDSG(size_t gridNumber) {
   assert(gridNumber == 0);
   assert(this->getCombinedUniDSGVector().size() == 1);
   assert(this->getExtraUniDSGVector().size() == 1);
@@ -242,14 +251,14 @@ inline void SparseGridWorker::copyFromPartialDsgToExtraDSG(int gridNumber) {
       std::remove_if(subspacesToCopy.begin(), subspacesToCopy.end(),
                      [&extraDSG](const auto& s) { return extraDSG->getDataSize(s) == 0; }),
       subspacesToCopy.end());
-  for (const auto& s : subspacesToCopy) {
+  for ([[maybe_unused]] const auto& s : subspacesToCopy) {
     assert(extraDSG->getDataSize(s) == extraDSG->getAllocatedDataSize(s));
     assert(dsg->getDataSize(s) == dsg->getAllocatedDataSize(s));
   }
   extraDSG->copyDataFrom(*dsg, subspacesToCopy);
 }
 
-inline void SparseGridWorker::copyFromExtraDsgToPartialDSG(int gridNumber) {
+inline void SparseGridWorker::copyFromExtraDsgToPartialDSG(size_t gridNumber) {
   assert(gridNumber == 0);
   assert(this->getCombinedUniDSGVector().size() == 1);
   assert(this->getExtraUniDSGVector().size() == 1);
@@ -261,7 +270,7 @@ inline void SparseGridWorker::copyFromExtraDsgToPartialDSG(int gridNumber) {
       std::remove_if(subspacesToCopy.begin(), subspacesToCopy.end(),
                      [&extraDSG](const auto& s) { return extraDSG->getDataSize(s) == 0; }),
       subspacesToCopy.end());
-  for (const auto& s : subspacesToCopy) {
+  for ([[maybe_unused]] const auto& s : subspacesToCopy) {
     assert(extraDSG->getDataSize(s) == extraDSG->getAllocatedDataSize(s));
     assert(dsg->getDataSize(s) == dsg->getAllocatedDataSize(s));
   }
@@ -310,7 +319,7 @@ inline void SparseGridWorker::distributeChunkedBroadcasts(uint32_t maxMiBToSendP
   assert(this->getCombinedUniDSGVector().size() == 1);
   for (auto& dsg : this->getCombinedUniDSGVector()) {
     // make sure data sizes are set
-    for (auto& subspace : subspaces) {
+    for ([[maybe_unused]] auto& subspace : subspaces) {
       assert(dsg->getDataSize(subspace) > 0);
     }
 
@@ -332,7 +341,7 @@ inline void SparseGridWorker::distributeChunkedBroadcasts(uint32_t maxMiBToSendP
       else {
         // non-output ranks need to wait before they can extract
         if (roundNumber == 0) Stats::startEvent("wait 1st bcast");
-        auto returnedValue = MPI_Wait(&request, MPI_STATUS_IGNORE);
+        [[maybe_unused]] auto returnedValue = MPI_Wait(&request, MPI_STATUS_IGNORE);
         if (roundNumber == 0) Stats::stopEvent("wait 1st bcast");
         assert(returnedValue == MPI_SUCCESS);
       }
@@ -343,7 +352,7 @@ inline void SparseGridWorker::distributeChunkedBroadcasts(uint32_t maxMiBToSendP
       }
       OUTPUT_GROUP_EXCLUSIVE_SECTION {
         // output ranks can wait later
-        auto returnedValue = MPI_Wait(&request, MPI_STATUS_IGNORE);
+        [[maybe_unused]] auto returnedValue = MPI_Wait(&request, MPI_STATUS_IGNORE);
         assert(returnedValue == MPI_SUCCESS);
       }
       ++roundNumber;
@@ -363,7 +372,7 @@ inline void SparseGridWorker::distributeChunkedBroadcasts(uint32_t maxMiBToSendP
     }
     for (auto& dsg : this->getCombinedUniDSGVector()) {
       // make sure data sizes are set
-      for (auto& subspace : subspaces) {
+      for ([[maybe_unused]] auto& subspace : subspaces) {
         assert(dsg->getDataSize(subspace) > 0);
       }
 
@@ -386,7 +395,7 @@ inline void SparseGridWorker::distributeCombinedSolutionToTasks() {
   // not better than the parallel loop within extractFromUniformSG
   // #pragma omp parallel for collapse(2) default(none) schedule(static)
   for (auto& taskToUpdate : this->taskWorkerRef_.getTasks()) {
-    for (int g = 0; g < this->getNumberOfGrids(); ++g) {
+    for (size_t g = 0; g < this->getNumberOfGrids(); ++g) {
       // fill dfg with hierarchical coefficients from distributed sparse grid
       taskToUpdate->getDistributedFullGrid(g).extractFromUniformSG(
           *this->getCombinedUniDSGVector()[g]);
@@ -395,7 +404,7 @@ inline void SparseGridWorker::distributeCombinedSolutionToTasks() {
 }
 
 inline void SparseGridWorker::fillDFGFromDSGU(
-    DistributedFullGrid<CombiDataType>& dfg, int g, const std::vector<BoundaryType>& boundary,
+    DistributedFullGrid<CombiDataType>& dfg, size_t g, const std::vector<BoundaryType>& boundary,
     const std::vector<bool>& hierarchizationDims,
     const std::vector<BasisFunctionBasis*>& hierarchicalBases, const LevelVector& lmin) const {
   // fill dfg with hierarchical coefficients from distributed sparse grid
@@ -416,7 +425,7 @@ inline void SparseGridWorker::fillDFGFromDSGU(
 inline void SparseGridWorker::fillDFGFromDSGU(
     Task& t, const std::vector<bool>& hierarchizationDims,
     const std::vector<BasisFunctionBasis*>& hierarchicalBases, const LevelVector& lmin) const {
-  for (int g = 0; g < this->getNumberOfGrids(); g++) {
+  for (size_t g = 0; g < this->getNumberOfGrids(); g++) {
     assert(this->getCombinedUniDSGVector()[g] != nullptr);
     this->fillDFGFromDSGU(t.getDistributedFullGrid(g), g, t.getBoundary(), hierarchizationDims,
                           hierarchicalBases, lmin);
@@ -443,7 +452,9 @@ SparseGridWorker::getExtraUniDSGVector() const {
   return extraUniDSGVector_;
 }
 
-inline int SparseGridWorker::getNumberOfGrids() const { return this->combinedUniDSGVector_.size(); }
+inline size_t SparseGridWorker::getNumberOfGrids() const {
+  return this->combinedUniDSGVector_.size();
+}
 
 inline std::unique_ptr<DistributedSparseGridUniform<CombiDataType>>&
 SparseGridWorker::getSparseGridToUseForThirdLevel(bool thirdLevelExtraSparseGrid) {
@@ -466,7 +477,7 @@ SparseGridWorker::getSparseGridToUseForThirdLevel(bool thirdLevelExtraSparseGrid
  */
 inline void SparseGridWorker::initCombinedUniDSGVector(const LevelVector& lmin, LevelVector lmax,
                                                        const LevelVector& reduceLmaxByVector,
-                                                       int numGrids,
+                                                       size_t numGrids,
                                                        CombinationVariant combinationVariant,
                                                        bool clearLevels) {
   if (this->taskWorkerRef_.getTasks().size() == 0) {
@@ -481,7 +492,7 @@ inline void SparseGridWorker::initCombinedUniDSGVector(const LevelVector& lmin,
 
   // get all subspaces in the (optimized) combischeme, create dsgs
   combinedUniDSGVector_.reserve(static_cast<size_t>(numGrids));
-  for (int g = 0; g < numGrids; ++g) {
+  for (size_t g = 0; g < numGrids; ++g) {
     combinedUniDSGVector_.emplace_back(std::unique_ptr<DistributedSparseGridUniform<CombiDataType>>(
         new DistributedSparseGridUniform<CombiDataType>(static_cast<DimType>(lmax.size()), lmax,
                                                         lmin, theMPISystem()->getLocalComm())));
@@ -525,7 +536,7 @@ inline void SparseGridWorker::interpolateAndPlotOnLevel(
   OwningDistributedFullGrid<CombiDataType> dfg(static_cast<DimType>(levelToEvaluate.size()),
                                                levelToEvaluate, theMPISystem()->getLocalComm(),
                                                boundary, parallelization, false, decomposition);
-  for (IndexType g = 0; g < this->getNumberOfGrids(); ++g) {  // loop over all grids and plot them
+  for (size_t g = 0; g < this->getNumberOfGrids(); ++g) {  // loop over all grids and plot them
     this->fillDFGFromDSGU(dfg, g, boundary, hierarchizationDims, hierarchicalBases, lmin);
     // save dfg to file with MPI-IO
     if (endsWith(filename, ".vtk")) {
@@ -560,19 +571,22 @@ inline void SparseGridWorker::maxReduceSubspaceSizesInOutputGroup() {
   }
 }
 
-inline int SparseGridWorker::readDSGsFromDisk(const std::string& filenamePrefix,
+inline int SparseGridWorker::readDSGsFromDisk(const std::string& filenamePrefixToRead,
+                                              bool addSpeciesNumberToFileName,
                                               bool alwaysReadFullDSG) {
   int numRead = 0;
   for (size_t i = 0; i < this->getNumberOfGrids(); ++i) {
     auto uniDsg = this->getCombinedUniDSGVector()[i].get();
     auto dsgToUse = uniDsg;
+    auto filename = filenamePrefixToRead;
+    if (addSpeciesNumberToFileName) {
+      filename += "_" + std::to_string(i);
+    }
     if (this->getExtraUniDSGVector().size() > 0 && !alwaysReadFullDSG) {
       dsgToUse = this->getExtraUniDSGVector()[i].get();
-      numRead += DistributedSparseGridIO::readSomeFiles(*dsgToUse,
-                                                        filenamePrefix + "_" + std::to_string(i));
+      numRead += DistributedSparseGridIO::readSomeFiles(*dsgToUse, filename);
     } else {
-      numRead +=
-          DistributedSparseGridIO::readOneFile(*dsgToUse, filenamePrefix + "_" + std::to_string(i));
+      numRead += DistributedSparseGridIO::readOneFile(*dsgToUse, filename);
     }
     if (this->getExtraUniDSGVector().size() > 0 && uniDsg->isSubspaceDataCreated()) {
       // copy partial data from extraDSG back to uniDSG
@@ -584,18 +598,23 @@ inline int SparseGridWorker::readDSGsFromDisk(const std::string& filenamePrefix,
 
 inline int SparseGridWorker::readDSGsFromDiskAndReduce(const std::string& filenamePrefixToRead,
                                                        uint32_t maxMiBToReadPerThread,
+                                                       bool addSpeciesNumberToFileName,
                                                        bool alwaysReadFullDSG) {
   int numReduced = 0;
   for (size_t i = 0; i < this->getNumberOfGrids(); ++i) {
+    auto filename = filenamePrefixToRead;
+    if (addSpeciesNumberToFileName) {
+      filename += "_" + std::to_string(i);
+    }
     auto uniDsg = this->getCombinedUniDSGVector()[i].get();
     auto dsgToUse = uniDsg;
     if (this->getExtraUniDSGVector().size() > 0 && !alwaysReadFullDSG) {
       dsgToUse = this->getExtraUniDSGVector()[i].get();
-      numReduced += DistributedSparseGridIO::readSomeFilesAndReduce(
-          *dsgToUse, filenamePrefixToRead + "_" + std::to_string(i), maxMiBToReadPerThread);
+      numReduced += DistributedSparseGridIO::readSomeFilesAndReduce(*dsgToUse, filename,
+                                                                    maxMiBToReadPerThread);
     } else {
-      numReduced += DistributedSparseGridIO::readOneFileAndReduce(
-          *dsgToUse, filenamePrefixToRead + "_" + std::to_string(i), maxMiBToReadPerThread);
+      numReduced +=
+          DistributedSparseGridIO::readOneFileAndReduce(*dsgToUse, filename, maxMiBToReadPerThread);
     }
     if (this->getExtraUniDSGVector().size() > 0 && uniDsg->isSubspaceDataCreated()) {
       // copy partial data from extraDSG back to uniDSG
@@ -605,13 +624,55 @@ inline int SparseGridWorker::readDSGsFromDiskAndReduce(const std::string& filena
   return numReduced;
 }
 
-inline int SparseGridWorker::readReduce(const std::string& filenamePrefixToRead,
+inline int SparseGridWorker::readReduce(const std::vector<std::string>& filenamePrefixesToRead,
+                                        const std::vector<std::string>& startReadingTokenFileNames,
                                         uint32_t maxMiBToReadPerThread, bool overwrite) {
   int numRead = 0;
-  if (overwrite) {
-    numRead = this->readDSGsFromDisk(filenamePrefixToRead);
-  } else {
-    numRead = this->readDSGsFromDiskAndReduce(filenamePrefixToRead, maxMiBToReadPerThread);
+  assert(this->getNumberOfGrids() == 1);  // for more species, todo loop
+
+  // wait until we can start to read any of the files
+  std::set<size_t> indicesStillToReadReduce;
+  for (size_t i = 0; i < filenamePrefixesToRead.size(); ++i) {
+    indicesStillToReadReduce.insert(i);
+  }
+  auto filePart = theMPISystem()->getFilePartNumber();
+  bool firstTry = true;
+  while (!indicesStillToReadReduce.empty()) {
+    std::string filePartTokenToRead;
+    std::string filePartNameToRead;
+    for (auto it = indicesStillToReadReduce.begin(); it != indicesStillToReadReduce.end(); ++it) {
+      if (combigrid::theMPISystem()->getOutputComm() != MPI_COMM_NULL) {
+        filePartTokenToRead = startReadingTokenFileNames[*it] + ".part" + std::to_string(filePart);
+      } else {
+        filePartTokenToRead = startReadingTokenFileNames[*it];
+      }
+      if (firstTry) {
+        std::cout << "trying to read " << filePartTokenToRead << std::endl;
+        firstTry = false;
+      }
+      if (combigrid::getFileExistsRootOnly(filePartTokenToRead, theMPISystem()->getOutputComm())) {
+        if (combigrid::theMPISystem()->getOutputComm() != MPI_COMM_NULL) {
+          filePartNameToRead =
+              filenamePrefixesToRead[*it] + "_0" + ".part" + std::to_string(filePart);
+        } else {
+          filePartNameToRead = filenamePrefixesToRead[*it] + "_0";
+        }
+        std::cout << " read from " << filePartNameToRead << std::endl;
+        overwrite ? Stats::startEvent("read SG") : Stats::startEvent("read/reduce SG");
+        if (overwrite) {
+          numRead += this->readDSGsFromDisk(filePartNameToRead, false);
+        } else {
+          numRead +=
+              this->readDSGsFromDiskAndReduce(filePartNameToRead, maxMiBToReadPerThread, false);
+        }
+        assert(numRead > 0);
+        overwrite ? Stats::stopEvent("read SG") : Stats::stopEvent("read/reduce SG");
+        indicesStillToReadReduce.erase(it);
+        break;  // because iterators may be invalidated
+      }
+    }
+    // wait for 200ms
+    std::this_thread::sleep_for(std::chrono::milliseconds(200));
   }
   if (this->getNumberOfGrids() != 1) {
     throw std::runtime_error("Combining more than one DSG is not implemented yet");
@@ -710,14 +771,14 @@ inline void SparseGridWorker::reduceLocalAndGlobal(CombinationVariant combinatio
   this->zeroDsgsData(combinationVariant);
   // local reduce (within rank)
   for (const auto& t : this->taskWorkerRef_.getTasks()) {
-    for (int g = 0; g < numGrids; ++g) {
+    for (size_t g = 0; g < numGrids; ++g) {
       const DistributedFullGrid<CombiDataType>& dfg =
           t->getDistributedFullGrid(static_cast<int>(g));
       this->getCombinedUniDSGVector()[g]->addDistributedFullGrid(dfg, t->getCoefficient());
     }
   }
   // global reduce (across process groups)
-  for (int g = 0; g < numGrids; ++g) {
+  for (size_t g = 0; g < numGrids; ++g) {
     if (combinationVariant == CombinationVariant::sparseGridReduce) {
       CombiCom::distributedGlobalSparseGridReduce(
           *this->getCombinedUniDSGVector()[g], maxMiBToSendPerThread, globalReduceRankThatCollects);
@@ -756,6 +817,34 @@ inline void SparseGridWorker::reduceSubspaceSizesBetweenGroups(
   }
 }
 
+void SparseGridWorker::removeReadingFiles(
+    const std::vector<std::string>& filenamePrefixesToRead,
+    const std::vector<std::string>& startReadingTokenFileNames, bool keepSparseGridFiles) const {
+  assert(filenamePrefixesToRead.size() == startReadingTokenFileNames.size());
+  OUTPUT_ROOT_EXCLUSIVE_SECTION {
+    for (size_t i = 0; i < filenamePrefixesToRead.size(); ++i) {
+      auto filePart = theMPISystem()->getFilePartNumber();
+      // remove reading token
+      auto partTokenString = startReadingTokenFileNames[i];
+      if (combigrid::theMPISystem()->getOutputComm() != MPI_COMM_NULL) {
+        partTokenString += ".part" + std::to_string(filePart);
+      }
+      std::filesystem::remove(partTokenString);
+      // remove sparse grid file(s)
+      if (!keepSparseGridFiles) {
+        std::filesystem::remove(filenamePrefixesToRead[i] + "_" + std::to_string(0));
+        for (const auto& entry : std::filesystem::directory_iterator(".")) {
+          if (entry.path().string().find(filenamePrefixesToRead[i] + "_" + std::to_string(0) +
+                                         ".part" + std::to_string(filePart)) != std::string::npos) {
+            assert(entry.is_regular_file());
+            std::filesystem::remove(entry.path());
+          }
+        }
+      }
+    }
+  }
+}
+
 inline void SparseGridWorker::setExtraSparseGrid(bool initializeSizes) {
   if (this->getNumberOfGrids() != 1) {
     throw std::runtime_error("this->getCombinedUniDSGVector() is empty");
@@ -778,7 +867,8 @@ inline void SparseGridWorker::setExtraSparseGrid(bool initializeSizes) {
     // create Kahan buffer now (at zero size), because summation is not needed on this sparse grid
     extraUniDSG->createKahanBuffer();
     if (initializeSizes) {
-      for (size_t i = 0; i < extraUniDSG->getNumSubspaces(); ++i) {
+      for (typename AnyDistributedSparseGrid::SubspaceIndexType i = 0;
+           i < extraUniDSG->getNumSubspaces(); ++i) {
         extraUniDSG->setDataSize(i, this->getCombinedUniDSGVector()[0]->getDataSize(i));
       }
     }
@@ -813,6 +903,7 @@ inline int SparseGridWorker::writeDSGsToDisk(const std::string& filenamePrefix,
     auto uniDsg = this->getCombinedUniDSGVector()[i].get();
     auto dsgToUse = uniDsg;
     if (this->getExtraUniDSGVector().size() > 0) {
+      assert(theMPISystem()->getOutputComm() != MPI_COMM_NULL);
       dsgToUse = this->getExtraUniDSGVector()[i].get();
       if (combinationVariant == CombinationVariant::sparseGridReduce ||
           combinationVariant == CombinationVariant::outgroupSparseGridReduce) {
@@ -844,6 +935,18 @@ inline void SparseGridWorker::writeMinMaxCoefficients(const std::string& fileNam
   }
 }
 
+inline void SparseGridWorker::writeTokenFiles(const std::string& writeCompleteTokenFileName) const {
+  OUTPUT_ROOT_EXCLUSIVE_SECTION {
+    if (combigrid::theMPISystem()->getOutputComm() == MPI_COMM_NULL) {
+      // write single token
+      std::ofstream tokenFile(writeCompleteTokenFileName);
+    } else {
+      auto filePart = theMPISystem()->getFilePartNumber();
+      std::ofstream tokenFile(writeCompleteTokenFileName + ".part" + std::to_string(filePart));
+    }
+  }
+}
+
 inline int SparseGridWorker::writeSubspaceSizesToFile(
     const std::string& filenamePrefixToWrite) const {
   return DistributedSparseGridIO::writeSubspaceSizesToFile(*this->getCombinedUniDSGVector()[0],
diff --git a/src/mpi/MPICartesianUtils.hpp b/src/mpi/MPICartesianUtils.hpp
index eba767046..51d38541a 100644
--- a/src/mpi/MPICartesianUtils.hpp
+++ b/src/mpi/MPICartesianUtils.hpp
@@ -82,7 +82,7 @@ class MPICartesianUtils {
     assert(!partitionCoords_.empty());
     // find rank r in partitionCoords_
     auto findIt = std::find(partitionCoords_.begin(), partitionCoords_.end(), r);
-    IndexType rIndex = std::distance(partitionCoords_.begin(), findIt);
+    IndexType rIndex = static_cast<IndexType>(std::distance(partitionCoords_.begin(), findIt));
     return partitionCoordsTensor_.getVectorIndex(rIndex);
   }
 
@@ -145,7 +145,7 @@ class MPICartesianUtils {
   }
 
   inline int getCommunicatorSize() const {
-    return partitionCoords_.size();
+    return static_cast<int>(partitionCoords_.size());
   }
 
   inline RankType getCommunicatorRank() const { return rank_; }
diff --git a/src/mpi/MPIMemory.cpp b/src/mpi/MPIMemory.cpp
index ee77e2e94..b2c01ab3d 100644
--- a/src/mpi/MPIMemory.cpp
+++ b/src/mpi/MPIMemory.cpp
@@ -2,6 +2,8 @@
 
 #include "mpi/MPISystem.hpp"
 
+#include <vector>
+
 namespace combigrid {
 namespace mpimemory {
 /*
@@ -18,8 +20,8 @@ int get_memory_usage_kb(unsigned long* vmrss_kb, unsigned long* vmsize_kb) {
   FILE* procfile = fopen("/proc/self/status", "r");
 
   long to_read = 8192;
-  char buffer[to_read];
-  fread(buffer, sizeof(char), to_read, procfile);
+  std::vector<char> buffer(to_read);
+  fread(buffer.data(), sizeof(char), to_read, procfile);
   fclose(procfile);
 
   short found_vmrss = 0;
@@ -28,7 +30,7 @@ int get_memory_usage_kb(unsigned long* vmrss_kb, unsigned long* vmsize_kb) {
 
   /* Look through proc status contents line by line */
   char delims[] = "\n";
-  char* line = strtok(buffer, delims);
+  char* line = strtok(buffer.data(), delims);
 
   while (line != NULL && (found_vmrss == 0 || found_vmsize == 0)) {
     search_result = strstr(line, "VmRSS:");
diff --git a/src/mpi/MPISystem.cpp b/src/mpi/MPISystem.cpp
index 3e51f2e5b..e306fa871 100644
--- a/src/mpi/MPISystem.cpp
+++ b/src/mpi/MPISystem.cpp
@@ -139,7 +139,7 @@ void MPISystem::init(size_t ngroup, size_t nprocs, bool withWorldManager) {
 
     initGlobalReduceCommm();
 
-    initOuputGroupComm();
+    initOutputGroupComm();
 
     if (withWorldManager) {
       /*
@@ -183,7 +183,7 @@ void MPISystem::init(size_t ngroup, size_t nprocs, CommunicatorType lcomm, bool
 
   initGlobalReduceCommm();
 
-  initOuputGroupComm();
+  initOutputGroupComm();
 
   if (withWorldManager) {
     initThirdLevelComms();
@@ -212,7 +212,7 @@ void MPISystem::initWorldReusable(CommunicatorType wcomm, size_t ngroup, size_t
 
     initGlobalReduceCommm();
 
-    initOuputGroupComm();
+    initOutputGroupComm();
 
     /* create global communicator which contains only the manager and the master
      * process of each process group
@@ -291,7 +291,7 @@ void MPISystem::storeLocalComm(CommunicatorType lcomm) {
     // in principle this does not have to be 0
     masterRank_ = 0;
 
-    int localSize = getCommSize(localComm_);
+    [[maybe_unused]] int localSize = getCommSize(localComm_);
     assert(masterRank_ < localSize);
   }
 
@@ -319,7 +319,7 @@ void MPISystem::initGlobalComm(bool withWorldManager) {
   }
   if (withWorldManager) {
     assert(managerRankWorld_ >= 0);
-    assert(managerRankWorld_ < nprocs_ * ngroup_ + 1);
+    assert(managerRankWorld_ < static_cast<RankType>(nprocs_ * ngroup_ + 1));
     ranks.push_back(managerRankWorld_);
   }
 
@@ -334,12 +334,12 @@ void MPISystem::initGlobalComm(bool withWorldManager) {
 
   int worldSize = 0;
   MPI_Group_size(worldGroup, &worldSize);
-  assert(worldSize == nprocs_ * ngroup_ + (withWorldManager ? 1 : 0));
+  assert(worldSize == static_cast<int>(nprocs_ * ngroup_ + (withWorldManager ? 1 : 0)));
   for (const auto& r : ranks) {
     assert(r >= 0);
     assert(r < worldSize);
   }
-  assert(ranks.size() <= worldSize);
+  assert(ranks.size() <= static_cast<size_t>(worldSize));
 #endif
 
   MPI_Group globalGroup;
@@ -447,7 +447,7 @@ std::vector<int>& getDiagonalRanks(size_t nprocs, size_t ngroup) {
   return ranks;
 }
 
-void MPISystem::initOuputGroupComm(uint16_t numFileParts) {
+void MPISystem::initOutputGroupComm(uint16_t numFileParts) {
   assert(numFileParts > 0);
   if (outputComm_ != MPI_COMM_NULL && outputComm_ != outputGroupComm_) {
     MPI_Comm_free(&outputComm_);
@@ -881,7 +881,7 @@ bool MPISystem::recoverCommunicators(bool groupAlive,
   deleteCommFTAndCcomm(&globalReduceCommFT_, &globalReduceComm_);
   initGlobalReduceCommm();
 
-  initOuputGroupComm();
+  initOutputGroupComm();
 
   // toDo return fixed process group IDs
   std::cout << "returning \n";
diff --git a/src/mpi/MPISystem.hpp b/src/mpi/MPISystem.hpp
index be097f928..c7dd7bc34 100644
--- a/src/mpi/MPISystem.hpp
+++ b/src/mpi/MPISystem.hpp
@@ -5,6 +5,7 @@
 // to resolve https://github.com/open-mpi/ompi/issues/5157
 #define OMPI_SKIP_MPICXX 1
 #include <mpi.h>
+
 #include <memory>
 #include <ostream>
 #include <vector>
@@ -18,17 +19,28 @@
 #define FIRST_GROUP_EXCLUSIVE_SECTION if (combigrid::theMPISystem()->getProcessGroupNumber() == 0)
 
 // output group does output (in no-manager case)
-#define OUTPUT_GROUP_EXCLUSIVE_SECTION if (combigrid::theMPISystem()->getOutputGroupComm() != MPI_COMM_NULL)
+#define OUTPUT_GROUP_EXCLUSIVE_SECTION \
+  if (combigrid::theMPISystem()->getOutputGroupComm() != MPI_COMM_NULL)
+
+// output root: either, the split Output comm is not set (then the master of output group),
+// or the zeroth rank of each split of the output group comm
+#define OUTPUT_ROOT_EXCLUSIVE_SECTION                                       \
+  if ((combigrid::theMPISystem()->getOutputGroupComm() != MPI_COMM_NULL) && \
+      ((combigrid::theMPISystem()->getOutputComm() == MPI_COMM_NULL &&      \
+        combigrid::theMPISystem()->getOutputGroupRank() == 0) ||            \
+       (combigrid::getCommRank(combigrid::theMPISystem()->getOutputComm()) == 0)))
 
 // last group does some other (potentially concurrent) output
-#define OTHER_OUTPUT_GROUP_EXCLUSIVE_SECTION \
-  if (combigrid::theMPISystem()->getProcessGroupNumber() == combigrid::theMPISystem()->getNumGroups() - 1)
+#define OTHER_OUTPUT_GROUP_EXCLUSIVE_SECTION                \
+  if (combigrid::theMPISystem()->getProcessGroupNumber() == \
+      static_cast<RankType>(combigrid::theMPISystem()->getNumGroups() - 1))
 
 // middle process can do command line output
-#define MIDDLE_PROCESS_EXCLUSIVE_SECTION           \
-  if (combigrid::theMPISystem()->getWorldRank() == \
-      (combigrid::theMPISystem()->getNumGroups() * combigrid::theMPISystem()->getNumProcs()) / 2)
-
+#define MIDDLE_PROCESS_EXCLUSIVE_SECTION                                                           \
+  if (combigrid::theMPISystem()->getWorldRank() ==                                                 \
+      static_cast<RankType>(                                                                       \
+          (combigrid::theMPISystem()->getNumGroups() * combigrid::theMPISystem()->getNumProcs()) / \
+          2))
 
 #define GLOBAL_MANAGER_EXCLUSIVE_SECTION if (combigrid::theMPISystem()->isGlobalManager())
 
@@ -160,6 +172,8 @@ class MPISystem {
 
   inline const CommunicatorType& getOutputComm() const;
 
+  inline RankType getFilePartNumber() const;
+
   inline const std::vector<CommunicatorType>& getThirdLevelComms() const;
 
   /**
@@ -312,7 +326,7 @@ class MPISystem {
   void storeLocalComm(CommunicatorType lcomm);
 
   /* let the output "group" be distributed across the actual process groups */
-  void initOuputGroupComm(uint16_t numFileParts = 1);
+  void initOutputGroupComm(uint16_t numFileParts = 1);
 
  private:
   explicit MPISystem();
@@ -494,6 +508,18 @@ class MPISystem {
   std::vector<RankType> reusableRanks_;
 };
 
+static inline int getCommSize(const CommunicatorType& comm) {
+  int commSize;
+  MPI_Comm_size(comm, &commSize);
+  return commSize;
+}
+
+static inline int getCommRank(const CommunicatorType& comm) {
+  int commRank;
+  MPI_Comm_rank(comm, &commRank);
+  return commRank;
+}
+
 /*!\name MPI communication system setup functions */
 //@{
 inline MPISystemID theMPISystem();
@@ -548,19 +574,31 @@ inline const CommunicatorType& MPISystem::getGlobalReduceComm() const {
   return globalReduceComm_;
 }
 
-inline const CommunicatorType& MPISystem::getOutputGroupComm() const {
-  return outputGroupComm_;
-}
+inline const CommunicatorType& MPISystem::getOutputGroupComm() const { return outputGroupComm_; }
 
 inline const CommunicatorType& MPISystem::getOutputComm() const {
+  OUTPUT_GROUP_EXCLUSIVE_SECTION { return outputComm_; }
+  else {
+    throw std::runtime_error("Called from outside output group!");
+  }
+}
+
+inline RankType MPISystem::getFilePartNumber() const {
+  int filePart = -1;
   OUTPUT_GROUP_EXCLUSIVE_SECTION {
-    return outputComm_;
-  } else {
+    if (combigrid::theMPISystem()->getOutputComm() != MPI_COMM_NULL) {
+      auto comm = theMPISystem()->getOutputComm();
+      auto outputGroupSize = combigrid::getCommSize(comm);
+      filePart = theMPISystem()->getOutputGroupRank() / outputGroupSize;
+    }
+  }
+  else {
     throw std::runtime_error("Called from outside output group!");
   }
+  return filePart;
 }
 
-inline const std::vector<CommunicatorType>& MPISystem::getThirdLevelComms() const{
+inline const std::vector<CommunicatorType>& MPISystem::getThirdLevelComms() const {
   checkPreconditions();
 
   return thirdLevelComms_;
@@ -613,7 +651,7 @@ inline const RankType& MPISystem::getOutputGroupRank() const{
 inline RankType MPISystem::getOutputRankInGlobalReduceComm() const {
   // my local rank gives the index of the global reduce comm,
   // and the output rank in the global reduce comm is the index modulo the number of groups
-  return localRank_ % ngroup_;
+  return static_cast<RankType>(localRank_ % ngroup_);
 }
 
 inline const RankType& MPISystem::getGlobalRank() const {
@@ -676,17 +714,6 @@ inline size_t MPISystem::getNumProcs() const {
 
 inline bool MPISystem::isInitialized() const { return initialized_; }
 
-static inline int getCommSize(const CommunicatorType& comm) {
-  int commSize;
-  MPI_Comm_size(comm, &commSize);
-  return commSize;
-}
-
-static inline int getCommRank(const CommunicatorType& comm) {
-  int commRank;
-  MPI_Comm_rank(comm, &commRank);
-  return commRank;
-}
 
 inline RankType MPISystem::getWorldSize() const {
   int worldSize = getCommSize(getWorldComm());
diff --git a/src/sparsegrid/AnyDistributedSparseGrid.cpp b/src/sparsegrid/AnyDistributedSparseGrid.cpp
index a0eaf1c65..e6f31dea3 100644
--- a/src/sparsegrid/AnyDistributedSparseGrid.cpp
+++ b/src/sparsegrid/AnyDistributedSparseGrid.cpp
@@ -12,7 +12,7 @@ namespace combigrid {
 AnyDistributedSparseGrid::AnyDistributedSparseGrid(size_t numSubspaces, CommunicatorType comm)
     : comm_(comm), subspacesDataSizes_(numSubspaces) {
   // make sure numSubspaces fits into SubspaceIndexType
-  assert(numSubspaces <= std::numeric_limits<SubspaceIndexType>::max());
+  assert(numSubspaces <= static_cast<size_t>(std::numeric_limits<SubspaceIndexType>::max()));
   MPI_Comm_rank(comm_, &rank_);
 }
 size_t AnyDistributedSparseGrid::getAccumulatedDataSize() const {
@@ -56,6 +56,10 @@ SubspaceSizeType AnyDistributedSparseGrid::getDataSize(SubspaceIndexType i) cons
   return subspacesDataSizes_[i];
 }
 
+SubspaceSizeType AnyDistributedSparseGrid::getDataSize(size_t i) const {
+  return this->getDataSize(static_cast<SubspaceIndexType>(i));
+}
+
 std::set<typename AnyDistributedSparseGrid::SubspaceIndexType>&
 AnyDistributedSparseGrid::getIngroupSubspaces() const {
   assert(this->getSubspacesByCommunicator().size() < 2);
@@ -134,7 +138,7 @@ std::vector<UIntForGroupReductionType> getSubspaceVote(
   // allocate vector of long long
   std::vector<UIntForGroupReductionType> subspaceVote(subspacesDataSizes.size(), 0);
   // set to mySummand if we have data in this subspace
-  for (AnyDistributedSparseGrid::SubspaceIndexType i = 0; i < subspacesDataSizes.size(); ++i) {
+  for (size_t i = 0; i < subspacesDataSizes.size(); ++i) {
     if (subspacesDataSizes[i] > 0) {
       subspaceVote[i] = mySummand;
     }
diff --git a/src/sparsegrid/AnyDistributedSparseGrid.hpp b/src/sparsegrid/AnyDistributedSparseGrid.hpp
index c79aeb078..b09fa90f4 100644
--- a/src/sparsegrid/AnyDistributedSparseGrid.hpp
+++ b/src/sparsegrid/AnyDistributedSparseGrid.hpp
@@ -40,6 +40,7 @@ class AnyDistributedSparseGrid {
 
   // data size of the subspace at index i
   SubspaceSizeType getDataSize(SubspaceIndexType i) const;
+  SubspaceSizeType getDataSize(size_t i) const;
 
   std::set<typename AnyDistributedSparseGrid::SubspaceIndexType>& getIngroupSubspaces() const;
 
diff --git a/src/sparsegrid/DistributedSparseGridIO.hpp b/src/sparsegrid/DistributedSparseGridIO.hpp
index 6e110ad52..58cd3ffca 100644
--- a/src/sparsegrid/DistributedSparseGridIO.hpp
+++ b/src/sparsegrid/DistributedSparseGridIO.hpp
@@ -17,15 +17,15 @@ namespace DistributedSparseGridIO {
 template <typename T>
 void reduceVectorTowardsMe(std::vector<T>& vectorToReduce, MPI_Comm comm, MPI_Op operation) {
   MPI_Datatype dataType = getMPIDatatype(abstraction::getabstractionDataType<T>());
-  MPI_Reduce(MPI_IN_PLACE, vectorToReduce.data(), vectorToReduce.size(), dataType, operation, 0,
-             comm);
+  MPI_Reduce(MPI_IN_PLACE, vectorToReduce.data(), static_cast<int>(vectorToReduce.size()), dataType,
+             operation, 0, comm);
 }
 
 template <typename T>
 void reduceVectorTowardsThem(std::vector<T>& vectorToReduce, MPI_Comm comm, MPI_Op operation) {
   MPI_Datatype dataType = getMPIDatatype(abstraction::getabstractionDataType<T>());
-  MPI_Reduce(vectorToReduce.data(), MPI_IN_PLACE, vectorToReduce.size(), dataType, operation, 0,
-             comm);
+  MPI_Reduce(vectorToReduce.data(), MPI_IN_PLACE, static_cast<int>(vectorToReduce.size()), dataType,
+             operation, 0, comm);
 }
 
 template <typename SparseGridType>
@@ -131,18 +131,19 @@ int readOneFileAndReduce(SparseGridType& dsg, const std::string& fileName,
                          uint32_t maxMiBToReadPerThread) {
   auto comm = dsg.getCommunicator();
 
-  const int numElementsToBuffer =
-      CombiCom::getGlobalReduceChunkSize<typename SparseGridType::ElementType>(
-          maxMiBToReadPerThread);
-
   // get offset in file
   const MPI_Offset len = dsg.getRawDataSize();
   auto data = dsg.getRawData();
 #ifdef DISCOTEC_USE_LZ4
-  int numReduced = mpiio::readReduceCompressedValuesConsecutive<typename SparseGridType::ElementType>(
-      data, len, fileName, comm, //numElementsToBuffer,
-      std::plus<typename SparseGridType::ElementType>{});
+  int numReduced =
+      mpiio::readReduceCompressedValuesConsecutive<typename SparseGridType::ElementType>(
+          data, len, fileName, comm,  // numElementsToBuffer,
+          std::plus<typename SparseGridType::ElementType>{});
 #else
+  const int numElementsToBuffer =
+      CombiCom::getGlobalReduceChunkSize<typename SparseGridType::ElementType>(
+          maxMiBToReadPerThread);
+
   int numReduced = mpiio::readReduceValuesConsecutive<typename SparseGridType::ElementType>(
       data, len, fileName, comm, numElementsToBuffer,
       std::plus<typename SparseGridType::ElementType>{});
@@ -172,11 +173,8 @@ int writeSomeFiles(const SparseGridType& dsg, const std::string& fileName,
 }
 
 template <typename SparseGridType>
-int readSomeFiles(SparseGridType& dsg, const std::string& fileName) {
+int readSomeFiles(SparseGridType& dsg, const std::string& filePartName) {
   auto comm = theMPISystem()->getOutputComm();
-  auto outputGroupSize = getCommSize(comm);
-  auto filePart = theMPISystem()->getOutputGroupRank() / outputGroupSize;
-  std::string filePartName = fileName + ".part" + std::to_string(filePart);
 
   // get offset in file
   MPI_Offset len = dsg.getRawDataSize();
@@ -192,26 +190,22 @@ int readSomeFiles(SparseGridType& dsg, const std::string& fileName) {
 }
 
 template <typename SparseGridType>
-int readSomeFilesAndReduce(SparseGridType& dsg, const std::string& fileName,
+int readSomeFilesAndReduce(SparseGridType& dsg, const std::string& filePartName,
                            uint32_t maxMiBToReadPerThread) {
   auto comm = theMPISystem()->getOutputComm();
-  auto outputGroupSize = getCommSize(comm);
-  auto filePart = theMPISystem()->getOutputGroupRank() / outputGroupSize;
-  std::string filePartName = fileName + ".part" + std::to_string(filePart);
-
-  const int numElementsToBuffer =
-      CombiCom::getGlobalReduceChunkSize<typename SparseGridType::ElementType>(
-          maxMiBToReadPerThread);
 
-  // get offset in file
   const MPI_Offset len = dsg.getRawDataSize();
   auto data = dsg.getRawData();
 
 #ifdef DISCOTEC_USE_LZ4
-  int numReduced = mpiio::readReduceCompressedValuesConsecutive<typename SparseGridType::ElementType>(
-      data, len, filePartName, comm, //numElementsToBuffer,
-      std::plus<typename SparseGridType::ElementType>{});
+  int numReduced =
+      mpiio::readReduceCompressedValuesConsecutive<typename SparseGridType::ElementType>(
+          data, len, filePartName, comm,  // numElementsToBuffer,
+          std::plus<typename SparseGridType::ElementType>{});
 #else
+  const int numElementsToBuffer =
+      CombiCom::getGlobalReduceChunkSize<typename SparseGridType::ElementType>(
+          maxMiBToReadPerThread);
   int numReduced = mpiio::readReduceValuesConsecutive<typename SparseGridType::ElementType>(
       data, len, filePartName, comm, numElementsToBuffer,
       std::plus<typename SparseGridType::ElementType>{});
@@ -246,7 +240,7 @@ int readReduceSubspaceSizesFromFile(SparseGridType& dsg, const std::string& file
   auto comm = dsg.getCommunicator();
   MPI_Offset len = dsg.getNumSubspaces();
   if (numElementsToBuffer == 0) {
-    numElementsToBuffer = len;
+    numElementsToBuffer = static_cast<int>(len);
   }
 
   int numReduced = mpiio::readReduceValuesConsecutive<SubspaceSizeType>(
@@ -263,7 +257,7 @@ int readReduceSubspaceSizesFromFiles(SparseGridType& dsg, const std::vector<std:
   auto comm = dsg.getCommunicator();
   MPI_Offset len = dsg.getNumSubspaces();
   if (numElementsToBuffer == 0) {
-    numElementsToBuffer = len;
+    numElementsToBuffer = static_cast<int>(len);
   }
 
   int numReduced = mpiio::readMultipleReduceValuesConsecutive<SubspaceSizeType>(
diff --git a/src/sparsegrid/DistributedSparseGridUniform.hpp b/src/sparsegrid/DistributedSparseGridUniform.hpp
index c350f4992..d8b90df64 100644
--- a/src/sparsegrid/DistributedSparseGridUniform.hpp
+++ b/src/sparsegrid/DistributedSparseGridUniform.hpp
@@ -53,7 +53,7 @@ class DistributedSparseGridDataContainer {
     SubspaceSizeType offset = 0;
     for (size_t i = 0; i < subspaces_.size(); i++) {
       subspaces_[i] = subspacesData_.data() + offset;
-      if (subspacesWithData_.find(i) != subspacesWithData_.end()) {
+      if (subspacesWithData_.find(static_cast<SubspaceIndexType>(i)) != subspacesWithData_.end()) {
         offset += dsgu_.getSubspaceDataSizes()[i];
       }
     }
@@ -84,7 +84,7 @@ class DistributedSparseGridDataContainer {
     SubspaceSizeType offset = 0;
     for (size_t i = 0; i < kahanDataBegin_.size(); i++) {
       kahanDataBegin_[i] = kahanData_.data() + offset;
-      if (subspacesWithData_.find(i) != subspacesWithData_.end()) {
+      if (subspacesWithData_.find(static_cast<SubspaceIndexType>(i)) != subspacesWithData_.end()) {
         offset += dsgu_.getSubspaceDataSizes()[i];
       }
     }
@@ -213,6 +213,7 @@ class DistributedSparseGridUniform : public AnyDistributedSparseGrid {
 
   // return level vector of subspace i
   inline const LevelVector& getLevelVector(SubspaceIndexType i) const;
+  inline const LevelVector& getLevelVector(size_t i) const;
 
   inline SubspaceIndexType getIndexInRange(const LevelVector& l, IndexType lowerBound) const;
 
@@ -422,7 +423,7 @@ std::vector<LevelVector> DistributedSparseGridUniform<FG_ELEMENT>::createLevels(
   combigrid::createTruncatedHierarchicalLevels(nmax, lmin, created);
   // std::sort(created.begin(), created.end());
   assert(std::is_sorted(created.begin(), created.end()));
-  if (created.size() > std::numeric_limits<SubspaceIndexType>::max()) {
+  if (created.size() > static_cast<size_t>(std::numeric_limits<SubspaceIndexType>::max())) {
     throw std::runtime_error("number of subspaces exceeds the maximum value of SubspaceIndexType");
   }
   return created;
@@ -442,6 +443,11 @@ inline const LevelVector& DistributedSparseGridUniform<FG_ELEMENT>::getLevelVect
   return *levelIterator;
 }
 
+template <typename FG_ELEMENT>
+inline const LevelVector& DistributedSparseGridUniform<FG_ELEMENT>::getLevelVector(size_t i) const {
+  return this->getLevelVector(static_cast<SubspaceIndexType>(i));
+}
+
 template <typename FG_ELEMENT>
 typename DistributedSparseGridUniform<FG_ELEMENT>::SubspaceIndexType
 DistributedSparseGridUniform<FG_ELEMENT>::getIndexInRange(const LevelVector& l,
@@ -473,7 +479,7 @@ template <typename FG_ELEMENT>
 inline FG_ELEMENT* DistributedSparseGridUniform<FG_ELEMENT>::getData(SubspaceIndexType i) {
 #ifndef NDEBUG
   assert(isSubspaceDataCreated());
-  assert(i < this->subspacesDataContainer_.subspaces_.size());
+  assert(static_cast<size_t>(i) < this->subspacesDataContainer_.subspaces_.size());
   assert(this->subspacesDataContainer_.subspaces_[i] <=
          &(*(this->subspacesDataContainer_.subspacesData_.end())));
   // if (this->subspacesDataContainer_.subspaces_[i] ==
@@ -489,7 +495,7 @@ inline const FG_ELEMENT* DistributedSparseGridUniform<FG_ELEMENT>::getData(
     SubspaceIndexType i) const {
 #ifndef NDEBUG
   assert(isSubspaceDataCreated());
-  assert(i < this->subspacesDataContainer_.subspaces_.size());
+  assert(static_cast<size_t>(i) < this->subspacesDataContainer_.subspaces_.size());
   assert(this->subspacesDataContainer_.subspaces_[i] <=
          &(*(this->subspacesDataContainer_.subspacesData_.end())));
   if (this->subspacesDataContainer_.subspaces_[i] ==
@@ -617,7 +623,7 @@ void DistributedSparseGridUniform<FG_ELEMENT>::accumulateMinMaxCoefficients() {
                   this->getCurrentlyAllocatedSubspaces().cend());
 #pragma omp parallel for default(none) schedule(guided) \
     shared(currentlyAllocatedSubspaceIndices, smaller_real)
-  for (SubspaceIndexType iAllocated = 0; iAllocated < currentlyAllocatedSubspaceIndices.size();
+  for (size_t iAllocated = 0; iAllocated < currentlyAllocatedSubspaceIndices.size();
        ++iAllocated) {
     SubspaceIndexType i = currentlyAllocatedSubspaceIndices[iAllocated];
     if (this->getSubspaceDataSizes()[i] > 0) {
@@ -664,7 +670,7 @@ inline void DistributedSparseGridUniform<FG_ELEMENT>::registerDistributedFullGri
       if (subSgDataSize == 0) {
         this->setDataSize(index, numPointsOfSubspace);
       } else {
-        ASSERT(subSgDataSize == numPointsOfSubspace,
+        ASSERT(static_cast<IndexType>(subSgDataSize) == numPointsOfSubspace,
                "subSgDataSize: " << subSgDataSize
                                  << ", numPointsOfSubspace: " << numPointsOfSubspace << " , level "
                                  << level << " , rank " << this->rank_ << std::endl);
@@ -713,7 +719,7 @@ inline void DistributedSparseGridUniform<FG_ELEMENT>::addDistributedFullGrid(
         assert(sDataSize == this->getDataSize(sIndex));
         assert(sDataSize == this->getAllocatedDataSize(sIndex));
         assert(std::distance(this->getData(0), sPointer) <
-               this->subspacesDataContainer_.subspacesData_.size());
+               static_cast<long int>(this->subspacesDataContainer_.subspacesData_.size()));
         auto kDataSize = this->subspacesDataContainer_.kahanDataBegin_[sIndex + 1] - kPointer;
         assert(kDataSize == this->getDataSize(sIndex));
       }
diff --git a/src/task/Task.cpp b/src/task/Task.cpp
index 1417afcb2..d106b89ed 100644
--- a/src/task/Task.cpp
+++ b/src/task/Task.cpp
@@ -28,7 +28,7 @@ Task::Task(const LevelVector& l, const std::vector<BoundaryType>& boundary, real
 
 Task::~Task() { delete faultCriterion_; }
 
-const DistributedFullGrid<CombiDataType>& Task::getDistributedFullGrid(int n) const {
+const DistributedFullGrid<CombiDataType>& Task::getDistributedFullGrid(size_t n) const {
   throw std::runtime_error("const getDistributedFullGrid called but not implemented");
 }
 
diff --git a/src/task/Task.hpp b/src/task/Task.hpp
index 2e2ef5574..9b12e7842 100644
--- a/src/task/Task.hpp
+++ b/src/task/Task.hpp
@@ -77,7 +77,7 @@ class Task {
   }
 
   virtual void init(CommunicatorType lcomm,
-                    std::vector<IndexVector> decomposition = std::vector<IndexVector>()) = 0;
+                    const std::vector<IndexVector>& decomposition = std::vector<IndexVector>()) = 0;
 
   // inline real estimateRuntime() const;
 
@@ -90,9 +90,9 @@ class Task {
                            int n = 0) = 0;
 
   // This method returns the local part of the n-th distributedFullGrid
-  virtual DistributedFullGrid<CombiDataType>& getDistributedFullGrid(int n = 0) = 0;
+  virtual DistributedFullGrid<CombiDataType>& getDistributedFullGrid(size_t n = 0) = 0;
 
-  virtual const DistributedFullGrid<CombiDataType>& getDistributedFullGrid(int n) const;
+  virtual const DistributedFullGrid<CombiDataType>& getDistributedFullGrid(size_t n) const;
 
   virtual void setZero() = 0;
 
diff --git a/src/third_level/NetworkUtils.cpp b/src/third_level/NetworkUtils.cpp
index e38fa9d9a..1c744fc15 100644
--- a/src/third_level/NetworkUtils.cpp
+++ b/src/third_level/NetworkUtils.cpp
@@ -94,7 +94,6 @@ bool ClientSocket::recvall(std::string& mesg, size_t len, int flags) const {
   assert(isInitialized() && "Client Socket not initialized");
   assert(len > 0);
   //std::cout << "Trying to receive " << len << "Bytes" << std::endl;
-  ssize_t recvd = -1;
   std::unique_ptr<char[]> buff(new char[len]);
   auto ret = this->recvall(buff.get(), len, flags);
   mesg = std::string(buff.get(), len);
diff --git a/src/third_level/NetworkUtils.hpp b/src/third_level/NetworkUtils.hpp
index 605d86324..0794a375e 100644
--- a/src/third_level/NetworkUtils.hpp
+++ b/src/third_level/NetworkUtils.hpp
@@ -189,7 +189,7 @@ bool ClientSocket::recvallBinaryAndCorrectInPlace(FG_ELEMENT* buff,
   assert(isInitialized() && "Client Socket not initialized");
   // receive endianness
   char temp = ' ';
-  auto recvAllSuccess = recvall(&temp, 1);
+  [[maybe_unused]] auto recvAllSuccess = recvall(&temp, 1);
   assert(recvAllSuccess && "Receiving Endianess failed");
   bool hasSameEndianness = bool(temp) == NetworkUtils::isLittleEndian();
 
@@ -300,7 +300,7 @@ bool ClientSocket::recvallBinaryAndReduceInPlace(FG_ELEMENT* buff,
   assert(isInitialized() && "Client Socket not initialized");
   // receive endianness
   char temp = ' ';
-  auto recvSuccess = recvall(&temp, 1);
+  [[maybe_unused]] auto recvSuccess = recvall(&temp, 1);
   assert(recvSuccess && "Receiving Endianess failed");
   bool hasSameEndianness = bool(temp) == NetworkUtils::isLittleEndian();
 
diff --git a/src/third_level/ThirdLevelUtils.hpp b/src/third_level/ThirdLevelUtils.hpp
index c3ffe32cb..b5212e09f 100644
--- a/src/third_level/ThirdLevelUtils.hpp
+++ b/src/third_level/ThirdLevelUtils.hpp
@@ -94,9 +94,9 @@ namespace combigrid {
   {
     assert(isConnected_);
     size_t rawSize = receiveSize();
-    size_t recvSize = (rawSize - 1) / sizeof(FG_ELEMENT); // - 1 due to endianness
+    [[maybe_unused]] size_t recvSize = (rawSize - 1) / sizeof(FG_ELEMENT); // - 1 due to endianness
     assert(recvSize == size && "Size mismatch receiving data size does not match expected");
-    bool success = connection_->recvallBinaryAndCorrectInPlace(data, size);
+    [[maybe_unused]] bool success = connection_->recvallBinaryAndCorrectInPlace(data, size);
     assert(success && "receiving dsgu data failed");
   }
 
@@ -106,9 +106,9 @@ namespace combigrid {
   {
     assert(isConnected_);
     size_t rawSize = receiveSize();
-    size_t recvSize = (rawSize - 1) / sizeof(FG_ELEMENT); // - 1 due to endianness
+    [[maybe_unused]] size_t recvSize = (rawSize - 1) / sizeof(FG_ELEMENT); // - 1 due to endianness
     assert(recvSize == size && "Size mismatch cannot add vectors of different size");
-    bool success = connection_->recvallBinaryAndReduceInPlace<FG_ELEMENT>(data, size,
+    [[maybe_unused]] bool success = connection_->recvallBinaryAndReduceInPlace<FG_ELEMENT>(data, size,
         [](const FG_ELEMENT& lhs, const FG_ELEMENT& rhs) -> FG_ELEMENT {return lhs + rhs;});
     assert(success && "receiving dsgu data failed");
   }
diff --git a/src/utils/DecompositionUtils.hpp b/src/utils/DecompositionUtils.hpp
new file mode 100644
index 000000000..b5201ad6b
--- /dev/null
+++ b/src/utils/DecompositionUtils.hpp
@@ -0,0 +1,94 @@
+#pragma once
+#include <vector>
+
+#include "utils/Types.hpp"
+namespace combigrid {
+
+/* a regular (equidistant) domain decompositioning for an even number of processes
+ * leads to grid points on the (geometrical) process boundaries.
+ * with the forwardDecomposition flag it can be decided if the grid points on
+ * the process boundaries belong to the process on the right-hand side (true)
+ * of the process boundary, or to the one on the left-hand side (false).
+ */
+static inline std::vector<IndexVector> getDefaultDecomposition(
+    const IndexVector& globalNumPointsPerDimension,
+    const std::vector<int>& cartesianProcsPerDimension, bool forwardDecomposition) {
+  auto dim = static_cast<DimType>(globalNumPointsPerDimension.size());
+  assert(cartesianProcsPerDimension.size() == dim);
+
+  // create decomposition vectors
+  std::vector<IndexVector> decomposition(dim);
+  for (DimType i = 0; i < dim; ++i) {
+    if (cartesianProcsPerDimension[i] > globalNumPointsPerDimension[i]) {
+      throw std::invalid_argument(
+          "change to coarser parallelization! currently not all processes can have points.");
+    }
+    IndexVector& llbnd = decomposition[i];
+    llbnd.resize(cartesianProcsPerDimension[i]);
+
+    for (int j = 0; j < cartesianProcsPerDimension[i]; ++j) {
+      double tmp = static_cast<double>(globalNumPointsPerDimension[i]) * static_cast<double>(j) /
+                   static_cast<double>(cartesianProcsPerDimension[i]);
+
+      if (forwardDecomposition)
+        llbnd[j] = static_cast<IndexType>(std::ceil(tmp));
+      else
+        llbnd[j] = static_cast<IndexType>(std::floor(tmp));
+    }
+  }
+  return decomposition;
+}
+
+inline static std::vector<IndexVector> getStandardDecomposition(LevelVector lref,
+                                                                std::vector<int>& procsRef) {
+  assert(lref.size() == procsRef.size());
+  std::vector<IndexVector> decomposition;
+  for (DimType d = 0; d < static_cast<DimType>(lref.size()); ++d) {
+    IndexVector di;
+    if (procsRef[d] == 1) {
+      di = {0};
+    } else if (procsRef[d] == 2) {
+      di = {0, powerOfTwo[lref[d]] / procsRef[d] + 1};
+    } else if (procsRef[d] == 3) {
+      di = {0, powerOfTwo[lref[d]] / procsRef[d] + 1, 2 * powerOfTwo[lref[d]] / procsRef[d] + 1};
+    } else if (procsRef[d] == 4) {
+      di = {0, powerOfTwo[lref[d]] / procsRef[d] + 1, 2 * powerOfTwo[lref[d]] / procsRef[d] + 1,
+            3 * powerOfTwo[lref[d]] / procsRef[d] + 1};
+    } else {
+      throw std::runtime_error("please implement a test decomposition matching procs and lref");
+    }
+    decomposition.push_back(di);
+  }
+  return decomposition;
+}
+
+static inline std::vector<IndexVector> downsampleDecomposition(
+    const std::vector<IndexVector>& decomposition, const LevelVector& referenceLevel,
+    const LevelVector& newLevel, const std::vector<BoundaryType>& boundary) {
+  auto newDecomposition = decomposition;
+  if (decomposition.size() > 0) {
+    for (DimType d = 0; d < static_cast<DimType>(referenceLevel.size()); ++d) {
+      // for now, assume that we never want to interpolate on a level finer than referenceLevel
+      assert(referenceLevel[d] >= newLevel[d]);
+      auto levelDiff = referenceLevel[d] - newLevel[d];
+      auto stepFactor = oneOverPowOfTwo[levelDiff];
+      if (boundary[d] > 0) {
+        // all levels contain the boundary points -> point 0 is the same
+        for (auto& dec : newDecomposition[d]) {
+          dec = static_cast<IndexType>(std::ceil(static_cast<double>(dec) * stepFactor));
+        }
+      } else {
+        // all levels do not contain the boundary points -> mid point is the same
+        auto leftProtrusion = powerOfTwo[levelDiff] - 1;
+        for (auto& dec : newDecomposition[d]) {
+          // same as before, but subtract the "left" protrusion on the finer level
+          dec = static_cast<IndexType>(
+              std::max(0., std::ceil(static_cast<double>(dec - leftProtrusion) * stepFactor)));
+        }
+      }
+    }
+  }
+  return newDecomposition;
+}
+
+}  // namespace combigrid
\ No newline at end of file
diff --git a/src/utils/LevelSetUtils.cpp b/src/utils/LevelSetUtils.cpp
index 8eb17cc56..fcabc46bc 100644
--- a/src/utils/LevelSetUtils.cpp
+++ b/src/utils/LevelSetUtils.cpp
@@ -11,8 +11,10 @@ void getDownSetRecursively(combigrid::LevelVector const& l, combigrid::LevelVect
   } else {
     // for the whole range of values in dimension d
     size_t d = fixedDimensions.size();
-// #pragma omp parallel for shared(l, fixedDimensions, downSet) firstprivate(d) default(none) \
-//     schedule(static) //leads to unordered downSet, avoid in favor of registration
+    /*
+    // #pragma omp parallel for shared(l, fixedDimensions, downSet) firstprivate(d) default(none) \
+    //     schedule(static) //leads to unordered downSet, avoid in favor of registration
+    */
     for (LevelType i = 1; i <= l[d]; ++i) {  // levels start at 1 here, in compliance with our
                                              // definition of DistributedSparseGrid
       auto moreDimensionsFixed = fixedDimensions;
diff --git a/src/utils/LevelSetUtils.hpp b/src/utils/LevelSetUtils.hpp
index a6f9c4f3d..a0c89fec6 100644
--- a/src/utils/LevelSetUtils.hpp
+++ b/src/utils/LevelSetUtils.hpp
@@ -116,7 +116,7 @@ class HypercubeDownSetGenerator {
     {
       if (!this->isFinished()) {
         // find first dimension that can be increased
-        for (int i = (levelUpTo_.size() - 1); i > -1; --i) {
+        for (auto i = static_cast<int>(levelUpTo_.size() - 1); i > -1; --i) {
           if (currentLevel_[i] < levelUpTo_[i]) {
             ++currentLevel_[i];
             break;
diff --git a/src/utils/MonteCarlo.cpp b/src/utils/MonteCarlo.cpp
index e92093f17..b587cb142 100644
--- a/src/utils/MonteCarlo.cpp
+++ b/src/utils/MonteCarlo.cpp
@@ -50,7 +50,7 @@ std::vector<std::vector<real>> getRandomCoordinates(int numCoordinates, size_t d
               };
   for (auto & coord : randomCoords) {
       std::generate(begin(coord), end(coord), gen);
-      for (auto & c : coord){
+      for ([[maybe_unused]] const auto & c : coord){
         assert( c <= 1. && c >=0. );
       }
   }
diff --git a/tests/TaskConst.hpp b/tests/TaskConst.hpp
index 3e3a84a01..03a4c74bb 100644
--- a/tests/TaskConst.hpp
+++ b/tests/TaskConst.hpp
@@ -17,7 +17,7 @@ class TaskConst : public combigrid::Task {
     assert(l.size() == 2);
   }
 
-  void init(CommunicatorType lcomm, std::vector<IndexVector> decomposition) {
+  void init(CommunicatorType lcomm, const std::vector<IndexVector>& decomposition) {
     // parallelization
     // assert(dfg_ == nullptr);
     auto nprocs = getCommSize(lcomm);
@@ -65,7 +65,7 @@ class TaskConst : public combigrid::Task {
     dfg_->gatherFullGrid(fg, r);
   }
 
-  DistributedFullGrid<CombiDataType>& getDistributedFullGrid(int n = 0) { return *dfg_; }
+  DistributedFullGrid<CombiDataType>& getDistributedFullGrid(size_t n = 0) override { return *dfg_; }
 
   void setZero() { BOOST_CHECK(true); }
 
diff --git a/tests/TaskConstParaboloid.hpp b/tests/TaskConstParaboloid.hpp
index 686f94f4f..06435cac0 100644
--- a/tests/TaskConstParaboloid.hpp
+++ b/tests/TaskConstParaboloid.hpp
@@ -59,7 +59,7 @@ class TaskConstParaboloid : public combigrid::Task {
     BOOST_TEST_CHECKPOINT("TaskConstParaboloid constructor");
   }
 
-  void init(CommunicatorType lcomm, std::vector<IndexVector> decomposition) override {
+  void init(CommunicatorType lcomm, const std::vector<IndexVector>& decomposition) override {
     // parallelization
     // assert(dfg_ == nullptr);
     auto nprocs = getCommSize(lcomm);
@@ -107,18 +107,18 @@ class TaskConstParaboloid : public combigrid::Task {
     dfg_->gatherFullGrid(fg, r);
   }
 
-  DistributedFullGrid<CombiDataType>& getDistributedFullGrid(int n = 0) override {
+  DistributedFullGrid<CombiDataType>& getDistributedFullGrid(size_t n = 0) override {
     BOOST_TEST_CHECKPOINT("TaskConstParaboloid getDFG");
     return *dfg_;
   }
 
-  const DistributedFullGrid<CombiDataType>& getDistributedFullGrid(int n = 0) const override {
+  const DistributedFullGrid<CombiDataType>& getDistributedFullGrid(size_t n = 0) const override {
     BOOST_TEST_CHECKPOINT("TaskConstParaboloid getDFG const");
     return *dfg_;
   }
 
   real getCurrentTime() const override {
-    return nsteps_;
+    return static_cast<real>(nsteps_);
   }
 
   void setZero() override { BOOST_CHECK(true); }
diff --git a/tests/TaskCount.hpp b/tests/TaskCount.hpp
index 0af089078..44bdd6bbb 100644
--- a/tests/TaskCount.hpp
+++ b/tests/TaskCount.hpp
@@ -36,7 +36,7 @@ class TaskCount : public combigrid::Task {
     BOOST_TEST_CHECKPOINT("TaskCount constructor");
   }
 
-  void init(CommunicatorType lcomm, std::vector<IndexVector> decomposition) override {
+  void init(CommunicatorType lcomm, const std::vector<IndexVector>& decomposition) override {
     auto nprocs = getCommSize(lcomm);
     std::vector<int> p;
     if (decomposition.size() == 0) {
@@ -85,7 +85,7 @@ class TaskCount : public combigrid::Task {
     dfg_->gatherFullGrid(fg, r);
   }
 
-  DistributedFullGrid<CombiDataType>& getDistributedFullGrid(int n = 0) override { return *dfg_; }
+  DistributedFullGrid<CombiDataType>& getDistributedFullGrid(size_t n = 0) override { return *dfg_; }
 
   void setZero() override {
     BOOST_TEST_CHECKPOINT("TaskCount setZero");
diff --git a/tests/test_distributedfullgrid.cpp b/tests/test_distributedfullgrid.cpp
index 8ccc6bbc8..19fb08e8a 100644
--- a/tests/test_distributedfullgrid.cpp
+++ b/tests/test_distributedfullgrid.cpp
@@ -229,7 +229,7 @@ void checkDistributedFullgrid(LevelVector& levels, std::vector<int>& procs,
     // check that InnerNodalBasisFunctionIntegral is correct
     double numFullInnerBasisFcns = 1.;
     for (DimType d = 0; d < dim; ++d) {
-      numFullInnerBasisFcns *= static_cast<double>(dfg.length(d) - 1);
+      numFullInnerBasisFcns *= static_cast<double>(dfg.globalNumPointsInDimension(d) - 1);
     }
     BOOST_CHECK_CLOSE(dfg.getInnerNodalBasisFunctionIntegral(), 1. / numFullInnerBasisFcns,
                       TestHelper::tolerance);
@@ -271,7 +271,7 @@ void checkDistributedFullgrid(LevelVector& levels, std::vector<int>& procs,
         --globAxisIndex[d];
         if (boundary[d] == 1 && globAxisIndex[d] < 0) {
           // wrap around
-          globAxisIndex[d] += dfg.length(d);
+          globAxisIndex[d] += dfg.globalNumPointsInDimension(d);
         }
         std::vector<double> coords(dim);
         dfg.getCoordsGlobal(dfg.getGlobalLinearIndex(globAxisIndex), coords);
diff --git a/tests/test_distributedsparsegrid.cpp b/tests/test_distributedsparsegrid.cpp
index e25a16a2f..f441bfac1 100644
--- a/tests/test_distributedsparsegrid.cpp
+++ b/tests/test_distributedsparsegrid.cpp
@@ -19,6 +19,7 @@
 #include "sparsegrid/DistributedSparseGridIO.hpp"
 #include "sparsegrid/DistributedSparseGridUniform.hpp"
 #include "sparsegrid/SGrid.hpp"
+#include "utils/DecompositionUtils.hpp"
 #include "utils/IndexVector.hpp"
 #include "utils/LevelSetUtils.hpp"
 #include "utils/MonteCarlo.hpp"
@@ -185,7 +186,7 @@ void checkDistributedSparsegrid(LevelVector& lmin, LevelVector& lmax, std::vecto
       }
       auto reduceSuccess =
           DistributedSparseGridIO::readOneFileAndReduce(*uniDSGfromSubspaces, "test_sg_all", 1);
-      BOOST_CHECK_EQUAL(readSuccess, uniDSGfromSubspaces->getRawDataSize());
+      BOOST_CHECK_EQUAL(reduceSuccess, uniDSGfromSubspaces->getRawDataSize());
       if (readSuccess) {
         BOOST_TEST_CHECKPOINT("compare double values");
         for (size_t i = 0; i < uniDSG->getRawDataSize(); ++i) {
@@ -1037,9 +1038,7 @@ BOOST_AUTO_TEST_CASE(test_sparseGridAndSubspaceReduce) {
           const auto& subspacesByComm = uniDSG->getSubspacesByCommunicator();
           std::set<typename AnyDistributedSparseGrid::SubspaceIndexType> checkedSubspaces{};
           for (const auto& subspaces : subspacesByComm) {
-            auto commSize = combigrid::getCommSize(subspaces.first);
             for (const auto& subspace : subspaces.second) {
-              auto subspaceStart = uniDSG->getData(subspace);
               for (SubspaceSizeType j = 0; j < uniDSG->getDataSize(subspace); ++j) {
                 // BOOST_CHECK_EQUAL(subspaceStart[j], ??);
               }
@@ -1082,9 +1081,7 @@ BOOST_AUTO_TEST_CASE(test_sparseGridAndSubspaceReduce) {
           // check that the data is correct
           checkedSubspaces.clear();
           for (const auto& subspaces : subspacesByComm) {
-            auto commSize = combigrid::getCommSize(subspaces.first);
             for (const auto& subspace : subspaces.second) {
-              auto subspaceStart = uniDSG->getData(subspace);
               for (SubspaceSizeType j = 0; j < uniDSG->getDataSize(subspace); ++j) {
                 // BOOST_CHECK_EQUAL(subspaceStart[j], ??);
               }
diff --git a/tests/test_ftolerance.cpp b/tests/test_ftolerance.cpp
index ac6cbbd8a..cac7a5712 100644
--- a/tests/test_ftolerance.cpp
+++ b/tests/test_ftolerance.cpp
@@ -65,7 +65,7 @@ class TaskAdvFDM : public combigrid::Task {
   }
 
   void init(CommunicatorType lcomm,
-            std::vector<IndexVector> decomposition = std::vector<IndexVector>()) {
+            const std::vector<IndexVector>& decomposition = std::vector<IndexVector>()) override {
     // only use one process per group
     std::vector<int> p(getDim(), 1);
 
@@ -94,8 +94,8 @@ class TaskAdvFDM : public combigrid::Task {
     // gradient of phi
     std::vector<CombiDataType> dphi(getDim());
 
-    IndexType l0 = dfg_->length(0);
-    IndexType l1 = dfg_->length(1);
+    IndexType l0 = dfg_->globalNumPointsInDimension(0);
+    IndexType l1 = dfg_->globalNumPointsInDimension(1);
     double h0 = 1.0 / (double)l0;
     double h1 = 1.0 / (double)l1;
 
@@ -139,7 +139,7 @@ class TaskAdvFDM : public combigrid::Task {
     dfg_->gatherFullGrid(fg, r);
   }
 
-  DistributedFullGrid<CombiDataType>& getDistributedFullGrid(int n = 0) { return *dfg_; }
+  DistributedFullGrid<CombiDataType>& getDistributedFullGrid(size_t n = 0) override { return *dfg_; }
 
   void setZero() {}
 
@@ -189,7 +189,7 @@ class TaskAdvFDM : public combigrid::Task {
     //real t = dt_ * nsteps_ * combiStep_;
     if (combiStep_ != 0 && faultCriterion_->failNow(combiStep_, -1.0, globalRank)){
           std::cout<<"Rank "<< globalRank <<" failed at iteration "<<combiStep_<<std::endl;
-          StatusType status=PROCESS_GROUP_FAIL;/*
+          [[maybe_unused]] StatusType status=PROCESS_GROUP_FAIL;/*
           MASTER_EXCLUSIVE_SECTION{
             simft::Sim_FT_MPI_Send( &status, 1, MPI_INT,  theMPISystem()->getManagerRank(), TRANSFER_STATUS_TAG,
                               theMPISystem()->getGlobalCommFT() );
diff --git a/tests/test_helper.hpp b/tests/test_helper.hpp
index c76e1762c..5b8bfc20c 100644
--- a/tests/test_helper.hpp
+++ b/tests/test_helper.hpp
@@ -39,6 +39,9 @@ namespace TestHelper{
       return MPI_COMM_NULL;
     }
   }
+  static inline MPI_Comm getComm(size_t nprocs) {
+    return getComm(static_cast<int>(nprocs));
+  }
 
   static inline MPI_Comm getCommSelfAsCartesian(combigrid::DimType dimensionality) {
     std::vector<int> dims(dimensionality, 1);
diff --git a/tests/test_hierarchization.cpp b/tests/test_hierarchization.cpp
index 9caf0bfaf..5939459c7 100644
--- a/tests/test_hierarchization.cpp
+++ b/tests/test_hierarchization.cpp
@@ -192,7 +192,7 @@ real checkConservationOfMomentum(DistributedFullGrid<FG_ELEMENT>& dfg,
   BOOST_CHECK(std::all_of(boundary.begin(), boundary.end(), [](BoundaryType b) { return b == 2; }));
   real mcMassBefore = getMonteCarloMass(dfg, nPointsMonteCarlo);
   auto mcMomentaBefore = getMonteCarloMomenta(dfg, nPointsMonteCarlo);
-  real mcMomentumBefore = mcMomentaBefore.back();
+  // real mcMomentumBefore = mcMomentaBefore.back();
 
   BOOST_TEST_CHECKPOINT("begin hierarchization");
   auto dim = dfg.getDimension();
@@ -1428,7 +1428,7 @@ BOOST_AUTO_TEST_CASE(momentum) {
       fillDFGbyFunction(constFctn, dfg3);
       // usually, momentum will not be conserved for periodic BC, but for constant functions it
       // should work
-      auto momentum = checkConservationOfMomentum<real>(
+      [[maybe_unused]] auto momentum = checkConservationOfMomentum<real>(
           dfg3, DistributedHierarchization::hierarchizeBiorthogonalPeriodic<real>);
     }
     {
diff --git a/tests/test_integration.cpp b/tests/test_integration.cpp
index 691f2a405..c7a46565b 100644
--- a/tests/test_integration.cpp
+++ b/tests/test_integration.cpp
@@ -358,7 +358,7 @@ void checkPassingHierarchicalBases(size_t ngroup = 1, size_t nprocs = 1) {
     combischeme.createAdaptiveCombischeme();
     std::vector<LevelVector> levels = combischeme.getCombiSpaces();
     std::vector<combigrid::real> coeffs = combischeme.getCoeffs();
-    auto numDOF = printCombiDegreesOfFreedom(levels, boundary);
+    [[maybe_unused]] auto numDOF = printCombiDegreesOfFreedom(levels, boundary);
 
     // create Tasks
     TaskContainer tasks;
diff --git a/tests/test_io.cpp b/tests/test_io.cpp
index b9b37a0bf..4d59cb9d9 100644
--- a/tests/test_io.cpp
+++ b/tests/test_io.cpp
@@ -86,7 +86,7 @@ void checkCompressionWithHeader(size_t numValues = 1000000) {
 
     std::string filename =
         "compression_testfile_" + std::to_string(theMPISystem()->getProcessGroupNumber());
-    auto numBytesWritten = mpiio::writeCompressedValuesConsecutive(
+    mpiio::writeCompressedValuesConsecutive(
         originalValues.data(), originalValues.size(), filename, localComm);
 
     std::vector<double> decompressedValues(originalValues.size());
diff --git a/tests/test_reduce.cpp b/tests/test_reduce.cpp
index c07ad53f5..8d805e8ae 100644
--- a/tests/test_reduce.cpp
+++ b/tests/test_reduce.cpp
@@ -37,7 +37,7 @@ class TaskConst : public combigrid::Task {
     assert(l.size() == 2);
   }
 
-  void init(CommunicatorType lcomm, std::vector<IndexVector> decomposition) {
+  void init(CommunicatorType lcomm, const std::vector<IndexVector>& decomposition) {
     // parallelization
     // assert(dfg_ == nullptr);
     auto nprocs = getCommSize(lcomm);
@@ -87,7 +87,7 @@ class TaskConst : public combigrid::Task {
     dfg_->gatherFullGrid(fg, r);
   }
 
-  DistributedFullGrid<CombiDataType>& getDistributedFullGrid(int n = 0) { return *dfg_; }
+  DistributedFullGrid<CombiDataType>& getDistributedFullGrid(size_t n = 0) override { return *dfg_; }
 
   void setZero() {}
 
diff --git a/tests/test_rescheduling.cpp b/tests/test_rescheduling.cpp
index 6dcd02f9b..b4e13081b 100644
--- a/tests/test_rescheduling.cpp
+++ b/tests/test_rescheduling.cpp
@@ -77,7 +77,7 @@ class TestingTask : public combigrid::Task {
     assert(l.size() == 2);
   }
 
-  void init(CommunicatorType lcomm, std::vector<IndexVector> decomposition) override {
+  void init(CommunicatorType lcomm, const std::vector<IndexVector>& decomposition) override {
     // parallelization
     // assert(dfg_ == nullptr);
     auto nprocs = getCommSize(lcomm);
@@ -113,7 +113,7 @@ class TestingTask : public combigrid::Task {
     dfg_->gatherFullGrid(fg, r);
   }
 
-  DistributedFullGrid<CombiDataType>& getDistributedFullGrid(int n = 0) override { return *dfg_; }
+  DistributedFullGrid<CombiDataType>& getDistributedFullGrid(size_t n = 0) override { return *dfg_; }
 
   void setZero() override {}
 
diff --git a/tests/test_task.cpp b/tests/test_task.cpp
index b6cd770f9..7d482b6a7 100644
--- a/tests/test_task.cpp
+++ b/tests/test_task.cpp
@@ -26,7 +26,7 @@ class TaskTest : public combigrid::Task {
       : Task(l, boundary, coeff, loadModel), test(t) {}
 
   void init(CommunicatorType lcomm,
-            std::vector<IndexVector> decomposition = std::vector<IndexVector>()) override {
+            const std::vector<IndexVector>& decomposition = std::vector<IndexVector>()) override {
     // create dummy dfg
     std::vector<int> p(getDim(), 1);
     dfg_ = new OwningDistributedFullGrid<CombiDataType>(getDim(), getLevelVector(), lcomm,
@@ -39,7 +39,7 @@ class TaskTest : public combigrid::Task {
     dfg_->gatherFullGrid(fg, r);
   }
 
-  DistributedFullGrid<CombiDataType>& getDistributedFullGrid(int n = 0) override { return *dfg_; }
+  DistributedFullGrid<CombiDataType>& getDistributedFullGrid(size_t n = 0) override { return *dfg_; }
 
   void setZero() override {}
 
diff --git a/tests/test_tensor.cpp b/tests/test_tensor.cpp
index 131ad2808..168a550ff 100644
--- a/tests/test_tensor.cpp
+++ b/tests/test_tensor.cpp
@@ -102,7 +102,7 @@ BOOST_AUTO_TEST_CASE(test_iterate_lower_boundaries_6d) {
       IndexType numberOfPointsVisited = 0;
       const auto strideInThisDimension = tensor.getOffsetsVector()[d];
       const IndexType jump = strideInThisDimension * tensor.getExtentsVector()[d];
-      const IndexType numberOfPolesHigherDimensions = tensor.size() / jump;
+      const IndexType numberOfPolesHigherDimensions = static_cast<IndexType>(tensor.size()) / jump;
       IndexType tensorLowestLayerIteratedIndex = 0;
 
       BOOST_TEST_CHECKPOINT("iterate lowest layer of tensor");
diff --git a/tests/test_thirdLevel.cpp b/tests/test_thirdLevel.cpp
index d5b73299c..b257fc87f 100644
--- a/tests/test_thirdLevel.cpp
+++ b/tests/test_thirdLevel.cpp
@@ -22,6 +22,7 @@
 #include "sparsegrid/DistributedSparseGridUniform.hpp"
 #include "task/Task.hpp"
 #include "utils/Config.hpp"
+#include "utils/DecompositionUtils.hpp"
 #include "utils/MonteCarlo.hpp"
 #include "utils/Types.hpp"
 #include "stdlib.h"
@@ -581,7 +582,7 @@ void testCombineThirdLevelWithoutManagers(
                                     true);
 
   if (testParams.nprocs > 1 && thirdLevelExtraSparseGrid) {
-    theMPISystem()->initOuputGroupComm(testParams.nprocs / 2);
+    theMPISystem()->initOutputGroupComm(testParams.nprocs / 2);
   }
   auto loadmodel = std::unique_ptr<LoadModel>(new LinearLoadModel());
   std::vector<BoundaryType> boundary(testParams.dim, testParams.boundary);
@@ -612,7 +613,7 @@ void testCombineThirdLevelWithoutManagers(
 
     for (size_t i = 0; i < systemLevels.size(); ++i) {
       // assign round-robin to process groups
-      if (i % theMPISystem()->getNumGroups() == theMPISystem()->getProcessGroupNumber()) {
+      if (static_cast<RankType>(i % theMPISystem()->getNumGroups()) == theMPISystem()->getProcessGroupNumber()) {
         // find index in full list
         auto position = std::find(systemLevels.begin(), systemLevels.end(), systemLevels[i]);
         BOOST_REQUIRE(position != systemLevels.end());
@@ -790,9 +791,7 @@ void testPretendThirdLevel(TestParams& testParams) {
   BOOST_CHECK(testParams.comm != MPI_COMM_NULL);
 
   combigrid::Stats::initialize();
-
-  size_t procsPerSys = testParams.ngroup * testParams.nprocs + 1;
-
+  // size_t procsPerSys = testParams.ngroup * testParams.nprocs + 1;
   theMPISystem()->initWorldReusable(testParams.comm, testParams.ngroup, testParams.nprocs);
 
   WORLD_MANAGER_EXCLUSIVE_SECTION {
diff --git a/tests/test_worker_only.cpp b/tests/test_worker_only.cpp
index cdde15666..c3539c46c 100644
--- a/tests/test_worker_only.cpp
+++ b/tests/test_worker_only.cpp
@@ -199,7 +199,7 @@ void checkWorkerOnly(size_t ngroup = 1, size_t nprocs = 1, BoundaryType boundary
     if (pretendThirdLevel ||
         params.getCombinationVariant() == CombinationVariant::sparseGridReduce) {
       BOOST_TEST_CHECKPOINT("write DSGS " + filename);
-      numWritten = worker.writeDSGsToDisk(filename);
+      numWritten = worker.writeDSGsToDisk(filename, "");
       BOOST_CHECK(numWritten > 0);
     }
   }
diff --git a/third_level_manager/test_uftp.cpp b/third_level_manager/test_uftp.cpp
index 7b89636f3..ccd2f514b 100644
--- a/third_level_manager/test_uftp.cpp
+++ b/third_level_manager/test_uftp.cpp
@@ -24,7 +24,7 @@
 
 using namespace combigrid;
 
-void mockUpDSGWriteToDisk(std::string filePrefix,
+void mockUpDSGWriteToDisk(const std::string& filePrefix,
                           const std::vector<long long int>& dsgPartitionSizes) {
   for (size_t partitionIndex = 0; partitionIndex < dsgPartitionSizes.size(); ++partitionIndex) {
     std::string myFilename = filePrefix + std::to_string(partitionIndex);
@@ -42,7 +42,7 @@ void mockUpDSGWriteToDisk(std::string filePrefix,
   }
 }
 
-void writeRandomDataToDisk(std::string filePrefix,
+void writeRandomDataToDisk(const std::string& filePrefix,
                          const std::vector<long long int>& dsgPartitionSizes) {
   Stats::startEvent("uftp write");
   auto actualPartitionSizes = dsgPartitionSizes;
@@ -53,7 +53,7 @@ void writeRandomDataToDisk(std::string filePrefix,
     auto sum = std::accumulate(actualPartitionSizes.begin(), actualPartitionSizes.end(), 0);
     actualPartitionSizes.back() += (dsgPartitionSizes[0] - sum);
   }
-  auto sumDOF = std::accumulate(actualPartitionSizes.begin(), actualPartitionSizes.end(), 0);
+  // [[maybe_unused]] auto sumDOF = std::accumulate(actualPartitionSizes.begin(), actualPartitionSizes.end(), 0);
   std::string myFilename = filePrefix + std::to_string(0);
   // cf. https://stackoverflow.com/a/47742514
   {
@@ -75,7 +75,7 @@ void writeRandomDataToDisk(std::string filePrefix,
   Stats::stopEvent("uftp write");
 }
 
-void createLargeFile(std::string filePrefix, const std::vector<long long int>& dsgPartitionSizes) {
+void createLargeFile(const std::string& filePrefix, const std::vector<long long int>& dsgPartitionSizes) {
   Stats::startEvent("uftp create file");
   // cf. https://stackoverflow.com/a/47742514
   {
@@ -90,7 +90,7 @@ void createLargeFile(std::string filePrefix, const std::vector<long long int>& d
   Stats::stopEvent("uftp create file");
 }
 
-void validateExchangedData(std::string filePrefix, std::string tokenToWaitFor,
+void validateExchangedData(const std::string& filePrefix, const std::string& tokenToWaitFor,
                            const std::vector<long long int>& dsgPartitionSizes) {
   // generate data on heap
   std::unique_ptr<std::vector<real>> mockUpData(new std::vector<real>(dsgPartitionSizes[0]));
@@ -118,7 +118,7 @@ void validateExchangedData(std::string filePrefix, std::string tokenToWaitFor,
   }
 }
 
-void checkSizeOfFile(std::string filePrefix, std::string tokenToWaitFor,
+void checkSizeOfFile(const std::string& filePrefix, const std::string& tokenToWaitFor,
                            const std::vector<long long int>& dsgPartitionSizes) {
   Stats::startEvent("uftp wait check size");
   Stats::startEvent("uftp wait");
@@ -146,7 +146,7 @@ void checkSizeOfFile(std::string filePrefix, std::string tokenToWaitFor,
   Stats::stopEvent("uftp wait check size");
 }
 
-void readAndInvertDSGFromDisk(std::string filePrefixIn, std::string filePrefixOut,
+void readAndInvertDSGFromDisk(const std::string& filePrefixIn, const std::string& filePrefixOut,
                               std::string tokenToWaitFor,
                               const std::vector<long long int>& dsgPartitionSizes) {
   std::unique_ptr<std::vector<real>> readData(new std::vector<real>(dsgPartitionSizes[0]));
@@ -224,7 +224,6 @@ int main(int argc, char** argv) {
   // read in third level parameters if available
   std::string thirdLevelHost, thirdLevelSSHCommand = "";
   unsigned int systemNumber = 0, numSystems = 1;
-  unsigned short thirdLevelPort = 0;
   bool hasThirdLevel = static_cast<bool>(cfg.get_child_optional("thirdLevel"));
   std::vector<real> fractionsOfScheme;
   if (hasThirdLevel) {
@@ -232,7 +231,6 @@ int main(int argc, char** argv) {
     thirdLevelHost = cfg.get<std::string>("thirdLevel.host");
     systemNumber = cfg.get<unsigned int>("thirdLevel.systemNumber");
     numSystems = cfg.get<unsigned int>("thirdLevel.numSystems");
-    thirdLevelPort = cfg.get<unsigned short>("thirdLevel.port");
     thirdLevelSSHCommand = cfg.get<std::string>("thirdLevel.sshCommand", "");
     bool hasFractions = static_cast<bool>(cfg.get_child_optional("thirdLevel.fractionsOfScheme"));
     if (hasFractions) {
@@ -261,8 +259,7 @@ int main(int argc, char** argv) {
   std::vector<LevelVector> levels;
   std::vector<combigrid::real> coeffs;
   std::vector<size_t> taskNumbers; // only used in case of static task assignment
-  bool useStaticTaskAssignment = false;
-  long long int ctDOF = 0;
+  [[maybe_unused]] long long int ctDOF = 0;
   if (ctschemeFile == "") {
     /* generate a list of levelvectors and coefficients
      * CombiMinMaxScheme will create a classical combination scheme.
@@ -288,10 +285,10 @@ int main(int argc, char** argv) {
         new CombiMinMaxSchemeFromFile(dim, lmin, lmax, ctschemeFile));
     const auto& pgNumbers = scheme->getProcessGroupNumbers();
     if (pgNumbers.size() > 0) {
-      useStaticTaskAssignment = true;
+      // useStaticTaskAssignment = true;
       const auto& allCoeffs = scheme->getCoeffs();
       const auto& allLevels = scheme->getCombiSpaces();
-      const auto [itMin, itMax] = std::minmax_element(pgNumbers.begin(), pgNumbers.end());
+      [[maybe_unused]] const auto [itMin, itMax] = std::minmax_element(pgNumbers.begin(), pgNumbers.end());
       assert(*itMin == 0);  // make sure it starts with 0
       // assert(*itMax == ngroup - 1); // and goes up to the maximum group //TODO
       // filter out only those tasks that belong to "our" process group
diff --git a/third_level_manager/thirdLevelManager.cpp b/third_level_manager/thirdLevelManager.cpp
index f7475a6a2..88562cfd9 100644
--- a/third_level_manager/thirdLevelManager.cpp
+++ b/third_level_manager/thirdLevelManager.cpp
@@ -77,7 +77,7 @@ void ThirdLevelManager::init()
     std::cout << "ThirdLevelManager::init(): Params are not loaded!" << std::endl;
     exit(0);
   }
-  auto serverInitSuccess = server_.init();
+  [[maybe_unused]] auto serverInitSuccess = server_.init();
   assert(serverInitSuccess);
   if (not server_.isInitialized())
   {
@@ -292,7 +292,7 @@ size_t ThirdLevelManager::forwardData(const System& sender, const System& receiv
   return dataSize;
 }
 
-void ThirdLevelManager::writeStatistics(std::string filename)
+void ThirdLevelManager::writeStatistics(const std::string& filename)
 {
   size_t walltime = stats_.getWallTime();
   size_t numCombinations = stats_.getNumCombinations();
diff --git a/third_level_manager/thirdLevelManager.hpp b/third_level_manager/thirdLevelManager.hpp
index dcb153ed9..9bd933ec7 100644
--- a/third_level_manager/thirdLevelManager.hpp
+++ b/third_level_manager/thirdLevelManager.hpp
@@ -44,7 +44,7 @@ class ThirdLevelManager
 
     void runtimeLoop();
 
-    void writeStatistics(std::string filename = "");
+    void writeStatistics(const std::string& filename = "");
 };
 
 }
diff --git a/tools/errorCalc.cpp b/tools/errorCalc.cpp
index 961056148..917b39a2c 100644
--- a/tools/errorCalc.cpp
+++ b/tools/errorCalc.cpp
@@ -58,8 +58,8 @@ int main(int argc, char** argv) {
   real tmp1 = 1.0 / l2norm1;
     real tmp2 = 1.0 / l2norm2;
   if (mode[0] == 'n') {
-    for (auto i = 0; i < data1.size(); ++i) data1[i] *= tmp1;
-    for (auto i = 0; i < data2.size(); ++i) data2[i] *= tmp2;
+    for (size_t i = 0; i < data1.size(); ++i) data1[i] *= tmp1;
+    for (size_t i = 0; i < data2.size(); ++i) data2[i] *= tmp2;
   } else if (mode[0] == 'a') {
     ;
   } else {
@@ -76,7 +76,7 @@ int main(int argc, char** argv) {
 
   // calc l2 norm of values
   real err = 0.0;
-  for (auto i = 0; i < data1.size(); ++i) {
+  for (size_t i = 0; i < data1.size(); ++i) {
     real tmp = std::abs(data1[i] - data2[i]);
 
     if (std::abs(tmp) / std::abs(data1[i]) > 1e-12)
@@ -121,7 +121,7 @@ void readPlotFile(const char* pltFileName, std::vector<CombiDataType>& data,
   pltFile.read((char*)&dim, sizeof(int));
 
   std::vector<int> res(dim);
-  for (size_t i = 0; i < dim; ++i) pltFile.read((char*)&res[i], sizeof(int));
+  for (int i = 0; i < dim; ++i) pltFile.read((char*)&res[i], sizeof(int));
 
   resolution.assign(res.begin(), res.end());
   std::cout << "resolution " << resolution << std::endl;
@@ -147,12 +147,12 @@ void readPlotFile(const char* pltFileName, std::vector<CombiDataType>& data,
     // copy data from local checkpoint to dfg
     // note that on the last process in some dimensions dfg is larger than the
     // local checkpoint
-    for (size_t n = 0; n < shape[0]; ++n) {
-      for (size_t m = 0; m < shape[1]; ++m) {
-        for (size_t l = 0; l < shape[2]; ++l) {
-          for (size_t k = 0; k < shape[3]; ++k) {
-            for (size_t j = 0; j < shape[4]; ++j) {
-              for (size_t i = 0; i < shape[5]; ++i) {
+    for (int n = 0; n < shape[0]; ++n) {
+      for (int m = 0; m < shape[1]; ++m) {
+        for (int l = 0; l < shape[2]; ++l) {
+          for (int k = 0; k < shape[3]; ++k) {
+            for (int j = 0; j < shape[4]; ++j) {
+              for (int i = 0; i < shape[5]; ++i) {
                 data.push_back(grid[n][m][l][k][j][i]);
               }
             }
@@ -162,11 +162,11 @@ void readPlotFile(const char* pltFileName, std::vector<CombiDataType>& data,
     }
   } else if (dim == 5) {
     auto grid = createMultiArrayRef<CombiDataType, 5>(&tmp[0], shape);
-    for (size_t n = 0; n < shape[0]; ++n) {
-      for (size_t m = 0; m < shape[1]; ++m) {
-        for (size_t l = 0; l < shape[2]; ++l) {
-          for (size_t k = 0; k < shape[3]; ++k) {
-            for (size_t j = 0; j < shape[4]; ++j) {
+    for (int n = 0; n < shape[0]; ++n) {
+      for (int m = 0; m < shape[1]; ++m) {
+        for (int l = 0; l < shape[2]; ++l) {
+          for (int k = 0; k < shape[3]; ++k) {
+            for (int j = 0; j < shape[4]; ++j) {
               data.push_back(grid[n][m][l][k][j]);
             }
           }
@@ -175,10 +175,10 @@ void readPlotFile(const char* pltFileName, std::vector<CombiDataType>& data,
     }
   } else if (dim == 4) {
     auto grid = createMultiArrayRef<CombiDataType, 4>(&tmp[0], shape);
-    for (size_t n = 0; n < shape[0]; ++n) {
-      for (size_t m = 0; m < shape[1]; ++m) {
-        for (size_t l = 0; l < shape[2]; ++l) {
-          for (size_t k = 0; k < shape[3]; ++k) {
+    for (int n = 0; n < shape[0]; ++n) {
+      for (int m = 0; m < shape[1]; ++m) {
+        for (int l = 0; l < shape[2]; ++l) {
+          for (int k = 0; k < shape[3]; ++k) {
             data.push_back(grid[n][m][l][k]);
           }
         }
@@ -186,17 +186,17 @@ void readPlotFile(const char* pltFileName, std::vector<CombiDataType>& data,
     }
   } else if (dim == 3) {
     auto grid = createMultiArrayRef<CombiDataType, 3>(&tmp[0], shape);
-    for (size_t n = 0; n < shape[0]; ++n) {
-      for (size_t m = 0; m < shape[1]; ++m) {
-        for (size_t l = 0; l < shape[2]; ++l) {
+    for (int n = 0; n < shape[0]; ++n) {
+      for (int m = 0; m < shape[1]; ++m) {
+        for (int l = 0; l < shape[2]; ++l) {
           data.push_back(grid[n][m][l]);
         }
       }
     }
   } else if (dim == 2) {
     auto grid = createMultiArrayRef<CombiDataType, 2>(&tmp[0], shape);
-    for (size_t n = 0; n < shape[0]; ++n) {
-      for (size_t m = 0; m < shape[1]; ++m) {
+    for (int n = 0; n < shape[0]; ++n) {
+      for (int m = 0; m < shape[1]; ++m) {
         data.push_back(grid[n][m]);
       }
     }
diff --git a/tools/subspace_writer/subspace_writer.cpp b/tools/subspace_writer/subspace_writer.cpp
index 1232ed817..41dcbd8fa 100644
--- a/tools/subspace_writer/subspace_writer.cpp
+++ b/tools/subspace_writer/subspace_writer.cpp
@@ -130,7 +130,6 @@ int main(int argc, char** argv) {
     // read in first CT scheme
     std::unique_ptr<CombiMinMaxSchemeFromFile> scheme(
         new CombiMinMaxSchemeFromFile(dim, lmin, lmax, ctschemeFile));
-    const auto& allLevels = scheme->getCombiSpaces();
 
     // generate distributed sparse grid
     uniDSGs.emplace_back(schemeFileToSparseGridAndSizesFile(