diff --git a/maintainer/benchmarks/lb.py b/maintainer/benchmarks/lb.py
index 04453c5ea7..7d47461bad 100644
--- a/maintainer/benchmarks/lb.py
+++ b/maintainer/benchmarks/lb.py
@@ -50,6 +50,18 @@
 parser.add_argument("--output", metavar="FILEPATH", action="store",
                     type=str, required=False, default="benchmarks.csv",
                     help="Output file (default: benchmarks.csv)")
+parser.add_argument("--divided_block", action="store",
+                    type=int, default=1, required=False,
+                    help="blocks^(1/3) per mpi rank")
+parser.add_argument("--divided_block_x", action="store",
+                    type=int, default=0, required=False,
+                    help="The number of divided blocks for x direction")
+parser.add_argument("--divided_block_y", action="store",
+                    type=int, default=0, required=False,
+                    help="The number of divided blocks for x direction")
+parser.add_argument("--divided_block_z", action="store",
+                    type=int, default=0, required=False,
+                    help="The number of divided blocks for x direction")
 
 args = parser.parse_args()
 
@@ -85,7 +97,10 @@
 n_proc = system.cell_system.get_state()["n_nodes"]
 n_part = n_proc * args.particles_per_core
 if n_part == 0:
-    box_l = 3 * args.box_l if len(args.box_l) == 1 else args.box_l
+    if len(args.box_l) == 1:
+        box_l = 3 * args.box_l
+    elif len(args.box_l) == 3:
+        box_l = args.box_l
     agrid = 1.
     lb_grid = box_l
     measurement_steps = 80
@@ -101,13 +116,21 @@
     lb_grid = 3 * [lb_grid]
     box_l = 3 * [box_l]
 
-print(f"box length: {box_l}")
-print(f"LB shape: {lb_grid}")
-print(f"LB agrid: {agrid:.3f}")
+divided_block_x = args.divided_block_x
+divided_block_y = args.divided_block_y
+divided_block_z = args.divided_block_z
+if divided_block_x != 0 and divided_block_y != 0 and divided_block_z != 0:
+    blocks_per_mpi_rank = [divided_block_x,
+                           divided_block_y, divided_block_z]
+else:
+    divided_block = args.divided_block
+    blocks_per_mpi_rank = [divided_block] * 3
 
 # System
 #############################################################
-system.box_l = box_l
+system.box_l = box_l * system.cell_system.node_grid
+print(f"LB agrid: {agrid:.3f}")
+print("LB shape", system.box_l)
 
 # Integration parameters
 #############################################################
@@ -145,7 +168,7 @@
 if args.multi_gpu:
     system.cuda_init_handle.call_method("set_device_id_per_rank")
 lbf = lb_class(agrid=agrid, tau=system.time_step, kinematic_viscosity=1.,
-               density=1., single_precision=args.single_precision)
+               density=1., single_precision=args.single_precision, blocks_per_mpi_rank=blocks_per_mpi_rank)
 system.lb = lbf
 if n_part:
     system.thermostat.set_lb(LB_fluid=lbf, gamma=1., seed=42)
diff --git a/src/core/integrate.cpp b/src/core/integrate.cpp
index 9ed1d628a4..badbc8f142 100644
--- a/src/core/integrate.cpp
+++ b/src/core/integrate.cpp
@@ -633,12 +633,18 @@ int System::System::integrate(int n_steps, int reuse_forces) {
           ek.propagate();
         }
       } else if (lb_active) {
+#ifdef CALIPER
+        CALI_MARK_BEGIN("LB.PROPAGATE");
+#endif
         auto const md_steps_per_lb_step = calc_md_steps_per_tau(lb.get_tau());
         propagation.lb_skipped_md_steps += 1;
         if (propagation.lb_skipped_md_steps >= md_steps_per_lb_step) {
           propagation.lb_skipped_md_steps = 0;
           lb.propagate();
         }
+#ifdef CALIPER
+        CALI_MARK_END("LB.PROPAGATE");
+#endif
       } else if (ek_active) {
         auto const md_steps_per_ek_step = calc_md_steps_per_tau(ek.get_tau());
         propagation.ek_skipped_md_steps += 1;
diff --git a/src/core/lb/LBWalberla.cpp b/src/core/lb/LBWalberla.cpp
index 9944d05408..37f3d78e64 100644
--- a/src/core/lb/LBWalberla.cpp
+++ b/src/core/lb/LBWalberla.cpp
@@ -40,6 +40,10 @@
 #include <optional>
 #include <variant>
 
+#ifdef CALIPER
+#include <caliper/cali.h>
+#endif
+
 namespace LB {
 
 bool LBWalberla::is_gpu() const { return lb_fluid->is_gpu(); }
@@ -50,7 +54,15 @@ Utils::VectorXd<9> LBWalberla::get_pressure_tensor() const {
   return lb_fluid->get_pressure_tensor();
 }
 
-void LBWalberla::propagate() { lb_fluid->integrate(); }
+void LBWalberla::propagate() {
+#ifdef CALIPER
+  CALI_MARK_BEGIN("LBWalberla.PROPAGATE");
+#endif
+  lb_fluid->integrate();
+#ifdef CALIPER
+  CALI_MARK_END("LBWalberla.PROPAGATE");
+#endif
+}
 
 void LBWalberla::ghost_communication() { lb_fluid->ghost_communication(); }
 
diff --git a/src/core/lb/Solver.cpp b/src/core/lb/Solver.cpp
index 758f36c4d7..9a75558057 100644
--- a/src/core/lb/Solver.cpp
+++ b/src/core/lb/Solver.cpp
@@ -47,6 +47,10 @@
 #include <variant>
 #include <vector>
 
+#ifdef CALIPER
+#include <caliper/cali.h>
+#endif
+
 namespace LB {
 
 Solver::Solver() { impl = std::make_unique<Implementation>(); }
@@ -69,8 +73,14 @@ void Solver::reset() {
 }
 
 void Solver::propagate() {
+#ifdef CALIPER
+  CALI_MARK_BEGIN("SOLVER.PROPAGATE");
+#endif
   check_solver(impl);
   std::visit([](auto &ptr) { ptr->propagate(); }, *impl->solver);
+#ifdef CALIPER
+  CALI_MARK_END("SOLVER.PROPAGATE");
+#endif
 }
 
 void Solver::ghost_communication() {
diff --git a/src/core/unit_tests/ek_interface_test.cpp b/src/core/unit_tests/ek_interface_test.cpp
index 0abe8917bd..a80b2fa2fa 100644
--- a/src/core/unit_tests/ek_interface_test.cpp
+++ b/src/core/unit_tests/ek_interface_test.cpp
@@ -83,7 +83,8 @@ static auto make_ek_actor() {
   auto constexpr n_ghost_layers = 1u;
   auto constexpr single_precision = true;
   ek_lattice = std::make_shared<LatticeWalberla>(
-      params.grid_dimensions, ::communicator.node_grid, n_ghost_layers);
+      params.grid_dimensions, ::communicator.node_grid,
+      ::communicator.node_grid, n_ghost_layers);
   ek_container = std::make_shared<EK::EKWalberla::ek_container_type>(
       params.tau, walberla::new_ek_poisson_none(ek_lattice, single_precision));
   ek_reactions = std::make_shared<EK::EKWalberla::ek_reactions_type>();
diff --git a/src/core/unit_tests/lb_particle_coupling_test.cpp b/src/core/unit_tests/lb_particle_coupling_test.cpp
index 97e0f4c2e8..4b6c875360 100644
--- a/src/core/unit_tests/lb_particle_coupling_test.cpp
+++ b/src/core/unit_tests/lb_particle_coupling_test.cpp
@@ -102,7 +102,8 @@ static auto make_lb_actor() {
   auto constexpr single_precision = false;
   lb_params = std::make_shared<LB::LBWalberlaParams>(params.agrid, params.tau);
   lb_lattice = std::make_shared<LatticeWalberla>(
-      params.grid_dimensions, ::communicator.node_grid, n_ghost_layers);
+      params.grid_dimensions, ::communicator.node_grid,
+      ::communicator.node_grid, n_ghost_layers);
   lb_fluid = new_lb_walberla_cpu(lb_lattice, params.viscosity, params.density,
                                  single_precision);
   lb_fluid->set_collision_model(params.kT, params.seed);
@@ -535,7 +536,8 @@ bool test_lb_domain_mismatch_local() {
   auto const params = std::make_shared<LB::LBWalberlaParams>(0.5, 0.01);
   ::communicator.node_grid = node_grid_reversed;
   auto const lattice = std::make_shared<LatticeWalberla>(
-      Utils::Vector3i{12, 12, 12}, node_grid_original, n_ghost_layers);
+      Utils::Vector3i{12, 12, 12}, node_grid_original, node_grid_original,
+      n_ghost_layers);
   auto const ptr = new_lb_walberla_cpu(lattice, 1.0, 1.0, false);
   ptr->set_collision_model(0.0, 0);
   ::communicator.node_grid = node_grid_original;
diff --git a/src/python/espressomd/detail/walberla.py b/src/python/espressomd/detail/walberla.py
index 6ec64dc94a..964832cc4a 100644
--- a/src/python/espressomd/detail/walberla.py
+++ b/src/python/espressomd/detail/walberla.py
@@ -47,7 +47,7 @@ def __init__(self, *args, **kwargs):
             super().__init__(**kwargs)
 
     def valid_keys(self):
-        return {"agrid", "n_ghost_layers"}
+        return {"agrid", "n_ghost_layers", "blocks_per_mpi_rank"}
 
     def required_keys(self):
         return self.valid_keys()
diff --git a/src/python/espressomd/lb.py b/src/python/espressomd/lb.py
index e4b870a307..5b7f588edb 100644
--- a/src/python/espressomd/lb.py
+++ b/src/python/espressomd/lb.py
@@ -58,14 +58,14 @@ def validate_params(self, params):
 
     def valid_keys(self):
         return {"agrid", "tau", "density", "ext_force_density",
-                "kinematic_viscosity", "lattice", "kT", "seed"}
+                "kinematic_viscosity", "lattice", "kT", "seed", "blocks_per_mpi_rank"}
 
     def required_keys(self):
         return {"lattice", "density", "kinematic_viscosity", "tau"}
 
     def default_params(self):
         return {"lattice": None, "seed": 0, "kT": 0.,
-                "ext_force_density": [0.0, 0.0, 0.0]}
+                "ext_force_density": [0.0, 0.0, 0.0], "blocks_per_mpi_rank": [1, 1, 1]}
 
     def mach_limit(self):
         """
@@ -141,6 +141,8 @@ class LBFluidWalberla(HydrodynamicInteraction,
         Required for a thermalized fluid. Must be positive.
     single_precision : :obj:`bool`, optional
         Use single-precision floating-point arithmetic.
+    blocks_per_mpi_rank : (3,) array_like of :obj:`int`, optional
+        Ditribute more than one block to each CPU.
 
     Methods
     -------
@@ -240,7 +242,7 @@ def validate_params(self, params):
             if "agrid" not in params:
                 raise ValueError("missing argument 'lattice' or 'agrid'")
             params["lattice"] = LatticeWalberla(
-                agrid=params.pop("agrid"), n_ghost_layers=1)
+                agrid=params.pop("agrid"), n_ghost_layers=1, blocks_per_mpi_rank=params.get("blocks_per_mpi_rank"))
         elif "agrid" in params:
             raise ValueError("cannot provide both 'lattice' and 'agrid'")
 
diff --git a/src/script_interface/walberla/LBFluid.cpp b/src/script_interface/walberla/LBFluid.cpp
index 5b3bf4cabc..954fa3fce8 100644
--- a/src/script_interface/walberla/LBFluid.cpp
+++ b/src/script_interface/walberla/LBFluid.cpp
@@ -139,6 +139,12 @@ void LBFluidGPU::make_instance(VariantMap const &params) {
   auto const visc = get_value<double>(params, "kinematic_viscosity");
   auto const dens = get_value<double>(params, "density");
   auto const precision = get_value<bool>(params, "single_precision");
+  auto const blocks_per_mpi_rank = get_value_or<Utils::Vector3i>(
+      params, "blocks_per_mpi_rank", Utils::Vector3i{{1, 1, 1}});
+  if (blocks_per_mpi_rank != Utils::Vector3i{{1, 1, 1}}) {
+    throw std::runtime_error(
+        "GPU architecture PROHIBITED allocating many blocks to 1 CPU.");
+  }
   auto const lb_lattice = m_lattice->lattice();
   auto const lb_visc = m_conv_visc * visc;
   auto const lb_dens = m_conv_dens * dens;
diff --git a/src/script_interface/walberla/LatticeWalberla.hpp b/src/script_interface/walberla/LatticeWalberla.hpp
index 999513f0a7..d438bee616 100644
--- a/src/script_interface/walberla/LatticeWalberla.hpp
+++ b/src/script_interface/walberla/LatticeWalberla.hpp
@@ -43,6 +43,7 @@ class LatticeWalberla : public AutoParameters<LatticeWalberla> {
   std::shared_ptr<::LatticeWalberla> m_lattice;
   double m_agrid;
   Utils::Vector3d m_box_l;
+  Utils::Vector3i m_blocks_per_mpi_rank;
 
 public:
   LatticeWalberla() {
@@ -53,6 +54,8 @@ class LatticeWalberla : public AutoParameters<LatticeWalberla> {
         {"shape", AutoParameter::read_only,
          [this]() { return m_lattice->get_grid_dimensions(); }},
         {"_box_l", AutoParameter::read_only, [this]() { return m_box_l; }},
+        {"blocks_per_mpi_rank", AutoParameter::read_only,
+         [this]() { return m_blocks_per_mpi_rank; }},
     });
   }
 
@@ -60,7 +63,16 @@ class LatticeWalberla : public AutoParameters<LatticeWalberla> {
     auto const &box_geo = *::System::get_system().box_geo;
     m_agrid = get_value<double>(args, "agrid");
     m_box_l = get_value_or<Utils::Vector3d>(args, "_box_l", box_geo.length());
+    m_blocks_per_mpi_rank = get_value_or<Utils::Vector3i>(
+        args, "blocks_per_mpi_rank", Utils::Vector3i{{1, 1, 1}});
     auto const n_ghost_layers = get_value<int>(args, "n_ghost_layers");
+    auto const block_grid =
+        Utils::Vector3i{{static_cast<int>(::communicator.node_grid[0] *
+                                          m_blocks_per_mpi_rank[0]),
+                         static_cast<int>(::communicator.node_grid[1] *
+                                          m_blocks_per_mpi_rank[1]),
+                         static_cast<int>(::communicator.node_grid[2] *
+                                          m_blocks_per_mpi_rank[2])}};
 
     context()->parallel_try_catch([&]() {
       if (m_agrid <= 0.) {
@@ -72,7 +84,7 @@ class LatticeWalberla : public AutoParameters<LatticeWalberla> {
       auto const grid_dim =
           ::LatticeWalberla::calc_grid_dimensions(m_box_l, m_agrid);
       m_lattice = std::make_shared<::LatticeWalberla>(
-          grid_dim, ::communicator.node_grid,
+          grid_dim, ::communicator.node_grid, block_grid,
           static_cast<unsigned int>(n_ghost_layers));
     });
   }
diff --git a/src/walberla_bridge/include/walberla_bridge/LatticeWalberla.hpp b/src/walberla_bridge/include/walberla_bridge/LatticeWalberla.hpp
index 03c5ff6291..b49693e848 100644
--- a/src/walberla_bridge/include/walberla_bridge/LatticeWalberla.hpp
+++ b/src/walberla_bridge/include/walberla_bridge/LatticeWalberla.hpp
@@ -52,6 +52,7 @@ class LatticeWalberla {
 public:
   LatticeWalberla(Utils::Vector3i const &grid_dimensions,
                   Utils::Vector3i const &node_grid,
+                  Utils::Vector3i const &block_grid,
                   unsigned int n_ghost_layers);
 
   // Grid, domain, halo
diff --git a/src/walberla_bridge/src/BoundaryPackInfo.hpp b/src/walberla_bridge/src/BoundaryPackInfo.hpp
index 83a26fa91d..48e3d4258c 100644
--- a/src/walberla_bridge/src/BoundaryPackInfo.hpp
+++ b/src/walberla_bridge/src/BoundaryPackInfo.hpp
@@ -96,7 +96,7 @@ class BoundaryPackInfo : public PackInfo<GhostLayerField_T> {
     WALBERLA_ASSERT_EQUAL(bSize, buf_size);
 #endif
 
-    auto const offset = std::get<0>(m_lattice->get_local_grid_range());
+    auto const offset = to_vector3i(receiver->getAABB().min());
     typename Boundary_T::value_type value;
     for (auto it = begin(flag_field); it != flag_field->end(); ++it) {
       if (isFlagSet(it, boundary_flag)) {
@@ -133,7 +133,7 @@ class BoundaryPackInfo : public PackInfo<GhostLayerField_T> {
            << buf_size;
 #endif
 
-    auto const offset = std::get<0>(m_lattice->get_local_grid_range());
+    auto const offset = to_vector3i(sender->getAABB().min());
     for (auto it = begin(flag_field); it != flag_field->end(); ++it) {
       if (isFlagSet(it, boundary_flag)) {
         auto const node = offset + Utils::Vector3i{{it.x(), it.y(), it.z()}};
diff --git a/src/walberla_bridge/src/LatticeWalberla.cpp b/src/walberla_bridge/src/LatticeWalberla.cpp
index 2dc2943a40..6551da010a 100644
--- a/src/walberla_bridge/src/LatticeWalberla.cpp
+++ b/src/walberla_bridge/src/LatticeWalberla.cpp
@@ -40,6 +40,7 @@
 
 LatticeWalberla::LatticeWalberla(Utils::Vector3i const &grid_dimensions,
                                  Utils::Vector3i const &node_grid,
+                                 Utils::Vector3i const &block_grid,
                                  unsigned int n_ghost_layers)
     : m_grid_dimensions{grid_dimensions}, m_n_ghost_layers{n_ghost_layers} {
   using walberla::real_t;
@@ -50,21 +51,28 @@ LatticeWalberla::LatticeWalberla(Utils::Vector3i const &grid_dimensions,
       throw std::runtime_error(
           "Lattice grid dimensions and MPI node grid are not compatible.");
     }
+    if (m_grid_dimensions[i] % block_grid[i] != 0) {
+      throw std::runtime_error(
+          "Lattice grid dimensions and block grid are not compatible.");
+    }
   }
 
   auto constexpr lattice_constant = real_t{1};
-  auto const cells_block = Utils::hadamard_division(grid_dimensions, node_grid);
+  auto const cells_block =
+      Utils::hadamard_division(grid_dimensions, block_grid);
 
   m_blocks = walberla::blockforest::createUniformBlockGrid(
       // number of blocks in each direction
-      uint_c(node_grid[0]), uint_c(node_grid[1]), uint_c(node_grid[2]),
+      uint_c(block_grid[0]), uint_c(block_grid[1]), uint_c(block_grid[2]),
       // number of cells per block in each direction
       uint_c(cells_block[0]), uint_c(cells_block[1]), uint_c(cells_block[2]),
       lattice_constant,
       // number of cpus per direction
       uint_c(node_grid[0]), uint_c(node_grid[1]), uint_c(node_grid[2]),
       // periodicity
-      true, true, true);
+      true, true, true,
+      // keep global block information
+      false);
   for (IBlock &block : *m_blocks) {
     m_cached_blocks.push_back(&block);
   }
@@ -73,11 +81,44 @@ LatticeWalberla::LatticeWalberla(Utils::Vector3i const &grid_dimensions,
 [[nodiscard]] std::pair<Utils::Vector3d, Utils::Vector3d>
 LatticeWalberla::get_local_domain() const {
   using walberla::to_vector3d;
-  // We only have one block per mpi rank
-  assert(++(m_blocks->begin()) == m_blocks->end());
-
-  auto const ab = m_blocks->begin()->getAABB();
-  return {to_vector3d(ab.min()), to_vector3d(ab.max())};
+  // Get upper and lower corner of BlockForest assigned to a mpi rank.
+  // Since we can allocate multiple blocks per mpi rank,
+  // the corners of all Blocks are compared.
+  int64_t const stride_y = m_grid_dimensions[2];
+  int64_t const stride_x = m_grid_dimensions[1] * stride_y;
+  auto aa = m_blocks->begin()->getAABB();
+  auto bb = m_blocks->begin()->getAABB();
+  int64_t aa_index = stride_x * static_cast<int>(aa.min()[0]) +
+                     stride_y * static_cast<int>(aa.min()[1]) +
+                     static_cast<int>(aa.min()[2]);
+  int64_t bb_index = stride_x * static_cast<int>(bb.max()[0]) +
+                     stride_y * static_cast<int>(bb.max()[1]) +
+                     static_cast<int>(bb.max()[2]);
+  for (auto b = m_blocks->begin(); b != m_blocks->end(); ++b) {
+    auto cc = b->getAABB();
+    for (auto const i : {0u, 1u, 2u}) {
+      if ((cc.max()[i] - cc.min()[i]) != 0) {
+        assert(m_grid_dimensions[i] %
+                   static_cast<int>(cc.max()[i] - cc.min()[i]) ==
+               0);
+      }
+    }
+    int64_t min_index = stride_x * static_cast<int>(cc.min()[0]) +
+                        stride_y * static_cast<int>(cc.min()[1]) +
+                        static_cast<int>(cc.min()[2]);
+    int64_t max_index = stride_x * static_cast<int>(cc.max()[0]) +
+                        stride_y * static_cast<int>(cc.max()[1]) +
+                        static_cast<int>(cc.max()[2]);
+    if (min_index < aa_index) {
+      aa = cc;
+      aa_index = min_index;
+    }
+    if (max_index > bb_index) {
+      bb = cc;
+      bb_index = max_index;
+    }
+  }
+  return {to_vector3d(aa.min()), to_vector3d(bb.max())};
 }
 
 [[nodiscard]] bool
diff --git a/src/walberla_bridge/src/lattice_boltzmann/LBWalberlaImpl.hpp b/src/walberla_bridge/src/lattice_boltzmann/LBWalberlaImpl.hpp
index 3c1b11219f..8cbd6981ea 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/LBWalberlaImpl.hpp
+++ b/src/walberla_bridge/src/lattice_boltzmann/LBWalberlaImpl.hpp
@@ -378,6 +378,7 @@ class LBWalberlaImpl : public LBWalberlaBase {
   std::shared_ptr<gpu::HostFieldAllocator<FloatType>> m_host_field_allocator;
 #endif
 
+  // Interval within not global but mpi rank
   [[nodiscard]] std::optional<CellInterval>
   get_interval(Utils::Vector3i const &lower_corner,
                Utils::Vector3i const &upper_corner) const {
@@ -389,8 +390,56 @@ class LBWalberlaImpl : public LBWalberlaBase {
     if (not lower_bc or not upper_bc) {
       return std::nullopt;
     }
-    assert(&(*(lower_bc->block)) == &(*(upper_bc->block)));
-    return {CellInterval(lower_bc->cell, upper_bc->cell)};
+    Cell const global_lower_cell = lower_bc->cell;
+    Cell const global_upper_cell =
+        Cell(static_cast<int>(upper_bc->cell[0] +
+                              upper_bc->block->getAABB().min()[0] -
+                              lower_bc->block->getAABB().min()[0]),
+             static_cast<int>(upper_bc->cell[1] +
+                              upper_bc->block->getAABB().min()[1] -
+                              lower_bc->block->getAABB().min()[1]),
+             static_cast<int>(upper_bc->cell[2] +
+                              upper_bc->block->getAABB().min()[2] -
+                              lower_bc->block->getAABB().min()[2]));
+    return {CellInterval(global_lower_cell, global_upper_cell)};
+  }
+
+  // Interval within local block
+  [[nodiscard]] std::optional<CellInterval> get_block_interval(
+      Utils::Vector3i const &lower_corner, Utils::Vector3i const &upper_corner,
+      Utils::Vector3i const &local_offset, IBlock const &block) const {
+    auto block_lower_corner = to_vector3i(block.getAABB().min());
+    if (upper_corner[0] < block_lower_corner[0] or
+        upper_corner[1] < block_lower_corner[1] or
+        upper_corner[2] < block_lower_corner[2]) {
+      return std::nullopt;
+    }
+    for (uint_t f = 0u; f < 3u; ++f) {
+      if (block_lower_corner[f] < lower_corner[f]) {
+        block_lower_corner[f] = lower_corner[f];
+      }
+    }
+    auto block_upper_corner = to_vector3i(block.getAABB().max());
+    if (lower_corner[0] > block_upper_corner[0] or
+        lower_corner[1] > block_upper_corner[1] or
+        lower_corner[2] > block_upper_corner[2]) {
+      return std::nullopt;
+    }
+    for (uint_t f = 0u; f < 3u; ++f) {
+      if (block_upper_corner[f] > upper_corner[f]) {
+        block_upper_corner[f] = upper_corner[f];
+      }
+    }
+    block_upper_corner -= Utils::Vector3i::broadcast(1);
+    Cell const block_lower_cell =
+        Cell(static_cast<int>(block_lower_corner[0] - local_offset[0]),
+             static_cast<int>(block_lower_corner[1] - local_offset[1]),
+             static_cast<int>(block_lower_corner[2] - local_offset[2]));
+    Cell const block_upper_cell =
+        Cell(static_cast<int>(block_upper_corner[0] - local_offset[0]),
+             static_cast<int>(block_upper_corner[1] - local_offset[1]),
+             static_cast<int>(block_upper_corner[2] - local_offset[2]));
+    return {CellInterval(block_lower_cell, block_upper_cell)};
   }
 
   /**
@@ -614,6 +663,7 @@ class LBWalberlaImpl : public LBWalberlaBase {
     if (m_has_boundaries) {
       integrate_boundaries(blocks);
     }
+
     // LB stream
     integrate_stream(blocks);
     // Mark pending ghost layer updates
@@ -868,40 +918,62 @@ class LBWalberlaImpl : public LBWalberlaBase {
   get_slice_velocity(Utils::Vector3i const &lower_corner,
                      Utils::Vector3i const &upper_corner) const override {
     std::vector<double> out;
+    uint_t values_size = 0;
     if (auto const ci = get_interval(lower_corner, upper_corner)) {
+      out = std::vector<double>(static_cast<unsigned int>(3u * ci->numCells()));
+      int64_t const stride_y = (ci->max().z() - ci->min().z() + 1u);
+      int64_t const stride_x = (ci->max().y() - ci->min().y() + 1u) * stride_y;
       auto const &lattice = get_lattice();
-      auto const &block = *(lattice.get_blocks()->begin());
-      auto const field =
-          block.template getData<VectorField>(m_velocity_field_id);
-      auto const values = lbm::accessor::Vector::get(field, *ci);
-      assert(++(lattice.get_blocks()->begin()) == lattice.get_blocks()->end());
-      assert(values.size() == 3u * ci->numCells());
-      if constexpr (std::is_same_v<typename decltype(values)::value_type,
-                                   double>) {
-        out = std::move(values);
-      } else {
-        out = std::vector<double>(values.begin(), values.end());
-      }
-      auto const local_offset = std::get<0>(lattice.get_local_grid_range());
-      auto const lower_cell = ci->min();
-      auto const upper_cell = ci->max();
-      auto it = out.begin();
-      for (auto x = lower_cell.x(); x <= upper_cell.x(); ++x) {
-        for (auto y = lower_cell.y(); y <= upper_cell.y(); ++y) {
-          for (auto z = lower_cell.z(); z <= upper_cell.z(); ++z) {
-            auto const node = local_offset + Utils::Vector3i{{x, y, z}};
-            if (m_boundary->node_is_boundary(node)) {
-              auto const &vec = m_boundary->get_node_value_at_boundary(node);
-              for (uint_t f = 0u; f < 3u; ++f) {
-                (*it) = double_c(vec[f]);
-                std::advance(it, 1l);
+      for (auto b = lattice.get_blocks()->begin();
+           b != lattice.get_blocks()->end(); ++b) {
+        auto const &block = *b;
+        auto const local_offset = to_vector3i(block.getAABB().min());
+        if (auto const bci = get_block_interval(lower_corner, upper_corner,
+                                                local_offset, block)) {
+          auto const field =
+              block.template getData<VectorField>(m_velocity_field_id);
+          auto const values = lbm::accessor::Vector::get(field, *bci);
+          assert(values.size() == 3u * bci->numCells());
+          values_size += 3u * bci->numCells();
+          int64_t const stride_ly = (bci->max().z() - bci->min().z() + 1u);
+          int64_t const stride_lx =
+              (bci->max().y() - bci->min().y() + 1u) * stride_ly;
+          auto const lower_cell = bci->min();
+          auto const upper_cell = bci->max();
+          // The field data "values" knows about block-local indices
+          // In the loop, x,y,z are in block-local coordinates
+          // It is converted to BlockForest (lattice) coordinates assigned to a
+          // mpi rank The same applies to other get_slice methods
+          for (auto x = lower_cell.x(); x <= upper_cell.x(); ++x) {
+            for (auto y = lower_cell.y(); y <= upper_cell.y(); ++y) {
+              for (auto z = lower_cell.z(); z <= upper_cell.z(); ++z) {
+                auto const node = local_offset + Utils::Vector3i{{x, y, z}};
+                auto const index = stride_x * (node[0] - lower_corner[0]) +
+                                   stride_y * (node[1] - lower_corner[1]) +
+                                   node[2] - lower_corner[2];
+                auto const local_index = stride_lx * (x - lower_cell.x()) +
+                                         stride_ly * (y - lower_cell.y()) + z -
+                                         lower_cell.z();
+                if (m_boundary->node_is_boundary(node)) {
+                  auto const &vec =
+                      m_boundary->get_node_value_at_boundary(node);
+                  for (uint_t f = 0u; f < 3u; ++f) {
+                    out[static_cast<unsigned int>(3u * index + f)] =
+                        double_c(vec[f]);
+                  }
+                } else {
+                  for (uint_t f = 0u; f < 3u; ++f) {
+                    out[static_cast<unsigned int>(3u * index + f)] =
+                        double_c(values[static_cast<unsigned int>(
+                            3u * local_index + f)]);
+                  }
+                }
               }
-            } else {
-              std::advance(it, 3l);
             }
           }
         }
       }
+      assert(values_size == 3u * ci->numCells());
     }
     return out;
   }
@@ -912,17 +984,55 @@ class LBWalberlaImpl : public LBWalberlaBase {
     m_pending_ghost_comm.set(GhostComm::PDF);
     m_pending_ghost_comm.set(GhostComm::VEL);
     if (auto const ci = get_interval(lower_corner, upper_corner)) {
-      auto const &lattice = get_lattice();
-      auto &block = *(lattice.get_blocks()->begin());
-      auto pdf_field = block.template getData<PdfField>(m_pdf_field_id);
-      auto force_field =
-          block.template getData<VectorField>(m_last_applied_force_field_id);
-      auto vel_field = block.template getData<VectorField>(m_velocity_field_id);
-      assert(++(lattice.get_blocks()->begin()) == lattice.get_blocks()->end());
       assert(velocity.size() == 3u * ci->numCells());
-      std::vector<FloatType> const values(velocity.begin(), velocity.end());
-      lbm::accessor::Velocity::set(pdf_field, vel_field, force_field, values,
-                                   *ci);
+      int64_t const stride_y = (ci->max().z() - ci->min().z() + 1u);
+      int64_t const stride_x = (ci->max().y() - ci->min().y() + 1u) * stride_y;
+      auto const &lattice = get_lattice();
+      for (auto b = lattice.get_blocks()->begin();
+           b != lattice.get_blocks()->end(); ++b) {
+        auto &block = *b;
+        auto const local_offset = to_vector3i(block.getAABB().min());
+        if (auto const bci = get_block_interval(lower_corner, upper_corner,
+                                                local_offset, block)) {
+          auto pdf_field = block.template getData<PdfField>(m_pdf_field_id);
+          auto force_field = block.template getData<VectorField>(
+              m_last_applied_force_field_id);
+          auto vel_field =
+              block.template getData<VectorField>(m_velocity_field_id);
+          std::vector<FloatType> values = std::vector<FloatType>(
+              static_cast<unsigned int>(3u * bci->numCells()));
+          int64_t const stride_ly = (bci->max().z() - bci->min().z() + 1u);
+          int64_t const stride_lx =
+              (bci->max().y() - bci->min().y() + 1u) * stride_ly;
+          auto const lower_cell = bci->min();
+          auto const upper_cell = bci->max();
+          // In the loop, x,y,z are in block-local coordinates
+          // The field data given in the argument knows about BlockForest
+          // (lattice) indices from lower_corner to upper_corner It is converted
+          // to block-local coordinates The same applies to other set_slice
+          // methods
+          for (auto x = lower_cell.x(); x <= upper_cell.x(); ++x) {
+            for (auto y = lower_cell.y(); y <= upper_cell.y(); ++y) {
+              for (auto z = lower_cell.z(); z <= upper_cell.z(); ++z) {
+                auto const node = local_offset + Utils::Vector3i{{x, y, z}};
+                auto const index = stride_x * (node[0] - lower_corner[0]) +
+                                   stride_y * (node[1] - lower_corner[1]) +
+                                   node[2] - lower_corner[2];
+                auto const local_index = stride_lx * (x - lower_cell.x()) +
+                                         stride_ly * (y - lower_cell.y()) + z -
+                                         lower_cell.z();
+                for (uint_t f = 0u; f < 3u; ++f) {
+                  values[static_cast<unsigned int>(3u * local_index + f)] =
+                      numeric_cast<FloatType>(
+                          velocity[static_cast<unsigned int>(3u * index + f)]);
+                }
+              }
+            }
+          }
+          lbm::accessor::Velocity::set(pdf_field, vel_field, force_field,
+                                       values, *bci);
+        }
+      }
     }
   }
 
@@ -1073,8 +1183,11 @@ class LBWalberlaImpl : public LBWalberlaBase {
       return false;
     auto const force_at_node = [this, &force](std::array<int, 3> const node,
                                               double weight) {
-      auto const bc =
-          get_block_and_cell(get_lattice(), Utils::Vector3i(node), true);
+      auto bc = get_block_and_cell(get_lattice(), Utils::Vector3i(node), false);
+      if (!bc) {
+        bc = get_block_and_cell(get_lattice(), Utils::Vector3i(node), true);
+      }
+
       if (bc) {
         auto const weighted_force = to_vector3<FloatType>(weight * force);
         auto force_field =
@@ -1137,18 +1250,47 @@ class LBWalberlaImpl : public LBWalberlaBase {
       Utils::Vector3i const &upper_corner) const override {
     std::vector<double> out;
     if (auto const ci = get_interval(lower_corner, upper_corner)) {
+      out = std::vector<double>(static_cast<unsigned int>(3u * ci->numCells()));
+      int64_t const stride_y = (ci->max().z() - ci->min().z() + 1u);
+      int64_t const stride_x = (ci->max().y() - ci->min().y() + 1u) * stride_y;
       auto const &lattice = get_lattice();
-      auto const &block = *(lattice.get_blocks()->begin());
-      auto const field =
-          block.template getData<VectorField>(m_last_applied_force_field_id);
-      auto const values = lbm::accessor::Vector::get(field, *ci);
-      assert(++(lattice.get_blocks()->begin()) == lattice.get_blocks()->end());
-      assert(values.size() == 3u * ci->numCells());
-      if constexpr (std::is_same_v<typename decltype(values)::value_type,
-                                   double>) {
-        out = std::move(values);
-      } else {
-        out = std::vector<double>(values.begin(), values.end());
+      for (auto b = lattice.get_blocks()->begin();
+           b != lattice.get_blocks()->end(); ++b) {
+        auto const &block = *b;
+        auto const local_offset = to_vector3i(block.getAABB().min());
+        if (auto const bci = get_block_interval(lower_corner, upper_corner,
+                                                local_offset, block)) {
+          auto const field = block.template getData<VectorField>(
+              m_last_applied_force_field_id);
+          auto const values = lbm::accessor::Vector::get(field, *bci);
+          assert(values.size() == 3u * bci->numCells());
+          int64_t const stride_ly = (bci->max().z() - bci->min().z() + 1u);
+          int64_t const stride_lx =
+              (bci->max().y() - bci->min().y() + 1u) * stride_ly;
+          auto const lower_cell = bci->min();
+          auto const upper_cell = bci->max();
+          // The field data "values" knows about block-local indices
+          // In the loop, x,y,z are in block-local coordinates
+          // It is converted to BlockForest (lattice) coordinates assigned to a
+          // mpi rank The same applies to other get_slice methods
+          for (auto x = lower_cell.x(); x <= upper_cell.x(); ++x) {
+            for (auto y = lower_cell.y(); y <= upper_cell.y(); ++y) {
+              for (auto z = lower_cell.z(); z <= upper_cell.z(); ++z) {
+                auto const node = local_offset + Utils::Vector3i{{x, y, z}};
+                auto const index = stride_x * (node[0] - lower_corner[0]) +
+                                   stride_y * (node[1] - lower_corner[1]) +
+                                   node[2] - lower_corner[2];
+                auto const local_index = stride_lx * (x - lower_cell.x()) +
+                                         stride_ly * (y - lower_cell.y()) + z -
+                                         lower_cell.z();
+                for (uint_t f = 0u; f < 3u; ++f) {
+                  out[static_cast<unsigned int>(3u * index + f)] =
+                      values[static_cast<unsigned int>(3u * local_index + f)];
+                }
+              }
+            }
+          }
+        }
       }
     }
     return out;
@@ -1160,16 +1302,55 @@ class LBWalberlaImpl : public LBWalberlaBase {
     m_pending_ghost_comm.set(GhostComm::VEL);
     m_pending_ghost_comm.set(GhostComm::LAF);
     if (auto const ci = get_interval(lower_corner, upper_corner)) {
-      auto const &lattice = get_lattice();
-      auto &block = *(lattice.get_blocks()->begin());
-      auto pdf_field = block.template getData<PdfField>(m_pdf_field_id);
-      auto force_field =
-          block.template getData<VectorField>(m_last_applied_force_field_id);
-      auto vel_field = block.template getData<VectorField>(m_velocity_field_id);
-      assert(++(lattice.get_blocks()->begin()) == lattice.get_blocks()->end());
       assert(force.size() == 3u * ci->numCells());
-      std::vector<FloatType> const values(force.begin(), force.end());
-      lbm::accessor::Force::set(pdf_field, vel_field, force_field, values, *ci);
+      int64_t const stride_y = (ci->max().z() - ci->min().z() + 1u);
+      int64_t const stride_x = (ci->max().y() - ci->min().y() + 1u) * stride_y;
+      auto const &lattice = get_lattice();
+      for (auto b = lattice.get_blocks()->begin();
+           b != lattice.get_blocks()->end(); ++b) {
+        auto &block = *b;
+        auto const local_offset = to_vector3i(block.getAABB().min());
+        if (auto const bci = get_block_interval(lower_corner, upper_corner,
+                                                local_offset, block)) {
+          auto pdf_field = block.template getData<PdfField>(m_pdf_field_id);
+          auto force_field = block.template getData<VectorField>(
+              m_last_applied_force_field_id);
+          auto vel_field =
+              block.template getData<VectorField>(m_velocity_field_id);
+          std::vector<FloatType> values = std::vector<FloatType>(
+              static_cast<unsigned int>(3u * bci->numCells()));
+          int64_t const stride_ly = (bci->max().z() - bci->min().z() + 1u);
+          int64_t const stride_lx =
+              (bci->max().y() - bci->min().y() + 1u) * stride_ly;
+          auto const lower_cell = bci->min();
+          auto const upper_cell = bci->max();
+          // In the loop, x,y,z are in block-local coordinates
+          // The field data given in the argument knows about BlockForest
+          // (lattice) indices from lower_corner to upper_corner It is converted
+          // to block-local coordinates The same applies to other set_slice
+          // methods
+          for (auto x = lower_cell.x(); x <= upper_cell.x(); ++x) {
+            for (auto y = lower_cell.y(); y <= upper_cell.y(); ++y) {
+              for (auto z = lower_cell.z(); z <= upper_cell.z(); ++z) {
+                auto const node = local_offset + Utils::Vector3i{{x, y, z}};
+                auto const index = stride_x * (node[0] - lower_corner[0]) +
+                                   stride_y * (node[1] - lower_corner[1]) +
+                                   node[2] - lower_corner[2];
+                auto const local_index = stride_lx * (x - lower_cell.x()) +
+                                         stride_ly * (y - lower_cell.y()) + z -
+                                         lower_cell.z();
+                for (uint_t f = 0u; f < 3u; ++f) {
+                  values[static_cast<unsigned int>(3u * local_index + f)] =
+                      numeric_cast<FloatType>(
+                          force[static_cast<unsigned int>(3u * index + f)]);
+                }
+              }
+            }
+          }
+          lbm::accessor::Force::set(pdf_field, vel_field, force_field, values,
+                                    *bci);
+        }
+      }
     }
   }
 
@@ -1220,17 +1401,49 @@ class LBWalberlaImpl : public LBWalberlaBase {
                        Utils::Vector3i const &upper_corner) const override {
     std::vector<double> out;
     if (auto const ci = get_interval(lower_corner, upper_corner)) {
+      out = std::vector<double>(
+          static_cast<unsigned int>(stencil_size() * ci->numCells()));
+      int64_t const stride_y = (ci->max().z() - ci->min().z() + 1u);
+      int64_t const stride_x = (ci->max().y() - ci->min().y() + 1u) * stride_y;
       auto const &lattice = get_lattice();
-      auto const &block = *(lattice.get_blocks()->begin());
-      auto const pdf_field = block.template getData<PdfField>(m_pdf_field_id);
-      auto const values = lbm::accessor::Population::get(pdf_field, *ci);
-      assert(++(lattice.get_blocks()->begin()) == lattice.get_blocks()->end());
-      assert(values.size() == stencil_size() * ci->numCells());
-      if constexpr (std::is_same_v<typename decltype(values)::value_type,
-                                   double>) {
-        out = std::move(values);
-      } else {
-        out = std::vector<double>(values.begin(), values.end());
+      for (auto b = lattice.get_blocks()->begin();
+           b != lattice.get_blocks()->end(); ++b) {
+        auto const &block = *b;
+        auto const local_offset = to_vector3i(block.getAABB().min());
+        if (auto const bci = get_block_interval(lower_corner, upper_corner,
+                                                local_offset, block)) {
+          auto const pdf_field =
+              block.template getData<PdfField>(m_pdf_field_id);
+          auto const values = lbm::accessor::Population::get(pdf_field, *bci);
+          assert(values.size() == stencil_size() * bci->numCells());
+          int64_t const stride_ly = (bci->max().z() - bci->min().z() + 1u);
+          int64_t const stride_lx =
+              (bci->max().y() - bci->min().y() + 1u) * stride_ly;
+          auto const lower_cell = bci->min();
+          auto const upper_cell = bci->max();
+          // The field data "values" knows about block-local indices
+          // In the loop, x,y,z are in block-local coordinates
+          // It is converted to BlockForest (lattice) coordinates assigned to a
+          // mpi rank The same applies to other get_slice methods
+          for (auto x = lower_cell.x(); x <= upper_cell.x(); ++x) {
+            for (auto y = lower_cell.y(); y <= upper_cell.y(); ++y) {
+              for (auto z = lower_cell.z(); z <= upper_cell.z(); ++z) {
+                auto const node = local_offset + Utils::Vector3i{{x, y, z}};
+                auto const index = stride_x * (node[0] - lower_corner[0]) +
+                                   stride_y * (node[1] - lower_corner[1]) +
+                                   node[2] - lower_corner[2];
+                auto const local_index = stride_lx * (x - lower_cell.x()) +
+                                         stride_ly * (y - lower_cell.y()) + z -
+                                         lower_cell.z();
+                for (uint_t f = 0u; f < stencil_size(); ++f) {
+                  out[static_cast<unsigned int>(stencil_size() * index + f)] =
+                      values[static_cast<unsigned int>(
+                          stencil_size() * local_index + f)];
+                }
+              }
+            }
+          }
+        }
       }
     }
     return out;
@@ -1240,17 +1453,57 @@ class LBWalberlaImpl : public LBWalberlaBase {
                             Utils::Vector3i const &upper_corner,
                             std::vector<double> const &population) override {
     if (auto const ci = get_interval(lower_corner, upper_corner)) {
-      auto const &lattice = get_lattice();
-      auto &block = *(lattice.get_blocks()->begin());
-      auto pdf_field = block.template getData<PdfField>(m_pdf_field_id);
-      auto force_field =
-          block.template getData<VectorField>(m_last_applied_force_field_id);
-      auto vel_field = block.template getData<VectorField>(m_velocity_field_id);
       assert(population.size() == stencil_size() * ci->numCells());
-      assert(++(lattice.get_blocks()->begin()) == lattice.get_blocks()->end());
-      std::vector<FloatType> const values(population.begin(), population.end());
-      lbm::accessor::Population::set(pdf_field, vel_field, force_field, values,
-                                     *ci);
+      int64_t const stride_y = (ci->max().z() - ci->min().z() + 1u);
+      int64_t const stride_x = (ci->max().y() - ci->min().y() + 1u) * stride_y;
+      auto const &lattice = get_lattice();
+      for (auto b = lattice.get_blocks()->begin();
+           b != lattice.get_blocks()->end(); ++b) {
+        auto &block = *b;
+        auto const local_offset = to_vector3i(block.getAABB().min());
+        if (auto const bci = get_block_interval(lower_corner, upper_corner,
+                                                local_offset, block)) {
+          auto pdf_field = block.template getData<PdfField>(m_pdf_field_id);
+          auto force_field = block.template getData<VectorField>(
+              m_last_applied_force_field_id);
+          auto vel_field =
+              block.template getData<VectorField>(m_velocity_field_id);
+          std::vector<FloatType> values = std::vector<FloatType>(
+              static_cast<unsigned int>(stencil_size() * bci->numCells()));
+          int64_t const stride_ly = (bci->max().z() - bci->min().z() + 1u);
+          int64_t const stride_lx =
+              (bci->max().y() - bci->min().y() + 1u) * stride_ly;
+          auto const lower_cell = bci->min();
+          auto const upper_cell = bci->max();
+          // In the loop, x,y,z are in block-local coordinates
+          // The field data given in the argument knows about BlockForest
+          // (lattice) indices from lower_corner to upper_corner It is converted
+          // to block-local coordinates The same applies to other set_slice
+          // methods
+          for (auto x = lower_cell.x(); x <= upper_cell.x(); ++x) {
+            for (auto y = lower_cell.y(); y <= upper_cell.y(); ++y) {
+              for (auto z = lower_cell.z(); z <= upper_cell.z(); ++z) {
+                auto const node = local_offset + Utils::Vector3i{{x, y, z}};
+                auto const index = stride_x * (node[0] - lower_corner[0]) +
+                                   stride_y * (node[1] - lower_corner[1]) +
+                                   node[2] - lower_corner[2];
+                auto const local_index = stride_lx * (x - lower_cell.x()) +
+                                         stride_ly * (y - lower_cell.y()) + z -
+                                         lower_cell.z();
+                for (uint_t f = 0u; f < stencil_size(); ++f) {
+                  values[static_cast<unsigned int>(
+                      stencil_size() * local_index + f)] =
+                      numeric_cast<FloatType>(
+                          population[static_cast<unsigned int>(
+                              stencil_size() * index + f)]);
+                }
+              }
+            }
+          }
+          lbm::accessor::Population::set(pdf_field, vel_field, force_field,
+                                         values, *bci);
+        }
+      }
     }
   }
 
@@ -1286,17 +1539,44 @@ class LBWalberlaImpl : public LBWalberlaBase {
                     Utils::Vector3i const &upper_corner) const override {
     std::vector<double> out;
     if (auto const ci = get_interval(lower_corner, upper_corner)) {
+      out = std::vector<double>(ci->numCells());
+      int64_t const stride_y = (ci->max().z() - ci->min().z() + 1u);
+      int64_t const stride_x = (ci->max().y() - ci->min().y() + 1u) * stride_y;
       auto const &lattice = get_lattice();
-      auto const &block = *(lattice.get_blocks()->begin());
-      auto const pdf_field = block.template getData<PdfField>(m_pdf_field_id);
-      auto const values = lbm::accessor::Density::get(pdf_field, *ci);
-      assert(++(lattice.get_blocks()->begin()) == lattice.get_blocks()->end());
-      assert(values.size() == ci->numCells());
-      if constexpr (std::is_same_v<typename decltype(values)::value_type,
-                                   double>) {
-        out = std::move(values);
-      } else {
-        out = std::vector<double>(values.begin(), values.end());
+      for (auto b = lattice.get_blocks()->begin();
+           b != lattice.get_blocks()->end(); ++b) {
+        auto const &block = *b;
+        auto const local_offset = to_vector3i(block.getAABB().min());
+        if (auto const bci = get_block_interval(lower_corner, upper_corner,
+                                                local_offset, block)) {
+          auto const pdf_field =
+              block.template getData<PdfField>(m_pdf_field_id);
+          auto const values = lbm::accessor::Density::get(pdf_field, *bci);
+          assert(values.size() == bci->numCells());
+          int64_t const stride_ly = (bci->max().z() - bci->min().z() + 1u);
+          int64_t const stride_lx =
+              (bci->max().y() - bci->min().y() + 1u) * stride_ly;
+          auto const lower_cell = bci->min();
+          auto const upper_cell = bci->max();
+          // The field data "values" knows about block-local indices
+          // In the loop, x,y,z are in block-local coordinates
+          // It is converted to BlockForest (lattice) coordinates assigned to a
+          // mpi rank The same applies to other get_slice methods
+          for (auto x = lower_cell.x(); x <= upper_cell.x(); ++x) {
+            for (auto y = lower_cell.y(); y <= upper_cell.y(); ++y) {
+              for (auto z = lower_cell.z(); z <= upper_cell.z(); ++z) {
+                auto const node = local_offset + Utils::Vector3i{{x, y, z}};
+                auto const index = stride_x * (node[0] - lower_corner[0]) +
+                                   stride_y * (node[1] - lower_corner[1]) +
+                                   node[2] - lower_corner[2];
+                auto const local_index = stride_lx * (x - lower_cell.x()) +
+                                         stride_ly * (y - lower_cell.y()) + z -
+                                         lower_cell.z();
+                out[index] = values[local_index];
+              }
+            }
+          }
+        }
       }
     }
     return out;
@@ -1307,13 +1587,46 @@ class LBWalberlaImpl : public LBWalberlaBase {
                          std::vector<double> const &density) override {
     m_pending_ghost_comm.set(GhostComm::PDF);
     if (auto const ci = get_interval(lower_corner, upper_corner)) {
-      auto const &lattice = get_lattice();
-      auto &block = *(lattice.get_blocks()->begin());
-      auto pdf_field = block.template getData<PdfField>(m_pdf_field_id);
       assert(density.size() == ci->numCells());
-      assert(++(lattice.get_blocks()->begin()) == lattice.get_blocks()->end());
-      std::vector<FloatType> const values(density.begin(), density.end());
-      lbm::accessor::Density::set(pdf_field, values, *ci);
+      int64_t const stride_y = (ci->max().z() - ci->min().z() + 1u);
+      int64_t const stride_x = (ci->max().y() - ci->min().y() + 1u) * stride_y;
+      auto const &lattice = get_lattice();
+      for (auto b = lattice.get_blocks()->begin();
+           b != lattice.get_blocks()->end(); ++b) {
+        auto &block = *b;
+        auto const local_offset = to_vector3i(block.getAABB().min());
+        if (auto const bci = get_block_interval(lower_corner, upper_corner,
+                                                local_offset, block)) {
+          auto pdf_field = block.template getData<PdfField>(m_pdf_field_id);
+          std::vector<FloatType> values =
+              std::vector<FloatType>(bci->numCells());
+          int64_t const stride_ly = (bci->max().z() - bci->min().z() + 1u);
+          int64_t const stride_lx =
+              (bci->max().y() - bci->min().y() + 1u) * stride_ly;
+          auto const lower_cell = bci->min();
+          auto const upper_cell = bci->max();
+          // In the loop, x,y,z are in block-local coordinates
+          // The field data given in the argument knows about BlockForest
+          // (lattice) indices from lower_corner to upper_corner It is converted
+          // to block-local coordinates The same applies to other set_slice
+          // methods
+          for (auto x = lower_cell.x(); x <= upper_cell.x(); ++x) {
+            for (auto y = lower_cell.y(); y <= upper_cell.y(); ++y) {
+              for (auto z = lower_cell.z(); z <= upper_cell.z(); ++z) {
+                auto const node = local_offset + Utils::Vector3i{{x, y, z}};
+                auto const index = stride_x * (node[0] - lower_corner[0]) +
+                                   stride_y * (node[1] - lower_corner[1]) +
+                                   node[2] - lower_corner[2];
+                auto const local_index = stride_lx * (x - lower_cell.x()) +
+                                         stride_ly * (y - lower_cell.y()) + z -
+                                         lower_cell.z();
+                values[local_index] = numeric_cast<FloatType>(density[index]);
+              }
+            }
+          }
+          lbm::accessor::Density::set(pdf_field, values, *bci);
+        }
+      }
     }
   }
 
@@ -1332,7 +1645,10 @@ class LBWalberlaImpl : public LBWalberlaBase {
                                      Utils::Vector3d const &velocity) override {
     on_boundary_add();
     m_pending_ghost_comm.set(GhostComm::UBB);
-    auto bc = get_block_and_cell(get_lattice(), node, true);
+    auto bc = get_block_and_cell(get_lattice(), node, false);
+    if (!bc) {
+      bc = get_block_and_cell(get_lattice(), node, true);
+    }
     if (bc) {
       m_boundary->set_node_value_at_boundary(
           node, to_vector3<FloatType>(velocity), *bc);
@@ -1345,26 +1661,40 @@ class LBWalberlaImpl : public LBWalberlaBase {
       Utils::Vector3i const &upper_corner) const override {
     std::vector<std::optional<Utils::Vector3d>> out;
     if (auto const ci = get_interval(lower_corner, upper_corner)) {
+      out = std::vector<std::optional<Utils::Vector3d>>(ci->numCells());
+      int64_t const stride_y = (ci->max().z() - ci->min().z() + 1u);
+      int64_t const stride_x = (ci->max().y() - ci->min().y() + 1u) * stride_y;
       auto const &lattice = get_lattice();
-      auto const local_offset = std::get<0>(lattice.get_local_grid_range());
-      auto const lower_cell = ci->min();
-      auto const upper_cell = ci->max();
-      auto const n_values = ci->numCells();
-      out.reserve(n_values);
-      for (auto x = lower_cell.x(); x <= upper_cell.x(); ++x) {
-        for (auto y = lower_cell.y(); y <= upper_cell.y(); ++y) {
-          for (auto z = lower_cell.z(); z <= upper_cell.z(); ++z) {
-            auto const node = local_offset + Utils::Vector3i{{x, y, z}};
-            if (m_boundary->node_is_boundary(node)) {
-              out.emplace_back(
-                  to_vector3d(m_boundary->get_node_value_at_boundary(node)));
-            } else {
-              out.emplace_back(std::nullopt);
+      for (auto b = lattice.get_blocks()->begin();
+           b != lattice.get_blocks()->end(); ++b) {
+        auto const &block = *b;
+        auto const local_offset = to_vector3i(block.getAABB().min());
+        if (auto const bci = get_block_interval(lower_corner, upper_corner,
+                                                local_offset, block)) {
+          auto const lower_cell = bci->min();
+          auto const upper_cell = bci->max();
+          // In the loop, x,y,z are in block-local coordinates
+          // It is converted to BlockForest (lattice) coordinates assigned to a
+          // mpi rank The same applies to other get_slice methods
+          for (auto x = lower_cell.x(); x <= upper_cell.x(); ++x) {
+            for (auto y = lower_cell.y(); y <= upper_cell.y(); ++y) {
+              for (auto z = lower_cell.z(); z <= upper_cell.z(); ++z) {
+                auto const node = local_offset + Utils::Vector3i{{x, y, z}};
+                auto const index = stride_x * (node[0] - lower_corner[0]) +
+                                   stride_y * (node[1] - lower_corner[1]) +
+                                   node[2] - lower_corner[2];
+                if (m_boundary->node_is_boundary(node)) {
+                  out[index] =
+                      to_vector3d(m_boundary->get_node_value_at_boundary(node));
+                } else {
+                  out[index] = std::nullopt;
+                }
+              }
             }
           }
         }
       }
-      assert(out.size() == n_values);
+      assert(out.size() == ci->numCells());
     }
     return out;
   }
@@ -1375,25 +1705,41 @@ class LBWalberlaImpl : public LBWalberlaBase {
     on_boundary_add();
     m_pending_ghost_comm.set(GhostComm::UBB);
     if (auto const ci = get_interval(lower_corner, upper_corner)) {
-      auto const &lattice = get_lattice();
-      auto const local_offset = std::get<0>(lattice.get_local_grid_range());
-      auto const lower_cell = ci->min();
-      auto const upper_cell = ci->max();
-      auto it = velocity.begin();
       assert(velocity.size() == ci->numCells());
-      for (auto x = lower_cell.x(); x <= upper_cell.x(); ++x) {
-        for (auto y = lower_cell.y(); y <= upper_cell.y(); ++y) {
-          for (auto z = lower_cell.z(); z <= upper_cell.z(); ++z) {
-            auto const node = local_offset + Utils::Vector3i{{x, y, z}};
-            auto const bc = get_block_and_cell(lattice, node, false);
-            auto const &opt = *it;
-            if (opt) {
-              m_boundary->set_node_value_at_boundary(
-                  node, to_vector3<FloatType>(*opt), *bc);
-            } else {
-              m_boundary->remove_node_from_boundary(node, *bc);
+      int64_t const stride_y = (ci->max().z() - ci->min().z() + 1u);
+      int64_t const stride_x = (ci->max().y() - ci->min().y() + 1u) * stride_y;
+      auto const &lattice = get_lattice();
+      for (auto b = lattice.get_blocks()->begin();
+           b != lattice.get_blocks()->end(); ++b) {
+        auto &block = *b;
+        auto const local_offset = to_vector3i(block.getAABB().min());
+        if (auto const bci = get_block_interval(lower_corner, upper_corner,
+                                                local_offset, block)) {
+          auto const lower_cell = bci->min();
+          auto const upper_cell = bci->max();
+          // In the loop, x,y,z are in block-local coordinates
+          // The field data given in the argument knows about BlockForest
+          // (lattice) indices from lower_corner to upper_corner It is converted
+          // to block-local coordinates The same applies to other set_slice
+          // methods
+          for (auto x = lower_cell.x(); x <= upper_cell.x(); ++x) {
+            for (auto y = lower_cell.y(); y <= upper_cell.y(); ++y) {
+              for (auto z = lower_cell.z(); z <= upper_cell.z(); ++z) {
+                auto const node = local_offset + Utils::Vector3i{{x, y, z}};
+                auto const index = stride_x * (node[0] - lower_corner[0]) +
+                                   stride_y * (node[1] - lower_corner[1]) +
+                                   node[2] - lower_corner[2];
+                auto const bc = get_block_and_cell(lattice, node, false);
+                assert(bc->block->getAABB() == block.getAABB());
+                auto const &opt = velocity[index];
+                if (opt) {
+                  m_boundary->set_node_value_at_boundary(
+                      node, to_vector3<FloatType>(*opt), *bc);
+                } else {
+                  m_boundary->remove_node_from_boundary(node, *bc);
+                }
+              }
             }
-            ++it;
           }
         }
       }
@@ -1410,7 +1756,10 @@ class LBWalberlaImpl : public LBWalberlaBase {
   }
 
   bool remove_node_from_boundary(Utils::Vector3i const &node) override {
-    auto bc = get_block_and_cell(get_lattice(), node, true);
+    auto bc = get_block_and_cell(get_lattice(), node, false);
+    if (!bc) {
+      bc = get_block_and_cell(get_lattice(), node, true);
+    }
     if (bc) {
       m_boundary->remove_node_from_boundary(node, *bc);
     }
@@ -1433,21 +1782,35 @@ class LBWalberlaImpl : public LBWalberlaBase {
                         Utils::Vector3i const &upper_corner) const override {
     std::vector<bool> out;
     if (auto const ci = get_interval(lower_corner, upper_corner)) {
+      out = std::vector<bool>(ci->numCells());
+      int64_t const stride_y = (ci->max().z() - ci->min().z() + 1u);
+      int64_t const stride_x = (ci->max().y() - ci->min().y() + 1u) * stride_y;
       auto const &lattice = get_lattice();
-      auto const local_offset = std::get<0>(lattice.get_local_grid_range());
-      auto const lower_cell = ci->min();
-      auto const upper_cell = ci->max();
-      auto const n_values = ci->numCells();
-      out.reserve(n_values);
-      for (auto x = lower_cell.x(); x <= upper_cell.x(); ++x) {
-        for (auto y = lower_cell.y(); y <= upper_cell.y(); ++y) {
-          for (auto z = lower_cell.z(); z <= upper_cell.z(); ++z) {
-            auto const node = local_offset + Utils::Vector3i{x, y, z};
-            out.emplace_back(m_boundary->node_is_boundary(node));
+      for (auto b = lattice.get_blocks()->begin();
+           b != lattice.get_blocks()->end(); ++b) {
+        auto const &block = *b;
+        auto const local_offset = to_vector3i(block.getAABB().min());
+        if (auto const bci = get_block_interval(lower_corner, upper_corner,
+                                                local_offset, block)) {
+          auto const lower_cell = bci->min();
+          auto const upper_cell = bci->max();
+          // In the loop, x,y,z are in block-local coordinates
+          // It is converted to BlockForest (lattice) coordinates assigned to a
+          // mpi rank The same applies to other get_slice methods
+          for (auto x = lower_cell.x(); x <= upper_cell.x(); ++x) {
+            for (auto y = lower_cell.y(); y <= upper_cell.y(); ++y) {
+              for (auto z = lower_cell.z(); z <= upper_cell.z(); ++z) {
+                auto const node = local_offset + Utils::Vector3i{{x, y, z}};
+                auto const index = stride_x * (node[0] - lower_corner[0]) +
+                                   stride_y * (node[1] - lower_corner[1]) +
+                                   node[2] - lower_corner[2];
+                out[index] = m_boundary->node_is_boundary(node);
+              }
+            }
           }
         }
       }
-      assert(out.size() == n_values);
+      assert(out.size() == ci->numCells());
     }
     return out;
   }
@@ -1501,20 +1864,49 @@ class LBWalberlaImpl : public LBWalberlaBase {
       Utils::Vector3i const &upper_corner) const override {
     std::vector<double> out;
     if (auto const ci = get_interval(lower_corner, upper_corner)) {
+      out = std::vector<double>(static_cast<unsigned int>(9u * ci->numCells()));
+      int64_t const stride_y = (ci->max().z() - ci->min().z() + 1u);
+      int64_t const stride_x = (ci->max().y() - ci->min().y() + 1u) * stride_y;
       auto const &lattice = get_lattice();
-      auto const &block = *(lattice.get_blocks()->begin());
-      auto const pdf_field = block.template getData<PdfField>(m_pdf_field_id);
-      auto values = lbm::accessor::PressureTensor::get(pdf_field, *ci);
-      assert(++(lattice.get_blocks()->begin()) == lattice.get_blocks()->end());
-      assert(values.size() == 9u * ci->numCells());
-      for (auto it = values.begin(); it != values.end(); std::advance(it, 9l)) {
-        pressure_tensor_correction(std::span<FloatType, 9ul>(it, 9ul));
-      }
-      if constexpr (std::is_same_v<typename decltype(values)::value_type,
-                                   double>) {
-        out = std::move(values);
-      } else {
-        out = std::vector<double>(values.begin(), values.end());
+      for (auto b = lattice.get_blocks()->begin();
+           b != lattice.get_blocks()->end(); ++b) {
+        auto const &block = *b;
+        auto const local_offset = to_vector3i(block.getAABB().min());
+        if (auto const bci = get_block_interval(lower_corner, upper_corner,
+                                                local_offset, block)) {
+          auto const pdf_field =
+              block.template getData<PdfField>(m_pdf_field_id);
+          auto values = lbm::accessor::PressureTensor::get(pdf_field, *bci);
+          assert(values.size() == 9u * bci->numCells());
+          int64_t const stride_ly = (bci->max().z() - bci->min().z() + 1u);
+          int64_t const stride_lx =
+              (bci->max().y() - bci->min().y() + 1u) * stride_ly;
+          auto const lower_cell = bci->min();
+          auto const upper_cell = bci->max();
+          // The field data "values" knows about block-local indices
+          // In the loop, x,y,z are in block-local coordinates
+          // It is converted to BlockForest (lattice) coordinates assigned to a
+          // mpi rank The same applies to other get_slice methods
+          for (auto x = lower_cell.x(); x <= upper_cell.x(); ++x) {
+            for (auto y = lower_cell.y(); y <= upper_cell.y(); ++y) {
+              for (auto z = lower_cell.z(); z <= upper_cell.z(); ++z) {
+                auto const node = local_offset + Utils::Vector3i{{x, y, z}};
+                auto const index = stride_x * (node[0] - lower_corner[0]) +
+                                   stride_y * (node[1] - lower_corner[1]) +
+                                   node[2] - lower_corner[2];
+                auto const local_index = stride_lx * (x - lower_cell.x()) +
+                                         stride_ly * (y - lower_cell.y()) + z -
+                                         lower_cell.z();
+                pressure_tensor_correction(std::span<FloatType, 9ul>(
+                    &values[static_cast<unsigned int>(9u * local_index)], 9ul));
+                for (uint_t f = 0u; f < 9u; ++f) {
+                  out[static_cast<unsigned int>(9u * index + f)] =
+                      values[static_cast<unsigned int>(9u * local_index + f)];
+                }
+              }
+            }
+          }
+        }
       }
     }
     return out;
diff --git a/src/walberla_bridge/src/utils/boundary.hpp b/src/walberla_bridge/src/utils/boundary.hpp
index 719c028aa4..dbd2a9ab25 100644
--- a/src/walberla_bridge/src/utils/boundary.hpp
+++ b/src/walberla_bridge/src/utils/boundary.hpp
@@ -85,15 +85,15 @@ void set_boundary_from_grid(BoundaryModel &boundary,
 
   auto const &conv = es2walberla<DataType, typename BoundaryModel::value_type>;
   auto const grid_size = lattice.get_grid_dimensions();
-  auto const offset = lattice.get_local_grid_range().first;
   auto const gl = static_cast<int>(lattice.get_ghost_layers());
   assert(raster_flat.size() ==
          static_cast<std::size_t>(Utils::product(grid_size)));
   auto const n_y = static_cast<std::size_t>(grid_size[1]);
   auto const n_z = static_cast<std::size_t>(grid_size[2]);
 
-  for (auto const &block : *lattice.get_blocks()) {
+  for (auto &block : *lattice.get_blocks()) {
     auto const [size_i, size_j, size_k] = boundary.block_dims(block);
+    auto const offset = to_vector3i(block.getAABB().min());
     // Get field data which knows about the indices
     // In the loop, i,j,k are in block-local coordinates
     for (int i = -gl; i < size_i + gl; ++i) {
@@ -106,8 +106,9 @@ void set_boundary_from_grid(BoundaryModel &boundary,
                              static_cast<std::size_t>(idx[2]);
           if (raster_flat[index]) {
             auto const &value = data_flat[index];
-            auto const bc = get_block_and_cell(lattice, node, true);
-            assert(bc.has_value());
+            std::optional<BlockAndCell> bc;
+            bc->block = &block;
+            bc->cell = Cell(i, j, k);
             boundary.set_node_value_at_boundary(node, conv(value), *bc);
           }
         }
diff --git a/src/walberla_bridge/src/utils/types_conversion.hpp b/src/walberla_bridge/src/utils/types_conversion.hpp
index 6f196cb57a..72968a25de 100644
--- a/src/walberla_bridge/src/utils/types_conversion.hpp
+++ b/src/walberla_bridge/src/utils/types_conversion.hpp
@@ -68,6 +68,10 @@ inline Utils::VectorXd<9> to_vector9d(Matrix3<float> const &m) {
           double_c(m[3]), double_c(m[4]), double_c(m[5]),
           double_c(m[6]), double_c(m[7]), double_c(m[8])};
 }
+inline Utils::Vector3i to_vector3i(Vector3<float> const &v) {
+  return Utils::Vector3i{
+      {static_cast<int>(v[0]), static_cast<int>(v[1]), static_cast<int>(v[2])}};
+}
 
 template <typename Function>
 void interpolate_bspline_at_pos(Utils::Vector3d const &pos, Function const &f) {
diff --git a/src/walberla_bridge/tests/CMakeLists.txt b/src/walberla_bridge/tests/CMakeLists.txt
index 7b3a85ab1b..c5d7805960 100644
--- a/src/walberla_bridge/tests/CMakeLists.txt
+++ b/src/walberla_bridge/tests/CMakeLists.txt
@@ -30,9 +30,12 @@ function(ESPRESSO_ADD_TEST)
                                                espresso::walberla_codegen_cuda)
   endif()
   if(${TEST_SRC} MATCHES ".*\.cu$")
-    target_link_libraries(${TEST_NAME} PRIVATE espresso::walberla::cuda_flags)
+    target_link_libraries(
+      ${TEST_NAME} PRIVATE espresso::walberla::cuda_flags
+                           espresso::walberla_cuda espresso::config)
   else()
-    target_link_libraries(${TEST_NAME} PRIVATE espresso::walberla::cpp_flags)
+    target_link_libraries(${TEST_NAME} PRIVATE espresso::walberla::cpp_flags
+                                               espresso::config)
   endif()
   set_target_properties(${TEST_NAME} PROPERTIES CXX_CLANG_TIDY "")
   target_include_directories(${TEST_NAME} PRIVATE ${WALBERLA_INCLUDE_DIRS}
@@ -50,6 +53,7 @@ espresso_add_test(SRC LBWalberlaImpl_unit_tests.cpp DEPENDS Boost::mpi NUM_PROC
 espresso_add_test(SRC LBWalberlaImpl_bspline_tests.cpp DEPENDS Boost::mpi
                   NUM_PROC 2)
 espresso_add_test(SRC LBWalberlaImpl_flow_tests.cpp DEPENDS Boost::mpi)
+espresso_configure_walberla_target(espresso_walberla_codegen)
 espresso_add_test(SRC LBWalberlaImpl_lees_edwards_tests.cpp DEPENDS Boost::mpi)
 espresso_add_test(SRC EKinWalberlaImpl_unit_tests.cpp DEPENDS Boost::mpi
                   NUM_PROC 2)
diff --git a/src/walberla_bridge/tests/EKinWalberlaImpl_unit_tests.cpp b/src/walberla_bridge/tests/EKinWalberlaImpl_unit_tests.cpp
index 210b5edb57..3e086d7c63 100644
--- a/src/walberla_bridge/tests/EKinWalberlaImpl_unit_tests.cpp
+++ b/src/walberla_bridge/tests/EKinWalberlaImpl_unit_tests.cpp
@@ -570,8 +570,8 @@ int main(int argc, char **argv) {
   params.ext_efield = Vector3d{0.01, 0.02, 0.03};
   params.grid_dimensions = Vector3i{12, 12, 18};
   params.box_dimensions = Vector3d{12, 12, 18};
-  params.lattice =
-      std::make_shared<LatticeWalberla>(params.grid_dimensions, mpi_shape, 1u);
+  params.lattice = std::make_shared<LatticeWalberla>(params.grid_dimensions,
+                                                     mpi_shape, mpi_shape, 1u);
 
   auto const res = boost::unit_test::unit_test_main(init_unit_test, argc, argv);
   MPI_Finalize();
diff --git a/src/walberla_bridge/tests/LBWalberlaImpl_bspline_tests.cpp b/src/walberla_bridge/tests/LBWalberlaImpl_bspline_tests.cpp
index a0123cbe67..3be29c54d1 100644
--- a/src/walberla_bridge/tests/LBWalberlaImpl_bspline_tests.cpp
+++ b/src/walberla_bridge/tests/LBWalberlaImpl_bspline_tests.cpp
@@ -156,8 +156,8 @@ int main(int argc, char **argv) {
   params.density = 1.4;
   params.grid_dimensions = Vector3i{12, 6, 9};
   params.box_dimensions = Vector3d{12, 6, 9};
-  params.lattice =
-      std::make_shared<LatticeWalberla>(params.grid_dimensions, mpi_shape, 1u);
+  params.lattice = std::make_shared<LatticeWalberla>(params.grid_dimensions,
+                                                     mpi_shape, mpi_shape, 1u);
 
   auto const res = boost::unit_test::unit_test_main(init_unit_test, argc, argv);
   MPI_Finalize();
diff --git a/src/walberla_bridge/tests/LBWalberlaImpl_field_accessors_tests.cu b/src/walberla_bridge/tests/LBWalberlaImpl_field_accessors_tests.cu
index 30c98ab2e7..f02c76c188 100644
--- a/src/walberla_bridge/tests/LBWalberlaImpl_field_accessors_tests.cu
+++ b/src/walberla_bridge/tests/LBWalberlaImpl_field_accessors_tests.cu
@@ -106,11 +106,13 @@ boost::test_tools::predicate_result almost_equal(R const &val, R const &ref,
   for (auto i = 0ul; i < val.size(); ++i) {
     if (auto const diff = std::abs(val[i] - ref[i]); diff > atol) {
       res = false;
-      res.message() << "val{" << print_first_n(val) << "} and " << "ref{"
-                    << print_first_n(ref) << "} mismatch: " << "val[" << i
-                    << "]{" << val[i] << "} != " << "ref[" << i << "]{"
-                    << ref[i] << "} " << "(difference{" << diff << "} > delta{"
-                    << atol << "})";
+      // clang-format off
+      res.message() << "val{" << print_first_n(val) << "} and "
+                    << "ref{" << print_first_n(ref) << "} mismatch: "
+		    << "val[" << i << "]{" << val[i] << "} != "
+		    << "ref[" << i << "]{" << ref[i] << "} "
+		    << "(difference{" << diff << "} > delta{" << atol << "})";
+      // clang-format on
       break;
     }
   }
@@ -156,7 +158,8 @@ template <typename FT, lbmpy::Arch Architecture> struct Fixture {
     auto const grid_dim = Utils::Vector3i::broadcast(4);
     auto const viscosity = FT(1.5);
     auto const density = FT(0.9);
-    lattice = std::make_shared<::LatticeWalberla>(grid_dim, mpi_shape, 1u);
+    lattice =
+        std::make_shared<::LatticeWalberla>(grid_dim, mpi_shape, mpi_shape, 1u);
     lbfluid = std::make_shared<LBWalberlaImplTest<FT, Architecture>>(
         lattice, viscosity, density);
   }
diff --git a/src/walberla_bridge/tests/LBWalberlaImpl_flow_tests.cpp b/src/walberla_bridge/tests/LBWalberlaImpl_flow_tests.cpp
index 36526ee3ce..96049bff27 100644
--- a/src/walberla_bridge/tests/LBWalberlaImpl_flow_tests.cpp
+++ b/src/walberla_bridge/tests/LBWalberlaImpl_flow_tests.cpp
@@ -167,8 +167,8 @@ int main(int argc, char **argv) {
   params.density = 1.4;
   params.grid_dimensions = Vector3i{12, 12, 18};
   params.box_dimensions = Vector3d{6, 6, 9};
-  params.lattice =
-      std::make_shared<LatticeWalberla>(params.grid_dimensions, mpi_shape, 1u);
+  params.lattice = std::make_shared<LatticeWalberla>(params.grid_dimensions,
+                                                     mpi_shape, mpi_shape, 1u);
 
   auto const res = boost::unit_test::unit_test_main(init_unit_test, argc, argv);
   MPI_Finalize();
diff --git a/src/walberla_bridge/tests/LBWalberlaImpl_lees_edwards_tests.cpp b/src/walberla_bridge/tests/LBWalberlaImpl_lees_edwards_tests.cpp
index 8e66ed037e..44667b4fa0 100644
--- a/src/walberla_bridge/tests/LBWalberlaImpl_lees_edwards_tests.cpp
+++ b/src/walberla_bridge/tests/LBWalberlaImpl_lees_edwards_tests.cpp
@@ -71,8 +71,8 @@ BOOST_AUTO_TEST_CASE(test_transient_shear) {
   using LBImplementation = walberla::LBWalberlaImpl<double, lbmpy::Arch::CPU>;
   double density = 1;
   double viscosity = 1. / 7.;
-  auto lattice =
-      std::make_shared<LatticeWalberla>(Vector3i{8, 64, 8}, mpi_shape, 1);
+  auto lattice = std::make_shared<LatticeWalberla>(Vector3i{8, 64, 8},
+                                                   mpi_shape, mpi_shape, 1);
   auto lb = LBImplementation(lattice, viscosity, density);
   auto le_pack = std::make_unique<LeesEdwardsPack>(
       0u, 1u, []() { return 0.0; }, [=]() { return v0; });
@@ -96,8 +96,8 @@ static auto setup_lb_with_offset(double offset) {
   using LBImplementation = walberla::LBWalberlaImpl<double, lbmpy::Arch::CPU>;
   auto density = 1.;
   auto viscosity = 1. / 7.;
-  auto lattice =
-      std::make_shared<LatticeWalberla>(Vector3i{10, 10, 10}, mpi_shape, 1);
+  auto lattice = std::make_shared<LatticeWalberla>(Vector3i{10, 10, 10},
+                                                   mpi_shape, mpi_shape, 1);
   auto lb = std::make_shared<LBImplementation>(lattice, viscosity, density);
   auto le_pack = std::make_unique<LeesEdwardsPack>(
       0u, 1u, [=]() { return offset; }, []() { return 0.0; });
diff --git a/src/walberla_bridge/tests/LBWalberlaImpl_statistical_tests.cpp b/src/walberla_bridge/tests/LBWalberlaImpl_statistical_tests.cpp
index 9732bc8a71..30e4b4b695 100644
--- a/src/walberla_bridge/tests/LBWalberlaImpl_statistical_tests.cpp
+++ b/src/walberla_bridge/tests/LBWalberlaImpl_statistical_tests.cpp
@@ -132,8 +132,8 @@ int main(int argc, char **argv) {
   params.density = 1.4;
   params.grid_dimensions = Vector3i{12, 12, 18};
   params.box_dimensions = Vector3d{6, 6, 9};
-  params.lattice =
-      std::make_shared<LatticeWalberla>(params.grid_dimensions, mpi_shape, 1u);
+  params.lattice = std::make_shared<LatticeWalberla>(params.grid_dimensions,
+                                                     mpi_shape, mpi_shape, 1u);
 
   auto const res = boost::unit_test::unit_test_main(init_unit_test, argc, argv);
   MPI_Finalize();
diff --git a/src/walberla_bridge/tests/LBWalberlaImpl_unit_tests.cpp b/src/walberla_bridge/tests/LBWalberlaImpl_unit_tests.cpp
index c3352fcbed..c473a4fc78 100644
--- a/src/walberla_bridge/tests/LBWalberlaImpl_unit_tests.cpp
+++ b/src/walberla_bridge/tests/LBWalberlaImpl_unit_tests.cpp
@@ -587,8 +587,8 @@ BOOST_DATA_TEST_CASE(vtk_exceptions,
 
 BOOST_AUTO_TEST_CASE(lb_exceptions) {
   using LB = walberla::LBWalberlaImpl<double, lbmpy::Arch::CPU>;
-  auto lb_lattice_without_ghosts =
-      std::make_shared<LatticeWalberla>(params.grid_dimensions, mpi_shape, 0u);
+  auto lb_lattice_without_ghosts = std::make_shared<LatticeWalberla>(
+      params.grid_dimensions, mpi_shape, mpi_shape, 0u);
   BOOST_CHECK_THROW(LB(lb_lattice_without_ghosts, 1., 1.), std::runtime_error);
 }
 
@@ -630,8 +630,8 @@ int main(int argc, char **argv) {
   params.density = 1.4;
   params.grid_dimensions = Vector3i{12, 12, 18};
   params.box_dimensions = Vector3d{12, 12, 18};
-  params.lattice =
-      std::make_shared<LatticeWalberla>(params.grid_dimensions, mpi_shape, 1u);
+  params.lattice = std::make_shared<LatticeWalberla>(params.grid_dimensions,
+                                                     mpi_shape, mpi_shape, 1u);
 
   auto const res = boost::unit_test::unit_test_main(init_unit_test, argc, argv);
   MPI_Finalize();
diff --git a/src/walberla_bridge/tests/LatticeWalberla_unit_tests.cpp b/src/walberla_bridge/tests/LatticeWalberla_unit_tests.cpp
index 977586ad89..8385981e93 100644
--- a/src/walberla_bridge/tests/LatticeWalberla_unit_tests.cpp
+++ b/src/walberla_bridge/tests/LatticeWalberla_unit_tests.cpp
@@ -52,8 +52,8 @@ static LatticeTestParameters params; // populated in main()
 static Vector3i mpi_shape;           // populated in main
 
 BOOST_DATA_TEST_CASE(domain_and_halo, bdata::xrange(3u), n_ghost_layers) {
-  auto const lattice =
-      LatticeWalberla(params.grid_dimensions, mpi_shape, n_ghost_layers);
+  auto const lattice = LatticeWalberla(params.grid_dimensions, mpi_shape,
+                                       mpi_shape, n_ghost_layers);
   auto const [my_left, my_right] = lattice.get_local_domain();
 
   for (auto const &n : all_nodes_incl_ghosts(lattice)) {
@@ -104,7 +104,7 @@ BOOST_AUTO_TEST_CASE(exceptions) {
     auto grid_dims = Vector3i::broadcast(1);
     grid_dims[i] = 3;
     node_grid[i] = 2;
-    BOOST_CHECK_THROW(LatticeWalberla(grid_dims, node_grid, 1u),
+    BOOST_CHECK_THROW(LatticeWalberla(grid_dims, node_grid, node_grid, 1u),
                       std::runtime_error);
   }
 }
diff --git a/testsuite/python/lb.py b/testsuite/python/lb.py
index a16ff6c5a3..8fad535b3b 100644
--- a/testsuite/python/lb.py
+++ b/testsuite/python/lb.py
@@ -517,7 +517,11 @@ def test_agrid_rounding(self):
         phi = 0.05
         lj_sig = 1.0
         l = (n_part * 4. / 3. * np.pi * (lj_sig / 2.)**3 / phi)**(1. / 3.)
-        system.box_l = [l] * 3 * np.array(system.cell_system.node_grid)
+        if hasattr(self, 'blocks_per_mpi_rank'):
+            system.box_l = [
+                l] * 3 * np.array(system.cell_system.node_grid) * np.array(self.blocks_per_mpi_rank)
+        else:
+            system.box_l = [l] * 3 * np.array(system.cell_system.node_grid)
         lbf = self.lb_class(agrid=l / 31, density=1, kinematic_viscosity=1, kT=0,
                             tau=system.time_step, **self.lb_params)
         system.lb = lbf
@@ -825,6 +829,25 @@ def params_with_tau(tau):
         np.testing.assert_allclose(v1, v2, rtol=1e-2)
         np.testing.assert_allclose(f1, f2, rtol=1e-2)
 
+    def test_raise_block_grid_mismatch(self):
+        if not hasattr(self, 'blocks_per_mpi_rank'):
+            self.skipTest(
+                "Skipping test: this test is only for the systme allocating multiple blocks to one mpi rank")
+        with self.assertRaisesRegex(RuntimeError, "Lattice grid dimensions and block grid are not compatible"):
+            self.lb_class(
+                **self.params, single_precision=self.lb_params["single_precision"], blocks_per_mpi_rank=[11, 1, 1])
+
+    @utx.skipIfMissingGPU()
+    def test_raise_blocks_for_GPU(self):
+        if self.lb_class != espressomd.lb.LBFluidWalberlaGPU:
+            self.skipTest(
+                "Skipping test: this test is only for LBFluidWalberlaGPU")
+        blocks_per_mpi_rank = [2, 2, 2]
+        self.lb_params = {"single_precision": False,
+                          "blocks_per_mpi_rank": blocks_per_mpi_rank}
+        with self.assertRaisesRegex(RuntimeError, "GPU architecture PROHIBITED allocating many blocks to 1 CPU"):
+            self.lb_class(**self.params, **self.lb_params)
+
 
 @utx.skipIfMissingFeatures("WALBERLA")
 class LBTestWalberlaDoublePrecisionCPU(LBTest, ut.TestCase):
@@ -864,5 +887,27 @@ class LBTestWalberlaSinglePrecisionGPU(LBTest, ut.TestCase):
     rtol = 2e-4
 
 
+@utx.skipIfMissingFeatures("WALBERLA")
+class LBTestWalberlaDoublePrecisionBlocksCPU(LBTest, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberla
+    lb_lattice_class = espressomd.lb.LatticeWalberla
+    blocks_per_mpi_rank = [2, 2, 2]
+    lb_params = {"single_precision": False,
+                 "blocks_per_mpi_rank": blocks_per_mpi_rank}
+    atol = 1e-10
+    rtol = 1e-7
+
+
+@utx.skipIfMissingFeatures("WALBERLA")
+class LBTestWalberlaSinglePrecisionBlocksCPU(LBTest, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberla
+    lb_lattice_class = espressomd.lb.LatticeWalberla
+    blocks_per_mpi_rank = [2, 2, 2]
+    lb_params = {"single_precision": True,
+                 "blocks_per_mpi_rank": blocks_per_mpi_rank}
+    atol = 1e-6
+    rtol = 2e-4
+
+
 if __name__ == "__main__":
     ut.main()
diff --git a/testsuite/python/lb_boundary.py b/testsuite/python/lb_boundary.py
index 6ad5a6c0ad..7d46007335 100644
--- a/testsuite/python/lb_boundary.py
+++ b/testsuite/python/lb_boundary.py
@@ -125,5 +125,11 @@ class LBBoundariesWalberlaSinglePrecisionGPU(LBBoundariesBase, ut.TestCase):
     lb_params = {"single_precision": True}
 
 
+@utx.skipIfMissingFeatures(["WALBERLA"])
+class LBBoundariesWalberlaDoublePrecisionCPU(LBBoundariesBase, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberla
+    lb_params = {"single_precision": False, "blocks_per_mpi_rank": [2, 1, 1]}
+
+
 if __name__ == "__main__":
     ut.main()
diff --git a/testsuite/python/lb_boundary_ghost_layer.py b/testsuite/python/lb_boundary_ghost_layer.py
index 84ce9180f0..46bcb36d3f 100644
--- a/testsuite/python/lb_boundary_ghost_layer.py
+++ b/testsuite/python/lb_boundary_ghost_layer.py
@@ -117,5 +117,12 @@ class LBPoiseuilleWalberlaDoublePrecisionGPU(TestCommon, ut.TestCase):
     lb_params = {"single_precision": False}
 
 
+@utx.skipIfMissingFeatures(["WALBERLA"])
+# @ut.skipIf(TestCommon.n_nodes != 2, "only runs for 2 MPI ranks")
+class LBPoiseuilleWalberlaDoublePrecisionBlocksCPU(TestCommon, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberla
+    lb_params = {"single_precision": False, "blocks_per_mpi_rank": [2, 1, 1]}
+
+
 if __name__ == "__main__":
     ut.main()
diff --git a/testsuite/python/lb_boundary_volume_force.py b/testsuite/python/lb_boundary_volume_force.py
index 76beda388f..9f402839ba 100644
--- a/testsuite/python/lb_boundary_volume_force.py
+++ b/testsuite/python/lb_boundary_volume_force.py
@@ -111,5 +111,11 @@ class LBBoundaryForceWalberlaSinglePrecision(
     lb_params = {"single_precision": True}
 
 
+@utx.skipIfMissingFeatures(["WALBERLA"])
+class LBBoundaryForceWalberlaBlocks(LBBoundaryForceCommon, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberla
+    lb_params = {"single_precision": False, "blocks_per_mpi_rank": [2, 2, 2]}
+
+
 if __name__ == '__main__':
     ut.main()
diff --git a/testsuite/python/lb_circular_couette.py b/testsuite/python/lb_circular_couette.py
index f16b6bbf24..2c9b1a1ad7 100644
--- a/testsuite/python/lb_circular_couette.py
+++ b/testsuite/python/lb_circular_couette.py
@@ -172,5 +172,17 @@ class LBCircularCouetteWalberlaSinglePrecisionGPU(LBCouetteTest, ut.TestCase):
     lb_params = {"single_precision": True}
 
 
+@utx.skipIfMissingFeatures(["WALBERLA"])
+class LBCircularCouetteWalberlaDoublePRecisionBlocksCPU(LBCouetteTest, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberla
+    lb_params = {"single_precision": False, "blocks_per_mpi_rank": [2, 2, 2]}
+
+
+@utx.skipIfMissingFeatures(["WALBERLA"])
+class LBCircularCouetteWalberlaSinglePRecisionBlocksCPU(LBCouetteTest, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberla
+    lb_params = {"single_precision": True, "blocks_per_mpi_rank": [2, 2, 2]}
+
+
 if __name__ == "__main__":
     ut.main()
diff --git a/testsuite/python/lb_couette_xy.py b/testsuite/python/lb_couette_xy.py
new file mode 100644
index 0000000000..742f03ff2c
--- /dev/null
+++ b/testsuite/python/lb_couette_xy.py
@@ -0,0 +1,147 @@
+#
+# Copyright (C) 2021-2023 The ESPResSo project
+#
+# This file is part of ESPResSo.
+#
+# ESPResSo is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# ESPResSo is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+import espressomd.lb
+import espressomd.lees_edwards
+
+import unittest as ut
+import unittest_decorators as utx
+import numpy as np
+
+
+def analytical(x, t, nu, v, h, k_max):
+    """
+    Analytical solution with Fourier series of the Navier-Stokes equation.
+
+    Parameters
+    ----------
+    x : :obj:`float`
+        Height within the channel
+    t : :obj:`float`
+        Time since the start up of the shear flow
+    nu: :obj:`float`
+        Kinematic kinematic_viscosity
+    v: :obj:`float`
+        Shearing velocity
+    h : :obj:`float`
+        Distance between shear planes
+    k_max : :obj:`int`
+        Upper limit of sums for sinus series
+
+    """
+    u = x / h - 0.5
+    for k in np.arange(1, k_max + 1):
+        wave = 2 * np.pi * k / h
+        u += np.exp(-nu * wave ** 2 * t) * np.sin(wave * x) / (np.pi * k)
+    return v * u
+
+
+LB_PARAMS = {'agrid': 1.,
+             'density': 1.,
+             'kinematic_viscosity': 1. / 6.,
+             'tau': 1.}
+
+system = espressomd.System(box_l=[32, 32, 32])
+system.time_step = LB_PARAMS['tau']
+system.cell_system.skin = 0.1
+system.cell_system.set_n_square()
+n_nodes = np.prod(system.cell_system.node_grid)
+
+coord_indexes = {"x": 0, "y": 1, "z": 2}
+
+
+class LBCouetteFlowCommon:
+
+    def setUp(self):
+        system.time = 0.
+
+    # def tearDown(self):
+        system.lb = None
+        system.lees_edwards.protocol = None
+
+    def check_profile(self, u_getter, **kwargs):
+        # carefully select the domain decomposition
+        assert kwargs["shear_plane_normal"] == "y"
+        assert system.cell_system.node_grid[coord_indexes[kwargs["shear_direction"]]] == 1
+        h = system.box_l[coord_indexes[kwargs["shear_plane_normal"]]]
+        shear_velocity = 0.05
+        k_max = 100
+
+        protocol = espressomd.lees_edwards.LinearShear(
+            shear_velocity=shear_velocity, initial_pos_offset=0., time_0=0.)
+        system.lees_edwards.set_boundary_conditions(
+            protocol=protocol, **kwargs)
+        agrid = LB_PARAMS["agrid"]
+
+        lbf = self.lb_class(**LB_PARAMS, **self.lb_params)
+        system.lb = lbf
+
+        # warmup
+        system.integrator.run(8)
+
+        # sampling
+        for i in range(4, 9):
+            steps = (2**i - 2**(i - 1))
+            system.integrator.run(steps)
+            pos = np.array(range(int(h))) + agrid / 2.
+            u_ref = analytical(pos, system.time - 1., lbf.kinematic_viscosity,
+                               shear_velocity, h, k_max)
+            u_lbf = np.copy(u_getter(lbf).reshape([-1]))
+            np.testing.assert_allclose(u_lbf, u_ref, atol=1e-4, rtol=0.)
+
+    @ut.skipIf(n_nodes == 1, "test is designed to run on multiple MPI ranks")
+    @ut.expectedFailure
+    def test_profile_xy_divided_shear_direction(self):
+        system.cell_system.node_grid = [n_nodes, 1, 1]
+        self.check_profile(lambda lbf: lbf[5, :, 0].velocity[:, 0],
+                           shear_direction="x", shear_plane_normal="y")
+
+    @ut.skip("TODO: LB+Lees Edwards doesnt'work for certian node grids")  # TODO
+    @ut.skipIf(n_nodes == 1, "test is designed to run on multiple MPI ranks")
+    def test_profile_xy_divided_normal_direction(self):
+        system.cell_system.node_grid = [1, n_nodes, 1]
+        self.check_profile(lambda lbf: lbf[5, :, 0].velocity[:, 0],
+                           shear_direction="x", shear_plane_normal="y")
+
+    def test_profile_xy_divided_z_direction(self):
+        system.cell_system.node_grid = [1, 1, n_nodes]
+        self.check_profile(lambda lbf: lbf[5, :, 0].velocity[:, 0],
+                           shear_direction="x", shear_plane_normal="y")
+
+
+@utx.skipIfMissingFeatures(["WALBERLA"])
+class LBCouetteFlowWalberla(LBCouetteFlowCommon, ut.TestCase):
+
+    """Test for the Walberla implementation of the LB in double-precision."""
+
+    lb_class = espressomd.lb.LBFluidWalberla
+    lb_params = {"single_precision": False}
+
+
+@utx.skipIfMissingFeatures(["WALBERLA"])
+class LBCouetteFlowWalberlaSinglePrecision(LBCouetteFlowCommon, ut.TestCase):
+
+    """Test for the Walberla implementation of the LB in single-precision."""
+
+    lb_class = espressomd.lb.LBFluidWalberla
+    lb_params = {"single_precision": True}
+
+
+if __name__ == '__main__':
+    ut.main()
diff --git a/testsuite/python/lb_interpolation.py b/testsuite/python/lb_interpolation.py
index 3a2bd9a491..f67fcdf268 100644
--- a/testsuite/python/lb_interpolation.py
+++ b/testsuite/python/lb_interpolation.py
@@ -55,6 +55,7 @@ class LBInterpolation:
     system = espressomd.System(box_l=[BOX_L] * 3)
     system.cell_system.skin = 0.4 * AGRID
     system.time_step = TIME_STEP
+    system.periodicity = [False, True, True]
 
     def setUp(self):
         self.lbf = self.lb_class(**LB_PARAMETERS, **self.lb_params)
@@ -187,5 +188,17 @@ class LBInterpolationWalberlaSinglePrecisionGPU(LBInterpolation, ut.TestCase):
     lb_params = {"single_precision": True}
 
 
+@utx.skipIfMissingFeatures(["WALBERLA"])
+class LBInterpolationWalberlaDoublePrecisionBlocksCPU(LBInterpolation, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberla
+    lb_params = {"single_precision": False, "blocks_per_mpi_rank": [2, 2, 2]}
+
+
+@utx.skipIfMissingFeatures(["WALBERLA"])
+class LBInterpolationWalberlaSinglePrecisionBlocksCPU(LBInterpolation, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberla
+    lb_params = {"single_precision": True, "blocks_per_mpi_rank": [2, 2, 2]}
+
+
 if __name__ == "__main__":
     ut.main()
diff --git a/testsuite/python/lb_mass_conservation.py b/testsuite/python/lb_mass_conservation.py
index fcbbab66b6..15d4be7f29 100644
--- a/testsuite/python/lb_mass_conservation.py
+++ b/testsuite/python/lb_mass_conservation.py
@@ -41,7 +41,7 @@ class LBMassCommon:
 
     """Check the lattice-Boltzmann mass conservation."""
 
-    system = espressomd.System(box_l=[3.0, 3.0, 3.0])
+    system = espressomd.System(box_l=[6.0, 6.0, 6.0])
     system.time_step = TIME_STEP
     system.cell_system.skin = 0.4 * AGRID
 
@@ -96,5 +96,14 @@ class LBMassWalberlaSinglePrecisionGPU(LBMassCommon, ut.TestCase):
     atol = 5e-7
 
 
+@utx.skipIfMissingFeatures(["WALBERLA"])
+class LBMassWalberlaDoublePrecisionBlocksCPU(LBMassCommon, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberla
+    blocks_per_mpi_rank = [2, 2, 2]
+    lb_params = {"single_precision": False,
+                 "blocks_per_mpi_rank": blocks_per_mpi_rank}
+    atol = 1e-10
+
+
 if __name__ == '__main__':
     ut.main()
diff --git a/testsuite/python/lb_momentum_conservation.py b/testsuite/python/lb_momentum_conservation.py
index 0d72f83ec5..f64c0543a5 100644
--- a/testsuite/python/lb_momentum_conservation.py
+++ b/testsuite/python/lb_momentum_conservation.py
@@ -218,5 +218,19 @@ def set_cellsystem(self):
         self.system.cell_system.set_n_square()
 
 
+@ut.skipIf(TestLBMomentumConservation.n_nodes != 1,
+           "LB with regular decomposition already tested with 2 MPI ranks")
+@utx.skipIfMissingFeatures(["WALBERLA", "EXTERNAL_FORCES"])
+class TestLBMomentumConservationRegularDoublePrecisionWalberlaBlocksCPU(
+        TestLBMomentumConservation, ut.TestCase):
+
+    lb_class = espressomd.lb.LBFluidWalberla
+    lb_params = {"single_precision": False, "blocks_per_mpi_rank": [2, 2, 2]}
+    atol = 1.2e-4
+
+    def set_cellsystem(self):
+        self.system.cell_system.set_regular_decomposition()
+
+
 if __name__ == "__main__":
     ut.main()
diff --git a/testsuite/python/lb_planar_couette.py b/testsuite/python/lb_planar_couette.py
index 7295128b86..6edda76921 100644
--- a/testsuite/python/lb_planar_couette.py
+++ b/testsuite/python/lb_planar_couette.py
@@ -116,6 +116,9 @@ def test_profile_xy(self):
 
     @ut.skipIf(n_nodes > 1, "Skipping test: only runs for n_nodes == 1")
     def test_profile_zy(self):
+        if hasattr(self, 'blocks_per_mpi_rank'):
+            self.skipTest(
+                "Skipping test: only runs for blocks_per_mpi_rank=[1,1,1]")
         self.check_profile(lambda lbf: lbf[0, :, 5].velocity[:, 0],
                            shear_direction="z", shear_plane_normal="y")
 
@@ -142,5 +145,18 @@ class LBCouetteFlowWalberlaSinglePrecision(LBCouetteFlowCommon, ut.TestCase):
     lb_params = {"single_precision": True}
 
 
+@utx.skipIfMissingFeatures(["WALBERLA"])
+@ut.skipIf(LBCouetteFlowCommon.n_nodes > 2,
+           "Skipping test: only runs for n_nodes <= 2")
+class LBCouetteFlowWalberlaBlocks(LBCouetteFlowCommon, ut.TestCase):
+
+    """Test for the Walberla implementation of the LB in double-precision."""
+
+    lb_class = espressomd.lb.LBFluidWalberla
+    blocks_per_mpi_rank = [2, 1, 1]
+    lb_params = {"single_precision": False,
+                 "blocks_per_mpi_rank": blocks_per_mpi_rank}
+
+
 if __name__ == '__main__':
     ut.main()
diff --git a/testsuite/python/lb_poiseuille.py b/testsuite/python/lb_poiseuille.py
index 9a4178d7af..4b259653e7 100644
--- a/testsuite/python/lb_poiseuille.py
+++ b/testsuite/python/lb_poiseuille.py
@@ -117,6 +117,7 @@ def test_profile(self):
                                      EXT_FORCE,
                                      KINEMATIC_VISC * DENS)
         np.testing.assert_allclose(v_measured, v_expected, rtol=5E-5)
+        # np.testing.assert_allclose(v_measured, v_expected, rtol=5E-5, atol=8E-4)
 
 
 @utx.skipIfMissingFeatures(["WALBERLA"])
@@ -145,5 +146,17 @@ class LBPoiseuilleWalberlaSinglePrecisionGPU(LBPoiseuilleCommon, ut.TestCase):
     lb_params = {"single_precision": True}
 
 
+@utx.skipIfMissingFeatures(["WALBERLA"])
+class LBPoiseuilleWalberlaDoublePrecisionBlocksCPU(LBPoiseuilleCommon, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberla
+    lb_params = {"single_precision": False, "blocks_per_mpi_rank": [2, 2, 2]}
+
+
+@utx.skipIfMissingFeatures(["WALBERLA"])
+class LBPoiseuilleWalberlaSinglePrecisionBlocksCPU(LBPoiseuilleCommon, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberla
+    lb_params = {"single_precision": True, "blocks_per_mpi_rank": [2, 2, 2]}
+
+
 if __name__ == '__main__':
     ut.main()
diff --git a/testsuite/python/lb_poiseuille_cylinder.py b/testsuite/python/lb_poiseuille_cylinder.py
index 4499f8661d..aa6493b48c 100644
--- a/testsuite/python/lb_poiseuille_cylinder.py
+++ b/testsuite/python/lb_poiseuille_cylinder.py
@@ -222,5 +222,11 @@ class LBPoiseuilleWalberlaSinglePrecisionGPU(LBPoiseuilleCommon, ut.TestCase):
     lb_params = {"single_precision": True}
 
 
+@utx.skipIfMissingFeatures(["WALBERLA"])
+class LBPoiseuilleWalberlaDoublePrecisionBlocksCPU(LBPoiseuilleCommon, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberla
+    lb_params = {"single_precision": False, "blocks_per_mpi_rank": [2, 2, 2]}
+
+
 if __name__ == '__main__':
     ut.main()
diff --git a/testsuite/python/lb_pressure_tensor.py b/testsuite/python/lb_pressure_tensor.py
index 0c0b4df3a6..263ff9295f 100644
--- a/testsuite/python/lb_pressure_tensor.py
+++ b/testsuite/python/lb_pressure_tensor.py
@@ -156,6 +156,14 @@ class TestLBPressureTensorCPU(TestLBPressureTensor, ut.TestCase):
     steps = 5000
 
 
+@utx.skipIfMissingFeatures("WALBERLA")
+class TestLBPressureTensorBlocksCPU(TestLBPressureTensor, ut.TestCase):
+
+    lb_class = espressomd.lb.LBFluidWalberla
+    lb_params = {"single_precision": True, "blocks_per_mpi_rank": [2, 2, 2]}
+    steps = 5000
+
+
 # TODO WALBERLA
 """
 @utx.skipIfMissingFeatures("WALBERLA")
diff --git a/testsuite/python/lb_shear.py b/testsuite/python/lb_shear.py
index fef0838ba6..1b7cf59a1f 100644
--- a/testsuite/python/lb_shear.py
+++ b/testsuite/python/lb_shear.py
@@ -31,7 +31,7 @@
 # Box size will be H +2 AGRID to make room for walls.
 # The number of grid cells should be divisible by four and 3 in all directions
 # for testing on multiple mpi nodes.
-H = 12 * AGRID
+H = 10 * AGRID
 W = 6 * AGRID
 SHEAR_VELOCITY = 0.3
 
@@ -85,7 +85,7 @@ class LBShearCommon:
     system.cell_system.skin = 0.4 * AGRID
 
     def setUp(self):
-        self.lbf = self.lb_class(**LB_PARAMS, **self.lb_params)
+        self.system.lb = None
 
     def tearDown(self):
         self.system.lb = None
@@ -96,9 +96,14 @@ def check_profile(self, shear_plane_normal, shear_direction):
         the exact solution.
         """
         self.tearDown()
-        self.system.box_l = np.max(
-            ((W, W, W), shear_plane_normal * (H + 2 * AGRID)), 0)
-        self.setUp()
+        if hasattr(self, 'blocks_per_mpi_rank'):
+            self.system.box_l = np.max(
+                ((W, W, W) * np.array(self.blocks_per_mpi_rank),
+                 shear_plane_normal * (H + 2 * AGRID) * np.array(self.blocks_per_mpi_rank)), 0)
+        else:
+            self.system.box_l = np.max(
+                ((W, W, W), shear_plane_normal * (H + 2 * AGRID)), 0)
+        self.lbf = self.lb_class(**LB_PARAMS, **self.lb_params)
         self.system.lb = self.lbf
         self.lbf.clear_boundaries()
 
@@ -204,5 +209,18 @@ class LBShearWalberlaSinglePrecision(LBShearCommon, ut.TestCase):
     rtol = 5e-3
 
 
+@utx.skipIfMissingFeatures(["WALBERLA"])
+class LBShearWalberlaBlocks(LBShearCommon, ut.TestCase):
+
+    """Test for the Walberla implementation of the LB in double-precision."""
+
+    lb_class = espressomd.lb.LBFluidWalberla
+    blocks_per_mpi_rank = [2, 2, 2]
+    lb_params = {"single_precision": False,
+                 "blocks_per_mpi_rank": blocks_per_mpi_rank}
+    atol = 5e-5
+    rtol = 5e-4
+
+
 if __name__ == '__main__':
     ut.main()
diff --git a/testsuite/python/lb_slice.py b/testsuite/python/lb_slice.py
index 09a49dc4bd..c2a43def65 100644
--- a/testsuite/python/lb_slice.py
+++ b/testsuite/python/lb_slice.py
@@ -200,5 +200,19 @@ class LBTestWalberlaSinglePrecisionGPU(LBTest, ut.TestCase):
     lb_params = {"single_precision": True}
 
 
+@utx.skipIfMissingFeatures("WALBERLA")
+class LBTestWalberlaDoublePrecisionBlocksCPU(LBTest, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberla
+    lb_lattice_class = espressomd.lb.LatticeWalberla
+    lb_params = {"single_precision": False, "blocks_per_mpi_rank": [1, 1, 2]}
+
+
+@utx.skipIfMissingFeatures(["WALBERLA"])
+class LBTestWalberlaSinglePrecisionBlocksCPU(LBTest, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberla
+    lb_lattice_class = espressomd.lb.LatticeWalberla
+    lb_params = {"single_precision": True, "blocks_per_mpi_rank": [1, 1, 2]}
+
+
 if __name__ == "__main__":
     ut.main()
diff --git a/testsuite/python/lb_streaming.py b/testsuite/python/lb_streaming.py
index ad9fefa350..8798d1474f 100644
--- a/testsuite/python/lb_streaming.py
+++ b/testsuite/python/lb_streaming.py
@@ -163,5 +163,13 @@ class LBStreamingWalberlaSinglePrecisionGPU(LBStreamingCommon, ut.TestCase):
     rtol = 1e-5
 
 
+@utx.skipIfMissingFeatures(["WALBERLA"])
+class LBStreamingWalberlaDoublePrecisionBlocksCPU(LBStreamingCommon, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberla
+    lb_params = {"single_precision": False, "blocks_per_mpi_rank": [1, 2, 2]}
+    box_l = [3., 2., 2.]
+    rtol = 1e-10
+
+
 if __name__ == "__main__":
     ut.main()
diff --git a/testsuite/python/lb_thermostat.py b/testsuite/python/lb_thermostat.py
index 53de0ea8ca..a5027cf699 100644
--- a/testsuite/python/lb_thermostat.py
+++ b/testsuite/python/lb_thermostat.py
@@ -245,5 +245,11 @@ class LBThermostatWalberlaSinglePrecisionGPU(LBThermostatCommon, ut.TestCase):
     lb_params = {"single_precision": True}
 
 
+@utx.skipIfMissingFeatures(["WALBERLA"])
+class LBThermostatWalberlaDoublePrecisionBlocksCPU(LBThermostatCommon, ut.TestCase):
+    lb_class = espressomd.lb.LBFluidWalberla
+    lb_params = {"single_precision": False, "blocks_per_mpi_rank": [2, 2, 2]}
+
+
 if __name__ == '__main__':
     ut.main()