Merge remote-tracking branch 'origin/stack/ogl_0.6' into stack/ogl_0.6

hpsim · Feb 3, 2025 · c577ed5 · c577ed5
2 parents 6932c08 + 26f98a0
commit c577ed5
Show file tree

Hide file tree

Showing 14 changed files with 263 additions and 187 deletions.
diff --git a/README.md b/README.md
@@ -79,6 +79,7 @@ adaptMinIter | true | based on the previous solution set minIter to be relaxatio
 relaxationFactor | 0.8 | use relaxationFactor*previousIters as new minIters
 scaling | 1.0 | Scale the complete system by the scaling factor
 forceHostBuffer  | false | whether to copy to host before MPI calls
+splitComm  | true | whether to split communicator for the host and device side
 export | false | write the complete system (matrix and rhs) to disk as .mtx file using controlDict/writeControl
 writeGlobal | false | convert all indices to global indices when exporting .mtx files
 

diff --git a/include/OGL/CommunicationPattern.hpp b/include/OGL/CommunicationPattern.hpp
@@ -117,6 +117,9 @@ std::vector<label> gather_labels_to_owner(const ExecutorHandler &exec_handler,
                                           const label *send_buffer,
                                           label send_size, label offset = 0);
 
+/* @class Struct to store communication related data
+**
+*/
 struct CommunicationPattern {
     using comm_size_type = label;
 
@@ -149,12 +152,6 @@ struct CommunicationPattern {
         }
     }
 
-    const gko::experimental::mpi::communicator &get_comm() const
-    {
-        return *exec_handler.get_communicator().get();
-    }
-
-
     /* @brief concatenate all separate send idxs arrays into one contiguous
      * array
      */

diff --git a/include/OGL/DevicePersistent/ExecutorHandler.hpp b/include/OGL/DevicePersistent/ExecutorHandler.hpp
@@ -13,26 +13,81 @@
 
 namespace Foam {
 
-struct ExecutorInitFunctor {
-    mutable std::shared_ptr<gko::experimental::mpi::communicator> comm_;
 
-    const label device_id_;
+struct DeviceIdHandler {
+    // ratio of inactive to active ranks on the GPU
+    label ranks_per_gpu;
+
+    /*
+     * @param gpus_per_rank ratio between active host ranks and active ranks
+     * gpus on the node i.e. if gpus_per_rank == 1 all host ranks are active on
+     * the gpu, which might lead to oversubscription gpus_per_rank == 2 only
+     * every second rank will be active ...
+     */
+    DeviceIdHandler(label ranks_per_gpu_in) : ranks_per_gpu(ranks_per_gpu_in)
+    {
+        bool par_run = Pstream::parRun();
+        if (!par_run) {
+            FatalErrorInFunction << "Only parallel runs are supported for OGL"
+                                 << exit(FatalError);
+        }
+    }
+
+    /* @brief compute the local device id
+     *
+     * @param num_devices_per_node number of devices per node
+     * @returns
+     */
+    label compute_device_id(label num_devices_per_node) const
+    {
+        // if zero devices present device id is always zero
+        if (num_devices_per_node == 0) {
+            return 0;
+        }
+        label global_rank = Pstream::myProcNo();
+        // clang-format off
+        /* example: machine with 4 cpu cores per node 2 accelerators per node, on node 2
+        * global ranks [0, 1, 2, 3 | 4, 5, 6, 7]
+        * global device id w/o repart  [0, 0, 1, 1 | 2, 2, 3, 3]
+        * global device id w repart  [0, x, 1, x | 2, x, 3, x] x-inactive
+        * local device_id w/o repart  [0, 1, 0, 1 | 0, 1, 0, 1]
+        * local device_id w repart  [0, x, 1, x | 0, x, 1, x] x-inactive
+        */
+        // clang-format on
+
+        // global_id rpg = 1: [0, 1, 2, 3 | 4, 5, 6, 7]
+        // global_id rpg = 2: [0, 0, 1, 1 | 2, 2, 3, 3]
+        label device_global_id = global_rank / ranks_per_gpu;
+
+        // compute local round robin id
+        // mod of global_id / num_devices_per_node
+        return device_global_id % num_devices_per_node;
+    }
+
+    /* @brief compute the group id for the split communicator
+     */
+    label compute_group() const
+    {
+        label rank = Pstream::myProcNo();
+        label owner_rank = rank - (rank % ranks_per_gpu);
+        bool is_owner = owner_rank == rank;
+        return (is_owner) ? 0 : 1;
+    }
+};
+
+struct ExecutorInitFunctor {
+    const DeviceIdHandler device_id_handler_;
 
     const word executor_name_;
 
     const word field_name_;
 
     const label verbose_;
 
-    ExecutorInitFunctor(bool par_run, const word executor_name,
-                        const word field_name, const label verbose,
-                        const label gpus_per_rank,
-                        const bool force_host_buffer = false)
-        : comm_((par_run)  // TODO make this DRY
-                    ? std::make_shared<gko::experimental::mpi::communicator>(
-                          MPI_COMM_WORLD, force_host_buffer)
-                    : NULL),
-          device_id_((par_run) ? comm_->rank() / gpus_per_rank : 0),
+    ExecutorInitFunctor(const word executor_name, const word field_name,
+                        const label verbose,
+                        const DeviceIdHandler device_id_handler)
+        : device_id_handler_(device_id_handler),
           executor_name_(executor_name),
           field_name_(field_name),
           verbose_(verbose)
@@ -47,17 +102,24 @@ struct ExecutorInitFunctor {
     {
         auto host_exec = gko::share(gko::ReferenceExecutor::create());
 
+        auto msg = [](auto exec, auto id) {
+            std::string s;
+            s += std::string("Create ") + std::string(exec) +
+                 std::string(" executor on device ") + std::to_string(id) +
+                 std::string(" on rank ") + std::to_string(Pstream::myProcNo());
+            return s;
+        };
+
         if (executor_name_ == "cuda") {
             if (version.cuda_version.tag == not_compiled_tag) {
                 FatalErrorInFunction
                     << "CUDA Backend was not compiled. Recompile OGL/Ginkgo "
                        "with CUDA backend enabled."
                     << abort(FatalError);
             }
-            label id = device_id_ % gko::CudaExecutor::get_num_devices();
-            word msg = "Create CUDA executor on device " + std::to_string(id) +
-                       " on rank " + std::to_string(comm_->rank());
-            LOG_0(verbose_, msg)
+            label id = device_id_handler_.compute_device_id(
+                gko::CudaExecutor::get_num_devices());
+            LOG_0(verbose_, msg(executor_name_, id))
             return gko::share(gko::CudaExecutor::create(id, host_exec));
         }
         if (executor_name_ == "sycl" || executor_name_ == "dpcpp") {
@@ -67,21 +129,10 @@ struct ExecutorInitFunctor {
                        "with SYCL backend enabled."
                     << abort(FatalError);
             }
-
-            if (executor_name_ == "dpcpp") {
-                Info << "Warning: the executor name dpcpp is deprecated.\n"
-                     << "Use sycl as the executor name for the same executor "
-                        "instead."
-                     << endl;
-            }
-
-            auto devices = gko::DpcppExecutor::get_num_devices("gpu");
-            if (devices == 0) {
-                return gko::share(gko::DpcppExecutor::create(0, host_exec));
-            } else {
-                return gko::share(gko::DpcppExecutor::create(
-                    device_id_ % devices, host_exec));
-            }
+            label id = device_id_handler_.compute_device_id(
+                gko::DpcppExecutor::get_num_devices("gpu"));
+            LOG_0(verbose_, msg(executor_name_, id))
+            return gko::share(gko::DpcppExecutor::create(id, host_exec));
         }
         if (executor_name_ == "hip") {
             if (version.hip_version.tag == not_compiled_tag) {
@@ -90,10 +141,9 @@ struct ExecutorInitFunctor {
                        "with HIP backend enabled."
                     << abort(FatalError);
             }
-            label id = device_id_ % gko::HipExecutor::get_num_devices();
-            word msg = "Create HIP executor on device " + std::to_string(id) +
-                       " on rank " + std::to_string(comm_->rank());
-            LOG_0(verbose_, msg)
+            label id = device_id_handler_.compute_device_id(
+                gko::HipExecutor::get_num_devices());
+            LOG_0(verbose_, msg(executor_name_, id))
             auto ret = gko::share(gko::HipExecutor::create(id, host_exec));
             return ret;
         }
@@ -125,37 +175,52 @@ class ExecutorHandler
 
     const bool non_orig_device_comm_;
 
-    const bool par_run_;
+    const bool split_comm_;
+
+    mutable std::shared_ptr<gko::experimental::mpi::communicator> host_comm_;
+
+    const bool host_rank_;
 
     mutable std::shared_ptr<gko::experimental::mpi::communicator> device_comm_;
 
     const word device_executor_name_;
 
 public:
     ExecutorHandler(const objectRegistry &db, const dictionary &solverControls,
-                    const word field_name, bool par_run = Pstream::parRun())
+                    const word field_name, DeviceIdHandler device_id_handler
+		    )
         : PersistentBase<gko::Executor, ExecutorInitFunctor>(
               solverControls.lookupOrDefault("executor", word("reference")) +
                   +"_" + field_name,
               db,
               ExecutorInitFunctor(
-                  par_run,
                   solverControls.lookupOrDefault("executor", word("reference")),
                   field_name,
                   solverControls.lookupOrDefault("verbose", label(0)),
-                  solverControls.lookupOrDefault("ranksPerGPU", label(1)),
-                  solverControls.lookupOrDefault("forceHostBuffer", false)),
+                  device_id_handler
+                  ),
               true, 0),
           gko_force_host_buffer_(
               solverControls.lookupOrDefault("forceHostBuffer", false)),
           non_orig_device_comm_(
               solverControls.lookupOrDefault("MPIxRankOffload", false)),
-          par_run_(par_run),
+          split_comm_(
+              solverControls.lookupOrDefault("splitMPIComm", true)),
+          host_comm_(std::make_shared<gko::experimental::mpi::communicator>(
+                        MPI_COMM_WORLD, gko_force_host_buffer_)),
+	  host_rank_(host_comm_->rank()),
           device_comm_(
-              (par_run_)
-                  ? std::make_shared<gko::experimental::mpi::communicator>(
-                        MPI_COMM_WORLD, gko_force_host_buffer_)
-                  : NULL),
+              (split_comm_)
+                  ? [this, device_id_handler](){
+		    label group = device_id_handler.compute_group();
+		    MPI_Comm gko_comm;
+            label host_rank =0;
+		    MPI_Comm_split(MPI_COMM_WORLD, group, host_rank, &gko_comm);
+
+		    return std::make_shared<gko::experimental::mpi::communicator>(
+			gko_comm, gko_force_host_buffer_);
+		  }()
+                  : host_comm_),
           device_executor_name_(
               solverControls.lookupOrDefault("executor", word("reference")))
     {}
@@ -168,10 +233,7 @@ class ExecutorHandler
     /* whether the mpi allows to send data directly to a remote rank
      * via pair-wise communication
      * */
-    bool get_non_orig_device_comm() const
-    {
-        return non_orig_device_comm_;
-    }
+    bool get_non_orig_device_comm() const { return non_orig_device_comm_; }
 
     const std::shared_ptr<gko::Executor> get_device_exec() const
     {
@@ -185,26 +247,23 @@ class ExecutorHandler
 
     word get_exec_name() const { return device_executor_name_; }
 
-    std::shared_ptr<gko::experimental::mpi::communicator>
-    get_gko_mpi_host_comm() const
-    {
-        return std::make_shared<gko::experimental::mpi::communicator>(
-            MPI_COMM_WORLD, gko_force_host_buffer_);
-    }
-
     std::shared_ptr<const gko::experimental::mpi::communicator>
-    get_gko_mpi_device_comm() const
+    get_device_comm() const
     {
         return this->device_comm_;
     }
 
-    std::shared_ptr<const gko::experimental::mpi::communicator>
-    get_communicator() const
+    std::shared_ptr<const gko::experimental::mpi::communicator> get_host_comm()
+        const
     {
-        return this->device_comm_;
+        return this->host_comm_;
     }
 
-    label get_rank() const { return get_communicator()->rank(); };
+    bool get_split_comm() const { return split_comm_; };
+
+    label get_host_rank() const { return get_host_comm()->rank(); };
+
+    label get_device_rank() const { return get_device_comm()->rank(); };
 };
 
 using PersistentExecutor = ExecutorHandler;

diff --git a/include/OGL/DevicePersistent/Vector.hpp b/include/OGL/DevicePersistent/Vector.hpp
@@ -53,7 +53,7 @@ struct VectorInitFunctor {
     {
         auto ref_exec = exec_.get_ref_exec();
         auto exec = exec_.get_device_exec();
-        auto comm = exec_.get_communicator();
+        auto comm = exec_.get_host_comm();
         auto repartitioner = dist_matrix_->get_repartitioner();
         auto host_size = repartitioner->get_orig_size();
         auto repart_size = repartitioner->get_repart_size();
@@ -78,7 +78,6 @@ struct VectorInitFunctor {
     {
         auto exec = exec_.get_device_exec();
         auto ref_exec = exec_.get_ref_exec();
-        auto comm = exec_.get_communicator();
         auto repartitioner = dist_matrix_->get_repartitioner();
         auto host_size = repartitioner->get_orig_size();
         auto repart_size = repartitioner->get_repart_size();
@@ -100,8 +99,9 @@ struct VectorInitFunctor {
         communicate_values(exec_, comm_pattern, host_view.get_const_data(),
                            values.get_data());
 
+        auto device_comm = exec_.get_device_comm();
         auto ret = gko::share(dist_vec::create(
-            exec, *comm.get(),
+            exec, *device_comm.get(),
             vec::create(
                 exec, gko::dim<2>{static_cast<gko::size_type>(repart_size), 1},
                 values, 1)));
@@ -174,9 +174,9 @@ class PersistentVector
     void copy_back()
     {
         auto exec = exec_.get_device_exec();
-        auto rank = exec_.get_rank();
+        auto rank = exec_.get_host_rank();
         auto ref_exec = exec_.get_ref_exec();
-        auto comm = exec_.get_communicator();
+        auto comm = exec_.get_host_comm();
         bool host_buffer = !exec_.get_non_orig_device_comm();
 
         auto repartitioner = dist_matrix_->get_repartitioner();

diff --git a/include/OGL/Repartitioner.hpp b/include/OGL/Repartitioner.hpp
@@ -63,11 +63,12 @@ class Repartitioner {
                                                           exec_handler)),
           ranks_per_gpu_(ranks_per_gpu),
           verbose_(verbose),
+          // TODO we build the partition from the local size before
+          // repartitioning thus we need the host comm to compute correct sizes
           orig_partition_(gko::share(
               gko::experimental::distributed::build_partition_from_local_size<
                   label, label>(exec_handler.get_ref_exec(),
-                                *exec_handler.get_communicator().get(),
-                                size))){};
+                                *exec_handler.get_host_comm().get(), size))){};
 
     /* returns the owner rank for a given rank */
     label get_owner_rank(label rank) const
@@ -78,14 +79,14 @@ class Repartitioner {
     /* returns the owner rank for a given rank */
     label get_owner_rank(const ExecutorHandler &exec_handler) const
     {
-        return get_owner_rank(exec_handler.get_rank());
+        return get_owner_rank(exec_handler.get_host_rank());
     };
 
     /* returns if current rank is an owner  */
     bool is_owner(const ExecutorHandler &exec_handler) const
     {
-        return exec_handler.get_rank() ==
-               get_owner_rank(exec_handler.get_rank());
+        return exec_handler.get_host_rank() ==
+               get_owner_rank(exec_handler.get_host_rank());
     };
 
     /* @brief check if the given rank gets local after repartitioning