Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/stack/ogl_0.6' into stack/ogl_0.6
Browse files Browse the repository at this point in the history
  • Loading branch information
greole committed Feb 3, 2025
2 parents 6932c08 + 26f98a0 commit c577ed5
Show file tree
Hide file tree
Showing 14 changed files with 263 additions and 187 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ adaptMinIter | true | based on the previous solution set minIter to be relaxatio
relaxationFactor | 0.8 | use relaxationFactor*previousIters as new minIters
scaling | 1.0 | Scale the complete system by the scaling factor
forceHostBuffer | false | whether to copy to host before MPI calls
splitComm | true | whether to split communicator for the host and device side
export | false | write the complete system (matrix and rhs) to disk as .mtx file using controlDict/writeControl
writeGlobal | false | convert all indices to global indices when exporting .mtx files

Expand Down
9 changes: 3 additions & 6 deletions include/OGL/CommunicationPattern.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,9 @@ std::vector<label> gather_labels_to_owner(const ExecutorHandler &exec_handler,
const label *send_buffer,
label send_size, label offset = 0);

/* @class Struct to store communication related data
**
*/
struct CommunicationPattern {
using comm_size_type = label;

Expand Down Expand Up @@ -149,12 +152,6 @@ struct CommunicationPattern {
}
}

const gko::experimental::mpi::communicator &get_comm() const
{
return *exec_handler.get_communicator().get();
}


/* @brief concatenate all separate send idxs arrays into one contiguous
* array
*/
Expand Down
181 changes: 120 additions & 61 deletions include/OGL/DevicePersistent/ExecutorHandler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,26 +13,81 @@

namespace Foam {

struct ExecutorInitFunctor {
mutable std::shared_ptr<gko::experimental::mpi::communicator> comm_;

const label device_id_;
struct DeviceIdHandler {
// ratio of inactive to active ranks on the GPU
label ranks_per_gpu;

/*
* @param gpus_per_rank ratio between active host ranks and active ranks
* gpus on the node i.e. if gpus_per_rank == 1 all host ranks are active on
* the gpu, which might lead to oversubscription gpus_per_rank == 2 only
* every second rank will be active ...
*/
DeviceIdHandler(label ranks_per_gpu_in) : ranks_per_gpu(ranks_per_gpu_in)
{
bool par_run = Pstream::parRun();
if (!par_run) {
FatalErrorInFunction << "Only parallel runs are supported for OGL"
<< exit(FatalError);
}
}

/* @brief compute the local device id
*
* @param num_devices_per_node number of devices per node
* @returns
*/
label compute_device_id(label num_devices_per_node) const
{
// if zero devices present device id is always zero
if (num_devices_per_node == 0) {
return 0;
}
label global_rank = Pstream::myProcNo();
// clang-format off
/* example: machine with 4 cpu cores per node 2 accelerators per node, on node 2
* global ranks [0, 1, 2, 3 | 4, 5, 6, 7]
* global device id w/o repart [0, 0, 1, 1 | 2, 2, 3, 3]
* global device id w repart [0, x, 1, x | 2, x, 3, x] x-inactive
* local device_id w/o repart [0, 1, 0, 1 | 0, 1, 0, 1]
* local device_id w repart [0, x, 1, x | 0, x, 1, x] x-inactive
*/
// clang-format on

// global_id rpg = 1: [0, 1, 2, 3 | 4, 5, 6, 7]
// global_id rpg = 2: [0, 0, 1, 1 | 2, 2, 3, 3]
label device_global_id = global_rank / ranks_per_gpu;

// compute local round robin id
// mod of global_id / num_devices_per_node
return device_global_id % num_devices_per_node;
}

/* @brief compute the group id for the split communicator
*/
label compute_group() const
{
label rank = Pstream::myProcNo();
label owner_rank = rank - (rank % ranks_per_gpu);
bool is_owner = owner_rank == rank;
return (is_owner) ? 0 : 1;
}
};

struct ExecutorInitFunctor {
const DeviceIdHandler device_id_handler_;

const word executor_name_;

const word field_name_;

const label verbose_;

ExecutorInitFunctor(bool par_run, const word executor_name,
const word field_name, const label verbose,
const label gpus_per_rank,
const bool force_host_buffer = false)
: comm_((par_run) // TODO make this DRY
? std::make_shared<gko::experimental::mpi::communicator>(
MPI_COMM_WORLD, force_host_buffer)
: NULL),
device_id_((par_run) ? comm_->rank() / gpus_per_rank : 0),
ExecutorInitFunctor(const word executor_name, const word field_name,
const label verbose,
const DeviceIdHandler device_id_handler)
: device_id_handler_(device_id_handler),
executor_name_(executor_name),
field_name_(field_name),
verbose_(verbose)
Expand All @@ -47,17 +102,24 @@ struct ExecutorInitFunctor {
{
auto host_exec = gko::share(gko::ReferenceExecutor::create());

auto msg = [](auto exec, auto id) {
std::string s;
s += std::string("Create ") + std::string(exec) +
std::string(" executor on device ") + std::to_string(id) +
std::string(" on rank ") + std::to_string(Pstream::myProcNo());
return s;
};

if (executor_name_ == "cuda") {
if (version.cuda_version.tag == not_compiled_tag) {
FatalErrorInFunction
<< "CUDA Backend was not compiled. Recompile OGL/Ginkgo "
"with CUDA backend enabled."
<< abort(FatalError);
}
label id = device_id_ % gko::CudaExecutor::get_num_devices();
word msg = "Create CUDA executor on device " + std::to_string(id) +
" on rank " + std::to_string(comm_->rank());
LOG_0(verbose_, msg)
label id = device_id_handler_.compute_device_id(
gko::CudaExecutor::get_num_devices());
LOG_0(verbose_, msg(executor_name_, id))
return gko::share(gko::CudaExecutor::create(id, host_exec));
}
if (executor_name_ == "sycl" || executor_name_ == "dpcpp") {
Expand All @@ -67,21 +129,10 @@ struct ExecutorInitFunctor {
"with SYCL backend enabled."
<< abort(FatalError);
}

if (executor_name_ == "dpcpp") {
Info << "Warning: the executor name dpcpp is deprecated.\n"
<< "Use sycl as the executor name for the same executor "
"instead."
<< endl;
}

auto devices = gko::DpcppExecutor::get_num_devices("gpu");
if (devices == 0) {
return gko::share(gko::DpcppExecutor::create(0, host_exec));
} else {
return gko::share(gko::DpcppExecutor::create(
device_id_ % devices, host_exec));
}
label id = device_id_handler_.compute_device_id(
gko::DpcppExecutor::get_num_devices("gpu"));
LOG_0(verbose_, msg(executor_name_, id))
return gko::share(gko::DpcppExecutor::create(id, host_exec));
}
if (executor_name_ == "hip") {
if (version.hip_version.tag == not_compiled_tag) {
Expand All @@ -90,10 +141,9 @@ struct ExecutorInitFunctor {
"with HIP backend enabled."
<< abort(FatalError);
}
label id = device_id_ % gko::HipExecutor::get_num_devices();
word msg = "Create HIP executor on device " + std::to_string(id) +
" on rank " + std::to_string(comm_->rank());
LOG_0(verbose_, msg)
label id = device_id_handler_.compute_device_id(
gko::HipExecutor::get_num_devices());
LOG_0(verbose_, msg(executor_name_, id))
auto ret = gko::share(gko::HipExecutor::create(id, host_exec));
return ret;
}
Expand Down Expand Up @@ -125,37 +175,52 @@ class ExecutorHandler

const bool non_orig_device_comm_;

const bool par_run_;
const bool split_comm_;

mutable std::shared_ptr<gko::experimental::mpi::communicator> host_comm_;

const bool host_rank_;

mutable std::shared_ptr<gko::experimental::mpi::communicator> device_comm_;

const word device_executor_name_;

public:
ExecutorHandler(const objectRegistry &db, const dictionary &solverControls,
const word field_name, bool par_run = Pstream::parRun())
const word field_name, DeviceIdHandler device_id_handler
)
: PersistentBase<gko::Executor, ExecutorInitFunctor>(
solverControls.lookupOrDefault("executor", word("reference")) +
+"_" + field_name,
db,
ExecutorInitFunctor(
par_run,
solverControls.lookupOrDefault("executor", word("reference")),
field_name,
solverControls.lookupOrDefault("verbose", label(0)),
solverControls.lookupOrDefault("ranksPerGPU", label(1)),
solverControls.lookupOrDefault("forceHostBuffer", false)),
device_id_handler
),
true, 0),
gko_force_host_buffer_(
solverControls.lookupOrDefault("forceHostBuffer", false)),
non_orig_device_comm_(
solverControls.lookupOrDefault("MPIxRankOffload", false)),
par_run_(par_run),
split_comm_(
solverControls.lookupOrDefault("splitMPIComm", true)),
host_comm_(std::make_shared<gko::experimental::mpi::communicator>(
MPI_COMM_WORLD, gko_force_host_buffer_)),
host_rank_(host_comm_->rank()),
device_comm_(
(par_run_)
? std::make_shared<gko::experimental::mpi::communicator>(
MPI_COMM_WORLD, gko_force_host_buffer_)
: NULL),
(split_comm_)
? [this, device_id_handler](){
label group = device_id_handler.compute_group();
MPI_Comm gko_comm;
label host_rank =0;
MPI_Comm_split(MPI_COMM_WORLD, group, host_rank, &gko_comm);

return std::make_shared<gko::experimental::mpi::communicator>(
gko_comm, gko_force_host_buffer_);
}()
: host_comm_),
device_executor_name_(
solverControls.lookupOrDefault("executor", word("reference")))
{}
Expand All @@ -168,10 +233,7 @@ class ExecutorHandler
/* whether the mpi allows to send data directly to a remote rank
* via pair-wise communication
* */
bool get_non_orig_device_comm() const
{
return non_orig_device_comm_;
}
bool get_non_orig_device_comm() const { return non_orig_device_comm_; }

const std::shared_ptr<gko::Executor> get_device_exec() const
{
Expand All @@ -185,26 +247,23 @@ class ExecutorHandler

word get_exec_name() const { return device_executor_name_; }

std::shared_ptr<gko::experimental::mpi::communicator>
get_gko_mpi_host_comm() const
{
return std::make_shared<gko::experimental::mpi::communicator>(
MPI_COMM_WORLD, gko_force_host_buffer_);
}

std::shared_ptr<const gko::experimental::mpi::communicator>
get_gko_mpi_device_comm() const
get_device_comm() const
{
return this->device_comm_;
}

std::shared_ptr<const gko::experimental::mpi::communicator>
get_communicator() const
std::shared_ptr<const gko::experimental::mpi::communicator> get_host_comm()
const
{
return this->device_comm_;
return this->host_comm_;
}

label get_rank() const { return get_communicator()->rank(); };
bool get_split_comm() const { return split_comm_; };

label get_host_rank() const { return get_host_comm()->rank(); };

label get_device_rank() const { return get_device_comm()->rank(); };
};

using PersistentExecutor = ExecutorHandler;
Expand Down
10 changes: 5 additions & 5 deletions include/OGL/DevicePersistent/Vector.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ struct VectorInitFunctor {
{
auto ref_exec = exec_.get_ref_exec();
auto exec = exec_.get_device_exec();
auto comm = exec_.get_communicator();
auto comm = exec_.get_host_comm();
auto repartitioner = dist_matrix_->get_repartitioner();
auto host_size = repartitioner->get_orig_size();
auto repart_size = repartitioner->get_repart_size();
Expand All @@ -78,7 +78,6 @@ struct VectorInitFunctor {
{
auto exec = exec_.get_device_exec();
auto ref_exec = exec_.get_ref_exec();
auto comm = exec_.get_communicator();
auto repartitioner = dist_matrix_->get_repartitioner();
auto host_size = repartitioner->get_orig_size();
auto repart_size = repartitioner->get_repart_size();
Expand All @@ -100,8 +99,9 @@ struct VectorInitFunctor {
communicate_values(exec_, comm_pattern, host_view.get_const_data(),
values.get_data());

auto device_comm = exec_.get_device_comm();
auto ret = gko::share(dist_vec::create(
exec, *comm.get(),
exec, *device_comm.get(),
vec::create(
exec, gko::dim<2>{static_cast<gko::size_type>(repart_size), 1},
values, 1)));
Expand Down Expand Up @@ -174,9 +174,9 @@ class PersistentVector
void copy_back()
{
auto exec = exec_.get_device_exec();
auto rank = exec_.get_rank();
auto rank = exec_.get_host_rank();
auto ref_exec = exec_.get_ref_exec();
auto comm = exec_.get_communicator();
auto comm = exec_.get_host_comm();
bool host_buffer = !exec_.get_non_orig_device_comm();

auto repartitioner = dist_matrix_->get_repartitioner();
Expand Down
11 changes: 6 additions & 5 deletions include/OGL/Repartitioner.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,11 +63,12 @@ class Repartitioner {
exec_handler)),
ranks_per_gpu_(ranks_per_gpu),
verbose_(verbose),
// TODO we build the partition from the local size before
// repartitioning thus we need the host comm to compute correct sizes
orig_partition_(gko::share(
gko::experimental::distributed::build_partition_from_local_size<
label, label>(exec_handler.get_ref_exec(),
*exec_handler.get_communicator().get(),
size))){};
*exec_handler.get_host_comm().get(), size))){};

/* returns the owner rank for a given rank */
label get_owner_rank(label rank) const
Expand All @@ -78,14 +79,14 @@ class Repartitioner {
/* returns the owner rank for a given rank */
label get_owner_rank(const ExecutorHandler &exec_handler) const
{
return get_owner_rank(exec_handler.get_rank());
return get_owner_rank(exec_handler.get_host_rank());
};

/* returns if current rank is an owner */
bool is_owner(const ExecutorHandler &exec_handler) const
{
return exec_handler.get_rank() ==
get_owner_rank(exec_handler.get_rank());
return exec_handler.get_host_rank() ==
get_owner_rank(exec_handler.get_host_rank());
};

/* @brief check if the given rank gets local after repartitioning
Expand Down
Loading

0 comments on commit c577ed5

Please sign in to comment.