diff --git a/docs/sphinx/api/languages/python_api.rst b/docs/sphinx/api/languages/python_api.rst index d84ddc427c..3c26a73e6d 100644 --- a/docs/sphinx/api/languages/python_api.rst +++ b/docs/sphinx/api/languages/python_api.rst @@ -157,6 +157,8 @@ Data Types .. autoclass:: cudaq.operator.cudm_state.CuDensityMatState :members: +.. autoclass:: cudaq.operator.helpers.InitialState + .. autofunction:: cudaq.operator.cudm_state.to_cupy_array .. autoclass:: cudaq::SampleResult diff --git a/docs/sphinx/examples/python/dynamics/mgmn/initial_state.py b/docs/sphinx/examples/python/dynamics/mgmn/initial_state.py new file mode 100644 index 0000000000..1bd06a8f94 --- /dev/null +++ b/docs/sphinx/examples/python/dynamics/mgmn/initial_state.py @@ -0,0 +1,71 @@ +import cudaq +from cudaq import operators, spin, Schedule, RungeKuttaIntegrator + +import numpy as np +import matplotlib.pyplot as plt +import os + +# On a system with multiple GPUs, `mpiexec` can be used as follows: +# `mpiexec -np python3 multi_gpu.py ` +cudaq.mpi.initialize() + +# Set the target to our dynamics simulator +cudaq.set_target("dynamics") + +# Large number of spins +N = 20 +dimensions = {} +for i in range(N): + dimensions[i] = 2 + +# Observable is the average magnetization operator +avg_magnetization_op = operators.zero() +for i in range(N): + avg_magnetization_op += (spin.z(i) / N) + +# Arbitrary coupling constant +g = 1.0 +# Construct the Hamiltonian +H = operators.zero() +for i in range(N): + H += 2 * np.pi * spin.x(i) + H += 2 * np.pi * spin.y(i) +for i in range(N - 1): + H += 2 * np.pi * g * spin.x(i) * spin.x(i + 1) + H += 2 * np.pi * g * spin.y(i) * spin.z(i + 1) + +steps = np.linspace(0.0, 1, 200) +schedule = Schedule(steps, ["time"]) + +# Initial state (expressed as an enum) +psi0 = cudaq.operator.InitialState.ZERO +# This can also be used to initialize a uniformly-distributed wave-function instead. +# `psi0 = cudaq.operator.InitialState.UNIFORM` + +# Run the simulation +evolution_result = cudaq.evolve(H, + dimensions, + schedule, + psi0, + observables=[avg_magnetization_op], + collapse_operators=[], + store_intermediate_results=True, + integrator=RungeKuttaIntegrator()) + +exp_val = [ + exp_vals[0].expectation() + for exp_vals in evolution_result.expectation_values() +] + +if cudaq.mpi.rank() == 0: + # Plot the results + fig = plt.figure(figsize=(12, 6)) + plt.plot(steps, exp_val) + plt.ylabel("Average Magnetization") + plt.xlabel("Time") + abspath = os.path.abspath(__file__) + dname = os.path.dirname(abspath) + os.chdir(dname) + fig.savefig("spin_model.png", dpi=fig.dpi) + +cudaq.mpi.finalize() diff --git a/docs/sphinx/examples/python/dynamics/mgmn/multi_gpu.py b/docs/sphinx/examples/python/dynamics/mgmn/multi_gpu.py new file mode 100644 index 0000000000..06f9ef7645 --- /dev/null +++ b/docs/sphinx/examples/python/dynamics/mgmn/multi_gpu.py @@ -0,0 +1,94 @@ +import cudaq +from cudaq import operators, spin, Schedule, RungeKuttaIntegrator + +import numpy as np +import cupy as cp +import matplotlib.pyplot as plt +import os + +# On a system with multiple GPUs, `mpiexec` can be used as follows: +# `mpiexec -np python3 multi_gpu.py ` +cudaq.mpi.initialize() + +# Set the target to our dynamics simulator +cudaq.set_target("dynamics") + +# In this example, we solve the Quantum Heisenberg model (https://en.wikipedia.org/wiki/Quantum_Heisenberg_model), +# which exhibits the so-called quantum quench effect. +# e.g., see `Quantum quenches in the anisotropic spin-1/2 Heisenberg chain: different approaches to many-body dynamics far from equilibrium` +# (New J. Phys. 12 055017) +# Large number of spins +N = 21 +dimensions = {} +for i in range(N): + dimensions[i] = 2 + +# Initial state: alternating spin up and down +spin_state = '' +for i in range(N): + spin_state += str(int(i % 2)) + +# Observable is the staggered magnetization operator +staggered_magnetization_op = operators.zero() +for i in range(N): + if i % 2 == 0: + staggered_magnetization_op += spin.z(i) + else: + staggered_magnetization_op -= spin.z(i) + +staggered_magnetization_op /= N + +observe_results = [] +for g in [0.25, 4.0]: + # Heisenberg model spin coupling strength + Jx = 1.0 + Jy = 1.0 + Jz = g + + # Construct the Hamiltonian + H = operators.zero() + + for i in range(N - 1): + H += Jx * spin.x(i) * spin.x(i + 1) + H += Jy * spin.y(i) * spin.y(i + 1) + H += Jz * spin.z(i) * spin.z(i + 1) + + steps = np.linspace(0.0, 1, 100) + schedule = Schedule(steps, ["time"]) + + # Prepare the initial state vector + psi0_ = cp.zeros(2**N, dtype=cp.complex128) + psi0_[int(spin_state, 2)] = 1.0 + psi0 = cudaq.State.from_data(psi0_) + + # Run the simulation + evolution_result = cudaq.evolve(H, + dimensions, + schedule, + psi0, + observables=[staggered_magnetization_op], + collapse_operators=[], + store_intermediate_results=True, + integrator=RungeKuttaIntegrator()) + + exp_val = [ + exp_vals[0].expectation() + for exp_vals in evolution_result.expectation_values() + ] + + observe_results.append((g, exp_val)) + +if cudaq.mpi.rank() == 0: + # Plot the results + fig = plt.figure(figsize=(12, 6)) + for g, exp_val in observe_results: + plt.plot(steps, exp_val, label=f'$ g = {g}$') + plt.legend(fontsize=16) + plt.ylabel("Staggered Magnetization") + plt.xlabel("Time") + abspath = os.path.abspath(__file__) + dname = os.path.dirname(abspath) + os.chdir(dname) + fig.savefig("heisenberg_model.png", dpi=fig.dpi) + +cudaq.mpi.finalize() diff --git a/docs/sphinx/snippets/python/using/backends/dynamics.py b/docs/sphinx/snippets/python/using/backends/dynamics.py index aa94c745f9..6e71395dff 100644 --- a/docs/sphinx/snippets/python/using/backends/dynamics.py +++ b/docs/sphinx/snippets/python/using/backends/dynamics.py @@ -172,3 +172,44 @@ def compute_value(param_name, step_idx): time_dependence = parameter_values(numpy.linspace(0, 1, 100)) cudaq.evolve(system_operator, system_dimensions, time_dependence, initial_state) #[End Schedule2] + +import cudaq +from cudaq import operators, spin, Schedule, RungeKuttaIntegrator + +N = 4 +dimensions = {} +for i in range(N): + dimensions[i] = 2 +g = 1.0 +H = operators.zero() +for i in range(N): + H += 2 * np.pi * spin.x(i) + H += 2 * np.pi * spin.y(i) +for i in range(N - 1): + H += 2 * np.pi * g * spin.x(i) * spin.x(i + 1) + H += 2 * np.pi * g * spin.y(i) * spin.z(i + 1) + +steps = np.linspace(0.0, 1, 200) +schedule = Schedule(steps, ["time"]) + +#[Begin MPI] +cudaq.mpi.initialize() + +# Set the target to our dynamics simulator +cudaq.set_target("dynamics") + +# Initial state (expressed as an enum) +psi0 = cudaq.operator.InitialState.ZERO + +# Run the simulation +evolution_result = cudaq.evolve(H, + dimensions, + schedule, + psi0, + observables=[], + collapse_operators=[], + store_intermediate_results=True, + integrator=RungeKuttaIntegrator()) + +cudaq.mpi.finalize() +#[End MPI] diff --git a/docs/sphinx/using/backends/dynamics.rst b/docs/sphinx/using/backends/dynamics.rst index a5b8c056b8..6e6e48e567 100644 --- a/docs/sphinx/using/backends/dynamics.rst +++ b/docs/sphinx/using/backends/dynamics.rst @@ -84,6 +84,8 @@ For example, we can plot the Pauli expectation value for the above simulation as In particular, for each time step, `evolve` captures an array of expectation values, one for each observable. Hence, we convert them into sequences for plotting purposes. +Examples that illustrate how to use the ``dynamics`` target are available +in the `CUDA-Q repository `__. Operator +++++++++++ @@ -272,4 +274,45 @@ backend target. If the output is a '`None`' string, it indicates that your Torch installation does not support CUDA. In this case, you need to install a CUDA-enabled Torch package via other mechanisms, e.g., building Torch from source or using their Docker images. - \ No newline at end of file + +Multi-GPU Multi-Node Execution ++++++++++++++++++++++++++++++++ + +.. _cudensitymat_mgmn: + +CUDA-Q ``dynamics`` target supports parallel execution on multiple GPUs. +To enable parallel execution, the application must initialize MPI as follows. + + +.. tab:: Python + + .. literalinclude:: ../../snippets/python/using/backends/dynamics.py + :language: python + :start-after: [Begin MPI] + :end-before: [End MPI] + + .. code:: bash + + mpiexec -np python3 program.py + + where ``N`` is the number of processes. + + +By initializing the MPI execution environment (via `cudaq.mpi.initialize()`) in the application code and +invoking it via an MPI launcher, we have activated the multi-node multi-GPU feature of the ``dynamics`` target. +Specifically, it will detect the number of processes (GPUs) and distribute the computation across all available GPUs. + + +.. note:: + The number of MPI processes must be a power of 2, one GPU per process. + +.. note:: + Not all integrators are capable of handling distributed state. Errors will be raised if parallel execution is activated + but the selected integrator does not support distributed state. + +.. warning:: + As of cuQuantum version 24.11, there are a couple of `known limitations `__ for parallel execution: + + - Computing the expectation value of a mixed quantum state is not supported. Thus, `collapse_operators` are not supported if expectation calculation is required. + + - Some combinations of quantum states and quantum many-body operators are not supported. Errors will be raised in those cases. diff --git a/python/cudaq/operator/__init__.py b/python/cudaq/operator/__init__.py index 1ddad5562a..c46aed64bd 100644 --- a/python/cudaq/operator/__init__.py +++ b/python/cudaq/operator/__init__.py @@ -9,5 +9,5 @@ from .definitions import operators, spin from .evolution import evolve, evolve_async from .expressions import Operator, OperatorSum, ProductOperator, ElementaryOperator, ScalarOperator, RydbergHamiltonian -from .helpers import NumericType +from .helpers import NumericType, InitialState from .schedule import Schedule diff --git a/python/cudaq/operator/cudm_solver.py b/python/cudaq/operator/cudm_solver.py index 85a9cc536b..f1dad8c35d 100644 --- a/python/cudaq/operator/cudm_solver.py +++ b/python/cudaq/operator/cudm_solver.py @@ -16,6 +16,7 @@ from ..mlir._mlir_libs._quakeDialects import cudaq_runtime from .cudm_helpers import cudm, CudmStateType from .cudm_state import CuDensityMatState, as_cudm_state +from .helpers import InitialState, InitialStateArgT from .integrator import BaseIntegrator from .integrators.builtin_integrators import RungeKuttaIntegrator, cuDensityMatTimeStepper import cupy @@ -28,7 +29,7 @@ def evolve_dynamics( hamiltonian: Operator, dimensions: Mapping[int, int], schedule: Schedule, - initial_state: cudaq_runtime.State, + initial_state: InitialStateArgT, collapse_operators: Sequence[Operator] = [], observables: Sequence[Operator] = [], store_intermediate_results=False, @@ -43,8 +44,21 @@ def evolve_dynamics( schedule.reset() hilbert_space_dims = tuple(dimensions[d] for d in range(len(dimensions))) - with ScopeTimer("evolve.as_cudm_state") as timer: - initial_state = as_cudm_state(initial_state) + # Check that the integrator can support distributed state if this is a distributed simulation. + if cudaq_runtime.mpi.is_initialized() and cudaq_runtime.mpi.num_ranks( + ) > 1 and integrator is not None and not integrator.support_distributed_state( + ): + raise ValueError( + f"Integrator {type(integrator).__name__} does not support distributed state." + ) + + if isinstance(initial_state, InitialState): + has_collapse_operators = len(collapse_operators) > 0 + initial_state = CuDensityMatState.create_initial_state( + initial_state, hilbert_space_dims, has_collapse_operators) + else: + with ScopeTimer("evolve.as_cudm_state") as timer: + initial_state = as_cudm_state(initial_state) if not isinstance(initial_state, CuDensityMatState): raise ValueError("Unknown type") diff --git a/python/cudaq/operator/cudm_state.py b/python/cudaq/operator/cudm_state.py index f1d8fd25b4..1cb5e47113 100644 --- a/python/cudaq/operator/cudm_state.py +++ b/python/cudaq/operator/cudm_state.py @@ -11,6 +11,13 @@ from typing import Sequence from cupy.cuda.memory import MemoryPointer, UnownedMemory from ..mlir._mlir_libs._quakeDialects import cudaq_runtime +from cuquantum.bindings import cudensitymat as cudm +from .helpers import InitialState + + +def is_multi_processes(): + return cudaq_runtime.mpi.is_initialized() and cudaq_runtime.mpi.num_ranks( + ) > 1 # Wrap state data (on device memory) as a `cupy` array. @@ -35,7 +42,17 @@ class CuDensityMatState(object): def __init__(self, data): if self.__ctx is None: - self.__ctx = WorkStream() + if (is_multi_processes()): + self.__ctx = WorkStream(device_id=cupy.cuda.runtime.getDevice()) + # FIXME: use the below once `cudensitymat` supports raw MPI Comm pointer. + # `ctx.set_communicator(comm=cudaq_runtime.mpi.comm_dup(), provider="MPI")` + # At the moment, only `mpi4py` communicator objects are supported, thus we use the underlying `reset_distributed_configuration` API. + _comm_ptr, _size = cudaq_runtime.mpi.comm_dup() + cudm.reset_distributed_configuration( + self.__ctx._handle._validated_ptr, + cudm.DistributedProvider.MPI, _comm_ptr, _size) + else: + self.__ctx = WorkStream() self.hilbert_space_dims = None if isinstance(data, DenseMixedState) or isinstance( @@ -46,28 +63,40 @@ def __init__(self, data): self.raw_data = data self.state = None + def try_init_state(self, shape): + """Try initialize state according to a shape, e.g., density matrix or state vector.""" + slice_shape, slice_offsets = self.state.local_info + state_size = numpy.prod(shape) + if state_size == self.raw_data.size: + slice_obj = tuple( + slice(offset, offset + size) for offset, size in zip( + slice_offsets, slice_shape))[:len(shape)] + self.raw_data = cupy.asfortranarray(self.raw_data.reshape(shape)) + self.raw_data = cupy.asfortranarray(self.raw_data[slice_obj].copy()) + self.state.attach_storage(self.raw_data) + else: + self.raw_data = cupy.asfortranarray( + self.raw_data.reshape(slice_shape)) + self.state.attach_storage(self.raw_data) + def init_state(self, hilbert_space_dims: Sequence[int]): if self.state is None: self.hilbert_space_dims = hilbert_space_dims dm_shape = hilbert_space_dims * 2 sv_shape = hilbert_space_dims try: - self.raw_data = cupy.asfortranarray( - self.raw_data.reshape(dm_shape)) self.state = DenseMixedState(self.__ctx, self.hilbert_space_dims, batch_size=1, dtype="complex128") - self.state.attach_storage(self.raw_data) + self.try_init_state(dm_shape) except: try: - self.raw_data = cupy.asfortranarray( - self.raw_data.reshape(sv_shape)) self.state = DensePureState(self.__ctx, self.hilbert_space_dims, batch_size=1, dtype="complex128") - self.state.attach_storage(self.raw_data) + self.try_init_state(sv_shape) except: raise ValueError( f"Invalid state data: state data must be either a state vector (equivalent to {sv_shape} shape) or a density matrix (equivalent to {dm_shape} shape)." @@ -83,6 +112,75 @@ def is_density_matrix(self) -> bool: def from_data(data): return CuDensityMatState(data) + @staticmethod + def create_initial_state(type: InitialState, + hilbert_space_dims: Sequence[int], + mix_state: bool): + new_state = CuDensityMatState(None) + new_state.hilbert_space_dims = hilbert_space_dims + + if mix_state: + new_state.state = DenseMixedState(new_state.__ctx, + new_state.hilbert_space_dims, + batch_size=1, + dtype="complex128") + else: + new_state.state = DensePureState(new_state.__ctx, + new_state.hilbert_space_dims, + batch_size=1, + dtype="complex128") + required_buffer_size = new_state.state.storage_size + slice_shape, slice_offsets = new_state.state.local_info + + if type == InitialState.ZERO: + buffer = cupy.asfortranarray( + cupy.zeros((required_buffer_size,), + dtype="complex128", + order="F")) + bitstring_is_local = False + # Follow `cudensitymat` example to set the amplitude based on `local_info` + for slice_dim, slice_offset in zip(slice_shape, slice_offsets): + bitstring_is_local = 0 in range(slice_offset, + slice_offset + slice_dim) + if not bitstring_is_local: + break + if bitstring_is_local: + buffer[0] = 1.0 + new_state.state.raw_data = cupy.asfortranarray( + buffer.reshape(slice_shape)) + new_state.state.attach_storage(new_state.state.raw_data) + elif type == InitialState.UNIFORM: + buffer = cupy.asfortranarray( + cupy.zeros((required_buffer_size,), + dtype="complex128", + order="F")) + hilberg_space_size = numpy.cumprod(hilbert_space_dims)[-1] + if mix_state: + dm_shape = hilbert_space_dims * 2 + # FIXME: currently, we use host-device data transfer, hence a host allocation is required. + # A custom GPU memory initialization can be also used so that no host allocation is needed. + host_array = (1. / (hilberg_space_size)) * numpy.identity( + hilberg_space_size, dtype="complex128") + host_array = host_array.reshape(dm_shape) + slice_obj = [] + for i in range(len(slice_offsets) - 1): + slice_obj.append( + slice(slice_offsets[i], + slice_offsets[i] + slice_shape[i])) + slice_obj = tuple(slice_obj) + sliced_host_array = numpy.ravel(host_array[slice_obj].copy()) + buffer = cupy.array(sliced_host_array) + else: + buffer[:] = 1. / numpy.sqrt(hilberg_space_size) + + new_state.state.raw_data = cupy.asfortranarray( + buffer.reshape(slice_shape)) + new_state.state.attach_storage(new_state.state.raw_data) + else: + raise ValueError("Unsupported initial state type") + + return new_state + def get_impl(self): return self.state diff --git a/python/cudaq/operator/evolution.py b/python/cudaq/operator/evolution.py index 7d1f68020b..a4a9cd0bcd 100644 --- a/python/cudaq/operator/evolution.py +++ b/python/cudaq/operator/evolution.py @@ -16,10 +16,9 @@ import warnings from .expressions import Operator, RydbergHamiltonian -from .helpers import NumericType +from .helpers import NumericType, InitialState, InitialStateArgT from .integrator import BaseIntegrator from .schedule import Schedule - from ..kernel.register_op import register_operation from ..kernel.utils import ahkPrefix from ..mlir._mlir_libs._quakeDialects import cudaq_runtime @@ -193,7 +192,7 @@ def evolve_single( hamiltonian: Operator, dimensions: Mapping[int, int], schedule: Schedule, - initial_state: cudaq_runtime.State, + initial_state: InitialStateArgT, collapse_operators: Sequence[Operator] = [], observables: Sequence[Operator] = [], store_intermediate_results=False, @@ -251,6 +250,25 @@ def evolve_single( step_parameters, dt) if shots_count is None: shots_count = -1 + if isinstance(initial_state, InitialState): + # This is an initial state enum, create concrete state. + state_size = 2**num_qubits + if initial_state == InitialState.ZERO: + state_data = numpy.zeros(state_size, dtype=numpy.complex128) + state_data[0] = 1.0 + elif initial_state == InitialState.UNIFORM: + state_data = (1. / numpy.sqrt(state_size)) * numpy.ones( + state_size, dtype=numpy.complex128) + else: + raise ValueError("Unsupported initial state type") + + sim_name = cudaq_runtime.get_target().simulator.strip() + if sim_name == "dm": + initial_state = cudaq_runtime.State.from_data( + numpy.outer(state_data, numpy.conj(state_data))) + else: + initial_state = cudaq_runtime.State.from_data(state_data) + if store_intermediate_results: evolution = _evolution_kernel( num_qubits, @@ -292,7 +310,7 @@ def evolve( hamiltonian: Operator, dimensions: Mapping[int, int] = {}, schedule: Schedule = None, - initial_state: cudaq_runtime.State | Sequence[cudaq_runtime.State] = None, + initial_state: InitialStateArgT | Sequence[InitialStateArgT] = None, collapse_operators: Sequence[Operator] = [], observables: Sequence[Operator] = [], store_intermediate_results=False, @@ -395,7 +413,7 @@ def evolve_single_async( hamiltonian: Operator, dimensions: Mapping[int, int], schedule: Schedule, - initial_state: cudaq_runtime.State, + initial_state: InitialStateArgT, collapse_operators: Sequence[Operator] = [], observables: Sequence[Operator] = [], store_intermediate_results=False, @@ -506,7 +524,7 @@ def evolve_async( hamiltonian: Operator, dimensions: Mapping[int, int] = {}, schedule: Schedule = None, - initial_state: cudaq_runtime.State | Sequence[cudaq_runtime.State] = None, + initial_state: InitialStateArgT | Sequence[InitialStateArgT] = None, collapse_operators: Sequence[Operator] = [], observables: Sequence[Operator] = [], store_intermediate_results=False, diff --git a/python/cudaq/operator/helpers.py b/python/cudaq/operator/helpers.py index be4971fe8f..d88810723d 100644 --- a/python/cudaq/operator/helpers.py +++ b/python/cudaq/operator/helpers.py @@ -9,6 +9,7 @@ import inspect, itertools, numpy, os, re, sys, typing # type: ignore from typing import Any, Callable, Iterable, Mapping, Optional, Sequence, Tuple from numpy.typing import NDArray +from enum import Enum from ..mlir._mlir_libs._quakeDialects import cudaq_runtime @@ -21,6 +22,17 @@ NumericType = typing.Union[numpy.complexfloating, complex, float, int] +class InitialState(Enum): + """ + Enum to specify the initial quantum state. + """ + ZERO = 1 + UNIFORM = 2 + + +InitialStateArgT = cudaq_runtime.State | InitialState + + class _OperatorHelpers: @staticmethod diff --git a/python/cudaq/operator/integrator.py b/python/cudaq/operator/integrator.py index d7837d80e7..010565bacc 100644 --- a/python/cudaq/operator/integrator.py +++ b/python/cudaq/operator/integrator.py @@ -75,3 +75,9 @@ def get_state(self) -> Tuple[float, TState]: Obtain the state of the integrator as a pair (t, state). """ return (self.t, self.state) + + def support_distributed_state(self): + """ + Returns true if the integrator supports distributed state else returns false. Default is set to false. + """ + return False diff --git a/python/cudaq/operator/integrators/builtin_integrators.py b/python/cudaq/operator/integrators/builtin_integrators.py index cff2a8e30c..355524541f 100644 --- a/python/cudaq/operator/integrators/builtin_integrators.py +++ b/python/cudaq/operator/integrators/builtin_integrators.py @@ -62,6 +62,9 @@ def __init__(self, super().__init__(**kwargs) self.stepper = stepper + def support_distributed_state(self): + return True + def __post_init__(self): if "nsteps" in self.integrator_options: self.n_steps = self.integrator_options["nsteps"] diff --git a/python/extension/CUDAQuantumExtension.cpp b/python/extension/CUDAQuantumExtension.cpp index 12911c3af7..56bc5954f1 100644 --- a/python/extension/CUDAQuantumExtension.cpp +++ b/python/extension/CUDAQuantumExtension.cpp @@ -169,6 +169,14 @@ PYBIND11_MODULE(_quakeDialects, m) { "Returns true if MPI has already been initialized."); mpiSubmodule.def( "finalize", []() { cudaq::mpi::finalize(); }, "Finalize MPI."); + mpiSubmodule.def( + "comm_dup", + []() { + const auto [commPtr, commSize] = cudaq::mpi::comm_dup(); + return std::make_pair(reinterpret_cast(commPtr), commSize); + }, + "Duplicates the communicator. Return the new communicator address (as an " + "integer) and its size in bytes"); auto orcaSubmodule = cudaqRuntime.def_submodule("orca"); orcaSubmodule.def( diff --git a/python/tests/operator/test_evolve_dynamics.py b/python/tests/operator/test_evolve_dynamics.py index 7ae3197abf..ebb6fed859 100644 --- a/python/tests/operator/test_evolve_dynamics.py +++ b/python/tests/operator/test_evolve_dynamics.py @@ -7,12 +7,12 @@ # ============================================================================ # import pytest import cudaq -# Note: the test model may create state, hence need to set the target to "dynamics" -cudaq.set_target("dynamics") if cudaq.num_available_gpus() == 0: pytest.skip("Skipping GPU tests", allow_module_level=True) else: + # Note: the test model may create state, hence need to set the target to "dynamics" + cudaq.set_target("dynamics") from system_models import * diff --git a/python/tests/operator/test_evolve_simulators.py b/python/tests/operator/test_evolve_simulators.py index 85e2b6b9dc..2fb2bac0e0 100644 --- a/python/tests/operator/test_evolve_simulators.py +++ b/python/tests/operator/test_evolve_simulators.py @@ -171,7 +171,8 @@ def do_something(): ] -def test_evolve(): +@pytest.mark.parametrize("init_state", ["array", "enum"]) +def test_evolve(init_state): # Set random seed for shots-based observe test. cudaq.set_random_seed(13) @@ -182,8 +183,11 @@ def test_evolve(): dimensions = {0: 2} # Initial state of the system (ground state). - rho0 = cudaq.State.from_data( - np.array([[1.0, 0.0], [0.0, 0.0]], dtype=np.complex128)) + if init_state == "array": + rho0 = cudaq.State.from_data( + np.array([[1.0, 0.0], [0.0, 0.0]], dtype=np.complex128)) + elif init_state == "enum": + rho0 = InitialState.ZERO # Schedule of time steps. steps = np.linspace(0, 10, 101) diff --git a/python/tests/parallel/CMakeLists.txt b/python/tests/parallel/CMakeLists.txt index 7b1741e5ec..651d4cddcd 100644 --- a/python/tests/parallel/CMakeLists.txt +++ b/python/tests/parallel/CMakeLists.txt @@ -26,3 +26,13 @@ set_tests_properties( "PYTHONPATH=${CMAKE_BINARY_DIR}/python" LABELS "gpu_required;mgpus_required") + +add_test( + NAME cudaq-py-parallel-dynamics + COMMAND ${MPIEXEC_EXECUTABLE} --allow-run-as-root -np 2 ${PYTHON_EXECUTABLE} test_mpi_dynamics.py + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) +set_tests_properties( + cudaq-py-parallel-dynamics PROPERTIES ENVIRONMENT + "PYTHONPATH=${CMAKE_BINARY_DIR}/python" + LABELS + "gpu_required;mgpus_required") diff --git a/python/tests/parallel/test_mpi_dynamics.py b/python/tests/parallel/test_mpi_dynamics.py new file mode 100644 index 0000000000..ba8ee367a2 --- /dev/null +++ b/python/tests/parallel/test_mpi_dynamics.py @@ -0,0 +1,71 @@ +# ============================================================================ # +# Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates. # +# All rights reserved. # +# # +# This source code and the accompanying materials are made available under # +# the terms of the Apache License 2.0 which accompanies this distribution. # +# ============================================================================ # + +import cudaq, os, pytest +from cudaq import operators, spin, Schedule, RungeKuttaIntegrator +import numpy as np + +skipIfUnsupported = pytest.mark.skipif( + not (cudaq.num_available_gpus() > 1 and cudaq.has_target('dynamics')), + reason="dynamics backend not available or not a multi-GPU machine") + + +@pytest.fixture(autouse=True) +def do_something(): + cudaq.mpi.initialize() + cudaq.set_target('dynamics') + yield + cudaq.reset_target() + cudaq.mpi.finalize() + + +@skipIfUnsupported +def testMpiRun(): + # Large number of spins + N = 20 + dimensions = {} + for i in range(N): + dimensions[i] = 2 + + # Observable is the average magnetization operator + avg_magnetization_op = operators.zero() + for i in range(N): + avg_magnetization_op += (spin.z(i) / N) + + # Arbitrary coupling constant + g = 1.0 + # Construct the Hamiltonian + H = operators.zero() + for i in range(N): + H += 2 * np.pi * spin.x(i) + H += 2 * np.pi * spin.y(i) + for i in range(N - 1): + H += 2 * np.pi * g * spin.x(i) * spin.x(i + 1) + H += 2 * np.pi * g * spin.y(i) * spin.z(i + 1) + + steps = np.linspace(0.0, 1, 100) + schedule = Schedule(steps, ["time"]) + + # Initial state (expressed as an enum) + psi0 = cudaq.operator.InitialState.ZERO + + # Run the simulation + evolution_result = cudaq.evolve(H, + dimensions, + schedule, + psi0, + observables=[avg_magnetization_op], + collapse_operators=[], + store_intermediate_results=False, + integrator=RungeKuttaIntegrator()) + + +# leave for gdb debugging +if __name__ == "__main__": + loc = os.path.abspath(__file__) + pytest.main([loc, "-s"]) diff --git a/runtime/cudaq.h b/runtime/cudaq.h index adf27cf38a..fb5f6b9182 100644 --- a/runtime/cudaq.h +++ b/runtime/cudaq.h @@ -307,6 +307,10 @@ void broadcast(std::vector &data, int rootRank); /// @brief Broadcast a string from a process (rootRank) to all other processes. void broadcast(std::string &data, int rootRank); +/// @brief Duplicate the communicator. Returns the new communicator (as a void*) +/// and its size. +std::pair comm_dup(); + /// @brief Finalize MPI. This function /// is a no-op if there CUDA-Q has not been built /// against MPI. diff --git a/runtime/cudaq/cudaq.cpp b/runtime/cudaq/cudaq.cpp index 5e8617ff38..071f658f43 100644 --- a/runtime/cudaq/cudaq.cpp +++ b/runtime/cudaq/cudaq.cpp @@ -172,6 +172,16 @@ void broadcast(std::string &data, int rootRank) { commPlugin->broadcast(data, rootRank); } +std::pair comm_dup() { + auto *commPlugin = getMpiPlugin(); + cudaqDistributedCommunicator_t *dupComm = nullptr; + cudaqDistributedCommunicator_t *comm = commPlugin->getComm(); + const auto dupStatus = commPlugin->get()->CommDup(comm, &dupComm); + if (dupStatus != 0 || dupComm == nullptr) + throw std::runtime_error("Failed to duplicate the MPI communicator."); + return std::make_pair(dupComm->commPtr, dupComm->commSize); +} + void finalize() { if (rank() == 0) cudaq::info("Finalizing MPI."); diff --git a/runtime/cudaq/distributed/builtin/mpi_comm_impl.cpp b/runtime/cudaq/distributed/builtin/mpi_comm_impl.cpp index 04ef3c066a..fb027d261c 100644 --- a/runtime/cudaq/distributed/builtin/mpi_comm_impl.cpp +++ b/runtime/cudaq/distributed/builtin/mpi_comm_impl.cpp @@ -230,7 +230,15 @@ static int mpi_AllgatherV(const cudaqDistributedCommunicator_t *comm, /// @brief Wrapper of MPI_Isend and track pending requests for synchronization static int mpi_SendAsync(const cudaqDistributedCommunicator_t *comm, const void *buf, int count, DataType dataType, - int peer, int32_t tag) { + int peer, int32_t tag, void *mpiRequest) { + // If a specific request was provided, just make MPI_Irecv call + if (mpiRequest) + return MPI_Isend(buf, count, convertType(dataType), peer, tag, + unpackMpiCommunicator(comm), (MPI_Request *)mpiRequest); + + // Otherwise, use the implicit request tracking mechanism (1 send and 1 + // receive async call) + std::scoped_lock lock(PendingRequest::g_mutex); if (PendingRequest::g_requests[comm].nActiveRequests == 2) return -1; @@ -249,7 +257,15 @@ static int mpi_SendAsync(const cudaqDistributedCommunicator_t *comm, /// @brief Wrapper of MPI_Irecv and track pending requests for synchronization static int mpi_RecvAsync(const cudaqDistributedCommunicator_t *comm, void *buf, - int count, DataType dataType, int peer, int32_t tag) { + int count, DataType dataType, int peer, int32_t tag, + void *mpiRequest) { + // If a specific request was provided, just make MPI_Irecv call + if (mpiRequest) + return MPI_Irecv(buf, count, convertType(dataType), peer, tag, + unpackMpiCommunicator(comm), (MPI_Request *)mpiRequest); + + // Otherwise, use the implicit request tracking mechanism (1 send and 1 + // receive async call) std::scoped_lock lock(PendingRequest::g_mutex); if (PendingRequest::g_requests[comm].nActiveRequests == 2) return -1; @@ -339,6 +355,50 @@ static int mpi_CommSplit(const cudaqDistributedCommunicator_t *comm, return status; } +/// @brief Wrapper MPI_Request malloc +static int mpi_CreateRequest(void **request) { + *request = malloc(sizeof(MPI_Request)); + if (*request == NULL) + return -1; + return 0; +} + +/// @brief Wrapper MPI_Request free +static int mpi_DestroyRequest(void *request) { + if (request == NULL) + return -1; + free(request); + return 0; +} + +/// @brief Wrapper MPI_Wait +static int mpi_WaitRequest(void *request) { + MPI_Status waitStatus; + return MPI_Wait((MPI_Request *)request, &waitStatus); +} + +/// @brief Wrapper MPI_Test +static int mpi_TestRequest(void *request, int32_t *completed) { + MPI_Status testStatus; + return MPI_Test((MPI_Request *)request, completed, &testStatus); +} + +/// @brief Wrapper MPI_Send +static int mpi_Send(const cudaqDistributedCommunicator_t *comm, + const void *buffer, int count, DataType datatype, + int destination, int32_t tag) { + return MPI_Send(buffer, count, convertType(datatype), destination, tag, + unpackMpiCommunicator(comm)); +} + +/// @brief Wrapper MPI_Recv +static int mpi_Recv(const cudaqDistributedCommunicator_t *comm, void *buffer, + int count, DataType datatype, int source, int32_t tag) { + MPI_Status recvStatus; + return MPI_Recv(buffer, count, convertType(datatype), source, tag, + unpackMpiCommunicator(comm), &recvStatus); +} + /// @brief Return the underlying MPI_Comm as a type-erased object cudaqDistributedCommunicator_t *getMpiCommunicator() { static MPI_Comm pluginComm = MPI_COMM_WORLD; @@ -370,7 +430,14 @@ cudaqDistributedInterface_t *getDistributedInterface() { mpi_Synchronize, mpi_Abort, mpi_CommDup, - mpi_CommSplit}; + mpi_CommSplit, + mpi_CreateRequest, + mpi_DestroyRequest, + mpi_WaitRequest, + mpi_TestRequest, + mpi_Send, + mpi_Recv, + }; return &cudaqDistributedInterface; } } diff --git a/runtime/cudaq/distributed/distributed_capi.h b/runtime/cudaq/distributed/distributed_capi.h index 3541b1fcc8..e52fb39eee 100644 --- a/runtime/cudaq/distributed/distributed_capi.h +++ b/runtime/cudaq/distributed/distributed_capi.h @@ -94,14 +94,14 @@ typedef struct { /// `Synchronize` should be called as appropriate to resolve in-flight /// requests before making new ones. int (*SendAsync)(const cudaqDistributedCommunicator_t *, const void *, int, - DataType, int, int32_t); + DataType, int, int32_t, void *); /// @brief MPI_Irecv /// @note The MPI plugin API allows for a maximum of two concurrent /// non-blocking requests (e.g., one `Isend` and one `Irecv`). Hence, /// `Synchronize` should be called as appropriate to resolve in-flight /// requests before making new ones. int (*RecvAsync)(const cudaqDistributedCommunicator_t *, void *, int, - DataType, int, int32_t); + DataType, int, int32_t, void *); /// @brief MPI_Isend and MPI_Irecv in one call /// @note The MPI plugin API allows for a maximum of two concurrent /// non-blocking requests (e.g., one `Isend` and one `Irecv`). Since this @@ -119,5 +119,19 @@ typedef struct { /// @brief MPI_Comm_split int (*CommSplit)(const cudaqDistributedCommunicator_t *, int32_t, int32_t, cudaqDistributedCommunicator_t **); + /// @brief Create an MPI_Request + int (*CreateRequest)(void **); + /// @brief Free a previously-allocated MPI_Request + int (*DestroyRequest)(void *); + /// @brief Wait for the completion of a MPI_Request + int (*WaitRequest)(void *); + /// @brief Test for the completion of a MPI_Request + int (*TestRequest)(void *, int32_t *); + /// @brief MPI_Send + int (*Send)(const cudaqDistributedCommunicator_t *, const void *, int, + DataType, int, int32_t); + /// @brief MPI_Recv + int (*Recv)(const cudaqDistributedCommunicator_t *, void *, int, DataType, + int, int32_t); } cudaqDistributedInterface_t; -} \ No newline at end of file +} diff --git a/runtime/nvqir/cudensitymat/CMakeLists.txt b/runtime/nvqir/cudensitymat/CMakeLists.txt index 426bd819ec..38a69559aa 100644 --- a/runtime/nvqir/cudensitymat/CMakeLists.txt +++ b/runtime/nvqir/cudensitymat/CMakeLists.txt @@ -11,13 +11,27 @@ find_package(CUDAToolkit REQUIRED) set(INTERFACE_POSITION_INDEPENDENT_CODE ON) set(LIBRARY_NAME nvqir-dynamics) -add_library(${LIBRARY_NAME} SHARED CuDensityMatSim.cpp) + +# Find cudensitymat header (for MPI plugin interface typedef) +find_file(CUDENSITYMAT_INC + NAMES cudensitymat.h + HINTS + $ENV{CUQUANTUM_INSTALL_PREFIX}/include + /usr/include + ENV CPATH +) + +message(STATUS "cudensitymat header: ${CUDENSITYMAT_INC}") +get_filename_component(CUDENSITYMAT_INCLUDE_DIR ${CUDENSITYMAT_INC} DIRECTORY) + +add_library(${LIBRARY_NAME} SHARED CuDensityMatSim.cpp mpi_support.cpp) target_include_directories(${LIBRARY_NAME} PRIVATE . .. ${CUDAToolkit_INCLUDE_DIRS} ${CMAKE_SOURCE_DIR}/runtime/common + ${CUDENSITYMAT_INCLUDE_DIR} ) target_link_libraries(${LIBRARY_NAME} diff --git a/runtime/nvqir/cudensitymat/CuDensityMatSim.cpp b/runtime/nvqir/cudensitymat/CuDensityMatSim.cpp index ac0ad95f0f..ba56d6fdbf 100644 --- a/runtime/nvqir/cudensitymat/CuDensityMatSim.cpp +++ b/runtime/nvqir/cudensitymat/CuDensityMatSim.cpp @@ -8,7 +8,54 @@ #include "CircuitSimulator.h" #include "CuDensityMatState.h" +#include "cudaq.h" +#include "cudaq/distributed/mpi_plugin.h" + namespace { +// Hook to query this shared lib file location at runtime. +extern "C" { +void cuDensityMatSimCppFileMarker() { return; } +} +/// @brief Query the full path to the this lib. +static const char *getThisSharedLibFilePath() { + static thread_local std::string LIB_PATH; + if (LIB_PATH.empty()) { + // Use dladdr query this .so file + void *funcPtrToFind = (void *)(intptr_t)cuDensityMatSimCppFileMarker; + Dl_info DLInfo; + int err = dladdr(funcPtrToFind, &DLInfo); + if (err != 0) { + char link_path[PATH_MAX]; + // If the filename is a symlink, we need to resolve and return the + // location of the actual .so file. + if (realpath(DLInfo.dli_fname, link_path)) + LIB_PATH = link_path; + } + } + + return LIB_PATH.c_str(); +} + +/// @brief Retrieve the path to the plugin implementation +std::string getMpiPluginFilePath() { + auto mpiPlugin = cudaq::mpi::getMpiPlugin(); + if (!mpiPlugin) + throw std::runtime_error("Failed to retrieve MPI plugin"); + + return mpiPlugin->getPluginPath(); +} + +void initCuDensityMatCommLib() { + // If CUDENSITYMAT_COMM_LIB environment variable is not set, + // use this builtin plugin shim (redirect MPI calls to CUDA-Q plugin) + if (std::getenv("CUDENSITYMAT_COMM_LIB") == nullptr) { + cudaq::info("Enabling cuDensityMat MPI without environment variable " + "CUDENSITYMAT_COMM_LIB. \nUse the builtin cuTensorNet " + "communicator lib from '{}' - CUDA-Q MPI plugin {}.", + getThisSharedLibFilePath(), getMpiPluginFilePath()); + setenv("CUDENSITYMAT_COMM_LIB", getThisSharedLibFilePath(), 0); + } +} class CuDensityMatSim : public nvqir::CircuitSimulatorBase { protected: @@ -30,7 +77,15 @@ class CuDensityMatSim : public nvqir::CircuitSimulatorBase { public: /// @brief The constructor - CuDensityMatSim() {} + CuDensityMatSim() { + int numDevices{0}; + HANDLE_CUDA_ERROR(cudaGetDeviceCount(&numDevices)); + const int deviceId = + cudaq::mpi::is_initialized() ? cudaq::mpi::rank() % numDevices : 0; + if (cudaq::mpi::is_initialized()) + initCuDensityMatCommLib(); + HANDLE_CUDA_ERROR(cudaSetDevice(deviceId)); + } /// The destructor virtual ~CuDensityMatSim() = default; diff --git a/runtime/nvqir/cudensitymat/mpi_support.cpp b/runtime/nvqir/cudensitymat/mpi_support.cpp new file mode 100644 index 0000000000..5d8605e33d --- /dev/null +++ b/runtime/nvqir/cudensitymat/mpi_support.cpp @@ -0,0 +1,246 @@ +/******************************************************************************* + * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ +#include "common/Logger.h" +#include "cudaq/distributed/mpi_plugin.h" +#include +#include +#include +#include +#include + +/// @brief Retrieve the MPI plugin comm interface +static cudaqDistributedInterface_t *getMpiPluginInterface() { + auto mpiPlugin = cudaq::mpi::getMpiPlugin(); + if (!mpiPlugin) + throw std::runtime_error("Failed to retrieve MPI plugin"); + cudaqDistributedInterface_t *mpiInterface = mpiPlugin->get(); + if (!mpiInterface) + throw std::runtime_error("Invalid MPI distributed plugin encountered"); + return mpiInterface; +} + +// Implementing cudensitymat's COMM interface by delegating wrapped MPI calls to +// the underlying CUDA-Q MPI plugin. This will make this library +// compatible with CUDENSITYMAT_COMM_LIB API. Converts CUDA data type to the +// corresponding CUDA-Q shim type enum + +/// Convert cudensitymat CUDA datatype enum +static DataType convertCudaToMpiDataType(const cudaDataType_t cudaDataType) { + switch (cudaDataType) { + case CUDA_R_8I: + return INT_8; + case CUDA_R_16I: + return INT_16; + case CUDA_R_32I: + return INT_32; + case CUDA_R_64I: + return INT_64; + case CUDA_R_32F: + return FLOAT_32; + case CUDA_R_64F: + return FLOAT_64; + case CUDA_C_32F: + return FLOAT_COMPLEX; + case CUDA_C_64F: + return DOUBLE_COMPLEX; + default: + throw std::runtime_error("Unsupported data type encountered in " + "cudensitymat communicator plugin"); + } + __builtin_unreachable(); +} + +/// Convert the type-erased Comm object +static cudaqDistributedCommunicator_t +convertMpiCommunicator(const cudensitymatDistributedCommunicator_t *cutnComm) { + cudaqDistributedCommunicator_t comm{cutnComm->commPtr, cutnComm->commSize}; + return comm; +} + +#ifdef __cplusplus +extern "C" { +#endif + +// Implementation of cudensitymat distributed interface by delegating to CUDA-Q +// MPI plugin API. +int cudensitymatMpiCommSize(const cudensitymatDistributedCommunicator_t *comm, + int32_t *numRanks) { + ScopedTraceWithContext(__FUNCTION__); + auto cudaqComm = convertMpiCommunicator(comm); + return getMpiPluginInterface()->getNumRanks(&cudaqComm, numRanks); +} + +int cudensitymatMpiCommSizeShared( + const cudensitymatDistributedCommunicator_t *comm, int32_t *numRanks) { + ScopedTraceWithContext(__FUNCTION__); + auto cudaqComm = convertMpiCommunicator(comm); + return getMpiPluginInterface()->getCommSizeShared(&cudaqComm, numRanks); +} + +int cudensitymatMpiCommRank(const cudensitymatDistributedCommunicator_t *comm, + int32_t *procRank) { + ScopedTraceWithContext(__FUNCTION__); + auto cudaqComm = convertMpiCommunicator(comm); + return getMpiPluginInterface()->getProcRank(&cudaqComm, procRank); +} + +int cudensitymatMpiBarrier(const cudensitymatDistributedCommunicator_t *comm) { + ScopedTraceWithContext(__FUNCTION__); + auto cudaqComm = convertMpiCommunicator(comm); + return getMpiPluginInterface()->Barrier(&cudaqComm); +} + +int cudensitymatMpiCreateRequest(cudensitymatDistributedRequest_t *request) { + ScopedTraceWithContext(__FUNCTION__); + return getMpiPluginInterface()->CreateRequest(request); +} + +int cudensitymatMpiDestroyRequest(cudensitymatDistributedRequest_t request) { + ScopedTraceWithContext(__FUNCTION__); + return getMpiPluginInterface()->DestroyRequest(request); +} + +int cudensitymatMpiWaitRequest(cudensitymatDistributedRequest_t request) { + ScopedTraceWithContext(__FUNCTION__); + return getMpiPluginInterface()->WaitRequest(request); +} + +int cudensitymatMpiTestRequest(cudensitymatDistributedRequest_t request, + int32_t *completed) { + ScopedTraceWithContext(__FUNCTION__); + return getMpiPluginInterface()->TestRequest(request, completed); +} + +int cudensitymatMpiSend(const cudensitymatDistributedCommunicator_t *comm, + const void *buffer, int32_t count, + cudaDataType_t datatype, int32_t destination, + int32_t tag) { + ScopedTraceWithContext(__FUNCTION__); + auto cudaqComm = convertMpiCommunicator(comm); + return getMpiPluginInterface()->Send(&cudaqComm, buffer, count, + convertCudaToMpiDataType(datatype), + destination, tag); +} + +int cudensitymatMpiSendAsync(const cudensitymatDistributedCommunicator_t *comm, + const void *buffer, int32_t count, + cudaDataType_t datatype, int32_t destination, + int32_t tag, + cudensitymatDistributedRequest_t request) { + ScopedTraceWithContext(__FUNCTION__); + auto cudaqComm = convertMpiCommunicator(comm); + return getMpiPluginInterface()->SendAsync(&cudaqComm, buffer, count, + convertCudaToMpiDataType(datatype), + destination, tag, request); +} + +int cudensitymatMpiRecv(const cudensitymatDistributedCommunicator_t *comm, + void *buffer, int32_t count, cudaDataType_t datatype, + int32_t source, int32_t tag) { + ScopedTraceWithContext(__FUNCTION__); + auto cudaqComm = convertMpiCommunicator(comm); + return getMpiPluginInterface()->Recv(&cudaqComm, buffer, count, + convertCudaToMpiDataType(datatype), + source, tag); +} + +int cudensitymatMpiRecvAsync(const cudensitymatDistributedCommunicator_t *comm, + void *buffer, int32_t count, + cudaDataType_t datatype, int32_t source, + int32_t tag, + cudensitymatDistributedRequest_t request) { + ScopedTraceWithContext(__FUNCTION__); + auto cudaqComm = convertMpiCommunicator(comm); + return getMpiPluginInterface()->RecvAsync(&cudaqComm, buffer, count, + convertCudaToMpiDataType(datatype), + source, tag, request); +} + +int cudensitymatMpiBcast(const cudensitymatDistributedCommunicator_t *comm, + void *buffer, int32_t count, cudaDataType_t datatype, + int32_t root) { + ScopedTraceWithContext(__FUNCTION__); + auto cudaqComm = convertMpiCommunicator(comm); + return getMpiPluginInterface()->Bcast( + &cudaqComm, buffer, count, convertCudaToMpiDataType(datatype), root); +} + +int cudensitymatMpiAllreduce(const cudensitymatDistributedCommunicator_t *comm, + const void *bufferIn, void *bufferOut, + int32_t count, cudaDataType_t datatype) { + ScopedTraceWithContext(__FUNCTION__); + // cudensitymat expects MPI_SUM in this API + auto cudaqComm = convertMpiCommunicator(comm); + return getMpiPluginInterface()->Allreduce( + &cudaqComm, bufferIn, bufferOut, count, + convertCudaToMpiDataType(datatype), SUM); +} + +int cudensitymatMpiAllreduceInPlace( + const cudensitymatDistributedCommunicator_t *comm, void *buffer, + int32_t count, cudaDataType_t datatype) { + ScopedTraceWithContext(__FUNCTION__); + auto cudaqComm = convertMpiCommunicator(comm); + // cudensitymat expects MPI_SUM in this API + return getMpiPluginInterface()->AllreduceInPlace( + &cudaqComm, buffer, count, convertCudaToMpiDataType(datatype), SUM); +} + +int cudensitymatMpiAllreduceInPlaceMin( + const cudensitymatDistributedCommunicator_t *comm, void *buffer, + int32_t count, cudaDataType_t datatype) { + ScopedTraceWithContext(__FUNCTION__); + auto cudaqComm = convertMpiCommunicator(comm); + // cudensitymat expects MPI_SUM in this API + return getMpiPluginInterface()->AllreduceInPlace( + &cudaqComm, buffer, count, convertCudaToMpiDataType(datatype), MIN); +} + +int cudensitymatMpiAllreduceDoubleIntMinloc( + const cudensitymatDistributedCommunicator_t *comm, const void *bufferIn, + void *bufferOut) { + ScopedTraceWithContext(__FUNCTION__); + auto cudaqComm = convertMpiCommunicator(comm); + return getMpiPluginInterface()->Allreduce(&cudaqComm, bufferIn, bufferOut, 1, + FLOAT_64, MIN_LOC); +} + +int cudensitymatMpiAllgather(const cudensitymatDistributedCommunicator_t *comm, + const void *bufferIn, void *bufferOut, + int32_t count, cudaDataType_t datatype) { + ScopedTraceWithContext(__FUNCTION__); + auto cudaqComm = convertMpiCommunicator(comm); + return getMpiPluginInterface()->Allgather(&cudaqComm, bufferIn, bufferOut, + count, + convertCudaToMpiDataType(datatype)); +} + +cudensitymatDistributedInterface_t cudensitymatCommInterface = { + CUDENSITYMAT_DISTRIBUTED_INTERFACE_VERSION, + cudensitymatMpiCommSize, + cudensitymatMpiCommSizeShared, + cudensitymatMpiCommRank, + cudensitymatMpiBarrier, + cudensitymatMpiCreateRequest, + cudensitymatMpiDestroyRequest, + cudensitymatMpiWaitRequest, + cudensitymatMpiTestRequest, + cudensitymatMpiSend, + cudensitymatMpiSendAsync, + cudensitymatMpiRecv, + cudensitymatMpiRecvAsync, + cudensitymatMpiBcast, + cudensitymatMpiAllreduce, + cudensitymatMpiAllreduceInPlace, + cudensitymatMpiAllreduceInPlaceMin, + cudensitymatMpiAllreduceDoubleIntMinloc, + cudensitymatMpiAllgather}; + +#ifdef __cplusplus +} // extern "C" +#endif diff --git a/unittests/mpi/mpi_tester.cpp b/unittests/mpi/mpi_tester.cpp index 826ff59d4d..e097df62c0 100644 --- a/unittests/mpi/mpi_tester.cpp +++ b/unittests/mpi/mpi_tester.cpp @@ -166,10 +166,10 @@ TEST(MPITester, checkSendAndRecv) { cudaqDistributedCommunicator_t *comm = mpiPlugin->getComm(); EXPECT_TRUE(comm != nullptr); EXPECT_EQ(mpiInterface->RecvAsync(comm, recvBuffer.data(), nElems, FLOAT_64, - recvRank, 0), + recvRank, 0, nullptr), 0); EXPECT_EQ(mpiInterface->SendAsync(comm, sendBuffer.data(), nElems, FLOAT_64, - sendRank, 0), + sendRank, 0, nullptr), 0); EXPECT_EQ(mpiInterface->Synchronize(comm), 0); for (std::size_t i = 0; i < refBuffer.size(); ++i) {