diff --git a/docs/sphinx/api/languages/python_api.rst b/docs/sphinx/api/languages/python_api.rst
index d84ddc427c..3c26a73e6d 100644
--- a/docs/sphinx/api/languages/python_api.rst
+++ b/docs/sphinx/api/languages/python_api.rst
@@ -157,6 +157,8 @@ Data Types
 .. autoclass:: cudaq.operator.cudm_state.CuDensityMatState
     :members:
 
+.. autoclass:: cudaq.operator.helpers.InitialState
+
 .. autofunction:: cudaq.operator.cudm_state.to_cupy_array
 
 .. autoclass:: cudaq::SampleResult
diff --git a/docs/sphinx/examples/python/dynamics/mgmn/initial_state.py b/docs/sphinx/examples/python/dynamics/mgmn/initial_state.py
new file mode 100644
index 0000000000..1bd06a8f94
--- /dev/null
+++ b/docs/sphinx/examples/python/dynamics/mgmn/initial_state.py
@@ -0,0 +1,71 @@
+import cudaq
+from cudaq import operators, spin, Schedule, RungeKuttaIntegrator
+
+import numpy as np
+import matplotlib.pyplot as plt
+import os
+
+# On a system with multiple GPUs, `mpiexec` can be used as follows:
+# `mpiexec -np <N> python3 multi_gpu.py `
+cudaq.mpi.initialize()
+
+# Set the target to our dynamics simulator
+cudaq.set_target("dynamics")
+
+# Large number of spins
+N = 20
+dimensions = {}
+for i in range(N):
+    dimensions[i] = 2
+
+# Observable is the average magnetization operator
+avg_magnetization_op = operators.zero()
+for i in range(N):
+    avg_magnetization_op += (spin.z(i) / N)
+
+# Arbitrary coupling constant
+g = 1.0
+# Construct the Hamiltonian
+H = operators.zero()
+for i in range(N):
+    H += 2 * np.pi * spin.x(i)
+    H += 2 * np.pi * spin.y(i)
+for i in range(N - 1):
+    H += 2 * np.pi * g * spin.x(i) * spin.x(i + 1)
+    H += 2 * np.pi * g * spin.y(i) * spin.z(i + 1)
+
+steps = np.linspace(0.0, 1, 200)
+schedule = Schedule(steps, ["time"])
+
+# Initial state (expressed as an enum)
+psi0 = cudaq.operator.InitialState.ZERO
+# This can also be used to initialize a uniformly-distributed wave-function instead.
+# `psi0 = cudaq.operator.InitialState.UNIFORM`
+
+# Run the simulation
+evolution_result = cudaq.evolve(H,
+                                dimensions,
+                                schedule,
+                                psi0,
+                                observables=[avg_magnetization_op],
+                                collapse_operators=[],
+                                store_intermediate_results=True,
+                                integrator=RungeKuttaIntegrator())
+
+exp_val = [
+    exp_vals[0].expectation()
+    for exp_vals in evolution_result.expectation_values()
+]
+
+if cudaq.mpi.rank() == 0:
+    # Plot the results
+    fig = plt.figure(figsize=(12, 6))
+    plt.plot(steps, exp_val)
+    plt.ylabel("Average Magnetization")
+    plt.xlabel("Time")
+    abspath = os.path.abspath(__file__)
+    dname = os.path.dirname(abspath)
+    os.chdir(dname)
+    fig.savefig("spin_model.png", dpi=fig.dpi)
+
+cudaq.mpi.finalize()
diff --git a/docs/sphinx/examples/python/dynamics/mgmn/multi_gpu.py b/docs/sphinx/examples/python/dynamics/mgmn/multi_gpu.py
new file mode 100644
index 0000000000..06f9ef7645
--- /dev/null
+++ b/docs/sphinx/examples/python/dynamics/mgmn/multi_gpu.py
@@ -0,0 +1,94 @@
+import cudaq
+from cudaq import operators, spin, Schedule, RungeKuttaIntegrator
+
+import numpy as np
+import cupy as cp
+import matplotlib.pyplot as plt
+import os
+
+# On a system with multiple GPUs, `mpiexec` can be used as follows:
+# `mpiexec -np <N> python3 multi_gpu.py `
+cudaq.mpi.initialize()
+
+# Set the target to our dynamics simulator
+cudaq.set_target("dynamics")
+
+# In this example, we solve the Quantum Heisenberg model (https://en.wikipedia.org/wiki/Quantum_Heisenberg_model),
+# which exhibits the so-called quantum quench effect.
+# e.g., see `Quantum quenches in the anisotropic spin-1/2 Heisenberg chain: different approaches to many-body dynamics far from equilibrium`
+# (New J. Phys. 12 055017)
+# Large number of spins
+N = 21
+dimensions = {}
+for i in range(N):
+    dimensions[i] = 2
+
+# Initial state: alternating spin up and down
+spin_state = ''
+for i in range(N):
+    spin_state += str(int(i % 2))
+
+# Observable is the staggered magnetization operator
+staggered_magnetization_op = operators.zero()
+for i in range(N):
+    if i % 2 == 0:
+        staggered_magnetization_op += spin.z(i)
+    else:
+        staggered_magnetization_op -= spin.z(i)
+
+staggered_magnetization_op /= N
+
+observe_results = []
+for g in [0.25, 4.0]:
+    # Heisenberg model spin coupling strength
+    Jx = 1.0
+    Jy = 1.0
+    Jz = g
+
+    # Construct the Hamiltonian
+    H = operators.zero()
+
+    for i in range(N - 1):
+        H += Jx * spin.x(i) * spin.x(i + 1)
+        H += Jy * spin.y(i) * spin.y(i + 1)
+        H += Jz * spin.z(i) * spin.z(i + 1)
+
+    steps = np.linspace(0.0, 1, 100)
+    schedule = Schedule(steps, ["time"])
+
+    # Prepare the initial state vector
+    psi0_ = cp.zeros(2**N, dtype=cp.complex128)
+    psi0_[int(spin_state, 2)] = 1.0
+    psi0 = cudaq.State.from_data(psi0_)
+
+    # Run the simulation
+    evolution_result = cudaq.evolve(H,
+                                    dimensions,
+                                    schedule,
+                                    psi0,
+                                    observables=[staggered_magnetization_op],
+                                    collapse_operators=[],
+                                    store_intermediate_results=True,
+                                    integrator=RungeKuttaIntegrator())
+
+    exp_val = [
+        exp_vals[0].expectation()
+        for exp_vals in evolution_result.expectation_values()
+    ]
+
+    observe_results.append((g, exp_val))
+
+if cudaq.mpi.rank() == 0:
+    # Plot the results
+    fig = plt.figure(figsize=(12, 6))
+    for g, exp_val in observe_results:
+        plt.plot(steps, exp_val, label=f'$ g = {g}$')
+    plt.legend(fontsize=16)
+    plt.ylabel("Staggered Magnetization")
+    plt.xlabel("Time")
+    abspath = os.path.abspath(__file__)
+    dname = os.path.dirname(abspath)
+    os.chdir(dname)
+    fig.savefig("heisenberg_model.png", dpi=fig.dpi)
+
+cudaq.mpi.finalize()
diff --git a/docs/sphinx/snippets/python/using/backends/dynamics.py b/docs/sphinx/snippets/python/using/backends/dynamics.py
index aa94c745f9..6e71395dff 100644
--- a/docs/sphinx/snippets/python/using/backends/dynamics.py
+++ b/docs/sphinx/snippets/python/using/backends/dynamics.py
@@ -172,3 +172,44 @@ def compute_value(param_name, step_idx):
 time_dependence = parameter_values(numpy.linspace(0, 1, 100))
 cudaq.evolve(system_operator, system_dimensions, time_dependence, initial_state)
 #[End Schedule2]
+
+import cudaq
+from cudaq import operators, spin, Schedule, RungeKuttaIntegrator
+
+N = 4
+dimensions = {}
+for i in range(N):
+    dimensions[i] = 2
+g = 1.0
+H = operators.zero()
+for i in range(N):
+    H += 2 * np.pi * spin.x(i)
+    H += 2 * np.pi * spin.y(i)
+for i in range(N - 1):
+    H += 2 * np.pi * g * spin.x(i) * spin.x(i + 1)
+    H += 2 * np.pi * g * spin.y(i) * spin.z(i + 1)
+
+steps = np.linspace(0.0, 1, 200)
+schedule = Schedule(steps, ["time"])
+
+#[Begin MPI]
+cudaq.mpi.initialize()
+
+# Set the target to our dynamics simulator
+cudaq.set_target("dynamics")
+
+# Initial state (expressed as an enum)
+psi0 = cudaq.operator.InitialState.ZERO
+
+# Run the simulation
+evolution_result = cudaq.evolve(H,
+                                dimensions,
+                                schedule,
+                                psi0,
+                                observables=[],
+                                collapse_operators=[],
+                                store_intermediate_results=True,
+                                integrator=RungeKuttaIntegrator())
+
+cudaq.mpi.finalize()
+#[End MPI]
diff --git a/docs/sphinx/using/backends/dynamics.rst b/docs/sphinx/using/backends/dynamics.rst
index a5b8c056b8..6e6e48e567 100644
--- a/docs/sphinx/using/backends/dynamics.rst
+++ b/docs/sphinx/using/backends/dynamics.rst
@@ -84,6 +84,8 @@ For example, we can plot the Pauli expectation value for the above simulation as
 In particular, for each time step, `evolve` captures an array of expectation values, one for each  
 observable. Hence, we convert them into sequences for plotting purposes.
 
+Examples that illustrate how to use the ``dynamics`` target are available 
+in the `CUDA-Q repository <https://github.com/NVIDIA/cuda-quantum/tree/main/docs/sphinx/examples/python/dynamics>`__. 
 
 Operator
 +++++++++++
@@ -272,4 +274,45 @@ backend target.
     If the output is a '`None`' string, it indicates that your Torch installation does not support CUDA.
     In this case, you need to install a CUDA-enabled Torch package via other mechanisms, e.g., building Torch from source or
     using their Docker images.
- 
\ No newline at end of file
+
+Multi-GPU Multi-Node Execution
++++++++++++++++++++++++++++++++
+
+.. _cudensitymat_mgmn:
+
+CUDA-Q ``dynamics`` target supports parallel execution on multiple GPUs. 
+To enable parallel execution, the application must initialize MPI as follows.
+
+
+.. tab:: Python
+
+  .. literalinclude:: ../../snippets/python/using/backends/dynamics.py
+        :language: python
+        :start-after: [Begin MPI]
+        :end-before: [End MPI]
+
+  .. code:: bash 
+
+        mpiexec -np <N> python3 program.py 
+  
+  where ``N`` is the number of processes.
+
+
+By initializing the MPI execution environment (via `cudaq.mpi.initialize()`) in the application code and
+invoking it via an MPI launcher, we have activated the multi-node multi-GPU feature of the ``dynamics`` target.
+Specifically, it will detect the number of processes (GPUs) and distribute the computation across all available GPUs.
+
+
+.. note::
+    The number of MPI processes must be a power of 2, one GPU per process.
+
+.. note::
+    Not all integrators are capable of handling distributed state. Errors will be raised if parallel execution is activated 
+    but the selected integrator does not support distributed state. 
+
+.. warning:: 
+    As of cuQuantum version 24.11, there are a couple of `known limitations <https://docs.nvidia.com/cuda/cuquantum/24.11.0/cudensitymat/index.html>`__ for parallel execution:
+
+    - Computing the expectation value of a mixed quantum state is not supported. Thus, `collapse_operators` are not supported if expectation calculation is required.
+
+    - Some combinations of quantum states and quantum many-body operators are not supported. Errors will be raised in those cases. 
diff --git a/python/cudaq/operator/__init__.py b/python/cudaq/operator/__init__.py
index 1ddad5562a..c46aed64bd 100644
--- a/python/cudaq/operator/__init__.py
+++ b/python/cudaq/operator/__init__.py
@@ -9,5 +9,5 @@
 from .definitions import operators, spin
 from .evolution import evolve, evolve_async
 from .expressions import Operator, OperatorSum, ProductOperator, ElementaryOperator, ScalarOperator, RydbergHamiltonian
-from .helpers import NumericType
+from .helpers import NumericType, InitialState
 from .schedule import Schedule
diff --git a/python/cudaq/operator/cudm_solver.py b/python/cudaq/operator/cudm_solver.py
index 85a9cc536b..f1dad8c35d 100644
--- a/python/cudaq/operator/cudm_solver.py
+++ b/python/cudaq/operator/cudm_solver.py
@@ -16,6 +16,7 @@
 from ..mlir._mlir_libs._quakeDialects import cudaq_runtime
 from .cudm_helpers import cudm, CudmStateType
 from .cudm_state import CuDensityMatState, as_cudm_state
+from .helpers import InitialState, InitialStateArgT
 from .integrator import BaseIntegrator
 from .integrators.builtin_integrators import RungeKuttaIntegrator, cuDensityMatTimeStepper
 import cupy
@@ -28,7 +29,7 @@ def evolve_dynamics(
         hamiltonian: Operator,
         dimensions: Mapping[int, int],
         schedule: Schedule,
-        initial_state: cudaq_runtime.State,
+        initial_state: InitialStateArgT,
         collapse_operators: Sequence[Operator] = [],
         observables: Sequence[Operator] = [],
         store_intermediate_results=False,
@@ -43,8 +44,21 @@ def evolve_dynamics(
     schedule.reset()
     hilbert_space_dims = tuple(dimensions[d] for d in range(len(dimensions)))
 
-    with ScopeTimer("evolve.as_cudm_state") as timer:
-        initial_state = as_cudm_state(initial_state)
+    # Check that the integrator can support distributed state if this is a distributed simulation.
+    if cudaq_runtime.mpi.is_initialized() and cudaq_runtime.mpi.num_ranks(
+    ) > 1 and integrator is not None and not integrator.support_distributed_state(
+    ):
+        raise ValueError(
+            f"Integrator {type(integrator).__name__} does not support distributed state."
+        )
+
+    if isinstance(initial_state, InitialState):
+        has_collapse_operators = len(collapse_operators) > 0
+        initial_state = CuDensityMatState.create_initial_state(
+            initial_state, hilbert_space_dims, has_collapse_operators)
+    else:
+        with ScopeTimer("evolve.as_cudm_state") as timer:
+            initial_state = as_cudm_state(initial_state)
 
     if not isinstance(initial_state, CuDensityMatState):
         raise ValueError("Unknown type")
diff --git a/python/cudaq/operator/cudm_state.py b/python/cudaq/operator/cudm_state.py
index f1d8fd25b4..1cb5e47113 100644
--- a/python/cudaq/operator/cudm_state.py
+++ b/python/cudaq/operator/cudm_state.py
@@ -11,6 +11,13 @@
 from typing import Sequence
 from cupy.cuda.memory import MemoryPointer, UnownedMemory
 from ..mlir._mlir_libs._quakeDialects import cudaq_runtime
+from cuquantum.bindings import cudensitymat as cudm
+from .helpers import InitialState
+
+
+def is_multi_processes():
+    return cudaq_runtime.mpi.is_initialized() and cudaq_runtime.mpi.num_ranks(
+    ) > 1
 
 
 # Wrap state data (on device memory) as a `cupy` array.
@@ -35,7 +42,17 @@ class CuDensityMatState(object):
 
     def __init__(self, data):
         if self.__ctx is None:
-            self.__ctx = WorkStream()
+            if (is_multi_processes()):
+                self.__ctx = WorkStream(device_id=cupy.cuda.runtime.getDevice())
+                # FIXME: use the below once `cudensitymat` supports raw MPI Comm pointer.
+                # `ctx.set_communicator(comm=cudaq_runtime.mpi.comm_dup(), provider="MPI")`
+                # At the moment, only `mpi4py` communicator objects are supported, thus we use the underlying `reset_distributed_configuration` API.
+                _comm_ptr, _size = cudaq_runtime.mpi.comm_dup()
+                cudm.reset_distributed_configuration(
+                    self.__ctx._handle._validated_ptr,
+                    cudm.DistributedProvider.MPI, _comm_ptr, _size)
+            else:
+                self.__ctx = WorkStream()
 
         self.hilbert_space_dims = None
         if isinstance(data, DenseMixedState) or isinstance(
@@ -46,28 +63,40 @@ def __init__(self, data):
             self.raw_data = data
             self.state = None
 
+    def try_init_state(self, shape):
+        """Try initialize state according to a shape, e.g., density matrix or state vector."""
+        slice_shape, slice_offsets = self.state.local_info
+        state_size = numpy.prod(shape)
+        if state_size == self.raw_data.size:
+            slice_obj = tuple(
+                slice(offset, offset + size) for offset, size in zip(
+                    slice_offsets, slice_shape))[:len(shape)]
+            self.raw_data = cupy.asfortranarray(self.raw_data.reshape(shape))
+            self.raw_data = cupy.asfortranarray(self.raw_data[slice_obj].copy())
+            self.state.attach_storage(self.raw_data)
+        else:
+            self.raw_data = cupy.asfortranarray(
+                self.raw_data.reshape(slice_shape))
+            self.state.attach_storage(self.raw_data)
+
     def init_state(self, hilbert_space_dims: Sequence[int]):
         if self.state is None:
             self.hilbert_space_dims = hilbert_space_dims
             dm_shape = hilbert_space_dims * 2
             sv_shape = hilbert_space_dims
             try:
-                self.raw_data = cupy.asfortranarray(
-                    self.raw_data.reshape(dm_shape))
                 self.state = DenseMixedState(self.__ctx,
                                              self.hilbert_space_dims,
                                              batch_size=1,
                                              dtype="complex128")
-                self.state.attach_storage(self.raw_data)
+                self.try_init_state(dm_shape)
             except:
                 try:
-                    self.raw_data = cupy.asfortranarray(
-                        self.raw_data.reshape(sv_shape))
                     self.state = DensePureState(self.__ctx,
                                                 self.hilbert_space_dims,
                                                 batch_size=1,
                                                 dtype="complex128")
-                    self.state.attach_storage(self.raw_data)
+                    self.try_init_state(sv_shape)
                 except:
                     raise ValueError(
                         f"Invalid state data: state data must be either a state vector (equivalent to {sv_shape} shape) or a density matrix (equivalent to {dm_shape} shape)."
@@ -83,6 +112,75 @@ def is_density_matrix(self) -> bool:
     def from_data(data):
         return CuDensityMatState(data)
 
+    @staticmethod
+    def create_initial_state(type: InitialState,
+                             hilbert_space_dims: Sequence[int],
+                             mix_state: bool):
+        new_state = CuDensityMatState(None)
+        new_state.hilbert_space_dims = hilbert_space_dims
+
+        if mix_state:
+            new_state.state = DenseMixedState(new_state.__ctx,
+                                              new_state.hilbert_space_dims,
+                                              batch_size=1,
+                                              dtype="complex128")
+        else:
+            new_state.state = DensePureState(new_state.__ctx,
+                                             new_state.hilbert_space_dims,
+                                             batch_size=1,
+                                             dtype="complex128")
+        required_buffer_size = new_state.state.storage_size
+        slice_shape, slice_offsets = new_state.state.local_info
+
+        if type == InitialState.ZERO:
+            buffer = cupy.asfortranarray(
+                cupy.zeros((required_buffer_size,),
+                           dtype="complex128",
+                           order="F"))
+            bitstring_is_local = False
+            # Follow `cudensitymat` example to set the amplitude based on `local_info`
+            for slice_dim, slice_offset in zip(slice_shape, slice_offsets):
+                bitstring_is_local = 0 in range(slice_offset,
+                                                slice_offset + slice_dim)
+                if not bitstring_is_local:
+                    break
+            if bitstring_is_local:
+                buffer[0] = 1.0
+            new_state.state.raw_data = cupy.asfortranarray(
+                buffer.reshape(slice_shape))
+            new_state.state.attach_storage(new_state.state.raw_data)
+        elif type == InitialState.UNIFORM:
+            buffer = cupy.asfortranarray(
+                cupy.zeros((required_buffer_size,),
+                           dtype="complex128",
+                           order="F"))
+            hilberg_space_size = numpy.cumprod(hilbert_space_dims)[-1]
+            if mix_state:
+                dm_shape = hilbert_space_dims * 2
+                # FIXME: currently, we use host-device data transfer, hence a host allocation is required.
+                # A custom GPU memory initialization can be also used so that no host allocation is needed.
+                host_array = (1. / (hilberg_space_size)) * numpy.identity(
+                    hilberg_space_size, dtype="complex128")
+                host_array = host_array.reshape(dm_shape)
+                slice_obj = []
+                for i in range(len(slice_offsets) - 1):
+                    slice_obj.append(
+                        slice(slice_offsets[i],
+                              slice_offsets[i] + slice_shape[i]))
+                slice_obj = tuple(slice_obj)
+                sliced_host_array = numpy.ravel(host_array[slice_obj].copy())
+                buffer = cupy.array(sliced_host_array)
+            else:
+                buffer[:] = 1. / numpy.sqrt(hilberg_space_size)
+
+            new_state.state.raw_data = cupy.asfortranarray(
+                buffer.reshape(slice_shape))
+            new_state.state.attach_storage(new_state.state.raw_data)
+        else:
+            raise ValueError("Unsupported initial state type")
+
+        return new_state
+
     def get_impl(self):
         return self.state
 
diff --git a/python/cudaq/operator/evolution.py b/python/cudaq/operator/evolution.py
index 7d1f68020b..a4a9cd0bcd 100644
--- a/python/cudaq/operator/evolution.py
+++ b/python/cudaq/operator/evolution.py
@@ -16,10 +16,9 @@
 import warnings
 
 from .expressions import Operator, RydbergHamiltonian
-from .helpers import NumericType
+from .helpers import NumericType, InitialState, InitialStateArgT
 from .integrator import BaseIntegrator
 from .schedule import Schedule
-
 from ..kernel.register_op import register_operation
 from ..kernel.utils import ahkPrefix
 from ..mlir._mlir_libs._quakeDialects import cudaq_runtime
@@ -193,7 +192,7 @@ def evolve_single(
         hamiltonian: Operator,
         dimensions: Mapping[int, int],
         schedule: Schedule,
-        initial_state: cudaq_runtime.State,
+        initial_state: InitialStateArgT,
         collapse_operators: Sequence[Operator] = [],
         observables: Sequence[Operator] = [],
         store_intermediate_results=False,
@@ -251,6 +250,25 @@ def evolve_single(
         step_parameters, dt)
     if shots_count is None:
         shots_count = -1
+    if isinstance(initial_state, InitialState):
+        # This is an initial state enum, create concrete state.
+        state_size = 2**num_qubits
+        if initial_state == InitialState.ZERO:
+            state_data = numpy.zeros(state_size, dtype=numpy.complex128)
+            state_data[0] = 1.0
+        elif initial_state == InitialState.UNIFORM:
+            state_data = (1. / numpy.sqrt(state_size)) * numpy.ones(
+                state_size, dtype=numpy.complex128)
+        else:
+            raise ValueError("Unsupported initial state type")
+
+        sim_name = cudaq_runtime.get_target().simulator.strip()
+        if sim_name == "dm":
+            initial_state = cudaq_runtime.State.from_data(
+                numpy.outer(state_data, numpy.conj(state_data)))
+        else:
+            initial_state = cudaq_runtime.State.from_data(state_data)
+
     if store_intermediate_results:
         evolution = _evolution_kernel(
             num_qubits,
@@ -292,7 +310,7 @@ def evolve(
     hamiltonian: Operator,
     dimensions: Mapping[int, int] = {},
     schedule: Schedule = None,
-    initial_state: cudaq_runtime.State | Sequence[cudaq_runtime.State] = None,
+    initial_state: InitialStateArgT | Sequence[InitialStateArgT] = None,
     collapse_operators: Sequence[Operator] = [],
     observables: Sequence[Operator] = [],
     store_intermediate_results=False,
@@ -395,7 +413,7 @@ def evolve_single_async(
         hamiltonian: Operator,
         dimensions: Mapping[int, int],
         schedule: Schedule,
-        initial_state: cudaq_runtime.State,
+        initial_state: InitialStateArgT,
         collapse_operators: Sequence[Operator] = [],
         observables: Sequence[Operator] = [],
         store_intermediate_results=False,
@@ -506,7 +524,7 @@ def evolve_async(
     hamiltonian: Operator,
     dimensions: Mapping[int, int] = {},
     schedule: Schedule = None,
-    initial_state: cudaq_runtime.State | Sequence[cudaq_runtime.State] = None,
+    initial_state: InitialStateArgT | Sequence[InitialStateArgT] = None,
     collapse_operators: Sequence[Operator] = [],
     observables: Sequence[Operator] = [],
     store_intermediate_results=False,
diff --git a/python/cudaq/operator/helpers.py b/python/cudaq/operator/helpers.py
index be4971fe8f..d88810723d 100644
--- a/python/cudaq/operator/helpers.py
+++ b/python/cudaq/operator/helpers.py
@@ -9,6 +9,7 @@
 import inspect, itertools, numpy, os, re, sys, typing  # type: ignore
 from typing import Any, Callable, Iterable, Mapping, Optional, Sequence, Tuple
 from numpy.typing import NDArray
+from enum import Enum
 
 from ..mlir._mlir_libs._quakeDialects import cudaq_runtime
 
@@ -21,6 +22,17 @@
     NumericType = typing.Union[numpy.complexfloating, complex, float, int]
 
 
+class InitialState(Enum):
+    """
+    Enum to specify the initial quantum state.
+    """
+    ZERO = 1
+    UNIFORM = 2
+
+
+InitialStateArgT = cudaq_runtime.State | InitialState
+
+
 class _OperatorHelpers:
 
     @staticmethod
diff --git a/python/cudaq/operator/integrator.py b/python/cudaq/operator/integrator.py
index d7837d80e7..010565bacc 100644
--- a/python/cudaq/operator/integrator.py
+++ b/python/cudaq/operator/integrator.py
@@ -75,3 +75,9 @@ def get_state(self) -> Tuple[float, TState]:
         Obtain the state of the integrator as a pair (t, state).
         """
         return (self.t, self.state)
+
+    def support_distributed_state(self):
+        """
+        Returns true if the integrator supports distributed state else returns false. Default is set to false.
+        """
+        return False
diff --git a/python/cudaq/operator/integrators/builtin_integrators.py b/python/cudaq/operator/integrators/builtin_integrators.py
index cff2a8e30c..355524541f 100644
--- a/python/cudaq/operator/integrators/builtin_integrators.py
+++ b/python/cudaq/operator/integrators/builtin_integrators.py
@@ -62,6 +62,9 @@ def __init__(self,
         super().__init__(**kwargs)
         self.stepper = stepper
 
+    def support_distributed_state(self):
+        return True
+
     def __post_init__(self):
         if "nsteps" in self.integrator_options:
             self.n_steps = self.integrator_options["nsteps"]
diff --git a/python/extension/CUDAQuantumExtension.cpp b/python/extension/CUDAQuantumExtension.cpp
index 12911c3af7..56bc5954f1 100644
--- a/python/extension/CUDAQuantumExtension.cpp
+++ b/python/extension/CUDAQuantumExtension.cpp
@@ -169,6 +169,14 @@ PYBIND11_MODULE(_quakeDialects, m) {
       "Returns true if MPI has already been initialized.");
   mpiSubmodule.def(
       "finalize", []() { cudaq::mpi::finalize(); }, "Finalize MPI.");
+  mpiSubmodule.def(
+      "comm_dup",
+      []() {
+        const auto [commPtr, commSize] = cudaq::mpi::comm_dup();
+        return std::make_pair(reinterpret_cast<intptr_t>(commPtr), commSize);
+      },
+      "Duplicates the communicator. Return the new communicator address (as an "
+      "integer) and its size in bytes");
 
   auto orcaSubmodule = cudaqRuntime.def_submodule("orca");
   orcaSubmodule.def(
diff --git a/python/tests/operator/test_evolve_dynamics.py b/python/tests/operator/test_evolve_dynamics.py
index 7ae3197abf..ebb6fed859 100644
--- a/python/tests/operator/test_evolve_dynamics.py
+++ b/python/tests/operator/test_evolve_dynamics.py
@@ -7,12 +7,12 @@
 # ============================================================================ #
 import pytest
 import cudaq
-# Note: the test model may create state, hence need to set the target to "dynamics"
-cudaq.set_target("dynamics")
 
 if cudaq.num_available_gpus() == 0:
     pytest.skip("Skipping GPU tests", allow_module_level=True)
 else:
+    # Note: the test model may create state, hence need to set the target to "dynamics"
+    cudaq.set_target("dynamics")
     from system_models import *
 
 
diff --git a/python/tests/operator/test_evolve_simulators.py b/python/tests/operator/test_evolve_simulators.py
index 85e2b6b9dc..2fb2bac0e0 100644
--- a/python/tests/operator/test_evolve_simulators.py
+++ b/python/tests/operator/test_evolve_simulators.py
@@ -171,7 +171,8 @@ def do_something():
 ]
 
 
-def test_evolve():
+@pytest.mark.parametrize("init_state", ["array", "enum"])
+def test_evolve(init_state):
     # Set random seed for shots-based observe test.
     cudaq.set_random_seed(13)
 
@@ -182,8 +183,11 @@ def test_evolve():
     dimensions = {0: 2}
 
     # Initial state of the system (ground state).
-    rho0 = cudaq.State.from_data(
-        np.array([[1.0, 0.0], [0.0, 0.0]], dtype=np.complex128))
+    if init_state == "array":
+        rho0 = cudaq.State.from_data(
+            np.array([[1.0, 0.0], [0.0, 0.0]], dtype=np.complex128))
+    elif init_state == "enum":
+        rho0 = InitialState.ZERO
 
     # Schedule of time steps.
     steps = np.linspace(0, 10, 101)
diff --git a/python/tests/parallel/CMakeLists.txt b/python/tests/parallel/CMakeLists.txt
index 7b1741e5ec..651d4cddcd 100644
--- a/python/tests/parallel/CMakeLists.txt
+++ b/python/tests/parallel/CMakeLists.txt
@@ -26,3 +26,13 @@ set_tests_properties(
                                    "PYTHONPATH=${CMAKE_BINARY_DIR}/python"
                                    LABELS 
                                    "gpu_required;mgpus_required")
+
+add_test(
+  NAME cudaq-py-parallel-dynamics
+  COMMAND ${MPIEXEC_EXECUTABLE} --allow-run-as-root -np 2 ${PYTHON_EXECUTABLE} test_mpi_dynamics.py
+  WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+set_tests_properties(
+  cudaq-py-parallel-dynamics PROPERTIES ENVIRONMENT
+                                    "PYTHONPATH=${CMAKE_BINARY_DIR}/python"
+                                    LABELS 
+                                    "gpu_required;mgpus_required")
diff --git a/python/tests/parallel/test_mpi_dynamics.py b/python/tests/parallel/test_mpi_dynamics.py
new file mode 100644
index 0000000000..ba8ee367a2
--- /dev/null
+++ b/python/tests/parallel/test_mpi_dynamics.py
@@ -0,0 +1,71 @@
+# ============================================================================ #
+# Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                   #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+
+import cudaq, os, pytest
+from cudaq import operators, spin, Schedule, RungeKuttaIntegrator
+import numpy as np
+
+skipIfUnsupported = pytest.mark.skipif(
+    not (cudaq.num_available_gpus() > 1 and cudaq.has_target('dynamics')),
+    reason="dynamics backend not available or not a multi-GPU machine")
+
+
+@pytest.fixture(autouse=True)
+def do_something():
+    cudaq.mpi.initialize()
+    cudaq.set_target('dynamics')
+    yield
+    cudaq.reset_target()
+    cudaq.mpi.finalize()
+
+
+@skipIfUnsupported
+def testMpiRun():
+    # Large number of spins
+    N = 20
+    dimensions = {}
+    for i in range(N):
+        dimensions[i] = 2
+
+    # Observable is the average magnetization operator
+    avg_magnetization_op = operators.zero()
+    for i in range(N):
+        avg_magnetization_op += (spin.z(i) / N)
+
+    # Arbitrary coupling constant
+    g = 1.0
+    # Construct the Hamiltonian
+    H = operators.zero()
+    for i in range(N):
+        H += 2 * np.pi * spin.x(i)
+        H += 2 * np.pi * spin.y(i)
+    for i in range(N - 1):
+        H += 2 * np.pi * g * spin.x(i) * spin.x(i + 1)
+        H += 2 * np.pi * g * spin.y(i) * spin.z(i + 1)
+
+    steps = np.linspace(0.0, 1, 100)
+    schedule = Schedule(steps, ["time"])
+
+    # Initial state (expressed as an enum)
+    psi0 = cudaq.operator.InitialState.ZERO
+
+    # Run the simulation
+    evolution_result = cudaq.evolve(H,
+                                    dimensions,
+                                    schedule,
+                                    psi0,
+                                    observables=[avg_magnetization_op],
+                                    collapse_operators=[],
+                                    store_intermediate_results=False,
+                                    integrator=RungeKuttaIntegrator())
+
+
+# leave for gdb debugging
+if __name__ == "__main__":
+    loc = os.path.abspath(__file__)
+    pytest.main([loc, "-s"])
diff --git a/runtime/cudaq.h b/runtime/cudaq.h
index adf27cf38a..fb5f6b9182 100644
--- a/runtime/cudaq.h
+++ b/runtime/cudaq.h
@@ -307,6 +307,10 @@ void broadcast(std::vector<double> &data, int rootRank);
 /// @brief Broadcast a string from a process (rootRank) to all other processes.
 void broadcast(std::string &data, int rootRank);
 
+/// @brief Duplicate the communicator. Returns the new communicator (as a void*)
+/// and its size.
+std::pair<void *, std::size_t> comm_dup();
+
 /// @brief Finalize MPI. This function
 /// is a no-op if there CUDA-Q has not been built
 /// against MPI.
diff --git a/runtime/cudaq/cudaq.cpp b/runtime/cudaq/cudaq.cpp
index 5e8617ff38..071f658f43 100644
--- a/runtime/cudaq/cudaq.cpp
+++ b/runtime/cudaq/cudaq.cpp
@@ -172,6 +172,16 @@ void broadcast(std::string &data, int rootRank) {
   commPlugin->broadcast(data, rootRank);
 }
 
+std::pair<void *, std::size_t> comm_dup() {
+  auto *commPlugin = getMpiPlugin();
+  cudaqDistributedCommunicator_t *dupComm = nullptr;
+  cudaqDistributedCommunicator_t *comm = commPlugin->getComm();
+  const auto dupStatus = commPlugin->get()->CommDup(comm, &dupComm);
+  if (dupStatus != 0 || dupComm == nullptr)
+    throw std::runtime_error("Failed to duplicate the MPI communicator.");
+  return std::make_pair(dupComm->commPtr, dupComm->commSize);
+}
+
 void finalize() {
   if (rank() == 0)
     cudaq::info("Finalizing MPI.");
diff --git a/runtime/cudaq/distributed/builtin/mpi_comm_impl.cpp b/runtime/cudaq/distributed/builtin/mpi_comm_impl.cpp
index 04ef3c066a..fb027d261c 100644
--- a/runtime/cudaq/distributed/builtin/mpi_comm_impl.cpp
+++ b/runtime/cudaq/distributed/builtin/mpi_comm_impl.cpp
@@ -230,7 +230,15 @@ static int mpi_AllgatherV(const cudaqDistributedCommunicator_t *comm,
 /// @brief Wrapper of MPI_Isend and track pending requests for synchronization
 static int mpi_SendAsync(const cudaqDistributedCommunicator_t *comm,
                          const void *buf, int count, DataType dataType,
-                         int peer, int32_t tag) {
+                         int peer, int32_t tag, void *mpiRequest) {
+  // If a specific request was provided, just make MPI_Irecv call
+  if (mpiRequest)
+    return MPI_Isend(buf, count, convertType(dataType), peer, tag,
+                     unpackMpiCommunicator(comm), (MPI_Request *)mpiRequest);
+
+  // Otherwise, use the implicit request tracking mechanism (1 send and 1
+  // receive async call)
+
   std::scoped_lock<std::mutex> lock(PendingRequest::g_mutex);
   if (PendingRequest::g_requests[comm].nActiveRequests == 2)
     return -1;
@@ -249,7 +257,15 @@ static int mpi_SendAsync(const cudaqDistributedCommunicator_t *comm,
 
 /// @brief Wrapper of MPI_Irecv and track pending requests for synchronization
 static int mpi_RecvAsync(const cudaqDistributedCommunicator_t *comm, void *buf,
-                         int count, DataType dataType, int peer, int32_t tag) {
+                         int count, DataType dataType, int peer, int32_t tag,
+                         void *mpiRequest) {
+  // If a specific request was provided, just make MPI_Irecv call
+  if (mpiRequest)
+    return MPI_Irecv(buf, count, convertType(dataType), peer, tag,
+                     unpackMpiCommunicator(comm), (MPI_Request *)mpiRequest);
+
+  // Otherwise, use the implicit request tracking mechanism (1 send and 1
+  // receive async call)
   std::scoped_lock<std::mutex> lock(PendingRequest::g_mutex);
   if (PendingRequest::g_requests[comm].nActiveRequests == 2)
     return -1;
@@ -339,6 +355,50 @@ static int mpi_CommSplit(const cudaqDistributedCommunicator_t *comm,
   return status;
 }
 
+/// @brief Wrapper MPI_Request malloc
+static int mpi_CreateRequest(void **request) {
+  *request = malloc(sizeof(MPI_Request));
+  if (*request == NULL)
+    return -1;
+  return 0;
+}
+
+/// @brief Wrapper MPI_Request free
+static int mpi_DestroyRequest(void *request) {
+  if (request == NULL)
+    return -1;
+  free(request);
+  return 0;
+}
+
+/// @brief Wrapper MPI_Wait
+static int mpi_WaitRequest(void *request) {
+  MPI_Status waitStatus;
+  return MPI_Wait((MPI_Request *)request, &waitStatus);
+}
+
+/// @brief Wrapper MPI_Test
+static int mpi_TestRequest(void *request, int32_t *completed) {
+  MPI_Status testStatus;
+  return MPI_Test((MPI_Request *)request, completed, &testStatus);
+}
+
+/// @brief Wrapper MPI_Send
+static int mpi_Send(const cudaqDistributedCommunicator_t *comm,
+                    const void *buffer, int count, DataType datatype,
+                    int destination, int32_t tag) {
+  return MPI_Send(buffer, count, convertType(datatype), destination, tag,
+                  unpackMpiCommunicator(comm));
+}
+
+/// @brief Wrapper MPI_Recv
+static int mpi_Recv(const cudaqDistributedCommunicator_t *comm, void *buffer,
+                    int count, DataType datatype, int source, int32_t tag) {
+  MPI_Status recvStatus;
+  return MPI_Recv(buffer, count, convertType(datatype), source, tag,
+                  unpackMpiCommunicator(comm), &recvStatus);
+}
+
 /// @brief Return the underlying MPI_Comm as a type-erased object
 cudaqDistributedCommunicator_t *getMpiCommunicator() {
   static MPI_Comm pluginComm = MPI_COMM_WORLD;
@@ -370,7 +430,14 @@ cudaqDistributedInterface_t *getDistributedInterface() {
       mpi_Synchronize,
       mpi_Abort,
       mpi_CommDup,
-      mpi_CommSplit};
+      mpi_CommSplit,
+      mpi_CreateRequest,
+      mpi_DestroyRequest,
+      mpi_WaitRequest,
+      mpi_TestRequest,
+      mpi_Send,
+      mpi_Recv,
+  };
   return &cudaqDistributedInterface;
 }
 }
diff --git a/runtime/cudaq/distributed/distributed_capi.h b/runtime/cudaq/distributed/distributed_capi.h
index 3541b1fcc8..e52fb39eee 100644
--- a/runtime/cudaq/distributed/distributed_capi.h
+++ b/runtime/cudaq/distributed/distributed_capi.h
@@ -94,14 +94,14 @@ typedef struct {
   /// `Synchronize` should be called as appropriate to resolve in-flight
   /// requests before making new ones.
   int (*SendAsync)(const cudaqDistributedCommunicator_t *, const void *, int,
-                   DataType, int, int32_t);
+                   DataType, int, int32_t, void *);
   /// @brief MPI_Irecv
   /// @note The MPI plugin API allows for a maximum of two concurrent
   /// non-blocking requests (e.g., one `Isend` and one `Irecv`). Hence,
   /// `Synchronize` should be called as appropriate to resolve in-flight
   /// requests before making new ones.
   int (*RecvAsync)(const cudaqDistributedCommunicator_t *, void *, int,
-                   DataType, int, int32_t);
+                   DataType, int, int32_t, void *);
   /// @brief MPI_Isend and MPI_Irecv in one call
   /// @note The MPI plugin API allows for a maximum of two concurrent
   /// non-blocking requests (e.g., one `Isend` and one `Irecv`). Since this
@@ -119,5 +119,19 @@ typedef struct {
   /// @brief MPI_Comm_split
   int (*CommSplit)(const cudaqDistributedCommunicator_t *, int32_t, int32_t,
                    cudaqDistributedCommunicator_t **);
+  /// @brief Create an MPI_Request
+  int (*CreateRequest)(void **);
+  /// @brief Free a previously-allocated MPI_Request
+  int (*DestroyRequest)(void *);
+  /// @brief Wait for the completion of a MPI_Request
+  int (*WaitRequest)(void *);
+  /// @brief Test for the completion of a MPI_Request
+  int (*TestRequest)(void *, int32_t *);
+  /// @brief MPI_Send
+  int (*Send)(const cudaqDistributedCommunicator_t *, const void *, int,
+              DataType, int, int32_t);
+  /// @brief MPI_Recv
+  int (*Recv)(const cudaqDistributedCommunicator_t *, void *, int, DataType,
+              int, int32_t);
 } cudaqDistributedInterface_t;
-}
\ No newline at end of file
+}
diff --git a/runtime/nvqir/cudensitymat/CMakeLists.txt b/runtime/nvqir/cudensitymat/CMakeLists.txt
index 426bd819ec..38a69559aa 100644
--- a/runtime/nvqir/cudensitymat/CMakeLists.txt
+++ b/runtime/nvqir/cudensitymat/CMakeLists.txt
@@ -11,13 +11,27 @@ find_package(CUDAToolkit REQUIRED)
 
 set(INTERFACE_POSITION_INDEPENDENT_CODE ON)
 set(LIBRARY_NAME nvqir-dynamics)
-add_library(${LIBRARY_NAME} SHARED CuDensityMatSim.cpp)
+
+# Find cudensitymat header (for MPI plugin interface typedef)
+find_file(CUDENSITYMAT_INC
+    NAMES   cudensitymat.h
+    HINTS   
+        $ENV{CUQUANTUM_INSTALL_PREFIX}/include      
+        /usr/include    
+        ENV CPATH
+)
+
+message(STATUS "cudensitymat header: ${CUDENSITYMAT_INC}")
+get_filename_component(CUDENSITYMAT_INCLUDE_DIR ${CUDENSITYMAT_INC} DIRECTORY)
+
+add_library(${LIBRARY_NAME} SHARED CuDensityMatSim.cpp mpi_support.cpp)
 
 target_include_directories(${LIBRARY_NAME}
   PRIVATE 
     . .. 
     ${CUDAToolkit_INCLUDE_DIRS} 
     ${CMAKE_SOURCE_DIR}/runtime/common
+    ${CUDENSITYMAT_INCLUDE_DIR}
 )
 
 target_link_libraries(${LIBRARY_NAME} 
diff --git a/runtime/nvqir/cudensitymat/CuDensityMatSim.cpp b/runtime/nvqir/cudensitymat/CuDensityMatSim.cpp
index ac0ad95f0f..ba56d6fdbf 100644
--- a/runtime/nvqir/cudensitymat/CuDensityMatSim.cpp
+++ b/runtime/nvqir/cudensitymat/CuDensityMatSim.cpp
@@ -8,7 +8,54 @@
 
 #include "CircuitSimulator.h"
 #include "CuDensityMatState.h"
+#include "cudaq.h"
+#include "cudaq/distributed/mpi_plugin.h"
+
 namespace {
+// Hook to query this shared lib file location at runtime.
+extern "C" {
+void cuDensityMatSimCppFileMarker() { return; }
+}
+/// @brief Query the full path to the this lib.
+static const char *getThisSharedLibFilePath() {
+  static thread_local std::string LIB_PATH;
+  if (LIB_PATH.empty()) {
+    // Use dladdr query this .so file
+    void *funcPtrToFind = (void *)(intptr_t)cuDensityMatSimCppFileMarker;
+    Dl_info DLInfo;
+    int err = dladdr(funcPtrToFind, &DLInfo);
+    if (err != 0) {
+      char link_path[PATH_MAX];
+      // If the filename is a symlink, we need to resolve and return the
+      // location of the actual .so file.
+      if (realpath(DLInfo.dli_fname, link_path))
+        LIB_PATH = link_path;
+    }
+  }
+
+  return LIB_PATH.c_str();
+}
+
+/// @brief Retrieve the path to the plugin implementation
+std::string getMpiPluginFilePath() {
+  auto mpiPlugin = cudaq::mpi::getMpiPlugin();
+  if (!mpiPlugin)
+    throw std::runtime_error("Failed to retrieve MPI plugin");
+
+  return mpiPlugin->getPluginPath();
+}
+
+void initCuDensityMatCommLib() {
+  // If CUDENSITYMAT_COMM_LIB environment variable is not set,
+  // use this builtin plugin shim (redirect MPI calls to CUDA-Q plugin)
+  if (std::getenv("CUDENSITYMAT_COMM_LIB") == nullptr) {
+    cudaq::info("Enabling cuDensityMat MPI without environment variable "
+                "CUDENSITYMAT_COMM_LIB. \nUse the builtin cuTensorNet "
+                "communicator lib from '{}' - CUDA-Q MPI plugin {}.",
+                getThisSharedLibFilePath(), getMpiPluginFilePath());
+    setenv("CUDENSITYMAT_COMM_LIB", getThisSharedLibFilePath(), 0);
+  }
+}
 
 class CuDensityMatSim : public nvqir::CircuitSimulatorBase<double> {
 protected:
@@ -30,7 +77,15 @@ class CuDensityMatSim : public nvqir::CircuitSimulatorBase<double> {
 
 public:
   /// @brief The constructor
-  CuDensityMatSim() {}
+  CuDensityMatSim() {
+    int numDevices{0};
+    HANDLE_CUDA_ERROR(cudaGetDeviceCount(&numDevices));
+    const int deviceId =
+        cudaq::mpi::is_initialized() ? cudaq::mpi::rank() % numDevices : 0;
+    if (cudaq::mpi::is_initialized())
+      initCuDensityMatCommLib();
+    HANDLE_CUDA_ERROR(cudaSetDevice(deviceId));
+  }
 
   /// The destructor
   virtual ~CuDensityMatSim() = default;
diff --git a/runtime/nvqir/cudensitymat/mpi_support.cpp b/runtime/nvqir/cudensitymat/mpi_support.cpp
new file mode 100644
index 0000000000..5d8605e33d
--- /dev/null
+++ b/runtime/nvqir/cudensitymat/mpi_support.cpp
@@ -0,0 +1,246 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+#include "common/Logger.h"
+#include "cudaq/distributed/mpi_plugin.h"
+#include <cassert>
+#include <cudensitymat.h>
+#include <dlfcn.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+/// @brief Retrieve the MPI plugin comm interface
+static cudaqDistributedInterface_t *getMpiPluginInterface() {
+  auto mpiPlugin = cudaq::mpi::getMpiPlugin();
+  if (!mpiPlugin)
+    throw std::runtime_error("Failed to retrieve MPI plugin");
+  cudaqDistributedInterface_t *mpiInterface = mpiPlugin->get();
+  if (!mpiInterface)
+    throw std::runtime_error("Invalid MPI distributed plugin encountered");
+  return mpiInterface;
+}
+
+// Implementing cudensitymat's COMM interface by delegating wrapped MPI calls to
+// the underlying CUDA-Q MPI plugin. This will make this library
+// compatible with CUDENSITYMAT_COMM_LIB API. Converts CUDA data type to the
+// corresponding CUDA-Q shim type enum
+
+/// Convert cudensitymat CUDA datatype enum
+static DataType convertCudaToMpiDataType(const cudaDataType_t cudaDataType) {
+  switch (cudaDataType) {
+  case CUDA_R_8I:
+    return INT_8;
+  case CUDA_R_16I:
+    return INT_16;
+  case CUDA_R_32I:
+    return INT_32;
+  case CUDA_R_64I:
+    return INT_64;
+  case CUDA_R_32F:
+    return FLOAT_32;
+  case CUDA_R_64F:
+    return FLOAT_64;
+  case CUDA_C_32F:
+    return FLOAT_COMPLEX;
+  case CUDA_C_64F:
+    return DOUBLE_COMPLEX;
+  default:
+    throw std::runtime_error("Unsupported data type encountered in "
+                             "cudensitymat communicator plugin");
+  }
+  __builtin_unreachable();
+}
+
+/// Convert the type-erased Comm object
+static cudaqDistributedCommunicator_t
+convertMpiCommunicator(const cudensitymatDistributedCommunicator_t *cutnComm) {
+  cudaqDistributedCommunicator_t comm{cutnComm->commPtr, cutnComm->commSize};
+  return comm;
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Implementation of cudensitymat distributed interface by delegating to CUDA-Q
+// MPI plugin API.
+int cudensitymatMpiCommSize(const cudensitymatDistributedCommunicator_t *comm,
+                            int32_t *numRanks) {
+  ScopedTraceWithContext(__FUNCTION__);
+  auto cudaqComm = convertMpiCommunicator(comm);
+  return getMpiPluginInterface()->getNumRanks(&cudaqComm, numRanks);
+}
+
+int cudensitymatMpiCommSizeShared(
+    const cudensitymatDistributedCommunicator_t *comm, int32_t *numRanks) {
+  ScopedTraceWithContext(__FUNCTION__);
+  auto cudaqComm = convertMpiCommunicator(comm);
+  return getMpiPluginInterface()->getCommSizeShared(&cudaqComm, numRanks);
+}
+
+int cudensitymatMpiCommRank(const cudensitymatDistributedCommunicator_t *comm,
+                            int32_t *procRank) {
+  ScopedTraceWithContext(__FUNCTION__);
+  auto cudaqComm = convertMpiCommunicator(comm);
+  return getMpiPluginInterface()->getProcRank(&cudaqComm, procRank);
+}
+
+int cudensitymatMpiBarrier(const cudensitymatDistributedCommunicator_t *comm) {
+  ScopedTraceWithContext(__FUNCTION__);
+  auto cudaqComm = convertMpiCommunicator(comm);
+  return getMpiPluginInterface()->Barrier(&cudaqComm);
+}
+
+int cudensitymatMpiCreateRequest(cudensitymatDistributedRequest_t *request) {
+  ScopedTraceWithContext(__FUNCTION__);
+  return getMpiPluginInterface()->CreateRequest(request);
+}
+
+int cudensitymatMpiDestroyRequest(cudensitymatDistributedRequest_t request) {
+  ScopedTraceWithContext(__FUNCTION__);
+  return getMpiPluginInterface()->DestroyRequest(request);
+}
+
+int cudensitymatMpiWaitRequest(cudensitymatDistributedRequest_t request) {
+  ScopedTraceWithContext(__FUNCTION__);
+  return getMpiPluginInterface()->WaitRequest(request);
+}
+
+int cudensitymatMpiTestRequest(cudensitymatDistributedRequest_t request,
+                               int32_t *completed) {
+  ScopedTraceWithContext(__FUNCTION__);
+  return getMpiPluginInterface()->TestRequest(request, completed);
+}
+
+int cudensitymatMpiSend(const cudensitymatDistributedCommunicator_t *comm,
+                        const void *buffer, int32_t count,
+                        cudaDataType_t datatype, int32_t destination,
+                        int32_t tag) {
+  ScopedTraceWithContext(__FUNCTION__);
+  auto cudaqComm = convertMpiCommunicator(comm);
+  return getMpiPluginInterface()->Send(&cudaqComm, buffer, count,
+                                       convertCudaToMpiDataType(datatype),
+                                       destination, tag);
+}
+
+int cudensitymatMpiSendAsync(const cudensitymatDistributedCommunicator_t *comm,
+                             const void *buffer, int32_t count,
+                             cudaDataType_t datatype, int32_t destination,
+                             int32_t tag,
+                             cudensitymatDistributedRequest_t request) {
+  ScopedTraceWithContext(__FUNCTION__);
+  auto cudaqComm = convertMpiCommunicator(comm);
+  return getMpiPluginInterface()->SendAsync(&cudaqComm, buffer, count,
+                                            convertCudaToMpiDataType(datatype),
+                                            destination, tag, request);
+}
+
+int cudensitymatMpiRecv(const cudensitymatDistributedCommunicator_t *comm,
+                        void *buffer, int32_t count, cudaDataType_t datatype,
+                        int32_t source, int32_t tag) {
+  ScopedTraceWithContext(__FUNCTION__);
+  auto cudaqComm = convertMpiCommunicator(comm);
+  return getMpiPluginInterface()->Recv(&cudaqComm, buffer, count,
+                                       convertCudaToMpiDataType(datatype),
+                                       source, tag);
+}
+
+int cudensitymatMpiRecvAsync(const cudensitymatDistributedCommunicator_t *comm,
+                             void *buffer, int32_t count,
+                             cudaDataType_t datatype, int32_t source,
+                             int32_t tag,
+                             cudensitymatDistributedRequest_t request) {
+  ScopedTraceWithContext(__FUNCTION__);
+  auto cudaqComm = convertMpiCommunicator(comm);
+  return getMpiPluginInterface()->RecvAsync(&cudaqComm, buffer, count,
+                                            convertCudaToMpiDataType(datatype),
+                                            source, tag, request);
+}
+
+int cudensitymatMpiBcast(const cudensitymatDistributedCommunicator_t *comm,
+                         void *buffer, int32_t count, cudaDataType_t datatype,
+                         int32_t root) {
+  ScopedTraceWithContext(__FUNCTION__);
+  auto cudaqComm = convertMpiCommunicator(comm);
+  return getMpiPluginInterface()->Bcast(
+      &cudaqComm, buffer, count, convertCudaToMpiDataType(datatype), root);
+}
+
+int cudensitymatMpiAllreduce(const cudensitymatDistributedCommunicator_t *comm,
+                             const void *bufferIn, void *bufferOut,
+                             int32_t count, cudaDataType_t datatype) {
+  ScopedTraceWithContext(__FUNCTION__);
+  // cudensitymat expects MPI_SUM in this API
+  auto cudaqComm = convertMpiCommunicator(comm);
+  return getMpiPluginInterface()->Allreduce(
+      &cudaqComm, bufferIn, bufferOut, count,
+      convertCudaToMpiDataType(datatype), SUM);
+}
+
+int cudensitymatMpiAllreduceInPlace(
+    const cudensitymatDistributedCommunicator_t *comm, void *buffer,
+    int32_t count, cudaDataType_t datatype) {
+  ScopedTraceWithContext(__FUNCTION__);
+  auto cudaqComm = convertMpiCommunicator(comm);
+  // cudensitymat expects MPI_SUM in this API
+  return getMpiPluginInterface()->AllreduceInPlace(
+      &cudaqComm, buffer, count, convertCudaToMpiDataType(datatype), SUM);
+}
+
+int cudensitymatMpiAllreduceInPlaceMin(
+    const cudensitymatDistributedCommunicator_t *comm, void *buffer,
+    int32_t count, cudaDataType_t datatype) {
+  ScopedTraceWithContext(__FUNCTION__);
+  auto cudaqComm = convertMpiCommunicator(comm);
+  // cudensitymat expects MPI_SUM in this API
+  return getMpiPluginInterface()->AllreduceInPlace(
+      &cudaqComm, buffer, count, convertCudaToMpiDataType(datatype), MIN);
+}
+
+int cudensitymatMpiAllreduceDoubleIntMinloc(
+    const cudensitymatDistributedCommunicator_t *comm, const void *bufferIn,
+    void *bufferOut) {
+  ScopedTraceWithContext(__FUNCTION__);
+  auto cudaqComm = convertMpiCommunicator(comm);
+  return getMpiPluginInterface()->Allreduce(&cudaqComm, bufferIn, bufferOut, 1,
+                                            FLOAT_64, MIN_LOC);
+}
+
+int cudensitymatMpiAllgather(const cudensitymatDistributedCommunicator_t *comm,
+                             const void *bufferIn, void *bufferOut,
+                             int32_t count, cudaDataType_t datatype) {
+  ScopedTraceWithContext(__FUNCTION__);
+  auto cudaqComm = convertMpiCommunicator(comm);
+  return getMpiPluginInterface()->Allgather(&cudaqComm, bufferIn, bufferOut,
+                                            count,
+                                            convertCudaToMpiDataType(datatype));
+}
+
+cudensitymatDistributedInterface_t cudensitymatCommInterface = {
+    CUDENSITYMAT_DISTRIBUTED_INTERFACE_VERSION,
+    cudensitymatMpiCommSize,
+    cudensitymatMpiCommSizeShared,
+    cudensitymatMpiCommRank,
+    cudensitymatMpiBarrier,
+    cudensitymatMpiCreateRequest,
+    cudensitymatMpiDestroyRequest,
+    cudensitymatMpiWaitRequest,
+    cudensitymatMpiTestRequest,
+    cudensitymatMpiSend,
+    cudensitymatMpiSendAsync,
+    cudensitymatMpiRecv,
+    cudensitymatMpiRecvAsync,
+    cudensitymatMpiBcast,
+    cudensitymatMpiAllreduce,
+    cudensitymatMpiAllreduceInPlace,
+    cudensitymatMpiAllreduceInPlaceMin,
+    cudensitymatMpiAllreduceDoubleIntMinloc,
+    cudensitymatMpiAllgather};
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
diff --git a/unittests/mpi/mpi_tester.cpp b/unittests/mpi/mpi_tester.cpp
index 826ff59d4d..e097df62c0 100644
--- a/unittests/mpi/mpi_tester.cpp
+++ b/unittests/mpi/mpi_tester.cpp
@@ -166,10 +166,10 @@ TEST(MPITester, checkSendAndRecv) {
   cudaqDistributedCommunicator_t *comm = mpiPlugin->getComm();
   EXPECT_TRUE(comm != nullptr);
   EXPECT_EQ(mpiInterface->RecvAsync(comm, recvBuffer.data(), nElems, FLOAT_64,
-                                    recvRank, 0),
+                                    recvRank, 0, nullptr),
             0);
   EXPECT_EQ(mpiInterface->SendAsync(comm, sendBuffer.data(), nElems, FLOAT_64,
-                                    sendRank, 0),
+                                    sendRank, 0, nullptr),
             0);
   EXPECT_EQ(mpiInterface->Synchronize(comm), 0);
   for (std::size_t i = 0; i < refBuffer.size(); ++i) {