NVIDIA · 1tnguyen · Dec 13, 2024 · Dec 16, 2024 · Dec 16, 2024 · Jan 13, 2025
diff --git a/docs/sphinx/api/languages/python_api.rst b/docs/sphinx/api/languages/python_api.rst
@@ -157,6 +157,8 @@ Data Types
 .. autoclass:: cudaq.operator.cudm_state.CuDensityMatState
     :members:
 
+.. autoclass:: cudaq.operator.helpers.InitialState
+
 .. autofunction:: cudaq.operator.cudm_state.to_cupy_array
 
 .. autoclass:: cudaq::SampleResult

diff --git a/docs/sphinx/examples/python/dynamics/mgmn/initial_state.py b/docs/sphinx/examples/python/dynamics/mgmn/initial_state.py
@@ -0,0 +1,71 @@
+import cudaq
+from cudaq import operators, spin, Schedule, RungeKuttaIntegrator
+
+import numpy as np
+import matplotlib.pyplot as plt
+import os
+
+# On a system with multiple GPUs, `mpiexec` can be used as follows:
+# `mpiexec -np <N> python3 multi_gpu.py `
+cudaq.mpi.initialize()
+
+# Set the target to our dynamics simulator
+cudaq.set_target("dynamics")
+
+# Large number of spins
+N = 20
+dimensions = {}
+for i in range(N):
+    dimensions[i] = 2
+
+# Observable is the average magnetization operator
+avg_magnetization_op = operators.zero()
+for i in range(N):
+    avg_magnetization_op += (spin.z(i) / N)
+
+# Arbitrary coupling constant
+g = 1.0
+# Construct the Hamiltonian
+H = operators.zero()
+for i in range(N):
+    H += 2 * np.pi * spin.x(i)
+    H += 2 * np.pi * spin.y(i)
+for i in range(N - 1):
+    H += 2 * np.pi * g * spin.x(i) * spin.x(i + 1)
+    H += 2 * np.pi * g * spin.y(i) * spin.z(i + 1)
+
+steps = np.linspace(0.0, 1, 200)
+schedule = Schedule(steps, ["time"])
+
+# Initial state (expressed as an enum)
+psi0 = cudaq.operator.InitialState.ZERO
+# This can also be used to initialize a uniformly-distributed wave-function instead.
+# `psi0 = cudaq.operator.InitialState.UNIFORM`
+
+# Run the simulation
+evolution_result = cudaq.evolve(H,
+                                dimensions,
+                                schedule,
+                                psi0,
+                                observables=[avg_magnetization_op],
+                                collapse_operators=[],
+                                store_intermediate_results=True,
+                                integrator=RungeKuttaIntegrator())
+
+exp_val = [
+    exp_vals[0].expectation()
+    for exp_vals in evolution_result.expectation_values()
+]
+
+if cudaq.mpi.rank() == 0:
+    # Plot the results
+    fig = plt.figure(figsize=(12, 6))
+    plt.plot(steps, exp_val)
+    plt.ylabel("Average Magnetization")
+    plt.xlabel("Time")
+    abspath = os.path.abspath(__file__)
+    dname = os.path.dirname(abspath)
+    os.chdir(dname)
+    fig.savefig("spin_model.png", dpi=fig.dpi)
+
+cudaq.mpi.finalize()
diff --git a/docs/sphinx/examples/python/dynamics/mgmn/multi_gpu.py b/docs/sphinx/examples/python/dynamics/mgmn/multi_gpu.py
@@ -0,0 +1,94 @@
+import cudaq
+from cudaq import operators, spin, Schedule, RungeKuttaIntegrator
+
+import numpy as np
+import cupy as cp
+import matplotlib.pyplot as plt
+import os
+
+# On a system with multiple GPUs, `mpiexec` can be used as follows:
+# `mpiexec -np <N> python3 multi_gpu.py `
+cudaq.mpi.initialize()
+
+# Set the target to our dynamics simulator
+cudaq.set_target("dynamics")
+
+# In this example, we solve the Quantum Heisenberg model (https://en.wikipedia.org/wiki/Quantum_Heisenberg_model),
+# which exhibits the so-called quantum quench effect.
+# e.g., see `Quantum quenches in the anisotropic spin-1/2 Heisenberg chain: different approaches to many-body dynamics far from equilibrium`
+# (New J. Phys. 12 055017)
+# Large number of spins
+N = 21
+dimensions = {}
+for i in range(N):
+    dimensions[i] = 2
+
+# Initial state: alternating spin up and down
+spin_state = ''
+for i in range(N):
+    spin_state += str(int(i % 2))
+
+# Observable is the staggered magnetization operator
+staggered_magnetization_op = operators.zero()
+for i in range(N):
+    if i % 2 == 0:
+        staggered_magnetization_op += spin.z(i)
+    else:
+        staggered_magnetization_op -= spin.z(i)
+
+staggered_magnetization_op /= N
+
+observe_results = []
+for g in [0.25, 4.0]:
+    # Heisenberg model spin coupling strength
+    Jx = 1.0
+    Jy = 1.0
+    Jz = g
+
+    # Construct the Hamiltonian
+    H = operators.zero()
+
+    for i in range(N - 1):
+        H += Jx * spin.x(i) * spin.x(i + 1)
+        H += Jy * spin.y(i) * spin.y(i + 1)
+        H += Jz * spin.z(i) * spin.z(i + 1)
+
+    steps = np.linspace(0.0, 1, 100)
+    schedule = Schedule(steps, ["time"])
+
+    # Prepare the initial state vector
+    psi0_ = cp.zeros(2**N, dtype=cp.complex128)
+    psi0_[int(spin_state, 2)] = 1.0
+    psi0 = cudaq.State.from_data(psi0_)
+
+    # Run the simulation
+    evolution_result = cudaq.evolve(H,
+                                    dimensions,
+                                    schedule,
+                                    psi0,
+                                    observables=[staggered_magnetization_op],
+                                    collapse_operators=[],
+                                    store_intermediate_results=True,
+                                    integrator=RungeKuttaIntegrator())
+
+    exp_val = [
+        exp_vals[0].expectation()
+        for exp_vals in evolution_result.expectation_values()
+    ]
+
+    observe_results.append((g, exp_val))
+
+if cudaq.mpi.rank() == 0:
+    # Plot the results
+    fig = plt.figure(figsize=(12, 6))
+    for g, exp_val in observe_results:
+        plt.plot(steps, exp_val, label=f'$ g = {g}$')
+    plt.legend(fontsize=16)
+    plt.ylabel("Staggered Magnetization")
+    plt.xlabel("Time")
+    abspath = os.path.abspath(__file__)
+    dname = os.path.dirname(abspath)
+    os.chdir(dname)
+    fig.savefig("heisenberg_model.png", dpi=fig.dpi)
+
+cudaq.mpi.finalize()
diff --git a/docs/sphinx/snippets/python/using/backends/dynamics.py b/docs/sphinx/snippets/python/using/backends/dynamics.py
@@ -172,3 +172,44 @@ def compute_value(param_name, step_idx):
 time_dependence = parameter_values(numpy.linspace(0, 1, 100))
 cudaq.evolve(system_operator, system_dimensions, time_dependence, initial_state)
 #[End Schedule2]
+
+import cudaq
+from cudaq import operators, spin, Schedule, RungeKuttaIntegrator
+
+N = 4
+dimensions = {}
+for i in range(N):
+    dimensions[i] = 2
+g = 1.0
+H = operators.zero()
+for i in range(N):
+    H += 2 * np.pi * spin.x(i)
+    H += 2 * np.pi * spin.y(i)
+for i in range(N - 1):
+    H += 2 * np.pi * g * spin.x(i) * spin.x(i + 1)
+    H += 2 * np.pi * g * spin.y(i) * spin.z(i + 1)
+
+steps = np.linspace(0.0, 1, 200)
+schedule = Schedule(steps, ["time"])
+
+#[Begin MPI]
+cudaq.mpi.initialize()
+
+# Set the target to our dynamics simulator
+cudaq.set_target("dynamics")
+
+# Initial state (expressed as an enum)
+psi0 = cudaq.operator.InitialState.ZERO
+
+# Run the simulation
+evolution_result = cudaq.evolve(H,
+                                dimensions,
+                                schedule,
+                                psi0,
+                                observables=[],
+                                collapse_operators=[],
+                                store_intermediate_results=True,
+                                integrator=RungeKuttaIntegrator())
+
+cudaq.mpi.finalize()
+#[End MPI]
diff --git a/docs/sphinx/using/backends/dynamics.rst b/docs/sphinx/using/backends/dynamics.rst
@@ -272,4 +272,42 @@ backend target.
     If the output is a '`None`' string, it indicates that your Torch installation does not support CUDA.
     In this case, you need to install a CUDA-enabled Torch package via other mechanisms, e.g., building Torch from source or
     using their Docker images.
-
+
+Multi-GPU Multi-Node Execution
++++++++++++++++++++++++++++++++
+
+.. _cudensitymat_mgmn:
+
+CUDA-Q ``dynamics`` target supports parallel execution on multiple GPUs. 
+To enable parallel execution, the application must initialize MPI as follows.
+
+
+.. tab:: Python
+
+  .. literalinclude:: ../../snippets/python/using/backends/dynamics.py
+        :language: python
+        :start-after: [Begin MPI]
+        :end-before: [End MPI]
+
+  .. code:: bash 
+
+        mpiexec -np <N> python3 program.py 
+
+By initializing the MPI execution environment (via `cudaq.mpi.initialize()`) in the application code and
+invoking it via an MPI launcher, we have activated the multi-node multi-GPU feature of the ``dynamics`` target.
+Specifically, it will detect the number of processes (GPUs) and distribute the computation across all available GPUs.
+
+
+.. note::
+    The number of MPI processes must be a power of 2, one GPU per process.
+
+.. note::
+    Not all integrators are capable of handling distributed state. Errors will be raised if parallel execution is activated 
+    but the selected integrator does not support distributed state. 
+
+.. warning:: 
+    As of cuQuantum version 24.11, there are a couple of `known limitations <https://docs.nvidia.com/cuda/cuquantum/24.11.0/cudensitymat/index.html>`__ for parallel execution:
+
+    - Computing the expectation value of a mixed quantum state is not supported. Thus, `collapse_operators` are not supported if expectation calculation is required.
+
+    - Some combinations of quantum states and quantum many-body operators are not supported. Errors will be raised in those cases. 
diff --git a/python/cudaq/operator/__init__.py b/python/cudaq/operator/__init__.py
@@ -9,5 +9,5 @@
 from .definitions import operators, spin
 from .evolution import evolve, evolve_async
 from .expressions import Operator, OperatorSum, ProductOperator, ElementaryOperator, ScalarOperator, RydbergHamiltonian
-from .helpers import NumericType
+from .helpers import NumericType, InitialState
 from .schedule import Schedule
diff --git a/python/cudaq/operator/cudm_solver.py b/python/cudaq/operator/cudm_solver.py
@@ -16,6 +16,7 @@
 from ..mlir._mlir_libs._quakeDialects import cudaq_runtime
 from .cudm_helpers import cudm, CudmStateType
 from .cudm_state import CuDensityMatState, as_cudm_state
+from .helpers import InitialState, InitialStateArgT
 from .integrator import BaseIntegrator
 from .integrators.builtin_integrators import RungeKuttaIntegrator, cuDensityMatTimeStepper
 import cupy
@@ -28,7 +29,7 @@ def evolve_dynamics(
         hamiltonian: Operator,
         dimensions: Mapping[int, int],
         schedule: Schedule,
-        initial_state: cudaq_runtime.State,
+        initial_state: InitialStateArgT,
         collapse_operators: Sequence[Operator] = [],
         observables: Sequence[Operator] = [],
         store_intermediate_results=False,
@@ -43,8 +44,21 @@ def evolve_dynamics(
     schedule.reset()
     hilbert_space_dims = tuple(dimensions[d] for d in range(len(dimensions)))
 
-    with ScopeTimer("evolve.as_cudm_state") as timer:
-        initial_state = as_cudm_state(initial_state)
+    # Check that the integrator can support distributed state if this is a distributed simulation.
+    if cudaq_runtime.mpi.is_initialized() and cudaq_runtime.mpi.num_ranks(
+    ) > 1 and integrator is not None and not integrator.support_distributed_state(
+    ):
+        raise ValueError(
+            f"Integrator {type(integrator).__name__} does not support distributed state."
+        )
+
+    if isinstance(initial_state, InitialState):
+        has_collapse_operators = len(collapse_operators) > 0
+        initial_state = CuDensityMatState.create_initial_state(
+            initial_state, hilbert_space_dims, has_collapse_operators)
+    else:
+        with ScopeTimer("evolve.as_cudm_state") as timer:
+            initial_state = as_cudm_state(initial_state)
 
     if not isinstance(initial_state, CuDensityMatState):
         raise ValueError("Unknown type")