Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Enable multi-GPU for dynamics #2509

Draft
wants to merge 21 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/sphinx/api/languages/python_api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,8 @@ Data Types
.. autoclass:: cudaq.operator.cudm_state.CuDensityMatState
:members:

.. autoclass:: cudaq.operator.helpers.InitialState

.. autofunction:: cudaq.operator.cudm_state.to_cupy_array

.. autoclass:: cudaq::SampleResult
Expand Down
71 changes: 71 additions & 0 deletions docs/sphinx/examples/python/dynamics/mgmn/initial_state.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import cudaq
from cudaq import operators, spin, Schedule, RungeKuttaIntegrator

import numpy as np
import matplotlib.pyplot as plt
import os

# On a system with multiple GPUs, `mpiexec` can be used as follows:
# `mpiexec -np <N> python3 multi_gpu.py `
cudaq.mpi.initialize()

# Set the target to our dynamics simulator
cudaq.set_target("dynamics")

# Large number of spins
N = 20
dimensions = {}
for i in range(N):
dimensions[i] = 2

# Observable is the average magnetization operator
avg_magnetization_op = operators.zero()
for i in range(N):
avg_magnetization_op += (spin.z(i) / N)

# Arbitrary coupling constant
g = 1.0
# Construct the Hamiltonian
H = operators.zero()
for i in range(N):
H += 2 * np.pi * spin.x(i)
H += 2 * np.pi * spin.y(i)
for i in range(N - 1):
H += 2 * np.pi * g * spin.x(i) * spin.x(i + 1)
H += 2 * np.pi * g * spin.y(i) * spin.z(i + 1)

steps = np.linspace(0.0, 1, 200)
schedule = Schedule(steps, ["time"])

# Initial state (expressed as an enum)
psi0 = cudaq.operator.InitialState.ZERO
# This can also be used to initialize a uniformly-distributed wave-function instead.
# `psi0 = cudaq.operator.InitialState.UNIFORM`

# Run the simulation
evolution_result = cudaq.evolve(H,
dimensions,
schedule,
psi0,
observables=[avg_magnetization_op],
collapse_operators=[],
store_intermediate_results=True,
integrator=RungeKuttaIntegrator())

exp_val = [
exp_vals[0].expectation()
for exp_vals in evolution_result.expectation_values()
]

if cudaq.mpi.rank() == 0:
# Plot the results
fig = plt.figure(figsize=(12, 6))
plt.plot(steps, exp_val)
plt.ylabel("Average Magnetization")
plt.xlabel("Time")
abspath = os.path.abspath(__file__)
dname = os.path.dirname(abspath)
os.chdir(dname)
fig.savefig("spin_model.png", dpi=fig.dpi)

cudaq.mpi.finalize()
94 changes: 94 additions & 0 deletions docs/sphinx/examples/python/dynamics/mgmn/multi_gpu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import cudaq
from cudaq import operators, spin, Schedule, RungeKuttaIntegrator

import numpy as np
import cupy as cp
import matplotlib.pyplot as plt
import os

# On a system with multiple GPUs, `mpiexec` can be used as follows:
# `mpiexec -np <N> python3 multi_gpu.py `
cudaq.mpi.initialize()

# Set the target to our dynamics simulator
cudaq.set_target("dynamics")

# In this example, we solve the Quantum Heisenberg model (https://en.wikipedia.org/wiki/Quantum_Heisenberg_model),
# which exhibits the so-called quantum quench effect.
# e.g., see `Quantum quenches in the anisotropic spin-1/2 Heisenberg chain: different approaches to many-body dynamics far from equilibrium`
# (New J. Phys. 12 055017)
# Large number of spins
N = 21
dimensions = {}
for i in range(N):
dimensions[i] = 2

# Initial state: alternating spin up and down
spin_state = ''
for i in range(N):
spin_state += str(int(i % 2))

# Observable is the staggered magnetization operator
staggered_magnetization_op = operators.zero()
for i in range(N):
if i % 2 == 0:
staggered_magnetization_op += spin.z(i)
else:
staggered_magnetization_op -= spin.z(i)

staggered_magnetization_op /= N

observe_results = []
for g in [0.25, 4.0]:
# Heisenberg model spin coupling strength
Jx = 1.0
Jy = 1.0
Jz = g

# Construct the Hamiltonian
H = operators.zero()

for i in range(N - 1):
H += Jx * spin.x(i) * spin.x(i + 1)
H += Jy * spin.y(i) * spin.y(i + 1)
H += Jz * spin.z(i) * spin.z(i + 1)

steps = np.linspace(0.0, 1, 100)
schedule = Schedule(steps, ["time"])

# Prepare the initial state vector
psi0_ = cp.zeros(2**N, dtype=cp.complex128)
psi0_[int(spin_state, 2)] = 1.0
psi0 = cudaq.State.from_data(psi0_)

# Run the simulation
evolution_result = cudaq.evolve(H,
dimensions,
schedule,
psi0,
observables=[staggered_magnetization_op],
collapse_operators=[],
store_intermediate_results=True,
integrator=RungeKuttaIntegrator())

exp_val = [
exp_vals[0].expectation()
for exp_vals in evolution_result.expectation_values()
]

observe_results.append((g, exp_val))

if cudaq.mpi.rank() == 0:
# Plot the results
fig = plt.figure(figsize=(12, 6))
for g, exp_val in observe_results:
plt.plot(steps, exp_val, label=f'$ g = {g}$')
plt.legend(fontsize=16)
plt.ylabel("Staggered Magnetization")
plt.xlabel("Time")
abspath = os.path.abspath(__file__)
dname = os.path.dirname(abspath)
os.chdir(dname)
fig.savefig("heisenberg_model.png", dpi=fig.dpi)

cudaq.mpi.finalize()
41 changes: 41 additions & 0 deletions docs/sphinx/snippets/python/using/backends/dynamics.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,3 +172,44 @@ def compute_value(param_name, step_idx):
time_dependence = parameter_values(numpy.linspace(0, 1, 100))
cudaq.evolve(system_operator, system_dimensions, time_dependence, initial_state)
#[End Schedule2]

import cudaq
from cudaq import operators, spin, Schedule, RungeKuttaIntegrator

N = 4
dimensions = {}
for i in range(N):
dimensions[i] = 2
g = 1.0
H = operators.zero()
for i in range(N):
H += 2 * np.pi * spin.x(i)
H += 2 * np.pi * spin.y(i)
for i in range(N - 1):
H += 2 * np.pi * g * spin.x(i) * spin.x(i + 1)
H += 2 * np.pi * g * spin.y(i) * spin.z(i + 1)

steps = np.linspace(0.0, 1, 200)
schedule = Schedule(steps, ["time"])

#[Begin MPI]
cudaq.mpi.initialize()

# Set the target to our dynamics simulator
cudaq.set_target("dynamics")

# Initial state (expressed as an enum)
psi0 = cudaq.operator.InitialState.ZERO

# Run the simulation
evolution_result = cudaq.evolve(H,
dimensions,
schedule,
psi0,
observables=[],
collapse_operators=[],
store_intermediate_results=True,
integrator=RungeKuttaIntegrator())

cudaq.mpi.finalize()
#[End MPI]
40 changes: 39 additions & 1 deletion docs/sphinx/using/backends/dynamics.rst
Original file line number Diff line number Diff line change
Expand Up @@ -272,4 +272,42 @@ backend target.
If the output is a '`None`' string, it indicates that your Torch installation does not support CUDA.
In this case, you need to install a CUDA-enabled Torch package via other mechanisms, e.g., building Torch from source or
using their Docker images.


Multi-GPU Multi-Node Execution
+++++++++++++++++++++++++++++++

.. _cudensitymat_mgmn:

CUDA-Q ``dynamics`` target supports parallel execution on multiple GPUs.
To enable parallel execution, the application must initialize MPI as follows.


.. tab:: Python

.. literalinclude:: ../../snippets/python/using/backends/dynamics.py
:language: python
:start-after: [Begin MPI]
:end-before: [End MPI]

.. code:: bash

mpiexec -np <N> python3 program.py

By initializing the MPI execution environment (via `cudaq.mpi.initialize()`) in the application code and
invoking it via an MPI launcher, we have activated the multi-node multi-GPU feature of the ``dynamics`` target.
Specifically, it will detect the number of processes (GPUs) and distribute the computation across all available GPUs.


.. note::
The number of MPI processes must be a power of 2, one GPU per process.

.. note::
Not all integrators are capable of handling distributed state. Errors will be raised if parallel execution is activated
but the selected integrator does not support distributed state.

.. warning::
As of cuQuantum version 24.11, there are a couple of `known limitations <https://docs.nvidia.com/cuda/cuquantum/24.11.0/cudensitymat/index.html>`__ for parallel execution:

- Computing the expectation value of a mixed quantum state is not supported. Thus, `collapse_operators` are not supported if expectation calculation is required.

- Some combinations of quantum states and quantum many-body operators are not supported. Errors will be raised in those cases.
2 changes: 1 addition & 1 deletion python/cudaq/operator/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@
from .definitions import operators, spin
from .evolution import evolve, evolve_async
from .expressions import Operator, OperatorSum, ProductOperator, ElementaryOperator, ScalarOperator, RydbergHamiltonian
from .helpers import NumericType
from .helpers import NumericType, InitialState
from .schedule import Schedule
20 changes: 17 additions & 3 deletions python/cudaq/operator/cudm_solver.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from ..mlir._mlir_libs._quakeDialects import cudaq_runtime
from .cudm_helpers import cudm, CudmStateType
from .cudm_state import CuDensityMatState, as_cudm_state
from .helpers import InitialState, InitialStateArgT
from .integrator import BaseIntegrator
from .integrators.builtin_integrators import RungeKuttaIntegrator, cuDensityMatTimeStepper
import cupy
Expand All @@ -28,7 +29,7 @@ def evolve_dynamics(
hamiltonian: Operator,
dimensions: Mapping[int, int],
schedule: Schedule,
initial_state: cudaq_runtime.State,
initial_state: InitialStateArgT,
collapse_operators: Sequence[Operator] = [],
observables: Sequence[Operator] = [],
store_intermediate_results=False,
Expand All @@ -43,8 +44,21 @@ def evolve_dynamics(
schedule.reset()
hilbert_space_dims = tuple(dimensions[d] for d in range(len(dimensions)))

with ScopeTimer("evolve.as_cudm_state") as timer:
initial_state = as_cudm_state(initial_state)
# Check that the integrator can support distributed state if this is a distributed simulation.
if cudaq_runtime.mpi.is_initialized() and cudaq_runtime.mpi.num_ranks(
) > 1 and integrator is not None and not integrator.support_distributed_state(
):
raise ValueError(
f"Integrator {type(integrator).__name__} does not support distributed state."
)

if isinstance(initial_state, InitialState):
has_collapse_operators = len(collapse_operators) > 0
initial_state = CuDensityMatState.create_initial_state(
initial_state, hilbert_space_dims, has_collapse_operators)
else:
with ScopeTimer("evolve.as_cudm_state") as timer:
initial_state = as_cudm_state(initial_state)

if not isinstance(initial_state, CuDensityMatState):
raise ValueError("Unknown type")
Expand Down
Loading
Loading