diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md index ab53d3e8e6..2999d78cdf 100644 --- a/.github/CHANGELOG.md +++ b/.github/CHANGELOG.md @@ -2,6 +2,9 @@ ### New features since last release +* Add `mid-circuit measurements` support to `lightning.gpu`'s single-GPU backend. + [(#931)](https://github.com/PennyLaneAI/pennylane-lightning/pull/931) + * Add Matrix Product Operator (MPO) for all gates support to `lightning.tensor`. Note current C++ implementation only works for MPO sites data provided by users. [(#859)](https://github.com/PennyLaneAI/pennylane-lightning/pull/859) diff --git a/mpitests/conftest.py b/mpitests/conftest.py index 552cf9f330..11be17824a 100644 --- a/mpitests/conftest.py +++ b/mpitests/conftest.py @@ -15,9 +15,10 @@ Pytest configuration file for PennyLane-Lightning-GPU test suite. """ # pylint: disable=missing-function-docstring,wrong-import-order,unused-import - import itertools import os +from functools import reduce +from typing import Sequence import pennylane as qml import pytest @@ -125,3 +126,79 @@ def _device(wires): ) return _device + + +####################################################################### + + +def validate_counts(shots, results1, results2): + """Compares two counts. + If the results are ``Sequence``s, loop over entries. + Fails if a key of ``results1`` is not found in ``results2``. + Passes if counts are too low, chosen as ``100``. + Otherwise, fails if counts differ by more than ``20`` plus 20 percent. + """ + if isinstance(results1, Sequence): + assert isinstance(results2, Sequence) + assert len(results1) == len(results2) + for r1, r2 in zip(results1, results2): + validate_counts(shots, r1, r2) + return + for key1, val1 in results1.items(): + val2 = results2[key1] + if abs(val1 + val2) > 100: + assert np.allclose(val1, val2, rtol=20, atol=0.2) + + +def validate_samples(shots, results1, results2): + """Compares two samples. + If the results are ``Sequence``s, loop over entries. + Fails if the results do not have the same shape, within ``20`` entries plus 20 percent. + This is to handle cases when post-selection yields variable shapes. + Otherwise, fails if the sums of samples differ by more than ``20`` plus 20 percent. + """ + if isinstance(shots, Sequence): + assert isinstance(results1, Sequence) + assert isinstance(results2, Sequence) + assert len(results1) == len(results2) + for s, r1, r2 in zip(shots, results1, results2): + validate_samples(s, r1, r2) + else: + sh1, sh2 = results1.shape[0], results2.shape[0] + assert np.allclose(sh1, sh2, rtol=20, atol=0.2) + assert results1.ndim == results2.ndim + if results2.ndim > 1: + assert results1.shape[1] == results2.shape[1] + np.allclose(np.sum(results1), np.sum(results2), rtol=20, atol=0.2) + + +def validate_others(shots, results1, results2): + """Compares two expval, probs or var. + If the results are ``Sequence``s, validate the average of items. + If ``shots is None``, validate using ``np.allclose``'s default parameters. + Otherwise, fails if the results do not match within ``0.01`` plus 20 percent. + """ + if isinstance(results1, Sequence): + assert isinstance(results2, Sequence) + assert len(results1) == len(results2) + results1 = reduce(lambda x, y: x + y, results1) / len(results1) + results2 = reduce(lambda x, y: x + y, results2) / len(results2) + validate_others(shots, results1, results2) + return + if shots is None: + assert np.allclose(results1, results2) + return + assert np.allclose(results1, results2, atol=0.01, rtol=0.2) + + +def validate_measurements(func, shots, results1, results2): + """Calls the correct validation function based on measurement type.""" + if func is qml.counts: + validate_counts(shots, results1, results2) + return + + if func is qml.sample: + validate_samples(shots, results1, results2) + return + + validate_others(shots, results1, results2) diff --git a/mpitests/test_native_mcm.py b/mpitests/test_native_mcm.py new file mode 100644 index 0000000000..eaafe0ff03 --- /dev/null +++ b/mpitests/test_native_mcm.py @@ -0,0 +1,348 @@ +# Copyright 2024 Xanadu Quantum Technologies Inc. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tests for default qubit preprocessing.""" +from functools import reduce +from typing import Sequence + +import numpy as np +import pennylane as qml +import pytest +from conftest import LightningDevice, device_name, validate_measurements +from flaky import flaky +from mpi4py import MPI + +if not LightningDevice._CPP_BINARY_AVAILABLE: # pylint: disable=protected-access + pytest.skip("No binary module found. Skipping.", allow_module_level=True) + + +def get_device(wires, **kwargs): + kwargs.setdefault("shots", None) + return qml.device(device_name, wires=wires, mpi=True, **kwargs) + + +def test_all_invalid_shots_circuit(): + """Test all invalid cases: expval, probs, var measurements.""" + comm = MPI.COMM_WORLD + dev = qml.device(device_name, wires=2) + dq = qml.device("default.qubit", wires=2) + + def circuit_op(): + m = qml.measure(0, postselect=1) + qml.cond(m, qml.PauliX)(1) + return ( + qml.expval(op=qml.PauliZ(1)), + qml.probs(op=qml.PauliY(0) @ qml.PauliZ(1)), + qml.var(op=qml.PauliZ(1)), + ) + + comm.Barrier() + res1 = qml.QNode(circuit_op, dq)() + res2 = qml.QNode(circuit_op, dev)(shots=10) + for r1, r2 in zip(res1, res2): + if isinstance(r1, Sequence): + assert len(r1) == len(r2) + assert np.all(np.isnan(r1)) + assert np.all(np.isnan(r2)) + + def circuit_mcm(): + m = qml.measure(0, postselect=1) + qml.cond(m, qml.PauliX)(1) + return qml.expval(op=m), qml.probs(op=m), qml.var(op=m) + + res1 = qml.QNode(circuit_mcm, dq)() + res2 = qml.QNode(circuit_mcm, dev)(shots=10) + + comm.Barrier() + for r1, r2 in zip(res1, res2): + if isinstance(r1, Sequence): + assert len(r1) == len(r2) + assert np.all(np.isnan(r1)) + assert np.all(np.isnan(r2)) + + +def test_unsupported_measurement(): + """Test unsupported ``qml.classical_shadow`` measurement on ``lightning.gpu`` .""" + comm = MPI.COMM_WORLD + dev = qml.device(device_name, wires=2, mpi=True, shots=1000) + params = np.pi / 4 * np.ones(2) + + @qml.qnode(dev) + def func(x, y): + qml.RX(x, wires=0) + m0 = qml.measure(0) + qml.cond(m0, qml.RY)(y, wires=1) + return qml.classical_shadow(wires=0) + + comm.Barrier() + if device_name == "lightning.qubit": + with pytest.raises( + qml.DeviceError, + match=f"not accepted with finite shots on lightning.qubit", + ): + func(*params) + if device_name in ("lightning.kokkos", "lightning.gpu"): + with pytest.raises( + qml.DeviceError, + match=r"Measurement shadow\(wires=\[0\]\) not accepted with finite shots on " + + device_name, + ): + func(*params) + + +@pytest.mark.parametrize("mcm_method", ["deferred", "one-shot"]) +def test_qnode_mcm_method(mcm_method, mocker): + """Test that user specified qnode arg for mid-circuit measurements transform are used correctly""" + comm = MPI.COMM_WORLD + spy = ( + mocker.spy(qml.dynamic_one_shot, "_transform") + if mcm_method == "one-shot" + else mocker.spy(qml.defer_measurements, "_transform") + ) + other_spy = ( + mocker.spy(qml.defer_measurements, "_transform") + if mcm_method == "one-shot" + else mocker.spy(qml.dynamic_one_shot, "_transform") + ) + + shots = 10 + device = qml.device(device_name, wires=3, mpi=True, shots=shots) + comm.Barrier() + + @qml.qnode(device, mcm_method=mcm_method) + def f(x): + qml.RX(x, 0) + _ = qml.measure(0) + qml.CNOT([0, 1]) + return qml.sample(wires=[0, 1]) + + _ = f(np.pi / 8) + comm.Barrier() + + spy.assert_called_once() + other_spy.assert_not_called() + + +@pytest.mark.parametrize("postselect_mode", ["hw-like", "fill-shots"]) +def test_qnode_postselect_mode(postselect_mode): + """Test that user specified qnode arg for discarding invalid shots is used correctly""" + comm = MPI.COMM_WORLD + shots = 100 + device = qml.device(device_name, wires=3, mpi=True, shots=shots) + postselect = 1 + + @qml.qnode(device, postselect_mode=postselect_mode) + def f(x): + qml.RX(x, 0) + _ = qml.measure(0, postselect=postselect) + qml.CNOT([0, 1]) + return qml.sample(qml.Identity(1)) + + comm.Barrier() + # Using small-ish rotation angle ensures the number of valid shots will be less than the + # original number of shots. This helps avoid stochastic failures for the assertion below + res = f(np.pi / 2) + + comm.Barrier() + + if postselect_mode == "hw-like": + assert res.size < shots + else: + assert len(res) == shots + assert np.allclose(res, postselect) + + +# pylint: disable=unused-argument +def obs_tape(x, y, z, reset=False, postselect=None): + qml.RX(x, 0) + qml.RZ(np.pi / 4, 0) + m0 = qml.measure(0, reset=reset) + qml.cond(m0 == 0, qml.RX)(np.pi / 4, 0) + qml.cond(m0 == 0, qml.RZ)(np.pi / 4, 0) + qml.cond(m0 == 1, qml.RX)(-np.pi / 4, 0) + qml.cond(m0 == 1, qml.RZ)(-np.pi / 4, 0) + qml.RX(y, 1) + qml.RZ(np.pi / 4, 1) + m1 = qml.measure(1, postselect=postselect) + qml.cond(m1 == 0, qml.RX)(np.pi / 4, 1) + qml.cond(m1 == 0, qml.RZ)(np.pi / 4, 1) + qml.cond(m1 == 1, qml.RX)(-np.pi / 4, 1) + qml.cond(m1 == 1, qml.RZ)(-np.pi / 4, 1) + return m0, m1 + + +@flaky(max_runs=5) +@pytest.mark.parametrize("shots", [5000, [5000, 5001]]) +@pytest.mark.parametrize("postselect", [None, 0, 1]) +@pytest.mark.parametrize("measure_f", [qml.counts, qml.expval, qml.probs, qml.sample, qml.var]) +@pytest.mark.parametrize( + "meas_obj", + [qml.PauliZ(0), qml.PauliY(1), [0], [0, 1], [1, 0], "mcm", "composite_mcm", "mcm_list"], +) +def test_simple_dynamic_circuit(shots, measure_f, postselect, meas_obj): + """Tests that LightningQubit handles a simple dynamic circuit with the following measurements: + * qml.counts with obs (comp basis or not), single wire, multiple wires (ordered/unordered), MCM, f(MCM), MCM list + * qml.expval with obs (comp basis or not), MCM, f(MCM), MCM list + * qml.probs with obs (comp basis or not), single wire, multiple wires (ordered/unordered), MCM, f(MCM), MCM list + * qml.sample with obs (comp basis or not), single wire, multiple wires (ordered/unordered), MCM, f(MCM), MCM list + * qml.var with obs (comp basis or not), MCM, f(MCM), MCM list + The above combinations should work for finite shots, shot vectors and post-selecting of either the 0 or 1 branch. + """ + comm = MPI.COMM_WORLD + + if measure_f in (qml.expval, qml.var) and ( + isinstance(meas_obj, list) or meas_obj == "mcm_list" + ): + pytest.skip("Can't use wires/mcm lists with var or expval") + + dq = qml.device("default.qubit", shots=shots) + dev = get_device(wires=3, shots=shots) + params = [np.pi / 2.5, np.pi / 3, -np.pi / 3.5] + + def func(x, y, z): + m0, m1 = obs_tape(x, y, z, postselect=postselect) + mid_measure = ( + m0 if meas_obj == "mcm" else (0.5 * m0 if meas_obj == "composite_mcm" else [m0, m1]) + ) + measurement_key = "wires" if isinstance(meas_obj, list) else "op" + measurement_value = mid_measure if isinstance(meas_obj, str) else meas_obj + return measure_f(**{measurement_key: measurement_value}) + + results1 = qml.QNode(func, dev, mcm_method="one-shot")(*params) + results2 = qml.QNode(func, dq, mcm_method="deferred")(*params) + comm.Barrier() + + validate_measurements(measure_f, shots, results1, results2) + + +@pytest.mark.parametrize("postselect", [None, 0, 1]) +@pytest.mark.parametrize("reset", [False, True]) +def test_multiple_measurements_and_reset(postselect, reset): + """Tests that LightningQubit handles a circuit with a single mid-circuit measurement with reset + and a conditional gate. Multiple measurements of the mid-circuit measurement value are + performed. This function also tests `reset` parametrizing over the parameter.""" + comm = MPI.COMM_WORLD + shots = 5000 + dq = qml.device("default.qubit", shots=shots) + dev = get_device(wires=3, shots=shots) + params = [np.pi / 2.5, np.pi / 3, -np.pi / 3.5] + obs = qml.PauliY(1) + + def func(x, y, z): + mcms = obs_tape(x, y, z, reset=reset, postselect=postselect) + return ( + qml.counts(op=obs), + qml.expval(op=mcms[0]), + qml.probs(op=obs), + qml.sample(op=mcms[0]), + qml.var(op=obs), + ) + + results1 = qml.QNode(func, dev, mcm_method="one-shot")(*params) + results2 = qml.QNode(func, dq, mcm_method="deferred")(*params) + comm.Barrier() + + for measure_f, r1, r2 in zip( + [qml.counts, qml.expval, qml.probs, qml.sample, qml.var], results1, results2 + ): + validate_measurements(measure_f, shots, r1, r2) + + +@pytest.mark.parametrize( + "mcm_f", + [ + lambda x: x * -1, + lambda x: x * 1, + lambda x: x * 2, + lambda x: 1 - x, + lambda x: x + 1, + lambda x: x & 3, + "mix", + "list", + ], +) +@pytest.mark.parametrize("measure_f", [qml.counts, qml.expval, qml.probs, qml.sample, qml.var]) +def test_composite_mcms(mcm_f, measure_f): + """Tests that LightningQubit handles a circuit with a composite mid-circuit measurement and a + conditional gate. A single measurement of a composite mid-circuit measurement is performed + at the end.""" + comm = MPI.COMM_WORLD + if measure_f in (qml.expval, qml.var) and (mcm_f in ("list", "mix")): + pytest.skip( + "expval/var does not support measuring sequences of measurements or observables." + ) + + if measure_f == qml.probs and mcm_f == "mix": + pytest.skip( + "Cannot use qml.probs() when measuring multiple mid-circuit measurements collected using arithmetic operators." + ) + + shots = 3000 + + dq = qml.device("default.qubit", shots=shots) + dev = get_device(wires=3, shots=shots) + param = np.pi / 3 + + @qml.qnode(dev) + def func(x): + qml.RX(x, 0) + m0 = qml.measure(0) + qml.RX(0.5 * x, 1) + m1 = qml.measure(1) + qml.cond((m0 + m1) == 2, qml.RY)(2.0 * x, 0) + m2 = qml.measure(0) + obs = ( + (m0 - 2 * m1) * m2 + 7 + if mcm_f == "mix" + else ([m0, m1, m2] if mcm_f == "list" else mcm_f(m2)) + ) + return measure_f(op=obs) + + results1 = qml.QNode(func, dev, mcm_method="one-shot")(param) + results2 = qml.QNode(func, dq, mcm_method="deferred")(param) + + comm.Barrier() + + validate_measurements(measure_f, shots, results1, results2) + + +@pytest.mark.parametrize( + "mcm_f", + [ + lambda x, y: x + y, + lambda x, y: x - 7 * y, + lambda x, y: x & y, + lambda x, y: x == y, + lambda x, y: 4.0 * x + 2.0 * y, + ], +) +def test_counts_return_type(mcm_f): + """Tests that LightningQubit returns the same keys for ``qml.counts`` measurements with ``dynamic_one_shot`` and ``defer_measurements``.""" + comm = MPI.COMM_WORLD + shots = 500 + + dq = qml.device("default.qubit", shots=shots) + dev = get_device(wires=3, shots=shots) + param = np.pi / 3 + + @qml.qnode(dev) + def func(x): + qml.RX(x, 0) + m0 = qml.measure(0) + qml.RX(0.5 * x, 1) + m1 = qml.measure(1) + qml.cond((m0 + m1) == 2, qml.RY)(2.0 * x, 0) + return qml.counts(op=mcm_f(m0, m1)) + + results1 = qml.QNode(func, dev, mcm_method="one-shot")(param) + results2 = qml.QNode(func, dq, mcm_method="deferred")(param) + comm.Barrier() + for r1, r2 in zip(results1.keys(), results2.keys()): + assert r1 == r2 diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py index 59d75bd653..cff4ff5e0b 100644 --- a/pennylane_lightning/core/_version.py +++ b/pennylane_lightning/core/_version.py @@ -16,4 +16,4 @@ Version number (major.minor.patch[-label]) """ -__version__ = "0.39.0-dev42" +__version__ = "0.39.0-dev43" diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp index 964c5e69ce..794c026de5 100644 --- a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp +++ b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp @@ -334,6 +334,79 @@ class StateVectorCudaMPI final mpi_manager_.Barrier(); } + /** + * @brief Collapse the state vector after having measured one of the qubit. + * + * Note: The branch parameter imposes the measurement result on the given + * wire. + * + * @param wire Wire to measure. + * @param branch Branch 0 or 1. + */ + void collapse(const std::size_t wire, const bool branch) { + /* + PL_ABORT_IF_NOT(wire < this->getTotalNumQubits(), + "Invalid wire index."); + PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize()); + mpi_manager_.Barrier(); + + const int wireInt = + static_cast(this->getTotalNumQubits() - 1 - wire); + + if (static_cast(wireInt) < getNumLocalQubits()) { + // local wire + collapse_local_(wireInt, branch); + } else { + // global wire + constexpr int local_wire = 0; + std::vector wirePairs{make_int2(wireInt, local_wire)}; + applyMPI_Dispatcher(wirePairs, &StateVectorCudaMPI::collapse_local_, + local_wire, branch); + + PL_CUDA_IS_SUCCESS(cudaStreamSynchronize(localStream_.get())); + PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize()); + } + + mpi_manager_.Barrier(); + */ + PL_ABORT_IF_NOT(wire < this->getTotalNumQubits(), + "Invalid wire index."); + + std::vector matrix(4, ComplexT(0.0, 0.0)); + + for (std::size_t i = 0; i < matrix.size(); i++) { + matrix[i] = ((i == 0 && branch == 0) || (i == 3 && branch == 1)) + ? ComplexT{1.0, 0.0} + : ComplexT{0.0, 0.0}; + } + + mpi_manager_.Barrier(); + + applyMatrix(matrix, {wire}, false); + + auto local_norm2 = norm2_CUDA( + BaseType::getData(), BaseType::getLength(), + BaseType::getDataBuffer().getDevTag().getDeviceID(), + BaseType::getDataBuffer().getDevTag().getStreamID(), + this->getCublasCaller()); + + local_norm2 *= local_norm2; + + mpi_manager_.Barrier(); + + auto norm2 = mpi_manager_.allreduce(local_norm2, "sum"); + + norm2 = std::sqrt(norm2); + + normalize_CUDA( + norm2, BaseType::getData(), BaseType::getLength(), + BaseType::getDataBuffer().getDevTag().getDeviceID(), + BaseType::getDataBuffer().getDevTag().getStreamID(), + this->getCublasCaller()); + + mpi_manager_.Barrier(); + } + /** * @brief Apply a single gate to the state-vector. Offloads to custatevec * specific API calls if available. If unable, attempts to use prior cached @@ -350,6 +423,8 @@ class StateVectorCudaMPI final const std::vector &wires, bool adjoint, const std::vector ¶ms, [[maybe_unused]] const std::vector &matrix) { + PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize()); + mpi_manager_.Barrier(); std::vector matrix_cu(matrix.size()); std::transform(matrix.begin(), matrix.end(), matrix_cu.begin(), [](const std::complex &x) { @@ -375,6 +450,8 @@ class StateVectorCudaMPI final const std::string &opName, const std::vector &wires, bool adjoint = false, const std::vector ¶ms = {0.0}, [[maybe_unused]] const std::vector &gate_matrix = {}) { + PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize()); + mpi_manager_.Barrier(); const auto ctrl_offset = (BaseType::getCtrlMap().find(opName) != BaseType::getCtrlMap().end()) ? BaseType::getCtrlMap().at(opName) @@ -434,6 +511,8 @@ class StateVectorCudaMPI final gate_cache_.get_gate_device_ptr(opName, par[0]), ctrls_local, tgts_local, adjoint); } + PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize()); + mpi_manager_.Barrier(); } /** @@ -491,6 +570,8 @@ class StateVectorCudaMPI final const std::vector &wires, bool adjoint = false) { PL_ABORT_IF(wires.empty(), "Number of wires must be larger than 0"); + PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize()); + mpi_manager_.Barrier(); const std::string opName = "Matrix"; std::size_t n = std::size_t{1} << wires.size(); const std::vector> matrix(gate_matrix, @@ -502,6 +583,8 @@ class StateVectorCudaMPI final x); }); applyOperation(opName, wires, adjoint, {}, matrix_cu); + PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize()); + mpi_manager_.Barrier(); } /** @@ -1617,6 +1700,54 @@ class StateVectorCudaMPI final mpi_manager_.Barrier(); } + /** + * @brief collapse the state vector to a given basis state. + * + * @param wire_local Local wire index. + * @param branch Branch index. + */ + void collapse_local_(const int wire_local, const bool branch) { + cudaDataType_t data_type; + + if constexpr (std::is_same_v || + std::is_same_v) { + data_type = CUDA_C_64F; + } else { + data_type = CUDA_C_32F; + } + + std::vector basisBits(1, wire_local); + + double abs2sum0_local, abs2sum1_local; + + PL_CUSTATEVEC_IS_SUCCESS(custatevecAbs2SumOnZBasis( + /* custatevecHandle_t */ handle_.get(), + /* void *sv */ BaseType::getData(), + /* cudaDataType_t */ data_type, + /* const uint32_t nIndexBits */ getNumLocalQubits(), + /* double * */ &abs2sum0_local, + /* double * */ &abs2sum1_local, + /* const int32_t * */ basisBits.data(), + /* const uint32_t nBasisBits */ basisBits.size())); + + auto abs2sum0 = mpi_manager_.allreduce(abs2sum0_local, "sum"); + auto abs2sum1 = mpi_manager_.allreduce(abs2sum1_local, "sum"); + + double norm = (branch == 0) ? abs2sum0 : abs2sum1; + + const int parity = static_cast(branch); + + PL_CUSTATEVEC_IS_SUCCESS(custatevecCollapseOnZBasis( + /* custatevecHandle_t */ handle_.get(), + /* void *sv */ BaseType::getData(), + /* cudaDataType_t */ data_type, + /* const uint32_t nIndexBits */ getNumLocalQubits(), + /* const int32_t parity */ parity, + /* const int32_t *basisBits */ basisBits.data(), + /* const uint32_t nBasisBits */ basisBits.size(), + /* double norm */ norm)); + } + /** * @brief Get expectation value for a sum of Pauli words. * @@ -1899,6 +2030,8 @@ class StateVectorCudaMPI final const std::vector &ctrls, const std::vector &tgts, bool use_adjoint = false) { + PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize()); + mpi_manager_.Barrier(); std::vector ctrlsInt(ctrls.size()); std::vector tgtsInt(tgts.size()); diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp index 9e68592dfd..7ade4ab2b9 100644 --- a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp +++ b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp @@ -487,6 +487,54 @@ class StateVectorCudaManaged applyMatrix(gate_matrix.data(), wires, adjoint); } + /** + * @brief Collapse the state vector after having measured one of the qubit. + * + * Note: The branch parameter imposes the measurement result on the given + * wire. + * + * @param wire Wire to measure. + * @param branch Branch 0 or 1. + */ + void collapse(const std::size_t wire, const bool branch) { + PL_ABORT_IF_NOT(wire < BaseType::getNumQubits(), "Invalid wire index."); + cudaDataType_t data_type; + + if constexpr (std::is_same_v || + std::is_same_v) { + data_type = CUDA_C_64F; + } else { + data_type = CUDA_C_32F; + } + + std::vector basisBits(1, BaseType::getNumQubits() - 1 - wire); + + double abs2sum0, abs2sum1; + PL_CUSTATEVEC_IS_SUCCESS(custatevecAbs2SumOnZBasis( + /* custatevecHandle_t */ handle_.get(), + /* void *sv */ BaseType::getData(), + /* cudaDataType_t */ data_type, + /* const uint32_t nIndexBits */ BaseType::getNumQubits(), + /* double * */ &abs2sum0, + /* double * */ &abs2sum1, + /* const int32_t * */ basisBits.data(), + /* const uint32_t nBasisBits */ basisBits.size())); + + double norm = (branch == 0) ? abs2sum0 : abs2sum1; + + const int parity = static_cast(branch); + + PL_CUSTATEVEC_IS_SUCCESS(custatevecCollapseOnZBasis( + /* custatevecHandle_t */ handle_.get(), + /* void *sv */ BaseType::getData(), + /* cudaDataType_t */ data_type, + /* const uint32_t nIndexBits */ BaseType::getNumQubits(), + /* const int32_t parity */ parity, + /* const int32_t *basisBits */ basisBits.data(), + /* const uint32_t nBasisBits */ basisBits.size(), + /* double norm */ norm)); + } + //****************************************************************************// // Explicit gate calls for bindings //****************************************************************************// diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp index c361bd6ed9..145097b30e 100644 --- a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp +++ b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp @@ -150,6 +150,8 @@ void registerBackendClassSpecificBindings(PyClass &pyclass) { }, py::arg("async") = false, "Initialize the statevector data to the |0...0> state") + .def("collapse", &StateVectorT::collapse, + "Collapse the statevector onto the 0 or 1 branch of a given wire.") .def( "apply", [](StateVectorT &sv, const std::string &str, diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp index 2d3313f694..529f5ae75e 100644 --- a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp +++ b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp @@ -100,6 +100,8 @@ void registerBackendClassSpecificBindingsMPI(PyClass &pyclass) { }, "Set State Vector on GPU with values for the state vector and " "wires on the host memory.") + .def("collapse", &StateVectorT::collapse, + "Collapse the statevector onto the 0 or 1 branch of a given wire.") .def( "DeviceToDevice", [](StateVectorT &sv, const StateVectorT &other, bool async) { diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/initSV.cu b/pennylane_lightning/core/src/simulators/lightning_gpu/initSV.cu index 4e3e93ea79..8a62e89e84 100644 --- a/pennylane_lightning/core/src/simulators/lightning_gpu/initSV.cu +++ b/pennylane_lightning/core/src/simulators/lightning_gpu/initSV.cu @@ -59,7 +59,7 @@ void setBasisState_CUDA(cuDoubleComplex *sv, cuDoubleComplex &value, cudaStream_t stream_id); /** - * @brief The CUDA kernel that setS state vector data on GPU device from the + * @brief The CUDA kernel that sets state vector data on GPU device from the * input values (on device) and their corresponding indices (on device) * information. * diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/MeasurementsGPUMPI.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/MeasurementsGPUMPI.hpp index 6fee1711d2..710930ba54 100644 --- a/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/MeasurementsGPUMPI.hpp +++ b/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/MeasurementsGPUMPI.hpp @@ -265,7 +265,7 @@ class MeasurementsMPI final * number between 0 and num_samples-1. */ auto generate_samples(std::size_t num_samples) -> std::vector { - double epsilon = 1e-15; + double epsilon = std::numeric_limits::epsilon() * 1.0e2; std::size_t nSubSvs = 1UL << (this->_statevector.getNumGlobalQubits()); std::vector rand_nums(num_samples); std::vector samples( @@ -280,8 +280,8 @@ class MeasurementsMPI final bitOrdering[i] = i; } - std::vector localBitStrings(num_samples); - std::vector globalBitStrings(num_samples); + std::vector localBitStrings(num_samples, 0); + std::vector globalBitStrings(num_samples, 0); if (mpi_manager_.getRank() == 0) { for (std::size_t n = 0; n < num_samples; n++) { @@ -320,6 +320,8 @@ class MeasurementsMPI final /* custatevecHandle_t */ this->_statevector.getCusvHandle(), /* custatevecSamplerDescriptor_t */ sampler, /* double * */ &subNorm)); + PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize()); + mpi_manager_.Barrier(); int source = (mpi_manager_.getRank() - 1 + mpi_manager_.getSize()) % mpi_manager_.getSize(); @@ -354,6 +356,8 @@ class MeasurementsMPI final /* double */ precumulative, /* double */ norm)); + norm = (norm < epsilon) ? epsilon : norm; + PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize()); auto low = std::lower_bound(rand_nums.begin(), rand_nums.end(), cumulative / norm); diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/tests/Test_StateVectorCudaManaged.cpp b/pennylane_lightning/core/src/simulators/lightning_gpu/tests/Test_StateVectorCudaManaged.cpp index 4003395b53..0301970390 100644 --- a/pennylane_lightning/core/src/simulators/lightning_gpu/tests/Test_StateVectorCudaManaged.cpp +++ b/pennylane_lightning/core/src/simulators/lightning_gpu/tests/Test_StateVectorCudaManaged.cpp @@ -266,3 +266,45 @@ TEMPLATE_TEST_CASE("StateVectorCudaManaged::StateVectorCudaManaged", REQUIRE(std::is_constructible_v); } } + +TEMPLATE_TEST_CASE("StateVectorCudaManaged::collapse", + "[StateVectorCudaManaged]", float, double) { + using PrecisionT = TestType; + using ComplexT = typename StateVectorCudaManaged::ComplexT; + using CFP_t = typename StateVectorCudaManaged::CFP_t; + using TestVectorT = TestVector; + + std::size_t wire = GENERATE(0, 1, 2); + std::size_t branch = GENERATE(0, 1); + const std::size_t num_qubits = 3; + + // TODO @tomlqc use same template for testing all Lightning flavours? + + SECTION("Collapse the state vector after having measured one of the " + "qubits.") { + TestVectorT init_state = createPlusState_(num_qubits); + + const ComplexT coef{0.5, PrecisionT{0.0}}; + const ComplexT zero{PrecisionT{0.0}, PrecisionT{0.0}}; + + std::vector>> expected_state = { + {{coef, coef, coef, coef, zero, zero, zero, zero}, + {coef, coef, zero, zero, coef, coef, zero, zero}, + {coef, zero, coef, zero, coef, zero, coef, zero}}, + {{zero, zero, zero, zero, coef, coef, coef, coef}, + {zero, zero, coef, coef, zero, zero, coef, coef}, + {zero, coef, zero, coef, zero, coef, zero, coef}}, + }; + + StateVectorCudaManaged sv( + reinterpret_cast(init_state.data()), init_state.size()); + + sv.collapse(wire, branch); + + PrecisionT eps = std::numeric_limits::epsilon() * 1e2; + REQUIRE(isApproxEqual(sv.getDataVector().data(), + sv.getDataVector().size(), + expected_state[branch][wire].data(), + expected_state[branch][wire].size(), eps)); + } +} diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/tests/mpi/Test_StateVectorCudaMPI.cpp b/pennylane_lightning/core/src/simulators/lightning_gpu/tests/mpi/Test_StateVectorCudaMPI.cpp index 4b5a2dd349..401f6056b3 100644 --- a/pennylane_lightning/core/src/simulators/lightning_gpu/tests/mpi/Test_StateVectorCudaMPI.cpp +++ b/pennylane_lightning/core/src/simulators/lightning_gpu/tests/mpi/Test_StateVectorCudaMPI.cpp @@ -317,4 +317,76 @@ TEMPLATE_PRODUCT_TEST_CASE("StateVectorCudaMPI::applyOperations", {false, false}, {{0.0}}), LightningException, "must all be equal"); // invalid parameters } -} \ No newline at end of file +} + +TEMPLATE_TEST_CASE("StateVectorCudaManaged::collapse", + "[StateVectorCudaManaged]", float, double) { + using PrecisionT = TestType; + using ComplexT = typename StateVectorCudaMPI::ComplexT; + using TestVectorT = TestVector; + + std::size_t wire = GENERATE(0, 1, 2); + std::size_t branch = GENERATE(0, 1); + const std::size_t num_qubits = 3; + + using StateVectorT = StateVectorCudaMPI; + MPIManager mpi_manager(MPI_COMM_WORLD); + REQUIRE(mpi_manager.getSize() == 2); + + std::size_t mpi_buffersize = 1; + + int nGlobalIndexBits = + std::bit_width(static_cast(mpi_manager.getSize())) - 1; + int nLocalIndexBits = num_qubits - nGlobalIndexBits; + mpi_manager.Barrier(); + + int nDevices = 0; // Number of GPU devices per node + cudaGetDeviceCount(&nDevices); + REQUIRE(nDevices >= 2); + int deviceId = mpi_manager.getRank() % nDevices; + cudaSetDevice(deviceId); + DevTag dt_local(deviceId, 0); + + TestVectorT init_state = createPlusState_(num_qubits); + + std::size_t subSvLength = 1 << nLocalIndexBits; + + mpi_manager.Barrier(); + + std::vector local_state(subSvLength); + + mpi_manager.Scatter(init_state.data(), local_state.data(), subSvLength, 0); + mpi_manager.Barrier(); + + // TODO @tomlqc use same template for testing all Lightning flavours? + + SECTION("Collapse the state vector after having measured one of the " + "qubits.") { + const ComplexT coef{0.5, PrecisionT{0.0}}; + const ComplexT zero{PrecisionT{0.0}, PrecisionT{0.0}}; + + std::vector>> expected_state = { + {{coef, coef, coef, coef, zero, zero, zero, zero}, + {coef, coef, zero, zero, coef, coef, zero, zero}, + {coef, zero, coef, zero, coef, zero, coef, zero}}, + {{zero, zero, zero, zero, coef, coef, coef, coef}, + {zero, zero, coef, coef, zero, zero, coef, coef}, + {zero, coef, zero, coef, zero, coef, zero, coef}}, + }; + + StateVectorT sv(mpi_manager, dt_local, mpi_buffersize, nGlobalIndexBits, + nLocalIndexBits); + + sv.CopyHostDataToGpu(local_state.data(), local_state.size(), false); + + sv.collapse(wire, branch); + + auto expected_local_state = + mpi_manager.scatter(expected_state[branch][wire], 0); + + PrecisionT eps = std::numeric_limits::epsilon() * 1e2; + REQUIRE(isApproxEqual( + sv.getDataVector().data(), sv.getDataVector().size(), + expected_local_state.data(), expected_local_state.size(), eps)); + } +} diff --git a/pennylane_lightning/core/src/utils/cuda_utils/LinearAlg.hpp b/pennylane_lightning/core/src/utils/cuda_utils/LinearAlg.hpp index cd422899b5..984a9d2358 100644 --- a/pennylane_lightning/core/src/utils/cuda_utils/LinearAlg.hpp +++ b/pennylane_lightning/core/src/utils/cuda_utils/LinearAlg.hpp @@ -274,6 +274,61 @@ inline auto scaleC_CUDA(const CFP_t a, T *v1, const int data_size, data_type); } +/** + * @brief cuBLAS backed GPU data normalization. + * + * @tparam CFP_t Complex float data-type. Accepts cuDoubleComplex and cuComplex + * @tparam DevTypeID Integer type of device id. + * + * @param v1 Device data pointer + * @param data_size Length of device data. + * @param dev_id the device on which the function should be executed. + * @param stream_id the CUDA stream on which the operation should be executed. + * @param cublas the CublasCaller object that manages the cuBLAS handle. + */ +template +inline auto norm2_CUDA(CFP_t *v1, const int data_size, DevTypeID dev_id, + cudaStream_t stream_id, const CublasCaller &cublas) { + if constexpr (std::is_same_v || + std::is_same_v) { + double norm{0.0}; + cublas.call(cublasDznrm2, dev_id, stream_id, data_size, v1, 1, &norm); + return norm; + } else { + float norm{0.0}; + cublas.call(cublasScnrm2, dev_id, stream_id, data_size, v1, 1, &norm); + return norm; + } +} + +/** + * @brief cuBLAS backed GPU data normalization. + * + * @tparam T Float data-type. Accepts float and double + * @tparam CFP_t Complex float data-type. Accepts cuDoubleComplex and cuComplex + * + * @param norm2 Norm of the vector + * @param v1 Device data pointer + * @param data_size Length of device data. + * @param dev_id the device on which the function should be executed. + * @param stream_id the CUDA stream on which the operation should be executed. + * @param cublas the CublasCaller object that manages the cuBLAS handle. + */ +template +inline auto normalize_CUDA(T norm2, CFP_t *v1, const int data_size, + DevTypeID dev_id, cudaStream_t stream_id, + const CublasCaller &cublas) { + if constexpr (std::is_same_v || + std::is_same_v) { + const double alpha = 1.0 / norm2; + cublas.call(cublasZdscal, dev_id, stream_id, data_size, &alpha, v1, 1); + } else { + const float alpha = 1.0 / norm2; + cublas.call(cublasCsscal, dev_id, stream_id, data_size, &alpha, v1, 1); + } +} + /** @brief `%CudaScopedDevice` uses RAII to select a CUDA device context. * * @see https://taskflow.github.io/taskflow/classtf_1_1cudaScopedDevice.html diff --git a/pennylane_lightning/lightning_gpu/_measurements.py b/pennylane_lightning/lightning_gpu/_measurements.py index 4b95762ccc..337f6273e3 100644 --- a/pennylane_lightning/lightning_gpu/_measurements.py +++ b/pennylane_lightning/lightning_gpu/_measurements.py @@ -34,6 +34,7 @@ except ImportError as error_import: warn(str(error_import), UserWarning) +from functools import reduce from typing import List import numpy as np @@ -105,8 +106,9 @@ def _measure_with_samples_diagonalizing_gates( self._apply_diagonalizing_gates(mps) # Specific for LGPU: - total_indices = self._qubit_state.num_wires - wires = qml.wires.Wires(range(total_indices)) + # total_indices = self._qubit_state.num_wires + # wires = qml.wires.Wires(range(total_indices)) + wires = reduce(sum, (mp.wires for mp in mps)) def _process_single_shot(samples): processed = [] diff --git a/pennylane_lightning/lightning_gpu/_state_vector.py b/pennylane_lightning/lightning_gpu/_state_vector.py index a000443563..77e453778b 100644 --- a/pennylane_lightning/lightning_gpu/_state_vector.py +++ b/pennylane_lightning/lightning_gpu/_state_vector.py @@ -36,13 +36,17 @@ import numpy as np import pennylane as qml from pennylane import DeviceError +from pennylane.measurements import MidMeasureMP +from pennylane.ops import Conditional from pennylane.ops.op_math import Adjoint +from pennylane.tape import QuantumScript from pennylane.wires import Wires # pylint: disable=ungrouped-imports from pennylane_lightning.core._serialize import global_phase_diagonal from pennylane_lightning.core._state_vector_base import LightningBaseStateVector +from ._measurements import LightningGPUMeasurements from ._mpi_handler import MPIHandler gate_cache_needs_hash = ( @@ -247,15 +251,33 @@ def _apply_lightning_controlled(self, operation): matrix = global_phase_diagonal(param, self.wires, control_wires, control_values) state.apply(name, wires, inv, [[param]], matrix) - def _apply_lightning_midmeasure(self): + def _apply_lightning_midmeasure( + self, operation: MidMeasureMP, mid_measurements: dict, postselect_mode: str + ): """Execute a MidMeasureMP operation and return the sample in mid_measurements. Args: + operation (~pennylane.operation.Operation): mid-circuit measurement + mid_measurements (None, dict): Dictionary of mid-circuit measurements + postselect_mode (str): Configuration for handling shots with mid-circuit measurement + postselection. Use ``"hw-like"`` to discard invalid shots and ``"fill-shots"`` to + keep the same number of shots. Returns: None """ - raise DeviceError("LightningGPU does not support Mid-circuit measurements.") + wires = self.wires.indices(operation.wires) + wire = list(wires)[0] + if postselect_mode == "fill-shots" and operation.postselect is not None: + sample = operation.postselect + else: + circuit = QuantumScript([], [qml.sample(wires=operation.wires)], shots=1) + sample = LightningGPUMeasurements(self).measure_final_state(circuit) + sample = np.squeeze(sample) + mid_measurements[operation] = sample + getattr(self.state_vector, "collapse")(wire, bool(sample)) + if operation.reset and bool(sample): + self.apply_operations([qml.PauliX(operation.wires)], mid_measurements=mid_measurements) # pylint: disable=unused-argument def _apply_lightning( @@ -289,7 +311,14 @@ def _apply_lightning( method = getattr(state, name, None) wires = list(operation.wires) - if method is not None: # apply specialized gate + if isinstance(operation, Conditional): + if operation.meas_val.concretize(mid_measurements): + self._apply_lightning([operation.base]) + elif isinstance(operation, MidMeasureMP): + self._apply_lightning_midmeasure( + operation, mid_measurements, postselect_mode=postselect_mode + ) + elif method is not None: # apply specialized gate param = operation.parameters method(wires, invert_param, param) elif isinstance(operation, qml.ops.Controlled) and isinstance( diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.py b/pennylane_lightning/lightning_gpu/lightning_gpu.py index 2b295c4990..d6f75b2c5e 100644 --- a/pennylane_lightning/lightning_gpu/lightning_gpu.py +++ b/pennylane_lightning/lightning_gpu/lightning_gpu.py @@ -173,10 +173,7 @@ def stopping_condition(op: Operator) -> bool: def stopping_condition_shots(op: Operator) -> bool: """A function that determines whether or not an operation is supported by ``lightning.gpu`` with finite shots.""" - if isinstance(op, (MidMeasureMP, qml.ops.op_math.Conditional)): - # LightningGPU does not support Mid-circuit measurements. - return False - return stopping_condition(op) + return stopping_condition(op) or isinstance(op, (MidMeasureMP, qml.ops.op_math.Conditional)) def accepted_observables(obs: Operator) -> bool: @@ -460,6 +457,7 @@ def execute( self.simulate( circuit, self._statevector, + postselect_mode=execution_config.mcm_config.postselect_mode, ) ) @@ -494,12 +492,16 @@ def simulate( self, circuit: QuantumScript, state: LightningGPUStateVector, + postselect_mode: Optional[str] = None, ) -> Result: """Simulate a single quantum script. Args: circuit (QuantumTape): The single circuit to simulate state (LightningGPUStateVector): handle to Lightning state vector + postselect_mode (str): Configuration for handling shots with mid-circuit measurement + postselection. Use ``"hw-like"`` to discard invalid shots and ``"fill-shots"`` to + keep the same number of shots. Default is ``None``. Returns: Tuple[TensorLike]: The results of the simulation @@ -507,7 +509,25 @@ def simulate( Note that this function can return measurements for non-commuting observables simultaneously. """ if circuit.shots and (any(isinstance(op, MidMeasureMP) for op in circuit.operations)): - raise qml.DeviceError("LightningGPU does not support Mid-circuit measurements.") + results = [] + aux_circ = QuantumScript( + circuit.operations, + circuit.measurements, + shots=[1], + trainable_params=circuit.trainable_params, + ) + for _ in range(circuit.shots.total_shots): + state.reset_state() + mid_measurements = {} + final_state = state.get_final_state( + aux_circ, mid_measurements=mid_measurements, postselect_mode=postselect_mode + ) + results.append( + self.LightningMeasurements(final_state).measure_final_state( + aux_circ, mid_measurements=mid_measurements + ) + ) + return tuple(results) state.reset_state() final_state = state.get_final_state(circuit) diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.toml b/pennylane_lightning/lightning_gpu/lightning_gpu.toml index 518315de09..b18470da6b 100644 --- a/pennylane_lightning/lightning_gpu/lightning_gpu.toml +++ b/pennylane_lightning/lightning_gpu/lightning_gpu.toml @@ -98,7 +98,7 @@ qjit_compatible = false # If the device requires run time generation of the quantum circuit. runtime_code_generation = false # If the device supports mid circuit measurements natively -mid_circuit_measurement = false +mid_circuit_measurement = true # This field is currently unchecked but it is reserved for the purpose of # determining if the device supports dynamic qubit allocation/deallocation. diff --git a/tests/test_native_mcm.py b/tests/test_native_mcm.py index 07281fb48a..050e1d27c6 100644 --- a/tests/test_native_mcm.py +++ b/tests/test_native_mcm.py @@ -21,7 +21,7 @@ from conftest import LightningDevice, device_name, validate_measurements from flaky import flaky -if device_name not in ("lightning.qubit", "lightning.kokkos"): +if device_name not in ("lightning.qubit", "lightning.kokkos", "lightning.gpu"): pytest.skip("Native MCM not supported. Skipping.", allow_module_level=True) if not LightningDevice._CPP_BINARY_AVAILABLE: # pylint: disable=protected-access @@ -89,7 +89,7 @@ def func(x, y): match=f"not accepted with finite shots on lightning.qubit", ): func(*params) - if device_name == "lightning.kokkos": + if device_name in ("lightning.kokkos", "lightning.gpu"): with pytest.raises( qml.DeviceError, match=r"Measurement shadow\(wires=\[0\]\) not accepted with finite shots on "