From 29cf4f0a3677377420c0cf8c9a5916870d35863d Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Wed, 2 Oct 2024 20:09:57 +0000 Subject: [PATCH 01/23] Initial commit --- .github/CHANGELOG.md | 3 + .../lightning_gpu/StateVectorCudaManaged.hpp | 45 +++++++++++ .../src/simulators/lightning_gpu/initSV.cu | 78 ++++++++++++++++++- .../tests/Test_StateVectorCudaManaged.cpp | 42 ++++++++++ .../core/src/utils/cuda_utils/LinearAlg.hpp | 28 +++++++ 5 files changed, 195 insertions(+), 1 deletion(-) diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md index 6a9b55cf8f..affc06c261 100644 --- a/.github/CHANGELOG.md +++ b/.github/CHANGELOG.md @@ -2,6 +2,9 @@ ### New features since last release +* Add `collapse()` support to `lightning.gpu` C++ layer. + [(#931)](https://github.com/PennyLaneAI/pennylane-lightning/pull/931) + * Add Matrix Product Operator (MPO) for all gates support to `lightning.tensor`. Note current C++ implementation only works for MPO sites data provided by users. [(#859)](https://github.com/PennyLaneAI/pennylane-lightning/pull/859) diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp index 716d95c89f..dc9c3ffcd9 100644 --- a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp +++ b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp @@ -67,6 +67,17 @@ extern void setBasisState_CUDA(cuDoubleComplex *sv, cuDoubleComplex &value, const std::size_t index, bool async, cudaStream_t stream_id); +extern void collapseStateVector_CUDA(cuComplex *sv, const std::size_t num_sv, + const std::size_t stride, const bool k, + std::size_t thread_per_block, + cudaStream_t stream_id); + +extern void collapseStateVector_CUDA(cuDoubleComplex *sv, + const std::size_t num_sv, + const std::size_t stride, const bool k, + std::size_t thread_per_block, + cudaStream_t stream_id); + extern void globalPhaseStateVector_CUDA(cuComplex *sv, std::size_t num_sv, cuComplex phase, std::size_t thread_per_block, @@ -434,6 +445,40 @@ class StateVectorCudaManaged applyMatrix(gate_matrix.data(), wires, adjoint); } + /** + * @brief Collapse the state vector after having measured one of the qubit. + * + * Note: The branch parameter imposes the measurement result on the given + * wire. + * + * @tparam thread_per_block Number of threads per block. Default is 256. + * @param wire Wire to measure. + * @param branch Branch 0 or 1. + */ + template + void collapse(const std::size_t wire, const bool branch) { + PL_ABORT_IF_NOT(wire < BaseType::getNumQubits(), "Invalid wire index."); + + const std::size_t stride = std::size_t{1U} + << (BaseType::getNumQubits() - (1 + wire)); + // zero half the entries + // the "half" entries depend on the stride + // *_*_*_*_ for stride 1 + // **__**__ for stride 2 + // ****____ for stride 4 + const bool k = branch ? 0 : 1; + + collapseStateVector_CUDA( + BaseType::getData(), BaseType::getLength(), stride, k, + thread_per_block, + BaseType::getDataBuffer().getDevTag().getStreamID()); + + normalize_CUDA( + BaseType::getData(), BaseType::getLength(), + BaseType::getDataBuffer().getDevTag().getDeviceID(), + BaseType::getDataBuffer().getDevTag().getStreamID(), + this->getCublasCaller()); + } //****************************************************************************// // Explicit gate calls for bindings //****************************************************************************// diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/initSV.cu b/pennylane_lightning/core/src/simulators/lightning_gpu/initSV.cu index 4e3e93ea79..6d0aee75a4 100644 --- a/pennylane_lightning/core/src/simulators/lightning_gpu/initSV.cu +++ b/pennylane_lightning/core/src/simulators/lightning_gpu/initSV.cu @@ -59,7 +59,7 @@ void setBasisState_CUDA(cuDoubleComplex *sv, cuDoubleComplex &value, cudaStream_t stream_id); /** - * @brief The CUDA kernel that setS state vector data on GPU device from the + * @brief The CUDA kernel that sets state vector data on GPU device from the * input values (on device) and their corresponding indices (on device) * information. * @@ -106,6 +106,66 @@ void setStateVector_CUDA_call(GPUDataT *sv, index_type &num_indices, PL_CUDA_IS_SUCCESS(cudaGetLastError()); } +/** + * @brief The CUDA kernel that collapses the state vector data on GPU device + * based on the input values (on device) and their corresponding indices (on + * device) information. + * + * @param sv Complex data pointer of state vector on device. + * @param half_num_sv Number of state vector elements. + * @param stride Number of elements in the input values array. + * @param k Boolean flag to indicate whether to collapse or not. + */ +template +__global__ void +collapseStateVectorKernel(GPUDataT *sv, const std::size_t half_num_sv, + const std::size_t stride, const bool k) { + const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= half_num_sv) { + return; + } + + const unsigned int id = i / stride; + const unsigned int ids = i % stride; + const unsigned int offset = (2 * id + k) * stride; + const unsigned int idx = offset + ids; + + sv[idx].x = 0.0; + sv[idx].y = 0.0; +} + +/** + * @brief The CUDA kernel call wrapper that collapses the state vector data on + * GPU device based on the input values (on device) and their corresponding + * indices (on device) information. + * + * @param sv Complex data pointer of state vector on device. + * @param num_sv Number of state vector elements. + * @param stride Number of elements in the input values array. + * @param k Boolean flag to indicate whether to collapse or not. + * @param thread_per_block Number of threads set per block. + * @param stream_id Stream id of CUDA calls + */ +template +void collapseStateVector_CUDA_call(GPUDataT *sv, std::size_t num_sv, + const std::size_t stride, const bool k, + const std::size_t thread_per_block, + cudaStream_t stream_id) { + const std::size_t half_num_sv = num_sv / 2; + auto dv = std::div(static_cast(half_num_sv), + static_cast(thread_per_block)); + + const std::size_t num_blocks = dv.quot + (dv.rem == 0 ? 0 : 1); + const std::size_t block_per_grid = (num_blocks == 0 ? 1 : num_blocks); + dim3 blockSize(thread_per_block, 1, 1); + dim3 gridSize(block_per_grid, 1); + + collapseStateVectorKernel + <<>>(sv, half_num_sv, stride, k); + + PL_CUDA_IS_SUCCESS(cudaGetLastError()); +} + /** * @brief The CUDA kernel that multiplies the state vector data on GPU device * by a global phase. @@ -243,6 +303,22 @@ void setBasisState_CUDA(cuDoubleComplex *sv, cuDoubleComplex &value, setBasisState_CUDA_call(sv, value, index, async, stream_id); } +void collapseStateVector_CUDA(cuComplex *sv, const std::size_t num_sv, + const std::size_t stride, const bool k, + std::size_t thread_per_block, + cudaStream_t stream_id) { + collapseStateVector_CUDA_call(sv, num_sv, stride, k, thread_per_block, + stream_id); +} + +void collapseStateVector_CUDA(cuDoubleComplex *sv, std::size_t num_sv, + const std::size_t stride, const bool k, + std::size_t thread_per_block, + cudaStream_t stream_id) { + collapseStateVector_CUDA_call(sv, num_sv, stride, k, thread_per_block, + stream_id); +} + void globalPhaseStateVector_CUDA(cuComplex *sv, std::size_t num_sv, cuComplex phase, std::size_t thread_per_block, cudaStream_t stream_id) { diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/tests/Test_StateVectorCudaManaged.cpp b/pennylane_lightning/core/src/simulators/lightning_gpu/tests/Test_StateVectorCudaManaged.cpp index 4003395b53..0301970390 100644 --- a/pennylane_lightning/core/src/simulators/lightning_gpu/tests/Test_StateVectorCudaManaged.cpp +++ b/pennylane_lightning/core/src/simulators/lightning_gpu/tests/Test_StateVectorCudaManaged.cpp @@ -266,3 +266,45 @@ TEMPLATE_TEST_CASE("StateVectorCudaManaged::StateVectorCudaManaged", REQUIRE(std::is_constructible_v); } } + +TEMPLATE_TEST_CASE("StateVectorCudaManaged::collapse", + "[StateVectorCudaManaged]", float, double) { + using PrecisionT = TestType; + using ComplexT = typename StateVectorCudaManaged::ComplexT; + using CFP_t = typename StateVectorCudaManaged::CFP_t; + using TestVectorT = TestVector; + + std::size_t wire = GENERATE(0, 1, 2); + std::size_t branch = GENERATE(0, 1); + const std::size_t num_qubits = 3; + + // TODO @tomlqc use same template for testing all Lightning flavours? + + SECTION("Collapse the state vector after having measured one of the " + "qubits.") { + TestVectorT init_state = createPlusState_(num_qubits); + + const ComplexT coef{0.5, PrecisionT{0.0}}; + const ComplexT zero{PrecisionT{0.0}, PrecisionT{0.0}}; + + std::vector>> expected_state = { + {{coef, coef, coef, coef, zero, zero, zero, zero}, + {coef, coef, zero, zero, coef, coef, zero, zero}, + {coef, zero, coef, zero, coef, zero, coef, zero}}, + {{zero, zero, zero, zero, coef, coef, coef, coef}, + {zero, zero, coef, coef, zero, zero, coef, coef}, + {zero, coef, zero, coef, zero, coef, zero, coef}}, + }; + + StateVectorCudaManaged sv( + reinterpret_cast(init_state.data()), init_state.size()); + + sv.collapse(wire, branch); + + PrecisionT eps = std::numeric_limits::epsilon() * 1e2; + REQUIRE(isApproxEqual(sv.getDataVector().data(), + sv.getDataVector().size(), + expected_state[branch][wire].data(), + expected_state[branch][wire].size(), eps)); + } +} diff --git a/pennylane_lightning/core/src/utils/cuda_utils/LinearAlg.hpp b/pennylane_lightning/core/src/utils/cuda_utils/LinearAlg.hpp index cd422899b5..d8e189ead4 100644 --- a/pennylane_lightning/core/src/utils/cuda_utils/LinearAlg.hpp +++ b/pennylane_lightning/core/src/utils/cuda_utils/LinearAlg.hpp @@ -274,6 +274,34 @@ inline auto scaleC_CUDA(const CFP_t a, T *v1, const int data_size, data_type); } +/** + * @brief cuBLAS backed GPU data normalization. + * + * @tparam T Float data-type. Accepts float and double + * @param a scaling factor + * @param v1 Device data pointer + * @param data_size Length of device data. + * @param dev_id the device on which the function should be executed. + * @param stream_id the CUDA stream on which the operation should be executed. + * @param cublas the CublasCaller object that manages the cuBLAS handle. + */ +template +inline auto normalize_CUDA(CFP_t *v1, const int data_size, DevTypeID dev_id, + cudaStream_t stream_id, const CublasCaller &cublas) { + if constexpr (std::is_same_v || + std::is_same_v) { + double norm{0.0}; + cublas.call(cublasDznrm2, dev_id, stream_id, data_size, v1, 1, &norm); + const double alpha = 1.0 / norm; + cublas.call(cublasZdscal, dev_id, stream_id, data_size, &alpha, v1, 1); + } else { + float norm{0.0}; + cublas.call(cublasScnrm2, dev_id, stream_id, data_size, v1, 1, &norm); + const float alpha = 1.0 / norm; + cublas.call(cublasCsscal, dev_id, stream_id, data_size, &alpha, v1, 1); + } +} + /** @brief `%CudaScopedDevice` uses RAII to select a CUDA device context. * * @see https://taskflow.github.io/taskflow/classtf_1_1cudaScopedDevice.html From 0c449bf18550c042c41e50148f8e313a2c259cd9 Mon Sep 17 00:00:00 2001 From: ringo-but-quantum Date: Wed, 2 Oct 2024 20:27:43 +0000 Subject: [PATCH 02/23] Auto update version from '0.39.0-dev34' to '0.39.0-dev36' --- pennylane_lightning/core/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py index 05a5ab3841..9b3cc2f16a 100644 --- a/pennylane_lightning/core/_version.py +++ b/pennylane_lightning/core/_version.py @@ -16,4 +16,4 @@ Version number (major.minor.patch[-label]) """ -__version__ = "0.39.0-dev34" +__version__ = "0.39.0-dev36" From 6c36e5cfa8458bfbabdcdb03ccc20e6c38ded4c6 Mon Sep 17 00:00:00 2001 From: ringo-but-quantum Date: Mon, 7 Oct 2024 20:46:19 +0000 Subject: [PATCH 03/23] Auto update version from '0.39.0-dev39' to '0.39.0-dev40' --- pennylane_lightning/core/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py index 4f9f650221..c6e0b5050a 100644 --- a/pennylane_lightning/core/_version.py +++ b/pennylane_lightning/core/_version.py @@ -16,4 +16,4 @@ Version number (major.minor.patch[-label]) """ -__version__ = "0.39.0-dev39" +__version__ = "0.39.0-dev40" From 80c3a62fc4f48155058b4e104f3883162ce36df8 Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Tue, 8 Oct 2024 00:03:00 +0000 Subject: [PATCH 04/23] tidy up code and add mpi support --- .../lightning_gpu/StateVectorCudaMPI.hpp | 48 ++++++++++++ .../lightning_gpu/StateVectorCudaManaged.hpp | 46 +++++------ .../src/simulators/lightning_gpu/initSV.cu | 76 ------------------- .../tests/mpi/Test_StateVectorCudaMPI.cpp | 74 +++++++++++++++++- .../core/src/utils/cuda_utils/LinearAlg.hpp | 41 ++++++++-- 5 files changed, 173 insertions(+), 112 deletions(-) diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp index 577c510f9e..97a05d546d 100644 --- a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp +++ b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp @@ -347,6 +347,54 @@ class StateVectorCudaMPI final mpi_manager_.Barrier(); } + /** + * @brief Collapse the state vector after having measured one of the qubit. + * + * Note: The branch parameter imposes the measurement result on the given + * wire. + * + * @param wire Wire to measure. + * @param branch Branch 0 or 1. + */ + void collapse(const std::size_t wire, const bool branch) { + PL_ABORT_IF_NOT(wire < this->getTotalNumQubits(), + "Invalid wire index."); + + std::vector matrix(4, ComplexT(0.0, 0.0)); + + for (std::size_t i = 0; i < matrix.size(); i++) { + matrix[i] = ((i == 0 && branch == 0) || (i == 3 && branch == 1)) + ? ComplexT{1.0, 0.0} + : ComplexT{0.0, 0.0}; + } + + mpi_manager_.Barrier(); + + applyMatrix(matrix, {wire}, false); + + auto local_norm2 = norm2_CUDA( + BaseType::getData(), BaseType::getLength(), + BaseType::getDataBuffer().getDevTag().getDeviceID(), + BaseType::getDataBuffer().getDevTag().getStreamID(), + this->getCublasCaller()); + + local_norm2 *= local_norm2; + + mpi_manager_.Barrier(); + + auto norm2 = mpi_manager_.allreduce(local_norm2, "sum"); + + norm2 = std::sqrt(norm2); + + normalize_CUDA( + norm2, BaseType::getData(), BaseType::getLength(), + BaseType::getDataBuffer().getDevTag().getDeviceID(), + BaseType::getDataBuffer().getDevTag().getStreamID(), + this->getCublasCaller()); + + mpi_manager_.Barrier(); + } + /** * @brief Apply a single gate to the state-vector. Offloads to custatevec * specific API calls if available. If unable, attempts to use prior cached diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp index 7b61c7a473..aa3429ceca 100644 --- a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp +++ b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp @@ -67,17 +67,6 @@ extern void setBasisState_CUDA(cuDoubleComplex *sv, cuDoubleComplex &value, const std::size_t index, bool async, cudaStream_t stream_id); -extern void collapseStateVector_CUDA(cuComplex *sv, const std::size_t num_sv, - const std::size_t stride, const bool k, - std::size_t thread_per_block, - cudaStream_t stream_id); - -extern void collapseStateVector_CUDA(cuDoubleComplex *sv, - const std::size_t num_sv, - const std::size_t stride, const bool k, - std::size_t thread_per_block, - cudaStream_t stream_id); - extern void globalPhaseStateVector_CUDA(cuComplex *sv, std::size_t num_sv, cuComplex phase, std::size_t thread_per_block, @@ -494,34 +483,35 @@ class StateVectorCudaManaged * Note: The branch parameter imposes the measurement result on the given * wire. * - * @tparam thread_per_block Number of threads per block. Default is 256. * @param wire Wire to measure. * @param branch Branch 0 or 1. */ - template void collapse(const std::size_t wire, const bool branch) { PL_ABORT_IF_NOT(wire < BaseType::getNumQubits(), "Invalid wire index."); - const std::size_t stride = std::size_t{1U} - << (BaseType::getNumQubits() - (1 + wire)); - // zero half the entries - // the "half" entries depend on the stride - // *_*_*_*_ for stride 1 - // **__**__ for stride 2 - // ****____ for stride 4 - const bool k = branch ? 0 : 1; - - collapseStateVector_CUDA( - BaseType::getData(), BaseType::getLength(), stride, k, - thread_per_block, - BaseType::getDataBuffer().getDevTag().getStreamID()); - - normalize_CUDA( + std::vector matrix(4, ComplexT(0.0, 0.0)); + + for (std::size_t i = 0; i < matrix.size(); i++) { + matrix[i] = ((i == 0 && branch == 0) || (i == 3 && branch == 1)) + ? ComplexT{1.0, 0.0} + : ComplexT{0.0, 0.0}; + } + + applyMatrix(matrix, {wire}, false); + + auto norm2 = norm2_CUDA( BaseType::getData(), BaseType::getLength(), BaseType::getDataBuffer().getDevTag().getDeviceID(), BaseType::getDataBuffer().getDevTag().getStreamID(), this->getCublasCaller()); + + normalize_CUDA( + norm2, BaseType::getData(), BaseType::getLength(), + BaseType::getDataBuffer().getDevTag().getDeviceID(), + BaseType::getDataBuffer().getDevTag().getStreamID(), + this->getCublasCaller()); } + //****************************************************************************// // Explicit gate calls for bindings //****************************************************************************// diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/initSV.cu b/pennylane_lightning/core/src/simulators/lightning_gpu/initSV.cu index 6d0aee75a4..8a62e89e84 100644 --- a/pennylane_lightning/core/src/simulators/lightning_gpu/initSV.cu +++ b/pennylane_lightning/core/src/simulators/lightning_gpu/initSV.cu @@ -106,66 +106,6 @@ void setStateVector_CUDA_call(GPUDataT *sv, index_type &num_indices, PL_CUDA_IS_SUCCESS(cudaGetLastError()); } -/** - * @brief The CUDA kernel that collapses the state vector data on GPU device - * based on the input values (on device) and their corresponding indices (on - * device) information. - * - * @param sv Complex data pointer of state vector on device. - * @param half_num_sv Number of state vector elements. - * @param stride Number of elements in the input values array. - * @param k Boolean flag to indicate whether to collapse or not. - */ -template -__global__ void -collapseStateVectorKernel(GPUDataT *sv, const std::size_t half_num_sv, - const std::size_t stride, const bool k) { - const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= half_num_sv) { - return; - } - - const unsigned int id = i / stride; - const unsigned int ids = i % stride; - const unsigned int offset = (2 * id + k) * stride; - const unsigned int idx = offset + ids; - - sv[idx].x = 0.0; - sv[idx].y = 0.0; -} - -/** - * @brief The CUDA kernel call wrapper that collapses the state vector data on - * GPU device based on the input values (on device) and their corresponding - * indices (on device) information. - * - * @param sv Complex data pointer of state vector on device. - * @param num_sv Number of state vector elements. - * @param stride Number of elements in the input values array. - * @param k Boolean flag to indicate whether to collapse or not. - * @param thread_per_block Number of threads set per block. - * @param stream_id Stream id of CUDA calls - */ -template -void collapseStateVector_CUDA_call(GPUDataT *sv, std::size_t num_sv, - const std::size_t stride, const bool k, - const std::size_t thread_per_block, - cudaStream_t stream_id) { - const std::size_t half_num_sv = num_sv / 2; - auto dv = std::div(static_cast(half_num_sv), - static_cast(thread_per_block)); - - const std::size_t num_blocks = dv.quot + (dv.rem == 0 ? 0 : 1); - const std::size_t block_per_grid = (num_blocks == 0 ? 1 : num_blocks); - dim3 blockSize(thread_per_block, 1, 1); - dim3 gridSize(block_per_grid, 1); - - collapseStateVectorKernel - <<>>(sv, half_num_sv, stride, k); - - PL_CUDA_IS_SUCCESS(cudaGetLastError()); -} - /** * @brief The CUDA kernel that multiplies the state vector data on GPU device * by a global phase. @@ -303,22 +243,6 @@ void setBasisState_CUDA(cuDoubleComplex *sv, cuDoubleComplex &value, setBasisState_CUDA_call(sv, value, index, async, stream_id); } -void collapseStateVector_CUDA(cuComplex *sv, const std::size_t num_sv, - const std::size_t stride, const bool k, - std::size_t thread_per_block, - cudaStream_t stream_id) { - collapseStateVector_CUDA_call(sv, num_sv, stride, k, thread_per_block, - stream_id); -} - -void collapseStateVector_CUDA(cuDoubleComplex *sv, std::size_t num_sv, - const std::size_t stride, const bool k, - std::size_t thread_per_block, - cudaStream_t stream_id) { - collapseStateVector_CUDA_call(sv, num_sv, stride, k, thread_per_block, - stream_id); -} - void globalPhaseStateVector_CUDA(cuComplex *sv, std::size_t num_sv, cuComplex phase, std::size_t thread_per_block, cudaStream_t stream_id) { diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/tests/mpi/Test_StateVectorCudaMPI.cpp b/pennylane_lightning/core/src/simulators/lightning_gpu/tests/mpi/Test_StateVectorCudaMPI.cpp index 4b5a2dd349..401f6056b3 100644 --- a/pennylane_lightning/core/src/simulators/lightning_gpu/tests/mpi/Test_StateVectorCudaMPI.cpp +++ b/pennylane_lightning/core/src/simulators/lightning_gpu/tests/mpi/Test_StateVectorCudaMPI.cpp @@ -317,4 +317,76 @@ TEMPLATE_PRODUCT_TEST_CASE("StateVectorCudaMPI::applyOperations", {false, false}, {{0.0}}), LightningException, "must all be equal"); // invalid parameters } -} \ No newline at end of file +} + +TEMPLATE_TEST_CASE("StateVectorCudaManaged::collapse", + "[StateVectorCudaManaged]", float, double) { + using PrecisionT = TestType; + using ComplexT = typename StateVectorCudaMPI::ComplexT; + using TestVectorT = TestVector; + + std::size_t wire = GENERATE(0, 1, 2); + std::size_t branch = GENERATE(0, 1); + const std::size_t num_qubits = 3; + + using StateVectorT = StateVectorCudaMPI; + MPIManager mpi_manager(MPI_COMM_WORLD); + REQUIRE(mpi_manager.getSize() == 2); + + std::size_t mpi_buffersize = 1; + + int nGlobalIndexBits = + std::bit_width(static_cast(mpi_manager.getSize())) - 1; + int nLocalIndexBits = num_qubits - nGlobalIndexBits; + mpi_manager.Barrier(); + + int nDevices = 0; // Number of GPU devices per node + cudaGetDeviceCount(&nDevices); + REQUIRE(nDevices >= 2); + int deviceId = mpi_manager.getRank() % nDevices; + cudaSetDevice(deviceId); + DevTag dt_local(deviceId, 0); + + TestVectorT init_state = createPlusState_(num_qubits); + + std::size_t subSvLength = 1 << nLocalIndexBits; + + mpi_manager.Barrier(); + + std::vector local_state(subSvLength); + + mpi_manager.Scatter(init_state.data(), local_state.data(), subSvLength, 0); + mpi_manager.Barrier(); + + // TODO @tomlqc use same template for testing all Lightning flavours? + + SECTION("Collapse the state vector after having measured one of the " + "qubits.") { + const ComplexT coef{0.5, PrecisionT{0.0}}; + const ComplexT zero{PrecisionT{0.0}, PrecisionT{0.0}}; + + std::vector>> expected_state = { + {{coef, coef, coef, coef, zero, zero, zero, zero}, + {coef, coef, zero, zero, coef, coef, zero, zero}, + {coef, zero, coef, zero, coef, zero, coef, zero}}, + {{zero, zero, zero, zero, coef, coef, coef, coef}, + {zero, zero, coef, coef, zero, zero, coef, coef}, + {zero, coef, zero, coef, zero, coef, zero, coef}}, + }; + + StateVectorT sv(mpi_manager, dt_local, mpi_buffersize, nGlobalIndexBits, + nLocalIndexBits); + + sv.CopyHostDataToGpu(local_state.data(), local_state.size(), false); + + sv.collapse(wire, branch); + + auto expected_local_state = + mpi_manager.scatter(expected_state[branch][wire], 0); + + PrecisionT eps = std::numeric_limits::epsilon() * 1e2; + REQUIRE(isApproxEqual( + sv.getDataVector().data(), sv.getDataVector().size(), + expected_local_state.data(), expected_local_state.size(), eps)); + } +} diff --git a/pennylane_lightning/core/src/utils/cuda_utils/LinearAlg.hpp b/pennylane_lightning/core/src/utils/cuda_utils/LinearAlg.hpp index d8e189ead4..984a9d2358 100644 --- a/pennylane_lightning/core/src/utils/cuda_utils/LinearAlg.hpp +++ b/pennylane_lightning/core/src/utils/cuda_utils/LinearAlg.hpp @@ -277,8 +277,9 @@ inline auto scaleC_CUDA(const CFP_t a, T *v1, const int data_size, /** * @brief cuBLAS backed GPU data normalization. * - * @tparam T Float data-type. Accepts float and double - * @param a scaling factor + * @tparam CFP_t Complex float data-type. Accepts cuDoubleComplex and cuComplex + * @tparam DevTypeID Integer type of device id. + * * @param v1 Device data pointer * @param data_size Length of device data. * @param dev_id the device on which the function should be executed. @@ -286,18 +287,44 @@ inline auto scaleC_CUDA(const CFP_t a, T *v1, const int data_size, * @param cublas the CublasCaller object that manages the cuBLAS handle. */ template -inline auto normalize_CUDA(CFP_t *v1, const int data_size, DevTypeID dev_id, - cudaStream_t stream_id, const CublasCaller &cublas) { +inline auto norm2_CUDA(CFP_t *v1, const int data_size, DevTypeID dev_id, + cudaStream_t stream_id, const CublasCaller &cublas) { if constexpr (std::is_same_v || std::is_same_v) { double norm{0.0}; cublas.call(cublasDznrm2, dev_id, stream_id, data_size, v1, 1, &norm); - const double alpha = 1.0 / norm; - cublas.call(cublasZdscal, dev_id, stream_id, data_size, &alpha, v1, 1); + return norm; } else { float norm{0.0}; cublas.call(cublasScnrm2, dev_id, stream_id, data_size, v1, 1, &norm); - const float alpha = 1.0 / norm; + return norm; + } +} + +/** + * @brief cuBLAS backed GPU data normalization. + * + * @tparam T Float data-type. Accepts float and double + * @tparam CFP_t Complex float data-type. Accepts cuDoubleComplex and cuComplex + * + * @param norm2 Norm of the vector + * @param v1 Device data pointer + * @param data_size Length of device data. + * @param dev_id the device on which the function should be executed. + * @param stream_id the CUDA stream on which the operation should be executed. + * @param cublas the CublasCaller object that manages the cuBLAS handle. + */ +template +inline auto normalize_CUDA(T norm2, CFP_t *v1, const int data_size, + DevTypeID dev_id, cudaStream_t stream_id, + const CublasCaller &cublas) { + if constexpr (std::is_same_v || + std::is_same_v) { + const double alpha = 1.0 / norm2; + cublas.call(cublasZdscal, dev_id, stream_id, data_size, &alpha, v1, 1); + } else { + const float alpha = 1.0 / norm2; cublas.call(cublasCsscal, dev_id, stream_id, data_size, &alpha, v1, 1); } } From b2a3758b10d1cd8d4da2373b5c305fc4b3f6a7ae Mon Sep 17 00:00:00 2001 From: ringo-but-quantum Date: Tue, 8 Oct 2024 00:03:23 +0000 Subject: [PATCH 05/23] Auto update version from '0.39.0-dev40' to '0.39.0-dev41' --- pennylane_lightning/core/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py index c6e0b5050a..ab5a1f0f5d 100644 --- a/pennylane_lightning/core/_version.py +++ b/pennylane_lightning/core/_version.py @@ -16,4 +16,4 @@ Version number (major.minor.patch[-label]) """ -__version__ = "0.39.0-dev40" +__version__ = "0.39.0-dev41" From 960dc842ecaaca82ab8672d291b96acfa5b6714e Mon Sep 17 00:00:00 2001 From: Shuli Shu <31480676+multiphaseCFD@users.noreply.github.com> Date: Mon, 7 Oct 2024 20:03:08 -0400 Subject: [PATCH 06/23] Add native `setStateVector` support to `lightning.gpu` (#930) Please complete the following checklist when submitting a PR: - [ ] All new features must include a unit test. If you've fixed a bug or added code that should be tested, add a test to the [`tests`](../tests) directory! - [ ] All new functions and code must be clearly commented and documented. If you do make documentation changes, make sure that the docs build and render correctly by running `make docs`. - [x] Ensure that the test suite passes, by running `make test`. - [x] Add a new entry to the `.github/CHANGELOG.md` file, summarizing the change, and including a link back to the PR. - [x] Ensure that code is properly formatted by running `make format`. When all the above are checked, delete everything above the dashed line and fill in the pull request template. ------------------------------------------------------------------------------------------------------------ **Context:** [SC-74668] `setStateVector` via state and wires is supported in the C++ layer in `lightning.gpu` **Description of the Change:** **Benefits:** **Possible Drawbacks:** **Related GitHub Issues:** --------- Co-authored-by: ringo-but-quantum --- .github/CHANGELOG.md | 3 + pennylane_lightning/core/_version.py | 2 +- .../lightning_gpu/StateVectorCudaMPI.hpp | 150 ++++++++++++------ .../lightning_gpu/StateVectorCudaManaged.hpp | 101 ++++++++---- .../lightning_gpu/bindings/LGPUBindings.hpp | 25 +-- .../bindings/LGPUBindingsMPI.hpp | 25 +-- .../Test_StateVectorCudaManaged_NonParam.cpp | 63 +------- .../mpi/Test_StateVectorCudaMPI_NonParam.cpp | 21 ++- .../lightning_gpu/_state_vector.py | 29 +--- .../lightning_gpu/lightning_gpu.py | 11 +- 10 files changed, 219 insertions(+), 211 deletions(-) diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md index b921117f4d..7fa9decd0c 100644 --- a/.github/CHANGELOG.md +++ b/.github/CHANGELOG.md @@ -46,6 +46,9 @@ ### Improvements +* Add `setStateVector(state, wire)` support to the `lightning.gpu` C++ layer. + [(#930)](https://github.com/PennyLaneAI/pennylane-lightning/pull/930) + * Add zero-state initialization to both `StateVectorCudaManaged` and `StateVectorCudaMPI` constructors to remove the `reset_state` in the python layer ctor and refactor `setBasisState(state, wires)` in the C++ layer. [(#933)](https://github.com/PennyLaneAI/pennylane-lightning/pull/933) diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py index ab5a1f0f5d..c6e0b5050a 100644 --- a/pennylane_lightning/core/_version.py +++ b/pennylane_lightning/core/_version.py @@ -16,4 +16,4 @@ Version number (major.minor.patch[-label]) """ -__version__ = "0.39.0-dev41" +__version__ = "0.39.0-dev40" diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp index 97a05d546d..f985d1b19e 100644 --- a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp +++ b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp @@ -289,61 +289,48 @@ class StateVectorCudaMPI final BaseType::getDataBuffer().zeroInit(); setBasisState_(value, index, use_async); } + /** - * @brief Set values for a batch of elements of the state-vector. This - * method is implemented by the customized CUDA kernel defined in the - * DataBuffer class. + * @brief Set values for a batch of elements of the state-vector. * - * @param num_indices Number of elements to be passed to the state vector. - * @param values Pointer to values to be set for the target elements. - * @param indices Pointer to indices of the target elements. - * @param async Use an asynchronous memory copy. + * @param state_ptr Pointer to initial state data. + * @param num_states Length of initial state data. + * @param wires Wires. + * @param use_async Use an asynchronous memory copy. Default is false. */ - template - void setStateVector(const index_type num_indices, - const std::complex *values, - const index_type *indices, const bool async = false) { - BaseType::getDataBuffer().zeroInit(); + void setStateVector(const ComplexT *state_ptr, const std::size_t num_states, + const std::vector &wires, + bool use_async = false) { + PL_ABORT_IF_NOT(num_states == Pennylane::Util::exp2(wires.size()), + "Inconsistent state and wires dimensions."); - std::vector indices_local; - std::vector> values_local; + const auto num_qubits = this->getTotalNumQubits(); - for (std::size_t i = 0; i < static_cast(num_indices); - i++) { - int index = indices[i]; - PL_ASSERT(index >= 0); - std::size_t rankId = - static_cast(index) >> BaseType::getNumQubits(); + PL_ABORT_IF_NOT(std::find_if(wires.begin(), wires.end(), + [&num_qubits](const auto i) { + return i >= num_qubits; + }) == wires.end(), + "Invalid wire index."); - if (rankId == mpi_manager_.getRank()) { - int local_index = static_cast( - compute_local_index(static_cast(index), - this->getNumLocalQubits())); - indices_local.push_back(local_index); - values_local.push_back(values[i]); + using index_type = + typename std::conditional::value, + int32_t, int64_t>::type; + + // Calculate the indices of the state-vector to be set. + // TODO: Could move to GPU/MPI calculation if the state size is large. + std::vector indices(num_states); + const std::size_t num_wires = wires.size(); + constexpr std::size_t one{1U}; + for (std::size_t i = 0; i < num_states; i++) { + std::size_t index{0U}; + for (std::size_t j = 0; j < num_wires; j++) { + const std::size_t bit = (i & (one << j)) >> j; + index |= bit << (num_qubits - 1 - wires[num_wires - 1 - j]); } + indices[i] = static_cast(index); } - - auto device_id = BaseType::getDataBuffer().getDevTag().getDeviceID(); - auto stream_id = BaseType::getDataBuffer().getDevTag().getStreamID(); - - index_type num_elements = indices_local.size(); - - DataBuffer d_indices{ - static_cast(num_elements), device_id, stream_id, true}; - - DataBuffer d_values{static_cast(num_elements), - device_id, stream_id, true}; - - d_indices.CopyHostDataToGpu(indices_local.data(), d_indices.getLength(), - async); - d_values.CopyHostDataToGpu(values_local.data(), d_values.getLength(), - async); - - setStateVector_CUDA(BaseType::getData(), num_elements, - d_values.getData(), d_indices.getData(), - thread_per_block, stream_id); - PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize()); + setStateVector_(num_states, state_ptr, indices.data(), + use_async); mpi_manager_.Barrier(); } @@ -1596,6 +1583,62 @@ class StateVectorCudaMPI final return t_indices; } + /** + * @brief Set values for a batch of elements of the state-vector. This + * method is implemented by the customized CUDA kernel defined in the + * DataBuffer class. + * + * @param num_indices Number of elements to be passed to the state vector. + * @param values Pointer to values to be set for the target elements. + * @param indices Pointer to indices of the target elements. + * @param async Use an asynchronous memory copy. + */ + template + void setStateVector_(const index_type num_indices, + const std::complex *values, + const index_type *indices, const bool async = false) { + BaseType::getDataBuffer().zeroInit(); + + std::vector indices_local; + std::vector> values_local; + + for (std::size_t i = 0; i < static_cast(num_indices); + i++) { + int index = indices[i]; + PL_ASSERT(index >= 0); + std::size_t rankId = + static_cast(index) >> BaseType::getNumQubits(); + + if (rankId == mpi_manager_.getRank()) { + int local_index = static_cast( + compute_local_index(static_cast(index), + this->getNumLocalQubits())); + indices_local.push_back(local_index); + values_local.push_back(values[i]); + } + } + + auto device_id = BaseType::getDataBuffer().getDevTag().getDeviceID(); + auto stream_id = BaseType::getDataBuffer().getDevTag().getStreamID(); + + index_type num_elements = indices_local.size(); + + DataBuffer d_indices{ + static_cast(num_elements), device_id, stream_id, true}; + + DataBuffer d_values{static_cast(num_elements), + device_id, stream_id, true}; + + d_indices.CopyHostDataToGpu(indices_local.data(), d_indices.getLength(), + async); + d_values.CopyHostDataToGpu(values_local.data(), d_values.getLength(), + async); + + setStateVector_CUDA(BaseType::getData(), num_elements, + d_values.getData(), d_indices.getData(), + thread_per_block, stream_id); + } + /** * @brief Set value for a single element of the state-vector on device. This * method is implemented by cudaMemcpy. @@ -1685,8 +1728,8 @@ class StateVectorCudaMPI final } /** - * @brief Apply parametric Pauli gates to local statevector using custateVec - * calls. + * @brief Apply parametric Pauli gates to local statevector using + * custateVec calls. * * @param pauli_words List of Pauli words representing operation. * @param ctrls Control wires @@ -1756,7 +1799,8 @@ class StateVectorCudaMPI final }); // Initialize a vector to store the status of wires and default its - // elements as zeros, which assumes there is no target and control wire. + // elements as zeros, which assumes there is no target and control + // wire. std::vector statusWires(this->getTotalNumQubits(), WireStatus::Default); @@ -1916,7 +1960,8 @@ class StateVectorCudaMPI final }); // Initialize a vector to store the status of wires and default its - // elements as zeros, which assumes there is no target and control wire. + // elements as zeros, which assumes there is no target and control + // wire. std::vector statusWires(this->getTotalNumQubits(), WireStatus::Default); @@ -2057,7 +2102,8 @@ class StateVectorCudaMPI final }); // Initialize a vector to store the status of wires and default its - // elements as zeros, which assumes there is no target and control wire. + // elements as zeros, which assumes there is no target and control + // wire. std::vector statusWires(this->getTotalNumQubits(), WireStatus::Default); diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp index aa3429ceca..4d31b03e3e 100644 --- a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp +++ b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp @@ -215,36 +215,46 @@ class StateVectorCudaManaged } /** - * @brief Set values for a batch of elements of the state-vector. This - * method is implemented by the customized CUDA kernel defined in the - * DataBuffer class. + * @brief Set values for a batch of elements of the state-vector. * - * @param num_indices Number of elements to be passed to the state vector. - * @param values Pointer to values to be set for the target elements. - * @param indices Pointer to indices of the target elements. - * @param async Use an asynchronous memory copy. + * @param state_ptr Pointer to the initial state data. + * @param num_states Length of the initial state data. + * @param wires Wires. + * @param use_async Use an asynchronous memory copy. Default is false. */ - template - void setStateVector(const index_type num_indices, - const std::complex *values, - const index_type *indices, const bool async = false) { - BaseType::getDataBuffer().zeroInit(); - - auto device_id = BaseType::getDataBuffer().getDevTag().getDeviceID(); - auto stream_id = BaseType::getDataBuffer().getDevTag().getStreamID(); - - index_type num_elements = num_indices; - DataBuffer d_indices{ - static_cast(num_elements), device_id, stream_id, true}; - DataBuffer d_values{static_cast(num_elements), - device_id, stream_id, true}; + void setStateVector(const ComplexT *state_ptr, const std::size_t num_states, + const std::vector &wires, + bool use_async = false) { + PL_ABORT_IF_NOT(num_states == Pennylane::Util::exp2(wires.size()), + "Inconsistent state and wires dimensions."); - d_indices.CopyHostDataToGpu(indices, d_indices.getLength(), async); - d_values.CopyHostDataToGpu(values, d_values.getLength(), async); + const auto num_qubits = BaseType::getNumQubits(); - setStateVector_CUDA(BaseType::getData(), num_elements, - d_values.getData(), d_indices.getData(), - thread_per_block, stream_id); + PL_ABORT_IF_NOT(std::find_if(wires.begin(), wires.end(), + [&num_qubits](const auto i) { + return i >= num_qubits; + }) == wires.end(), + "Invalid wire index."); + + using index_type = + typename std::conditional::value, + int32_t, int64_t>::type; + + // Calculate the indices of the state-vector to be set. + // TODO: Could move to GPU calculation if the state size is large. + std::vector indices(num_states); + const std::size_t num_wires = wires.size(); + constexpr std::size_t one{1U}; + for (std::size_t i = 0; i < num_states; i++) { + std::size_t index{0U}; + for (std::size_t j = 0; j < num_wires; j++) { + const std::size_t bit = (i & (one << j)) >> j; + index |= bit << (num_qubits - 1 - wires[num_wires - 1 - j]); + } + indices[i] = static_cast(index); + } + setStateVector_(num_states, state_ptr, indices.data(), + use_async); } /** @@ -1381,9 +1391,8 @@ class StateVectorCudaManaged return t_indices; } - /** - * @brief Set value for a single element of the state-vector on device. This - * method is implemented by cudaMemcpy. + /** @brief Set value for a single element of the state-vector on device. + * This method is implemented by cudaMemcpy. * * @param value Value to be set for the target element. * @param index Index of the target element. @@ -1397,6 +1406,40 @@ class StateVectorCudaManaged stream_id); } + /** + * @brief Set values for a batch of elements of the state-vector. This + * method is implemented by the customized CUDA kernel defined in the + * DataBuffer class. + * + * @param num_indices Number of elements to be passed to the state vector. + * @param values Pointer to values to be set for the target elements. + * @param indices Pointer to indices of the target elements. + * @param async Use an asynchronous memory copy. + */ + template + void setStateVector_(const index_type num_indices, + const std::complex *values, + const index_type *indices, const bool async = false) { + BaseType::getDataBuffer().zeroInit(); + + auto device_id = BaseType::getDataBuffer().getDevTag().getDeviceID(); + auto stream_id = BaseType::getDataBuffer().getDevTag().getStreamID(); + + index_type num_elements = num_indices; + DataBuffer d_indices{ + static_cast(num_elements), device_id, stream_id, true}; + DataBuffer d_values{static_cast(num_elements), + device_id, stream_id, true}; + + d_indices.CopyHostDataToGpu(indices, d_indices.getLength(), async); + d_values.CopyHostDataToGpu(values, d_values.getLength(), async); + + setStateVector_CUDA(BaseType::getData(), num_elements, + d_values.getData(), d_indices.getData(), + thread_per_block, stream_id); + PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize()); + } + /** * @brief Apply parametric Pauli gates using custateVec calls. * diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp index b2d03eba5a..c361bd6ed9 100644 --- a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp +++ b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp @@ -63,10 +63,6 @@ void registerBackendClassSpecificBindings(PyClass &pyclass) { using ParamT = PrecisionT; // Parameter's data precision using np_arr_c = py::array_t, py::array::c_style | py::array::forcecast>; - using np_arr_sparse_ind = typename std::conditional< - std::is_same::value, - py::array_t, - py::array_t>::type; registerGatesForStateVector(pyclass); @@ -91,20 +87,15 @@ void registerBackendClassSpecificBindings(PyClass &pyclass) { "Set the state vector to a basis state on GPU.") .def( "setStateVector", - [](StateVectorT &sv, const np_arr_sparse_ind &indices, - const np_arr_c &state, const bool use_async) { - using index_type = typename std::conditional< - std::is_same::value, int32_t, int64_t>::type; - - sv.template setStateVector( - static_cast(indices.request().size), - static_cast *>( - state.request().ptr), - static_cast(indices.request().ptr), - use_async); + [](StateVectorT &sv, const np_arr_c &state, + const std::vector &wires, const bool async) { + const auto state_buffer = state.request(); + const auto state_ptr = + static_cast *>(state_buffer.ptr); + sv.setStateVector(state_ptr, state_buffer.size, wires, async); }, - "Set State Vector on GPU with values and their corresponding " - "indices for the state vector on device") + "Set State Vector on GPU with values for the state vector and " + "wires on the host memory.") .def( "DeviceToDevice", [](StateVectorT &sv, const StateVectorT &other, bool async) { diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp index f6a933aca3..2d3313f694 100644 --- a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp +++ b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp @@ -63,10 +63,6 @@ void registerBackendClassSpecificBindingsMPI(PyClass &pyclass) { using ParamT = PrecisionT; // Parameter's data precision using np_arr_c = py::array_t, py::array::c_style | py::array::forcecast>; - using np_arr_sparse_ind = typename std::conditional< - std::is_same::value, - py::array_t, - py::array_t>::type; registerGatesForStateVector(pyclass); @@ -95,20 +91,15 @@ void registerBackendClassSpecificBindingsMPI(PyClass &pyclass) { "Set the state vector to a basis state on GPU.") .def( "setStateVector", - [](StateVectorT &sv, const np_arr_sparse_ind &indices, - const np_arr_c &state, const bool use_async) { - using index_type = typename std::conditional< - std::is_same::value, int32_t, int64_t>::type; - - sv.template setStateVector( - static_cast(indices.request().size), - static_cast *>( - state.request().ptr), - static_cast(indices.request().ptr), - use_async); + [](StateVectorT &sv, const np_arr_c &state, + const std::vector &wires, const bool async) { + const auto state_buffer = state.request(); + const auto state_ptr = + static_cast *>(state_buffer.ptr); + sv.setStateVector(state_ptr, state_buffer.size, wires, async); }, - "Set State Vector on GPU with values and their corresponding " - "indices for the state vector on device") + "Set State Vector on GPU with values for the state vector and " + "wires on the host memory.") .def( "DeviceToDevice", [](StateVectorT &sv, const StateVectorT &other, bool async) { diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/gates/tests/Test_StateVectorCudaManaged_NonParam.cpp b/pennylane_lightning/core/src/simulators/lightning_gpu/gates/tests/Test_StateVectorCudaManaged_NonParam.cpp index dc0c1a7128..af864d8b01 100644 --- a/pennylane_lightning/core/src/simulators/lightning_gpu/gates/tests/Test_StateVectorCudaManaged_NonParam.cpp +++ b/pennylane_lightning/core/src/simulators/lightning_gpu/gates/tests/Test_StateVectorCudaManaged_NonParam.cpp @@ -1069,68 +1069,15 @@ TEMPLATE_TEST_CASE("StateVectorCudaManaged::SetStateVector", } StateVectorCudaManaged sv{num_qubits}; - sv.CopyHostDataToGpu(init_state.data(), init_state.size()); - - using index_type = - typename std::conditional::value, - int32_t, int64_t>::type; - // The setStates will shuffle the state vector values on the device with - // the following indices and values setting on host. For example, the - // values[i] is used to set the indices[i] th element of state vector on - // the device. For example, values[2] (init_state[5]) will be copied to - // indices[2]th or (4th) element of the state vector. - std::vector indices = {0, 2, 4, 6, 1, 3, 5, 7}; - - std::vector> values = { - init_state[1], init_state[3], init_state[5], init_state[7], - init_state[0], init_state[2], init_state[4], init_state[6]}; - - sv.template setStateVector(values.size(), values.data(), - indices.data(), false); - CHECK(expected_state == Pennylane::Util::approx(sv.getDataVector())); - } -} -// LCOV_EXCL_START -TEMPLATE_TEST_CASE("StateVectorCudaManaged::SetStateVectorwith_thread_setting", - "[StateVectorCudaManaged_Nonparam]", float, double) { - using PrecisionT = TestType; - const std::size_t num_qubits = 3; - std::mt19937 re{1337}; - - SECTION("SetStates with a non-default GPU thread setting") { - auto init_state = - createRandomStateVectorData(re, num_qubits); - auto expected_state = init_state; + std::vector> values(init_state.begin(), + init_state.end()); - for (std::size_t i = 0; i < Pennylane::Util::exp2(num_qubits - 1); - i++) { - std::swap(expected_state[i * 2], expected_state[i * 2 + 1]); - } - - StateVectorCudaManaged sv{num_qubits}; - sv.CopyHostDataToGpu(init_state.data(), init_state.size()); - - using index_type = - typename std::conditional::value, - int32_t, int64_t>::type; - - std::vector indices = {0, 2, 4, 6, 1, 3, 5, 7}; - - std::vector> values = { - init_state[1], init_state[3], init_state[5], init_state[7], - init_state[0], init_state[2], init_state[4], init_state[6]}; - - // default setting of the number of threads in a block is 256. - const std::size_t threads_per_block = 1024; - - sv.template setStateVector( - values.size(), values.data(), indices.data(), false); - - CHECK(expected_state == Pennylane::Util::approx(sv.getDataVector())); + sv.setStateVector(values.data(), values.size(), + std::vector{0, 1, 2}); + CHECK(init_state == Pennylane::Util::approx(sv.getDataVector())); } } -// LCOV_EXCL_STOP TEMPLATE_TEST_CASE("StateVectorCudaManaged::SetIthStates", "[StateVectorCudaManaged_Nonparam]", float, double) { diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/gates/tests/mpi/Test_StateVectorCudaMPI_NonParam.cpp b/pennylane_lightning/core/src/simulators/lightning_gpu/gates/tests/mpi/Test_StateVectorCudaMPI_NonParam.cpp index b9ed7fcbef..968badd4dc 100644 --- a/pennylane_lightning/core/src/simulators/lightning_gpu/gates/tests/mpi/Test_StateVectorCudaMPI_NonParam.cpp +++ b/pennylane_lightning/core/src/simulators/lightning_gpu/gates/tests/mpi/Test_StateVectorCudaMPI_NonParam.cpp @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -158,21 +159,17 @@ TEMPLATE_TEST_CASE("StateVectorCudaMPI::SetStateVector", "the host") { StateVectorCudaMPI sv(mpi_manager, dt_local, mpi_buffersize, nGlobalIndexBits, nLocalIndexBits); - // The setStates will shuffle the state vector values on the device with - // the following indices and values setting on host. For example, the - // values[i] is used to set the indices[i] th element of state vector on - // the device. For example, values[2] (init_state[5]) will be copied to - // indices[2]th or (4th) element of the state vector. - sv.template setStateVector( - init_state.size(), init_state.data(), indices.data(), false); + std::vector> values(init_state.begin(), + init_state.end()); + std::vector wires(num_qubits); + std::iota(wires.begin(), wires.end(), 0); + sv.setStateVector(values.data(), values.size(), wires); - mpi_manager.Barrier(); - sv.CopyGpuDataToHost(local_state.data(), - static_cast(subSvLength)); - mpi_manager.Barrier(); + auto expected_local_state_vector = mpi_manager.scatter(values, 0); - CHECK(expected_local_state == Pennylane::Util::approx(local_state)); + CHECK(expected_local_state_vector == + Pennylane::Util::approx(sv.getDataVector())); } } diff --git a/pennylane_lightning/lightning_gpu/_state_vector.py b/pennylane_lightning/lightning_gpu/_state_vector.py index faca301e8c..a000443563 100644 --- a/pennylane_lightning/lightning_gpu/_state_vector.py +++ b/pennylane_lightning/lightning_gpu/_state_vector.py @@ -31,7 +31,6 @@ except ImportError as ex: warn(str(ex), UserWarning) -from itertools import product from typing import Union import numpy as np @@ -69,7 +68,7 @@ class LightningGPUStateVector(LightningBaseStateVector): device_name(string): state vector device name. Options: ["lightning.gpu"] mpi_handler(MPIHandler): MPI handler for PennyLane Lightning GPU device. Provides functionality to distribute the state-vector to multiple devices. - sync (bool): is host-device data copy synchronized or not. + use_async (bool): is host-device data copy asynchronized or not. """ def __init__( @@ -77,7 +76,7 @@ def __init__( num_wires: int, dtype: Union[np.complex128, np.complex64] = np.complex128, mpi_handler: MPIHandler = None, - sync: bool = True, + use_async: bool = False, ): super().__init__(num_wires, dtype) @@ -92,7 +91,7 @@ def __init__( self._num_local_wires = mpi_handler.num_local_wires self._mpi_handler = mpi_handler - self._sync = sync + self._use_async = use_async # Initialize the state vector if self._mpi_handler.use_mpi: # using MPI @@ -120,7 +119,7 @@ def _state_dtype(self): # without MPI return StateVectorC128 if self.dtype == np.complex128 else StateVectorC64 - def syncD2H(self, state_vector, use_async=False): + def syncD2H(self, state_vector, use_async: bool = False): """Copy the state vector data on device to a state vector on the host provided by the user. Args: state_vector(array[complex]): the state vector array on host. @@ -155,7 +154,7 @@ def state(self): self.syncD2H(state) return state - def syncH2D(self, state_vector, use_async=False): + def syncH2D(self, state_vector, use_async: bool = False): """Copy the state vector data on host provided by the user to the state vector on the device Args: state_vector(array[complex]): the state vector array on host. @@ -189,7 +188,7 @@ def _asarray(arr, dtype=None): return arr - def _apply_state_vector(self, state, device_wires, use_async=False): + def _apply_state_vector(self, state, device_wires, use_async: bool = False): """Initialize the state vector on GPU with a specified state on host. Note that any use of this method will introduce host-overheads. Args: @@ -224,20 +223,8 @@ def _apply_state_vector(self, state, device_wires, use_async=False): self.syncH2D(np.reshape(local_state, output_shape)) return - # generate basis states on subset of qubits via the cartesian product - basis_states = np.array(list(product([0, 1], repeat=len(device_wires)))) - - # get basis states to alter on full set of qubits - unravelled_indices = np.zeros((2 ** len(device_wires), self.num_wires), dtype=int) - unravelled_indices[:, device_wires] = basis_states - - # get indices for which the state is changed to input state vector elements - ravelled_indices = np.ravel_multi_index(unravelled_indices.T, [2] * self.num_wires) - - # set the state vector on GPU with the unravelled_indices and their corresponding values - self._qubit_state.setStateVector( - ravelled_indices, state, use_async - ) # this operation on device + # set the state vector on GPU with provided state and their corresponding wires + self._qubit_state.setStateVector(state, list(device_wires), use_async) def _apply_lightning_controlled(self, operation): """Apply an arbitrary controlled operation to the state tensor. diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.py b/pennylane_lightning/lightning_gpu/lightning_gpu.py index 84d7dd31e6..2b295c4990 100644 --- a/pennylane_lightning/lightning_gpu/lightning_gpu.py +++ b/pennylane_lightning/lightning_gpu/lightning_gpu.py @@ -296,7 +296,7 @@ class LightningGPU(LightningBase): is built with MPI. Default is False. mpi (bool): declare if the device will use the MPI support. mpi_buf_size (int): size of GPU memory (in MiB) set for MPI operation and its default value is 64 MiB. - sync (bool): is host-device data copy synchronized or not. + use_async (bool): is host-device data copy asynchronized or not. """ # General device options @@ -326,7 +326,7 @@ def __init__( # pylint: disable=too-many-arguments # GPU and MPI arguments mpi: bool = False, mpi_buf_size: int = 0, - sync: bool = False, + use_async: bool = False, ): if not self._CPP_BINARY_AVAILABLE: raise ImportError( @@ -349,13 +349,16 @@ def __init__( # pylint: disable=too-many-arguments # GPU specific options self._dp = DevPool() - self._sync = sync + self._use_async = use_async # Creating the state vector self._mpi_handler = MPIHandler(mpi, mpi_buf_size, len(self.wires), c_dtype) self._statevector = self.LightningStateVector( - num_wires=len(self.wires), dtype=c_dtype, mpi_handler=self._mpi_handler, sync=self._sync + num_wires=len(self.wires), + dtype=c_dtype, + mpi_handler=self._mpi_handler, + use_async=self._use_async, ) @property From c7ea1a81112c0db6b4591dda49202d740a90b688 Mon Sep 17 00:00:00 2001 From: ringo-but-quantum Date: Tue, 8 Oct 2024 00:04:44 +0000 Subject: [PATCH 07/23] Auto update version from '0.39.0-dev40' to '0.39.0-dev41' --- pennylane_lightning/core/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py index c6e0b5050a..ab5a1f0f5d 100644 --- a/pennylane_lightning/core/_version.py +++ b/pennylane_lightning/core/_version.py @@ -16,4 +16,4 @@ Version number (major.minor.patch[-label]) """ -__version__ = "0.39.0-dev40" +__version__ = "0.39.0-dev41" From 829f8eb748b31882b163d9249a28ffc1e726df7f Mon Sep 17 00:00:00 2001 From: ringo-but-quantum Date: Tue, 8 Oct 2024 00:13:31 +0000 Subject: [PATCH 08/23] Auto update version from '0.39.0-dev40' to '0.39.0-dev41' --- pennylane_lightning/core/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py index c6e0b5050a..ab5a1f0f5d 100644 --- a/pennylane_lightning/core/_version.py +++ b/pennylane_lightning/core/_version.py @@ -16,4 +16,4 @@ Version number (major.minor.patch[-label]) """ -__version__ = "0.39.0-dev40" +__version__ = "0.39.0-dev41" From 2493260f20ffdf6018f1ecc8c26b43efc470cfee Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Tue, 8 Oct 2024 22:30:37 +0000 Subject: [PATCH 09/23] update collapse with custatevec apis --- .../lightning_gpu/StateVectorCudaMPI.hpp | 91 +++++++++++++------ .../lightning_gpu/StateVectorCudaManaged.hpp | 47 ++++++---- .../core/src/utils/cuda_utils/LinearAlg.hpp | 55 ----------- 3 files changed, 92 insertions(+), 101 deletions(-) diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp index f985d1b19e..389da68b47 100644 --- a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp +++ b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp @@ -347,37 +347,22 @@ class StateVectorCudaMPI final PL_ABORT_IF_NOT(wire < this->getTotalNumQubits(), "Invalid wire index."); - std::vector matrix(4, ComplexT(0.0, 0.0)); + const int wireInt = + static_cast(this->getTotalNumQubits() - 1 - wire); - for (std::size_t i = 0; i < matrix.size(); i++) { - matrix[i] = ((i == 0 && branch == 0) || (i == 3 && branch == 1)) - ? ComplexT{1.0, 0.0} - : ComplexT{0.0, 0.0}; - } - - mpi_manager_.Barrier(); - - applyMatrix(matrix, {wire}, false); - - auto local_norm2 = norm2_CUDA( - BaseType::getData(), BaseType::getLength(), - BaseType::getDataBuffer().getDevTag().getDeviceID(), - BaseType::getDataBuffer().getDevTag().getStreamID(), - this->getCublasCaller()); - - local_norm2 *= local_norm2; - - mpi_manager_.Barrier(); - - auto norm2 = mpi_manager_.allreduce(local_norm2, "sum"); - - norm2 = std::sqrt(norm2); + if (static_cast(wireInt) < BaseType::getNumQubits()) { + // local wire + collapse_local_(wireInt, branch); + } else { + // global wire + int local_wire = 0; + std::vector wirePairs{make_int2(wireInt, local_wire)}; + applyMPI_Dispatcher(wirePairs, &StateVectorCudaMPI::collapse_local_, + local_wire, branch); - normalize_CUDA( - norm2, BaseType::getData(), BaseType::getLength(), - BaseType::getDataBuffer().getDevTag().getDeviceID(), - BaseType::getDataBuffer().getDevTag().getStreamID(), - this->getCublasCaller()); + PL_CUDA_IS_SUCCESS(cudaStreamSynchronize(localStream_.get())); + PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize()); + } mpi_manager_.Barrier(); } @@ -1665,6 +1650,54 @@ class StateVectorCudaMPI final mpi_manager_.Barrier(); } + /** + * @brief collapse the state vector to a given basis state. + * + * @param wire_local Local wire index. + * @param branch Branch index. + */ + void collapse_local_(const int wire_local, const bool branch) { + cudaDataType_t data_type; + + if constexpr (std::is_same_v || + std::is_same_v) { + data_type = CUDA_C_64F; + } else { + data_type = CUDA_C_32F; + } + + std::vector basisBits(1, wire_local); + + double abs2sum0_local, abs2sum1_local; + + PL_CUSTATEVEC_IS_SUCCESS(custatevecAbs2SumOnZBasis( + /* custatevecHandle_t */ handle_.get(), + /* void *sv */ BaseType::getData(), + /* cudaDataType_t */ data_type, + /* const uint32_t nIndexBits */ BaseType::getNumQubits(), + /* double * */ &abs2sum0_local, + /* double * */ &abs2sum1_local, + /* const int32_t * */ basisBits.data(), + /* const uint32_t nBasisBits */ basisBits.size())); + + auto abs2sum0 = mpi_manager_.allreduce(abs2sum0_local, "sum"); + auto abs2sum1 = mpi_manager_.allreduce(abs2sum1_local, "sum"); + + double norm = (branch == 0) ? abs2sum0 : abs2sum1; + + int parity = branch; + + PL_CUSTATEVEC_IS_SUCCESS(custatevecCollapseOnZBasis( + /* custatevecHandle_t */ handle_.get(), + /* void *sv */ BaseType::getData(), + /* cudaDataType_t */ data_type, + /* const uint32_t nIndexBits */ BaseType::getNumQubits(), + /* const int32_t parity */ parity, + /* const int32_t *basisBits */ basisBits.data(), + /* const uint32_t nBasisBits */ basisBits.size(), + /* double norm */ norm)); + } + /** * @brief Get expectation value for a sum of Pauli words. * diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp index 4d31b03e3e..377b6d729d 100644 --- a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp +++ b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp @@ -498,28 +498,41 @@ class StateVectorCudaManaged */ void collapse(const std::size_t wire, const bool branch) { PL_ABORT_IF_NOT(wire < BaseType::getNumQubits(), "Invalid wire index."); + cudaDataType_t data_type; - std::vector matrix(4, ComplexT(0.0, 0.0)); - - for (std::size_t i = 0; i < matrix.size(); i++) { - matrix[i] = ((i == 0 && branch == 0) || (i == 3 && branch == 1)) - ? ComplexT{1.0, 0.0} - : ComplexT{0.0, 0.0}; + if constexpr (std::is_same_v || + std::is_same_v) { + data_type = CUDA_C_64F; + } else { + data_type = CUDA_C_32F; } - applyMatrix(matrix, {wire}, false); + std::vector basisBits(1, BaseType::getNumQubits() - 1 - wire); - auto norm2 = norm2_CUDA( - BaseType::getData(), BaseType::getLength(), - BaseType::getDataBuffer().getDevTag().getDeviceID(), - BaseType::getDataBuffer().getDevTag().getStreamID(), - this->getCublasCaller()); + double abs2sum0, abs2sum1; + PL_CUSTATEVEC_IS_SUCCESS(custatevecAbs2SumOnZBasis( + /* custatevecHandle_t */ handle_.get(), + /* void *sv */ BaseType::getData(), + /* cudaDataType_t */ data_type, + /* const uint32_t nIndexBits */ BaseType::getNumQubits(), + /* double * */ &abs2sum0, + /* double * */ &abs2sum1, + /* const int32_t * */ basisBits.data(), + /* const uint32_t nBasisBits */ basisBits.size())); + + double norm = (branch == 0) ? abs2sum0 : abs2sum1; - normalize_CUDA( - norm2, BaseType::getData(), BaseType::getLength(), - BaseType::getDataBuffer().getDevTag().getDeviceID(), - BaseType::getDataBuffer().getDevTag().getStreamID(), - this->getCublasCaller()); + int parity = branch; + + PL_CUSTATEVEC_IS_SUCCESS(custatevecCollapseOnZBasis( + /* custatevecHandle_t */ handle_.get(), + /* void *sv */ BaseType::getData(), + /* cudaDataType_t */ data_type, + /* const uint32_t nIndexBits */ BaseType::getNumQubits(), + /* const int32_t parity */ parity, + /* const int32_t *basisBits */ basisBits.data(), + /* const uint32_t nBasisBits */ basisBits.size(), + /* double norm */ norm)); } //****************************************************************************// diff --git a/pennylane_lightning/core/src/utils/cuda_utils/LinearAlg.hpp b/pennylane_lightning/core/src/utils/cuda_utils/LinearAlg.hpp index 984a9d2358..cd422899b5 100644 --- a/pennylane_lightning/core/src/utils/cuda_utils/LinearAlg.hpp +++ b/pennylane_lightning/core/src/utils/cuda_utils/LinearAlg.hpp @@ -274,61 +274,6 @@ inline auto scaleC_CUDA(const CFP_t a, T *v1, const int data_size, data_type); } -/** - * @brief cuBLAS backed GPU data normalization. - * - * @tparam CFP_t Complex float data-type. Accepts cuDoubleComplex and cuComplex - * @tparam DevTypeID Integer type of device id. - * - * @param v1 Device data pointer - * @param data_size Length of device data. - * @param dev_id the device on which the function should be executed. - * @param stream_id the CUDA stream on which the operation should be executed. - * @param cublas the CublasCaller object that manages the cuBLAS handle. - */ -template -inline auto norm2_CUDA(CFP_t *v1, const int data_size, DevTypeID dev_id, - cudaStream_t stream_id, const CublasCaller &cublas) { - if constexpr (std::is_same_v || - std::is_same_v) { - double norm{0.0}; - cublas.call(cublasDznrm2, dev_id, stream_id, data_size, v1, 1, &norm); - return norm; - } else { - float norm{0.0}; - cublas.call(cublasScnrm2, dev_id, stream_id, data_size, v1, 1, &norm); - return norm; - } -} - -/** - * @brief cuBLAS backed GPU data normalization. - * - * @tparam T Float data-type. Accepts float and double - * @tparam CFP_t Complex float data-type. Accepts cuDoubleComplex and cuComplex - * - * @param norm2 Norm of the vector - * @param v1 Device data pointer - * @param data_size Length of device data. - * @param dev_id the device on which the function should be executed. - * @param stream_id the CUDA stream on which the operation should be executed. - * @param cublas the CublasCaller object that manages the cuBLAS handle. - */ -template -inline auto normalize_CUDA(T norm2, CFP_t *v1, const int data_size, - DevTypeID dev_id, cudaStream_t stream_id, - const CublasCaller &cublas) { - if constexpr (std::is_same_v || - std::is_same_v) { - const double alpha = 1.0 / norm2; - cublas.call(cublasZdscal, dev_id, stream_id, data_size, &alpha, v1, 1); - } else { - const float alpha = 1.0 / norm2; - cublas.call(cublasCsscal, dev_id, stream_id, data_size, &alpha, v1, 1); - } -} - /** @brief `%CudaScopedDevice` uses RAII to select a CUDA device context. * * @see https://taskflow.github.io/taskflow/classtf_1_1cudaScopedDevice.html From 4238aeb3fe10dcdbe96a7deadf2e612948424a32 Mon Sep 17 00:00:00 2001 From: ringo-but-quantum Date: Tue, 8 Oct 2024 22:31:01 +0000 Subject: [PATCH 10/23] Auto update version from '0.39.0-dev41' to '0.39.0-dev42' --- pennylane_lightning/core/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py index ab5a1f0f5d..59d75bd653 100644 --- a/pennylane_lightning/core/_version.py +++ b/pennylane_lightning/core/_version.py @@ -16,4 +16,4 @@ Version number (major.minor.patch[-label]) """ -__version__ = "0.39.0-dev41" +__version__ = "0.39.0-dev42" From 15c7e2f6c374224c13b3cd742491a2d4447ed3ba Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Tue, 8 Oct 2024 23:14:55 +0000 Subject: [PATCH 11/23] add python layer --- .../lightning_gpu/bindings/LGPUBindings.hpp | 2 ++ .../bindings/LGPUBindingsMPI.hpp | 2 ++ .../lightning_gpu/_state_vector.py | 35 +++++++++++++++++-- .../lightning_gpu/lightning_gpu.py | 32 ++++++++++++++--- tests/test_native_mcm.py | 2 +- 5 files changed, 64 insertions(+), 9 deletions(-) diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp index c361bd6ed9..145097b30e 100644 --- a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp +++ b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp @@ -150,6 +150,8 @@ void registerBackendClassSpecificBindings(PyClass &pyclass) { }, py::arg("async") = false, "Initialize the statevector data to the |0...0> state") + .def("collapse", &StateVectorT::collapse, + "Collapse the statevector onto the 0 or 1 branch of a given wire.") .def( "apply", [](StateVectorT &sv, const std::string &str, diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp index 2d3313f694..360af02fe9 100644 --- a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp +++ b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp @@ -154,6 +154,8 @@ void registerBackendClassSpecificBindingsMPI(PyClass &pyclass) { }, py::arg("async") = false, "Initialize the statevector data to the |0...0> state") + .def("collapse", &StateVectorT::collapse, + "Collapse the statevector onto the 0 or 1 branch of a given wire.") .def( "apply", [](StateVectorT &sv, const std::string &str, diff --git a/pennylane_lightning/lightning_gpu/_state_vector.py b/pennylane_lightning/lightning_gpu/_state_vector.py index a000443563..77e453778b 100644 --- a/pennylane_lightning/lightning_gpu/_state_vector.py +++ b/pennylane_lightning/lightning_gpu/_state_vector.py @@ -36,13 +36,17 @@ import numpy as np import pennylane as qml from pennylane import DeviceError +from pennylane.measurements import MidMeasureMP +from pennylane.ops import Conditional from pennylane.ops.op_math import Adjoint +from pennylane.tape import QuantumScript from pennylane.wires import Wires # pylint: disable=ungrouped-imports from pennylane_lightning.core._serialize import global_phase_diagonal from pennylane_lightning.core._state_vector_base import LightningBaseStateVector +from ._measurements import LightningGPUMeasurements from ._mpi_handler import MPIHandler gate_cache_needs_hash = ( @@ -247,15 +251,33 @@ def _apply_lightning_controlled(self, operation): matrix = global_phase_diagonal(param, self.wires, control_wires, control_values) state.apply(name, wires, inv, [[param]], matrix) - def _apply_lightning_midmeasure(self): + def _apply_lightning_midmeasure( + self, operation: MidMeasureMP, mid_measurements: dict, postselect_mode: str + ): """Execute a MidMeasureMP operation and return the sample in mid_measurements. Args: + operation (~pennylane.operation.Operation): mid-circuit measurement + mid_measurements (None, dict): Dictionary of mid-circuit measurements + postselect_mode (str): Configuration for handling shots with mid-circuit measurement + postselection. Use ``"hw-like"`` to discard invalid shots and ``"fill-shots"`` to + keep the same number of shots. Returns: None """ - raise DeviceError("LightningGPU does not support Mid-circuit measurements.") + wires = self.wires.indices(operation.wires) + wire = list(wires)[0] + if postselect_mode == "fill-shots" and operation.postselect is not None: + sample = operation.postselect + else: + circuit = QuantumScript([], [qml.sample(wires=operation.wires)], shots=1) + sample = LightningGPUMeasurements(self).measure_final_state(circuit) + sample = np.squeeze(sample) + mid_measurements[operation] = sample + getattr(self.state_vector, "collapse")(wire, bool(sample)) + if operation.reset and bool(sample): + self.apply_operations([qml.PauliX(operation.wires)], mid_measurements=mid_measurements) # pylint: disable=unused-argument def _apply_lightning( @@ -289,7 +311,14 @@ def _apply_lightning( method = getattr(state, name, None) wires = list(operation.wires) - if method is not None: # apply specialized gate + if isinstance(operation, Conditional): + if operation.meas_val.concretize(mid_measurements): + self._apply_lightning([operation.base]) + elif isinstance(operation, MidMeasureMP): + self._apply_lightning_midmeasure( + operation, mid_measurements, postselect_mode=postselect_mode + ) + elif method is not None: # apply specialized gate param = operation.parameters method(wires, invert_param, param) elif isinstance(operation, qml.ops.Controlled) and isinstance( diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.py b/pennylane_lightning/lightning_gpu/lightning_gpu.py index 2b295c4990..c568caceb2 100644 --- a/pennylane_lightning/lightning_gpu/lightning_gpu.py +++ b/pennylane_lightning/lightning_gpu/lightning_gpu.py @@ -173,10 +173,7 @@ def stopping_condition(op: Operator) -> bool: def stopping_condition_shots(op: Operator) -> bool: """A function that determines whether or not an operation is supported by ``lightning.gpu`` with finite shots.""" - if isinstance(op, (MidMeasureMP, qml.ops.op_math.Conditional)): - # LightningGPU does not support Mid-circuit measurements. - return False - return stopping_condition(op) + return stopping_condition(op) or isinstance(op, (MidMeasureMP, qml.ops.op_math.Conditional)) def accepted_observables(obs: Operator) -> bool: @@ -460,6 +457,7 @@ def execute( self.simulate( circuit, self._statevector, + postselect_mode=execution_config.mcm_config.postselect_mode, ) ) @@ -494,20 +492,44 @@ def simulate( self, circuit: QuantumScript, state: LightningGPUStateVector, + postselect_mode: Optional[str] = None, ) -> Result: """Simulate a single quantum script. Args: circuit (QuantumTape): The single circuit to simulate state (LightningGPUStateVector): handle to Lightning state vector + postselect_mode (str): Configuration for handling shots with mid-circuit measurement + postselection. Use ``"hw-like"`` to discard invalid shots and ``"fill-shots"`` to + keep the same number of shots. Default is ``None``. Returns: Tuple[TensorLike]: The results of the simulation Note that this function can return measurements for non-commuting observables simultaneously. """ + #if circuit.shots and (any(isinstance(op, MidMeasureMP) for op in circuit.operations)): + # raise qml.DeviceError("LightningGPU does not support Mid-circuit measurements.") if circuit.shots and (any(isinstance(op, MidMeasureMP) for op in circuit.operations)): - raise qml.DeviceError("LightningGPU does not support Mid-circuit measurements.") + results = [] + aux_circ = qml.tape.QuantumScript( + circuit.operations, + circuit.measurements, + shots=[1], + trainable_params=circuit.trainable_params, + ) + for _ in range(circuit.shots.total_shots): + state.reset_state() + mid_measurements = {} + final_state = state.get_final_state( + aux_circ, mid_measurements=mid_measurements, postselect_mode=postselect_mode + ) + results.append( + self.LightningMeasurements(final_state).measure_final_state( + aux_circ, mid_measurements=mid_measurements + ) + ) + return tuple(results) state.reset_state() final_state = state.get_final_state(circuit) diff --git a/tests/test_native_mcm.py b/tests/test_native_mcm.py index 07281fb48a..516edab0cc 100644 --- a/tests/test_native_mcm.py +++ b/tests/test_native_mcm.py @@ -21,7 +21,7 @@ from conftest import LightningDevice, device_name, validate_measurements from flaky import flaky -if device_name not in ("lightning.qubit", "lightning.kokkos"): +if device_name not in ("lightning.qubit", "lightning.kokkos", "lightning.gpu"): pytest.skip("Native MCM not supported. Skipping.", allow_module_level=True) if not LightningDevice._CPP_BINARY_AVAILABLE: # pylint: disable=protected-access From 6dc7c03223d90f57d2508cbb978a423a964d1e57 Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Tue, 8 Oct 2024 23:16:33 +0000 Subject: [PATCH 12/23] update changelog --- .github/CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md index 7fa9decd0c..4efd8f948b 100644 --- a/.github/CHANGELOG.md +++ b/.github/CHANGELOG.md @@ -2,7 +2,7 @@ ### New features since last release -* Add `collapse()` support to `lightning.gpu` C++ layer. +* Add `mid-circuit measurements` support to `lightning.gpu`. [(#931)](https://github.com/PennyLaneAI/pennylane-lightning/pull/931) * Add Matrix Product Operator (MPO) for all gates support to `lightning.tensor`. Note current C++ implementation only works for MPO sites data provided by users. From a57b5eeed2e873a698d3f14ca14220571d9da4c6 Mon Sep 17 00:00:00 2001 From: ringo-but-quantum Date: Tue, 8 Oct 2024 23:21:21 +0000 Subject: [PATCH 13/23] Auto update version from '0.39.0-dev41' to '0.39.0-dev42' --- pennylane_lightning/core/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py index ab5a1f0f5d..59d75bd653 100644 --- a/pennylane_lightning/core/_version.py +++ b/pennylane_lightning/core/_version.py @@ -16,4 +16,4 @@ Version number (major.minor.patch[-label]) """ -__version__ = "0.39.0-dev41" +__version__ = "0.39.0-dev42" From d608f06e064bcc2ba412f71308bf18a2a4f67249 Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Wed, 9 Oct 2024 00:08:30 +0000 Subject: [PATCH 14/23] make format --- pennylane_lightning/core/src/utils/Util.hpp | 2 +- pennylane_lightning/lightning_gpu/lightning_gpu.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/pennylane_lightning/core/src/utils/Util.hpp b/pennylane_lightning/core/src/utils/Util.hpp index 5544c96ba9..5478cdbdcb 100644 --- a/pennylane_lightning/core/src/utils/Util.hpp +++ b/pennylane_lightning/core/src/utils/Util.hpp @@ -21,12 +21,12 @@ #include #include #include +#include // integral, floating_point #include #include // transform_reduce #include #include // is_same_v #include -#include // integral, floating_point #include "Error.hpp" #include "TypeTraits.hpp" // remove_complex_t diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.py b/pennylane_lightning/lightning_gpu/lightning_gpu.py index c568caceb2..3407f5cbf9 100644 --- a/pennylane_lightning/lightning_gpu/lightning_gpu.py +++ b/pennylane_lightning/lightning_gpu/lightning_gpu.py @@ -508,8 +508,6 @@ def simulate( Note that this function can return measurements for non-commuting observables simultaneously. """ - #if circuit.shots and (any(isinstance(op, MidMeasureMP) for op in circuit.operations)): - # raise qml.DeviceError("LightningGPU does not support Mid-circuit measurements.") if circuit.shots and (any(isinstance(op, MidMeasureMP) for op in circuit.operations)): results = [] aux_circ = qml.tape.QuantumScript( From 63facdd28e131742a44c17734ea93cfe2dc2640f Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Wed, 9 Oct 2024 15:26:55 +0000 Subject: [PATCH 15/23] drop mpi support for mid-measurement --- .github/CHANGELOG.md | 2 +- mpitests/test_native_mcm.py | 48 +++++++++++++++++++ .../bindings/LGPUBindingsMPI.hpp | 2 - .../lightning_gpu/lightning_gpu.py | 4 ++ .../lightning_gpu/lightning_gpu.toml | 2 +- tests/test_native_mcm.py | 2 +- 6 files changed, 55 insertions(+), 5 deletions(-) create mode 100644 mpitests/test_native_mcm.py diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md index f20bf9324e..6cfe74e276 100644 --- a/.github/CHANGELOG.md +++ b/.github/CHANGELOG.md @@ -2,7 +2,7 @@ ### New features since last release -* Add `mid-circuit measurements` support to `lightning.gpu`. +* Add `mid-circuit measurements` support to `lightning.gpu`'s single-GPU backend. [(#931)](https://github.com/PennyLaneAI/pennylane-lightning/pull/931) * Add Matrix Product Operator (MPO) for all gates support to `lightning.tensor`. Note current C++ implementation only works for MPO sites data provided by users. diff --git a/mpitests/test_native_mcm.py b/mpitests/test_native_mcm.py new file mode 100644 index 0000000000..758376af35 --- /dev/null +++ b/mpitests/test_native_mcm.py @@ -0,0 +1,48 @@ +# Copyright 2024 Xanadu Quantum Technologies Inc. + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tests for default qubit preprocessing.""" +from mpi4py import MPI +import numpy as np +import pennylane as qml +import pytest +from conftest import LightningDevice, device_name + +if device_name not in ("lightning.qubit", "lightning.kokkos", "lightning.gpu"): + pytest.skip("Native MCM not supported. Skipping.", allow_module_level=True) + +if not LightningDevice._CPP_BINARY_AVAILABLE: # pylint: disable=protected-access + pytest.skip("No binary module found. Skipping.", allow_module_level=True) + + +def test_unspported_mid_measurement(): + """Test unsupported mid_measurement for lightning.gpu-mpi.""" + comm = MPI.COMM_WORLD + dev = qml.device(device_name, wires=2, mpi=True, shots=1000) + params = np.pi / 4 * np.ones(2) + + @qml.qnode(dev) + def func(x, y): + qml.RX(x, wires=0) + m0 = qml.measure(0) + qml.cond(m0, qml.RY)(y, wires=1) + return qml.probs(wires=0) + + comm.Barrier() + + with pytest.raises( + qml.DeviceError, + match=f"LightningGPU-MPI does not support Mid-circuit measurements.", + ): + func(*params) + diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp index 360af02fe9..2d3313f694 100644 --- a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp +++ b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp @@ -154,8 +154,6 @@ void registerBackendClassSpecificBindingsMPI(PyClass &pyclass) { }, py::arg("async") = false, "Initialize the statevector data to the |0...0> state") - .def("collapse", &StateVectorT::collapse, - "Collapse the statevector onto the 0 or 1 branch of a given wire.") .def( "apply", [](StateVectorT &sv, const std::string &str, diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.py b/pennylane_lightning/lightning_gpu/lightning_gpu.py index 3407f5cbf9..2794e126d7 100644 --- a/pennylane_lightning/lightning_gpu/lightning_gpu.py +++ b/pennylane_lightning/lightning_gpu/lightning_gpu.py @@ -350,6 +350,7 @@ def __init__( # pylint: disable=too-many-arguments # Creating the state vector self._mpi_handler = MPIHandler(mpi, mpi_buf_size, len(self.wires), c_dtype) + self._use_mpi = mpi self._statevector = self.LightningStateVector( num_wires=len(self.wires), @@ -509,6 +510,9 @@ def simulate( Note that this function can return measurements for non-commuting observables simultaneously. """ if circuit.shots and (any(isinstance(op, MidMeasureMP) for op in circuit.operations)): + if self._use_mpi : + raise qml.DeviceError("LightningGPU-MPI does not support Mid-circuit measurements.") + results = [] aux_circ = qml.tape.QuantumScript( circuit.operations, diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.toml b/pennylane_lightning/lightning_gpu/lightning_gpu.toml index 518315de09..b18470da6b 100644 --- a/pennylane_lightning/lightning_gpu/lightning_gpu.toml +++ b/pennylane_lightning/lightning_gpu/lightning_gpu.toml @@ -98,7 +98,7 @@ qjit_compatible = false # If the device requires run time generation of the quantum circuit. runtime_code_generation = false # If the device supports mid circuit measurements natively -mid_circuit_measurement = false +mid_circuit_measurement = true # This field is currently unchecked but it is reserved for the purpose of # determining if the device supports dynamic qubit allocation/deallocation. diff --git a/tests/test_native_mcm.py b/tests/test_native_mcm.py index 516edab0cc..050e1d27c6 100644 --- a/tests/test_native_mcm.py +++ b/tests/test_native_mcm.py @@ -89,7 +89,7 @@ def func(x, y): match=f"not accepted with finite shots on lightning.qubit", ): func(*params) - if device_name == "lightning.kokkos": + if device_name in ("lightning.kokkos", "lightning.gpu"): with pytest.raises( qml.DeviceError, match=r"Measurement shadow\(wires=\[0\]\) not accepted with finite shots on " From 53ec72b5dc992df5115f7e834c89c2d0850f8b8e Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Wed, 9 Oct 2024 15:27:26 +0000 Subject: [PATCH 16/23] make format --- mpitests/test_native_mcm.py | 3 +-- pennylane_lightning/lightning_gpu/lightning_gpu.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/mpitests/test_native_mcm.py b/mpitests/test_native_mcm.py index 758376af35..af489f7160 100644 --- a/mpitests/test_native_mcm.py +++ b/mpitests/test_native_mcm.py @@ -12,11 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. """Tests for default qubit preprocessing.""" -from mpi4py import MPI import numpy as np import pennylane as qml import pytest from conftest import LightningDevice, device_name +from mpi4py import MPI if device_name not in ("lightning.qubit", "lightning.kokkos", "lightning.gpu"): pytest.skip("Native MCM not supported. Skipping.", allow_module_level=True) @@ -45,4 +45,3 @@ def func(x, y): match=f"LightningGPU-MPI does not support Mid-circuit measurements.", ): func(*params) - diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.py b/pennylane_lightning/lightning_gpu/lightning_gpu.py index 2794e126d7..4a0f48fb5a 100644 --- a/pennylane_lightning/lightning_gpu/lightning_gpu.py +++ b/pennylane_lightning/lightning_gpu/lightning_gpu.py @@ -510,7 +510,7 @@ def simulate( Note that this function can return measurements for non-commuting observables simultaneously. """ if circuit.shots and (any(isinstance(op, MidMeasureMP) for op in circuit.operations)): - if self._use_mpi : + if self._use_mpi: raise qml.DeviceError("LightningGPU-MPI does not support Mid-circuit measurements.") results = [] From 410d694cff05f79f5bfaffaf62286eee60eee159 Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Thu, 10 Oct 2024 13:51:43 +0000 Subject: [PATCH 17/23] test mpi support --- mpitests/conftest.py | 84 +++++ mpitests/test_native_mcm.py | 324 +++++++++++++++++- .../bindings/LGPUBindingsMPI.hpp | 2 + .../lightning_gpu/lightning_gpu.py | 4 - 4 files changed, 402 insertions(+), 12 deletions(-) diff --git a/mpitests/conftest.py b/mpitests/conftest.py index 552cf9f330..6c079c86de 100644 --- a/mpitests/conftest.py +++ b/mpitests/conftest.py @@ -18,6 +18,8 @@ import itertools import os +from functools import reduce +from typing import Sequence import pennylane as qml import pytest @@ -125,3 +127,85 @@ def _device(wires): ) return _device + + +####################################################################### + + +def validate_counts(shots, results1, results2): + """Compares two counts. + + If the results are ``Sequence``s, loop over entries. + + Fails if a key of ``results1`` is not found in ``results2``. + Passes if counts are too low, chosen as ``100``. + Otherwise, fails if counts differ by more than ``20`` plus 20 percent. + """ + if isinstance(results1, Sequence): + assert isinstance(results2, Sequence) + assert len(results1) == len(results2) + for r1, r2 in zip(results1, results2): + validate_counts(shots, r1, r2) + return + for key1, val1 in results1.items(): + val2 = results2[key1] + if abs(val1 + val2) > 100: + assert np.allclose(val1, val2, rtol=20, atol=0.2) + + +def validate_samples(shots, results1, results2): + """Compares two samples. + + If the results are ``Sequence``s, loop over entries. + + Fails if the results do not have the same shape, within ``20`` entries plus 20 percent. + This is to handle cases when post-selection yields variable shapes. + Otherwise, fails if the sums of samples differ by more than ``20`` plus 20 percent. + """ + if isinstance(shots, Sequence): + assert isinstance(results1, Sequence) + assert isinstance(results2, Sequence) + assert len(results1) == len(results2) + for s, r1, r2 in zip(shots, results1, results2): + validate_samples(s, r1, r2) + else: + sh1, sh2 = results1.shape[0], results2.shape[0] + assert np.allclose(sh1, sh2, rtol=20, atol=0.2) + assert results1.ndim == results2.ndim + if results2.ndim > 1: + assert results1.shape[1] == results2.shape[1] + np.allclose(np.sum(results1), np.sum(results2), rtol=20, atol=0.2) + + +def validate_others(shots, results1, results2): + """Compares two expval, probs or var. + + If the results are ``Sequence``s, validate the average of items. + + If ``shots is None``, validate using ``np.allclose``'s default parameters. + Otherwise, fails if the results do not match within ``0.01`` plus 20 percent. + """ + if isinstance(results1, Sequence): + assert isinstance(results2, Sequence) + assert len(results1) == len(results2) + results1 = reduce(lambda x, y: x + y, results1) / len(results1) + results2 = reduce(lambda x, y: x + y, results2) / len(results2) + validate_others(shots, results1, results2) + return + if shots is None: + assert np.allclose(results1, results2) + return + assert np.allclose(results1, results2, atol=0.01, rtol=0.2) + + +def validate_measurements(func, shots, results1, results2): + """Calls the correct validation function based on measurement type.""" + if func is qml.counts: + validate_counts(shots, results1, results2) + return + + if func is qml.sample: + validate_samples(shots, results1, results2) + return + + validate_others(shots, results1, results2) diff --git a/mpitests/test_native_mcm.py b/mpitests/test_native_mcm.py index af489f7160..563de142be 100644 --- a/mpitests/test_native_mcm.py +++ b/mpitests/test_native_mcm.py @@ -12,10 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. """Tests for default qubit preprocessing.""" +from functools import reduce +from typing import Sequence + import numpy as np import pennylane as qml import pytest -from conftest import LightningDevice, device_name +from conftest import LightningDevice, device_name, validate_measurements +from flaky import flaky from mpi4py import MPI if device_name not in ("lightning.qubit", "lightning.kokkos", "lightning.gpu"): @@ -25,8 +29,53 @@ pytest.skip("No binary module found. Skipping.", allow_module_level=True) -def test_unspported_mid_measurement(): - """Test unsupported mid_measurement for lightning.gpu-mpi.""" +def get_device(wires, **kwargs): + kwargs.setdefault("shots", None) + return qml.device(device_name, wires=wires, mpi=True, **kwargs) + + +def test_all_invalid_shots_circuit(): + """Test all invalid cases: expval, probs, var measurements.""" + comm = MPI.COMM_WORLD + dev = qml.device(device_name, wires=2) + dq = qml.device("default.qubit", wires=2) + + def circuit_op(): + m = qml.measure(0, postselect=1) + qml.cond(m, qml.PauliX)(1) + return ( + qml.expval(op=qml.PauliZ(1)), + qml.probs(op=qml.PauliY(0) @ qml.PauliZ(1)), + qml.var(op=qml.PauliZ(1)), + ) + + comm.Barrier() + res1 = qml.QNode(circuit_op, dq)() + res2 = qml.QNode(circuit_op, dev)(shots=10) + for r1, r2 in zip(res1, res2): + if isinstance(r1, Sequence): + assert len(r1) == len(r2) + assert np.all(np.isnan(r1)) + assert np.all(np.isnan(r2)) + + def circuit_mcm(): + m = qml.measure(0, postselect=1) + qml.cond(m, qml.PauliX)(1) + return qml.expval(op=m), qml.probs(op=m), qml.var(op=m) + + res1 = qml.QNode(circuit_mcm, dq)() + res2 = qml.QNode(circuit_mcm, dev)(shots=10) + + comm.Barrier() + for r1, r2 in zip(res1, res2): + if isinstance(r1, Sequence): + assert len(r1) == len(r2) + assert np.all(np.isnan(r1)) + assert np.all(np.isnan(r2)) + + +def test_unsupported_measurement(): + """Test unsupported ``qml.classical_shadow`` measurement on ``lightning.gpu`` .""" comm = MPI.COMM_WORLD dev = qml.device(device_name, wires=2, mpi=True, shots=1000) params = np.pi / 4 * np.ones(2) @@ -36,12 +85,271 @@ def func(x, y): qml.RX(x, wires=0) m0 = qml.measure(0) qml.cond(m0, qml.RY)(y, wires=1) - return qml.probs(wires=0) + return qml.classical_shadow(wires=0) + + comm.Barrier() + if device_name == "lightning.qubit": + with pytest.raises( + qml.DeviceError, + match=f"not accepted with finite shots on lightning.qubit", + ): + func(*params) + if device_name in ("lightning.kokkos", "lightning.gpu"): + with pytest.raises( + qml.DeviceError, + match=r"Measurement shadow\(wires=\[0\]\) not accepted with finite shots on " + + device_name, + ): + func(*params) + + +@pytest.mark.parametrize("mcm_method", ["deferred", "one-shot"]) +def test_qnode_mcm_method(mcm_method, mocker): + """Test that user specified qnode arg for mid-circuit measurements transform are used correctly""" + comm = MPI.COMM_WORLD + spy = ( + mocker.spy(qml.dynamic_one_shot, "_transform") + if mcm_method == "one-shot" + else mocker.spy(qml.defer_measurements, "_transform") + ) + other_spy = ( + mocker.spy(qml.defer_measurements, "_transform") + if mcm_method == "one-shot" + else mocker.spy(qml.dynamic_one_shot, "_transform") + ) + + shots = 10 + device = qml.device(device_name, wires=3, mpi=True, shots=shots) + comm.Barrier() + + @qml.qnode(device, mcm_method=mcm_method) + def f(x): + qml.RX(x, 0) + _ = qml.measure(0) + qml.CNOT([0, 1]) + return qml.sample(wires=[0, 1]) + + _ = f(np.pi / 8) + comm.Barrier() + + spy.assert_called_once() + other_spy.assert_not_called() + + +@pytest.mark.parametrize("postselect_mode", ["hw-like", "fill-shots"]) +def test_qnode_postselect_mode(postselect_mode): + """Test that user specified qnode arg for discarding invalid shots is used correctly""" + comm = MPI.COMM_WORLD + shots = 100 + device = qml.device(device_name, wires=3, mpi=True, shots=shots) + postselect = 1 + + @qml.qnode(device, postselect_mode=postselect_mode) + def f(x): + qml.RX(x, 0) + _ = qml.measure(0, postselect=postselect) + qml.CNOT([0, 1]) + return qml.sample(wires=[1]) + + # Using small-ish rotation angle ensures the number of valid shots will be less than the + # original number of shots. This helps avoid stochastic failures for the assertion below + res = f(np.pi / 2) + + comm.Barrier() + + if postselect_mode == "hw-like": + assert len(res) < shots + else: + assert len(res) == shots + assert np.allclose(res, postselect) + + +# pylint: disable=unused-argument +def obs_tape(x, y, z, reset=False, postselect=None): + qml.RX(x, 0) + qml.RZ(np.pi / 4, 0) + m0 = qml.measure(0, reset=reset) + qml.cond(m0 == 0, qml.RX)(np.pi / 4, 0) + qml.cond(m0 == 0, qml.RZ)(np.pi / 4, 0) + qml.cond(m0 == 1, qml.RX)(-np.pi / 4, 0) + qml.cond(m0 == 1, qml.RZ)(-np.pi / 4, 0) + qml.RX(y, 1) + qml.RZ(np.pi / 4, 1) + m1 = qml.measure(1, postselect=postselect) + qml.cond(m1 == 0, qml.RX)(np.pi / 4, 1) + qml.cond(m1 == 0, qml.RZ)(np.pi / 4, 1) + qml.cond(m1 == 1, qml.RX)(-np.pi / 4, 1) + qml.cond(m1 == 1, qml.RZ)(-np.pi / 4, 1) + return m0, m1 + + +@flaky(max_runs=5) +@pytest.mark.parametrize("shots", [5000, [5000, 5001]]) +@pytest.mark.parametrize("postselect", [None, 0, 1]) +@pytest.mark.parametrize("measure_f", [qml.counts, qml.expval, qml.probs, qml.sample, qml.var]) +@pytest.mark.parametrize( + "meas_obj", + [qml.PauliZ(0), qml.PauliY(1), [0], [0, 1], [1, 0], "mcm", "composite_mcm", "mcm_list"], +) +def test_simple_dynamic_circuit(shots, measure_f, postselect, meas_obj): + """Tests that LightningQubit handles a simple dynamic circuit with the following measurements: + * qml.counts with obs (comp basis or not), single wire, multiple wires (ordered/unordered), MCM, f(MCM), MCM list + * qml.expval with obs (comp basis or not), MCM, f(MCM), MCM list + * qml.probs with obs (comp basis or not), single wire, multiple wires (ordered/unordered), MCM, f(MCM), MCM list + * qml.sample with obs (comp basis or not), single wire, multiple wires (ordered/unordered), MCM, f(MCM), MCM list + * qml.var with obs (comp basis or not), MCM, f(MCM), MCM list + + The above combinations should work for finite shots, shot vectors and post-selecting of either the 0 or 1 branch. + """ + comm = MPI.COMM_WORLD + + if measure_f in (qml.expval, qml.var) and ( + isinstance(meas_obj, list) or meas_obj == "mcm_list" + ): + pytest.skip("Can't use wires/mcm lists with var or expval") + + dq = qml.device("default.qubit", shots=shots) + dev = get_device(wires=3, shots=shots) + params = [np.pi / 2.5, np.pi / 3, -np.pi / 3.5] + + def func(x, y, z): + m0, m1 = obs_tape(x, y, z, postselect=postselect) + mid_measure = ( + m0 if meas_obj == "mcm" else (0.5 * m0 if meas_obj == "composite_mcm" else [m0, m1]) + ) + measurement_key = "wires" if isinstance(meas_obj, list) else "op" + measurement_value = mid_measure if isinstance(meas_obj, str) else meas_obj + return measure_f(**{measurement_key: measurement_value}) + + results1 = qml.QNode(func, dev, mcm_method="one-shot")(*params) + results2 = qml.QNode(func, dq, mcm_method="deferred")(*params) + comm.Barrier() + + validate_measurements(measure_f, shots, results1, results2) + + +@pytest.mark.parametrize("postselect", [None, 0, 1]) +@pytest.mark.parametrize("reset", [False, True]) +def test_multiple_measurements_and_reset(postselect, reset): + """Tests that LightningQubit handles a circuit with a single mid-circuit measurement with reset + and a conditional gate. Multiple measurements of the mid-circuit measurement value are + performed. This function also tests `reset` parametrizing over the parameter.""" + comm = MPI.COMM_WORLD + shots = 5000 + dq = qml.device("default.qubit", shots=shots) + dev = get_device(wires=3, shots=shots) + params = [np.pi / 2.5, np.pi / 3, -np.pi / 3.5] + obs = qml.PauliY(1) + + def func(x, y, z): + mcms = obs_tape(x, y, z, reset=reset, postselect=postselect) + return ( + qml.counts(op=obs), + qml.expval(op=mcms[0]), + qml.probs(op=obs), + qml.sample(op=mcms[0]), + qml.var(op=obs), + ) + + results1 = qml.QNode(func, dev, mcm_method="one-shot")(*params) + results2 = qml.QNode(func, dq, mcm_method="deferred")(*params) comm.Barrier() - with pytest.raises( - qml.DeviceError, - match=f"LightningGPU-MPI does not support Mid-circuit measurements.", + for measure_f, r1, r2 in zip( + [qml.counts, qml.expval, qml.probs, qml.sample, qml.var], results1, results2 ): - func(*params) + validate_measurements(measure_f, shots, r1, r2) + + +@pytest.mark.parametrize( + "mcm_f", + [ + lambda x: x * -1, + lambda x: x * 1, + lambda x: x * 2, + lambda x: 1 - x, + lambda x: x + 1, + lambda x: x & 3, + "mix", + "list", + ], +) +@pytest.mark.parametrize("measure_f", [qml.counts, qml.expval, qml.probs, qml.sample, qml.var]) +def test_composite_mcms(mcm_f, measure_f): + """Tests that LightningQubit handles a circuit with a composite mid-circuit measurement and a + conditional gate. A single measurement of a composite mid-circuit measurement is performed + at the end.""" + comm = MPI.COMM_WORLD + if measure_f in (qml.expval, qml.var) and (mcm_f in ("list", "mix")): + pytest.skip( + "expval/var does not support measuring sequences of measurements or observables." + ) + + if measure_f == qml.probs and mcm_f == "mix": + pytest.skip( + "Cannot use qml.probs() when measuring multiple mid-circuit measurements collected using arithmetic operators." + ) + + shots = 3000 + + dq = qml.device("default.qubit", shots=shots) + dev = get_device(wires=3, shots=shots) + param = np.pi / 3 + + @qml.qnode(dev) + def func(x): + qml.RX(x, 0) + m0 = qml.measure(0) + qml.RX(0.5 * x, 1) + m1 = qml.measure(1) + qml.cond((m0 + m1) == 2, qml.RY)(2.0 * x, 0) + m2 = qml.measure(0) + obs = ( + (m0 - 2 * m1) * m2 + 7 + if mcm_f == "mix" + else ([m0, m1, m2] if mcm_f == "list" else mcm_f(m2)) + ) + return measure_f(op=obs) + + results1 = qml.QNode(func, dev, mcm_method="one-shot")(param) + results2 = qml.QNode(func, dq, mcm_method="deferred")(param) + + comm.Barrier() + + validate_measurements(measure_f, shots, results1, results2) + + +@pytest.mark.parametrize( + "mcm_f", + [ + lambda x, y: x + y, + lambda x, y: x - 7 * y, + lambda x, y: x & y, + lambda x, y: x == y, + lambda x, y: 4.0 * x + 2.0 * y, + ], +) +def test_counts_return_type(mcm_f): + """Tests that LightningQubit returns the same keys for ``qml.counts`` measurements with ``dynamic_one_shot`` and ``defer_measurements``.""" + comm = MPI.COMM_WORLD + shots = 500 + + dq = qml.device("default.qubit", shots=shots) + dev = get_device(wires=3, shots=shots) + param = np.pi / 3 + + @qml.qnode(dev) + def func(x): + qml.RX(x, 0) + m0 = qml.measure(0) + qml.RX(0.5 * x, 1) + m1 = qml.measure(1) + qml.cond((m0 + m1) == 2, qml.RY)(2.0 * x, 0) + return qml.counts(op=mcm_f(m0, m1)) + + results1 = qml.QNode(func, dev, mcm_method="one-shot")(param) + results2 = qml.QNode(func, dq, mcm_method="deferred")(param) + comm.Barrier() + for r1, r2 in zip(results1.keys(), results2.keys()): + assert r1 == r2 diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp index 2d3313f694..529f5ae75e 100644 --- a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp +++ b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp @@ -100,6 +100,8 @@ void registerBackendClassSpecificBindingsMPI(PyClass &pyclass) { }, "Set State Vector on GPU with values for the state vector and " "wires on the host memory.") + .def("collapse", &StateVectorT::collapse, + "Collapse the statevector onto the 0 or 1 branch of a given wire.") .def( "DeviceToDevice", [](StateVectorT &sv, const StateVectorT &other, bool async) { diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.py b/pennylane_lightning/lightning_gpu/lightning_gpu.py index 4a0f48fb5a..3407f5cbf9 100644 --- a/pennylane_lightning/lightning_gpu/lightning_gpu.py +++ b/pennylane_lightning/lightning_gpu/lightning_gpu.py @@ -350,7 +350,6 @@ def __init__( # pylint: disable=too-many-arguments # Creating the state vector self._mpi_handler = MPIHandler(mpi, mpi_buf_size, len(self.wires), c_dtype) - self._use_mpi = mpi self._statevector = self.LightningStateVector( num_wires=len(self.wires), @@ -510,9 +509,6 @@ def simulate( Note that this function can return measurements for non-commuting observables simultaneously. """ if circuit.shots and (any(isinstance(op, MidMeasureMP) for op in circuit.operations)): - if self._use_mpi: - raise qml.DeviceError("LightningGPU-MPI does not support Mid-circuit measurements.") - results = [] aux_circ = qml.tape.QuantumScript( circuit.operations, From 6a010f15cc30a96c4c4e76108931f44e7f741881 Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Thu, 10 Oct 2024 17:50:43 +0000 Subject: [PATCH 18/23] tidy up the code --- .../src/simulators/lightning_gpu/StateVectorCudaMPI.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp index 389da68b47..cca7ad4894 100644 --- a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp +++ b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp @@ -355,7 +355,7 @@ class StateVectorCudaMPI final collapse_local_(wireInt, branch); } else { // global wire - int local_wire = 0; + constexpr int local_wire = 0; std::vector wirePairs{make_int2(wireInt, local_wire)}; applyMPI_Dispatcher(wirePairs, &StateVectorCudaMPI::collapse_local_, local_wire, branch); @@ -1680,8 +1680,8 @@ class StateVectorCudaMPI final /* const int32_t * */ basisBits.data(), /* const uint32_t nBasisBits */ basisBits.size())); - auto abs2sum0 = mpi_manager_.allreduce(abs2sum0_local, "sum"); - auto abs2sum1 = mpi_manager_.allreduce(abs2sum1_local, "sum"); + auto abs2sum0 = mpi_manager_.allreduce(abs2sum0_local, "sum"); + auto abs2sum1 = mpi_manager_.allreduce(abs2sum1_local, "sum"); double norm = (branch == 0) ? abs2sum0 : abs2sum1; From 7f4c97fb4b3d811bff050b1f2baf39d7b01914e4 Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Thu, 10 Oct 2024 17:59:30 +0000 Subject: [PATCH 19/23] revert mpi support --- mpitests/conftest.py | 84 ----- mpitests/test_native_mcm.py | 326 +----------------- .../bindings/LGPUBindingsMPI.hpp | 2 - .../lightning_gpu/lightning_gpu.py | 4 + 4 files changed, 11 insertions(+), 405 deletions(-) diff --git a/mpitests/conftest.py b/mpitests/conftest.py index 6c079c86de..552cf9f330 100644 --- a/mpitests/conftest.py +++ b/mpitests/conftest.py @@ -18,8 +18,6 @@ import itertools import os -from functools import reduce -from typing import Sequence import pennylane as qml import pytest @@ -127,85 +125,3 @@ def _device(wires): ) return _device - - -####################################################################### - - -def validate_counts(shots, results1, results2): - """Compares two counts. - - If the results are ``Sequence``s, loop over entries. - - Fails if a key of ``results1`` is not found in ``results2``. - Passes if counts are too low, chosen as ``100``. - Otherwise, fails if counts differ by more than ``20`` plus 20 percent. - """ - if isinstance(results1, Sequence): - assert isinstance(results2, Sequence) - assert len(results1) == len(results2) - for r1, r2 in zip(results1, results2): - validate_counts(shots, r1, r2) - return - for key1, val1 in results1.items(): - val2 = results2[key1] - if abs(val1 + val2) > 100: - assert np.allclose(val1, val2, rtol=20, atol=0.2) - - -def validate_samples(shots, results1, results2): - """Compares two samples. - - If the results are ``Sequence``s, loop over entries. - - Fails if the results do not have the same shape, within ``20`` entries plus 20 percent. - This is to handle cases when post-selection yields variable shapes. - Otherwise, fails if the sums of samples differ by more than ``20`` plus 20 percent. - """ - if isinstance(shots, Sequence): - assert isinstance(results1, Sequence) - assert isinstance(results2, Sequence) - assert len(results1) == len(results2) - for s, r1, r2 in zip(shots, results1, results2): - validate_samples(s, r1, r2) - else: - sh1, sh2 = results1.shape[0], results2.shape[0] - assert np.allclose(sh1, sh2, rtol=20, atol=0.2) - assert results1.ndim == results2.ndim - if results2.ndim > 1: - assert results1.shape[1] == results2.shape[1] - np.allclose(np.sum(results1), np.sum(results2), rtol=20, atol=0.2) - - -def validate_others(shots, results1, results2): - """Compares two expval, probs or var. - - If the results are ``Sequence``s, validate the average of items. - - If ``shots is None``, validate using ``np.allclose``'s default parameters. - Otherwise, fails if the results do not match within ``0.01`` plus 20 percent. - """ - if isinstance(results1, Sequence): - assert isinstance(results2, Sequence) - assert len(results1) == len(results2) - results1 = reduce(lambda x, y: x + y, results1) / len(results1) - results2 = reduce(lambda x, y: x + y, results2) / len(results2) - validate_others(shots, results1, results2) - return - if shots is None: - assert np.allclose(results1, results2) - return - assert np.allclose(results1, results2, atol=0.01, rtol=0.2) - - -def validate_measurements(func, shots, results1, results2): - """Calls the correct validation function based on measurement type.""" - if func is qml.counts: - validate_counts(shots, results1, results2) - return - - if func is qml.sample: - validate_samples(shots, results1, results2) - return - - validate_others(shots, results1, results2) diff --git a/mpitests/test_native_mcm.py b/mpitests/test_native_mcm.py index 563de142be..4ca607e343 100644 --- a/mpitests/test_native_mcm.py +++ b/mpitests/test_native_mcm.py @@ -12,70 +12,18 @@ # See the License for the specific language governing permissions and # limitations under the License. """Tests for default qubit preprocessing.""" -from functools import reduce -from typing import Sequence - import numpy as np import pennylane as qml import pytest -from conftest import LightningDevice, device_name, validate_measurements -from flaky import flaky +from conftest import LightningDevice, device_name from mpi4py import MPI -if device_name not in ("lightning.qubit", "lightning.kokkos", "lightning.gpu"): - pytest.skip("Native MCM not supported. Skipping.", allow_module_level=True) - if not LightningDevice._CPP_BINARY_AVAILABLE: # pylint: disable=protected-access pytest.skip("No binary module found. Skipping.", allow_module_level=True) -def get_device(wires, **kwargs): - kwargs.setdefault("shots", None) - return qml.device(device_name, wires=wires, mpi=True, **kwargs) - - -def test_all_invalid_shots_circuit(): - """Test all invalid cases: expval, probs, var measurements.""" - comm = MPI.COMM_WORLD - dev = qml.device(device_name, wires=2) - dq = qml.device("default.qubit", wires=2) - - def circuit_op(): - m = qml.measure(0, postselect=1) - qml.cond(m, qml.PauliX)(1) - return ( - qml.expval(op=qml.PauliZ(1)), - qml.probs(op=qml.PauliY(0) @ qml.PauliZ(1)), - qml.var(op=qml.PauliZ(1)), - ) - - comm.Barrier() - res1 = qml.QNode(circuit_op, dq)() - res2 = qml.QNode(circuit_op, dev)(shots=10) - for r1, r2 in zip(res1, res2): - if isinstance(r1, Sequence): - assert len(r1) == len(r2) - assert np.all(np.isnan(r1)) - assert np.all(np.isnan(r2)) - - def circuit_mcm(): - m = qml.measure(0, postselect=1) - qml.cond(m, qml.PauliX)(1) - return qml.expval(op=m), qml.probs(op=m), qml.var(op=m) - - res1 = qml.QNode(circuit_mcm, dq)() - res2 = qml.QNode(circuit_mcm, dev)(shots=10) - - comm.Barrier() - for r1, r2 in zip(res1, res2): - if isinstance(r1, Sequence): - assert len(r1) == len(r2) - assert np.all(np.isnan(r1)) - assert np.all(np.isnan(r2)) - - -def test_unsupported_measurement(): - """Test unsupported ``qml.classical_shadow`` measurement on ``lightning.gpu`` .""" +def test_unspported_mid_measurement(): + """Test unsupported mid_measurement for lightning.gpu-mpi.""" comm = MPI.COMM_WORLD dev = qml.device(device_name, wires=2, mpi=True, shots=1000) params = np.pi / 4 * np.ones(2) @@ -85,271 +33,11 @@ def func(x, y): qml.RX(x, wires=0) m0 = qml.measure(0) qml.cond(m0, qml.RY)(y, wires=1) - return qml.classical_shadow(wires=0) - - comm.Barrier() - if device_name == "lightning.qubit": - with pytest.raises( - qml.DeviceError, - match=f"not accepted with finite shots on lightning.qubit", - ): - func(*params) - if device_name in ("lightning.kokkos", "lightning.gpu"): - with pytest.raises( - qml.DeviceError, - match=r"Measurement shadow\(wires=\[0\]\) not accepted with finite shots on " - + device_name, - ): - func(*params) - - -@pytest.mark.parametrize("mcm_method", ["deferred", "one-shot"]) -def test_qnode_mcm_method(mcm_method, mocker): - """Test that user specified qnode arg for mid-circuit measurements transform are used correctly""" - comm = MPI.COMM_WORLD - spy = ( - mocker.spy(qml.dynamic_one_shot, "_transform") - if mcm_method == "one-shot" - else mocker.spy(qml.defer_measurements, "_transform") - ) - other_spy = ( - mocker.spy(qml.defer_measurements, "_transform") - if mcm_method == "one-shot" - else mocker.spy(qml.dynamic_one_shot, "_transform") - ) + return qml.probs(wires=0) - shots = 10 - device = qml.device(device_name, wires=3, mpi=True, shots=shots) comm.Barrier() - @qml.qnode(device, mcm_method=mcm_method) - def f(x): - qml.RX(x, 0) - _ = qml.measure(0) - qml.CNOT([0, 1]) - return qml.sample(wires=[0, 1]) - - _ = f(np.pi / 8) - comm.Barrier() - - spy.assert_called_once() - other_spy.assert_not_called() - - -@pytest.mark.parametrize("postselect_mode", ["hw-like", "fill-shots"]) -def test_qnode_postselect_mode(postselect_mode): - """Test that user specified qnode arg for discarding invalid shots is used correctly""" - comm = MPI.COMM_WORLD - shots = 100 - device = qml.device(device_name, wires=3, mpi=True, shots=shots) - postselect = 1 - - @qml.qnode(device, postselect_mode=postselect_mode) - def f(x): - qml.RX(x, 0) - _ = qml.measure(0, postselect=postselect) - qml.CNOT([0, 1]) - return qml.sample(wires=[1]) - - # Using small-ish rotation angle ensures the number of valid shots will be less than the - # original number of shots. This helps avoid stochastic failures for the assertion below - res = f(np.pi / 2) - - comm.Barrier() - - if postselect_mode == "hw-like": - assert len(res) < shots - else: - assert len(res) == shots - assert np.allclose(res, postselect) - - -# pylint: disable=unused-argument -def obs_tape(x, y, z, reset=False, postselect=None): - qml.RX(x, 0) - qml.RZ(np.pi / 4, 0) - m0 = qml.measure(0, reset=reset) - qml.cond(m0 == 0, qml.RX)(np.pi / 4, 0) - qml.cond(m0 == 0, qml.RZ)(np.pi / 4, 0) - qml.cond(m0 == 1, qml.RX)(-np.pi / 4, 0) - qml.cond(m0 == 1, qml.RZ)(-np.pi / 4, 0) - qml.RX(y, 1) - qml.RZ(np.pi / 4, 1) - m1 = qml.measure(1, postselect=postselect) - qml.cond(m1 == 0, qml.RX)(np.pi / 4, 1) - qml.cond(m1 == 0, qml.RZ)(np.pi / 4, 1) - qml.cond(m1 == 1, qml.RX)(-np.pi / 4, 1) - qml.cond(m1 == 1, qml.RZ)(-np.pi / 4, 1) - return m0, m1 - - -@flaky(max_runs=5) -@pytest.mark.parametrize("shots", [5000, [5000, 5001]]) -@pytest.mark.parametrize("postselect", [None, 0, 1]) -@pytest.mark.parametrize("measure_f", [qml.counts, qml.expval, qml.probs, qml.sample, qml.var]) -@pytest.mark.parametrize( - "meas_obj", - [qml.PauliZ(0), qml.PauliY(1), [0], [0, 1], [1, 0], "mcm", "composite_mcm", "mcm_list"], -) -def test_simple_dynamic_circuit(shots, measure_f, postselect, meas_obj): - """Tests that LightningQubit handles a simple dynamic circuit with the following measurements: - - * qml.counts with obs (comp basis or not), single wire, multiple wires (ordered/unordered), MCM, f(MCM), MCM list - * qml.expval with obs (comp basis or not), MCM, f(MCM), MCM list - * qml.probs with obs (comp basis or not), single wire, multiple wires (ordered/unordered), MCM, f(MCM), MCM list - * qml.sample with obs (comp basis or not), single wire, multiple wires (ordered/unordered), MCM, f(MCM), MCM list - * qml.var with obs (comp basis or not), MCM, f(MCM), MCM list - - The above combinations should work for finite shots, shot vectors and post-selecting of either the 0 or 1 branch. - """ - comm = MPI.COMM_WORLD - - if measure_f in (qml.expval, qml.var) and ( - isinstance(meas_obj, list) or meas_obj == "mcm_list" - ): - pytest.skip("Can't use wires/mcm lists with var or expval") - - dq = qml.device("default.qubit", shots=shots) - dev = get_device(wires=3, shots=shots) - params = [np.pi / 2.5, np.pi / 3, -np.pi / 3.5] - - def func(x, y, z): - m0, m1 = obs_tape(x, y, z, postselect=postselect) - mid_measure = ( - m0 if meas_obj == "mcm" else (0.5 * m0 if meas_obj == "composite_mcm" else [m0, m1]) - ) - measurement_key = "wires" if isinstance(meas_obj, list) else "op" - measurement_value = mid_measure if isinstance(meas_obj, str) else meas_obj - return measure_f(**{measurement_key: measurement_value}) - - results1 = qml.QNode(func, dev, mcm_method="one-shot")(*params) - results2 = qml.QNode(func, dq, mcm_method="deferred")(*params) - comm.Barrier() - - validate_measurements(measure_f, shots, results1, results2) - - -@pytest.mark.parametrize("postselect", [None, 0, 1]) -@pytest.mark.parametrize("reset", [False, True]) -def test_multiple_measurements_and_reset(postselect, reset): - """Tests that LightningQubit handles a circuit with a single mid-circuit measurement with reset - and a conditional gate. Multiple measurements of the mid-circuit measurement value are - performed. This function also tests `reset` parametrizing over the parameter.""" - comm = MPI.COMM_WORLD - shots = 5000 - dq = qml.device("default.qubit", shots=shots) - dev = get_device(wires=3, shots=shots) - params = [np.pi / 2.5, np.pi / 3, -np.pi / 3.5] - obs = qml.PauliY(1) - - def func(x, y, z): - mcms = obs_tape(x, y, z, reset=reset, postselect=postselect) - return ( - qml.counts(op=obs), - qml.expval(op=mcms[0]), - qml.probs(op=obs), - qml.sample(op=mcms[0]), - qml.var(op=obs), - ) - - results1 = qml.QNode(func, dev, mcm_method="one-shot")(*params) - results2 = qml.QNode(func, dq, mcm_method="deferred")(*params) - comm.Barrier() - - for measure_f, r1, r2 in zip( - [qml.counts, qml.expval, qml.probs, qml.sample, qml.var], results1, results2 + with pytest.raises( + qml.DeviceError, match="LightningGPU-MPI does not support Mid-circuit measurements." ): - validate_measurements(measure_f, shots, r1, r2) - - -@pytest.mark.parametrize( - "mcm_f", - [ - lambda x: x * -1, - lambda x: x * 1, - lambda x: x * 2, - lambda x: 1 - x, - lambda x: x + 1, - lambda x: x & 3, - "mix", - "list", - ], -) -@pytest.mark.parametrize("measure_f", [qml.counts, qml.expval, qml.probs, qml.sample, qml.var]) -def test_composite_mcms(mcm_f, measure_f): - """Tests that LightningQubit handles a circuit with a composite mid-circuit measurement and a - conditional gate. A single measurement of a composite mid-circuit measurement is performed - at the end.""" - comm = MPI.COMM_WORLD - if measure_f in (qml.expval, qml.var) and (mcm_f in ("list", "mix")): - pytest.skip( - "expval/var does not support measuring sequences of measurements or observables." - ) - - if measure_f == qml.probs and mcm_f == "mix": - pytest.skip( - "Cannot use qml.probs() when measuring multiple mid-circuit measurements collected using arithmetic operators." - ) - - shots = 3000 - - dq = qml.device("default.qubit", shots=shots) - dev = get_device(wires=3, shots=shots) - param = np.pi / 3 - - @qml.qnode(dev) - def func(x): - qml.RX(x, 0) - m0 = qml.measure(0) - qml.RX(0.5 * x, 1) - m1 = qml.measure(1) - qml.cond((m0 + m1) == 2, qml.RY)(2.0 * x, 0) - m2 = qml.measure(0) - obs = ( - (m0 - 2 * m1) * m2 + 7 - if mcm_f == "mix" - else ([m0, m1, m2] if mcm_f == "list" else mcm_f(m2)) - ) - return measure_f(op=obs) - - results1 = qml.QNode(func, dev, mcm_method="one-shot")(param) - results2 = qml.QNode(func, dq, mcm_method="deferred")(param) - - comm.Barrier() - - validate_measurements(measure_f, shots, results1, results2) - - -@pytest.mark.parametrize( - "mcm_f", - [ - lambda x, y: x + y, - lambda x, y: x - 7 * y, - lambda x, y: x & y, - lambda x, y: x == y, - lambda x, y: 4.0 * x + 2.0 * y, - ], -) -def test_counts_return_type(mcm_f): - """Tests that LightningQubit returns the same keys for ``qml.counts`` measurements with ``dynamic_one_shot`` and ``defer_measurements``.""" - comm = MPI.COMM_WORLD - shots = 500 - - dq = qml.device("default.qubit", shots=shots) - dev = get_device(wires=3, shots=shots) - param = np.pi / 3 - - @qml.qnode(dev) - def func(x): - qml.RX(x, 0) - m0 = qml.measure(0) - qml.RX(0.5 * x, 1) - m1 = qml.measure(1) - qml.cond((m0 + m1) == 2, qml.RY)(2.0 * x, 0) - return qml.counts(op=mcm_f(m0, m1)) - - results1 = qml.QNode(func, dev, mcm_method="one-shot")(param) - results2 = qml.QNode(func, dq, mcm_method="deferred")(param) - comm.Barrier() - for r1, r2 in zip(results1.keys(), results2.keys()): - assert r1 == r2 + func(*params) diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp index 529f5ae75e..2d3313f694 100644 --- a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp +++ b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp @@ -100,8 +100,6 @@ void registerBackendClassSpecificBindingsMPI(PyClass &pyclass) { }, "Set State Vector on GPU with values for the state vector and " "wires on the host memory.") - .def("collapse", &StateVectorT::collapse, - "Collapse the statevector onto the 0 or 1 branch of a given wire.") .def( "DeviceToDevice", [](StateVectorT &sv, const StateVectorT &other, bool async) { diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.py b/pennylane_lightning/lightning_gpu/lightning_gpu.py index 3407f5cbf9..4a0f48fb5a 100644 --- a/pennylane_lightning/lightning_gpu/lightning_gpu.py +++ b/pennylane_lightning/lightning_gpu/lightning_gpu.py @@ -350,6 +350,7 @@ def __init__( # pylint: disable=too-many-arguments # Creating the state vector self._mpi_handler = MPIHandler(mpi, mpi_buf_size, len(self.wires), c_dtype) + self._use_mpi = mpi self._statevector = self.LightningStateVector( num_wires=len(self.wires), @@ -509,6 +510,9 @@ def simulate( Note that this function can return measurements for non-commuting observables simultaneously. """ if circuit.shots and (any(isinstance(op, MidMeasureMP) for op in circuit.operations)): + if self._use_mpi: + raise qml.DeviceError("LightningGPU-MPI does not support Mid-circuit measurements.") + results = [] aux_circ = qml.tape.QuantumScript( circuit.operations, From 85dc575b55fbc5f864f822df708b626e3442221b Mon Sep 17 00:00:00 2001 From: ringo-but-quantum Date: Thu, 10 Oct 2024 17:59:54 +0000 Subject: [PATCH 20/23] Auto update version from '0.39.0-dev42' to '0.39.0-dev43' --- pennylane_lightning/core/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py index 59d75bd653..cff4ff5e0b 100644 --- a/pennylane_lightning/core/_version.py +++ b/pennylane_lightning/core/_version.py @@ -16,4 +16,4 @@ Version number (major.minor.patch[-label]) """ -__version__ = "0.39.0-dev42" +__version__ = "0.39.0-dev43" From 0e9f1db466f662e6432451e4a5f51da5b3f7da57 Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Thu, 10 Oct 2024 18:11:51 +0000 Subject: [PATCH 21/23] tidy up code --- .../core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp | 2 +- .../src/simulators/lightning_gpu/StateVectorCudaManaged.hpp | 2 +- pennylane_lightning/lightning_gpu/lightning_gpu.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp index cca7ad4894..c05f8160fa 100644 --- a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp +++ b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp @@ -1685,7 +1685,7 @@ class StateVectorCudaMPI final double norm = (branch == 0) ? abs2sum0 : abs2sum1; - int parity = branch; + const int parity = static_cast(branch); PL_CUSTATEVEC_IS_SUCCESS(custatevecCollapseOnZBasis( /* custatevecHandle_t */ handle_.get(), diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp index 377b6d729d..7ade4ab2b9 100644 --- a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp +++ b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp @@ -522,7 +522,7 @@ class StateVectorCudaManaged double norm = (branch == 0) ? abs2sum0 : abs2sum1; - int parity = branch; + const int parity = static_cast(branch); PL_CUSTATEVEC_IS_SUCCESS(custatevecCollapseOnZBasis( /* custatevecHandle_t */ handle_.get(), diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.py b/pennylane_lightning/lightning_gpu/lightning_gpu.py index 4a0f48fb5a..9c91b87a52 100644 --- a/pennylane_lightning/lightning_gpu/lightning_gpu.py +++ b/pennylane_lightning/lightning_gpu/lightning_gpu.py @@ -514,7 +514,7 @@ def simulate( raise qml.DeviceError("LightningGPU-MPI does not support Mid-circuit measurements.") results = [] - aux_circ = qml.tape.QuantumScript( + aux_circ = QuantumScript( circuit.operations, circuit.measurements, shots=[1], From 3b809faf98278fd44a1187b73bb5f32677aab66e Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Tue, 15 Oct 2024 13:36:48 +0000 Subject: [PATCH 22/23] initial commit --- mpitests/test_native_mcm.py | 325 +++++++++++++++++- .../lightning_gpu/StateVectorCudaMPI.hpp | 58 +++- .../bindings/LGPUBindingsMPI.hpp | 2 + .../measurements/MeasurementsGPUMPI.hpp | 10 +- .../core/src/utils/cuda_utils/LinearAlg.hpp | 55 +++ .../lightning_gpu/_measurements.py | 6 +- .../lightning_gpu/lightning_gpu.py | 4 - 7 files changed, 438 insertions(+), 22 deletions(-) diff --git a/mpitests/test_native_mcm.py b/mpitests/test_native_mcm.py index 4ca607e343..eaafe0ff03 100644 --- a/mpitests/test_native_mcm.py +++ b/mpitests/test_native_mcm.py @@ -1,29 +1,75 @@ # Copyright 2024 Xanadu Quantum Technologies Inc. - # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at - # http://www.apache.org/licenses/LICENSE-2.0 - # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Tests for default qubit preprocessing.""" +from functools import reduce +from typing import Sequence + import numpy as np import pennylane as qml import pytest -from conftest import LightningDevice, device_name +from conftest import LightningDevice, device_name, validate_measurements +from flaky import flaky from mpi4py import MPI if not LightningDevice._CPP_BINARY_AVAILABLE: # pylint: disable=protected-access pytest.skip("No binary module found. Skipping.", allow_module_level=True) -def test_unspported_mid_measurement(): - """Test unsupported mid_measurement for lightning.gpu-mpi.""" +def get_device(wires, **kwargs): + kwargs.setdefault("shots", None) + return qml.device(device_name, wires=wires, mpi=True, **kwargs) + + +def test_all_invalid_shots_circuit(): + """Test all invalid cases: expval, probs, var measurements.""" + comm = MPI.COMM_WORLD + dev = qml.device(device_name, wires=2) + dq = qml.device("default.qubit", wires=2) + + def circuit_op(): + m = qml.measure(0, postselect=1) + qml.cond(m, qml.PauliX)(1) + return ( + qml.expval(op=qml.PauliZ(1)), + qml.probs(op=qml.PauliY(0) @ qml.PauliZ(1)), + qml.var(op=qml.PauliZ(1)), + ) + + comm.Barrier() + res1 = qml.QNode(circuit_op, dq)() + res2 = qml.QNode(circuit_op, dev)(shots=10) + for r1, r2 in zip(res1, res2): + if isinstance(r1, Sequence): + assert len(r1) == len(r2) + assert np.all(np.isnan(r1)) + assert np.all(np.isnan(r2)) + + def circuit_mcm(): + m = qml.measure(0, postselect=1) + qml.cond(m, qml.PauliX)(1) + return qml.expval(op=m), qml.probs(op=m), qml.var(op=m) + + res1 = qml.QNode(circuit_mcm, dq)() + res2 = qml.QNode(circuit_mcm, dev)(shots=10) + + comm.Barrier() + for r1, r2 in zip(res1, res2): + if isinstance(r1, Sequence): + assert len(r1) == len(r2) + assert np.all(np.isnan(r1)) + assert np.all(np.isnan(r2)) + + +def test_unsupported_measurement(): + """Test unsupported ``qml.classical_shadow`` measurement on ``lightning.gpu`` .""" comm = MPI.COMM_WORLD dev = qml.device(device_name, wires=2, mpi=True, shots=1000) params = np.pi / 4 * np.ones(2) @@ -33,11 +79,270 @@ def func(x, y): qml.RX(x, wires=0) m0 = qml.measure(0) qml.cond(m0, qml.RY)(y, wires=1) - return qml.probs(wires=0) + return qml.classical_shadow(wires=0) + + comm.Barrier() + if device_name == "lightning.qubit": + with pytest.raises( + qml.DeviceError, + match=f"not accepted with finite shots on lightning.qubit", + ): + func(*params) + if device_name in ("lightning.kokkos", "lightning.gpu"): + with pytest.raises( + qml.DeviceError, + match=r"Measurement shadow\(wires=\[0\]\) not accepted with finite shots on " + + device_name, + ): + func(*params) + + +@pytest.mark.parametrize("mcm_method", ["deferred", "one-shot"]) +def test_qnode_mcm_method(mcm_method, mocker): + """Test that user specified qnode arg for mid-circuit measurements transform are used correctly""" + comm = MPI.COMM_WORLD + spy = ( + mocker.spy(qml.dynamic_one_shot, "_transform") + if mcm_method == "one-shot" + else mocker.spy(qml.defer_measurements, "_transform") + ) + other_spy = ( + mocker.spy(qml.defer_measurements, "_transform") + if mcm_method == "one-shot" + else mocker.spy(qml.dynamic_one_shot, "_transform") + ) + + shots = 10 + device = qml.device(device_name, wires=3, mpi=True, shots=shots) + comm.Barrier() + + @qml.qnode(device, mcm_method=mcm_method) + def f(x): + qml.RX(x, 0) + _ = qml.measure(0) + qml.CNOT([0, 1]) + return qml.sample(wires=[0, 1]) + + _ = f(np.pi / 8) + comm.Barrier() + + spy.assert_called_once() + other_spy.assert_not_called() + + +@pytest.mark.parametrize("postselect_mode", ["hw-like", "fill-shots"]) +def test_qnode_postselect_mode(postselect_mode): + """Test that user specified qnode arg for discarding invalid shots is used correctly""" + comm = MPI.COMM_WORLD + shots = 100 + device = qml.device(device_name, wires=3, mpi=True, shots=shots) + postselect = 1 + + @qml.qnode(device, postselect_mode=postselect_mode) + def f(x): + qml.RX(x, 0) + _ = qml.measure(0, postselect=postselect) + qml.CNOT([0, 1]) + return qml.sample(qml.Identity(1)) + + comm.Barrier() + # Using small-ish rotation angle ensures the number of valid shots will be less than the + # original number of shots. This helps avoid stochastic failures for the assertion below + res = f(np.pi / 2) + + comm.Barrier() + + if postselect_mode == "hw-like": + assert res.size < shots + else: + assert len(res) == shots + assert np.allclose(res, postselect) + +# pylint: disable=unused-argument +def obs_tape(x, y, z, reset=False, postselect=None): + qml.RX(x, 0) + qml.RZ(np.pi / 4, 0) + m0 = qml.measure(0, reset=reset) + qml.cond(m0 == 0, qml.RX)(np.pi / 4, 0) + qml.cond(m0 == 0, qml.RZ)(np.pi / 4, 0) + qml.cond(m0 == 1, qml.RX)(-np.pi / 4, 0) + qml.cond(m0 == 1, qml.RZ)(-np.pi / 4, 0) + qml.RX(y, 1) + qml.RZ(np.pi / 4, 1) + m1 = qml.measure(1, postselect=postselect) + qml.cond(m1 == 0, qml.RX)(np.pi / 4, 1) + qml.cond(m1 == 0, qml.RZ)(np.pi / 4, 1) + qml.cond(m1 == 1, qml.RX)(-np.pi / 4, 1) + qml.cond(m1 == 1, qml.RZ)(-np.pi / 4, 1) + return m0, m1 + + +@flaky(max_runs=5) +@pytest.mark.parametrize("shots", [5000, [5000, 5001]]) +@pytest.mark.parametrize("postselect", [None, 0, 1]) +@pytest.mark.parametrize("measure_f", [qml.counts, qml.expval, qml.probs, qml.sample, qml.var]) +@pytest.mark.parametrize( + "meas_obj", + [qml.PauliZ(0), qml.PauliY(1), [0], [0, 1], [1, 0], "mcm", "composite_mcm", "mcm_list"], +) +def test_simple_dynamic_circuit(shots, measure_f, postselect, meas_obj): + """Tests that LightningQubit handles a simple dynamic circuit with the following measurements: + * qml.counts with obs (comp basis or not), single wire, multiple wires (ordered/unordered), MCM, f(MCM), MCM list + * qml.expval with obs (comp basis or not), MCM, f(MCM), MCM list + * qml.probs with obs (comp basis or not), single wire, multiple wires (ordered/unordered), MCM, f(MCM), MCM list + * qml.sample with obs (comp basis or not), single wire, multiple wires (ordered/unordered), MCM, f(MCM), MCM list + * qml.var with obs (comp basis or not), MCM, f(MCM), MCM list + The above combinations should work for finite shots, shot vectors and post-selecting of either the 0 or 1 branch. + """ + comm = MPI.COMM_WORLD + + if measure_f in (qml.expval, qml.var) and ( + isinstance(meas_obj, list) or meas_obj == "mcm_list" + ): + pytest.skip("Can't use wires/mcm lists with var or expval") + + dq = qml.device("default.qubit", shots=shots) + dev = get_device(wires=3, shots=shots) + params = [np.pi / 2.5, np.pi / 3, -np.pi / 3.5] + + def func(x, y, z): + m0, m1 = obs_tape(x, y, z, postselect=postselect) + mid_measure = ( + m0 if meas_obj == "mcm" else (0.5 * m0 if meas_obj == "composite_mcm" else [m0, m1]) + ) + measurement_key = "wires" if isinstance(meas_obj, list) else "op" + measurement_value = mid_measure if isinstance(meas_obj, str) else meas_obj + return measure_f(**{measurement_key: measurement_value}) + + results1 = qml.QNode(func, dev, mcm_method="one-shot")(*params) + results2 = qml.QNode(func, dq, mcm_method="deferred")(*params) comm.Barrier() - with pytest.raises( - qml.DeviceError, match="LightningGPU-MPI does not support Mid-circuit measurements." + validate_measurements(measure_f, shots, results1, results2) + + +@pytest.mark.parametrize("postselect", [None, 0, 1]) +@pytest.mark.parametrize("reset", [False, True]) +def test_multiple_measurements_and_reset(postselect, reset): + """Tests that LightningQubit handles a circuit with a single mid-circuit measurement with reset + and a conditional gate. Multiple measurements of the mid-circuit measurement value are + performed. This function also tests `reset` parametrizing over the parameter.""" + comm = MPI.COMM_WORLD + shots = 5000 + dq = qml.device("default.qubit", shots=shots) + dev = get_device(wires=3, shots=shots) + params = [np.pi / 2.5, np.pi / 3, -np.pi / 3.5] + obs = qml.PauliY(1) + + def func(x, y, z): + mcms = obs_tape(x, y, z, reset=reset, postselect=postselect) + return ( + qml.counts(op=obs), + qml.expval(op=mcms[0]), + qml.probs(op=obs), + qml.sample(op=mcms[0]), + qml.var(op=obs), + ) + + results1 = qml.QNode(func, dev, mcm_method="one-shot")(*params) + results2 = qml.QNode(func, dq, mcm_method="deferred")(*params) + comm.Barrier() + + for measure_f, r1, r2 in zip( + [qml.counts, qml.expval, qml.probs, qml.sample, qml.var], results1, results2 ): - func(*params) + validate_measurements(measure_f, shots, r1, r2) + + +@pytest.mark.parametrize( + "mcm_f", + [ + lambda x: x * -1, + lambda x: x * 1, + lambda x: x * 2, + lambda x: 1 - x, + lambda x: x + 1, + lambda x: x & 3, + "mix", + "list", + ], +) +@pytest.mark.parametrize("measure_f", [qml.counts, qml.expval, qml.probs, qml.sample, qml.var]) +def test_composite_mcms(mcm_f, measure_f): + """Tests that LightningQubit handles a circuit with a composite mid-circuit measurement and a + conditional gate. A single measurement of a composite mid-circuit measurement is performed + at the end.""" + comm = MPI.COMM_WORLD + if measure_f in (qml.expval, qml.var) and (mcm_f in ("list", "mix")): + pytest.skip( + "expval/var does not support measuring sequences of measurements or observables." + ) + + if measure_f == qml.probs and mcm_f == "mix": + pytest.skip( + "Cannot use qml.probs() when measuring multiple mid-circuit measurements collected using arithmetic operators." + ) + + shots = 3000 + + dq = qml.device("default.qubit", shots=shots) + dev = get_device(wires=3, shots=shots) + param = np.pi / 3 + + @qml.qnode(dev) + def func(x): + qml.RX(x, 0) + m0 = qml.measure(0) + qml.RX(0.5 * x, 1) + m1 = qml.measure(1) + qml.cond((m0 + m1) == 2, qml.RY)(2.0 * x, 0) + m2 = qml.measure(0) + obs = ( + (m0 - 2 * m1) * m2 + 7 + if mcm_f == "mix" + else ([m0, m1, m2] if mcm_f == "list" else mcm_f(m2)) + ) + return measure_f(op=obs) + + results1 = qml.QNode(func, dev, mcm_method="one-shot")(param) + results2 = qml.QNode(func, dq, mcm_method="deferred")(param) + + comm.Barrier() + + validate_measurements(measure_f, shots, results1, results2) + + +@pytest.mark.parametrize( + "mcm_f", + [ + lambda x, y: x + y, + lambda x, y: x - 7 * y, + lambda x, y: x & y, + lambda x, y: x == y, + lambda x, y: 4.0 * x + 2.0 * y, + ], +) +def test_counts_return_type(mcm_f): + """Tests that LightningQubit returns the same keys for ``qml.counts`` measurements with ``dynamic_one_shot`` and ``defer_measurements``.""" + comm = MPI.COMM_WORLD + shots = 500 + + dq = qml.device("default.qubit", shots=shots) + dev = get_device(wires=3, shots=shots) + param = np.pi / 3 + + @qml.qnode(dev) + def func(x): + qml.RX(x, 0) + m0 = qml.measure(0) + qml.RX(0.5 * x, 1) + m1 = qml.measure(1) + qml.cond((m0 + m1) == 2, qml.RY)(2.0 * x, 0) + return qml.counts(op=mcm_f(m0, m1)) + + results1 = qml.QNode(func, dev, mcm_method="one-shot")(param) + results2 = qml.QNode(func, dq, mcm_method="deferred")(param) + comm.Barrier() + for r1, r2 in zip(results1.keys(), results2.keys()): + assert r1 == r2 diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp index c05f8160fa..794c026de5 100644 --- a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp +++ b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp @@ -344,13 +344,16 @@ class StateVectorCudaMPI final * @param branch Branch 0 or 1. */ void collapse(const std::size_t wire, const bool branch) { + /* PL_ABORT_IF_NOT(wire < this->getTotalNumQubits(), "Invalid wire index."); + PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize()); + mpi_manager_.Barrier(); const int wireInt = static_cast(this->getTotalNumQubits() - 1 - wire); - if (static_cast(wireInt) < BaseType::getNumQubits()) { + if (static_cast(wireInt) < getNumLocalQubits()) { // local wire collapse_local_(wireInt, branch); } else { @@ -364,6 +367,43 @@ class StateVectorCudaMPI final PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize()); } + mpi_manager_.Barrier(); + */ + PL_ABORT_IF_NOT(wire < this->getTotalNumQubits(), + "Invalid wire index."); + + std::vector matrix(4, ComplexT(0.0, 0.0)); + + for (std::size_t i = 0; i < matrix.size(); i++) { + matrix[i] = ((i == 0 && branch == 0) || (i == 3 && branch == 1)) + ? ComplexT{1.0, 0.0} + : ComplexT{0.0, 0.0}; + } + + mpi_manager_.Barrier(); + + applyMatrix(matrix, {wire}, false); + + auto local_norm2 = norm2_CUDA( + BaseType::getData(), BaseType::getLength(), + BaseType::getDataBuffer().getDevTag().getDeviceID(), + BaseType::getDataBuffer().getDevTag().getStreamID(), + this->getCublasCaller()); + + local_norm2 *= local_norm2; + + mpi_manager_.Barrier(); + + auto norm2 = mpi_manager_.allreduce(local_norm2, "sum"); + + norm2 = std::sqrt(norm2); + + normalize_CUDA( + norm2, BaseType::getData(), BaseType::getLength(), + BaseType::getDataBuffer().getDevTag().getDeviceID(), + BaseType::getDataBuffer().getDevTag().getStreamID(), + this->getCublasCaller()); + mpi_manager_.Barrier(); } @@ -383,6 +423,8 @@ class StateVectorCudaMPI final const std::vector &wires, bool adjoint, const std::vector ¶ms, [[maybe_unused]] const std::vector &matrix) { + PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize()); + mpi_manager_.Barrier(); std::vector matrix_cu(matrix.size()); std::transform(matrix.begin(), matrix.end(), matrix_cu.begin(), [](const std::complex &x) { @@ -408,6 +450,8 @@ class StateVectorCudaMPI final const std::string &opName, const std::vector &wires, bool adjoint = false, const std::vector ¶ms = {0.0}, [[maybe_unused]] const std::vector &gate_matrix = {}) { + PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize()); + mpi_manager_.Barrier(); const auto ctrl_offset = (BaseType::getCtrlMap().find(opName) != BaseType::getCtrlMap().end()) ? BaseType::getCtrlMap().at(opName) @@ -467,6 +511,8 @@ class StateVectorCudaMPI final gate_cache_.get_gate_device_ptr(opName, par[0]), ctrls_local, tgts_local, adjoint); } + PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize()); + mpi_manager_.Barrier(); } /** @@ -524,6 +570,8 @@ class StateVectorCudaMPI final const std::vector &wires, bool adjoint = false) { PL_ABORT_IF(wires.empty(), "Number of wires must be larger than 0"); + PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize()); + mpi_manager_.Barrier(); const std::string opName = "Matrix"; std::size_t n = std::size_t{1} << wires.size(); const std::vector> matrix(gate_matrix, @@ -535,6 +583,8 @@ class StateVectorCudaMPI final x); }); applyOperation(opName, wires, adjoint, {}, matrix_cu); + PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize()); + mpi_manager_.Barrier(); } /** @@ -1674,7 +1724,7 @@ class StateVectorCudaMPI final /* custatevecHandle_t */ handle_.get(), /* void *sv */ BaseType::getData(), /* cudaDataType_t */ data_type, - /* const uint32_t nIndexBits */ BaseType::getNumQubits(), + /* const uint32_t nIndexBits */ getNumLocalQubits(), /* double * */ &abs2sum0_local, /* double * */ &abs2sum1_local, /* const int32_t * */ basisBits.data(), @@ -1691,7 +1741,7 @@ class StateVectorCudaMPI final /* custatevecHandle_t */ handle_.get(), /* void *sv */ BaseType::getData(), /* cudaDataType_t */ data_type, - /* const uint32_t nIndexBits */ BaseType::getNumQubits(), + /* const uint32_t nIndexBits */ getNumLocalQubits(), /* const int32_t parity */ parity, /* const int32_t *basisBits */ basisBits.data(), /* const uint32_t nBasisBits */ basisBits.size(), @@ -1980,6 +2030,8 @@ class StateVectorCudaMPI final const std::vector &ctrls, const std::vector &tgts, bool use_adjoint = false) { + PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize()); + mpi_manager_.Barrier(); std::vector ctrlsInt(ctrls.size()); std::vector tgtsInt(tgts.size()); diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp index 2d3313f694..529f5ae75e 100644 --- a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp +++ b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp @@ -100,6 +100,8 @@ void registerBackendClassSpecificBindingsMPI(PyClass &pyclass) { }, "Set State Vector on GPU with values for the state vector and " "wires on the host memory.") + .def("collapse", &StateVectorT::collapse, + "Collapse the statevector onto the 0 or 1 branch of a given wire.") .def( "DeviceToDevice", [](StateVectorT &sv, const StateVectorT &other, bool async) { diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/MeasurementsGPUMPI.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/MeasurementsGPUMPI.hpp index 6fee1711d2..710930ba54 100644 --- a/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/MeasurementsGPUMPI.hpp +++ b/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/MeasurementsGPUMPI.hpp @@ -265,7 +265,7 @@ class MeasurementsMPI final * number between 0 and num_samples-1. */ auto generate_samples(std::size_t num_samples) -> std::vector { - double epsilon = 1e-15; + double epsilon = std::numeric_limits::epsilon() * 1.0e2; std::size_t nSubSvs = 1UL << (this->_statevector.getNumGlobalQubits()); std::vector rand_nums(num_samples); std::vector samples( @@ -280,8 +280,8 @@ class MeasurementsMPI final bitOrdering[i] = i; } - std::vector localBitStrings(num_samples); - std::vector globalBitStrings(num_samples); + std::vector localBitStrings(num_samples, 0); + std::vector globalBitStrings(num_samples, 0); if (mpi_manager_.getRank() == 0) { for (std::size_t n = 0; n < num_samples; n++) { @@ -320,6 +320,8 @@ class MeasurementsMPI final /* custatevecHandle_t */ this->_statevector.getCusvHandle(), /* custatevecSamplerDescriptor_t */ sampler, /* double * */ &subNorm)); + PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize()); + mpi_manager_.Barrier(); int source = (mpi_manager_.getRank() - 1 + mpi_manager_.getSize()) % mpi_manager_.getSize(); @@ -354,6 +356,8 @@ class MeasurementsMPI final /* double */ precumulative, /* double */ norm)); + norm = (norm < epsilon) ? epsilon : norm; + PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize()); auto low = std::lower_bound(rand_nums.begin(), rand_nums.end(), cumulative / norm); diff --git a/pennylane_lightning/core/src/utils/cuda_utils/LinearAlg.hpp b/pennylane_lightning/core/src/utils/cuda_utils/LinearAlg.hpp index cd422899b5..984a9d2358 100644 --- a/pennylane_lightning/core/src/utils/cuda_utils/LinearAlg.hpp +++ b/pennylane_lightning/core/src/utils/cuda_utils/LinearAlg.hpp @@ -274,6 +274,61 @@ inline auto scaleC_CUDA(const CFP_t a, T *v1, const int data_size, data_type); } +/** + * @brief cuBLAS backed GPU data normalization. + * + * @tparam CFP_t Complex float data-type. Accepts cuDoubleComplex and cuComplex + * @tparam DevTypeID Integer type of device id. + * + * @param v1 Device data pointer + * @param data_size Length of device data. + * @param dev_id the device on which the function should be executed. + * @param stream_id the CUDA stream on which the operation should be executed. + * @param cublas the CublasCaller object that manages the cuBLAS handle. + */ +template +inline auto norm2_CUDA(CFP_t *v1, const int data_size, DevTypeID dev_id, + cudaStream_t stream_id, const CublasCaller &cublas) { + if constexpr (std::is_same_v || + std::is_same_v) { + double norm{0.0}; + cublas.call(cublasDznrm2, dev_id, stream_id, data_size, v1, 1, &norm); + return norm; + } else { + float norm{0.0}; + cublas.call(cublasScnrm2, dev_id, stream_id, data_size, v1, 1, &norm); + return norm; + } +} + +/** + * @brief cuBLAS backed GPU data normalization. + * + * @tparam T Float data-type. Accepts float and double + * @tparam CFP_t Complex float data-type. Accepts cuDoubleComplex and cuComplex + * + * @param norm2 Norm of the vector + * @param v1 Device data pointer + * @param data_size Length of device data. + * @param dev_id the device on which the function should be executed. + * @param stream_id the CUDA stream on which the operation should be executed. + * @param cublas the CublasCaller object that manages the cuBLAS handle. + */ +template +inline auto normalize_CUDA(T norm2, CFP_t *v1, const int data_size, + DevTypeID dev_id, cudaStream_t stream_id, + const CublasCaller &cublas) { + if constexpr (std::is_same_v || + std::is_same_v) { + const double alpha = 1.0 / norm2; + cublas.call(cublasZdscal, dev_id, stream_id, data_size, &alpha, v1, 1); + } else { + const float alpha = 1.0 / norm2; + cublas.call(cublasCsscal, dev_id, stream_id, data_size, &alpha, v1, 1); + } +} + /** @brief `%CudaScopedDevice` uses RAII to select a CUDA device context. * * @see https://taskflow.github.io/taskflow/classtf_1_1cudaScopedDevice.html diff --git a/pennylane_lightning/lightning_gpu/_measurements.py b/pennylane_lightning/lightning_gpu/_measurements.py index 4b95762ccc..337f6273e3 100644 --- a/pennylane_lightning/lightning_gpu/_measurements.py +++ b/pennylane_lightning/lightning_gpu/_measurements.py @@ -34,6 +34,7 @@ except ImportError as error_import: warn(str(error_import), UserWarning) +from functools import reduce from typing import List import numpy as np @@ -105,8 +106,9 @@ def _measure_with_samples_diagonalizing_gates( self._apply_diagonalizing_gates(mps) # Specific for LGPU: - total_indices = self._qubit_state.num_wires - wires = qml.wires.Wires(range(total_indices)) + # total_indices = self._qubit_state.num_wires + # wires = qml.wires.Wires(range(total_indices)) + wires = reduce(sum, (mp.wires for mp in mps)) def _process_single_shot(samples): processed = [] diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.py b/pennylane_lightning/lightning_gpu/lightning_gpu.py index 9c91b87a52..d6f75b2c5e 100644 --- a/pennylane_lightning/lightning_gpu/lightning_gpu.py +++ b/pennylane_lightning/lightning_gpu/lightning_gpu.py @@ -350,7 +350,6 @@ def __init__( # pylint: disable=too-many-arguments # Creating the state vector self._mpi_handler = MPIHandler(mpi, mpi_buf_size, len(self.wires), c_dtype) - self._use_mpi = mpi self._statevector = self.LightningStateVector( num_wires=len(self.wires), @@ -510,9 +509,6 @@ def simulate( Note that this function can return measurements for non-commuting observables simultaneously. """ if circuit.shots and (any(isinstance(op, MidMeasureMP) for op in circuit.operations)): - if self._use_mpi: - raise qml.DeviceError("LightningGPU-MPI does not support Mid-circuit measurements.") - results = [] aux_circ = QuantumScript( circuit.operations, From ddc74f2b8d845eaf1eb27250ecd47f370a0a5e13 Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Tue, 15 Oct 2024 15:19:41 +0000 Subject: [PATCH 23/23] update conftest --- mpitests/conftest.py | 79 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 78 insertions(+), 1 deletion(-) diff --git a/mpitests/conftest.py b/mpitests/conftest.py index 552cf9f330..11be17824a 100644 --- a/mpitests/conftest.py +++ b/mpitests/conftest.py @@ -15,9 +15,10 @@ Pytest configuration file for PennyLane-Lightning-GPU test suite. """ # pylint: disable=missing-function-docstring,wrong-import-order,unused-import - import itertools import os +from functools import reduce +from typing import Sequence import pennylane as qml import pytest @@ -125,3 +126,79 @@ def _device(wires): ) return _device + + +####################################################################### + + +def validate_counts(shots, results1, results2): + """Compares two counts. + If the results are ``Sequence``s, loop over entries. + Fails if a key of ``results1`` is not found in ``results2``. + Passes if counts are too low, chosen as ``100``. + Otherwise, fails if counts differ by more than ``20`` plus 20 percent. + """ + if isinstance(results1, Sequence): + assert isinstance(results2, Sequence) + assert len(results1) == len(results2) + for r1, r2 in zip(results1, results2): + validate_counts(shots, r1, r2) + return + for key1, val1 in results1.items(): + val2 = results2[key1] + if abs(val1 + val2) > 100: + assert np.allclose(val1, val2, rtol=20, atol=0.2) + + +def validate_samples(shots, results1, results2): + """Compares two samples. + If the results are ``Sequence``s, loop over entries. + Fails if the results do not have the same shape, within ``20`` entries plus 20 percent. + This is to handle cases when post-selection yields variable shapes. + Otherwise, fails if the sums of samples differ by more than ``20`` plus 20 percent. + """ + if isinstance(shots, Sequence): + assert isinstance(results1, Sequence) + assert isinstance(results2, Sequence) + assert len(results1) == len(results2) + for s, r1, r2 in zip(shots, results1, results2): + validate_samples(s, r1, r2) + else: + sh1, sh2 = results1.shape[0], results2.shape[0] + assert np.allclose(sh1, sh2, rtol=20, atol=0.2) + assert results1.ndim == results2.ndim + if results2.ndim > 1: + assert results1.shape[1] == results2.shape[1] + np.allclose(np.sum(results1), np.sum(results2), rtol=20, atol=0.2) + + +def validate_others(shots, results1, results2): + """Compares two expval, probs or var. + If the results are ``Sequence``s, validate the average of items. + If ``shots is None``, validate using ``np.allclose``'s default parameters. + Otherwise, fails if the results do not match within ``0.01`` plus 20 percent. + """ + if isinstance(results1, Sequence): + assert isinstance(results2, Sequence) + assert len(results1) == len(results2) + results1 = reduce(lambda x, y: x + y, results1) / len(results1) + results2 = reduce(lambda x, y: x + y, results2) / len(results2) + validate_others(shots, results1, results2) + return + if shots is None: + assert np.allclose(results1, results2) + return + assert np.allclose(results1, results2, atol=0.01, rtol=0.2) + + +def validate_measurements(func, shots, results1, results2): + """Calls the correct validation function based on measurement type.""" + if func is qml.counts: + validate_counts(shots, results1, results2) + return + + if func is qml.sample: + validate_samples(shots, results1, results2) + return + + validate_others(shots, results1, results2)