From 29cf4f0a3677377420c0cf8c9a5916870d35863d Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Wed, 2 Oct 2024 20:09:57 +0000
Subject: [PATCH 01/23] Initial commit

---
 .github/CHANGELOG.md                          |  3 +
 .../lightning_gpu/StateVectorCudaManaged.hpp  | 45 +++++++++++
 .../src/simulators/lightning_gpu/initSV.cu    | 78 ++++++++++++++++++-
 .../tests/Test_StateVectorCudaManaged.cpp     | 42 ++++++++++
 .../core/src/utils/cuda_utils/LinearAlg.hpp   | 28 +++++++
 5 files changed, 195 insertions(+), 1 deletion(-)

diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md
index 6a9b55cf8f..affc06c261 100644
--- a/.github/CHANGELOG.md
+++ b/.github/CHANGELOG.md
@@ -2,6 +2,9 @@
 
 ### New features since last release
 
+* Add `collapse()` support to `lightning.gpu` C++ layer.
+  [(#931)](https://github.com/PennyLaneAI/pennylane-lightning/pull/931)
+
 * Add Matrix Product Operator (MPO) for all gates support to `lightning.tensor`. Note current C++ implementation only works for MPO sites data provided by users.
   [(#859)](https://github.com/PennyLaneAI/pennylane-lightning/pull/859)
 
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp
index 716d95c89f..dc9c3ffcd9 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp
@@ -67,6 +67,17 @@ extern void setBasisState_CUDA(cuDoubleComplex *sv, cuDoubleComplex &value,
                                const std::size_t index, bool async,
                                cudaStream_t stream_id);
 
+extern void collapseStateVector_CUDA(cuComplex *sv, const std::size_t num_sv,
+                                     const std::size_t stride, const bool k,
+                                     std::size_t thread_per_block,
+                                     cudaStream_t stream_id);
+
+extern void collapseStateVector_CUDA(cuDoubleComplex *sv,
+                                     const std::size_t num_sv,
+                                     const std::size_t stride, const bool k,
+                                     std::size_t thread_per_block,
+                                     cudaStream_t stream_id);
+
 extern void globalPhaseStateVector_CUDA(cuComplex *sv, std::size_t num_sv,
                                         cuComplex phase,
                                         std::size_t thread_per_block,
@@ -434,6 +445,40 @@ class StateVectorCudaManaged
         applyMatrix(gate_matrix.data(), wires, adjoint);
     }
 
+    /**
+     * @brief Collapse the state vector after having measured one of the qubit.
+     *
+     * Note: The branch parameter imposes the measurement result on the given
+     * wire.
+     *
+     * @tparam thread_per_block Number of threads per block. Default is 256.
+     * @param wire Wire to measure.
+     * @param branch Branch 0 or 1.
+     */
+    template <std::size_t thread_per_block = 256>
+    void collapse(const std::size_t wire, const bool branch) {
+        PL_ABORT_IF_NOT(wire < BaseType::getNumQubits(), "Invalid wire index.");
+
+        const std::size_t stride = std::size_t{1U}
+                                   << (BaseType::getNumQubits() - (1 + wire));
+        // zero half the entries
+        // the "half" entries depend on the stride
+        // *_*_*_*_ for stride 1
+        // **__**__ for stride 2
+        // ****____ for stride 4
+        const bool k = branch ? 0 : 1;
+
+        collapseStateVector_CUDA(
+            BaseType::getData(), BaseType::getLength(), stride, k,
+            thread_per_block,
+            BaseType::getDataBuffer().getDevTag().getStreamID());
+
+        normalize_CUDA<CFP_t>(
+            BaseType::getData(), BaseType::getLength(),
+            BaseType::getDataBuffer().getDevTag().getDeviceID(),
+            BaseType::getDataBuffer().getDevTag().getStreamID(),
+            this->getCublasCaller());
+    }
     //****************************************************************************//
     // Explicit gate calls for bindings
     //****************************************************************************//
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/initSV.cu b/pennylane_lightning/core/src/simulators/lightning_gpu/initSV.cu
index 4e3e93ea79..6d0aee75a4 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/initSV.cu
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/initSV.cu
@@ -59,7 +59,7 @@ void setBasisState_CUDA(cuDoubleComplex *sv, cuDoubleComplex &value,
                         cudaStream_t stream_id);
 
 /**
- * @brief The CUDA kernel that setS state vector data on GPU device from the
+ * @brief The CUDA kernel that sets state vector data on GPU device from the
  * input values (on device) and their corresponding indices (on device)
  * information.
  *
@@ -106,6 +106,66 @@ void setStateVector_CUDA_call(GPUDataT *sv, index_type &num_indices,
     PL_CUDA_IS_SUCCESS(cudaGetLastError());
 }
 
+/**
+ * @brief The CUDA kernel that collapses the state vector data on GPU device
+ * based on the input values (on device) and their corresponding indices (on
+ * device) information.
+ *
+ * @param sv Complex data pointer of state vector on device.
+ * @param half_num_sv Number of state vector elements.
+ * @param stride Number of elements in the input values array.
+ * @param k Boolean flag to indicate whether to collapse or not.
+ */
+template <class GPUDataT>
+__global__ void
+collapseStateVectorKernel(GPUDataT *sv, const std::size_t half_num_sv,
+                          const std::size_t stride, const bool k) {
+    const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= half_num_sv) {
+        return;
+    }
+
+    const unsigned int id = i / stride;
+    const unsigned int ids = i % stride;
+    const unsigned int offset = (2 * id + k) * stride;
+    const unsigned int idx = offset + ids;
+
+    sv[idx].x = 0.0;
+    sv[idx].y = 0.0;
+}
+
+/**
+ * @brief The CUDA kernel call wrapper that collapses the state vector data on
+ * GPU device based on the input values (on device) and their corresponding
+ * indices (on device) information.
+ *
+ * @param sv Complex data pointer of state vector on device.
+ * @param num_sv Number of state vector elements.
+ * @param stride Number of elements in the input values array.
+ * @param k Boolean flag to indicate whether to collapse or not.
+ * @param thread_per_block Number of threads set per block.
+ * @param stream_id Stream id of CUDA calls
+ */
+template <class GPUDataT>
+void collapseStateVector_CUDA_call(GPUDataT *sv, std::size_t num_sv,
+                                   const std::size_t stride, const bool k,
+                                   const std::size_t thread_per_block,
+                                   cudaStream_t stream_id) {
+    const std::size_t half_num_sv = num_sv / 2;
+    auto dv = std::div(static_cast<long long>(half_num_sv),
+                       static_cast<long long>(thread_per_block));
+
+    const std::size_t num_blocks = dv.quot + (dv.rem == 0 ? 0 : 1);
+    const std::size_t block_per_grid = (num_blocks == 0 ? 1 : num_blocks);
+    dim3 blockSize(thread_per_block, 1, 1);
+    dim3 gridSize(block_per_grid, 1);
+
+    collapseStateVectorKernel<GPUDataT>
+        <<<gridSize, blockSize, 0, stream_id>>>(sv, half_num_sv, stride, k);
+
+    PL_CUDA_IS_SUCCESS(cudaGetLastError());
+}
+
 /**
  * @brief The CUDA kernel that multiplies the state vector data on GPU device
  * by a global phase.
@@ -243,6 +303,22 @@ void setBasisState_CUDA(cuDoubleComplex *sv, cuDoubleComplex &value,
     setBasisState_CUDA_call(sv, value, index, async, stream_id);
 }
 
+void collapseStateVector_CUDA(cuComplex *sv, const std::size_t num_sv,
+                              const std::size_t stride, const bool k,
+                              std::size_t thread_per_block,
+                              cudaStream_t stream_id) {
+    collapseStateVector_CUDA_call(sv, num_sv, stride, k, thread_per_block,
+                                  stream_id);
+}
+
+void collapseStateVector_CUDA(cuDoubleComplex *sv, std::size_t num_sv,
+                              const std::size_t stride, const bool k,
+                              std::size_t thread_per_block,
+                              cudaStream_t stream_id) {
+    collapseStateVector_CUDA_call(sv, num_sv, stride, k, thread_per_block,
+                                  stream_id);
+}
+
 void globalPhaseStateVector_CUDA(cuComplex *sv, std::size_t num_sv,
                                  cuComplex phase, std::size_t thread_per_block,
                                  cudaStream_t stream_id) {
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/tests/Test_StateVectorCudaManaged.cpp b/pennylane_lightning/core/src/simulators/lightning_gpu/tests/Test_StateVectorCudaManaged.cpp
index 4003395b53..0301970390 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/tests/Test_StateVectorCudaManaged.cpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/tests/Test_StateVectorCudaManaged.cpp
@@ -266,3 +266,45 @@ TEMPLATE_TEST_CASE("StateVectorCudaManaged::StateVectorCudaManaged",
         REQUIRE(std::is_constructible_v<StateVectorT, const StateVectorT &>);
     }
 }
+
+TEMPLATE_TEST_CASE("StateVectorCudaManaged::collapse",
+                   "[StateVectorCudaManaged]", float, double) {
+    using PrecisionT = TestType;
+    using ComplexT = typename StateVectorCudaManaged<PrecisionT>::ComplexT;
+    using CFP_t = typename StateVectorCudaManaged<PrecisionT>::CFP_t;
+    using TestVectorT = TestVector<ComplexT>;
+
+    std::size_t wire = GENERATE(0, 1, 2);
+    std::size_t branch = GENERATE(0, 1);
+    const std::size_t num_qubits = 3;
+
+    // TODO @tomlqc use same template for testing all Lightning flavours?
+
+    SECTION("Collapse the state vector after having measured one of the "
+            "qubits.") {
+        TestVectorT init_state = createPlusState_<ComplexT>(num_qubits);
+
+        const ComplexT coef{0.5, PrecisionT{0.0}};
+        const ComplexT zero{PrecisionT{0.0}, PrecisionT{0.0}};
+
+        std::vector<std::vector<std::vector<ComplexT>>> expected_state = {
+            {{coef, coef, coef, coef, zero, zero, zero, zero},
+             {coef, coef, zero, zero, coef, coef, zero, zero},
+             {coef, zero, coef, zero, coef, zero, coef, zero}},
+            {{zero, zero, zero, zero, coef, coef, coef, coef},
+             {zero, zero, coef, coef, zero, zero, coef, coef},
+             {zero, coef, zero, coef, zero, coef, zero, coef}},
+        };
+
+        StateVectorCudaManaged<PrecisionT> sv(
+            reinterpret_cast<CFP_t *>(init_state.data()), init_state.size());
+
+        sv.collapse(wire, branch);
+
+        PrecisionT eps = std::numeric_limits<PrecisionT>::epsilon() * 1e2;
+        REQUIRE(isApproxEqual(sv.getDataVector().data(),
+                              sv.getDataVector().size(),
+                              expected_state[branch][wire].data(),
+                              expected_state[branch][wire].size(), eps));
+    }
+}
diff --git a/pennylane_lightning/core/src/utils/cuda_utils/LinearAlg.hpp b/pennylane_lightning/core/src/utils/cuda_utils/LinearAlg.hpp
index cd422899b5..d8e189ead4 100644
--- a/pennylane_lightning/core/src/utils/cuda_utils/LinearAlg.hpp
+++ b/pennylane_lightning/core/src/utils/cuda_utils/LinearAlg.hpp
@@ -274,6 +274,34 @@ inline auto scaleC_CUDA(const CFP_t a, T *v1, const int data_size,
                 data_type);
 }
 
+/**
+ * @brief cuBLAS backed GPU data normalization.
+ *
+ * @tparam T Float data-type. Accepts float and double
+ * @param a scaling factor
+ * @param v1 Device data pointer
+ * @param data_size Length of device data.
+ * @param dev_id the device on which the function should be executed.
+ * @param stream_id the CUDA stream on which the operation should be executed.
+ * @param cublas the CublasCaller object that manages the cuBLAS handle.
+ */
+template <class CFP_t = cuDoubleComplex, class DevTypeID = int>
+inline auto normalize_CUDA(CFP_t *v1, const int data_size, DevTypeID dev_id,
+                           cudaStream_t stream_id, const CublasCaller &cublas) {
+    if constexpr (std::is_same_v<CFP_t, cuDoubleComplex> ||
+                  std::is_same_v<CFP_t, double2>) {
+        double norm{0.0};
+        cublas.call(cublasDznrm2, dev_id, stream_id, data_size, v1, 1, &norm);
+        const double alpha = 1.0 / norm;
+        cublas.call(cublasZdscal, dev_id, stream_id, data_size, &alpha, v1, 1);
+    } else {
+        float norm{0.0};
+        cublas.call(cublasScnrm2, dev_id, stream_id, data_size, v1, 1, &norm);
+        const float alpha = 1.0 / norm;
+        cublas.call(cublasCsscal, dev_id, stream_id, data_size, &alpha, v1, 1);
+    }
+}
+
 /** @brief `%CudaScopedDevice` uses RAII to select a CUDA device context.
  *
  * @see https://taskflow.github.io/taskflow/classtf_1_1cudaScopedDevice.html

From 0c449bf18550c042c41e50148f8e313a2c259cd9 Mon Sep 17 00:00:00 2001
From: ringo-but-quantum <github-ringo-but-quantum@xanadu.ai>
Date: Wed, 2 Oct 2024 20:27:43 +0000
Subject: [PATCH 02/23] Auto update version from '0.39.0-dev34' to
 '0.39.0-dev36'

---
 pennylane_lightning/core/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
index 05a5ab3841..9b3cc2f16a 100644
--- a/pennylane_lightning/core/_version.py
+++ b/pennylane_lightning/core/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.39.0-dev34"
+__version__ = "0.39.0-dev36"

From 6c36e5cfa8458bfbabdcdb03ccc20e6c38ded4c6 Mon Sep 17 00:00:00 2001
From: ringo-but-quantum <github-ringo-but-quantum@xanadu.ai>
Date: Mon, 7 Oct 2024 20:46:19 +0000
Subject: [PATCH 03/23] Auto update version from '0.39.0-dev39' to
 '0.39.0-dev40'

---
 pennylane_lightning/core/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
index 4f9f650221..c6e0b5050a 100644
--- a/pennylane_lightning/core/_version.py
+++ b/pennylane_lightning/core/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.39.0-dev39"
+__version__ = "0.39.0-dev40"

From 80c3a62fc4f48155058b4e104f3883162ce36df8 Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Tue, 8 Oct 2024 00:03:00 +0000
Subject: [PATCH 04/23] tidy up code and add mpi support

---
 .../lightning_gpu/StateVectorCudaMPI.hpp      | 48 ++++++++++++
 .../lightning_gpu/StateVectorCudaManaged.hpp  | 46 +++++------
 .../src/simulators/lightning_gpu/initSV.cu    | 76 -------------------
 .../tests/mpi/Test_StateVectorCudaMPI.cpp     | 74 +++++++++++++++++-
 .../core/src/utils/cuda_utils/LinearAlg.hpp   | 41 ++++++++--
 5 files changed, 173 insertions(+), 112 deletions(-)

diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp
index 577c510f9e..97a05d546d 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp
@@ -347,6 +347,54 @@ class StateVectorCudaMPI final
         mpi_manager_.Barrier();
     }
 
+    /**
+     * @brief Collapse the state vector after having measured one of the qubit.
+     *
+     * Note: The branch parameter imposes the measurement result on the given
+     * wire.
+     *
+     * @param wire Wire to measure.
+     * @param branch Branch 0 or 1.
+     */
+    void collapse(const std::size_t wire, const bool branch) {
+        PL_ABORT_IF_NOT(wire < this->getTotalNumQubits(),
+                        "Invalid wire index.");
+
+        std::vector<ComplexT> matrix(4, ComplexT(0.0, 0.0));
+
+        for (std::size_t i = 0; i < matrix.size(); i++) {
+            matrix[i] = ((i == 0 && branch == 0) || (i == 3 && branch == 1))
+                            ? ComplexT{1.0, 0.0}
+                            : ComplexT{0.0, 0.0};
+        }
+
+        mpi_manager_.Barrier();
+
+        applyMatrix(matrix, {wire}, false);
+
+        auto local_norm2 = norm2_CUDA<CFP_t>(
+            BaseType::getData(), BaseType::getLength(),
+            BaseType::getDataBuffer().getDevTag().getDeviceID(),
+            BaseType::getDataBuffer().getDevTag().getStreamID(),
+            this->getCublasCaller());
+
+        local_norm2 *= local_norm2;
+
+        mpi_manager_.Barrier();
+
+        auto norm2 = mpi_manager_.allreduce(local_norm2, "sum");
+
+        norm2 = std::sqrt(norm2);
+
+        normalize_CUDA<PrecisionT, CFP_t>(
+            norm2, BaseType::getData(), BaseType::getLength(),
+            BaseType::getDataBuffer().getDevTag().getDeviceID(),
+            BaseType::getDataBuffer().getDevTag().getStreamID(),
+            this->getCublasCaller());
+
+        mpi_manager_.Barrier();
+    }
+
     /**
      * @brief Apply a single gate to the state-vector. Offloads to custatevec
      * specific API calls if available. If unable, attempts to use prior cached
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp
index 7b61c7a473..aa3429ceca 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp
@@ -67,17 +67,6 @@ extern void setBasisState_CUDA(cuDoubleComplex *sv, cuDoubleComplex &value,
                                const std::size_t index, bool async,
                                cudaStream_t stream_id);
 
-extern void collapseStateVector_CUDA(cuComplex *sv, const std::size_t num_sv,
-                                     const std::size_t stride, const bool k,
-                                     std::size_t thread_per_block,
-                                     cudaStream_t stream_id);
-
-extern void collapseStateVector_CUDA(cuDoubleComplex *sv,
-                                     const std::size_t num_sv,
-                                     const std::size_t stride, const bool k,
-                                     std::size_t thread_per_block,
-                                     cudaStream_t stream_id);
-
 extern void globalPhaseStateVector_CUDA(cuComplex *sv, std::size_t num_sv,
                                         cuComplex phase,
                                         std::size_t thread_per_block,
@@ -494,34 +483,35 @@ class StateVectorCudaManaged
      * Note: The branch parameter imposes the measurement result on the given
      * wire.
      *
-     * @tparam thread_per_block Number of threads per block. Default is 256.
      * @param wire Wire to measure.
      * @param branch Branch 0 or 1.
      */
-    template <std::size_t thread_per_block = 256>
     void collapse(const std::size_t wire, const bool branch) {
         PL_ABORT_IF_NOT(wire < BaseType::getNumQubits(), "Invalid wire index.");
 
-        const std::size_t stride = std::size_t{1U}
-                                   << (BaseType::getNumQubits() - (1 + wire));
-        // zero half the entries
-        // the "half" entries depend on the stride
-        // *_*_*_*_ for stride 1
-        // **__**__ for stride 2
-        // ****____ for stride 4
-        const bool k = branch ? 0 : 1;
-
-        collapseStateVector_CUDA(
-            BaseType::getData(), BaseType::getLength(), stride, k,
-            thread_per_block,
-            BaseType::getDataBuffer().getDevTag().getStreamID());
-
-        normalize_CUDA<CFP_t>(
+        std::vector<ComplexT> matrix(4, ComplexT(0.0, 0.0));
+
+        for (std::size_t i = 0; i < matrix.size(); i++) {
+            matrix[i] = ((i == 0 && branch == 0) || (i == 3 && branch == 1))
+                            ? ComplexT{1.0, 0.0}
+                            : ComplexT{0.0, 0.0};
+        }
+
+        applyMatrix(matrix, {wire}, false);
+
+        auto norm2 = norm2_CUDA<CFP_t>(
             BaseType::getData(), BaseType::getLength(),
             BaseType::getDataBuffer().getDevTag().getDeviceID(),
             BaseType::getDataBuffer().getDevTag().getStreamID(),
             this->getCublasCaller());
+
+        normalize_CUDA<PrecisionT, CFP_t>(
+            norm2, BaseType::getData(), BaseType::getLength(),
+            BaseType::getDataBuffer().getDevTag().getDeviceID(),
+            BaseType::getDataBuffer().getDevTag().getStreamID(),
+            this->getCublasCaller());
     }
+
     //****************************************************************************//
     // Explicit gate calls for bindings
     //****************************************************************************//
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/initSV.cu b/pennylane_lightning/core/src/simulators/lightning_gpu/initSV.cu
index 6d0aee75a4..8a62e89e84 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/initSV.cu
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/initSV.cu
@@ -106,66 +106,6 @@ void setStateVector_CUDA_call(GPUDataT *sv, index_type &num_indices,
     PL_CUDA_IS_SUCCESS(cudaGetLastError());
 }
 
-/**
- * @brief The CUDA kernel that collapses the state vector data on GPU device
- * based on the input values (on device) and their corresponding indices (on
- * device) information.
- *
- * @param sv Complex data pointer of state vector on device.
- * @param half_num_sv Number of state vector elements.
- * @param stride Number of elements in the input values array.
- * @param k Boolean flag to indicate whether to collapse or not.
- */
-template <class GPUDataT>
-__global__ void
-collapseStateVectorKernel(GPUDataT *sv, const std::size_t half_num_sv,
-                          const std::size_t stride, const bool k) {
-    const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= half_num_sv) {
-        return;
-    }
-
-    const unsigned int id = i / stride;
-    const unsigned int ids = i % stride;
-    const unsigned int offset = (2 * id + k) * stride;
-    const unsigned int idx = offset + ids;
-
-    sv[idx].x = 0.0;
-    sv[idx].y = 0.0;
-}
-
-/**
- * @brief The CUDA kernel call wrapper that collapses the state vector data on
- * GPU device based on the input values (on device) and their corresponding
- * indices (on device) information.
- *
- * @param sv Complex data pointer of state vector on device.
- * @param num_sv Number of state vector elements.
- * @param stride Number of elements in the input values array.
- * @param k Boolean flag to indicate whether to collapse or not.
- * @param thread_per_block Number of threads set per block.
- * @param stream_id Stream id of CUDA calls
- */
-template <class GPUDataT>
-void collapseStateVector_CUDA_call(GPUDataT *sv, std::size_t num_sv,
-                                   const std::size_t stride, const bool k,
-                                   const std::size_t thread_per_block,
-                                   cudaStream_t stream_id) {
-    const std::size_t half_num_sv = num_sv / 2;
-    auto dv = std::div(static_cast<long long>(half_num_sv),
-                       static_cast<long long>(thread_per_block));
-
-    const std::size_t num_blocks = dv.quot + (dv.rem == 0 ? 0 : 1);
-    const std::size_t block_per_grid = (num_blocks == 0 ? 1 : num_blocks);
-    dim3 blockSize(thread_per_block, 1, 1);
-    dim3 gridSize(block_per_grid, 1);
-
-    collapseStateVectorKernel<GPUDataT>
-        <<<gridSize, blockSize, 0, stream_id>>>(sv, half_num_sv, stride, k);
-
-    PL_CUDA_IS_SUCCESS(cudaGetLastError());
-}
-
 /**
  * @brief The CUDA kernel that multiplies the state vector data on GPU device
  * by a global phase.
@@ -303,22 +243,6 @@ void setBasisState_CUDA(cuDoubleComplex *sv, cuDoubleComplex &value,
     setBasisState_CUDA_call(sv, value, index, async, stream_id);
 }
 
-void collapseStateVector_CUDA(cuComplex *sv, const std::size_t num_sv,
-                              const std::size_t stride, const bool k,
-                              std::size_t thread_per_block,
-                              cudaStream_t stream_id) {
-    collapseStateVector_CUDA_call(sv, num_sv, stride, k, thread_per_block,
-                                  stream_id);
-}
-
-void collapseStateVector_CUDA(cuDoubleComplex *sv, std::size_t num_sv,
-                              const std::size_t stride, const bool k,
-                              std::size_t thread_per_block,
-                              cudaStream_t stream_id) {
-    collapseStateVector_CUDA_call(sv, num_sv, stride, k, thread_per_block,
-                                  stream_id);
-}
-
 void globalPhaseStateVector_CUDA(cuComplex *sv, std::size_t num_sv,
                                  cuComplex phase, std::size_t thread_per_block,
                                  cudaStream_t stream_id) {
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/tests/mpi/Test_StateVectorCudaMPI.cpp b/pennylane_lightning/core/src/simulators/lightning_gpu/tests/mpi/Test_StateVectorCudaMPI.cpp
index 4b5a2dd349..401f6056b3 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/tests/mpi/Test_StateVectorCudaMPI.cpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/tests/mpi/Test_StateVectorCudaMPI.cpp
@@ -317,4 +317,76 @@ TEMPLATE_PRODUCT_TEST_CASE("StateVectorCudaMPI::applyOperations",
                                          {false, false}, {{0.0}}),
             LightningException, "must all be equal"); // invalid parameters
     }
-}
\ No newline at end of file
+}
+
+TEMPLATE_TEST_CASE("StateVectorCudaManaged::collapse",
+                   "[StateVectorCudaManaged]", float, double) {
+    using PrecisionT = TestType;
+    using ComplexT = typename StateVectorCudaMPI<PrecisionT>::ComplexT;
+    using TestVectorT = TestVector<ComplexT>;
+
+    std::size_t wire = GENERATE(0, 1, 2);
+    std::size_t branch = GENERATE(0, 1);
+    const std::size_t num_qubits = 3;
+
+    using StateVectorT = StateVectorCudaMPI<TestType>;
+    MPIManager mpi_manager(MPI_COMM_WORLD);
+    REQUIRE(mpi_manager.getSize() == 2);
+
+    std::size_t mpi_buffersize = 1;
+
+    int nGlobalIndexBits =
+        std::bit_width(static_cast<unsigned int>(mpi_manager.getSize())) - 1;
+    int nLocalIndexBits = num_qubits - nGlobalIndexBits;
+    mpi_manager.Barrier();
+
+    int nDevices = 0; // Number of GPU devices per node
+    cudaGetDeviceCount(&nDevices);
+    REQUIRE(nDevices >= 2);
+    int deviceId = mpi_manager.getRank() % nDevices;
+    cudaSetDevice(deviceId);
+    DevTag<int> dt_local(deviceId, 0);
+
+    TestVectorT init_state = createPlusState_<ComplexT>(num_qubits);
+
+    std::size_t subSvLength = 1 << nLocalIndexBits;
+
+    mpi_manager.Barrier();
+
+    std::vector<ComplexT> local_state(subSvLength);
+
+    mpi_manager.Scatter(init_state.data(), local_state.data(), subSvLength, 0);
+    mpi_manager.Barrier();
+
+    // TODO @tomlqc use same template for testing all Lightning flavours?
+
+    SECTION("Collapse the state vector after having measured one of the "
+            "qubits.") {
+        const ComplexT coef{0.5, PrecisionT{0.0}};
+        const ComplexT zero{PrecisionT{0.0}, PrecisionT{0.0}};
+
+        std::vector<std::vector<std::vector<ComplexT>>> expected_state = {
+            {{coef, coef, coef, coef, zero, zero, zero, zero},
+             {coef, coef, zero, zero, coef, coef, zero, zero},
+             {coef, zero, coef, zero, coef, zero, coef, zero}},
+            {{zero, zero, zero, zero, coef, coef, coef, coef},
+             {zero, zero, coef, coef, zero, zero, coef, coef},
+             {zero, coef, zero, coef, zero, coef, zero, coef}},
+        };
+
+        StateVectorT sv(mpi_manager, dt_local, mpi_buffersize, nGlobalIndexBits,
+                        nLocalIndexBits);
+
+        sv.CopyHostDataToGpu(local_state.data(), local_state.size(), false);
+
+        sv.collapse(wire, branch);
+
+        auto expected_local_state =
+            mpi_manager.scatter(expected_state[branch][wire], 0);
+
+        PrecisionT eps = std::numeric_limits<PrecisionT>::epsilon() * 1e2;
+        REQUIRE(isApproxEqual(
+            sv.getDataVector().data(), sv.getDataVector().size(),
+            expected_local_state.data(), expected_local_state.size(), eps));
+    }
+}
diff --git a/pennylane_lightning/core/src/utils/cuda_utils/LinearAlg.hpp b/pennylane_lightning/core/src/utils/cuda_utils/LinearAlg.hpp
index d8e189ead4..984a9d2358 100644
--- a/pennylane_lightning/core/src/utils/cuda_utils/LinearAlg.hpp
+++ b/pennylane_lightning/core/src/utils/cuda_utils/LinearAlg.hpp
@@ -277,8 +277,9 @@ inline auto scaleC_CUDA(const CFP_t a, T *v1, const int data_size,
 /**
  * @brief cuBLAS backed GPU data normalization.
  *
- * @tparam T Float data-type. Accepts float and double
- * @param a scaling factor
+ * @tparam CFP_t Complex float data-type. Accepts cuDoubleComplex and cuComplex
+ * @tparam DevTypeID Integer type of device id.
+ *
  * @param v1 Device data pointer
  * @param data_size Length of device data.
  * @param dev_id the device on which the function should be executed.
@@ -286,18 +287,44 @@ inline auto scaleC_CUDA(const CFP_t a, T *v1, const int data_size,
  * @param cublas the CublasCaller object that manages the cuBLAS handle.
  */
 template <class CFP_t = cuDoubleComplex, class DevTypeID = int>
-inline auto normalize_CUDA(CFP_t *v1, const int data_size, DevTypeID dev_id,
-                           cudaStream_t stream_id, const CublasCaller &cublas) {
+inline auto norm2_CUDA(CFP_t *v1, const int data_size, DevTypeID dev_id,
+                       cudaStream_t stream_id, const CublasCaller &cublas) {
     if constexpr (std::is_same_v<CFP_t, cuDoubleComplex> ||
                   std::is_same_v<CFP_t, double2>) {
         double norm{0.0};
         cublas.call(cublasDznrm2, dev_id, stream_id, data_size, v1, 1, &norm);
-        const double alpha = 1.0 / norm;
-        cublas.call(cublasZdscal, dev_id, stream_id, data_size, &alpha, v1, 1);
+        return norm;
     } else {
         float norm{0.0};
         cublas.call(cublasScnrm2, dev_id, stream_id, data_size, v1, 1, &norm);
-        const float alpha = 1.0 / norm;
+        return norm;
+    }
+}
+
+/**
+ * @brief cuBLAS backed GPU data normalization.
+ *
+ * @tparam T Float data-type. Accepts float and double
+ * @tparam CFP_t Complex float data-type. Accepts cuDoubleComplex and cuComplex
+ *
+ * @param norm2 Norm of the vector
+ * @param v1 Device data pointer
+ * @param data_size Length of device data.
+ * @param dev_id the device on which the function should be executed.
+ * @param stream_id the CUDA stream on which the operation should be executed.
+ * @param cublas the CublasCaller object that manages the cuBLAS handle.
+ */
+template <class T = double, class CFP_t = cuDoubleComplex,
+          class DevTypeID = int>
+inline auto normalize_CUDA(T norm2, CFP_t *v1, const int data_size,
+                           DevTypeID dev_id, cudaStream_t stream_id,
+                           const CublasCaller &cublas) {
+    if constexpr (std::is_same_v<CFP_t, cuDoubleComplex> ||
+                  std::is_same_v<CFP_t, double2>) {
+        const double alpha = 1.0 / norm2;
+        cublas.call(cublasZdscal, dev_id, stream_id, data_size, &alpha, v1, 1);
+    } else {
+        const float alpha = 1.0 / norm2;
         cublas.call(cublasCsscal, dev_id, stream_id, data_size, &alpha, v1, 1);
     }
 }

From b2a3758b10d1cd8d4da2373b5c305fc4b3f6a7ae Mon Sep 17 00:00:00 2001
From: ringo-but-quantum <github-ringo-but-quantum@xanadu.ai>
Date: Tue, 8 Oct 2024 00:03:23 +0000
Subject: [PATCH 05/23] Auto update version from '0.39.0-dev40' to
 '0.39.0-dev41'

---
 pennylane_lightning/core/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
index c6e0b5050a..ab5a1f0f5d 100644
--- a/pennylane_lightning/core/_version.py
+++ b/pennylane_lightning/core/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.39.0-dev40"
+__version__ = "0.39.0-dev41"

From 960dc842ecaaca82ab8672d291b96acfa5b6714e Mon Sep 17 00:00:00 2001
From: Shuli Shu <31480676+multiphaseCFD@users.noreply.github.com>
Date: Mon, 7 Oct 2024 20:03:08 -0400
Subject: [PATCH 06/23] Add native `setStateVector` support to `lightning.gpu`
 (#930)

Please complete the following checklist when submitting a PR:

- [ ] All new features must include a unit test.
If you've fixed a bug or added code that should be tested, add a test to
the
      [`tests`](../tests) directory!

- [ ] All new functions and code must be clearly commented and
documented.
If you do make documentation changes, make sure that the docs build and
      render correctly by running `make docs`.

- [x] Ensure that the test suite passes, by running `make test`.

- [x] Add a new entry to the `.github/CHANGELOG.md` file, summarizing
the
      change, and including a link back to the PR.

- [x] Ensure that code is properly formatted by running `make format`.

When all the above are checked, delete everything above the dashed
line and fill in the pull request template.

------------------------------------------------------------------------------------------------------------

**Context:**
 [SC-74668]
`setStateVector` via state and wires is supported in the C++ layer in
`lightning.gpu`

**Description of the Change:**

**Benefits:**

**Possible Drawbacks:**

**Related GitHub Issues:**

---------

Co-authored-by: ringo-but-quantum <github-ringo-but-quantum@xanadu.ai>
---
 .github/CHANGELOG.md                          |   3 +
 pennylane_lightning/core/_version.py          |   2 +-
 .../lightning_gpu/StateVectorCudaMPI.hpp      | 150 ++++++++++++------
 .../lightning_gpu/StateVectorCudaManaged.hpp  | 101 ++++++++----
 .../lightning_gpu/bindings/LGPUBindings.hpp   |  25 +--
 .../bindings/LGPUBindingsMPI.hpp              |  25 +--
 .../Test_StateVectorCudaManaged_NonParam.cpp  |  63 +-------
 .../mpi/Test_StateVectorCudaMPI_NonParam.cpp  |  21 ++-
 .../lightning_gpu/_state_vector.py            |  29 +---
 .../lightning_gpu/lightning_gpu.py            |  11 +-
 10 files changed, 219 insertions(+), 211 deletions(-)

diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md
index b921117f4d..7fa9decd0c 100644
--- a/.github/CHANGELOG.md
+++ b/.github/CHANGELOG.md
@@ -46,6 +46,9 @@
 
 ### Improvements
 
+* Add `setStateVector(state, wire)` support to the `lightning.gpu` C++ layer.
+  [(#930)](https://github.com/PennyLaneAI/pennylane-lightning/pull/930)
+
 * Add zero-state initialization to both `StateVectorCudaManaged` and `StateVectorCudaMPI` constructors to remove the `reset_state` in the python layer ctor and refactor `setBasisState(state, wires)` in the C++ layer.
   [(#933)](https://github.com/PennyLaneAI/pennylane-lightning/pull/933)
   
diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
index ab5a1f0f5d..c6e0b5050a 100644
--- a/pennylane_lightning/core/_version.py
+++ b/pennylane_lightning/core/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.39.0-dev41"
+__version__ = "0.39.0-dev40"
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp
index 97a05d546d..f985d1b19e 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp
@@ -289,61 +289,48 @@ class StateVectorCudaMPI final
         BaseType::getDataBuffer().zeroInit();
         setBasisState_(value, index, use_async);
     }
+
     /**
-     * @brief Set values for a batch of elements of the state-vector. This
-     * method is implemented by the customized CUDA kernel defined in the
-     * DataBuffer class.
+     * @brief Set values for a batch of elements of the state-vector.
      *
-     * @param num_indices Number of elements to be passed to the state vector.
-     * @param values Pointer to values to be set for the target elements.
-     * @param indices Pointer to indices of the target elements.
-     * @param async Use an asynchronous memory copy.
+     * @param state_ptr Pointer to initial state data.
+     * @param num_states Length of initial state data.
+     * @param wires Wires.
+     * @param use_async Use an asynchronous memory copy. Default is false.
      */
-    template <class index_type, std::size_t thread_per_block = 256>
-    void setStateVector(const index_type num_indices,
-                        const std::complex<Precision> *values,
-                        const index_type *indices, const bool async = false) {
-        BaseType::getDataBuffer().zeroInit();
+    void setStateVector(const ComplexT *state_ptr, const std::size_t num_states,
+                        const std::vector<std::size_t> &wires,
+                        bool use_async = false) {
+        PL_ABORT_IF_NOT(num_states == Pennylane::Util::exp2(wires.size()),
+                        "Inconsistent state and wires dimensions.");
 
-        std::vector<index_type> indices_local;
-        std::vector<std::complex<Precision>> values_local;
+        const auto num_qubits = this->getTotalNumQubits();
 
-        for (std::size_t i = 0; i < static_cast<std::size_t>(num_indices);
-             i++) {
-            int index = indices[i];
-            PL_ASSERT(index >= 0);
-            std::size_t rankId =
-                static_cast<std::size_t>(index) >> BaseType::getNumQubits();
+        PL_ABORT_IF_NOT(std::find_if(wires.begin(), wires.end(),
+                                     [&num_qubits](const auto i) {
+                                         return i >= num_qubits;
+                                     }) == wires.end(),
+                        "Invalid wire index.");
 
-            if (rankId == mpi_manager_.getRank()) {
-                int local_index = static_cast<int>(
-                    compute_local_index(static_cast<std::size_t>(index),
-                                        this->getNumLocalQubits()));
-                indices_local.push_back(local_index);
-                values_local.push_back(values[i]);
+        using index_type =
+            typename std::conditional<std::is_same<PrecisionT, float>::value,
+                                      int32_t, int64_t>::type;
+
+        // Calculate the indices of the state-vector to be set.
+        // TODO: Could move to GPU/MPI calculation if the state size is large.
+        std::vector<index_type> indices(num_states);
+        const std::size_t num_wires = wires.size();
+        constexpr std::size_t one{1U};
+        for (std::size_t i = 0; i < num_states; i++) {
+            std::size_t index{0U};
+            for (std::size_t j = 0; j < num_wires; j++) {
+                const std::size_t bit = (i & (one << j)) >> j;
+                index |= bit << (num_qubits - 1 - wires[num_wires - 1 - j]);
             }
+            indices[i] = static_cast<index_type>(index);
         }
-
-        auto device_id = BaseType::getDataBuffer().getDevTag().getDeviceID();
-        auto stream_id = BaseType::getDataBuffer().getDevTag().getStreamID();
-
-        index_type num_elements = indices_local.size();
-
-        DataBuffer<index_type, int> d_indices{
-            static_cast<std::size_t>(num_elements), device_id, stream_id, true};
-
-        DataBuffer<CFP_t, int> d_values{static_cast<std::size_t>(num_elements),
-                                        device_id, stream_id, true};
-
-        d_indices.CopyHostDataToGpu(indices_local.data(), d_indices.getLength(),
-                                    async);
-        d_values.CopyHostDataToGpu(values_local.data(), d_values.getLength(),
-                                   async);
-
-        setStateVector_CUDA(BaseType::getData(), num_elements,
-                            d_values.getData(), d_indices.getData(),
-                            thread_per_block, stream_id);
-        PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
+        setStateVector_<index_type>(num_states, state_ptr, indices.data(),
+                                    use_async);
         mpi_manager_.Barrier();
     }
 
@@ -1596,6 +1583,62 @@ class StateVectorCudaMPI final
         return t_indices;
     }
 
+    /**
+     * @brief Set values for a batch of elements of the state-vector. This
+     * method is implemented by the customized CUDA kernel defined in the
+     * DataBuffer class.
+     *
+     * @param num_indices Number of elements to be passed to the state vector.
+     * @param values Pointer to values to be set for the target elements.
+     * @param indices Pointer to indices of the target elements.
+     * @param async Use an asynchronous memory copy.
+     */
+    template <class index_type, std::size_t thread_per_block = 256>
+    void setStateVector_(const index_type num_indices,
+                         const std::complex<Precision> *values,
+                         const index_type *indices, const bool async = false) {
+        BaseType::getDataBuffer().zeroInit();
+
+        std::vector<index_type> indices_local;
+        std::vector<std::complex<Precision>> values_local;
+
+        for (std::size_t i = 0; i < static_cast<std::size_t>(num_indices);
+             i++) {
+            int index = indices[i];
+            PL_ASSERT(index >= 0);
+            std::size_t rankId =
+                static_cast<std::size_t>(index) >> BaseType::getNumQubits();
+
+            if (rankId == mpi_manager_.getRank()) {
+                int local_index = static_cast<int>(
+                    compute_local_index(static_cast<std::size_t>(index),
+                                        this->getNumLocalQubits()));
+                indices_local.push_back(local_index);
+                values_local.push_back(values[i]);
+            }
+        }
+
+        auto device_id = BaseType::getDataBuffer().getDevTag().getDeviceID();
+        auto stream_id = BaseType::getDataBuffer().getDevTag().getStreamID();
+
+        index_type num_elements = indices_local.size();
+
+        DataBuffer<index_type, int> d_indices{
+            static_cast<std::size_t>(num_elements), device_id, stream_id, true};
+
+        DataBuffer<CFP_t, int> d_values{static_cast<std::size_t>(num_elements),
+                                        device_id, stream_id, true};
+
+        d_indices.CopyHostDataToGpu(indices_local.data(), d_indices.getLength(),
+                                    async);
+        d_values.CopyHostDataToGpu(values_local.data(), d_values.getLength(),
+                                   async);
+
+        setStateVector_CUDA(BaseType::getData(), num_elements,
+                            d_values.getData(), d_indices.getData(),
+                            thread_per_block, stream_id);
+    }
+
     /**
      * @brief Set value for a single element of the state-vector on device. This
      * method is implemented by cudaMemcpy.
@@ -1685,8 +1728,8 @@ class StateVectorCudaMPI final
     }
 
     /**
-     * @brief Apply parametric Pauli gates to local statevector using custateVec
-     * calls.
+     * @brief Apply parametric Pauli gates to local statevector using
+     * custateVec calls.
      *
      * @param pauli_words List of Pauli words representing operation.
      * @param ctrls Control wires
@@ -1756,7 +1799,8 @@ class StateVectorCudaMPI final
             });
 
         // Initialize a vector to store the status of wires and default its
-        // elements as zeros, which assumes there is no target and control wire.
+        // elements as zeros, which assumes there is no target and control
+        // wire.
         std::vector<int> statusWires(this->getTotalNumQubits(),
                                      WireStatus::Default);
 
@@ -1916,7 +1960,8 @@ class StateVectorCudaMPI final
             });
 
         // Initialize a vector to store the status of wires and default its
-        // elements as zeros, which assumes there is no target and control wire.
+        // elements as zeros, which assumes there is no target and control
+        // wire.
         std::vector<int> statusWires(this->getTotalNumQubits(),
                                      WireStatus::Default);
 
@@ -2057,7 +2102,8 @@ class StateVectorCudaMPI final
             });
 
         // Initialize a vector to store the status of wires and default its
-        // elements as zeros, which assumes there is no target and control wire.
+        // elements as zeros, which assumes there is no target and control
+        // wire.
         std::vector<int> statusWires(this->getTotalNumQubits(),
                                      WireStatus::Default);
 
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp
index aa3429ceca..4d31b03e3e 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp
@@ -215,36 +215,46 @@ class StateVectorCudaManaged
     }
 
     /**
-     * @brief Set values for a batch of elements of the state-vector. This
-     * method is implemented by the customized CUDA kernel defined in the
-     * DataBuffer class.
+     * @brief Set values for a batch of elements of the state-vector.
      *
-     * @param num_indices Number of elements to be passed to the state vector.
-     * @param values Pointer to values to be set for the target elements.
-     * @param indices Pointer to indices of the target elements.
-     * @param async Use an asynchronous memory copy.
+     * @param state_ptr Pointer to the initial state data.
+     * @param num_states Length of the initial state data.
+     * @param wires Wires.
+     * @param use_async Use an asynchronous memory copy. Default is false.
      */
-    template <class index_type, std::size_t thread_per_block = 256>
-    void setStateVector(const index_type num_indices,
-                        const std::complex<Precision> *values,
-                        const index_type *indices, const bool async = false) {
-        BaseType::getDataBuffer().zeroInit();
-
-        auto device_id = BaseType::getDataBuffer().getDevTag().getDeviceID();
-        auto stream_id = BaseType::getDataBuffer().getDevTag().getStreamID();
-
-        index_type num_elements = num_indices;
-        DataBuffer<index_type, int> d_indices{
-            static_cast<std::size_t>(num_elements), device_id, stream_id, true};
-        DataBuffer<CFP_t, int> d_values{static_cast<std::size_t>(num_elements),
-                                        device_id, stream_id, true};
+    void setStateVector(const ComplexT *state_ptr, const std::size_t num_states,
+                        const std::vector<std::size_t> &wires,
+                        bool use_async = false) {
+        PL_ABORT_IF_NOT(num_states == Pennylane::Util::exp2(wires.size()),
+                        "Inconsistent state and wires dimensions.");
 
-        d_indices.CopyHostDataToGpu(indices, d_indices.getLength(), async);
-        d_values.CopyHostDataToGpu(values, d_values.getLength(), async);
+        const auto num_qubits = BaseType::getNumQubits();
 
-        setStateVector_CUDA(BaseType::getData(), num_elements,
-                            d_values.getData(), d_indices.getData(),
-                            thread_per_block, stream_id);
+        PL_ABORT_IF_NOT(std::find_if(wires.begin(), wires.end(),
+                                     [&num_qubits](const auto i) {
+                                         return i >= num_qubits;
+                                     }) == wires.end(),
+                        "Invalid wire index.");
+
+        using index_type =
+            typename std::conditional<std::is_same<PrecisionT, float>::value,
+                                      int32_t, int64_t>::type;
+
+        // Calculate the indices of the state-vector to be set.
+        // TODO: Could move to GPU calculation if the state size is large.
+        std::vector<index_type> indices(num_states);
+        const std::size_t num_wires = wires.size();
+        constexpr std::size_t one{1U};
+        for (std::size_t i = 0; i < num_states; i++) {
+            std::size_t index{0U};
+            for (std::size_t j = 0; j < num_wires; j++) {
+                const std::size_t bit = (i & (one << j)) >> j;
+                index |= bit << (num_qubits - 1 - wires[num_wires - 1 - j]);
+            }
+            indices[i] = static_cast<index_type>(index);
+        }
+        setStateVector_<index_type>(num_states, state_ptr, indices.data(),
+                                    use_async);
     }
 
     /**
@@ -1381,9 +1391,8 @@ class StateVectorCudaManaged
         return t_indices;
     }
 
-    /**
-     * @brief Set value for a single element of the state-vector on device. This
-     * method is implemented by cudaMemcpy.
+    /** @brief Set value for a single element of the state-vector on device.
+     * This method is implemented by cudaMemcpy.
      *
      * @param value Value to be set for the target element.
      * @param index Index of the target element.
@@ -1397,6 +1406,40 @@ class StateVectorCudaManaged
                            stream_id);
     }
 
+    /**
+     * @brief Set values for a batch of elements of the state-vector. This
+     * method is implemented by the customized CUDA kernel defined in the
+     * DataBuffer class.
+     *
+     * @param num_indices Number of elements to be passed to the state vector.
+     * @param values Pointer to values to be set for the target elements.
+     * @param indices Pointer to indices of the target elements.
+     * @param async Use an asynchronous memory copy.
+     */
+    template <class index_type, std::size_t thread_per_block = 256>
+    void setStateVector_(const index_type num_indices,
+                         const std::complex<Precision> *values,
+                         const index_type *indices, const bool async = false) {
+        BaseType::getDataBuffer().zeroInit();
+
+        auto device_id = BaseType::getDataBuffer().getDevTag().getDeviceID();
+        auto stream_id = BaseType::getDataBuffer().getDevTag().getStreamID();
+
+        index_type num_elements = num_indices;
+        DataBuffer<index_type, int> d_indices{
+            static_cast<std::size_t>(num_elements), device_id, stream_id, true};
+        DataBuffer<CFP_t, int> d_values{static_cast<std::size_t>(num_elements),
+                                        device_id, stream_id, true};
+
+        d_indices.CopyHostDataToGpu(indices, d_indices.getLength(), async);
+        d_values.CopyHostDataToGpu(values, d_values.getLength(), async);
+
+        setStateVector_CUDA(BaseType::getData(), num_elements,
+                            d_values.getData(), d_indices.getData(),
+                            thread_per_block, stream_id);
+        PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
+    }
+
     /**
      * @brief Apply parametric Pauli gates using custateVec calls.
      *
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp
index b2d03eba5a..c361bd6ed9 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp
@@ -63,10 +63,6 @@ void registerBackendClassSpecificBindings(PyClass &pyclass) {
     using ParamT = PrecisionT;        // Parameter's data precision
     using np_arr_c = py::array_t<std::complex<ParamT>,
                                  py::array::c_style | py::array::forcecast>;
-    using np_arr_sparse_ind = typename std::conditional<
-        std::is_same<ParamT, float>::value,
-        py::array_t<int32_t, py::array::c_style | py::array::forcecast>,
-        py::array_t<int64_t, py::array::c_style | py::array::forcecast>>::type;
 
     registerGatesForStateVector<StateVectorT>(pyclass);
 
@@ -91,20 +87,15 @@ void registerBackendClassSpecificBindings(PyClass &pyclass) {
             "Set the state vector to a basis state on GPU.")
         .def(
             "setStateVector",
-            [](StateVectorT &sv, const np_arr_sparse_ind &indices,
-               const np_arr_c &state, const bool use_async) {
-                using index_type = typename std::conditional<
-                    std::is_same<ParamT, float>::value, int32_t, int64_t>::type;
-
-                sv.template setStateVector<index_type>(
-                    static_cast<index_type>(indices.request().size),
-                    static_cast<std::complex<PrecisionT> *>(
-                        state.request().ptr),
-                    static_cast<index_type *>(indices.request().ptr),
-                    use_async);
+            [](StateVectorT &sv, const np_arr_c &state,
+               const std::vector<std::size_t> &wires, const bool async) {
+                const auto state_buffer = state.request();
+                const auto state_ptr =
+                    static_cast<const std::complex<ParamT> *>(state_buffer.ptr);
+                sv.setStateVector(state_ptr, state_buffer.size, wires, async);
             },
-            "Set State Vector on GPU with values and their corresponding "
-            "indices for the state vector on device")
+            "Set State Vector on GPU with values for the state vector and "
+            "wires on the host memory.")
         .def(
             "DeviceToDevice",
             [](StateVectorT &sv, const StateVectorT &other, bool async) {
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp
index f6a933aca3..2d3313f694 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp
@@ -63,10 +63,6 @@ void registerBackendClassSpecificBindingsMPI(PyClass &pyclass) {
     using ParamT = PrecisionT;        // Parameter's data precision
     using np_arr_c = py::array_t<std::complex<ParamT>,
                                  py::array::c_style | py::array::forcecast>;
-    using np_arr_sparse_ind = typename std::conditional<
-        std::is_same<ParamT, float>::value,
-        py::array_t<int32_t, py::array::c_style | py::array::forcecast>,
-        py::array_t<int64_t, py::array::c_style | py::array::forcecast>>::type;
 
     registerGatesForStateVector<StateVectorT>(pyclass);
 
@@ -95,20 +91,15 @@ void registerBackendClassSpecificBindingsMPI(PyClass &pyclass) {
             "Set the state vector to a basis state on GPU.")
         .def(
             "setStateVector",
-            [](StateVectorT &sv, const np_arr_sparse_ind &indices,
-               const np_arr_c &state, const bool use_async) {
-                using index_type = typename std::conditional<
-                    std::is_same<ParamT, float>::value, int32_t, int64_t>::type;
-
-                sv.template setStateVector<index_type>(
-                    static_cast<index_type>(indices.request().size),
-                    static_cast<std::complex<PrecisionT> *>(
-                        state.request().ptr),
-                    static_cast<index_type *>(indices.request().ptr),
-                    use_async);
+            [](StateVectorT &sv, const np_arr_c &state,
+               const std::vector<std::size_t> &wires, const bool async) {
+                const auto state_buffer = state.request();
+                const auto state_ptr =
+                    static_cast<const std::complex<ParamT> *>(state_buffer.ptr);
+                sv.setStateVector(state_ptr, state_buffer.size, wires, async);
             },
-            "Set State Vector on GPU with values and their corresponding "
-            "indices for the state vector on device")
+            "Set State Vector on GPU with values for the state vector and "
+            "wires on the host memory.")
         .def(
             "DeviceToDevice",
             [](StateVectorT &sv, const StateVectorT &other, bool async) {
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/gates/tests/Test_StateVectorCudaManaged_NonParam.cpp b/pennylane_lightning/core/src/simulators/lightning_gpu/gates/tests/Test_StateVectorCudaManaged_NonParam.cpp
index dc0c1a7128..af864d8b01 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/gates/tests/Test_StateVectorCudaManaged_NonParam.cpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/gates/tests/Test_StateVectorCudaManaged_NonParam.cpp
@@ -1069,68 +1069,15 @@ TEMPLATE_TEST_CASE("StateVectorCudaManaged::SetStateVector",
         }
 
         StateVectorCudaManaged<TestType> sv{num_qubits};
-        sv.CopyHostDataToGpu(init_state.data(), init_state.size());
-
-        using index_type =
-            typename std::conditional<std::is_same<PrecisionT, float>::value,
-                                      int32_t, int64_t>::type;
-        // The setStates will shuffle the state vector values on the device with
-        // the following indices and values setting on host. For example, the
-        // values[i] is used to set the indices[i] th element of state vector on
-        // the device. For example, values[2] (init_state[5]) will be copied to
-        // indices[2]th or (4th) element of the state vector.
-        std::vector<index_type> indices = {0, 2, 4, 6, 1, 3, 5, 7};
-
-        std::vector<std::complex<PrecisionT>> values = {
-            init_state[1], init_state[3], init_state[5], init_state[7],
-            init_state[0], init_state[2], init_state[4], init_state[6]};
-
-        sv.template setStateVector<index_type>(values.size(), values.data(),
-                                               indices.data(), false);
 
-        CHECK(expected_state == Pennylane::Util::approx(sv.getDataVector()));
-    }
-}
-// LCOV_EXCL_START
-TEMPLATE_TEST_CASE("StateVectorCudaManaged::SetStateVectorwith_thread_setting",
-                   "[StateVectorCudaManaged_Nonparam]", float, double) {
-    using PrecisionT = TestType;
-    const std::size_t num_qubits = 3;
-    std::mt19937 re{1337};
-
-    SECTION("SetStates with a non-default GPU thread setting") {
-        auto init_state =
-            createRandomStateVectorData<PrecisionT>(re, num_qubits);
-        auto expected_state = init_state;
+        std::vector<std::complex<PrecisionT>> values(init_state.begin(),
+                                                     init_state.end());
 
-        for (std::size_t i = 0; i < Pennylane::Util::exp2(num_qubits - 1);
-             i++) {
-            std::swap(expected_state[i * 2], expected_state[i * 2 + 1]);
-        }
-
-        StateVectorCudaManaged<TestType> sv{num_qubits};
-        sv.CopyHostDataToGpu(init_state.data(), init_state.size());
-
-        using index_type =
-            typename std::conditional<std::is_same<PrecisionT, float>::value,
-                                      int32_t, int64_t>::type;
-
-        std::vector<index_type> indices = {0, 2, 4, 6, 1, 3, 5, 7};
-
-        std::vector<std::complex<PrecisionT>> values = {
-            init_state[1], init_state[3], init_state[5], init_state[7],
-            init_state[0], init_state[2], init_state[4], init_state[6]};
-
-        // default setting of the number of threads in a block is 256.
-        const std::size_t threads_per_block = 1024;
-
-        sv.template setStateVector<index_type, threads_per_block>(
-            values.size(), values.data(), indices.data(), false);
-
-        CHECK(expected_state == Pennylane::Util::approx(sv.getDataVector()));
+        sv.setStateVector(values.data(), values.size(),
+                          std::vector<std::size_t>{0, 1, 2});
+        CHECK(init_state == Pennylane::Util::approx(sv.getDataVector()));
     }
 }
-// LCOV_EXCL_STOP
 
 TEMPLATE_TEST_CASE("StateVectorCudaManaged::SetIthStates",
                    "[StateVectorCudaManaged_Nonparam]", float, double) {
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/gates/tests/mpi/Test_StateVectorCudaMPI_NonParam.cpp b/pennylane_lightning/core/src/simulators/lightning_gpu/gates/tests/mpi/Test_StateVectorCudaMPI_NonParam.cpp
index b9ed7fcbef..968badd4dc 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/gates/tests/mpi/Test_StateVectorCudaMPI_NonParam.cpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/gates/tests/mpi/Test_StateVectorCudaMPI_NonParam.cpp
@@ -15,6 +15,7 @@
 #include <complex>
 #include <iostream>
 #include <limits>
+#include <numeric>
 #include <type_traits>
 #include <utility>
 #include <vector>
@@ -158,21 +159,17 @@ TEMPLATE_TEST_CASE("StateVectorCudaMPI::SetStateVector",
             "the host") {
         StateVectorCudaMPI<PrecisionT> sv(mpi_manager, dt_local, mpi_buffersize,
                                           nGlobalIndexBits, nLocalIndexBits);
-        // The setStates will shuffle the state vector values on the device with
-        // the following indices and values setting on host. For example, the
-        // values[i] is used to set the indices[i] th element of state vector on
-        // the device. For example, values[2] (init_state[5]) will be copied to
-        // indices[2]th or (4th) element of the state vector.
 
-        sv.template setStateVector<index_type>(
-            init_state.size(), init_state.data(), indices.data(), false);
+        std::vector<std::complex<PrecisionT>> values(init_state.begin(),
+                                                     init_state.end());
+        std::vector<std::size_t> wires(num_qubits);
+        std::iota(wires.begin(), wires.end(), 0);
+        sv.setStateVector(values.data(), values.size(), wires);
 
-        mpi_manager.Barrier();
-        sv.CopyGpuDataToHost(local_state.data(),
-                             static_cast<std::size_t>(subSvLength));
-        mpi_manager.Barrier();
+        auto expected_local_state_vector = mpi_manager.scatter<cp_t>(values, 0);
 
-        CHECK(expected_local_state == Pennylane::Util::approx(local_state));
+        CHECK(expected_local_state_vector ==
+              Pennylane::Util::approx(sv.getDataVector()));
     }
 }
 
diff --git a/pennylane_lightning/lightning_gpu/_state_vector.py b/pennylane_lightning/lightning_gpu/_state_vector.py
index faca301e8c..a000443563 100644
--- a/pennylane_lightning/lightning_gpu/_state_vector.py
+++ b/pennylane_lightning/lightning_gpu/_state_vector.py
@@ -31,7 +31,6 @@
 except ImportError as ex:
     warn(str(ex), UserWarning)
 
-from itertools import product
 from typing import Union
 
 import numpy as np
@@ -69,7 +68,7 @@ class LightningGPUStateVector(LightningBaseStateVector):
         device_name(string): state vector device name. Options: ["lightning.gpu"]
         mpi_handler(MPIHandler): MPI handler for PennyLane Lightning GPU device.
             Provides functionality to distribute the state-vector to multiple devices.
-        sync (bool): is host-device data copy synchronized or not.
+        use_async (bool): is host-device data copy asynchronized or not.
     """
 
     def __init__(
@@ -77,7 +76,7 @@ def __init__(
         num_wires: int,
         dtype: Union[np.complex128, np.complex64] = np.complex128,
         mpi_handler: MPIHandler = None,
-        sync: bool = True,
+        use_async: bool = False,
     ):
 
         super().__init__(num_wires, dtype)
@@ -92,7 +91,7 @@ def __init__(
         self._num_local_wires = mpi_handler.num_local_wires
 
         self._mpi_handler = mpi_handler
-        self._sync = sync
+        self._use_async = use_async
 
         # Initialize the state vector
         if self._mpi_handler.use_mpi:  # using MPI
@@ -120,7 +119,7 @@ def _state_dtype(self):
         # without MPI
         return StateVectorC128 if self.dtype == np.complex128 else StateVectorC64
 
-    def syncD2H(self, state_vector, use_async=False):
+    def syncD2H(self, state_vector, use_async: bool = False):
         """Copy the state vector data on device to a state vector on the host provided by the user.
         Args:
             state_vector(array[complex]): the state vector array on host.
@@ -155,7 +154,7 @@ def state(self):
         self.syncD2H(state)
         return state
 
-    def syncH2D(self, state_vector, use_async=False):
+    def syncH2D(self, state_vector, use_async: bool = False):
         """Copy the state vector data on host provided by the user to the state vector on the device
         Args:
             state_vector(array[complex]): the state vector array on host.
@@ -189,7 +188,7 @@ def _asarray(arr, dtype=None):
 
         return arr
 
-    def _apply_state_vector(self, state, device_wires, use_async=False):
+    def _apply_state_vector(self, state, device_wires, use_async: bool = False):
         """Initialize the state vector on GPU with a specified state on host.
         Note that any use of this method will introduce host-overheads.
         Args:
@@ -224,20 +223,8 @@ def _apply_state_vector(self, state, device_wires, use_async=False):
             self.syncH2D(np.reshape(local_state, output_shape))
             return
 
-        # generate basis states on subset of qubits via the cartesian product
-        basis_states = np.array(list(product([0, 1], repeat=len(device_wires))))
-
-        # get basis states to alter on full set of qubits
-        unravelled_indices = np.zeros((2 ** len(device_wires), self.num_wires), dtype=int)
-        unravelled_indices[:, device_wires] = basis_states
-
-        # get indices for which the state is changed to input state vector elements
-        ravelled_indices = np.ravel_multi_index(unravelled_indices.T, [2] * self.num_wires)
-
-        # set the state vector on GPU with the unravelled_indices and their corresponding values
-        self._qubit_state.setStateVector(
-            ravelled_indices, state, use_async
-        )  # this operation on device
+        # set the state vector on GPU with provided state and their corresponding wires
+        self._qubit_state.setStateVector(state, list(device_wires), use_async)
 
     def _apply_lightning_controlled(self, operation):
         """Apply an arbitrary controlled operation to the state tensor.
diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.py b/pennylane_lightning/lightning_gpu/lightning_gpu.py
index 84d7dd31e6..2b295c4990 100644
--- a/pennylane_lightning/lightning_gpu/lightning_gpu.py
+++ b/pennylane_lightning/lightning_gpu/lightning_gpu.py
@@ -296,7 +296,7 @@ class LightningGPU(LightningBase):
             is built with MPI. Default is False.
         mpi (bool): declare if the device will use the MPI support.
         mpi_buf_size (int): size of GPU memory (in MiB) set for MPI operation and its default value is 64 MiB.
-        sync (bool): is host-device data copy synchronized or not.
+        use_async (bool): is host-device data copy asynchronized or not.
     """
 
     # General device options
@@ -326,7 +326,7 @@ def __init__(  # pylint: disable=too-many-arguments
         # GPU and MPI arguments
         mpi: bool = False,
         mpi_buf_size: int = 0,
-        sync: bool = False,
+        use_async: bool = False,
     ):
         if not self._CPP_BINARY_AVAILABLE:
             raise ImportError(
@@ -349,13 +349,16 @@ def __init__(  # pylint: disable=too-many-arguments
 
         # GPU specific options
         self._dp = DevPool()
-        self._sync = sync
+        self._use_async = use_async
 
         # Creating the state vector
         self._mpi_handler = MPIHandler(mpi, mpi_buf_size, len(self.wires), c_dtype)
 
         self._statevector = self.LightningStateVector(
-            num_wires=len(self.wires), dtype=c_dtype, mpi_handler=self._mpi_handler, sync=self._sync
+            num_wires=len(self.wires),
+            dtype=c_dtype,
+            mpi_handler=self._mpi_handler,
+            use_async=self._use_async,
         )
 
     @property

From c7ea1a81112c0db6b4591dda49202d740a90b688 Mon Sep 17 00:00:00 2001
From: ringo-but-quantum <github-ringo-but-quantum@xanadu.ai>
Date: Tue, 8 Oct 2024 00:04:44 +0000
Subject: [PATCH 07/23] Auto update version from '0.39.0-dev40' to
 '0.39.0-dev41'

---
 pennylane_lightning/core/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
index c6e0b5050a..ab5a1f0f5d 100644
--- a/pennylane_lightning/core/_version.py
+++ b/pennylane_lightning/core/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.39.0-dev40"
+__version__ = "0.39.0-dev41"

From 829f8eb748b31882b163d9249a28ffc1e726df7f Mon Sep 17 00:00:00 2001
From: ringo-but-quantum <github-ringo-but-quantum@xanadu.ai>
Date: Tue, 8 Oct 2024 00:13:31 +0000
Subject: [PATCH 08/23] Auto update version from '0.39.0-dev40' to
 '0.39.0-dev41'

---
 pennylane_lightning/core/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
index c6e0b5050a..ab5a1f0f5d 100644
--- a/pennylane_lightning/core/_version.py
+++ b/pennylane_lightning/core/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.39.0-dev40"
+__version__ = "0.39.0-dev41"

From 2493260f20ffdf6018f1ecc8c26b43efc470cfee Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Tue, 8 Oct 2024 22:30:37 +0000
Subject: [PATCH 09/23] update collapse with custatevec apis

---
 .../lightning_gpu/StateVectorCudaMPI.hpp      | 91 +++++++++++++------
 .../lightning_gpu/StateVectorCudaManaged.hpp  | 47 ++++++----
 .../core/src/utils/cuda_utils/LinearAlg.hpp   | 55 -----------
 3 files changed, 92 insertions(+), 101 deletions(-)

diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp
index f985d1b19e..389da68b47 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp
@@ -347,37 +347,22 @@ class StateVectorCudaMPI final
         PL_ABORT_IF_NOT(wire < this->getTotalNumQubits(),
                         "Invalid wire index.");
 
-        std::vector<ComplexT> matrix(4, ComplexT(0.0, 0.0));
+        const int wireInt =
+            static_cast<int>(this->getTotalNumQubits() - 1 - wire);
 
-        for (std::size_t i = 0; i < matrix.size(); i++) {
-            matrix[i] = ((i == 0 && branch == 0) || (i == 3 && branch == 1))
-                            ? ComplexT{1.0, 0.0}
-                            : ComplexT{0.0, 0.0};
-        }
-
-        mpi_manager_.Barrier();
-
-        applyMatrix(matrix, {wire}, false);
-
-        auto local_norm2 = norm2_CUDA<CFP_t>(
-            BaseType::getData(), BaseType::getLength(),
-            BaseType::getDataBuffer().getDevTag().getDeviceID(),
-            BaseType::getDataBuffer().getDevTag().getStreamID(),
-            this->getCublasCaller());
-
-        local_norm2 *= local_norm2;
-
-        mpi_manager_.Barrier();
-
-        auto norm2 = mpi_manager_.allreduce(local_norm2, "sum");
-
-        norm2 = std::sqrt(norm2);
+        if (static_cast<std::size_t>(wireInt) < BaseType::getNumQubits()) {
+            // local wire
+            collapse_local_(wireInt, branch);
+        } else {
+            // global wire
+            int local_wire = 0;
+            std::vector<int2> wirePairs{make_int2(wireInt, local_wire)};
+            applyMPI_Dispatcher(wirePairs, &StateVectorCudaMPI::collapse_local_,
+                                local_wire, branch);
 
-        normalize_CUDA<PrecisionT, CFP_t>(
-            norm2, BaseType::getData(), BaseType::getLength(),
-            BaseType::getDataBuffer().getDevTag().getDeviceID(),
-            BaseType::getDataBuffer().getDevTag().getStreamID(),
-            this->getCublasCaller());
+            PL_CUDA_IS_SUCCESS(cudaStreamSynchronize(localStream_.get()));
+            PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
+        }
 
         mpi_manager_.Barrier();
     }
@@ -1665,6 +1650,54 @@ class StateVectorCudaMPI final
         mpi_manager_.Barrier();
     }
 
+    /**
+     * @brief collapse the state vector to a given basis state.
+     *
+     * @param wire_local Local wire index.
+     * @param branch Branch index.
+     */
+    void collapse_local_(const int wire_local, const bool branch) {
+        cudaDataType_t data_type;
+
+        if constexpr (std::is_same_v<CFP_t, cuDoubleComplex> ||
+                      std::is_same_v<CFP_t, double2>) {
+            data_type = CUDA_C_64F;
+        } else {
+            data_type = CUDA_C_32F;
+        }
+
+        std::vector<int> basisBits(1, wire_local);
+
+        double abs2sum0_local, abs2sum1_local;
+
+        PL_CUSTATEVEC_IS_SUCCESS(custatevecAbs2SumOnZBasis(
+            /* custatevecHandle_t */ handle_.get(),
+            /* void *sv */ BaseType::getData(),
+            /* cudaDataType_t */ data_type,
+            /* const uint32_t nIndexBits */ BaseType::getNumQubits(),
+            /* double * */ &abs2sum0_local,
+            /* double * */ &abs2sum1_local,
+            /* const int32_t * */ basisBits.data(),
+            /* const uint32_t nBasisBits */ basisBits.size()));
+
+        auto abs2sum0 = mpi_manager_.allreduce(abs2sum0_local, "sum");
+        auto abs2sum1 = mpi_manager_.allreduce(abs2sum1_local, "sum");
+
+        double norm = (branch == 0) ? abs2sum0 : abs2sum1;
+
+        int parity = branch;
+
+        PL_CUSTATEVEC_IS_SUCCESS(custatevecCollapseOnZBasis(
+            /* custatevecHandle_t */ handle_.get(),
+            /* void *sv */ BaseType::getData(),
+            /* cudaDataType_t */ data_type,
+            /* const uint32_t nIndexBits */ BaseType::getNumQubits(),
+            /* const int32_t parity */ parity,
+            /* const int32_t *basisBits */ basisBits.data(),
+            /* const uint32_t nBasisBits */ basisBits.size(),
+            /* double norm */ norm));
+    }
+
     /**
      * @brief Get expectation value for a sum of Pauli words.
      *
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp
index 4d31b03e3e..377b6d729d 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp
@@ -498,28 +498,41 @@ class StateVectorCudaManaged
      */
     void collapse(const std::size_t wire, const bool branch) {
         PL_ABORT_IF_NOT(wire < BaseType::getNumQubits(), "Invalid wire index.");
+        cudaDataType_t data_type;
 
-        std::vector<ComplexT> matrix(4, ComplexT(0.0, 0.0));
-
-        for (std::size_t i = 0; i < matrix.size(); i++) {
-            matrix[i] = ((i == 0 && branch == 0) || (i == 3 && branch == 1))
-                            ? ComplexT{1.0, 0.0}
-                            : ComplexT{0.0, 0.0};
+        if constexpr (std::is_same_v<CFP_t, cuDoubleComplex> ||
+                      std::is_same_v<CFP_t, double2>) {
+            data_type = CUDA_C_64F;
+        } else {
+            data_type = CUDA_C_32F;
         }
 
-        applyMatrix(matrix, {wire}, false);
+        std::vector<int> basisBits(1, BaseType::getNumQubits() - 1 - wire);
 
-        auto norm2 = norm2_CUDA<CFP_t>(
-            BaseType::getData(), BaseType::getLength(),
-            BaseType::getDataBuffer().getDevTag().getDeviceID(),
-            BaseType::getDataBuffer().getDevTag().getStreamID(),
-            this->getCublasCaller());
+        double abs2sum0, abs2sum1;
+        PL_CUSTATEVEC_IS_SUCCESS(custatevecAbs2SumOnZBasis(
+            /* custatevecHandle_t */ handle_.get(),
+            /* void *sv */ BaseType::getData(),
+            /* cudaDataType_t */ data_type,
+            /* const uint32_t nIndexBits */ BaseType::getNumQubits(),
+            /* double * */ &abs2sum0,
+            /* double * */ &abs2sum1,
+            /* const int32_t * */ basisBits.data(),
+            /* const uint32_t nBasisBits */ basisBits.size()));
+
+        double norm = (branch == 0) ? abs2sum0 : abs2sum1;
 
-        normalize_CUDA<PrecisionT, CFP_t>(
-            norm2, BaseType::getData(), BaseType::getLength(),
-            BaseType::getDataBuffer().getDevTag().getDeviceID(),
-            BaseType::getDataBuffer().getDevTag().getStreamID(),
-            this->getCublasCaller());
+        int parity = branch;
+
+        PL_CUSTATEVEC_IS_SUCCESS(custatevecCollapseOnZBasis(
+            /* custatevecHandle_t */ handle_.get(),
+            /* void *sv */ BaseType::getData(),
+            /* cudaDataType_t */ data_type,
+            /* const uint32_t nIndexBits */ BaseType::getNumQubits(),
+            /* const int32_t parity */ parity,
+            /* const int32_t *basisBits */ basisBits.data(),
+            /* const uint32_t nBasisBits */ basisBits.size(),
+            /* double norm */ norm));
     }
 
     //****************************************************************************//
diff --git a/pennylane_lightning/core/src/utils/cuda_utils/LinearAlg.hpp b/pennylane_lightning/core/src/utils/cuda_utils/LinearAlg.hpp
index 984a9d2358..cd422899b5 100644
--- a/pennylane_lightning/core/src/utils/cuda_utils/LinearAlg.hpp
+++ b/pennylane_lightning/core/src/utils/cuda_utils/LinearAlg.hpp
@@ -274,61 +274,6 @@ inline auto scaleC_CUDA(const CFP_t a, T *v1, const int data_size,
                 data_type);
 }
 
-/**
- * @brief cuBLAS backed GPU data normalization.
- *
- * @tparam CFP_t Complex float data-type. Accepts cuDoubleComplex and cuComplex
- * @tparam DevTypeID Integer type of device id.
- *
- * @param v1 Device data pointer
- * @param data_size Length of device data.
- * @param dev_id the device on which the function should be executed.
- * @param stream_id the CUDA stream on which the operation should be executed.
- * @param cublas the CublasCaller object that manages the cuBLAS handle.
- */
-template <class CFP_t = cuDoubleComplex, class DevTypeID = int>
-inline auto norm2_CUDA(CFP_t *v1, const int data_size, DevTypeID dev_id,
-                       cudaStream_t stream_id, const CublasCaller &cublas) {
-    if constexpr (std::is_same_v<CFP_t, cuDoubleComplex> ||
-                  std::is_same_v<CFP_t, double2>) {
-        double norm{0.0};
-        cublas.call(cublasDznrm2, dev_id, stream_id, data_size, v1, 1, &norm);
-        return norm;
-    } else {
-        float norm{0.0};
-        cublas.call(cublasScnrm2, dev_id, stream_id, data_size, v1, 1, &norm);
-        return norm;
-    }
-}
-
-/**
- * @brief cuBLAS backed GPU data normalization.
- *
- * @tparam T Float data-type. Accepts float and double
- * @tparam CFP_t Complex float data-type. Accepts cuDoubleComplex and cuComplex
- *
- * @param norm2 Norm of the vector
- * @param v1 Device data pointer
- * @param data_size Length of device data.
- * @param dev_id the device on which the function should be executed.
- * @param stream_id the CUDA stream on which the operation should be executed.
- * @param cublas the CublasCaller object that manages the cuBLAS handle.
- */
-template <class T = double, class CFP_t = cuDoubleComplex,
-          class DevTypeID = int>
-inline auto normalize_CUDA(T norm2, CFP_t *v1, const int data_size,
-                           DevTypeID dev_id, cudaStream_t stream_id,
-                           const CublasCaller &cublas) {
-    if constexpr (std::is_same_v<CFP_t, cuDoubleComplex> ||
-                  std::is_same_v<CFP_t, double2>) {
-        const double alpha = 1.0 / norm2;
-        cublas.call(cublasZdscal, dev_id, stream_id, data_size, &alpha, v1, 1);
-    } else {
-        const float alpha = 1.0 / norm2;
-        cublas.call(cublasCsscal, dev_id, stream_id, data_size, &alpha, v1, 1);
-    }
-}
-
 /** @brief `%CudaScopedDevice` uses RAII to select a CUDA device context.
  *
  * @see https://taskflow.github.io/taskflow/classtf_1_1cudaScopedDevice.html

From 4238aeb3fe10dcdbe96a7deadf2e612948424a32 Mon Sep 17 00:00:00 2001
From: ringo-but-quantum <github-ringo-but-quantum@xanadu.ai>
Date: Tue, 8 Oct 2024 22:31:01 +0000
Subject: [PATCH 10/23] Auto update version from '0.39.0-dev41' to
 '0.39.0-dev42'

---
 pennylane_lightning/core/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
index ab5a1f0f5d..59d75bd653 100644
--- a/pennylane_lightning/core/_version.py
+++ b/pennylane_lightning/core/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.39.0-dev41"
+__version__ = "0.39.0-dev42"

From 15c7e2f6c374224c13b3cd742491a2d4447ed3ba Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Tue, 8 Oct 2024 23:14:55 +0000
Subject: [PATCH 11/23] add python layer

---
 .../lightning_gpu/bindings/LGPUBindings.hpp   |  2 ++
 .../bindings/LGPUBindingsMPI.hpp              |  2 ++
 .../lightning_gpu/_state_vector.py            | 35 +++++++++++++++++--
 .../lightning_gpu/lightning_gpu.py            | 32 ++++++++++++++---
 tests/test_native_mcm.py                      |  2 +-
 5 files changed, 64 insertions(+), 9 deletions(-)

diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp
index c361bd6ed9..145097b30e 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp
@@ -150,6 +150,8 @@ void registerBackendClassSpecificBindings(PyClass &pyclass) {
             },
             py::arg("async") = false,
             "Initialize the statevector data to the |0...0> state")
+        .def("collapse", &StateVectorT::collapse,
+             "Collapse the statevector onto the 0 or 1 branch of a given wire.")
         .def(
             "apply",
             [](StateVectorT &sv, const std::string &str,
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp
index 2d3313f694..360af02fe9 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp
@@ -154,6 +154,8 @@ void registerBackendClassSpecificBindingsMPI(PyClass &pyclass) {
             },
             py::arg("async") = false,
             "Initialize the statevector data to the |0...0> state")
+        .def("collapse", &StateVectorT::collapse,
+             "Collapse the statevector onto the 0 or 1 branch of a given wire.")
         .def(
             "apply",
             [](StateVectorT &sv, const std::string &str,
diff --git a/pennylane_lightning/lightning_gpu/_state_vector.py b/pennylane_lightning/lightning_gpu/_state_vector.py
index a000443563..77e453778b 100644
--- a/pennylane_lightning/lightning_gpu/_state_vector.py
+++ b/pennylane_lightning/lightning_gpu/_state_vector.py
@@ -36,13 +36,17 @@
 import numpy as np
 import pennylane as qml
 from pennylane import DeviceError
+from pennylane.measurements import MidMeasureMP
+from pennylane.ops import Conditional
 from pennylane.ops.op_math import Adjoint
+from pennylane.tape import QuantumScript
 from pennylane.wires import Wires
 
 # pylint: disable=ungrouped-imports
 from pennylane_lightning.core._serialize import global_phase_diagonal
 from pennylane_lightning.core._state_vector_base import LightningBaseStateVector
 
+from ._measurements import LightningGPUMeasurements
 from ._mpi_handler import MPIHandler
 
 gate_cache_needs_hash = (
@@ -247,15 +251,33 @@ def _apply_lightning_controlled(self, operation):
         matrix = global_phase_diagonal(param, self.wires, control_wires, control_values)
         state.apply(name, wires, inv, [[param]], matrix)
 
-    def _apply_lightning_midmeasure(self):
+    def _apply_lightning_midmeasure(
+        self, operation: MidMeasureMP, mid_measurements: dict, postselect_mode: str
+    ):
         """Execute a MidMeasureMP operation and return the sample in mid_measurements.
 
         Args:
+            operation (~pennylane.operation.Operation): mid-circuit measurement
+            mid_measurements (None, dict): Dictionary of mid-circuit measurements
+            postselect_mode (str): Configuration for handling shots with mid-circuit measurement
+                postselection. Use ``"hw-like"`` to discard invalid shots and ``"fill-shots"`` to
+                keep the same number of shots.
 
         Returns:
             None
         """
-        raise DeviceError("LightningGPU does not support Mid-circuit measurements.")
+        wires = self.wires.indices(operation.wires)
+        wire = list(wires)[0]
+        if postselect_mode == "fill-shots" and operation.postselect is not None:
+            sample = operation.postselect
+        else:
+            circuit = QuantumScript([], [qml.sample(wires=operation.wires)], shots=1)
+            sample = LightningGPUMeasurements(self).measure_final_state(circuit)
+            sample = np.squeeze(sample)
+        mid_measurements[operation] = sample
+        getattr(self.state_vector, "collapse")(wire, bool(sample))
+        if operation.reset and bool(sample):
+            self.apply_operations([qml.PauliX(operation.wires)], mid_measurements=mid_measurements)
 
     # pylint: disable=unused-argument
     def _apply_lightning(
@@ -289,7 +311,14 @@ def _apply_lightning(
             method = getattr(state, name, None)
             wires = list(operation.wires)
 
-            if method is not None:  # apply specialized gate
+            if isinstance(operation, Conditional):
+                if operation.meas_val.concretize(mid_measurements):
+                    self._apply_lightning([operation.base])
+            elif isinstance(operation, MidMeasureMP):
+                self._apply_lightning_midmeasure(
+                    operation, mid_measurements, postselect_mode=postselect_mode
+                )
+            elif method is not None:  # apply specialized gate
                 param = operation.parameters
                 method(wires, invert_param, param)
             elif isinstance(operation, qml.ops.Controlled) and isinstance(
diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.py b/pennylane_lightning/lightning_gpu/lightning_gpu.py
index 2b295c4990..c568caceb2 100644
--- a/pennylane_lightning/lightning_gpu/lightning_gpu.py
+++ b/pennylane_lightning/lightning_gpu/lightning_gpu.py
@@ -173,10 +173,7 @@ def stopping_condition(op: Operator) -> bool:
 def stopping_condition_shots(op: Operator) -> bool:
     """A function that determines whether or not an operation is supported by ``lightning.gpu``
     with finite shots."""
-    if isinstance(op, (MidMeasureMP, qml.ops.op_math.Conditional)):
-        # LightningGPU does not support Mid-circuit measurements.
-        return False
-    return stopping_condition(op)
+    return stopping_condition(op) or isinstance(op, (MidMeasureMP, qml.ops.op_math.Conditional))
 
 
 def accepted_observables(obs: Operator) -> bool:
@@ -460,6 +457,7 @@ def execute(
                 self.simulate(
                     circuit,
                     self._statevector,
+                    postselect_mode=execution_config.mcm_config.postselect_mode,
                 )
             )
 
@@ -494,20 +492,44 @@ def simulate(
         self,
         circuit: QuantumScript,
         state: LightningGPUStateVector,
+        postselect_mode: Optional[str] = None,
     ) -> Result:
         """Simulate a single quantum script.
 
         Args:
             circuit (QuantumTape): The single circuit to simulate
             state (LightningGPUStateVector): handle to Lightning state vector
+            postselect_mode (str): Configuration for handling shots with mid-circuit measurement
+                postselection. Use ``"hw-like"`` to discard invalid shots and ``"fill-shots"`` to
+                keep the same number of shots. Default is ``None``.
 
         Returns:
             Tuple[TensorLike]: The results of the simulation
 
         Note that this function can return measurements for non-commuting observables simultaneously.
         """
+        #if circuit.shots and (any(isinstance(op, MidMeasureMP) for op in circuit.operations)):
+        #    raise qml.DeviceError("LightningGPU does not support Mid-circuit measurements.")
         if circuit.shots and (any(isinstance(op, MidMeasureMP) for op in circuit.operations)):
-            raise qml.DeviceError("LightningGPU does not support Mid-circuit measurements.")
+            results = []
+            aux_circ = qml.tape.QuantumScript(
+                circuit.operations,
+                circuit.measurements,
+                shots=[1],
+                trainable_params=circuit.trainable_params,
+            )
+            for _ in range(circuit.shots.total_shots):
+                state.reset_state()
+                mid_measurements = {}
+                final_state = state.get_final_state(
+                    aux_circ, mid_measurements=mid_measurements, postselect_mode=postselect_mode
+                )
+                results.append(
+                    self.LightningMeasurements(final_state).measure_final_state(
+                        aux_circ, mid_measurements=mid_measurements
+                    )
+                )
+            return tuple(results)
 
         state.reset_state()
         final_state = state.get_final_state(circuit)
diff --git a/tests/test_native_mcm.py b/tests/test_native_mcm.py
index 07281fb48a..516edab0cc 100644
--- a/tests/test_native_mcm.py
+++ b/tests/test_native_mcm.py
@@ -21,7 +21,7 @@
 from conftest import LightningDevice, device_name, validate_measurements
 from flaky import flaky
 
-if device_name not in ("lightning.qubit", "lightning.kokkos"):
+if device_name not in ("lightning.qubit", "lightning.kokkos", "lightning.gpu"):
     pytest.skip("Native MCM not supported. Skipping.", allow_module_level=True)
 
 if not LightningDevice._CPP_BINARY_AVAILABLE:  # pylint: disable=protected-access

From 6dc7c03223d90f57d2508cbb978a423a964d1e57 Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Tue, 8 Oct 2024 23:16:33 +0000
Subject: [PATCH 12/23] update changelog

---
 .github/CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md
index 7fa9decd0c..4efd8f948b 100644
--- a/.github/CHANGELOG.md
+++ b/.github/CHANGELOG.md
@@ -2,7 +2,7 @@
 
 ### New features since last release
 
-* Add `collapse()` support to `lightning.gpu` C++ layer.
+* Add `mid-circuit measurements` support to `lightning.gpu`.
   [(#931)](https://github.com/PennyLaneAI/pennylane-lightning/pull/931)
 
 * Add Matrix Product Operator (MPO) for all gates support to `lightning.tensor`. Note current C++ implementation only works for MPO sites data provided by users.

From a57b5eeed2e873a698d3f14ca14220571d9da4c6 Mon Sep 17 00:00:00 2001
From: ringo-but-quantum <github-ringo-but-quantum@xanadu.ai>
Date: Tue, 8 Oct 2024 23:21:21 +0000
Subject: [PATCH 13/23] Auto update version from '0.39.0-dev41' to
 '0.39.0-dev42'

---
 pennylane_lightning/core/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
index ab5a1f0f5d..59d75bd653 100644
--- a/pennylane_lightning/core/_version.py
+++ b/pennylane_lightning/core/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.39.0-dev41"
+__version__ = "0.39.0-dev42"

From d608f06e064bcc2ba412f71308bf18a2a4f67249 Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Wed, 9 Oct 2024 00:08:30 +0000
Subject: [PATCH 14/23] make format

---
 pennylane_lightning/core/src/utils/Util.hpp        | 2 +-
 pennylane_lightning/lightning_gpu/lightning_gpu.py | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/pennylane_lightning/core/src/utils/Util.hpp b/pennylane_lightning/core/src/utils/Util.hpp
index 5544c96ba9..5478cdbdcb 100644
--- a/pennylane_lightning/core/src/utils/Util.hpp
+++ b/pennylane_lightning/core/src/utils/Util.hpp
@@ -21,12 +21,12 @@
 #include <algorithm>
 #include <cmath>
 #include <complex>
+#include <concepts> // integral, floating_point
 #include <numbers>
 #include <numeric> // transform_reduce
 #include <set>
 #include <type_traits> // is_same_v
 #include <vector>
-#include <concepts> // integral, floating_point
 
 #include "Error.hpp"
 #include "TypeTraits.hpp" // remove_complex_t
diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.py b/pennylane_lightning/lightning_gpu/lightning_gpu.py
index c568caceb2..3407f5cbf9 100644
--- a/pennylane_lightning/lightning_gpu/lightning_gpu.py
+++ b/pennylane_lightning/lightning_gpu/lightning_gpu.py
@@ -508,8 +508,6 @@ def simulate(
 
         Note that this function can return measurements for non-commuting observables simultaneously.
         """
-        #if circuit.shots and (any(isinstance(op, MidMeasureMP) for op in circuit.operations)):
-        #    raise qml.DeviceError("LightningGPU does not support Mid-circuit measurements.")
         if circuit.shots and (any(isinstance(op, MidMeasureMP) for op in circuit.operations)):
             results = []
             aux_circ = qml.tape.QuantumScript(

From 63facdd28e131742a44c17734ea93cfe2dc2640f Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Wed, 9 Oct 2024 15:26:55 +0000
Subject: [PATCH 15/23] drop mpi support for mid-measurement

---
 .github/CHANGELOG.md                          |  2 +-
 mpitests/test_native_mcm.py                   | 48 +++++++++++++++++++
 .../bindings/LGPUBindingsMPI.hpp              |  2 -
 .../lightning_gpu/lightning_gpu.py            |  4 ++
 .../lightning_gpu/lightning_gpu.toml          |  2 +-
 tests/test_native_mcm.py                      |  2 +-
 6 files changed, 55 insertions(+), 5 deletions(-)
 create mode 100644 mpitests/test_native_mcm.py

diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md
index f20bf9324e..6cfe74e276 100644
--- a/.github/CHANGELOG.md
+++ b/.github/CHANGELOG.md
@@ -2,7 +2,7 @@
 
 ### New features since last release
 
-* Add `mid-circuit measurements` support to `lightning.gpu`.
+* Add `mid-circuit measurements` support to `lightning.gpu`'s single-GPU backend.
   [(#931)](https://github.com/PennyLaneAI/pennylane-lightning/pull/931)
 
 * Add Matrix Product Operator (MPO) for all gates support to `lightning.tensor`. Note current C++ implementation only works for MPO sites data provided by users.
diff --git a/mpitests/test_native_mcm.py b/mpitests/test_native_mcm.py
new file mode 100644
index 0000000000..758376af35
--- /dev/null
+++ b/mpitests/test_native_mcm.py
@@ -0,0 +1,48 @@
+# Copyright 2024 Xanadu Quantum Technologies Inc.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for default qubit preprocessing."""
+from mpi4py import MPI
+import numpy as np
+import pennylane as qml
+import pytest
+from conftest import LightningDevice, device_name
+
+if device_name not in ("lightning.qubit", "lightning.kokkos", "lightning.gpu"):
+    pytest.skip("Native MCM not supported. Skipping.", allow_module_level=True)
+
+if not LightningDevice._CPP_BINARY_AVAILABLE:  # pylint: disable=protected-access
+    pytest.skip("No binary module found. Skipping.", allow_module_level=True)
+
+
+def test_unspported_mid_measurement():
+    """Test unsupported mid_measurement for lightning.gpu-mpi."""
+    comm = MPI.COMM_WORLD
+    dev = qml.device(device_name, wires=2, mpi=True, shots=1000)
+    params = np.pi / 4 * np.ones(2)
+
+    @qml.qnode(dev)
+    def func(x, y):
+        qml.RX(x, wires=0)
+        m0 = qml.measure(0)
+        qml.cond(m0, qml.RY)(y, wires=1)
+        return qml.probs(wires=0)
+
+    comm.Barrier()
+
+    with pytest.raises(
+        qml.DeviceError,
+        match=f"LightningGPU-MPI does not support Mid-circuit measurements.",
+    ):
+        func(*params)
+
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp
index 360af02fe9..2d3313f694 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp
@@ -154,8 +154,6 @@ void registerBackendClassSpecificBindingsMPI(PyClass &pyclass) {
             },
             py::arg("async") = false,
             "Initialize the statevector data to the |0...0> state")
-        .def("collapse", &StateVectorT::collapse,
-             "Collapse the statevector onto the 0 or 1 branch of a given wire.")
         .def(
             "apply",
             [](StateVectorT &sv, const std::string &str,
diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.py b/pennylane_lightning/lightning_gpu/lightning_gpu.py
index 3407f5cbf9..2794e126d7 100644
--- a/pennylane_lightning/lightning_gpu/lightning_gpu.py
+++ b/pennylane_lightning/lightning_gpu/lightning_gpu.py
@@ -350,6 +350,7 @@ def __init__(  # pylint: disable=too-many-arguments
 
         # Creating the state vector
         self._mpi_handler = MPIHandler(mpi, mpi_buf_size, len(self.wires), c_dtype)
+        self._use_mpi = mpi
 
         self._statevector = self.LightningStateVector(
             num_wires=len(self.wires),
@@ -509,6 +510,9 @@ def simulate(
         Note that this function can return measurements for non-commuting observables simultaneously.
         """
         if circuit.shots and (any(isinstance(op, MidMeasureMP) for op in circuit.operations)):
+            if self._use_mpi :
+                raise qml.DeviceError("LightningGPU-MPI does not support Mid-circuit measurements.")
+
             results = []
             aux_circ = qml.tape.QuantumScript(
                 circuit.operations,
diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.toml b/pennylane_lightning/lightning_gpu/lightning_gpu.toml
index 518315de09..b18470da6b 100644
--- a/pennylane_lightning/lightning_gpu/lightning_gpu.toml
+++ b/pennylane_lightning/lightning_gpu/lightning_gpu.toml
@@ -98,7 +98,7 @@ qjit_compatible = false
 # If the device requires run time generation of the quantum circuit.
 runtime_code_generation = false
 # If the device supports mid circuit measurements natively
-mid_circuit_measurement = false
+mid_circuit_measurement = true
 
 # This field is currently unchecked but it is reserved for the purpose of
 # determining if the device supports dynamic qubit allocation/deallocation.
diff --git a/tests/test_native_mcm.py b/tests/test_native_mcm.py
index 516edab0cc..050e1d27c6 100644
--- a/tests/test_native_mcm.py
+++ b/tests/test_native_mcm.py
@@ -89,7 +89,7 @@ def func(x, y):
             match=f"not accepted with finite shots on lightning.qubit",
         ):
             func(*params)
-    if device_name == "lightning.kokkos":
+    if device_name in ("lightning.kokkos", "lightning.gpu"):
         with pytest.raises(
             qml.DeviceError,
             match=r"Measurement shadow\(wires=\[0\]\) not accepted with finite shots on "

From 53ec72b5dc992df5115f7e834c89c2d0850f8b8e Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Wed, 9 Oct 2024 15:27:26 +0000
Subject: [PATCH 16/23] make format

---
 mpitests/test_native_mcm.py                        | 3 +--
 pennylane_lightning/lightning_gpu/lightning_gpu.py | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/mpitests/test_native_mcm.py b/mpitests/test_native_mcm.py
index 758376af35..af489f7160 100644
--- a/mpitests/test_native_mcm.py
+++ b/mpitests/test_native_mcm.py
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tests for default qubit preprocessing."""
-from mpi4py import MPI
 import numpy as np
 import pennylane as qml
 import pytest
 from conftest import LightningDevice, device_name
+from mpi4py import MPI
 
 if device_name not in ("lightning.qubit", "lightning.kokkos", "lightning.gpu"):
     pytest.skip("Native MCM not supported. Skipping.", allow_module_level=True)
@@ -45,4 +45,3 @@ def func(x, y):
         match=f"LightningGPU-MPI does not support Mid-circuit measurements.",
     ):
         func(*params)
-
diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.py b/pennylane_lightning/lightning_gpu/lightning_gpu.py
index 2794e126d7..4a0f48fb5a 100644
--- a/pennylane_lightning/lightning_gpu/lightning_gpu.py
+++ b/pennylane_lightning/lightning_gpu/lightning_gpu.py
@@ -510,7 +510,7 @@ def simulate(
         Note that this function can return measurements for non-commuting observables simultaneously.
         """
         if circuit.shots and (any(isinstance(op, MidMeasureMP) for op in circuit.operations)):
-            if self._use_mpi :
+            if self._use_mpi:
                 raise qml.DeviceError("LightningGPU-MPI does not support Mid-circuit measurements.")
 
             results = []

From 410d694cff05f79f5bfaffaf62286eee60eee159 Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Thu, 10 Oct 2024 13:51:43 +0000
Subject: [PATCH 17/23] test mpi support

---
 mpitests/conftest.py                          |  84 +++++
 mpitests/test_native_mcm.py                   | 324 +++++++++++++++++-
 .../bindings/LGPUBindingsMPI.hpp              |   2 +
 .../lightning_gpu/lightning_gpu.py            |   4 -
 4 files changed, 402 insertions(+), 12 deletions(-)

diff --git a/mpitests/conftest.py b/mpitests/conftest.py
index 552cf9f330..6c079c86de 100644
--- a/mpitests/conftest.py
+++ b/mpitests/conftest.py
@@ -18,6 +18,8 @@
 
 import itertools
 import os
+from functools import reduce
+from typing import Sequence
 
 import pennylane as qml
 import pytest
@@ -125,3 +127,85 @@ def _device(wires):
         )
 
     return _device
+
+
+#######################################################################
+
+
+def validate_counts(shots, results1, results2):
+    """Compares two counts.
+
+    If the results are ``Sequence``s, loop over entries.
+
+    Fails if a key of ``results1`` is not found in ``results2``.
+    Passes if counts are too low, chosen as ``100``.
+    Otherwise, fails if counts differ by more than ``20`` plus 20 percent.
+    """
+    if isinstance(results1, Sequence):
+        assert isinstance(results2, Sequence)
+        assert len(results1) == len(results2)
+        for r1, r2 in zip(results1, results2):
+            validate_counts(shots, r1, r2)
+        return
+    for key1, val1 in results1.items():
+        val2 = results2[key1]
+        if abs(val1 + val2) > 100:
+            assert np.allclose(val1, val2, rtol=20, atol=0.2)
+
+
+def validate_samples(shots, results1, results2):
+    """Compares two samples.
+
+    If the results are ``Sequence``s, loop over entries.
+
+    Fails if the results do not have the same shape, within ``20`` entries plus 20 percent.
+    This is to handle cases when post-selection yields variable shapes.
+    Otherwise, fails if the sums of samples differ by more than ``20`` plus 20 percent.
+    """
+    if isinstance(shots, Sequence):
+        assert isinstance(results1, Sequence)
+        assert isinstance(results2, Sequence)
+        assert len(results1) == len(results2)
+        for s, r1, r2 in zip(shots, results1, results2):
+            validate_samples(s, r1, r2)
+    else:
+        sh1, sh2 = results1.shape[0], results2.shape[0]
+        assert np.allclose(sh1, sh2, rtol=20, atol=0.2)
+        assert results1.ndim == results2.ndim
+        if results2.ndim > 1:
+            assert results1.shape[1] == results2.shape[1]
+        np.allclose(np.sum(results1), np.sum(results2), rtol=20, atol=0.2)
+
+
+def validate_others(shots, results1, results2):
+    """Compares two expval, probs or var.
+
+    If the results are ``Sequence``s, validate the average of items.
+
+    If ``shots is None``, validate using ``np.allclose``'s default parameters.
+    Otherwise, fails if the results do not match within ``0.01`` plus 20 percent.
+    """
+    if isinstance(results1, Sequence):
+        assert isinstance(results2, Sequence)
+        assert len(results1) == len(results2)
+        results1 = reduce(lambda x, y: x + y, results1) / len(results1)
+        results2 = reduce(lambda x, y: x + y, results2) / len(results2)
+        validate_others(shots, results1, results2)
+        return
+    if shots is None:
+        assert np.allclose(results1, results2)
+        return
+    assert np.allclose(results1, results2, atol=0.01, rtol=0.2)
+
+
+def validate_measurements(func, shots, results1, results2):
+    """Calls the correct validation function based on measurement type."""
+    if func is qml.counts:
+        validate_counts(shots, results1, results2)
+        return
+
+    if func is qml.sample:
+        validate_samples(shots, results1, results2)
+        return
+
+    validate_others(shots, results1, results2)
diff --git a/mpitests/test_native_mcm.py b/mpitests/test_native_mcm.py
index af489f7160..563de142be 100644
--- a/mpitests/test_native_mcm.py
+++ b/mpitests/test_native_mcm.py
@@ -12,10 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tests for default qubit preprocessing."""
+from functools import reduce
+from typing import Sequence
+
 import numpy as np
 import pennylane as qml
 import pytest
-from conftest import LightningDevice, device_name
+from conftest import LightningDevice, device_name, validate_measurements
+from flaky import flaky
 from mpi4py import MPI
 
 if device_name not in ("lightning.qubit", "lightning.kokkos", "lightning.gpu"):
@@ -25,8 +29,53 @@
     pytest.skip("No binary module found. Skipping.", allow_module_level=True)
 
 
-def test_unspported_mid_measurement():
-    """Test unsupported mid_measurement for lightning.gpu-mpi."""
+def get_device(wires, **kwargs):
+    kwargs.setdefault("shots", None)
+    return qml.device(device_name, wires=wires, mpi=True, **kwargs)
+
+
+def test_all_invalid_shots_circuit():
+    """Test all invalid cases: expval, probs, var measurements."""
+    comm = MPI.COMM_WORLD
+    dev = qml.device(device_name, wires=2)
+    dq = qml.device("default.qubit", wires=2)
+
+    def circuit_op():
+        m = qml.measure(0, postselect=1)
+        qml.cond(m, qml.PauliX)(1)
+        return (
+            qml.expval(op=qml.PauliZ(1)),
+            qml.probs(op=qml.PauliY(0) @ qml.PauliZ(1)),
+            qml.var(op=qml.PauliZ(1)),
+        )
+
+    comm.Barrier()
+    res1 = qml.QNode(circuit_op, dq)()
+    res2 = qml.QNode(circuit_op, dev)(shots=10)
+    for r1, r2 in zip(res1, res2):
+        if isinstance(r1, Sequence):
+            assert len(r1) == len(r2)
+        assert np.all(np.isnan(r1))
+        assert np.all(np.isnan(r2))
+
+    def circuit_mcm():
+        m = qml.measure(0, postselect=1)
+        qml.cond(m, qml.PauliX)(1)
+        return qml.expval(op=m), qml.probs(op=m), qml.var(op=m)
+
+    res1 = qml.QNode(circuit_mcm, dq)()
+    res2 = qml.QNode(circuit_mcm, dev)(shots=10)
+
+    comm.Barrier()
+    for r1, r2 in zip(res1, res2):
+        if isinstance(r1, Sequence):
+            assert len(r1) == len(r2)
+        assert np.all(np.isnan(r1))
+        assert np.all(np.isnan(r2))
+
+
+def test_unsupported_measurement():
+    """Test unsupported ``qml.classical_shadow`` measurement on ``lightning.gpu`` ."""
     comm = MPI.COMM_WORLD
     dev = qml.device(device_name, wires=2, mpi=True, shots=1000)
     params = np.pi / 4 * np.ones(2)
@@ -36,12 +85,271 @@ def func(x, y):
         qml.RX(x, wires=0)
         m0 = qml.measure(0)
         qml.cond(m0, qml.RY)(y, wires=1)
-        return qml.probs(wires=0)
+        return qml.classical_shadow(wires=0)
+
+    comm.Barrier()
+    if device_name == "lightning.qubit":
+        with pytest.raises(
+            qml.DeviceError,
+            match=f"not accepted with finite shots on lightning.qubit",
+        ):
+            func(*params)
+    if device_name in ("lightning.kokkos", "lightning.gpu"):
+        with pytest.raises(
+            qml.DeviceError,
+            match=r"Measurement shadow\(wires=\[0\]\) not accepted with finite shots on "
+            + device_name,
+        ):
+            func(*params)
+
+
+@pytest.mark.parametrize("mcm_method", ["deferred", "one-shot"])
+def test_qnode_mcm_method(mcm_method, mocker):
+    """Test that user specified qnode arg for mid-circuit measurements transform are used correctly"""
+    comm = MPI.COMM_WORLD
+    spy = (
+        mocker.spy(qml.dynamic_one_shot, "_transform")
+        if mcm_method == "one-shot"
+        else mocker.spy(qml.defer_measurements, "_transform")
+    )
+    other_spy = (
+        mocker.spy(qml.defer_measurements, "_transform")
+        if mcm_method == "one-shot"
+        else mocker.spy(qml.dynamic_one_shot, "_transform")
+    )
+
+    shots = 10
+    device = qml.device(device_name, wires=3, mpi=True, shots=shots)
+    comm.Barrier()
+
+    @qml.qnode(device, mcm_method=mcm_method)
+    def f(x):
+        qml.RX(x, 0)
+        _ = qml.measure(0)
+        qml.CNOT([0, 1])
+        return qml.sample(wires=[0, 1])
+
+    _ = f(np.pi / 8)
+    comm.Barrier()
+
+    spy.assert_called_once()
+    other_spy.assert_not_called()
+
+
+@pytest.mark.parametrize("postselect_mode", ["hw-like", "fill-shots"])
+def test_qnode_postselect_mode(postselect_mode):
+    """Test that user specified qnode arg for discarding invalid shots is used correctly"""
+    comm = MPI.COMM_WORLD
+    shots = 100
+    device = qml.device(device_name, wires=3, mpi=True, shots=shots)
+    postselect = 1
+
+    @qml.qnode(device, postselect_mode=postselect_mode)
+    def f(x):
+        qml.RX(x, 0)
+        _ = qml.measure(0, postselect=postselect)
+        qml.CNOT([0, 1])
+        return qml.sample(wires=[1])
+
+    # Using small-ish rotation angle ensures the number of valid shots will be less than the
+    # original number of shots. This helps avoid stochastic failures for the assertion below
+    res = f(np.pi / 2)
+
+    comm.Barrier()
+
+    if postselect_mode == "hw-like":
+        assert len(res) < shots
+    else:
+        assert len(res) == shots
+    assert np.allclose(res, postselect)
+
+
+# pylint: disable=unused-argument
+def obs_tape(x, y, z, reset=False, postselect=None):
+    qml.RX(x, 0)
+    qml.RZ(np.pi / 4, 0)
+    m0 = qml.measure(0, reset=reset)
+    qml.cond(m0 == 0, qml.RX)(np.pi / 4, 0)
+    qml.cond(m0 == 0, qml.RZ)(np.pi / 4, 0)
+    qml.cond(m0 == 1, qml.RX)(-np.pi / 4, 0)
+    qml.cond(m0 == 1, qml.RZ)(-np.pi / 4, 0)
+    qml.RX(y, 1)
+    qml.RZ(np.pi / 4, 1)
+    m1 = qml.measure(1, postselect=postselect)
+    qml.cond(m1 == 0, qml.RX)(np.pi / 4, 1)
+    qml.cond(m1 == 0, qml.RZ)(np.pi / 4, 1)
+    qml.cond(m1 == 1, qml.RX)(-np.pi / 4, 1)
+    qml.cond(m1 == 1, qml.RZ)(-np.pi / 4, 1)
+    return m0, m1
+
+
+@flaky(max_runs=5)
+@pytest.mark.parametrize("shots", [5000, [5000, 5001]])
+@pytest.mark.parametrize("postselect", [None, 0, 1])
+@pytest.mark.parametrize("measure_f", [qml.counts, qml.expval, qml.probs, qml.sample, qml.var])
+@pytest.mark.parametrize(
+    "meas_obj",
+    [qml.PauliZ(0), qml.PauliY(1), [0], [0, 1], [1, 0], "mcm", "composite_mcm", "mcm_list"],
+)
+def test_simple_dynamic_circuit(shots, measure_f, postselect, meas_obj):
+    """Tests that LightningQubit handles a simple dynamic circuit with the following measurements:
 
+        * qml.counts with obs (comp basis or not), single wire, multiple wires (ordered/unordered), MCM, f(MCM), MCM list
+        * qml.expval with obs (comp basis or not), MCM, f(MCM), MCM list
+        * qml.probs with obs (comp basis or not), single wire, multiple wires (ordered/unordered), MCM, f(MCM), MCM list
+        * qml.sample with obs (comp basis or not), single wire, multiple wires (ordered/unordered), MCM, f(MCM), MCM list
+        * qml.var with obs (comp basis or not), MCM, f(MCM), MCM list
+
+    The above combinations should work for finite shots, shot vectors and post-selecting of either the 0 or 1 branch.
+    """
+    comm = MPI.COMM_WORLD
+
+    if measure_f in (qml.expval, qml.var) and (
+        isinstance(meas_obj, list) or meas_obj == "mcm_list"
+    ):
+        pytest.skip("Can't use wires/mcm lists with var or expval")
+
+    dq = qml.device("default.qubit", shots=shots)
+    dev = get_device(wires=3, shots=shots)
+    params = [np.pi / 2.5, np.pi / 3, -np.pi / 3.5]
+
+    def func(x, y, z):
+        m0, m1 = obs_tape(x, y, z, postselect=postselect)
+        mid_measure = (
+            m0 if meas_obj == "mcm" else (0.5 * m0 if meas_obj == "composite_mcm" else [m0, m1])
+        )
+        measurement_key = "wires" if isinstance(meas_obj, list) else "op"
+        measurement_value = mid_measure if isinstance(meas_obj, str) else meas_obj
+        return measure_f(**{measurement_key: measurement_value})
+
+    results1 = qml.QNode(func, dev, mcm_method="one-shot")(*params)
+    results2 = qml.QNode(func, dq, mcm_method="deferred")(*params)
+    comm.Barrier()
+
+    validate_measurements(measure_f, shots, results1, results2)
+
+
+@pytest.mark.parametrize("postselect", [None, 0, 1])
+@pytest.mark.parametrize("reset", [False, True])
+def test_multiple_measurements_and_reset(postselect, reset):
+    """Tests that LightningQubit handles a circuit with a single mid-circuit measurement with reset
+    and a conditional gate. Multiple measurements of the mid-circuit measurement value are
+    performed. This function also tests `reset` parametrizing over the parameter."""
+    comm = MPI.COMM_WORLD
+    shots = 5000
+    dq = qml.device("default.qubit", shots=shots)
+    dev = get_device(wires=3, shots=shots)
+    params = [np.pi / 2.5, np.pi / 3, -np.pi / 3.5]
+    obs = qml.PauliY(1)
+
+    def func(x, y, z):
+        mcms = obs_tape(x, y, z, reset=reset, postselect=postselect)
+        return (
+            qml.counts(op=obs),
+            qml.expval(op=mcms[0]),
+            qml.probs(op=obs),
+            qml.sample(op=mcms[0]),
+            qml.var(op=obs),
+        )
+
+    results1 = qml.QNode(func, dev, mcm_method="one-shot")(*params)
+    results2 = qml.QNode(func, dq, mcm_method="deferred")(*params)
     comm.Barrier()
 
-    with pytest.raises(
-        qml.DeviceError,
-        match=f"LightningGPU-MPI does not support Mid-circuit measurements.",
+    for measure_f, r1, r2 in zip(
+        [qml.counts, qml.expval, qml.probs, qml.sample, qml.var], results1, results2
     ):
-        func(*params)
+        validate_measurements(measure_f, shots, r1, r2)
+
+
+@pytest.mark.parametrize(
+    "mcm_f",
+    [
+        lambda x: x * -1,
+        lambda x: x * 1,
+        lambda x: x * 2,
+        lambda x: 1 - x,
+        lambda x: x + 1,
+        lambda x: x & 3,
+        "mix",
+        "list",
+    ],
+)
+@pytest.mark.parametrize("measure_f", [qml.counts, qml.expval, qml.probs, qml.sample, qml.var])
+def test_composite_mcms(mcm_f, measure_f):
+    """Tests that LightningQubit handles a circuit with a composite mid-circuit measurement and a
+    conditional gate. A single measurement of a composite mid-circuit measurement is performed
+    at the end."""
+    comm = MPI.COMM_WORLD
+    if measure_f in (qml.expval, qml.var) and (mcm_f in ("list", "mix")):
+        pytest.skip(
+            "expval/var does not support measuring sequences of measurements or observables."
+        )
+
+    if measure_f == qml.probs and mcm_f == "mix":
+        pytest.skip(
+            "Cannot use qml.probs() when measuring multiple mid-circuit measurements collected using arithmetic operators."
+        )
+
+    shots = 3000
+
+    dq = qml.device("default.qubit", shots=shots)
+    dev = get_device(wires=3, shots=shots)
+    param = np.pi / 3
+
+    @qml.qnode(dev)
+    def func(x):
+        qml.RX(x, 0)
+        m0 = qml.measure(0)
+        qml.RX(0.5 * x, 1)
+        m1 = qml.measure(1)
+        qml.cond((m0 + m1) == 2, qml.RY)(2.0 * x, 0)
+        m2 = qml.measure(0)
+        obs = (
+            (m0 - 2 * m1) * m2 + 7
+            if mcm_f == "mix"
+            else ([m0, m1, m2] if mcm_f == "list" else mcm_f(m2))
+        )
+        return measure_f(op=obs)
+
+    results1 = qml.QNode(func, dev, mcm_method="one-shot")(param)
+    results2 = qml.QNode(func, dq, mcm_method="deferred")(param)
+
+    comm.Barrier()
+
+    validate_measurements(measure_f, shots, results1, results2)
+
+
+@pytest.mark.parametrize(
+    "mcm_f",
+    [
+        lambda x, y: x + y,
+        lambda x, y: x - 7 * y,
+        lambda x, y: x & y,
+        lambda x, y: x == y,
+        lambda x, y: 4.0 * x + 2.0 * y,
+    ],
+)
+def test_counts_return_type(mcm_f):
+    """Tests that LightningQubit returns the same keys for ``qml.counts`` measurements with ``dynamic_one_shot`` and ``defer_measurements``."""
+    comm = MPI.COMM_WORLD
+    shots = 500
+
+    dq = qml.device("default.qubit", shots=shots)
+    dev = get_device(wires=3, shots=shots)
+    param = np.pi / 3
+
+    @qml.qnode(dev)
+    def func(x):
+        qml.RX(x, 0)
+        m0 = qml.measure(0)
+        qml.RX(0.5 * x, 1)
+        m1 = qml.measure(1)
+        qml.cond((m0 + m1) == 2, qml.RY)(2.0 * x, 0)
+        return qml.counts(op=mcm_f(m0, m1))
+
+    results1 = qml.QNode(func, dev, mcm_method="one-shot")(param)
+    results2 = qml.QNode(func, dq, mcm_method="deferred")(param)
+    comm.Barrier()
+    for r1, r2 in zip(results1.keys(), results2.keys()):
+        assert r1 == r2
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp
index 2d3313f694..529f5ae75e 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp
@@ -100,6 +100,8 @@ void registerBackendClassSpecificBindingsMPI(PyClass &pyclass) {
             },
             "Set State Vector on GPU with values for the state vector and "
             "wires on the host memory.")
+        .def("collapse", &StateVectorT::collapse,
+             "Collapse the statevector onto the 0 or 1 branch of a given wire.")
         .def(
             "DeviceToDevice",
             [](StateVectorT &sv, const StateVectorT &other, bool async) {
diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.py b/pennylane_lightning/lightning_gpu/lightning_gpu.py
index 4a0f48fb5a..3407f5cbf9 100644
--- a/pennylane_lightning/lightning_gpu/lightning_gpu.py
+++ b/pennylane_lightning/lightning_gpu/lightning_gpu.py
@@ -350,7 +350,6 @@ def __init__(  # pylint: disable=too-many-arguments
 
         # Creating the state vector
         self._mpi_handler = MPIHandler(mpi, mpi_buf_size, len(self.wires), c_dtype)
-        self._use_mpi = mpi
 
         self._statevector = self.LightningStateVector(
             num_wires=len(self.wires),
@@ -510,9 +509,6 @@ def simulate(
         Note that this function can return measurements for non-commuting observables simultaneously.
         """
         if circuit.shots and (any(isinstance(op, MidMeasureMP) for op in circuit.operations)):
-            if self._use_mpi:
-                raise qml.DeviceError("LightningGPU-MPI does not support Mid-circuit measurements.")
-
             results = []
             aux_circ = qml.tape.QuantumScript(
                 circuit.operations,

From 6a010f15cc30a96c4c4e76108931f44e7f741881 Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Thu, 10 Oct 2024 17:50:43 +0000
Subject: [PATCH 18/23] tidy up the code

---
 .../src/simulators/lightning_gpu/StateVectorCudaMPI.hpp     | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp
index 389da68b47..cca7ad4894 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp
@@ -355,7 +355,7 @@ class StateVectorCudaMPI final
             collapse_local_(wireInt, branch);
         } else {
             // global wire
-            int local_wire = 0;
+            constexpr int local_wire = 0;
             std::vector<int2> wirePairs{make_int2(wireInt, local_wire)};
             applyMPI_Dispatcher(wirePairs, &StateVectorCudaMPI::collapse_local_,
                                 local_wire, branch);
@@ -1680,8 +1680,8 @@ class StateVectorCudaMPI final
             /* const int32_t * */ basisBits.data(),
             /* const uint32_t nBasisBits */ basisBits.size()));
 
-        auto abs2sum0 = mpi_manager_.allreduce(abs2sum0_local, "sum");
-        auto abs2sum1 = mpi_manager_.allreduce(abs2sum1_local, "sum");
+        auto abs2sum0 = mpi_manager_.allreduce<double>(abs2sum0_local, "sum");
+        auto abs2sum1 = mpi_manager_.allreduce<double>(abs2sum1_local, "sum");
 
         double norm = (branch == 0) ? abs2sum0 : abs2sum1;
 

From 7f4c97fb4b3d811bff050b1f2baf39d7b01914e4 Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Thu, 10 Oct 2024 17:59:30 +0000
Subject: [PATCH 19/23] revert mpi support

---
 mpitests/conftest.py                          |  84 -----
 mpitests/test_native_mcm.py                   | 326 +-----------------
 .../bindings/LGPUBindingsMPI.hpp              |   2 -
 .../lightning_gpu/lightning_gpu.py            |   4 +
 4 files changed, 11 insertions(+), 405 deletions(-)

diff --git a/mpitests/conftest.py b/mpitests/conftest.py
index 6c079c86de..552cf9f330 100644
--- a/mpitests/conftest.py
+++ b/mpitests/conftest.py
@@ -18,8 +18,6 @@
 
 import itertools
 import os
-from functools import reduce
-from typing import Sequence
 
 import pennylane as qml
 import pytest
@@ -127,85 +125,3 @@ def _device(wires):
         )
 
     return _device
-
-
-#######################################################################
-
-
-def validate_counts(shots, results1, results2):
-    """Compares two counts.
-
-    If the results are ``Sequence``s, loop over entries.
-
-    Fails if a key of ``results1`` is not found in ``results2``.
-    Passes if counts are too low, chosen as ``100``.
-    Otherwise, fails if counts differ by more than ``20`` plus 20 percent.
-    """
-    if isinstance(results1, Sequence):
-        assert isinstance(results2, Sequence)
-        assert len(results1) == len(results2)
-        for r1, r2 in zip(results1, results2):
-            validate_counts(shots, r1, r2)
-        return
-    for key1, val1 in results1.items():
-        val2 = results2[key1]
-        if abs(val1 + val2) > 100:
-            assert np.allclose(val1, val2, rtol=20, atol=0.2)
-
-
-def validate_samples(shots, results1, results2):
-    """Compares two samples.
-
-    If the results are ``Sequence``s, loop over entries.
-
-    Fails if the results do not have the same shape, within ``20`` entries plus 20 percent.
-    This is to handle cases when post-selection yields variable shapes.
-    Otherwise, fails if the sums of samples differ by more than ``20`` plus 20 percent.
-    """
-    if isinstance(shots, Sequence):
-        assert isinstance(results1, Sequence)
-        assert isinstance(results2, Sequence)
-        assert len(results1) == len(results2)
-        for s, r1, r2 in zip(shots, results1, results2):
-            validate_samples(s, r1, r2)
-    else:
-        sh1, sh2 = results1.shape[0], results2.shape[0]
-        assert np.allclose(sh1, sh2, rtol=20, atol=0.2)
-        assert results1.ndim == results2.ndim
-        if results2.ndim > 1:
-            assert results1.shape[1] == results2.shape[1]
-        np.allclose(np.sum(results1), np.sum(results2), rtol=20, atol=0.2)
-
-
-def validate_others(shots, results1, results2):
-    """Compares two expval, probs or var.
-
-    If the results are ``Sequence``s, validate the average of items.
-
-    If ``shots is None``, validate using ``np.allclose``'s default parameters.
-    Otherwise, fails if the results do not match within ``0.01`` plus 20 percent.
-    """
-    if isinstance(results1, Sequence):
-        assert isinstance(results2, Sequence)
-        assert len(results1) == len(results2)
-        results1 = reduce(lambda x, y: x + y, results1) / len(results1)
-        results2 = reduce(lambda x, y: x + y, results2) / len(results2)
-        validate_others(shots, results1, results2)
-        return
-    if shots is None:
-        assert np.allclose(results1, results2)
-        return
-    assert np.allclose(results1, results2, atol=0.01, rtol=0.2)
-
-
-def validate_measurements(func, shots, results1, results2):
-    """Calls the correct validation function based on measurement type."""
-    if func is qml.counts:
-        validate_counts(shots, results1, results2)
-        return
-
-    if func is qml.sample:
-        validate_samples(shots, results1, results2)
-        return
-
-    validate_others(shots, results1, results2)
diff --git a/mpitests/test_native_mcm.py b/mpitests/test_native_mcm.py
index 563de142be..4ca607e343 100644
--- a/mpitests/test_native_mcm.py
+++ b/mpitests/test_native_mcm.py
@@ -12,70 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tests for default qubit preprocessing."""
-from functools import reduce
-from typing import Sequence
-
 import numpy as np
 import pennylane as qml
 import pytest
-from conftest import LightningDevice, device_name, validate_measurements
-from flaky import flaky
+from conftest import LightningDevice, device_name
 from mpi4py import MPI
 
-if device_name not in ("lightning.qubit", "lightning.kokkos", "lightning.gpu"):
-    pytest.skip("Native MCM not supported. Skipping.", allow_module_level=True)
-
 if not LightningDevice._CPP_BINARY_AVAILABLE:  # pylint: disable=protected-access
     pytest.skip("No binary module found. Skipping.", allow_module_level=True)
 
 
-def get_device(wires, **kwargs):
-    kwargs.setdefault("shots", None)
-    return qml.device(device_name, wires=wires, mpi=True, **kwargs)
-
-
-def test_all_invalid_shots_circuit():
-    """Test all invalid cases: expval, probs, var measurements."""
-    comm = MPI.COMM_WORLD
-    dev = qml.device(device_name, wires=2)
-    dq = qml.device("default.qubit", wires=2)
-
-    def circuit_op():
-        m = qml.measure(0, postselect=1)
-        qml.cond(m, qml.PauliX)(1)
-        return (
-            qml.expval(op=qml.PauliZ(1)),
-            qml.probs(op=qml.PauliY(0) @ qml.PauliZ(1)),
-            qml.var(op=qml.PauliZ(1)),
-        )
-
-    comm.Barrier()
-    res1 = qml.QNode(circuit_op, dq)()
-    res2 = qml.QNode(circuit_op, dev)(shots=10)
-    for r1, r2 in zip(res1, res2):
-        if isinstance(r1, Sequence):
-            assert len(r1) == len(r2)
-        assert np.all(np.isnan(r1))
-        assert np.all(np.isnan(r2))
-
-    def circuit_mcm():
-        m = qml.measure(0, postselect=1)
-        qml.cond(m, qml.PauliX)(1)
-        return qml.expval(op=m), qml.probs(op=m), qml.var(op=m)
-
-    res1 = qml.QNode(circuit_mcm, dq)()
-    res2 = qml.QNode(circuit_mcm, dev)(shots=10)
-
-    comm.Barrier()
-    for r1, r2 in zip(res1, res2):
-        if isinstance(r1, Sequence):
-            assert len(r1) == len(r2)
-        assert np.all(np.isnan(r1))
-        assert np.all(np.isnan(r2))
-
-
-def test_unsupported_measurement():
-    """Test unsupported ``qml.classical_shadow`` measurement on ``lightning.gpu`` ."""
+def test_unspported_mid_measurement():
+    """Test unsupported mid_measurement for lightning.gpu-mpi."""
     comm = MPI.COMM_WORLD
     dev = qml.device(device_name, wires=2, mpi=True, shots=1000)
     params = np.pi / 4 * np.ones(2)
@@ -85,271 +33,11 @@ def func(x, y):
         qml.RX(x, wires=0)
         m0 = qml.measure(0)
         qml.cond(m0, qml.RY)(y, wires=1)
-        return qml.classical_shadow(wires=0)
-
-    comm.Barrier()
-    if device_name == "lightning.qubit":
-        with pytest.raises(
-            qml.DeviceError,
-            match=f"not accepted with finite shots on lightning.qubit",
-        ):
-            func(*params)
-    if device_name in ("lightning.kokkos", "lightning.gpu"):
-        with pytest.raises(
-            qml.DeviceError,
-            match=r"Measurement shadow\(wires=\[0\]\) not accepted with finite shots on "
-            + device_name,
-        ):
-            func(*params)
-
-
-@pytest.mark.parametrize("mcm_method", ["deferred", "one-shot"])
-def test_qnode_mcm_method(mcm_method, mocker):
-    """Test that user specified qnode arg for mid-circuit measurements transform are used correctly"""
-    comm = MPI.COMM_WORLD
-    spy = (
-        mocker.spy(qml.dynamic_one_shot, "_transform")
-        if mcm_method == "one-shot"
-        else mocker.spy(qml.defer_measurements, "_transform")
-    )
-    other_spy = (
-        mocker.spy(qml.defer_measurements, "_transform")
-        if mcm_method == "one-shot"
-        else mocker.spy(qml.dynamic_one_shot, "_transform")
-    )
+        return qml.probs(wires=0)
 
-    shots = 10
-    device = qml.device(device_name, wires=3, mpi=True, shots=shots)
     comm.Barrier()
 
-    @qml.qnode(device, mcm_method=mcm_method)
-    def f(x):
-        qml.RX(x, 0)
-        _ = qml.measure(0)
-        qml.CNOT([0, 1])
-        return qml.sample(wires=[0, 1])
-
-    _ = f(np.pi / 8)
-    comm.Barrier()
-
-    spy.assert_called_once()
-    other_spy.assert_not_called()
-
-
-@pytest.mark.parametrize("postselect_mode", ["hw-like", "fill-shots"])
-def test_qnode_postselect_mode(postselect_mode):
-    """Test that user specified qnode arg for discarding invalid shots is used correctly"""
-    comm = MPI.COMM_WORLD
-    shots = 100
-    device = qml.device(device_name, wires=3, mpi=True, shots=shots)
-    postselect = 1
-
-    @qml.qnode(device, postselect_mode=postselect_mode)
-    def f(x):
-        qml.RX(x, 0)
-        _ = qml.measure(0, postselect=postselect)
-        qml.CNOT([0, 1])
-        return qml.sample(wires=[1])
-
-    # Using small-ish rotation angle ensures the number of valid shots will be less than the
-    # original number of shots. This helps avoid stochastic failures for the assertion below
-    res = f(np.pi / 2)
-
-    comm.Barrier()
-
-    if postselect_mode == "hw-like":
-        assert len(res) < shots
-    else:
-        assert len(res) == shots
-    assert np.allclose(res, postselect)
-
-
-# pylint: disable=unused-argument
-def obs_tape(x, y, z, reset=False, postselect=None):
-    qml.RX(x, 0)
-    qml.RZ(np.pi / 4, 0)
-    m0 = qml.measure(0, reset=reset)
-    qml.cond(m0 == 0, qml.RX)(np.pi / 4, 0)
-    qml.cond(m0 == 0, qml.RZ)(np.pi / 4, 0)
-    qml.cond(m0 == 1, qml.RX)(-np.pi / 4, 0)
-    qml.cond(m0 == 1, qml.RZ)(-np.pi / 4, 0)
-    qml.RX(y, 1)
-    qml.RZ(np.pi / 4, 1)
-    m1 = qml.measure(1, postselect=postselect)
-    qml.cond(m1 == 0, qml.RX)(np.pi / 4, 1)
-    qml.cond(m1 == 0, qml.RZ)(np.pi / 4, 1)
-    qml.cond(m1 == 1, qml.RX)(-np.pi / 4, 1)
-    qml.cond(m1 == 1, qml.RZ)(-np.pi / 4, 1)
-    return m0, m1
-
-
-@flaky(max_runs=5)
-@pytest.mark.parametrize("shots", [5000, [5000, 5001]])
-@pytest.mark.parametrize("postselect", [None, 0, 1])
-@pytest.mark.parametrize("measure_f", [qml.counts, qml.expval, qml.probs, qml.sample, qml.var])
-@pytest.mark.parametrize(
-    "meas_obj",
-    [qml.PauliZ(0), qml.PauliY(1), [0], [0, 1], [1, 0], "mcm", "composite_mcm", "mcm_list"],
-)
-def test_simple_dynamic_circuit(shots, measure_f, postselect, meas_obj):
-    """Tests that LightningQubit handles a simple dynamic circuit with the following measurements:
-
-        * qml.counts with obs (comp basis or not), single wire, multiple wires (ordered/unordered), MCM, f(MCM), MCM list
-        * qml.expval with obs (comp basis or not), MCM, f(MCM), MCM list
-        * qml.probs with obs (comp basis or not), single wire, multiple wires (ordered/unordered), MCM, f(MCM), MCM list
-        * qml.sample with obs (comp basis or not), single wire, multiple wires (ordered/unordered), MCM, f(MCM), MCM list
-        * qml.var with obs (comp basis or not), MCM, f(MCM), MCM list
-
-    The above combinations should work for finite shots, shot vectors and post-selecting of either the 0 or 1 branch.
-    """
-    comm = MPI.COMM_WORLD
-
-    if measure_f in (qml.expval, qml.var) and (
-        isinstance(meas_obj, list) or meas_obj == "mcm_list"
-    ):
-        pytest.skip("Can't use wires/mcm lists with var or expval")
-
-    dq = qml.device("default.qubit", shots=shots)
-    dev = get_device(wires=3, shots=shots)
-    params = [np.pi / 2.5, np.pi / 3, -np.pi / 3.5]
-
-    def func(x, y, z):
-        m0, m1 = obs_tape(x, y, z, postselect=postselect)
-        mid_measure = (
-            m0 if meas_obj == "mcm" else (0.5 * m0 if meas_obj == "composite_mcm" else [m0, m1])
-        )
-        measurement_key = "wires" if isinstance(meas_obj, list) else "op"
-        measurement_value = mid_measure if isinstance(meas_obj, str) else meas_obj
-        return measure_f(**{measurement_key: measurement_value})
-
-    results1 = qml.QNode(func, dev, mcm_method="one-shot")(*params)
-    results2 = qml.QNode(func, dq, mcm_method="deferred")(*params)
-    comm.Barrier()
-
-    validate_measurements(measure_f, shots, results1, results2)
-
-
-@pytest.mark.parametrize("postselect", [None, 0, 1])
-@pytest.mark.parametrize("reset", [False, True])
-def test_multiple_measurements_and_reset(postselect, reset):
-    """Tests that LightningQubit handles a circuit with a single mid-circuit measurement with reset
-    and a conditional gate. Multiple measurements of the mid-circuit measurement value are
-    performed. This function also tests `reset` parametrizing over the parameter."""
-    comm = MPI.COMM_WORLD
-    shots = 5000
-    dq = qml.device("default.qubit", shots=shots)
-    dev = get_device(wires=3, shots=shots)
-    params = [np.pi / 2.5, np.pi / 3, -np.pi / 3.5]
-    obs = qml.PauliY(1)
-
-    def func(x, y, z):
-        mcms = obs_tape(x, y, z, reset=reset, postselect=postselect)
-        return (
-            qml.counts(op=obs),
-            qml.expval(op=mcms[0]),
-            qml.probs(op=obs),
-            qml.sample(op=mcms[0]),
-            qml.var(op=obs),
-        )
-
-    results1 = qml.QNode(func, dev, mcm_method="one-shot")(*params)
-    results2 = qml.QNode(func, dq, mcm_method="deferred")(*params)
-    comm.Barrier()
-
-    for measure_f, r1, r2 in zip(
-        [qml.counts, qml.expval, qml.probs, qml.sample, qml.var], results1, results2
+    with pytest.raises(
+        qml.DeviceError, match="LightningGPU-MPI does not support Mid-circuit measurements."
     ):
-        validate_measurements(measure_f, shots, r1, r2)
-
-
-@pytest.mark.parametrize(
-    "mcm_f",
-    [
-        lambda x: x * -1,
-        lambda x: x * 1,
-        lambda x: x * 2,
-        lambda x: 1 - x,
-        lambda x: x + 1,
-        lambda x: x & 3,
-        "mix",
-        "list",
-    ],
-)
-@pytest.mark.parametrize("measure_f", [qml.counts, qml.expval, qml.probs, qml.sample, qml.var])
-def test_composite_mcms(mcm_f, measure_f):
-    """Tests that LightningQubit handles a circuit with a composite mid-circuit measurement and a
-    conditional gate. A single measurement of a composite mid-circuit measurement is performed
-    at the end."""
-    comm = MPI.COMM_WORLD
-    if measure_f in (qml.expval, qml.var) and (mcm_f in ("list", "mix")):
-        pytest.skip(
-            "expval/var does not support measuring sequences of measurements or observables."
-        )
-
-    if measure_f == qml.probs and mcm_f == "mix":
-        pytest.skip(
-            "Cannot use qml.probs() when measuring multiple mid-circuit measurements collected using arithmetic operators."
-        )
-
-    shots = 3000
-
-    dq = qml.device("default.qubit", shots=shots)
-    dev = get_device(wires=3, shots=shots)
-    param = np.pi / 3
-
-    @qml.qnode(dev)
-    def func(x):
-        qml.RX(x, 0)
-        m0 = qml.measure(0)
-        qml.RX(0.5 * x, 1)
-        m1 = qml.measure(1)
-        qml.cond((m0 + m1) == 2, qml.RY)(2.0 * x, 0)
-        m2 = qml.measure(0)
-        obs = (
-            (m0 - 2 * m1) * m2 + 7
-            if mcm_f == "mix"
-            else ([m0, m1, m2] if mcm_f == "list" else mcm_f(m2))
-        )
-        return measure_f(op=obs)
-
-    results1 = qml.QNode(func, dev, mcm_method="one-shot")(param)
-    results2 = qml.QNode(func, dq, mcm_method="deferred")(param)
-
-    comm.Barrier()
-
-    validate_measurements(measure_f, shots, results1, results2)
-
-
-@pytest.mark.parametrize(
-    "mcm_f",
-    [
-        lambda x, y: x + y,
-        lambda x, y: x - 7 * y,
-        lambda x, y: x & y,
-        lambda x, y: x == y,
-        lambda x, y: 4.0 * x + 2.0 * y,
-    ],
-)
-def test_counts_return_type(mcm_f):
-    """Tests that LightningQubit returns the same keys for ``qml.counts`` measurements with ``dynamic_one_shot`` and ``defer_measurements``."""
-    comm = MPI.COMM_WORLD
-    shots = 500
-
-    dq = qml.device("default.qubit", shots=shots)
-    dev = get_device(wires=3, shots=shots)
-    param = np.pi / 3
-
-    @qml.qnode(dev)
-    def func(x):
-        qml.RX(x, 0)
-        m0 = qml.measure(0)
-        qml.RX(0.5 * x, 1)
-        m1 = qml.measure(1)
-        qml.cond((m0 + m1) == 2, qml.RY)(2.0 * x, 0)
-        return qml.counts(op=mcm_f(m0, m1))
-
-    results1 = qml.QNode(func, dev, mcm_method="one-shot")(param)
-    results2 = qml.QNode(func, dq, mcm_method="deferred")(param)
-    comm.Barrier()
-    for r1, r2 in zip(results1.keys(), results2.keys()):
-        assert r1 == r2
+        func(*params)
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp
index 529f5ae75e..2d3313f694 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp
@@ -100,8 +100,6 @@ void registerBackendClassSpecificBindingsMPI(PyClass &pyclass) {
             },
             "Set State Vector on GPU with values for the state vector and "
             "wires on the host memory.")
-        .def("collapse", &StateVectorT::collapse,
-             "Collapse the statevector onto the 0 or 1 branch of a given wire.")
         .def(
             "DeviceToDevice",
             [](StateVectorT &sv, const StateVectorT &other, bool async) {
diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.py b/pennylane_lightning/lightning_gpu/lightning_gpu.py
index 3407f5cbf9..4a0f48fb5a 100644
--- a/pennylane_lightning/lightning_gpu/lightning_gpu.py
+++ b/pennylane_lightning/lightning_gpu/lightning_gpu.py
@@ -350,6 +350,7 @@ def __init__(  # pylint: disable=too-many-arguments
 
         # Creating the state vector
         self._mpi_handler = MPIHandler(mpi, mpi_buf_size, len(self.wires), c_dtype)
+        self._use_mpi = mpi
 
         self._statevector = self.LightningStateVector(
             num_wires=len(self.wires),
@@ -509,6 +510,9 @@ def simulate(
         Note that this function can return measurements for non-commuting observables simultaneously.
         """
         if circuit.shots and (any(isinstance(op, MidMeasureMP) for op in circuit.operations)):
+            if self._use_mpi:
+                raise qml.DeviceError("LightningGPU-MPI does not support Mid-circuit measurements.")
+
             results = []
             aux_circ = qml.tape.QuantumScript(
                 circuit.operations,

From 85dc575b55fbc5f864f822df708b626e3442221b Mon Sep 17 00:00:00 2001
From: ringo-but-quantum <github-ringo-but-quantum@xanadu.ai>
Date: Thu, 10 Oct 2024 17:59:54 +0000
Subject: [PATCH 20/23] Auto update version from '0.39.0-dev42' to
 '0.39.0-dev43'

---
 pennylane_lightning/core/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
index 59d75bd653..cff4ff5e0b 100644
--- a/pennylane_lightning/core/_version.py
+++ b/pennylane_lightning/core/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.39.0-dev42"
+__version__ = "0.39.0-dev43"

From 0e9f1db466f662e6432451e4a5f51da5b3f7da57 Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Thu, 10 Oct 2024 18:11:51 +0000
Subject: [PATCH 21/23] tidy up code

---
 .../core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp    | 2 +-
 .../src/simulators/lightning_gpu/StateVectorCudaManaged.hpp     | 2 +-
 pennylane_lightning/lightning_gpu/lightning_gpu.py              | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp
index cca7ad4894..c05f8160fa 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp
@@ -1685,7 +1685,7 @@ class StateVectorCudaMPI final
 
         double norm = (branch == 0) ? abs2sum0 : abs2sum1;
 
-        int parity = branch;
+        const int parity = static_cast<int>(branch);
 
         PL_CUSTATEVEC_IS_SUCCESS(custatevecCollapseOnZBasis(
             /* custatevecHandle_t */ handle_.get(),
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp
index 377b6d729d..7ade4ab2b9 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp
@@ -522,7 +522,7 @@ class StateVectorCudaManaged
 
         double norm = (branch == 0) ? abs2sum0 : abs2sum1;
 
-        int parity = branch;
+        const int parity = static_cast<int>(branch);
 
         PL_CUSTATEVEC_IS_SUCCESS(custatevecCollapseOnZBasis(
             /* custatevecHandle_t */ handle_.get(),
diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.py b/pennylane_lightning/lightning_gpu/lightning_gpu.py
index 4a0f48fb5a..9c91b87a52 100644
--- a/pennylane_lightning/lightning_gpu/lightning_gpu.py
+++ b/pennylane_lightning/lightning_gpu/lightning_gpu.py
@@ -514,7 +514,7 @@ def simulate(
                 raise qml.DeviceError("LightningGPU-MPI does not support Mid-circuit measurements.")
 
             results = []
-            aux_circ = qml.tape.QuantumScript(
+            aux_circ = QuantumScript(
                 circuit.operations,
                 circuit.measurements,
                 shots=[1],

From 3b809faf98278fd44a1187b73bb5f32677aab66e Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Tue, 15 Oct 2024 13:36:48 +0000
Subject: [PATCH 22/23] initial commit

---
 mpitests/test_native_mcm.py                   | 325 +++++++++++++++++-
 .../lightning_gpu/StateVectorCudaMPI.hpp      |  58 +++-
 .../bindings/LGPUBindingsMPI.hpp              |   2 +
 .../measurements/MeasurementsGPUMPI.hpp       |  10 +-
 .../core/src/utils/cuda_utils/LinearAlg.hpp   |  55 +++
 .../lightning_gpu/_measurements.py            |   6 +-
 .../lightning_gpu/lightning_gpu.py            |   4 -
 7 files changed, 438 insertions(+), 22 deletions(-)

diff --git a/mpitests/test_native_mcm.py b/mpitests/test_native_mcm.py
index 4ca607e343..eaafe0ff03 100644
--- a/mpitests/test_native_mcm.py
+++ b/mpitests/test_native_mcm.py
@@ -1,29 +1,75 @@
 # Copyright 2024 Xanadu Quantum Technologies Inc.
-
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-
 #     http://www.apache.org/licenses/LICENSE-2.0
-
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tests for default qubit preprocessing."""
+from functools import reduce
+from typing import Sequence
+
 import numpy as np
 import pennylane as qml
 import pytest
-from conftest import LightningDevice, device_name
+from conftest import LightningDevice, device_name, validate_measurements
+from flaky import flaky
 from mpi4py import MPI
 
 if not LightningDevice._CPP_BINARY_AVAILABLE:  # pylint: disable=protected-access
     pytest.skip("No binary module found. Skipping.", allow_module_level=True)
 
 
-def test_unspported_mid_measurement():
-    """Test unsupported mid_measurement for lightning.gpu-mpi."""
+def get_device(wires, **kwargs):
+    kwargs.setdefault("shots", None)
+    return qml.device(device_name, wires=wires, mpi=True, **kwargs)
+
+
+def test_all_invalid_shots_circuit():
+    """Test all invalid cases: expval, probs, var measurements."""
+    comm = MPI.COMM_WORLD
+    dev = qml.device(device_name, wires=2)
+    dq = qml.device("default.qubit", wires=2)
+
+    def circuit_op():
+        m = qml.measure(0, postselect=1)
+        qml.cond(m, qml.PauliX)(1)
+        return (
+            qml.expval(op=qml.PauliZ(1)),
+            qml.probs(op=qml.PauliY(0) @ qml.PauliZ(1)),
+            qml.var(op=qml.PauliZ(1)),
+        )
+
+    comm.Barrier()
+    res1 = qml.QNode(circuit_op, dq)()
+    res2 = qml.QNode(circuit_op, dev)(shots=10)
+    for r1, r2 in zip(res1, res2):
+        if isinstance(r1, Sequence):
+            assert len(r1) == len(r2)
+        assert np.all(np.isnan(r1))
+        assert np.all(np.isnan(r2))
+
+    def circuit_mcm():
+        m = qml.measure(0, postselect=1)
+        qml.cond(m, qml.PauliX)(1)
+        return qml.expval(op=m), qml.probs(op=m), qml.var(op=m)
+
+    res1 = qml.QNode(circuit_mcm, dq)()
+    res2 = qml.QNode(circuit_mcm, dev)(shots=10)
+
+    comm.Barrier()
+    for r1, r2 in zip(res1, res2):
+        if isinstance(r1, Sequence):
+            assert len(r1) == len(r2)
+        assert np.all(np.isnan(r1))
+        assert np.all(np.isnan(r2))
+
+
+def test_unsupported_measurement():
+    """Test unsupported ``qml.classical_shadow`` measurement on ``lightning.gpu`` ."""
     comm = MPI.COMM_WORLD
     dev = qml.device(device_name, wires=2, mpi=True, shots=1000)
     params = np.pi / 4 * np.ones(2)
@@ -33,11 +79,270 @@ def func(x, y):
         qml.RX(x, wires=0)
         m0 = qml.measure(0)
         qml.cond(m0, qml.RY)(y, wires=1)
-        return qml.probs(wires=0)
+        return qml.classical_shadow(wires=0)
+
+    comm.Barrier()
+    if device_name == "lightning.qubit":
+        with pytest.raises(
+            qml.DeviceError,
+            match=f"not accepted with finite shots on lightning.qubit",
+        ):
+            func(*params)
+    if device_name in ("lightning.kokkos", "lightning.gpu"):
+        with pytest.raises(
+            qml.DeviceError,
+            match=r"Measurement shadow\(wires=\[0\]\) not accepted with finite shots on "
+            + device_name,
+        ):
+            func(*params)
+
+
+@pytest.mark.parametrize("mcm_method", ["deferred", "one-shot"])
+def test_qnode_mcm_method(mcm_method, mocker):
+    """Test that user specified qnode arg for mid-circuit measurements transform are used correctly"""
+    comm = MPI.COMM_WORLD
+    spy = (
+        mocker.spy(qml.dynamic_one_shot, "_transform")
+        if mcm_method == "one-shot"
+        else mocker.spy(qml.defer_measurements, "_transform")
+    )
+    other_spy = (
+        mocker.spy(qml.defer_measurements, "_transform")
+        if mcm_method == "one-shot"
+        else mocker.spy(qml.dynamic_one_shot, "_transform")
+    )
+
+    shots = 10
+    device = qml.device(device_name, wires=3, mpi=True, shots=shots)
+    comm.Barrier()
+
+    @qml.qnode(device, mcm_method=mcm_method)
+    def f(x):
+        qml.RX(x, 0)
+        _ = qml.measure(0)
+        qml.CNOT([0, 1])
+        return qml.sample(wires=[0, 1])
+
+    _ = f(np.pi / 8)
+    comm.Barrier()
+
+    spy.assert_called_once()
+    other_spy.assert_not_called()
+
+
+@pytest.mark.parametrize("postselect_mode", ["hw-like", "fill-shots"])
+def test_qnode_postselect_mode(postselect_mode):
+    """Test that user specified qnode arg for discarding invalid shots is used correctly"""
+    comm = MPI.COMM_WORLD
+    shots = 100
+    device = qml.device(device_name, wires=3, mpi=True, shots=shots)
+    postselect = 1
+
+    @qml.qnode(device, postselect_mode=postselect_mode)
+    def f(x):
+        qml.RX(x, 0)
+        _ = qml.measure(0, postselect=postselect)
+        qml.CNOT([0, 1])
+        return qml.sample(qml.Identity(1))
+
+    comm.Barrier()
+    # Using small-ish rotation angle ensures the number of valid shots will be less than the
+    # original number of shots. This helps avoid stochastic failures for the assertion below
+    res = f(np.pi / 2)
+
+    comm.Barrier()
+
+    if postselect_mode == "hw-like":
+        assert res.size < shots
+    else:
+        assert len(res) == shots
+    assert np.allclose(res, postselect)
+
 
+# pylint: disable=unused-argument
+def obs_tape(x, y, z, reset=False, postselect=None):
+    qml.RX(x, 0)
+    qml.RZ(np.pi / 4, 0)
+    m0 = qml.measure(0, reset=reset)
+    qml.cond(m0 == 0, qml.RX)(np.pi / 4, 0)
+    qml.cond(m0 == 0, qml.RZ)(np.pi / 4, 0)
+    qml.cond(m0 == 1, qml.RX)(-np.pi / 4, 0)
+    qml.cond(m0 == 1, qml.RZ)(-np.pi / 4, 0)
+    qml.RX(y, 1)
+    qml.RZ(np.pi / 4, 1)
+    m1 = qml.measure(1, postselect=postselect)
+    qml.cond(m1 == 0, qml.RX)(np.pi / 4, 1)
+    qml.cond(m1 == 0, qml.RZ)(np.pi / 4, 1)
+    qml.cond(m1 == 1, qml.RX)(-np.pi / 4, 1)
+    qml.cond(m1 == 1, qml.RZ)(-np.pi / 4, 1)
+    return m0, m1
+
+
+@flaky(max_runs=5)
+@pytest.mark.parametrize("shots", [5000, [5000, 5001]])
+@pytest.mark.parametrize("postselect", [None, 0, 1])
+@pytest.mark.parametrize("measure_f", [qml.counts, qml.expval, qml.probs, qml.sample, qml.var])
+@pytest.mark.parametrize(
+    "meas_obj",
+    [qml.PauliZ(0), qml.PauliY(1), [0], [0, 1], [1, 0], "mcm", "composite_mcm", "mcm_list"],
+)
+def test_simple_dynamic_circuit(shots, measure_f, postselect, meas_obj):
+    """Tests that LightningQubit handles a simple dynamic circuit with the following measurements:
+        * qml.counts with obs (comp basis or not), single wire, multiple wires (ordered/unordered), MCM, f(MCM), MCM list
+        * qml.expval with obs (comp basis or not), MCM, f(MCM), MCM list
+        * qml.probs with obs (comp basis or not), single wire, multiple wires (ordered/unordered), MCM, f(MCM), MCM list
+        * qml.sample with obs (comp basis or not), single wire, multiple wires (ordered/unordered), MCM, f(MCM), MCM list
+        * qml.var with obs (comp basis or not), MCM, f(MCM), MCM list
+    The above combinations should work for finite shots, shot vectors and post-selecting of either the 0 or 1 branch.
+    """
+    comm = MPI.COMM_WORLD
+
+    if measure_f in (qml.expval, qml.var) and (
+        isinstance(meas_obj, list) or meas_obj == "mcm_list"
+    ):
+        pytest.skip("Can't use wires/mcm lists with var or expval")
+
+    dq = qml.device("default.qubit", shots=shots)
+    dev = get_device(wires=3, shots=shots)
+    params = [np.pi / 2.5, np.pi / 3, -np.pi / 3.5]
+
+    def func(x, y, z):
+        m0, m1 = obs_tape(x, y, z, postselect=postselect)
+        mid_measure = (
+            m0 if meas_obj == "mcm" else (0.5 * m0 if meas_obj == "composite_mcm" else [m0, m1])
+        )
+        measurement_key = "wires" if isinstance(meas_obj, list) else "op"
+        measurement_value = mid_measure if isinstance(meas_obj, str) else meas_obj
+        return measure_f(**{measurement_key: measurement_value})
+
+    results1 = qml.QNode(func, dev, mcm_method="one-shot")(*params)
+    results2 = qml.QNode(func, dq, mcm_method="deferred")(*params)
     comm.Barrier()
 
-    with pytest.raises(
-        qml.DeviceError, match="LightningGPU-MPI does not support Mid-circuit measurements."
+    validate_measurements(measure_f, shots, results1, results2)
+
+
+@pytest.mark.parametrize("postselect", [None, 0, 1])
+@pytest.mark.parametrize("reset", [False, True])
+def test_multiple_measurements_and_reset(postselect, reset):
+    """Tests that LightningQubit handles a circuit with a single mid-circuit measurement with reset
+    and a conditional gate. Multiple measurements of the mid-circuit measurement value are
+    performed. This function also tests `reset` parametrizing over the parameter."""
+    comm = MPI.COMM_WORLD
+    shots = 5000
+    dq = qml.device("default.qubit", shots=shots)
+    dev = get_device(wires=3, shots=shots)
+    params = [np.pi / 2.5, np.pi / 3, -np.pi / 3.5]
+    obs = qml.PauliY(1)
+
+    def func(x, y, z):
+        mcms = obs_tape(x, y, z, reset=reset, postselect=postselect)
+        return (
+            qml.counts(op=obs),
+            qml.expval(op=mcms[0]),
+            qml.probs(op=obs),
+            qml.sample(op=mcms[0]),
+            qml.var(op=obs),
+        )
+
+    results1 = qml.QNode(func, dev, mcm_method="one-shot")(*params)
+    results2 = qml.QNode(func, dq, mcm_method="deferred")(*params)
+    comm.Barrier()
+
+    for measure_f, r1, r2 in zip(
+        [qml.counts, qml.expval, qml.probs, qml.sample, qml.var], results1, results2
     ):
-        func(*params)
+        validate_measurements(measure_f, shots, r1, r2)
+
+
+@pytest.mark.parametrize(
+    "mcm_f",
+    [
+        lambda x: x * -1,
+        lambda x: x * 1,
+        lambda x: x * 2,
+        lambda x: 1 - x,
+        lambda x: x + 1,
+        lambda x: x & 3,
+        "mix",
+        "list",
+    ],
+)
+@pytest.mark.parametrize("measure_f", [qml.counts, qml.expval, qml.probs, qml.sample, qml.var])
+def test_composite_mcms(mcm_f, measure_f):
+    """Tests that LightningQubit handles a circuit with a composite mid-circuit measurement and a
+    conditional gate. A single measurement of a composite mid-circuit measurement is performed
+    at the end."""
+    comm = MPI.COMM_WORLD
+    if measure_f in (qml.expval, qml.var) and (mcm_f in ("list", "mix")):
+        pytest.skip(
+            "expval/var does not support measuring sequences of measurements or observables."
+        )
+
+    if measure_f == qml.probs and mcm_f == "mix":
+        pytest.skip(
+            "Cannot use qml.probs() when measuring multiple mid-circuit measurements collected using arithmetic operators."
+        )
+
+    shots = 3000
+
+    dq = qml.device("default.qubit", shots=shots)
+    dev = get_device(wires=3, shots=shots)
+    param = np.pi / 3
+
+    @qml.qnode(dev)
+    def func(x):
+        qml.RX(x, 0)
+        m0 = qml.measure(0)
+        qml.RX(0.5 * x, 1)
+        m1 = qml.measure(1)
+        qml.cond((m0 + m1) == 2, qml.RY)(2.0 * x, 0)
+        m2 = qml.measure(0)
+        obs = (
+            (m0 - 2 * m1) * m2 + 7
+            if mcm_f == "mix"
+            else ([m0, m1, m2] if mcm_f == "list" else mcm_f(m2))
+        )
+        return measure_f(op=obs)
+
+    results1 = qml.QNode(func, dev, mcm_method="one-shot")(param)
+    results2 = qml.QNode(func, dq, mcm_method="deferred")(param)
+
+    comm.Barrier()
+
+    validate_measurements(measure_f, shots, results1, results2)
+
+
+@pytest.mark.parametrize(
+    "mcm_f",
+    [
+        lambda x, y: x + y,
+        lambda x, y: x - 7 * y,
+        lambda x, y: x & y,
+        lambda x, y: x == y,
+        lambda x, y: 4.0 * x + 2.0 * y,
+    ],
+)
+def test_counts_return_type(mcm_f):
+    """Tests that LightningQubit returns the same keys for ``qml.counts`` measurements with ``dynamic_one_shot`` and ``defer_measurements``."""
+    comm = MPI.COMM_WORLD
+    shots = 500
+
+    dq = qml.device("default.qubit", shots=shots)
+    dev = get_device(wires=3, shots=shots)
+    param = np.pi / 3
+
+    @qml.qnode(dev)
+    def func(x):
+        qml.RX(x, 0)
+        m0 = qml.measure(0)
+        qml.RX(0.5 * x, 1)
+        m1 = qml.measure(1)
+        qml.cond((m0 + m1) == 2, qml.RY)(2.0 * x, 0)
+        return qml.counts(op=mcm_f(m0, m1))
+
+    results1 = qml.QNode(func, dev, mcm_method="one-shot")(param)
+    results2 = qml.QNode(func, dq, mcm_method="deferred")(param)
+    comm.Barrier()
+    for r1, r2 in zip(results1.keys(), results2.keys()):
+        assert r1 == r2
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp
index c05f8160fa..794c026de5 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp
@@ -344,13 +344,16 @@ class StateVectorCudaMPI final
      * @param branch Branch 0 or 1.
      */
     void collapse(const std::size_t wire, const bool branch) {
+        /*
         PL_ABORT_IF_NOT(wire < this->getTotalNumQubits(),
                         "Invalid wire index.");
+        PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
+        mpi_manager_.Barrier();
 
         const int wireInt =
             static_cast<int>(this->getTotalNumQubits() - 1 - wire);
 
-        if (static_cast<std::size_t>(wireInt) < BaseType::getNumQubits()) {
+        if (static_cast<std::size_t>(wireInt) < getNumLocalQubits()) {
             // local wire
             collapse_local_(wireInt, branch);
         } else {
@@ -364,6 +367,43 @@ class StateVectorCudaMPI final
             PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
         }
 
+        mpi_manager_.Barrier();
+        */
+        PL_ABORT_IF_NOT(wire < this->getTotalNumQubits(),
+                        "Invalid wire index.");
+
+        std::vector<ComplexT> matrix(4, ComplexT(0.0, 0.0));
+
+        for (std::size_t i = 0; i < matrix.size(); i++) {
+            matrix[i] = ((i == 0 && branch == 0) || (i == 3 && branch == 1))
+                            ? ComplexT{1.0, 0.0}
+                            : ComplexT{0.0, 0.0};
+        }
+
+        mpi_manager_.Barrier();
+
+        applyMatrix(matrix, {wire}, false);
+
+        auto local_norm2 = norm2_CUDA<CFP_t>(
+            BaseType::getData(), BaseType::getLength(),
+            BaseType::getDataBuffer().getDevTag().getDeviceID(),
+            BaseType::getDataBuffer().getDevTag().getStreamID(),
+            this->getCublasCaller());
+
+        local_norm2 *= local_norm2;
+
+        mpi_manager_.Barrier();
+
+        auto norm2 = mpi_manager_.allreduce<PrecisionT>(local_norm2, "sum");
+
+        norm2 = std::sqrt(norm2);
+
+        normalize_CUDA<PrecisionT, CFP_t>(
+            norm2, BaseType::getData(), BaseType::getLength(),
+            BaseType::getDataBuffer().getDevTag().getDeviceID(),
+            BaseType::getDataBuffer().getDevTag().getStreamID(),
+            this->getCublasCaller());
+
         mpi_manager_.Barrier();
     }
 
@@ -383,6 +423,8 @@ class StateVectorCudaMPI final
                         const std::vector<std::size_t> &wires, bool adjoint,
                         const std::vector<Precision> &params,
                         [[maybe_unused]] const std::vector<ComplexT> &matrix) {
+        PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
+        mpi_manager_.Barrier();
         std::vector<CFP_t> matrix_cu(matrix.size());
         std::transform(matrix.begin(), matrix.end(), matrix_cu.begin(),
                        [](const std::complex<Precision> &x) {
@@ -408,6 +450,8 @@ class StateVectorCudaMPI final
         const std::string &opName, const std::vector<std::size_t> &wires,
         bool adjoint = false, const std::vector<Precision> &params = {0.0},
         [[maybe_unused]] const std::vector<CFP_t> &gate_matrix = {}) {
+        PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
+        mpi_manager_.Barrier();
         const auto ctrl_offset = (BaseType::getCtrlMap().find(opName) !=
                                   BaseType::getCtrlMap().end())
                                      ? BaseType::getCtrlMap().at(opName)
@@ -467,6 +511,8 @@ class StateVectorCudaMPI final
                 gate_cache_.get_gate_device_ptr(opName, par[0]), ctrls_local,
                 tgts_local, adjoint);
         }
+        PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
+        mpi_manager_.Barrier();
     }
 
     /**
@@ -524,6 +570,8 @@ class StateVectorCudaMPI final
                      const std::vector<std::size_t> &wires,
                      bool adjoint = false) {
         PL_ABORT_IF(wires.empty(), "Number of wires must be larger than 0");
+        PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
+        mpi_manager_.Barrier();
         const std::string opName = "Matrix";
         std::size_t n = std::size_t{1} << wires.size();
         const std::vector<std::complex<PrecisionT>> matrix(gate_matrix,
@@ -535,6 +583,8 @@ class StateVectorCudaMPI final
                                x);
                        });
         applyOperation(opName, wires, adjoint, {}, matrix_cu);
+        PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
+        mpi_manager_.Barrier();
     }
 
     /**
@@ -1674,7 +1724,7 @@ class StateVectorCudaMPI final
             /* custatevecHandle_t */ handle_.get(),
             /* void *sv */ BaseType::getData(),
             /* cudaDataType_t */ data_type,
-            /* const uint32_t nIndexBits */ BaseType::getNumQubits(),
+            /* const uint32_t nIndexBits */ getNumLocalQubits(),
             /* double * */ &abs2sum0_local,
             /* double * */ &abs2sum1_local,
             /* const int32_t * */ basisBits.data(),
@@ -1691,7 +1741,7 @@ class StateVectorCudaMPI final
             /* custatevecHandle_t */ handle_.get(),
             /* void *sv */ BaseType::getData(),
             /* cudaDataType_t */ data_type,
-            /* const uint32_t nIndexBits */ BaseType::getNumQubits(),
+            /* const uint32_t nIndexBits */ getNumLocalQubits(),
             /* const int32_t parity */ parity,
             /* const int32_t *basisBits */ basisBits.data(),
             /* const uint32_t nBasisBits */ basisBits.size(),
@@ -1980,6 +2030,8 @@ class StateVectorCudaMPI final
                                const std::vector<std::size_t> &ctrls,
                                const std::vector<std::size_t> &tgts,
                                bool use_adjoint = false) {
+        PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
+        mpi_manager_.Barrier();
         std::vector<int> ctrlsInt(ctrls.size());
         std::vector<int> tgtsInt(tgts.size());
 
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp
index 2d3313f694..529f5ae75e 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp
@@ -100,6 +100,8 @@ void registerBackendClassSpecificBindingsMPI(PyClass &pyclass) {
             },
             "Set State Vector on GPU with values for the state vector and "
             "wires on the host memory.")
+        .def("collapse", &StateVectorT::collapse,
+             "Collapse the statevector onto the 0 or 1 branch of a given wire.")
         .def(
             "DeviceToDevice",
             [](StateVectorT &sv, const StateVectorT &other, bool async) {
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/MeasurementsGPUMPI.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/MeasurementsGPUMPI.hpp
index 6fee1711d2..710930ba54 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/MeasurementsGPUMPI.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/MeasurementsGPUMPI.hpp
@@ -265,7 +265,7 @@ class MeasurementsMPI final
      * number between 0 and num_samples-1.
      */
     auto generate_samples(std::size_t num_samples) -> std::vector<std::size_t> {
-        double epsilon = 1e-15;
+        double epsilon = std::numeric_limits<double>::epsilon() * 1.0e2;
         std::size_t nSubSvs = 1UL << (this->_statevector.getNumGlobalQubits());
         std::vector<double> rand_nums(num_samples);
         std::vector<std::size_t> samples(
@@ -280,8 +280,8 @@ class MeasurementsMPI final
             bitOrdering[i] = i;
         }
 
-        std::vector<custatevecIndex_t> localBitStrings(num_samples);
-        std::vector<custatevecIndex_t> globalBitStrings(num_samples);
+        std::vector<custatevecIndex_t> localBitStrings(num_samples, 0);
+        std::vector<custatevecIndex_t> globalBitStrings(num_samples, 0);
 
         if (mpi_manager_.getRank() == 0) {
             for (std::size_t n = 0; n < num_samples; n++) {
@@ -320,6 +320,8 @@ class MeasurementsMPI final
             /* custatevecHandle_t */ this->_statevector.getCusvHandle(),
             /* custatevecSamplerDescriptor_t */ sampler,
             /* double * */ &subNorm));
+        PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
+        mpi_manager_.Barrier();
 
         int source = (mpi_manager_.getRank() - 1 + mpi_manager_.getSize()) %
                      mpi_manager_.getSize();
@@ -354,6 +356,8 @@ class MeasurementsMPI final
             /* double */ precumulative,
             /* double */ norm));
 
+        norm = (norm < epsilon) ? epsilon : norm;
+
         PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
         auto low = std::lower_bound(rand_nums.begin(), rand_nums.end(),
                                     cumulative / norm);
diff --git a/pennylane_lightning/core/src/utils/cuda_utils/LinearAlg.hpp b/pennylane_lightning/core/src/utils/cuda_utils/LinearAlg.hpp
index cd422899b5..984a9d2358 100644
--- a/pennylane_lightning/core/src/utils/cuda_utils/LinearAlg.hpp
+++ b/pennylane_lightning/core/src/utils/cuda_utils/LinearAlg.hpp
@@ -274,6 +274,61 @@ inline auto scaleC_CUDA(const CFP_t a, T *v1, const int data_size,
                 data_type);
 }
 
+/**
+ * @brief cuBLAS backed GPU data normalization.
+ *
+ * @tparam CFP_t Complex float data-type. Accepts cuDoubleComplex and cuComplex
+ * @tparam DevTypeID Integer type of device id.
+ *
+ * @param v1 Device data pointer
+ * @param data_size Length of device data.
+ * @param dev_id the device on which the function should be executed.
+ * @param stream_id the CUDA stream on which the operation should be executed.
+ * @param cublas the CublasCaller object that manages the cuBLAS handle.
+ */
+template <class CFP_t = cuDoubleComplex, class DevTypeID = int>
+inline auto norm2_CUDA(CFP_t *v1, const int data_size, DevTypeID dev_id,
+                       cudaStream_t stream_id, const CublasCaller &cublas) {
+    if constexpr (std::is_same_v<CFP_t, cuDoubleComplex> ||
+                  std::is_same_v<CFP_t, double2>) {
+        double norm{0.0};
+        cublas.call(cublasDznrm2, dev_id, stream_id, data_size, v1, 1, &norm);
+        return norm;
+    } else {
+        float norm{0.0};
+        cublas.call(cublasScnrm2, dev_id, stream_id, data_size, v1, 1, &norm);
+        return norm;
+    }
+}
+
+/**
+ * @brief cuBLAS backed GPU data normalization.
+ *
+ * @tparam T Float data-type. Accepts float and double
+ * @tparam CFP_t Complex float data-type. Accepts cuDoubleComplex and cuComplex
+ *
+ * @param norm2 Norm of the vector
+ * @param v1 Device data pointer
+ * @param data_size Length of device data.
+ * @param dev_id the device on which the function should be executed.
+ * @param stream_id the CUDA stream on which the operation should be executed.
+ * @param cublas the CublasCaller object that manages the cuBLAS handle.
+ */
+template <class T = double, class CFP_t = cuDoubleComplex,
+          class DevTypeID = int>
+inline auto normalize_CUDA(T norm2, CFP_t *v1, const int data_size,
+                           DevTypeID dev_id, cudaStream_t stream_id,
+                           const CublasCaller &cublas) {
+    if constexpr (std::is_same_v<CFP_t, cuDoubleComplex> ||
+                  std::is_same_v<CFP_t, double2>) {
+        const double alpha = 1.0 / norm2;
+        cublas.call(cublasZdscal, dev_id, stream_id, data_size, &alpha, v1, 1);
+    } else {
+        const float alpha = 1.0 / norm2;
+        cublas.call(cublasCsscal, dev_id, stream_id, data_size, &alpha, v1, 1);
+    }
+}
+
 /** @brief `%CudaScopedDevice` uses RAII to select a CUDA device context.
  *
  * @see https://taskflow.github.io/taskflow/classtf_1_1cudaScopedDevice.html
diff --git a/pennylane_lightning/lightning_gpu/_measurements.py b/pennylane_lightning/lightning_gpu/_measurements.py
index 4b95762ccc..337f6273e3 100644
--- a/pennylane_lightning/lightning_gpu/_measurements.py
+++ b/pennylane_lightning/lightning_gpu/_measurements.py
@@ -34,6 +34,7 @@
 except ImportError as error_import:
     warn(str(error_import), UserWarning)
 
+from functools import reduce
 from typing import List
 
 import numpy as np
@@ -105,8 +106,9 @@ def _measure_with_samples_diagonalizing_gates(
         self._apply_diagonalizing_gates(mps)
 
         # Specific for LGPU:
-        total_indices = self._qubit_state.num_wires
-        wires = qml.wires.Wires(range(total_indices))
+        # total_indices = self._qubit_state.num_wires
+        # wires = qml.wires.Wires(range(total_indices))
+        wires = reduce(sum, (mp.wires for mp in mps))
 
         def _process_single_shot(samples):
             processed = []
diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.py b/pennylane_lightning/lightning_gpu/lightning_gpu.py
index 9c91b87a52..d6f75b2c5e 100644
--- a/pennylane_lightning/lightning_gpu/lightning_gpu.py
+++ b/pennylane_lightning/lightning_gpu/lightning_gpu.py
@@ -350,7 +350,6 @@ def __init__(  # pylint: disable=too-many-arguments
 
         # Creating the state vector
         self._mpi_handler = MPIHandler(mpi, mpi_buf_size, len(self.wires), c_dtype)
-        self._use_mpi = mpi
 
         self._statevector = self.LightningStateVector(
             num_wires=len(self.wires),
@@ -510,9 +509,6 @@ def simulate(
         Note that this function can return measurements for non-commuting observables simultaneously.
         """
         if circuit.shots and (any(isinstance(op, MidMeasureMP) for op in circuit.operations)):
-            if self._use_mpi:
-                raise qml.DeviceError("LightningGPU-MPI does not support Mid-circuit measurements.")
-
             results = []
             aux_circ = QuantumScript(
                 circuit.operations,

From ddc74f2b8d845eaf1eb27250ecd47f370a0a5e13 Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Tue, 15 Oct 2024 15:19:41 +0000
Subject: [PATCH 23/23] update conftest

---
 mpitests/conftest.py | 79 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 78 insertions(+), 1 deletion(-)

diff --git a/mpitests/conftest.py b/mpitests/conftest.py
index 552cf9f330..11be17824a 100644
--- a/mpitests/conftest.py
+++ b/mpitests/conftest.py
@@ -15,9 +15,10 @@
 Pytest configuration file for PennyLane-Lightning-GPU test suite.
 """
 # pylint: disable=missing-function-docstring,wrong-import-order,unused-import
-
 import itertools
 import os
+from functools import reduce
+from typing import Sequence
 
 import pennylane as qml
 import pytest
@@ -125,3 +126,79 @@ def _device(wires):
         )
 
     return _device
+
+
+#######################################################################
+
+
+def validate_counts(shots, results1, results2):
+    """Compares two counts.
+    If the results are ``Sequence``s, loop over entries.
+    Fails if a key of ``results1`` is not found in ``results2``.
+    Passes if counts are too low, chosen as ``100``.
+    Otherwise, fails if counts differ by more than ``20`` plus 20 percent.
+    """
+    if isinstance(results1, Sequence):
+        assert isinstance(results2, Sequence)
+        assert len(results1) == len(results2)
+        for r1, r2 in zip(results1, results2):
+            validate_counts(shots, r1, r2)
+        return
+    for key1, val1 in results1.items():
+        val2 = results2[key1]
+        if abs(val1 + val2) > 100:
+            assert np.allclose(val1, val2, rtol=20, atol=0.2)
+
+
+def validate_samples(shots, results1, results2):
+    """Compares two samples.
+    If the results are ``Sequence``s, loop over entries.
+    Fails if the results do not have the same shape, within ``20`` entries plus 20 percent.
+    This is to handle cases when post-selection yields variable shapes.
+    Otherwise, fails if the sums of samples differ by more than ``20`` plus 20 percent.
+    """
+    if isinstance(shots, Sequence):
+        assert isinstance(results1, Sequence)
+        assert isinstance(results2, Sequence)
+        assert len(results1) == len(results2)
+        for s, r1, r2 in zip(shots, results1, results2):
+            validate_samples(s, r1, r2)
+    else:
+        sh1, sh2 = results1.shape[0], results2.shape[0]
+        assert np.allclose(sh1, sh2, rtol=20, atol=0.2)
+        assert results1.ndim == results2.ndim
+        if results2.ndim > 1:
+            assert results1.shape[1] == results2.shape[1]
+        np.allclose(np.sum(results1), np.sum(results2), rtol=20, atol=0.2)
+
+
+def validate_others(shots, results1, results2):
+    """Compares two expval, probs or var.
+    If the results are ``Sequence``s, validate the average of items.
+    If ``shots is None``, validate using ``np.allclose``'s default parameters.
+    Otherwise, fails if the results do not match within ``0.01`` plus 20 percent.
+    """
+    if isinstance(results1, Sequence):
+        assert isinstance(results2, Sequence)
+        assert len(results1) == len(results2)
+        results1 = reduce(lambda x, y: x + y, results1) / len(results1)
+        results2 = reduce(lambda x, y: x + y, results2) / len(results2)
+        validate_others(shots, results1, results2)
+        return
+    if shots is None:
+        assert np.allclose(results1, results2)
+        return
+    assert np.allclose(results1, results2, atol=0.01, rtol=0.2)
+
+
+def validate_measurements(func, shots, results1, results2):
+    """Calls the correct validation function based on measurement type."""
+    if func is qml.counts:
+        validate_counts(shots, results1, results2)
+        return
+
+    if func is qml.sample:
+        validate_samples(shots, results1, results2)
+        return
+
+    validate_others(shots, results1, results2)