diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md
index ab53d3e8e6..2999d78cdf 100644
--- a/.github/CHANGELOG.md
+++ b/.github/CHANGELOG.md
@@ -2,6 +2,9 @@
 
 ### New features since last release
 
+* Add `mid-circuit measurements` support to `lightning.gpu`'s single-GPU backend.
+  [(#931)](https://github.com/PennyLaneAI/pennylane-lightning/pull/931)
+
 * Add Matrix Product Operator (MPO) for all gates support to `lightning.tensor`. Note current C++ implementation only works for MPO sites data provided by users.
   [(#859)](https://github.com/PennyLaneAI/pennylane-lightning/pull/859)
 
diff --git a/mpitests/conftest.py b/mpitests/conftest.py
index 552cf9f330..11be17824a 100644
--- a/mpitests/conftest.py
+++ b/mpitests/conftest.py
@@ -15,9 +15,10 @@
 Pytest configuration file for PennyLane-Lightning-GPU test suite.
 """
 # pylint: disable=missing-function-docstring,wrong-import-order,unused-import
-
 import itertools
 import os
+from functools import reduce
+from typing import Sequence
 
 import pennylane as qml
 import pytest
@@ -125,3 +126,79 @@ def _device(wires):
         )
 
     return _device
+
+
+#######################################################################
+
+
+def validate_counts(shots, results1, results2):
+    """Compares two counts.
+    If the results are ``Sequence``s, loop over entries.
+    Fails if a key of ``results1`` is not found in ``results2``.
+    Passes if counts are too low, chosen as ``100``.
+    Otherwise, fails if counts differ by more than ``20`` plus 20 percent.
+    """
+    if isinstance(results1, Sequence):
+        assert isinstance(results2, Sequence)
+        assert len(results1) == len(results2)
+        for r1, r2 in zip(results1, results2):
+            validate_counts(shots, r1, r2)
+        return
+    for key1, val1 in results1.items():
+        val2 = results2[key1]
+        if abs(val1 + val2) > 100:
+            assert np.allclose(val1, val2, rtol=20, atol=0.2)
+
+
+def validate_samples(shots, results1, results2):
+    """Compares two samples.
+    If the results are ``Sequence``s, loop over entries.
+    Fails if the results do not have the same shape, within ``20`` entries plus 20 percent.
+    This is to handle cases when post-selection yields variable shapes.
+    Otherwise, fails if the sums of samples differ by more than ``20`` plus 20 percent.
+    """
+    if isinstance(shots, Sequence):
+        assert isinstance(results1, Sequence)
+        assert isinstance(results2, Sequence)
+        assert len(results1) == len(results2)
+        for s, r1, r2 in zip(shots, results1, results2):
+            validate_samples(s, r1, r2)
+    else:
+        sh1, sh2 = results1.shape[0], results2.shape[0]
+        assert np.allclose(sh1, sh2, rtol=20, atol=0.2)
+        assert results1.ndim == results2.ndim
+        if results2.ndim > 1:
+            assert results1.shape[1] == results2.shape[1]
+        np.allclose(np.sum(results1), np.sum(results2), rtol=20, atol=0.2)
+
+
+def validate_others(shots, results1, results2):
+    """Compares two expval, probs or var.
+    If the results are ``Sequence``s, validate the average of items.
+    If ``shots is None``, validate using ``np.allclose``'s default parameters.
+    Otherwise, fails if the results do not match within ``0.01`` plus 20 percent.
+    """
+    if isinstance(results1, Sequence):
+        assert isinstance(results2, Sequence)
+        assert len(results1) == len(results2)
+        results1 = reduce(lambda x, y: x + y, results1) / len(results1)
+        results2 = reduce(lambda x, y: x + y, results2) / len(results2)
+        validate_others(shots, results1, results2)
+        return
+    if shots is None:
+        assert np.allclose(results1, results2)
+        return
+    assert np.allclose(results1, results2, atol=0.01, rtol=0.2)
+
+
+def validate_measurements(func, shots, results1, results2):
+    """Calls the correct validation function based on measurement type."""
+    if func is qml.counts:
+        validate_counts(shots, results1, results2)
+        return
+
+    if func is qml.sample:
+        validate_samples(shots, results1, results2)
+        return
+
+    validate_others(shots, results1, results2)
diff --git a/mpitests/test_native_mcm.py b/mpitests/test_native_mcm.py
new file mode 100644
index 0000000000..eaafe0ff03
--- /dev/null
+++ b/mpitests/test_native_mcm.py
@@ -0,0 +1,348 @@
+# Copyright 2024 Xanadu Quantum Technologies Inc.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for default qubit preprocessing."""
+from functools import reduce
+from typing import Sequence
+
+import numpy as np
+import pennylane as qml
+import pytest
+from conftest import LightningDevice, device_name, validate_measurements
+from flaky import flaky
+from mpi4py import MPI
+
+if not LightningDevice._CPP_BINARY_AVAILABLE:  # pylint: disable=protected-access
+    pytest.skip("No binary module found. Skipping.", allow_module_level=True)
+
+
+def get_device(wires, **kwargs):
+    kwargs.setdefault("shots", None)
+    return qml.device(device_name, wires=wires, mpi=True, **kwargs)
+
+
+def test_all_invalid_shots_circuit():
+    """Test all invalid cases: expval, probs, var measurements."""
+    comm = MPI.COMM_WORLD
+    dev = qml.device(device_name, wires=2)
+    dq = qml.device("default.qubit", wires=2)
+
+    def circuit_op():
+        m = qml.measure(0, postselect=1)
+        qml.cond(m, qml.PauliX)(1)
+        return (
+            qml.expval(op=qml.PauliZ(1)),
+            qml.probs(op=qml.PauliY(0) @ qml.PauliZ(1)),
+            qml.var(op=qml.PauliZ(1)),
+        )
+
+    comm.Barrier()
+    res1 = qml.QNode(circuit_op, dq)()
+    res2 = qml.QNode(circuit_op, dev)(shots=10)
+    for r1, r2 in zip(res1, res2):
+        if isinstance(r1, Sequence):
+            assert len(r1) == len(r2)
+        assert np.all(np.isnan(r1))
+        assert np.all(np.isnan(r2))
+
+    def circuit_mcm():
+        m = qml.measure(0, postselect=1)
+        qml.cond(m, qml.PauliX)(1)
+        return qml.expval(op=m), qml.probs(op=m), qml.var(op=m)
+
+    res1 = qml.QNode(circuit_mcm, dq)()
+    res2 = qml.QNode(circuit_mcm, dev)(shots=10)
+
+    comm.Barrier()
+    for r1, r2 in zip(res1, res2):
+        if isinstance(r1, Sequence):
+            assert len(r1) == len(r2)
+        assert np.all(np.isnan(r1))
+        assert np.all(np.isnan(r2))
+
+
+def test_unsupported_measurement():
+    """Test unsupported ``qml.classical_shadow`` measurement on ``lightning.gpu`` ."""
+    comm = MPI.COMM_WORLD
+    dev = qml.device(device_name, wires=2, mpi=True, shots=1000)
+    params = np.pi / 4 * np.ones(2)
+
+    @qml.qnode(dev)
+    def func(x, y):
+        qml.RX(x, wires=0)
+        m0 = qml.measure(0)
+        qml.cond(m0, qml.RY)(y, wires=1)
+        return qml.classical_shadow(wires=0)
+
+    comm.Barrier()
+    if device_name == "lightning.qubit":
+        with pytest.raises(
+            qml.DeviceError,
+            match=f"not accepted with finite shots on lightning.qubit",
+        ):
+            func(*params)
+    if device_name in ("lightning.kokkos", "lightning.gpu"):
+        with pytest.raises(
+            qml.DeviceError,
+            match=r"Measurement shadow\(wires=\[0\]\) not accepted with finite shots on "
+            + device_name,
+        ):
+            func(*params)
+
+
+@pytest.mark.parametrize("mcm_method", ["deferred", "one-shot"])
+def test_qnode_mcm_method(mcm_method, mocker):
+    """Test that user specified qnode arg for mid-circuit measurements transform are used correctly"""
+    comm = MPI.COMM_WORLD
+    spy = (
+        mocker.spy(qml.dynamic_one_shot, "_transform")
+        if mcm_method == "one-shot"
+        else mocker.spy(qml.defer_measurements, "_transform")
+    )
+    other_spy = (
+        mocker.spy(qml.defer_measurements, "_transform")
+        if mcm_method == "one-shot"
+        else mocker.spy(qml.dynamic_one_shot, "_transform")
+    )
+
+    shots = 10
+    device = qml.device(device_name, wires=3, mpi=True, shots=shots)
+    comm.Barrier()
+
+    @qml.qnode(device, mcm_method=mcm_method)
+    def f(x):
+        qml.RX(x, 0)
+        _ = qml.measure(0)
+        qml.CNOT([0, 1])
+        return qml.sample(wires=[0, 1])
+
+    _ = f(np.pi / 8)
+    comm.Barrier()
+
+    spy.assert_called_once()
+    other_spy.assert_not_called()
+
+
+@pytest.mark.parametrize("postselect_mode", ["hw-like", "fill-shots"])
+def test_qnode_postselect_mode(postselect_mode):
+    """Test that user specified qnode arg for discarding invalid shots is used correctly"""
+    comm = MPI.COMM_WORLD
+    shots = 100
+    device = qml.device(device_name, wires=3, mpi=True, shots=shots)
+    postselect = 1
+
+    @qml.qnode(device, postselect_mode=postselect_mode)
+    def f(x):
+        qml.RX(x, 0)
+        _ = qml.measure(0, postselect=postselect)
+        qml.CNOT([0, 1])
+        return qml.sample(qml.Identity(1))
+
+    comm.Barrier()
+    # Using small-ish rotation angle ensures the number of valid shots will be less than the
+    # original number of shots. This helps avoid stochastic failures for the assertion below
+    res = f(np.pi / 2)
+
+    comm.Barrier()
+
+    if postselect_mode == "hw-like":
+        assert res.size < shots
+    else:
+        assert len(res) == shots
+    assert np.allclose(res, postselect)
+
+
+# pylint: disable=unused-argument
+def obs_tape(x, y, z, reset=False, postselect=None):
+    qml.RX(x, 0)
+    qml.RZ(np.pi / 4, 0)
+    m0 = qml.measure(0, reset=reset)
+    qml.cond(m0 == 0, qml.RX)(np.pi / 4, 0)
+    qml.cond(m0 == 0, qml.RZ)(np.pi / 4, 0)
+    qml.cond(m0 == 1, qml.RX)(-np.pi / 4, 0)
+    qml.cond(m0 == 1, qml.RZ)(-np.pi / 4, 0)
+    qml.RX(y, 1)
+    qml.RZ(np.pi / 4, 1)
+    m1 = qml.measure(1, postselect=postselect)
+    qml.cond(m1 == 0, qml.RX)(np.pi / 4, 1)
+    qml.cond(m1 == 0, qml.RZ)(np.pi / 4, 1)
+    qml.cond(m1 == 1, qml.RX)(-np.pi / 4, 1)
+    qml.cond(m1 == 1, qml.RZ)(-np.pi / 4, 1)
+    return m0, m1
+
+
+@flaky(max_runs=5)
+@pytest.mark.parametrize("shots", [5000, [5000, 5001]])
+@pytest.mark.parametrize("postselect", [None, 0, 1])
+@pytest.mark.parametrize("measure_f", [qml.counts, qml.expval, qml.probs, qml.sample, qml.var])
+@pytest.mark.parametrize(
+    "meas_obj",
+    [qml.PauliZ(0), qml.PauliY(1), [0], [0, 1], [1, 0], "mcm", "composite_mcm", "mcm_list"],
+)
+def test_simple_dynamic_circuit(shots, measure_f, postselect, meas_obj):
+    """Tests that LightningQubit handles a simple dynamic circuit with the following measurements:
+        * qml.counts with obs (comp basis or not), single wire, multiple wires (ordered/unordered), MCM, f(MCM), MCM list
+        * qml.expval with obs (comp basis or not), MCM, f(MCM), MCM list
+        * qml.probs with obs (comp basis or not), single wire, multiple wires (ordered/unordered), MCM, f(MCM), MCM list
+        * qml.sample with obs (comp basis or not), single wire, multiple wires (ordered/unordered), MCM, f(MCM), MCM list
+        * qml.var with obs (comp basis or not), MCM, f(MCM), MCM list
+    The above combinations should work for finite shots, shot vectors and post-selecting of either the 0 or 1 branch.
+    """
+    comm = MPI.COMM_WORLD
+
+    if measure_f in (qml.expval, qml.var) and (
+        isinstance(meas_obj, list) or meas_obj == "mcm_list"
+    ):
+        pytest.skip("Can't use wires/mcm lists with var or expval")
+
+    dq = qml.device("default.qubit", shots=shots)
+    dev = get_device(wires=3, shots=shots)
+    params = [np.pi / 2.5, np.pi / 3, -np.pi / 3.5]
+
+    def func(x, y, z):
+        m0, m1 = obs_tape(x, y, z, postselect=postselect)
+        mid_measure = (
+            m0 if meas_obj == "mcm" else (0.5 * m0 if meas_obj == "composite_mcm" else [m0, m1])
+        )
+        measurement_key = "wires" if isinstance(meas_obj, list) else "op"
+        measurement_value = mid_measure if isinstance(meas_obj, str) else meas_obj
+        return measure_f(**{measurement_key: measurement_value})
+
+    results1 = qml.QNode(func, dev, mcm_method="one-shot")(*params)
+    results2 = qml.QNode(func, dq, mcm_method="deferred")(*params)
+    comm.Barrier()
+
+    validate_measurements(measure_f, shots, results1, results2)
+
+
+@pytest.mark.parametrize("postselect", [None, 0, 1])
+@pytest.mark.parametrize("reset", [False, True])
+def test_multiple_measurements_and_reset(postselect, reset):
+    """Tests that LightningQubit handles a circuit with a single mid-circuit measurement with reset
+    and a conditional gate. Multiple measurements of the mid-circuit measurement value are
+    performed. This function also tests `reset` parametrizing over the parameter."""
+    comm = MPI.COMM_WORLD
+    shots = 5000
+    dq = qml.device("default.qubit", shots=shots)
+    dev = get_device(wires=3, shots=shots)
+    params = [np.pi / 2.5, np.pi / 3, -np.pi / 3.5]
+    obs = qml.PauliY(1)
+
+    def func(x, y, z):
+        mcms = obs_tape(x, y, z, reset=reset, postselect=postselect)
+        return (
+            qml.counts(op=obs),
+            qml.expval(op=mcms[0]),
+            qml.probs(op=obs),
+            qml.sample(op=mcms[0]),
+            qml.var(op=obs),
+        )
+
+    results1 = qml.QNode(func, dev, mcm_method="one-shot")(*params)
+    results2 = qml.QNode(func, dq, mcm_method="deferred")(*params)
+    comm.Barrier()
+
+    for measure_f, r1, r2 in zip(
+        [qml.counts, qml.expval, qml.probs, qml.sample, qml.var], results1, results2
+    ):
+        validate_measurements(measure_f, shots, r1, r2)
+
+
+@pytest.mark.parametrize(
+    "mcm_f",
+    [
+        lambda x: x * -1,
+        lambda x: x * 1,
+        lambda x: x * 2,
+        lambda x: 1 - x,
+        lambda x: x + 1,
+        lambda x: x & 3,
+        "mix",
+        "list",
+    ],
+)
+@pytest.mark.parametrize("measure_f", [qml.counts, qml.expval, qml.probs, qml.sample, qml.var])
+def test_composite_mcms(mcm_f, measure_f):
+    """Tests that LightningQubit handles a circuit with a composite mid-circuit measurement and a
+    conditional gate. A single measurement of a composite mid-circuit measurement is performed
+    at the end."""
+    comm = MPI.COMM_WORLD
+    if measure_f in (qml.expval, qml.var) and (mcm_f in ("list", "mix")):
+        pytest.skip(
+            "expval/var does not support measuring sequences of measurements or observables."
+        )
+
+    if measure_f == qml.probs and mcm_f == "mix":
+        pytest.skip(
+            "Cannot use qml.probs() when measuring multiple mid-circuit measurements collected using arithmetic operators."
+        )
+
+    shots = 3000
+
+    dq = qml.device("default.qubit", shots=shots)
+    dev = get_device(wires=3, shots=shots)
+    param = np.pi / 3
+
+    @qml.qnode(dev)
+    def func(x):
+        qml.RX(x, 0)
+        m0 = qml.measure(0)
+        qml.RX(0.5 * x, 1)
+        m1 = qml.measure(1)
+        qml.cond((m0 + m1) == 2, qml.RY)(2.0 * x, 0)
+        m2 = qml.measure(0)
+        obs = (
+            (m0 - 2 * m1) * m2 + 7
+            if mcm_f == "mix"
+            else ([m0, m1, m2] if mcm_f == "list" else mcm_f(m2))
+        )
+        return measure_f(op=obs)
+
+    results1 = qml.QNode(func, dev, mcm_method="one-shot")(param)
+    results2 = qml.QNode(func, dq, mcm_method="deferred")(param)
+
+    comm.Barrier()
+
+    validate_measurements(measure_f, shots, results1, results2)
+
+
+@pytest.mark.parametrize(
+    "mcm_f",
+    [
+        lambda x, y: x + y,
+        lambda x, y: x - 7 * y,
+        lambda x, y: x & y,
+        lambda x, y: x == y,
+        lambda x, y: 4.0 * x + 2.0 * y,
+    ],
+)
+def test_counts_return_type(mcm_f):
+    """Tests that LightningQubit returns the same keys for ``qml.counts`` measurements with ``dynamic_one_shot`` and ``defer_measurements``."""
+    comm = MPI.COMM_WORLD
+    shots = 500
+
+    dq = qml.device("default.qubit", shots=shots)
+    dev = get_device(wires=3, shots=shots)
+    param = np.pi / 3
+
+    @qml.qnode(dev)
+    def func(x):
+        qml.RX(x, 0)
+        m0 = qml.measure(0)
+        qml.RX(0.5 * x, 1)
+        m1 = qml.measure(1)
+        qml.cond((m0 + m1) == 2, qml.RY)(2.0 * x, 0)
+        return qml.counts(op=mcm_f(m0, m1))
+
+    results1 = qml.QNode(func, dev, mcm_method="one-shot")(param)
+    results2 = qml.QNode(func, dq, mcm_method="deferred")(param)
+    comm.Barrier()
+    for r1, r2 in zip(results1.keys(), results2.keys()):
+        assert r1 == r2
diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
index 59d75bd653..cff4ff5e0b 100644
--- a/pennylane_lightning/core/_version.py
+++ b/pennylane_lightning/core/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.39.0-dev42"
+__version__ = "0.39.0-dev43"
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp
index 964c5e69ce..794c026de5 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp
@@ -334,6 +334,79 @@ class StateVectorCudaMPI final
         mpi_manager_.Barrier();
     }
 
+    /**
+     * @brief Collapse the state vector after having measured one of the qubit.
+     *
+     * Note: The branch parameter imposes the measurement result on the given
+     * wire.
+     *
+     * @param wire Wire to measure.
+     * @param branch Branch 0 or 1.
+     */
+    void collapse(const std::size_t wire, const bool branch) {
+        /*
+        PL_ABORT_IF_NOT(wire < this->getTotalNumQubits(),
+                        "Invalid wire index.");
+        PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
+        mpi_manager_.Barrier();
+
+        const int wireInt =
+            static_cast<int>(this->getTotalNumQubits() - 1 - wire);
+
+        if (static_cast<std::size_t>(wireInt) < getNumLocalQubits()) {
+            // local wire
+            collapse_local_(wireInt, branch);
+        } else {
+            // global wire
+            constexpr int local_wire = 0;
+            std::vector<int2> wirePairs{make_int2(wireInt, local_wire)};
+            applyMPI_Dispatcher(wirePairs, &StateVectorCudaMPI::collapse_local_,
+                                local_wire, branch);
+
+            PL_CUDA_IS_SUCCESS(cudaStreamSynchronize(localStream_.get()));
+            PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
+        }
+
+        mpi_manager_.Barrier();
+        */
+        PL_ABORT_IF_NOT(wire < this->getTotalNumQubits(),
+                        "Invalid wire index.");
+
+        std::vector<ComplexT> matrix(4, ComplexT(0.0, 0.0));
+
+        for (std::size_t i = 0; i < matrix.size(); i++) {
+            matrix[i] = ((i == 0 && branch == 0) || (i == 3 && branch == 1))
+                            ? ComplexT{1.0, 0.0}
+                            : ComplexT{0.0, 0.0};
+        }
+
+        mpi_manager_.Barrier();
+
+        applyMatrix(matrix, {wire}, false);
+
+        auto local_norm2 = norm2_CUDA<CFP_t>(
+            BaseType::getData(), BaseType::getLength(),
+            BaseType::getDataBuffer().getDevTag().getDeviceID(),
+            BaseType::getDataBuffer().getDevTag().getStreamID(),
+            this->getCublasCaller());
+
+        local_norm2 *= local_norm2;
+
+        mpi_manager_.Barrier();
+
+        auto norm2 = mpi_manager_.allreduce<PrecisionT>(local_norm2, "sum");
+
+        norm2 = std::sqrt(norm2);
+
+        normalize_CUDA<PrecisionT, CFP_t>(
+            norm2, BaseType::getData(), BaseType::getLength(),
+            BaseType::getDataBuffer().getDevTag().getDeviceID(),
+            BaseType::getDataBuffer().getDevTag().getStreamID(),
+            this->getCublasCaller());
+
+        mpi_manager_.Barrier();
+    }
+
     /**
      * @brief Apply a single gate to the state-vector. Offloads to custatevec
      * specific API calls if available. If unable, attempts to use prior cached
@@ -350,6 +423,8 @@ class StateVectorCudaMPI final
                         const std::vector<std::size_t> &wires, bool adjoint,
                         const std::vector<Precision> &params,
                         [[maybe_unused]] const std::vector<ComplexT> &matrix) {
+        PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
+        mpi_manager_.Barrier();
         std::vector<CFP_t> matrix_cu(matrix.size());
         std::transform(matrix.begin(), matrix.end(), matrix_cu.begin(),
                        [](const std::complex<Precision> &x) {
@@ -375,6 +450,8 @@ class StateVectorCudaMPI final
         const std::string &opName, const std::vector<std::size_t> &wires,
         bool adjoint = false, const std::vector<Precision> &params = {0.0},
         [[maybe_unused]] const std::vector<CFP_t> &gate_matrix = {}) {
+        PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
+        mpi_manager_.Barrier();
         const auto ctrl_offset = (BaseType::getCtrlMap().find(opName) !=
                                   BaseType::getCtrlMap().end())
                                      ? BaseType::getCtrlMap().at(opName)
@@ -434,6 +511,8 @@ class StateVectorCudaMPI final
                 gate_cache_.get_gate_device_ptr(opName, par[0]), ctrls_local,
                 tgts_local, adjoint);
         }
+        PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
+        mpi_manager_.Barrier();
     }
 
     /**
@@ -491,6 +570,8 @@ class StateVectorCudaMPI final
                      const std::vector<std::size_t> &wires,
                      bool adjoint = false) {
         PL_ABORT_IF(wires.empty(), "Number of wires must be larger than 0");
+        PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
+        mpi_manager_.Barrier();
         const std::string opName = "Matrix";
         std::size_t n = std::size_t{1} << wires.size();
         const std::vector<std::complex<PrecisionT>> matrix(gate_matrix,
@@ -502,6 +583,8 @@ class StateVectorCudaMPI final
                                x);
                        });
         applyOperation(opName, wires, adjoint, {}, matrix_cu);
+        PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
+        mpi_manager_.Barrier();
     }
 
     /**
@@ -1617,6 +1700,54 @@ class StateVectorCudaMPI final
         mpi_manager_.Barrier();
     }
 
+    /**
+     * @brief collapse the state vector to a given basis state.
+     *
+     * @param wire_local Local wire index.
+     * @param branch Branch index.
+     */
+    void collapse_local_(const int wire_local, const bool branch) {
+        cudaDataType_t data_type;
+
+        if constexpr (std::is_same_v<CFP_t, cuDoubleComplex> ||
+                      std::is_same_v<CFP_t, double2>) {
+            data_type = CUDA_C_64F;
+        } else {
+            data_type = CUDA_C_32F;
+        }
+
+        std::vector<int> basisBits(1, wire_local);
+
+        double abs2sum0_local, abs2sum1_local;
+
+        PL_CUSTATEVEC_IS_SUCCESS(custatevecAbs2SumOnZBasis(
+            /* custatevecHandle_t */ handle_.get(),
+            /* void *sv */ BaseType::getData(),
+            /* cudaDataType_t */ data_type,
+            /* const uint32_t nIndexBits */ getNumLocalQubits(),
+            /* double * */ &abs2sum0_local,
+            /* double * */ &abs2sum1_local,
+            /* const int32_t * */ basisBits.data(),
+            /* const uint32_t nBasisBits */ basisBits.size()));
+
+        auto abs2sum0 = mpi_manager_.allreduce<double>(abs2sum0_local, "sum");
+        auto abs2sum1 = mpi_manager_.allreduce<double>(abs2sum1_local, "sum");
+
+        double norm = (branch == 0) ? abs2sum0 : abs2sum1;
+
+        const int parity = static_cast<int>(branch);
+
+        PL_CUSTATEVEC_IS_SUCCESS(custatevecCollapseOnZBasis(
+            /* custatevecHandle_t */ handle_.get(),
+            /* void *sv */ BaseType::getData(),
+            /* cudaDataType_t */ data_type,
+            /* const uint32_t nIndexBits */ getNumLocalQubits(),
+            /* const int32_t parity */ parity,
+            /* const int32_t *basisBits */ basisBits.data(),
+            /* const uint32_t nBasisBits */ basisBits.size(),
+            /* double norm */ norm));
+    }
+
     /**
      * @brief Get expectation value for a sum of Pauli words.
      *
@@ -1899,6 +2030,8 @@ class StateVectorCudaMPI final
                                const std::vector<std::size_t> &ctrls,
                                const std::vector<std::size_t> &tgts,
                                bool use_adjoint = false) {
+        PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
+        mpi_manager_.Barrier();
         std::vector<int> ctrlsInt(ctrls.size());
         std::vector<int> tgtsInt(tgts.size());
 
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp
index 9e68592dfd..7ade4ab2b9 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp
@@ -487,6 +487,54 @@ class StateVectorCudaManaged
         applyMatrix(gate_matrix.data(), wires, adjoint);
     }
 
+    /**
+     * @brief Collapse the state vector after having measured one of the qubit.
+     *
+     * Note: The branch parameter imposes the measurement result on the given
+     * wire.
+     *
+     * @param wire Wire to measure.
+     * @param branch Branch 0 or 1.
+     */
+    void collapse(const std::size_t wire, const bool branch) {
+        PL_ABORT_IF_NOT(wire < BaseType::getNumQubits(), "Invalid wire index.");
+        cudaDataType_t data_type;
+
+        if constexpr (std::is_same_v<CFP_t, cuDoubleComplex> ||
+                      std::is_same_v<CFP_t, double2>) {
+            data_type = CUDA_C_64F;
+        } else {
+            data_type = CUDA_C_32F;
+        }
+
+        std::vector<int> basisBits(1, BaseType::getNumQubits() - 1 - wire);
+
+        double abs2sum0, abs2sum1;
+        PL_CUSTATEVEC_IS_SUCCESS(custatevecAbs2SumOnZBasis(
+            /* custatevecHandle_t */ handle_.get(),
+            /* void *sv */ BaseType::getData(),
+            /* cudaDataType_t */ data_type,
+            /* const uint32_t nIndexBits */ BaseType::getNumQubits(),
+            /* double * */ &abs2sum0,
+            /* double * */ &abs2sum1,
+            /* const int32_t * */ basisBits.data(),
+            /* const uint32_t nBasisBits */ basisBits.size()));
+
+        double norm = (branch == 0) ? abs2sum0 : abs2sum1;
+
+        const int parity = static_cast<int>(branch);
+
+        PL_CUSTATEVEC_IS_SUCCESS(custatevecCollapseOnZBasis(
+            /* custatevecHandle_t */ handle_.get(),
+            /* void *sv */ BaseType::getData(),
+            /* cudaDataType_t */ data_type,
+            /* const uint32_t nIndexBits */ BaseType::getNumQubits(),
+            /* const int32_t parity */ parity,
+            /* const int32_t *basisBits */ basisBits.data(),
+            /* const uint32_t nBasisBits */ basisBits.size(),
+            /* double norm */ norm));
+    }
+
     //****************************************************************************//
     // Explicit gate calls for bindings
     //****************************************************************************//
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp
index c361bd6ed9..145097b30e 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp
@@ -150,6 +150,8 @@ void registerBackendClassSpecificBindings(PyClass &pyclass) {
             },
             py::arg("async") = false,
             "Initialize the statevector data to the |0...0> state")
+        .def("collapse", &StateVectorT::collapse,
+             "Collapse the statevector onto the 0 or 1 branch of a given wire.")
         .def(
             "apply",
             [](StateVectorT &sv, const std::string &str,
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp
index 2d3313f694..529f5ae75e 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp
@@ -100,6 +100,8 @@ void registerBackendClassSpecificBindingsMPI(PyClass &pyclass) {
             },
             "Set State Vector on GPU with values for the state vector and "
             "wires on the host memory.")
+        .def("collapse", &StateVectorT::collapse,
+             "Collapse the statevector onto the 0 or 1 branch of a given wire.")
         .def(
             "DeviceToDevice",
             [](StateVectorT &sv, const StateVectorT &other, bool async) {
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/initSV.cu b/pennylane_lightning/core/src/simulators/lightning_gpu/initSV.cu
index 4e3e93ea79..8a62e89e84 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/initSV.cu
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/initSV.cu
@@ -59,7 +59,7 @@ void setBasisState_CUDA(cuDoubleComplex *sv, cuDoubleComplex &value,
                         cudaStream_t stream_id);
 
 /**
- * @brief The CUDA kernel that setS state vector data on GPU device from the
+ * @brief The CUDA kernel that sets state vector data on GPU device from the
  * input values (on device) and their corresponding indices (on device)
  * information.
  *
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/MeasurementsGPUMPI.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/MeasurementsGPUMPI.hpp
index 6fee1711d2..710930ba54 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/MeasurementsGPUMPI.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/MeasurementsGPUMPI.hpp
@@ -265,7 +265,7 @@ class MeasurementsMPI final
      * number between 0 and num_samples-1.
      */
     auto generate_samples(std::size_t num_samples) -> std::vector<std::size_t> {
-        double epsilon = 1e-15;
+        double epsilon = std::numeric_limits<double>::epsilon() * 1.0e2;
         std::size_t nSubSvs = 1UL << (this->_statevector.getNumGlobalQubits());
         std::vector<double> rand_nums(num_samples);
         std::vector<std::size_t> samples(
@@ -280,8 +280,8 @@ class MeasurementsMPI final
             bitOrdering[i] = i;
         }
 
-        std::vector<custatevecIndex_t> localBitStrings(num_samples);
-        std::vector<custatevecIndex_t> globalBitStrings(num_samples);
+        std::vector<custatevecIndex_t> localBitStrings(num_samples, 0);
+        std::vector<custatevecIndex_t> globalBitStrings(num_samples, 0);
 
         if (mpi_manager_.getRank() == 0) {
             for (std::size_t n = 0; n < num_samples; n++) {
@@ -320,6 +320,8 @@ class MeasurementsMPI final
             /* custatevecHandle_t */ this->_statevector.getCusvHandle(),
             /* custatevecSamplerDescriptor_t */ sampler,
             /* double * */ &subNorm));
+        PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
+        mpi_manager_.Barrier();
 
         int source = (mpi_manager_.getRank() - 1 + mpi_manager_.getSize()) %
                      mpi_manager_.getSize();
@@ -354,6 +356,8 @@ class MeasurementsMPI final
             /* double */ precumulative,
             /* double */ norm));
 
+        norm = (norm < epsilon) ? epsilon : norm;
+
         PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
         auto low = std::lower_bound(rand_nums.begin(), rand_nums.end(),
                                     cumulative / norm);
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/tests/Test_StateVectorCudaManaged.cpp b/pennylane_lightning/core/src/simulators/lightning_gpu/tests/Test_StateVectorCudaManaged.cpp
index 4003395b53..0301970390 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/tests/Test_StateVectorCudaManaged.cpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/tests/Test_StateVectorCudaManaged.cpp
@@ -266,3 +266,45 @@ TEMPLATE_TEST_CASE("StateVectorCudaManaged::StateVectorCudaManaged",
         REQUIRE(std::is_constructible_v<StateVectorT, const StateVectorT &>);
     }
 }
+
+TEMPLATE_TEST_CASE("StateVectorCudaManaged::collapse",
+                   "[StateVectorCudaManaged]", float, double) {
+    using PrecisionT = TestType;
+    using ComplexT = typename StateVectorCudaManaged<PrecisionT>::ComplexT;
+    using CFP_t = typename StateVectorCudaManaged<PrecisionT>::CFP_t;
+    using TestVectorT = TestVector<ComplexT>;
+
+    std::size_t wire = GENERATE(0, 1, 2);
+    std::size_t branch = GENERATE(0, 1);
+    const std::size_t num_qubits = 3;
+
+    // TODO @tomlqc use same template for testing all Lightning flavours?
+
+    SECTION("Collapse the state vector after having measured one of the "
+            "qubits.") {
+        TestVectorT init_state = createPlusState_<ComplexT>(num_qubits);
+
+        const ComplexT coef{0.5, PrecisionT{0.0}};
+        const ComplexT zero{PrecisionT{0.0}, PrecisionT{0.0}};
+
+        std::vector<std::vector<std::vector<ComplexT>>> expected_state = {
+            {{coef, coef, coef, coef, zero, zero, zero, zero},
+             {coef, coef, zero, zero, coef, coef, zero, zero},
+             {coef, zero, coef, zero, coef, zero, coef, zero}},
+            {{zero, zero, zero, zero, coef, coef, coef, coef},
+             {zero, zero, coef, coef, zero, zero, coef, coef},
+             {zero, coef, zero, coef, zero, coef, zero, coef}},
+        };
+
+        StateVectorCudaManaged<PrecisionT> sv(
+            reinterpret_cast<CFP_t *>(init_state.data()), init_state.size());
+
+        sv.collapse(wire, branch);
+
+        PrecisionT eps = std::numeric_limits<PrecisionT>::epsilon() * 1e2;
+        REQUIRE(isApproxEqual(sv.getDataVector().data(),
+                              sv.getDataVector().size(),
+                              expected_state[branch][wire].data(),
+                              expected_state[branch][wire].size(), eps));
+    }
+}
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/tests/mpi/Test_StateVectorCudaMPI.cpp b/pennylane_lightning/core/src/simulators/lightning_gpu/tests/mpi/Test_StateVectorCudaMPI.cpp
index 4b5a2dd349..401f6056b3 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/tests/mpi/Test_StateVectorCudaMPI.cpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/tests/mpi/Test_StateVectorCudaMPI.cpp
@@ -317,4 +317,76 @@ TEMPLATE_PRODUCT_TEST_CASE("StateVectorCudaMPI::applyOperations",
                                          {false, false}, {{0.0}}),
             LightningException, "must all be equal"); // invalid parameters
     }
-}
\ No newline at end of file
+}
+
+TEMPLATE_TEST_CASE("StateVectorCudaManaged::collapse",
+                   "[StateVectorCudaManaged]", float, double) {
+    using PrecisionT = TestType;
+    using ComplexT = typename StateVectorCudaMPI<PrecisionT>::ComplexT;
+    using TestVectorT = TestVector<ComplexT>;
+
+    std::size_t wire = GENERATE(0, 1, 2);
+    std::size_t branch = GENERATE(0, 1);
+    const std::size_t num_qubits = 3;
+
+    using StateVectorT = StateVectorCudaMPI<TestType>;
+    MPIManager mpi_manager(MPI_COMM_WORLD);
+    REQUIRE(mpi_manager.getSize() == 2);
+
+    std::size_t mpi_buffersize = 1;
+
+    int nGlobalIndexBits =
+        std::bit_width(static_cast<unsigned int>(mpi_manager.getSize())) - 1;
+    int nLocalIndexBits = num_qubits - nGlobalIndexBits;
+    mpi_manager.Barrier();
+
+    int nDevices = 0; // Number of GPU devices per node
+    cudaGetDeviceCount(&nDevices);
+    REQUIRE(nDevices >= 2);
+    int deviceId = mpi_manager.getRank() % nDevices;
+    cudaSetDevice(deviceId);
+    DevTag<int> dt_local(deviceId, 0);
+
+    TestVectorT init_state = createPlusState_<ComplexT>(num_qubits);
+
+    std::size_t subSvLength = 1 << nLocalIndexBits;
+
+    mpi_manager.Barrier();
+
+    std::vector<ComplexT> local_state(subSvLength);
+
+    mpi_manager.Scatter(init_state.data(), local_state.data(), subSvLength, 0);
+    mpi_manager.Barrier();
+
+    // TODO @tomlqc use same template for testing all Lightning flavours?
+
+    SECTION("Collapse the state vector after having measured one of the "
+            "qubits.") {
+        const ComplexT coef{0.5, PrecisionT{0.0}};
+        const ComplexT zero{PrecisionT{0.0}, PrecisionT{0.0}};
+
+        std::vector<std::vector<std::vector<ComplexT>>> expected_state = {
+            {{coef, coef, coef, coef, zero, zero, zero, zero},
+             {coef, coef, zero, zero, coef, coef, zero, zero},
+             {coef, zero, coef, zero, coef, zero, coef, zero}},
+            {{zero, zero, zero, zero, coef, coef, coef, coef},
+             {zero, zero, coef, coef, zero, zero, coef, coef},
+             {zero, coef, zero, coef, zero, coef, zero, coef}},
+        };
+
+        StateVectorT sv(mpi_manager, dt_local, mpi_buffersize, nGlobalIndexBits,
+                        nLocalIndexBits);
+
+        sv.CopyHostDataToGpu(local_state.data(), local_state.size(), false);
+
+        sv.collapse(wire, branch);
+
+        auto expected_local_state =
+            mpi_manager.scatter(expected_state[branch][wire], 0);
+
+        PrecisionT eps = std::numeric_limits<PrecisionT>::epsilon() * 1e2;
+        REQUIRE(isApproxEqual(
+            sv.getDataVector().data(), sv.getDataVector().size(),
+            expected_local_state.data(), expected_local_state.size(), eps));
+    }
+}
diff --git a/pennylane_lightning/core/src/utils/cuda_utils/LinearAlg.hpp b/pennylane_lightning/core/src/utils/cuda_utils/LinearAlg.hpp
index cd422899b5..984a9d2358 100644
--- a/pennylane_lightning/core/src/utils/cuda_utils/LinearAlg.hpp
+++ b/pennylane_lightning/core/src/utils/cuda_utils/LinearAlg.hpp
@@ -274,6 +274,61 @@ inline auto scaleC_CUDA(const CFP_t a, T *v1, const int data_size,
                 data_type);
 }
 
+/**
+ * @brief cuBLAS backed GPU data normalization.
+ *
+ * @tparam CFP_t Complex float data-type. Accepts cuDoubleComplex and cuComplex
+ * @tparam DevTypeID Integer type of device id.
+ *
+ * @param v1 Device data pointer
+ * @param data_size Length of device data.
+ * @param dev_id the device on which the function should be executed.
+ * @param stream_id the CUDA stream on which the operation should be executed.
+ * @param cublas the CublasCaller object that manages the cuBLAS handle.
+ */
+template <class CFP_t = cuDoubleComplex, class DevTypeID = int>
+inline auto norm2_CUDA(CFP_t *v1, const int data_size, DevTypeID dev_id,
+                       cudaStream_t stream_id, const CublasCaller &cublas) {
+    if constexpr (std::is_same_v<CFP_t, cuDoubleComplex> ||
+                  std::is_same_v<CFP_t, double2>) {
+        double norm{0.0};
+        cublas.call(cublasDznrm2, dev_id, stream_id, data_size, v1, 1, &norm);
+        return norm;
+    } else {
+        float norm{0.0};
+        cublas.call(cublasScnrm2, dev_id, stream_id, data_size, v1, 1, &norm);
+        return norm;
+    }
+}
+
+/**
+ * @brief cuBLAS backed GPU data normalization.
+ *
+ * @tparam T Float data-type. Accepts float and double
+ * @tparam CFP_t Complex float data-type. Accepts cuDoubleComplex and cuComplex
+ *
+ * @param norm2 Norm of the vector
+ * @param v1 Device data pointer
+ * @param data_size Length of device data.
+ * @param dev_id the device on which the function should be executed.
+ * @param stream_id the CUDA stream on which the operation should be executed.
+ * @param cublas the CublasCaller object that manages the cuBLAS handle.
+ */
+template <class T = double, class CFP_t = cuDoubleComplex,
+          class DevTypeID = int>
+inline auto normalize_CUDA(T norm2, CFP_t *v1, const int data_size,
+                           DevTypeID dev_id, cudaStream_t stream_id,
+                           const CublasCaller &cublas) {
+    if constexpr (std::is_same_v<CFP_t, cuDoubleComplex> ||
+                  std::is_same_v<CFP_t, double2>) {
+        const double alpha = 1.0 / norm2;
+        cublas.call(cublasZdscal, dev_id, stream_id, data_size, &alpha, v1, 1);
+    } else {
+        const float alpha = 1.0 / norm2;
+        cublas.call(cublasCsscal, dev_id, stream_id, data_size, &alpha, v1, 1);
+    }
+}
+
 /** @brief `%CudaScopedDevice` uses RAII to select a CUDA device context.
  *
  * @see https://taskflow.github.io/taskflow/classtf_1_1cudaScopedDevice.html
diff --git a/pennylane_lightning/lightning_gpu/_measurements.py b/pennylane_lightning/lightning_gpu/_measurements.py
index 4b95762ccc..337f6273e3 100644
--- a/pennylane_lightning/lightning_gpu/_measurements.py
+++ b/pennylane_lightning/lightning_gpu/_measurements.py
@@ -34,6 +34,7 @@
 except ImportError as error_import:
     warn(str(error_import), UserWarning)
 
+from functools import reduce
 from typing import List
 
 import numpy as np
@@ -105,8 +106,9 @@ def _measure_with_samples_diagonalizing_gates(
         self._apply_diagonalizing_gates(mps)
 
         # Specific for LGPU:
-        total_indices = self._qubit_state.num_wires
-        wires = qml.wires.Wires(range(total_indices))
+        # total_indices = self._qubit_state.num_wires
+        # wires = qml.wires.Wires(range(total_indices))
+        wires = reduce(sum, (mp.wires for mp in mps))
 
         def _process_single_shot(samples):
             processed = []
diff --git a/pennylane_lightning/lightning_gpu/_state_vector.py b/pennylane_lightning/lightning_gpu/_state_vector.py
index a000443563..77e453778b 100644
--- a/pennylane_lightning/lightning_gpu/_state_vector.py
+++ b/pennylane_lightning/lightning_gpu/_state_vector.py
@@ -36,13 +36,17 @@
 import numpy as np
 import pennylane as qml
 from pennylane import DeviceError
+from pennylane.measurements import MidMeasureMP
+from pennylane.ops import Conditional
 from pennylane.ops.op_math import Adjoint
+from pennylane.tape import QuantumScript
 from pennylane.wires import Wires
 
 # pylint: disable=ungrouped-imports
 from pennylane_lightning.core._serialize import global_phase_diagonal
 from pennylane_lightning.core._state_vector_base import LightningBaseStateVector
 
+from ._measurements import LightningGPUMeasurements
 from ._mpi_handler import MPIHandler
 
 gate_cache_needs_hash = (
@@ -247,15 +251,33 @@ def _apply_lightning_controlled(self, operation):
         matrix = global_phase_diagonal(param, self.wires, control_wires, control_values)
         state.apply(name, wires, inv, [[param]], matrix)
 
-    def _apply_lightning_midmeasure(self):
+    def _apply_lightning_midmeasure(
+        self, operation: MidMeasureMP, mid_measurements: dict, postselect_mode: str
+    ):
         """Execute a MidMeasureMP operation and return the sample in mid_measurements.
 
         Args:
+            operation (~pennylane.operation.Operation): mid-circuit measurement
+            mid_measurements (None, dict): Dictionary of mid-circuit measurements
+            postselect_mode (str): Configuration for handling shots with mid-circuit measurement
+                postselection. Use ``"hw-like"`` to discard invalid shots and ``"fill-shots"`` to
+                keep the same number of shots.
 
         Returns:
             None
         """
-        raise DeviceError("LightningGPU does not support Mid-circuit measurements.")
+        wires = self.wires.indices(operation.wires)
+        wire = list(wires)[0]
+        if postselect_mode == "fill-shots" and operation.postselect is not None:
+            sample = operation.postselect
+        else:
+            circuit = QuantumScript([], [qml.sample(wires=operation.wires)], shots=1)
+            sample = LightningGPUMeasurements(self).measure_final_state(circuit)
+            sample = np.squeeze(sample)
+        mid_measurements[operation] = sample
+        getattr(self.state_vector, "collapse")(wire, bool(sample))
+        if operation.reset and bool(sample):
+            self.apply_operations([qml.PauliX(operation.wires)], mid_measurements=mid_measurements)
 
     # pylint: disable=unused-argument
     def _apply_lightning(
@@ -289,7 +311,14 @@ def _apply_lightning(
             method = getattr(state, name, None)
             wires = list(operation.wires)
 
-            if method is not None:  # apply specialized gate
+            if isinstance(operation, Conditional):
+                if operation.meas_val.concretize(mid_measurements):
+                    self._apply_lightning([operation.base])
+            elif isinstance(operation, MidMeasureMP):
+                self._apply_lightning_midmeasure(
+                    operation, mid_measurements, postselect_mode=postselect_mode
+                )
+            elif method is not None:  # apply specialized gate
                 param = operation.parameters
                 method(wires, invert_param, param)
             elif isinstance(operation, qml.ops.Controlled) and isinstance(
diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.py b/pennylane_lightning/lightning_gpu/lightning_gpu.py
index 2b295c4990..d6f75b2c5e 100644
--- a/pennylane_lightning/lightning_gpu/lightning_gpu.py
+++ b/pennylane_lightning/lightning_gpu/lightning_gpu.py
@@ -173,10 +173,7 @@ def stopping_condition(op: Operator) -> bool:
 def stopping_condition_shots(op: Operator) -> bool:
     """A function that determines whether or not an operation is supported by ``lightning.gpu``
     with finite shots."""
-    if isinstance(op, (MidMeasureMP, qml.ops.op_math.Conditional)):
-        # LightningGPU does not support Mid-circuit measurements.
-        return False
-    return stopping_condition(op)
+    return stopping_condition(op) or isinstance(op, (MidMeasureMP, qml.ops.op_math.Conditional))
 
 
 def accepted_observables(obs: Operator) -> bool:
@@ -460,6 +457,7 @@ def execute(
                 self.simulate(
                     circuit,
                     self._statevector,
+                    postselect_mode=execution_config.mcm_config.postselect_mode,
                 )
             )
 
@@ -494,12 +492,16 @@ def simulate(
         self,
         circuit: QuantumScript,
         state: LightningGPUStateVector,
+        postselect_mode: Optional[str] = None,
     ) -> Result:
         """Simulate a single quantum script.
 
         Args:
             circuit (QuantumTape): The single circuit to simulate
             state (LightningGPUStateVector): handle to Lightning state vector
+            postselect_mode (str): Configuration for handling shots with mid-circuit measurement
+                postselection. Use ``"hw-like"`` to discard invalid shots and ``"fill-shots"`` to
+                keep the same number of shots. Default is ``None``.
 
         Returns:
             Tuple[TensorLike]: The results of the simulation
@@ -507,7 +509,25 @@ def simulate(
         Note that this function can return measurements for non-commuting observables simultaneously.
         """
         if circuit.shots and (any(isinstance(op, MidMeasureMP) for op in circuit.operations)):
-            raise qml.DeviceError("LightningGPU does not support Mid-circuit measurements.")
+            results = []
+            aux_circ = QuantumScript(
+                circuit.operations,
+                circuit.measurements,
+                shots=[1],
+                trainable_params=circuit.trainable_params,
+            )
+            for _ in range(circuit.shots.total_shots):
+                state.reset_state()
+                mid_measurements = {}
+                final_state = state.get_final_state(
+                    aux_circ, mid_measurements=mid_measurements, postselect_mode=postselect_mode
+                )
+                results.append(
+                    self.LightningMeasurements(final_state).measure_final_state(
+                        aux_circ, mid_measurements=mid_measurements
+                    )
+                )
+            return tuple(results)
 
         state.reset_state()
         final_state = state.get_final_state(circuit)
diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.toml b/pennylane_lightning/lightning_gpu/lightning_gpu.toml
index 518315de09..b18470da6b 100644
--- a/pennylane_lightning/lightning_gpu/lightning_gpu.toml
+++ b/pennylane_lightning/lightning_gpu/lightning_gpu.toml
@@ -98,7 +98,7 @@ qjit_compatible = false
 # If the device requires run time generation of the quantum circuit.
 runtime_code_generation = false
 # If the device supports mid circuit measurements natively
-mid_circuit_measurement = false
+mid_circuit_measurement = true
 
 # This field is currently unchecked but it is reserved for the purpose of
 # determining if the device supports dynamic qubit allocation/deallocation.
diff --git a/tests/test_native_mcm.py b/tests/test_native_mcm.py
index 07281fb48a..050e1d27c6 100644
--- a/tests/test_native_mcm.py
+++ b/tests/test_native_mcm.py
@@ -21,7 +21,7 @@
 from conftest import LightningDevice, device_name, validate_measurements
 from flaky import flaky
 
-if device_name not in ("lightning.qubit", "lightning.kokkos"):
+if device_name not in ("lightning.qubit", "lightning.kokkos", "lightning.gpu"):
     pytest.skip("Native MCM not supported. Skipping.", allow_module_level=True)
 
 if not LightningDevice._CPP_BINARY_AVAILABLE:  # pylint: disable=protected-access
@@ -89,7 +89,7 @@ def func(x, y):
             match=f"not accepted with finite shots on lightning.qubit",
         ):
             func(*params)
-    if device_name == "lightning.kokkos":
+    if device_name in ("lightning.kokkos", "lightning.gpu"):
         with pytest.raises(
             qml.DeviceError,
             match=r"Measurement shadow\(wires=\[0\]\) not accepted with finite shots on "