From 87619f9cdb5d3864bb0d290e6716dcded846d85d Mon Sep 17 00:00:00 2001
From: Shuli Shu <31480676+multiphaseCFD@users.noreply.github.com>
Date: Wed, 8 Jan 2025 15:24:00 -0500
Subject: [PATCH] Automate aarch64-cuda-wheels tests (#1031)

### Before submitting

Please complete the following checklist when submitting a PR:

- [ ] All new features must include a unit test.
If you've fixed a bug or added code that should be tested, add a test to
the
      [`tests`](../tests) directory!

- [ ] All new functions and code must be clearly commented and
documented.
If you do make documentation changes, make sure that the docs build and
      render correctly by running `make docs`.

- [ ] Ensure that the test suite passes, by running `make test`.

- [ ] Add a new entry to the `.github/CHANGELOG.md` file, summarizing
the
      change, and including a link back to the PR.

- [ ] Ensure that code is properly formatted by running `make format`.

When all the above are checked, delete everything above the dashed
line and fill in the pull request template.


------------------------------------------------------------------------------------------------------------

**Context:**

[sc-81544] & [sc-81555]

This PR fixes the python cuda dependecies bug and automates the arm64 LT
and LGPU wheels tests

**Description of the Change:**

**Benefits:**

**Possible Drawbacks:**

**Related GitHub Issues:**

---------

Co-authored-by: ringo-but-quantum <github-ringo-but-quantum@xanadu.ai>
---
 .github/CHANGELOG.md                          | 14 +++++--
 .../workflows/wheel_linux_aarch64_cuda.yml    | 37 ++++++++++++++++---
 CMakeLists.txt                                |  2 +-
 pennylane_lightning/core/_version.py          |  2 +-
 .../test_measurements_class.py                |  4 +-
 5 files changed, 46 insertions(+), 13 deletions(-)

diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md
index bfb8d3bdbc..18325cc4fc 100644
--- a/.github/CHANGELOG.md
+++ b/.github/CHANGELOG.md
@@ -29,17 +29,20 @@
 
 ### Improvements
 
+* Add CI wheels checks for `aarch64` wheels of Lightning-GPU and Lightning-Tensor.
+  [(#1031)](https://github.com/PennyLaneAI/pennylane-lightning/pull/1031)
+
 * Replace the `dummy_tensor_update` method with the `cutensornetStateCaptureMPS`API to ensure that further gates apply is allowed after the `cutensornetStateCompute` call.
-  [(#1028)](https://github.com/PennyLaneAI/pennylane-lightning/pull/1028/)
+  [(#1028)](https://github.com/PennyLaneAI/pennylane-lightning/pull/1028)
 
 * Add unit test for measurement with shots for Lightning Tensor with `tn` method.
   [(#1027)](https://github.com/PennyLaneAI/pennylane-lightning/pull/1027)
 
 * Add CUDA dependencies to Lightning GPU and Lightning Tensor Python wheels.
-  [(#1025)](https://github.com/PennyLaneAI/pennylane-lightning/pull/1025/)
+  [(#1025)](https://github.com/PennyLaneAI/pennylane-lightning/pull/1025)
 
 * Update the python layer UI of Lightning Tensor.
-  [(#1022)](https://github.com/PennyLaneAI/pennylane-lightning/pull/1022/)
+  [(#1022)](https://github.com/PennyLaneAI/pennylane-lightning/pull/1022)
 
 * Catalyst device interfaces support dynamic shots, and no longer parses the device init op's attribute dictionary for a static shots literal.
   [(#1017)](https://github.com/PennyLaneAI/pennylane-lightning/pull/1017)
@@ -51,7 +54,7 @@
   [(#1015)](https://github.com/PennyLaneAI/pennylane-lightning/pull/1015)
 
 * Add Exact Tensor Network cpp binding.
-  [(#1014)](https://github.com/PennyLaneAI/pennylane-lightning/pull/1014/)
+  [(#1014)](https://github.com/PennyLaneAI/pennylane-lightning/pull/1014)
 
 * Reverse Lightning Qubit generators vector insertion order.
   [(#1009)](https://github.com/PennyLaneAI/pennylane-lightning/pull/1009)
@@ -104,6 +107,9 @@
 
 ### Bug fixes
 
+* Fix Python CUDA dependencies by adding path to `nvidia/nvjitlink/lib` to RPATH.
+  [(#1031)](https://github.com/PennyLaneAI/pennylane-lightning/pull/1031)
+
 * Add `RTLD_NODELETE` flag to `dlopen` in order to mitigate the segfault issues for arm64-macos Catalyst support.
   [(#1030)](https://github.com/PennyLaneAI/pennylane-lightning/pull/1030)
 
diff --git a/.github/workflows/wheel_linux_aarch64_cuda.yml b/.github/workflows/wheel_linux_aarch64_cuda.yml
index 922dbb7116..4839dd426f 100644
--- a/.github/workflows/wheel_linux_aarch64_cuda.yml
+++ b/.github/workflows/wheel_linux_aarch64_cuda.yml
@@ -34,7 +34,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [pl-4-core-large-runner]
+        os: [arm-gpu]
         arch: [aarch64]
         pl_backend: ["lightning_gpu", "lightning_tensor"]
         cuda_version: ["12"]
@@ -42,7 +42,9 @@ jobs:
         container_img: ["quay.io/pypa/manylinux_2_28_aarch64"]
     timeout-minutes: 45
     name: ${{ matrix.os }}::${{ matrix.arch }} - ${{ matrix.pl_backend }} (Python ${{ fromJson('{ "cp310-*":"3.10","cp311-*":"3.11", "cp312-*":"3.12" }')[matrix.cibw_build] }})
-    runs-on: ${{ matrix.os }}
+    runs-on: 
+      - self-hosted
+      - ${{ matrix.os }}
 
     steps:
       - name: Checkout PennyLane-Lightning
@@ -59,9 +61,6 @@ jobs:
       - name: Configure pyproject.toml file
         run: PL_BACKEND="${{ matrix.pl_backend }}" python scripts/configure_pyproject_toml.py
 
-      - uses: docker/setup-qemu-action@v3
-        name: Set up QEMU
-
       - name: Build wheels
         env:
           CIBW_ARCHS_LINUX: ${{matrix.arch}}
@@ -94,6 +93,34 @@ jobs:
 
         run: python3 -m cibuildwheel --output-dir wheelhouse
 
+      - name: Determine Python version
+        id: pyvs
+        shell: bash
+        run: |
+          echo "version=$(echo  ${{ matrix.cibw_build }} | tr -cd '[:digit:].' | sed 's/./&./1')" >> $GITHUB_OUTPUT
+
+      - uses: actions/setup-python@v5
+        name: Install Python
+        with:
+          python-version: ${{ steps.pyvs.outputs.version }}
+
+      - name: Test wheels
+        run: |
+          python -m ensurepip --upgrade
+          python -m pip install -r requirements-tests.txt
+          PL_BACKEND="lightning_qubit" python scripts/configure_pyproject_toml.py
+          SKIP_COMPILATION=True python -m pip install . -vv
+          python -m pip install ./wheelhouse/*.whl
+          DEVICENAME=`echo ${{ matrix.pl_backend }} | sed "s/_/./g"`
+          if (${{ matrix.pl_backend == 'lightning_tensor' }}) then
+            PL_DEVICE=${DEVICENAME} python -m pytest tests/
+          else
+            pl-device-test --device=${DEVICENAME} --skip-ops -x --tb=short --no-flaky-report
+            # MCM tests are slow and skipped. get_c_interface() API is not supported with current test setup and skipped.
+            PL_DEVICE=${DEVICENAME} python -m pytest tests/ -k "not test_supported_linux_platform_gpu and not test_native_mcm"
+          fi
+          
+
       - name: Validate wheels
         run: |
          python3 -m pip install twine
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 030de959a8..cb6ea5c93a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -153,7 +153,7 @@ if(ENABLE_PYTHON)
     if("${PL_BACKEND}" STREQUAL "lightning_gpu" OR "${PL_BACKEND}" STREQUAL "lightning_tensor")
         # Allow pip installation of cuQuantum & CUDA 12 libs to be accessible without setting LD_LIBRARY_PATH for lightning_gpu
         # BUILD_RPATH only works for the last call
-        set_target_properties("${PL_BACKEND}_ops" PROPERTIES BUILD_RPATH "$ORIGIN/../cuquantum/lib:$ORIGIN/../nvidia/cuda_runtime/lib:$ORIGIN/../nvidia/cublas/lib:$ORIGIN/../nvidia/cusparse/lib:${SCIPY_OPENBLAS32_RUNTIME_LIB_PATH}:$ORIGIN")
+        set_target_properties("${PL_BACKEND}_ops" PROPERTIES BUILD_RPATH "$ORIGIN/../cuquantum/lib:$ORIGIN/../nvidia/cuda_runtime/lib:$ORIGIN/../nvidia/nvjitlink/lib:$ORIGIN/../nvidia/cublas/lib:$ORIGIN/../nvidia/cusparse/lib:${SCIPY_OPENBLAS32_RUNTIME_LIB_PATH}:$ORIGIN")
     else()
         set_target_properties("${PL_BACKEND}_ops" PROPERTIES BUILD_RPATH "${SCIPY_OPENBLAS32_RUNTIME_LIB_PATH}")
     endif()
diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
index 9b9e9b997d..fb54db9a0c 100644
--- a/pennylane_lightning/core/_version.py
+++ b/pennylane_lightning/core/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.40.0-rc0"
+__version__ = "0.40.0-rc1"
diff --git a/tests/lightning_tensor/test_measurements_class.py b/tests/lightning_tensor/test_measurements_class.py
index 8cd48f23b3..d3c9c96648 100644
--- a/tests/lightning_tensor/test_measurements_class.py
+++ b/tests/lightning_tensor/test_measurements_class.py
@@ -140,7 +140,7 @@ def test_probs_many_wires(self, method, n_qubits, n_targets, tol):
             pytest.skip("Number of targets cannot exceed the number of wires.")
 
         dev = qml.device(device_name, wires=n_qubits, **method)
-        dq = qml.device("lightning.qubit", wires=n_qubits)
+        dq = qml.device("default.qubit", wires=n_qubits)
 
         init_state = np.random.rand(2**n_qubits) + 1.0j * np.random.rand(2**n_qubits)
         init_state /= np.linalg.norm(init_state)
@@ -168,7 +168,7 @@ def test_state_many_wires(self, method, n_qubits, n_targets, tol):
             pytest.skip("Number of targets cannot exceed the number of wires.")
 
         dev = qml.device(device_name, wires=n_qubits, **method)
-        dq = qml.device("lightning.qubit", wires=n_qubits)
+        dq = qml.device("default.qubit", wires=n_qubits)
 
         init_state = np.random.rand(2**n_qubits) + 1.0j * np.random.rand(2**n_qubits)
         init_state /= np.linalg.norm(init_state)