From 72ce285190eb0a1bc54c4b2169a6663581b4e54a Mon Sep 17 00:00:00 2001
From: Prashanth Kanduri <kanduri@cscs.ch>
Date: Mon, 29 Apr 2024 13:22:20 +0200
Subject: [PATCH 01/28] Add CI config files

---
 ci/docker/Dockerfile.base |  6 ++++++
 ci/pipeline.yml           | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+)
 create mode 100644 ci/docker/Dockerfile.base
 create mode 100644 ci/pipeline.yml

diff --git a/ci/docker/Dockerfile.base b/ci/docker/Dockerfile.base
new file mode 100644
index 0000000..f1c85e8
--- /dev/null
+++ b/ci/docker/Dockerfile.base
@@ -0,0 +1,6 @@
+FROM nvcr.io/nvidia/pytorch:23.10-py3
+
+RUN apt-get update
+
+# install boost test framework
+RUN apt-get install -y libboost-test-dev
diff --git a/ci/pipeline.yml b/ci/pipeline.yml
new file mode 100644
index 0000000..61af310
--- /dev/null
+++ b/ci/pipeline.yml
@@ -0,0 +1,33 @@
+include:
+  - remote: 'https://gitlab.com/cscs-ci/recipes/-/raw/master/templates/v2/.ci-ext.yml'
+
+stages:
+  - build
+  - test
+
+build_base_image_job:
+  stage: build
+  extends: .container-builder-dynamic-name
+  timeout: 2h
+  variables:
+    DOCKERFILE: ci/docker/Dockerfile.base
+    WATCH_FILECHANGES: $DOCKERFILE
+    PERSIST_IMAGE_NAME: $CSCS_REGISTRY_PATH/base/public/mops
+
+test_job:
+  stage: test
+  extends: .container-runner-daint-gpu
+  image: $BASE_IMAGE
+  timeout: 2h
+  script:
+    - export CUDA_HOME="/usr/local/cuda"
+    - python3 -m pip install --upgrade pip
+    - python3 -m pip install tox
+    - tox
+
+  variables:
+    SLURM_JOB_NUM_NODES: 1
+    SLURM_PARTITION: normal
+    SLURM_NTASKS: 1
+    SLURM_TIMELIMIT: '00:40:00'
+    GIT_STRATEGY: fetch

From d463d2161b2eacb923854f5c0e159e6ecd496875 Mon Sep 17 00:00:00 2001
From: "Nick J. Browning" <nickjbrowning@gmail.com>
Date: Thu, 2 May 2024 10:44:55 +0200
Subject: [PATCH 02/28] added stream management to OPSA

---
 mops-torch/src/opsa.cpp    | 19 +++++++++++++++++--
 mops/CMakeLists.txt        |  2 +-
 mops/include/mops/opsa.h   | 18 ++++++++++++------
 mops/include/mops/opsa.hpp | 33 +++++++++++++++++++++++---------
 mops/src/opsa/capi.cpp     | 36 +++++++++++++++++++++++------------
 mops/src/opsa/opsa.cpp     | 32 ++++++++++++++++++++++---------
 mops/src/opsa/opsa.cu      | 39 ++++++++++++++++++++++++++++----------
 7 files changed, 130 insertions(+), 49 deletions(-)

diff --git a/mops-torch/src/opsa.cpp b/mops-torch/src/opsa.cpp
index 0b782dd..cdccedf 100644
--- a/mops-torch/src/opsa.cpp
+++ b/mops-torch/src/opsa.cpp
@@ -1,3 +1,8 @@
+#ifdef MOPS_CUDA_ENABLED
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/cuda/CUDAStream.h>
+#endif
+
 #include "mops/torch/opsa.hpp"
 #include "mops/torch/utils.hpp"
 
@@ -48,6 +53,10 @@ torch::Tensor OuterProductScatterAdd::forward(
 #ifndef MOPS_CUDA_ENABLED
         C10_THROW_ERROR(ValueError, "MOPS was not compiled with CUDA support " + A.device().str());
 #else
+        c10::cuda::CUDAGuard deviceGuard{A.device()};
+        cudaStream_t currstream = c10::cuda::getCurrentCUDAStream();
+        void* stream = reinterpret_cast<void*>(currstream);
+
         output = torch::empty(
             {output_size, A.size(1), B.size(1)},
             torch::TensorOptions().dtype(A.scalar_type()).device(A.device())
@@ -58,7 +67,8 @@ torch::Tensor OuterProductScatterAdd::forward(
                 details::torch_to_mops_3d<scalar_t>(output),
                 details::torch_to_mops_2d<scalar_t>(A),
                 details::torch_to_mops_2d<scalar_t>(B),
-                details::torch_to_mops_1d<int32_t>(indices_output)
+                details::torch_to_mops_1d<int32_t>(indices_output),
+                stream
             );
         });
 
@@ -130,6 +140,10 @@ std::vector<torch::Tensor> OuterProductScatterAddBackward::forward(
 #ifndef MOPS_CUDA_ENABLED
         C10_THROW_ERROR(ValueError, "MOPS was not compiled with CUDA support " + A.device().str());
 #else
+        c10::cuda::CUDAGuard deviceGuard{A.device()};
+        cudaStream_t currstream = c10::cuda::getCurrentCUDAStream();
+        void* stream = reinterpret_cast<void*>(currstream);
+
         AT_DISPATCH_FLOATING_TYPES(A.scalar_type(), "outer_product_scatter_add_vjp", [&]() {
             auto mops_grad_A = mops::Tensor<scalar_t, 2>{nullptr, {0, 0}};
 
@@ -150,7 +164,8 @@ std::vector<torch::Tensor> OuterProductScatterAddBackward::forward(
                 details::torch_to_mops_3d<scalar_t>(grad_output),
                 details::torch_to_mops_2d<scalar_t>(A),
                 details::torch_to_mops_2d<scalar_t>(B),
-                details::torch_to_mops_1d<int32_t>(indices_output)
+                details::torch_to_mops_1d<int32_t>(indices_output),
+                stream
             );
         });
 #endif
diff --git a/mops/CMakeLists.txt b/mops/CMakeLists.txt
index 846eb47..efc00b7 100644
--- a/mops/CMakeLists.txt
+++ b/mops/CMakeLists.txt
@@ -110,7 +110,7 @@ add_library(mops
 
 if(CMAKE_CUDA_COMPILER AND MOPS_CUDA)
     target_compile_definitions(mops PUBLIC MOPS_CUDA_ENABLED)
-    set_target_properties(mops PROPERTIES CUDA_ARCHITECTURES native)
+    set_target_properties(mops PROPERTIES CUDA_ARCHITECTURES all)
     set_target_properties(mops PROPERTIES CUDA_NVCC_FLAGS "-lineinfo")
 
     target_sources(mops
diff --git a/mops/include/mops/opsa.h b/mops/include/mops/opsa.h
index 7f608ed..e1ade1a 100644
--- a/mops/include/mops/opsa.h
+++ b/mops/include/mops/opsa.h
@@ -75,7 +75,8 @@ int MOPS_EXPORT mops_cuda_outer_product_scatter_add_f32(
     mops_tensor_3d_f32_t output,
     mops_tensor_2d_f32_t A,
     mops_tensor_2d_f32_t B,
-    mops_tensor_1d_i32_t indices_output
+    mops_tensor_1d_i32_t indices_output,
+    void* cuda_stream
 );
 
 /// CUDA version of mops::outer_product_scatter_add for 64-bit floats
@@ -83,7 +84,8 @@ int MOPS_EXPORT mops_cuda_outer_product_scatter_add_f64(
     mops_tensor_3d_f64_t output,
     mops_tensor_2d_f64_t A,
     mops_tensor_2d_f64_t B,
-    mops_tensor_1d_i32_t indices_output
+    mops_tensor_1d_i32_t indices_output,
+    void* cuda_stream
 );
 
 /// CUDA version of mops::outer_product_scatter_add_vjp for 32-bit floats
@@ -93,7 +95,8 @@ int MOPS_EXPORT mops_cuda_outer_product_scatter_add_vjp_f32(
     mops_tensor_3d_f32_t grad_output,
     mops_tensor_2d_f32_t A,
     mops_tensor_2d_f32_t B,
-    mops_tensor_1d_i32_t indices_output
+    mops_tensor_1d_i32_t indices_output,
+    void* cuda_stream
 );
 
 /// CUDA version of mops::outer_product_scatter_add_vjp for 64-bit floats
@@ -103,7 +106,8 @@ int MOPS_EXPORT mops_cuda_outer_product_scatter_add_vjp_f64(
     mops_tensor_3d_f64_t grad_output,
     mops_tensor_2d_f64_t A,
     mops_tensor_2d_f64_t B,
-    mops_tensor_1d_i32_t indices_output
+    mops_tensor_1d_i32_t indices_output,
+    void* cuda_stream
 );
 
 /// CUDA version of mops::outer_product_scatter_add_vjp_vjp for 32-bit floats
@@ -116,7 +120,8 @@ int MOPS_EXPORT mops_cuda_outer_product_scatter_add_vjp_vjp_f32(
     mops_tensor_3d_f32_t grad_output,
     mops_tensor_2d_f32_t A,
     mops_tensor_2d_f32_t B,
-    mops_tensor_1d_i32_t indices_output
+    mops_tensor_1d_i32_t indices_output,
+    void* cuda_stream
 );
 
 /// CUDA version of mops::outer_product_scatter_add_vjp_vjp for 64-bit floats
@@ -129,7 +134,8 @@ int MOPS_EXPORT mops_cuda_outer_product_scatter_add_vjp_vjp_f64(
     mops_tensor_3d_f64_t grad_output,
     mops_tensor_2d_f64_t A,
     mops_tensor_2d_f64_t B,
-    mops_tensor_1d_i32_t indices_output
+    mops_tensor_1d_i32_t indices_output,
+    void* cuda_stream
 );
 
 #ifdef __cplusplus
diff --git a/mops/include/mops/opsa.hpp b/mops/include/mops/opsa.hpp
index 9e7543f..eb6fbe9 100644
--- a/mops/include/mops/opsa.hpp
+++ b/mops/include/mops/opsa.hpp
@@ -149,15 +149,24 @@ void MOPS_EXPORT outer_product_scatter_add(
     Tensor<scalar_t, 3> output,
     Tensor<scalar_t, 2> A,
     Tensor<scalar_t, 2> B,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream = nullptr
 );
 
 extern template void outer_product_scatter_add(
-    Tensor<float, 3> output, Tensor<float, 2> A, Tensor<float, 2> B, Tensor<int32_t, 1> indices_output
+    Tensor<float, 3> output,
+    Tensor<float, 2> A,
+    Tensor<float, 2> B,
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream
 );
 
 extern template void outer_product_scatter_add(
-    Tensor<double, 3> output, Tensor<double, 2> A, Tensor<double, 2> B, Tensor<int32_t, 1> indices_output
+    Tensor<double, 3> output,
+    Tensor<double, 2> A,
+    Tensor<double, 2> B,
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream
 );
 
 template <typename scalar_t>
@@ -167,7 +176,8 @@ void MOPS_EXPORT outer_product_scatter_add_vjp(
     Tensor<scalar_t, 3> grad_output,
     Tensor<scalar_t, 2> A,
     Tensor<scalar_t, 2> B,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream = nullptr
 );
 
 // these templates will be precompiled and provided in the mops library
@@ -177,7 +187,8 @@ extern template void outer_product_scatter_add_vjp(
     Tensor<float, 3> grad_output,
     Tensor<float, 2> A,
     Tensor<float, 2> B,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream
 );
 
 extern template void outer_product_scatter_add_vjp(
@@ -186,7 +197,8 @@ extern template void outer_product_scatter_add_vjp(
     Tensor<double, 3> grad_output,
     Tensor<double, 2> A,
     Tensor<double, 2> B,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream
 );
 
 /// TODO
@@ -200,7 +212,8 @@ void MOPS_EXPORT outer_product_scatter_add_vjp_vjp(
     Tensor<scalar_t, 3> grad_output,
     Tensor<scalar_t, 2> A,
     Tensor<scalar_t, 2> B,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream = nullptr
 );
 
 // these templates will be precompiled and provided in the mops library
@@ -213,7 +226,8 @@ extern template void outer_product_scatter_add_vjp_vjp(
     Tensor<float, 3> grad_output,
     Tensor<float, 2> A,
     Tensor<float, 2> B,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream
 );
 
 extern template void outer_product_scatter_add_vjp_vjp(
@@ -225,7 +239,8 @@ extern template void outer_product_scatter_add_vjp_vjp(
     Tensor<double, 3> grad_output,
     Tensor<double, 2> A,
     Tensor<double, 2> B,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream
 );
 
 } // namespace cuda
diff --git a/mops/src/opsa/capi.cpp b/mops/src/opsa/capi.cpp
index 2be0a2b..2098a6e 100644
--- a/mops/src/opsa/capi.cpp
+++ b/mops/src/opsa/capi.cpp
@@ -172,7 +172,8 @@ extern "C" int mops_cuda_outer_product_scatter_add_f32(
     mops_tensor_3d_f32_t output,
     mops_tensor_2d_f32_t A,
     mops_tensor_2d_f32_t B,
-    mops_tensor_1d_i32_t indices_output
+    mops_tensor_1d_i32_t indices_output,
+    void* cuda_stream
 ) {
     MOPS_CATCH_EXCEPTIONS_BEGIN
         mops::cuda::outer_product_scatter_add<float>(
@@ -182,7 +183,8 @@ extern "C" int mops_cuda_outer_product_scatter_add_f32(
               checked_cast(output.shape[2])}},
             {A.data, {checked_cast(A.shape[0]), checked_cast(A.shape[1])}},
             {B.data, {checked_cast(B.shape[0]), checked_cast(B.shape[1])}},
-            {indices_output.data, {checked_cast(indices_output.shape[0])}}
+            {indices_output.data, {checked_cast(indices_output.shape[0])}},
+            cuda_stream
         );
     MOPS_CATCH_EXCEPTIONS_END
 }
@@ -191,7 +193,8 @@ extern "C" int mops_cuda_outer_product_scatter_add_f64(
     mops_tensor_3d_f64_t output,
     mops_tensor_2d_f64_t A,
     mops_tensor_2d_f64_t B,
-    mops_tensor_1d_i32_t indices_output
+    mops_tensor_1d_i32_t indices_output,
+    void* cuda_stream
 ) {
     MOPS_CATCH_EXCEPTIONS_BEGIN
         mops::cuda::outer_product_scatter_add<double>(
@@ -201,7 +204,8 @@ extern "C" int mops_cuda_outer_product_scatter_add_f64(
               checked_cast(output.shape[2])}},
             {A.data, {checked_cast(A.shape[0]), checked_cast(A.shape[1])}},
             {B.data, {checked_cast(B.shape[0]), checked_cast(B.shape[1])}},
-            {indices_output.data, {checked_cast(indices_output.shape[0])}}
+            {indices_output.data, {checked_cast(indices_output.shape[0])}},
+            cuda_stream
         );
     MOPS_CATCH_EXCEPTIONS_END
 }
@@ -212,7 +216,8 @@ extern "C" int mops_cuda_outer_product_scatter_add_vjp_f32(
     mops_tensor_3d_f32_t grad_output,
     mops_tensor_2d_f32_t A,
     mops_tensor_2d_f32_t B,
-    mops_tensor_1d_i32_t indices_output
+    mops_tensor_1d_i32_t indices_output,
+    void* cuda_stream
 ) {
     MOPS_CATCH_EXCEPTIONS_BEGIN
         mops::cuda::outer_product_scatter_add_vjp<float>(
@@ -224,7 +229,8 @@ extern "C" int mops_cuda_outer_product_scatter_add_vjp_f32(
               checked_cast(grad_output.shape[2])}},
             {A.data, {checked_cast(A.shape[0]), checked_cast(A.shape[1])}},
             {B.data, {checked_cast(B.shape[0]), checked_cast(B.shape[1])}},
-            {indices_output.data, {checked_cast(indices_output.shape[0])}}
+            {indices_output.data, {checked_cast(indices_output.shape[0])}},
+            cuda_stream
         );
     MOPS_CATCH_EXCEPTIONS_END
 }
@@ -235,7 +241,8 @@ extern "C" int mops_cuda_outer_product_scatter_add_vjp_f64(
     mops_tensor_3d_f64_t grad_output,
     mops_tensor_2d_f64_t A,
     mops_tensor_2d_f64_t B,
-    mops_tensor_1d_i32_t indices_output
+    mops_tensor_1d_i32_t indices_output,
+    void* cuda_stream
 ) {
     MOPS_CATCH_EXCEPTIONS_BEGIN
         mops::cuda::outer_product_scatter_add_vjp<double>(
@@ -247,7 +254,8 @@ extern "C" int mops_cuda_outer_product_scatter_add_vjp_f64(
               checked_cast(grad_output.shape[2])}},
             {A.data, {checked_cast(A.shape[0]), checked_cast(A.shape[1])}},
             {B.data, {checked_cast(B.shape[0]), checked_cast(B.shape[1])}},
-            {indices_output.data, {checked_cast(indices_output.shape[0])}}
+            {indices_output.data, {checked_cast(indices_output.shape[0])}},
+            cuda_stream
         );
     MOPS_CATCH_EXCEPTIONS_END
 }
@@ -261,7 +269,8 @@ extern "C" int mops_cuda_outer_product_scatter_add_vjp_vjp_f32(
     mops_tensor_3d_f32_t grad_output,
     mops_tensor_2d_f32_t A,
     mops_tensor_2d_f32_t B,
-    mops_tensor_1d_i32_t indices_output
+    mops_tensor_1d_i32_t indices_output,
+    void* cuda_stream
 ) {
     MOPS_CATCH_EXCEPTIONS_BEGIN
         mops::cuda::outer_product_scatter_add_vjp_vjp<float>(
@@ -281,7 +290,8 @@ extern "C" int mops_cuda_outer_product_scatter_add_vjp_vjp_f32(
               checked_cast(grad_output.shape[2])}},
             {A.data, {checked_cast(A.shape[0]), checked_cast(A.shape[1])}},
             {B.data, {checked_cast(B.shape[0]), checked_cast(B.shape[1])}},
-            {indices_output.data, {checked_cast(indices_output.shape[0])}}
+            {indices_output.data, {checked_cast(indices_output.shape[0])}},
+            cuda_stream
         );
     MOPS_CATCH_EXCEPTIONS_END
 }
@@ -295,7 +305,8 @@ extern "C" int mops_cuda_outer_product_scatter_add_vjp_vjp_f64(
     mops_tensor_3d_f64_t grad_output,
     mops_tensor_2d_f64_t A,
     mops_tensor_2d_f64_t B,
-    mops_tensor_1d_i32_t indices_output
+    mops_tensor_1d_i32_t indices_output,
+    void* cuda_stream
 ) {
     MOPS_CATCH_EXCEPTIONS_BEGIN
         mops::cuda::outer_product_scatter_add_vjp_vjp<double>(
@@ -315,7 +326,8 @@ extern "C" int mops_cuda_outer_product_scatter_add_vjp_vjp_f64(
               checked_cast(grad_output.shape[2])}},
             {A.data, {checked_cast(A.shape[0]), checked_cast(A.shape[1])}},
             {B.data, {checked_cast(B.shape[0]), checked_cast(B.shape[1])}},
-            {indices_output.data, {checked_cast(indices_output.shape[0])}}
+            {indices_output.data, {checked_cast(indices_output.shape[0])}},
+            cuda_stream
         );
     MOPS_CATCH_EXCEPTIONS_END
 }
diff --git a/mops/src/opsa/opsa.cpp b/mops/src/opsa/opsa.cpp
index 7a82088..7eab3ad 100644
--- a/mops/src/opsa/opsa.cpp
+++ b/mops/src/opsa/opsa.cpp
@@ -54,17 +54,25 @@ template void mops::outer_product_scatter_add_vjp_vjp<double>(
 #ifndef MOPS_CUDA_ENABLED
 template <typename scalar_t>
 void mops::cuda::
-    outer_product_scatter_add(Tensor<scalar_t, 3>, Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<int32_t, 1>) {
+    outer_product_scatter_add(Tensor<scalar_t, 3>, Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<int32_t, 1>, void*) {
     throw std::runtime_error("MOPS was not compiled with CUDA support");
 }
 
 // explicit instantiations of CUDA templates
 template void mops::cuda::outer_product_scatter_add<float>(
-    Tensor<float, 3> output, Tensor<float, 2> A, Tensor<float, 2> B, Tensor<int32_t, 1> indices_output
+    Tensor<float, 3> output,
+    Tensor<float, 2> A,
+    Tensor<float, 2> B,
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream
 );
 
 template void mops::cuda::outer_product_scatter_add<double>(
-    Tensor<double, 3> output, Tensor<double, 2> A, Tensor<double, 2> B, Tensor<int32_t, 1> indices_output
+    Tensor<double, 3> output,
+    Tensor<double, 2> A,
+    Tensor<double, 2> B,
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream
 );
 
 template <typename scalar_t>
@@ -74,7 +82,8 @@ void mops::cuda::outer_product_scatter_add_vjp(
     Tensor<scalar_t, 3> /*grad_output*/,
     Tensor<scalar_t, 2> /*A*/,
     Tensor<scalar_t, 2> /*B*/,
-    Tensor<int32_t, 1> /*indices_output*/
+    Tensor<int32_t, 1> /*indices_output*/,
+    void* /*cudaStream_t*/
 ) {
     throw std::runtime_error("MOPS was not compiled with CUDA support");
 }
@@ -85,7 +94,8 @@ template void mops::cuda::outer_product_scatter_add_vjp<float>(
     Tensor<float, 3> grad_output,
     Tensor<float, 2> A,
     Tensor<float, 2> B,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream
 );
 
 template void mops::cuda::outer_product_scatter_add_vjp<double>(
@@ -94,7 +104,8 @@ template void mops::cuda::outer_product_scatter_add_vjp<double>(
     Tensor<double, 3> grad_output,
     Tensor<double, 2> A,
     Tensor<double, 2> B,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream
 );
 
 template <typename scalar_t>
@@ -107,7 +118,8 @@ void mops::cuda::outer_product_scatter_add_vjp_vjp(
     Tensor<scalar_t, 3> /*grad_output*/,
     Tensor<scalar_t, 2> /*A*/,
     Tensor<scalar_t, 2> /*B*/,
-    Tensor<int32_t, 1> /*indices_output*/
+    Tensor<int32_t, 1> /*indices_output*/,
+    void* /*cudaStream_t*/
 ) {
     throw std::runtime_error("MOPS was not compiled with CUDA support");
 }
@@ -121,7 +133,8 @@ template void mops::cuda::outer_product_scatter_add_vjp_vjp<float>(
     Tensor<float, 3> grad_output,
     Tensor<float, 2> A,
     Tensor<float, 2> B,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream
 );
 
 template void mops::cuda::outer_product_scatter_add_vjp_vjp<double>(
@@ -133,7 +146,8 @@ template void mops::cuda::outer_product_scatter_add_vjp_vjp<double>(
     Tensor<double, 3> grad_output,
     Tensor<double, 2> A,
     Tensor<double, 2> B,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream
 );
 
 #endif
diff --git a/mops/src/opsa/opsa.cu b/mops/src/opsa/opsa.cu
index a831335..b05b227 100644
--- a/mops/src/opsa/opsa.cu
+++ b/mops/src/opsa/opsa.cu
@@ -66,10 +66,13 @@ void mops::cuda::outer_product_scatter_add(
     Tensor<scalar_t, 3> output,
     Tensor<scalar_t, 2> A,
     Tensor<scalar_t, 2> B,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream
 ) {
     check_opsa(output, A, B, indices_output, "cuda_outer_product_scatter_add");
 
+    cudaStream_t cstream = reinterpret_cast<cudaStream_t>(cuda_stream);
+
     int32_t* first_occurences = calculate_first_occurences_cuda(
         indices_output.data, indices_output.shape[0], output.shape[0]
     );
@@ -78,7 +81,7 @@ void mops::cuda::outer_product_scatter_add(
 
     dim3 blockDim(WARP_SIZE * NWARPS_PER_BLOCK, 1, 1);
 
-    outer_product_scatter_add_kernel<scalar_t><<<gridDim, blockDim, 0>>>(
+    outer_product_scatter_add_kernel<scalar_t><<<gridDim, blockDim, 0, cstream>>>(
         A, B, mops::Tensor<int32_t, 1>{first_occurences, {output.shape[0] * 2}}, indices_output, output
     );
 
@@ -89,11 +92,19 @@ void mops::cuda::outer_product_scatter_add(
 
 // explicit instantiations of CUDA templates
 template void mops::cuda::outer_product_scatter_add<float>(
-    Tensor<float, 3> output, Tensor<float, 2> A, Tensor<float, 2> B, Tensor<int32_t, 1> indices_output
+    Tensor<float, 3> output,
+    Tensor<float, 2> A,
+    Tensor<float, 2> B,
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream
 );
 
 template void mops::cuda::outer_product_scatter_add<double>(
-    Tensor<double, 3> output, Tensor<double, 2> A, Tensor<double, 2> B, Tensor<int32_t, 1> indices_output
+    Tensor<double, 3> output,
+    Tensor<double, 2> A,
+    Tensor<double, 2> B,
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream
 );
 
 template <typename scalar_t>
@@ -240,12 +251,15 @@ void mops::cuda::outer_product_scatter_add_vjp(
     Tensor<scalar_t, 3> grad_output,
     Tensor<scalar_t, 2> A,
     Tensor<scalar_t, 2> B,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream
 ) {
     check_opsa_vjp(
         grad_A, grad_B, grad_output, A, B, indices_output, "cuda_outer_product_scatter_add_vjp"
     );
 
+    cudaStream_t cstream = reinterpret_cast<cudaStream_t>(cuda_stream);
+
     int32_t* first_occurences = calculate_first_occurences_cuda(
         indices_output.data, indices_output.shape[0], grad_output.shape[0]
     );
@@ -290,7 +304,8 @@ template void mops::cuda::outer_product_scatter_add_vjp<float>(
     Tensor<float, 3> grad_output,
     Tensor<float, 2> A,
     Tensor<float, 2> B,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream
 );
 
 template void mops::cuda::outer_product_scatter_add_vjp<double>(
@@ -299,7 +314,8 @@ template void mops::cuda::outer_product_scatter_add_vjp<double>(
     Tensor<double, 3> grad_output,
     Tensor<double, 2> A,
     Tensor<double, 2> B,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream
 );
 
 template <typename scalar_t>
@@ -312,7 +328,8 @@ void mops::cuda::outer_product_scatter_add_vjp_vjp(
     Tensor<scalar_t, 3> /*grad_output*/,
     Tensor<scalar_t, 2> /*A*/,
     Tensor<scalar_t, 2> /*B*/,
-    Tensor<int32_t, 1> /*indices_output*/
+    Tensor<int32_t, 1> /*indices_output*/,
+    void* /*cudaStream_t*/
 ) {
     throw std::runtime_error("Not implemented");
 }
@@ -327,7 +344,8 @@ template void mops::cuda::outer_product_scatter_add_vjp_vjp<float>(
     Tensor<float, 3> grad_output,
     Tensor<float, 2> A,
     Tensor<float, 2> B,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream
 );
 
 template void mops::cuda::outer_product_scatter_add_vjp_vjp<double>(
@@ -339,5 +357,6 @@ template void mops::cuda::outer_product_scatter_add_vjp_vjp<double>(
     Tensor<double, 3> grad_output,
     Tensor<double, 2> A,
     Tensor<double, 2> B,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream
 );

From d81ec9cadeb44653b0a7bd7acffa10701bda3b8d Mon Sep 17 00:00:00 2001
From: "Nick J. Browning" <nickjbrowning@gmail.com>
Date: Thu, 2 May 2024 11:07:07 +0200
Subject: [PATCH 03/28] x

---
 mops/src/opsa/opsa.cu | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

diff --git a/mops/src/opsa/opsa.cu b/mops/src/opsa/opsa.cu
index b05b227..f500f44 100644
--- a/mops/src/opsa/opsa.cu
+++ b/mops/src/opsa/opsa.cu
@@ -71,6 +71,14 @@ void mops::cuda::outer_product_scatter_add(
 ) {
     check_opsa(output, A, B, indices_output, "cuda_outer_product_scatter_add");
 
+    cudaPointerAttributes attributes;
+    CUDA_CHECK_ERROR(cudaPointerGetAttributes(&attributes, A.data));
+    int current_device;
+    CUDA_CHECK_ERROR(cudaGetDevice(&current_device));
+    if (current_device != attributes.device) {
+        CUDA_CHECK_ERROR(cudaSetDevice(attributes.device));
+    }
+
     cudaStream_t cstream = reinterpret_cast<cudaStream_t>(cuda_stream);
 
     int32_t* first_occurences = calculate_first_occurences_cuda(
@@ -86,8 +94,11 @@ void mops::cuda::outer_product_scatter_add(
     );
 
     CUDA_CHECK_ERROR(cudaGetLastError());
-
     CUDA_CHECK_ERROR(cudaDeviceSynchronize());
+
+    if (current_device != attributes.device) {
+        CUDA_CHECK_ERROR(cudaSetDevice(current_device));
+    }
 }
 
 // explicit instantiations of CUDA templates
@@ -258,6 +269,14 @@ void mops::cuda::outer_product_scatter_add_vjp(
         grad_A, grad_B, grad_output, A, B, indices_output, "cuda_outer_product_scatter_add_vjp"
     );
 
+    cudaPointerAttributes attributes;
+    CUDA_CHECK_ERROR(cudaPointerGetAttributes(&attributes, A.data));
+    int current_device;
+    CUDA_CHECK_ERROR(cudaGetDevice(&current_device));
+    if (current_device != attributes.device) {
+        CUDA_CHECK_ERROR(cudaSetDevice(attributes.device));
+    }
+
     cudaStream_t cstream = reinterpret_cast<cudaStream_t>(cuda_stream);
 
     int32_t* first_occurences = calculate_first_occurences_cuda(
@@ -282,7 +301,7 @@ void mops::cuda::outer_product_scatter_add_vjp(
         shared_array<scalar_t>(NWARPS_PER_BLOCK * B.shape[1], sptr, &space);
     }
 
-    outer_product_scatter_add_vjp_kernel<scalar_t><<<gridDim, blockDim, space>>>(
+    outer_product_scatter_add_vjp_kernel<scalar_t><<<gridDim, blockDim, space, cstream>>>(
         A,
         B,
         mops::Tensor<int32_t, 1>{first_occurences, {grad_output.shape[0]}},
@@ -293,8 +312,11 @@ void mops::cuda::outer_product_scatter_add_vjp(
     );
 
     CUDA_CHECK_ERROR(cudaGetLastError());
-
     CUDA_CHECK_ERROR(cudaDeviceSynchronize());
+
+    if (current_device != attributes.device) {
+        CUDA_CHECK_ERROR(cudaSetDevice(current_device));
+    }
 }
 
 // these templates will be precompiled and provided in the mops library

From c8d87768b5ebd07332322864d6fd045d1f1bbd3c Mon Sep 17 00:00:00 2001
From: "Nick J. Browning" <nickjbrowning@gmail.com>
Date: Thu, 2 May 2024 11:51:40 +0200
Subject: [PATCH 04/28] added support for other ops.

---
 mops-torch/src/hpe.cpp      | 27 ++++++++++++--
 mops-torch/src/opsaw.cpp    |  5 +++
 mops-torch/src/sap.cpp      |  5 +++
 mops-torch/src/sasaw.cpp    |  5 +++
 mops/CMakeLists.txt         |  2 +-
 mops/include/mops/hpe.h     | 18 ++++++----
 mops/include/mops/hpe.hpp   | 24 ++++++++-----
 mops/include/mops/opsaw.h   | 18 ++++++----
 mops/include/mops/opsaw.hpp | 27 +++++++++-----
 mops/include/mops/sap.h     | 18 ++++++----
 mops/include/mops/sap.hpp   | 27 +++++++++-----
 mops/include/mops/sasaw.h   | 18 ++++++----
 mops/include/mops/sasaw.hpp | 27 +++++++++-----
 mops/src/hpe/capi.cpp       | 36 ++++++++++++-------
 mops/src/hpe/hpe.cpp        | 30 +++++++++++-----
 mops/src/hpe/hpe.cu         | 54 ++++++++++++++++++++++------
 mops/src/opsa/opsa.cu       |  4 +--
 mops/src/opsaw/capi.cpp     | 36 ++++++++++++-------
 mops/src/sap/capi.cpp       | 36 ++++++++++++-------
 mops/src/sap/sap.cpp        | 25 ++++++++-----
 mops/src/sap/sap.cu         | 72 ++++++++++++++++++++++++++++++-------
 mops/src/sasaw/capi.cpp     | 36 ++++++++++++-------
 mops/src/sasaw/sasaw.cpp    | 33 +++++++++--------
 23 files changed, 415 insertions(+), 168 deletions(-)

diff --git a/mops-torch/src/hpe.cpp b/mops-torch/src/hpe.cpp
index dc74031..08f5664 100644
--- a/mops-torch/src/hpe.cpp
+++ b/mops-torch/src/hpe.cpp
@@ -1,3 +1,8 @@
+#ifdef MOPS_CUDA_ENABLED
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/cuda/CUDAStream.h>
+#endif
+
 #include "mops/torch/hpe.hpp"
 #include "mops/torch/utils.hpp"
 
@@ -38,15 +43,25 @@ torch::Tensor HomogeneousPolynomialEvaluation::forward(
         });
     } else if (A.device().is_cuda()) {
 
+#ifndef MOPS_CUDA_ENABLED
+        C10_THROW_ERROR(ValueError, "MOPS was not compiled with CUDA support " + A.device().str());
+#else
+        c10::cuda::CUDAGuard deviceGuard{A.device()};
+        cudaStream_t currstream = c10::cuda::getCurrentCUDAStream();
+        void* stream = reinterpret_cast<void*>(currstream);
+
         AT_DISPATCH_FLOATING_TYPES(A.scalar_type(), "homogeneous_polynomial_evaluation", [&]() {
             mops::cuda::homogeneous_polynomial_evaluation<scalar_t>(
                 details::torch_to_mops_1d<scalar_t>(output),
                 details::torch_to_mops_2d<scalar_t>(A),
                 details::torch_to_mops_1d<scalar_t>(C),
-                details::torch_to_mops_2d<int32_t>(indices_A)
+                details::torch_to_mops_2d<int32_t>(indices_A),
+                stream
             );
         });
 
+#endif
+
     } else {
         C10_THROW_ERROR(
             ValueError,
@@ -108,6 +123,12 @@ torch::Tensor HomogeneousPolynomialEvaluationBackward::forward(
             );
         });
     } else if (A.device().is_cuda()) {
+#ifndef MOPS_CUDA_ENABLED
+        C10_THROW_ERROR(ValueError, "MOPS was not compiled with CUDA support " + A.device().str());
+#else
+        c10::cuda::CUDAGuard deviceGuard{A.device()};
+        cudaStream_t currstream = c10::cuda::getCurrentCUDAStream();
+        void* stream = reinterpret_cast<void*>(currstream);
 
         AT_DISPATCH_FLOATING_TYPES(A.scalar_type(), "homogeneous_polynomial_evaluation_vjp", [&]() {
             auto mops_grad_A = mops::Tensor<scalar_t, 2>{nullptr, {0, 0}};
@@ -121,9 +142,11 @@ torch::Tensor HomogeneousPolynomialEvaluationBackward::forward(
                 details::torch_to_mops_1d<scalar_t>(grad_output),
                 details::torch_to_mops_2d<scalar_t>(A),
                 details::torch_to_mops_1d<scalar_t>(C),
-                details::torch_to_mops_2d<int32_t>(indices_A)
+                details::torch_to_mops_2d<int32_t>(indices_A),
+                stream
             );
         });
+#endif
     } else {
         C10_THROW_ERROR(
             ValueError,
diff --git a/mops-torch/src/opsaw.cpp b/mops-torch/src/opsaw.cpp
index 608598f..a8f313d 100644
--- a/mops-torch/src/opsaw.cpp
+++ b/mops-torch/src/opsaw.cpp
@@ -1,3 +1,8 @@
+#ifdef MOPS_CUDA_ENABLED
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/cuda/CUDAStream.h>
+#endif
+
 #include "mops/torch/opsaw.hpp"
 #include "mops/torch/utils.hpp"
 
diff --git a/mops-torch/src/sap.cpp b/mops-torch/src/sap.cpp
index a68cca5..aca6d49 100644
--- a/mops-torch/src/sap.cpp
+++ b/mops-torch/src/sap.cpp
@@ -1,3 +1,8 @@
+#ifdef MOPS_CUDA_ENABLED
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/cuda/CUDAStream.h>
+#endif
+
 #include "mops/torch/sap.hpp"
 #include "mops/torch/utils.hpp"
 
diff --git a/mops-torch/src/sasaw.cpp b/mops-torch/src/sasaw.cpp
index 2868e23..ee8ea76 100644
--- a/mops-torch/src/sasaw.cpp
+++ b/mops-torch/src/sasaw.cpp
@@ -1,3 +1,8 @@
+#ifdef MOPS_CUDA_ENABLED
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/cuda/CUDAStream.h>
+#endif
+
 #include "mops/torch/sasaw.hpp"
 #include "mops/torch/utils.hpp"
 
diff --git a/mops/CMakeLists.txt b/mops/CMakeLists.txt
index efc00b7..846eb47 100644
--- a/mops/CMakeLists.txt
+++ b/mops/CMakeLists.txt
@@ -110,7 +110,7 @@ add_library(mops
 
 if(CMAKE_CUDA_COMPILER AND MOPS_CUDA)
     target_compile_definitions(mops PUBLIC MOPS_CUDA_ENABLED)
-    set_target_properties(mops PROPERTIES CUDA_ARCHITECTURES all)
+    set_target_properties(mops PROPERTIES CUDA_ARCHITECTURES native)
     set_target_properties(mops PROPERTIES CUDA_NVCC_FLAGS "-lineinfo")
 
     target_sources(mops
diff --git a/mops/include/mops/hpe.h b/mops/include/mops/hpe.h
index ca18489..d1cf15b 100644
--- a/mops/include/mops/hpe.h
+++ b/mops/include/mops/hpe.h
@@ -69,7 +69,8 @@ int MOPS_EXPORT mops_cuda_homogeneous_polynomial_evaluation_f32(
     mops_tensor_1d_f32_t output,
     mops_tensor_2d_f32_t A,
     mops_tensor_1d_f32_t C,
-    mops_tensor_2d_i32_t indices_A
+    mops_tensor_2d_i32_t indices_A,
+    void* cuda_stream
 );
 
 /// CUDA version of mops::homogeneous_polynomial_evaluation for 64-bit floats
@@ -77,7 +78,8 @@ int MOPS_EXPORT mops_cuda_homogeneous_polynomial_evaluation_f64(
     mops_tensor_1d_f64_t output,
     mops_tensor_2d_f64_t A,
     mops_tensor_1d_f64_t C,
-    mops_tensor_2d_i32_t indices_A
+    mops_tensor_2d_i32_t indices_A,
+    void* cuda_stream
 );
 
 /// CUDA version of mops::homogeneous_polynomial_evaluation_vjp for 32-bit floats
@@ -86,7 +88,8 @@ int MOPS_EXPORT mops_cuda_homogeneous_polynomial_evaluation_vjp_f32(
     mops_tensor_1d_f32_t grad_output,
     mops_tensor_2d_f32_t A,
     mops_tensor_1d_f32_t C,
-    mops_tensor_2d_i32_t indices_A
+    mops_tensor_2d_i32_t indices_A,
+    void* cuda_stream
 );
 
 /// CUDA version of mops::homogeneous_polynomial_evaluation_vjp for 64-bit floats
@@ -95,7 +98,8 @@ int MOPS_EXPORT mops_cuda_homogeneous_polynomial_evaluation_vjp_f64(
     mops_tensor_1d_f64_t grad_output,
     mops_tensor_2d_f64_t A,
     mops_tensor_1d_f64_t C,
-    mops_tensor_2d_i32_t indices_A
+    mops_tensor_2d_i32_t indices_A,
+    void* cuda_stream
 );
 
 /// CUDA version of mops::homogeneous_polynomial_evaluation_vjp_vjp for 32-bit floats
@@ -106,7 +110,8 @@ int MOPS_EXPORT mops_cuda_homogeneous_polynomial_evaluation_vjp_vjp_f32(
     mops_tensor_1d_f32_t grad_output,
     mops_tensor_2d_f32_t A,
     mops_tensor_1d_f32_t C,
-    mops_tensor_2d_i32_t indices_A
+    mops_tensor_2d_i32_t indices_A,
+    void* cuda_stream
 );
 
 /// CUDA version of mops::homogeneous_polynomial_evaluation_vjp_vjp for 64-bit floats
@@ -117,7 +122,8 @@ int MOPS_EXPORT mops_cuda_homogeneous_polynomial_evaluation_vjp_vjp_f64(
     mops_tensor_1d_f64_t grad_output,
     mops_tensor_2d_f64_t A,
     mops_tensor_1d_f64_t C,
-    mops_tensor_2d_i32_t indices_A
+    mops_tensor_2d_i32_t indices_A,
+    void* cuda_stream
 );
 
 #ifdef __cplusplus
diff --git a/mops/include/mops/hpe.hpp b/mops/include/mops/hpe.hpp
index 472417f..c3f6425 100644
--- a/mops/include/mops/hpe.hpp
+++ b/mops/include/mops/hpe.hpp
@@ -122,15 +122,15 @@ namespace cuda {
 /// CUDA version of mops::homogeneous_polynomial_evaluation
 template <typename scalar_t>
 void MOPS_EXPORT homogeneous_polynomial_evaluation(
-    Tensor<scalar_t, 1> output, Tensor<scalar_t, 2> A, Tensor<scalar_t, 1> C, Tensor<int32_t, 2> indices_A
+    Tensor<scalar_t, 1> output, Tensor<scalar_t, 2> A, Tensor<scalar_t, 1> C, Tensor<int32_t, 2> indices_A, void* cuda_stream = nullptr
 );
 
 extern template void homogeneous_polynomial_evaluation(
-    Tensor<float, 1> output, Tensor<float, 2> A, Tensor<float, 1> C, Tensor<int32_t, 2> indices_A
+    Tensor<float, 1> output, Tensor<float, 2> A, Tensor<float, 1> C, Tensor<int32_t, 2> indices_A, void* cuda_stream
 );
 
 extern template void homogeneous_polynomial_evaluation(
-    Tensor<double, 1> output, Tensor<double, 2> A, Tensor<double, 1> C, Tensor<int32_t, 2> indices_A
+    Tensor<double, 1> output, Tensor<double, 2> A, Tensor<double, 1> C, Tensor<int32_t, 2> indices_A, void* cuda_stream
 );
 
 template <typename scalar_t>
@@ -139,7 +139,8 @@ void MOPS_EXPORT homogeneous_polynomial_evaluation_vjp(
     Tensor<scalar_t, 1> grad_output,
     Tensor<scalar_t, 2> A,
     Tensor<scalar_t, 1> C,
-    Tensor<int32_t, 2> indices_A
+    Tensor<int32_t, 2> indices_A,
+    void* cuda_stream = nullptr
 );
 
 extern template void homogeneous_polynomial_evaluation_vjp(
@@ -147,7 +148,8 @@ extern template void homogeneous_polynomial_evaluation_vjp(
     Tensor<float, 1> grad_output,
     Tensor<float, 2> A,
     Tensor<float, 1> C,
-    Tensor<int32_t, 2> indices_A
+    Tensor<int32_t, 2> indices_A,
+    void* cuda_stream
 );
 
 extern template void homogeneous_polynomial_evaluation_vjp(
@@ -155,7 +157,8 @@ extern template void homogeneous_polynomial_evaluation_vjp(
     Tensor<double, 1> grad_output,
     Tensor<double, 2> A,
     Tensor<double, 1> C,
-    Tensor<int32_t, 2> indices_A
+    Tensor<int32_t, 2> indices_A,
+    void* cuda_stream
 );
 
 template <typename scalar_t>
@@ -166,7 +169,8 @@ void MOPS_EXPORT homogeneous_polynomial_evaluation_vjp_vjp(
     Tensor<scalar_t, 1> grad_output,
     Tensor<scalar_t, 2> A,
     Tensor<scalar_t, 1> C,
-    Tensor<int32_t, 2> indices_A
+    Tensor<int32_t, 2> indices_A,
+    void* cuda_stream = nullptr
 );
 
 extern template void homogeneous_polynomial_evaluation_vjp_vjp(
@@ -176,7 +180,8 @@ extern template void homogeneous_polynomial_evaluation_vjp_vjp(
     Tensor<float, 1> grad_output,
     Tensor<float, 2> A,
     Tensor<float, 1> C,
-    Tensor<int32_t, 2> indices_A
+    Tensor<int32_t, 2> indices_A,
+    void* cuda_stream
 );
 
 extern template void homogeneous_polynomial_evaluation_vjp_vjp(
@@ -186,7 +191,8 @@ extern template void homogeneous_polynomial_evaluation_vjp_vjp(
     Tensor<double, 1> grad_output,
     Tensor<double, 2> A,
     Tensor<double, 1> C,
-    Tensor<int32_t, 2> indices_A
+    Tensor<int32_t, 2> indices_A,
+    void* cuda_stream
 );
 
 } // namespace cuda
diff --git a/mops/include/mops/opsaw.h b/mops/include/mops/opsaw.h
index feafb11..ffa8566 100644
--- a/mops/include/mops/opsaw.h
+++ b/mops/include/mops/opsaw.h
@@ -95,7 +95,8 @@ int MOPS_EXPORT mops_cuda_outer_product_scatter_add_with_weights_f32(
     mops_tensor_2d_f32_t B,
     mops_tensor_2d_f32_t W,
     mops_tensor_1d_i32_t indices_W,
-    mops_tensor_1d_i32_t indices_output
+    mops_tensor_1d_i32_t indices_output,
+    void* cuda_stream
 );
 
 /// CUDA version of mops::outer_product_scatter_add_with_weights for 64-bit floats
@@ -105,7 +106,8 @@ int MOPS_EXPORT mops_cuda_outer_product_scatter_add_with_weights_f64(
     mops_tensor_2d_f64_t B,
     mops_tensor_2d_f64_t W,
     mops_tensor_1d_i32_t indices_W,
-    mops_tensor_1d_i32_t indices_output
+    mops_tensor_1d_i32_t indices_output,
+    void* cuda_stream
 );
 
 /// CUDA version of mops::outer_product_scatter_add_with_weights_vjp for 32-bit floats
@@ -118,7 +120,8 @@ int MOPS_EXPORT mops_cuda_outer_product_scatter_add_with_weights_vjp_f32(
     mops_tensor_2d_f32_t B,
     mops_tensor_2d_f32_t W,
     mops_tensor_1d_i32_t indices_W,
-    mops_tensor_1d_i32_t indices_output
+    mops_tensor_1d_i32_t indices_output,
+    void* cuda_stream
 );
 
 /// CUDA version of mops::outer_product_scatter_add_with_weights_vjp for 64-bit floats
@@ -131,7 +134,8 @@ int MOPS_EXPORT mops_cuda_outer_product_scatter_add_with_weights_vjp_f64(
     mops_tensor_2d_f64_t B,
     mops_tensor_2d_f64_t W,
     mops_tensor_1d_i32_t indices_W,
-    mops_tensor_1d_i32_t indices_output
+    mops_tensor_1d_i32_t indices_output,
+    void* cuda_stream
 );
 
 /// CUDA version of mops::outer_product_scatter_add_with_weights_vjp_vjp for 32-bit floats
@@ -148,7 +152,8 @@ int MOPS_EXPORT mops_cuda_outer_product_scatter_add_with_weights_vjp_vjp_f32(
     mops_tensor_2d_f32_t B,
     mops_tensor_2d_f32_t W,
     mops_tensor_1d_i32_t indices_W,
-    mops_tensor_1d_i32_t indices_output
+    mops_tensor_1d_i32_t indices_output,
+    void* cuda_stream
 );
 
 /// CUDA version of mops::outer_product_scatter_add_with_weights_vjp_vjp for 64-bit floats
@@ -165,7 +170,8 @@ int MOPS_EXPORT mops_cuda_outer_product_scatter_add_with_weights_vjp_vjp_f64(
     mops_tensor_2d_f64_t B,
     mops_tensor_2d_f64_t W,
     mops_tensor_1d_i32_t indices_W,
-    mops_tensor_1d_i32_t indices_output
+    mops_tensor_1d_i32_t indices_output,
+    void* cuda_stream
 );
 
 #ifdef __cplusplus
diff --git a/mops/include/mops/opsaw.hpp b/mops/include/mops/opsaw.hpp
index 656bdde..3f71db6 100644
--- a/mops/include/mops/opsaw.hpp
+++ b/mops/include/mops/opsaw.hpp
@@ -194,7 +194,8 @@ void MOPS_EXPORT outer_product_scatter_add_with_weights(
     Tensor<scalar_t, 2> B,
     Tensor<scalar_t, 2> W,
     Tensor<int32_t, 1> indices_W,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream = nullptr
 );
 
 extern template void outer_product_scatter_add_with_weights(
@@ -203,7 +204,8 @@ extern template void outer_product_scatter_add_with_weights(
     Tensor<float, 2> B,
     Tensor<float, 2> W,
     Tensor<int32_t, 1> indices_W,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream
 );
 
 extern template void outer_product_scatter_add_with_weights(
@@ -212,7 +214,8 @@ extern template void outer_product_scatter_add_with_weights(
     Tensor<double, 2> B,
     Tensor<double, 2> W,
     Tensor<int32_t, 1> indices_W,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream
 );
 
 /// CUDA version of mops::outer_product_scatter_add_with_weights_vjp
@@ -226,7 +229,8 @@ void MOPS_EXPORT outer_product_scatter_add_with_weights_vjp(
     Tensor<scalar_t, 2> B,
     Tensor<scalar_t, 2> W,
     Tensor<int32_t, 1> indices_W,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream = nullptr
 );
 
 extern template void outer_product_scatter_add_with_weights_vjp(
@@ -238,7 +242,8 @@ extern template void outer_product_scatter_add_with_weights_vjp(
     Tensor<float, 2> B,
     Tensor<float, 2> W,
     Tensor<int32_t, 1> indices_W,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream
 );
 
 extern template void outer_product_scatter_add_with_weights_vjp(
@@ -250,7 +255,8 @@ extern template void outer_product_scatter_add_with_weights_vjp(
     Tensor<double, 2> B,
     Tensor<double, 2> W,
     Tensor<int32_t, 1> indices_W,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream
 );
 
 /// TODO
@@ -268,7 +274,8 @@ void MOPS_EXPORT outer_product_scatter_add_with_weights_vjp_vjp(
     Tensor<scalar_t, 2> B,
     Tensor<scalar_t, 2> W,
     Tensor<int32_t, 1> indices_W,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream = nullptr
 );
 
 // these templates will be precompiled and provided in the mops library
@@ -285,7 +292,8 @@ extern template void outer_product_scatter_add_with_weights_vjp_vjp(
     Tensor<float, 2> B,
     Tensor<float, 2> W,
     Tensor<int32_t, 1> indices_W,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream
 );
 
 extern template void outer_product_scatter_add_with_weights_vjp_vjp(
@@ -301,7 +309,8 @@ extern template void outer_product_scatter_add_with_weights_vjp_vjp(
     Tensor<double, 2> B,
     Tensor<double, 2> W,
     Tensor<int32_t, 1> indices_W,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream
 );
 
 } // namespace cuda
diff --git a/mops/include/mops/sap.h b/mops/include/mops/sap.h
index 5990e57..210b9bf 100644
--- a/mops/include/mops/sap.h
+++ b/mops/include/mops/sap.h
@@ -96,7 +96,8 @@ int MOPS_EXPORT mops_cuda_sparse_accumulation_of_products_f32(
     mops_tensor_1d_f32_t C,
     mops_tensor_1d_i32_t indices_A,
     mops_tensor_1d_i32_t indices_B,
-    mops_tensor_1d_i32_t indices_output
+    mops_tensor_1d_i32_t indices_output,
+    void* cuda_stream
 );
 
 /// CUDA version of mops::sparse_accumulation_of_products for 64-bit floats
@@ -107,7 +108,8 @@ int MOPS_EXPORT mops_cuda_sparse_accumulation_of_products_f64(
     mops_tensor_1d_f64_t C,
     mops_tensor_1d_i32_t indices_A,
     mops_tensor_1d_i32_t indices_B,
-    mops_tensor_1d_i32_t indices_output
+    mops_tensor_1d_i32_t indices_output,
+    void* cuda_stream
 );
 
 /// CUDA version of mops::sparse_accumulation_of_products_vjp for 32-bit floats
@@ -120,7 +122,8 @@ int MOPS_EXPORT mops_cuda_sparse_accumulation_of_products_vjp_f32(
     mops_tensor_1d_f32_t C,
     mops_tensor_1d_i32_t indices_A,
     mops_tensor_1d_i32_t indices_B,
-    mops_tensor_1d_i32_t indices_output
+    mops_tensor_1d_i32_t indices_output,
+    void* cuda_stream
 );
 
 /// CUDA version of mops::sparse_accumulation_of_products_vjp for 64-bit floats
@@ -133,7 +136,8 @@ int MOPS_EXPORT mops_cuda_sparse_accumulation_of_products_vjp_f64(
     mops_tensor_1d_f64_t C,
     mops_tensor_1d_i32_t indices_A,
     mops_tensor_1d_i32_t indices_B,
-    mops_tensor_1d_i32_t indices_output
+    mops_tensor_1d_i32_t indices_output,
+    void* cuda_stream
 );
 
 /// CUDA version of mops::sparse_accumulation_of_products_vjp_vjp for 32-bit floats
@@ -149,7 +153,8 @@ int MOPS_EXPORT mops_cuda_sparse_accumulation_of_products_vjp_vjp_f32(
     mops_tensor_1d_f32_t C,
     mops_tensor_1d_i32_t indices_A,
     mops_tensor_1d_i32_t indices_B,
-    mops_tensor_1d_i32_t indices_output
+    mops_tensor_1d_i32_t indices_output,
+    void* cuda_stream
 );
 
 /// CUDA version of mops::sparse_accumulation_of_products_vjp_vjp for 64-bit floats
@@ -165,7 +170,8 @@ int MOPS_EXPORT mops_cuda_sparse_accumulation_of_products_vjp_vjp_f64(
     mops_tensor_1d_f64_t C,
     mops_tensor_1d_i32_t indices_A,
     mops_tensor_1d_i32_t indices_B,
-    mops_tensor_1d_i32_t indices_output
+    mops_tensor_1d_i32_t indices_output,
+    void* cuda_stream
 );
 
 #ifdef __cplusplus
diff --git a/mops/include/mops/sap.hpp b/mops/include/mops/sap.hpp
index a27090a..4568297 100644
--- a/mops/include/mops/sap.hpp
+++ b/mops/include/mops/sap.hpp
@@ -191,7 +191,8 @@ void MOPS_EXPORT sparse_accumulation_of_products(
     Tensor<scalar_t, 1> C,
     Tensor<int32_t, 1> indices_A,
     Tensor<int32_t, 1> indices_B,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream = nullptr
 );
 
 extern template void sparse_accumulation_of_products(
@@ -201,7 +202,8 @@ extern template void sparse_accumulation_of_products(
     Tensor<float, 1> C,
     Tensor<int32_t, 1> indices_A,
     Tensor<int32_t, 1> indices_B,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream
 );
 
 extern template void sparse_accumulation_of_products(
@@ -211,7 +213,8 @@ extern template void sparse_accumulation_of_products(
     Tensor<double, 1> C,
     Tensor<int32_t, 1> indices_A,
     Tensor<int32_t, 1> indices_B,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream
 );
 
 /// CUDA version of mops::sparse_accumulation_of_products_vjp
@@ -225,7 +228,8 @@ void MOPS_EXPORT sparse_accumulation_of_products_vjp(
     Tensor<scalar_t, 1> C,
     Tensor<int32_t, 1> indices_A,
     Tensor<int32_t, 1> indices_B,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream = nullptr
 );
 
 extern template void sparse_accumulation_of_products_vjp(
@@ -237,7 +241,8 @@ extern template void sparse_accumulation_of_products_vjp(
     Tensor<float, 1> C,
     Tensor<int32_t, 1> indices_A,
     Tensor<int32_t, 1> indices_B,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream
 );
 
 extern template void sparse_accumulation_of_products_vjp(
@@ -249,7 +254,8 @@ extern template void sparse_accumulation_of_products_vjp(
     Tensor<double, 1> C,
     Tensor<int32_t, 1> indices_A,
     Tensor<int32_t, 1> indices_B,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream
 );
 
 /// TODO
@@ -266,7 +272,8 @@ void MOPS_EXPORT sparse_accumulation_of_products_vjp_vjp(
     Tensor<scalar_t, 1> C,
     Tensor<int32_t, 1> indices_A,
     Tensor<int32_t, 1> indices_B,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream = nullptr
 );
 
 // these templates will be precompiled and provided in the mops library
@@ -282,7 +289,8 @@ extern template void sparse_accumulation_of_products_vjp_vjp(
     Tensor<float, 1> C,
     Tensor<int32_t, 1> indices_A,
     Tensor<int32_t, 1> indices_B,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream
 );
 
 extern template void sparse_accumulation_of_products_vjp_vjp(
@@ -297,7 +305,8 @@ extern template void sparse_accumulation_of_products_vjp_vjp(
     Tensor<double, 1> C,
     Tensor<int32_t, 1> indices_A,
     Tensor<int32_t, 1> indices_B,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream
 );
 
 } // namespace cuda
diff --git a/mops/include/mops/sasaw.h b/mops/include/mops/sasaw.h
index 75170d7..1c4b88c 100644
--- a/mops/include/mops/sasaw.h
+++ b/mops/include/mops/sasaw.h
@@ -127,7 +127,8 @@ int MOPS_EXPORT mops_cuda_sparse_accumulation_scatter_add_with_weights_f32(
     mops_tensor_1d_i32_t indices_W_1,
     mops_tensor_1d_i32_t indices_W_2,
     mops_tensor_1d_i32_t indices_output_1,
-    mops_tensor_1d_i32_t indices_output_2
+    mops_tensor_1d_i32_t indices_output_2,
+    void * cuda_stream
 );
 
 /// CUDA version of mops::sparse_accumulation_scatter_add_with for 64-bit floats
@@ -141,7 +142,8 @@ int MOPS_EXPORT mops_cuda_sparse_accumulation_scatter_add_with_weights_f64(
     mops_tensor_1d_i32_t indices_W_1,
     mops_tensor_1d_i32_t indices_W_2,
     mops_tensor_1d_i32_t indices_output_1,
-    mops_tensor_1d_i32_t indices_output_2
+    mops_tensor_1d_i32_t indices_output_2,
+    void * cuda_stream
 );
 
 /// CUDA version of mops::sparse_accumulation_scatter_add_with_weights_vjp for
@@ -159,7 +161,8 @@ int MOPS_EXPORT mops_cuda_sparse_accumulation_scatter_add_with_weights_vjp_f32(
     mops_tensor_1d_i32_t indices_W_1,
     mops_tensor_1d_i32_t indices_W_2,
     mops_tensor_1d_i32_t indices_output_1,
-    mops_tensor_1d_i32_t indices_output_2
+    mops_tensor_1d_i32_t indices_output_2,
+    void * cuda_stream
 );
 
 /// CUDA version of mops::sparse_accumulation_scatter_add_with_weights_vjp for
@@ -177,7 +180,8 @@ int MOPS_EXPORT mops_cuda_sparse_accumulation_scatter_add_with_weights_vjp_f64(
     mops_tensor_1d_i32_t indices_W_1,
     mops_tensor_1d_i32_t indices_W_2,
     mops_tensor_1d_i32_t indices_output_1,
-    mops_tensor_1d_i32_t indices_output_2
+    mops_tensor_1d_i32_t indices_output_2,
+    void * cuda_stream
 );
 
 /// CUDA version of mops::sparse_accumulation_scatter_add_with_weights_vjp_vjp for
@@ -199,7 +203,8 @@ int MOPS_EXPORT mops_cuda_sparse_accumulation_scatter_add_with_weights_vjp_vjp_f
     mops_tensor_1d_i32_t indices_W_1,
     mops_tensor_1d_i32_t indices_W_2,
     mops_tensor_1d_i32_t indices_output_1,
-    mops_tensor_1d_i32_t indices_output_2
+    mops_tensor_1d_i32_t indices_output_2,
+    void * cuda_stream
 );
 
 /// CUDA version of mops::sparse_accumulation_scatter_add_with_weights_vjp_vjp for
@@ -221,7 +226,8 @@ int MOPS_EXPORT mops_cuda_sparse_accumulation_scatter_add_with_weights_vjp_vjp_f
     mops_tensor_1d_i32_t indices_W_1,
     mops_tensor_1d_i32_t indices_W_2,
     mops_tensor_1d_i32_t indices_output_1,
-    mops_tensor_1d_i32_t indices_output_2
+    mops_tensor_1d_i32_t indices_output_2,
+    void * cuda_stream
 );
 
 #ifdef __cplusplus
diff --git a/mops/include/mops/sasaw.hpp b/mops/include/mops/sasaw.hpp
index 075db69..8654eb9 100644
--- a/mops/include/mops/sasaw.hpp
+++ b/mops/include/mops/sasaw.hpp
@@ -253,7 +253,8 @@ void MOPS_EXPORT sparse_accumulation_scatter_add_with_weights(
     Tensor<int32_t, 1> indices_W_1,
     Tensor<int32_t, 1> indices_W_2,
     Tensor<int32_t, 1> indices_output_1,
-    Tensor<int32_t, 1> indices_output_2
+    Tensor<int32_t, 1> indices_output_2,
+    void* cuda_stream = nullptr
 );
 
 extern template void sparse_accumulation_scatter_add_with_weights(
@@ -266,7 +267,8 @@ extern template void sparse_accumulation_scatter_add_with_weights(
     Tensor<int32_t, 1> indices_W_1,
     Tensor<int32_t, 1> indices_W_2,
     Tensor<int32_t, 1> indices_output_1,
-    Tensor<int32_t, 1> indices_output_2
+    Tensor<int32_t, 1> indices_output_2,
+    void* cuda_stream
 );
 
 extern template void sparse_accumulation_scatter_add_with_weights(
@@ -279,7 +281,8 @@ extern template void sparse_accumulation_scatter_add_with_weights(
     Tensor<int32_t, 1> indices_W_1,
     Tensor<int32_t, 1> indices_W_2,
     Tensor<int32_t, 1> indices_output_1,
-    Tensor<int32_t, 1> indices_output_2
+    Tensor<int32_t, 1> indices_output_2,
+    void* cuda_stream
 );
 
 /// CUDA version of mops::sparse_accumulation_scatter_add_with_weights_vjp
@@ -297,7 +300,8 @@ void MOPS_EXPORT sparse_accumulation_scatter_add_with_weights_vjp(
     Tensor<int32_t, 1> indices_W_1,
     Tensor<int32_t, 1> indices_W_2,
     Tensor<int32_t, 1> indices_output_1,
-    Tensor<int32_t, 1> indices_output_2
+    Tensor<int32_t, 1> indices_output_2,
+    void* cuda_stream = nullptr
 );
 
 extern template void sparse_accumulation_scatter_add_with_weights_vjp(
@@ -313,7 +317,8 @@ extern template void sparse_accumulation_scatter_add_with_weights_vjp(
     Tensor<int32_t, 1> indices_W_1,
     Tensor<int32_t, 1> indices_W_2,
     Tensor<int32_t, 1> indices_output_1,
-    Tensor<int32_t, 1> indices_output_2
+    Tensor<int32_t, 1> indices_output_2,
+    void* cuda_stream
 );
 
 extern template void sparse_accumulation_scatter_add_with_weights_vjp(
@@ -329,7 +334,8 @@ extern template void sparse_accumulation_scatter_add_with_weights_vjp(
     Tensor<int32_t, 1> indices_W_1,
     Tensor<int32_t, 1> indices_W_2,
     Tensor<int32_t, 1> indices_output_1,
-    Tensor<int32_t, 1> indices_output_2
+    Tensor<int32_t, 1> indices_output_2,
+    void* cuda_stream
 );
 
 /// TODO
@@ -351,7 +357,8 @@ void MOPS_EXPORT sparse_accumulation_scatter_add_with_weights_vjp_vjp(
     Tensor<int32_t, 1> indices_W_1,
     Tensor<int32_t, 1> indices_W_2,
     Tensor<int32_t, 1> indices_output_1,
-    Tensor<int32_t, 1> indices_output_2
+    Tensor<int32_t, 1> indices_output_2,
+    void* cuda_stream = nullptr
 );
 
 // these templates will be precompiled and provided in the mops library
@@ -372,7 +379,8 @@ extern template void sparse_accumulation_scatter_add_with_weights_vjp_vjp(
     Tensor<int32_t, 1> indices_W_1,
     Tensor<int32_t, 1> indices_W_2,
     Tensor<int32_t, 1> indices_output_1,
-    Tensor<int32_t, 1> indices_output_2
+    Tensor<int32_t, 1> indices_output_2,
+    void* cuda_stream
 );
 
 extern template void sparse_accumulation_scatter_add_with_weights_vjp_vjp(
@@ -392,7 +400,8 @@ extern template void sparse_accumulation_scatter_add_with_weights_vjp_vjp(
     Tensor<int32_t, 1> indices_W_1,
     Tensor<int32_t, 1> indices_W_2,
     Tensor<int32_t, 1> indices_output_1,
-    Tensor<int32_t, 1> indices_output_2
+    Tensor<int32_t, 1> indices_output_2,
+    void* cuda_stream
 );
 
 } // namespace cuda
diff --git a/mops/src/hpe/capi.cpp b/mops/src/hpe/capi.cpp
index 7dd5e92..01e3056 100644
--- a/mops/src/hpe/capi.cpp
+++ b/mops/src/hpe/capi.cpp
@@ -132,14 +132,16 @@ extern "C" int mops_cuda_homogeneous_polynomial_evaluation_f32(
     mops_tensor_1d_f32_t output,
     mops_tensor_2d_f32_t A,
     mops_tensor_1d_f32_t C,
-    mops_tensor_2d_i32_t indices_A
+    mops_tensor_2d_i32_t indices_A,
+    void* cuda_stream
 ) {
     MOPS_CATCH_EXCEPTIONS_BEGIN
         mops::cuda::homogeneous_polynomial_evaluation<float>(
             {output.data, {checked_cast(output.shape[0])}},
             {A.data, {checked_cast(A.shape[0]), checked_cast(A.shape[1])}},
             {C.data, {checked_cast(C.shape[0])}},
-            {indices_A.data, {checked_cast(indices_A.shape[0]), checked_cast(indices_A.shape[1])}}
+            {indices_A.data, {checked_cast(indices_A.shape[0]), checked_cast(indices_A.shape[1])}},
+            cuda_stream
         );
     MOPS_CATCH_EXCEPTIONS_END
 }
@@ -148,14 +150,16 @@ extern "C" int mops_cuda_homogeneous_polynomial_evaluation_f64(
     mops_tensor_1d_f64_t output,
     mops_tensor_2d_f64_t A,
     mops_tensor_1d_f64_t C,
-    mops_tensor_2d_i32_t indices_A
+    mops_tensor_2d_i32_t indices_A,
+    void* cuda_stream
 ) {
     MOPS_CATCH_EXCEPTIONS_BEGIN
         mops::cuda::homogeneous_polynomial_evaluation<double>(
             {output.data, {checked_cast(output.shape[0])}},
             {A.data, {checked_cast(A.shape[0]), checked_cast(A.shape[1])}},
             {C.data, {checked_cast(C.shape[0])}},
-            {indices_A.data, {checked_cast(indices_A.shape[0]), checked_cast(indices_A.shape[1])}}
+            {indices_A.data, {checked_cast(indices_A.shape[0]), checked_cast(indices_A.shape[1])}},
+            cuda_stream
         );
     MOPS_CATCH_EXCEPTIONS_END
 }
@@ -165,7 +169,8 @@ extern "C" int mops_cuda_homogeneous_polynomial_evaluation_vjp_f32(
     mops_tensor_1d_f32_t grad_output,
     mops_tensor_2d_f32_t A,
     mops_tensor_1d_f32_t C,
-    mops_tensor_2d_i32_t indices_A
+    mops_tensor_2d_i32_t indices_A,
+    void* cuda_stream
 ) {
     MOPS_CATCH_EXCEPTIONS_BEGIN
         mops::cuda::homogeneous_polynomial_evaluation_vjp<float>(
@@ -173,7 +178,8 @@ extern "C" int mops_cuda_homogeneous_polynomial_evaluation_vjp_f32(
             {grad_output.data, {checked_cast(grad_output.shape[0])}},
             {A.data, {checked_cast(A.shape[0]), checked_cast(A.shape[1])}},
             {C.data, {checked_cast(C.shape[0])}},
-            {indices_A.data, {checked_cast(indices_A.shape[0]), checked_cast(indices_A.shape[1])}}
+            {indices_A.data, {checked_cast(indices_A.shape[0]), checked_cast(indices_A.shape[1])}},
+            cuda_stream
         );
     MOPS_CATCH_EXCEPTIONS_END
 }
@@ -183,7 +189,8 @@ extern "C" int mops_cuda_homogeneous_polynomial_evaluation_vjp_f64(
     mops_tensor_1d_f64_t grad_output,
     mops_tensor_2d_f64_t A,
     mops_tensor_1d_f64_t C,
-    mops_tensor_2d_i32_t indices_A
+    mops_tensor_2d_i32_t indices_A,
+    void* cuda_stream
 ) {
     MOPS_CATCH_EXCEPTIONS_BEGIN
         mops::cuda::homogeneous_polynomial_evaluation_vjp<double>(
@@ -191,7 +198,8 @@ extern "C" int mops_cuda_homogeneous_polynomial_evaluation_vjp_f64(
             {grad_output.data, {checked_cast(grad_output.shape[0])}},
             {A.data, {checked_cast(A.shape[0]), checked_cast(A.shape[1])}},
             {C.data, {checked_cast(C.shape[0])}},
-            {indices_A.data, {checked_cast(indices_A.shape[0]), checked_cast(indices_A.shape[1])}}
+            {indices_A.data, {checked_cast(indices_A.shape[0]), checked_cast(indices_A.shape[1])}},
+            cuda_stream
         );
     MOPS_CATCH_EXCEPTIONS_END
 }
@@ -203,7 +211,8 @@ extern "C" int mops_cuda_homogeneous_polynomial_evaluation_vjp_vjp_f32(
     mops_tensor_1d_f32_t grad_output,
     mops_tensor_2d_f32_t A,
     mops_tensor_1d_f32_t C,
-    mops_tensor_2d_i32_t indices_A
+    mops_tensor_2d_i32_t indices_A,
+    void* cuda_stream
 ) {
     MOPS_CATCH_EXCEPTIONS_BEGIN
         mops::cuda::homogeneous_polynomial_evaluation_vjp_vjp<float>(
@@ -214,7 +223,8 @@ extern "C" int mops_cuda_homogeneous_polynomial_evaluation_vjp_vjp_f32(
             {grad_output.data, {checked_cast(grad_output.shape[0])}},
             {A.data, {checked_cast(A.shape[0]), checked_cast(A.shape[1])}},
             {C.data, {checked_cast(C.shape[0])}},
-            {indices_A.data, {checked_cast(indices_A.shape[0]), checked_cast(indices_A.shape[1])}}
+            {indices_A.data, {checked_cast(indices_A.shape[0]), checked_cast(indices_A.shape[1])}},
+            cuda_stream
         );
     MOPS_CATCH_EXCEPTIONS_END
 }
@@ -226,7 +236,8 @@ extern "C" int mops_cuda_homogeneous_polynomial_evaluation_vjp_vjp_f64(
     mops_tensor_1d_f64_t grad_output,
     mops_tensor_2d_f64_t A,
     mops_tensor_1d_f64_t C,
-    mops_tensor_2d_i32_t indices_A
+    mops_tensor_2d_i32_t indices_A,
+    void* cuda_stream
 ) {
     MOPS_CATCH_EXCEPTIONS_BEGIN
         mops::cuda::homogeneous_polynomial_evaluation_vjp_vjp<double>(
@@ -237,7 +248,8 @@ extern "C" int mops_cuda_homogeneous_polynomial_evaluation_vjp_vjp_f64(
             {grad_output.data, {checked_cast(grad_output.shape[0])}},
             {A.data, {checked_cast(A.shape[0]), checked_cast(A.shape[1])}},
             {C.data, {checked_cast(C.shape[0])}},
-            {indices_A.data, {checked_cast(indices_A.shape[0]), checked_cast(indices_A.shape[1])}}
+            {indices_A.data, {checked_cast(indices_A.shape[0]), checked_cast(indices_A.shape[1])}},
+            cuda_stream
         );
     MOPS_CATCH_EXCEPTIONS_END
 }
diff --git a/mops/src/hpe/hpe.cpp b/mops/src/hpe/hpe.cpp
index 1fcf0ba..52602ae 100644
--- a/mops/src/hpe/hpe.cpp
+++ b/mops/src/hpe/hpe.cpp
@@ -48,29 +48,37 @@ template void mops::homogeneous_polynomial_evaluation_vjp_vjp<double>(
 #ifndef MOPS_CUDA_ENABLED
 template <typename scalar_t>
 void mops::cuda::
-    homogeneous_polynomial_evaluation(Tensor<scalar_t, 1>, Tensor<scalar_t, 2>, Tensor<scalar_t, 1>, Tensor<int32_t, 2>) {
+    homogeneous_polynomial_evaluation(Tensor<scalar_t, 1>, Tensor<scalar_t, 2>, Tensor<scalar_t, 1>, Tensor<int32_t, 2>, void*) {
     throw std::runtime_error("MOPS was not compiled with CUDA support");
 }
 
 template <typename scalar_t>
 void mops::cuda::
-    homogeneous_polynomial_evaluation_vjp(Tensor<scalar_t, 2>, Tensor<scalar_t, 1>, Tensor<scalar_t, 2>, Tensor<scalar_t, 1>, Tensor<int32_t, 2>) {
+    homogeneous_polynomial_evaluation_vjp(Tensor<scalar_t, 2>, Tensor<scalar_t, 1>, Tensor<scalar_t, 2>, Tensor<scalar_t, 1>, Tensor<int32_t, 2>, void*) {
     throw std::runtime_error("MOPS was not compiled with CUDA support");
 }
 
 template <typename scalar_t>
 void mops::cuda::
-    homogeneous_polynomial_evaluation_vjp_vjp(Tensor<scalar_t, 1>, Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<scalar_t, 1>, Tensor<scalar_t, 2>, Tensor<scalar_t, 1>, Tensor<int32_t, 2>) {
+    homogeneous_polynomial_evaluation_vjp_vjp(Tensor<scalar_t, 1>, Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<scalar_t, 1>, Tensor<scalar_t, 2>, Tensor<scalar_t, 1>, Tensor<int32_t, 2>, void*) {
     throw std::runtime_error("MOPS was not compiled with CUDA support");
 }
 
 // explicit instantiations of CUDA templates
 template void mops::cuda::homogeneous_polynomial_evaluation<float>(
-    Tensor<float, 1> output, Tensor<float, 2> A, Tensor<float, 1> C, Tensor<int32_t, 2> indices_A
+    Tensor<float, 1> output,
+    Tensor<float, 2> A,
+    Tensor<float, 1> C,
+    Tensor<int32_t, 2> indices_A,
+    void* stream
 );
 
 template void mops::cuda::homogeneous_polynomial_evaluation<double>(
-    Tensor<double, 1> output, Tensor<double, 2> A, Tensor<double, 1> C, Tensor<int32_t, 2> indices_A
+    Tensor<double, 1> output,
+    Tensor<double, 2> A,
+    Tensor<double, 1> C,
+    Tensor<int32_t, 2> indices_A,
+    void* stream
 );
 
 template void mops::cuda::homogeneous_polynomial_evaluation_vjp<float>(
@@ -78,7 +86,8 @@ template void mops::cuda::homogeneous_polynomial_evaluation_vjp<float>(
     Tensor<float, 1> grad_output,
     Tensor<float, 2> A,
     Tensor<float, 1> C,
-    Tensor<int32_t, 2> indices_A
+    Tensor<int32_t, 2> indices_A,
+    void* stream
 );
 
 template void mops::cuda::homogeneous_polynomial_evaluation_vjp<double>(
@@ -86,7 +95,8 @@ template void mops::cuda::homogeneous_polynomial_evaluation_vjp<double>(
     Tensor<double, 1> grad_output,
     Tensor<double, 2> A,
     Tensor<double, 1> C,
-    Tensor<int32_t, 2> indices_A
+    Tensor<int32_t, 2> indices_A,
+    void* stream
 );
 
 template void mops::cuda::homogeneous_polynomial_evaluation_vjp_vjp<float>(
@@ -96,7 +106,8 @@ template void mops::cuda::homogeneous_polynomial_evaluation_vjp_vjp<float>(
     Tensor<float, 1> grad_output,
     Tensor<float, 2> A,
     Tensor<float, 1> C,
-    Tensor<int32_t, 2> indices_A
+    Tensor<int32_t, 2> indices_A,
+    void* stream
 );
 
 template void mops::cuda::homogeneous_polynomial_evaluation_vjp_vjp<double>(
@@ -106,7 +117,8 @@ template void mops::cuda::homogeneous_polynomial_evaluation_vjp_vjp<double>(
     Tensor<double, 1> grad_output,
     Tensor<double, 2> A,
     Tensor<double, 1> C,
-    Tensor<int32_t, 2> indices_A
+    Tensor<int32_t, 2> indices_A,
+    void* stream
 );
 
 #endif
diff --git a/mops/src/hpe/hpe.cu b/mops/src/hpe/hpe.cu
index 964c4b4..7e36468 100644
--- a/mops/src/hpe/hpe.cu
+++ b/mops/src/hpe/hpe.cu
@@ -108,11 +108,21 @@ __global__ void homogeneous_polynomial_evaluation_kernel(
 
 template <typename scalar_t>
 void mops::cuda::homogeneous_polynomial_evaluation(
-    Tensor<scalar_t, 1> output, Tensor<scalar_t, 2> A, Tensor<scalar_t, 1> C, Tensor<int32_t, 2> indices_A
+    Tensor<scalar_t, 1> output, Tensor<scalar_t, 2> A, Tensor<scalar_t, 1> C, Tensor<int32_t, 2> indices_A, void * cuda_stream
 ) {
 
     check_hpe(output, A, C, indices_A, "cuda_homogeneous_polynomial_evaluation");
 
+    cudaPointerAttributes attributes;
+    CUDA_CHECK_ERROR(cudaPointerGetAttributes(&attributes, A.data));
+    int current_device;
+    CUDA_CHECK_ERROR(cudaGetDevice(&current_device));
+    if (current_device != attributes.device) {
+        CUDA_CHECK_ERROR(cudaSetDevice(attributes.device));
+    }
+
+    cudaStream_t cstream = reinterpret_cast<cudaStream_t>(cuda_stream);
+
     int32_t nbatch = output.shape[0];
     int32_t nnu1 = A.shape[1];
     size_t polynomial_order = indices_A.shape[1];
@@ -180,17 +190,20 @@ void mops::cuda::homogeneous_polynomial_evaluation(
     }
 
     CUDA_CHECK_ERROR(cudaGetLastError());
+    CUDA_CHECK_ERROR(cudaStreamSynchronize(cstream));
 
-    CUDA_CHECK_ERROR(cudaDeviceSynchronize());
+    if (current_device != attributes.device) {
+        CUDA_CHECK_ERROR(cudaSetDevice(current_device));
+    }
 }
 
 // explicit instanciations of CUDA templates
 template void mops::cuda::homogeneous_polynomial_evaluation<float>(
-    Tensor<float, 1> output, Tensor<float, 2> A, Tensor<float, 1> C, Tensor<int32_t, 2> indices_A
+    Tensor<float, 1> output, Tensor<float, 2> A, Tensor<float, 1> C, Tensor<int32_t, 2> indices_A, void * cuda_stream
 );
 
 template void mops::cuda::homogeneous_polynomial_evaluation<double>(
-    Tensor<double, 1> output, Tensor<double, 2> A, Tensor<double, 1> C, Tensor<int32_t, 2> indices_A
+    Tensor<double, 1> output, Tensor<double, 2> A, Tensor<double, 1> C, Tensor<int32_t, 2> indices_A, void * cuda_stream
 );
 
 template <typename scalar_t, int32_t polynomial_order>
@@ -293,10 +306,21 @@ void mops::cuda::homogeneous_polynomial_evaluation_vjp(
     Tensor<scalar_t, 1> grad_output,
     Tensor<scalar_t, 2> A,
     Tensor<scalar_t, 1> C,
-    Tensor<int32_t, 2> indices_A
+    Tensor<int32_t, 2> indices_A,
+    void * cuda_stream
 ) {
     check_hpe_vjp(grad_A, grad_output, A, C, indices_A, "cuda_homogeneous_polynomial_evaluation_vjp");
 
+    cudaPointerAttributes attributes;
+    CUDA_CHECK_ERROR(cudaPointerGetAttributes(&attributes, A.data));
+    int current_device;
+    CUDA_CHECK_ERROR(cudaGetDevice(&current_device));
+    if (current_device != attributes.device) {
+        CUDA_CHECK_ERROR(cudaSetDevice(attributes.device));
+    }
+
+    cudaStream_t cstream = reinterpret_cast<cudaStream_t>(cuda_stream);
+
     int32_t nbatch = grad_output.shape[0];
     int32_t nnu1 = A.shape[1];
     size_t polynomial_order = indices_A.shape[1];
@@ -363,8 +387,11 @@ void mops::cuda::homogeneous_polynomial_evaluation_vjp(
     }
 
     CUDA_CHECK_ERROR(cudaGetLastError());
+    CUDA_CHECK_ERROR(cudaStreamSynchronize(cstream));
 
-    CUDA_CHECK_ERROR(cudaDeviceSynchronize());
+    if (current_device != attributes.device) {
+        CUDA_CHECK_ERROR(cudaSetDevice(current_device));
+    }
 }
 
 // explicit instanciations of CUDA templates
@@ -373,7 +400,8 @@ template void mops::cuda::homogeneous_polynomial_evaluation_vjp<float>(
     Tensor<float, 1> grad_output,
     Tensor<float, 2> A,
     Tensor<float, 1> C,
-    Tensor<int32_t, 2> indices_A
+    Tensor<int32_t, 2> indices_A,
+    void * cuda_stream
 );
 
 template void mops::cuda::homogeneous_polynomial_evaluation_vjp<double>(
@@ -381,7 +409,8 @@ template void mops::cuda::homogeneous_polynomial_evaluation_vjp<double>(
     Tensor<double, 1> grad_output,
     Tensor<double, 2> A,
     Tensor<double, 1> C,
-    Tensor<int32_t, 2> indices_A
+    Tensor<int32_t, 2> indices_A,
+    void * cuda_stream
 );
 
 template <typename scalar_t>
@@ -392,7 +421,8 @@ void mops::cuda::homogeneous_polynomial_evaluation_vjp_vjp(
     Tensor<scalar_t, 1> grad_output,
     Tensor<scalar_t, 2> A,
     Tensor<scalar_t, 1> C,
-    Tensor<int32_t, 2> indices_A
+    Tensor<int32_t, 2> indices_A,
+    void * cuda_stream
 ) {
     throw std::runtime_error("Not implemented");
 }
@@ -405,7 +435,8 @@ template void mops::cuda::homogeneous_polynomial_evaluation_vjp_vjp<float>(
     Tensor<float, 1> grad_output,
     Tensor<float, 2> A,
     Tensor<float, 1> C,
-    Tensor<int32_t, 2> indices_A
+    Tensor<int32_t, 2> indices_A,
+    void * cuda_stream
 );
 
 template void mops::cuda::homogeneous_polynomial_evaluation_vjp_vjp<double>(
@@ -415,5 +446,6 @@ template void mops::cuda::homogeneous_polynomial_evaluation_vjp_vjp<double>(
     Tensor<double, 1> grad_output,
     Tensor<double, 2> A,
     Tensor<double, 1> C,
-    Tensor<int32_t, 2> indices_A
+    Tensor<int32_t, 2> indices_A,
+    void * cuda_stream
 );
diff --git a/mops/src/opsa/opsa.cu b/mops/src/opsa/opsa.cu
index f500f44..73a2306 100644
--- a/mops/src/opsa/opsa.cu
+++ b/mops/src/opsa/opsa.cu
@@ -94,7 +94,7 @@ void mops::cuda::outer_product_scatter_add(
     );
 
     CUDA_CHECK_ERROR(cudaGetLastError());
-    CUDA_CHECK_ERROR(cudaDeviceSynchronize());
+    CUDA_CHECK_ERROR(cudaStreamSynchronize(cstream));
 
     if (current_device != attributes.device) {
         CUDA_CHECK_ERROR(cudaSetDevice(current_device));
@@ -312,7 +312,7 @@ void mops::cuda::outer_product_scatter_add_vjp(
     );
 
     CUDA_CHECK_ERROR(cudaGetLastError());
-    CUDA_CHECK_ERROR(cudaDeviceSynchronize());
+    CUDA_CHECK_ERROR(cudaStreamSynchronize(cstream));
 
     if (current_device != attributes.device) {
         CUDA_CHECK_ERROR(cudaSetDevice(current_device));
diff --git a/mops/src/opsaw/capi.cpp b/mops/src/opsaw/capi.cpp
index 04e0bc4..dacb6ac 100644
--- a/mops/src/opsaw/capi.cpp
+++ b/mops/src/opsaw/capi.cpp
@@ -210,7 +210,8 @@ extern "C" int mops_cuda_outer_product_scatter_add_with_weights_f32(
     mops_tensor_2d_f32_t B,
     mops_tensor_2d_f32_t W,
     mops_tensor_1d_i32_t indices_W,
-    mops_tensor_1d_i32_t indices_output
+    mops_tensor_1d_i32_t indices_output,
+    void * cuda_stream
 ) {
     MOPS_CATCH_EXCEPTIONS_BEGIN
         mops::cuda::outer_product_scatter_add_with_weights<float>(
@@ -222,7 +223,8 @@ extern "C" int mops_cuda_outer_product_scatter_add_with_weights_f32(
             {B.data, {checked_cast(B.shape[0]), checked_cast(B.shape[1])}},
             {W.data, {checked_cast(W.shape[0]), checked_cast(W.shape[1])}},
             {indices_W.data, {checked_cast(indices_W.shape[0])}},
-            {indices_output.data, {checked_cast(indices_output.shape[0])}}
+            {indices_output.data, {checked_cast(indices_output.shape[0])}},
+            cuda_stream
         );
     MOPS_CATCH_EXCEPTIONS_END
 }
@@ -233,7 +235,8 @@ extern "C" int mops_cuda_outer_product_scatter_add_with_weights_f64(
     mops_tensor_2d_f64_t B,
     mops_tensor_2d_f64_t W,
     mops_tensor_1d_i32_t indices_W,
-    mops_tensor_1d_i32_t indices_output
+    mops_tensor_1d_i32_t indices_output,
+    void * cuda_stream
 ) {
     MOPS_CATCH_EXCEPTIONS_BEGIN
         mops::cuda::outer_product_scatter_add_with_weights<double>(
@@ -245,7 +248,8 @@ extern "C" int mops_cuda_outer_product_scatter_add_with_weights_f64(
             {B.data, {checked_cast(B.shape[0]), checked_cast(B.shape[1])}},
             {W.data, {checked_cast(W.shape[0]), checked_cast(W.shape[1])}},
             {indices_W.data, {checked_cast(indices_W.shape[0])}},
-            {indices_output.data, {checked_cast(indices_output.shape[0])}}
+            {indices_output.data, {checked_cast(indices_output.shape[0])}},
+            cuda_stream
         );
     MOPS_CATCH_EXCEPTIONS_END
 }
@@ -259,7 +263,8 @@ extern "C" int mops_cuda_outer_product_scatter_add_with_weights_vjp_f32(
     mops_tensor_2d_f32_t B,
     mops_tensor_2d_f32_t W,
     mops_tensor_1d_i32_t indices_W,
-    mops_tensor_1d_i32_t indices_output
+    mops_tensor_1d_i32_t indices_output,
+    void * cuda_stream
 ) {
     MOPS_CATCH_EXCEPTIONS_BEGIN
         mops::cuda::outer_product_scatter_add_with_weights_vjp<float>(
@@ -274,7 +279,8 @@ extern "C" int mops_cuda_outer_product_scatter_add_with_weights_vjp_f32(
             {B.data, {checked_cast(B.shape[0]), checked_cast(B.shape[1])}},
             {W.data, {checked_cast(W.shape[0]), checked_cast(W.shape[1])}},
             {indices_W.data, {checked_cast(indices_W.shape[0])}},
-            {indices_output.data, {checked_cast(indices_output.shape[0])}}
+            {indices_output.data, {checked_cast(indices_output.shape[0])}},
+            cuda_stream
         );
     MOPS_CATCH_EXCEPTIONS_END
 }
@@ -288,7 +294,8 @@ extern "C" int mops_cuda_outer_product_scatter_add_with_weights_vjp_f64(
     mops_tensor_2d_f64_t B,
     mops_tensor_2d_f64_t W,
     mops_tensor_1d_i32_t indices_W,
-    mops_tensor_1d_i32_t indices_output
+    mops_tensor_1d_i32_t indices_output,
+    void * cuda_stream
 ) {
     MOPS_CATCH_EXCEPTIONS_BEGIN
         mops::cuda::outer_product_scatter_add_with_weights_vjp<double>(
@@ -303,7 +310,8 @@ extern "C" int mops_cuda_outer_product_scatter_add_with_weights_vjp_f64(
             {B.data, {checked_cast(B.shape[0]), checked_cast(B.shape[1])}},
             {W.data, {checked_cast(W.shape[0]), checked_cast(W.shape[1])}},
             {indices_W.data, {checked_cast(indices_W.shape[0])}},
-            {indices_output.data, {checked_cast(indices_output.shape[0])}}
+            {indices_output.data, {checked_cast(indices_output.shape[0])}},
+            cuda_stream
         );
     MOPS_CATCH_EXCEPTIONS_END
 }
@@ -321,7 +329,8 @@ extern "C" int mops_cuda_outer_product_scatter_add_with_weights_vjp_vjp_f32(
     mops_tensor_2d_f32_t B,
     mops_tensor_2d_f32_t W,
     mops_tensor_1d_i32_t indices_W,
-    mops_tensor_1d_i32_t indices_output
+    mops_tensor_1d_i32_t indices_output,
+    void * cuda_stream
 ) {
     MOPS_CATCH_EXCEPTIONS_BEGIN
         mops::cuda::outer_product_scatter_add_with_weights_vjp_vjp<float>(
@@ -346,7 +355,8 @@ extern "C" int mops_cuda_outer_product_scatter_add_with_weights_vjp_vjp_f32(
             {B.data, {checked_cast(B.shape[0]), checked_cast(B.shape[1])}},
             {W.data, {checked_cast(W.shape[0]), checked_cast(W.shape[1])}},
             {indices_W.data, {checked_cast(indices_W.shape[0])}},
-            {indices_output.data, {checked_cast(indices_output.shape[0])}}
+            {indices_output.data, {checked_cast(indices_output.shape[0])}},
+            cuda_stream
         );
     MOPS_CATCH_EXCEPTIONS_END
 }
@@ -364,7 +374,8 @@ extern "C" int mops_cuda_outer_product_scatter_add_with_weights_vjp_vjp_f64(
     mops_tensor_2d_f64_t B,
     mops_tensor_2d_f64_t W,
     mops_tensor_1d_i32_t indices_W,
-    mops_tensor_1d_i32_t indices_output
+    mops_tensor_1d_i32_t indices_output,
+    void * cuda_stream
 ) {
     MOPS_CATCH_EXCEPTIONS_BEGIN
         mops::cuda::outer_product_scatter_add_with_weights_vjp_vjp<double>(
@@ -389,7 +400,8 @@ extern "C" int mops_cuda_outer_product_scatter_add_with_weights_vjp_vjp_f64(
             {B.data, {checked_cast(B.shape[0]), checked_cast(B.shape[1])}},
             {W.data, {checked_cast(W.shape[0]), checked_cast(W.shape[1])}},
             {indices_W.data, {checked_cast(indices_W.shape[0])}},
-            {indices_output.data, {checked_cast(indices_output.shape[0])}}
+            {indices_output.data, {checked_cast(indices_output.shape[0])}},
+            cuda_stream
         );
     MOPS_CATCH_EXCEPTIONS_END
 }
diff --git a/mops/src/sap/capi.cpp b/mops/src/sap/capi.cpp
index e5c79d8..e79a256 100644
--- a/mops/src/sap/capi.cpp
+++ b/mops/src/sap/capi.cpp
@@ -191,7 +191,8 @@ extern "C" int mops_cuda_sparse_accumulation_of_products_f32(
     mops_tensor_1d_f32_t C,
     mops_tensor_1d_i32_t indices_A,
     mops_tensor_1d_i32_t indices_B,
-    mops_tensor_1d_i32_t indices_output
+    mops_tensor_1d_i32_t indices_output,
+    void* cuda_stream
 ) {
     MOPS_CATCH_EXCEPTIONS_BEGIN
         mops::cuda::sparse_accumulation_of_products<float>(
@@ -201,7 +202,8 @@ extern "C" int mops_cuda_sparse_accumulation_of_products_f32(
             {C.data, {checked_cast(C.shape[0])}},
             {indices_A.data, {checked_cast(indices_A.shape[0])}},
             {indices_B.data, {checked_cast(indices_B.shape[0])}},
-            {indices_output.data, {checked_cast(indices_output.shape[0])}}
+            {indices_output.data, {checked_cast(indices_output.shape[0])}},
+            cuda_stream
         );
     MOPS_CATCH_EXCEPTIONS_END
 }
@@ -213,7 +215,8 @@ extern "C" int mops_cuda_sparse_accumulation_of_products_f64(
     mops_tensor_1d_f64_t C,
     mops_tensor_1d_i32_t indices_A,
     mops_tensor_1d_i32_t indices_B,
-    mops_tensor_1d_i32_t indices_output
+    mops_tensor_1d_i32_t indices_output,
+    void* cuda_stream
 ) {
     MOPS_CATCH_EXCEPTIONS_BEGIN
         mops::cuda::sparse_accumulation_of_products<double>(
@@ -223,7 +226,8 @@ extern "C" int mops_cuda_sparse_accumulation_of_products_f64(
             {C.data, {checked_cast(C.shape[0])}},
             {indices_A.data, {checked_cast(indices_A.shape[0])}},
             {indices_B.data, {checked_cast(indices_B.shape[0])}},
-            {indices_output.data, {checked_cast(indices_output.shape[0])}}
+            {indices_output.data, {checked_cast(indices_output.shape[0])}},
+            cuda_stream
         );
     MOPS_CATCH_EXCEPTIONS_END
 }
@@ -237,7 +241,8 @@ extern "C" int mops_cuda_sparse_accumulation_of_products_vjp_f32(
     mops_tensor_1d_f32_t C,
     mops_tensor_1d_i32_t indices_A,
     mops_tensor_1d_i32_t indices_B,
-    mops_tensor_1d_i32_t indices_output
+    mops_tensor_1d_i32_t indices_output,
+    void* cuda_stream
 ) {
     MOPS_CATCH_EXCEPTIONS_BEGIN
         mops::cuda::sparse_accumulation_of_products_vjp<float>(
@@ -250,7 +255,8 @@ extern "C" int mops_cuda_sparse_accumulation_of_products_vjp_f32(
             {C.data, {checked_cast(C.shape[0])}},
             {indices_A.data, {checked_cast(indices_A.shape[0])}},
             {indices_B.data, {checked_cast(indices_B.shape[0])}},
-            {indices_output.data, {checked_cast(indices_output.shape[0])}}
+            {indices_output.data, {checked_cast(indices_output.shape[0])}},
+            cuda_stream
         );
     MOPS_CATCH_EXCEPTIONS_END
 }
@@ -264,7 +270,8 @@ extern "C" int mops_cuda_sparse_accumulation_of_products_vjp_f64(
     mops_tensor_1d_f64_t C,
     mops_tensor_1d_i32_t indices_A,
     mops_tensor_1d_i32_t indices_B,
-    mops_tensor_1d_i32_t indices_output
+    mops_tensor_1d_i32_t indices_output,
+    void* cuda_stream
 ) {
     MOPS_CATCH_EXCEPTIONS_BEGIN
         mops::cuda::sparse_accumulation_of_products_vjp<double>(
@@ -277,7 +284,8 @@ extern "C" int mops_cuda_sparse_accumulation_of_products_vjp_f64(
             {C.data, {checked_cast(C.shape[0])}},
             {indices_A.data, {checked_cast(indices_A.shape[0])}},
             {indices_B.data, {checked_cast(indices_B.shape[0])}},
-            {indices_output.data, {checked_cast(indices_output.shape[0])}}
+            {indices_output.data, {checked_cast(indices_output.shape[0])}},
+            cuda_stream
         );
     MOPS_CATCH_EXCEPTIONS_END
 }
@@ -294,7 +302,8 @@ extern "C" int mops_cuda_sparse_accumulation_of_products_vjp_vjp_f32(
     mops_tensor_1d_f32_t C,
     mops_tensor_1d_i32_t indices_A,
     mops_tensor_1d_i32_t indices_B,
-    mops_tensor_1d_i32_t indices_output
+    mops_tensor_1d_i32_t indices_output,
+    void* cuda_stream
 ) {
     MOPS_CATCH_EXCEPTIONS_BEGIN
         mops::cuda::sparse_accumulation_of_products_vjp_vjp<float>(
@@ -313,7 +322,8 @@ extern "C" int mops_cuda_sparse_accumulation_of_products_vjp_vjp_f32(
             {C.data, {checked_cast(C.shape[0])}},
             {indices_A.data, {checked_cast(indices_A.shape[0])}},
             {indices_B.data, {checked_cast(indices_B.shape[0])}},
-            {indices_output.data, {checked_cast(indices_output.shape[0])}}
+            {indices_output.data, {checked_cast(indices_output.shape[0])}},
+            cuda_stream
         );
     MOPS_CATCH_EXCEPTIONS_END
 }
@@ -330,7 +340,8 @@ extern "C" int mops_cuda_sparse_accumulation_of_products_vjp_vjp_f64(
     mops_tensor_1d_f64_t C,
     mops_tensor_1d_i32_t indices_A,
     mops_tensor_1d_i32_t indices_B,
-    mops_tensor_1d_i32_t indices_output
+    mops_tensor_1d_i32_t indices_output,
+    void* cuda_stream
 ) {
     MOPS_CATCH_EXCEPTIONS_BEGIN
         mops::cuda::sparse_accumulation_of_products_vjp_vjp<double>(
@@ -349,7 +360,8 @@ extern "C" int mops_cuda_sparse_accumulation_of_products_vjp_vjp_f64(
             {C.data, {checked_cast(C.shape[0])}},
             {indices_A.data, {checked_cast(indices_A.shape[0])}},
             {indices_B.data, {checked_cast(indices_B.shape[0])}},
-            {indices_output.data, {checked_cast(indices_output.shape[0])}}
+            {indices_output.data, {checked_cast(indices_output.shape[0])}},
+            cuda_stream
         );
     MOPS_CATCH_EXCEPTIONS_END
 }
diff --git a/mops/src/sap/sap.cpp b/mops/src/sap/sap.cpp
index 545c429..b91b41a 100644
--- a/mops/src/sap/sap.cpp
+++ b/mops/src/sap/sap.cpp
@@ -78,13 +78,13 @@ template void mops::sparse_accumulation_of_products_vjp_vjp<double>(
 #ifndef MOPS_CUDA_ENABLED
 template <typename scalar_t>
 void mops::cuda::
-    sparse_accumulation_of_products(Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<scalar_t, 1>, Tensor<int32_t, 1>, Tensor<int32_t, 1>, Tensor<int32_t, 1>) {
+    sparse_accumulation_of_products(Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<scalar_t, 1>, Tensor<int32_t, 1>, Tensor<int32_t, 1>, Tensor<int32_t, 1>, void*) {
     throw std::runtime_error("MOPS was not compiled with CUDA support");
 }
 
 template <typename scalar_t>
 void mops::cuda::
-    sparse_accumulation_of_products_vjp(Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<scalar_t, 1>, Tensor<int32_t, 1>, Tensor<int32_t, 1>, Tensor<int32_t, 1>) {
+    sparse_accumulation_of_products_vjp(Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<scalar_t, 1>, Tensor<int32_t, 1>, Tensor<int32_t, 1>, Tensor<int32_t, 1>, void*) {
     throw std::runtime_error("MOPS was not compiled with CUDA support");
 }
 
@@ -101,7 +101,8 @@ void mops::cuda::sparse_accumulation_of_products_vjp_vjp(
     Tensor<scalar_t, 1> /*C*/,
     Tensor<int32_t, 1> /*indices_A*/,
     Tensor<int32_t, 1> /*indices_B*/,
-    Tensor<int32_t, 1> /*indices_output*/
+    Tensor<int32_t, 1> /*indices_output*/,
+    void* /*cuda_stream*/
 ) {
     throw std::runtime_error("MOPS was not compiled with CUDA support");
 }
@@ -114,7 +115,8 @@ template void mops::cuda::sparse_accumulation_of_products<float>(
     Tensor<float, 1> C,
     Tensor<int32_t, 1> indices_A,
     Tensor<int32_t, 1> indices_B,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream
 );
 
 template void mops::cuda::sparse_accumulation_of_products<double>(
@@ -124,7 +126,8 @@ template void mops::cuda::sparse_accumulation_of_products<double>(
     Tensor<double, 1> C,
     Tensor<int32_t, 1> indices_A,
     Tensor<int32_t, 1> indices_B,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream
 );
 
 template void mops::cuda::sparse_accumulation_of_products_vjp<float>(
@@ -136,7 +139,8 @@ template void mops::cuda::sparse_accumulation_of_products_vjp<float>(
     Tensor<float, 1> C,
     Tensor<int32_t, 1> indices_A,
     Tensor<int32_t, 1> indices_B,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream
 );
 
 template void mops::cuda::sparse_accumulation_of_products_vjp<double>(
@@ -148,7 +152,8 @@ template void mops::cuda::sparse_accumulation_of_products_vjp<double>(
     Tensor<double, 1> C,
     Tensor<int32_t, 1> indices_A,
     Tensor<int32_t, 1> indices_B,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream
 );
 
 template void mops::cuda::sparse_accumulation_of_products_vjp_vjp<float>(
@@ -163,7 +168,8 @@ template void mops::cuda::sparse_accumulation_of_products_vjp_vjp<float>(
     Tensor<float, 1> C,
     Tensor<int32_t, 1> indices_A,
     Tensor<int32_t, 1> indices_B,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream
 );
 
 template void mops::cuda::sparse_accumulation_of_products_vjp_vjp<double>(
@@ -178,7 +184,8 @@ template void mops::cuda::sparse_accumulation_of_products_vjp_vjp<double>(
     Tensor<double, 1> C,
     Tensor<int32_t, 1> indices_A,
     Tensor<int32_t, 1> indices_B,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream
 );
 
 #endif
diff --git a/mops/src/sap/sap.cu b/mops/src/sap/sap.cu
index be72bc8..7d8a2d5 100644
--- a/mops/src/sap/sap.cu
+++ b/mops/src/sap/sap.cu
@@ -95,12 +95,23 @@ void mops::cuda::sparse_accumulation_of_products(
     Tensor<scalar_t, 1> C,
     Tensor<int32_t, 1> indices_A,
     Tensor<int32_t, 1> indices_B,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream
 ) {
     check_sap(
         output, A, B, C, indices_A, indices_B, indices_output, "cuda_sparse_accumulation_of_products"
     );
 
+    cudaPointerAttributes attributes;
+    CUDA_CHECK_ERROR(cudaPointerGetAttributes(&attributes, A.data));
+    int current_device;
+    CUDA_CHECK_ERROR(cudaGetDevice(&current_device));
+    if (current_device != attributes.device) {
+        CUDA_CHECK_ERROR(cudaSetDevice(attributes.device));
+    }
+
+    cudaStream_t cstream = reinterpret_cast<cudaStream_t>(cuda_stream);
+
     dim3 block_dim(find_integer_divisor(A.shape[0], WARP_SIZE));
 
     dim3 thread_block(WARP_SIZE * NWARPS_PER_BLOCK, 1, 1);
@@ -117,8 +128,11 @@ void mops::cuda::sparse_accumulation_of_products(
         <<<block_dim, thread_block, space>>>(output, A, B, C, indices_A, indices_B, indices_output);
 
     CUDA_CHECK_ERROR(cudaGetLastError());
+    CUDA_CHECK_ERROR(cudaStreamSynchronize(cstream));
 
-    CUDA_CHECK_ERROR(cudaDeviceSynchronize());
+    if (current_device != attributes.device) {
+        CUDA_CHECK_ERROR(cudaSetDevice(current_device));
+    }
 }
 
 // explicit instanciations of CUDA templates
@@ -129,7 +143,8 @@ template void mops::cuda::sparse_accumulation_of_products<float>(
     Tensor<float, 1> C,
     Tensor<int32_t, 1> indices_A,
     Tensor<int32_t, 1> indices_B,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream
 );
 
 template void mops::cuda::sparse_accumulation_of_products<double>(
@@ -139,7 +154,8 @@ template void mops::cuda::sparse_accumulation_of_products<double>(
     Tensor<double, 1> C,
     Tensor<int32_t, 1> indices_A,
     Tensor<int32_t, 1> indices_B,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream
 );
 
 template <typename scalar_t>
@@ -282,7 +298,8 @@ void mops::cuda::sparse_accumulation_of_products_vjp(
     Tensor<scalar_t, 1> C,
     Tensor<int32_t, 1> indices_A,
     Tensor<int32_t, 1> indices_B,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream
 ) {
     check_sap_vjp(
         grad_A,
@@ -297,6 +314,16 @@ void mops::cuda::sparse_accumulation_of_products_vjp(
         "cuda_sparse_accumulation_of_products_vjp"
     );
 
+    cudaPointerAttributes attributes;
+    CUDA_CHECK_ERROR(cudaPointerGetAttributes(&attributes, A.data));
+    int current_device;
+    CUDA_CHECK_ERROR(cudaGetDevice(&current_device));
+    if (current_device != attributes.device) {
+        CUDA_CHECK_ERROR(cudaSetDevice(attributes.device));
+    }
+
+    cudaStream_t cstream = reinterpret_cast<cudaStream_t>(cuda_stream);
+
     dim3 block_dim(find_integer_divisor(grad_A.shape[0], WARP_SIZE));
 
     dim3 thread_block(WARP_SIZE * NWARPS_PER_BLOCK, 1, 1);
@@ -322,8 +349,11 @@ void mops::cuda::sparse_accumulation_of_products_vjp(
     );
 
     CUDA_CHECK_ERROR(cudaGetLastError());
+    CUDA_CHECK_ERROR(cudaStreamSynchronize(cstream));
 
-    CUDA_CHECK_ERROR(cudaDeviceSynchronize());
+    if (current_device != attributes.device) {
+        CUDA_CHECK_ERROR(cudaSetDevice(current_device));
+    }
 }
 
 template void mops::cuda::sparse_accumulation_of_products_vjp<float>(
@@ -335,7 +365,8 @@ template void mops::cuda::sparse_accumulation_of_products_vjp<float>(
     Tensor<float, 1> C,
     Tensor<int32_t, 1> indices_A,
     Tensor<int32_t, 1> indices_B,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream
 );
 
 template void mops::cuda::sparse_accumulation_of_products_vjp<double>(
@@ -347,7 +378,8 @@ template void mops::cuda::sparse_accumulation_of_products_vjp<double>(
     Tensor<double, 1> C,
     Tensor<int32_t, 1> indices_A,
     Tensor<int32_t, 1> indices_B,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream
 );
 
 template <typename scalar_t>
@@ -586,7 +618,8 @@ void mops::cuda::sparse_accumulation_of_products_vjp_vjp(
     Tensor<scalar_t, 1> C,
     Tensor<int32_t, 1> indices_A,
     Tensor<int32_t, 1> indices_B,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream
 ) {
     check_sap_vjp_vjp(
         grad_grad_output,
@@ -604,6 +637,16 @@ void mops::cuda::sparse_accumulation_of_products_vjp_vjp(
         "cuda_sparse_accumulation_of_products_vjp_vjp"
     );
 
+    cudaPointerAttributes attributes;
+    CUDA_CHECK_ERROR(cudaPointerGetAttributes(&attributes, A.data));
+    int current_device;
+    CUDA_CHECK_ERROR(cudaGetDevice(&current_device));
+    if (current_device != attributes.device) {
+        CUDA_CHECK_ERROR(cudaSetDevice(attributes.device));
+    }
+
+    cudaStream_t cstream = reinterpret_cast<cudaStream_t>(cuda_stream);
+
     dim3 block_dim(find_integer_divisor(grad_A_2.shape[0], WARP_SIZE));
 
     dim3 thread_block(WARP_SIZE * NWARPS_PER_BLOCK, 1, 1);
@@ -674,8 +717,11 @@ void mops::cuda::sparse_accumulation_of_products_vjp_vjp(
     );
 
     CUDA_CHECK_ERROR(cudaGetLastError());
+    CUDA_CHECK_ERROR(cudaStreamSynchronize(cstream));
 
-    CUDA_CHECK_ERROR(cudaDeviceSynchronize());
+    if (current_device != attributes.device) {
+        CUDA_CHECK_ERROR(cudaSetDevice(current_device));
+    }
 }
 
 // explicit instanciations of CUDA templates
@@ -691,7 +737,8 @@ template void mops::cuda::sparse_accumulation_of_products_vjp_vjp<float>(
     Tensor<float, 1> C,
     Tensor<int32_t, 1> indices_A,
     Tensor<int32_t, 1> indices_B,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream
 );
 
 template void mops::cuda::sparse_accumulation_of_products_vjp_vjp<double>(
@@ -706,5 +753,6 @@ template void mops::cuda::sparse_accumulation_of_products_vjp_vjp<double>(
     Tensor<double, 1> C,
     Tensor<int32_t, 1> indices_A,
     Tensor<int32_t, 1> indices_B,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream
 );
diff --git a/mops/src/sasaw/capi.cpp b/mops/src/sasaw/capi.cpp
index 5dfb933..57f5620 100644
--- a/mops/src/sasaw/capi.cpp
+++ b/mops/src/sasaw/capi.cpp
@@ -276,7 +276,8 @@ extern "C" int mops_cuda_sparse_accumulation_scatter_add_with_weights_f32(
     mops_tensor_1d_i32_t indices_W_1,
     mops_tensor_1d_i32_t indices_W_2,
     mops_tensor_1d_i32_t indices_output_1,
-    mops_tensor_1d_i32_t indices_output_2
+    mops_tensor_1d_i32_t indices_output_2,
+    void* cuda_stream
 ) {
     MOPS_CATCH_EXCEPTIONS_BEGIN
         mops::cuda::sparse_accumulation_scatter_add_with_weights<float>(
@@ -292,7 +293,8 @@ extern "C" int mops_cuda_sparse_accumulation_scatter_add_with_weights_f32(
             {indices_W_1.data, {checked_cast(indices_W_1.shape[0])}},
             {indices_W_2.data, {checked_cast(indices_W_2.shape[0])}},
             {indices_output_1.data, {checked_cast(indices_output_1.shape[0])}},
-            {indices_output_2.data, {checked_cast(indices_output_2.shape[0])}}
+            {indices_output_2.data, {checked_cast(indices_output_2.shape[0])}},
+            cuda_stream
         );
     MOPS_CATCH_EXCEPTIONS_END
 }
@@ -307,7 +309,8 @@ extern "C" int mops_cuda_sparse_accumulation_scatter_add_with_weights_f64(
     mops_tensor_1d_i32_t indices_W_1,
     mops_tensor_1d_i32_t indices_W_2,
     mops_tensor_1d_i32_t indices_output_1,
-    mops_tensor_1d_i32_t indices_output_2
+    mops_tensor_1d_i32_t indices_output_2,
+    void* cuda_stream
 ) {
     MOPS_CATCH_EXCEPTIONS_BEGIN
         mops::cuda::sparse_accumulation_scatter_add_with_weights<double>(
@@ -323,7 +326,8 @@ extern "C" int mops_cuda_sparse_accumulation_scatter_add_with_weights_f64(
             {indices_W_1.data, {checked_cast(indices_W_1.shape[0])}},
             {indices_W_2.data, {checked_cast(indices_W_2.shape[0])}},
             {indices_output_1.data, {checked_cast(indices_output_1.shape[0])}},
-            {indices_output_2.data, {checked_cast(indices_output_2.shape[0])}}
+            {indices_output_2.data, {checked_cast(indices_output_2.shape[0])}},
+            cuda_stream
         );
     MOPS_CATCH_EXCEPTIONS_END
 }
@@ -341,7 +345,8 @@ extern "C" int mops_cuda_sparse_accumulation_scatter_add_with_weights_vjp_f32(
     mops_tensor_1d_i32_t indices_W_1,
     mops_tensor_1d_i32_t indices_W_2,
     mops_tensor_1d_i32_t indices_output_1,
-    mops_tensor_1d_i32_t indices_output_2
+    mops_tensor_1d_i32_t indices_output_2,
+    void* cuda_stream
 ) {
     MOPS_CATCH_EXCEPTIONS_BEGIN
         mops::cuda::sparse_accumulation_scatter_add_with_weights_vjp<float>(
@@ -362,7 +367,8 @@ extern "C" int mops_cuda_sparse_accumulation_scatter_add_with_weights_vjp_f32(
             {indices_W_1.data, {checked_cast(indices_W_1.shape[0])}},
             {indices_W_2.data, {checked_cast(indices_W_2.shape[0])}},
             {indices_output_1.data, {checked_cast(indices_output_1.shape[0])}},
-            {indices_output_2.data, {checked_cast(indices_output_2.shape[0])}}
+            {indices_output_2.data, {checked_cast(indices_output_2.shape[0])}},
+            cuda_stream
         );
     MOPS_CATCH_EXCEPTIONS_END
 }
@@ -380,7 +386,8 @@ extern "C" int mops_cuda_sparse_accumulation_scatter_add_with_weights_vjp_f64(
     mops_tensor_1d_i32_t indices_W_1,
     mops_tensor_1d_i32_t indices_W_2,
     mops_tensor_1d_i32_t indices_output_1,
-    mops_tensor_1d_i32_t indices_output_2
+    mops_tensor_1d_i32_t indices_output_2,
+    void* cuda_stream
 ) {
     MOPS_CATCH_EXCEPTIONS_BEGIN
         mops::cuda::sparse_accumulation_scatter_add_with_weights_vjp<double>(
@@ -401,7 +408,8 @@ extern "C" int mops_cuda_sparse_accumulation_scatter_add_with_weights_vjp_f64(
             {indices_W_1.data, {checked_cast(indices_W_1.shape[0])}},
             {indices_W_2.data, {checked_cast(indices_W_2.shape[0])}},
             {indices_output_1.data, {checked_cast(indices_output_1.shape[0])}},
-            {indices_output_2.data, {checked_cast(indices_output_2.shape[0])}}
+            {indices_output_2.data, {checked_cast(indices_output_2.shape[0])}},
+            cuda_stream
         );
     MOPS_CATCH_EXCEPTIONS_END
 }
@@ -423,7 +431,8 @@ extern "C" int mops_cuda_sparse_accumulation_scatter_add_with_weights_vjp_vjp_f3
     mops_tensor_1d_i32_t indices_W_1,
     mops_tensor_1d_i32_t indices_W_2,
     mops_tensor_1d_i32_t indices_output_1,
-    mops_tensor_1d_i32_t indices_output_2
+    mops_tensor_1d_i32_t indices_output_2,
+    void* cuda_stream
 ) {
     MOPS_CATCH_EXCEPTIONS_BEGIN
         mops::cuda::sparse_accumulation_scatter_add_with_weights_vjp_vjp<float>(
@@ -457,7 +466,8 @@ extern "C" int mops_cuda_sparse_accumulation_scatter_add_with_weights_vjp_vjp_f3
             {indices_W_1.data, {checked_cast(indices_W_1.shape[0])}},
             {indices_W_2.data, {checked_cast(indices_W_2.shape[0])}},
             {indices_output_1.data, {checked_cast(indices_output_1.shape[0])}},
-            {indices_output_2.data, {checked_cast(indices_output_2.shape[0])}}
+            {indices_output_2.data, {checked_cast(indices_output_2.shape[0])}},
+            cuda_stream
         );
     MOPS_CATCH_EXCEPTIONS_END
 }
@@ -479,7 +489,8 @@ extern "C" int mops_cuda_sparse_accumulation_scatter_add_with_weights_vjp_vjp_f6
     mops_tensor_1d_i32_t indices_W_1,
     mops_tensor_1d_i32_t indices_W_2,
     mops_tensor_1d_i32_t indices_output_1,
-    mops_tensor_1d_i32_t indices_output_2
+    mops_tensor_1d_i32_t indices_output_2,
+    void* cuda_stream
 ) {
     MOPS_CATCH_EXCEPTIONS_BEGIN
         mops::cuda::sparse_accumulation_scatter_add_with_weights_vjp_vjp<double>(
@@ -513,7 +524,8 @@ extern "C" int mops_cuda_sparse_accumulation_scatter_add_with_weights_vjp_vjp_f6
             {indices_W_1.data, {checked_cast(indices_W_1.shape[0])}},
             {indices_W_2.data, {checked_cast(indices_W_2.shape[0])}},
             {indices_output_1.data, {checked_cast(indices_output_1.shape[0])}},
-            {indices_output_2.data, {checked_cast(indices_output_2.shape[0])}}
+            {indices_output_2.data, {checked_cast(indices_output_2.shape[0])}},
+            cuda_stream
         );
     MOPS_CATCH_EXCEPTIONS_END
 }
diff --git a/mops/src/sasaw/sasaw.cpp b/mops/src/sasaw/sasaw.cpp
index 9bc4a68..50525aa 100644
--- a/mops/src/sasaw/sasaw.cpp
+++ b/mops/src/sasaw/sasaw.cpp
@@ -99,18 +99,16 @@ template void mops::sparse_accumulation_scatter_add_with_weights_vjp_vjp<double>
     Tensor<int32_t, 1> indices_output_2
 );
 
-#ifdef MOPS_CUDA_ENABLED
-#include "cuda.tpp"
-#else
+#ifndef MOPS_CUDA_ENABLED
 template <typename scalar_t>
 void mops::cuda::
-    sparse_accumulation_scatter_add_with_weights(Tensor<scalar_t, 3>, Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<scalar_t, 1>, Tensor<scalar_t, 3>, Tensor<int32_t, 1>, Tensor<int32_t, 1>, Tensor<int32_t, 1>, Tensor<int32_t, 1>, Tensor<int32_t, 1>) {
+    sparse_accumulation_scatter_add_with_weights(Tensor<scalar_t, 3>, Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<scalar_t, 1>, Tensor<scalar_t, 3>, Tensor<int32_t, 1>, Tensor<int32_t, 1>, Tensor<int32_t, 1>, Tensor<int32_t, 1>, Tensor<int32_t, 1>, void*) {
     throw std::runtime_error("MOPS was not compiled with CUDA support");
 }
 
 template <typename scalar_t>
 void mops::cuda::
-    sparse_accumulation_scatter_add_with_weights_vjp(Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<scalar_t, 3>, Tensor<scalar_t, 3>, Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<scalar_t, 1>, Tensor<scalar_t, 3>, Tensor<int, 1>, Tensor<int, 1>, Tensor<int, 1>, Tensor<int, 1>, Tensor<int, 1>) {
+    sparse_accumulation_scatter_add_with_weights_vjp(Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<scalar_t, 3>, Tensor<scalar_t, 3>, Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<scalar_t, 1>, Tensor<scalar_t, 3>, Tensor<int, 1>, Tensor<int, 1>, Tensor<int, 1>, Tensor<int, 1>, Tensor<int, 1>, void*) {
     throw std::runtime_error("MOPS was not compiled with CUDA support");
 }
 
@@ -132,13 +130,12 @@ void mops::cuda::sparse_accumulation_scatter_add_with_weights_vjp_vjp(
     Tensor<int32_t, 1> /*indices_W_1*/,
     Tensor<int32_t, 1> /*indices_W_2*/,
     Tensor<int32_t, 1> /*indices_output_1*/,
-    Tensor<int32_t, 1> /*indices_output_2*/
+    Tensor<int32_t, 1> /*indices_output_2*/,
+    void* /*cuda_stream*/
 ) {
     throw std::runtime_error("MOPS was not compiled with CUDA support");
 }
 
-#endif
-
 // explicit instantiations of CUDA templates
 template void mops::cuda::sparse_accumulation_scatter_add_with_weights<float>(
     Tensor<float, 3> output,
@@ -150,7 +147,8 @@ template void mops::cuda::sparse_accumulation_scatter_add_with_weights<float>(
     Tensor<int32_t, 1> indices_W_1,
     Tensor<int32_t, 1> indices_W_2,
     Tensor<int32_t, 1> indices_output_1,
-    Tensor<int32_t, 1> indices_output_2
+    Tensor<int32_t, 1> indices_output_2,
+    void* cuda_stream
 );
 
 template void mops::cuda::sparse_accumulation_scatter_add_with_weights<double>(
@@ -163,7 +161,8 @@ template void mops::cuda::sparse_accumulation_scatter_add_with_weights<double>(
     Tensor<int32_t, 1> indices_W_1,
     Tensor<int32_t, 1> indices_W_2,
     Tensor<int32_t, 1> indices_output_1,
-    Tensor<int32_t, 1> indices_output_2
+    Tensor<int32_t, 1> indices_output_2,
+    void* cuda_stream
 );
 
 template void mops::cuda::sparse_accumulation_scatter_add_with_weights_vjp<float>(
@@ -179,7 +178,8 @@ template void mops::cuda::sparse_accumulation_scatter_add_with_weights_vjp<float
     Tensor<int32_t, 1> indices_W_1,
     Tensor<int32_t, 1> indices_W_2,
     Tensor<int32_t, 1> indices_output_1,
-    Tensor<int32_t, 1> indices_output_2
+    Tensor<int32_t, 1> indices_output_2,
+    void* cuda_stream
 );
 
 template void mops::cuda::sparse_accumulation_scatter_add_with_weights_vjp<double>(
@@ -195,7 +195,8 @@ template void mops::cuda::sparse_accumulation_scatter_add_with_weights_vjp<doubl
     Tensor<int32_t, 1> indices_W_1,
     Tensor<int32_t, 1> indices_W_2,
     Tensor<int32_t, 1> indices_output_1,
-    Tensor<int32_t, 1> indices_output_2
+    Tensor<int32_t, 1> indices_output_2,
+    void* cuda_stream
 );
 
 template void mops::cuda::sparse_accumulation_scatter_add_with_weights_vjp_vjp<float>(
@@ -215,7 +216,8 @@ template void mops::cuda::sparse_accumulation_scatter_add_with_weights_vjp_vjp<f
     Tensor<int32_t, 1> indices_W_1,
     Tensor<int32_t, 1> indices_W_2,
     Tensor<int32_t, 1> indices_output_1,
-    Tensor<int32_t, 1> indices_output_2
+    Tensor<int32_t, 1> indices_output_2,
+    void* cuda_stream
 );
 
 template void mops::cuda::sparse_accumulation_scatter_add_with_weights_vjp_vjp<double>(
@@ -235,5 +237,8 @@ template void mops::cuda::sparse_accumulation_scatter_add_with_weights_vjp_vjp<d
     Tensor<int32_t, 1> indices_W_1,
     Tensor<int32_t, 1> indices_W_2,
     Tensor<int32_t, 1> indices_output_1,
-    Tensor<int32_t, 1> indices_output_2
+    Tensor<int32_t, 1> indices_output_2,
+    void* cuda_stream
 );
+
+#endif
\ No newline at end of file

From 0b06e796cdc7ee5632c89f742db58fa0baf52e43 Mon Sep 17 00:00:00 2001
From: "Nick J. Browning" <nickjbrowning@gmail.com>
Date: Thu, 2 May 2024 11:52:42 +0200
Subject: [PATCH 05/28] formatting.

---
 mops/include/mops/hpe.hpp | 18 +++++++++++++++---
 mops/include/mops/sasaw.h | 12 ++++++------
 mops/src/hpe/hpe.cu       | 30 +++++++++++++++++++++---------
 mops/src/opsaw/capi.cpp   | 12 ++++++------
 4 files changed, 48 insertions(+), 24 deletions(-)

diff --git a/mops/include/mops/hpe.hpp b/mops/include/mops/hpe.hpp
index c3f6425..1a66edd 100644
--- a/mops/include/mops/hpe.hpp
+++ b/mops/include/mops/hpe.hpp
@@ -122,15 +122,27 @@ namespace cuda {
 /// CUDA version of mops::homogeneous_polynomial_evaluation
 template <typename scalar_t>
 void MOPS_EXPORT homogeneous_polynomial_evaluation(
-    Tensor<scalar_t, 1> output, Tensor<scalar_t, 2> A, Tensor<scalar_t, 1> C, Tensor<int32_t, 2> indices_A, void* cuda_stream = nullptr
+    Tensor<scalar_t, 1> output,
+    Tensor<scalar_t, 2> A,
+    Tensor<scalar_t, 1> C,
+    Tensor<int32_t, 2> indices_A,
+    void* cuda_stream = nullptr
 );
 
 extern template void homogeneous_polynomial_evaluation(
-    Tensor<float, 1> output, Tensor<float, 2> A, Tensor<float, 1> C, Tensor<int32_t, 2> indices_A, void* cuda_stream
+    Tensor<float, 1> output,
+    Tensor<float, 2> A,
+    Tensor<float, 1> C,
+    Tensor<int32_t, 2> indices_A,
+    void* cuda_stream
 );
 
 extern template void homogeneous_polynomial_evaluation(
-    Tensor<double, 1> output, Tensor<double, 2> A, Tensor<double, 1> C, Tensor<int32_t, 2> indices_A, void* cuda_stream
+    Tensor<double, 1> output,
+    Tensor<double, 2> A,
+    Tensor<double, 1> C,
+    Tensor<int32_t, 2> indices_A,
+    void* cuda_stream
 );
 
 template <typename scalar_t>
diff --git a/mops/include/mops/sasaw.h b/mops/include/mops/sasaw.h
index 1c4b88c..d87560d 100644
--- a/mops/include/mops/sasaw.h
+++ b/mops/include/mops/sasaw.h
@@ -128,7 +128,7 @@ int MOPS_EXPORT mops_cuda_sparse_accumulation_scatter_add_with_weights_f32(
     mops_tensor_1d_i32_t indices_W_2,
     mops_tensor_1d_i32_t indices_output_1,
     mops_tensor_1d_i32_t indices_output_2,
-    void * cuda_stream
+    void* cuda_stream
 );
 
 /// CUDA version of mops::sparse_accumulation_scatter_add_with for 64-bit floats
@@ -143,7 +143,7 @@ int MOPS_EXPORT mops_cuda_sparse_accumulation_scatter_add_with_weights_f64(
     mops_tensor_1d_i32_t indices_W_2,
     mops_tensor_1d_i32_t indices_output_1,
     mops_tensor_1d_i32_t indices_output_2,
-    void * cuda_stream
+    void* cuda_stream
 );
 
 /// CUDA version of mops::sparse_accumulation_scatter_add_with_weights_vjp for
@@ -162,7 +162,7 @@ int MOPS_EXPORT mops_cuda_sparse_accumulation_scatter_add_with_weights_vjp_f32(
     mops_tensor_1d_i32_t indices_W_2,
     mops_tensor_1d_i32_t indices_output_1,
     mops_tensor_1d_i32_t indices_output_2,
-    void * cuda_stream
+    void* cuda_stream
 );
 
 /// CUDA version of mops::sparse_accumulation_scatter_add_with_weights_vjp for
@@ -181,7 +181,7 @@ int MOPS_EXPORT mops_cuda_sparse_accumulation_scatter_add_with_weights_vjp_f64(
     mops_tensor_1d_i32_t indices_W_2,
     mops_tensor_1d_i32_t indices_output_1,
     mops_tensor_1d_i32_t indices_output_2,
-    void * cuda_stream
+    void* cuda_stream
 );
 
 /// CUDA version of mops::sparse_accumulation_scatter_add_with_weights_vjp_vjp for
@@ -204,7 +204,7 @@ int MOPS_EXPORT mops_cuda_sparse_accumulation_scatter_add_with_weights_vjp_vjp_f
     mops_tensor_1d_i32_t indices_W_2,
     mops_tensor_1d_i32_t indices_output_1,
     mops_tensor_1d_i32_t indices_output_2,
-    void * cuda_stream
+    void* cuda_stream
 );
 
 /// CUDA version of mops::sparse_accumulation_scatter_add_with_weights_vjp_vjp for
@@ -227,7 +227,7 @@ int MOPS_EXPORT mops_cuda_sparse_accumulation_scatter_add_with_weights_vjp_vjp_f
     mops_tensor_1d_i32_t indices_W_2,
     mops_tensor_1d_i32_t indices_output_1,
     mops_tensor_1d_i32_t indices_output_2,
-    void * cuda_stream
+    void* cuda_stream
 );
 
 #ifdef __cplusplus
diff --git a/mops/src/hpe/hpe.cu b/mops/src/hpe/hpe.cu
index 7e36468..59a6af1 100644
--- a/mops/src/hpe/hpe.cu
+++ b/mops/src/hpe/hpe.cu
@@ -108,7 +108,11 @@ __global__ void homogeneous_polynomial_evaluation_kernel(
 
 template <typename scalar_t>
 void mops::cuda::homogeneous_polynomial_evaluation(
-    Tensor<scalar_t, 1> output, Tensor<scalar_t, 2> A, Tensor<scalar_t, 1> C, Tensor<int32_t, 2> indices_A, void * cuda_stream
+    Tensor<scalar_t, 1> output,
+    Tensor<scalar_t, 2> A,
+    Tensor<scalar_t, 1> C,
+    Tensor<int32_t, 2> indices_A,
+    void* cuda_stream
 ) {
 
     check_hpe(output, A, C, indices_A, "cuda_homogeneous_polynomial_evaluation");
@@ -199,11 +203,19 @@ void mops::cuda::homogeneous_polynomial_evaluation(
 
 // explicit instanciations of CUDA templates
 template void mops::cuda::homogeneous_polynomial_evaluation<float>(
-    Tensor<float, 1> output, Tensor<float, 2> A, Tensor<float, 1> C, Tensor<int32_t, 2> indices_A, void * cuda_stream
+    Tensor<float, 1> output,
+    Tensor<float, 2> A,
+    Tensor<float, 1> C,
+    Tensor<int32_t, 2> indices_A,
+    void* cuda_stream
 );
 
 template void mops::cuda::homogeneous_polynomial_evaluation<double>(
-    Tensor<double, 1> output, Tensor<double, 2> A, Tensor<double, 1> C, Tensor<int32_t, 2> indices_A, void * cuda_stream
+    Tensor<double, 1> output,
+    Tensor<double, 2> A,
+    Tensor<double, 1> C,
+    Tensor<int32_t, 2> indices_A,
+    void* cuda_stream
 );
 
 template <typename scalar_t, int32_t polynomial_order>
@@ -307,7 +319,7 @@ void mops::cuda::homogeneous_polynomial_evaluation_vjp(
     Tensor<scalar_t, 2> A,
     Tensor<scalar_t, 1> C,
     Tensor<int32_t, 2> indices_A,
-    void * cuda_stream
+    void* cuda_stream
 ) {
     check_hpe_vjp(grad_A, grad_output, A, C, indices_A, "cuda_homogeneous_polynomial_evaluation_vjp");
 
@@ -401,7 +413,7 @@ template void mops::cuda::homogeneous_polynomial_evaluation_vjp<float>(
     Tensor<float, 2> A,
     Tensor<float, 1> C,
     Tensor<int32_t, 2> indices_A,
-    void * cuda_stream
+    void* cuda_stream
 );
 
 template void mops::cuda::homogeneous_polynomial_evaluation_vjp<double>(
@@ -410,7 +422,7 @@ template void mops::cuda::homogeneous_polynomial_evaluation_vjp<double>(
     Tensor<double, 2> A,
     Tensor<double, 1> C,
     Tensor<int32_t, 2> indices_A,
-    void * cuda_stream
+    void* cuda_stream
 );
 
 template <typename scalar_t>
@@ -422,7 +434,7 @@ void mops::cuda::homogeneous_polynomial_evaluation_vjp_vjp(
     Tensor<scalar_t, 2> A,
     Tensor<scalar_t, 1> C,
     Tensor<int32_t, 2> indices_A,
-    void * cuda_stream
+    void* cuda_stream
 ) {
     throw std::runtime_error("Not implemented");
 }
@@ -436,7 +448,7 @@ template void mops::cuda::homogeneous_polynomial_evaluation_vjp_vjp<float>(
     Tensor<float, 2> A,
     Tensor<float, 1> C,
     Tensor<int32_t, 2> indices_A,
-    void * cuda_stream
+    void* cuda_stream
 );
 
 template void mops::cuda::homogeneous_polynomial_evaluation_vjp_vjp<double>(
@@ -447,5 +459,5 @@ template void mops::cuda::homogeneous_polynomial_evaluation_vjp_vjp<double>(
     Tensor<double, 2> A,
     Tensor<double, 1> C,
     Tensor<int32_t, 2> indices_A,
-    void * cuda_stream
+    void* cuda_stream
 );
diff --git a/mops/src/opsaw/capi.cpp b/mops/src/opsaw/capi.cpp
index dacb6ac..3585d70 100644
--- a/mops/src/opsaw/capi.cpp
+++ b/mops/src/opsaw/capi.cpp
@@ -211,7 +211,7 @@ extern "C" int mops_cuda_outer_product_scatter_add_with_weights_f32(
     mops_tensor_2d_f32_t W,
     mops_tensor_1d_i32_t indices_W,
     mops_tensor_1d_i32_t indices_output,
-    void * cuda_stream
+    void* cuda_stream
 ) {
     MOPS_CATCH_EXCEPTIONS_BEGIN
         mops::cuda::outer_product_scatter_add_with_weights<float>(
@@ -236,7 +236,7 @@ extern "C" int mops_cuda_outer_product_scatter_add_with_weights_f64(
     mops_tensor_2d_f64_t W,
     mops_tensor_1d_i32_t indices_W,
     mops_tensor_1d_i32_t indices_output,
-    void * cuda_stream
+    void* cuda_stream
 ) {
     MOPS_CATCH_EXCEPTIONS_BEGIN
         mops::cuda::outer_product_scatter_add_with_weights<double>(
@@ -264,7 +264,7 @@ extern "C" int mops_cuda_outer_product_scatter_add_with_weights_vjp_f32(
     mops_tensor_2d_f32_t W,
     mops_tensor_1d_i32_t indices_W,
     mops_tensor_1d_i32_t indices_output,
-    void * cuda_stream
+    void* cuda_stream
 ) {
     MOPS_CATCH_EXCEPTIONS_BEGIN
         mops::cuda::outer_product_scatter_add_with_weights_vjp<float>(
@@ -295,7 +295,7 @@ extern "C" int mops_cuda_outer_product_scatter_add_with_weights_vjp_f64(
     mops_tensor_2d_f64_t W,
     mops_tensor_1d_i32_t indices_W,
     mops_tensor_1d_i32_t indices_output,
-    void * cuda_stream
+    void* cuda_stream
 ) {
     MOPS_CATCH_EXCEPTIONS_BEGIN
         mops::cuda::outer_product_scatter_add_with_weights_vjp<double>(
@@ -330,7 +330,7 @@ extern "C" int mops_cuda_outer_product_scatter_add_with_weights_vjp_vjp_f32(
     mops_tensor_2d_f32_t W,
     mops_tensor_1d_i32_t indices_W,
     mops_tensor_1d_i32_t indices_output,
-    void * cuda_stream
+    void* cuda_stream
 ) {
     MOPS_CATCH_EXCEPTIONS_BEGIN
         mops::cuda::outer_product_scatter_add_with_weights_vjp_vjp<float>(
@@ -375,7 +375,7 @@ extern "C" int mops_cuda_outer_product_scatter_add_with_weights_vjp_vjp_f64(
     mops_tensor_2d_f64_t W,
     mops_tensor_1d_i32_t indices_W,
     mops_tensor_1d_i32_t indices_output,
-    void * cuda_stream
+    void* cuda_stream
 ) {
     MOPS_CATCH_EXCEPTIONS_BEGIN
         mops::cuda::outer_product_scatter_add_with_weights_vjp_vjp<double>(

From b2c170a9c5ee1a6e61fdbfb2ead3f6ba3ec897df Mon Sep 17 00:00:00 2001
From: "Nick J. Browning" <nickjbrowning@gmail.com>
Date: Fri, 3 May 2024 13:57:18 +0200
Subject: [PATCH 06/28] fixed build issues.

---
 mops/CMakeLists.txt      |   2 +
 mops/src/opsaw/cuda.tpp  |  49 -------------
 mops/src/opsaw/opsaw.cpp |   8 +--
 mops/src/opsaw/opsaw.cu  | 107 +++++++++++++++++++++++++++-
 mops/src/sasaw/cuda.tpp  |  61 ----------------
 mops/src/sasaw/sasaw.cu  | 149 ++++++++++++++++++++++++++++++++++++++-
 6 files changed, 259 insertions(+), 117 deletions(-)
 delete mode 100644 mops/src/opsaw/cuda.tpp
 delete mode 100644 mops/src/sasaw/cuda.tpp

diff --git a/mops/CMakeLists.txt b/mops/CMakeLists.txt
index 846eb47..f75a638 100644
--- a/mops/CMakeLists.txt
+++ b/mops/CMakeLists.txt
@@ -124,6 +124,8 @@ if(CMAKE_CUDA_COMPILER AND MOPS_CUDA)
         "src/opsa/opsa.cu"
         "src/hpe/hpe.cu"
         "src/sap/sap.cu"
+        "src/sasaw/sasaw.cu"
+        "src/opsaw/opsaw.cu"
     )
     
 endif()
diff --git a/mops/src/opsaw/cuda.tpp b/mops/src/opsaw/cuda.tpp
deleted file mode 100644
index 204e7db..0000000
--- a/mops/src/opsaw/cuda.tpp
+++ /dev/null
@@ -1,49 +0,0 @@
-#include <stdexcept>
-
-#include "mops/opsaw.hpp"
-
-template<typename scalar_t>
-void mops::cuda::outer_product_scatter_add_with_weights(
-    Tensor<scalar_t, 3>,
-    Tensor<scalar_t, 2>,
-    Tensor<scalar_t, 2>,
-    Tensor<scalar_t, 2>,
-    Tensor<int32_t, 1>,
-    Tensor<int32_t, 1>
-) {
-    throw std::runtime_error("CUDA implementation does not exist yet");
-}
-
-template<typename scalar_t>
-void mops::cuda::outer_product_scatter_add_with_weights_vjp(
-    Tensor<scalar_t, 2>,
-    Tensor<scalar_t, 2>,
-    Tensor<scalar_t, 2>,
-    Tensor<scalar_t, 3>,
-    Tensor<scalar_t, 2>,
-    Tensor<scalar_t, 2>,
-    Tensor<scalar_t, 2>,
-    Tensor<int32_t, 1>,
-    Tensor<int32_t, 1>
-) {
-    throw std::runtime_error("CUDA implementation does not exist yet");
-}
-
-template<typename scalar_t>
-void mops::cuda::outer_product_scatter_add_with_weights_vjp_vjp(
-    Tensor<scalar_t, 3> /*grad_grad_output*/,
-    Tensor<scalar_t, 2> /*grad_A_2*/,
-    Tensor<scalar_t, 2> /*grad_B_2*/,
-    Tensor<scalar_t, 2> /*grad_W_2*/,
-    Tensor<scalar_t, 2> /*grad_grad_A*/,
-    Tensor<scalar_t, 2> /*grad_grad_B*/,
-    Tensor<scalar_t, 2> /*grad_grad_W*/,
-    Tensor<scalar_t, 3> /*grad_output*/,
-    Tensor<scalar_t, 2> /*A*/,
-    Tensor<scalar_t, 2> /*B*/,
-    Tensor<scalar_t, 2> /*W*/,
-    Tensor<int32_t, 1> /*indices_W*/,
-    Tensor<int32_t, 1> /*indices_output*/
-) {
-    throw std::runtime_error("CUDA implementation does not exist yet");
-}
diff --git a/mops/src/opsaw/opsaw.cpp b/mops/src/opsaw/opsaw.cpp
index b3b30b8..b7eb889 100644
--- a/mops/src/opsaw/opsaw.cpp
+++ b/mops/src/opsaw/opsaw.cpp
@@ -75,9 +75,7 @@ template void mops::outer_product_scatter_add_with_weights_vjp_vjp<double>(
     Tensor<int32_t, 1> indices_output
 );
 
-#ifdef MOPS_CUDA_ENABLED
-#include "cuda.tpp"
-#else
+#ifndef MOPS_CUDA_ENABLED
 template <typename scalar_t>
 void mops::cuda::
     outer_product_scatter_add_with_weights(Tensor<scalar_t, 3>, Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<int32_t, 1>, Tensor<int32_t, 1>) {
@@ -109,8 +107,6 @@ void mops::cuda::outer_product_scatter_add_with_weights_vjp_vjp(
     throw std::runtime_error("MOPS was not compiled with CUDA support");
 }
 
-#endif
-
 // explicit instantiations of CUDA templates
 template void mops::cuda::outer_product_scatter_add_with_weights<float>(
     Tensor<float, 3> output,
@@ -185,3 +181,5 @@ template void mops::cuda::outer_product_scatter_add_with_weights_vjp_vjp<double>
     Tensor<int32_t, 1> indices_W,
     Tensor<int32_t, 1> indices_output
 );
+
+#endif
\ No newline at end of file
diff --git a/mops/src/opsaw/opsaw.cu b/mops/src/opsaw/opsaw.cu
index 8421bac..28c48fe 100644
--- a/mops/src/opsaw/opsaw.cu
+++ b/mops/src/opsaw/opsaw.cu
@@ -1 +1,106 @@
-// todo: cuda device code
+#include <stdexcept>
+
+#include "mops/opsaw.hpp"
+
+using namespace mops;
+using namespace mops::cuda;
+using namespace std;
+
+template <typename scalar_t>
+void mops::cuda::
+    outer_product_scatter_add_with_weights(Tensor<scalar_t, 3>, Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<int32_t, 1>, Tensor<int32_t, 1>, void*) {
+    throw std::runtime_error("CUDA implementation does not exist yet");
+}
+
+template <typename scalar_t>
+void mops::cuda::
+    outer_product_scatter_add_with_weights_vjp(Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<scalar_t, 3>, Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<int32_t, 1>, Tensor<int32_t, 1>, void*) {
+    throw std::runtime_error("CUDA implementation does not exist yet");
+}
+
+template <typename scalar_t>
+void mops::cuda::
+    outer_product_scatter_add_with_weights_vjp_vjp(Tensor<scalar_t, 3> /*grad_grad_output*/, Tensor<scalar_t, 2> /*grad_A_2*/, Tensor<scalar_t, 2> /*grad_B_2*/, Tensor<scalar_t, 2> /*grad_W_2*/, Tensor<scalar_t, 2> /*grad_grad_A*/, Tensor<scalar_t, 2> /*grad_grad_B*/, Tensor<scalar_t, 2> /*grad_grad_W*/, Tensor<scalar_t, 3> /*grad_output*/, Tensor<scalar_t, 2> /*A*/, Tensor<scalar_t, 2> /*B*/, Tensor<scalar_t, 2> /*W*/, Tensor<int32_t, 1> /*indices_W*/, Tensor<int32_t, 1> /*indices_output*/, void*) {
+    throw std::runtime_error("CUDA implementation does not exist yet");
+}
+
+// explicit instantiations of CUDA templates
+template void mops::cuda::outer_product_scatter_add_with_weights<float>(
+    Tensor<float, 3> output,
+    Tensor<float, 2> A,
+    Tensor<float, 2> B,
+    Tensor<float, 2> W,
+    Tensor<int32_t, 1> indices_W,
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream
+);
+
+template void mops::cuda::outer_product_scatter_add_with_weights<double>(
+    Tensor<double, 3> output,
+    Tensor<double, 2> A,
+    Tensor<double, 2> B,
+    Tensor<double, 2> W,
+    Tensor<int32_t, 1> indices_W,
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream
+);
+
+template void mops::cuda::outer_product_scatter_add_with_weights_vjp<float>(
+    Tensor<float, 2> grad_A,
+    Tensor<float, 2> grad_B,
+    Tensor<float, 2> grad_W,
+    Tensor<float, 3> grad_output,
+    Tensor<float, 2> A,
+    Tensor<float, 2> B,
+    Tensor<float, 2> W,
+    Tensor<int32_t, 1> indices_W,
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream
+);
+
+template void mops::cuda::outer_product_scatter_add_with_weights_vjp<double>(
+    Tensor<double, 2> grad_A,
+    Tensor<double, 2> grad_B,
+    Tensor<double, 2> grad_W,
+    Tensor<double, 3> grad_output,
+    Tensor<double, 2> A,
+    Tensor<double, 2> B,
+    Tensor<double, 2> W,
+    Tensor<int32_t, 1> indices_W,
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream
+);
+
+template void mops::cuda::outer_product_scatter_add_with_weights_vjp_vjp<float>(
+    Tensor<float, 3> grad_grad_output,
+    Tensor<float, 2> grad_A_2,
+    Tensor<float, 2> grad_B_2,
+    Tensor<float, 2> grad_W_2,
+    Tensor<float, 2> grad_grad_A,
+    Tensor<float, 2> grad_grad_B,
+    Tensor<float, 2> grad_grad_W,
+    Tensor<float, 3> grad_output,
+    Tensor<float, 2> A,
+    Tensor<float, 2> B,
+    Tensor<float, 2> W,
+    Tensor<int32_t, 1> indices_W,
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream
+);
+
+template void mops::cuda::outer_product_scatter_add_with_weights_vjp_vjp<double>(
+    Tensor<double, 3> grad_grad_output,
+    Tensor<double, 2> grad_A_2,
+    Tensor<double, 2> grad_B_2,
+    Tensor<double, 2> grad_W_2,
+    Tensor<double, 2> grad_grad_A,
+    Tensor<double, 2> grad_grad_B,
+    Tensor<double, 2> grad_grad_W,
+    Tensor<double, 3> grad_output,
+    Tensor<double, 2> A,
+    Tensor<double, 2> B,
+    Tensor<double, 2> W,
+    Tensor<int32_t, 1> indices_W,
+    Tensor<int32_t, 1> indices_output,
+    void* cuda_stream
+);
\ No newline at end of file
diff --git a/mops/src/sasaw/cuda.tpp b/mops/src/sasaw/cuda.tpp
deleted file mode 100644
index c2f8558..0000000
--- a/mops/src/sasaw/cuda.tpp
+++ /dev/null
@@ -1,61 +0,0 @@
-#include <stdexcept>
-
-#include "mops/sasaw.hpp"
-
-template<typename scalar_t>
-void mops::cuda::sparse_accumulation_scatter_add_with_weights(
-    Tensor<scalar_t, 3>,
-    Tensor<scalar_t, 2>,
-    Tensor<scalar_t, 2>,
-    Tensor<scalar_t, 1>,
-    Tensor<scalar_t, 3>,
-    Tensor<int32_t, 1>,
-    Tensor<int32_t, 1>,
-    Tensor<int32_t, 1>,
-    Tensor<int32_t, 1>,
-    Tensor<int32_t, 1>
-) {
-    throw std::runtime_error("CUDA implementation does not exist yet");
-}
-
-template<typename scalar_t>
-void mops::cuda::sparse_accumulation_scatter_add_with_weights_vjp(
-    Tensor<scalar_t, 2>,
-    Tensor<scalar_t, 2> ,
-    Tensor<scalar_t, 3>,
-    Tensor<scalar_t, 3>,
-    Tensor<scalar_t, 2> ,
-    Tensor<scalar_t, 2> ,
-    Tensor<scalar_t, 1> ,
-    Tensor<scalar_t, 3> ,
-    Tensor<int32_t, 1>,
-    Tensor<int32_t, 1>,
-    Tensor<int32_t, 1>,
-    Tensor<int32_t, 1>,
-    Tensor<int32_t, 1>
-) {
-    throw std::runtime_error("CUDA implementation does not exist yet");
-}
-
-template<typename scalar_t>
-void mops::cuda::sparse_accumulation_scatter_add_with_weights_vjp_vjp(
-    Tensor<scalar_t, 3> /*grad_grad_output*/,
-    Tensor<scalar_t, 2> /*grad_A_2*/,
-    Tensor<scalar_t, 2> /*grad_B_2*/,
-    Tensor<scalar_t, 3> /*grad_W_2*/,
-    Tensor<scalar_t, 2> /*grad_grad_A*/,
-    Tensor<scalar_t, 2> /*grad_grad_B*/,
-    Tensor<scalar_t, 3> /*grad_grad_W*/,
-    Tensor<scalar_t, 3> /*grad_output*/,
-    Tensor<scalar_t, 2> /*A*/,
-    Tensor<scalar_t, 2> /*B*/,
-    Tensor<scalar_t, 1> /*C*/,
-    Tensor<scalar_t, 3> /*W*/,
-    Tensor<int32_t, 1> /*indices_A*/,
-    Tensor<int32_t, 1> /*indices_W_1*/,
-    Tensor<int32_t, 1> /*indices_W_2*/,
-    Tensor<int32_t, 1> /*indices_output_1*/,
-    Tensor<int32_t, 1> /*indices_output_2*/
-) {
-    throw std::runtime_error("CUDA implementation does not exist yet");
-}
diff --git a/mops/src/sasaw/sasaw.cu b/mops/src/sasaw/sasaw.cu
index 8421bac..06b94b1 100644
--- a/mops/src/sasaw/sasaw.cu
+++ b/mops/src/sasaw/sasaw.cu
@@ -1 +1,148 @@
-// todo: cuda device code
+#include <stdexcept>
+
+#include "mops/sasaw.hpp"
+
+using namespace mops;
+using namespace mops::cuda;
+using namespace std;
+
+template <typename scalar_t>
+void mops::cuda::
+    sparse_accumulation_scatter_add_with_weights(Tensor<scalar_t, 3>, Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<scalar_t, 1>, Tensor<scalar_t, 3>, Tensor<int32_t, 1>, Tensor<int32_t, 1>, Tensor<int32_t, 1>, Tensor<int32_t, 1>, Tensor<int32_t, 1>, void*) {
+    throw std::runtime_error("MOPS was not compiled with CUDA support");
+}
+
+template <typename scalar_t>
+void mops::cuda::
+    sparse_accumulation_scatter_add_with_weights_vjp(Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<scalar_t, 3>, Tensor<scalar_t, 3>, Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<scalar_t, 1>, Tensor<scalar_t, 3>, Tensor<int, 1>, Tensor<int, 1>, Tensor<int, 1>, Tensor<int, 1>, Tensor<int, 1>, void*) {
+    throw std::runtime_error("MOPS was not compiled with CUDA support");
+}
+
+template <typename scalar_t>
+void mops::cuda::sparse_accumulation_scatter_add_with_weights_vjp_vjp(
+    Tensor<scalar_t, 3> /*grad_grad_output*/,
+    Tensor<scalar_t, 2> /*grad_A_2*/,
+    Tensor<scalar_t, 2> /*grad_B_2*/,
+    Tensor<scalar_t, 3> /*grad_W_2*/,
+    Tensor<scalar_t, 2> /*grad_grad_A*/,
+    Tensor<scalar_t, 2> /*grad_grad_B*/,
+    Tensor<scalar_t, 3> /*grad_grad_W*/,
+    Tensor<scalar_t, 3> /*grad_output*/,
+    Tensor<scalar_t, 2> /*A*/,
+    Tensor<scalar_t, 2> /*B*/,
+    Tensor<scalar_t, 1> /*C*/,
+    Tensor<scalar_t, 3> /*W*/,
+    Tensor<int32_t, 1> /*indices_A*/,
+    Tensor<int32_t, 1> /*indices_W_1*/,
+    Tensor<int32_t, 1> /*indices_W_2*/,
+    Tensor<int32_t, 1> /*indices_output_1*/,
+    Tensor<int32_t, 1> /*indices_output_2*/,
+    void* /*cuda_stream*/
+) {
+    throw std::runtime_error("MOPS was not compiled with CUDA support");
+}
+
+// explicit instantiations of CUDA templates
+template void mops::cuda::sparse_accumulation_scatter_add_with_weights<float>(
+    Tensor<float, 3> output,
+    Tensor<float, 2> A,
+    Tensor<float, 2> B,
+    Tensor<float, 1> C,
+    Tensor<float, 3> W,
+    Tensor<int32_t, 1> indices_A,
+    Tensor<int32_t, 1> indices_W_1,
+    Tensor<int32_t, 1> indices_W_2,
+    Tensor<int32_t, 1> indices_output_1,
+    Tensor<int32_t, 1> indices_output_2,
+    void* cuda_stream
+);
+
+template void mops::cuda::sparse_accumulation_scatter_add_with_weights<double>(
+    Tensor<double, 3> output,
+    Tensor<double, 2> A,
+    Tensor<double, 2> B,
+    Tensor<double, 1> C,
+    Tensor<double, 3> W,
+    Tensor<int32_t, 1> indices_A,
+    Tensor<int32_t, 1> indices_W_1,
+    Tensor<int32_t, 1> indices_W_2,
+    Tensor<int32_t, 1> indices_output_1,
+    Tensor<int32_t, 1> indices_output_2,
+    void* cuda_stream
+);
+
+template void mops::cuda::sparse_accumulation_scatter_add_with_weights_vjp<float>(
+    Tensor<float, 2> grad_A,
+    Tensor<float, 2> grad_B,
+    Tensor<float, 3> grad_W,
+    Tensor<float, 3> grad_output,
+    Tensor<float, 2> A,
+    Tensor<float, 2> B,
+    Tensor<float, 1> C,
+    Tensor<float, 3> W,
+    Tensor<int32_t, 1> indices_A,
+    Tensor<int32_t, 1> indices_W_1,
+    Tensor<int32_t, 1> indices_W_2,
+    Tensor<int32_t, 1> indices_output_1,
+    Tensor<int32_t, 1> indices_output_2,
+    void* cuda_stream
+);
+
+template void mops::cuda::sparse_accumulation_scatter_add_with_weights_vjp<double>(
+    Tensor<double, 2> grad_A,
+    Tensor<double, 2> grad_B,
+    Tensor<double, 3> grad_W,
+    Tensor<double, 3> grad_output,
+    Tensor<double, 2> A,
+    Tensor<double, 2> B,
+    Tensor<double, 1> C,
+    Tensor<double, 3> W,
+    Tensor<int32_t, 1> indices_A,
+    Tensor<int32_t, 1> indices_W_1,
+    Tensor<int32_t, 1> indices_W_2,
+    Tensor<int32_t, 1> indices_output_1,
+    Tensor<int32_t, 1> indices_output_2,
+    void* cuda_stream
+);
+
+template void mops::cuda::sparse_accumulation_scatter_add_with_weights_vjp_vjp<float>(
+    Tensor<float, 3> grad_grad_output,
+    Tensor<float, 2> grad_A_2,
+    Tensor<float, 2> grad_B_2,
+    Tensor<float, 3> grad_W_2,
+    Tensor<float, 2> grad_grad_A,
+    Tensor<float, 2> grad_grad_B,
+    Tensor<float, 3> grad_grad_W,
+    Tensor<float, 3> grad_output,
+    Tensor<float, 2> A,
+    Tensor<float, 2> B,
+    Tensor<float, 1> C,
+    Tensor<float, 3> W,
+    Tensor<int32_t, 1> indices_A,
+    Tensor<int32_t, 1> indices_W_1,
+    Tensor<int32_t, 1> indices_W_2,
+    Tensor<int32_t, 1> indices_output_1,
+    Tensor<int32_t, 1> indices_output_2,
+    void* cuda_stream
+);
+
+template void mops::cuda::sparse_accumulation_scatter_add_with_weights_vjp_vjp<double>(
+    Tensor<double, 3> grad_grad_output,
+    Tensor<double, 2> grad_A_2,
+    Tensor<double, 2> grad_B_2,
+    Tensor<double, 3> grad_W_2,
+    Tensor<double, 2> grad_grad_A,
+    Tensor<double, 2> grad_grad_B,
+    Tensor<double, 3> grad_grad_W,
+    Tensor<double, 3> grad_output,
+    Tensor<double, 2> A,
+    Tensor<double, 2> B,
+    Tensor<double, 1> C,
+    Tensor<double, 3> W,
+    Tensor<int32_t, 1> indices_A,
+    Tensor<int32_t, 1> indices_W_1,
+    Tensor<int32_t, 1> indices_W_2,
+    Tensor<int32_t, 1> indices_output_1,
+    Tensor<int32_t, 1> indices_output_2,
+    void* cuda_stream
+);
\ No newline at end of file

From f78161cd0991637fe921d2fd56e0d883aa511016 Mon Sep 17 00:00:00 2001
From: "Nick J. Browning" <nickjbrowning@gmail.com>
Date: Fri, 3 May 2024 14:02:58 +0200
Subject: [PATCH 07/28] fixed issue with opsaw instantiation

---
 mops/src/opsaw/opsaw.cpp | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/mops/src/opsaw/opsaw.cpp b/mops/src/opsaw/opsaw.cpp
index b7eb889..4764b14 100644
--- a/mops/src/opsaw/opsaw.cpp
+++ b/mops/src/opsaw/opsaw.cpp
@@ -78,13 +78,13 @@ template void mops::outer_product_scatter_add_with_weights_vjp_vjp<double>(
 #ifndef MOPS_CUDA_ENABLED
 template <typename scalar_t>
 void mops::cuda::
-    outer_product_scatter_add_with_weights(Tensor<scalar_t, 3>, Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<int32_t, 1>, Tensor<int32_t, 1>) {
+    outer_product_scatter_add_with_weights(Tensor<scalar_t, 3>, Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<int32_t, 1>, Tensor<int32_t, 1>, void *) {
     throw std::runtime_error("MOPS was not compiled with CUDA support");
 }
 
 template <typename scalar_t>
 void mops::cuda::
-    outer_product_scatter_add_with_weights_vjp(Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<scalar_t, 3>, Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<int32_t, 1>, Tensor<int32_t, 1>) {
+    outer_product_scatter_add_with_weights_vjp(Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<scalar_t, 3>, Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<int32_t, 1>, Tensor<int32_t, 1>, void *) {
     throw std::runtime_error("MOPS was not compiled with CUDA support");
 }
 
@@ -102,7 +102,8 @@ void mops::cuda::outer_product_scatter_add_with_weights_vjp_vjp(
     Tensor<scalar_t, 2> /*B*/,
     Tensor<scalar_t, 2> /*W*/,
     Tensor<int32_t, 1> /*indices_W*/,
-    Tensor<int32_t, 1> /*indices_output*/
+    Tensor<int32_t, 1> /*indices_output*/,
+    void *
 ) {
     throw std::runtime_error("MOPS was not compiled with CUDA support");
 }
@@ -114,7 +115,8 @@ template void mops::cuda::outer_product_scatter_add_with_weights<float>(
     Tensor<float, 2> B,
     Tensor<float, 2> W,
     Tensor<int32_t, 1> indices_W,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void * cuda_stream
 );
 
 template void mops::cuda::outer_product_scatter_add_with_weights<double>(
@@ -123,7 +125,8 @@ template void mops::cuda::outer_product_scatter_add_with_weights<double>(
     Tensor<double, 2> B,
     Tensor<double, 2> W,
     Tensor<int32_t, 1> indices_W,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void * cuda_stream
 );
 
 template void mops::cuda::outer_product_scatter_add_with_weights_vjp<float>(
@@ -135,7 +138,8 @@ template void mops::cuda::outer_product_scatter_add_with_weights_vjp<float>(
     Tensor<float, 2> B,
     Tensor<float, 2> W,
     Tensor<int32_t, 1> indices_W,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void * cuda_stream
 );
 
 template void mops::cuda::outer_product_scatter_add_with_weights_vjp<double>(
@@ -147,7 +151,8 @@ template void mops::cuda::outer_product_scatter_add_with_weights_vjp<double>(
     Tensor<double, 2> B,
     Tensor<double, 2> W,
     Tensor<int32_t, 1> indices_W,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void * cuda_stream
 );
 
 template void mops::cuda::outer_product_scatter_add_with_weights_vjp_vjp<float>(
@@ -163,7 +168,8 @@ template void mops::cuda::outer_product_scatter_add_with_weights_vjp_vjp<float>(
     Tensor<float, 2> B,
     Tensor<float, 2> W,
     Tensor<int32_t, 1> indices_W,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void * cuda_stream
 );
 
 template void mops::cuda::outer_product_scatter_add_with_weights_vjp_vjp<double>(
@@ -179,7 +185,8 @@ template void mops::cuda::outer_product_scatter_add_with_weights_vjp_vjp<double>
     Tensor<double, 2> B,
     Tensor<double, 2> W,
     Tensor<int32_t, 1> indices_W,
-    Tensor<int32_t, 1> indices_output
+    Tensor<int32_t, 1> indices_output,
+    void * cuda_stream
 );
 
 #endif
\ No newline at end of file

From e89350b3e9be5ce149a7651b3b8299ea27c89fb6 Mon Sep 17 00:00:00 2001
From: "Nick J. Browning" <nickjbrowning@gmail.com>
Date: Fri, 3 May 2024 14:03:29 +0200
Subject: [PATCH 08/28] formatting.

---
 mops/src/opsaw/opsaw.cpp | 34 ++++++++++------------------------
 1 file changed, 10 insertions(+), 24 deletions(-)

diff --git a/mops/src/opsaw/opsaw.cpp b/mops/src/opsaw/opsaw.cpp
index 4764b14..6e30c41 100644
--- a/mops/src/opsaw/opsaw.cpp
+++ b/mops/src/opsaw/opsaw.cpp
@@ -78,33 +78,19 @@ template void mops::outer_product_scatter_add_with_weights_vjp_vjp<double>(
 #ifndef MOPS_CUDA_ENABLED
 template <typename scalar_t>
 void mops::cuda::
-    outer_product_scatter_add_with_weights(Tensor<scalar_t, 3>, Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<int32_t, 1>, Tensor<int32_t, 1>, void *) {
+    outer_product_scatter_add_with_weights(Tensor<scalar_t, 3>, Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<int32_t, 1>, Tensor<int32_t, 1>, void*) {
     throw std::runtime_error("MOPS was not compiled with CUDA support");
 }
 
 template <typename scalar_t>
 void mops::cuda::
-    outer_product_scatter_add_with_weights_vjp(Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<scalar_t, 3>, Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<int32_t, 1>, Tensor<int32_t, 1>, void *) {
+    outer_product_scatter_add_with_weights_vjp(Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<scalar_t, 3>, Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<scalar_t, 2>, Tensor<int32_t, 1>, Tensor<int32_t, 1>, void*) {
     throw std::runtime_error("MOPS was not compiled with CUDA support");
 }
 
 template <typename scalar_t>
-void mops::cuda::outer_product_scatter_add_with_weights_vjp_vjp(
-    Tensor<scalar_t, 3> /*grad_grad_output*/,
-    Tensor<scalar_t, 2> /*grad_A_2*/,
-    Tensor<scalar_t, 2> /*grad_B_2*/,
-    Tensor<scalar_t, 2> /*grad_W_2*/,
-    Tensor<scalar_t, 2> /*grad_grad_A*/,
-    Tensor<scalar_t, 2> /*grad_grad_B*/,
-    Tensor<scalar_t, 2> /*grad_grad_W*/,
-    Tensor<scalar_t, 3> /*grad_output*/,
-    Tensor<scalar_t, 2> /*A*/,
-    Tensor<scalar_t, 2> /*B*/,
-    Tensor<scalar_t, 2> /*W*/,
-    Tensor<int32_t, 1> /*indices_W*/,
-    Tensor<int32_t, 1> /*indices_output*/,
-    void *
-) {
+void mops::cuda::
+    outer_product_scatter_add_with_weights_vjp_vjp(Tensor<scalar_t, 3> /*grad_grad_output*/, Tensor<scalar_t, 2> /*grad_A_2*/, Tensor<scalar_t, 2> /*grad_B_2*/, Tensor<scalar_t, 2> /*grad_W_2*/, Tensor<scalar_t, 2> /*grad_grad_A*/, Tensor<scalar_t, 2> /*grad_grad_B*/, Tensor<scalar_t, 2> /*grad_grad_W*/, Tensor<scalar_t, 3> /*grad_output*/, Tensor<scalar_t, 2> /*A*/, Tensor<scalar_t, 2> /*B*/, Tensor<scalar_t, 2> /*W*/, Tensor<int32_t, 1> /*indices_W*/, Tensor<int32_t, 1> /*indices_output*/, void*) {
     throw std::runtime_error("MOPS was not compiled with CUDA support");
 }
 
@@ -116,7 +102,7 @@ template void mops::cuda::outer_product_scatter_add_with_weights<float>(
     Tensor<float, 2> W,
     Tensor<int32_t, 1> indices_W,
     Tensor<int32_t, 1> indices_output,
-    void * cuda_stream
+    void* cuda_stream
 );
 
 template void mops::cuda::outer_product_scatter_add_with_weights<double>(
@@ -126,7 +112,7 @@ template void mops::cuda::outer_product_scatter_add_with_weights<double>(
     Tensor<double, 2> W,
     Tensor<int32_t, 1> indices_W,
     Tensor<int32_t, 1> indices_output,
-    void * cuda_stream
+    void* cuda_stream
 );
 
 template void mops::cuda::outer_product_scatter_add_with_weights_vjp<float>(
@@ -139,7 +125,7 @@ template void mops::cuda::outer_product_scatter_add_with_weights_vjp<float>(
     Tensor<float, 2> W,
     Tensor<int32_t, 1> indices_W,
     Tensor<int32_t, 1> indices_output,
-    void * cuda_stream
+    void* cuda_stream
 );
 
 template void mops::cuda::outer_product_scatter_add_with_weights_vjp<double>(
@@ -152,7 +138,7 @@ template void mops::cuda::outer_product_scatter_add_with_weights_vjp<double>(
     Tensor<double, 2> W,
     Tensor<int32_t, 1> indices_W,
     Tensor<int32_t, 1> indices_output,
-    void * cuda_stream
+    void* cuda_stream
 );
 
 template void mops::cuda::outer_product_scatter_add_with_weights_vjp_vjp<float>(
@@ -169,7 +155,7 @@ template void mops::cuda::outer_product_scatter_add_with_weights_vjp_vjp<float>(
     Tensor<float, 2> W,
     Tensor<int32_t, 1> indices_W,
     Tensor<int32_t, 1> indices_output,
-    void * cuda_stream
+    void* cuda_stream
 );
 
 template void mops::cuda::outer_product_scatter_add_with_weights_vjp_vjp<double>(
@@ -186,7 +172,7 @@ template void mops::cuda::outer_product_scatter_add_with_weights_vjp_vjp<double>
     Tensor<double, 2> W,
     Tensor<int32_t, 1> indices_W,
     Tensor<int32_t, 1> indices_output,
-    void * cuda_stream
+    void* cuda_stream
 );
 
 #endif
\ No newline at end of file

From ea54e9afa333d487558c4533b661cf081bbc0526 Mon Sep 17 00:00:00 2001
From: Prashanth Kanduri <kanduri@cscs.ch>
Date: Fri, 3 May 2024 15:28:33 +0200
Subject: [PATCH 09/28] add comments on the test job script

---
 ci/pipeline.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ci/pipeline.yml b/ci/pipeline.yml
index 61af310..0d34917 100644
--- a/ci/pipeline.yml
+++ b/ci/pipeline.yml
@@ -22,7 +22,9 @@ test_job:
   script:
     - export CUDA_HOME="/usr/local/cuda"
     - python3 -m pip install --upgrade pip
+    - echo "Install Tox"
     - python3 -m pip install tox
+    - echo "Run the Tox Script"
     - tox
 
   variables:

From 1bb35aab6c1c8d5fe07c9855079d8020d8527344 Mon Sep 17 00:00:00 2001
From: Prashanth Kanduri <kanduri@cscs.ch>
Date: Fri, 3 May 2024 15:32:26 +0200
Subject: [PATCH 10/28] insignificant commit for retriggering pipeline

---
 ci/pipeline.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ci/pipeline.yml b/ci/pipeline.yml
index 0d34917..87cabd9 100644
--- a/ci/pipeline.yml
+++ b/ci/pipeline.yml
@@ -23,9 +23,10 @@ test_job:
     - export CUDA_HOME="/usr/local/cuda"
     - python3 -m pip install --upgrade pip
     - echo "Install Tox"
-    - python3 -m pip install tox
+    - python3 -m pip install too
     - echo "Run the Tox Script"
     - tox
+    - echo "Too script completed"
 
   variables:
     SLURM_JOB_NUM_NODES: 1

From b9be358b5a3a17f75ec2bd63df614feeb3096268 Mon Sep 17 00:00:00 2001
From: Filippo Bigi <98903385+frostedoyster@users.noreply.github.com>
Date: Sat, 4 May 2024 08:19:54 +0200
Subject: [PATCH 11/28] Install tox

---
 ci/pipeline.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/pipeline.yml b/ci/pipeline.yml
index 87cabd9..21625eb 100644
--- a/ci/pipeline.yml
+++ b/ci/pipeline.yml
@@ -23,10 +23,10 @@ test_job:
     - export CUDA_HOME="/usr/local/cuda"
     - python3 -m pip install --upgrade pip
     - echo "Install Tox"
-    - python3 -m pip install too
+    - python3 -m pip install tox
     - echo "Run the Tox Script"
     - tox
-    - echo "Too script completed"
+    - echo "Tox script completed"
 
   variables:
     SLURM_JOB_NUM_NODES: 1

From d9fc283cc636f46a0be26114ace25a97cc6923c1 Mon Sep 17 00:00:00 2001
From: Prashanth Kanduri <kanduri@cscs.ch>
Date: Mon, 6 May 2024 13:29:41 +0200
Subject: [PATCH 12/28] attempt addressing CI compiler errors and warnings

---
 mops/src/opsa/cpu.tpp | 2 +-
 mops/src/sap/sap.cu   | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/mops/src/opsa/cpu.tpp b/mops/src/opsa/cpu.tpp
index da347ad..570b16a 100644
--- a/mops/src/opsa/cpu.tpp
+++ b/mops/src/opsa/cpu.tpp
@@ -93,7 +93,7 @@ void mops::outer_product_scatter_add_vjp(
         scalar_t *grad_output_ptr = grad_output.data;
         scalar_t *a_ptr = A.data;
         scalar_t *b_ptr = B.data;
-        int32_t *indices_output_ptr = indices_output.data;
+        [[maybe_unused]] int32_t *indices_output_ptr = indices_output.data;
 
 #pragma omp parallel for
         for (size_t i = 0; i < size_ab; i++) {
diff --git a/mops/src/sap/sap.cu b/mops/src/sap/sap.cu
index f6adea6..27c246a 100644
--- a/mops/src/sap/sap.cu
+++ b/mops/src/sap/sap.cu
@@ -71,7 +71,7 @@ __global__ void sparse_accumulation_of_products_kernel(
         int a_idx = (packed_indices[k] >> 16) & 0xFF;
 
         atomicAdd(
-            buffer_out + out_idx * WARP_SIZE + laneID,
+            &buffer_out + out_idx * WARP_SIZE + laneID,
             C.data[k] * buffer_A[a_idx * WARP_SIZE + laneID] * buffer_B[b_idx * WARP_SIZE + laneID]
         );
     }
@@ -233,14 +233,14 @@ __global__ void sparse_accumulation_of_products_vjp_kernel(
 
         if (grad_A.data != nullptr) {
             atomicAdd(
-                buffer_gradA + a_idx * WARP_SIZE + laneID,
+                &buffer_gradA + a_idx * WARP_SIZE + laneID,
                 C.data[k] * buffer_B[b_idx * WARP_SIZE + laneID] *
                     buffer_gradout[out_idx * WARP_SIZE + laneID]
             );
         }
         if (grad_B.data != nullptr) {
             atomicAdd(
-                buffer_gradB + b_idx * WARP_SIZE + laneID,
+                &buffer_gradB + b_idx * WARP_SIZE + laneID,
                 C.data[k] * buffer_A[a_idx * WARP_SIZE + laneID] *
                     buffer_gradout[out_idx * WARP_SIZE + laneID]
             );

From af2ec607ea6f584cbdda731743f0a7d20a3c9a64 Mon Sep 17 00:00:00 2001
From: Prashanth Kanduri <kanduri@cscs.ch>
Date: Mon, 6 May 2024 15:02:51 +0200
Subject: [PATCH 13/28] remove polynomial order zero case to avoid divide by
 zero issue

---
 mops/src/hpe/hpe.cu | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/mops/src/hpe/hpe.cu b/mops/src/hpe/hpe.cu
index 964c4b4..c4d1735 100644
--- a/mops/src/hpe/hpe.cu
+++ b/mops/src/hpe/hpe.cu
@@ -313,10 +313,6 @@ void mops::cuda::homogeneous_polynomial_evaluation_vjp(
 
     if (polynomial_order <= 10) {
         switch (polynomial_order) {
-        case 0:
-            homogeneous_polynomial_evaluation_vjp_kernel<scalar_t, 0>
-                <<<block_dim, thread_block, space>>>(grad_A, grad_output, A, C, indices_A);
-            break;
         case 1:
             homogeneous_polynomial_evaluation_vjp_kernel<scalar_t, 1>
                 <<<block_dim, thread_block, space>>>(grad_A, grad_output, A, C, indices_A);

From 9fee2a1741104b3ce80b1fc172ded90e4f1c4d2a Mon Sep 17 00:00:00 2001
From: Prashanth Kanduri <kanduri@cscs.ch>
Date: Mon, 6 May 2024 15:04:47 +0200
Subject: [PATCH 14/28] correct address of the atomic add operations in sap

---
 mops/src/sap/sap.cu | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mops/src/sap/sap.cu b/mops/src/sap/sap.cu
index f15fb69..1a82982 100644
--- a/mops/src/sap/sap.cu
+++ b/mops/src/sap/sap.cu
@@ -508,14 +508,14 @@ __global__ void sparse_accumulation_of_products_vjp_vjp_kernel(
 
             if (grad_grad_output.data != nullptr) {
                 atomicAdd(
-                    buffer_grad_grad_output + out_idx * WARP_SIZE + laneID,
+                    &buffer_grad_grad_output + out_idx * WARP_SIZE + laneID,
                     grad_grad_A_k * buffer_B[b_idx * WARP_SIZE + laneID] * c
                 );
             }
 
             if (grad_B_2.data != nullptr) {
                 atomicAdd(
-                    buffer_grad_B2 + b_idx * WARP_SIZE + laneID,
+                    &buffer_grad_B2 + b_idx * WARP_SIZE + laneID,
                     grad_grad_A_k * buffer_grad_output[out_idx * WARP_SIZE + laneID] * c
                 );
             }
@@ -526,14 +526,14 @@ __global__ void sparse_accumulation_of_products_vjp_vjp_kernel(
 
             if (grad_grad_output.data != nullptr) {
                 atomicAdd(
-                    buffer_grad_grad_output + out_idx * WARP_SIZE + laneID,
+                    &buffer_grad_grad_output + out_idx * WARP_SIZE + laneID,
                     grad_grad_B_k * buffer_A[a_idx * WARP_SIZE + laneID] * c
                 );
             }
 
             if (grad_A_2.data != nullptr) {
                 atomicAdd(
-                    buffer_grad_A2 + a_idx * WARP_SIZE + laneID,
+                    &buffer_grad_A2 + a_idx * WARP_SIZE + laneID,
                     grad_grad_B_k * buffer_grad_output[out_idx * WARP_SIZE + laneID] * c
                 );
             }

From fc80766cb65d5a0bf72c8f8bbbf31f14b975846a Mon Sep 17 00:00:00 2001
From: Prashanth Kanduri <kanduri@cscs.ch>
Date: Mon, 6 May 2024 15:30:54 +0200
Subject: [PATCH 15/28] specify addresses with indices for atomic add
 operations to address CI errors

---
 mops/src/hpe/hpe.cu   |  2 +-
 mops/src/opsa/cpu.tpp |  2 +-
 mops/src/sap/sap.cu   | 14 +++++++-------
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/mops/src/hpe/hpe.cu b/mops/src/hpe/hpe.cu
index c4d1735..de22035 100644
--- a/mops/src/hpe/hpe.cu
+++ b/mops/src/hpe/hpe.cu
@@ -311,7 +311,7 @@ void mops::cuda::homogeneous_polynomial_evaluation_vjp(
     shared_array<scalar_t>(2 * nnu1, sptr, &space);
     shared_array<int32_t>(NWARPS_PER_BLOCK * WARP_SIZE * polynomial_order, sptr, &space);
 
-    if (polynomial_order <= 10) {
+    if (polynomial_order > 0 && polynomial_order <= 10) {
         switch (polynomial_order) {
         case 1:
             homogeneous_polynomial_evaluation_vjp_kernel<scalar_t, 1>
diff --git a/mops/src/opsa/cpu.tpp b/mops/src/opsa/cpu.tpp
index 1bc9d68..e628e2e 100644
--- a/mops/src/opsa/cpu.tpp
+++ b/mops/src/opsa/cpu.tpp
@@ -167,7 +167,7 @@ void mops::outer_product_scatter_add_vjp_vjp(
     scalar_t *grad_output_ptr = grad_output.data;
     scalar_t *a_ptr = A.data;
     scalar_t *b_ptr = B.data;
-    int32_t *indices_output_ptr = indices_output.data;
+    [[maybe_unused]] int32_t *indices_output_ptr = indices_output.data;
 
     scalar_t *grad_output_ptr_i = nullptr;
     scalar_t *a_ptr_i = nullptr;
diff --git a/mops/src/sap/sap.cu b/mops/src/sap/sap.cu
index 1a82982..48dbe84 100644
--- a/mops/src/sap/sap.cu
+++ b/mops/src/sap/sap.cu
@@ -71,7 +71,7 @@ __global__ void sparse_accumulation_of_products_kernel(
         int a_idx = (packed_indices[k] >> 16) & 0xFF;
 
         atomicAdd(
-            &buffer_out + out_idx * WARP_SIZE + laneID,
+            &buffer_out[out_idx * WARP_SIZE + laneID],
             C.data[k] * buffer_A[a_idx * WARP_SIZE + laneID] * buffer_B[b_idx * WARP_SIZE + laneID]
         );
     }
@@ -236,14 +236,14 @@ __global__ void sparse_accumulation_of_products_vjp_kernel(
 
         if (grad_A.data != nullptr) {
             atomicAdd(
-                &buffer_gradA + a_idx * WARP_SIZE + laneID,
+                &buffer_gradA[a_idx * WARP_SIZE + laneID],
                 C.data[k] * buffer_B[b_idx * WARP_SIZE + laneID] *
                     buffer_gradout[out_idx * WARP_SIZE + laneID]
             );
         }
         if (grad_B.data != nullptr) {
             atomicAdd(
-                &buffer_gradB + b_idx * WARP_SIZE + laneID,
+                &buffer_gradB[b_idx * WARP_SIZE + laneID],
                 C.data[k] * buffer_A[a_idx * WARP_SIZE + laneID] *
                     buffer_gradout[out_idx * WARP_SIZE + laneID]
             );
@@ -508,14 +508,14 @@ __global__ void sparse_accumulation_of_products_vjp_vjp_kernel(
 
             if (grad_grad_output.data != nullptr) {
                 atomicAdd(
-                    &buffer_grad_grad_output + out_idx * WARP_SIZE + laneID,
+                    &buffer_grad_grad_output[out_idx * WARP_SIZE + laneID],
                     grad_grad_A_k * buffer_B[b_idx * WARP_SIZE + laneID] * c
                 );
             }
 
             if (grad_B_2.data != nullptr) {
                 atomicAdd(
-                    &buffer_grad_B2 + b_idx * WARP_SIZE + laneID,
+                    &buffer_grad_B2[b_idx * WARP_SIZE + laneID],
                     grad_grad_A_k * buffer_grad_output[out_idx * WARP_SIZE + laneID] * c
                 );
             }
@@ -526,14 +526,14 @@ __global__ void sparse_accumulation_of_products_vjp_vjp_kernel(
 
             if (grad_grad_output.data != nullptr) {
                 atomicAdd(
-                    &buffer_grad_grad_output + out_idx * WARP_SIZE + laneID,
+                    &buffer_grad_grad_output[out_idx * WARP_SIZE + laneID],
                     grad_grad_B_k * buffer_A[a_idx * WARP_SIZE + laneID] * c
                 );
             }
 
             if (grad_A_2.data != nullptr) {
                 atomicAdd(
-                    &buffer_grad_A2 + a_idx * WARP_SIZE + laneID,
+                    &buffer_grad_A2[a_idx * WARP_SIZE + laneID],
                     grad_grad_B_k * buffer_grad_output[out_idx * WARP_SIZE + laneID] * c
                 );
             }

From 2ed5771124a7eb02ec8efd787082d154ae0aec92 Mon Sep 17 00:00:00 2001
From: Prashanth Kanduri <kanduri@cscs.ch>
Date: Tue, 7 May 2024 11:06:29 +0200
Subject: [PATCH 16/28] dummy implementation for polynomial order zero

---
 mops/src/hpe/hpe.cu | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/mops/src/hpe/hpe.cu b/mops/src/hpe/hpe.cu
index de22035..8d6e1d8 100644
--- a/mops/src/hpe/hpe.cu
+++ b/mops/src/hpe/hpe.cu
@@ -106,6 +106,13 @@ __global__ void homogeneous_polynomial_evaluation_kernel(
     }
 }
 
+template <typename scalar_t, 0>
+__global__ void homogeneous_polynomial_evaluation_kernel(
+    Tensor<scalar_t, 1> output, Tensor<scalar_t, 2> A, Tensor<scalar_t, 1> C, Tensor<int32_t, 2> indices_A
+) {
+	// dummy implementation for polynomial order zero
+}
+
 template <typename scalar_t>
 void mops::cuda::homogeneous_polynomial_evaluation(
     Tensor<scalar_t, 1> output, Tensor<scalar_t, 2> A, Tensor<scalar_t, 1> C, Tensor<int32_t, 2> indices_A
@@ -128,12 +135,8 @@ void mops::cuda::homogeneous_polynomial_evaluation(
     shared_array<scalar_t>(NWARPS_PER_BLOCK, sptr, &space);
     shared_array<int32_t>(WARP_SIZE * NWARPS_PER_BLOCK * polynomial_order, sptr, &space);
 
-    if (polynomial_order <= 10) {
+    if (polynomial_order > 0 && polynomial_order <= 10) {
         switch (polynomial_order) {
-        case 0:
-            homogeneous_polynomial_evaluation_kernel<scalar_t, 0>
-                <<<block_dim, thread_block, space>>>(output, A, C, indices_A);
-            break;
         case 1:
             homogeneous_polynomial_evaluation_kernel<scalar_t, 1>
                 <<<block_dim, thread_block, space>>>(output, A, C, indices_A);

From 62b73d8df162c957a0e3151fa4a6e86f4bd724b4 Mon Sep 17 00:00:00 2001
From: "Nicholas J. Browning" <nickjbrowning@gmail.com>
Date: Fri, 10 May 2024 15:07:08 +0200
Subject: [PATCH 17/28] removed bug in CUDA HPE.

---
 mops/src/hpe/hpe.cu | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/mops/src/hpe/hpe.cu b/mops/src/hpe/hpe.cu
index 8d6e1d8..1d9e2ea 100644
--- a/mops/src/hpe/hpe.cu
+++ b/mops/src/hpe/hpe.cu
@@ -106,13 +106,6 @@ __global__ void homogeneous_polynomial_evaluation_kernel(
     }
 }
 
-template <typename scalar_t, 0>
-__global__ void homogeneous_polynomial_evaluation_kernel(
-    Tensor<scalar_t, 1> output, Tensor<scalar_t, 2> A, Tensor<scalar_t, 1> C, Tensor<int32_t, 2> indices_A
-) {
-	// dummy implementation for polynomial order zero
-}
-
 template <typename scalar_t>
 void mops::cuda::homogeneous_polynomial_evaluation(
     Tensor<scalar_t, 1> output, Tensor<scalar_t, 2> A, Tensor<scalar_t, 1> C, Tensor<int32_t, 2> indices_A

From b6d479529b3896681548054175a90f45ae3dc4fa Mon Sep 17 00:00:00 2001
From: "Nicholas J. Browning" <nickjbrowning@gmail.com>
Date: Fri, 10 May 2024 15:45:03 +0200
Subject: [PATCH 18/28] header change

---
 mops/src/hpe/hpe.cu | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mops/src/hpe/hpe.cu b/mops/src/hpe/hpe.cu
index 1d9e2ea..d2238c9 100644
--- a/mops/src/hpe/hpe.cu
+++ b/mops/src/hpe/hpe.cu
@@ -1,3 +1,5 @@
+#include <cuda.h>
+
 #include "mops/hpe.hpp"
 
 #include "internal/checks/hpe.hpp"

From 68223e25ed837a90e8c4bca3a1c35939206c6220 Mon Sep 17 00:00:00 2001
From: "Nicholas J. Browning" <nickjbrowning@gmail.com>
Date: Fri, 10 May 2024 16:10:28 +0200
Subject: [PATCH 19/28] HPE divide by zero fix.

---
 mops/src/hpe/hpe.cu | 103 ++++++++++++++++++++++++++------------------
 1 file changed, 62 insertions(+), 41 deletions(-)

diff --git a/mops/src/hpe/hpe.cu b/mops/src/hpe/hpe.cu
index d2238c9..2c75e65 100644
--- a/mops/src/hpe/hpe.cu
+++ b/mops/src/hpe/hpe.cu
@@ -1,5 +1,3 @@
-#include <cuda.h>
-
 #include "mops/hpe.hpp"
 
 #include "internal/checks/hpe.hpp"
@@ -55,14 +53,20 @@ __global__ void homogeneous_polynomial_evaluation_kernel(
 
         __syncthreads();
 
-        int32_t i_monomial = threadIdx.x % polynomial_order;
-        int32_t x = threadIdx.x / polynomial_order;
-        int32_t nx = blockDim.x / polynomial_order;
+        int32_t i_monomial;
+        int32_t x;
+        int32_t nx;
+
+        if (polynomial_order > 0) {
+            i_monomial = threadIdx.x % polynomial_order;
+            x = threadIdx.x / polynomial_order;
+            nx = blockDim.x / polynomial_order;
 
-        for (int lbasis = x; lbasis < blockDim.x; lbasis += nx) {
-            if (i_monomial * blockDim.x + lbasis < polynomial_order * blockDim.x) {
-                buffer_indices_A[i_monomial * blockDim.x + lbasis] =
-                    indices_A.data[(i + lbasis) * polynomial_order + i_monomial];
+            for (int lbasis = x; lbasis < blockDim.x; lbasis += nx) {
+                if (i_monomial * blockDim.x + lbasis < polynomial_order * blockDim.x) {
+                    buffer_indices_A[i_monomial * blockDim.x + lbasis] =
+                        indices_A.data[(i + lbasis) * polynomial_order + i_monomial];
+                }
             }
         }
 
@@ -130,8 +134,12 @@ void mops::cuda::homogeneous_polynomial_evaluation(
     shared_array<scalar_t>(NWARPS_PER_BLOCK, sptr, &space);
     shared_array<int32_t>(WARP_SIZE * NWARPS_PER_BLOCK * polynomial_order, sptr, &space);
 
-    if (polynomial_order > 0 && polynomial_order <= 10) {
+    if (polynomial_order <= 10) {
         switch (polynomial_order) {
+        case 0:
+            homogeneous_polynomial_evaluation_kernel<scalar_t, 0>
+                <<<block_dim, thread_block, space>>>(output, A, C, indices_A);
+            break;
         case 1:
             homogeneous_polynomial_evaluation_kernel<scalar_t, 1>
                 <<<block_dim, thread_block, space>>>(output, A, C, indices_A);
@@ -229,51 +237,56 @@ __global__ void homogeneous_polynomial_evaluation_vjp_kernel(
     __syncthreads();
 
     scalar_t gout = grad_output.data[batch_id];
+    if (polynomial_order > 0) {
+        // indices_A : nbasis, polynomial_order
+        for (int32_t i = 0; i < nbasis; i += blockDim.x) {
 
-    // indices_A : nbasis, polynomial_order
-    for (int32_t i = 0; i < nbasis; i += blockDim.x) {
+            __syncthreads();
 
-        __syncthreads();
+            int32_t basis = i + threadIdx.x;
+
+            int32_t i_monomial;
+            int32_t x;
+            int32_t nx;
 
-        int32_t i_monomial = threadIdx.x % polynomial_order;
-        int32_t x = threadIdx.x / polynomial_order;
-        int32_t nx = blockDim.x / polynomial_order;
+            i_monomial = threadIdx.x % polynomial_order;
+            x = threadIdx.x / polynomial_order;
+            nx = blockDim.x / polynomial_order;
 
-        for (int lbasis = x; lbasis < blockDim.x; lbasis += nx) {
-            if (i_monomial * blockDim.x + lbasis < polynomial_order * blockDim.x) {
-                buffer_indices_A[i_monomial * blockDim.x + lbasis] =
-                    indices_A.data[(i + lbasis) * polynomial_order + i_monomial];
+            for (int lbasis = x; lbasis < blockDim.x; lbasis += nx) {
+                if (i_monomial * blockDim.x + lbasis < polynomial_order * blockDim.x) {
+                    buffer_indices_A[i_monomial * blockDim.x + lbasis] =
+                        indices_A.data[(i + lbasis) * polynomial_order + i_monomial];
+                }
             }
-        }
 
-        __syncthreads();
+            __syncthreads();
 
-        int32_t basis = i + threadIdx.x;
+            if (basis < nbasis) {
 
-        if (basis < nbasis) {
+                scalar_t c = C.data[basis] * gout;
 
-            scalar_t c = C.data[basis] * gout;
+                for (int32_t i_monomial = 0; i_monomial < polynomial_order; i_monomial++) {
 
-            for (int32_t i_monomial = 0; i_monomial < polynomial_order; i_monomial++) {
+                    scalar_t tmp_i = c;
+
+                    for (int32_t j_monomial = 0; j_monomial < polynomial_order; j_monomial++) {
 
-                scalar_t tmp_i = c;
+                        if (i_monomial == j_monomial) {
+                            continue;
+                        }
 
-                for (int32_t j_monomial = 0; j_monomial < polynomial_order; j_monomial++) {
+                        int32_t idx_j = buffer_indices_A
+                            [j_monomial * blockDim.x + threadIdx.x]; // indices_A.data[j_monomial
+                                                                     // * indices_A.shape[0] + basis];
 
-                    if (i_monomial == j_monomial) {
-                        continue;
+                        tmp_i *= buffer_nu1[idx_j];
                     }
 
-                    int32_t idx_j =
-                        buffer_indices_A[j_monomial * blockDim.x + threadIdx.x]; // indices_A.data[j_monomial
-                                                                                 // * indices_A.shape[0] + basis];
+                    int32_t idx_i = buffer_indices_A[i_monomial * blockDim.x + threadIdx.x];
 
-                    tmp_i *= buffer_nu1[idx_j];
+                    atomicAdd(&buffer_gradA[idx_i], tmp_i);
                 }
-
-                int32_t idx_i = buffer_indices_A[i_monomial * blockDim.x + threadIdx.x];
-
-                atomicAdd(&buffer_gradA[idx_i], tmp_i);
             }
         }
     }
@@ -281,7 +294,11 @@ __global__ void homogeneous_polynomial_evaluation_vjp_kernel(
     __syncthreads();
 
     for (int32_t i = threadIdx.x; i < nnu1; i += blockDim.x) {
-        grad_A.data[batch_id * nnu1 + i] = buffer_gradA[i];
+        if (polynomial_order > 0) {
+            grad_A.data[batch_id * nnu1 + i] = buffer_gradA[i];
+        } else {
+            grad_A.data[batch_id * nnu1 + i] = 0.0;
+        }
     }
 }
 
@@ -309,8 +326,12 @@ void mops::cuda::homogeneous_polynomial_evaluation_vjp(
     shared_array<scalar_t>(2 * nnu1, sptr, &space);
     shared_array<int32_t>(NWARPS_PER_BLOCK * WARP_SIZE * polynomial_order, sptr, &space);
 
-    if (polynomial_order > 0 && polynomial_order <= 10) {
+    if (polynomial_order <= 10) {
         switch (polynomial_order) {
+        case 0:
+            homogeneous_polynomial_evaluation_vjp_kernel<scalar_t, 0>
+                <<<block_dim, thread_block, space>>>(grad_A, grad_output, A, C, indices_A);
+            break;
         case 1:
             homogeneous_polynomial_evaluation_vjp_kernel<scalar_t, 1>
                 <<<block_dim, thread_block, space>>>(grad_A, grad_output, A, C, indices_A);
@@ -410,4 +431,4 @@ template void mops::cuda::homogeneous_polynomial_evaluation_vjp_vjp<double>(
     Tensor<double, 2> A,
     Tensor<double, 1> C,
     Tensor<int32_t, 2> indices_A
-);
+);
\ No newline at end of file

From 3b4527f4a9e4e22d541267d3b303a1132e2cc8f1 Mon Sep 17 00:00:00 2001
From: "Nicholas J. Browning" <nickjbrowning@gmail.com>
Date: Fri, 10 May 2024 16:24:09 +0200
Subject: [PATCH 20/28] added in code for pre-sm60 atomicAdd(doubles)

---
 mops/src/hpe/hpe.cu              |  5 ++++-
 mops/src/internal/cuda_utils.cu  | 16 ++++++++++++++++
 mops/src/internal/cuda_utils.cuh |  2 ++
 3 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/mops/src/hpe/hpe.cu b/mops/src/hpe/hpe.cu
index 2c75e65..e599480 100644
--- a/mops/src/hpe/hpe.cu
+++ b/mops/src/hpe/hpe.cu
@@ -284,8 +284,11 @@ __global__ void homogeneous_polynomial_evaluation_vjp_kernel(
                     }
 
                     int32_t idx_i = buffer_indices_A[i_monomial * blockDim.x + threadIdx.x];
-
+#if __CUDA_ARCH__ < 600
+                    atomicAdd_presm60(&buffer_gradA[idx_i], tmp_i);
+#else
                     atomicAdd(&buffer_gradA[idx_i], tmp_i);
+#endif
                 }
             }
         }
diff --git a/mops/src/internal/cuda_utils.cu b/mops/src/internal/cuda_utils.cu
index 3af533e..878c556 100644
--- a/mops/src/internal/cuda_utils.cu
+++ b/mops/src/internal/cuda_utils.cu
@@ -11,6 +11,22 @@ __host__ __device__ int32_t find_integer_divisor(int32_t x, int32_t bdim) {
     return (x + bdim - 1) / bdim;
 }
 
+__device__ double atomicAdd_presm60(double* address, double val) {
+    unsigned long long int* address_as_ull = (unsigned long long int*)address;
+    unsigned long long int old = *address_as_ull, assumed;
+
+    do {
+        assumed = old;
+        old = atomicCAS(
+            address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))
+        );
+
+        // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
+    } while (assumed != old);
+
+    return __longlong_as_double(old);
+}
+
 template <typename T>
 __host__ __device__ T* shared_array(std::size_t n_elements, void*& ptr, std::size_t* space) noexcept {
     const std::uintptr_t inptr = reinterpret_cast<uintptr_t>(ptr);
diff --git a/mops/src/internal/cuda_utils.cuh b/mops/src/internal/cuda_utils.cuh
index b8c25be..a58b514 100644
--- a/mops/src/internal/cuda_utils.cuh
+++ b/mops/src/internal/cuda_utils.cuh
@@ -23,6 +23,8 @@ using namespace std;
         }                                                                                          \
     } while (0)
 
+__device__ double atomicAdd_presm60(double* address, double val);
+
 __host__ __device__ int32_t find_integer_divisor(int32_t x, int32_t bdim);
 
 /*

From e3c70c2c53ad8df6677b3e9d9b4f5f76c08af4aa Mon Sep 17 00:00:00 2001
From: "Nicholas J. Browning" <nickjbrowning@gmail.com>
Date: Fri, 10 May 2024 16:27:42 +0200
Subject: [PATCH 21/28] macro to switch out the atomicAdds depending on ARCH

---
 mops/src/hpe/hpe.cu              |  7 ++-----
 mops/src/internal/cuda_utils.cuh |  6 ++++++
 mops/src/sap/sap.cu              | 14 +++++++-------
 3 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/mops/src/hpe/hpe.cu b/mops/src/hpe/hpe.cu
index e599480..100cbce 100644
--- a/mops/src/hpe/hpe.cu
+++ b/mops/src/hpe/hpe.cu
@@ -284,11 +284,8 @@ __global__ void homogeneous_polynomial_evaluation_vjp_kernel(
                     }
 
                     int32_t idx_i = buffer_indices_A[i_monomial * blockDim.x + threadIdx.x];
-#if __CUDA_ARCH__ < 600
-                    atomicAdd_presm60(&buffer_gradA[idx_i], tmp_i);
-#else
-                    atomicAdd(&buffer_gradA[idx_i], tmp_i);
-#endif
+
+                    ATOMIC_ADD(&buffer_gradA[idx_i], tmp_i);
                 }
             }
         }
diff --git a/mops/src/internal/cuda_utils.cuh b/mops/src/internal/cuda_utils.cuh
index a58b514..416417f 100644
--- a/mops/src/internal/cuda_utils.cuh
+++ b/mops/src/internal/cuda_utils.cuh
@@ -23,6 +23,12 @@ using namespace std;
         }                                                                                          \
     } while (0)
 
+#if __CUDA_ARCH__ < 600
+#define ATOMIC_ADD(address, val) atomicAdd_presm60(address, val)
+#else
+#define ATOMIC_ADD(address, val) atomicAdd(address, val)
+#endif
+
 __device__ double atomicAdd_presm60(double* address, double val);
 
 __host__ __device__ int32_t find_integer_divisor(int32_t x, int32_t bdim);
diff --git a/mops/src/sap/sap.cu b/mops/src/sap/sap.cu
index 48dbe84..5e17295 100644
--- a/mops/src/sap/sap.cu
+++ b/mops/src/sap/sap.cu
@@ -70,7 +70,7 @@ __global__ void sparse_accumulation_of_products_kernel(
         int b_idx = (packed_indices[k] >> 8) & 0xFF;
         int a_idx = (packed_indices[k] >> 16) & 0xFF;
 
-        atomicAdd(
+        ATOMIC_ADD(
             &buffer_out[out_idx * WARP_SIZE + laneID],
             C.data[k] * buffer_A[a_idx * WARP_SIZE + laneID] * buffer_B[b_idx * WARP_SIZE + laneID]
         );
@@ -235,14 +235,14 @@ __global__ void sparse_accumulation_of_products_vjp_kernel(
         int a_idx = (packed_indices[k] >> 16) & 0xFF;
 
         if (grad_A.data != nullptr) {
-            atomicAdd(
+            ATOMIC_ADD(
                 &buffer_gradA[a_idx * WARP_SIZE + laneID],
                 C.data[k] * buffer_B[b_idx * WARP_SIZE + laneID] *
                     buffer_gradout[out_idx * WARP_SIZE + laneID]
             );
         }
         if (grad_B.data != nullptr) {
-            atomicAdd(
+            ATOMIC_ADD(
                 &buffer_gradB[b_idx * WARP_SIZE + laneID],
                 C.data[k] * buffer_A[a_idx * WARP_SIZE + laneID] *
                     buffer_gradout[out_idx * WARP_SIZE + laneID]
@@ -507,14 +507,14 @@ __global__ void sparse_accumulation_of_products_vjp_vjp_kernel(
             scalar_t grad_grad_A_k = buffer_grad_grad_A[a_idx * WARP_SIZE + laneID];
 
             if (grad_grad_output.data != nullptr) {
-                atomicAdd(
+                ATOMIC_ADD(
                     &buffer_grad_grad_output[out_idx * WARP_SIZE + laneID],
                     grad_grad_A_k * buffer_B[b_idx * WARP_SIZE + laneID] * c
                 );
             }
 
             if (grad_B_2.data != nullptr) {
-                atomicAdd(
+                ATOMIC_ADD(
                     &buffer_grad_B2[b_idx * WARP_SIZE + laneID],
                     grad_grad_A_k * buffer_grad_output[out_idx * WARP_SIZE + laneID] * c
                 );
@@ -525,14 +525,14 @@ __global__ void sparse_accumulation_of_products_vjp_vjp_kernel(
             scalar_t grad_grad_B_k = buffer_grad_grad_B[b_idx * WARP_SIZE + laneID];
 
             if (grad_grad_output.data != nullptr) {
-                atomicAdd(
+                ATOMIC_ADD(
                     &buffer_grad_grad_output[out_idx * WARP_SIZE + laneID],
                     grad_grad_B_k * buffer_A[a_idx * WARP_SIZE + laneID] * c
                 );
             }
 
             if (grad_A_2.data != nullptr) {
-                atomicAdd(
+                ATOMIC_ADD(
                     &buffer_grad_A2[a_idx * WARP_SIZE + laneID],
                     grad_grad_B_k * buffer_grad_output[out_idx * WARP_SIZE + laneID] * c
                 );

From ade1cd3ee84e097c986f5be4df813281353b5471 Mon Sep 17 00:00:00 2001
From: "Nicholas J. Browning" <nickjbrowning@gmail.com>
Date: Fri, 10 May 2024 16:33:18 +0200
Subject: [PATCH 22/28] documentation.

---
 mops/src/internal/cuda_utils.cuh | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/mops/src/internal/cuda_utils.cuh b/mops/src/internal/cuda_utils.cuh
index 416417f..7aec588 100644
--- a/mops/src/internal/cuda_utils.cuh
+++ b/mops/src/internal/cuda_utils.cuh
@@ -23,12 +23,19 @@ using namespace std;
         }                                                                                          \
     } while (0)
 
+/*
+ * Macro to select the right version of atomicAdd for the archcode being compiled.
+ */
 #if __CUDA_ARCH__ < 600
 #define ATOMIC_ADD(address, val) atomicAdd_presm60(address, val)
 #else
 #define ATOMIC_ADD(address, val) atomicAdd(address, val)
 #endif
 
+/*
+ * Pre SM60 cards do not support atomicAdd(double *, double). This function implements and atomicCAS
+ * to lock update the address.
+ */
 __device__ double atomicAdd_presm60(double* address, double val);
 
 __host__ __device__ int32_t find_integer_divisor(int32_t x, int32_t bdim);

From 263984d4be133ad8f78767ed311f7ceee5e10583 Mon Sep 17 00:00:00 2001
From: "Nicholas J. Browning" <nickjbrowning@gmail.com>
Date: Fri, 10 May 2024 17:02:33 +0200
Subject: [PATCH 23/28] changed macro to device function.

---
 mops/src/internal/cuda_utils.cu  | 15 +++++++++++++++
 mops/src/internal/cuda_utils.cuh | 14 +++++---------
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/mops/src/internal/cuda_utils.cu b/mops/src/internal/cuda_utils.cu
index 878c556..532d168 100644
--- a/mops/src/internal/cuda_utils.cu
+++ b/mops/src/internal/cuda_utils.cu
@@ -27,6 +27,21 @@ __device__ double atomicAdd_presm60(double* address, double val) {
     return __longlong_as_double(old);
 }
 
+template <typename scalar_t> __device__ scalar_t ATOMIC_ADD(scalar_t* address, scalar_t val) {
+#if __CUDA_ARCH__ < 600
+    if constexpr (sizeof(scalar_t) == 4) {
+        return atomicAdd(address, val);
+    } else if constexpr (sizeof(scalar_t) == 8) {
+        return atomicAdd_presm60(address, val);
+    }
+#else
+    return atomicAdd(address, val);
+#endif
+}
+
+template float ATOMIC_ADD<float>(float* address, float val);
+template double ATOMIC_ADD<double>(double* address, double val);
+
 template <typename T>
 __host__ __device__ T* shared_array(std::size_t n_elements, void*& ptr, std::size_t* space) noexcept {
     const std::uintptr_t inptr = reinterpret_cast<uintptr_t>(ptr);
diff --git a/mops/src/internal/cuda_utils.cuh b/mops/src/internal/cuda_utils.cuh
index 7aec588..83645cf 100644
--- a/mops/src/internal/cuda_utils.cuh
+++ b/mops/src/internal/cuda_utils.cuh
@@ -23,21 +23,17 @@ using namespace std;
         }                                                                                          \
     } while (0)
 
-/*
- * Macro to select the right version of atomicAdd for the archcode being compiled.
- */
-#if __CUDA_ARCH__ < 600
-#define ATOMIC_ADD(address, val) atomicAdd_presm60(address, val)
-#else
-#define ATOMIC_ADD(address, val) atomicAdd(address, val)
-#endif
-
 /*
  * Pre SM60 cards do not support atomicAdd(double *, double). This function implements and atomicCAS
  * to lock update the address.
  */
 __device__ double atomicAdd_presm60(double* address, double val);
 
+/*
+ * function to select the right version of atomicAdd for the archcode being compiled.
+ */
+template <typename scalar_t> __device__ scalar_t ATOMIC_ADD(scalar_t* address, scalar_t val);
+
 __host__ __device__ int32_t find_integer_divisor(int32_t x, int32_t bdim);
 
 /*

From 899ea941095dcf0d0337d3516ede7a4b6b606df4 Mon Sep 17 00:00:00 2001
From: "Nicholas J. Browning" <nickjbrowning@gmail.com>
Date: Fri, 10 May 2024 17:40:12 +0200
Subject: [PATCH 24/28] fixed sap cstream

---
 mops/src/sap/sap.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mops/src/sap/sap.cu b/mops/src/sap/sap.cu
index f370265..305f9d8 100644
--- a/mops/src/sap/sap.cu
+++ b/mops/src/sap/sap.cu
@@ -701,7 +701,7 @@ void mops::cuda::sparse_accumulation_of_products_vjp_vjp(
 
     int32_t* packed_indices = shared_array<int32_t>(indices_A.shape[0], sptr, &space);
 
-    sparse_accumulation_of_products_vjp_vjp_kernel<scalar_t><<<block_dim, thread_block, space>>>(
+    sparse_accumulation_of_products_vjp_vjp_kernel<scalar_t><<<block_dim, thread_block, space, cstream>>>(
         grad_grad_output,
         grad_A_2,
         grad_B_2,

From 99cedf5e4afc75169688eeb1e8b7a471d545bebc Mon Sep 17 00:00:00 2001
From: "Nicholas J. Browning" <nickjbrowning@gmail.com>
Date: Fri, 10 May 2024 17:43:01 +0200
Subject: [PATCH 25/28] formatting.

---
 mops/src/sap/sap.cu | 41 ++++++++++++++++++++++-------------------
 1 file changed, 22 insertions(+), 19 deletions(-)

diff --git a/mops/src/sap/sap.cu b/mops/src/sap/sap.cu
index 305f9d8..55d7716 100644
--- a/mops/src/sap/sap.cu
+++ b/mops/src/sap/sap.cu
@@ -124,8 +124,9 @@ void mops::cuda::sparse_accumulation_of_products(
     shared_array<scalar_t>(WARP_SIZE * B.shape[1], sptr, &space);
     shared_array<int32_t>(indices_A.shape[0], sptr, &space);
 
-    sparse_accumulation_of_products_kernel<scalar_t>
-        <<<block_dim, thread_block, space, cstream>>>(output, A, B, C, indices_A, indices_B, indices_output);
+    sparse_accumulation_of_products_kernel<scalar_t><<<block_dim, thread_block, space, cstream>>>(
+        output, A, B, C, indices_A, indices_B, indices_output
+    );
 
     CUDA_CHECK_ERROR(cudaGetLastError());
     CUDA_CHECK_ERROR(cudaStreamSynchronize(cstream));
@@ -344,9 +345,10 @@ void mops::cuda::sparse_accumulation_of_products_vjp(
         shared_array<scalar_t>(WARP_SIZE * grad_A.shape[1], sptr, &space);
     }
 
-    sparse_accumulation_of_products_vjp_kernel<scalar_t><<<block_dim, thread_block, space, cstream>>>(
-        grad_A, grad_B, grad_output, A, B, C, indices_A, indices_B, indices_output
-    );
+    sparse_accumulation_of_products_vjp_kernel<scalar_t>
+        <<<block_dim, thread_block, space, cstream>>>(
+            grad_A, grad_B, grad_output, A, B, C, indices_A, indices_B, indices_output
+        );
 
     CUDA_CHECK_ERROR(cudaGetLastError());
     CUDA_CHECK_ERROR(cudaStreamSynchronize(cstream));
@@ -701,20 +703,21 @@ void mops::cuda::sparse_accumulation_of_products_vjp_vjp(
 
     int32_t* packed_indices = shared_array<int32_t>(indices_A.shape[0], sptr, &space);
 
-    sparse_accumulation_of_products_vjp_vjp_kernel<scalar_t><<<block_dim, thread_block, space, cstream>>>(
-        grad_grad_output,
-        grad_A_2,
-        grad_B_2,
-        grad_grad_A,
-        grad_grad_B,
-        grad_output,
-        A,
-        B,
-        C,
-        indices_A,
-        indices_B,
-        indices_output
-    );
+    sparse_accumulation_of_products_vjp_vjp_kernel<scalar_t>
+        <<<block_dim, thread_block, space, cstream>>>(
+            grad_grad_output,
+            grad_A_2,
+            grad_B_2,
+            grad_grad_A,
+            grad_grad_B,
+            grad_output,
+            A,
+            B,
+            C,
+            indices_A,
+            indices_B,
+            indices_output
+        );
 
     CUDA_CHECK_ERROR(cudaGetLastError());
     CUDA_CHECK_ERROR(cudaStreamSynchronize(cstream));

From 21b6235d3caf863e0323c598b113138459c92a6d Mon Sep 17 00:00:00 2001
From: "Nicholas J. Browning" <nickjbrowning@gmail.com>
Date: Fri, 10 May 2024 17:49:12 +0200
Subject: [PATCH 26/28] missing device guard in SAP

---
 mops-torch/src/sap.cpp | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/mops-torch/src/sap.cpp b/mops-torch/src/sap.cpp
index aca6d49..83c904f 100644
--- a/mops-torch/src/sap.cpp
+++ b/mops-torch/src/sap.cpp
@@ -64,6 +64,14 @@ torch::Tensor SparseAccumulationOfProducts::forward(
             );
         });
     } else if (A.device().is_cuda()) {
+
+#ifndef MOPS_CUDA_ENABLED
+        C10_THROW_ERROR(ValueError, "MOPS was not compiled with CUDA support " + A.device().str());
+#else
+        c10::cuda::CUDAGuard deviceGuard{A.device()};
+        cudaStream_t currstream = c10::cuda::getCurrentCUDAStream();
+        void* stream = reinterpret_cast<void*>(currstream);
+
         output = torch::empty(
             {A.size(0), output_size},
             torch::TensorOptions().dtype(A.scalar_type()).device(A.device())
@@ -77,9 +85,11 @@ torch::Tensor SparseAccumulationOfProducts::forward(
                 details::torch_to_mops_1d<scalar_t>(C),
                 details::torch_to_mops_1d<int32_t>(indices_A),
                 details::torch_to_mops_1d<int32_t>(indices_B),
-                details::torch_to_mops_1d<int32_t>(indices_output)
+                details::torch_to_mops_1d<int32_t>(indices_output),
+                stream
             );
         });
+#endif
     } else {
         C10_THROW_ERROR(
             ValueError,
@@ -175,6 +185,14 @@ std::vector<torch::Tensor> SparseAccumulationOfProductsBackward::forward(
             );
         });
     } else if (A.device().is_cuda()) {
+
+#ifndef MOPS_CUDA_ENABLED
+        C10_THROW_ERROR(ValueError, "MOPS was not compiled with CUDA support " + A.device().str());
+#else
+        c10::cuda::CUDAGuard deviceGuard{A.device()};
+        cudaStream_t currstream = c10::cuda::getCurrentCUDAStream();
+        void* stream = reinterpret_cast<void*>(currstream);
+
         AT_DISPATCH_FLOATING_TYPES(A.scalar_type(), "sparse_accumulation_of_products_vjp", [&]() {
             auto mops_grad_A = mops::Tensor<scalar_t, 2>{nullptr, {0, 0}};
             if (A.requires_grad()) {
@@ -197,9 +215,11 @@ std::vector<torch::Tensor> SparseAccumulationOfProductsBackward::forward(
                 details::torch_to_mops_1d<scalar_t>(C),
                 details::torch_to_mops_1d<int32_t>(indices_A),
                 details::torch_to_mops_1d<int32_t>(indices_B),
-                details::torch_to_mops_1d<int32_t>(indices_output)
+                details::torch_to_mops_1d<int32_t>(indices_output),
+                stream
             );
         });
+#endif
     } else {
         C10_THROW_ERROR(
             ValueError,
@@ -281,6 +301,10 @@ std::vector<torch::Tensor> SparseAccumulationOfProductsBackward::backward(
 #ifndef MOPS_CUDA_ENABLED
         C10_THROW_ERROR(ValueError, "MOPS was not compiled with CUDA support " + A.device().str());
 #else
+        c10::cuda::CUDAGuard deviceGuard{A.device()};
+        cudaStream_t currstream = c10::cuda::getCurrentCUDAStream();
+        void* stream = reinterpret_cast<void*>(currstream);
+
         AT_DISPATCH_FLOATING_TYPES(A.scalar_type(), "sparse_accumulation_of_products_vjp_vjp", [&]() {
             auto mops_grad_grad_output = mops::Tensor<scalar_t, 2>{nullptr, {0, 0}};
             if (grad_output.requires_grad()) {

From dd29ac0ca88cf545e2ef42b6306dc1c33988e650 Mon Sep 17 00:00:00 2001
From: "Nicholas J. Browning" <nickjbrowning@gmail.com>
Date: Fri, 10 May 2024 17:51:54 +0200
Subject: [PATCH 27/28] missing stream.

---
 mops-torch/src/sap.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mops-torch/src/sap.cpp b/mops-torch/src/sap.cpp
index 83c904f..c7ae354 100644
--- a/mops-torch/src/sap.cpp
+++ b/mops-torch/src/sap.cpp
@@ -346,7 +346,8 @@ std::vector<torch::Tensor> SparseAccumulationOfProductsBackward::backward(
                 details::torch_to_mops_1d<scalar_t>(C),
                 details::torch_to_mops_1d<int32_t>(indices_A),
                 details::torch_to_mops_1d<int32_t>(indices_B),
-                details::torch_to_mops_1d<int32_t>(indices_output)
+                details::torch_to_mops_1d<int32_t>(indices_output),
+                stream
             );
         });
 #endif

From 7f92034a9e161c9bd2cb99e4fa32c02569185d2c Mon Sep 17 00:00:00 2001
From: frostedoyster <bigi.f@libero.it>
Date: Sun, 12 May 2024 19:56:02 +0200
Subject: [PATCH 28/28] Add stream benchmark

---
 python/mops-torch/benchmarks/stream.py | 52 ++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)
 create mode 100644 python/mops-torch/benchmarks/stream.py

diff --git a/python/mops-torch/benchmarks/stream.py b/python/mops-torch/benchmarks/stream.py
new file mode 100644
index 0000000..ffaad7c
--- /dev/null
+++ b/python/mops-torch/benchmarks/stream.py
@@ -0,0 +1,52 @@
+import mops.torch
+import torch
+from benchmark import benchmark, format_mean_std, initialize
+from typing import List
+
+initialize()
+
+A = torch.rand(100, 1000, requires_grad=True)
+C = torch.rand(2000)
+indices_A = torch.randint(1000, size=(2000, 3), dtype=torch.int32)
+
+@torch.jit.script
+def hpe_10(A, C, indices_A):
+    results = []
+    for _ in range(10):
+        results.append(mops.torch.homogeneous_polynomial_evaluation(A, C, indices_A))
+    return torch.sum(torch.concatenate(results))
+
+mean_fwd, std_fwd, mean_bwd, std_bwd = benchmark(
+    lambda: hpe_10(A, C, indices_A)
+)
+
+
+@torch.jit.script
+def hpe_10_stream(streams: List[torch.classes.cuda.Stream], A, C, indices_A):
+    results = []
+    
+    for stream in streams:
+        with torch.cuda.stream(stream):
+            result = mops.torch.homogeneous_polynomial_evaluation(A, C, indices_A)
+        results.append(result)
+
+    for stream in streams:
+        stream.synchronize()
+
+    return torch.sum(torch.concatenate(results))
+
+streams = [torch.classes.cuda.Stream() for _ in range(10)]
+mean_fwd_stream, std_fwd_stream, mean_bwd_stream, std_bwd_stream = benchmark(
+    lambda: hpe_10_stream(streams, A, C, indices_A)
+)
+
+
+print()
+print("Without CUDA streams:")
+print("Forward pass:", format_mean_std(mean_fwd, std_fwd))
+print("Backward pass:", format_mean_std(mean_bwd, std_bwd))
+
+print()
+print("With CUDA streams:")
+print("Forward pass:", format_mean_std(mean_fwd_stream, std_fwd_stream))
+print("Backward pass:", format_mean_std(mean_bwd_stream, std_bwd_stream))