From 72ce285190eb0a1bc54c4b2169a6663581b4e54a Mon Sep 17 00:00:00 2001 From: Prashanth Kanduri Date: Mon, 29 Apr 2024 13:22:20 +0200 Subject: [PATCH 01/28] Add CI config files --- ci/docker/Dockerfile.base | 6 ++++++ ci/pipeline.yml | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) create mode 100644 ci/docker/Dockerfile.base create mode 100644 ci/pipeline.yml diff --git a/ci/docker/Dockerfile.base b/ci/docker/Dockerfile.base new file mode 100644 index 0000000..f1c85e8 --- /dev/null +++ b/ci/docker/Dockerfile.base @@ -0,0 +1,6 @@ +FROM nvcr.io/nvidia/pytorch:23.10-py3 + +RUN apt-get update + +# install boost test framework +RUN apt-get install -y libboost-test-dev diff --git a/ci/pipeline.yml b/ci/pipeline.yml new file mode 100644 index 0000000..61af310 --- /dev/null +++ b/ci/pipeline.yml @@ -0,0 +1,33 @@ +include: + - remote: 'https://gitlab.com/cscs-ci/recipes/-/raw/master/templates/v2/.ci-ext.yml' + +stages: + - build + - test + +build_base_image_job: + stage: build + extends: .container-builder-dynamic-name + timeout: 2h + variables: + DOCKERFILE: ci/docker/Dockerfile.base + WATCH_FILECHANGES: $DOCKERFILE + PERSIST_IMAGE_NAME: $CSCS_REGISTRY_PATH/base/public/mops + +test_job: + stage: test + extends: .container-runner-daint-gpu + image: $BASE_IMAGE + timeout: 2h + script: + - export CUDA_HOME="/usr/local/cuda" + - python3 -m pip install --upgrade pip + - python3 -m pip install tox + - tox + + variables: + SLURM_JOB_NUM_NODES: 1 + SLURM_PARTITION: normal + SLURM_NTASKS: 1 + SLURM_TIMELIMIT: '00:40:00' + GIT_STRATEGY: fetch From d463d2161b2eacb923854f5c0e159e6ecd496875 Mon Sep 17 00:00:00 2001 From: "Nick J. Browning" Date: Thu, 2 May 2024 10:44:55 +0200 Subject: [PATCH 02/28] added stream management to OPSA --- mops-torch/src/opsa.cpp | 19 +++++++++++++++++-- mops/CMakeLists.txt | 2 +- mops/include/mops/opsa.h | 18 ++++++++++++------ mops/include/mops/opsa.hpp | 33 +++++++++++++++++++++++--------- mops/src/opsa/capi.cpp | 36 +++++++++++++++++++++++------------ mops/src/opsa/opsa.cpp | 32 ++++++++++++++++++++++--------- mops/src/opsa/opsa.cu | 39 ++++++++++++++++++++++++++++---------- 7 files changed, 130 insertions(+), 49 deletions(-) diff --git a/mops-torch/src/opsa.cpp b/mops-torch/src/opsa.cpp index 0b782dd..cdccedf 100644 --- a/mops-torch/src/opsa.cpp +++ b/mops-torch/src/opsa.cpp @@ -1,3 +1,8 @@ +#ifdef MOPS_CUDA_ENABLED +#include +#include +#endif + #include "mops/torch/opsa.hpp" #include "mops/torch/utils.hpp" @@ -48,6 +53,10 @@ torch::Tensor OuterProductScatterAdd::forward( #ifndef MOPS_CUDA_ENABLED C10_THROW_ERROR(ValueError, "MOPS was not compiled with CUDA support " + A.device().str()); #else + c10::cuda::CUDAGuard deviceGuard{A.device()}; + cudaStream_t currstream = c10::cuda::getCurrentCUDAStream(); + void* stream = reinterpret_cast(currstream); + output = torch::empty( {output_size, A.size(1), B.size(1)}, torch::TensorOptions().dtype(A.scalar_type()).device(A.device()) @@ -58,7 +67,8 @@ torch::Tensor OuterProductScatterAdd::forward( details::torch_to_mops_3d(output), details::torch_to_mops_2d(A), details::torch_to_mops_2d(B), - details::torch_to_mops_1d(indices_output) + details::torch_to_mops_1d(indices_output), + stream ); }); @@ -130,6 +140,10 @@ std::vector OuterProductScatterAddBackward::forward( #ifndef MOPS_CUDA_ENABLED C10_THROW_ERROR(ValueError, "MOPS was not compiled with CUDA support " + A.device().str()); #else + c10::cuda::CUDAGuard deviceGuard{A.device()}; + cudaStream_t currstream = c10::cuda::getCurrentCUDAStream(); + void* stream = reinterpret_cast(currstream); + AT_DISPATCH_FLOATING_TYPES(A.scalar_type(), "outer_product_scatter_add_vjp", [&]() { auto mops_grad_A = mops::Tensor{nullptr, {0, 0}}; @@ -150,7 +164,8 @@ std::vector OuterProductScatterAddBackward::forward( details::torch_to_mops_3d(grad_output), details::torch_to_mops_2d(A), details::torch_to_mops_2d(B), - details::torch_to_mops_1d(indices_output) + details::torch_to_mops_1d(indices_output), + stream ); }); #endif diff --git a/mops/CMakeLists.txt b/mops/CMakeLists.txt index 846eb47..efc00b7 100644 --- a/mops/CMakeLists.txt +++ b/mops/CMakeLists.txt @@ -110,7 +110,7 @@ add_library(mops if(CMAKE_CUDA_COMPILER AND MOPS_CUDA) target_compile_definitions(mops PUBLIC MOPS_CUDA_ENABLED) - set_target_properties(mops PROPERTIES CUDA_ARCHITECTURES native) + set_target_properties(mops PROPERTIES CUDA_ARCHITECTURES all) set_target_properties(mops PROPERTIES CUDA_NVCC_FLAGS "-lineinfo") target_sources(mops diff --git a/mops/include/mops/opsa.h b/mops/include/mops/opsa.h index 7f608ed..e1ade1a 100644 --- a/mops/include/mops/opsa.h +++ b/mops/include/mops/opsa.h @@ -75,7 +75,8 @@ int MOPS_EXPORT mops_cuda_outer_product_scatter_add_f32( mops_tensor_3d_f32_t output, mops_tensor_2d_f32_t A, mops_tensor_2d_f32_t B, - mops_tensor_1d_i32_t indices_output + mops_tensor_1d_i32_t indices_output, + void* cuda_stream ); /// CUDA version of mops::outer_product_scatter_add for 64-bit floats @@ -83,7 +84,8 @@ int MOPS_EXPORT mops_cuda_outer_product_scatter_add_f64( mops_tensor_3d_f64_t output, mops_tensor_2d_f64_t A, mops_tensor_2d_f64_t B, - mops_tensor_1d_i32_t indices_output + mops_tensor_1d_i32_t indices_output, + void* cuda_stream ); /// CUDA version of mops::outer_product_scatter_add_vjp for 32-bit floats @@ -93,7 +95,8 @@ int MOPS_EXPORT mops_cuda_outer_product_scatter_add_vjp_f32( mops_tensor_3d_f32_t grad_output, mops_tensor_2d_f32_t A, mops_tensor_2d_f32_t B, - mops_tensor_1d_i32_t indices_output + mops_tensor_1d_i32_t indices_output, + void* cuda_stream ); /// CUDA version of mops::outer_product_scatter_add_vjp for 64-bit floats @@ -103,7 +106,8 @@ int MOPS_EXPORT mops_cuda_outer_product_scatter_add_vjp_f64( mops_tensor_3d_f64_t grad_output, mops_tensor_2d_f64_t A, mops_tensor_2d_f64_t B, - mops_tensor_1d_i32_t indices_output + mops_tensor_1d_i32_t indices_output, + void* cuda_stream ); /// CUDA version of mops::outer_product_scatter_add_vjp_vjp for 32-bit floats @@ -116,7 +120,8 @@ int MOPS_EXPORT mops_cuda_outer_product_scatter_add_vjp_vjp_f32( mops_tensor_3d_f32_t grad_output, mops_tensor_2d_f32_t A, mops_tensor_2d_f32_t B, - mops_tensor_1d_i32_t indices_output + mops_tensor_1d_i32_t indices_output, + void* cuda_stream ); /// CUDA version of mops::outer_product_scatter_add_vjp_vjp for 64-bit floats @@ -129,7 +134,8 @@ int MOPS_EXPORT mops_cuda_outer_product_scatter_add_vjp_vjp_f64( mops_tensor_3d_f64_t grad_output, mops_tensor_2d_f64_t A, mops_tensor_2d_f64_t B, - mops_tensor_1d_i32_t indices_output + mops_tensor_1d_i32_t indices_output, + void* cuda_stream ); #ifdef __cplusplus diff --git a/mops/include/mops/opsa.hpp b/mops/include/mops/opsa.hpp index 9e7543f..eb6fbe9 100644 --- a/mops/include/mops/opsa.hpp +++ b/mops/include/mops/opsa.hpp @@ -149,15 +149,24 @@ void MOPS_EXPORT outer_product_scatter_add( Tensor output, Tensor A, Tensor B, - Tensor indices_output + Tensor indices_output, + void* cuda_stream = nullptr ); extern template void outer_product_scatter_add( - Tensor output, Tensor A, Tensor B, Tensor indices_output + Tensor output, + Tensor A, + Tensor B, + Tensor indices_output, + void* cuda_stream ); extern template void outer_product_scatter_add( - Tensor output, Tensor A, Tensor B, Tensor indices_output + Tensor output, + Tensor A, + Tensor B, + Tensor indices_output, + void* cuda_stream ); template @@ -167,7 +176,8 @@ void MOPS_EXPORT outer_product_scatter_add_vjp( Tensor grad_output, Tensor A, Tensor B, - Tensor indices_output + Tensor indices_output, + void* cuda_stream = nullptr ); // these templates will be precompiled and provided in the mops library @@ -177,7 +187,8 @@ extern template void outer_product_scatter_add_vjp( Tensor grad_output, Tensor A, Tensor B, - Tensor indices_output + Tensor indices_output, + void* cuda_stream ); extern template void outer_product_scatter_add_vjp( @@ -186,7 +197,8 @@ extern template void outer_product_scatter_add_vjp( Tensor grad_output, Tensor A, Tensor B, - Tensor indices_output + Tensor indices_output, + void* cuda_stream ); /// TODO @@ -200,7 +212,8 @@ void MOPS_EXPORT outer_product_scatter_add_vjp_vjp( Tensor grad_output, Tensor A, Tensor B, - Tensor indices_output + Tensor indices_output, + void* cuda_stream = nullptr ); // these templates will be precompiled and provided in the mops library @@ -213,7 +226,8 @@ extern template void outer_product_scatter_add_vjp_vjp( Tensor grad_output, Tensor A, Tensor B, - Tensor indices_output + Tensor indices_output, + void* cuda_stream ); extern template void outer_product_scatter_add_vjp_vjp( @@ -225,7 +239,8 @@ extern template void outer_product_scatter_add_vjp_vjp( Tensor grad_output, Tensor A, Tensor B, - Tensor indices_output + Tensor indices_output, + void* cuda_stream ); } // namespace cuda diff --git a/mops/src/opsa/capi.cpp b/mops/src/opsa/capi.cpp index 2be0a2b..2098a6e 100644 --- a/mops/src/opsa/capi.cpp +++ b/mops/src/opsa/capi.cpp @@ -172,7 +172,8 @@ extern "C" int mops_cuda_outer_product_scatter_add_f32( mops_tensor_3d_f32_t output, mops_tensor_2d_f32_t A, mops_tensor_2d_f32_t B, - mops_tensor_1d_i32_t indices_output + mops_tensor_1d_i32_t indices_output, + void* cuda_stream ) { MOPS_CATCH_EXCEPTIONS_BEGIN mops::cuda::outer_product_scatter_add( @@ -182,7 +183,8 @@ extern "C" int mops_cuda_outer_product_scatter_add_f32( checked_cast(output.shape[2])}}, {A.data, {checked_cast(A.shape[0]), checked_cast(A.shape[1])}}, {B.data, {checked_cast(B.shape[0]), checked_cast(B.shape[1])}}, - {indices_output.data, {checked_cast(indices_output.shape[0])}} + {indices_output.data, {checked_cast(indices_output.shape[0])}}, + cuda_stream ); MOPS_CATCH_EXCEPTIONS_END } @@ -191,7 +193,8 @@ extern "C" int mops_cuda_outer_product_scatter_add_f64( mops_tensor_3d_f64_t output, mops_tensor_2d_f64_t A, mops_tensor_2d_f64_t B, - mops_tensor_1d_i32_t indices_output + mops_tensor_1d_i32_t indices_output, + void* cuda_stream ) { MOPS_CATCH_EXCEPTIONS_BEGIN mops::cuda::outer_product_scatter_add( @@ -201,7 +204,8 @@ extern "C" int mops_cuda_outer_product_scatter_add_f64( checked_cast(output.shape[2])}}, {A.data, {checked_cast(A.shape[0]), checked_cast(A.shape[1])}}, {B.data, {checked_cast(B.shape[0]), checked_cast(B.shape[1])}}, - {indices_output.data, {checked_cast(indices_output.shape[0])}} + {indices_output.data, {checked_cast(indices_output.shape[0])}}, + cuda_stream ); MOPS_CATCH_EXCEPTIONS_END } @@ -212,7 +216,8 @@ extern "C" int mops_cuda_outer_product_scatter_add_vjp_f32( mops_tensor_3d_f32_t grad_output, mops_tensor_2d_f32_t A, mops_tensor_2d_f32_t B, - mops_tensor_1d_i32_t indices_output + mops_tensor_1d_i32_t indices_output, + void* cuda_stream ) { MOPS_CATCH_EXCEPTIONS_BEGIN mops::cuda::outer_product_scatter_add_vjp( @@ -224,7 +229,8 @@ extern "C" int mops_cuda_outer_product_scatter_add_vjp_f32( checked_cast(grad_output.shape[2])}}, {A.data, {checked_cast(A.shape[0]), checked_cast(A.shape[1])}}, {B.data, {checked_cast(B.shape[0]), checked_cast(B.shape[1])}}, - {indices_output.data, {checked_cast(indices_output.shape[0])}} + {indices_output.data, {checked_cast(indices_output.shape[0])}}, + cuda_stream ); MOPS_CATCH_EXCEPTIONS_END } @@ -235,7 +241,8 @@ extern "C" int mops_cuda_outer_product_scatter_add_vjp_f64( mops_tensor_3d_f64_t grad_output, mops_tensor_2d_f64_t A, mops_tensor_2d_f64_t B, - mops_tensor_1d_i32_t indices_output + mops_tensor_1d_i32_t indices_output, + void* cuda_stream ) { MOPS_CATCH_EXCEPTIONS_BEGIN mops::cuda::outer_product_scatter_add_vjp( @@ -247,7 +254,8 @@ extern "C" int mops_cuda_outer_product_scatter_add_vjp_f64( checked_cast(grad_output.shape[2])}}, {A.data, {checked_cast(A.shape[0]), checked_cast(A.shape[1])}}, {B.data, {checked_cast(B.shape[0]), checked_cast(B.shape[1])}}, - {indices_output.data, {checked_cast(indices_output.shape[0])}} + {indices_output.data, {checked_cast(indices_output.shape[0])}}, + cuda_stream ); MOPS_CATCH_EXCEPTIONS_END } @@ -261,7 +269,8 @@ extern "C" int mops_cuda_outer_product_scatter_add_vjp_vjp_f32( mops_tensor_3d_f32_t grad_output, mops_tensor_2d_f32_t A, mops_tensor_2d_f32_t B, - mops_tensor_1d_i32_t indices_output + mops_tensor_1d_i32_t indices_output, + void* cuda_stream ) { MOPS_CATCH_EXCEPTIONS_BEGIN mops::cuda::outer_product_scatter_add_vjp_vjp( @@ -281,7 +290,8 @@ extern "C" int mops_cuda_outer_product_scatter_add_vjp_vjp_f32( checked_cast(grad_output.shape[2])}}, {A.data, {checked_cast(A.shape[0]), checked_cast(A.shape[1])}}, {B.data, {checked_cast(B.shape[0]), checked_cast(B.shape[1])}}, - {indices_output.data, {checked_cast(indices_output.shape[0])}} + {indices_output.data, {checked_cast(indices_output.shape[0])}}, + cuda_stream ); MOPS_CATCH_EXCEPTIONS_END } @@ -295,7 +305,8 @@ extern "C" int mops_cuda_outer_product_scatter_add_vjp_vjp_f64( mops_tensor_3d_f64_t grad_output, mops_tensor_2d_f64_t A, mops_tensor_2d_f64_t B, - mops_tensor_1d_i32_t indices_output + mops_tensor_1d_i32_t indices_output, + void* cuda_stream ) { MOPS_CATCH_EXCEPTIONS_BEGIN mops::cuda::outer_product_scatter_add_vjp_vjp( @@ -315,7 +326,8 @@ extern "C" int mops_cuda_outer_product_scatter_add_vjp_vjp_f64( checked_cast(grad_output.shape[2])}}, {A.data, {checked_cast(A.shape[0]), checked_cast(A.shape[1])}}, {B.data, {checked_cast(B.shape[0]), checked_cast(B.shape[1])}}, - {indices_output.data, {checked_cast(indices_output.shape[0])}} + {indices_output.data, {checked_cast(indices_output.shape[0])}}, + cuda_stream ); MOPS_CATCH_EXCEPTIONS_END } diff --git a/mops/src/opsa/opsa.cpp b/mops/src/opsa/opsa.cpp index 7a82088..7eab3ad 100644 --- a/mops/src/opsa/opsa.cpp +++ b/mops/src/opsa/opsa.cpp @@ -54,17 +54,25 @@ template void mops::outer_product_scatter_add_vjp_vjp( #ifndef MOPS_CUDA_ENABLED template void mops::cuda:: - outer_product_scatter_add(Tensor, Tensor, Tensor, Tensor) { + outer_product_scatter_add(Tensor, Tensor, Tensor, Tensor, void*) { throw std::runtime_error("MOPS was not compiled with CUDA support"); } // explicit instantiations of CUDA templates template void mops::cuda::outer_product_scatter_add( - Tensor output, Tensor A, Tensor B, Tensor indices_output + Tensor output, + Tensor A, + Tensor B, + Tensor indices_output, + void* cuda_stream ); template void mops::cuda::outer_product_scatter_add( - Tensor output, Tensor A, Tensor B, Tensor indices_output + Tensor output, + Tensor A, + Tensor B, + Tensor indices_output, + void* cuda_stream ); template @@ -74,7 +82,8 @@ void mops::cuda::outer_product_scatter_add_vjp( Tensor /*grad_output*/, Tensor /*A*/, Tensor /*B*/, - Tensor /*indices_output*/ + Tensor /*indices_output*/, + void* /*cudaStream_t*/ ) { throw std::runtime_error("MOPS was not compiled with CUDA support"); } @@ -85,7 +94,8 @@ template void mops::cuda::outer_product_scatter_add_vjp( Tensor grad_output, Tensor A, Tensor B, - Tensor indices_output + Tensor indices_output, + void* cuda_stream ); template void mops::cuda::outer_product_scatter_add_vjp( @@ -94,7 +104,8 @@ template void mops::cuda::outer_product_scatter_add_vjp( Tensor grad_output, Tensor A, Tensor B, - Tensor indices_output + Tensor indices_output, + void* cuda_stream ); template @@ -107,7 +118,8 @@ void mops::cuda::outer_product_scatter_add_vjp_vjp( Tensor /*grad_output*/, Tensor /*A*/, Tensor /*B*/, - Tensor /*indices_output*/ + Tensor /*indices_output*/, + void* /*cudaStream_t*/ ) { throw std::runtime_error("MOPS was not compiled with CUDA support"); } @@ -121,7 +133,8 @@ template void mops::cuda::outer_product_scatter_add_vjp_vjp( Tensor grad_output, Tensor A, Tensor B, - Tensor indices_output + Tensor indices_output, + void* cuda_stream ); template void mops::cuda::outer_product_scatter_add_vjp_vjp( @@ -133,7 +146,8 @@ template void mops::cuda::outer_product_scatter_add_vjp_vjp( Tensor grad_output, Tensor A, Tensor B, - Tensor indices_output + Tensor indices_output, + void* cuda_stream ); #endif diff --git a/mops/src/opsa/opsa.cu b/mops/src/opsa/opsa.cu index a831335..b05b227 100644 --- a/mops/src/opsa/opsa.cu +++ b/mops/src/opsa/opsa.cu @@ -66,10 +66,13 @@ void mops::cuda::outer_product_scatter_add( Tensor output, Tensor A, Tensor B, - Tensor indices_output + Tensor indices_output, + void* cuda_stream ) { check_opsa(output, A, B, indices_output, "cuda_outer_product_scatter_add"); + cudaStream_t cstream = reinterpret_cast(cuda_stream); + int32_t* first_occurences = calculate_first_occurences_cuda( indices_output.data, indices_output.shape[0], output.shape[0] ); @@ -78,7 +81,7 @@ void mops::cuda::outer_product_scatter_add( dim3 blockDim(WARP_SIZE * NWARPS_PER_BLOCK, 1, 1); - outer_product_scatter_add_kernel<<>>( + outer_product_scatter_add_kernel<<>>( A, B, mops::Tensor{first_occurences, {output.shape[0] * 2}}, indices_output, output ); @@ -89,11 +92,19 @@ void mops::cuda::outer_product_scatter_add( // explicit instantiations of CUDA templates template void mops::cuda::outer_product_scatter_add( - Tensor output, Tensor A, Tensor B, Tensor indices_output + Tensor output, + Tensor A, + Tensor B, + Tensor indices_output, + void* cuda_stream ); template void mops::cuda::outer_product_scatter_add( - Tensor output, Tensor A, Tensor B, Tensor indices_output + Tensor output, + Tensor A, + Tensor B, + Tensor indices_output, + void* cuda_stream ); template @@ -240,12 +251,15 @@ void mops::cuda::outer_product_scatter_add_vjp( Tensor grad_output, Tensor A, Tensor B, - Tensor indices_output + Tensor indices_output, + void* cuda_stream ) { check_opsa_vjp( grad_A, grad_B, grad_output, A, B, indices_output, "cuda_outer_product_scatter_add_vjp" ); + cudaStream_t cstream = reinterpret_cast(cuda_stream); + int32_t* first_occurences = calculate_first_occurences_cuda( indices_output.data, indices_output.shape[0], grad_output.shape[0] ); @@ -290,7 +304,8 @@ template void mops::cuda::outer_product_scatter_add_vjp( Tensor grad_output, Tensor A, Tensor B, - Tensor indices_output + Tensor indices_output, + void* cuda_stream ); template void mops::cuda::outer_product_scatter_add_vjp( @@ -299,7 +314,8 @@ template void mops::cuda::outer_product_scatter_add_vjp( Tensor grad_output, Tensor A, Tensor B, - Tensor indices_output + Tensor indices_output, + void* cuda_stream ); template @@ -312,7 +328,8 @@ void mops::cuda::outer_product_scatter_add_vjp_vjp( Tensor /*grad_output*/, Tensor /*A*/, Tensor /*B*/, - Tensor /*indices_output*/ + Tensor /*indices_output*/, + void* /*cudaStream_t*/ ) { throw std::runtime_error("Not implemented"); } @@ -327,7 +344,8 @@ template void mops::cuda::outer_product_scatter_add_vjp_vjp( Tensor grad_output, Tensor A, Tensor B, - Tensor indices_output + Tensor indices_output, + void* cuda_stream ); template void mops::cuda::outer_product_scatter_add_vjp_vjp( @@ -339,5 +357,6 @@ template void mops::cuda::outer_product_scatter_add_vjp_vjp( Tensor grad_output, Tensor A, Tensor B, - Tensor indices_output + Tensor indices_output, + void* cuda_stream ); From d81ec9cadeb44653b0a7bd7acffa10701bda3b8d Mon Sep 17 00:00:00 2001 From: "Nick J. Browning" Date: Thu, 2 May 2024 11:07:07 +0200 Subject: [PATCH 03/28] x --- mops/src/opsa/opsa.cu | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/mops/src/opsa/opsa.cu b/mops/src/opsa/opsa.cu index b05b227..f500f44 100644 --- a/mops/src/opsa/opsa.cu +++ b/mops/src/opsa/opsa.cu @@ -71,6 +71,14 @@ void mops::cuda::outer_product_scatter_add( ) { check_opsa(output, A, B, indices_output, "cuda_outer_product_scatter_add"); + cudaPointerAttributes attributes; + CUDA_CHECK_ERROR(cudaPointerGetAttributes(&attributes, A.data)); + int current_device; + CUDA_CHECK_ERROR(cudaGetDevice(¤t_device)); + if (current_device != attributes.device) { + CUDA_CHECK_ERROR(cudaSetDevice(attributes.device)); + } + cudaStream_t cstream = reinterpret_cast(cuda_stream); int32_t* first_occurences = calculate_first_occurences_cuda( @@ -86,8 +94,11 @@ void mops::cuda::outer_product_scatter_add( ); CUDA_CHECK_ERROR(cudaGetLastError()); - CUDA_CHECK_ERROR(cudaDeviceSynchronize()); + + if (current_device != attributes.device) { + CUDA_CHECK_ERROR(cudaSetDevice(current_device)); + } } // explicit instantiations of CUDA templates @@ -258,6 +269,14 @@ void mops::cuda::outer_product_scatter_add_vjp( grad_A, grad_B, grad_output, A, B, indices_output, "cuda_outer_product_scatter_add_vjp" ); + cudaPointerAttributes attributes; + CUDA_CHECK_ERROR(cudaPointerGetAttributes(&attributes, A.data)); + int current_device; + CUDA_CHECK_ERROR(cudaGetDevice(¤t_device)); + if (current_device != attributes.device) { + CUDA_CHECK_ERROR(cudaSetDevice(attributes.device)); + } + cudaStream_t cstream = reinterpret_cast(cuda_stream); int32_t* first_occurences = calculate_first_occurences_cuda( @@ -282,7 +301,7 @@ void mops::cuda::outer_product_scatter_add_vjp( shared_array(NWARPS_PER_BLOCK * B.shape[1], sptr, &space); } - outer_product_scatter_add_vjp_kernel<<>>( + outer_product_scatter_add_vjp_kernel<<>>( A, B, mops::Tensor{first_occurences, {grad_output.shape[0]}}, @@ -293,8 +312,11 @@ void mops::cuda::outer_product_scatter_add_vjp( ); CUDA_CHECK_ERROR(cudaGetLastError()); - CUDA_CHECK_ERROR(cudaDeviceSynchronize()); + + if (current_device != attributes.device) { + CUDA_CHECK_ERROR(cudaSetDevice(current_device)); + } } // these templates will be precompiled and provided in the mops library From c8d87768b5ebd07332322864d6fd045d1f1bbd3c Mon Sep 17 00:00:00 2001 From: "Nick J. Browning" Date: Thu, 2 May 2024 11:51:40 +0200 Subject: [PATCH 04/28] added support for other ops. --- mops-torch/src/hpe.cpp | 27 ++++++++++++-- mops-torch/src/opsaw.cpp | 5 +++ mops-torch/src/sap.cpp | 5 +++ mops-torch/src/sasaw.cpp | 5 +++ mops/CMakeLists.txt | 2 +- mops/include/mops/hpe.h | 18 ++++++---- mops/include/mops/hpe.hpp | 24 ++++++++----- mops/include/mops/opsaw.h | 18 ++++++---- mops/include/mops/opsaw.hpp | 27 +++++++++----- mops/include/mops/sap.h | 18 ++++++---- mops/include/mops/sap.hpp | 27 +++++++++----- mops/include/mops/sasaw.h | 18 ++++++---- mops/include/mops/sasaw.hpp | 27 +++++++++----- mops/src/hpe/capi.cpp | 36 ++++++++++++------- mops/src/hpe/hpe.cpp | 30 +++++++++++----- mops/src/hpe/hpe.cu | 54 ++++++++++++++++++++++------ mops/src/opsa/opsa.cu | 4 +-- mops/src/opsaw/capi.cpp | 36 ++++++++++++------- mops/src/sap/capi.cpp | 36 ++++++++++++------- mops/src/sap/sap.cpp | 25 ++++++++----- mops/src/sap/sap.cu | 72 ++++++++++++++++++++++++++++++------- mops/src/sasaw/capi.cpp | 36 ++++++++++++------- mops/src/sasaw/sasaw.cpp | 33 +++++++++-------- 23 files changed, 415 insertions(+), 168 deletions(-) diff --git a/mops-torch/src/hpe.cpp b/mops-torch/src/hpe.cpp index dc74031..08f5664 100644 --- a/mops-torch/src/hpe.cpp +++ b/mops-torch/src/hpe.cpp @@ -1,3 +1,8 @@ +#ifdef MOPS_CUDA_ENABLED +#include +#include +#endif + #include "mops/torch/hpe.hpp" #include "mops/torch/utils.hpp" @@ -38,15 +43,25 @@ torch::Tensor HomogeneousPolynomialEvaluation::forward( }); } else if (A.device().is_cuda()) { +#ifndef MOPS_CUDA_ENABLED + C10_THROW_ERROR(ValueError, "MOPS was not compiled with CUDA support " + A.device().str()); +#else + c10::cuda::CUDAGuard deviceGuard{A.device()}; + cudaStream_t currstream = c10::cuda::getCurrentCUDAStream(); + void* stream = reinterpret_cast(currstream); + AT_DISPATCH_FLOATING_TYPES(A.scalar_type(), "homogeneous_polynomial_evaluation", [&]() { mops::cuda::homogeneous_polynomial_evaluation( details::torch_to_mops_1d(output), details::torch_to_mops_2d(A), details::torch_to_mops_1d(C), - details::torch_to_mops_2d(indices_A) + details::torch_to_mops_2d(indices_A), + stream ); }); +#endif + } else { C10_THROW_ERROR( ValueError, @@ -108,6 +123,12 @@ torch::Tensor HomogeneousPolynomialEvaluationBackward::forward( ); }); } else if (A.device().is_cuda()) { +#ifndef MOPS_CUDA_ENABLED + C10_THROW_ERROR(ValueError, "MOPS was not compiled with CUDA support " + A.device().str()); +#else + c10::cuda::CUDAGuard deviceGuard{A.device()}; + cudaStream_t currstream = c10::cuda::getCurrentCUDAStream(); + void* stream = reinterpret_cast(currstream); AT_DISPATCH_FLOATING_TYPES(A.scalar_type(), "homogeneous_polynomial_evaluation_vjp", [&]() { auto mops_grad_A = mops::Tensor{nullptr, {0, 0}}; @@ -121,9 +142,11 @@ torch::Tensor HomogeneousPolynomialEvaluationBackward::forward( details::torch_to_mops_1d(grad_output), details::torch_to_mops_2d(A), details::torch_to_mops_1d(C), - details::torch_to_mops_2d(indices_A) + details::torch_to_mops_2d(indices_A), + stream ); }); +#endif } else { C10_THROW_ERROR( ValueError, diff --git a/mops-torch/src/opsaw.cpp b/mops-torch/src/opsaw.cpp index 608598f..a8f313d 100644 --- a/mops-torch/src/opsaw.cpp +++ b/mops-torch/src/opsaw.cpp @@ -1,3 +1,8 @@ +#ifdef MOPS_CUDA_ENABLED +#include +#include +#endif + #include "mops/torch/opsaw.hpp" #include "mops/torch/utils.hpp" diff --git a/mops-torch/src/sap.cpp b/mops-torch/src/sap.cpp index a68cca5..aca6d49 100644 --- a/mops-torch/src/sap.cpp +++ b/mops-torch/src/sap.cpp @@ -1,3 +1,8 @@ +#ifdef MOPS_CUDA_ENABLED +#include +#include +#endif + #include "mops/torch/sap.hpp" #include "mops/torch/utils.hpp" diff --git a/mops-torch/src/sasaw.cpp b/mops-torch/src/sasaw.cpp index 2868e23..ee8ea76 100644 --- a/mops-torch/src/sasaw.cpp +++ b/mops-torch/src/sasaw.cpp @@ -1,3 +1,8 @@ +#ifdef MOPS_CUDA_ENABLED +#include +#include +#endif + #include "mops/torch/sasaw.hpp" #include "mops/torch/utils.hpp" diff --git a/mops/CMakeLists.txt b/mops/CMakeLists.txt index efc00b7..846eb47 100644 --- a/mops/CMakeLists.txt +++ b/mops/CMakeLists.txt @@ -110,7 +110,7 @@ add_library(mops if(CMAKE_CUDA_COMPILER AND MOPS_CUDA) target_compile_definitions(mops PUBLIC MOPS_CUDA_ENABLED) - set_target_properties(mops PROPERTIES CUDA_ARCHITECTURES all) + set_target_properties(mops PROPERTIES CUDA_ARCHITECTURES native) set_target_properties(mops PROPERTIES CUDA_NVCC_FLAGS "-lineinfo") target_sources(mops diff --git a/mops/include/mops/hpe.h b/mops/include/mops/hpe.h index ca18489..d1cf15b 100644 --- a/mops/include/mops/hpe.h +++ b/mops/include/mops/hpe.h @@ -69,7 +69,8 @@ int MOPS_EXPORT mops_cuda_homogeneous_polynomial_evaluation_f32( mops_tensor_1d_f32_t output, mops_tensor_2d_f32_t A, mops_tensor_1d_f32_t C, - mops_tensor_2d_i32_t indices_A + mops_tensor_2d_i32_t indices_A, + void* cuda_stream ); /// CUDA version of mops::homogeneous_polynomial_evaluation for 64-bit floats @@ -77,7 +78,8 @@ int MOPS_EXPORT mops_cuda_homogeneous_polynomial_evaluation_f64( mops_tensor_1d_f64_t output, mops_tensor_2d_f64_t A, mops_tensor_1d_f64_t C, - mops_tensor_2d_i32_t indices_A + mops_tensor_2d_i32_t indices_A, + void* cuda_stream ); /// CUDA version of mops::homogeneous_polynomial_evaluation_vjp for 32-bit floats @@ -86,7 +88,8 @@ int MOPS_EXPORT mops_cuda_homogeneous_polynomial_evaluation_vjp_f32( mops_tensor_1d_f32_t grad_output, mops_tensor_2d_f32_t A, mops_tensor_1d_f32_t C, - mops_tensor_2d_i32_t indices_A + mops_tensor_2d_i32_t indices_A, + void* cuda_stream ); /// CUDA version of mops::homogeneous_polynomial_evaluation_vjp for 64-bit floats @@ -95,7 +98,8 @@ int MOPS_EXPORT mops_cuda_homogeneous_polynomial_evaluation_vjp_f64( mops_tensor_1d_f64_t grad_output, mops_tensor_2d_f64_t A, mops_tensor_1d_f64_t C, - mops_tensor_2d_i32_t indices_A + mops_tensor_2d_i32_t indices_A, + void* cuda_stream ); /// CUDA version of mops::homogeneous_polynomial_evaluation_vjp_vjp for 32-bit floats @@ -106,7 +110,8 @@ int MOPS_EXPORT mops_cuda_homogeneous_polynomial_evaluation_vjp_vjp_f32( mops_tensor_1d_f32_t grad_output, mops_tensor_2d_f32_t A, mops_tensor_1d_f32_t C, - mops_tensor_2d_i32_t indices_A + mops_tensor_2d_i32_t indices_A, + void* cuda_stream ); /// CUDA version of mops::homogeneous_polynomial_evaluation_vjp_vjp for 64-bit floats @@ -117,7 +122,8 @@ int MOPS_EXPORT mops_cuda_homogeneous_polynomial_evaluation_vjp_vjp_f64( mops_tensor_1d_f64_t grad_output, mops_tensor_2d_f64_t A, mops_tensor_1d_f64_t C, - mops_tensor_2d_i32_t indices_A + mops_tensor_2d_i32_t indices_A, + void* cuda_stream ); #ifdef __cplusplus diff --git a/mops/include/mops/hpe.hpp b/mops/include/mops/hpe.hpp index 472417f..c3f6425 100644 --- a/mops/include/mops/hpe.hpp +++ b/mops/include/mops/hpe.hpp @@ -122,15 +122,15 @@ namespace cuda { /// CUDA version of mops::homogeneous_polynomial_evaluation template void MOPS_EXPORT homogeneous_polynomial_evaluation( - Tensor output, Tensor A, Tensor C, Tensor indices_A + Tensor output, Tensor A, Tensor C, Tensor indices_A, void* cuda_stream = nullptr ); extern template void homogeneous_polynomial_evaluation( - Tensor output, Tensor A, Tensor C, Tensor indices_A + Tensor output, Tensor A, Tensor C, Tensor indices_A, void* cuda_stream ); extern template void homogeneous_polynomial_evaluation( - Tensor output, Tensor A, Tensor C, Tensor indices_A + Tensor output, Tensor A, Tensor C, Tensor indices_A, void* cuda_stream ); template @@ -139,7 +139,8 @@ void MOPS_EXPORT homogeneous_polynomial_evaluation_vjp( Tensor grad_output, Tensor A, Tensor C, - Tensor indices_A + Tensor indices_A, + void* cuda_stream = nullptr ); extern template void homogeneous_polynomial_evaluation_vjp( @@ -147,7 +148,8 @@ extern template void homogeneous_polynomial_evaluation_vjp( Tensor grad_output, Tensor A, Tensor C, - Tensor indices_A + Tensor indices_A, + void* cuda_stream ); extern template void homogeneous_polynomial_evaluation_vjp( @@ -155,7 +157,8 @@ extern template void homogeneous_polynomial_evaluation_vjp( Tensor grad_output, Tensor A, Tensor C, - Tensor indices_A + Tensor indices_A, + void* cuda_stream ); template @@ -166,7 +169,8 @@ void MOPS_EXPORT homogeneous_polynomial_evaluation_vjp_vjp( Tensor grad_output, Tensor A, Tensor C, - Tensor indices_A + Tensor indices_A, + void* cuda_stream = nullptr ); extern template void homogeneous_polynomial_evaluation_vjp_vjp( @@ -176,7 +180,8 @@ extern template void homogeneous_polynomial_evaluation_vjp_vjp( Tensor grad_output, Tensor A, Tensor C, - Tensor indices_A + Tensor indices_A, + void* cuda_stream ); extern template void homogeneous_polynomial_evaluation_vjp_vjp( @@ -186,7 +191,8 @@ extern template void homogeneous_polynomial_evaluation_vjp_vjp( Tensor grad_output, Tensor A, Tensor C, - Tensor indices_A + Tensor indices_A, + void* cuda_stream ); } // namespace cuda diff --git a/mops/include/mops/opsaw.h b/mops/include/mops/opsaw.h index feafb11..ffa8566 100644 --- a/mops/include/mops/opsaw.h +++ b/mops/include/mops/opsaw.h @@ -95,7 +95,8 @@ int MOPS_EXPORT mops_cuda_outer_product_scatter_add_with_weights_f32( mops_tensor_2d_f32_t B, mops_tensor_2d_f32_t W, mops_tensor_1d_i32_t indices_W, - mops_tensor_1d_i32_t indices_output + mops_tensor_1d_i32_t indices_output, + void* cuda_stream ); /// CUDA version of mops::outer_product_scatter_add_with_weights for 64-bit floats @@ -105,7 +106,8 @@ int MOPS_EXPORT mops_cuda_outer_product_scatter_add_with_weights_f64( mops_tensor_2d_f64_t B, mops_tensor_2d_f64_t W, mops_tensor_1d_i32_t indices_W, - mops_tensor_1d_i32_t indices_output + mops_tensor_1d_i32_t indices_output, + void* cuda_stream ); /// CUDA version of mops::outer_product_scatter_add_with_weights_vjp for 32-bit floats @@ -118,7 +120,8 @@ int MOPS_EXPORT mops_cuda_outer_product_scatter_add_with_weights_vjp_f32( mops_tensor_2d_f32_t B, mops_tensor_2d_f32_t W, mops_tensor_1d_i32_t indices_W, - mops_tensor_1d_i32_t indices_output + mops_tensor_1d_i32_t indices_output, + void* cuda_stream ); /// CUDA version of mops::outer_product_scatter_add_with_weights_vjp for 64-bit floats @@ -131,7 +134,8 @@ int MOPS_EXPORT mops_cuda_outer_product_scatter_add_with_weights_vjp_f64( mops_tensor_2d_f64_t B, mops_tensor_2d_f64_t W, mops_tensor_1d_i32_t indices_W, - mops_tensor_1d_i32_t indices_output + mops_tensor_1d_i32_t indices_output, + void* cuda_stream ); /// CUDA version of mops::outer_product_scatter_add_with_weights_vjp_vjp for 32-bit floats @@ -148,7 +152,8 @@ int MOPS_EXPORT mops_cuda_outer_product_scatter_add_with_weights_vjp_vjp_f32( mops_tensor_2d_f32_t B, mops_tensor_2d_f32_t W, mops_tensor_1d_i32_t indices_W, - mops_tensor_1d_i32_t indices_output + mops_tensor_1d_i32_t indices_output, + void* cuda_stream ); /// CUDA version of mops::outer_product_scatter_add_with_weights_vjp_vjp for 64-bit floats @@ -165,7 +170,8 @@ int MOPS_EXPORT mops_cuda_outer_product_scatter_add_with_weights_vjp_vjp_f64( mops_tensor_2d_f64_t B, mops_tensor_2d_f64_t W, mops_tensor_1d_i32_t indices_W, - mops_tensor_1d_i32_t indices_output + mops_tensor_1d_i32_t indices_output, + void* cuda_stream ); #ifdef __cplusplus diff --git a/mops/include/mops/opsaw.hpp b/mops/include/mops/opsaw.hpp index 656bdde..3f71db6 100644 --- a/mops/include/mops/opsaw.hpp +++ b/mops/include/mops/opsaw.hpp @@ -194,7 +194,8 @@ void MOPS_EXPORT outer_product_scatter_add_with_weights( Tensor B, Tensor W, Tensor indices_W, - Tensor indices_output + Tensor indices_output, + void* cuda_stream = nullptr ); extern template void outer_product_scatter_add_with_weights( @@ -203,7 +204,8 @@ extern template void outer_product_scatter_add_with_weights( Tensor B, Tensor W, Tensor indices_W, - Tensor indices_output + Tensor indices_output, + void* cuda_stream ); extern template void outer_product_scatter_add_with_weights( @@ -212,7 +214,8 @@ extern template void outer_product_scatter_add_with_weights( Tensor B, Tensor W, Tensor indices_W, - Tensor indices_output + Tensor indices_output, + void* cuda_stream ); /// CUDA version of mops::outer_product_scatter_add_with_weights_vjp @@ -226,7 +229,8 @@ void MOPS_EXPORT outer_product_scatter_add_with_weights_vjp( Tensor B, Tensor W, Tensor indices_W, - Tensor indices_output + Tensor indices_output, + void* cuda_stream = nullptr ); extern template void outer_product_scatter_add_with_weights_vjp( @@ -238,7 +242,8 @@ extern template void outer_product_scatter_add_with_weights_vjp( Tensor B, Tensor W, Tensor indices_W, - Tensor indices_output + Tensor indices_output, + void* cuda_stream ); extern template void outer_product_scatter_add_with_weights_vjp( @@ -250,7 +255,8 @@ extern template void outer_product_scatter_add_with_weights_vjp( Tensor B, Tensor W, Tensor indices_W, - Tensor indices_output + Tensor indices_output, + void* cuda_stream ); /// TODO @@ -268,7 +274,8 @@ void MOPS_EXPORT outer_product_scatter_add_with_weights_vjp_vjp( Tensor B, Tensor W, Tensor indices_W, - Tensor indices_output + Tensor indices_output, + void* cuda_stream = nullptr ); // these templates will be precompiled and provided in the mops library @@ -285,7 +292,8 @@ extern template void outer_product_scatter_add_with_weights_vjp_vjp( Tensor B, Tensor W, Tensor indices_W, - Tensor indices_output + Tensor indices_output, + void* cuda_stream ); extern template void outer_product_scatter_add_with_weights_vjp_vjp( @@ -301,7 +309,8 @@ extern template void outer_product_scatter_add_with_weights_vjp_vjp( Tensor B, Tensor W, Tensor indices_W, - Tensor indices_output + Tensor indices_output, + void* cuda_stream ); } // namespace cuda diff --git a/mops/include/mops/sap.h b/mops/include/mops/sap.h index 5990e57..210b9bf 100644 --- a/mops/include/mops/sap.h +++ b/mops/include/mops/sap.h @@ -96,7 +96,8 @@ int MOPS_EXPORT mops_cuda_sparse_accumulation_of_products_f32( mops_tensor_1d_f32_t C, mops_tensor_1d_i32_t indices_A, mops_tensor_1d_i32_t indices_B, - mops_tensor_1d_i32_t indices_output + mops_tensor_1d_i32_t indices_output, + void* cuda_stream ); /// CUDA version of mops::sparse_accumulation_of_products for 64-bit floats @@ -107,7 +108,8 @@ int MOPS_EXPORT mops_cuda_sparse_accumulation_of_products_f64( mops_tensor_1d_f64_t C, mops_tensor_1d_i32_t indices_A, mops_tensor_1d_i32_t indices_B, - mops_tensor_1d_i32_t indices_output + mops_tensor_1d_i32_t indices_output, + void* cuda_stream ); /// CUDA version of mops::sparse_accumulation_of_products_vjp for 32-bit floats @@ -120,7 +122,8 @@ int MOPS_EXPORT mops_cuda_sparse_accumulation_of_products_vjp_f32( mops_tensor_1d_f32_t C, mops_tensor_1d_i32_t indices_A, mops_tensor_1d_i32_t indices_B, - mops_tensor_1d_i32_t indices_output + mops_tensor_1d_i32_t indices_output, + void* cuda_stream ); /// CUDA version of mops::sparse_accumulation_of_products_vjp for 64-bit floats @@ -133,7 +136,8 @@ int MOPS_EXPORT mops_cuda_sparse_accumulation_of_products_vjp_f64( mops_tensor_1d_f64_t C, mops_tensor_1d_i32_t indices_A, mops_tensor_1d_i32_t indices_B, - mops_tensor_1d_i32_t indices_output + mops_tensor_1d_i32_t indices_output, + void* cuda_stream ); /// CUDA version of mops::sparse_accumulation_of_products_vjp_vjp for 32-bit floats @@ -149,7 +153,8 @@ int MOPS_EXPORT mops_cuda_sparse_accumulation_of_products_vjp_vjp_f32( mops_tensor_1d_f32_t C, mops_tensor_1d_i32_t indices_A, mops_tensor_1d_i32_t indices_B, - mops_tensor_1d_i32_t indices_output + mops_tensor_1d_i32_t indices_output, + void* cuda_stream ); /// CUDA version of mops::sparse_accumulation_of_products_vjp_vjp for 64-bit floats @@ -165,7 +170,8 @@ int MOPS_EXPORT mops_cuda_sparse_accumulation_of_products_vjp_vjp_f64( mops_tensor_1d_f64_t C, mops_tensor_1d_i32_t indices_A, mops_tensor_1d_i32_t indices_B, - mops_tensor_1d_i32_t indices_output + mops_tensor_1d_i32_t indices_output, + void* cuda_stream ); #ifdef __cplusplus diff --git a/mops/include/mops/sap.hpp b/mops/include/mops/sap.hpp index a27090a..4568297 100644 --- a/mops/include/mops/sap.hpp +++ b/mops/include/mops/sap.hpp @@ -191,7 +191,8 @@ void MOPS_EXPORT sparse_accumulation_of_products( Tensor C, Tensor indices_A, Tensor indices_B, - Tensor indices_output + Tensor indices_output, + void* cuda_stream = nullptr ); extern template void sparse_accumulation_of_products( @@ -201,7 +202,8 @@ extern template void sparse_accumulation_of_products( Tensor C, Tensor indices_A, Tensor indices_B, - Tensor indices_output + Tensor indices_output, + void* cuda_stream ); extern template void sparse_accumulation_of_products( @@ -211,7 +213,8 @@ extern template void sparse_accumulation_of_products( Tensor C, Tensor indices_A, Tensor indices_B, - Tensor indices_output + Tensor indices_output, + void* cuda_stream ); /// CUDA version of mops::sparse_accumulation_of_products_vjp @@ -225,7 +228,8 @@ void MOPS_EXPORT sparse_accumulation_of_products_vjp( Tensor C, Tensor indices_A, Tensor indices_B, - Tensor indices_output + Tensor indices_output, + void* cuda_stream = nullptr ); extern template void sparse_accumulation_of_products_vjp( @@ -237,7 +241,8 @@ extern template void sparse_accumulation_of_products_vjp( Tensor C, Tensor indices_A, Tensor indices_B, - Tensor indices_output + Tensor indices_output, + void* cuda_stream ); extern template void sparse_accumulation_of_products_vjp( @@ -249,7 +254,8 @@ extern template void sparse_accumulation_of_products_vjp( Tensor C, Tensor indices_A, Tensor indices_B, - Tensor indices_output + Tensor indices_output, + void* cuda_stream ); /// TODO @@ -266,7 +272,8 @@ void MOPS_EXPORT sparse_accumulation_of_products_vjp_vjp( Tensor C, Tensor indices_A, Tensor indices_B, - Tensor indices_output + Tensor indices_output, + void* cuda_stream = nullptr ); // these templates will be precompiled and provided in the mops library @@ -282,7 +289,8 @@ extern template void sparse_accumulation_of_products_vjp_vjp( Tensor C, Tensor indices_A, Tensor indices_B, - Tensor indices_output + Tensor indices_output, + void* cuda_stream ); extern template void sparse_accumulation_of_products_vjp_vjp( @@ -297,7 +305,8 @@ extern template void sparse_accumulation_of_products_vjp_vjp( Tensor C, Tensor indices_A, Tensor indices_B, - Tensor indices_output + Tensor indices_output, + void* cuda_stream ); } // namespace cuda diff --git a/mops/include/mops/sasaw.h b/mops/include/mops/sasaw.h index 75170d7..1c4b88c 100644 --- a/mops/include/mops/sasaw.h +++ b/mops/include/mops/sasaw.h @@ -127,7 +127,8 @@ int MOPS_EXPORT mops_cuda_sparse_accumulation_scatter_add_with_weights_f32( mops_tensor_1d_i32_t indices_W_1, mops_tensor_1d_i32_t indices_W_2, mops_tensor_1d_i32_t indices_output_1, - mops_tensor_1d_i32_t indices_output_2 + mops_tensor_1d_i32_t indices_output_2, + void * cuda_stream ); /// CUDA version of mops::sparse_accumulation_scatter_add_with for 64-bit floats @@ -141,7 +142,8 @@ int MOPS_EXPORT mops_cuda_sparse_accumulation_scatter_add_with_weights_f64( mops_tensor_1d_i32_t indices_W_1, mops_tensor_1d_i32_t indices_W_2, mops_tensor_1d_i32_t indices_output_1, - mops_tensor_1d_i32_t indices_output_2 + mops_tensor_1d_i32_t indices_output_2, + void * cuda_stream ); /// CUDA version of mops::sparse_accumulation_scatter_add_with_weights_vjp for @@ -159,7 +161,8 @@ int MOPS_EXPORT mops_cuda_sparse_accumulation_scatter_add_with_weights_vjp_f32( mops_tensor_1d_i32_t indices_W_1, mops_tensor_1d_i32_t indices_W_2, mops_tensor_1d_i32_t indices_output_1, - mops_tensor_1d_i32_t indices_output_2 + mops_tensor_1d_i32_t indices_output_2, + void * cuda_stream ); /// CUDA version of mops::sparse_accumulation_scatter_add_with_weights_vjp for @@ -177,7 +180,8 @@ int MOPS_EXPORT mops_cuda_sparse_accumulation_scatter_add_with_weights_vjp_f64( mops_tensor_1d_i32_t indices_W_1, mops_tensor_1d_i32_t indices_W_2, mops_tensor_1d_i32_t indices_output_1, - mops_tensor_1d_i32_t indices_output_2 + mops_tensor_1d_i32_t indices_output_2, + void * cuda_stream ); /// CUDA version of mops::sparse_accumulation_scatter_add_with_weights_vjp_vjp for @@ -199,7 +203,8 @@ int MOPS_EXPORT mops_cuda_sparse_accumulation_scatter_add_with_weights_vjp_vjp_f mops_tensor_1d_i32_t indices_W_1, mops_tensor_1d_i32_t indices_W_2, mops_tensor_1d_i32_t indices_output_1, - mops_tensor_1d_i32_t indices_output_2 + mops_tensor_1d_i32_t indices_output_2, + void * cuda_stream ); /// CUDA version of mops::sparse_accumulation_scatter_add_with_weights_vjp_vjp for @@ -221,7 +226,8 @@ int MOPS_EXPORT mops_cuda_sparse_accumulation_scatter_add_with_weights_vjp_vjp_f mops_tensor_1d_i32_t indices_W_1, mops_tensor_1d_i32_t indices_W_2, mops_tensor_1d_i32_t indices_output_1, - mops_tensor_1d_i32_t indices_output_2 + mops_tensor_1d_i32_t indices_output_2, + void * cuda_stream ); #ifdef __cplusplus diff --git a/mops/include/mops/sasaw.hpp b/mops/include/mops/sasaw.hpp index 075db69..8654eb9 100644 --- a/mops/include/mops/sasaw.hpp +++ b/mops/include/mops/sasaw.hpp @@ -253,7 +253,8 @@ void MOPS_EXPORT sparse_accumulation_scatter_add_with_weights( Tensor indices_W_1, Tensor indices_W_2, Tensor indices_output_1, - Tensor indices_output_2 + Tensor indices_output_2, + void* cuda_stream = nullptr ); extern template void sparse_accumulation_scatter_add_with_weights( @@ -266,7 +267,8 @@ extern template void sparse_accumulation_scatter_add_with_weights( Tensor indices_W_1, Tensor indices_W_2, Tensor indices_output_1, - Tensor indices_output_2 + Tensor indices_output_2, + void* cuda_stream ); extern template void sparse_accumulation_scatter_add_with_weights( @@ -279,7 +281,8 @@ extern template void sparse_accumulation_scatter_add_with_weights( Tensor indices_W_1, Tensor indices_W_2, Tensor indices_output_1, - Tensor indices_output_2 + Tensor indices_output_2, + void* cuda_stream ); /// CUDA version of mops::sparse_accumulation_scatter_add_with_weights_vjp @@ -297,7 +300,8 @@ void MOPS_EXPORT sparse_accumulation_scatter_add_with_weights_vjp( Tensor indices_W_1, Tensor indices_W_2, Tensor indices_output_1, - Tensor indices_output_2 + Tensor indices_output_2, + void* cuda_stream = nullptr ); extern template void sparse_accumulation_scatter_add_with_weights_vjp( @@ -313,7 +317,8 @@ extern template void sparse_accumulation_scatter_add_with_weights_vjp( Tensor indices_W_1, Tensor indices_W_2, Tensor indices_output_1, - Tensor indices_output_2 + Tensor indices_output_2, + void* cuda_stream ); extern template void sparse_accumulation_scatter_add_with_weights_vjp( @@ -329,7 +334,8 @@ extern template void sparse_accumulation_scatter_add_with_weights_vjp( Tensor indices_W_1, Tensor indices_W_2, Tensor indices_output_1, - Tensor indices_output_2 + Tensor indices_output_2, + void* cuda_stream ); /// TODO @@ -351,7 +357,8 @@ void MOPS_EXPORT sparse_accumulation_scatter_add_with_weights_vjp_vjp( Tensor indices_W_1, Tensor indices_W_2, Tensor indices_output_1, - Tensor indices_output_2 + Tensor indices_output_2, + void* cuda_stream = nullptr ); // these templates will be precompiled and provided in the mops library @@ -372,7 +379,8 @@ extern template void sparse_accumulation_scatter_add_with_weights_vjp_vjp( Tensor indices_W_1, Tensor indices_W_2, Tensor indices_output_1, - Tensor indices_output_2 + Tensor indices_output_2, + void* cuda_stream ); extern template void sparse_accumulation_scatter_add_with_weights_vjp_vjp( @@ -392,7 +400,8 @@ extern template void sparse_accumulation_scatter_add_with_weights_vjp_vjp( Tensor indices_W_1, Tensor indices_W_2, Tensor indices_output_1, - Tensor indices_output_2 + Tensor indices_output_2, + void* cuda_stream ); } // namespace cuda diff --git a/mops/src/hpe/capi.cpp b/mops/src/hpe/capi.cpp index 7dd5e92..01e3056 100644 --- a/mops/src/hpe/capi.cpp +++ b/mops/src/hpe/capi.cpp @@ -132,14 +132,16 @@ extern "C" int mops_cuda_homogeneous_polynomial_evaluation_f32( mops_tensor_1d_f32_t output, mops_tensor_2d_f32_t A, mops_tensor_1d_f32_t C, - mops_tensor_2d_i32_t indices_A + mops_tensor_2d_i32_t indices_A, + void* cuda_stream ) { MOPS_CATCH_EXCEPTIONS_BEGIN mops::cuda::homogeneous_polynomial_evaluation( {output.data, {checked_cast(output.shape[0])}}, {A.data, {checked_cast(A.shape[0]), checked_cast(A.shape[1])}}, {C.data, {checked_cast(C.shape[0])}}, - {indices_A.data, {checked_cast(indices_A.shape[0]), checked_cast(indices_A.shape[1])}} + {indices_A.data, {checked_cast(indices_A.shape[0]), checked_cast(indices_A.shape[1])}}, + cuda_stream ); MOPS_CATCH_EXCEPTIONS_END } @@ -148,14 +150,16 @@ extern "C" int mops_cuda_homogeneous_polynomial_evaluation_f64( mops_tensor_1d_f64_t output, mops_tensor_2d_f64_t A, mops_tensor_1d_f64_t C, - mops_tensor_2d_i32_t indices_A + mops_tensor_2d_i32_t indices_A, + void* cuda_stream ) { MOPS_CATCH_EXCEPTIONS_BEGIN mops::cuda::homogeneous_polynomial_evaluation( {output.data, {checked_cast(output.shape[0])}}, {A.data, {checked_cast(A.shape[0]), checked_cast(A.shape[1])}}, {C.data, {checked_cast(C.shape[0])}}, - {indices_A.data, {checked_cast(indices_A.shape[0]), checked_cast(indices_A.shape[1])}} + {indices_A.data, {checked_cast(indices_A.shape[0]), checked_cast(indices_A.shape[1])}}, + cuda_stream ); MOPS_CATCH_EXCEPTIONS_END } @@ -165,7 +169,8 @@ extern "C" int mops_cuda_homogeneous_polynomial_evaluation_vjp_f32( mops_tensor_1d_f32_t grad_output, mops_tensor_2d_f32_t A, mops_tensor_1d_f32_t C, - mops_tensor_2d_i32_t indices_A + mops_tensor_2d_i32_t indices_A, + void* cuda_stream ) { MOPS_CATCH_EXCEPTIONS_BEGIN mops::cuda::homogeneous_polynomial_evaluation_vjp( @@ -173,7 +178,8 @@ extern "C" int mops_cuda_homogeneous_polynomial_evaluation_vjp_f32( {grad_output.data, {checked_cast(grad_output.shape[0])}}, {A.data, {checked_cast(A.shape[0]), checked_cast(A.shape[1])}}, {C.data, {checked_cast(C.shape[0])}}, - {indices_A.data, {checked_cast(indices_A.shape[0]), checked_cast(indices_A.shape[1])}} + {indices_A.data, {checked_cast(indices_A.shape[0]), checked_cast(indices_A.shape[1])}}, + cuda_stream ); MOPS_CATCH_EXCEPTIONS_END } @@ -183,7 +189,8 @@ extern "C" int mops_cuda_homogeneous_polynomial_evaluation_vjp_f64( mops_tensor_1d_f64_t grad_output, mops_tensor_2d_f64_t A, mops_tensor_1d_f64_t C, - mops_tensor_2d_i32_t indices_A + mops_tensor_2d_i32_t indices_A, + void* cuda_stream ) { MOPS_CATCH_EXCEPTIONS_BEGIN mops::cuda::homogeneous_polynomial_evaluation_vjp( @@ -191,7 +198,8 @@ extern "C" int mops_cuda_homogeneous_polynomial_evaluation_vjp_f64( {grad_output.data, {checked_cast(grad_output.shape[0])}}, {A.data, {checked_cast(A.shape[0]), checked_cast(A.shape[1])}}, {C.data, {checked_cast(C.shape[0])}}, - {indices_A.data, {checked_cast(indices_A.shape[0]), checked_cast(indices_A.shape[1])}} + {indices_A.data, {checked_cast(indices_A.shape[0]), checked_cast(indices_A.shape[1])}}, + cuda_stream ); MOPS_CATCH_EXCEPTIONS_END } @@ -203,7 +211,8 @@ extern "C" int mops_cuda_homogeneous_polynomial_evaluation_vjp_vjp_f32( mops_tensor_1d_f32_t grad_output, mops_tensor_2d_f32_t A, mops_tensor_1d_f32_t C, - mops_tensor_2d_i32_t indices_A + mops_tensor_2d_i32_t indices_A, + void* cuda_stream ) { MOPS_CATCH_EXCEPTIONS_BEGIN mops::cuda::homogeneous_polynomial_evaluation_vjp_vjp( @@ -214,7 +223,8 @@ extern "C" int mops_cuda_homogeneous_polynomial_evaluation_vjp_vjp_f32( {grad_output.data, {checked_cast(grad_output.shape[0])}}, {A.data, {checked_cast(A.shape[0]), checked_cast(A.shape[1])}}, {C.data, {checked_cast(C.shape[0])}}, - {indices_A.data, {checked_cast(indices_A.shape[0]), checked_cast(indices_A.shape[1])}} + {indices_A.data, {checked_cast(indices_A.shape[0]), checked_cast(indices_A.shape[1])}}, + cuda_stream ); MOPS_CATCH_EXCEPTIONS_END } @@ -226,7 +236,8 @@ extern "C" int mops_cuda_homogeneous_polynomial_evaluation_vjp_vjp_f64( mops_tensor_1d_f64_t grad_output, mops_tensor_2d_f64_t A, mops_tensor_1d_f64_t C, - mops_tensor_2d_i32_t indices_A + mops_tensor_2d_i32_t indices_A, + void* cuda_stream ) { MOPS_CATCH_EXCEPTIONS_BEGIN mops::cuda::homogeneous_polynomial_evaluation_vjp_vjp( @@ -237,7 +248,8 @@ extern "C" int mops_cuda_homogeneous_polynomial_evaluation_vjp_vjp_f64( {grad_output.data, {checked_cast(grad_output.shape[0])}}, {A.data, {checked_cast(A.shape[0]), checked_cast(A.shape[1])}}, {C.data, {checked_cast(C.shape[0])}}, - {indices_A.data, {checked_cast(indices_A.shape[0]), checked_cast(indices_A.shape[1])}} + {indices_A.data, {checked_cast(indices_A.shape[0]), checked_cast(indices_A.shape[1])}}, + cuda_stream ); MOPS_CATCH_EXCEPTIONS_END } diff --git a/mops/src/hpe/hpe.cpp b/mops/src/hpe/hpe.cpp index 1fcf0ba..52602ae 100644 --- a/mops/src/hpe/hpe.cpp +++ b/mops/src/hpe/hpe.cpp @@ -48,29 +48,37 @@ template void mops::homogeneous_polynomial_evaluation_vjp_vjp( #ifndef MOPS_CUDA_ENABLED template void mops::cuda:: - homogeneous_polynomial_evaluation(Tensor, Tensor, Tensor, Tensor) { + homogeneous_polynomial_evaluation(Tensor, Tensor, Tensor, Tensor, void*) { throw std::runtime_error("MOPS was not compiled with CUDA support"); } template void mops::cuda:: - homogeneous_polynomial_evaluation_vjp(Tensor, Tensor, Tensor, Tensor, Tensor) { + homogeneous_polynomial_evaluation_vjp(Tensor, Tensor, Tensor, Tensor, Tensor, void*) { throw std::runtime_error("MOPS was not compiled with CUDA support"); } template void mops::cuda:: - homogeneous_polynomial_evaluation_vjp_vjp(Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor) { + homogeneous_polynomial_evaluation_vjp_vjp(Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, void*) { throw std::runtime_error("MOPS was not compiled with CUDA support"); } // explicit instantiations of CUDA templates template void mops::cuda::homogeneous_polynomial_evaluation( - Tensor output, Tensor A, Tensor C, Tensor indices_A + Tensor output, + Tensor A, + Tensor C, + Tensor indices_A, + void* stream ); template void mops::cuda::homogeneous_polynomial_evaluation( - Tensor output, Tensor A, Tensor C, Tensor indices_A + Tensor output, + Tensor A, + Tensor C, + Tensor indices_A, + void* stream ); template void mops::cuda::homogeneous_polynomial_evaluation_vjp( @@ -78,7 +86,8 @@ template void mops::cuda::homogeneous_polynomial_evaluation_vjp( Tensor grad_output, Tensor A, Tensor C, - Tensor indices_A + Tensor indices_A, + void* stream ); template void mops::cuda::homogeneous_polynomial_evaluation_vjp( @@ -86,7 +95,8 @@ template void mops::cuda::homogeneous_polynomial_evaluation_vjp( Tensor grad_output, Tensor A, Tensor C, - Tensor indices_A + Tensor indices_A, + void* stream ); template void mops::cuda::homogeneous_polynomial_evaluation_vjp_vjp( @@ -96,7 +106,8 @@ template void mops::cuda::homogeneous_polynomial_evaluation_vjp_vjp( Tensor grad_output, Tensor A, Tensor C, - Tensor indices_A + Tensor indices_A, + void* stream ); template void mops::cuda::homogeneous_polynomial_evaluation_vjp_vjp( @@ -106,7 +117,8 @@ template void mops::cuda::homogeneous_polynomial_evaluation_vjp_vjp( Tensor grad_output, Tensor A, Tensor C, - Tensor indices_A + Tensor indices_A, + void* stream ); #endif diff --git a/mops/src/hpe/hpe.cu b/mops/src/hpe/hpe.cu index 964c4b4..7e36468 100644 --- a/mops/src/hpe/hpe.cu +++ b/mops/src/hpe/hpe.cu @@ -108,11 +108,21 @@ __global__ void homogeneous_polynomial_evaluation_kernel( template void mops::cuda::homogeneous_polynomial_evaluation( - Tensor output, Tensor A, Tensor C, Tensor indices_A + Tensor output, Tensor A, Tensor C, Tensor indices_A, void * cuda_stream ) { check_hpe(output, A, C, indices_A, "cuda_homogeneous_polynomial_evaluation"); + cudaPointerAttributes attributes; + CUDA_CHECK_ERROR(cudaPointerGetAttributes(&attributes, A.data)); + int current_device; + CUDA_CHECK_ERROR(cudaGetDevice(¤t_device)); + if (current_device != attributes.device) { + CUDA_CHECK_ERROR(cudaSetDevice(attributes.device)); + } + + cudaStream_t cstream = reinterpret_cast(cuda_stream); + int32_t nbatch = output.shape[0]; int32_t nnu1 = A.shape[1]; size_t polynomial_order = indices_A.shape[1]; @@ -180,17 +190,20 @@ void mops::cuda::homogeneous_polynomial_evaluation( } CUDA_CHECK_ERROR(cudaGetLastError()); + CUDA_CHECK_ERROR(cudaStreamSynchronize(cstream)); - CUDA_CHECK_ERROR(cudaDeviceSynchronize()); + if (current_device != attributes.device) { + CUDA_CHECK_ERROR(cudaSetDevice(current_device)); + } } // explicit instanciations of CUDA templates template void mops::cuda::homogeneous_polynomial_evaluation( - Tensor output, Tensor A, Tensor C, Tensor indices_A + Tensor output, Tensor A, Tensor C, Tensor indices_A, void * cuda_stream ); template void mops::cuda::homogeneous_polynomial_evaluation( - Tensor output, Tensor A, Tensor C, Tensor indices_A + Tensor output, Tensor A, Tensor C, Tensor indices_A, void * cuda_stream ); template @@ -293,10 +306,21 @@ void mops::cuda::homogeneous_polynomial_evaluation_vjp( Tensor grad_output, Tensor A, Tensor C, - Tensor indices_A + Tensor indices_A, + void * cuda_stream ) { check_hpe_vjp(grad_A, grad_output, A, C, indices_A, "cuda_homogeneous_polynomial_evaluation_vjp"); + cudaPointerAttributes attributes; + CUDA_CHECK_ERROR(cudaPointerGetAttributes(&attributes, A.data)); + int current_device; + CUDA_CHECK_ERROR(cudaGetDevice(¤t_device)); + if (current_device != attributes.device) { + CUDA_CHECK_ERROR(cudaSetDevice(attributes.device)); + } + + cudaStream_t cstream = reinterpret_cast(cuda_stream); + int32_t nbatch = grad_output.shape[0]; int32_t nnu1 = A.shape[1]; size_t polynomial_order = indices_A.shape[1]; @@ -363,8 +387,11 @@ void mops::cuda::homogeneous_polynomial_evaluation_vjp( } CUDA_CHECK_ERROR(cudaGetLastError()); + CUDA_CHECK_ERROR(cudaStreamSynchronize(cstream)); - CUDA_CHECK_ERROR(cudaDeviceSynchronize()); + if (current_device != attributes.device) { + CUDA_CHECK_ERROR(cudaSetDevice(current_device)); + } } // explicit instanciations of CUDA templates @@ -373,7 +400,8 @@ template void mops::cuda::homogeneous_polynomial_evaluation_vjp( Tensor grad_output, Tensor A, Tensor C, - Tensor indices_A + Tensor indices_A, + void * cuda_stream ); template void mops::cuda::homogeneous_polynomial_evaluation_vjp( @@ -381,7 +409,8 @@ template void mops::cuda::homogeneous_polynomial_evaluation_vjp( Tensor grad_output, Tensor A, Tensor C, - Tensor indices_A + Tensor indices_A, + void * cuda_stream ); template @@ -392,7 +421,8 @@ void mops::cuda::homogeneous_polynomial_evaluation_vjp_vjp( Tensor grad_output, Tensor A, Tensor C, - Tensor indices_A + Tensor indices_A, + void * cuda_stream ) { throw std::runtime_error("Not implemented"); } @@ -405,7 +435,8 @@ template void mops::cuda::homogeneous_polynomial_evaluation_vjp_vjp( Tensor grad_output, Tensor A, Tensor C, - Tensor indices_A + Tensor indices_A, + void * cuda_stream ); template void mops::cuda::homogeneous_polynomial_evaluation_vjp_vjp( @@ -415,5 +446,6 @@ template void mops::cuda::homogeneous_polynomial_evaluation_vjp_vjp( Tensor grad_output, Tensor A, Tensor C, - Tensor indices_A + Tensor indices_A, + void * cuda_stream ); diff --git a/mops/src/opsa/opsa.cu b/mops/src/opsa/opsa.cu index f500f44..73a2306 100644 --- a/mops/src/opsa/opsa.cu +++ b/mops/src/opsa/opsa.cu @@ -94,7 +94,7 @@ void mops::cuda::outer_product_scatter_add( ); CUDA_CHECK_ERROR(cudaGetLastError()); - CUDA_CHECK_ERROR(cudaDeviceSynchronize()); + CUDA_CHECK_ERROR(cudaStreamSynchronize(cstream)); if (current_device != attributes.device) { CUDA_CHECK_ERROR(cudaSetDevice(current_device)); @@ -312,7 +312,7 @@ void mops::cuda::outer_product_scatter_add_vjp( ); CUDA_CHECK_ERROR(cudaGetLastError()); - CUDA_CHECK_ERROR(cudaDeviceSynchronize()); + CUDA_CHECK_ERROR(cudaStreamSynchronize(cstream)); if (current_device != attributes.device) { CUDA_CHECK_ERROR(cudaSetDevice(current_device)); diff --git a/mops/src/opsaw/capi.cpp b/mops/src/opsaw/capi.cpp index 04e0bc4..dacb6ac 100644 --- a/mops/src/opsaw/capi.cpp +++ b/mops/src/opsaw/capi.cpp @@ -210,7 +210,8 @@ extern "C" int mops_cuda_outer_product_scatter_add_with_weights_f32( mops_tensor_2d_f32_t B, mops_tensor_2d_f32_t W, mops_tensor_1d_i32_t indices_W, - mops_tensor_1d_i32_t indices_output + mops_tensor_1d_i32_t indices_output, + void * cuda_stream ) { MOPS_CATCH_EXCEPTIONS_BEGIN mops::cuda::outer_product_scatter_add_with_weights( @@ -222,7 +223,8 @@ extern "C" int mops_cuda_outer_product_scatter_add_with_weights_f32( {B.data, {checked_cast(B.shape[0]), checked_cast(B.shape[1])}}, {W.data, {checked_cast(W.shape[0]), checked_cast(W.shape[1])}}, {indices_W.data, {checked_cast(indices_W.shape[0])}}, - {indices_output.data, {checked_cast(indices_output.shape[0])}} + {indices_output.data, {checked_cast(indices_output.shape[0])}}, + cuda_stream ); MOPS_CATCH_EXCEPTIONS_END } @@ -233,7 +235,8 @@ extern "C" int mops_cuda_outer_product_scatter_add_with_weights_f64( mops_tensor_2d_f64_t B, mops_tensor_2d_f64_t W, mops_tensor_1d_i32_t indices_W, - mops_tensor_1d_i32_t indices_output + mops_tensor_1d_i32_t indices_output, + void * cuda_stream ) { MOPS_CATCH_EXCEPTIONS_BEGIN mops::cuda::outer_product_scatter_add_with_weights( @@ -245,7 +248,8 @@ extern "C" int mops_cuda_outer_product_scatter_add_with_weights_f64( {B.data, {checked_cast(B.shape[0]), checked_cast(B.shape[1])}}, {W.data, {checked_cast(W.shape[0]), checked_cast(W.shape[1])}}, {indices_W.data, {checked_cast(indices_W.shape[0])}}, - {indices_output.data, {checked_cast(indices_output.shape[0])}} + {indices_output.data, {checked_cast(indices_output.shape[0])}}, + cuda_stream ); MOPS_CATCH_EXCEPTIONS_END } @@ -259,7 +263,8 @@ extern "C" int mops_cuda_outer_product_scatter_add_with_weights_vjp_f32( mops_tensor_2d_f32_t B, mops_tensor_2d_f32_t W, mops_tensor_1d_i32_t indices_W, - mops_tensor_1d_i32_t indices_output + mops_tensor_1d_i32_t indices_output, + void * cuda_stream ) { MOPS_CATCH_EXCEPTIONS_BEGIN mops::cuda::outer_product_scatter_add_with_weights_vjp( @@ -274,7 +279,8 @@ extern "C" int mops_cuda_outer_product_scatter_add_with_weights_vjp_f32( {B.data, {checked_cast(B.shape[0]), checked_cast(B.shape[1])}}, {W.data, {checked_cast(W.shape[0]), checked_cast(W.shape[1])}}, {indices_W.data, {checked_cast(indices_W.shape[0])}}, - {indices_output.data, {checked_cast(indices_output.shape[0])}} + {indices_output.data, {checked_cast(indices_output.shape[0])}}, + cuda_stream ); MOPS_CATCH_EXCEPTIONS_END } @@ -288,7 +294,8 @@ extern "C" int mops_cuda_outer_product_scatter_add_with_weights_vjp_f64( mops_tensor_2d_f64_t B, mops_tensor_2d_f64_t W, mops_tensor_1d_i32_t indices_W, - mops_tensor_1d_i32_t indices_output + mops_tensor_1d_i32_t indices_output, + void * cuda_stream ) { MOPS_CATCH_EXCEPTIONS_BEGIN mops::cuda::outer_product_scatter_add_with_weights_vjp( @@ -303,7 +310,8 @@ extern "C" int mops_cuda_outer_product_scatter_add_with_weights_vjp_f64( {B.data, {checked_cast(B.shape[0]), checked_cast(B.shape[1])}}, {W.data, {checked_cast(W.shape[0]), checked_cast(W.shape[1])}}, {indices_W.data, {checked_cast(indices_W.shape[0])}}, - {indices_output.data, {checked_cast(indices_output.shape[0])}} + {indices_output.data, {checked_cast(indices_output.shape[0])}}, + cuda_stream ); MOPS_CATCH_EXCEPTIONS_END } @@ -321,7 +329,8 @@ extern "C" int mops_cuda_outer_product_scatter_add_with_weights_vjp_vjp_f32( mops_tensor_2d_f32_t B, mops_tensor_2d_f32_t W, mops_tensor_1d_i32_t indices_W, - mops_tensor_1d_i32_t indices_output + mops_tensor_1d_i32_t indices_output, + void * cuda_stream ) { MOPS_CATCH_EXCEPTIONS_BEGIN mops::cuda::outer_product_scatter_add_with_weights_vjp_vjp( @@ -346,7 +355,8 @@ extern "C" int mops_cuda_outer_product_scatter_add_with_weights_vjp_vjp_f32( {B.data, {checked_cast(B.shape[0]), checked_cast(B.shape[1])}}, {W.data, {checked_cast(W.shape[0]), checked_cast(W.shape[1])}}, {indices_W.data, {checked_cast(indices_W.shape[0])}}, - {indices_output.data, {checked_cast(indices_output.shape[0])}} + {indices_output.data, {checked_cast(indices_output.shape[0])}}, + cuda_stream ); MOPS_CATCH_EXCEPTIONS_END } @@ -364,7 +374,8 @@ extern "C" int mops_cuda_outer_product_scatter_add_with_weights_vjp_vjp_f64( mops_tensor_2d_f64_t B, mops_tensor_2d_f64_t W, mops_tensor_1d_i32_t indices_W, - mops_tensor_1d_i32_t indices_output + mops_tensor_1d_i32_t indices_output, + void * cuda_stream ) { MOPS_CATCH_EXCEPTIONS_BEGIN mops::cuda::outer_product_scatter_add_with_weights_vjp_vjp( @@ -389,7 +400,8 @@ extern "C" int mops_cuda_outer_product_scatter_add_with_weights_vjp_vjp_f64( {B.data, {checked_cast(B.shape[0]), checked_cast(B.shape[1])}}, {W.data, {checked_cast(W.shape[0]), checked_cast(W.shape[1])}}, {indices_W.data, {checked_cast(indices_W.shape[0])}}, - {indices_output.data, {checked_cast(indices_output.shape[0])}} + {indices_output.data, {checked_cast(indices_output.shape[0])}}, + cuda_stream ); MOPS_CATCH_EXCEPTIONS_END } diff --git a/mops/src/sap/capi.cpp b/mops/src/sap/capi.cpp index e5c79d8..e79a256 100644 --- a/mops/src/sap/capi.cpp +++ b/mops/src/sap/capi.cpp @@ -191,7 +191,8 @@ extern "C" int mops_cuda_sparse_accumulation_of_products_f32( mops_tensor_1d_f32_t C, mops_tensor_1d_i32_t indices_A, mops_tensor_1d_i32_t indices_B, - mops_tensor_1d_i32_t indices_output + mops_tensor_1d_i32_t indices_output, + void* cuda_stream ) { MOPS_CATCH_EXCEPTIONS_BEGIN mops::cuda::sparse_accumulation_of_products( @@ -201,7 +202,8 @@ extern "C" int mops_cuda_sparse_accumulation_of_products_f32( {C.data, {checked_cast(C.shape[0])}}, {indices_A.data, {checked_cast(indices_A.shape[0])}}, {indices_B.data, {checked_cast(indices_B.shape[0])}}, - {indices_output.data, {checked_cast(indices_output.shape[0])}} + {indices_output.data, {checked_cast(indices_output.shape[0])}}, + cuda_stream ); MOPS_CATCH_EXCEPTIONS_END } @@ -213,7 +215,8 @@ extern "C" int mops_cuda_sparse_accumulation_of_products_f64( mops_tensor_1d_f64_t C, mops_tensor_1d_i32_t indices_A, mops_tensor_1d_i32_t indices_B, - mops_tensor_1d_i32_t indices_output + mops_tensor_1d_i32_t indices_output, + void* cuda_stream ) { MOPS_CATCH_EXCEPTIONS_BEGIN mops::cuda::sparse_accumulation_of_products( @@ -223,7 +226,8 @@ extern "C" int mops_cuda_sparse_accumulation_of_products_f64( {C.data, {checked_cast(C.shape[0])}}, {indices_A.data, {checked_cast(indices_A.shape[0])}}, {indices_B.data, {checked_cast(indices_B.shape[0])}}, - {indices_output.data, {checked_cast(indices_output.shape[0])}} + {indices_output.data, {checked_cast(indices_output.shape[0])}}, + cuda_stream ); MOPS_CATCH_EXCEPTIONS_END } @@ -237,7 +241,8 @@ extern "C" int mops_cuda_sparse_accumulation_of_products_vjp_f32( mops_tensor_1d_f32_t C, mops_tensor_1d_i32_t indices_A, mops_tensor_1d_i32_t indices_B, - mops_tensor_1d_i32_t indices_output + mops_tensor_1d_i32_t indices_output, + void* cuda_stream ) { MOPS_CATCH_EXCEPTIONS_BEGIN mops::cuda::sparse_accumulation_of_products_vjp( @@ -250,7 +255,8 @@ extern "C" int mops_cuda_sparse_accumulation_of_products_vjp_f32( {C.data, {checked_cast(C.shape[0])}}, {indices_A.data, {checked_cast(indices_A.shape[0])}}, {indices_B.data, {checked_cast(indices_B.shape[0])}}, - {indices_output.data, {checked_cast(indices_output.shape[0])}} + {indices_output.data, {checked_cast(indices_output.shape[0])}}, + cuda_stream ); MOPS_CATCH_EXCEPTIONS_END } @@ -264,7 +270,8 @@ extern "C" int mops_cuda_sparse_accumulation_of_products_vjp_f64( mops_tensor_1d_f64_t C, mops_tensor_1d_i32_t indices_A, mops_tensor_1d_i32_t indices_B, - mops_tensor_1d_i32_t indices_output + mops_tensor_1d_i32_t indices_output, + void* cuda_stream ) { MOPS_CATCH_EXCEPTIONS_BEGIN mops::cuda::sparse_accumulation_of_products_vjp( @@ -277,7 +284,8 @@ extern "C" int mops_cuda_sparse_accumulation_of_products_vjp_f64( {C.data, {checked_cast(C.shape[0])}}, {indices_A.data, {checked_cast(indices_A.shape[0])}}, {indices_B.data, {checked_cast(indices_B.shape[0])}}, - {indices_output.data, {checked_cast(indices_output.shape[0])}} + {indices_output.data, {checked_cast(indices_output.shape[0])}}, + cuda_stream ); MOPS_CATCH_EXCEPTIONS_END } @@ -294,7 +302,8 @@ extern "C" int mops_cuda_sparse_accumulation_of_products_vjp_vjp_f32( mops_tensor_1d_f32_t C, mops_tensor_1d_i32_t indices_A, mops_tensor_1d_i32_t indices_B, - mops_tensor_1d_i32_t indices_output + mops_tensor_1d_i32_t indices_output, + void* cuda_stream ) { MOPS_CATCH_EXCEPTIONS_BEGIN mops::cuda::sparse_accumulation_of_products_vjp_vjp( @@ -313,7 +322,8 @@ extern "C" int mops_cuda_sparse_accumulation_of_products_vjp_vjp_f32( {C.data, {checked_cast(C.shape[0])}}, {indices_A.data, {checked_cast(indices_A.shape[0])}}, {indices_B.data, {checked_cast(indices_B.shape[0])}}, - {indices_output.data, {checked_cast(indices_output.shape[0])}} + {indices_output.data, {checked_cast(indices_output.shape[0])}}, + cuda_stream ); MOPS_CATCH_EXCEPTIONS_END } @@ -330,7 +340,8 @@ extern "C" int mops_cuda_sparse_accumulation_of_products_vjp_vjp_f64( mops_tensor_1d_f64_t C, mops_tensor_1d_i32_t indices_A, mops_tensor_1d_i32_t indices_B, - mops_tensor_1d_i32_t indices_output + mops_tensor_1d_i32_t indices_output, + void* cuda_stream ) { MOPS_CATCH_EXCEPTIONS_BEGIN mops::cuda::sparse_accumulation_of_products_vjp_vjp( @@ -349,7 +360,8 @@ extern "C" int mops_cuda_sparse_accumulation_of_products_vjp_vjp_f64( {C.data, {checked_cast(C.shape[0])}}, {indices_A.data, {checked_cast(indices_A.shape[0])}}, {indices_B.data, {checked_cast(indices_B.shape[0])}}, - {indices_output.data, {checked_cast(indices_output.shape[0])}} + {indices_output.data, {checked_cast(indices_output.shape[0])}}, + cuda_stream ); MOPS_CATCH_EXCEPTIONS_END } diff --git a/mops/src/sap/sap.cpp b/mops/src/sap/sap.cpp index 545c429..b91b41a 100644 --- a/mops/src/sap/sap.cpp +++ b/mops/src/sap/sap.cpp @@ -78,13 +78,13 @@ template void mops::sparse_accumulation_of_products_vjp_vjp( #ifndef MOPS_CUDA_ENABLED template void mops::cuda:: - sparse_accumulation_of_products(Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor) { + sparse_accumulation_of_products(Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, void*) { throw std::runtime_error("MOPS was not compiled with CUDA support"); } template void mops::cuda:: - sparse_accumulation_of_products_vjp(Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor) { + sparse_accumulation_of_products_vjp(Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, void*) { throw std::runtime_error("MOPS was not compiled with CUDA support"); } @@ -101,7 +101,8 @@ void mops::cuda::sparse_accumulation_of_products_vjp_vjp( Tensor /*C*/, Tensor /*indices_A*/, Tensor /*indices_B*/, - Tensor /*indices_output*/ + Tensor /*indices_output*/, + void* /*cuda_stream*/ ) { throw std::runtime_error("MOPS was not compiled with CUDA support"); } @@ -114,7 +115,8 @@ template void mops::cuda::sparse_accumulation_of_products( Tensor C, Tensor indices_A, Tensor indices_B, - Tensor indices_output + Tensor indices_output, + void* cuda_stream ); template void mops::cuda::sparse_accumulation_of_products( @@ -124,7 +126,8 @@ template void mops::cuda::sparse_accumulation_of_products( Tensor C, Tensor indices_A, Tensor indices_B, - Tensor indices_output + Tensor indices_output, + void* cuda_stream ); template void mops::cuda::sparse_accumulation_of_products_vjp( @@ -136,7 +139,8 @@ template void mops::cuda::sparse_accumulation_of_products_vjp( Tensor C, Tensor indices_A, Tensor indices_B, - Tensor indices_output + Tensor indices_output, + void* cuda_stream ); template void mops::cuda::sparse_accumulation_of_products_vjp( @@ -148,7 +152,8 @@ template void mops::cuda::sparse_accumulation_of_products_vjp( Tensor C, Tensor indices_A, Tensor indices_B, - Tensor indices_output + Tensor indices_output, + void* cuda_stream ); template void mops::cuda::sparse_accumulation_of_products_vjp_vjp( @@ -163,7 +168,8 @@ template void mops::cuda::sparse_accumulation_of_products_vjp_vjp( Tensor C, Tensor indices_A, Tensor indices_B, - Tensor indices_output + Tensor indices_output, + void* cuda_stream ); template void mops::cuda::sparse_accumulation_of_products_vjp_vjp( @@ -178,7 +184,8 @@ template void mops::cuda::sparse_accumulation_of_products_vjp_vjp( Tensor C, Tensor indices_A, Tensor indices_B, - Tensor indices_output + Tensor indices_output, + void* cuda_stream ); #endif diff --git a/mops/src/sap/sap.cu b/mops/src/sap/sap.cu index be72bc8..7d8a2d5 100644 --- a/mops/src/sap/sap.cu +++ b/mops/src/sap/sap.cu @@ -95,12 +95,23 @@ void mops::cuda::sparse_accumulation_of_products( Tensor C, Tensor indices_A, Tensor indices_B, - Tensor indices_output + Tensor indices_output, + void* cuda_stream ) { check_sap( output, A, B, C, indices_A, indices_B, indices_output, "cuda_sparse_accumulation_of_products" ); + cudaPointerAttributes attributes; + CUDA_CHECK_ERROR(cudaPointerGetAttributes(&attributes, A.data)); + int current_device; + CUDA_CHECK_ERROR(cudaGetDevice(¤t_device)); + if (current_device != attributes.device) { + CUDA_CHECK_ERROR(cudaSetDevice(attributes.device)); + } + + cudaStream_t cstream = reinterpret_cast(cuda_stream); + dim3 block_dim(find_integer_divisor(A.shape[0], WARP_SIZE)); dim3 thread_block(WARP_SIZE * NWARPS_PER_BLOCK, 1, 1); @@ -117,8 +128,11 @@ void mops::cuda::sparse_accumulation_of_products( <<>>(output, A, B, C, indices_A, indices_B, indices_output); CUDA_CHECK_ERROR(cudaGetLastError()); + CUDA_CHECK_ERROR(cudaStreamSynchronize(cstream)); - CUDA_CHECK_ERROR(cudaDeviceSynchronize()); + if (current_device != attributes.device) { + CUDA_CHECK_ERROR(cudaSetDevice(current_device)); + } } // explicit instanciations of CUDA templates @@ -129,7 +143,8 @@ template void mops::cuda::sparse_accumulation_of_products( Tensor C, Tensor indices_A, Tensor indices_B, - Tensor indices_output + Tensor indices_output, + void* cuda_stream ); template void mops::cuda::sparse_accumulation_of_products( @@ -139,7 +154,8 @@ template void mops::cuda::sparse_accumulation_of_products( Tensor C, Tensor indices_A, Tensor indices_B, - Tensor indices_output + Tensor indices_output, + void* cuda_stream ); template @@ -282,7 +298,8 @@ void mops::cuda::sparse_accumulation_of_products_vjp( Tensor C, Tensor indices_A, Tensor indices_B, - Tensor indices_output + Tensor indices_output, + void* cuda_stream ) { check_sap_vjp( grad_A, @@ -297,6 +314,16 @@ void mops::cuda::sparse_accumulation_of_products_vjp( "cuda_sparse_accumulation_of_products_vjp" ); + cudaPointerAttributes attributes; + CUDA_CHECK_ERROR(cudaPointerGetAttributes(&attributes, A.data)); + int current_device; + CUDA_CHECK_ERROR(cudaGetDevice(¤t_device)); + if (current_device != attributes.device) { + CUDA_CHECK_ERROR(cudaSetDevice(attributes.device)); + } + + cudaStream_t cstream = reinterpret_cast(cuda_stream); + dim3 block_dim(find_integer_divisor(grad_A.shape[0], WARP_SIZE)); dim3 thread_block(WARP_SIZE * NWARPS_PER_BLOCK, 1, 1); @@ -322,8 +349,11 @@ void mops::cuda::sparse_accumulation_of_products_vjp( ); CUDA_CHECK_ERROR(cudaGetLastError()); + CUDA_CHECK_ERROR(cudaStreamSynchronize(cstream)); - CUDA_CHECK_ERROR(cudaDeviceSynchronize()); + if (current_device != attributes.device) { + CUDA_CHECK_ERROR(cudaSetDevice(current_device)); + } } template void mops::cuda::sparse_accumulation_of_products_vjp( @@ -335,7 +365,8 @@ template void mops::cuda::sparse_accumulation_of_products_vjp( Tensor C, Tensor indices_A, Tensor indices_B, - Tensor indices_output + Tensor indices_output, + void* cuda_stream ); template void mops::cuda::sparse_accumulation_of_products_vjp( @@ -347,7 +378,8 @@ template void mops::cuda::sparse_accumulation_of_products_vjp( Tensor C, Tensor indices_A, Tensor indices_B, - Tensor indices_output + Tensor indices_output, + void* cuda_stream ); template @@ -586,7 +618,8 @@ void mops::cuda::sparse_accumulation_of_products_vjp_vjp( Tensor C, Tensor indices_A, Tensor indices_B, - Tensor indices_output + Tensor indices_output, + void* cuda_stream ) { check_sap_vjp_vjp( grad_grad_output, @@ -604,6 +637,16 @@ void mops::cuda::sparse_accumulation_of_products_vjp_vjp( "cuda_sparse_accumulation_of_products_vjp_vjp" ); + cudaPointerAttributes attributes; + CUDA_CHECK_ERROR(cudaPointerGetAttributes(&attributes, A.data)); + int current_device; + CUDA_CHECK_ERROR(cudaGetDevice(¤t_device)); + if (current_device != attributes.device) { + CUDA_CHECK_ERROR(cudaSetDevice(attributes.device)); + } + + cudaStream_t cstream = reinterpret_cast(cuda_stream); + dim3 block_dim(find_integer_divisor(grad_A_2.shape[0], WARP_SIZE)); dim3 thread_block(WARP_SIZE * NWARPS_PER_BLOCK, 1, 1); @@ -674,8 +717,11 @@ void mops::cuda::sparse_accumulation_of_products_vjp_vjp( ); CUDA_CHECK_ERROR(cudaGetLastError()); + CUDA_CHECK_ERROR(cudaStreamSynchronize(cstream)); - CUDA_CHECK_ERROR(cudaDeviceSynchronize()); + if (current_device != attributes.device) { + CUDA_CHECK_ERROR(cudaSetDevice(current_device)); + } } // explicit instanciations of CUDA templates @@ -691,7 +737,8 @@ template void mops::cuda::sparse_accumulation_of_products_vjp_vjp( Tensor C, Tensor indices_A, Tensor indices_B, - Tensor indices_output + Tensor indices_output, + void* cuda_stream ); template void mops::cuda::sparse_accumulation_of_products_vjp_vjp( @@ -706,5 +753,6 @@ template void mops::cuda::sparse_accumulation_of_products_vjp_vjp( Tensor C, Tensor indices_A, Tensor indices_B, - Tensor indices_output + Tensor indices_output, + void* cuda_stream ); diff --git a/mops/src/sasaw/capi.cpp b/mops/src/sasaw/capi.cpp index 5dfb933..57f5620 100644 --- a/mops/src/sasaw/capi.cpp +++ b/mops/src/sasaw/capi.cpp @@ -276,7 +276,8 @@ extern "C" int mops_cuda_sparse_accumulation_scatter_add_with_weights_f32( mops_tensor_1d_i32_t indices_W_1, mops_tensor_1d_i32_t indices_W_2, mops_tensor_1d_i32_t indices_output_1, - mops_tensor_1d_i32_t indices_output_2 + mops_tensor_1d_i32_t indices_output_2, + void* cuda_stream ) { MOPS_CATCH_EXCEPTIONS_BEGIN mops::cuda::sparse_accumulation_scatter_add_with_weights( @@ -292,7 +293,8 @@ extern "C" int mops_cuda_sparse_accumulation_scatter_add_with_weights_f32( {indices_W_1.data, {checked_cast(indices_W_1.shape[0])}}, {indices_W_2.data, {checked_cast(indices_W_2.shape[0])}}, {indices_output_1.data, {checked_cast(indices_output_1.shape[0])}}, - {indices_output_2.data, {checked_cast(indices_output_2.shape[0])}} + {indices_output_2.data, {checked_cast(indices_output_2.shape[0])}}, + cuda_stream ); MOPS_CATCH_EXCEPTIONS_END } @@ -307,7 +309,8 @@ extern "C" int mops_cuda_sparse_accumulation_scatter_add_with_weights_f64( mops_tensor_1d_i32_t indices_W_1, mops_tensor_1d_i32_t indices_W_2, mops_tensor_1d_i32_t indices_output_1, - mops_tensor_1d_i32_t indices_output_2 + mops_tensor_1d_i32_t indices_output_2, + void* cuda_stream ) { MOPS_CATCH_EXCEPTIONS_BEGIN mops::cuda::sparse_accumulation_scatter_add_with_weights( @@ -323,7 +326,8 @@ extern "C" int mops_cuda_sparse_accumulation_scatter_add_with_weights_f64( {indices_W_1.data, {checked_cast(indices_W_1.shape[0])}}, {indices_W_2.data, {checked_cast(indices_W_2.shape[0])}}, {indices_output_1.data, {checked_cast(indices_output_1.shape[0])}}, - {indices_output_2.data, {checked_cast(indices_output_2.shape[0])}} + {indices_output_2.data, {checked_cast(indices_output_2.shape[0])}}, + cuda_stream ); MOPS_CATCH_EXCEPTIONS_END } @@ -341,7 +345,8 @@ extern "C" int mops_cuda_sparse_accumulation_scatter_add_with_weights_vjp_f32( mops_tensor_1d_i32_t indices_W_1, mops_tensor_1d_i32_t indices_W_2, mops_tensor_1d_i32_t indices_output_1, - mops_tensor_1d_i32_t indices_output_2 + mops_tensor_1d_i32_t indices_output_2, + void* cuda_stream ) { MOPS_CATCH_EXCEPTIONS_BEGIN mops::cuda::sparse_accumulation_scatter_add_with_weights_vjp( @@ -362,7 +367,8 @@ extern "C" int mops_cuda_sparse_accumulation_scatter_add_with_weights_vjp_f32( {indices_W_1.data, {checked_cast(indices_W_1.shape[0])}}, {indices_W_2.data, {checked_cast(indices_W_2.shape[0])}}, {indices_output_1.data, {checked_cast(indices_output_1.shape[0])}}, - {indices_output_2.data, {checked_cast(indices_output_2.shape[0])}} + {indices_output_2.data, {checked_cast(indices_output_2.shape[0])}}, + cuda_stream ); MOPS_CATCH_EXCEPTIONS_END } @@ -380,7 +386,8 @@ extern "C" int mops_cuda_sparse_accumulation_scatter_add_with_weights_vjp_f64( mops_tensor_1d_i32_t indices_W_1, mops_tensor_1d_i32_t indices_W_2, mops_tensor_1d_i32_t indices_output_1, - mops_tensor_1d_i32_t indices_output_2 + mops_tensor_1d_i32_t indices_output_2, + void* cuda_stream ) { MOPS_CATCH_EXCEPTIONS_BEGIN mops::cuda::sparse_accumulation_scatter_add_with_weights_vjp( @@ -401,7 +408,8 @@ extern "C" int mops_cuda_sparse_accumulation_scatter_add_with_weights_vjp_f64( {indices_W_1.data, {checked_cast(indices_W_1.shape[0])}}, {indices_W_2.data, {checked_cast(indices_W_2.shape[0])}}, {indices_output_1.data, {checked_cast(indices_output_1.shape[0])}}, - {indices_output_2.data, {checked_cast(indices_output_2.shape[0])}} + {indices_output_2.data, {checked_cast(indices_output_2.shape[0])}}, + cuda_stream ); MOPS_CATCH_EXCEPTIONS_END } @@ -423,7 +431,8 @@ extern "C" int mops_cuda_sparse_accumulation_scatter_add_with_weights_vjp_vjp_f3 mops_tensor_1d_i32_t indices_W_1, mops_tensor_1d_i32_t indices_W_2, mops_tensor_1d_i32_t indices_output_1, - mops_tensor_1d_i32_t indices_output_2 + mops_tensor_1d_i32_t indices_output_2, + void* cuda_stream ) { MOPS_CATCH_EXCEPTIONS_BEGIN mops::cuda::sparse_accumulation_scatter_add_with_weights_vjp_vjp( @@ -457,7 +466,8 @@ extern "C" int mops_cuda_sparse_accumulation_scatter_add_with_weights_vjp_vjp_f3 {indices_W_1.data, {checked_cast(indices_W_1.shape[0])}}, {indices_W_2.data, {checked_cast(indices_W_2.shape[0])}}, {indices_output_1.data, {checked_cast(indices_output_1.shape[0])}}, - {indices_output_2.data, {checked_cast(indices_output_2.shape[0])}} + {indices_output_2.data, {checked_cast(indices_output_2.shape[0])}}, + cuda_stream ); MOPS_CATCH_EXCEPTIONS_END } @@ -479,7 +489,8 @@ extern "C" int mops_cuda_sparse_accumulation_scatter_add_with_weights_vjp_vjp_f6 mops_tensor_1d_i32_t indices_W_1, mops_tensor_1d_i32_t indices_W_2, mops_tensor_1d_i32_t indices_output_1, - mops_tensor_1d_i32_t indices_output_2 + mops_tensor_1d_i32_t indices_output_2, + void* cuda_stream ) { MOPS_CATCH_EXCEPTIONS_BEGIN mops::cuda::sparse_accumulation_scatter_add_with_weights_vjp_vjp( @@ -513,7 +524,8 @@ extern "C" int mops_cuda_sparse_accumulation_scatter_add_with_weights_vjp_vjp_f6 {indices_W_1.data, {checked_cast(indices_W_1.shape[0])}}, {indices_W_2.data, {checked_cast(indices_W_2.shape[0])}}, {indices_output_1.data, {checked_cast(indices_output_1.shape[0])}}, - {indices_output_2.data, {checked_cast(indices_output_2.shape[0])}} + {indices_output_2.data, {checked_cast(indices_output_2.shape[0])}}, + cuda_stream ); MOPS_CATCH_EXCEPTIONS_END } diff --git a/mops/src/sasaw/sasaw.cpp b/mops/src/sasaw/sasaw.cpp index 9bc4a68..50525aa 100644 --- a/mops/src/sasaw/sasaw.cpp +++ b/mops/src/sasaw/sasaw.cpp @@ -99,18 +99,16 @@ template void mops::sparse_accumulation_scatter_add_with_weights_vjp_vjp Tensor indices_output_2 ); -#ifdef MOPS_CUDA_ENABLED -#include "cuda.tpp" -#else +#ifndef MOPS_CUDA_ENABLED template void mops::cuda:: - sparse_accumulation_scatter_add_with_weights(Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor) { + sparse_accumulation_scatter_add_with_weights(Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, void*) { throw std::runtime_error("MOPS was not compiled with CUDA support"); } template void mops::cuda:: - sparse_accumulation_scatter_add_with_weights_vjp(Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor) { + sparse_accumulation_scatter_add_with_weights_vjp(Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, void*) { throw std::runtime_error("MOPS was not compiled with CUDA support"); } @@ -132,13 +130,12 @@ void mops::cuda::sparse_accumulation_scatter_add_with_weights_vjp_vjp( Tensor /*indices_W_1*/, Tensor /*indices_W_2*/, Tensor /*indices_output_1*/, - Tensor /*indices_output_2*/ + Tensor /*indices_output_2*/, + void* /*cuda_stream*/ ) { throw std::runtime_error("MOPS was not compiled with CUDA support"); } -#endif - // explicit instantiations of CUDA templates template void mops::cuda::sparse_accumulation_scatter_add_with_weights( Tensor output, @@ -150,7 +147,8 @@ template void mops::cuda::sparse_accumulation_scatter_add_with_weights( Tensor indices_W_1, Tensor indices_W_2, Tensor indices_output_1, - Tensor indices_output_2 + Tensor indices_output_2, + void* cuda_stream ); template void mops::cuda::sparse_accumulation_scatter_add_with_weights( @@ -163,7 +161,8 @@ template void mops::cuda::sparse_accumulation_scatter_add_with_weights( Tensor indices_W_1, Tensor indices_W_2, Tensor indices_output_1, - Tensor indices_output_2 + Tensor indices_output_2, + void* cuda_stream ); template void mops::cuda::sparse_accumulation_scatter_add_with_weights_vjp( @@ -179,7 +178,8 @@ template void mops::cuda::sparse_accumulation_scatter_add_with_weights_vjp indices_W_1, Tensor indices_W_2, Tensor indices_output_1, - Tensor indices_output_2 + Tensor indices_output_2, + void* cuda_stream ); template void mops::cuda::sparse_accumulation_scatter_add_with_weights_vjp( @@ -195,7 +195,8 @@ template void mops::cuda::sparse_accumulation_scatter_add_with_weights_vjp indices_W_1, Tensor indices_W_2, Tensor indices_output_1, - Tensor indices_output_2 + Tensor indices_output_2, + void* cuda_stream ); template void mops::cuda::sparse_accumulation_scatter_add_with_weights_vjp_vjp( @@ -215,7 +216,8 @@ template void mops::cuda::sparse_accumulation_scatter_add_with_weights_vjp_vjp indices_W_1, Tensor indices_W_2, Tensor indices_output_1, - Tensor indices_output_2 + Tensor indices_output_2, + void* cuda_stream ); template void mops::cuda::sparse_accumulation_scatter_add_with_weights_vjp_vjp( @@ -235,5 +237,8 @@ template void mops::cuda::sparse_accumulation_scatter_add_with_weights_vjp_vjp indices_W_1, Tensor indices_W_2, Tensor indices_output_1, - Tensor indices_output_2 + Tensor indices_output_2, + void* cuda_stream ); + +#endif \ No newline at end of file From 0b06e796cdc7ee5632c89f742db58fa0baf52e43 Mon Sep 17 00:00:00 2001 From: "Nick J. Browning" Date: Thu, 2 May 2024 11:52:42 +0200 Subject: [PATCH 05/28] formatting. --- mops/include/mops/hpe.hpp | 18 +++++++++++++++--- mops/include/mops/sasaw.h | 12 ++++++------ mops/src/hpe/hpe.cu | 30 +++++++++++++++++++++--------- mops/src/opsaw/capi.cpp | 12 ++++++------ 4 files changed, 48 insertions(+), 24 deletions(-) diff --git a/mops/include/mops/hpe.hpp b/mops/include/mops/hpe.hpp index c3f6425..1a66edd 100644 --- a/mops/include/mops/hpe.hpp +++ b/mops/include/mops/hpe.hpp @@ -122,15 +122,27 @@ namespace cuda { /// CUDA version of mops::homogeneous_polynomial_evaluation template void MOPS_EXPORT homogeneous_polynomial_evaluation( - Tensor output, Tensor A, Tensor C, Tensor indices_A, void* cuda_stream = nullptr + Tensor output, + Tensor A, + Tensor C, + Tensor indices_A, + void* cuda_stream = nullptr ); extern template void homogeneous_polynomial_evaluation( - Tensor output, Tensor A, Tensor C, Tensor indices_A, void* cuda_stream + Tensor output, + Tensor A, + Tensor C, + Tensor indices_A, + void* cuda_stream ); extern template void homogeneous_polynomial_evaluation( - Tensor output, Tensor A, Tensor C, Tensor indices_A, void* cuda_stream + Tensor output, + Tensor A, + Tensor C, + Tensor indices_A, + void* cuda_stream ); template diff --git a/mops/include/mops/sasaw.h b/mops/include/mops/sasaw.h index 1c4b88c..d87560d 100644 --- a/mops/include/mops/sasaw.h +++ b/mops/include/mops/sasaw.h @@ -128,7 +128,7 @@ int MOPS_EXPORT mops_cuda_sparse_accumulation_scatter_add_with_weights_f32( mops_tensor_1d_i32_t indices_W_2, mops_tensor_1d_i32_t indices_output_1, mops_tensor_1d_i32_t indices_output_2, - void * cuda_stream + void* cuda_stream ); /// CUDA version of mops::sparse_accumulation_scatter_add_with for 64-bit floats @@ -143,7 +143,7 @@ int MOPS_EXPORT mops_cuda_sparse_accumulation_scatter_add_with_weights_f64( mops_tensor_1d_i32_t indices_W_2, mops_tensor_1d_i32_t indices_output_1, mops_tensor_1d_i32_t indices_output_2, - void * cuda_stream + void* cuda_stream ); /// CUDA version of mops::sparse_accumulation_scatter_add_with_weights_vjp for @@ -162,7 +162,7 @@ int MOPS_EXPORT mops_cuda_sparse_accumulation_scatter_add_with_weights_vjp_f32( mops_tensor_1d_i32_t indices_W_2, mops_tensor_1d_i32_t indices_output_1, mops_tensor_1d_i32_t indices_output_2, - void * cuda_stream + void* cuda_stream ); /// CUDA version of mops::sparse_accumulation_scatter_add_with_weights_vjp for @@ -181,7 +181,7 @@ int MOPS_EXPORT mops_cuda_sparse_accumulation_scatter_add_with_weights_vjp_f64( mops_tensor_1d_i32_t indices_W_2, mops_tensor_1d_i32_t indices_output_1, mops_tensor_1d_i32_t indices_output_2, - void * cuda_stream + void* cuda_stream ); /// CUDA version of mops::sparse_accumulation_scatter_add_with_weights_vjp_vjp for @@ -204,7 +204,7 @@ int MOPS_EXPORT mops_cuda_sparse_accumulation_scatter_add_with_weights_vjp_vjp_f mops_tensor_1d_i32_t indices_W_2, mops_tensor_1d_i32_t indices_output_1, mops_tensor_1d_i32_t indices_output_2, - void * cuda_stream + void* cuda_stream ); /// CUDA version of mops::sparse_accumulation_scatter_add_with_weights_vjp_vjp for @@ -227,7 +227,7 @@ int MOPS_EXPORT mops_cuda_sparse_accumulation_scatter_add_with_weights_vjp_vjp_f mops_tensor_1d_i32_t indices_W_2, mops_tensor_1d_i32_t indices_output_1, mops_tensor_1d_i32_t indices_output_2, - void * cuda_stream + void* cuda_stream ); #ifdef __cplusplus diff --git a/mops/src/hpe/hpe.cu b/mops/src/hpe/hpe.cu index 7e36468..59a6af1 100644 --- a/mops/src/hpe/hpe.cu +++ b/mops/src/hpe/hpe.cu @@ -108,7 +108,11 @@ __global__ void homogeneous_polynomial_evaluation_kernel( template void mops::cuda::homogeneous_polynomial_evaluation( - Tensor output, Tensor A, Tensor C, Tensor indices_A, void * cuda_stream + Tensor output, + Tensor A, + Tensor C, + Tensor indices_A, + void* cuda_stream ) { check_hpe(output, A, C, indices_A, "cuda_homogeneous_polynomial_evaluation"); @@ -199,11 +203,19 @@ void mops::cuda::homogeneous_polynomial_evaluation( // explicit instanciations of CUDA templates template void mops::cuda::homogeneous_polynomial_evaluation( - Tensor output, Tensor A, Tensor C, Tensor indices_A, void * cuda_stream + Tensor output, + Tensor A, + Tensor C, + Tensor indices_A, + void* cuda_stream ); template void mops::cuda::homogeneous_polynomial_evaluation( - Tensor output, Tensor A, Tensor C, Tensor indices_A, void * cuda_stream + Tensor output, + Tensor A, + Tensor C, + Tensor indices_A, + void* cuda_stream ); template @@ -307,7 +319,7 @@ void mops::cuda::homogeneous_polynomial_evaluation_vjp( Tensor A, Tensor C, Tensor indices_A, - void * cuda_stream + void* cuda_stream ) { check_hpe_vjp(grad_A, grad_output, A, C, indices_A, "cuda_homogeneous_polynomial_evaluation_vjp"); @@ -401,7 +413,7 @@ template void mops::cuda::homogeneous_polynomial_evaluation_vjp( Tensor A, Tensor C, Tensor indices_A, - void * cuda_stream + void* cuda_stream ); template void mops::cuda::homogeneous_polynomial_evaluation_vjp( @@ -410,7 +422,7 @@ template void mops::cuda::homogeneous_polynomial_evaluation_vjp( Tensor A, Tensor C, Tensor indices_A, - void * cuda_stream + void* cuda_stream ); template @@ -422,7 +434,7 @@ void mops::cuda::homogeneous_polynomial_evaluation_vjp_vjp( Tensor A, Tensor C, Tensor indices_A, - void * cuda_stream + void* cuda_stream ) { throw std::runtime_error("Not implemented"); } @@ -436,7 +448,7 @@ template void mops::cuda::homogeneous_polynomial_evaluation_vjp_vjp( Tensor A, Tensor C, Tensor indices_A, - void * cuda_stream + void* cuda_stream ); template void mops::cuda::homogeneous_polynomial_evaluation_vjp_vjp( @@ -447,5 +459,5 @@ template void mops::cuda::homogeneous_polynomial_evaluation_vjp_vjp( Tensor A, Tensor C, Tensor indices_A, - void * cuda_stream + void* cuda_stream ); diff --git a/mops/src/opsaw/capi.cpp b/mops/src/opsaw/capi.cpp index dacb6ac..3585d70 100644 --- a/mops/src/opsaw/capi.cpp +++ b/mops/src/opsaw/capi.cpp @@ -211,7 +211,7 @@ extern "C" int mops_cuda_outer_product_scatter_add_with_weights_f32( mops_tensor_2d_f32_t W, mops_tensor_1d_i32_t indices_W, mops_tensor_1d_i32_t indices_output, - void * cuda_stream + void* cuda_stream ) { MOPS_CATCH_EXCEPTIONS_BEGIN mops::cuda::outer_product_scatter_add_with_weights( @@ -236,7 +236,7 @@ extern "C" int mops_cuda_outer_product_scatter_add_with_weights_f64( mops_tensor_2d_f64_t W, mops_tensor_1d_i32_t indices_W, mops_tensor_1d_i32_t indices_output, - void * cuda_stream + void* cuda_stream ) { MOPS_CATCH_EXCEPTIONS_BEGIN mops::cuda::outer_product_scatter_add_with_weights( @@ -264,7 +264,7 @@ extern "C" int mops_cuda_outer_product_scatter_add_with_weights_vjp_f32( mops_tensor_2d_f32_t W, mops_tensor_1d_i32_t indices_W, mops_tensor_1d_i32_t indices_output, - void * cuda_stream + void* cuda_stream ) { MOPS_CATCH_EXCEPTIONS_BEGIN mops::cuda::outer_product_scatter_add_with_weights_vjp( @@ -295,7 +295,7 @@ extern "C" int mops_cuda_outer_product_scatter_add_with_weights_vjp_f64( mops_tensor_2d_f64_t W, mops_tensor_1d_i32_t indices_W, mops_tensor_1d_i32_t indices_output, - void * cuda_stream + void* cuda_stream ) { MOPS_CATCH_EXCEPTIONS_BEGIN mops::cuda::outer_product_scatter_add_with_weights_vjp( @@ -330,7 +330,7 @@ extern "C" int mops_cuda_outer_product_scatter_add_with_weights_vjp_vjp_f32( mops_tensor_2d_f32_t W, mops_tensor_1d_i32_t indices_W, mops_tensor_1d_i32_t indices_output, - void * cuda_stream + void* cuda_stream ) { MOPS_CATCH_EXCEPTIONS_BEGIN mops::cuda::outer_product_scatter_add_with_weights_vjp_vjp( @@ -375,7 +375,7 @@ extern "C" int mops_cuda_outer_product_scatter_add_with_weights_vjp_vjp_f64( mops_tensor_2d_f64_t W, mops_tensor_1d_i32_t indices_W, mops_tensor_1d_i32_t indices_output, - void * cuda_stream + void* cuda_stream ) { MOPS_CATCH_EXCEPTIONS_BEGIN mops::cuda::outer_product_scatter_add_with_weights_vjp_vjp( From b2c170a9c5ee1a6e61fdbfb2ead3f6ba3ec897df Mon Sep 17 00:00:00 2001 From: "Nick J. Browning" Date: Fri, 3 May 2024 13:57:18 +0200 Subject: [PATCH 06/28] fixed build issues. --- mops/CMakeLists.txt | 2 + mops/src/opsaw/cuda.tpp | 49 ------------- mops/src/opsaw/opsaw.cpp | 8 +-- mops/src/opsaw/opsaw.cu | 107 +++++++++++++++++++++++++++- mops/src/sasaw/cuda.tpp | 61 ---------------- mops/src/sasaw/sasaw.cu | 149 ++++++++++++++++++++++++++++++++++++++- 6 files changed, 259 insertions(+), 117 deletions(-) delete mode 100644 mops/src/opsaw/cuda.tpp delete mode 100644 mops/src/sasaw/cuda.tpp diff --git a/mops/CMakeLists.txt b/mops/CMakeLists.txt index 846eb47..f75a638 100644 --- a/mops/CMakeLists.txt +++ b/mops/CMakeLists.txt @@ -124,6 +124,8 @@ if(CMAKE_CUDA_COMPILER AND MOPS_CUDA) "src/opsa/opsa.cu" "src/hpe/hpe.cu" "src/sap/sap.cu" + "src/sasaw/sasaw.cu" + "src/opsaw/opsaw.cu" ) endif() diff --git a/mops/src/opsaw/cuda.tpp b/mops/src/opsaw/cuda.tpp deleted file mode 100644 index 204e7db..0000000 --- a/mops/src/opsaw/cuda.tpp +++ /dev/null @@ -1,49 +0,0 @@ -#include - -#include "mops/opsaw.hpp" - -template -void mops::cuda::outer_product_scatter_add_with_weights( - Tensor, - Tensor, - Tensor, - Tensor, - Tensor, - Tensor -) { - throw std::runtime_error("CUDA implementation does not exist yet"); -} - -template -void mops::cuda::outer_product_scatter_add_with_weights_vjp( - Tensor, - Tensor, - Tensor, - Tensor, - Tensor, - Tensor, - Tensor, - Tensor, - Tensor -) { - throw std::runtime_error("CUDA implementation does not exist yet"); -} - -template -void mops::cuda::outer_product_scatter_add_with_weights_vjp_vjp( - Tensor /*grad_grad_output*/, - Tensor /*grad_A_2*/, - Tensor /*grad_B_2*/, - Tensor /*grad_W_2*/, - Tensor /*grad_grad_A*/, - Tensor /*grad_grad_B*/, - Tensor /*grad_grad_W*/, - Tensor /*grad_output*/, - Tensor /*A*/, - Tensor /*B*/, - Tensor /*W*/, - Tensor /*indices_W*/, - Tensor /*indices_output*/ -) { - throw std::runtime_error("CUDA implementation does not exist yet"); -} diff --git a/mops/src/opsaw/opsaw.cpp b/mops/src/opsaw/opsaw.cpp index b3b30b8..b7eb889 100644 --- a/mops/src/opsaw/opsaw.cpp +++ b/mops/src/opsaw/opsaw.cpp @@ -75,9 +75,7 @@ template void mops::outer_product_scatter_add_with_weights_vjp_vjp( Tensor indices_output ); -#ifdef MOPS_CUDA_ENABLED -#include "cuda.tpp" -#else +#ifndef MOPS_CUDA_ENABLED template void mops::cuda:: outer_product_scatter_add_with_weights(Tensor, Tensor, Tensor, Tensor, Tensor, Tensor) { @@ -109,8 +107,6 @@ void mops::cuda::outer_product_scatter_add_with_weights_vjp_vjp( throw std::runtime_error("MOPS was not compiled with CUDA support"); } -#endif - // explicit instantiations of CUDA templates template void mops::cuda::outer_product_scatter_add_with_weights( Tensor output, @@ -185,3 +181,5 @@ template void mops::cuda::outer_product_scatter_add_with_weights_vjp_vjp Tensor indices_W, Tensor indices_output ); + +#endif \ No newline at end of file diff --git a/mops/src/opsaw/opsaw.cu b/mops/src/opsaw/opsaw.cu index 8421bac..28c48fe 100644 --- a/mops/src/opsaw/opsaw.cu +++ b/mops/src/opsaw/opsaw.cu @@ -1 +1,106 @@ -// todo: cuda device code +#include + +#include "mops/opsaw.hpp" + +using namespace mops; +using namespace mops::cuda; +using namespace std; + +template +void mops::cuda:: + outer_product_scatter_add_with_weights(Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, void*) { + throw std::runtime_error("CUDA implementation does not exist yet"); +} + +template +void mops::cuda:: + outer_product_scatter_add_with_weights_vjp(Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, void*) { + throw std::runtime_error("CUDA implementation does not exist yet"); +} + +template +void mops::cuda:: + outer_product_scatter_add_with_weights_vjp_vjp(Tensor /*grad_grad_output*/, Tensor /*grad_A_2*/, Tensor /*grad_B_2*/, Tensor /*grad_W_2*/, Tensor /*grad_grad_A*/, Tensor /*grad_grad_B*/, Tensor /*grad_grad_W*/, Tensor /*grad_output*/, Tensor /*A*/, Tensor /*B*/, Tensor /*W*/, Tensor /*indices_W*/, Tensor /*indices_output*/, void*) { + throw std::runtime_error("CUDA implementation does not exist yet"); +} + +// explicit instantiations of CUDA templates +template void mops::cuda::outer_product_scatter_add_with_weights( + Tensor output, + Tensor A, + Tensor B, + Tensor W, + Tensor indices_W, + Tensor indices_output, + void* cuda_stream +); + +template void mops::cuda::outer_product_scatter_add_with_weights( + Tensor output, + Tensor A, + Tensor B, + Tensor W, + Tensor indices_W, + Tensor indices_output, + void* cuda_stream +); + +template void mops::cuda::outer_product_scatter_add_with_weights_vjp( + Tensor grad_A, + Tensor grad_B, + Tensor grad_W, + Tensor grad_output, + Tensor A, + Tensor B, + Tensor W, + Tensor indices_W, + Tensor indices_output, + void* cuda_stream +); + +template void mops::cuda::outer_product_scatter_add_with_weights_vjp( + Tensor grad_A, + Tensor grad_B, + Tensor grad_W, + Tensor grad_output, + Tensor A, + Tensor B, + Tensor W, + Tensor indices_W, + Tensor indices_output, + void* cuda_stream +); + +template void mops::cuda::outer_product_scatter_add_with_weights_vjp_vjp( + Tensor grad_grad_output, + Tensor grad_A_2, + Tensor grad_B_2, + Tensor grad_W_2, + Tensor grad_grad_A, + Tensor grad_grad_B, + Tensor grad_grad_W, + Tensor grad_output, + Tensor A, + Tensor B, + Tensor W, + Tensor indices_W, + Tensor indices_output, + void* cuda_stream +); + +template void mops::cuda::outer_product_scatter_add_with_weights_vjp_vjp( + Tensor grad_grad_output, + Tensor grad_A_2, + Tensor grad_B_2, + Tensor grad_W_2, + Tensor grad_grad_A, + Tensor grad_grad_B, + Tensor grad_grad_W, + Tensor grad_output, + Tensor A, + Tensor B, + Tensor W, + Tensor indices_W, + Tensor indices_output, + void* cuda_stream +); \ No newline at end of file diff --git a/mops/src/sasaw/cuda.tpp b/mops/src/sasaw/cuda.tpp deleted file mode 100644 index c2f8558..0000000 --- a/mops/src/sasaw/cuda.tpp +++ /dev/null @@ -1,61 +0,0 @@ -#include - -#include "mops/sasaw.hpp" - -template -void mops::cuda::sparse_accumulation_scatter_add_with_weights( - Tensor, - Tensor, - Tensor, - Tensor, - Tensor, - Tensor, - Tensor, - Tensor, - Tensor, - Tensor -) { - throw std::runtime_error("CUDA implementation does not exist yet"); -} - -template -void mops::cuda::sparse_accumulation_scatter_add_with_weights_vjp( - Tensor, - Tensor , - Tensor, - Tensor, - Tensor , - Tensor , - Tensor , - Tensor , - Tensor, - Tensor, - Tensor, - Tensor, - Tensor -) { - throw std::runtime_error("CUDA implementation does not exist yet"); -} - -template -void mops::cuda::sparse_accumulation_scatter_add_with_weights_vjp_vjp( - Tensor /*grad_grad_output*/, - Tensor /*grad_A_2*/, - Tensor /*grad_B_2*/, - Tensor /*grad_W_2*/, - Tensor /*grad_grad_A*/, - Tensor /*grad_grad_B*/, - Tensor /*grad_grad_W*/, - Tensor /*grad_output*/, - Tensor /*A*/, - Tensor /*B*/, - Tensor /*C*/, - Tensor /*W*/, - Tensor /*indices_A*/, - Tensor /*indices_W_1*/, - Tensor /*indices_W_2*/, - Tensor /*indices_output_1*/, - Tensor /*indices_output_2*/ -) { - throw std::runtime_error("CUDA implementation does not exist yet"); -} diff --git a/mops/src/sasaw/sasaw.cu b/mops/src/sasaw/sasaw.cu index 8421bac..06b94b1 100644 --- a/mops/src/sasaw/sasaw.cu +++ b/mops/src/sasaw/sasaw.cu @@ -1 +1,148 @@ -// todo: cuda device code +#include + +#include "mops/sasaw.hpp" + +using namespace mops; +using namespace mops::cuda; +using namespace std; + +template +void mops::cuda:: + sparse_accumulation_scatter_add_with_weights(Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, void*) { + throw std::runtime_error("MOPS was not compiled with CUDA support"); +} + +template +void mops::cuda:: + sparse_accumulation_scatter_add_with_weights_vjp(Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, void*) { + throw std::runtime_error("MOPS was not compiled with CUDA support"); +} + +template +void mops::cuda::sparse_accumulation_scatter_add_with_weights_vjp_vjp( + Tensor /*grad_grad_output*/, + Tensor /*grad_A_2*/, + Tensor /*grad_B_2*/, + Tensor /*grad_W_2*/, + Tensor /*grad_grad_A*/, + Tensor /*grad_grad_B*/, + Tensor /*grad_grad_W*/, + Tensor /*grad_output*/, + Tensor /*A*/, + Tensor /*B*/, + Tensor /*C*/, + Tensor /*W*/, + Tensor /*indices_A*/, + Tensor /*indices_W_1*/, + Tensor /*indices_W_2*/, + Tensor /*indices_output_1*/, + Tensor /*indices_output_2*/, + void* /*cuda_stream*/ +) { + throw std::runtime_error("MOPS was not compiled with CUDA support"); +} + +// explicit instantiations of CUDA templates +template void mops::cuda::sparse_accumulation_scatter_add_with_weights( + Tensor output, + Tensor A, + Tensor B, + Tensor C, + Tensor W, + Tensor indices_A, + Tensor indices_W_1, + Tensor indices_W_2, + Tensor indices_output_1, + Tensor indices_output_2, + void* cuda_stream +); + +template void mops::cuda::sparse_accumulation_scatter_add_with_weights( + Tensor output, + Tensor A, + Tensor B, + Tensor C, + Tensor W, + Tensor indices_A, + Tensor indices_W_1, + Tensor indices_W_2, + Tensor indices_output_1, + Tensor indices_output_2, + void* cuda_stream +); + +template void mops::cuda::sparse_accumulation_scatter_add_with_weights_vjp( + Tensor grad_A, + Tensor grad_B, + Tensor grad_W, + Tensor grad_output, + Tensor A, + Tensor B, + Tensor C, + Tensor W, + Tensor indices_A, + Tensor indices_W_1, + Tensor indices_W_2, + Tensor indices_output_1, + Tensor indices_output_2, + void* cuda_stream +); + +template void mops::cuda::sparse_accumulation_scatter_add_with_weights_vjp( + Tensor grad_A, + Tensor grad_B, + Tensor grad_W, + Tensor grad_output, + Tensor A, + Tensor B, + Tensor C, + Tensor W, + Tensor indices_A, + Tensor indices_W_1, + Tensor indices_W_2, + Tensor indices_output_1, + Tensor indices_output_2, + void* cuda_stream +); + +template void mops::cuda::sparse_accumulation_scatter_add_with_weights_vjp_vjp( + Tensor grad_grad_output, + Tensor grad_A_2, + Tensor grad_B_2, + Tensor grad_W_2, + Tensor grad_grad_A, + Tensor grad_grad_B, + Tensor grad_grad_W, + Tensor grad_output, + Tensor A, + Tensor B, + Tensor C, + Tensor W, + Tensor indices_A, + Tensor indices_W_1, + Tensor indices_W_2, + Tensor indices_output_1, + Tensor indices_output_2, + void* cuda_stream +); + +template void mops::cuda::sparse_accumulation_scatter_add_with_weights_vjp_vjp( + Tensor grad_grad_output, + Tensor grad_A_2, + Tensor grad_B_2, + Tensor grad_W_2, + Tensor grad_grad_A, + Tensor grad_grad_B, + Tensor grad_grad_W, + Tensor grad_output, + Tensor A, + Tensor B, + Tensor C, + Tensor W, + Tensor indices_A, + Tensor indices_W_1, + Tensor indices_W_2, + Tensor indices_output_1, + Tensor indices_output_2, + void* cuda_stream +); \ No newline at end of file From f78161cd0991637fe921d2fd56e0d883aa511016 Mon Sep 17 00:00:00 2001 From: "Nick J. Browning" Date: Fri, 3 May 2024 14:02:58 +0200 Subject: [PATCH 07/28] fixed issue with opsaw instantiation --- mops/src/opsaw/opsaw.cpp | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/mops/src/opsaw/opsaw.cpp b/mops/src/opsaw/opsaw.cpp index b7eb889..4764b14 100644 --- a/mops/src/opsaw/opsaw.cpp +++ b/mops/src/opsaw/opsaw.cpp @@ -78,13 +78,13 @@ template void mops::outer_product_scatter_add_with_weights_vjp_vjp( #ifndef MOPS_CUDA_ENABLED template void mops::cuda:: - outer_product_scatter_add_with_weights(Tensor, Tensor, Tensor, Tensor, Tensor, Tensor) { + outer_product_scatter_add_with_weights(Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, void *) { throw std::runtime_error("MOPS was not compiled with CUDA support"); } template void mops::cuda:: - outer_product_scatter_add_with_weights_vjp(Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor) { + outer_product_scatter_add_with_weights_vjp(Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, void *) { throw std::runtime_error("MOPS was not compiled with CUDA support"); } @@ -102,7 +102,8 @@ void mops::cuda::outer_product_scatter_add_with_weights_vjp_vjp( Tensor /*B*/, Tensor /*W*/, Tensor /*indices_W*/, - Tensor /*indices_output*/ + Tensor /*indices_output*/, + void * ) { throw std::runtime_error("MOPS was not compiled with CUDA support"); } @@ -114,7 +115,8 @@ template void mops::cuda::outer_product_scatter_add_with_weights( Tensor B, Tensor W, Tensor indices_W, - Tensor indices_output + Tensor indices_output, + void * cuda_stream ); template void mops::cuda::outer_product_scatter_add_with_weights( @@ -123,7 +125,8 @@ template void mops::cuda::outer_product_scatter_add_with_weights( Tensor B, Tensor W, Tensor indices_W, - Tensor indices_output + Tensor indices_output, + void * cuda_stream ); template void mops::cuda::outer_product_scatter_add_with_weights_vjp( @@ -135,7 +138,8 @@ template void mops::cuda::outer_product_scatter_add_with_weights_vjp( Tensor B, Tensor W, Tensor indices_W, - Tensor indices_output + Tensor indices_output, + void * cuda_stream ); template void mops::cuda::outer_product_scatter_add_with_weights_vjp( @@ -147,7 +151,8 @@ template void mops::cuda::outer_product_scatter_add_with_weights_vjp( Tensor B, Tensor W, Tensor indices_W, - Tensor indices_output + Tensor indices_output, + void * cuda_stream ); template void mops::cuda::outer_product_scatter_add_with_weights_vjp_vjp( @@ -163,7 +168,8 @@ template void mops::cuda::outer_product_scatter_add_with_weights_vjp_vjp( Tensor B, Tensor W, Tensor indices_W, - Tensor indices_output + Tensor indices_output, + void * cuda_stream ); template void mops::cuda::outer_product_scatter_add_with_weights_vjp_vjp( @@ -179,7 +185,8 @@ template void mops::cuda::outer_product_scatter_add_with_weights_vjp_vjp Tensor B, Tensor W, Tensor indices_W, - Tensor indices_output + Tensor indices_output, + void * cuda_stream ); #endif \ No newline at end of file From e89350b3e9be5ce149a7651b3b8299ea27c89fb6 Mon Sep 17 00:00:00 2001 From: "Nick J. Browning" Date: Fri, 3 May 2024 14:03:29 +0200 Subject: [PATCH 08/28] formatting. --- mops/src/opsaw/opsaw.cpp | 34 ++++++++++------------------------ 1 file changed, 10 insertions(+), 24 deletions(-) diff --git a/mops/src/opsaw/opsaw.cpp b/mops/src/opsaw/opsaw.cpp index 4764b14..6e30c41 100644 --- a/mops/src/opsaw/opsaw.cpp +++ b/mops/src/opsaw/opsaw.cpp @@ -78,33 +78,19 @@ template void mops::outer_product_scatter_add_with_weights_vjp_vjp( #ifndef MOPS_CUDA_ENABLED template void mops::cuda:: - outer_product_scatter_add_with_weights(Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, void *) { + outer_product_scatter_add_with_weights(Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, void*) { throw std::runtime_error("MOPS was not compiled with CUDA support"); } template void mops::cuda:: - outer_product_scatter_add_with_weights_vjp(Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, void *) { + outer_product_scatter_add_with_weights_vjp(Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, void*) { throw std::runtime_error("MOPS was not compiled with CUDA support"); } template -void mops::cuda::outer_product_scatter_add_with_weights_vjp_vjp( - Tensor /*grad_grad_output*/, - Tensor /*grad_A_2*/, - Tensor /*grad_B_2*/, - Tensor /*grad_W_2*/, - Tensor /*grad_grad_A*/, - Tensor /*grad_grad_B*/, - Tensor /*grad_grad_W*/, - Tensor /*grad_output*/, - Tensor /*A*/, - Tensor /*B*/, - Tensor /*W*/, - Tensor /*indices_W*/, - Tensor /*indices_output*/, - void * -) { +void mops::cuda:: + outer_product_scatter_add_with_weights_vjp_vjp(Tensor /*grad_grad_output*/, Tensor /*grad_A_2*/, Tensor /*grad_B_2*/, Tensor /*grad_W_2*/, Tensor /*grad_grad_A*/, Tensor /*grad_grad_B*/, Tensor /*grad_grad_W*/, Tensor /*grad_output*/, Tensor /*A*/, Tensor /*B*/, Tensor /*W*/, Tensor /*indices_W*/, Tensor /*indices_output*/, void*) { throw std::runtime_error("MOPS was not compiled with CUDA support"); } @@ -116,7 +102,7 @@ template void mops::cuda::outer_product_scatter_add_with_weights( Tensor W, Tensor indices_W, Tensor indices_output, - void * cuda_stream + void* cuda_stream ); template void mops::cuda::outer_product_scatter_add_with_weights( @@ -126,7 +112,7 @@ template void mops::cuda::outer_product_scatter_add_with_weights( Tensor W, Tensor indices_W, Tensor indices_output, - void * cuda_stream + void* cuda_stream ); template void mops::cuda::outer_product_scatter_add_with_weights_vjp( @@ -139,7 +125,7 @@ template void mops::cuda::outer_product_scatter_add_with_weights_vjp( Tensor W, Tensor indices_W, Tensor indices_output, - void * cuda_stream + void* cuda_stream ); template void mops::cuda::outer_product_scatter_add_with_weights_vjp( @@ -152,7 +138,7 @@ template void mops::cuda::outer_product_scatter_add_with_weights_vjp( Tensor W, Tensor indices_W, Tensor indices_output, - void * cuda_stream + void* cuda_stream ); template void mops::cuda::outer_product_scatter_add_with_weights_vjp_vjp( @@ -169,7 +155,7 @@ template void mops::cuda::outer_product_scatter_add_with_weights_vjp_vjp( Tensor W, Tensor indices_W, Tensor indices_output, - void * cuda_stream + void* cuda_stream ); template void mops::cuda::outer_product_scatter_add_with_weights_vjp_vjp( @@ -186,7 +172,7 @@ template void mops::cuda::outer_product_scatter_add_with_weights_vjp_vjp Tensor W, Tensor indices_W, Tensor indices_output, - void * cuda_stream + void* cuda_stream ); #endif \ No newline at end of file From ea54e9afa333d487558c4533b661cf081bbc0526 Mon Sep 17 00:00:00 2001 From: Prashanth Kanduri Date: Fri, 3 May 2024 15:28:33 +0200 Subject: [PATCH 09/28] add comments on the test job script --- ci/pipeline.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ci/pipeline.yml b/ci/pipeline.yml index 61af310..0d34917 100644 --- a/ci/pipeline.yml +++ b/ci/pipeline.yml @@ -22,7 +22,9 @@ test_job: script: - export CUDA_HOME="/usr/local/cuda" - python3 -m pip install --upgrade pip + - echo "Install Tox" - python3 -m pip install tox + - echo "Run the Tox Script" - tox variables: From 1bb35aab6c1c8d5fe07c9855079d8020d8527344 Mon Sep 17 00:00:00 2001 From: Prashanth Kanduri Date: Fri, 3 May 2024 15:32:26 +0200 Subject: [PATCH 10/28] insignificant commit for retriggering pipeline --- ci/pipeline.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ci/pipeline.yml b/ci/pipeline.yml index 0d34917..87cabd9 100644 --- a/ci/pipeline.yml +++ b/ci/pipeline.yml @@ -23,9 +23,10 @@ test_job: - export CUDA_HOME="/usr/local/cuda" - python3 -m pip install --upgrade pip - echo "Install Tox" - - python3 -m pip install tox + - python3 -m pip install too - echo "Run the Tox Script" - tox + - echo "Too script completed" variables: SLURM_JOB_NUM_NODES: 1 From b9be358b5a3a17f75ec2bd63df614feeb3096268 Mon Sep 17 00:00:00 2001 From: Filippo Bigi <98903385+frostedoyster@users.noreply.github.com> Date: Sat, 4 May 2024 08:19:54 +0200 Subject: [PATCH 11/28] Install tox --- ci/pipeline.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/pipeline.yml b/ci/pipeline.yml index 87cabd9..21625eb 100644 --- a/ci/pipeline.yml +++ b/ci/pipeline.yml @@ -23,10 +23,10 @@ test_job: - export CUDA_HOME="/usr/local/cuda" - python3 -m pip install --upgrade pip - echo "Install Tox" - - python3 -m pip install too + - python3 -m pip install tox - echo "Run the Tox Script" - tox - - echo "Too script completed" + - echo "Tox script completed" variables: SLURM_JOB_NUM_NODES: 1 From d9fc283cc636f46a0be26114ace25a97cc6923c1 Mon Sep 17 00:00:00 2001 From: Prashanth Kanduri Date: Mon, 6 May 2024 13:29:41 +0200 Subject: [PATCH 12/28] attempt addressing CI compiler errors and warnings --- mops/src/opsa/cpu.tpp | 2 +- mops/src/sap/sap.cu | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/mops/src/opsa/cpu.tpp b/mops/src/opsa/cpu.tpp index da347ad..570b16a 100644 --- a/mops/src/opsa/cpu.tpp +++ b/mops/src/opsa/cpu.tpp @@ -93,7 +93,7 @@ void mops::outer_product_scatter_add_vjp( scalar_t *grad_output_ptr = grad_output.data; scalar_t *a_ptr = A.data; scalar_t *b_ptr = B.data; - int32_t *indices_output_ptr = indices_output.data; + [[maybe_unused]] int32_t *indices_output_ptr = indices_output.data; #pragma omp parallel for for (size_t i = 0; i < size_ab; i++) { diff --git a/mops/src/sap/sap.cu b/mops/src/sap/sap.cu index f6adea6..27c246a 100644 --- a/mops/src/sap/sap.cu +++ b/mops/src/sap/sap.cu @@ -71,7 +71,7 @@ __global__ void sparse_accumulation_of_products_kernel( int a_idx = (packed_indices[k] >> 16) & 0xFF; atomicAdd( - buffer_out + out_idx * WARP_SIZE + laneID, + &buffer_out + out_idx * WARP_SIZE + laneID, C.data[k] * buffer_A[a_idx * WARP_SIZE + laneID] * buffer_B[b_idx * WARP_SIZE + laneID] ); } @@ -233,14 +233,14 @@ __global__ void sparse_accumulation_of_products_vjp_kernel( if (grad_A.data != nullptr) { atomicAdd( - buffer_gradA + a_idx * WARP_SIZE + laneID, + &buffer_gradA + a_idx * WARP_SIZE + laneID, C.data[k] * buffer_B[b_idx * WARP_SIZE + laneID] * buffer_gradout[out_idx * WARP_SIZE + laneID] ); } if (grad_B.data != nullptr) { atomicAdd( - buffer_gradB + b_idx * WARP_SIZE + laneID, + &buffer_gradB + b_idx * WARP_SIZE + laneID, C.data[k] * buffer_A[a_idx * WARP_SIZE + laneID] * buffer_gradout[out_idx * WARP_SIZE + laneID] ); From af2ec607ea6f584cbdda731743f0a7d20a3c9a64 Mon Sep 17 00:00:00 2001 From: Prashanth Kanduri Date: Mon, 6 May 2024 15:02:51 +0200 Subject: [PATCH 13/28] remove polynomial order zero case to avoid divide by zero issue --- mops/src/hpe/hpe.cu | 4 ---- 1 file changed, 4 deletions(-) diff --git a/mops/src/hpe/hpe.cu b/mops/src/hpe/hpe.cu index 964c4b4..c4d1735 100644 --- a/mops/src/hpe/hpe.cu +++ b/mops/src/hpe/hpe.cu @@ -313,10 +313,6 @@ void mops::cuda::homogeneous_polynomial_evaluation_vjp( if (polynomial_order <= 10) { switch (polynomial_order) { - case 0: - homogeneous_polynomial_evaluation_vjp_kernel - <<>>(grad_A, grad_output, A, C, indices_A); - break; case 1: homogeneous_polynomial_evaluation_vjp_kernel <<>>(grad_A, grad_output, A, C, indices_A); From 9fee2a1741104b3ce80b1fc172ded90e4f1c4d2a Mon Sep 17 00:00:00 2001 From: Prashanth Kanduri Date: Mon, 6 May 2024 15:04:47 +0200 Subject: [PATCH 14/28] correct address of the atomic add operations in sap --- mops/src/sap/sap.cu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mops/src/sap/sap.cu b/mops/src/sap/sap.cu index f15fb69..1a82982 100644 --- a/mops/src/sap/sap.cu +++ b/mops/src/sap/sap.cu @@ -508,14 +508,14 @@ __global__ void sparse_accumulation_of_products_vjp_vjp_kernel( if (grad_grad_output.data != nullptr) { atomicAdd( - buffer_grad_grad_output + out_idx * WARP_SIZE + laneID, + &buffer_grad_grad_output + out_idx * WARP_SIZE + laneID, grad_grad_A_k * buffer_B[b_idx * WARP_SIZE + laneID] * c ); } if (grad_B_2.data != nullptr) { atomicAdd( - buffer_grad_B2 + b_idx * WARP_SIZE + laneID, + &buffer_grad_B2 + b_idx * WARP_SIZE + laneID, grad_grad_A_k * buffer_grad_output[out_idx * WARP_SIZE + laneID] * c ); } @@ -526,14 +526,14 @@ __global__ void sparse_accumulation_of_products_vjp_vjp_kernel( if (grad_grad_output.data != nullptr) { atomicAdd( - buffer_grad_grad_output + out_idx * WARP_SIZE + laneID, + &buffer_grad_grad_output + out_idx * WARP_SIZE + laneID, grad_grad_B_k * buffer_A[a_idx * WARP_SIZE + laneID] * c ); } if (grad_A_2.data != nullptr) { atomicAdd( - buffer_grad_A2 + a_idx * WARP_SIZE + laneID, + &buffer_grad_A2 + a_idx * WARP_SIZE + laneID, grad_grad_B_k * buffer_grad_output[out_idx * WARP_SIZE + laneID] * c ); } From fc80766cb65d5a0bf72c8f8bbbf31f14b975846a Mon Sep 17 00:00:00 2001 From: Prashanth Kanduri Date: Mon, 6 May 2024 15:30:54 +0200 Subject: [PATCH 15/28] specify addresses with indices for atomic add operations to address CI errors --- mops/src/hpe/hpe.cu | 2 +- mops/src/opsa/cpu.tpp | 2 +- mops/src/sap/sap.cu | 14 +++++++------- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/mops/src/hpe/hpe.cu b/mops/src/hpe/hpe.cu index c4d1735..de22035 100644 --- a/mops/src/hpe/hpe.cu +++ b/mops/src/hpe/hpe.cu @@ -311,7 +311,7 @@ void mops::cuda::homogeneous_polynomial_evaluation_vjp( shared_array(2 * nnu1, sptr, &space); shared_array(NWARPS_PER_BLOCK * WARP_SIZE * polynomial_order, sptr, &space); - if (polynomial_order <= 10) { + if (polynomial_order > 0 && polynomial_order <= 10) { switch (polynomial_order) { case 1: homogeneous_polynomial_evaluation_vjp_kernel diff --git a/mops/src/opsa/cpu.tpp b/mops/src/opsa/cpu.tpp index 1bc9d68..e628e2e 100644 --- a/mops/src/opsa/cpu.tpp +++ b/mops/src/opsa/cpu.tpp @@ -167,7 +167,7 @@ void mops::outer_product_scatter_add_vjp_vjp( scalar_t *grad_output_ptr = grad_output.data; scalar_t *a_ptr = A.data; scalar_t *b_ptr = B.data; - int32_t *indices_output_ptr = indices_output.data; + [[maybe_unused]] int32_t *indices_output_ptr = indices_output.data; scalar_t *grad_output_ptr_i = nullptr; scalar_t *a_ptr_i = nullptr; diff --git a/mops/src/sap/sap.cu b/mops/src/sap/sap.cu index 1a82982..48dbe84 100644 --- a/mops/src/sap/sap.cu +++ b/mops/src/sap/sap.cu @@ -71,7 +71,7 @@ __global__ void sparse_accumulation_of_products_kernel( int a_idx = (packed_indices[k] >> 16) & 0xFF; atomicAdd( - &buffer_out + out_idx * WARP_SIZE + laneID, + &buffer_out[out_idx * WARP_SIZE + laneID], C.data[k] * buffer_A[a_idx * WARP_SIZE + laneID] * buffer_B[b_idx * WARP_SIZE + laneID] ); } @@ -236,14 +236,14 @@ __global__ void sparse_accumulation_of_products_vjp_kernel( if (grad_A.data != nullptr) { atomicAdd( - &buffer_gradA + a_idx * WARP_SIZE + laneID, + &buffer_gradA[a_idx * WARP_SIZE + laneID], C.data[k] * buffer_B[b_idx * WARP_SIZE + laneID] * buffer_gradout[out_idx * WARP_SIZE + laneID] ); } if (grad_B.data != nullptr) { atomicAdd( - &buffer_gradB + b_idx * WARP_SIZE + laneID, + &buffer_gradB[b_idx * WARP_SIZE + laneID], C.data[k] * buffer_A[a_idx * WARP_SIZE + laneID] * buffer_gradout[out_idx * WARP_SIZE + laneID] ); @@ -508,14 +508,14 @@ __global__ void sparse_accumulation_of_products_vjp_vjp_kernel( if (grad_grad_output.data != nullptr) { atomicAdd( - &buffer_grad_grad_output + out_idx * WARP_SIZE + laneID, + &buffer_grad_grad_output[out_idx * WARP_SIZE + laneID], grad_grad_A_k * buffer_B[b_idx * WARP_SIZE + laneID] * c ); } if (grad_B_2.data != nullptr) { atomicAdd( - &buffer_grad_B2 + b_idx * WARP_SIZE + laneID, + &buffer_grad_B2[b_idx * WARP_SIZE + laneID], grad_grad_A_k * buffer_grad_output[out_idx * WARP_SIZE + laneID] * c ); } @@ -526,14 +526,14 @@ __global__ void sparse_accumulation_of_products_vjp_vjp_kernel( if (grad_grad_output.data != nullptr) { atomicAdd( - &buffer_grad_grad_output + out_idx * WARP_SIZE + laneID, + &buffer_grad_grad_output[out_idx * WARP_SIZE + laneID], grad_grad_B_k * buffer_A[a_idx * WARP_SIZE + laneID] * c ); } if (grad_A_2.data != nullptr) { atomicAdd( - &buffer_grad_A2 + a_idx * WARP_SIZE + laneID, + &buffer_grad_A2[a_idx * WARP_SIZE + laneID], grad_grad_B_k * buffer_grad_output[out_idx * WARP_SIZE + laneID] * c ); } From 2ed5771124a7eb02ec8efd787082d154ae0aec92 Mon Sep 17 00:00:00 2001 From: Prashanth Kanduri Date: Tue, 7 May 2024 11:06:29 +0200 Subject: [PATCH 16/28] dummy implementation for polynomial order zero --- mops/src/hpe/hpe.cu | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/mops/src/hpe/hpe.cu b/mops/src/hpe/hpe.cu index de22035..8d6e1d8 100644 --- a/mops/src/hpe/hpe.cu +++ b/mops/src/hpe/hpe.cu @@ -106,6 +106,13 @@ __global__ void homogeneous_polynomial_evaluation_kernel( } } +template +__global__ void homogeneous_polynomial_evaluation_kernel( + Tensor output, Tensor A, Tensor C, Tensor indices_A +) { + // dummy implementation for polynomial order zero +} + template void mops::cuda::homogeneous_polynomial_evaluation( Tensor output, Tensor A, Tensor C, Tensor indices_A @@ -128,12 +135,8 @@ void mops::cuda::homogeneous_polynomial_evaluation( shared_array(NWARPS_PER_BLOCK, sptr, &space); shared_array(WARP_SIZE * NWARPS_PER_BLOCK * polynomial_order, sptr, &space); - if (polynomial_order <= 10) { + if (polynomial_order > 0 && polynomial_order <= 10) { switch (polynomial_order) { - case 0: - homogeneous_polynomial_evaluation_kernel - <<>>(output, A, C, indices_A); - break; case 1: homogeneous_polynomial_evaluation_kernel <<>>(output, A, C, indices_A); From 62b73d8df162c957a0e3151fa4a6e86f4bd724b4 Mon Sep 17 00:00:00 2001 From: "Nicholas J. Browning" Date: Fri, 10 May 2024 15:07:08 +0200 Subject: [PATCH 17/28] removed bug in CUDA HPE. --- mops/src/hpe/hpe.cu | 7 ------- 1 file changed, 7 deletions(-) diff --git a/mops/src/hpe/hpe.cu b/mops/src/hpe/hpe.cu index 8d6e1d8..1d9e2ea 100644 --- a/mops/src/hpe/hpe.cu +++ b/mops/src/hpe/hpe.cu @@ -106,13 +106,6 @@ __global__ void homogeneous_polynomial_evaluation_kernel( } } -template -__global__ void homogeneous_polynomial_evaluation_kernel( - Tensor output, Tensor A, Tensor C, Tensor indices_A -) { - // dummy implementation for polynomial order zero -} - template void mops::cuda::homogeneous_polynomial_evaluation( Tensor output, Tensor A, Tensor C, Tensor indices_A From b6d479529b3896681548054175a90f45ae3dc4fa Mon Sep 17 00:00:00 2001 From: "Nicholas J. Browning" Date: Fri, 10 May 2024 15:45:03 +0200 Subject: [PATCH 18/28] header change --- mops/src/hpe/hpe.cu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mops/src/hpe/hpe.cu b/mops/src/hpe/hpe.cu index 1d9e2ea..d2238c9 100644 --- a/mops/src/hpe/hpe.cu +++ b/mops/src/hpe/hpe.cu @@ -1,3 +1,5 @@ +#include + #include "mops/hpe.hpp" #include "internal/checks/hpe.hpp" From 68223e25ed837a90e8c4bca3a1c35939206c6220 Mon Sep 17 00:00:00 2001 From: "Nicholas J. Browning" Date: Fri, 10 May 2024 16:10:28 +0200 Subject: [PATCH 19/28] HPE divide by zero fix. --- mops/src/hpe/hpe.cu | 103 ++++++++++++++++++++++++++------------------ 1 file changed, 62 insertions(+), 41 deletions(-) diff --git a/mops/src/hpe/hpe.cu b/mops/src/hpe/hpe.cu index d2238c9..2c75e65 100644 --- a/mops/src/hpe/hpe.cu +++ b/mops/src/hpe/hpe.cu @@ -1,5 +1,3 @@ -#include - #include "mops/hpe.hpp" #include "internal/checks/hpe.hpp" @@ -55,14 +53,20 @@ __global__ void homogeneous_polynomial_evaluation_kernel( __syncthreads(); - int32_t i_monomial = threadIdx.x % polynomial_order; - int32_t x = threadIdx.x / polynomial_order; - int32_t nx = blockDim.x / polynomial_order; + int32_t i_monomial; + int32_t x; + int32_t nx; + + if (polynomial_order > 0) { + i_monomial = threadIdx.x % polynomial_order; + x = threadIdx.x / polynomial_order; + nx = blockDim.x / polynomial_order; - for (int lbasis = x; lbasis < blockDim.x; lbasis += nx) { - if (i_monomial * blockDim.x + lbasis < polynomial_order * blockDim.x) { - buffer_indices_A[i_monomial * blockDim.x + lbasis] = - indices_A.data[(i + lbasis) * polynomial_order + i_monomial]; + for (int lbasis = x; lbasis < blockDim.x; lbasis += nx) { + if (i_monomial * blockDim.x + lbasis < polynomial_order * blockDim.x) { + buffer_indices_A[i_monomial * blockDim.x + lbasis] = + indices_A.data[(i + lbasis) * polynomial_order + i_monomial]; + } } } @@ -130,8 +134,12 @@ void mops::cuda::homogeneous_polynomial_evaluation( shared_array(NWARPS_PER_BLOCK, sptr, &space); shared_array(WARP_SIZE * NWARPS_PER_BLOCK * polynomial_order, sptr, &space); - if (polynomial_order > 0 && polynomial_order <= 10) { + if (polynomial_order <= 10) { switch (polynomial_order) { + case 0: + homogeneous_polynomial_evaluation_kernel + <<>>(output, A, C, indices_A); + break; case 1: homogeneous_polynomial_evaluation_kernel <<>>(output, A, C, indices_A); @@ -229,51 +237,56 @@ __global__ void homogeneous_polynomial_evaluation_vjp_kernel( __syncthreads(); scalar_t gout = grad_output.data[batch_id]; + if (polynomial_order > 0) { + // indices_A : nbasis, polynomial_order + for (int32_t i = 0; i < nbasis; i += blockDim.x) { - // indices_A : nbasis, polynomial_order - for (int32_t i = 0; i < nbasis; i += blockDim.x) { + __syncthreads(); - __syncthreads(); + int32_t basis = i + threadIdx.x; + + int32_t i_monomial; + int32_t x; + int32_t nx; - int32_t i_monomial = threadIdx.x % polynomial_order; - int32_t x = threadIdx.x / polynomial_order; - int32_t nx = blockDim.x / polynomial_order; + i_monomial = threadIdx.x % polynomial_order; + x = threadIdx.x / polynomial_order; + nx = blockDim.x / polynomial_order; - for (int lbasis = x; lbasis < blockDim.x; lbasis += nx) { - if (i_monomial * blockDim.x + lbasis < polynomial_order * blockDim.x) { - buffer_indices_A[i_monomial * blockDim.x + lbasis] = - indices_A.data[(i + lbasis) * polynomial_order + i_monomial]; + for (int lbasis = x; lbasis < blockDim.x; lbasis += nx) { + if (i_monomial * blockDim.x + lbasis < polynomial_order * blockDim.x) { + buffer_indices_A[i_monomial * blockDim.x + lbasis] = + indices_A.data[(i + lbasis) * polynomial_order + i_monomial]; + } } - } - __syncthreads(); + __syncthreads(); - int32_t basis = i + threadIdx.x; + if (basis < nbasis) { - if (basis < nbasis) { + scalar_t c = C.data[basis] * gout; - scalar_t c = C.data[basis] * gout; + for (int32_t i_monomial = 0; i_monomial < polynomial_order; i_monomial++) { - for (int32_t i_monomial = 0; i_monomial < polynomial_order; i_monomial++) { + scalar_t tmp_i = c; + + for (int32_t j_monomial = 0; j_monomial < polynomial_order; j_monomial++) { - scalar_t tmp_i = c; + if (i_monomial == j_monomial) { + continue; + } - for (int32_t j_monomial = 0; j_monomial < polynomial_order; j_monomial++) { + int32_t idx_j = buffer_indices_A + [j_monomial * blockDim.x + threadIdx.x]; // indices_A.data[j_monomial + // * indices_A.shape[0] + basis]; - if (i_monomial == j_monomial) { - continue; + tmp_i *= buffer_nu1[idx_j]; } - int32_t idx_j = - buffer_indices_A[j_monomial * blockDim.x + threadIdx.x]; // indices_A.data[j_monomial - // * indices_A.shape[0] + basis]; + int32_t idx_i = buffer_indices_A[i_monomial * blockDim.x + threadIdx.x]; - tmp_i *= buffer_nu1[idx_j]; + atomicAdd(&buffer_gradA[idx_i], tmp_i); } - - int32_t idx_i = buffer_indices_A[i_monomial * blockDim.x + threadIdx.x]; - - atomicAdd(&buffer_gradA[idx_i], tmp_i); } } } @@ -281,7 +294,11 @@ __global__ void homogeneous_polynomial_evaluation_vjp_kernel( __syncthreads(); for (int32_t i = threadIdx.x; i < nnu1; i += blockDim.x) { - grad_A.data[batch_id * nnu1 + i] = buffer_gradA[i]; + if (polynomial_order > 0) { + grad_A.data[batch_id * nnu1 + i] = buffer_gradA[i]; + } else { + grad_A.data[batch_id * nnu1 + i] = 0.0; + } } } @@ -309,8 +326,12 @@ void mops::cuda::homogeneous_polynomial_evaluation_vjp( shared_array(2 * nnu1, sptr, &space); shared_array(NWARPS_PER_BLOCK * WARP_SIZE * polynomial_order, sptr, &space); - if (polynomial_order > 0 && polynomial_order <= 10) { + if (polynomial_order <= 10) { switch (polynomial_order) { + case 0: + homogeneous_polynomial_evaluation_vjp_kernel + <<>>(grad_A, grad_output, A, C, indices_A); + break; case 1: homogeneous_polynomial_evaluation_vjp_kernel <<>>(grad_A, grad_output, A, C, indices_A); @@ -410,4 +431,4 @@ template void mops::cuda::homogeneous_polynomial_evaluation_vjp_vjp( Tensor A, Tensor C, Tensor indices_A -); +); \ No newline at end of file From 3b4527f4a9e4e22d541267d3b303a1132e2cc8f1 Mon Sep 17 00:00:00 2001 From: "Nicholas J. Browning" Date: Fri, 10 May 2024 16:24:09 +0200 Subject: [PATCH 20/28] added in code for pre-sm60 atomicAdd(doubles) --- mops/src/hpe/hpe.cu | 5 ++++- mops/src/internal/cuda_utils.cu | 16 ++++++++++++++++ mops/src/internal/cuda_utils.cuh | 2 ++ 3 files changed, 22 insertions(+), 1 deletion(-) diff --git a/mops/src/hpe/hpe.cu b/mops/src/hpe/hpe.cu index 2c75e65..e599480 100644 --- a/mops/src/hpe/hpe.cu +++ b/mops/src/hpe/hpe.cu @@ -284,8 +284,11 @@ __global__ void homogeneous_polynomial_evaluation_vjp_kernel( } int32_t idx_i = buffer_indices_A[i_monomial * blockDim.x + threadIdx.x]; - +#if __CUDA_ARCH__ < 600 + atomicAdd_presm60(&buffer_gradA[idx_i], tmp_i); +#else atomicAdd(&buffer_gradA[idx_i], tmp_i); +#endif } } } diff --git a/mops/src/internal/cuda_utils.cu b/mops/src/internal/cuda_utils.cu index 3af533e..878c556 100644 --- a/mops/src/internal/cuda_utils.cu +++ b/mops/src/internal/cuda_utils.cu @@ -11,6 +11,22 @@ __host__ __device__ int32_t find_integer_divisor(int32_t x, int32_t bdim) { return (x + bdim - 1) / bdim; } +__device__ double atomicAdd_presm60(double* address, double val) { + unsigned long long int* address_as_ull = (unsigned long long int*)address; + unsigned long long int old = *address_as_ull, assumed; + + do { + assumed = old; + old = atomicCAS( + address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)) + ); + + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + + return __longlong_as_double(old); +} + template __host__ __device__ T* shared_array(std::size_t n_elements, void*& ptr, std::size_t* space) noexcept { const std::uintptr_t inptr = reinterpret_cast(ptr); diff --git a/mops/src/internal/cuda_utils.cuh b/mops/src/internal/cuda_utils.cuh index b8c25be..a58b514 100644 --- a/mops/src/internal/cuda_utils.cuh +++ b/mops/src/internal/cuda_utils.cuh @@ -23,6 +23,8 @@ using namespace std; } \ } while (0) +__device__ double atomicAdd_presm60(double* address, double val); + __host__ __device__ int32_t find_integer_divisor(int32_t x, int32_t bdim); /* From e3c70c2c53ad8df6677b3e9d9b4f5f76c08af4aa Mon Sep 17 00:00:00 2001 From: "Nicholas J. Browning" Date: Fri, 10 May 2024 16:27:42 +0200 Subject: [PATCH 21/28] macro to switch out the atomicAdds depending on ARCH --- mops/src/hpe/hpe.cu | 7 ++----- mops/src/internal/cuda_utils.cuh | 6 ++++++ mops/src/sap/sap.cu | 14 +++++++------- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/mops/src/hpe/hpe.cu b/mops/src/hpe/hpe.cu index e599480..100cbce 100644 --- a/mops/src/hpe/hpe.cu +++ b/mops/src/hpe/hpe.cu @@ -284,11 +284,8 @@ __global__ void homogeneous_polynomial_evaluation_vjp_kernel( } int32_t idx_i = buffer_indices_A[i_monomial * blockDim.x + threadIdx.x]; -#if __CUDA_ARCH__ < 600 - atomicAdd_presm60(&buffer_gradA[idx_i], tmp_i); -#else - atomicAdd(&buffer_gradA[idx_i], tmp_i); -#endif + + ATOMIC_ADD(&buffer_gradA[idx_i], tmp_i); } } } diff --git a/mops/src/internal/cuda_utils.cuh b/mops/src/internal/cuda_utils.cuh index a58b514..416417f 100644 --- a/mops/src/internal/cuda_utils.cuh +++ b/mops/src/internal/cuda_utils.cuh @@ -23,6 +23,12 @@ using namespace std; } \ } while (0) +#if __CUDA_ARCH__ < 600 +#define ATOMIC_ADD(address, val) atomicAdd_presm60(address, val) +#else +#define ATOMIC_ADD(address, val) atomicAdd(address, val) +#endif + __device__ double atomicAdd_presm60(double* address, double val); __host__ __device__ int32_t find_integer_divisor(int32_t x, int32_t bdim); diff --git a/mops/src/sap/sap.cu b/mops/src/sap/sap.cu index 48dbe84..5e17295 100644 --- a/mops/src/sap/sap.cu +++ b/mops/src/sap/sap.cu @@ -70,7 +70,7 @@ __global__ void sparse_accumulation_of_products_kernel( int b_idx = (packed_indices[k] >> 8) & 0xFF; int a_idx = (packed_indices[k] >> 16) & 0xFF; - atomicAdd( + ATOMIC_ADD( &buffer_out[out_idx * WARP_SIZE + laneID], C.data[k] * buffer_A[a_idx * WARP_SIZE + laneID] * buffer_B[b_idx * WARP_SIZE + laneID] ); @@ -235,14 +235,14 @@ __global__ void sparse_accumulation_of_products_vjp_kernel( int a_idx = (packed_indices[k] >> 16) & 0xFF; if (grad_A.data != nullptr) { - atomicAdd( + ATOMIC_ADD( &buffer_gradA[a_idx * WARP_SIZE + laneID], C.data[k] * buffer_B[b_idx * WARP_SIZE + laneID] * buffer_gradout[out_idx * WARP_SIZE + laneID] ); } if (grad_B.data != nullptr) { - atomicAdd( + ATOMIC_ADD( &buffer_gradB[b_idx * WARP_SIZE + laneID], C.data[k] * buffer_A[a_idx * WARP_SIZE + laneID] * buffer_gradout[out_idx * WARP_SIZE + laneID] @@ -507,14 +507,14 @@ __global__ void sparse_accumulation_of_products_vjp_vjp_kernel( scalar_t grad_grad_A_k = buffer_grad_grad_A[a_idx * WARP_SIZE + laneID]; if (grad_grad_output.data != nullptr) { - atomicAdd( + ATOMIC_ADD( &buffer_grad_grad_output[out_idx * WARP_SIZE + laneID], grad_grad_A_k * buffer_B[b_idx * WARP_SIZE + laneID] * c ); } if (grad_B_2.data != nullptr) { - atomicAdd( + ATOMIC_ADD( &buffer_grad_B2[b_idx * WARP_SIZE + laneID], grad_grad_A_k * buffer_grad_output[out_idx * WARP_SIZE + laneID] * c ); @@ -525,14 +525,14 @@ __global__ void sparse_accumulation_of_products_vjp_vjp_kernel( scalar_t grad_grad_B_k = buffer_grad_grad_B[b_idx * WARP_SIZE + laneID]; if (grad_grad_output.data != nullptr) { - atomicAdd( + ATOMIC_ADD( &buffer_grad_grad_output[out_idx * WARP_SIZE + laneID], grad_grad_B_k * buffer_A[a_idx * WARP_SIZE + laneID] * c ); } if (grad_A_2.data != nullptr) { - atomicAdd( + ATOMIC_ADD( &buffer_grad_A2[a_idx * WARP_SIZE + laneID], grad_grad_B_k * buffer_grad_output[out_idx * WARP_SIZE + laneID] * c ); From ade1cd3ee84e097c986f5be4df813281353b5471 Mon Sep 17 00:00:00 2001 From: "Nicholas J. Browning" Date: Fri, 10 May 2024 16:33:18 +0200 Subject: [PATCH 22/28] documentation. --- mops/src/internal/cuda_utils.cuh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mops/src/internal/cuda_utils.cuh b/mops/src/internal/cuda_utils.cuh index 416417f..7aec588 100644 --- a/mops/src/internal/cuda_utils.cuh +++ b/mops/src/internal/cuda_utils.cuh @@ -23,12 +23,19 @@ using namespace std; } \ } while (0) +/* + * Macro to select the right version of atomicAdd for the archcode being compiled. + */ #if __CUDA_ARCH__ < 600 #define ATOMIC_ADD(address, val) atomicAdd_presm60(address, val) #else #define ATOMIC_ADD(address, val) atomicAdd(address, val) #endif +/* + * Pre SM60 cards do not support atomicAdd(double *, double). This function implements and atomicCAS + * to lock update the address. + */ __device__ double atomicAdd_presm60(double* address, double val); __host__ __device__ int32_t find_integer_divisor(int32_t x, int32_t bdim); From 263984d4be133ad8f78767ed311f7ceee5e10583 Mon Sep 17 00:00:00 2001 From: "Nicholas J. Browning" Date: Fri, 10 May 2024 17:02:33 +0200 Subject: [PATCH 23/28] changed macro to device function. --- mops/src/internal/cuda_utils.cu | 15 +++++++++++++++ mops/src/internal/cuda_utils.cuh | 14 +++++--------- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/mops/src/internal/cuda_utils.cu b/mops/src/internal/cuda_utils.cu index 878c556..532d168 100644 --- a/mops/src/internal/cuda_utils.cu +++ b/mops/src/internal/cuda_utils.cu @@ -27,6 +27,21 @@ __device__ double atomicAdd_presm60(double* address, double val) { return __longlong_as_double(old); } +template __device__ scalar_t ATOMIC_ADD(scalar_t* address, scalar_t val) { +#if __CUDA_ARCH__ < 600 + if constexpr (sizeof(scalar_t) == 4) { + return atomicAdd(address, val); + } else if constexpr (sizeof(scalar_t) == 8) { + return atomicAdd_presm60(address, val); + } +#else + return atomicAdd(address, val); +#endif +} + +template float ATOMIC_ADD(float* address, float val); +template double ATOMIC_ADD(double* address, double val); + template __host__ __device__ T* shared_array(std::size_t n_elements, void*& ptr, std::size_t* space) noexcept { const std::uintptr_t inptr = reinterpret_cast(ptr); diff --git a/mops/src/internal/cuda_utils.cuh b/mops/src/internal/cuda_utils.cuh index 7aec588..83645cf 100644 --- a/mops/src/internal/cuda_utils.cuh +++ b/mops/src/internal/cuda_utils.cuh @@ -23,21 +23,17 @@ using namespace std; } \ } while (0) -/* - * Macro to select the right version of atomicAdd for the archcode being compiled. - */ -#if __CUDA_ARCH__ < 600 -#define ATOMIC_ADD(address, val) atomicAdd_presm60(address, val) -#else -#define ATOMIC_ADD(address, val) atomicAdd(address, val) -#endif - /* * Pre SM60 cards do not support atomicAdd(double *, double). This function implements and atomicCAS * to lock update the address. */ __device__ double atomicAdd_presm60(double* address, double val); +/* + * function to select the right version of atomicAdd for the archcode being compiled. + */ +template __device__ scalar_t ATOMIC_ADD(scalar_t* address, scalar_t val); + __host__ __device__ int32_t find_integer_divisor(int32_t x, int32_t bdim); /* From 899ea941095dcf0d0337d3516ede7a4b6b606df4 Mon Sep 17 00:00:00 2001 From: "Nicholas J. Browning" Date: Fri, 10 May 2024 17:40:12 +0200 Subject: [PATCH 24/28] fixed sap cstream --- mops/src/sap/sap.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mops/src/sap/sap.cu b/mops/src/sap/sap.cu index f370265..305f9d8 100644 --- a/mops/src/sap/sap.cu +++ b/mops/src/sap/sap.cu @@ -701,7 +701,7 @@ void mops::cuda::sparse_accumulation_of_products_vjp_vjp( int32_t* packed_indices = shared_array(indices_A.shape[0], sptr, &space); - sparse_accumulation_of_products_vjp_vjp_kernel<<>>( + sparse_accumulation_of_products_vjp_vjp_kernel<<>>( grad_grad_output, grad_A_2, grad_B_2, From 99cedf5e4afc75169688eeb1e8b7a471d545bebc Mon Sep 17 00:00:00 2001 From: "Nicholas J. Browning" Date: Fri, 10 May 2024 17:43:01 +0200 Subject: [PATCH 25/28] formatting. --- mops/src/sap/sap.cu | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/mops/src/sap/sap.cu b/mops/src/sap/sap.cu index 305f9d8..55d7716 100644 --- a/mops/src/sap/sap.cu +++ b/mops/src/sap/sap.cu @@ -124,8 +124,9 @@ void mops::cuda::sparse_accumulation_of_products( shared_array(WARP_SIZE * B.shape[1], sptr, &space); shared_array(indices_A.shape[0], sptr, &space); - sparse_accumulation_of_products_kernel - <<>>(output, A, B, C, indices_A, indices_B, indices_output); + sparse_accumulation_of_products_kernel<<>>( + output, A, B, C, indices_A, indices_B, indices_output + ); CUDA_CHECK_ERROR(cudaGetLastError()); CUDA_CHECK_ERROR(cudaStreamSynchronize(cstream)); @@ -344,9 +345,10 @@ void mops::cuda::sparse_accumulation_of_products_vjp( shared_array(WARP_SIZE * grad_A.shape[1], sptr, &space); } - sparse_accumulation_of_products_vjp_kernel<<>>( - grad_A, grad_B, grad_output, A, B, C, indices_A, indices_B, indices_output - ); + sparse_accumulation_of_products_vjp_kernel + <<>>( + grad_A, grad_B, grad_output, A, B, C, indices_A, indices_B, indices_output + ); CUDA_CHECK_ERROR(cudaGetLastError()); CUDA_CHECK_ERROR(cudaStreamSynchronize(cstream)); @@ -701,20 +703,21 @@ void mops::cuda::sparse_accumulation_of_products_vjp_vjp( int32_t* packed_indices = shared_array(indices_A.shape[0], sptr, &space); - sparse_accumulation_of_products_vjp_vjp_kernel<<>>( - grad_grad_output, - grad_A_2, - grad_B_2, - grad_grad_A, - grad_grad_B, - grad_output, - A, - B, - C, - indices_A, - indices_B, - indices_output - ); + sparse_accumulation_of_products_vjp_vjp_kernel + <<>>( + grad_grad_output, + grad_A_2, + grad_B_2, + grad_grad_A, + grad_grad_B, + grad_output, + A, + B, + C, + indices_A, + indices_B, + indices_output + ); CUDA_CHECK_ERROR(cudaGetLastError()); CUDA_CHECK_ERROR(cudaStreamSynchronize(cstream)); From 21b6235d3caf863e0323c598b113138459c92a6d Mon Sep 17 00:00:00 2001 From: "Nicholas J. Browning" Date: Fri, 10 May 2024 17:49:12 +0200 Subject: [PATCH 26/28] missing device guard in SAP --- mops-torch/src/sap.cpp | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/mops-torch/src/sap.cpp b/mops-torch/src/sap.cpp index aca6d49..83c904f 100644 --- a/mops-torch/src/sap.cpp +++ b/mops-torch/src/sap.cpp @@ -64,6 +64,14 @@ torch::Tensor SparseAccumulationOfProducts::forward( ); }); } else if (A.device().is_cuda()) { + +#ifndef MOPS_CUDA_ENABLED + C10_THROW_ERROR(ValueError, "MOPS was not compiled with CUDA support " + A.device().str()); +#else + c10::cuda::CUDAGuard deviceGuard{A.device()}; + cudaStream_t currstream = c10::cuda::getCurrentCUDAStream(); + void* stream = reinterpret_cast(currstream); + output = torch::empty( {A.size(0), output_size}, torch::TensorOptions().dtype(A.scalar_type()).device(A.device()) @@ -77,9 +85,11 @@ torch::Tensor SparseAccumulationOfProducts::forward( details::torch_to_mops_1d(C), details::torch_to_mops_1d(indices_A), details::torch_to_mops_1d(indices_B), - details::torch_to_mops_1d(indices_output) + details::torch_to_mops_1d(indices_output), + stream ); }); +#endif } else { C10_THROW_ERROR( ValueError, @@ -175,6 +185,14 @@ std::vector SparseAccumulationOfProductsBackward::forward( ); }); } else if (A.device().is_cuda()) { + +#ifndef MOPS_CUDA_ENABLED + C10_THROW_ERROR(ValueError, "MOPS was not compiled with CUDA support " + A.device().str()); +#else + c10::cuda::CUDAGuard deviceGuard{A.device()}; + cudaStream_t currstream = c10::cuda::getCurrentCUDAStream(); + void* stream = reinterpret_cast(currstream); + AT_DISPATCH_FLOATING_TYPES(A.scalar_type(), "sparse_accumulation_of_products_vjp", [&]() { auto mops_grad_A = mops::Tensor{nullptr, {0, 0}}; if (A.requires_grad()) { @@ -197,9 +215,11 @@ std::vector SparseAccumulationOfProductsBackward::forward( details::torch_to_mops_1d(C), details::torch_to_mops_1d(indices_A), details::torch_to_mops_1d(indices_B), - details::torch_to_mops_1d(indices_output) + details::torch_to_mops_1d(indices_output), + stream ); }); +#endif } else { C10_THROW_ERROR( ValueError, @@ -281,6 +301,10 @@ std::vector SparseAccumulationOfProductsBackward::backward( #ifndef MOPS_CUDA_ENABLED C10_THROW_ERROR(ValueError, "MOPS was not compiled with CUDA support " + A.device().str()); #else + c10::cuda::CUDAGuard deviceGuard{A.device()}; + cudaStream_t currstream = c10::cuda::getCurrentCUDAStream(); + void* stream = reinterpret_cast(currstream); + AT_DISPATCH_FLOATING_TYPES(A.scalar_type(), "sparse_accumulation_of_products_vjp_vjp", [&]() { auto mops_grad_grad_output = mops::Tensor{nullptr, {0, 0}}; if (grad_output.requires_grad()) { From dd29ac0ca88cf545e2ef42b6306dc1c33988e650 Mon Sep 17 00:00:00 2001 From: "Nicholas J. Browning" Date: Fri, 10 May 2024 17:51:54 +0200 Subject: [PATCH 27/28] missing stream. --- mops-torch/src/sap.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mops-torch/src/sap.cpp b/mops-torch/src/sap.cpp index 83c904f..c7ae354 100644 --- a/mops-torch/src/sap.cpp +++ b/mops-torch/src/sap.cpp @@ -346,7 +346,8 @@ std::vector SparseAccumulationOfProductsBackward::backward( details::torch_to_mops_1d(C), details::torch_to_mops_1d(indices_A), details::torch_to_mops_1d(indices_B), - details::torch_to_mops_1d(indices_output) + details::torch_to_mops_1d(indices_output), + stream ); }); #endif From 7f92034a9e161c9bd2cb99e4fa32c02569185d2c Mon Sep 17 00:00:00 2001 From: frostedoyster Date: Sun, 12 May 2024 19:56:02 +0200 Subject: [PATCH 28/28] Add stream benchmark --- python/mops-torch/benchmarks/stream.py | 52 ++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 python/mops-torch/benchmarks/stream.py diff --git a/python/mops-torch/benchmarks/stream.py b/python/mops-torch/benchmarks/stream.py new file mode 100644 index 0000000..ffaad7c --- /dev/null +++ b/python/mops-torch/benchmarks/stream.py @@ -0,0 +1,52 @@ +import mops.torch +import torch +from benchmark import benchmark, format_mean_std, initialize +from typing import List + +initialize() + +A = torch.rand(100, 1000, requires_grad=True) +C = torch.rand(2000) +indices_A = torch.randint(1000, size=(2000, 3), dtype=torch.int32) + +@torch.jit.script +def hpe_10(A, C, indices_A): + results = [] + for _ in range(10): + results.append(mops.torch.homogeneous_polynomial_evaluation(A, C, indices_A)) + return torch.sum(torch.concatenate(results)) + +mean_fwd, std_fwd, mean_bwd, std_bwd = benchmark( + lambda: hpe_10(A, C, indices_A) +) + + +@torch.jit.script +def hpe_10_stream(streams: List[torch.classes.cuda.Stream], A, C, indices_A): + results = [] + + for stream in streams: + with torch.cuda.stream(stream): + result = mops.torch.homogeneous_polynomial_evaluation(A, C, indices_A) + results.append(result) + + for stream in streams: + stream.synchronize() + + return torch.sum(torch.concatenate(results)) + +streams = [torch.classes.cuda.Stream() for _ in range(10)] +mean_fwd_stream, std_fwd_stream, mean_bwd_stream, std_bwd_stream = benchmark( + lambda: hpe_10_stream(streams, A, C, indices_A) +) + + +print() +print("Without CUDA streams:") +print("Forward pass:", format_mean_std(mean_fwd, std_fwd)) +print("Backward pass:", format_mean_std(mean_bwd, std_bwd)) + +print() +print("With CUDA streams:") +print("Forward pass:", format_mean_std(mean_fwd_stream, std_fwd_stream)) +print("Backward pass:", format_mean_std(mean_bwd_stream, std_bwd_stream))