Setup cudax examples. (NVIDIA#2697)

fbusato · Nov 4, 2024 · 71c6652 · 71c6652
1 parent 8244738
commit 71c6652
Show file tree

Hide file tree

Showing 5 changed files with 337 additions and 10 deletions.
diff --git a/cudax/examples/CMakeLists.txt b/cudax/examples/CMakeLists.txt
@@ -1,15 +1,69 @@
-foreach(cn_target IN LISTS cudax_TARGETS)
-  cudax_get_target_property(config_prefix ${cn_target} PREFIX)
+find_package(cudax) # already found, bring in version info.
+find_package(Thrust ${cudax_VERSION} EXACT CONFIG
+  NO_DEFAULT_PATH # Only check the explicit path in HINTS:
+  HINTS "${CCCL_SOURCE_DIR}/lib/cmake/thrust/"
+)
+thrust_create_target(cudax.examples.thrust)
+
+function(cudax_add_example target_name_var example_src cudax_target)
+  cudax_get_target_property(config_prefix ${cudax_target} PREFIX)
+  cudax_get_target_property(config_dialect ${cudax_target} DIALECT)
+
+  get_filename_component(example_name ${example_src} NAME_WE)
+
+  # The actual name of the test's target:
+  set(example_target ${config_prefix}.example.${example_name})
+  set(${target_name_var} ${example_target} PARENT_SCOPE)
+
+  # Related target names:
+  set(config_meta_target ${config_prefix}.examples)
+  set(example_meta_target cudax.all.example.${example_name})
+
+  add_executable(${example_target} "${example_src}")
+  cccl_configure_target(${example_target} DIALECT ${config_dialect})
+  target_link_libraries(${example_target} PRIVATE
+    ${cudax_target}
+    cudax.examples.thrust
+  )
+  cudax_clone_target_properties(${example_target} ${cudax_target})
+  target_include_directories(${example_target} PRIVATE "${CUB_SOURCE_DIR}/examples")
+
+  # Add to the active configuration's meta target
+  add_dependencies(${config_meta_target} ${example_target})
+
+  # Meta target that builds examples with this name for all configurations:
+  if (NOT TARGET ${example_meta_target})
+    add_custom_target(${example_meta_target})
+  endif()
+  add_dependencies(${example_meta_target} ${example_target})
+
+  add_test(NAME ${example_target}
+    COMMAND "$<TARGET_FILE:${example_target}>"
+  )
+endfunction()
+
+file(GLOB example_srcs
+  RELATIVE "${cudax_SOURCE_DIR}/examples"
+  CONFIGURE_DEPENDS
+  *.cu
+)
+
+foreach(cudax_target IN LISTS cudax_TARGETS)
+  cudax_get_target_property(config_prefix ${cudax_target} PREFIX)
 
   # Metatarget for the current configuration's tests:
   set(config_meta_target ${config_prefix}.examples)
   add_custom_target(${config_meta_target})
   add_dependencies(${config_prefix}.all ${config_meta_target})
+
+  foreach (example_src IN LISTS example_srcs)
+    cudax_add_example(example_target "${example_src}" ${cudax_target})
+  endforeach()
 endforeach()
 
 # FIXME: Enable MSVC
 if (NOT "MSVC" STREQUAL "${CMAKE_CXX_COMPILER_ID}" AND
     NOT "NVHPC" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
-  # STF tests are handled separately:
+  # STF examples are handled separately:
   add_subdirectory(stf)
 endif()
diff --git a/cudax/examples/stf/CMakeLists.txt b/cudax/examples/stf/CMakeLists.txt
@@ -55,12 +55,6 @@ find_package(CUB ${cudax_VERSION} EXACT CONFIG
   HINTS "${CCCL_SOURCE_DIR}/lib/cmake/cub/"
 )
 
-find_package(Thrust ${cudax_VERSION} EXACT CONFIG
-  NO_DEFAULT_PATH # Only check the explicit path in HINTS:
-  HINTS "${CCCL_SOURCE_DIR}/lib/cmake/thrust/"
-)
-thrust_create_target(cudax.stf.examples.thrust)
-
 find_package(CUDAToolkit REQUIRED)
 
 ## cudax_add_stf_example
@@ -90,7 +84,7 @@ function(cudax_add_stf_example target_name_var source cn_target)
   cudax_clone_target_properties(${example_target} ${cn_target})
   cudax_stf_configure_target(${example_target} ${ARGN})
   target_link_libraries(${example_target} PRIVATE
-    cudax.stf.examples.thrust
+    cudax.examples.thrust
     CUB::CUB
   )
 

diff --git a/cudax/examples/vector.cuh b/cudax/examples/vector.cuh
@@ -0,0 +1,151 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDAX__CONTAINER_VECTOR
+#define _CUDAX__CONTAINER_VECTOR
+
+#include <cuda/__cccl_config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+
+#include <cuda/std/__type_traits/maybe_const.h>
+#include <cuda/std/span>
+#include <cuda/stream_ref>
+
+#include <cuda/experimental/__detail/utility.cuh>
+#include <cuda/experimental/__launch/param_kind.cuh>
+
+#if _CCCL_STD_VER >= 2017
+namespace cuda::experimental
+{
+using ::cuda::std::span;
+using ::thrust::device_vector;
+using ::thrust::host_vector;
+
+template <typename _Ty>
+class vector
+{
+public:
+  vector() = default;
+  explicit vector(size_t __n)
+      : __h_(__n)
+  {}
+
+  _Ty& operator[](size_t __i) noexcept
+  {
+    __dirty_ = true;
+    return __h_[__i];
+  }
+
+  const _Ty& operator[](size_t __i) const noexcept
+  {
+    return __h_[__i];
+  }
+
+private:
+  void sync_host_to_device(::cuda::stream_ref __str, detail::__param_kind __p) const
+  {
+    (void) __str;
+    if (__dirty_)
+    {
+      if (__p == detail::__param_kind::_out)
+      {
+        // There's no need to copy the data from host to device if the data is
+        // only going to be written to. We can just allocate the device memory.
+        __d_.resize(__h_.size());
+      }
+      else
+      {
+        // TODO: use a memcpy async here
+        __d_ = __h_;
+      }
+      __dirty_ = false;
+    }
+  }
+
+  void sync_device_to_host(::cuda::stream_ref __str, detail::__param_kind __p) const
+  {
+    if (__p != detail::__param_kind::_in)
+    {
+      // TODO: use a memcpy async here
+      __str.wait(); // wait for the kernel to finish executing
+      __h_ = __d_;
+    }
+  }
+
+  template <detail::__param_kind _Kind>
+  class __action //: private detail::__immovable
+  {
+    using __cv_vector = ::cuda::std::__maybe_const<_Kind == detail::__param_kind::_in, vector>;
+
+  public:
+    explicit __action(::cuda::stream_ref __str, __cv_vector& __v) noexcept
+        : __str_(__str)
+        , __v_(__v)
+    {
+      __v_.sync_host_to_device(__str_, _Kind);
+    }
+
+    __action(__action&&) = delete;
+
+    ~__action()
+    {
+      __v_.sync_device_to_host(__str_, _Kind);
+    }
+
+    using __as_kernel_arg = ::cuda::std::span<_Ty>;
+
+    operator ::cuda::std::span<_Ty>()
+    {
+      return {__v_.__d_.data().get(), __v_.__d_.size()};
+    }
+
+  private:
+    ::cuda::stream_ref __str_;
+    __cv_vector& __v_;
+  };
+
+  _CCCL_NODISCARD_FRIEND __action<detail::__param_kind::_inout>
+  __cudax_launch_transform(::cuda::stream_ref __str, vector& __v) noexcept
+  {
+    return __action<detail::__param_kind::_inout>{__str, __v};
+  }
+
+  _CCCL_NODISCARD_FRIEND __action<detail::__param_kind::_in>
+  __cudax_launch_transform(::cuda::stream_ref __str, const vector& __v) noexcept
+  {
+    return __action<detail::__param_kind::_in>{__str, __v};
+  }
+
+  template <detail::__param_kind _Kind>
+  _CCCL_NODISCARD_FRIEND __action<_Kind>
+  __cudax_launch_transform(::cuda::stream_ref __str, detail::__box<vector, _Kind> __b) noexcept
+  {
+    return __action<_Kind>{__str, __b.__val};
+  }
+
+  mutable host_vector<_Ty> __h_;
+  mutable device_vector<_Ty> __d_{};
+  mutable bool __dirty_ = true;
+};
+
+} // namespace cuda::experimental
+
+#endif
+#endif
diff --git a/cudax/examples/vector_add.cu b/cudax/examples/vector_add.cu
@@ -0,0 +1,127 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * Vector addition: C = A + B.
+ *
+ * This sample is a very basic sample that implements element by element
+ * vector addition. It is the same as the sample illustrating Chapter 2
+ * of the programming guide with some additions like error checking.
+ */
+
+#include <stdio.h>
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+#include <cuda/std/span>
+
+#include <cuda/experimental/launch.cuh>
+#include <cuda/experimental/stream.cuh>
+
+#include "vector.cuh"
+
+namespace cudax = cuda::experimental;
+using cudax::in;
+using cudax::out;
+
+/**
+ * CUDA Kernel Device code
+ *
+ * Computes the vector addition of A and B into C. The 3 vectors have the same
+ * number of elements numElements.
+ */
+__global__ void vectorAdd(cudax::span<const float> A, cudax::span<const float> B, cudax::span<float> C)
+{
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+  if (i < A.size())
+  {
+    C[i] = A[i] + B[i] + 0.0f;
+  }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+try
+{
+  // A CUDA stream on which to execute the vector addition kernel
+  cudax::stream stream(cudax::devices[0]);
+
+  // Print the vector length to be used, and compute its size
+  int numElements = 50000;
+  printf("[Vector addition of %d elements]\n", numElements);
+
+  // Allocate the host vectors
+  cudax::vector<float> A(numElements); // input
+  cudax::vector<float> B(numElements); // input
+  cudax::vector<float> C(numElements); // output
+
+  // Initialize the host input vectors
+  for (int i = 0; i < numElements; ++i)
+  {
+    A[i] = rand() / (float) RAND_MAX;
+    B[i] = rand() / (float) RAND_MAX;
+  }
+
+  // Define the kernel launch parameters
+  constexpr int threadsPerBlock = 256;
+  auto dims                     = cudax::distribute<threadsPerBlock>(numElements);
+
+  // Launch the vectorAdd kernel
+  printf("CUDA kernel launch with %d blocks of %d threads\n", dims.count(cudax::block, cudax::grid), threadsPerBlock);
+  cudax::launch(stream, dims, vectorAdd, in(A), in(B), out(C));
+
+  printf("waiting for the stream to finish\n");
+  stream.wait();
+
+  printf("veryfying the results\n");
+  // Verify that the result vector is correct
+  for (int i = 0; i < numElements; ++i)
+  {
+    if (fabs(A[i] + B[i] - C[i]) > 1e-5)
+    {
+      fprintf(stderr, "Result verification failed at element %d!\n", i);
+      exit(EXIT_FAILURE);
+    }
+  }
+
+  printf("Test PASSED\n");
+
+  printf("Done\n");
+  return 0;
+}
+catch (const std::exception& e)
+{
+  printf("caught an exception: \"%s\"\n", e.what());
+}
+catch (...)
+{
+  printf("caught an unknown exception\n");
+}
diff --git a/examples/cudax/vector_add/vector.cuh b/examples/cudax/vector_add/vector.cuh
@@ -61,6 +61,7 @@ public:
 private:
   void sync_host_to_device(::cuda::stream_ref __str, detail::__param_kind __p) const
   {
+    (void) __str;
     if (__dirty_)
     {
       if (__p == detail::__param_kind::_out)