Skip to content

Commit

Permalink
Setup cudax examples. (NVIDIA#2697)
Browse files Browse the repository at this point in the history
  • Loading branch information
alliepiper authored Nov 4, 2024
1 parent 8244738 commit 71c6652
Show file tree
Hide file tree
Showing 5 changed files with 337 additions and 10 deletions.
60 changes: 57 additions & 3 deletions cudax/examples/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,15 +1,69 @@
foreach(cn_target IN LISTS cudax_TARGETS)
cudax_get_target_property(config_prefix ${cn_target} PREFIX)
find_package(cudax) # already found, bring in version info.
find_package(Thrust ${cudax_VERSION} EXACT CONFIG
NO_DEFAULT_PATH # Only check the explicit path in HINTS:
HINTS "${CCCL_SOURCE_DIR}/lib/cmake/thrust/"
)
thrust_create_target(cudax.examples.thrust)

function(cudax_add_example target_name_var example_src cudax_target)
cudax_get_target_property(config_prefix ${cudax_target} PREFIX)
cudax_get_target_property(config_dialect ${cudax_target} DIALECT)

get_filename_component(example_name ${example_src} NAME_WE)

# The actual name of the test's target:
set(example_target ${config_prefix}.example.${example_name})
set(${target_name_var} ${example_target} PARENT_SCOPE)

# Related target names:
set(config_meta_target ${config_prefix}.examples)
set(example_meta_target cudax.all.example.${example_name})

add_executable(${example_target} "${example_src}")
cccl_configure_target(${example_target} DIALECT ${config_dialect})
target_link_libraries(${example_target} PRIVATE
${cudax_target}
cudax.examples.thrust
)
cudax_clone_target_properties(${example_target} ${cudax_target})
target_include_directories(${example_target} PRIVATE "${CUB_SOURCE_DIR}/examples")

# Add to the active configuration's meta target
add_dependencies(${config_meta_target} ${example_target})

# Meta target that builds examples with this name for all configurations:
if (NOT TARGET ${example_meta_target})
add_custom_target(${example_meta_target})
endif()
add_dependencies(${example_meta_target} ${example_target})

add_test(NAME ${example_target}
COMMAND "$<TARGET_FILE:${example_target}>"
)
endfunction()

file(GLOB example_srcs
RELATIVE "${cudax_SOURCE_DIR}/examples"
CONFIGURE_DEPENDS
*.cu
)

foreach(cudax_target IN LISTS cudax_TARGETS)
cudax_get_target_property(config_prefix ${cudax_target} PREFIX)

# Metatarget for the current configuration's tests:
set(config_meta_target ${config_prefix}.examples)
add_custom_target(${config_meta_target})
add_dependencies(${config_prefix}.all ${config_meta_target})

foreach (example_src IN LISTS example_srcs)
cudax_add_example(example_target "${example_src}" ${cudax_target})
endforeach()
endforeach()

# FIXME: Enable MSVC
if (NOT "MSVC" STREQUAL "${CMAKE_CXX_COMPILER_ID}" AND
NOT "NVHPC" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
# STF tests are handled separately:
# STF examples are handled separately:
add_subdirectory(stf)
endif()
8 changes: 1 addition & 7 deletions cudax/examples/stf/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -55,12 +55,6 @@ find_package(CUB ${cudax_VERSION} EXACT CONFIG
HINTS "${CCCL_SOURCE_DIR}/lib/cmake/cub/"
)

find_package(Thrust ${cudax_VERSION} EXACT CONFIG
NO_DEFAULT_PATH # Only check the explicit path in HINTS:
HINTS "${CCCL_SOURCE_DIR}/lib/cmake/thrust/"
)
thrust_create_target(cudax.stf.examples.thrust)

find_package(CUDAToolkit REQUIRED)

## cudax_add_stf_example
Expand Down Expand Up @@ -90,7 +84,7 @@ function(cudax_add_stf_example target_name_var source cn_target)
cudax_clone_target_properties(${example_target} ${cn_target})
cudax_stf_configure_target(${example_target} ${ARGN})
target_link_libraries(${example_target} PRIVATE
cudax.stf.examples.thrust
cudax.examples.thrust
CUB::CUB
)

Expand Down
151 changes: 151 additions & 0 deletions cudax/examples/vector.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
//===----------------------------------------------------------------------===//
//
// Part of CUDA Experimental in CUDA C++ Core Libraries,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef _CUDAX__CONTAINER_VECTOR
#define _CUDAX__CONTAINER_VECTOR

#include <cuda/__cccl_config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#include <thrust/device_vector.h>
#include <thrust/host_vector.h>

#include <cuda/std/__type_traits/maybe_const.h>
#include <cuda/std/span>
#include <cuda/stream_ref>

#include <cuda/experimental/__detail/utility.cuh>
#include <cuda/experimental/__launch/param_kind.cuh>

#if _CCCL_STD_VER >= 2017
namespace cuda::experimental
{
using ::cuda::std::span;
using ::thrust::device_vector;
using ::thrust::host_vector;

template <typename _Ty>
class vector
{
public:
vector() = default;
explicit vector(size_t __n)
: __h_(__n)
{}

_Ty& operator[](size_t __i) noexcept
{
__dirty_ = true;
return __h_[__i];
}

const _Ty& operator[](size_t __i) const noexcept
{
return __h_[__i];
}

private:
void sync_host_to_device(::cuda::stream_ref __str, detail::__param_kind __p) const
{
(void) __str;
if (__dirty_)
{
if (__p == detail::__param_kind::_out)
{
// There's no need to copy the data from host to device if the data is
// only going to be written to. We can just allocate the device memory.
__d_.resize(__h_.size());
}
else
{
// TODO: use a memcpy async here
__d_ = __h_;
}
__dirty_ = false;
}
}

void sync_device_to_host(::cuda::stream_ref __str, detail::__param_kind __p) const
{
if (__p != detail::__param_kind::_in)
{
// TODO: use a memcpy async here
__str.wait(); // wait for the kernel to finish executing
__h_ = __d_;
}
}

template <detail::__param_kind _Kind>
class __action //: private detail::__immovable
{
using __cv_vector = ::cuda::std::__maybe_const<_Kind == detail::__param_kind::_in, vector>;

public:
explicit __action(::cuda::stream_ref __str, __cv_vector& __v) noexcept
: __str_(__str)
, __v_(__v)
{
__v_.sync_host_to_device(__str_, _Kind);
}

__action(__action&&) = delete;

~__action()
{
__v_.sync_device_to_host(__str_, _Kind);
}

using __as_kernel_arg = ::cuda::std::span<_Ty>;

operator ::cuda::std::span<_Ty>()
{
return {__v_.__d_.data().get(), __v_.__d_.size()};
}

private:
::cuda::stream_ref __str_;
__cv_vector& __v_;
};

_CCCL_NODISCARD_FRIEND __action<detail::__param_kind::_inout>
__cudax_launch_transform(::cuda::stream_ref __str, vector& __v) noexcept
{
return __action<detail::__param_kind::_inout>{__str, __v};
}

_CCCL_NODISCARD_FRIEND __action<detail::__param_kind::_in>
__cudax_launch_transform(::cuda::stream_ref __str, const vector& __v) noexcept
{
return __action<detail::__param_kind::_in>{__str, __v};
}

template <detail::__param_kind _Kind>
_CCCL_NODISCARD_FRIEND __action<_Kind>
__cudax_launch_transform(::cuda::stream_ref __str, detail::__box<vector, _Kind> __b) noexcept
{
return __action<_Kind>{__str, __b.__val};
}

mutable host_vector<_Ty> __h_;
mutable device_vector<_Ty> __d_{};
mutable bool __dirty_ = true;
};

} // namespace cuda::experimental

#endif
#endif
127 changes: 127 additions & 0 deletions cudax/examples/vector_add.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

/**
* Vector addition: C = A + B.
*
* This sample is a very basic sample that implements element by element
* vector addition. It is the same as the sample illustrating Chapter 2
* of the programming guide with some additions like error checking.
*/

#include <stdio.h>

// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>

#include <cuda/std/span>

#include <cuda/experimental/launch.cuh>
#include <cuda/experimental/stream.cuh>

#include "vector.cuh"

namespace cudax = cuda::experimental;
using cudax::in;
using cudax::out;

/**
* CUDA Kernel Device code
*
* Computes the vector addition of A and B into C. The 3 vectors have the same
* number of elements numElements.
*/
__global__ void vectorAdd(cudax::span<const float> A, cudax::span<const float> B, cudax::span<float> C)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;

if (i < A.size())
{
C[i] = A[i] + B[i] + 0.0f;
}
}

/**
* Host main routine
*/
int main(void)
try
{
// A CUDA stream on which to execute the vector addition kernel
cudax::stream stream(cudax::devices[0]);

// Print the vector length to be used, and compute its size
int numElements = 50000;
printf("[Vector addition of %d elements]\n", numElements);

// Allocate the host vectors
cudax::vector<float> A(numElements); // input
cudax::vector<float> B(numElements); // input
cudax::vector<float> C(numElements); // output

// Initialize the host input vectors
for (int i = 0; i < numElements; ++i)
{
A[i] = rand() / (float) RAND_MAX;
B[i] = rand() / (float) RAND_MAX;
}

// Define the kernel launch parameters
constexpr int threadsPerBlock = 256;
auto dims = cudax::distribute<threadsPerBlock>(numElements);

// Launch the vectorAdd kernel
printf("CUDA kernel launch with %d blocks of %d threads\n", dims.count(cudax::block, cudax::grid), threadsPerBlock);
cudax::launch(stream, dims, vectorAdd, in(A), in(B), out(C));

printf("waiting for the stream to finish\n");
stream.wait();

printf("veryfying the results\n");
// Verify that the result vector is correct
for (int i = 0; i < numElements; ++i)
{
if (fabs(A[i] + B[i] - C[i]) > 1e-5)
{
fprintf(stderr, "Result verification failed at element %d!\n", i);
exit(EXIT_FAILURE);
}
}

printf("Test PASSED\n");

printf("Done\n");
return 0;
}
catch (const std::exception& e)
{
printf("caught an exception: \"%s\"\n", e.what());
}
catch (...)
{
printf("caught an unknown exception\n");
}
1 change: 1 addition & 0 deletions examples/cudax/vector_add/vector.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ public:
private:
void sync_host_to_device(::cuda::stream_ref __str, detail::__param_kind __p) const
{
(void) __str;
if (__dirty_)
{
if (__p == detail::__param_kind::_out)
Expand Down

0 comments on commit 71c6652

Please sign in to comment.