diff --git a/include/mscclpp/concurrency_device.hpp b/include/mscclpp/concurrency_device.hpp index c443a73c4..d382acb46 100644 --- a/include/mscclpp/concurrency_device.hpp +++ b/include/mscclpp/concurrency_device.hpp @@ -23,9 +23,9 @@ struct DeviceSyncer { /// @param blockNum The number of blocks that will synchronize. /// @param maxSpinCount The maximum number of spin counts before asserting. Never assert if negative. MSCCLPP_DEVICE_INLINE void sync(int blockNum, int64_t maxSpinCount = 100000000) { - unsigned int maxOldCnt = blockNum - 1; __syncthreads(); if (blockNum == 1) return; + unsigned int maxOldCnt = blockNum - 1; if (threadIdx.x == 0) { // Need a `__threadfence()` before to flip `flag`. __threadfence(); diff --git a/include/mscclpp/gpu.hpp b/include/mscclpp/gpu.hpp index d3d48ce1f..de77c15b3 100644 --- a/include/mscclpp/gpu.hpp +++ b/include/mscclpp/gpu.hpp @@ -13,6 +13,7 @@ using cudaGraph_t = hipGraph_t; using cudaGraphExec_t = hipGraphExec_t; using cudaDeviceProp = hipDeviceProp_t; using cudaStream_t = hipStream_t; +using cudaEvent_t = hipEvent_t; using cudaStreamCaptureMode = hipStreamCaptureMode; using cudaMemcpyKind = hipMemcpyKind; using cudaIpcMemHandle_t = hipIpcMemHandle_t; @@ -53,6 +54,7 @@ constexpr auto cudaIpcMemLazyEnablePeerAccess = hipIpcMemLazyEnablePeerAccess; #define cudaMemcpy(...) hipMemcpy(__VA_ARGS__) #define cudaMemcpyAsync(...) hipMemcpyAsync(__VA_ARGS__) #define cudaMemcpyToSymbol(...) hipMemcpyToSymbol(__VA_ARGS__) +#define cudaStreamCreate(...) hipStreamCreate(__VA_ARGS__) #define cudaStreamCreateWithFlags(...) hipStreamCreateWithFlags(__VA_ARGS__) #define cudaStreamSynchronize(...) hipStreamSynchronize(__VA_ARGS__) #define cudaStreamBeginCapture(...) hipStreamBeginCapture(__VA_ARGS__) @@ -66,6 +68,10 @@ constexpr auto cudaIpcMemLazyEnablePeerAccess = hipIpcMemLazyEnablePeerAccess; #define cudaIpcGetMemHandle(...) hipIpcGetMemHandle(__VA_ARGS__) #define cudaIpcOpenMemHandle(...) hipIpcOpenMemHandle(__VA_ARGS__) #define cudaIpcCloseMemHandle(...) hipIpcCloseMemHandle(__VA_ARGS__) +#define cudaEventCreate(...) hipEventCreate(__VA_ARGS__) +#define cudaEventDestroy(...) hipEventDestroy(__VA_ARGS__) +#define cudaEventRecord(...) hipEventRecord(__VA_ARGS__) +#define cudaEventElapsedTime(...) hipEventElapsedTime(__VA_ARGS__) #define cuGetErrorString(...) hipDrvGetErrorString(__VA_ARGS__) #define cuMemGetAddressRange(...) hipMemGetAddressRange(__VA_ARGS__) diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt index e86bc8253..5c0a0bc65 100644 --- a/test/unit/CMakeLists.txt +++ b/test/unit/CMakeLists.txt @@ -8,6 +8,7 @@ target_sources(unit_tests PRIVATE fifo_tests.cu numa_tests.cc socket_tests.cc + sync_tests.cu utils_tests.cc utils_internal_tests.cc ) diff --git a/test/unit/sync_tests.cu b/test/unit/sync_tests.cu new file mode 100644 index 000000000..7e8708222 --- /dev/null +++ b/test/unit/sync_tests.cu @@ -0,0 +1,103 @@ +/****************************************************************************** + * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + *****************************************************************************/ + +#include "sync_tests.hpp" + +TEST_F(DeviceSyncerTestFixture, execute_1_1) { + execute(1, 1); +} + +TEST_F(DeviceSyncerTestFixture, execute_1024_1) { + execute(1024, 1); +} + +TEST_F(DeviceSyncerTestFixture, execute_1_2) { + execute(1, 2); +} + +TEST_F(DeviceSyncerTestFixture, execute_1024_2) { + execute(1024, 2); +} + +TEST_F(DeviceSyncerTestFixture, execute_1_4) { + execute(1, 4); +} + +TEST_F(DeviceSyncerTestFixture, execute_1024_4) { + execute(1024, 4); +} + +TEST_F(DeviceSyncerTestFixture, execute_1_8) { + execute(1, 8); +} + +TEST_F(DeviceSyncerTestFixture, execute_1024_8) { + execute(1024, 8); +} + +TEST_F(DeviceSyncerTestFixture, execute_1_16) { + execute(1, 16); +} + +TEST_F(DeviceSyncerTestFixture, execute_1024_16) { + execute(1024, 16); +} + +TEST_F(DeviceSyncerTestFixture, execute_1_32) { + execute(1, 32); +} + +TEST_F(DeviceSyncerTestFixture, execute_1024_32) { + execute(1024, 32); +} + +TEST_F(DeviceSyncerTestFixture, execute_1_64) { + execute(1, 64); +} + +TEST_F(DeviceSyncerTestFixture, execute_1024_64) { + execute(1024, 64); +} + +TEST_F(DeviceSyncerTestFixture, execute_1_104) { + execute(1, 104); +} + +TEST_F(DeviceSyncerTestFixture, execute_1024_104) { + execute(1024, 104); +} + +TEST_F(DeviceSyncerTestFixture, execute_1_128) { + execute(1, 128); +} + +TEST_F(DeviceSyncerTestFixture, execute_1_256) { + execute(1, 256); +} + +TEST_F(DeviceSyncerTestFixture, execute_1_512) { + execute(1, 512); +} + +TEST_F(DeviceSyncerTestFixture, execute_1_1024) { + execute(1, 1024); +} diff --git a/test/unit/sync_tests.hpp b/test/unit/sync_tests.hpp new file mode 100644 index 000000000..1537bceaa --- /dev/null +++ b/test/unit/sync_tests.hpp @@ -0,0 +1,130 @@ +/****************************************************************************** + * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + *****************************************************************************/ + +#include +#include +#include +#include + +using TYPE = mscclpp::DeviceSyncer; + +MSCCLPP_DEVICE_INLINE void +timestamp(uint64_t& clk) { + asm volatile("s_memrealtime %0\n" + "s_waitcnt lgkmcnt(0)\n" + : "=s" (clk)); +} + +__global__ void synchronize(TYPE *syncer, uint64_t *timer) { + uint64_t start {0}; + uint64_t end {0}; + timestamp(start); + syncer->sync(gridDim.x); + timestamp(end); + if (threadIdx.x == 0) { + timer[blockIdx.x] = end - start; + } +} + +class DeviceSyncerTestFixture : public ::testing::Test { + public: + DeviceSyncerTestFixture() { + MSCCLPP_CUDATHROW(cudaMalloc((void**)&syncer_d, sizeof(TYPE))); + syncer_h = (TYPE*)malloc(sizeof(TYPE)); + MSCCLPP_CUDATHROW(cudaMalloc((void**)&timer_d, sizeof(uint64_t) * MAX_BLOCKS)); + timer_h = (uint64_t*)malloc(sizeof(uint64_t) * MAX_BLOCKS); + } + + ~DeviceSyncerTestFixture() { + if (timer_h) { free(timer_h); } + if (timer_d) { (void)cudaFree(timer_d); } + if (syncer_h) { free(syncer_h); } + if (syncer_d) { (void)cudaFree(syncer_d); } + } + + void execute(uint32_t x_block_dim, uint32_t x_grid_dim) { + assert(x_grid_dim <= MAX_BLOCKS); + + const dim3 blocksize(x_block_dim, 1, 1); + const dim3 gridsize(x_grid_dim, 1, 1); + + cudaStream_t stream; + MSCCLPP_CUDATHROW(cudaStreamCreate(&stream)); + + cudaEvent_t start_event; + cudaEvent_t stop_event; + MSCCLPP_CUDATHROW(cudaEventCreate(&start_event)); + MSCCLPP_CUDATHROW(cudaEventCreate(&stop_event)); + + memset(syncer_h, 0, sizeof(TYPE)); + MSCCLPP_CUDATHROW(cudaMemcpyAsync(syncer_d, syncer_h, sizeof(TYPE), cudaMemcpyHostToDevice, stream)); + MSCCLPP_CUDATHROW(cudaEventRecord(start_event, stream)); + synchronize<<>>(syncer_d, timer_d); + MSCCLPP_CUDATHROW(cudaEventRecord(stop_event, stream)); + MSCCLPP_CUDATHROW(cudaMemcpyAsync(timer_h, timer_d, sizeof(uint64_t) * MAX_BLOCKS, cudaMemcpyDeviceToHost, stream)); + MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream)); + + float event_time; + MSCCLPP_CUDATHROW(cudaEventElapsedTime(&event_time, start_event, stop_event)); + + MSCCLPP_CUDATHROW(cudaEventDestroy(stop_event)); + MSCCLPP_CUDATHROW(cudaEventDestroy(start_event)); + MSCCLPP_CUDATHROW(cudaStreamDestroy(stream)); + + printf("event time: %f ms\n", event_time); + timer_us(x_grid_dim); + } + + protected: + uint64_t memrealtime_freq_mhz() { + cudaDeviceProp deviceProp{}; + MSCCLPP_CUDATHROW(cudaGetDeviceProperties(&deviceProp, 0)); +#if defined(__HIP_PLATFORM_AMD__) && (__HIP_PLATFORM_AMD__ == 1) + switch (deviceProp.gcnArch) { + case 900: return 27; + case 906: return 25; + case 908: return 25; + case 910: return 25; + default: + assert(false && "clock data unavailable"); + return 0; + } +#endif + } + + double gpu_cycles_to_us(uint64_t cycles) { + double div {(double)cycles / memrealtime_freq_mhz()}; + return div; + } + + void timer_us(uint32_t num_blocks) { + for (uint32_t i{0}; i < num_blocks; i++) { + printf("block %d : latency %f us\n", i, gpu_cycles_to_us(timer_h[i])); + } + } + + const uint32_t MAX_BLOCKS{1024}; + TYPE *syncer_h{nullptr}; + TYPE *syncer_d{nullptr}; + uint64_t *timer_h{nullptr}; + uint64_t *timer_d{nullptr}; +};