From aa3efcc019731851413c5451326c3a57c5f44216 Mon Sep 17 00:00:00 2001 From: Mehmet Yusufoglu Date: Mon, 15 Jan 2024 00:00:13 +0100 Subject: [PATCH] Convolution2D filter example using global and shared memory --- example/CMakeLists.txt | 1 + example/convolution2D/CMakeLists.txt | 47 ++++ example/convolution2D/src/convolution2D.cpp | 247 ++++++++++++++++++++ 3 files changed, 295 insertions(+) create mode 100644 example/convolution2D/CMakeLists.txt create mode 100644 example/convolution2D/src/convolution2D.cpp diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt index b74c45b86243..e3eae8aa2b3b 100644 --- a/example/CMakeLists.txt +++ b/example/CMakeLists.txt @@ -17,6 +17,7 @@ project("alpakaExamples" LANGUAGES CXX) add_subdirectory("bufferCopy/") add_subdirectory("complex/") +add_subdirectory("convolution2D/") add_subdirectory("counterBasedRng/") add_subdirectory("heatEquation/") add_subdirectory("helloWorld/") diff --git a/example/convolution2D/CMakeLists.txt b/example/convolution2D/CMakeLists.txt new file mode 100644 index 000000000000..2324f7b8ef2f --- /dev/null +++ b/example/convolution2D/CMakeLists.txt @@ -0,0 +1,47 @@ +# +# Copyright 2023 Erik Zenker, Benjamin Worpitz, Jan Stephan +# SPDX-License-Identifier: ISC +# + +################################################################################ +# Required CMake version. + +cmake_minimum_required(VERSION 3.22) + +set_property(GLOBAL PROPERTY USE_FOLDERS ON) + +################################################################################ +# Project. + +set(_TARGET_NAME convolution2D) + +project(${_TARGET_NAME} LANGUAGES CXX) + +#------------------------------------------------------------------------------- +# Find alpaka. + +if(NOT TARGET alpaka::alpaka) + option(alpaka_USE_SOURCE_TREE "Use alpaka's source tree instead of an alpaka installation" OFF) + + if(alpaka_USE_SOURCE_TREE) + # Don't build the examples recursively + set(alpaka_BUILD_EXAMPLES OFF) + add_subdirectory("${CMAKE_CURRENT_LIST_DIR}/../.." "${CMAKE_BINARY_DIR}/alpaka") + else() + find_package(alpaka REQUIRED) + endif() +endif() + +#------------------------------------------------------------------------------- +# Add executable. + +alpaka_add_executable( + ${_TARGET_NAME} + src/convolution2D.cpp) +target_link_libraries( + ${_TARGET_NAME} + PUBLIC alpaka::alpaka) + +set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER example) + +add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME}) diff --git a/example/convolution2D/src/convolution2D.cpp b/example/convolution2D/src/convolution2D.cpp new file mode 100644 index 000000000000..c345d97298f8 --- /dev/null +++ b/example/convolution2D/src/convolution2D.cpp @@ -0,0 +1,247 @@ +/* Copyright 2023 Benjamin Worpitz, Erik Zenker, Bernhard Manfred Gruber, Jan Stephan + * SPDX-License-Identifier: ISC + */ + +#include +#include + +#include +#include +#include +#include + +//! Convolution Example +//! +//! A 2D convolutional filter example with padding. + +static constexpr int32_t FILTER_RADIUS = 2; +static constexpr int32_t TILE_DIM = 2 * FILTER_RADIUS + 1; +static constexpr int32_t MATRIX_WIDTH = 10; +static constexpr int32_t MATRIX_HEIGHT = 10; + + +// These declarations about constant memory allocation(in GPU context), are used to silence clang`s +// -Wmissing-variable-declarations warning that forces every non-static variable to be declared with extern before the +// are defined. These forward declarations are only necessary when you want to access those variables from a different +// compilation unit and should be moved to a common header. +extern ALPAKA_STATIC_ACC_MEM_CONSTANT float g_constantMemory2DUninitialized[5][5]; +ALPAKA_STATIC_ACC_MEM_CONSTANT float g_constantMemory2DUninitialized[5][5]; + +// 2D Convolutional Filter using only global memory for matrix, and constant memory for the filter +struct ConvolutionKernel2DGlobalMemory +{ + template + ALPAKA_FN_ACC auto operator()( + TAcc const& acc, + TElem const* const N, + TElem* P, + const std::size_t MatrixWidth, + const std::size_t MatrixHeight, + TElem const* const filter) const -> void + { + using Dim = alpaka::Dim; + using Idx = alpaka::Idx; + using Vec = alpaka::Vec; + + auto const [row, col] = alpaka::getIdx(acc); + auto const [blockThreadY, blockThreadX] = alpaka::getIdx(acc); + + // The filter matrix applied to the matrix tile. The center of filter is positioned to the item pointed by the + // thread index. An implicit "zero padding" is used. If some of the items of the filter are outside the matrix, + // those are not taken into calculation in other words the corresponding items of the matrix are assumed zero. + if(col < MatrixWidth && row < MatrixHeight) + { + auto Pvalue = 0.0f; + for(int32_t fRow = 0; fRow < 2 * FILTER_RADIUS + 1; fRow++) + { + for(int32_t fCol = 0; fCol < 2 * FILTER_RADIUS + 1; fCol++) + { + auto const exactRow = static_cast(row) - FILTER_RADIUS + fRow; + auto const exactCol = static_cast(col) - FILTER_RADIUS + fCol; + if(exactRow >= 0 && exactRow < MatrixHeight && exactCol >= 0 && exactCol < MatrixWidth) + { + Pvalue += filter[fRow * TILE_DIM + fCol] * N[exactRow * MatrixWidth + exactCol]; + } + } + P[row * MatrixWidth + col] = Pvalue; + } + } + } +}; + +// 2D Convolutional Filter, uses tiling method. Tiles of matrix are kept in the shared memory. For the filter, the +// constant memory is used. Block dimensions are equal to tile dimensions. +struct ConvolutionKernel2DSharedMemory +{ + template + ALPAKA_FN_ACC auto operator()( + TAcc const& acc, + TElem const* const N, + TElem* P, + const std::size_t MatrixWidth, + const std::size_t MatrixHeight, + float const* const filter) const -> void + { + using Dim = alpaka::Dim; + using Idx = alpaka::Idx; + using Vec = alpaka::Vec; + + auto const [row, col] = alpaka::getIdx(acc); + auto const [blockThreadY, blockThreadX] = alpaka::getIdx(acc); + auto const [blockThreadExtentY, blockThreadExtentX] = alpaka::getWorkDiv(acc); + + // Allocate shared memory + auto& N_s = alpaka::declareSharedVar(acc); + + // Fill shared memory of device so that tile items are accessed from shared memory + if(row < MatrixHeight && col < MatrixWidth) + { + N_s[blockThreadY * blockThreadExtentX + blockThreadX] = N[row * MatrixWidth + col]; + } + else + { + N_s[blockThreadY * blockThreadExtentX + blockThreadX] = 0.0; + } + + // Wait for the block fills the shared memory with the tile of the main matrix + alpaka::syncBlockThreads(acc); + + if(col < MatrixWidth && row < MatrixHeight) + { + auto Pvalue{0.0f}; + for(int32_t fRow = 0; fRow < 2 * FILTER_RADIUS + 1; fRow++) + { + for(int32_t fCol = 0; fCol < 2 * FILTER_RADIUS + 1; fCol++) + { + auto const exactRowBlock = static_cast(blockThreadY) - FILTER_RADIUS + fRow; + auto const exactColBlock = static_cast(blockThreadX) - FILTER_RADIUS + fCol; + if(exactColBlock >= 0 && exactColBlock < TILE_DIM && exactRowBlock >= 0 + && exactRowBlock < TILE_DIM) + { + // get the item from the shared memory + Pvalue += filter[fRow * TILE_DIM + fCol] + * N_s[exactRowBlock * blockThreadExtentX + exactColBlock]; + } + else + { + auto const exactRow = static_cast(row) - FILTER_RADIUS + fRow; + auto const exactCol = static_cast(col) - FILTER_RADIUS + fCol; + if(exactRow >= 0 && exactRow < MatrixHeight && exactCol >= 0 && exactCol < MatrixWidth) + { + // get the item from the global memory + Pvalue += filter[fRow * TILE_DIM + fCol] * N[exactRow * MatrixWidth + exactCol]; + } + } + } + P[row * MatrixWidth + col] = Pvalue; + } + } // if + } +}; + +auto main() -> int +{ + // Define the index domain + using Dim = alpaka::DimInt<2u>; + // Index type + using Idx = std::uint32_t; + using Vec = alpaka::Vec; + // Define the accelerator + using DevAcc = alpaka::ExampleDefaultAcc; + using QueueProperty = alpaka::NonBlocking; + using QueueAcc = alpaka::Queue; + + using DataType = float; + static_assert( + alpaka::Dim::value == 2u, + "The accelerator used for the AlpakaKernel has to be 2 dimensional!"); + static_assert( + MATRIX_WIDTH % TILE_DIM == 0 && MATRIX_HEIGHT % TILE_DIM == 0, + "Matrix dimensions are not multiples of Tile dimensions"); + + std::cout << "Using alpaka accelerator: " << alpaka::getAccName() << std::endl; + + auto const devHost = alpaka::getDevByIdx(alpaka::PlatformCpu{}, 0); + // Select a device from the accelerator + auto const platformAcc = alpaka::Platform{}; + auto const devAcc = alpaka::getDevByIdx(platformAcc, 0); + + // Create a queue on the device + QueueAcc queueAcc(devAcc); + // Define extent (dimensions) + Vec const extent(static_cast(MATRIX_WIDTH), static_cast(MATRIX_HEIGHT)); + // Kernel Input + std::vector bufInputHost(MATRIX_HEIGHT * MATRIX_WIDTH); + std::iota(bufInputHost.begin(), bufInputHost.end(), 1.0f); + auto bufInputHostView = alpaka::createView(devHost, bufInputHost, extent); + + // Input buffer at device + auto bufInputAcc = alpaka::allocBuf(devAcc, extent); + auto bufInputAccView = alpaka::createView(devAcc, bufInputAcc, extent); + alpaka::memcpy(queueAcc, bufInputAccView, bufInputHostView); + alpaka::wait(queueAcc); + // Output buffer in device + alpaka::Vec, Idx> const extent1D(MATRIX_HEIGHT * MATRIX_WIDTH); + auto outputDeviceMemory = alpaka::allocBuf(devAcc, extent1D); + + using Vec = alpaka::Vec; + using WorkDiv = alpaka::WorkDivMembers; + + // The matrix is tiled. Each block of threads uses one tile, tile dimensions equal to block dims. + auto blocksPerGrid = alpaka::Vec{MATRIX_WIDTH / TILE_DIM, MATRIX_HEIGHT / TILE_DIM}; + auto const threadsPerBlock = alpaka::Vec{TILE_DIM, TILE_DIM}; + auto const elementsPerThread = alpaka::Vec{1u, 0u}; + auto workDiv = WorkDiv{blocksPerGrid, threadsPerBlock, elementsPerThread}; + + // convolution filter + std::vector filter = {0.11, 0.12, 0.13, 0.14, 0.15, 0.21, 0.22, 0.23, 0.24, 0.25, 0.31, 0.32, 0.33, + 0.34, 0.35, 0.41, 0.42, 0.43, 0.44, 0.45, 0.51, 0.52, 0.53, 0.54, 0.55}; + auto bufHostFilter = alpaka::createView(devHost, filter.data(), Vec{TILE_DIM, TILE_DIM}); + // Use constant memory in device for convolution filter + auto viewConstantMemUninitialized + = alpaka::createStaticDevMemView(&g_constantMemory2DUninitialized[0u][0u], devAcc, Vec{TILE_DIM, TILE_DIM}); + // Copy filter matrix to the constant memory + alpaka::memcpy(queueAcc, viewConstantMemUninitialized, bufHostFilter); + alpaka::wait(queueAcc); + + // Construct kernel object + ConvolutionKernel2DSharedMemory convolutionKernel2D; + + // Run the kernel + auto const taskKernel = alpaka::createTaskKernel( + workDiv, + convolutionKernel2D, + alpaka::getPtrNative(bufInputAccView), + alpaka::getPtrNative(outputDeviceMemory), + MATRIX_WIDTH, + MATRIX_HEIGHT, + alpaka::getPtrNative(viewConstantMemUninitialized)); + + alpaka::enqueue(queueAcc, taskKernel); + alpaka::wait(queueAcc); + // Allocate memory on host + auto resultGpuHost = alpaka::allocBuf(devHost, extent1D); + // Copy from device memory to host + alpaka::memcpy(queueAcc, resultGpuHost, outputDeviceMemory, extent1D); + + // Print results at the host + printf( + "Convolution filter kernel: %s\n", + std::is_same::value + ? "ConvolutionKernel2DGlobalMemory" + : "ConvolutionKernel2DSharedMemory"); + printf( + "Matrix Size: %d x %d, Filter Size: %d x %d, Tile Size: %d x %d\n", + MATRIX_WIDTH, + MATRIX_HEIGHT, + TILE_DIM, + TILE_DIM, + TILE_DIM, + TILE_DIM); + for(size_t i{0}; i < MATRIX_WIDTH * MATRIX_HEIGHT; i++) + { + DataType const& val(alpaka::getPtrNative(resultGpuHost)[i]); + std::cout << "output[" << i << "]:" << std::setprecision(3) << val << std::endl; + } + return EXIT_SUCCESS; +}