diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt index acdc0dae6ca1..bb6f1061a590 100644 --- a/example/CMakeLists.txt +++ b/example/CMakeLists.txt @@ -18,6 +18,7 @@ project("alpakaExamples" LANGUAGES CXX) add_subdirectory("bufferCopy/") add_subdirectory("complex/") add_subdirectory("convolution1D/") +add_subdirectory("convolution2D/") add_subdirectory("counterBasedRng/") add_subdirectory("heatEquation/") add_subdirectory("helloWorld/") diff --git a/example/convolution2D/CMakeLists.txt b/example/convolution2D/CMakeLists.txt new file mode 100644 index 000000000000..2324f7b8ef2f --- /dev/null +++ b/example/convolution2D/CMakeLists.txt @@ -0,0 +1,47 @@ +# +# Copyright 2023 Erik Zenker, Benjamin Worpitz, Jan Stephan +# SPDX-License-Identifier: ISC +# + +################################################################################ +# Required CMake version. + +cmake_minimum_required(VERSION 3.22) + +set_property(GLOBAL PROPERTY USE_FOLDERS ON) + +################################################################################ +# Project. + +set(_TARGET_NAME convolution2D) + +project(${_TARGET_NAME} LANGUAGES CXX) + +#------------------------------------------------------------------------------- +# Find alpaka. + +if(NOT TARGET alpaka::alpaka) + option(alpaka_USE_SOURCE_TREE "Use alpaka's source tree instead of an alpaka installation" OFF) + + if(alpaka_USE_SOURCE_TREE) + # Don't build the examples recursively + set(alpaka_BUILD_EXAMPLES OFF) + add_subdirectory("${CMAKE_CURRENT_LIST_DIR}/../.." "${CMAKE_BINARY_DIR}/alpaka") + else() + find_package(alpaka REQUIRED) + endif() +endif() + +#------------------------------------------------------------------------------- +# Add executable. + +alpaka_add_executable( + ${_TARGET_NAME} + src/convolution2D.cpp) +target_link_libraries( + ${_TARGET_NAME} + PUBLIC alpaka::alpaka) + +set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER example) + +add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME}) diff --git a/example/convolution2D/src/convolution2D.cpp b/example/convolution2D/src/convolution2D.cpp new file mode 100644 index 000000000000..e40f6aff59f0 --- /dev/null +++ b/example/convolution2D/src/convolution2D.cpp @@ -0,0 +1,381 @@ +/* Copyright 2023 Mehmet Yusufoglu, Bernhard Manfred Gruber, René Widera + * SPDX-License-Identifier: ISC + */ + +#include +#include + +#include +#include +#include + +//! Convolution Example +//! +//! A 2D Convolutional filter applied to a matrix. The first kernel, ConvolutionKernel2DGlobalMemory, uses only global +//! memory. The second kernel ConvolutionKernel2DSharedMemory uses tiling and shared memory. Block size is assumed to +//! be equal to the tile size. First, the tile is copied to shared memory, since an element in a tile is accessed many +//! times; using the shared memory for the main matrix data increases performance. Each block works on the domain of +//! one tile. But at the border of the tile, some external matrix values are needed (at the border with another tile) +//! then those matrix values are taken from the global memory. +//! Results can be tested by comparing with the results of the Matlab call: Y = +//! filter2(FilterMatrix,InputMatrix,'same'); + +/** + * @brief 2D Convolutional Filter using only global memory for the input-matrix and the filter-matrix + */ +struct ConvolutionKernel2DGlobalMemory +{ + //! \tparam TAcc Accelerator type + //! \tparam TElem The input-matrix and filter-matrix element type + //! \param acc Accelerator + //! \param input Input matrix + //! \param output Output matrix + //! \param matrixWidth Input matrix width + //! \param matrixHeight Input matrix height + //! \param filter Filter-matrix + //! \param filterWidth Filter-matrix width + //! \param intputWidthAllocated Input-matrix width allocated (possibly larger than normal width due to paddding) + //! \param filterWidthAllocated Filter-matrix width allocated (possibly larger than normal width due to paddding + + template + ALPAKA_FN_ACC auto operator()( + TAcc const& acc, + TElem const* const input, + TElem* output, + int32_t const matrixWidth, + int32_t const matrixHeight, + TElem const* const filter, + int32_t const filterWidth, + int32_t const intputWidthAllocated, + int32_t const filterWidthAllocated) const -> void + { + // Get thread index, the center of filter-matrix is positioned to the item on this index. + auto const [row, col] = alpaka::getIdx(acc); + // Block index with respect to thread + auto const [blockThreadY, blockThreadX] = alpaka::getIdx(acc); + + // The convolutional filter-matrix applied to the input matrix, it's position is row and col. If some of the + // items of the filter are outside the matrix, those are not taken into calculation or they are assumed zero. + if(col < matrixWidth && row < matrixHeight) + { + TElem pValue{0.0f}; + for(int32_t fRow = 0; fRow < filterWidth; fRow++) + { + for(int32_t fCol = 0; fCol < filterWidth; fCol++) + { + // Position of input matrix element to be multiplied with the corresponding element at filter + auto const exactRow = static_cast(row) - filterWidth / 2 + fRow; + auto const exactCol = static_cast(col) - filterWidth / 2 + fCol; + if(exactRow >= 0 && exactRow < matrixHeight && exactCol >= 0 && exactCol < matrixWidth) + { + pValue += filter[fRow * filterWidthAllocated + fCol] + * input[exactRow * intputWidthAllocated + exactCol]; + } + } + output[row * matrixWidth + col] = pValue; + } + } + } +}; + +/** + * @brief ConvolutionKernel2DSharedMemory struct. The kernel for 2D Convolutional Filter, uses + tiling method. Tiles of matrix are kept in the shared memory. Block + dimensions are equal to tile dimensions. + */ +struct ConvolutionKernel2DSharedMemory +{ + //! \tparam TAcc Accelerator type + //! \tparam TElem The input-matrix and filter-matrix element type + //! \param acc Accelerator + //! \param input Input matrix + //! \param output Output matrix + //! \param matrixWidth Input matrix width + //! \param matrixHeight Input matrix height + //! \param filter Filter-matrix + //! \param filterWidth Filter-matrix width + //! \param intputWidthAllocated Input-matrix width allocated (possibly larger than normal width due to paddding + //! \param filterWidthAllocated Filter-matrix width allocated (possibly larger than normal width due to paddding + + template + ALPAKA_FN_ACC auto operator()( + TAcc const& acc, + TElem const* const input, + TElem* output, + int32_t const matrixWidth, + int32_t const matrixHeight, + TElem const* const filter, + int32_t const filterWidth, + int32_t const intputWidthAllocated, + int32_t const filterWidthAllocated) const -> void + { + auto const [row, col] = alpaka::getIdx(acc); + // Get extents(dimensions) + auto const gridBlockExtent = alpaka::getWorkDiv(acc); + auto const blockThreadExtent = alpaka::getWorkDiv(acc); + // Get indexes + auto const blockThreadIdx = alpaka::getIdx(acc); + auto const blockThreadIdx1D = alpaka::mapIdx<1u>(blockThreadIdx, blockThreadExtent)[0u]; + // Get elements from 2-element arrays + auto const [blockThreadExtentY, blockThreadExtentX] = blockThreadExtent; + auto const [blockThreadY, blockThreadX] = blockThreadIdx; + auto const [gridBlockExtentY, gridBlockExtentX] = gridBlockExtent; + // Allocate shared memory + auto* const sharedN = alpaka::getDynSharedMem(acc); + // Fill shared memory of device so that tile items are accessed from shared memory + if(row < matrixHeight && col < matrixWidth && blockThreadIdx1D < blockThreadExtent.prod()) + { + sharedN[blockThreadIdx1D] = input[row * intputWidthAllocated + col]; + } + else if(blockThreadIdx1D < blockThreadExtent.prod()) + { + sharedN[blockThreadIdx1D] = 0.0f; + } + + // Wait for the block fills the shared memory with the tile of the main matrix + alpaka::syncBlockThreads(acc); + + if(col < matrixWidth && row < matrixHeight) + { + TElem pValue{0.0f}; + for(int32_t fRow = 0; fRow < filterWidth; fRow++) + { + for(int32_t fCol = 0; fCol < filterWidth; fCol++) + { + // Position of input matrix element to be multiplied with the corresponding element at the filter. + // The position is with respect to tile(block) + auto const exactRowBlock = static_cast(blockThreadY) - filterWidth / 2 + fRow; + auto const exactColBlock = static_cast(blockThreadX) - filterWidth / 2 + fCol; + if(exactColBlock >= 0 && exactColBlock < blockThreadExtentX && exactRowBlock >= 0 + && exactRowBlock < blockThreadExtentY) + { + // The element is inside the tile. Get the element from the shared memory + pValue += filter[fRow * filterWidthAllocated + fCol] + * sharedN[exactRowBlock * blockThreadExtentX + exactColBlock]; + } + else + { // The element is not in the tile(block) + // Position of input matrix element to be multiplied with the corresponding element at the + // filter. The position is with respect to the input matrix + auto const exactRow = static_cast(row) - filterWidth / 2 + fRow; + auto const exactCol = static_cast(col) - filterWidth / 2 + fCol; + if(exactRow >= 0 && exactRow < matrixHeight && exactCol >= 0 && exactCol < matrixWidth) + { + // get the item from the global memory, use padded width! + pValue += filter[fRow * filterWidthAllocated + fCol] + * input[exactRow * intputWidthAllocated + exactCol]; + } + } + } + output[row * matrixWidth + col] = pValue; + } + } // if + } +}; + +// The specialisation used for calculation of dynamic shared memory size +namespace alpaka::trait +{ + //! The trait for getting the size of the block shared dynamic memory for a kernel. + template + struct BlockSharedMemDynSizeBytes + { + //! \tparam TVec type for extent array + //! \tparam TElem element type of the matrix + //! \return The size of the shared memory allocated for a block. + template + ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes( + ConvolutionKernel2DSharedMemory const& /* matMulKernel */, + TVec const& blockThreadExtent, // dimensions of thread per block + TVec const& threadElemExtent, // dimensions of elements per thread + TElem const* const, // input Matrix + TElem*, // output array + int32_t const, // matrixWidth + int32_t const, // matrixHeight + TElem const* const, // filter + int32_t const, // filter width + int32_t const, // allocated input width + int32_t const) // allocated filter width + { + // Reserve the buffer, buffers size is the number of elements in a block (tile) + return static_cast(blockThreadExtent.prod() * threadElemExtent.prod()) * sizeof(TElem); + } + }; +} // namespace alpaka::trait + +auto FuzzyEqual(float a, float b) -> bool +{ + return std::fabs(a - b) < std::numeric_limits::epsilon() * 1000.0f; +} + +auto main() -> int +{ + // Define the index domain + using Dim = alpaka::DimInt<2>; + // Index type + using Idx = std::uint32_t; + using Vec = alpaka::Vec; + // Define the accelerator + using DevAcc = alpaka::ExampleDefaultAcc; + using QueueAcc = alpaka::Queue; + + using DataType = float; + static constexpr Idx filterWidth = 5; + static constexpr Idx matrixWidth = 128; + static constexpr Idx matrixHeight = 128; + + static_assert( + alpaka::Dim::value == 2u, + "The accelerator used for the Alpaka Kernel has to be 2 dimensional!"); + + std::cout << "Using alpaka accelerator: " << alpaka::getAccName() << std::endl; + + auto const devHost = alpaka::getDevByIdx(alpaka::PlatformCpu{}, 0); + // Select a device from the accelerator + auto const platformAcc = alpaka::Platform{}; + auto const devAcc = alpaka::getDevByIdx(platformAcc, 0); + + // Create a queue on the device + QueueAcc queueAcc(devAcc); + // Define the 2D extent (dimensions) + Vec const extent(static_cast(matrixWidth), static_cast(matrixHeight)); + + // + // Input vector allocation and copy to device buffer + // + std::vector bufInputHost1D(extent.prod(), 1); + // Use increasing values as input + std::iota(bufInputHost1D.begin(), bufInputHost1D.end(), 1.0f); + for(DataType& element : bufInputHost1D) + { + element /= matrixWidth; + } + // Create 2D view + auto bufInputHostView = alpaka::createView(devHost, bufInputHost1D.data(), extent); + + // Input buffer at device + auto bufInputAcc = alpaka::allocBuf(devAcc, extent); + // Copy input view from host to device by copying to alpaka buffer type + alpaka::memcpy(queueAcc, bufInputAcc, bufInputHostView); + alpaka::wait(queueAcc); + + // Calculate the allocated width, due to padding it might be larger then the matrix width + auto const intputWidthAllocated = [&]() -> const Idx + { + // Calculate pitch: The size of one line in bytes including padding. + auto const rowPitchInput{alpaka::getPitchesInBytes(bufInputAcc)[0]}; + return static_cast(rowPitchInput / sizeof(DataType)); + }(); + + // + // Output buffer allocation at device + // + alpaka::Vec, Idx> const extent1D(matrixHeight * matrixWidth); + auto outputDeviceMemory = alpaka::allocBuf(devAcc, extent1D); + + using WorkDiv = alpaka::WorkDivMembers; + // Let alpaka calculate good block and grid sizes given our full problem extent. + alpaka::WorkDivMembers const workDiv(alpaka::getValidWorkDiv( + devAcc, + extent, + Vec::ones(), + false, + alpaka::GridBlockExtentSubDivRestrictions::Unrestricted)); + + // Prepare convolution filter + // + std::vector const filter = {0.11, 0.12, 0.13, 0.14, 0.15, 0.21, 0.22, 0.23, 0.24, 0.25, 0.31, 0.32, 0.33, + 0.34, 0.35, 0.41, 0.42, 0.43, 0.44, 0.45, 0.51, 0.52, 0.53, 0.54, 0.55}; + + Vec const filterExtent(static_cast(filterWidth), static_cast(filterWidth)); + // Create 2D view + auto bufFilterHostView = alpaka::createView(devHost, filter.data(), filterExtent); + + // Filter buffer at device + auto bufFilterAcc = alpaka::allocBuf(devAcc, filterExtent); + // Copy input view from host to device by copying to alpaka buffer type + alpaka::memcpy(queueAcc, bufFilterAcc, bufFilterHostView); + alpaka::wait(queueAcc); + + // Calculate the allocated width, due to padding it might be larger then the matrix width + auto const filterWidthAllocated = [&]() -> const Idx + { + // Calculate pitch: The size of one line in bytes including padding. + auto const rowPitchFilter{alpaka::getPitchesInBytes(bufFilterAcc)[0]}; + return static_cast(rowPitchFilter / sizeof(DataType)); + }(); + + // Construct kernel object, choose on of the kernels provided. ConvolutionKernel2DGlobalMemory and + // ConvolutionKernel2DSharedMemory + ConvolutionKernel2DSharedMemory convolutionKernel2D; + + // Run the kernel + alpaka::exec( + queueAcc, + workDiv, + convolutionKernel2D, + alpaka::getPtrNative(bufInputAcc), + alpaka::getPtrNative(outputDeviceMemory), + matrixWidth, + matrixHeight, + alpaka::getPtrNative(bufFilterAcc), + filterWidth, + intputWidthAllocated, + filterWidthAllocated); + + // Allocate memory on host to receive the resulting matrix as an array + auto resultGpuHost = alpaka::allocBuf(devHost, extent1D); + // Copy result from device memory to host + alpaka::memcpy(queueAcc, resultGpuHost, outputDeviceMemory, extent1D); + alpaka::wait(queueAcc); + + // Print results + // + std::string const kernelType{ + std::is_same::value + ? "ConvolutionKernel2DGlobalMemory" + : "ConvolutionKernel2DSharedMemory"}; + + std::cout << "Convolution filter kernel:" << kernelType << "\n"; + std::cout << "Matrix Size:" << matrixWidth << "x" << matrixHeight << ", Filter Size:" << filterWidth << "x" + << filterWidth << "\n"; + + // Print output + for(size_t i{0}; i < matrixWidth * matrixHeight; ++i) + { + std::cout << "output[" << i << "]:" << std::setprecision(6) << resultGpuHost[i] << std::endl; + } + + // Expected array of sampled results + std::vector const expectedOutput{ + 4.622344e+00, + 1.106426e+02, + 2.162168e+02, + 3.217910e+02, + 4.273652e+02, + 4.199258e+02, + 6.385137e+02, + 7.440879e+02, + 8.496621e+02, + 9.552363e+02, + 4.390715e+02}; + // Select samples from output to check results + size_t const numberOfSamples{10}; + size_t const samplePeriod{matrixWidth * matrixHeight / numberOfSamples}; + bool allEqual{true}; + for(size_t i{0}; i < numberOfSamples; ++i) + { + // Compare with the reference results, select one from every samplePeriod element + bool fuzzyEqual = FuzzyEqual(resultGpuHost[i * samplePeriod], expectedOutput[i]); + if(!fuzzyEqual) + std::cout << resultGpuHost[i * samplePeriod] << " " << expectedOutput[i] << std::endl; + allEqual = allEqual && fuzzyEqual; + } + if(!allEqual) + { + std::cout << "Error: Some 2D convolution results doesn't match!\n"; + return EXIT_FAILURE; + } + std::cout << "Sampled result checks are correct!\n"; + return EXIT_SUCCESS; +}