Convolution2D filter example using global and shared memory

alpaka-group · Jan 26, 2024 · 8a7a920 · 8a7a920
1 parent 15a56e9
commit 8a7a920
Show file tree

Hide file tree

Showing 3 changed files with 349 additions and 0 deletions.
diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
@@ -18,6 +18,7 @@ project("alpakaExamples" LANGUAGES CXX)
 add_subdirectory("bufferCopy/")
 add_subdirectory("complex/")
 add_subdirectory("convolution1D/")
+add_subdirectory("convolution2D/")
 add_subdirectory("counterBasedRng/")
 add_subdirectory("heatEquation/")
 add_subdirectory("helloWorld/")

diff --git a/example/convolution2D/CMakeLists.txt b/example/convolution2D/CMakeLists.txt
@@ -0,0 +1,47 @@
+#
+# Copyright 2023 Erik Zenker, Benjamin Worpitz, Jan Stephan
+# SPDX-License-Identifier: ISC
+#
+
+################################################################################
+# Required CMake version.
+
+cmake_minimum_required(VERSION 3.22)
+
+set_property(GLOBAL PROPERTY USE_FOLDERS ON)
+
+################################################################################
+# Project.
+
+set(_TARGET_NAME convolution2D)
+
+project(${_TARGET_NAME} LANGUAGES CXX)
+
+#-------------------------------------------------------------------------------
+# Find alpaka.
+
+if(NOT TARGET alpaka::alpaka)
+    option(alpaka_USE_SOURCE_TREE "Use alpaka's source tree instead of an alpaka installation" OFF)
+
+    if(alpaka_USE_SOURCE_TREE)
+        # Don't build the examples recursively
+        set(alpaka_BUILD_EXAMPLES OFF)
+        add_subdirectory("${CMAKE_CURRENT_LIST_DIR}/../.." "${CMAKE_BINARY_DIR}/alpaka")
+    else()
+        find_package(alpaka REQUIRED)
+    endif()
+endif()
+
+#-------------------------------------------------------------------------------
+# Add executable.
+
+alpaka_add_executable(
+    ${_TARGET_NAME}
+    src/convolution2D.cpp)
+target_link_libraries(
+    ${_TARGET_NAME}
+    PUBLIC alpaka::alpaka)
+
+set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER example)
+
+add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME})
diff --git a/example/convolution2D/src/convolution2D.cpp b/example/convolution2D/src/convolution2D.cpp
@@ -0,0 +1,301 @@
+/* Copyright 2023 Benjamin Worpitz, Erik Zenker, Bernhard Manfred Gruber, Jan Stephan
+ * SPDX-License-Identifier: ISC
+ */
+
+#include <alpaka/alpaka.hpp>
+#include <alpaka/example/ExampleDefaultAcc.hpp>
+
+#include <iomanip>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+//! Convolution Example
+//!
+//! A 2D Convolutional filter applied to a matrix. The values of filter-matrix are kept in constant memory in order to
+//! increase performance. Kernel1: Global memory is used, without tiling. Kernel2: Uses tiling. Block size is assumed
+//! to be equal to tile size. First, the tile is copied to shared memory, since an element in a tile would be accessed
+//! many times; using the shared memory increases performance. Each block works on the domain of one tile. But at the
+//! border of the tile, some external matrix values are needed (at the border with another tile) then those matrix
+//! values are taken from the global memory.
+
+#define FILTER_WIDTH 5
+ALPAKA_STATIC_ACC_MEM_CONSTANT float g_constantMemory2DUninitialized[FILTER_WIDTH][FILTER_WIDTH];
+
+/**
+ * @brief 2D Convolutional Filter using only global memory for matrix, and constant memory for the filter
+ */
+struct ConvolutionKernel2DGlobalMemory
+{
+    /**
+    @tparam TAcc Accelerator type
+    @tparam TElem The matrix and filter-matrix element type *
+    @param acc Accelerator
+    @param input Input matrix
+    @param output Output matrix
+    @param matrixWidth Input matrix width
+    @param matrixHeight Input matrix height
+    @param filter Filter-matrix
+    @param filter Matrix width
+        */
+    template<typename TAcc, typename TElem>
+    ALPAKA_FN_ACC auto operator()(
+        TAcc const& acc,
+        TElem const* const input,
+        TElem* output,
+        std::size_t const matrixWidth,
+        std::size_t const matrixHeight,
+        TElem const* const filter,
+        int32_t const filterWidth) const -> void
+    {
+        auto const [row, col] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
+        auto const [blockThreadY, blockThreadX] = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);
+
+        // The convolutional filter-matrix applied to the input matrix. The center of filter is positioned to the item
+        // pointed by the thread index. An implicit "zero padding" is used. If some of the items of the filter are
+        // outside the matrix, those are not taken into calculation in other words the corresponding items of the
+        // matrix are assumed zero.
+        if(col < matrixWidth && row < matrixHeight)
+        {
+            auto pValue{0.0f};
+            for(int32_t fRow = 0; fRow < filterWidth; fRow++)
+            {
+                for(int32_t fCol = 0; fCol < filterWidth; fCol++)
+                {
+                    // Position of input matrix element to be multiplied with the corresponding element at filter
+                    auto const exactRow = static_cast<int32_t>(row) - filterWidth / 2 + fRow;
+                    auto const exactCol = static_cast<int32_t>(col) - filterWidth / 2 + fCol;
+                    if(exactRow >= 0 && exactRow < matrixHeight && exactCol >= 0 && exactCol < matrixWidth)
+                    {
+                        pValue += filter[fRow * filterWidth + fCol] * input[exactRow * matrixWidth + exactCol];
+                    }
+                }
+                output[row * matrixWidth + col] = pValue;
+            }
+        }
+    }
+};
+
+/**
+ * @brief The ConvolutionKernel2DSharedMemory struct. The operator() is a kernel for 2D Convolutional Filter, uses
+ tiling method. Tiles of matrix are kept in the shared memory. For the filter, the constant memory is used. Block
+ dimensions are equal to tile dimensions.
+ */
+struct ConvolutionKernel2DSharedMemory
+{
+    /**
+    @tparam TAcc Accelerator type
+    @tparam TElem The matrix and filter-matrix element type *
+    @param acc Accelerator
+    @param input Input matrix
+    @param output Output matrix
+    @param matrixWidth Input matrix width
+    @param matrixHeight Input matrix height
+    @param filter Filter-matrix
+    @param filter Matrix width
+        */
+    template<typename TAcc, typename TElem>
+    ALPAKA_FN_ACC auto operator()(
+        TAcc const& acc,
+        TElem const* const input,
+        TElem* output,
+        std::size_t const matrixWidth,
+        std::size_t const matrixHeight,
+        float const* const filter,
+        int32_t const filterWidth) const -> void
+    {
+        auto const [row, col] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
+        // Get extents(dimensions)
+        auto const gridBlockExtent(alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc));
+        auto const blockThreadExtent = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc);
+        // Get indexes
+        auto const blockThreadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);
+        auto const blockThreadIdx1D = alpaka::mapIdx<1u>(blockThreadIdx, blockThreadExtent)[0u];
+
+        auto const [blockThreadExtentY, blockThreadExtentX] = blockThreadExtent;
+        auto const [blockThreadY, blockThreadX] = blockThreadIdx;
+        // Allocate shared memory
+        auto* const sharedN = alpaka::getDynSharedMem<TElem>(acc);
+        // Fill shared memory of device so that tile items are accessed from shared memory
+        if(row < matrixHeight && col < matrixWidth)
+        {
+            sharedN[blockThreadIdx1D] = input[row * matrixWidth + col];
+        }
+        else
+        {
+            sharedN[blockThreadIdx1D] = 0.0;
+        }
+
+        // Wait for the block fills the shared memory with the tile of the main matrix
+        alpaka::syncBlockThreads(acc);
+
+        if(col < matrixWidth && row < matrixHeight)
+        {
+            auto pValue{0.0f};
+            for(int32_t fRow = 0; fRow < filterWidth; fRow++)
+            {
+                for(int32_t fCol = 0; fCol < filterWidth; fCol++)
+                {
+                    // Position of input matrix element to be multiplied with the corresponding element at the filter.
+                    // The position is with respect to tile(block)
+                    auto const exactRowBlock = static_cast<int32_t>(blockThreadY) - filterWidth / 2 + fRow;
+                    auto const exactColBlock = static_cast<int32_t>(blockThreadX) - filterWidth / 2 + fCol;
+                    if(exactColBlock >= 0 && exactColBlock < gridBlockExtent[1] && exactRowBlock >= 0
+                       && exactRowBlock < gridBlockExtent[0])
+                    {
+                        // The element is inside the tile. Get the element from the shared memory
+                        pValue += filter[fRow * filterWidth + fCol]
+                                  * sharedN[exactRowBlock * blockThreadExtentX + exactColBlock];
+                    }
+                    else
+                    { // The element is not in the tile(block)
+                        // Position of input matrix element to be multiplied with the corresponding element at the
+                        // filter. The position is with respect to the input matrix
+                        auto const exactRow = static_cast<int32_t>(row) - filterWidth / 2 + fRow;
+                        auto const exactCol = static_cast<int32_t>(col) - filterWidth / 2 + fCol;
+                        if(exactRow >= 0 && exactRow < matrixHeight && exactCol >= 0 && exactCol < matrixWidth)
+                        {
+                            // get the item from the global memory
+                            pValue += filter[fRow * filterWidth + fCol] * input[exactRow * matrixWidth + exactCol];
+                        }
+                    }
+                }
+                output[row * matrixWidth + col] = pValue;
+            }
+        } // if
+    }
+};
+
+namespace alpaka::trait
+{
+    //! The trait for getting the size of the block shared dynamic memory for a kernel.
+    template<typename TAcc>
+    struct BlockSharedMemDynSizeBytes<ConvolutionKernel2DSharedMemory, TAcc>
+    {
+        //! \return The size of the shared memory allocated for a block.
+        template<typename TVec, typename TElem>
+        ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes(
+            ConvolutionKernel2DSharedMemory const& /* matMulKernel */,
+            TVec const& blockThreadExtent,
+            TVec const& threadElemExtent,
+            TElem const* const, // input Matrix
+            TElem*, // output array
+            std::size_t const, // matrixWidth
+            std::size_t const, // matrixHeight
+            float const* const, // filter
+            const int32_t) // filter size
+        {
+            // Reserve the buffer for the two blocks of A and B.
+            return static_cast<std::size_t>(2u * blockThreadExtent.prod() * threadElemExtent.prod()) * sizeof(TElem);
+        }
+    };
+} // namespace alpaka::trait
+
+auto main() -> int
+{
+    // Define the index domain
+    using Dim = alpaka::DimInt<2u>;
+    // Index type
+    using Idx = std::uint32_t;
+    using Vec = alpaka::Vec<Dim, Idx>;
+    // Define the accelerator
+    using DevAcc = alpaka::ExampleDefaultAcc<Dim, Idx>;
+    using QueueProperty = alpaka::NonBlocking;
+    using QueueAcc = alpaka::Queue<DevAcc, QueueProperty>;
+
+    using DataType = float;
+    static constexpr int32_t filterWidth = FILTER_WIDTH;
+    static constexpr int32_t matrixWidth = 10;
+    static constexpr int32_t matrixHeight = 10;
+
+    static_assert(
+        alpaka::Dim<DevAcc>::value == 2u,
+        "The accelerator used for the Alpaka Kernel has to be 2 dimensional!");
+
+    std::cout << "Using alpaka accelerator: " << alpaka::getAccName<DevAcc>() << std::endl;
+
+    auto const devHost = alpaka::getDevByIdx(alpaka::PlatformCpu{}, 0);
+    // Select a device from the accelerator
+    auto const platformAcc = alpaka::Platform<DevAcc>{};
+    auto const devAcc = alpaka::getDevByIdx(platformAcc, 0);
+
+    // Create a queue on the device
+    QueueAcc queueAcc(devAcc);
+    // Define extent (dimensions)
+    Vec const extent(static_cast<Idx>(matrixWidth), static_cast<Idx>(matrixHeight));
+    // Kernel Input
+    std::vector<DataType> bufInputHost(matrixHeight * matrixWidth);
+    // Use increasing values as input
+    std::iota(bufInputHost.begin(), bufInputHost.end(), 1.0f);
+    auto bufInputHostView = alpaka::createView(devHost, bufInputHost, extent);
+
+    // Input buffer at device
+    auto bufInputAcc = alpaka::allocBuf<DataType, Idx>(devAcc, extent);
+    auto bufInputAccView = alpaka::createView(devAcc, bufInputAcc, extent);
+    alpaka::memcpy(queueAcc, bufInputAccView, bufInputHostView);
+    alpaka::wait(queueAcc);
+    // Output buffer at device
+    alpaka::Vec<alpaka::DimInt<1u>, Idx> const extent1D(matrixHeight * matrixWidth);
+    auto outputDeviceMemory = alpaka::allocBuf<DataType, Idx>(devAcc, extent1D);
+
+    using Vec = alpaka::Vec<Dim, Idx>;
+    using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>;
+
+    //  Let alpaka calculate good block and grid sizes given our full problem extent.
+    alpaka::WorkDivMembers<Dim, Idx> const workDiv(alpaka::getValidWorkDiv<DevAcc>(
+        devAcc,
+        extent,
+        alpaka::Vec<Dim, Idx>::ones(),
+        false,
+        alpaka::GridBlockExtentSubDivRestrictions::Unrestricted));
+
+    // convolution filter
+    std::vector<DataType> const filter = {0.11, 0.12, 0.13, 0.14, 0.15, 0.21, 0.22, 0.23, 0.24, 0.25, 0.31, 0.32, 0.33,
+                                          0.34, 0.35, 0.41, 0.42, 0.43, 0.44, 0.45, 0.51, 0.52, 0.53, 0.54, 0.55};
+    auto const bufHostFilter = alpaka::createView(devHost, filter.data(), Vec{filterWidth, filterWidth});
+
+    auto viewConstantMemUninitialized = alpaka::createStaticDevMemView(
+        &g_constantMemory2DUninitialized[0u][0u],
+        devAcc,
+        Vec{filterWidth, filterWidth});
+
+    // Copy the filter-matrix to the constant memory
+    alpaka::memcpy(queueAcc, viewConstantMemUninitialized, bufHostFilter);
+    alpaka::wait(queueAcc);
+
+    // Construct kernel object, choose on of the kernels provided above
+    // ConvolutionKernel2DGlobalMemory and ConvolutionKernel2DSharedMemory
+    ConvolutionKernel2DSharedMemory convolutionKernel2D;
+
+    // Run the kernel
+    alpaka::exec<DevAcc>(
+        queueAcc,
+        workDiv,
+        convolutionKernel2D,
+        alpaka::getPtrNative(bufInputAccView),
+        alpaka::getPtrNative(outputDeviceMemory),
+        matrixWidth,
+        matrixHeight,
+        alpaka::getPtrNative(viewConstantMemUninitialized),
+        filterWidth);
+
+    alpaka::wait(queueAcc);
+    // Allocate memory on host
+    auto resultGpuHost = alpaka::allocBuf<DataType, Idx>(devHost, extent1D);
+    // Copy from device memory to host
+    alpaka::memcpy(queueAcc, resultGpuHost, outputDeviceMemory, extent1D);
+
+    // Print results at the host
+    printf(
+        "Convolution filter kernel: %s\n",
+        std::is_same<decltype(convolutionKernel2D), ConvolutionKernel2DGlobalMemory>::value
+            ? "ConvolutionKernel2DGlobalMemory"
+            : "ConvolutionKernel2DSharedMemory");
+    printf("Matrix Size: %d x %d, Filter Size: %d x %d\n", matrixWidth, matrixHeight, filterWidth, filterWidth);
+    for(size_t i{0}; i < matrixWidth * matrixHeight; i++)
+    {
+        DataType const& val(alpaka::getPtrNative(resultGpuHost)[i]);
+        std::cout << "output[" << i << "]:" << std::setprecision(3) << val << std::endl;
+    }
+    return EXIT_SUCCESS;
+}