From aa3efcc019731851413c5451326c3a57c5f44216 Mon Sep 17 00:00:00 2001
From: Mehmet Yusufoglu <mehmet@rtr.ai>
Date: Mon, 15 Jan 2024 00:00:13 +0100
Subject: [PATCH] Convolution2D filter example using global and shared memory

---
 example/CMakeLists.txt                      |   1 +
 example/convolution2D/CMakeLists.txt        |  47 ++++
 example/convolution2D/src/convolution2D.cpp | 247 ++++++++++++++++++++
 3 files changed, 295 insertions(+)
 create mode 100644 example/convolution2D/CMakeLists.txt
 create mode 100644 example/convolution2D/src/convolution2D.cpp
diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
index b74c45b86243..e3eae8aa2b3b 100644
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -17,6 +17,7 @@ project("alpakaExamples" LANGUAGES CXX)
 
 add_subdirectory("bufferCopy/")
 add_subdirectory("complex/")
+add_subdirectory("convolution2D/")
 add_subdirectory("counterBasedRng/")
 add_subdirectory("heatEquation/")
 add_subdirectory("helloWorld/")
diff --git a/example/convolution2D/CMakeLists.txt b/example/convolution2D/CMakeLists.txt
new file mode 100644
index 000000000000..2324f7b8ef2f
--- /dev/null
+++ b/example/convolution2D/CMakeLists.txt
@@ -0,0 +1,47 @@
+#
+# Copyright 2023 Erik Zenker, Benjamin Worpitz, Jan Stephan
+# SPDX-License-Identifier: ISC
+#
+
+################################################################################
+# Required CMake version.
+
+cmake_minimum_required(VERSION 3.22)
+
+set_property(GLOBAL PROPERTY USE_FOLDERS ON)
+
+################################################################################
+# Project.
+
+set(_TARGET_NAME convolution2D)
+
+project(${_TARGET_NAME} LANGUAGES CXX)
+
+#-------------------------------------------------------------------------------
+# Find alpaka.
+
+if(NOT TARGET alpaka::alpaka)
+    option(alpaka_USE_SOURCE_TREE "Use alpaka's source tree instead of an alpaka installation" OFF)
+
+    if(alpaka_USE_SOURCE_TREE)
+        # Don't build the examples recursively
+        set(alpaka_BUILD_EXAMPLES OFF)
+        add_subdirectory("${CMAKE_CURRENT_LIST_DIR}/../.." "${CMAKE_BINARY_DIR}/alpaka")
+    else()
+        find_package(alpaka REQUIRED)
+    endif()
+endif()
+
+#-------------------------------------------------------------------------------
+# Add executable.
+
+alpaka_add_executable(
+    ${_TARGET_NAME}
+    src/convolution2D.cpp)
+target_link_libraries(
+    ${_TARGET_NAME}
+    PUBLIC alpaka::alpaka)
+
+set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER example)
+
+add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME})
diff --git a/example/convolution2D/src/convolution2D.cpp b/example/convolution2D/src/convolution2D.cpp
new file mode 100644
index 000000000000..c345d97298f8
--- /dev/null
+++ b/example/convolution2D/src/convolution2D.cpp
@@ -0,0 +1,247 @@
+/* Copyright 2023 Benjamin Worpitz, Erik Zenker, Bernhard Manfred Gruber, Jan Stephan
+ * SPDX-License-Identifier: ISC
+ */
+
+#include <alpaka/alpaka.hpp>
+#include <alpaka/example/ExampleDefaultAcc.hpp>
+
+#include <iomanip>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+//! Convolution Example
+//!
+//! A 2D convolutional filter example with padding.
+
+static constexpr int32_t FILTER_RADIUS = 2;
+static constexpr int32_t TILE_DIM = 2 * FILTER_RADIUS + 1;
+static constexpr int32_t MATRIX_WIDTH = 10;
+static constexpr int32_t MATRIX_HEIGHT = 10;
+
+
+// These declarations about constant memory allocation(in GPU context), are used to silence clang`s
+// -Wmissing-variable-declarations warning that forces every non-static variable to be declared with extern before the
+// are defined. These forward declarations are only necessary when you want to access those variables from a different
+// compilation unit and should be moved to a common header.
+extern ALPAKA_STATIC_ACC_MEM_CONSTANT float g_constantMemory2DUninitialized[5][5];
+ALPAKA_STATIC_ACC_MEM_CONSTANT float g_constantMemory2DUninitialized[5][5];
+
+// 2D Convolutional Filter using only global memory for matrix, and constant memory for the filter
+struct ConvolutionKernel2DGlobalMemory
+{
+    template<typename TAcc, typename TElem>
+    ALPAKA_FN_ACC auto operator()(
+        TAcc const& acc,
+        TElem const* const N,
+        TElem* P,
+        const std::size_t MatrixWidth,
+        const std::size_t MatrixHeight,
+        TElem const* const filter) const -> void
+    {
+        using Dim = alpaka::Dim<TAcc>;
+        using Idx = alpaka::Idx<TAcc>;
+        using Vec = alpaka::Vec<Dim, Idx>;
+
+        auto const [row, col] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
+        auto const [blockThreadY, blockThreadX] = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);
+
+        // The filter matrix applied to the matrix tile. The center of filter is positioned to the item pointed by the
+        // thread index. An implicit "zero padding" is used. If some of the items of the filter are outside the matrix,
+        // those are not taken into calculation in other words the corresponding items of the matrix are assumed zero.
+        if(col < MatrixWidth && row < MatrixHeight)
+        {
+            auto Pvalue = 0.0f;
+            for(int32_t fRow = 0; fRow < 2 * FILTER_RADIUS + 1; fRow++)
+            {
+                for(int32_t fCol = 0; fCol < 2 * FILTER_RADIUS + 1; fCol++)
+                {
+                    auto const exactRow = static_cast<int32_t>(row) - FILTER_RADIUS + fRow;
+                    auto const exactCol = static_cast<int32_t>(col) - FILTER_RADIUS + fCol;
+                    if(exactRow >= 0 && exactRow < MatrixHeight && exactCol >= 0 && exactCol < MatrixWidth)
+                    {
+                        Pvalue += filter[fRow * TILE_DIM + fCol] * N[exactRow * MatrixWidth + exactCol];
+                    }
+                }
+                P[row * MatrixWidth + col] = Pvalue;
+            }
+        }
+    }
+};
+
+// 2D Convolutional Filter, uses tiling method. Tiles of matrix are kept in the shared memory. For the filter, the
+// constant memory is used. Block dimensions are equal to tile dimensions.
+struct ConvolutionKernel2DSharedMemory
+{
+    template<typename TAcc, typename TElem>
+    ALPAKA_FN_ACC auto operator()(
+        TAcc const& acc,
+        TElem const* const N,
+        TElem* P,
+        const std::size_t MatrixWidth,
+        const std::size_t MatrixHeight,
+        float const* const filter) const -> void
+    {
+        using Dim = alpaka::Dim<TAcc>;
+        using Idx = alpaka::Idx<TAcc>;
+        using Vec = alpaka::Vec<Dim, Idx>;
+
+        auto const [row, col] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
+        auto const [blockThreadY, blockThreadX] = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);
+        auto const [blockThreadExtentY, blockThreadExtentX] = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc);
+
+        // Allocate shared memory
+        auto& N_s = alpaka::declareSharedVar<TElem[TILE_DIM * TILE_DIM], __COUNTER__>(acc);
+
+        // Fill shared memory of device so that tile items are accessed from shared memory
+        if(row < MatrixHeight && col < MatrixWidth)
+        {
+            N_s[blockThreadY * blockThreadExtentX + blockThreadX] = N[row * MatrixWidth + col];
+        }
+        else
+        {
+            N_s[blockThreadY * blockThreadExtentX + blockThreadX] = 0.0;
+        }
+
+        // Wait for the block fills the shared memory with the tile of the main matrix
+        alpaka::syncBlockThreads(acc);
+
+        if(col < MatrixWidth && row < MatrixHeight)
+        {
+            auto Pvalue{0.0f};
+            for(int32_t fRow = 0; fRow < 2 * FILTER_RADIUS + 1; fRow++)
+            {
+                for(int32_t fCol = 0; fCol < 2 * FILTER_RADIUS + 1; fCol++)
+                {
+                    auto const exactRowBlock = static_cast<int32_t>(blockThreadY) - FILTER_RADIUS + fRow;
+                    auto const exactColBlock = static_cast<int32_t>(blockThreadX) - FILTER_RADIUS + fCol;
+                    if(exactColBlock >= 0 && exactColBlock < TILE_DIM && exactRowBlock >= 0
+                       && exactRowBlock < TILE_DIM)
+                    {
+                        // get the item from the shared memory
+                        Pvalue += filter[fRow * TILE_DIM + fCol]
+                                  * N_s[exactRowBlock * blockThreadExtentX + exactColBlock];
+                    }
+                    else
+                    {
+                        auto const exactRow = static_cast<int32_t>(row) - FILTER_RADIUS + fRow;
+                        auto const exactCol = static_cast<int32_t>(col) - FILTER_RADIUS + fCol;
+                        if(exactRow >= 0 && exactRow < MatrixHeight && exactCol >= 0 && exactCol < MatrixWidth)
+                        {
+                            // get the item from the global memory
+                            Pvalue += filter[fRow * TILE_DIM + fCol] * N[exactRow * MatrixWidth + exactCol];
+                        }
+                    }
+                }
+                P[row * MatrixWidth + col] = Pvalue;
+            }
+        } // if
+    }
+};
+
+auto main() -> int
+{
+    // Define the index domain
+    using Dim = alpaka::DimInt<2u>;
+    // Index type
+    using Idx = std::uint32_t;
+    using Vec = alpaka::Vec<Dim, Idx>;
+    // Define the accelerator
+    using DevAcc = alpaka::ExampleDefaultAcc<Dim, Idx>;
+    using QueueProperty = alpaka::NonBlocking;
+    using QueueAcc = alpaka::Queue<DevAcc, QueueProperty>;
+
+    using DataType = float;
+    static_assert(
+        alpaka::Dim<DevAcc>::value == 2u,
+        "The accelerator used for the AlpakaKernel has to be 2 dimensional!");
+    static_assert(
+        MATRIX_WIDTH % TILE_DIM == 0 && MATRIX_HEIGHT % TILE_DIM == 0,
+        "Matrix dimensions are not multiples of Tile dimensions");
+
+    std::cout << "Using alpaka accelerator: " << alpaka::getAccName<DevAcc>() << std::endl;
+
+    auto const devHost = alpaka::getDevByIdx(alpaka::PlatformCpu{}, 0);
+    // Select a device from the accelerator
+    auto const platformAcc = alpaka::Platform<DevAcc>{};
+    auto const devAcc = alpaka::getDevByIdx(platformAcc, 0);
+
+    // Create a queue on the device
+    QueueAcc queueAcc(devAcc);
+    // Define extent (dimensions)
+    Vec const extent(static_cast<Idx>(MATRIX_WIDTH), static_cast<Idx>(MATRIX_HEIGHT));
+    // Kernel Input
+    std::vector<DataType> bufInputHost(MATRIX_HEIGHT * MATRIX_WIDTH);
+    std::iota(bufInputHost.begin(), bufInputHost.end(), 1.0f);
+    auto bufInputHostView = alpaka::createView(devHost, bufInputHost, extent);
+
+    // Input buffer at device
+    auto bufInputAcc = alpaka::allocBuf<DataType, Idx>(devAcc, extent);
+    auto bufInputAccView = alpaka::createView(devAcc, bufInputAcc, extent);
+    alpaka::memcpy(queueAcc, bufInputAccView, bufInputHostView);
+    alpaka::wait(queueAcc);
+    // Output buffer in device
+    alpaka::Vec<alpaka::DimInt<1u>, Idx> const extent1D(MATRIX_HEIGHT * MATRIX_WIDTH);
+    auto outputDeviceMemory = alpaka::allocBuf<DataType, Idx>(devAcc, extent1D);
+
+    using Vec = alpaka::Vec<Dim, Idx>;
+    using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>;
+
+    // The matrix is tiled. Each block of threads uses one tile, tile dimensions equal to block dims.
+    auto blocksPerGrid = alpaka::Vec<Dim, Idx>{MATRIX_WIDTH / TILE_DIM, MATRIX_HEIGHT / TILE_DIM};
+    auto const threadsPerBlock = alpaka::Vec<Dim, Idx>{TILE_DIM, TILE_DIM};
+    auto const elementsPerThread = alpaka::Vec<Dim, Idx>{1u, 0u};
+    auto workDiv = WorkDiv{blocksPerGrid, threadsPerBlock, elementsPerThread};
+
+    // convolution filter
+    std::vector<DataType> filter = {0.11, 0.12, 0.13, 0.14, 0.15, 0.21, 0.22, 0.23, 0.24, 0.25, 0.31, 0.32, 0.33,
+                                    0.34, 0.35, 0.41, 0.42, 0.43, 0.44, 0.45, 0.51, 0.52, 0.53, 0.54, 0.55};
+    auto bufHostFilter = alpaka::createView(devHost, filter.data(), Vec{TILE_DIM, TILE_DIM});
+    // Use constant memory in device for convolution filter
+    auto viewConstantMemUninitialized
+        = alpaka::createStaticDevMemView(&g_constantMemory2DUninitialized[0u][0u], devAcc, Vec{TILE_DIM, TILE_DIM});
+    // Copy filter matrix to the constant memory
+    alpaka::memcpy(queueAcc, viewConstantMemUninitialized, bufHostFilter);
+    alpaka::wait(queueAcc);
+
+    // Construct kernel object
+    ConvolutionKernel2DSharedMemory convolutionKernel2D;
+
+    // Run the kernel
+    auto const taskKernel = alpaka::createTaskKernel<DevAcc>(
+        workDiv,
+        convolutionKernel2D,
+        alpaka::getPtrNative(bufInputAccView),
+        alpaka::getPtrNative(outputDeviceMemory),
+        MATRIX_WIDTH,
+        MATRIX_HEIGHT,
+        alpaka::getPtrNative(viewConstantMemUninitialized));
+
+    alpaka::enqueue(queueAcc, taskKernel);
+    alpaka::wait(queueAcc);
+    // Allocate memory on host
+    auto resultGpuHost = alpaka::allocBuf<DataType, Idx>(devHost, extent1D);
+    // Copy from device memory to host
+    alpaka::memcpy(queueAcc, resultGpuHost, outputDeviceMemory, extent1D);
+
+    // Print results at the host
+    printf(
+        "Convolution filter kernel: %s\n",
+        std::is_same<decltype(convolutionKernel2D), ConvolutionKernel2DGlobalMemory>::value
+            ? "ConvolutionKernel2DGlobalMemory"
+            : "ConvolutionKernel2DSharedMemory");
+    printf(
+        "Matrix Size: %d x %d, Filter Size: %d x %d, Tile Size: %d x %d\n",
+        MATRIX_WIDTH,
+        MATRIX_HEIGHT,
+        TILE_DIM,
+        TILE_DIM,
+        TILE_DIM,
+        TILE_DIM);
+    for(size_t i{0}; i < MATRIX_WIDTH * MATRIX_HEIGHT; i++)
+    {
+        DataType const& val(alpaka::getPtrNative(resultGpuHost)[i]);
+        std::cout << "output[" << i << "]:" << std::setprecision(3) << val << std::endl;
+    }
+    return EXIT_SUCCESS;
+}