Skip to content


Convolution2D filter example using global and shared memory
Browse files Browse the repository at this point in the history
  • Loading branch information
Mehmet Yusufoglu committed Jan 26, 2024
1 parent 15a56e9 commit 8a7a920
Show file tree
Hide file tree
Showing 3 changed files with 349 additions and 0 deletions.
1 change: 1 addition & 0 deletions example/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ project("alpakaExamples" LANGUAGES CXX)
Expand Down
47 changes: 47 additions & 0 deletions example/convolution2D/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# Copyright 2023 Erik Zenker, Benjamin Worpitz, Jan Stephan
# SPDX-License-Identifier: ISC

# Required CMake version.

cmake_minimum_required(VERSION 3.22)


# Project.

set(_TARGET_NAME convolution2D)


# Find alpaka.

if(NOT TARGET alpaka::alpaka)
option(alpaka_USE_SOURCE_TREE "Use alpaka's source tree instead of an alpaka installation" OFF)

# Don't build the examples recursively
add_subdirectory("${CMAKE_CURRENT_LIST_DIR}/../.." "${CMAKE_BINARY_DIR}/alpaka")
find_package(alpaka REQUIRED)

# Add executable.

PUBLIC alpaka::alpaka)

set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER example)

301 changes: 301 additions & 0 deletions example/convolution2D/src/convolution2D.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,301 @@
/* Copyright 2023 Benjamin Worpitz, Erik Zenker, Bernhard Manfred Gruber, Jan Stephan
* SPDX-License-Identifier: ISC

#include <alpaka/alpaka.hpp>
#include <alpaka/example/ExampleDefaultAcc.hpp>

#include <iomanip>
#include <iostream>
#include <numeric>
#include <vector>

//! Convolution Example
//! A 2D Convolutional filter applied to a matrix. The values of filter-matrix are kept in constant memory in order to
//! increase performance. Kernel1: Global memory is used, without tiling. Kernel2: Uses tiling. Block size is assumed
//! to be equal to tile size. First, the tile is copied to shared memory, since an element in a tile would be accessed
//! many times; using the shared memory increases performance. Each block works on the domain of one tile. But at the
//! border of the tile, some external matrix values are needed (at the border with another tile) then those matrix
//! values are taken from the global memory.

#define FILTER_WIDTH 5

* @brief 2D Convolutional Filter using only global memory for matrix, and constant memory for the filter
struct ConvolutionKernel2DGlobalMemory
@tparam TAcc Accelerator type
@tparam TElem The matrix and filter-matrix element type *
@param acc Accelerator
@param input Input matrix
@param output Output matrix
@param matrixWidth Input matrix width
@param matrixHeight Input matrix height
@param filter Filter-matrix
@param filter Matrix width
template<typename TAcc, typename TElem>
ALPAKA_FN_ACC auto operator()(
TAcc const& acc,
TElem const* const input,
TElem* output,
std::size_t const matrixWidth,
std::size_t const matrixHeight,
TElem const* const filter,
int32_t const filterWidth) const -> void
auto const [row, col] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
auto const [blockThreadY, blockThreadX] = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);

// The convolutional filter-matrix applied to the input matrix. The center of filter is positioned to the item
// pointed by the thread index. An implicit "zero padding" is used. If some of the items of the filter are
// outside the matrix, those are not taken into calculation in other words the corresponding items of the
// matrix are assumed zero.
if(col < matrixWidth && row < matrixHeight)
auto pValue{0.0f};
for(int32_t fRow = 0; fRow < filterWidth; fRow++)
for(int32_t fCol = 0; fCol < filterWidth; fCol++)
// Position of input matrix element to be multiplied with the corresponding element at filter
auto const exactRow = static_cast<int32_t>(row) - filterWidth / 2 + fRow;
auto const exactCol = static_cast<int32_t>(col) - filterWidth / 2 + fCol;
if(exactRow >= 0 && exactRow < matrixHeight && exactCol >= 0 && exactCol < matrixWidth)
pValue += filter[fRow * filterWidth + fCol] * input[exactRow * matrixWidth + exactCol];
output[row * matrixWidth + col] = pValue;

* @brief The ConvolutionKernel2DSharedMemory struct. The operator() is a kernel for 2D Convolutional Filter, uses
tiling method. Tiles of matrix are kept in the shared memory. For the filter, the constant memory is used. Block
dimensions are equal to tile dimensions.
struct ConvolutionKernel2DSharedMemory
@tparam TAcc Accelerator type
@tparam TElem The matrix and filter-matrix element type *
@param acc Accelerator
@param input Input matrix
@param output Output matrix
@param matrixWidth Input matrix width
@param matrixHeight Input matrix height
@param filter Filter-matrix
@param filter Matrix width
template<typename TAcc, typename TElem>
ALPAKA_FN_ACC auto operator()(
TAcc const& acc,
TElem const* const input,
TElem* output,
std::size_t const matrixWidth,
std::size_t const matrixHeight,
float const* const filter,
int32_t const filterWidth) const -> void
auto const [row, col] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
// Get extents(dimensions)
auto const gridBlockExtent(alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc));
auto const blockThreadExtent = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc);
// Get indexes
auto const blockThreadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);
auto const blockThreadIdx1D = alpaka::mapIdx<1u>(blockThreadIdx, blockThreadExtent)[0u];

auto const [blockThreadExtentY, blockThreadExtentX] = blockThreadExtent;
auto const [blockThreadY, blockThreadX] = blockThreadIdx;
// Allocate shared memory
auto* const sharedN = alpaka::getDynSharedMem<TElem>(acc);
// Fill shared memory of device so that tile items are accessed from shared memory
if(row < matrixHeight && col < matrixWidth)
sharedN[blockThreadIdx1D] = input[row * matrixWidth + col];
sharedN[blockThreadIdx1D] = 0.0;

// Wait for the block fills the shared memory with the tile of the main matrix

if(col < matrixWidth && row < matrixHeight)
auto pValue{0.0f};
for(int32_t fRow = 0; fRow < filterWidth; fRow++)
for(int32_t fCol = 0; fCol < filterWidth; fCol++)
// Position of input matrix element to be multiplied with the corresponding element at the filter.
// The position is with respect to tile(block)
auto const exactRowBlock = static_cast<int32_t>(blockThreadY) - filterWidth / 2 + fRow;
auto const exactColBlock = static_cast<int32_t>(blockThreadX) - filterWidth / 2 + fCol;
if(exactColBlock >= 0 && exactColBlock < gridBlockExtent[1] && exactRowBlock >= 0
&& exactRowBlock < gridBlockExtent[0])
// The element is inside the tile. Get the element from the shared memory
pValue += filter[fRow * filterWidth + fCol]
* sharedN[exactRowBlock * blockThreadExtentX + exactColBlock];
{ // The element is not in the tile(block)
// Position of input matrix element to be multiplied with the corresponding element at the
// filter. The position is with respect to the input matrix
auto const exactRow = static_cast<int32_t>(row) - filterWidth / 2 + fRow;
auto const exactCol = static_cast<int32_t>(col) - filterWidth / 2 + fCol;
if(exactRow >= 0 && exactRow < matrixHeight && exactCol >= 0 && exactCol < matrixWidth)
// get the item from the global memory
pValue += filter[fRow * filterWidth + fCol] * input[exactRow * matrixWidth + exactCol];
output[row * matrixWidth + col] = pValue;
} // if

namespace alpaka::trait
//! The trait for getting the size of the block shared dynamic memory for a kernel.
template<typename TAcc>
struct BlockSharedMemDynSizeBytes<ConvolutionKernel2DSharedMemory, TAcc>
//! \return The size of the shared memory allocated for a block.
template<typename TVec, typename TElem>
ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes(
ConvolutionKernel2DSharedMemory const& /* matMulKernel */,
TVec const& blockThreadExtent,
TVec const& threadElemExtent,
TElem const* const, // input Matrix
TElem*, // output array
std::size_t const, // matrixWidth
std::size_t const, // matrixHeight
float const* const, // filter
const int32_t) // filter size
// Reserve the buffer for the two blocks of A and B.
return static_cast<std::size_t>(2u * * * sizeof(TElem);
} // namespace alpaka::trait

auto main() -> int
// Define the index domain
using Dim = alpaka::DimInt<2u>;
// Index type
using Idx = std::uint32_t;
using Vec = alpaka::Vec<Dim, Idx>;
// Define the accelerator
using DevAcc = alpaka::ExampleDefaultAcc<Dim, Idx>;
using QueueProperty = alpaka::NonBlocking;
using QueueAcc = alpaka::Queue<DevAcc, QueueProperty>;

using DataType = float;
static constexpr int32_t filterWidth = FILTER_WIDTH;
static constexpr int32_t matrixWidth = 10;
static constexpr int32_t matrixHeight = 10;

alpaka::Dim<DevAcc>::value == 2u,
"The accelerator used for the Alpaka Kernel has to be 2 dimensional!");

std::cout << "Using alpaka accelerator: " << alpaka::getAccName<DevAcc>() << std::endl;

auto const devHost = alpaka::getDevByIdx(alpaka::PlatformCpu{}, 0);
// Select a device from the accelerator
auto const platformAcc = alpaka::Platform<DevAcc>{};
auto const devAcc = alpaka::getDevByIdx(platformAcc, 0);

// Create a queue on the device
QueueAcc queueAcc(devAcc);
// Define extent (dimensions)
Vec const extent(static_cast<Idx>(matrixWidth), static_cast<Idx>(matrixHeight));
// Kernel Input
std::vector<DataType> bufInputHost(matrixHeight * matrixWidth);
// Use increasing values as input
std::iota(bufInputHost.begin(), bufInputHost.end(), 1.0f);
auto bufInputHostView = alpaka::createView(devHost, bufInputHost, extent);

// Input buffer at device
auto bufInputAcc = alpaka::allocBuf<DataType, Idx>(devAcc, extent);
auto bufInputAccView = alpaka::createView(devAcc, bufInputAcc, extent);
alpaka::memcpy(queueAcc, bufInputAccView, bufInputHostView);
// Output buffer at device
alpaka::Vec<alpaka::DimInt<1u>, Idx> const extent1D(matrixHeight * matrixWidth);
auto outputDeviceMemory = alpaka::allocBuf<DataType, Idx>(devAcc, extent1D);

using Vec = alpaka::Vec<Dim, Idx>;
using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>;

// Let alpaka calculate good block and grid sizes given our full problem extent.
alpaka::WorkDivMembers<Dim, Idx> const workDiv(alpaka::getValidWorkDiv<DevAcc>(
alpaka::Vec<Dim, Idx>::ones(),

// convolution filter
std::vector<DataType> const filter = {0.11, 0.12, 0.13, 0.14, 0.15, 0.21, 0.22, 0.23, 0.24, 0.25, 0.31, 0.32, 0.33,
0.34, 0.35, 0.41, 0.42, 0.43, 0.44, 0.45, 0.51, 0.52, 0.53, 0.54, 0.55};
auto const bufHostFilter = alpaka::createView(devHost,, Vec{filterWidth, filterWidth});

auto viewConstantMemUninitialized = alpaka::createStaticDevMemView(
Vec{filterWidth, filterWidth});

// Copy the filter-matrix to the constant memory
alpaka::memcpy(queueAcc, viewConstantMemUninitialized, bufHostFilter);

// Construct kernel object, choose on of the kernels provided above
// ConvolutionKernel2DGlobalMemory and ConvolutionKernel2DSharedMemory
ConvolutionKernel2DSharedMemory convolutionKernel2D;

// Run the kernel

// Allocate memory on host
auto resultGpuHost = alpaka::allocBuf<DataType, Idx>(devHost, extent1D);
// Copy from device memory to host
alpaka::memcpy(queueAcc, resultGpuHost, outputDeviceMemory, extent1D);

// Print results at the host
"Convolution filter kernel: %s\n",
std::is_same<decltype(convolutionKernel2D), ConvolutionKernel2DGlobalMemory>::value
? "ConvolutionKernel2DGlobalMemory"
: "ConvolutionKernel2DSharedMemory");
printf("Matrix Size: %d x %d, Filter Size: %d x %d\n", matrixWidth, matrixHeight, filterWidth, filterWidth);
for(size_t i{0}; i < matrixWidth * matrixHeight; i++)
DataType const& val(alpaka::getPtrNative(resultGpuHost)[i]);
std::cout << "output[" << i << "]:" << std::setprecision(3) << val << std::endl;

0 comments on commit 8a7a920

Please sign in to comment.