Skip to content

Commit

Permalink
Convolution2D filter example using global and shared memory
Browse files Browse the repository at this point in the history
  • Loading branch information
Mehmet Yusufoglu committed Jan 24, 2024
1 parent b161b2f commit aa3efcc
Show file tree
Hide file tree
Showing 3 changed files with 295 additions and 0 deletions.
1 change: 1 addition & 0 deletions example/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ project("alpakaExamples" LANGUAGES CXX)

add_subdirectory("bufferCopy/")
add_subdirectory("complex/")
add_subdirectory("convolution2D/")
add_subdirectory("counterBasedRng/")
add_subdirectory("heatEquation/")
add_subdirectory("helloWorld/")
Expand Down
47 changes: 47 additions & 0 deletions example/convolution2D/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#
# Copyright 2023 Erik Zenker, Benjamin Worpitz, Jan Stephan
# SPDX-License-Identifier: ISC
#

################################################################################
# Required CMake version.

cmake_minimum_required(VERSION 3.22)

set_property(GLOBAL PROPERTY USE_FOLDERS ON)

################################################################################
# Project.

set(_TARGET_NAME convolution2D)

project(${_TARGET_NAME} LANGUAGES CXX)

#-------------------------------------------------------------------------------
# Find alpaka.

if(NOT TARGET alpaka::alpaka)
option(alpaka_USE_SOURCE_TREE "Use alpaka's source tree instead of an alpaka installation" OFF)

if(alpaka_USE_SOURCE_TREE)
# Don't build the examples recursively
set(alpaka_BUILD_EXAMPLES OFF)
add_subdirectory("${CMAKE_CURRENT_LIST_DIR}/../.." "${CMAKE_BINARY_DIR}/alpaka")
else()
find_package(alpaka REQUIRED)
endif()
endif()

#-------------------------------------------------------------------------------
# Add executable.

alpaka_add_executable(
${_TARGET_NAME}
src/convolution2D.cpp)
target_link_libraries(
${_TARGET_NAME}
PUBLIC alpaka::alpaka)

set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER example)

add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME})
247 changes: 247 additions & 0 deletions example/convolution2D/src/convolution2D.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,247 @@
/* Copyright 2023 Benjamin Worpitz, Erik Zenker, Bernhard Manfred Gruber, Jan Stephan
* SPDX-License-Identifier: ISC
*/

#include <alpaka/alpaka.hpp>
#include <alpaka/example/ExampleDefaultAcc.hpp>

#include <iomanip>
#include <iostream>
#include <numeric>
#include <vector>

//! Convolution Example
//!
//! A 2D convolutional filter example with padding.

static constexpr int32_t FILTER_RADIUS = 2;
static constexpr int32_t TILE_DIM = 2 * FILTER_RADIUS + 1;
static constexpr int32_t MATRIX_WIDTH = 10;
static constexpr int32_t MATRIX_HEIGHT = 10;


// These declarations about constant memory allocation(in GPU context), are used to silence clang`s
// -Wmissing-variable-declarations warning that forces every non-static variable to be declared with extern before the
// are defined. These forward declarations are only necessary when you want to access those variables from a different
// compilation unit and should be moved to a common header.
extern ALPAKA_STATIC_ACC_MEM_CONSTANT float g_constantMemory2DUninitialized[5][5];
ALPAKA_STATIC_ACC_MEM_CONSTANT float g_constantMemory2DUninitialized[5][5];

// 2D Convolutional Filter using only global memory for matrix, and constant memory for the filter
struct ConvolutionKernel2DGlobalMemory
{
template<typename TAcc, typename TElem>
ALPAKA_FN_ACC auto operator()(
TAcc const& acc,
TElem const* const N,
TElem* P,
const std::size_t MatrixWidth,
const std::size_t MatrixHeight,
TElem const* const filter) const -> void
{
using Dim = alpaka::Dim<TAcc>;
using Idx = alpaka::Idx<TAcc>;
using Vec = alpaka::Vec<Dim, Idx>;

auto const [row, col] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
auto const [blockThreadY, blockThreadX] = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);

// The filter matrix applied to the matrix tile. The center of filter is positioned to the item pointed by the
// thread index. An implicit "zero padding" is used. If some of the items of the filter are outside the matrix,
// those are not taken into calculation in other words the corresponding items of the matrix are assumed zero.
if(col < MatrixWidth && row < MatrixHeight)
{
auto Pvalue = 0.0f;
for(int32_t fRow = 0; fRow < 2 * FILTER_RADIUS + 1; fRow++)
{
for(int32_t fCol = 0; fCol < 2 * FILTER_RADIUS + 1; fCol++)
{
auto const exactRow = static_cast<int32_t>(row) - FILTER_RADIUS + fRow;
auto const exactCol = static_cast<int32_t>(col) - FILTER_RADIUS + fCol;
if(exactRow >= 0 && exactRow < MatrixHeight && exactCol >= 0 && exactCol < MatrixWidth)
{
Pvalue += filter[fRow * TILE_DIM + fCol] * N[exactRow * MatrixWidth + exactCol];
}
}
P[row * MatrixWidth + col] = Pvalue;
}
}
}
};

// 2D Convolutional Filter, uses tiling method. Tiles of matrix are kept in the shared memory. For the filter, the
// constant memory is used. Block dimensions are equal to tile dimensions.
struct ConvolutionKernel2DSharedMemory
{
template<typename TAcc, typename TElem>
ALPAKA_FN_ACC auto operator()(
TAcc const& acc,
TElem const* const N,
TElem* P,
const std::size_t MatrixWidth,
const std::size_t MatrixHeight,
float const* const filter) const -> void
{
using Dim = alpaka::Dim<TAcc>;
using Idx = alpaka::Idx<TAcc>;
using Vec = alpaka::Vec<Dim, Idx>;

auto const [row, col] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
auto const [blockThreadY, blockThreadX] = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);
auto const [blockThreadExtentY, blockThreadExtentX] = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc);

// Allocate shared memory
auto& N_s = alpaka::declareSharedVar<TElem[TILE_DIM * TILE_DIM], __COUNTER__>(acc);

// Fill shared memory of device so that tile items are accessed from shared memory
if(row < MatrixHeight && col < MatrixWidth)
{
N_s[blockThreadY * blockThreadExtentX + blockThreadX] = N[row * MatrixWidth + col];
}
else
{
N_s[blockThreadY * blockThreadExtentX + blockThreadX] = 0.0;
}

// Wait for the block fills the shared memory with the tile of the main matrix
alpaka::syncBlockThreads(acc);

if(col < MatrixWidth && row < MatrixHeight)
{
auto Pvalue{0.0f};
for(int32_t fRow = 0; fRow < 2 * FILTER_RADIUS + 1; fRow++)
{
for(int32_t fCol = 0; fCol < 2 * FILTER_RADIUS + 1; fCol++)
{
auto const exactRowBlock = static_cast<int32_t>(blockThreadY) - FILTER_RADIUS + fRow;
auto const exactColBlock = static_cast<int32_t>(blockThreadX) - FILTER_RADIUS + fCol;
if(exactColBlock >= 0 && exactColBlock < TILE_DIM && exactRowBlock >= 0
&& exactRowBlock < TILE_DIM)
{
// get the item from the shared memory
Pvalue += filter[fRow * TILE_DIM + fCol]
* N_s[exactRowBlock * blockThreadExtentX + exactColBlock];
}
else
{
auto const exactRow = static_cast<int32_t>(row) - FILTER_RADIUS + fRow;
auto const exactCol = static_cast<int32_t>(col) - FILTER_RADIUS + fCol;
if(exactRow >= 0 && exactRow < MatrixHeight && exactCol >= 0 && exactCol < MatrixWidth)
{
// get the item from the global memory
Pvalue += filter[fRow * TILE_DIM + fCol] * N[exactRow * MatrixWidth + exactCol];
}
}
}
P[row * MatrixWidth + col] = Pvalue;
}
} // if
}
};

auto main() -> int
{
// Define the index domain
using Dim = alpaka::DimInt<2u>;
// Index type
using Idx = std::uint32_t;
using Vec = alpaka::Vec<Dim, Idx>;
// Define the accelerator
using DevAcc = alpaka::ExampleDefaultAcc<Dim, Idx>;
using QueueProperty = alpaka::NonBlocking;
using QueueAcc = alpaka::Queue<DevAcc, QueueProperty>;

using DataType = float;
static_assert(
alpaka::Dim<DevAcc>::value == 2u,
"The accelerator used for the AlpakaKernel has to be 2 dimensional!");
static_assert(
MATRIX_WIDTH % TILE_DIM == 0 && MATRIX_HEIGHT % TILE_DIM == 0,
"Matrix dimensions are not multiples of Tile dimensions");

std::cout << "Using alpaka accelerator: " << alpaka::getAccName<DevAcc>() << std::endl;

auto const devHost = alpaka::getDevByIdx(alpaka::PlatformCpu{}, 0);
// Select a device from the accelerator
auto const platformAcc = alpaka::Platform<DevAcc>{};
auto const devAcc = alpaka::getDevByIdx(platformAcc, 0);

// Create a queue on the device
QueueAcc queueAcc(devAcc);
// Define extent (dimensions)
Vec const extent(static_cast<Idx>(MATRIX_WIDTH), static_cast<Idx>(MATRIX_HEIGHT));
// Kernel Input
std::vector<DataType> bufInputHost(MATRIX_HEIGHT * MATRIX_WIDTH);
std::iota(bufInputHost.begin(), bufInputHost.end(), 1.0f);
auto bufInputHostView = alpaka::createView(devHost, bufInputHost, extent);

// Input buffer at device
auto bufInputAcc = alpaka::allocBuf<DataType, Idx>(devAcc, extent);
auto bufInputAccView = alpaka::createView(devAcc, bufInputAcc, extent);
alpaka::memcpy(queueAcc, bufInputAccView, bufInputHostView);
alpaka::wait(queueAcc);
// Output buffer in device
alpaka::Vec<alpaka::DimInt<1u>, Idx> const extent1D(MATRIX_HEIGHT * MATRIX_WIDTH);
auto outputDeviceMemory = alpaka::allocBuf<DataType, Idx>(devAcc, extent1D);

using Vec = alpaka::Vec<Dim, Idx>;
using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>;

// The matrix is tiled. Each block of threads uses one tile, tile dimensions equal to block dims.
auto blocksPerGrid = alpaka::Vec<Dim, Idx>{MATRIX_WIDTH / TILE_DIM, MATRIX_HEIGHT / TILE_DIM};
auto const threadsPerBlock = alpaka::Vec<Dim, Idx>{TILE_DIM, TILE_DIM};
auto const elementsPerThread = alpaka::Vec<Dim, Idx>{1u, 0u};
auto workDiv = WorkDiv{blocksPerGrid, threadsPerBlock, elementsPerThread};

// convolution filter
std::vector<DataType> filter = {0.11, 0.12, 0.13, 0.14, 0.15, 0.21, 0.22, 0.23, 0.24, 0.25, 0.31, 0.32, 0.33,
0.34, 0.35, 0.41, 0.42, 0.43, 0.44, 0.45, 0.51, 0.52, 0.53, 0.54, 0.55};
auto bufHostFilter = alpaka::createView(devHost, filter.data(), Vec{TILE_DIM, TILE_DIM});
// Use constant memory in device for convolution filter
auto viewConstantMemUninitialized
= alpaka::createStaticDevMemView(&g_constantMemory2DUninitialized[0u][0u], devAcc, Vec{TILE_DIM, TILE_DIM});
// Copy filter matrix to the constant memory
alpaka::memcpy(queueAcc, viewConstantMemUninitialized, bufHostFilter);
alpaka::wait(queueAcc);

// Construct kernel object
ConvolutionKernel2DSharedMemory convolutionKernel2D;

// Run the kernel
auto const taskKernel = alpaka::createTaskKernel<DevAcc>(
workDiv,
convolutionKernel2D,
alpaka::getPtrNative(bufInputAccView),
alpaka::getPtrNative(outputDeviceMemory),
MATRIX_WIDTH,
MATRIX_HEIGHT,
alpaka::getPtrNative(viewConstantMemUninitialized));

alpaka::enqueue(queueAcc, taskKernel);
alpaka::wait(queueAcc);
// Allocate memory on host
auto resultGpuHost = alpaka::allocBuf<DataType, Idx>(devHost, extent1D);
// Copy from device memory to host
alpaka::memcpy(queueAcc, resultGpuHost, outputDeviceMemory, extent1D);

// Print results at the host
printf(
"Convolution filter kernel: %s\n",
std::is_same<decltype(convolutionKernel2D), ConvolutionKernel2DGlobalMemory>::value
? "ConvolutionKernel2DGlobalMemory"
: "ConvolutionKernel2DSharedMemory");
printf(
"Matrix Size: %d x %d, Filter Size: %d x %d, Tile Size: %d x %d\n",
MATRIX_WIDTH,
MATRIX_HEIGHT,
TILE_DIM,
TILE_DIM,
TILE_DIM,
TILE_DIM);
for(size_t i{0}; i < MATRIX_WIDTH * MATRIX_HEIGHT; i++)
{
DataType const& val(alpaka::getPtrNative(resultGpuHost)[i]);
std::cout << "output[" << i << "]:" << std::setprecision(3) << val << std::endl;
}
return EXIT_SUCCESS;
}

0 comments on commit aa3efcc

Please sign in to comment.