-
Notifications
You must be signed in to change notification settings - Fork 75
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Convolution2D filter example using global and shared memory
- Loading branch information
Mehmet Yusufoglu
committed
Jan 24, 2024
1 parent
b161b2f
commit aa3efcc
Showing
3 changed files
with
295 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
# | ||
# Copyright 2023 Erik Zenker, Benjamin Worpitz, Jan Stephan | ||
# SPDX-License-Identifier: ISC | ||
# | ||
|
||
################################################################################ | ||
# Required CMake version. | ||
|
||
cmake_minimum_required(VERSION 3.22) | ||
|
||
set_property(GLOBAL PROPERTY USE_FOLDERS ON) | ||
|
||
################################################################################ | ||
# Project. | ||
|
||
set(_TARGET_NAME convolution2D) | ||
|
||
project(${_TARGET_NAME} LANGUAGES CXX) | ||
|
||
#------------------------------------------------------------------------------- | ||
# Find alpaka. | ||
|
||
if(NOT TARGET alpaka::alpaka) | ||
option(alpaka_USE_SOURCE_TREE "Use alpaka's source tree instead of an alpaka installation" OFF) | ||
|
||
if(alpaka_USE_SOURCE_TREE) | ||
# Don't build the examples recursively | ||
set(alpaka_BUILD_EXAMPLES OFF) | ||
add_subdirectory("${CMAKE_CURRENT_LIST_DIR}/../.." "${CMAKE_BINARY_DIR}/alpaka") | ||
else() | ||
find_package(alpaka REQUIRED) | ||
endif() | ||
endif() | ||
|
||
#------------------------------------------------------------------------------- | ||
# Add executable. | ||
|
||
alpaka_add_executable( | ||
${_TARGET_NAME} | ||
src/convolution2D.cpp) | ||
target_link_libraries( | ||
${_TARGET_NAME} | ||
PUBLIC alpaka::alpaka) | ||
|
||
set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER example) | ||
|
||
add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,247 @@ | ||
/* Copyright 2023 Benjamin Worpitz, Erik Zenker, Bernhard Manfred Gruber, Jan Stephan | ||
* SPDX-License-Identifier: ISC | ||
*/ | ||
|
||
#include <alpaka/alpaka.hpp> | ||
#include <alpaka/example/ExampleDefaultAcc.hpp> | ||
|
||
#include <iomanip> | ||
#include <iostream> | ||
#include <numeric> | ||
#include <vector> | ||
|
||
//! Convolution Example | ||
//! | ||
//! A 2D convolutional filter example with padding. | ||
|
||
static constexpr int32_t FILTER_RADIUS = 2; | ||
static constexpr int32_t TILE_DIM = 2 * FILTER_RADIUS + 1; | ||
static constexpr int32_t MATRIX_WIDTH = 10; | ||
static constexpr int32_t MATRIX_HEIGHT = 10; | ||
|
||
|
||
// These declarations about constant memory allocation(in GPU context), are used to silence clang`s | ||
// -Wmissing-variable-declarations warning that forces every non-static variable to be declared with extern before the | ||
// are defined. These forward declarations are only necessary when you want to access those variables from a different | ||
// compilation unit and should be moved to a common header. | ||
extern ALPAKA_STATIC_ACC_MEM_CONSTANT float g_constantMemory2DUninitialized[5][5]; | ||
ALPAKA_STATIC_ACC_MEM_CONSTANT float g_constantMemory2DUninitialized[5][5]; | ||
|
||
// 2D Convolutional Filter using only global memory for matrix, and constant memory for the filter | ||
struct ConvolutionKernel2DGlobalMemory | ||
{ | ||
template<typename TAcc, typename TElem> | ||
ALPAKA_FN_ACC auto operator()( | ||
TAcc const& acc, | ||
TElem const* const N, | ||
TElem* P, | ||
const std::size_t MatrixWidth, | ||
const std::size_t MatrixHeight, | ||
TElem const* const filter) const -> void | ||
{ | ||
using Dim = alpaka::Dim<TAcc>; | ||
using Idx = alpaka::Idx<TAcc>; | ||
using Vec = alpaka::Vec<Dim, Idx>; | ||
|
||
auto const [row, col] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc); | ||
auto const [blockThreadY, blockThreadX] = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc); | ||
|
||
// The filter matrix applied to the matrix tile. The center of filter is positioned to the item pointed by the | ||
// thread index. An implicit "zero padding" is used. If some of the items of the filter are outside the matrix, | ||
// those are not taken into calculation in other words the corresponding items of the matrix are assumed zero. | ||
if(col < MatrixWidth && row < MatrixHeight) | ||
{ | ||
auto Pvalue = 0.0f; | ||
for(int32_t fRow = 0; fRow < 2 * FILTER_RADIUS + 1; fRow++) | ||
{ | ||
for(int32_t fCol = 0; fCol < 2 * FILTER_RADIUS + 1; fCol++) | ||
{ | ||
auto const exactRow = static_cast<int32_t>(row) - FILTER_RADIUS + fRow; | ||
auto const exactCol = static_cast<int32_t>(col) - FILTER_RADIUS + fCol; | ||
if(exactRow >= 0 && exactRow < MatrixHeight && exactCol >= 0 && exactCol < MatrixWidth) | ||
{ | ||
Pvalue += filter[fRow * TILE_DIM + fCol] * N[exactRow * MatrixWidth + exactCol]; | ||
} | ||
} | ||
P[row * MatrixWidth + col] = Pvalue; | ||
} | ||
} | ||
} | ||
}; | ||
|
||
// 2D Convolutional Filter, uses tiling method. Tiles of matrix are kept in the shared memory. For the filter, the | ||
// constant memory is used. Block dimensions are equal to tile dimensions. | ||
struct ConvolutionKernel2DSharedMemory | ||
{ | ||
template<typename TAcc, typename TElem> | ||
ALPAKA_FN_ACC auto operator()( | ||
TAcc const& acc, | ||
TElem const* const N, | ||
TElem* P, | ||
const std::size_t MatrixWidth, | ||
const std::size_t MatrixHeight, | ||
float const* const filter) const -> void | ||
{ | ||
using Dim = alpaka::Dim<TAcc>; | ||
using Idx = alpaka::Idx<TAcc>; | ||
using Vec = alpaka::Vec<Dim, Idx>; | ||
|
||
auto const [row, col] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc); | ||
auto const [blockThreadY, blockThreadX] = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc); | ||
auto const [blockThreadExtentY, blockThreadExtentX] = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc); | ||
|
||
// Allocate shared memory | ||
auto& N_s = alpaka::declareSharedVar<TElem[TILE_DIM * TILE_DIM], __COUNTER__>(acc); | ||
|
||
// Fill shared memory of device so that tile items are accessed from shared memory | ||
if(row < MatrixHeight && col < MatrixWidth) | ||
{ | ||
N_s[blockThreadY * blockThreadExtentX + blockThreadX] = N[row * MatrixWidth + col]; | ||
} | ||
else | ||
{ | ||
N_s[blockThreadY * blockThreadExtentX + blockThreadX] = 0.0; | ||
} | ||
|
||
// Wait for the block fills the shared memory with the tile of the main matrix | ||
alpaka::syncBlockThreads(acc); | ||
|
||
if(col < MatrixWidth && row < MatrixHeight) | ||
{ | ||
auto Pvalue{0.0f}; | ||
for(int32_t fRow = 0; fRow < 2 * FILTER_RADIUS + 1; fRow++) | ||
{ | ||
for(int32_t fCol = 0; fCol < 2 * FILTER_RADIUS + 1; fCol++) | ||
{ | ||
auto const exactRowBlock = static_cast<int32_t>(blockThreadY) - FILTER_RADIUS + fRow; | ||
auto const exactColBlock = static_cast<int32_t>(blockThreadX) - FILTER_RADIUS + fCol; | ||
if(exactColBlock >= 0 && exactColBlock < TILE_DIM && exactRowBlock >= 0 | ||
&& exactRowBlock < TILE_DIM) | ||
{ | ||
// get the item from the shared memory | ||
Pvalue += filter[fRow * TILE_DIM + fCol] | ||
* N_s[exactRowBlock * blockThreadExtentX + exactColBlock]; | ||
} | ||
else | ||
{ | ||
auto const exactRow = static_cast<int32_t>(row) - FILTER_RADIUS + fRow; | ||
auto const exactCol = static_cast<int32_t>(col) - FILTER_RADIUS + fCol; | ||
if(exactRow >= 0 && exactRow < MatrixHeight && exactCol >= 0 && exactCol < MatrixWidth) | ||
{ | ||
// get the item from the global memory | ||
Pvalue += filter[fRow * TILE_DIM + fCol] * N[exactRow * MatrixWidth + exactCol]; | ||
} | ||
} | ||
} | ||
P[row * MatrixWidth + col] = Pvalue; | ||
} | ||
} // if | ||
} | ||
}; | ||
|
||
auto main() -> int | ||
{ | ||
// Define the index domain | ||
using Dim = alpaka::DimInt<2u>; | ||
// Index type | ||
using Idx = std::uint32_t; | ||
using Vec = alpaka::Vec<Dim, Idx>; | ||
// Define the accelerator | ||
using DevAcc = alpaka::ExampleDefaultAcc<Dim, Idx>; | ||
using QueueProperty = alpaka::NonBlocking; | ||
using QueueAcc = alpaka::Queue<DevAcc, QueueProperty>; | ||
|
||
using DataType = float; | ||
static_assert( | ||
alpaka::Dim<DevAcc>::value == 2u, | ||
"The accelerator used for the AlpakaKernel has to be 2 dimensional!"); | ||
static_assert( | ||
MATRIX_WIDTH % TILE_DIM == 0 && MATRIX_HEIGHT % TILE_DIM == 0, | ||
"Matrix dimensions are not multiples of Tile dimensions"); | ||
|
||
std::cout << "Using alpaka accelerator: " << alpaka::getAccName<DevAcc>() << std::endl; | ||
|
||
auto const devHost = alpaka::getDevByIdx(alpaka::PlatformCpu{}, 0); | ||
// Select a device from the accelerator | ||
auto const platformAcc = alpaka::Platform<DevAcc>{}; | ||
auto const devAcc = alpaka::getDevByIdx(platformAcc, 0); | ||
|
||
// Create a queue on the device | ||
QueueAcc queueAcc(devAcc); | ||
// Define extent (dimensions) | ||
Vec const extent(static_cast<Idx>(MATRIX_WIDTH), static_cast<Idx>(MATRIX_HEIGHT)); | ||
// Kernel Input | ||
std::vector<DataType> bufInputHost(MATRIX_HEIGHT * MATRIX_WIDTH); | ||
std::iota(bufInputHost.begin(), bufInputHost.end(), 1.0f); | ||
auto bufInputHostView = alpaka::createView(devHost, bufInputHost, extent); | ||
|
||
// Input buffer at device | ||
auto bufInputAcc = alpaka::allocBuf<DataType, Idx>(devAcc, extent); | ||
auto bufInputAccView = alpaka::createView(devAcc, bufInputAcc, extent); | ||
alpaka::memcpy(queueAcc, bufInputAccView, bufInputHostView); | ||
alpaka::wait(queueAcc); | ||
// Output buffer in device | ||
alpaka::Vec<alpaka::DimInt<1u>, Idx> const extent1D(MATRIX_HEIGHT * MATRIX_WIDTH); | ||
auto outputDeviceMemory = alpaka::allocBuf<DataType, Idx>(devAcc, extent1D); | ||
|
||
using Vec = alpaka::Vec<Dim, Idx>; | ||
using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>; | ||
|
||
// The matrix is tiled. Each block of threads uses one tile, tile dimensions equal to block dims. | ||
auto blocksPerGrid = alpaka::Vec<Dim, Idx>{MATRIX_WIDTH / TILE_DIM, MATRIX_HEIGHT / TILE_DIM}; | ||
auto const threadsPerBlock = alpaka::Vec<Dim, Idx>{TILE_DIM, TILE_DIM}; | ||
auto const elementsPerThread = alpaka::Vec<Dim, Idx>{1u, 0u}; | ||
auto workDiv = WorkDiv{blocksPerGrid, threadsPerBlock, elementsPerThread}; | ||
|
||
// convolution filter | ||
std::vector<DataType> filter = {0.11, 0.12, 0.13, 0.14, 0.15, 0.21, 0.22, 0.23, 0.24, 0.25, 0.31, 0.32, 0.33, | ||
0.34, 0.35, 0.41, 0.42, 0.43, 0.44, 0.45, 0.51, 0.52, 0.53, 0.54, 0.55}; | ||
auto bufHostFilter = alpaka::createView(devHost, filter.data(), Vec{TILE_DIM, TILE_DIM}); | ||
// Use constant memory in device for convolution filter | ||
auto viewConstantMemUninitialized | ||
= alpaka::createStaticDevMemView(&g_constantMemory2DUninitialized[0u][0u], devAcc, Vec{TILE_DIM, TILE_DIM}); | ||
// Copy filter matrix to the constant memory | ||
alpaka::memcpy(queueAcc, viewConstantMemUninitialized, bufHostFilter); | ||
alpaka::wait(queueAcc); | ||
|
||
// Construct kernel object | ||
ConvolutionKernel2DSharedMemory convolutionKernel2D; | ||
|
||
// Run the kernel | ||
auto const taskKernel = alpaka::createTaskKernel<DevAcc>( | ||
workDiv, | ||
convolutionKernel2D, | ||
alpaka::getPtrNative(bufInputAccView), | ||
alpaka::getPtrNative(outputDeviceMemory), | ||
MATRIX_WIDTH, | ||
MATRIX_HEIGHT, | ||
alpaka::getPtrNative(viewConstantMemUninitialized)); | ||
|
||
alpaka::enqueue(queueAcc, taskKernel); | ||
alpaka::wait(queueAcc); | ||
// Allocate memory on host | ||
auto resultGpuHost = alpaka::allocBuf<DataType, Idx>(devHost, extent1D); | ||
// Copy from device memory to host | ||
alpaka::memcpy(queueAcc, resultGpuHost, outputDeviceMemory, extent1D); | ||
|
||
// Print results at the host | ||
printf( | ||
"Convolution filter kernel: %s\n", | ||
std::is_same<decltype(convolutionKernel2D), ConvolutionKernel2DGlobalMemory>::value | ||
? "ConvolutionKernel2DGlobalMemory" | ||
: "ConvolutionKernel2DSharedMemory"); | ||
printf( | ||
"Matrix Size: %d x %d, Filter Size: %d x %d, Tile Size: %d x %d\n", | ||
MATRIX_WIDTH, | ||
MATRIX_HEIGHT, | ||
TILE_DIM, | ||
TILE_DIM, | ||
TILE_DIM, | ||
TILE_DIM); | ||
for(size_t i{0}; i < MATRIX_WIDTH * MATRIX_HEIGHT; i++) | ||
{ | ||
DataType const& val(alpaka::getPtrNative(resultGpuHost)[i]); | ||
std::cout << "output[" << i << "]:" << std::setprecision(3) << val << std::endl; | ||
} | ||
return EXIT_SUCCESS; | ||
} |