-
Notifications
You must be signed in to change notification settings - Fork 75
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Convolution2D filter example using global and shared memory
- Loading branch information
Mehmet Yusufoglu
committed
Jan 26, 2024
1 parent
15a56e9
commit 8a7a920
Showing
3 changed files
with
349 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
# | ||
# Copyright 2023 Erik Zenker, Benjamin Worpitz, Jan Stephan | ||
# SPDX-License-Identifier: ISC | ||
# | ||
|
||
################################################################################ | ||
# Required CMake version. | ||
|
||
cmake_minimum_required(VERSION 3.22) | ||
|
||
set_property(GLOBAL PROPERTY USE_FOLDERS ON) | ||
|
||
################################################################################ | ||
# Project. | ||
|
||
set(_TARGET_NAME convolution2D) | ||
|
||
project(${_TARGET_NAME} LANGUAGES CXX) | ||
|
||
#------------------------------------------------------------------------------- | ||
# Find alpaka. | ||
|
||
if(NOT TARGET alpaka::alpaka) | ||
option(alpaka_USE_SOURCE_TREE "Use alpaka's source tree instead of an alpaka installation" OFF) | ||
|
||
if(alpaka_USE_SOURCE_TREE) | ||
# Don't build the examples recursively | ||
set(alpaka_BUILD_EXAMPLES OFF) | ||
add_subdirectory("${CMAKE_CURRENT_LIST_DIR}/../.." "${CMAKE_BINARY_DIR}/alpaka") | ||
else() | ||
find_package(alpaka REQUIRED) | ||
endif() | ||
endif() | ||
|
||
#------------------------------------------------------------------------------- | ||
# Add executable. | ||
|
||
alpaka_add_executable( | ||
${_TARGET_NAME} | ||
src/convolution2D.cpp) | ||
target_link_libraries( | ||
${_TARGET_NAME} | ||
PUBLIC alpaka::alpaka) | ||
|
||
set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER example) | ||
|
||
add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,301 @@ | ||
/* Copyright 2023 Benjamin Worpitz, Erik Zenker, Bernhard Manfred Gruber, Jan Stephan | ||
* SPDX-License-Identifier: ISC | ||
*/ | ||
|
||
#include <alpaka/alpaka.hpp> | ||
#include <alpaka/example/ExampleDefaultAcc.hpp> | ||
|
||
#include <iomanip> | ||
#include <iostream> | ||
#include <numeric> | ||
#include <vector> | ||
|
||
//! Convolution Example | ||
//! | ||
//! A 2D Convolutional filter applied to a matrix. The values of filter-matrix are kept in constant memory in order to | ||
//! increase performance. Kernel1: Global memory is used, without tiling. Kernel2: Uses tiling. Block size is assumed | ||
//! to be equal to tile size. First, the tile is copied to shared memory, since an element in a tile would be accessed | ||
//! many times; using the shared memory increases performance. Each block works on the domain of one tile. But at the | ||
//! border of the tile, some external matrix values are needed (at the border with another tile) then those matrix | ||
//! values are taken from the global memory. | ||
|
||
#define FILTER_WIDTH 5 | ||
ALPAKA_STATIC_ACC_MEM_CONSTANT float g_constantMemory2DUninitialized[FILTER_WIDTH][FILTER_WIDTH]; | ||
|
||
/** | ||
* @brief 2D Convolutional Filter using only global memory for matrix, and constant memory for the filter | ||
*/ | ||
struct ConvolutionKernel2DGlobalMemory | ||
{ | ||
/** | ||
@tparam TAcc Accelerator type | ||
@tparam TElem The matrix and filter-matrix element type * | ||
@param acc Accelerator | ||
@param input Input matrix | ||
@param output Output matrix | ||
@param matrixWidth Input matrix width | ||
@param matrixHeight Input matrix height | ||
@param filter Filter-matrix | ||
@param filter Matrix width | ||
*/ | ||
template<typename TAcc, typename TElem> | ||
ALPAKA_FN_ACC auto operator()( | ||
TAcc const& acc, | ||
TElem const* const input, | ||
TElem* output, | ||
std::size_t const matrixWidth, | ||
std::size_t const matrixHeight, | ||
TElem const* const filter, | ||
int32_t const filterWidth) const -> void | ||
{ | ||
auto const [row, col] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc); | ||
auto const [blockThreadY, blockThreadX] = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc); | ||
|
||
// The convolutional filter-matrix applied to the input matrix. The center of filter is positioned to the item | ||
// pointed by the thread index. An implicit "zero padding" is used. If some of the items of the filter are | ||
// outside the matrix, those are not taken into calculation in other words the corresponding items of the | ||
// matrix are assumed zero. | ||
if(col < matrixWidth && row < matrixHeight) | ||
{ | ||
auto pValue{0.0f}; | ||
for(int32_t fRow = 0; fRow < filterWidth; fRow++) | ||
{ | ||
for(int32_t fCol = 0; fCol < filterWidth; fCol++) | ||
{ | ||
// Position of input matrix element to be multiplied with the corresponding element at filter | ||
auto const exactRow = static_cast<int32_t>(row) - filterWidth / 2 + fRow; | ||
auto const exactCol = static_cast<int32_t>(col) - filterWidth / 2 + fCol; | ||
if(exactRow >= 0 && exactRow < matrixHeight && exactCol >= 0 && exactCol < matrixWidth) | ||
{ | ||
pValue += filter[fRow * filterWidth + fCol] * input[exactRow * matrixWidth + exactCol]; | ||
} | ||
} | ||
output[row * matrixWidth + col] = pValue; | ||
} | ||
} | ||
} | ||
}; | ||
|
||
/** | ||
* @brief The ConvolutionKernel2DSharedMemory struct. The operator() is a kernel for 2D Convolutional Filter, uses | ||
tiling method. Tiles of matrix are kept in the shared memory. For the filter, the constant memory is used. Block | ||
dimensions are equal to tile dimensions. | ||
*/ | ||
struct ConvolutionKernel2DSharedMemory | ||
{ | ||
/** | ||
@tparam TAcc Accelerator type | ||
@tparam TElem The matrix and filter-matrix element type * | ||
@param acc Accelerator | ||
@param input Input matrix | ||
@param output Output matrix | ||
@param matrixWidth Input matrix width | ||
@param matrixHeight Input matrix height | ||
@param filter Filter-matrix | ||
@param filter Matrix width | ||
*/ | ||
template<typename TAcc, typename TElem> | ||
ALPAKA_FN_ACC auto operator()( | ||
TAcc const& acc, | ||
TElem const* const input, | ||
TElem* output, | ||
std::size_t const matrixWidth, | ||
std::size_t const matrixHeight, | ||
float const* const filter, | ||
int32_t const filterWidth) const -> void | ||
{ | ||
auto const [row, col] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc); | ||
// Get extents(dimensions) | ||
auto const gridBlockExtent(alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)); | ||
auto const blockThreadExtent = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc); | ||
// Get indexes | ||
auto const blockThreadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc); | ||
auto const blockThreadIdx1D = alpaka::mapIdx<1u>(blockThreadIdx, blockThreadExtent)[0u]; | ||
|
||
auto const [blockThreadExtentY, blockThreadExtentX] = blockThreadExtent; | ||
auto const [blockThreadY, blockThreadX] = blockThreadIdx; | ||
// Allocate shared memory | ||
auto* const sharedN = alpaka::getDynSharedMem<TElem>(acc); | ||
// Fill shared memory of device so that tile items are accessed from shared memory | ||
if(row < matrixHeight && col < matrixWidth) | ||
{ | ||
sharedN[blockThreadIdx1D] = input[row * matrixWidth + col]; | ||
} | ||
else | ||
{ | ||
sharedN[blockThreadIdx1D] = 0.0; | ||
} | ||
|
||
// Wait for the block fills the shared memory with the tile of the main matrix | ||
alpaka::syncBlockThreads(acc); | ||
|
||
if(col < matrixWidth && row < matrixHeight) | ||
{ | ||
auto pValue{0.0f}; | ||
for(int32_t fRow = 0; fRow < filterWidth; fRow++) | ||
{ | ||
for(int32_t fCol = 0; fCol < filterWidth; fCol++) | ||
{ | ||
// Position of input matrix element to be multiplied with the corresponding element at the filter. | ||
// The position is with respect to tile(block) | ||
auto const exactRowBlock = static_cast<int32_t>(blockThreadY) - filterWidth / 2 + fRow; | ||
auto const exactColBlock = static_cast<int32_t>(blockThreadX) - filterWidth / 2 + fCol; | ||
if(exactColBlock >= 0 && exactColBlock < gridBlockExtent[1] && exactRowBlock >= 0 | ||
&& exactRowBlock < gridBlockExtent[0]) | ||
{ | ||
// The element is inside the tile. Get the element from the shared memory | ||
pValue += filter[fRow * filterWidth + fCol] | ||
* sharedN[exactRowBlock * blockThreadExtentX + exactColBlock]; | ||
} | ||
else | ||
{ // The element is not in the tile(block) | ||
// Position of input matrix element to be multiplied with the corresponding element at the | ||
// filter. The position is with respect to the input matrix | ||
auto const exactRow = static_cast<int32_t>(row) - filterWidth / 2 + fRow; | ||
auto const exactCol = static_cast<int32_t>(col) - filterWidth / 2 + fCol; | ||
if(exactRow >= 0 && exactRow < matrixHeight && exactCol >= 0 && exactCol < matrixWidth) | ||
{ | ||
// get the item from the global memory | ||
pValue += filter[fRow * filterWidth + fCol] * input[exactRow * matrixWidth + exactCol]; | ||
} | ||
} | ||
} | ||
output[row * matrixWidth + col] = pValue; | ||
} | ||
} // if | ||
} | ||
}; | ||
|
||
namespace alpaka::trait | ||
{ | ||
//! The trait for getting the size of the block shared dynamic memory for a kernel. | ||
template<typename TAcc> | ||
struct BlockSharedMemDynSizeBytes<ConvolutionKernel2DSharedMemory, TAcc> | ||
{ | ||
//! \return The size of the shared memory allocated for a block. | ||
template<typename TVec, typename TElem> | ||
ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes( | ||
ConvolutionKernel2DSharedMemory const& /* matMulKernel */, | ||
TVec const& blockThreadExtent, | ||
TVec const& threadElemExtent, | ||
TElem const* const, // input Matrix | ||
TElem*, // output array | ||
std::size_t const, // matrixWidth | ||
std::size_t const, // matrixHeight | ||
float const* const, // filter | ||
const int32_t) // filter size | ||
{ | ||
// Reserve the buffer for the two blocks of A and B. | ||
return static_cast<std::size_t>(2u * blockThreadExtent.prod() * threadElemExtent.prod()) * sizeof(TElem); | ||
} | ||
}; | ||
} // namespace alpaka::trait | ||
|
||
auto main() -> int | ||
{ | ||
// Define the index domain | ||
using Dim = alpaka::DimInt<2u>; | ||
// Index type | ||
using Idx = std::uint32_t; | ||
using Vec = alpaka::Vec<Dim, Idx>; | ||
// Define the accelerator | ||
using DevAcc = alpaka::ExampleDefaultAcc<Dim, Idx>; | ||
using QueueProperty = alpaka::NonBlocking; | ||
using QueueAcc = alpaka::Queue<DevAcc, QueueProperty>; | ||
|
||
using DataType = float; | ||
static constexpr int32_t filterWidth = FILTER_WIDTH; | ||
static constexpr int32_t matrixWidth = 10; | ||
static constexpr int32_t matrixHeight = 10; | ||
|
||
static_assert( | ||
alpaka::Dim<DevAcc>::value == 2u, | ||
"The accelerator used for the Alpaka Kernel has to be 2 dimensional!"); | ||
|
||
std::cout << "Using alpaka accelerator: " << alpaka::getAccName<DevAcc>() << std::endl; | ||
|
||
auto const devHost = alpaka::getDevByIdx(alpaka::PlatformCpu{}, 0); | ||
// Select a device from the accelerator | ||
auto const platformAcc = alpaka::Platform<DevAcc>{}; | ||
auto const devAcc = alpaka::getDevByIdx(platformAcc, 0); | ||
|
||
// Create a queue on the device | ||
QueueAcc queueAcc(devAcc); | ||
// Define extent (dimensions) | ||
Vec const extent(static_cast<Idx>(matrixWidth), static_cast<Idx>(matrixHeight)); | ||
// Kernel Input | ||
std::vector<DataType> bufInputHost(matrixHeight * matrixWidth); | ||
// Use increasing values as input | ||
std::iota(bufInputHost.begin(), bufInputHost.end(), 1.0f); | ||
auto bufInputHostView = alpaka::createView(devHost, bufInputHost, extent); | ||
|
||
// Input buffer at device | ||
auto bufInputAcc = alpaka::allocBuf<DataType, Idx>(devAcc, extent); | ||
auto bufInputAccView = alpaka::createView(devAcc, bufInputAcc, extent); | ||
alpaka::memcpy(queueAcc, bufInputAccView, bufInputHostView); | ||
alpaka::wait(queueAcc); | ||
// Output buffer at device | ||
alpaka::Vec<alpaka::DimInt<1u>, Idx> const extent1D(matrixHeight * matrixWidth); | ||
auto outputDeviceMemory = alpaka::allocBuf<DataType, Idx>(devAcc, extent1D); | ||
|
||
using Vec = alpaka::Vec<Dim, Idx>; | ||
using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>; | ||
|
||
// Let alpaka calculate good block and grid sizes given our full problem extent. | ||
alpaka::WorkDivMembers<Dim, Idx> const workDiv(alpaka::getValidWorkDiv<DevAcc>( | ||
devAcc, | ||
extent, | ||
alpaka::Vec<Dim, Idx>::ones(), | ||
false, | ||
alpaka::GridBlockExtentSubDivRestrictions::Unrestricted)); | ||
|
||
// convolution filter | ||
std::vector<DataType> const filter = {0.11, 0.12, 0.13, 0.14, 0.15, 0.21, 0.22, 0.23, 0.24, 0.25, 0.31, 0.32, 0.33, | ||
0.34, 0.35, 0.41, 0.42, 0.43, 0.44, 0.45, 0.51, 0.52, 0.53, 0.54, 0.55}; | ||
auto const bufHostFilter = alpaka::createView(devHost, filter.data(), Vec{filterWidth, filterWidth}); | ||
|
||
auto viewConstantMemUninitialized = alpaka::createStaticDevMemView( | ||
&g_constantMemory2DUninitialized[0u][0u], | ||
devAcc, | ||
Vec{filterWidth, filterWidth}); | ||
|
||
// Copy the filter-matrix to the constant memory | ||
alpaka::memcpy(queueAcc, viewConstantMemUninitialized, bufHostFilter); | ||
alpaka::wait(queueAcc); | ||
|
||
// Construct kernel object, choose on of the kernels provided above | ||
// ConvolutionKernel2DGlobalMemory and ConvolutionKernel2DSharedMemory | ||
ConvolutionKernel2DSharedMemory convolutionKernel2D; | ||
|
||
// Run the kernel | ||
alpaka::exec<DevAcc>( | ||
queueAcc, | ||
workDiv, | ||
convolutionKernel2D, | ||
alpaka::getPtrNative(bufInputAccView), | ||
alpaka::getPtrNative(outputDeviceMemory), | ||
matrixWidth, | ||
matrixHeight, | ||
alpaka::getPtrNative(viewConstantMemUninitialized), | ||
filterWidth); | ||
|
||
alpaka::wait(queueAcc); | ||
// Allocate memory on host | ||
auto resultGpuHost = alpaka::allocBuf<DataType, Idx>(devHost, extent1D); | ||
// Copy from device memory to host | ||
alpaka::memcpy(queueAcc, resultGpuHost, outputDeviceMemory, extent1D); | ||
|
||
// Print results at the host | ||
printf( | ||
"Convolution filter kernel: %s\n", | ||
std::is_same<decltype(convolutionKernel2D), ConvolutionKernel2DGlobalMemory>::value | ||
? "ConvolutionKernel2DGlobalMemory" | ||
: "ConvolutionKernel2DSharedMemory"); | ||
printf("Matrix Size: %d x %d, Filter Size: %d x %d\n", matrixWidth, matrixHeight, filterWidth, filterWidth); | ||
for(size_t i{0}; i < matrixWidth * matrixHeight; i++) | ||
{ | ||
DataType const& val(alpaka::getPtrNative(resultGpuHost)[i]); | ||
std::cout << "output[" << i << "]:" << std::setprecision(3) << val << std::endl; | ||
} | ||
return EXIT_SUCCESS; | ||
} |