Skip to content

Commit

Permalink
Merge branch 'main' into release/hip6.0_cuda12.2
Browse files Browse the repository at this point in the history
  • Loading branch information
RichardGe committed Jan 10, 2025
2 parents 320e003 + ef26098 commit ae9c7ce
Show file tree
Hide file tree
Showing 10 changed files with 141 additions and 35 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,7 @@ build/
result.xml
UnitTest/bitcodes/*.fatbin
Test/SimpleD3D12/cache/**

ParallelPrimitives/cache/KernelArgs.h
ParallelPrimitives/cache/Kernels.h
ParallelPrimitives/cache/oro_compiled_kernels.h
24 changes: 12 additions & 12 deletions Orochi/GpuMemory.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
namespace Oro
{

/// @brief A helper function that casts an address of a pointer to the device memory to a void pointer to be used as an argument for kernel calls.
/// @brief A helper function that casts an address of a pointer to the device memory to a void pointer to be used as an argument for kernel calls.
/// @tparam T The type of the element stored in the device memory.
/// @param ptr The address of a pointer to the device memory.
/// @return A void pointer.
Expand All @@ -44,8 +44,8 @@ class GpuMemory final
public:
GpuMemory() = default;

/// @brief Allocate the device memory with the given size.
/// @param init_size The initial size which represents the number of elements.
/// @brief Allocate the elements on the device memory.
/// @param init_size The initial container size which represents the number of elements.
explicit GpuMemory( const size_t init_size )
{
OrochiUtils::malloc( m_data, init_size );
Expand All @@ -61,9 +61,9 @@ class GpuMemory final

GpuMemory& operator=( GpuMemory&& other ) noexcept
{
GpuMemory tmp( std::move( *this ) );
GpuMemory tmp( std::move( other ) );

swap( *this, other );
swap( *this, tmp );

return *this;
}
Expand All @@ -79,8 +79,8 @@ class GpuMemory final
m_capacity = 0ULL;
}

/// @brief Get the size of the device memory.
/// @return The size of the device memory.
/// @brief Get the container size which represents the number of elements.
/// @return The container size which represents the number of elements.
size_t size() const noexcept { return m_size; }

/// @brief Get the pointer to the device memory.
Expand All @@ -91,9 +91,9 @@ class GpuMemory final
/// @return The address of the pointer to the device memory.
T* const* address() const noexcept { return &m_data; }

/// @brief Resize the device memory. Its capacity is unchanged if the new size is smaller than the current one.
/// @brief Resize the container. Its capacity is unchanged if the new size is smaller than the current one.
/// The old data should be considered invalid to be used after the function is called unless @c copy is set to True.
/// @param new_size The new memory size after the function is called.
/// @param new_size The new container size which represents the number of elements after the function is called.
/// @param copy If true, the function will copy the data to the newly created memory space as well.
void resize( const size_t new_size, const bool copy = false ) noexcept
{
Expand All @@ -113,8 +113,8 @@ class GpuMemory final
*this = std::move( tmp );
}

/// @brief Asynchronous version of 'resize' using a given Orochi stream.
/// @param new_size The new memory size after the function is called.
/// @brief Asynchronous version of @c resize using a given Orochi stream.
/// @param new_size The new container size which represents the number of elements after the function is called.
/// @param copy If true, the function will copy the data to the newly created memory space as well.
/// @param stream The Orochi stream used for the underlying operations.
void resizeAsync( const size_t new_size, const bool copy = false, oroStream stream = 0 ) noexcept
Expand All @@ -138,7 +138,7 @@ class GpuMemory final
/// @brief Reset the memory space so that all bits inside are cleared to zero.
void reset() noexcept { OrochiUtils::memset( m_data, 0, m_size * sizeof( T ) ); }

/// @brief Asynchronous version of 'reset' using a given Orochi stream.
/// @brief Asynchronous version of @c reset using a given Orochi stream.
/// @param stream The Orochi stream used for the underlying operations.
void resetAsync( oroStream stream = 0 ) noexcept { OrochiUtils::memsetAsync( m_data, 0, m_size * sizeof( T ), stream ); }

Expand Down
35 changes: 35 additions & 0 deletions Orochi/OrochiUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -558,6 +558,41 @@ oroFunction OrochiUtils::getFunctionFromString( oroDevice device, const char* so
return f;
}

oroFunction OrochiUtils::getFunctionFromPrecompiledBinary_asData( const unsigned char* precompData, size_t dataSizeInBytes, const std::string& funcName )
{
std::lock_guard<std::recursive_mutex> lock( m_mutex );

const std::string cacheName = OrochiUtilsImpl::getCacheName( "___BAKED_BIN___", funcName );
if( m_kernelMap.find( cacheName.c_str() ) != m_kernelMap.end() )
{
return m_kernelMap[cacheName].function;
}

oroModule module = nullptr;
oroError e = oroModuleLoadData( &module, precompData );
if ( e != oroSuccess )
{
// add some verbose info to help debugging missing data
printf("oroModuleLoadData FAILED (error = %d) loading baked precomp data: %s\n", e, funcName.c_str());
return nullptr;
}

oroFunction functionOut{};
e = oroModuleGetFunction( &functionOut, module, funcName.c_str() );
if ( e != oroSuccess )
{
// add some verbose info to help debugging missing data
printf("oroModuleGetFunction FAILED (error = %d) loading baked precomp data: %s\n", e, funcName.c_str());
return nullptr;
}
OROASSERT( e == oroSuccess, 0 );

m_kernelMap[cacheName].function = functionOut;
m_kernelMap[cacheName].module = module;

return functionOut;
}

oroFunction OrochiUtils::getFunctionFromPrecompiledBinary( const std::string& path, const std::string& funcName )
{
std::lock_guard<std::recursive_mutex> lock( m_mutex );
Expand Down
4 changes: 4 additions & 0 deletions Orochi/OrochiUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,10 @@ class OrochiUtils

oroFunction getFunctionFromPrecompiledBinary( const std::string& path, const std::string& funcName );

// this function is like 'getFunctionFromPrecompiledBinary' but instead of giving a path to a file, we give the data directly.
// ( use the script convert_binary_to_array.py to convert the .hipfb to a C-array. )
oroFunction getFunctionFromPrecompiledBinary_asData( const unsigned char* data, size_t dataSizeInBytes, const std::string& funcName );

oroFunction getFunctionFromFile( oroDevice device, const char* path, const char* funcName, std::vector<const char*>* opts );
oroFunction getFunctionFromString( oroDevice device, const char* source, const char* path, const char* funcName, std::vector<const char*>* opts, int numHeaders, const char** headers, const char** includeNames );
oroFunction getFunction( oroDevice device, const char* code, const char* path, const char* funcName, std::vector<const char*>* opts, int numHeaders = 0, const char** headers = 0, const char** includeNames = 0, oroModule* loadedModule = 0 );
Expand Down
67 changes: 50 additions & 17 deletions ParallelPrimitives/RadixSort.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,37 +27,63 @@
#include <iostream>
#include <numeric>

#if defined( ORO_PP_LOAD_FROM_STRING )

// if ORO_PP_LOAD_FROM_STRING && ORO_PRECOMPILED -> we load the precompiled/baked kernels.
// if ORO_PP_LOAD_FROM_STRING && NOT ORO_PRECOMPILED -> we load the baked source code kernels (from Kernels.h / KernelArgs.h)
#if !defined( ORO_PRECOMPILED ) && defined( ORO_PP_LOAD_FROM_STRING )
// Note: the include order must be in this particular form.
// clang-format off
#include <ParallelPrimitives/cache/Kernels.h>
#include <ParallelPrimitives/cache/KernelArgs.h>
// clang-format on
#else
// if Kernels.h / KernelArgs.h are not included, declare nullptr strings
static const char* hip_RadixSortKernels = nullptr;
namespace hip
{
static const char** RadixSortKernelsArgs = nullptr;
static const char** RadixSortKernelsIncludes = nullptr;
}
#endif

#if defined( __GNUC__ )
#include <dlfcn.h>
#endif

namespace
{
#if defined( ORO_PRECOMPILED )
constexpr auto useBitCode = true;
#if defined( ORO_PRECOMPILED ) && defined( ORO_PP_LOAD_FROM_STRING )
#include <ParallelPrimitives/cache/oro_compiled_kernels.h> // generate this header with 'convert_binary_to_array.py'
#else
constexpr auto useBitCode = false;
const unsigned char oro_compiled_kernels_h[] = "";
const size_t oro_compiled_kernels_h_size = 0;
#endif

#if defined( ORO_PP_LOAD_FROM_STRING )
constexpr auto useBakeKernel = true;
#else
constexpr auto useBakeKernel = false;
static const char* hip_RadixSortKernels = nullptr;
namespace hip
namespace
{
static const char** RadixSortKernelsArgs = nullptr;
static const char** RadixSortKernelsIncludes = nullptr;
} // namespace hip

// if those 2 preprocessors are enabled, this activates the 'usePrecompiledAndBakedKernel' mode.
#if defined( ORO_PRECOMPILED ) && defined( ORO_PP_LOAD_FROM_STRING )

// this flag means that we bake the precompiled kernels
constexpr auto usePrecompiledAndBakedKernel = true;

constexpr auto useBitCode = false;
constexpr auto useBakeKernel = false;

#else

constexpr auto usePrecompiledAndBakedKernel = false;

#if defined( ORO_PRECOMPILED )
constexpr auto useBitCode = true; // this flag means we use the bitcode file
#else
constexpr auto useBitCode = false;
#endif

#if defined( ORO_PP_LOAD_FROM_STRING )
constexpr auto useBakeKernel = true; // this flag means we use the HIP source code embeded in the binary ( as a string )
#else
constexpr auto useBakeKernel = false;
#endif

#endif

static_assert( !( useBitCode && useBakeKernel ), "useBitCode and useBakeKernel cannot coexist" );
Expand Down Expand Up @@ -211,9 +237,14 @@ void RadixSort::compileKernels( const std::string& kernelPath, const std::string
opts.push_back( sort_block_size_param.c_str() );
opts.push_back( sort_num_warps_param.c_str() );


for( const auto& record : records )
{
if constexpr( useBakeKernel )
if constexpr( usePrecompiledAndBakedKernel )
{
oroFunctions[record.kernelType] = m_oroutils.getFunctionFromPrecompiledBinary_asData(oro_compiled_kernels_h, oro_compiled_kernels_h_size, record.kernelName.c_str() );
}
else if constexpr( useBakeKernel )
{
oroFunctions[record.kernelType] = m_oroutils.getFunctionFromString( m_device, hip_RadixSortKernels, currentKernelPath.c_str(), record.kernelName.c_str(), &opts, 1, hip::RadixSortKernelsArgs, hip::RadixSortKernelsIncludes );
}
Expand All @@ -231,6 +262,8 @@ void RadixSort::compileKernels( const std::string& kernelPath, const std::string
printKernelInfo( record.kernelName, oroFunctions[record.kernelType] );
}
}

return;
}

int RadixSort::calculateWGsToExecute( const int blockSize ) const noexcept
Expand Down
3 changes: 3 additions & 0 deletions Test/DeviceEnum/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,9 @@ int main( int argc, char** argv )
e = oroCtxCreate( &ctx, 0, device );
ERROR_CHECK( e );

e = oroCtxSetCurrent( ctx );
ERROR_CHECK( e );

//try kernel execution
oroFunction function;
{
Expand Down
4 changes: 2 additions & 2 deletions contrib/bin/win64/amd_comgr0601.dll
Git LFS file not shown
4 changes: 2 additions & 2 deletions contrib/bin/win64/hiprtc-builtins0601.dll
Git LFS file not shown
4 changes: 2 additions & 2 deletions contrib/bin/win64/hiprtc0601.dll
Git LFS file not shown
27 changes: 27 additions & 0 deletions scripts/convert_binary_to_array.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# convert_binary_to_header.py
import sys
from pathlib import Path

def binary_to_c_array(bin_file, array_name):
with open(bin_file, 'rb') as f:
binary_data = f.read()

hex_array = ', '.join(f'0x{b:02x}' for b in binary_data)
c_array = f'const unsigned char {array_name}[] = {{\n {hex_array}\n}};\n'
c_array += f'const size_t {array_name}_size = sizeof({array_name});\n'
return c_array

if __name__ == "__main__":
if len(sys.argv) != 3:
print(f"Usage: {sys.argv[0]} <input_binary_file> <output_header_file>")
sys.exit(1)

bin_file = sys.argv[1]
header_file_path = sys.argv[2]
header_file = Path(header_file_path).name
array_name = header_file.replace('.', '_')

c_array = binary_to_c_array(bin_file, array_name)
with open(header_file_path, 'w') as f:
f.write("// generated by convert_binary_to_header.py\n")
f.write(c_array)

0 comments on commit ae9c7ce

Please sign in to comment.