Merge branch 'main' into release/hip6.0_cuda12.2

GPUOpen-LibrariesAndSDKs · Jan 10, 2025 · ae9c7ce · ae9c7ce
2 parents 320e003 + ef26098
commit ae9c7ce
Show file tree

Hide file tree

Showing 10 changed files with 141 additions and 35 deletions.
diff --git a/.gitignore b/.gitignore
@@ -25,3 +25,7 @@ build/
 result.xml
 UnitTest/bitcodes/*.fatbin
 Test/SimpleD3D12/cache/**
+
+ParallelPrimitives/cache/KernelArgs.h
+ParallelPrimitives/cache/Kernels.h
+ParallelPrimitives/cache/oro_compiled_kernels.h
diff --git a/Orochi/GpuMemory.h b/Orochi/GpuMemory.h
@@ -28,7 +28,7 @@
 namespace Oro
 {
 
-/// @brief A helper function that casts an address of a pointer to the device memory to a void pointer to be used as an argument for kernel calls. 
+/// @brief A helper function that casts an address of a pointer to the device memory to a void pointer to be used as an argument for kernel calls.
 /// @tparam T The type of the element stored in the device memory.
 /// @param ptr The address of a pointer to the device memory.
 /// @return A void pointer.
@@ -44,8 +44,8 @@ class GpuMemory final
   public:
 	GpuMemory() = default;
 
-	/// @brief Allocate the device memory with the given size.
-	/// @param init_size The initial size which represents the number of elements.
+	/// @brief Allocate the elements on the device memory.
+	/// @param init_size The initial container size which represents the number of elements.
 	explicit GpuMemory( const size_t init_size )
 	{
 		OrochiUtils::malloc( m_data, init_size );
@@ -61,9 +61,9 @@ class GpuMemory final
 
 	GpuMemory& operator=( GpuMemory&& other ) noexcept
 	{
-		GpuMemory tmp( std::move( *this ) );
+		GpuMemory tmp( std::move( other ) );
 
-		swap( *this, other );
+		swap( *this, tmp );
 
 		return *this;
 	}
@@ -79,8 +79,8 @@ class GpuMemory final
 		m_capacity = 0ULL;
 	}
 
-	/// @brief  Get the size of the device memory.
-	/// @return The size of the device memory.
+	/// @brief  Get the container size which represents the number of elements.
+	/// @return The container size which represents the number of elements.
 	size_t size() const noexcept { return m_size; }
 
 	/// @brief Get the pointer to the device memory.
@@ -91,9 +91,9 @@ class GpuMemory final
 	/// @return The address of the pointer to the device memory.
 	T* const* address() const noexcept { return &m_data; }
 
-	/// @brief Resize the device memory. Its capacity is unchanged if the new size is smaller than the current one.
+	/// @brief Resize the container. Its capacity is unchanged if the new size is smaller than the current one.
 	/// The old data should be considered invalid to be used after the function is called unless @c copy is set to True.
-	/// @param new_size The new memory size after the function is called.
+	/// @param new_size The new container size which represents the number of elements after the function is called.
 	/// @param copy If true, the function will copy the data to the newly created memory space as well.
 	void resize( const size_t new_size, const bool copy = false ) noexcept
 	{
@@ -113,8 +113,8 @@ class GpuMemory final
 		*this = std::move( tmp );
 	}
 
-	/// @brief Asynchronous version of 'resize' using a given Orochi stream.
-	/// @param new_size The new memory size after the function is called.
+	/// @brief Asynchronous version of @c resize using a given Orochi stream.
+	/// @param new_size The new container size which represents the number of elements after the function is called.
 	/// @param copy If true, the function will copy the data to the newly created memory space as well.
 	/// @param stream The Orochi stream used for the underlying operations.
 	void resizeAsync( const size_t new_size, const bool copy = false, oroStream stream = 0 ) noexcept
@@ -138,7 +138,7 @@ class GpuMemory final
 	/// @brief Reset the memory space so that all bits inside are cleared to zero.
 	void reset() noexcept { OrochiUtils::memset( m_data, 0, m_size * sizeof( T ) ); }
 
-	/// @brief Asynchronous version of 'reset' using a given Orochi stream.
+	/// @brief Asynchronous version of @c reset using a given Orochi stream.
 	/// @param stream The Orochi stream used for the underlying operations.
 	void resetAsync( oroStream stream = 0 ) noexcept { OrochiUtils::memsetAsync( m_data, 0, m_size * sizeof( T ), stream ); }
 

diff --git a/Orochi/OrochiUtils.cpp b/Orochi/OrochiUtils.cpp
@@ -558,6 +558,41 @@ oroFunction OrochiUtils::getFunctionFromString( oroDevice device, const char* so
 	return f;
 }
 
+oroFunction OrochiUtils::getFunctionFromPrecompiledBinary_asData( const unsigned char* precompData, size_t dataSizeInBytes, const std::string& funcName )
+{
+	std::lock_guard<std::recursive_mutex> lock( m_mutex );
+
+	const std::string cacheName = OrochiUtilsImpl::getCacheName( "___BAKED_BIN___", funcName );
+	if( m_kernelMap.find( cacheName.c_str() ) != m_kernelMap.end() )
+	{
+		return m_kernelMap[cacheName].function;
+	}
+
+	oroModule module = nullptr;
+	oroError e = oroModuleLoadData( &module, precompData );
+	if ( e != oroSuccess )
+	{
+		// add some verbose info to help debugging missing data
+		printf("oroModuleLoadData FAILED (error = %d) loading baked precomp data: %s\n", e, funcName.c_str());
+		return nullptr;
+	}
+
+	oroFunction functionOut{};
+	e = oroModuleGetFunction( &functionOut, module, funcName.c_str() );
+	if ( e != oroSuccess )
+	{
+		// add some verbose info to help debugging missing data
+		printf("oroModuleGetFunction FAILED (error = %d) loading baked precomp data: %s\n", e, funcName.c_str());
+		return nullptr;
+	}
+	OROASSERT( e == oroSuccess, 0 );
+
+	m_kernelMap[cacheName].function = functionOut;
+	m_kernelMap[cacheName].module = module;
+
+	return functionOut;
+}
+
 oroFunction OrochiUtils::getFunctionFromPrecompiledBinary( const std::string& path, const std::string& funcName )
 {
 	std::lock_guard<std::recursive_mutex> lock( m_mutex );

diff --git a/Orochi/OrochiUtils.h b/Orochi/OrochiUtils.h
@@ -69,6 +69,10 @@ class OrochiUtils
 
 	oroFunction getFunctionFromPrecompiledBinary( const std::string& path, const std::string& funcName );
 
+	// this function is like 'getFunctionFromPrecompiledBinary' but instead of giving a path to a file, we give the data directly.
+	// ( use the script convert_binary_to_array.py to convert the .hipfb to a C-array. )
+	oroFunction getFunctionFromPrecompiledBinary_asData( const unsigned char* data, size_t dataSizeInBytes, const std::string& funcName );
+
 	oroFunction getFunctionFromFile( oroDevice device, const char* path, const char* funcName, std::vector<const char*>* opts );
 	oroFunction getFunctionFromString( oroDevice device, const char* source, const char* path, const char* funcName, std::vector<const char*>* opts, int numHeaders, const char** headers, const char** includeNames );
 	oroFunction getFunction( oroDevice device, const char* code, const char* path, const char* funcName, std::vector<const char*>* opts, int numHeaders = 0, const char** headers = 0, const char** includeNames = 0, oroModule* loadedModule = 0 );

diff --git a/ParallelPrimitives/RadixSort.cpp b/ParallelPrimitives/RadixSort.cpp
@@ -27,37 +27,63 @@
 #include <iostream>
 #include <numeric>
 
-#if defined( ORO_PP_LOAD_FROM_STRING )
-
+// if ORO_PP_LOAD_FROM_STRING &&     ORO_PRECOMPILED -> we load the precompiled/baked kernels.
+// if ORO_PP_LOAD_FROM_STRING && NOT ORO_PRECOMPILED -> we load the baked source code kernels (from Kernels.h / KernelArgs.h)
+#if !defined( ORO_PRECOMPILED ) && defined( ORO_PP_LOAD_FROM_STRING )
 // Note: the include order must be in this particular form.
 // clang-format off
 #include <ParallelPrimitives/cache/Kernels.h>
 #include <ParallelPrimitives/cache/KernelArgs.h>
 // clang-format on
+#else
+// if Kernels.h / KernelArgs.h are not included, declare nullptr strings
+static const char* hip_RadixSortKernels = nullptr;
+namespace hip
+{
+static const char** RadixSortKernelsArgs = nullptr;
+static const char** RadixSortKernelsIncludes = nullptr;
+}
 #endif
 
 #if defined( __GNUC__ )
 #include <dlfcn.h>
 #endif
 
-namespace
-{
-#if defined( ORO_PRECOMPILED )
-constexpr auto useBitCode = true;
+#if defined( ORO_PRECOMPILED ) && defined( ORO_PP_LOAD_FROM_STRING ) 
+#include <ParallelPrimitives/cache/oro_compiled_kernels.h> // generate this header with 'convert_binary_to_array.py'
 #else
-constexpr auto useBitCode = false;
+const unsigned char oro_compiled_kernels_h[] = "";
+const size_t oro_compiled_kernels_h_size = 0;
 #endif
 
-#if defined( ORO_PP_LOAD_FROM_STRING )
-constexpr auto useBakeKernel = true;
-#else
-constexpr auto useBakeKernel = false;
-static const char* hip_RadixSortKernels = nullptr;
-namespace hip
+namespace
 {
-static const char** RadixSortKernelsArgs = nullptr;
-static const char** RadixSortKernelsIncludes = nullptr;
-} // namespace hip
+
+// if those 2 preprocessors are enabled, this activates the 'usePrecompiledAndBakedKernel' mode.
+#if defined( ORO_PRECOMPILED ) && defined( ORO_PP_LOAD_FROM_STRING ) 
+
+	// this flag means that we bake the precompiled kernels
+	constexpr auto usePrecompiledAndBakedKernel = true;
+
+	constexpr auto useBitCode = false;
+	constexpr auto useBakeKernel = false;
+
+#else
+
+	constexpr auto usePrecompiledAndBakedKernel = false;
+
+	#if defined( ORO_PRECOMPILED )
+	constexpr auto useBitCode = true; // this flag means we use the bitcode file
+	#else
+	constexpr auto useBitCode = false;
+	#endif
+
+	#if defined( ORO_PP_LOAD_FROM_STRING )
+	constexpr auto useBakeKernel = true; // this flag means we use the HIP source code embeded in the binary ( as a string ) 
+	#else
+	constexpr auto useBakeKernel = false;
+	#endif
+
 #endif
 
 static_assert( !( useBitCode && useBakeKernel ), "useBitCode and useBakeKernel cannot coexist" );
@@ -211,9 +237,14 @@ void RadixSort::compileKernels( const std::string& kernelPath, const std::string
 	opts.push_back( sort_block_size_param.c_str() );
 	opts.push_back( sort_num_warps_param.c_str() );
 
+
 	for( const auto& record : records )
 	{
-		if constexpr( useBakeKernel )
+		if constexpr( usePrecompiledAndBakedKernel )
+		{
+			oroFunctions[record.kernelType] = m_oroutils.getFunctionFromPrecompiledBinary_asData(oro_compiled_kernels_h, oro_compiled_kernels_h_size, record.kernelName.c_str() );
+		}
+		else if constexpr( useBakeKernel )
 		{
 			oroFunctions[record.kernelType] = m_oroutils.getFunctionFromString( m_device, hip_RadixSortKernels, currentKernelPath.c_str(), record.kernelName.c_str(), &opts, 1, hip::RadixSortKernelsArgs, hip::RadixSortKernelsIncludes );
 		}
@@ -231,6 +262,8 @@ void RadixSort::compileKernels( const std::string& kernelPath, const std::string
 			printKernelInfo( record.kernelName, oroFunctions[record.kernelType] );
 		}
 	}
+
+	return;
 }
 
 int RadixSort::calculateWGsToExecute( const int blockSize ) const noexcept

diff --git a/Test/DeviceEnum/main.cpp b/Test/DeviceEnum/main.cpp
@@ -66,6 +66,9 @@ int main( int argc, char** argv )
 		e = oroCtxCreate( &ctx, 0, device );
 		ERROR_CHECK( e );
 
+		e = oroCtxSetCurrent( ctx );
+		ERROR_CHECK( e );
+
 		//try kernel execution
 		 oroFunction function;
 		{

diff --git a/contrib/bin/win64/amd_comgr0601.dll b/contrib/bin/win64/amd_comgr0601.dll
diff --git a/contrib/bin/win64/hiprtc-builtins0601.dll b/contrib/bin/win64/hiprtc-builtins0601.dll
diff --git a/contrib/bin/win64/hiprtc0601.dll b/contrib/bin/win64/hiprtc0601.dll
diff --git a/scripts/convert_binary_to_array.py b/scripts/convert_binary_to_array.py
@@ -0,0 +1,27 @@
+# convert_binary_to_header.py
+import sys
+from pathlib import Path
+
+def binary_to_c_array(bin_file, array_name):
+    with open(bin_file, 'rb') as f:
+        binary_data = f.read()
+
+    hex_array = ', '.join(f'0x{b:02x}' for b in binary_data)
+    c_array = f'const unsigned char {array_name}[] = {{\n    {hex_array}\n}};\n'
+    c_array += f'const size_t {array_name}_size = sizeof({array_name});\n'
+    return c_array
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print(f"Usage: {sys.argv[0]} <input_binary_file> <output_header_file>")
+        sys.exit(1)
+
+    bin_file = sys.argv[1]
+    header_file_path = sys.argv[2]
+    header_file = Path(header_file_path).name
+    array_name = header_file.replace('.', '_')
+
+    c_array = binary_to_c_array(bin_file, array_name)
+    with open(header_file_path, 'w') as f:
+        f.write("// generated by convert_binary_to_header.py\n")
+        f.write(c_array)