From a6047746e4e7a8cd939c1f6cbfc7e865c9ac1a95 Mon Sep 17 00:00:00 2001 From: James Edwards <JamesAdrian.Edwards@amd.com> Date: Sun, 14 Aug 2016 10:03:45 -0500 Subject: [PATCH] ROCR 1.2 updates --- src/README.md | 12 - src/core/CMakeLists.txt | 17 +- src/core/common/hsa_table_interface.cpp | 423 +++++++-- src/core/common/shared.h | 2 + src/core/hsacore.so.def | 2 + src/core/inc/agent.h | 32 +- src/core/inc/amd_aql_queue.h | 6 +- src/core/inc/amd_blit_kernel.h | 73 +- src/core/inc/amd_blit_kernel_kv.h | 479 ---------- src/core/inc/amd_blit_kernel_vi.h | 490 ---------- src/core/inc/amd_blit_sdma.h | 26 +- src/core/inc/amd_elf_image.hpp | 1 + src/core/inc/amd_gpu_agent.h | 86 +- src/core/inc/amd_gpu_shaders.h | 169 ++++ src/core/inc/amd_hsa_loader.hpp | 13 + src/core/inc/blit.h | 13 +- src/core/inc/hsa_api_trace_int.h | 29 +- src/core/inc/hsa_ext_amd_impl.h | 186 ++++ src/core/inc/hsa_ext_interface.h | 28 +- src/core/inc/hsa_table_interface.h | 4 +- src/core/inc/interrupt_signal.h | 8 - src/core/inc/runtime.h | 5 - src/core/inc/signal.h | 14 +- src/core/runtime/amd_aql_queue.cpp | 182 ++-- src/core/runtime/amd_blit_kernel.cpp | 862 +++++++++++++----- src/core/runtime/amd_blit_sdma.cpp | 230 ++++- src/core/runtime/amd_gpu_agent.cpp | 330 +++++-- src/core/runtime/amd_memory_region.cpp | 91 +- src/core/runtime/amd_topology.cpp | 6 - src/core/runtime/hsa.cpp | 113 +-- src/core/runtime/hsa_api_trace.cpp | 324 ++++--- src/core/runtime/hsa_ext_amd.cpp | 86 +- src/core/runtime/hsa_ext_interface.cpp | 340 ++++--- ...code_object.cpp => hsa_ven_amd_loader.cpp} | 15 +- src/core/runtime/interrupt_signal.cpp | 6 +- src/core/runtime/runtime.cpp | 85 +- src/core/util/win/os_win.cpp | 227 +++++ src/inc/hsa.h | 8 +- src/inc/hsa_api_trace.h | 195 +++- src/inc/hsa_ext_amd.h | 76 +- src/inc/hsa_ven_amd_loaded_code_object.h | 95 -- src/inc/hsa_ven_amd_loader.h | 249 +++++ src/libamdhsacode/amd_elf_image.cpp | 7 +- src/libamdhsacode/amd_hsa_code.cpp | 8 +- src/loader/executable.cpp | 128 ++- src/loader/executable.hpp | 25 +- src/loader/loaders.cpp | 10 + src/loader/loaders.hpp | 4 + 48 files changed, 3680 insertions(+), 2140 deletions(-) delete mode 100644 src/core/inc/amd_blit_kernel_kv.h delete mode 100644 src/core/inc/amd_blit_kernel_vi.h create mode 100644 src/core/inc/amd_gpu_shaders.h create mode 100755 src/core/inc/hsa_ext_amd_impl.h rename src/core/runtime/{hsa_ven_amd_loaded_code_object.cpp => hsa_ven_amd_loader.cpp} (83%) create mode 100644 src/core/util/win/os_win.cpp delete mode 100644 src/inc/hsa_ven_amd_loaded_code_object.h create mode 100644 src/inc/hsa_ven_amd_loader.h diff --git a/src/README.md b/src/README.md index 2b6bea468..1ee7e8425 100644 --- a/src/README.md +++ b/src/README.md @@ -60,18 +60,6 @@ For example, from the top level ROCR repository execute: The name of the core hsa runtime is libhsa-runtime64.so.1. -#### External requirements - -The core runtime requires the sp3.a library to be able to compiler -on x86_64 architechtures. The binaries for the sp3.a librariy can -be found on the amd-codexl-analyzer GitHub repository: - -https://github.com/GPUOpen-Tools/amd-codexl-analyzer - -The x86_64 library and associated header files have been added to -this code base for convenience, but are still subject to the -AMD copyright license. - #### Specs http://www.hsafoundation.com/standards/ diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index ec0816ca1..0eb9af686 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -66,18 +66,6 @@ if ( NOT EXISTS ${HSATHK_BUILD_LIB_PATH}/libhsakmt.so.1 ) MESSAGE ( FATAL_ERROR "Environment variable HSATHK_BUILD_LIB_PATH is not set to point to the location where KFD Thunk library libhsakmt.so.1 could be found." ) endif () -if ( EXISTS ${LIBSP3_BUILD_INC_PATH}/sp3.h ) - set ( LIBSP3_BUILD_INC_PATH ${LIBSP3_BUILD_INC_PATH} ) -else () - set ( LIBSP3_BUILD_INC_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../utils/sp3 ) -endif () - -if ( EXISTS ${LIBSP3_BUILD_LIB_PATH}/libsp3.a ) - set ( LIBSP3_BUILD_LIB_PATH ${LIBSP3_BUILD_LIB_PATH} ) -else () - set ( LIBSP3_BUILD_LIB_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../utils/sp3 ) -endif () - MESSAGE ( ------IS64BIT: ${IS64BIT} ) MESSAGE ( ------Compiler: ${CMAKE_CXX_COMPILER} ) MESSAGE ( ------Version: ${CMAKE_CXX_COMPILER_VERSION} ) @@ -132,7 +120,7 @@ set ( CORE_SRCS ${CORE_SRCS} runtime/amd_cpu_agent.cpp ) set ( CORE_SRCS ${CORE_SRCS} runtime/amd_gpu_agent.cpp ) set ( CORE_SRCS ${CORE_SRCS} runtime/amd_aql_queue.cpp ) set ( CORE_SRCS ${CORE_SRCS} runtime/amd_loader_context.cpp ) -set ( CORE_SRCS ${CORE_SRCS} runtime/hsa_ven_amd_loaded_code_object.cpp ) +set ( CORE_SRCS ${CORE_SRCS} runtime/hsa_ven_amd_loader.cpp ) set ( CORE_SRCS ${CORE_SRCS} runtime/amd_memory_region.cpp ) set ( CORE_SRCS ${CORE_SRCS} runtime/amd_topology.cpp ) set ( CORE_SRCS ${CORE_SRCS} runtime/default_signal.cpp ) @@ -153,11 +141,9 @@ include_directories ( ${CMAKE_CURRENT_SOURCE_DIR}/.. ) include_directories ( ${CMAKE_CURRENT_SOURCE_DIR}/../inc ) include_directories ( ${CMAKE_CURRENT_SOURCE_DIR}/inc ) include_directories ( ${HSATHK_BUILD_INC_PATH} ) -include_directories ( ${LIBSP3_BUILD_INC_PATH} ) ## Library path(s). link_directories ( ${HSATHK_BUILD_LIB_PATH} ) -link_directories ( ${LIBSP3_BUILD_LIB_PATH} ) add_library ( ${CORE_RUNTIME_TARGET} SHARED ${CORE_SRCS} ) @@ -172,7 +158,6 @@ target_link_libraries ( ${CORE_RUNTIME_TARGET} PRIVATE amdhsaloader PRIVATE amdhsacode PRIVATE hsakmt - PRIVATE sp3 dl pthread rt ) diff --git a/src/core/common/hsa_table_interface.cpp b/src/core/common/hsa_table_interface.cpp index ffbe749a9..13154820f 100644 --- a/src/core/common/hsa_table_interface.cpp +++ b/src/core/common/hsa_table_interface.cpp @@ -41,60 +41,69 @@ //////////////////////////////////////////////////////////////////////////////// #include "hsa_api_trace.h" +#include "core/inc/hsa_api_trace_int.h" -static const ApiTable* HsaApiTable; +static const HsaApiTable* hsaApiTable; +static const CoreApiTable* coreApiTable; +static const AmdExtTable* amdExtTable; -void hsa_table_interface_init(const ApiTable* Table) { HsaApiTable = Table; } +void hsa_table_interface_init(const HsaApiTable* apiTable) { + hsaApiTable = apiTable; + coreApiTable = apiTable->core_; + amdExtTable = apiTable->amd_ext_; +} -const ApiTable* hsa_table_interface_get_table() { return HsaApiTable; } +const HsaApiTable* hsa_table_interface_get_table() { + return hsaApiTable; +} // Pass through stub functions -hsa_status_t HSA_API hsa_init() { return HsaApiTable->hsa_init_fn(); } +hsa_status_t HSA_API hsa_init() { return coreApiTable->hsa_init_fn(); } -hsa_status_t HSA_API hsa_shut_down() { return HsaApiTable->hsa_shut_down_fn(); } +hsa_status_t HSA_API hsa_shut_down() { return coreApiTable->hsa_shut_down_fn(); } hsa_status_t HSA_API hsa_system_get_info(hsa_system_info_t attribute, void* value) { - return HsaApiTable->hsa_system_get_info_fn(attribute, value); + return coreApiTable->hsa_system_get_info_fn(attribute, value); } hsa_status_t HSA_API hsa_system_extension_supported(uint16_t extension, uint16_t version_major, uint16_t version_minor, bool* result) { - return HsaApiTable->hsa_system_extension_supported_fn( + return coreApiTable->hsa_system_extension_supported_fn( extension, version_major, version_minor, result); } hsa_status_t HSA_API hsa_system_get_extension_table(uint16_t extension, uint16_t version_major, uint16_t version_minor, void* table) { - return HsaApiTable->hsa_system_get_extension_table_fn( + return coreApiTable->hsa_system_get_extension_table_fn( extension, version_major, version_minor, table); } hsa_status_t HSA_API hsa_iterate_agents(hsa_status_t (*callback)(hsa_agent_t agent, void* data), void* data) { - return HsaApiTable->hsa_iterate_agents_fn(callback, data); + return coreApiTable->hsa_iterate_agents_fn(callback, data); } hsa_status_t HSA_API hsa_agent_get_info(hsa_agent_t agent, hsa_agent_info_t attribute, void* value) { - return HsaApiTable->hsa_agent_get_info_fn(agent, attribute, value); + return coreApiTable->hsa_agent_get_info_fn(agent, attribute, value); } hsa_status_t HSA_API hsa_agent_get_exception_policies(hsa_agent_t agent, hsa_profile_t profile, uint16_t* mask) { - return HsaApiTable->hsa_agent_get_exception_policies_fn(agent, profile, mask); + return coreApiTable->hsa_agent_get_exception_policies_fn(agent, profile, mask); } hsa_status_t HSA_API hsa_agent_extension_supported(uint16_t extension, hsa_agent_t agent, uint16_t version_major, uint16_t version_minor, bool* result) { - return HsaApiTable->hsa_agent_extension_supported_fn( + return coreApiTable->hsa_agent_extension_supported_fn( extension, agent, version_major, version_minor, result); } @@ -104,7 +113,7 @@ hsa_status_t HSA_API void* data), void* data, uint32_t private_segment_size, uint32_t group_segment_size, hsa_queue_t** queue) { - return HsaApiTable->hsa_queue_create_fn(agent, size, type, callback, data, + return coreApiTable->hsa_queue_create_fn(agent, size, type, callback, data, private_segment_size, group_segment_size, queue); } @@ -113,167 +122,167 @@ hsa_status_t HSA_API hsa_soft_queue_create(hsa_region_t region, uint32_t size, hsa_queue_type_t type, uint32_t features, hsa_signal_t completion_signal, hsa_queue_t** queue) { - return HsaApiTable->hsa_soft_queue_create_fn(region, size, type, features, + return coreApiTable->hsa_soft_queue_create_fn(region, size, type, features, completion_signal, queue); } hsa_status_t HSA_API hsa_queue_destroy(hsa_queue_t* queue) { - return HsaApiTable->hsa_queue_destroy_fn(queue); + return coreApiTable->hsa_queue_destroy_fn(queue); } hsa_status_t HSA_API hsa_queue_inactivate(hsa_queue_t* queue) { - return HsaApiTable->hsa_queue_inactivate_fn(queue); + return coreApiTable->hsa_queue_inactivate_fn(queue); } uint64_t HSA_API hsa_queue_load_read_index_acquire(const hsa_queue_t* queue) { - return HsaApiTable->hsa_queue_load_read_index_acquire_fn(queue); + return coreApiTable->hsa_queue_load_read_index_acquire_fn(queue); } uint64_t HSA_API hsa_queue_load_read_index_relaxed(const hsa_queue_t* queue) { - return HsaApiTable->hsa_queue_load_read_index_relaxed_fn(queue); + return coreApiTable->hsa_queue_load_read_index_relaxed_fn(queue); } uint64_t HSA_API hsa_queue_load_write_index_acquire(const hsa_queue_t* queue) { - return HsaApiTable->hsa_queue_load_write_index_acquire_fn(queue); + return coreApiTable->hsa_queue_load_write_index_acquire_fn(queue); } uint64_t HSA_API hsa_queue_load_write_index_relaxed(const hsa_queue_t* queue) { - return HsaApiTable->hsa_queue_load_write_index_relaxed_fn(queue); + return coreApiTable->hsa_queue_load_write_index_relaxed_fn(queue); } void HSA_API hsa_queue_store_write_index_relaxed(const hsa_queue_t* queue, uint64_t value) { - return HsaApiTable->hsa_queue_store_write_index_relaxed_fn(queue, value); + return coreApiTable->hsa_queue_store_write_index_relaxed_fn(queue, value); } void HSA_API hsa_queue_store_write_index_release(const hsa_queue_t* queue, uint64_t value) { - return HsaApiTable->hsa_queue_store_write_index_release_fn(queue, value); + return coreApiTable->hsa_queue_store_write_index_release_fn(queue, value); } uint64_t HSA_API hsa_queue_cas_write_index_acq_rel(const hsa_queue_t* queue, uint64_t expected, uint64_t value) { - return HsaApiTable->hsa_queue_cas_write_index_acq_rel_fn(queue, expected, + return coreApiTable->hsa_queue_cas_write_index_acq_rel_fn(queue, expected, value); } uint64_t HSA_API hsa_queue_cas_write_index_acquire(const hsa_queue_t* queue, uint64_t expected, uint64_t value) { - return HsaApiTable->hsa_queue_cas_write_index_acquire_fn(queue, expected, + return coreApiTable->hsa_queue_cas_write_index_acquire_fn(queue, expected, value); } uint64_t HSA_API hsa_queue_cas_write_index_relaxed(const hsa_queue_t* queue, uint64_t expected, uint64_t value) { - return HsaApiTable->hsa_queue_cas_write_index_relaxed_fn(queue, expected, + return coreApiTable->hsa_queue_cas_write_index_relaxed_fn(queue, expected, value); } uint64_t HSA_API hsa_queue_cas_write_index_release(const hsa_queue_t* queue, uint64_t expected, uint64_t value) { - return HsaApiTable->hsa_queue_cas_write_index_release_fn(queue, expected, + return coreApiTable->hsa_queue_cas_write_index_release_fn(queue, expected, value); } uint64_t HSA_API hsa_queue_add_write_index_acq_rel(const hsa_queue_t* queue, uint64_t value) { - return HsaApiTable->hsa_queue_add_write_index_acq_rel_fn(queue, value); + return coreApiTable->hsa_queue_add_write_index_acq_rel_fn(queue, value); } uint64_t HSA_API hsa_queue_add_write_index_acquire(const hsa_queue_t* queue, uint64_t value) { - return HsaApiTable->hsa_queue_add_write_index_acquire_fn(queue, value); + return coreApiTable->hsa_queue_add_write_index_acquire_fn(queue, value); } uint64_t HSA_API hsa_queue_add_write_index_relaxed(const hsa_queue_t* queue, uint64_t value) { - return HsaApiTable->hsa_queue_add_write_index_relaxed_fn(queue, value); + return coreApiTable->hsa_queue_add_write_index_relaxed_fn(queue, value); } uint64_t HSA_API hsa_queue_add_write_index_release(const hsa_queue_t* queue, uint64_t value) { - return HsaApiTable->hsa_queue_add_write_index_release_fn(queue, value); + return coreApiTable->hsa_queue_add_write_index_release_fn(queue, value); } void HSA_API hsa_queue_store_read_index_relaxed(const hsa_queue_t* queue, uint64_t value) { - return HsaApiTable->hsa_queue_store_read_index_relaxed_fn(queue, value); + return coreApiTable->hsa_queue_store_read_index_relaxed_fn(queue, value); } void HSA_API hsa_queue_store_read_index_release(const hsa_queue_t* queue, uint64_t value) { - return HsaApiTable->hsa_queue_store_read_index_release_fn(queue, value); + return coreApiTable->hsa_queue_store_read_index_release_fn(queue, value); } hsa_status_t HSA_API hsa_agent_iterate_regions( hsa_agent_t agent, hsa_status_t (*callback)(hsa_region_t region, void* data), void* data) { - return HsaApiTable->hsa_agent_iterate_regions_fn(agent, callback, data); + return coreApiTable->hsa_agent_iterate_regions_fn(agent, callback, data); } hsa_status_t HSA_API hsa_region_get_info(hsa_region_t region, hsa_region_info_t attribute, void* value) { - return HsaApiTable->hsa_region_get_info_fn(region, attribute, value); + return coreApiTable->hsa_region_get_info_fn(region, attribute, value); } hsa_status_t HSA_API hsa_memory_register(void* address, size_t size) { - return HsaApiTable->hsa_memory_register_fn(address, size); + return coreApiTable->hsa_memory_register_fn(address, size); } hsa_status_t HSA_API hsa_memory_deregister(void* address, size_t size) { - return HsaApiTable->hsa_memory_deregister_fn(address, size); + return coreApiTable->hsa_memory_deregister_fn(address, size); } hsa_status_t HSA_API hsa_memory_allocate(hsa_region_t region, size_t size, void** ptr) { - return HsaApiTable->hsa_memory_allocate_fn(region, size, ptr); + return coreApiTable->hsa_memory_allocate_fn(region, size, ptr); } hsa_status_t HSA_API hsa_memory_free(void* ptr) { - return HsaApiTable->hsa_memory_free_fn(ptr); + return coreApiTable->hsa_memory_free_fn(ptr); } hsa_status_t HSA_API hsa_memory_copy(void* dst, const void* src, size_t size) { - return HsaApiTable->hsa_memory_copy_fn(dst, src, size); + return coreApiTable->hsa_memory_copy_fn(dst, src, size); } hsa_status_t HSA_API hsa_memory_assign_agent(void* ptr, hsa_agent_t agent, hsa_access_permission_t access) { - return HsaApiTable->hsa_memory_assign_agent_fn(ptr, agent, access); + return coreApiTable->hsa_memory_assign_agent_fn(ptr, agent, access); } hsa_status_t HSA_API hsa_signal_create(hsa_signal_value_t initial_value, uint32_t num_consumers, const hsa_agent_t* consumers, hsa_signal_t* signal) { - return HsaApiTable->hsa_signal_create_fn(initial_value, num_consumers, + return coreApiTable->hsa_signal_create_fn(initial_value, num_consumers, consumers, signal); } hsa_status_t HSA_API hsa_signal_destroy(hsa_signal_t signal) { - return HsaApiTable->hsa_signal_destroy_fn(signal); + return coreApiTable->hsa_signal_destroy_fn(signal); } hsa_signal_value_t HSA_API hsa_signal_load_relaxed(hsa_signal_t signal) { - return HsaApiTable->hsa_signal_load_relaxed_fn(signal); + return coreApiTable->hsa_signal_load_relaxed_fn(signal); } hsa_signal_value_t HSA_API hsa_signal_load_acquire(hsa_signal_t signal) { - return HsaApiTable->hsa_signal_load_acquire_fn(signal); + return coreApiTable->hsa_signal_load_acquire_fn(signal); } void HSA_API hsa_signal_store_relaxed(hsa_signal_t signal, hsa_signal_value_t value) { - return HsaApiTable->hsa_signal_store_relaxed_fn(signal, value); + return coreApiTable->hsa_signal_store_relaxed_fn(signal, value); } void HSA_API hsa_signal_store_release(hsa_signal_t signal, hsa_signal_value_t value) { - return HsaApiTable->hsa_signal_store_release_fn(signal, value); + return coreApiTable->hsa_signal_store_release_fn(signal, value); } hsa_signal_value_t HSA_API @@ -282,7 +291,7 @@ hsa_signal_value_t HSA_API hsa_signal_value_t compare_value, uint64_t timeout_hint, hsa_wait_state_t wait_expectancy_hint) { - return HsaApiTable->hsa_signal_wait_relaxed_fn( + return coreApiTable->hsa_signal_wait_relaxed_fn( signal, condition, compare_value, timeout_hint, wait_expectancy_hint); } @@ -292,166 +301,166 @@ hsa_signal_value_t HSA_API hsa_signal_value_t compare_value, uint64_t timeout_hint, hsa_wait_state_t wait_expectancy_hint) { - return HsaApiTable->hsa_signal_wait_acquire_fn( + return coreApiTable->hsa_signal_wait_acquire_fn( signal, condition, compare_value, timeout_hint, wait_expectancy_hint); } void HSA_API hsa_signal_and_relaxed(hsa_signal_t signal, hsa_signal_value_t value) { - return HsaApiTable->hsa_signal_and_relaxed_fn(signal, value); + return coreApiTable->hsa_signal_and_relaxed_fn(signal, value); } void HSA_API hsa_signal_and_acquire(hsa_signal_t signal, hsa_signal_value_t value) { - return HsaApiTable->hsa_signal_and_acquire_fn(signal, value); + return coreApiTable->hsa_signal_and_acquire_fn(signal, value); } void HSA_API hsa_signal_and_release(hsa_signal_t signal, hsa_signal_value_t value) { - return HsaApiTable->hsa_signal_and_release_fn(signal, value); + return coreApiTable->hsa_signal_and_release_fn(signal, value); } void HSA_API hsa_signal_and_acq_rel(hsa_signal_t signal, hsa_signal_value_t value) { - return HsaApiTable->hsa_signal_and_acq_rel_fn(signal, value); + return coreApiTable->hsa_signal_and_acq_rel_fn(signal, value); } void HSA_API hsa_signal_or_relaxed(hsa_signal_t signal, hsa_signal_value_t value) { - return HsaApiTable->hsa_signal_or_relaxed_fn(signal, value); + return coreApiTable->hsa_signal_or_relaxed_fn(signal, value); } void HSA_API hsa_signal_or_acquire(hsa_signal_t signal, hsa_signal_value_t value) { - return HsaApiTable->hsa_signal_or_acquire_fn(signal, value); + return coreApiTable->hsa_signal_or_acquire_fn(signal, value); } void HSA_API hsa_signal_or_release(hsa_signal_t signal, hsa_signal_value_t value) { - return HsaApiTable->hsa_signal_or_release_fn(signal, value); + return coreApiTable->hsa_signal_or_release_fn(signal, value); } void HSA_API hsa_signal_or_acq_rel(hsa_signal_t signal, hsa_signal_value_t value) { - return HsaApiTable->hsa_signal_or_acq_rel_fn(signal, value); + return coreApiTable->hsa_signal_or_acq_rel_fn(signal, value); } void HSA_API hsa_signal_xor_relaxed(hsa_signal_t signal, hsa_signal_value_t value) { - return HsaApiTable->hsa_signal_xor_relaxed_fn(signal, value); + return coreApiTable->hsa_signal_xor_relaxed_fn(signal, value); } void HSA_API hsa_signal_xor_acquire(hsa_signal_t signal, hsa_signal_value_t value) { - return HsaApiTable->hsa_signal_xor_acquire_fn(signal, value); + return coreApiTable->hsa_signal_xor_acquire_fn(signal, value); } void HSA_API hsa_signal_xor_release(hsa_signal_t signal, hsa_signal_value_t value) { - return HsaApiTable->hsa_signal_xor_release_fn(signal, value); + return coreApiTable->hsa_signal_xor_release_fn(signal, value); } void HSA_API hsa_signal_xor_acq_rel(hsa_signal_t signal, hsa_signal_value_t value) { - return HsaApiTable->hsa_signal_xor_acq_rel_fn(signal, value); + return coreApiTable->hsa_signal_xor_acq_rel_fn(signal, value); } void HSA_API hsa_signal_add_relaxed(hsa_signal_t signal, hsa_signal_value_t value) { - return HsaApiTable->hsa_signal_add_relaxed_fn(signal, value); + return coreApiTable->hsa_signal_add_relaxed_fn(signal, value); } void HSA_API hsa_signal_add_acquire(hsa_signal_t signal, hsa_signal_value_t value) { - return HsaApiTable->hsa_signal_add_acquire_fn(signal, value); + return coreApiTable->hsa_signal_add_acquire_fn(signal, value); } void HSA_API hsa_signal_add_release(hsa_signal_t signal, hsa_signal_value_t value) { - return HsaApiTable->hsa_signal_add_release_fn(signal, value); + return coreApiTable->hsa_signal_add_release_fn(signal, value); } void HSA_API hsa_signal_add_acq_rel(hsa_signal_t signal, hsa_signal_value_t value) { - return HsaApiTable->hsa_signal_add_acq_rel_fn(signal, value); + return coreApiTable->hsa_signal_add_acq_rel_fn(signal, value); } void HSA_API hsa_signal_subtract_relaxed(hsa_signal_t signal, hsa_signal_value_t value) { - return HsaApiTable->hsa_signal_subtract_relaxed_fn(signal, value); + return coreApiTable->hsa_signal_subtract_relaxed_fn(signal, value); } void HSA_API hsa_signal_subtract_acquire(hsa_signal_t signal, hsa_signal_value_t value) { - return HsaApiTable->hsa_signal_subtract_acquire_fn(signal, value); + return coreApiTable->hsa_signal_subtract_acquire_fn(signal, value); } void HSA_API hsa_signal_subtract_release(hsa_signal_t signal, hsa_signal_value_t value) { - return HsaApiTable->hsa_signal_subtract_release_fn(signal, value); + return coreApiTable->hsa_signal_subtract_release_fn(signal, value); } void HSA_API hsa_signal_subtract_acq_rel(hsa_signal_t signal, hsa_signal_value_t value) { - return HsaApiTable->hsa_signal_subtract_acq_rel_fn(signal, value); + return coreApiTable->hsa_signal_subtract_acq_rel_fn(signal, value); } hsa_signal_value_t HSA_API hsa_signal_exchange_relaxed(hsa_signal_t signal, hsa_signal_value_t value) { - return HsaApiTable->hsa_signal_exchange_relaxed_fn(signal, value); + return coreApiTable->hsa_signal_exchange_relaxed_fn(signal, value); } hsa_signal_value_t HSA_API hsa_signal_exchange_acquire(hsa_signal_t signal, hsa_signal_value_t value) { - return HsaApiTable->hsa_signal_exchange_acquire_fn(signal, value); + return coreApiTable->hsa_signal_exchange_acquire_fn(signal, value); } hsa_signal_value_t HSA_API hsa_signal_exchange_release(hsa_signal_t signal, hsa_signal_value_t value) { - return HsaApiTable->hsa_signal_exchange_release_fn(signal, value); + return coreApiTable->hsa_signal_exchange_release_fn(signal, value); } hsa_signal_value_t HSA_API hsa_signal_exchange_acq_rel(hsa_signal_t signal, hsa_signal_value_t value) { - return HsaApiTable->hsa_signal_exchange_acq_rel_fn(signal, value); + return coreApiTable->hsa_signal_exchange_acq_rel_fn(signal, value); } hsa_signal_value_t HSA_API hsa_signal_cas_relaxed(hsa_signal_t signal, hsa_signal_value_t expected, hsa_signal_value_t value) { - return HsaApiTable->hsa_signal_cas_relaxed_fn(signal, expected, value); + return coreApiTable->hsa_signal_cas_relaxed_fn(signal, expected, value); } hsa_signal_value_t HSA_API hsa_signal_cas_acquire(hsa_signal_t signal, hsa_signal_value_t expected, hsa_signal_value_t value) { - return HsaApiTable->hsa_signal_cas_acquire_fn(signal, expected, value); + return coreApiTable->hsa_signal_cas_acquire_fn(signal, expected, value); } hsa_signal_value_t HSA_API hsa_signal_cas_release(hsa_signal_t signal, hsa_signal_value_t expected, hsa_signal_value_t value) { - return HsaApiTable->hsa_signal_cas_release_fn(signal, expected, value); + return coreApiTable->hsa_signal_cas_release_fn(signal, expected, value); } hsa_signal_value_t HSA_API hsa_signal_cas_acq_rel(hsa_signal_t signal, hsa_signal_value_t expected, hsa_signal_value_t value) { - return HsaApiTable->hsa_signal_cas_acq_rel_fn(signal, expected, value); + return coreApiTable->hsa_signal_cas_acq_rel_fn(signal, expected, value); } hsa_status_t hsa_isa_from_name(const char* name, hsa_isa_t* isa) { - return HsaApiTable->hsa_isa_from_name_fn(name, isa); + return coreApiTable->hsa_isa_from_name_fn(name, isa); } hsa_status_t HSA_API hsa_isa_get_info(hsa_isa_t isa, hsa_isa_info_t attribute, uint32_t index, void* value) { - return HsaApiTable->hsa_isa_get_info_fn(isa, attribute, index, value); + return coreApiTable->hsa_isa_get_info_fn(isa, attribute, index, value); } hsa_status_t hsa_isa_compatible(hsa_isa_t code_object_isa, hsa_isa_t agent_isa, bool* result) { - return HsaApiTable->hsa_isa_compatible_fn(code_object_isa, agent_isa, result); + return coreApiTable->hsa_isa_compatible_fn(code_object_isa, agent_isa, result); } hsa_status_t HSA_API hsa_code_object_serialize( @@ -460,7 +469,7 @@ hsa_status_t HSA_API hsa_code_object_serialize( void** address), hsa_callback_data_t callback_data, const char* options, void** serialized_code_object, size_t* serialized_code_object_size) { - return HsaApiTable->hsa_code_object_serialize_fn( + return coreApiTable->hsa_code_object_serialize_fn( code_object, alloc_callback, callback_data, options, serialized_code_object, serialized_code_object_size); } @@ -470,33 +479,33 @@ hsa_status_t HSA_API size_t serialized_code_object_size, const char* options, hsa_code_object_t* code_object) { - return HsaApiTable->hsa_code_object_deserialize_fn( + return coreApiTable->hsa_code_object_deserialize_fn( serialized_code_object, serialized_code_object_size, options, code_object); } hsa_status_t HSA_API hsa_code_object_destroy(hsa_code_object_t code_object) { - return HsaApiTable->hsa_code_object_destroy_fn(code_object); + return coreApiTable->hsa_code_object_destroy_fn(code_object); } hsa_status_t HSA_API hsa_code_object_get_info(hsa_code_object_t code_object, hsa_code_object_info_t attribute, void* value) { - return HsaApiTable->hsa_code_object_get_info_fn(code_object, attribute, + return coreApiTable->hsa_code_object_get_info_fn(code_object, attribute, value); } hsa_status_t HSA_API hsa_code_object_get_symbol(hsa_code_object_t code_object, const char* symbol_name, hsa_code_symbol_t* symbol) { - return HsaApiTable->hsa_code_object_get_symbol_fn(code_object, symbol_name, + return coreApiTable->hsa_code_object_get_symbol_fn(code_object, symbol_name, symbol); } hsa_status_t HSA_API hsa_code_symbol_get_info(hsa_code_symbol_t code_symbol, hsa_code_symbol_info_t attribute, void* value) { - return HsaApiTable->hsa_code_symbol_get_info_fn(code_symbol, attribute, + return coreApiTable->hsa_code_symbol_get_info_fn(code_symbol, attribute, value); } @@ -505,7 +514,7 @@ hsa_status_t HSA_API hsa_code_object_iterate_symbols( hsa_status_t (*callback)(hsa_code_object_t code_object, hsa_code_symbol_t symbol, void* data), void* data) { - return HsaApiTable->hsa_code_object_iterate_symbols_fn(code_object, callback, + return coreApiTable->hsa_code_object_iterate_symbols_fn(code_object, callback, data); } @@ -513,12 +522,12 @@ hsa_status_t HSA_API hsa_executable_create(hsa_profile_t profile, hsa_executable_state_t executable_state, const char* options, hsa_executable_t* executable) { - return HsaApiTable->hsa_executable_create_fn(profile, executable_state, + return coreApiTable->hsa_executable_create_fn(profile, executable_state, options, executable); } hsa_status_t HSA_API hsa_executable_destroy(hsa_executable_t executable) { - return HsaApiTable->hsa_executable_destroy_fn(executable); + return coreApiTable->hsa_executable_destroy_fn(executable); } hsa_status_t HSA_API @@ -526,26 +535,26 @@ hsa_status_t HSA_API hsa_agent_t agent, hsa_code_object_t code_object, const char* options) { - return HsaApiTable->hsa_executable_load_code_object_fn(executable, agent, + return coreApiTable->hsa_executable_load_code_object_fn(executable, agent, code_object, options); } hsa_status_t HSA_API hsa_executable_freeze(hsa_executable_t executable, const char* options) { - return HsaApiTable->hsa_executable_freeze_fn(executable, options); + return coreApiTable->hsa_executable_freeze_fn(executable, options); } hsa_status_t HSA_API hsa_executable_get_info(hsa_executable_t executable, hsa_executable_info_t attribute, void* value) { - return HsaApiTable->hsa_executable_get_info_fn(executable, attribute, value); + return coreApiTable->hsa_executable_get_info_fn(executable, attribute, value); } hsa_status_t HSA_API hsa_executable_global_variable_define(hsa_executable_t executable, const char* variable_name, void* address) { - return HsaApiTable->hsa_executable_global_variable_define_fn( + return coreApiTable->hsa_executable_global_variable_define_fn( executable, variable_name, address); } @@ -554,7 +563,7 @@ hsa_status_t HSA_API hsa_agent_t agent, const char* variable_name, void* address) { - return HsaApiTable->hsa_executable_agent_global_variable_define_fn( + return coreApiTable->hsa_executable_agent_global_variable_define_fn( executable, agent, variable_name, address); } @@ -563,13 +572,13 @@ hsa_status_t HSA_API hsa_agent_t agent, const char* variable_name, void* address) { - return HsaApiTable->hsa_executable_readonly_variable_define_fn( + return coreApiTable->hsa_executable_readonly_variable_define_fn( executable, agent, variable_name, address); } hsa_status_t HSA_API hsa_executable_validate(hsa_executable_t executable, uint32_t* result) { - return HsaApiTable->hsa_executable_validate_fn(executable, result); + return coreApiTable->hsa_executable_validate_fn(executable, result); } hsa_status_t HSA_API @@ -577,7 +586,7 @@ hsa_status_t HSA_API const char* module_name, const char* symbol_name, hsa_agent_t agent, int32_t call_convention, hsa_executable_symbol_t* symbol) { - return HsaApiTable->hsa_executable_get_symbol_fn( + return coreApiTable->hsa_executable_get_symbol_fn( executable, module_name, symbol_name, agent, call_convention, symbol); } @@ -585,7 +594,7 @@ hsa_status_t HSA_API hsa_executable_symbol_get_info(hsa_executable_symbol_t executable_symbol, hsa_executable_symbol_info_t attribute, void* value) { - return HsaApiTable->hsa_executable_symbol_get_info_fn(executable_symbol, + return coreApiTable->hsa_executable_symbol_get_info_fn(executable_symbol, attribute, value); } @@ -594,11 +603,227 @@ hsa_status_t HSA_API hsa_executable_iterate_symbols( hsa_status_t (*callback)(hsa_executable_t executable, hsa_executable_symbol_t symbol, void* data), void* data) { - return HsaApiTable->hsa_executable_iterate_symbols_fn(executable, callback, + return coreApiTable->hsa_executable_iterate_symbols_fn(executable, callback, data); } hsa_status_t HSA_API hsa_status_string(hsa_status_t status, const char** status_string) { - return HsaApiTable->hsa_status_string_fn(status, status_string); + return coreApiTable->hsa_status_string_fn(status, status_string); +} + +/* + * Following set of functions are bundled as AMD Extension Apis + */ + +// Pass through stub functions +hsa_status_t HSA_API hsa_amd_coherency_get_type(hsa_agent_t agent, + hsa_amd_coherency_type_t* type) { + return amdExtTable->hsa_amd_coherency_get_type_fn(agent, type); } + +// Pass through stub functions +hsa_status_t HSA_API hsa_amd_coherency_set_type(hsa_agent_t agent, + hsa_amd_coherency_type_t type) { + return amdExtTable->hsa_amd_coherency_set_type_fn(agent, type); +} + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API + hsa_amd_profiling_set_profiler_enabled(hsa_queue_t* queue, int enable) { + return amdExtTable->hsa_amd_profiling_set_profiler_enabled_fn( + queue, enable); +} + +hsa_status_t HSA_API + hsa_amd_profiling_async_copy_enable(bool enable) { + return amdExtTable->hsa_amd_profiling_async_copy_enable_fn(enable); +} + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API hsa_amd_profiling_get_dispatch_time( + hsa_agent_t agent, hsa_signal_t signal, + hsa_amd_profiling_dispatch_time_t* time) { + return amdExtTable->hsa_amd_profiling_get_dispatch_time_fn( + agent, signal, time); +} + +hsa_status_t HSA_API + hsa_amd_profiling_get_async_copy_time( + hsa_signal_t hsa_signal, hsa_amd_profiling_async_copy_time_t* time) { + return amdExtTable->hsa_amd_profiling_get_async_copy_time_fn(hsa_signal, time); +} + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API + hsa_amd_profiling_convert_tick_to_system_domain(hsa_agent_t agent, + uint64_t agent_tick, + uint64_t* system_tick) { + return amdExtTable->hsa_amd_profiling_convert_tick_to_system_domain_fn( + agent, agent_tick, system_tick); +} + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API + hsa_amd_signal_async_handler(hsa_signal_t signal, + hsa_signal_condition_t cond, + hsa_signal_value_t value, + hsa_amd_signal_handler handler, void* arg) { + return amdExtTable->hsa_amd_signal_async_handler_fn( + signal, cond, value, handler, arg); +} + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API + hsa_amd_async_function(void (*callback)(void* arg), void* arg) { + return amdExtTable->hsa_amd_async_function_fn(callback, arg); +} + +// Mirrors Amd Extension Apis +uint32_t HSA_API + hsa_amd_signal_wait_any(uint32_t signal_count, hsa_signal_t* signals, + hsa_signal_condition_t* conds, + hsa_signal_value_t* values, uint64_t timeout_hint, + hsa_wait_state_t wait_hint, + hsa_signal_value_t* satisfying_value) { + return amdExtTable->hsa_amd_signal_wait_any_fn( + signal_count, signals, + conds, values, timeout_hint, + wait_hint, satisfying_value); +} + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue, + uint32_t num_cu_mask_count, + const uint32_t* cu_mask) { + return amdExtTable->hsa_amd_queue_cu_set_mask_fn( + queue, num_cu_mask_count, cu_mask); +} + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API + hsa_amd_memory_pool_get_info(hsa_amd_memory_pool_t memory_pool, + hsa_amd_memory_pool_info_t attribute, + void* value) { + return amdExtTable->hsa_amd_memory_pool_get_info_fn( + memory_pool, attribute, value); +} + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API hsa_amd_agent_iterate_memory_pools( + hsa_agent_t agent, + hsa_status_t (*callback)(hsa_amd_memory_pool_t memory_pool, void* data), + void* data) { + return amdExtTable->hsa_amd_agent_iterate_memory_pools_fn( + agent, callback, data); +} + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API + hsa_amd_memory_pool_allocate(hsa_amd_memory_pool_t memory_pool, size_t size, + uint32_t flags, void** ptr) { + return amdExtTable->hsa_amd_memory_pool_allocate_fn( + memory_pool, size, flags, ptr); +} + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API hsa_amd_memory_pool_free(void* ptr) { + return amdExtTable->hsa_amd_memory_pool_free_fn(ptr); +} + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API + hsa_amd_memory_async_copy(void* dst, hsa_agent_t dst_agent, const void* src, + hsa_agent_t src_agent, size_t size, + uint32_t num_dep_signals, + const hsa_signal_t* dep_signals, + hsa_signal_t completion_signal) { + return amdExtTable->hsa_amd_memory_async_copy_fn( + dst, dst_agent, src, src_agent, size, + num_dep_signals, dep_signals, completion_signal); +} + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API hsa_amd_agent_memory_pool_get_info( + hsa_agent_t agent, hsa_amd_memory_pool_t memory_pool, + hsa_amd_agent_memory_pool_info_t attribute, void* value) { + return amdExtTable->hsa_amd_agent_memory_pool_get_info_fn( + agent, memory_pool, attribute, value); +} + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API + hsa_amd_agents_allow_access(uint32_t num_agents, const hsa_agent_t* agents, + const uint32_t* flags, const void* ptr) { + return amdExtTable->hsa_amd_agents_allow_access_fn( + num_agents, agents, flags, ptr); +} + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API + hsa_amd_memory_pool_can_migrate(hsa_amd_memory_pool_t src_memory_pool, + hsa_amd_memory_pool_t dst_memory_pool, + bool* result) { + return amdExtTable->hsa_amd_memory_pool_can_migrate_fn( + src_memory_pool, dst_memory_pool, result); +} + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API hsa_amd_memory_migrate(const void* ptr, + hsa_amd_memory_pool_t memory_pool, + uint32_t flags) { + return amdExtTable->hsa_amd_memory_migrate_fn( + ptr, memory_pool, flags); +} + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API hsa_amd_memory_lock(void* host_ptr, size_t size, + hsa_agent_t* agents, int num_agent, + void** agent_ptr) { + return amdExtTable->hsa_amd_memory_lock_fn( + host_ptr, size, agents, num_agent, agent_ptr); +} + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API hsa_amd_memory_unlock(void* host_ptr) { + return amdExtTable->hsa_amd_memory_unlock_fn(host_ptr); + +} + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API + hsa_amd_memory_fill(void* ptr, uint32_t value, size_t count) { + return amdExtTable->hsa_amd_memory_fill_fn(ptr, value, count); +} + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API hsa_amd_interop_map_buffer(uint32_t num_agents, + hsa_agent_t* agents, + int interop_handle, + uint32_t flags, + size_t* size, + void** ptr, + size_t* metadata_size, + const void** metadata) { + return amdExtTable->hsa_amd_interop_map_buffer_fn( + num_agents, agents, interop_handle, + flags, size, ptr, metadata_size, metadata); +} + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API hsa_amd_interop_unmap_buffer(void* ptr) { + return amdExtTable->hsa_amd_interop_unmap_buffer_fn(ptr); +} + +// Use the function pointer from local instance Image Extension +hsa_status_t HSA_API hsa_amd_image_create( + hsa_agent_t agent, + const hsa_ext_image_descriptor_t *image_descriptor, + const hsa_amd_image_descriptor_t *image_layout, + const void *image_data, + hsa_access_permission_t access_permission, + hsa_ext_image_t *image) { + return amdExtTable->hsa_amd_image_create_fn(agent, image_descriptor, + image_layout, image_data, access_permission, image); +} + diff --git a/src/core/common/shared.h b/src/core/common/shared.h index 76720bd79..fdf89b625 100644 --- a/src/core/common/shared.h +++ b/src/core/common/shared.h @@ -82,6 +82,8 @@ class Shared : public BaseShared { assert(shared_object_ != NULL && "Failed on allocating shared_object_"); + memset(shared_object_, 0, sizeof(T)); + if (shared_object_ != NULL) new (shared_object_) T; } diff --git a/src/core/hsacore.so.def b/src/core/hsacore.so.def index 4e04a8e2a..130dd0875 100644 --- a/src/core/hsacore.so.def +++ b/src/core/hsacore.so.def @@ -107,6 +107,8 @@ global: hsa_amd_coherency_set_type; hsa_amd_profiling_set_profiler_enabled; hsa_amd_profiling_get_dispatch_time; + hsa_amd_profiling_async_copy_enable; + hsa_amd_profiling_get_async_copy_time; hsa_amd_profiling_convert_tick_to_system_domain; hsa_amd_signal_wait_any; hsa_amd_signal_async_handler; diff --git a/src/core/inc/agent.h b/src/core/inc/agent.h index abd7acf84..41867eb3c 100644 --- a/src/core/inc/agent.h +++ b/src/core/inc/agent.h @@ -107,7 +107,9 @@ class Agent : public Checked<0xF6BC25EB17E6F917> { // // @param [in] type CPU or GPU or other. explicit Agent(uint32_t node_id, DeviceType type) - : node_id_(node_id), device_type_(uint32_t(type)) { + : node_id_(node_id), + device_type_(uint32_t(type)), + profiling_enabled_(false) { public_handle_ = Convert(this); } @@ -115,7 +117,7 @@ class Agent : public Checked<0xF6BC25EB17E6F917> { // // @param [in] type CPU or GPU or other. explicit Agent(uint32_t node_id, uint32_t type) - : node_id_(node_id), device_type_(type) { + : node_id_(node_id), device_type_(type), profiling_enabled_(false) { public_handle_ = Convert(this); } @@ -240,6 +242,19 @@ class Agent : public Checked<0xF6BC25EB17E6F917> { // @brief Returns node id associated with this agent. __forceinline uint32_t node_id() const { return node_id_; } + // @brief Getter for profiling_enabled_. + __forceinline bool profiling_enabled() const { return profiling_enabled_; } + + // @brief Setter for profiling_enabled_. + virtual hsa_status_t profiling_enabled(bool enable) { + const hsa_status_t stat = EnableDmaProfiling(enable); + if (HSA_STATUS_SUCCESS == stat) { + profiling_enabled_ = enable; + } + + return stat; + } + protected: // Intention here is to have a polymorphic update procedure for public_handle_ // which is callable on any Agent* but only from some class dervied from @@ -254,6 +269,17 @@ class Agent : public Checked<0xF6BC25EB17E6F917> { public_handle_ = handle; } + // @brief Enable profiling of the asynchronous DMA copy. The timestamp + // of each copy request will be stored in the completion signal structure. + // + // @param enable True to enable profiling. False to disable profiling. + // + // @retval HSA_STATUS_SUCCESS The profiling is enabled and the + // timing of subsequent async copy will be measured. + virtual hsa_status_t EnableDmaProfiling(bool enable) { + return HSA_STATUS_SUCCESS; + } + hsa_agent_t public_handle_; private: @@ -262,6 +288,8 @@ class Agent : public Checked<0xF6BC25EB17E6F917> { const uint32_t device_type_; + bool profiling_enabled_; + // Forbid copying and moving of this object DISALLOW_COPY_AND_ASSIGN(Agent); }; diff --git a/src/core/inc/amd_aql_queue.h b/src/core/inc/amd_aql_queue.h index 25cb252f8..80c462977 100644 --- a/src/core/inc/amd_aql_queue.h +++ b/src/core/inc/amd_aql_queue.h @@ -358,6 +358,10 @@ class AqlQueue : public core::Queue, public core::Signal { static bool DynamicScratchHandler(hsa_signal_value_t error_code, void* arg); + /// @brief Define the Scratch Buffer Descriptor and related parameters + /// that enable kernel access scratch memory + void InitScratchSRD(); + // AQL packet ring buffer void* ring_buf_; @@ -380,8 +384,6 @@ class AqlQueue : public core::Queue, public core::Signal { // Handle of agent, which queue is attached to GpuAgent* agent_; - hsa_profile_t agent_profile_; - uint32_t queue_full_workaround_; // Handle of scratch memory descriptor diff --git a/src/core/inc/amd_blit_kernel.h b/src/core/inc/amd_blit_kernel.h index a7b0a58f8..f6f7b27e2 100644 --- a/src/core/inc/amd_blit_kernel.h +++ b/src/core/inc/amd_blit_kernel.h @@ -43,6 +43,7 @@ #ifndef HSA_RUNTIME_CORE_INC_AMD_BLIT_KERNEL_H_ #define HSA_RUNTIME_CORE_INC_AMD_BLIT_KERNEL_H_ +#include <map> #include <stdint.h> #include "core/inc/blit.h" @@ -66,8 +67,10 @@ class BlitKernel : public core::Blit { /// /// @note: The call will block until all AQL packets have been executed. /// + /// @param agent Agent passed to Initialize. + /// /// @return hsa_status_t - virtual hsa_status_t Destroy() override; + virtual hsa_status_t Destroy(const core::Agent& agent) override; /// @brief Submit an AQL packet to perform vector copy. The call is blocking /// until the command execution is finished. @@ -102,19 +105,40 @@ class BlitKernel : public core::Blit { virtual hsa_status_t SubmitLinearFillCommand(void* ptr, uint32_t value, size_t count) override; + virtual hsa_status_t EnableProfiling(bool enable) override; + private: union KernelArgs { - struct __ALIGNED__(16) KernelCopyArgs { - const void* src; - void* dst; - uint64_t size; - uint32_t use_vector; - } copy; - - struct __ALIGNED__(16) KernelFillArgs { - void* ptr; - uint64_t num; - uint32_t value; + struct __ALIGNED__(16) { + uint64_t phase1_src_start; + uint64_t phase1_dst_start; + uint64_t phase2_src_start; + uint64_t phase2_dst_start; + uint64_t phase3_src_start; + uint64_t phase3_dst_start; + uint64_t phase4_src_start; + uint64_t phase4_dst_start; + uint64_t phase4_src_end; + uint64_t phase4_dst_end; + uint32_t num_workitems; + } copy_aligned; + + struct __ALIGNED__(16) { + uint64_t phase1_src_start; + uint64_t phase1_dst_start; + uint64_t phase2_src_start; + uint64_t phase2_dst_start; + uint64_t phase2_src_end; + uint64_t phase2_dst_end; + uint32_t num_workitems; + } copy_misaligned; + + struct __ALIGNED__(16) { + uint64_t phase1_dst_start; + uint64_t phase2_dst_start; + uint64_t phase2_dst_end; + uint32_t fill_value; + uint32_t num_workitems; } fill; }; @@ -136,14 +160,19 @@ class BlitKernel : public core::Blit { KernelArgs* ObtainAsyncKernelCopyArg(); - /// Handles to the vector copy kernel. - uint64_t copy_code_handle_; + /// AQL code object and size for each kernel. + enum class KernelType { + CopyAligned, + CopyMisaligned, + Fill, + }; - /// Handles to the vector copy aligned kernel. - uint64_t copy_aligned_code_handle_; + struct KernelCode { + void* code_buf_; + size_t code_buf_size_; + }; - /// Handles to the fill memory kernel. - uint64_t fill_code_handle_; + std::map<KernelType, KernelCode> kernels_; /// AQL queue for submitting the vector copy kernel. hsa_queue_t* queue_; @@ -163,12 +192,8 @@ class BlitKernel : public core::Blit { /// Lock to synchronize access to kernarg_ and completion_signal_ std::mutex lock_; - /// Pointer to memory containing the ISA and argument buffer. - void* code_arg_buffer_; - - static const size_t kMaxCopyCount; - static const size_t kMaxFillCount; - static const uint32_t kGroupSize; + /// Number of CUs on the underlying agent. + int num_cus_; }; } // namespace amd diff --git a/src/core/inc/amd_blit_kernel_kv.h b/src/core/inc/amd_blit_kernel_kv.h deleted file mode 100644 index a8e235ea7..000000000 --- a/src/core/inc/amd_blit_kernel_kv.h +++ /dev/null @@ -1,479 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// -// The University of Illinois/NCSA -// Open Source License (NCSA) -// -// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// -// Developed by: -// -// AMD Research and AMD HSA Software Development -// -// Advanced Micro Devices, Inc. -// -// www.amd.com -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to -// deal with the Software without restriction, including without limitation -// the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the -// Software is furnished to do so, subject to the following conditions: -// -// - Redistributions of source code must retain the above copyright notice, -// this list of conditions and the following disclaimers. -// - Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimers in -// the documentation and/or other materials provided with the distribution. -// - Neither the names of Advanced Micro Devices, Inc, -// nor the names of its contributors may be used to endorse or promote -// products derived from this Software without specific prior written -// permission. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR -// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -// DEALINGS WITH THE SOFTWARE. -// -//////////////////////////////////////////////////////////////////////////////// - -#ifndef HSA_RUNTIME_CORE_INC_AMD_BLIT_KERNEL_KV_H_ -#define HSA_RUNTIME_CORE_INC_AMD_BLIT_KERNEL_KV_H_ - -#include <stddef.h> - -#define HSA_VECTOR_COPY_KV_AKC_SIZE 368 -#define HSA_VECTOR_COPY_KV_AKC_OFFSET 256 - -/*****HSAIL code of the ISA in ::kVectorCopyRawKv. -module &m:1:0:$full:$large:$default; - -prog kernel &__vector_copy_kernel( - kernarg_u64 %src, - kernarg_u64 %dst, - kernarg_u64 %size) -{ - @__vector_copy_kernel_entry: - // BB#0: // %entry - workitemabsid_u32 $s0, 0; - cvt_u64_u32 $d0, $s0; - ld_kernarg_align(8)_width(all)_u64 $d1, [%size]; - cmp_ge_b1_u64 $c0, $d0, $d1; - cbr_b1 $c0, @BB0_2; - // BB#1: // %if.end - ld_kernarg_align(8)_width(all)_u64 $d1, [%src]; - ld_kernarg_align(8)_width(all)_u64 $d2, [%dst]; - add_u64 $d2, $d2, $d0; - add_u64 $d0, $d1, $d0; - ld_global_u8 $s0, [$d0]; - st_global_u8 $s0, [$d2]; - - @BB0_2: - // %return - ret; -}; -*/ - -static char kVectorCopyRawKv[] = { - 127, 69, 76, 70, 2, 1, 1, 64, 0, 0, 0, 0, 0, - 0, 0, 0, 1, 0, -32, 0, 1, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 64, 0, 0, 0, 0, 0, 0, - 0, -104, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 64, 0, 56, 0, 1, 0, 64, 0, 6, 0, 5, 0, 3, - 0, 0, 96, 6, 0, 0, 0, 0, 1, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 112, 1, 0, 0, 0, 0, 0, 0, - 112, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, - 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, - 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 65, 0, -116, 0, -112, 0, 0, 0, - 11, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 11, 0, 5, 0, 5, 0, 0, 0, 9, 0, 0, - 0, 0, 0, 0, 0, 3, 0, 0, 6, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 1, 5, 0, -64, 127, 0, -116, -65, - 0, -1, -128, -109, 0, 0, 16, 0, 0, 8, 0, -109, 0, - 0, 0, 74, 4, 7, 64, -64, -128, 2, 2, 126, 127, 0, - -116, -65, 0, 0, -56, 125, 106, 36, -128, -66, 15, 0, -120, - -65, 0, 7, -126, -64, 127, 0, -116, -65, 4, 0, 2, 74, - 5, 2, 4, 126, 2, 106, 80, -46, 2, 1, -87, 1, 0, - 0, 32, -36, 1, 0, 0, 1, 6, 0, 6, 74, 7, 2, - 4, 126, 4, 106, 80, -46, 2, 1, -87, 1, 112, 0, -116, - -65, 0, 0, 96, -36, 3, 1, 0, 0, 0, 0, -127, -65, - 3, 0, 0, 0, 8, 0, 0, 0, 1, 0, 0, 0, 65, - 77, 68, 0, 1, 0, 0, 0, 0, 0, 0, 0, 3, 0, - 0, 0, 12, 0, 0, 0, 2, 0, 0, 0, 65, 77, 68, - 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, - 3, 0, 0, 0, 28, 0, 0, 0, 3, 0, 0, 0, 65, - 77, 68, 0, 4, 0, 7, 0, 7, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 65, 77, 68, 0, 65, 77, 68, - 71, 80, 85, 0, 0, 3, 0, 0, 0, 40, 0, 0, 0, - 4, 0, 0, 0, 65, 77, 68, 0, 26, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 65, 77, 68, 32, 72, 83, - 65, 32, 82, 117, 110, 116, 105, 109, 101, 32, 70, 105, 110, - 97, 108, 105, 122, 101, 114, 0, 0, 0, 38, 95, 95, 118, - 101, 99, 116, 111, 114, 95, 99, 111, 112, 121, 95, 107, 101, - 114, 110, 101, 108, 0, 95, 95, 104, 115, 97, 95, 115, 101, - 99, 116, 105, 111, 110, 46, 104, 115, 97, 116, 101, 120, 116, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 26, 0, 1, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 22, 0, 0, 0, 3, 0, 1, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 46, 104, 115, 97, 116, 101, 120, 116, 0, 46, 110, - 111, 116, 101, 0, 46, 115, 116, 114, 116, 97, 98, 0, 46, - 115, 121, 109, 116, 97, 98, 0, 46, 115, 104, 115, 116, 114, - 116, 97, 98, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, - 1, 0, 0, 0, 7, 0, -64, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, - 0, 0, 112, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, 0, 0, 7, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 112, 2, 0, 0, 0, 0, 0, - 0, -104, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 3, 0, - 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 8, 3, 0, 0, 0, 0, 0, 0, - 44, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 24, 0, 0, 0, 2, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 56, 3, 0, 0, 0, 0, 0, 0, 48, - 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, - 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 24, 0, 0, - 0, 0, 0, 0, 0, 32, 0, 0, 0, 3, 0, 0, 0, - 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 104, 3, 0, 0, 0, 0, 0, 0, 42, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, -}; -extern char* const kVectorCopyKvObject = &kVectorCopyRawKv[0]; -extern size_t const kVectorCopyKvObjectSize = sizeof(kVectorCopyRawKv); - -#define HSA_VECTOR_COPY_ALIGNED_KV_AKC_SIZE 436 -#define HSA_VECTOR_COPY_ALIGNED_KV_AKC_OFFSET 256 - -/*****HSAIL code of the ISA in ::kVectorCopyAlignedRawKv. -module &m:1:0:$full:$large:$default; -extension "amd:gcn"; - -prog kernel &__copy_buffer_aligned_kernel( - kernarg_u64 %src, - kernarg_u64 %dst, - kernarg_u64 %size, - kernarg_u32 %use_vector) -{ - @__copy_buffer_aligned_kernel_entry: - // BB#0: // %entry - workitemabsid_u32 $s0, 0; - cvt_u64_u32 $d0, $s0; - ld_kernarg_align(8)_width(all)_u64 $d1, [%size]; - cmp_ge_b1_u64 $c0, $d0, $d1; - cbr_b1 $c0, @LBB0_4; - // BB#1: // %if.end - ld_kernarg_align(8)_width(all)_u64 $d2, [%dst]; - ld_kernarg_align(8)_width(all)_u64 $d1, [%src]; - ld_kernarg_align(4)_width(all)_u32 $s0, [%use_vector]; - cmp_ne_b1_s32 $c0, $s0, 1; - cbr_b1 $c0, @LBB0_3; - // BB#2: // %if.then2 - shl_u64 $d0, $d0, 4; - add_u64 $d2, $d2, $d0; - add_u64 $d0, $d1, $d0; - ld_v4_global_align(16)_const_u32 ($s0, $s1, $s2, $s3), [$d0]; - st_v4_global_align(16)_u32 ($s0, $s1, $s2, $s3), [$d2]; - br @LBB0_4; - - @LBB0_3: - // %if.else - shl_u64 $d0, $d0, 2; - add_u64 $d2, $d2, $d0; - add_u64 $d0, $d1, $d0; - ld_global_align(4)_const_u32 $s0, [$d0]; - st_global_align(4)_u32 $s0, [$d2]; - - @LBB0_4: - // %if.end6 - ret; -}; -*/ - -static char kVectorCopyAlignedRawKv[] = { - 127, 69, 76, 70, 2, 1, 1, 64, 0, 0, 0, 0, 0, - 0, 0, 0, 1, 0, -32, 0, 1, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 64, 0, 0, 0, 0, 0, 0, - 0, -8, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 64, 0, 56, 0, 1, 0, 64, 0, 6, 0, 5, 0, 3, - 0, 0, 96, 6, 0, 0, 0, 0, 1, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, -76, 1, 0, 0, 0, 0, 0, 0, - -76, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, - 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, - 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 65, 0, -84, 0, -112, 0, 0, 0, - 11, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 11, 0, 7, 0, 7, 0, 0, 0, 9, 0, 0, - 0, 0, 0, 0, 0, 4, 4, 4, 6, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 1, 5, 0, -64, 127, 0, -116, -65, - 0, -1, -128, -109, 0, 0, 16, 0, 0, 8, 0, -109, 0, - 0, 0, 74, 4, 7, 64, -64, -128, 2, 2, 126, 127, 0, - -116, -65, 0, 0, -56, 125, 106, 36, -128, -66, 32, 0, -120, - -65, 6, 7, 1, -64, 0, 7, -126, -64, 127, 0, -116, -65, - 2, -127, 0, -65, 14, 0, -124, -65, 0, 0, -62, -46, 0, - 9, 1, 0, 4, 0, 4, 74, 5, 2, 6, 126, 3, 3, - 6, 80, 0, 0, 56, -36, 2, 0, 0, 2, 6, 0, 0, - 74, 7, 2, 12, 126, 6, 3, 2, 80, 112, 0, -116, -65, - 0, 0, 120, -36, 0, 2, 0, 0, 13, 0, -126, -65, 0, - 0, -62, -46, 0, 5, 1, 0, 4, 0, 4, 74, 5, 2, - 6, 126, 3, 3, 6, 80, 0, 0, 48, -36, 2, 0, 0, - 2, 6, 0, 0, 74, 7, 2, 6, 126, 3, 3, 2, 80, - 112, 0, -116, -65, 0, 0, 112, -36, 0, 2, 0, 0, 0, - 0, -127, -65, 0, 0, 0, 0, 4, 0, 0, 0, 8, 0, - 0, 0, 1, 0, 0, 0, 65, 77, 68, 0, 1, 0, 0, - 0, 0, 0, 0, 0, 4, 0, 0, 0, 12, 0, 0, 0, - 2, 0, 0, 0, 65, 77, 68, 0, 1, 0, 0, 0, 0, - 0, 0, 0, 1, 1, 1, 0, 4, 0, 0, 0, 25, 0, - 0, 0, 5, 0, 0, 0, 65, 77, 68, 0, 22, 0, 45, - 104, 115, 97, 95, 99, 97, 108, 108, 95, 99, 111, 110, 118, - 101, 110, 116, 105, 111, 110, 61, 0, 0, 0, 0, 0, 4, - 0, 0, 0, 30, 0, 0, 0, 3, 0, 0, 0, 65, 77, - 68, 0, 4, 0, 7, 0, 7, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 65, 77, 68, 0, 65, 77, 68, 71, - 80, 85, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 8, - 0, 0, 0, 4, 0, 0, 0, 65, 77, 68, 0, -32, 101, - -118, -12, -1, 127, 0, 0, 38, 95, 95, 99, 111, 112, 121, - 95, 98, 117, 102, 102, 101, 114, 95, 97, 108, 105, 103, 110, - 101, 100, 95, 107, 101, 114, 110, 101, 108, 0, 95, 95, 104, - 115, 97, 95, 115, 101, 99, 116, 105, 111, 110, 46, 104, 115, - 97, 116, 101, 120, 116, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 26, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, - -76, 1, 0, 0, 0, 0, 0, 0, 30, 0, 0, 0, 3, - 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 46, 104, 115, 97, 116, 101, - 120, 116, 0, 46, 110, 111, 116, 101, 0, 46, 115, 116, 114, - 116, 97, 98, 0, 46, 115, 121, 109, 116, 97, 98, 0, 46, - 115, 104, 115, 116, 114, 116, 97, 98, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 1, 0, 0, 0, 1, 0, 0, 0, 7, 0, -64, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 1, 0, 0, 0, 0, 0, 0, -76, 1, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 10, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -72, 2, - 0, 0, 0, 0, 0, 0, -88, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, - 0, 0, 0, 3, 0, 0, 0, 32, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 96, 3, 0, - 0, 0, 0, 0, 0, 52, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 24, 0, - 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, -104, 3, 0, 0, - 0, 0, 0, 0, 48, 0, 0, 0, 0, 0, 0, 0, 3, - 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, - 0, 0, 24, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, - 0, 3, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, -56, 3, 0, 0, 0, - 0, 0, 0, 42, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, -}; -extern char* const kVectorCopyAlignedKvObject = &kVectorCopyAlignedRawKv[0]; -extern size_t const kVectorCopyAlignedKvObjectSize = - sizeof(kVectorCopyAlignedRawKv); - -#define HSA_FILL_MEMORY_KV_AKC_SIZE 352 -#define HSA_FILL_MEMORY_KV_AKC_OFFSET 256 - -/*****HSAIL code of the ISA in ::kFillMemoryRawKv. -module &m:1:0:$full:$large:$default; -extension "amd:gcn"; - -prog kernel &__fill_memory_kernel( -kernarg_u64 %ptr, -kernarg_u64 %num, -kernarg_u32 %value) -{ -@__fill_memory_kernel_entry: -// BB#0: // %entry -workitemabsid_u32 $s0, 0; -cvt_u64_u32 $d0, $s0; -ld_kernarg_align(8)_width(all)_u64 $d1, [%num]; -cmp_ge_b1_u64 $c0, $d0, $d1; -cbr_b1 $c0, @LBB0_2; -// BB#1: // %if.end -ld_kernarg_align(8)_width(all)_u64 $d1, [%ptr]; -ld_kernarg_align(4)_width(all)_u32 $s0, [%value]; -shl_u64 $d0, $d0, 2; -add_u64 $d0, $d1, $d0; -st_global_align(4)_u32 $s0, [$d0]; - -@LBB0_2: -// %return -ret; -}; -*/ - -static char kFillMemoryRawKv[] = { - 127, 69, 76, 70, 2, 1, 1, 64, 0, 0, 0, 0, 0, 0, - 0, 0, 1, 0, -32, 0, 1, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 64, 0, 0, 0, 0, 0, 0, 0, -104, 3, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 64, 0, 56, 0, - 1, 0, 64, 0, 6, 0, 5, 0, 3, 0, 0, 96, 6, 0, - 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 96, 1, - 0, 0, 0, 0, 0, 0, 96, 1, 0, 0, 0, 0, 0, 0, - 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, - 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 64, 0, -84, 0, - -112, 0, 0, 0, 11, 0, 10, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 11, 0, 3, 0, 3, 0, 0, 0, 9, 0, - 0, 0, 0, 0, 0, 0, 4, 4, 4, 6, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 1, 5, 0, -64, 127, 0, - -116, -65, 0, -1, -128, -109, 0, 0, 16, 0, 0, 8, 0, -109, - 0, 0, 0, 74, 2, 7, 64, -64, -128, 2, 2, 126, 127, 0, - -116, -65, 0, 0, -56, 125, 106, 36, -128, -66, 11, 0, -120, -65, - 0, 7, 65, -64, 4, 7, 2, -64, 0, 0, -62, -46, 0, 5, - 1, 0, 127, 0, -116, -65, 2, 0, 0, 74, 3, 2, 4, 126, - 2, 3, 2, 80, 4, 2, 4, 126, 0, 0, 112, -36, 0, 2, - 0, 0, 0, 0, -127, -65, 4, 0, 0, 0, 8, 0, 0, 0, - 1, 0, 0, 0, 65, 77, 68, 0, 1, 0, 0, 0, 0, 0, - 0, 0, 4, 0, 0, 0, 12, 0, 0, 0, 2, 0, 0, 0, - 65, 77, 68, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, - 1, 0, 4, 0, 0, 0, 25, 0, 0, 0, 5, 0, 0, 0, - 65, 77, 68, 0, 22, 0, 45, 104, 115, 97, 95, 99, 97, 108, - 108, 95, 99, 111, 110, 118, 101, 110, 116, 105, 111, 110, 61, 0, - 0, 0, 0, 0, 4, 0, 0, 0, 30, 0, 0, 0, 3, 0, - 0, 0, 65, 77, 68, 0, 4, 0, 7, 0, 7, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 65, 77, 68, 0, 65, 77, - 68, 71, 80, 85, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, - 8, 0, 0, 0, 4, 0, 0, 0, 65, 77, 68, 0, 48, 123, - 44, -103, -4, 127, 0, 0, 38, 95, 95, 102, 105, 108, 108, 95, - 109, 101, 109, 111, 114, 121, 95, 107, 101, 114, 110, 101, 108, 0, - 95, 95, 104, 115, 97, 95, 115, 101, 99, 116, 105, 111, 110, 46, - 104, 115, 97, 116, 101, 120, 116, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 26, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 96, 1, 0, 0, 0, 0, 0, 0, 22, 0, 0, 0, 3, 0, - 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 46, 104, 115, 97, 116, 101, 120, 116, 0, - 46, 110, 111, 116, 101, 0, 46, 115, 116, 114, 116, 97, 98, 0, - 46, 115, 121, 109, 116, 97, 98, 0, 46, 115, 104, 115, 116, 114, - 116, 97, 98, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 7, 0, - -64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 1, 0, 0, 0, 0, 0, 0, 96, 1, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, - 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 96, 2, 0, 0, 0, 0, - 0, 0, -88, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 3, 0, 0, 0, - 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 8, 3, 0, 0, 0, 0, 0, 0, 44, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 24, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 56, 3, 0, 0, - 0, 0, 0, 0, 48, 0, 0, 0, 0, 0, 0, 0, 3, 0, - 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, - 24, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 3, 0, - 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 104, 3, 0, 0, 0, 0, 0, 0, 42, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, -}; - -extern char* const kFillMemoryKvObject = &kFillMemoryRawKv[0]; -extern size_t const kFillMemoryKvObjectSize = sizeof(kFillMemoryRawKv); -#endif // header guard \ No newline at end of file diff --git a/src/core/inc/amd_blit_kernel_vi.h b/src/core/inc/amd_blit_kernel_vi.h deleted file mode 100644 index 13969370b..000000000 --- a/src/core/inc/amd_blit_kernel_vi.h +++ /dev/null @@ -1,490 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// -// The University of Illinois/NCSA -// Open Source License (NCSA) -// -// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// -// Developed by: -// -// AMD Research and AMD HSA Software Development -// -// Advanced Micro Devices, Inc. -// -// www.amd.com -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to -// deal with the Software without restriction, including without limitation -// the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the -// Software is furnished to do so, subject to the following conditions: -// -// - Redistributions of source code must retain the above copyright notice, -// this list of conditions and the following disclaimers. -// - Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimers in -// the documentation and/or other materials provided with the distribution. -// - Neither the names of Advanced Micro Devices, Inc, -// nor the names of its contributors may be used to endorse or promote -// products derived from this Software without specific prior written -// permission. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR -// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -// DEALINGS WITH THE SOFTWARE. -// -//////////////////////////////////////////////////////////////////////////////// - -#ifndef HSA_RUNTIME_CORE_INC_AMD_BLIT_KERNEL_VI_H_ -#define HSA_RUNTIME_CORE_INC_AMD_BLIT_KERNEL_VI_H_ - -#include <stddef.h> - -#define HSA_VECTOR_COPY_VI_AKC_SIZE 380 -#define HSA_VECTOR_COPY_VI_AKC_OFFSET 256 - -/*****HSAIL code of the ISA in ::kVectorCopyRawVi. -module &m:1:0:$full:$large:$default; - -prog kernel &__vector_copy_kernel( - kernarg_u64 %src, - kernarg_u64 %dst, - kernarg_u64 %size) -{ - @__vector_copy_kernel_entry: - // BB#0: // %entry - workitemabsid_u32 $s0, 0; - cvt_u64_u32 $d0, $s0; - ld_kernarg_align(8)_width(all)_u64 $d1, [%size]; - cmp_ge_b1_u64 $c0, $d0, $d1; - cbr_b1 $c0, @BB0_2; - // BB#1: // %if.end - ld_kernarg_align(8)_width(all)_u64 $d1, [%src]; - ld_kernarg_align(8)_width(all)_u64 $d2, [%dst]; - add_u64 $d2, $d2, $d0; - add_u64 $d0, $d1, $d0; - ld_global_u8 $s0, [$d0]; - st_global_u8 $s0, [$d2]; - - @BB0_2: - // %return - ret; -}; -*/ - -static char kVectorCopyRawVi[] = { - 127, 69, 76, 70, 2, 1, 1, 64, 0, 0, 0, 0, 0, - 0, 0, 0, 1, 0, -32, 0, 1, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 64, 0, 0, 0, 0, 0, 0, - 0, -72, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 64, 0, 56, 0, 1, 0, 64, 0, 6, 0, 5, 0, 3, - 0, 0, 96, 6, 0, 0, 0, 0, 1, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 124, 1, 0, 0, 0, 0, 0, 0, - 124, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, - 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, - 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, -63, 2, -84, 0, -112, 0, 0, 0, - 11, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 96, 0, 5, 0, 5, 0, 0, 0, 9, 0, 0, - 0, 0, 0, 0, 0, 4, 4, 4, 6, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 2, 0, 2, -64, 4, 0, 0, 0, - 127, 0, -116, -65, 0, -1, -128, -110, 0, 0, 16, 0, 0, - 8, 0, -110, 0, 0, 0, 50, 3, 0, 6, -64, 16, 0, - 0, 0, -128, 2, 2, 126, 127, 0, -116, -65, 0, 0, -40, - 125, 106, 32, -128, -66, 16, 0, -120, -65, 3, 1, 10, -64, - 0, 0, 0, 0, 127, 0, -116, -65, 4, 0, 2, 50, 5, - 2, 4, 126, 2, 106, 28, -47, 2, 1, -87, 1, 0, 0, - 64, -36, 1, 0, 0, 1, 6, 0, 6, 50, 7, 2, 4, - 126, 4, 106, 28, -47, 2, 1, -87, 1, 112, 0, -116, -65, - 0, 0, 96, -36, 3, 1, 0, 0, 0, 0, -127, -65, 0, - 0, 0, 0, 4, 0, 0, 0, 8, 0, 0, 0, 1, 0, - 0, 0, 65, 77, 68, 0, 1, 0, 0, 0, 0, 0, 0, - 0, 4, 0, 0, 0, 12, 0, 0, 0, 2, 0, 0, 0, - 65, 77, 68, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, - 1, 1, 0, 4, 0, 0, 0, 25, 0, 0, 0, 5, 0, - 0, 0, 65, 77, 68, 0, 22, 0, 45, 104, 115, 97, 95, - 99, 97, 108, 108, 95, 99, 111, 110, 118, 101, 110, 116, 105, - 111, 110, 61, 0, 0, 0, 0, 0, 4, 0, 0, 0, 30, - 0, 0, 0, 3, 0, 0, 0, 65, 77, 68, 0, 4, 0, - 7, 0, 8, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, - 0, 65, 77, 68, 0, 65, 77, 68, 71, 80, 85, 0, 0, - 0, 0, 0, 0, 4, 0, 0, 0, 8, 0, 0, 0, 4, - 0, 0, 0, 65, 77, 68, 0, 32, 103, -72, 81, -3, 127, - 0, 0, 38, 95, 95, 118, 101, 99, 116, 111, 114, 95, 99, - 111, 112, 121, 95, 107, 101, 114, 110, 101, 108, 0, 95, 95, - 104, 115, 97, 95, 115, 101, 99, 116, 105, 111, 110, 46, 104, - 115, 97, 116, 101, 120, 116, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 26, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 124, 1, 0, 0, 0, 0, 0, 0, 22, 0, 0, 0, - 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 46, 104, 115, 97, 116, - 101, 120, 116, 0, 46, 110, 111, 116, 101, 0, 46, 115, 116, - 114, 116, 97, 98, 0, 46, 115, 121, 109, 116, 97, 98, 0, - 46, 115, 104, 115, 116, 114, 116, 97, 98, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 7, 0, -64, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 1, 0, 0, 0, 0, 0, 0, 124, 1, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 10, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -128, - 2, 0, 0, 0, 0, 0, 0, -88, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 16, 0, 0, 0, 3, 0, 0, 0, 32, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 40, 3, - 0, 0, 0, 0, 0, 0, 44, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 24, - 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 88, 3, 0, - 0, 0, 0, 0, 0, 48, 0, 0, 0, 0, 0, 0, 0, - 3, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, - 0, 0, 0, 24, 0, 0, 0, 0, 0, 0, 0, 32, 0, - 0, 0, 3, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, -120, 3, 0, 0, - 0, 0, 0, 0, 42, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -}; -extern char* const kVectorCopyViObject = &kVectorCopyRawVi[0]; -extern size_t const kVectorCopyViObjectSize = sizeof(kVectorCopyRawVi); - -#define HSA_VECTOR_COPY_ALIGNED_VI_AKC_SIZE 452 -#define HSA_VECTOR_COPY_ALIGNED_VI_AKC_OFFSET 256 - -/*****HSAIL code of the ISA in ::kVectorCopyAlignedRawVi. -module &m:1:0:$full:$large:$default; -extension "amd:gcn"; - -prog kernel &__copy_buffer_aligned_kernel( - kernarg_u64 %src, - kernarg_u64 %dst, - kernarg_u64 %size, - kernarg_u32 %use_vector) -{ - @__copy_buffer_aligned_kernel_entry: - // BB#0: // %entry - workitemabsid_u32 $s0, 0; - cvt_u64_u32 $d0, $s0; - ld_kernarg_align(8)_width(all)_u64 $d1, [%size]; - cmp_ge_b1_u64 $c0, $d0, $d1; - cbr_b1 $c0, @LBB0_4; - // BB#1: // %if.end - ld_kernarg_align(8)_width(all)_u64 $d2, [%dst]; - ld_kernarg_align(8)_width(all)_u64 $d1, [%src]; - ld_kernarg_align(4)_width(all)_u32 $s0, [%use_vector]; - cmp_ne_b1_s32 $c0, $s0, 1; - cbr_b1 $c0, @LBB0_3; - // BB#2: // %if.then2 - shl_u64 $d0, $d0, 4; - add_u64 $d2, $d2, $d0; - add_u64 $d0, $d1, $d0; - ld_v4_global_align(16)_const_u32 ($s0, $s1, $s2, $s3), [$d0]; - st_v4_global_align(16)_u32 ($s0, $s1, $s2, $s3), [$d2]; - br @LBB0_4; - - @LBB0_3: - // %if.else - shl_u64 $d0, $d0, 2; - add_u64 $d2, $d2, $d0; - add_u64 $d0, $d1, $d0; - ld_global_align(4)_const_u32 $s0, [$d0]; - st_global_align(4)_u32 $s0, [$d2]; - - @LBB0_4: - // %if.end6 - ret; -}; -*/ - -static char kVectorCopyAlignedRawVi[] = { - 127, 69, 76, 70, 2, 1, 1, 64, 0, 0, 0, 0, 0, - 0, 0, 0, 1, 0, -32, 0, 1, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 64, 0, 0, 0, 0, 0, 0, - 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 64, 0, 56, 0, 1, 0, 64, 0, 6, 0, 5, 0, 3, - 0, 0, 96, 6, 0, 0, 0, 0, 1, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, -60, 1, 0, 0, 0, 0, 0, 0, - -60, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, - 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, - 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 65, 0, -84, 0, -112, 0, 0, 0, - 11, 0, 74, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 16, 0, 8, 0, 8, 0, 0, 0, 12, 0, 0, - 0, 0, 0, 0, 0, 4, 4, 4, 6, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 2, 0, 2, -64, 4, 0, 0, 0, - 127, 0, -116, -65, 0, -1, -128, -110, 0, 0, 16, 0, 0, - 8, 0, -110, 0, 0, 0, 50, 3, 0, 6, -64, 16, 0, - 0, 0, -128, 2, 2, 126, 127, 0, -116, -65, 0, 0, -40, - 125, 106, 32, -128, -66, 34, 0, -120, -65, -125, 0, 2, -64, - 24, 0, 0, 0, 3, 2, 10, -64, 0, 0, 0, 0, 127, - 0, -116, -65, 2, -127, 0, -65, 14, 0, -124, -65, 0, 0, - -113, -46, -124, 0, 2, 0, 8, 0, 4, 50, 9, 2, 6, - 126, 3, 3, 6, 56, 0, 0, 92, -36, 2, 0, 0, 4, - 10, 0, 0, 50, 11, 2, 4, 126, 2, 3, 2, 56, 112, - 0, -116, -65, 0, 0, 124, -36, 0, 4, 0, 0, 13, 0, - -126, -65, 0, 0, -113, -46, -126, 0, 2, 0, 8, 0, 4, - 50, 9, 2, 6, 126, 3, 3, 6, 56, 0, 0, 80, -36, - 2, 0, 0, 4, 10, 0, 0, 50, 11, 2, 4, 126, 2, - 3, 2, 56, 112, 0, -116, -65, 0, 0, 112, -36, 0, 4, - 0, 0, 0, 0, -127, -65, 0, 0, 0, 0, 4, 0, 0, - 0, 8, 0, 0, 0, 1, 0, 0, 0, 65, 77, 68, 0, - 1, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 12, - 0, 0, 0, 2, 0, 0, 0, 65, 77, 68, 0, 1, 0, - 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 4, 0, 0, - 0, 25, 0, 0, 0, 5, 0, 0, 0, 65, 77, 68, 0, - 22, 0, 45, 104, 115, 97, 95, 99, 97, 108, 108, 95, 99, - 111, 110, 118, 101, 110, 116, 105, 111, 110, 61, 0, 0, 0, - 0, 0, 4, 0, 0, 0, 30, 0, 0, 0, 3, 0, 0, - 0, 65, 77, 68, 0, 4, 0, 7, 0, 8, 0, 0, 0, - 0, 0, 0, 0, 1, 0, 0, 0, 65, 77, 68, 0, 65, - 77, 68, 71, 80, 85, 0, 0, 0, 0, 0, 0, 4, 0, - 0, 0, 8, 0, 0, 0, 4, 0, 0, 0, 65, 77, 68, - 0, 96, 62, -27, 85, -1, 127, 0, 0, 38, 95, 95, 99, - 111, 112, 121, 95, 98, 117, 102, 102, 101, 114, 95, 97, 108, - 105, 103, 110, 101, 100, 95, 107, 101, 114, 110, 101, 108, 0, - 95, 95, 104, 115, 97, 95, 115, 101, 99, 116, 105, 111, 110, - 46, 104, 115, 97, 116, 101, 120, 116, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 26, 0, 1, 0, 0, 0, 0, 0, 0, - 0, 0, 0, -60, 1, 0, 0, 0, 0, 0, 0, 30, 0, - 0, 0, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 46, 104, 115, - 97, 116, 101, 120, 116, 0, 46, 110, 111, 116, 101, 0, 46, - 115, 116, 114, 116, 97, 98, 0, 46, 115, 121, 109, 116, 97, - 98, 0, 46, 115, 104, 115, 116, 114, 116, 97, 98, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 7, - 0, -64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, -60, 1, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 10, 0, 0, 0, 7, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, -56, 2, 0, 0, 0, 0, 0, 0, -88, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 16, 0, 0, 0, 3, 0, 0, 0, 32, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 112, 3, 0, 0, 0, 0, 0, 0, 52, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 24, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -88, - 3, 0, 0, 0, 0, 0, 0, 48, 0, 0, 0, 0, 0, - 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, - 0, 0, 0, 0, 0, 24, 0, 0, 0, 0, 0, 0, 0, - 32, 0, 0, 0, 3, 0, 0, 0, 32, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -40, 3, - 0, 0, 0, 0, 0, 0, 42, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -}; -extern char* const kVectorCopyAlignedViObject = &kVectorCopyAlignedRawVi[0]; -extern size_t const kVectorCopyAlignedViObjectSize = - sizeof(kVectorCopyAlignedRawVi); - -#define HSA_FILL_MEMORY_VI_AKC_SIZE 368 -#define HSA_FILL_MEMORY_VI_AKC_OFFSET 256 - -/*****HSAIL code of the ISA in ::kFillMemoryRawVi. -module &m:1:0:$full:$large:$default; -extension "amd:gcn"; - -prog kernel &__fill_memory_kernel( - kernarg_u64 %ptr, - kernarg_u64 %num, - kernarg_u32 %value) -{ - @__fill_memory_kernel_entry: - // BB#0: // %entry - workitemabsid_u32 $s0, 0; - cvt_u64_u32 $d0, $s0; - ld_kernarg_align(8)_width(all)_u64 $d1, [%num]; - cmp_ge_b1_u64 $c0, $d0, $d1; - cbr_b1 $c0, @LBB0_2; - // BB#1: // %if.end - ld_kernarg_align(8)_width(all)_u64 $d1, [%ptr]; - ld_kernarg_align(4)_width(all)_u32 $s0, [%value]; - shl_u64 $d0, $d0, 2; - add_u64 $d0, $d1, $d0; - st_global_align(4)_u32 $s0, [$d0]; - - @LBB0_2: - // %return - ret; -}; -*/ - -static char kFillMemoryRawVi[] = { - 127, 69, 76, 70, 2, 1, 1, 64, 0, 0, 0, 0, 0, - 0, 0, 0, 1, 0, -32, 0, 1, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 64, 0, 0, 0, 0, 0, 0, - 0, -88, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 64, 0, 56, 0, 1, 0, 64, 0, 6, 0, 5, 0, 3, - 0, 0, 96, 6, 0, 0, 0, 0, 1, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 112, 1, 0, 0, 0, 0, 0, 0, - 112, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, - 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, - 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 64, 0, -84, 0, -112, 0, 0, 0, - 11, 0, 74, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 13, 0, 3, 0, 3, 0, 0, 0, 9, 0, 0, - 0, 0, 0, 0, 0, 4, 4, 4, 6, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 2, 0, 2, -64, 4, 0, 0, 0, - 127, 0, -116, -65, 0, -1, -128, -110, 0, 0, 16, 0, 0, - 8, 0, -110, 0, 0, 0, 50, 3, 0, 6, -64, 8, 0, - 0, 0, -128, 2, 2, 126, 127, 0, -116, -65, 0, 0, -40, - 125, 106, 32, -128, -66, 13, 0, -120, -65, -125, 0, 6, -64, - 0, 0, 0, 0, 3, 1, 2, -64, 16, 0, 0, 0, 0, - 0, -113, -46, -126, 0, 2, 0, 127, 0, -116, -65, 2, 0, - 0, 50, 3, 2, 4, 126, 2, 3, 2, 56, 4, 2, 4, - 126, 0, 0, 112, -36, 0, 2, 0, 0, 0, 0, -127, -65, - 4, 0, 0, 0, 8, 0, 0, 0, 1, 0, 0, 0, 65, - 77, 68, 0, 1, 0, 0, 0, 0, 0, 0, 0, 4, 0, - 0, 0, 12, 0, 0, 0, 2, 0, 0, 0, 65, 77, 68, - 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, - 4, 0, 0, 0, 25, 0, 0, 0, 5, 0, 0, 0, 65, - 77, 68, 0, 22, 0, 45, 104, 115, 97, 95, 99, 97, 108, - 108, 95, 99, 111, 110, 118, 101, 110, 116, 105, 111, 110, 61, - 0, 0, 0, 0, 0, 4, 0, 0, 0, 30, 0, 0, 0, - 3, 0, 0, 0, 65, 77, 68, 0, 4, 0, 7, 0, 8, - 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 65, 77, - 68, 0, 65, 77, 68, 71, 80, 85, 0, 0, 0, 0, 0, - 0, 4, 0, 0, 0, 8, 0, 0, 0, 4, 0, 0, 0, - 65, 77, 68, 0, 16, -20, 88, 97, -4, 127, 0, 0, 38, - 95, 95, 102, 105, 108, 108, 95, 109, 101, 109, 111, 114, 121, - 95, 107, 101, 114, 110, 101, 108, 0, 95, 95, 104, 115, 97, - 95, 115, 101, 99, 116, 105, 111, 110, 46, 104, 115, 97, 116, - 101, 120, 116, 0, 0, 0, 0, 0, 0, 0, 0, 0, 26, - 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 112, 1, - 0, 0, 0, 0, 0, 0, 22, 0, 0, 0, 3, 0, 1, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 46, 104, 115, 97, 116, 101, 120, 116, - 0, 46, 110, 111, 116, 101, 0, 46, 115, 116, 114, 116, 97, - 98, 0, 46, 115, 121, 109, 116, 97, 98, 0, 46, 115, 104, - 115, 116, 114, 116, 97, 98, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, - 0, 0, 0, 1, 0, 0, 0, 7, 0, -64, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, - 0, 0, 0, 0, 0, 112, 1, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, - 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 112, 2, 0, 0, - 0, 0, 0, 0, -88, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, - 0, 3, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 24, 3, 0, 0, 0, - 0, 0, 0, 44, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 24, 0, 0, 0, - 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 72, 3, 0, 0, 0, 0, - 0, 0, 48, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, - 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, - 24, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 3, - 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 120, 3, 0, 0, 0, 0, 0, - 0, 42, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, -}; - -extern char* const kFillMemoryViObject = &kFillMemoryRawVi[0]; -extern size_t const kFillMemoryViObjectSize = sizeof(kFillMemoryRawVi); -#endif // header guard \ No newline at end of file diff --git a/src/core/inc/amd_blit_sdma.h b/src/core/inc/amd_blit_sdma.h index 35f683bc3..6212c3dcc 100644 --- a/src/core/inc/amd_blit_sdma.h +++ b/src/core/inc/amd_blit_sdma.h @@ -43,10 +43,12 @@ #ifndef HSA_RUNTIME_CORE_INC_AMD_BLIT_SDMA_H_ #define HSA_RUNTIME_CORE_INC_AMD_BLIT_SDMA_H_ +#include <mutex> #include <stdint.h> #include "hsakmt.h" +#include "core/inc/amd_gpu_agent.h" #include "core/inc/blit.h" #include "core/inc/runtime.h" #include "core/inc/signal.h" @@ -73,8 +75,10 @@ class BlitSdma : public core::Blit { /// /// @note: The call will block until all packets have executed. /// + /// @param agent Agent passed to Initialize. + /// /// @return hsa_status_t - virtual hsa_status_t Destroy() override; + virtual hsa_status_t Destroy(const core::Agent& agent) override; /// @brief Submit a linear copy command to the queue buffer. /// @@ -107,6 +111,12 @@ class BlitSdma : public core::Blit { virtual hsa_status_t SubmitLinearFillCommand(void* ptr, uint32_t value, size_t count) override; + virtual hsa_status_t EnableProfiling(bool enable) override; + + static const size_t kQueueSize; + + static const size_t kCopyPacketSize; + protected: /// @brief Acquires the address into queue buffer where a new command /// packet of specified size could be written. The address that is @@ -159,6 +169,13 @@ class BlitSdma : public core::Blit { void BuildAtomicDecrementCommand(char* cmd_addr, void* addr); + void BuildGetGlobalTimestampCommand(char* cmd_addr, void* write_address); + + void BuildTrapCommand(char* cmd_addr); + + // Agent object owning the SDMA engine. + GpuAgent* agent_; + /// Indicates size of Queue buffer in bytes. uint32_t queue_size_; @@ -199,6 +216,10 @@ class BlitSdma : public core::Blit { uint32_t atomic_command_size_; + uint32_t timestamp_command_size_; + + uint32_t trap_command_size_; + // Max copy size of a single linear copy command packet. size_t max_single_linear_copy_size_; @@ -210,6 +231,9 @@ class BlitSdma : public core::Blit { /// Max total fill count supported by the queue. size_t max_total_fill_size_; + + /// True if platform atomic is supported. + bool platform_atomic_support_; }; } // namespace amd diff --git a/src/core/inc/amd_elf_image.hpp b/src/core/inc/amd_elf_image.hpp index 8bc811e17..763c5c831 100644 --- a/src/core/inc/amd_elf_image.hpp +++ b/src/core/inc/amd_elf_image.hpp @@ -103,6 +103,7 @@ namespace amd { virtual uint64_t imageSize() const = 0; virtual uint64_t vaddr() const = 0; virtual uint64_t flags() const = 0; + virtual uint64_t offset() const = 0; virtual const char* data() const = 0; virtual uint16_t getSegmentIndex() = 0; virtual bool updateAddSection(Section *section) = 0; diff --git a/src/core/inc/amd_gpu_agent.h b/src/core/inc/amd_gpu_agent.h index abd854679..7b3246465 100644 --- a/src/core/inc/amd_gpu_agent.h +++ b/src/core/inc/amd_gpu_agent.h @@ -57,6 +57,8 @@ #include "core/util/locks.h" namespace amd { +class MemoryRegion; + // @brief Contains scratch memory information. struct ScratchInfo { void* queue_base; @@ -72,6 +74,16 @@ class GpuAgentInt : public core::Agent { GpuAgentInt(uint32_t node_id) : core::Agent(node_id, core::Agent::DeviceType::kAmdGpuDevice) {} + // @brief Initialize DMA queue. + // + // @retval HSA_STATUS_SUCCESS DMA queue initialization is successful. + virtual void InitDma() = 0; + + // @brief Initialize blit kernel object based on AQL queue. + // + // @retval HSA_STATUS_SUCCESS blit kernel object initialization is successful. + virtual hsa_status_t InitBlitKernel() = 0; + // @brief Invoke the user provided callback for each region accessible by // this agent. // @@ -108,6 +120,16 @@ class GpuAgentInt : public core::Agent { virtual void TranslateTime(core::Signal* signal, hsa_amd_profiling_dispatch_time_t& time) = 0; + // @brief Translate the async copy start and end timestamp from agent + // domain to host domain. + // + // @param [in] signal Pointer to signal that provides the async copy timing. + // @param [out] time Structure to be populated with the host domain value. + virtual void TranslateTime(core::Signal* signal, + hsa_amd_profiling_async_copy_time_t& time) { + return TranslateTime(signal, (hsa_amd_profiling_dispatch_time_t&)time); + } + // @brief Translate timestamp agent domain to host domain. // // @param [out] time Timestamp in agent domain. @@ -158,27 +180,32 @@ class GpuAgent : public GpuAgentInt { // @brief GPU agent destructor. ~GpuAgent(); - // @brief Initialize DMA queue. - // - // @retval HSA_STATUS_SUCCESS DMA queue initialization is successful. - hsa_status_t InitDma(); + // @brief Override from core::Agent. + void InitDma() override; + + // @brief Override from core::Agent. + hsa_status_t InitBlitKernel() override; uint16_t GetMicrocodeVersion() const; - // @brief Assembles SP3 shader source into executable code. + // @brief Assembles SP3 shader source into ISA or AQL code object. // // @param [in] src_sp3 SP3 shader source text representation. // @param [in] func_name Name of the SP3 function to assemble. - // @param [out] code_buf Executable code buffer. - // @param [out] code_buf_size Size of executable code buffer in bytes. + // @param [in] assemble_target ISA or AQL assembly target. + // @param [out] code_buf Code object buffer. + // @param [out] code_buf_size Size of code object buffer in bytes. + enum class AssembleTarget { ISA, AQL }; + void AssembleShader(const char* src_sp3, const char* func_name, - void*& code_buf, size_t& code_buf_size); + AssembleTarget assemble_target, void*& code_buf, + size_t& code_buf_size) const; - // @brief Frees executable code created by AssembleShader. + // @brief Frees code object created by AssembleShader. // - // @param [in] code_buf Executable code buffer. - // @param [in] code_buf_size Size of executable code buffer in bytes. - void ReleaseShader(void* code_buf, size_t code_buf_size); + // @param [in] code_buf Code object buffer. + // @param [in] code_buf_size Size of code object buffer in bytes. + void ReleaseShader(void* code_buf, size_t code_buf_size) const; // @brief Override from core::Agent. hsa_status_t VisitRegion(bool include_peer, @@ -203,6 +230,9 @@ class GpuAgent : public GpuAgentInt { // @brief Override from core::Agent. hsa_status_t DmaFill(void* ptr, uint32_t value, size_t count) override; + // @brief Get the next available end timestamp object. + uint64_t* ObtainEndTsObject(); + // @brief Override from core::Agent. hsa_status_t GetInfo(hsa_agent_info_t attribute, void* value) const override; @@ -308,6 +338,9 @@ class GpuAgent : public GpuAgentInt { // @brief Binds the second-level trap handler to this node. void BindTrapHandler(); + // @brief Override from core::Agent. + hsa_status_t EnableDmaProfiling(bool enable) override; + // @brief Node properties. const HsaNodeProperties properties_; @@ -329,10 +362,13 @@ class GpuAgent : public GpuAgentInt { // @brief Blit object to handle memory copy from system to device memory. core::Blit* blit_h2d_; - // @brief Blit object to handle memory copy from device to system, device to - // device, and memory fill. + // @brief Blit object to handle memory copy from device to system memory. core::Blit* blit_d2h_; + // @brief Blit object to handle memory copy from device to device memory, and + // memory fill. + core::Blit* blit_d2d_; + // @brief Mutex to protect the update to coherency type. KernelMutex coherency_lock_; @@ -342,6 +378,9 @@ class GpuAgent : public GpuAgentInt { // @brief Mutex to protect access to ::t1_. KernelMutex t1_lock_; + // @brief Mutex to protect access to blit objects. + KernelMutex blit_lock_; + // @brief GPU tick on initialization. HsaClockCounters t0_; @@ -353,6 +392,8 @@ class GpuAgent : public GpuAgentInt { // @brief Array of regions owned by this agent. std::vector<const core::MemoryRegion*> regions_; + MemoryRegion* local_region_; + core::Isa* isa_; // @brief HSA profile. @@ -381,12 +422,29 @@ class GpuAgent : public GpuAgentInt { // @brief Query the driver to get the cache properties. void InitCacheList(); + // @brief Initialize memory pool for end timestamp object. + // @retval True if the memory pool for end timestamp object is initialized. + bool InitEndTsPool(); + // @brief Alternative aperture base address. Only on KV. uintptr_t ape1_base_; // @brief Alternative aperture size. Only on KV. size_t ape1_size_; + // @brief True if blit objects are initialized. + std::atomic<bool> blit_initialized_; + + // Each end ts is 32 bytes. + static const size_t kTsSize = 32; + + // Number of element in the pool. + uint32_t end_ts_pool_size_; + + std::atomic<uint32_t> end_ts_pool_counter_; + + std::atomic<uint64_t*> end_ts_base_addr_; + DISALLOW_COPY_AND_ASSIGN(GpuAgent); }; diff --git a/src/core/inc/amd_gpu_shaders.h b/src/core/inc/amd_gpu_shaders.h new file mode 100644 index 000000000..2aa074981 --- /dev/null +++ b/src/core/inc/amd_gpu_shaders.h @@ -0,0 +1,169 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_RUNTIME_CORE_INC_AMD_GPU_SHADERS_H_ +#define HSA_RUNTIME_CORE_INC_AMD_GPU_SHADERS_H_ + +namespace amd { + +static const unsigned int kCodeCopyAligned7[] = { + 0xC0820100, 0xC0840104, 0xC0860108, 0xC088010C, 0xC08A0110, 0xC00C0114, + 0xBF8C007F, 0x8F028602, 0x4A000002, 0x7E060205, 0xD24A6A02, 0x00000900, + 0xD2506A03, 0x01A90103, 0x7E0A0207, 0xD24A6A04, 0x00000D00, 0xD2506A05, + 0x01A90105, 0xD1C2006A, 0x00001102, 0xBF86000F, 0x87FE6A7E, 0xDC200000, + 0x01000002, 0xBF8C0F70, 0xD24A6A02, 0x00003102, 0xD2506A03, 0x01A90103, + 0xDC600000, 0x00000104, 0xD24A6A04, 0x00003104, 0xD2506A05, 0x01A90105, + 0xBF82FFEE, 0xBEFE04C1, 0x8F198418, 0x34020084, 0x7E060209, 0xD24A6A02, + 0x00001101, 0xD2506A03, 0x01A90103, 0x7E0A020B, 0xD24A6A04, 0x00001501, + 0xD2506A05, 0x01A90105, 0xD1C2006A, 0x00001902, 0xBF86000E, 0xDC380000, + 0x08000002, 0xD24A6A02, 0x00003302, 0xD2506A03, 0x01A90103, 0xBF8C0F70, + 0xDC780000, 0x00000804, 0xD24A6A04, 0x00003304, 0xD2506A05, 0x01A90105, + 0xBF82FFEF, 0x8F198218, 0x34020082, 0x7E06020D, 0xD24A6A02, 0x00001901, + 0xD2506A03, 0x01A90103, 0x7E0A020F, 0xD24A6A04, 0x00001D01, 0xD2506A05, + 0x01A90105, 0xD1C2006A, 0x00002102, 0xBF86000F, 0x87FE6A7E, 0xDC300000, + 0x01000002, 0xD24A6A02, 0x00003302, 0xD2506A03, 0x01A90103, 0xBF8C0F70, + 0xDC700000, 0x00000104, 0xD24A6A04, 0x00003304, 0xD2506A05, 0x01A90105, + 0xBF82FFEE, 0xBEFE04C1, 0x7E060211, 0xD24A6A02, 0x00002100, 0xD2506A03, + 0x01A90103, 0x7E0A0213, 0xD24A6A04, 0x00002500, 0xD2506A05, 0x01A90105, + 0xD1C2006A, 0x00002902, 0xBF860006, 0x87FE6A7E, 0xDC200000, 0x01000002, + 0xBF8C0F70, 0xDC600000, 0x00000104, 0xBF810000, +}; + +static const unsigned int kCodeCopyMisaligned7[] = { + 0xC0820100, 0xC0840104, 0xC0860108, 0xC008010C, 0xBF8C007F, 0x8F028602, + 0x4A000002, 0x7E060205, 0xD24A6A02, 0x00000900, 0xD2506A03, 0x01A90103, + 0x7E0A0207, 0xD24A6A04, 0x00000D00, 0xD2506A05, 0x01A90105, 0xD1C2006A, + 0x00001102, 0xBF860032, 0xDC200000, 0x06000002, 0xD24A6A02, 0x00002102, + 0xD2506A03, 0x01A90103, 0xDC200000, 0x07000002, 0xD24A6A02, 0x00002102, + 0xD2506A03, 0x01A90103, 0xDC200000, 0x08000002, 0xD24A6A02, 0x00002102, + 0xD2506A03, 0x01A90103, 0xDC200000, 0x09000002, 0xD24A6A02, 0x00002102, + 0xD2506A03, 0x01A90103, 0xBF8C0F70, 0xDC600000, 0x00000604, 0xD24A6A04, + 0x00002104, 0xD2506A05, 0x01A90105, 0xDC600000, 0x00000704, 0xD24A6A04, + 0x00002104, 0xD2506A05, 0x01A90105, 0xDC600000, 0x00000804, 0xD24A6A04, + 0x00002104, 0xD2506A05, 0x01A90105, 0xDC600000, 0x00000904, 0xD24A6A04, + 0x00002104, 0xD2506A05, 0x01A90105, 0xBF82FFCB, 0x7E060209, 0xD24A6A02, + 0x00001100, 0xD2506A03, 0x01A90103, 0x7E0A020B, 0xD24A6A04, 0x00001500, + 0xD2506A05, 0x01A90105, 0xD1C2006A, 0x00001902, 0xBF86000F, 0x87FE6A7E, + 0xDC200000, 0x01000002, 0xD24A6A02, 0x00002102, 0xD2506A03, 0x01A90103, + 0xBF8C0F70, 0xDC600000, 0x00000104, 0xD24A6A04, 0x00002104, 0xD2506A05, + 0x01A90105, 0xBF82FFEE, 0xBF810000, +}; + +static const unsigned int kCodeFill7[] = { + 0xC0820100, 0xC0840104, 0xBF8C007F, 0x8F028602, 0x4A000002, 0x7E08020A, + 0x7E0A020A, 0x7E0C020A, 0x7E0E020A, 0x8F0C840B, 0x34020084, 0x7E060205, + 0xD24A6A02, 0x00000901, 0xD2506A03, 0x01A90103, 0xD1C2006A, 0x00000D02, + 0xBF860007, 0xDC780000, 0x00000402, 0xD24A6A02, 0x00001902, 0xD2506A03, + 0x01A90103, 0xBF82FFF6, 0x8F0C820B, 0x34020082, 0x7E060207, 0xD24A6A02, + 0x00000D01, 0xD2506A03, 0x01A90103, 0xD1C2006A, 0x00001102, 0xBF860008, + 0x87FE6A7E, 0xDC700000, 0x00000402, 0xD24A6A02, 0x00001902, 0xD2506A03, + 0x01A90103, 0xBF82FFF5, 0xBF810000, +}; + +static const unsigned int kCodeTrapHandler8[] = { + 0xC0061C80, 0x000000C0, 0xBF8C007F, 0xBEFE0181, 0x80728872, 0x82738073, + 0x7E000272, 0x7E020273, 0x7E0402FF, 0x80000000, 0x7E060280, 0xDD800000, + 0x00000200, 0xBF8C0F70, 0x7DD40500, 0xBF870011, 0xC0061D39, 0x00000008, + 0xBF8C007F, 0x86F47474, 0xBF84000C, 0x80729072, 0x82738073, 0xC0021CB9, + 0x00000000, 0xBF8C007F, 0x7E000274, 0x7E020275, 0x7E040272, 0xDC700000, + 0x00000200, 0xBF8C0F70, 0xBF900001, 0xBF8D0001, 0xBE801F70, +}; + +static const unsigned int kCodeCopyAligned8[] = { + 0xC00A0100, 0x00000000, 0xC00A0200, 0x00000010, 0xC00A0300, 0x00000020, + 0xC00A0400, 0x00000030, 0xC00A0500, 0x00000040, 0xC0020600, 0x00000050, + 0xBF8C007F, 0x8E028602, 0x32000002, 0x7E060205, 0xD1196A02, 0x00000900, + 0xD11C6A03, 0x01A90103, 0x7E0A0207, 0xD1196A04, 0x00000D00, 0xD11C6A05, + 0x01A90105, 0xD0E9006A, 0x00001102, 0xBF86000F, 0x86FE6A7E, 0xDC400000, + 0x01000002, 0xBF8C0F70, 0xD1196A02, 0x00003102, 0xD11C6A03, 0x01A90103, + 0xDC600000, 0x00000104, 0xD1196A04, 0x00003104, 0xD11C6A05, 0x01A90105, + 0xBF82FFEE, 0xBEFE01C1, 0x8E198418, 0x24020084, 0x7E060209, 0xD1196A02, + 0x00001101, 0xD11C6A03, 0x01A90103, 0x7E0A020B, 0xD1196A04, 0x00001501, + 0xD11C6A05, 0x01A90105, 0xD0E9006A, 0x00001902, 0xBF86000E, 0xDC5C0000, + 0x08000002, 0xD1196A02, 0x00003302, 0xD11C6A03, 0x01A90103, 0xBF8C0F70, + 0xDC7C0000, 0x00000804, 0xD1196A04, 0x00003304, 0xD11C6A05, 0x01A90105, + 0xBF82FFEF, 0x8E198218, 0x24020082, 0x7E06020D, 0xD1196A02, 0x00001901, + 0xD11C6A03, 0x01A90103, 0x7E0A020F, 0xD1196A04, 0x00001D01, 0xD11C6A05, + 0x01A90105, 0xD0E9006A, 0x00002102, 0xBF86000F, 0x86FE6A7E, 0xDC500000, + 0x01000002, 0xD1196A02, 0x00003302, 0xD11C6A03, 0x01A90103, 0xBF8C0F70, + 0xDC700000, 0x00000104, 0xD1196A04, 0x00003304, 0xD11C6A05, 0x01A90105, + 0xBF82FFEE, 0xBEFE01C1, 0x7E060211, 0xD1196A02, 0x00002100, 0xD11C6A03, + 0x01A90103, 0x7E0A0213, 0xD1196A04, 0x00002500, 0xD11C6A05, 0x01A90105, + 0xD0E9006A, 0x00002902, 0xBF860006, 0x86FE6A7E, 0xDC400000, 0x01000002, + 0xBF8C0F70, 0xDC600000, 0x00000104, 0xBF810000, +}; + +static const unsigned int kCodeCopyMisaligned8[] = { + 0xC00A0100, 0x00000000, 0xC00A0200, 0x00000010, 0xC00A0300, 0x00000020, + 0xC0020400, 0x00000030, 0xBF8C007F, 0x8E028602, 0x32000002, 0x7E060205, + 0xD1196A02, 0x00000900, 0xD11C6A03, 0x01A90103, 0x7E0A0207, 0xD1196A04, + 0x00000D00, 0xD11C6A05, 0x01A90105, 0xD0E9006A, 0x00001102, 0xBF860032, + 0xDC400000, 0x06000002, 0xD1196A02, 0x00002102, 0xD11C6A03, 0x01A90103, + 0xDC400000, 0x07000002, 0xD1196A02, 0x00002102, 0xD11C6A03, 0x01A90103, + 0xDC400000, 0x08000002, 0xD1196A02, 0x00002102, 0xD11C6A03, 0x01A90103, + 0xDC400000, 0x09000002, 0xD1196A02, 0x00002102, 0xD11C6A03, 0x01A90103, + 0xBF8C0F70, 0xDC600000, 0x00000604, 0xD1196A04, 0x00002104, 0xD11C6A05, + 0x01A90105, 0xDC600000, 0x00000704, 0xD1196A04, 0x00002104, 0xD11C6A05, + 0x01A90105, 0xDC600000, 0x00000804, 0xD1196A04, 0x00002104, 0xD11C6A05, + 0x01A90105, 0xDC600000, 0x00000904, 0xD1196A04, 0x00002104, 0xD11C6A05, + 0x01A90105, 0xBF82FFCB, 0x7E060209, 0xD1196A02, 0x00001100, 0xD11C6A03, + 0x01A90103, 0x7E0A020B, 0xD1196A04, 0x00001500, 0xD11C6A05, 0x01A90105, + 0xD0E9006A, 0x00001902, 0xBF86000F, 0x86FE6A7E, 0xDC400000, 0x01000002, + 0xD1196A02, 0x00002102, 0xD11C6A03, 0x01A90103, 0xBF8C0F70, 0xDC600000, + 0x00000104, 0xD1196A04, 0x00002104, 0xD11C6A05, 0x01A90105, 0xBF82FFEE, + 0xBF810000, +}; + +static const unsigned int kCodeFill8[] = { + 0xC00A0100, 0x00000000, 0xC00A0200, 0x00000010, 0xBF8C007F, 0x8E028602, + 0x32000002, 0x7E08020A, 0x7E0A020A, 0x7E0C020A, 0x7E0E020A, 0x8E0C840B, + 0x24020084, 0x7E060205, 0xD1196A02, 0x00000901, 0xD11C6A03, 0x01A90103, + 0xD0E9006A, 0x00000D02, 0xBF860007, 0xDC7C0000, 0x00000402, 0xD1196A02, + 0x00001902, 0xD11C6A03, 0x01A90103, 0xBF82FFF6, 0x8E0C820B, 0x24020082, + 0x7E060207, 0xD1196A02, 0x00000D01, 0xD11C6A03, 0x01A90103, 0xD0E9006A, + 0x00001102, 0xBF860008, 0x86FE6A7E, 0xDC700000, 0x00000402, 0xD1196A02, + 0x00001902, 0xD11C6A03, 0x01A90103, 0xBF82FFF5, 0xBF810000, +}; + +} // namespace amd + +#endif // header guard diff --git a/src/core/inc/amd_hsa_loader.hpp b/src/core/inc/amd_hsa_loader.hpp index 5b9cd4d92..251df841a 100644 --- a/src/core/inc/amd_hsa_loader.hpp +++ b/src/core/inc/amd_hsa_loader.hpp @@ -47,6 +47,7 @@ #include <cstdint> #include "hsa.h" #include "hsa_ext_image.h" +#include "hsa_ven_amd_loader.h" #include "amd_hsa_elf.h" #include <string> #include <mutex> @@ -317,6 +318,13 @@ class Executable { void *data), void *data) = 0; + virtual size_t GetNumSegmentDescriptors() = 0; + + virtual size_t QuerySegmentDescriptors( + hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors, + size_t total_num_segment_descriptors, + size_t first_empty_segment_descriptor) = 0; + virtual uint64_t FindHostAddress(uint64_t device_address) = 0; virtual void Print(std::ostream& out) = 0; @@ -368,6 +376,11 @@ class Loader { void *data), void *data) = 0; + /// @brief same as hsa_ven_amd_loader_query_segment_descriptors. + virtual hsa_status_t QuerySegmentDescriptors( + hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors, + size_t *num_segment_descriptors) = 0; + /// @brief Returns host address given @p device_address. If @p device_address /// is already host address, returns null pointer. If @p device_address is /// invalid address, returns null pointer. diff --git a/src/core/inc/blit.h b/src/core/inc/blit.h index f44a6bab1..48aebaa64 100644 --- a/src/core/inc/blit.h +++ b/src/core/inc/blit.h @@ -66,8 +66,10 @@ class Blit { /// /// @note: The call will block until all commands have executed. /// + /// @param agent Agent passed to Initialize. + /// /// @return hsa_status_t - virtual hsa_status_t Destroy() = 0; + virtual hsa_status_t Destroy(const core::Agent& agent) = 0; /// @brief Submit a linear copy command to the the underlying compute device's /// control block. The call is blocking until the command execution is @@ -102,6 +104,15 @@ class Blit { /// @param num Number of uint32_t element to be set to the value. virtual hsa_status_t SubmitLinearFillCommand(void* ptr, uint32_t value, size_t num) = 0; + + /// @brief Enable profiling of the asynchronous copy command. The timestamp + /// of each copy request will be stored in the completion signal structure. + /// + /// @param enable True to enable profiling. False to disable profiling. + /// + /// @return HSA_STATUS_SUCCESS if the request to enable/disable profiling is + /// successful. + virtual hsa_status_t EnableProfiling(bool enable) = 0; }; } // namespace core diff --git a/src/core/inc/hsa_api_trace_int.h b/src/core/inc/hsa_api_trace_int.h index cc9a638a9..769dbed2e 100644 --- a/src/core/inc/hsa_api_trace_int.h +++ b/src/core/inc/hsa_api_trace_int.h @@ -47,17 +47,28 @@ #include "core/inc/hsa_internal.h" namespace core { -struct ApiTable { - ::ApiTable table; - ExtTable extension_backup; + struct HsaApiTable { - ApiTable(); - void Reset(); - void LinkExts(ExtTable* ptr); -}; + static const uint32_t HSA_EXT_FINALIZER_API_TABLE_ID = 0; + static const uint32_t HSA_EXT_IMAGE_API_TABLE_ID = 1; -extern ApiTable hsa_api_table_; -extern ApiTable hsa_internal_api_table_; + ::HsaApiTable hsa_api; + ::CoreApiTable core_api; + ::AmdExtTable amd_ext_api; + ::FinalizerExtTable finalizer_api; + ::ImageExtTable image_api; + + HsaApiTable(); + void Init(); + void UpdateCore(); + void UpdateAmdExts(); + void CloneExts(void* ptr, uint32_t table_id); + void LinkExts(void* ptr, uint32_t table_id); + void Reset(); + }; + + extern HsaApiTable hsa_api_table_; + extern HsaApiTable hsa_internal_api_table_; } #endif diff --git a/src/core/inc/hsa_ext_amd_impl.h b/src/core/inc/hsa_ext_amd_impl.h new file mode 100755 index 000000000..54f8e3458 --- /dev/null +++ b/src/core/inc/hsa_ext_amd_impl.h @@ -0,0 +1,186 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// HSA AMD extension. + +#ifndef HSA_RUNTIME_CORE_INC_EXT_AMD_H_ +#define HSA_RUNTIME_CORE_INC_EXT_AMD_H_ + +#include "hsa.h" +#include "hsa_ext_image.h" +#include "hsa_ext_amd.h" + +// Wrap internal implementation inside AMD namespace +namespace AMD { + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API hsa_amd_coherency_get_type(hsa_agent_t agent, + hsa_amd_coherency_type_t* type); + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API hsa_amd_coherency_set_type(hsa_agent_t agent, + hsa_amd_coherency_type_t type); + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API + hsa_amd_profiling_set_profiler_enabled(hsa_queue_t* queue, int enable); + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API + hsa_amd_profiling_async_copy_enable(bool enable); + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API hsa_amd_profiling_get_dispatch_time( + hsa_agent_t agent, hsa_signal_t signal, + hsa_amd_profiling_dispatch_time_t* time); + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API hsa_amd_profiling_get_async_copy_time( + hsa_signal_t signal, hsa_amd_profiling_async_copy_time_t* time); + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API + hsa_amd_profiling_convert_tick_to_system_domain(hsa_agent_t agent, + uint64_t agent_tick, + uint64_t* system_tick); + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API + hsa_amd_signal_async_handler(hsa_signal_t signal, + hsa_signal_condition_t cond, + hsa_signal_value_t value, + hsa_amd_signal_handler handler, void* arg); + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API + hsa_amd_async_function(void (*callback)(void* arg), void* arg); + +// Mirrors Amd Extension Apis +uint32_t HSA_API + hsa_amd_signal_wait_any(uint32_t signal_count, hsa_signal_t* signals, + hsa_signal_condition_t* conds, + hsa_signal_value_t* values, uint64_t timeout_hint, + hsa_wait_state_t wait_hint, + hsa_signal_value_t* satisfying_value); + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue, + uint32_t num_cu_mask_count, + const uint32_t* cu_mask); + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API + hsa_amd_memory_pool_get_info(hsa_amd_memory_pool_t memory_pool, + hsa_amd_memory_pool_info_t attribute, + void* value); + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API hsa_amd_agent_iterate_memory_pools( + hsa_agent_t agent, + hsa_status_t (*callback)(hsa_amd_memory_pool_t memory_pool, void* data), + void* data); + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API + hsa_amd_memory_pool_allocate(hsa_amd_memory_pool_t memory_pool, size_t size, + uint32_t flags, void** ptr); + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API hsa_amd_memory_pool_free(void* ptr); + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API + hsa_amd_memory_async_copy(void* dst, hsa_agent_t dst_agent, const void* src, + hsa_agent_t src_agent, size_t size, + uint32_t num_dep_signals, + const hsa_signal_t* dep_signals, + hsa_signal_t completion_signal); + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API hsa_amd_agent_memory_pool_get_info( + hsa_agent_t agent, hsa_amd_memory_pool_t memory_pool, + hsa_amd_agent_memory_pool_info_t attribute, void* value); + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API + hsa_amd_agents_allow_access(uint32_t num_agents, const hsa_agent_t* agents, + const uint32_t* flags, const void* ptr); + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API + hsa_amd_memory_pool_can_migrate(hsa_amd_memory_pool_t src_memory_pool, + hsa_amd_memory_pool_t dst_memory_pool, + bool* result); + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API hsa_amd_memory_migrate(const void* ptr, + hsa_amd_memory_pool_t memory_pool, + uint32_t flags); + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API hsa_amd_memory_lock(void* host_ptr, size_t size, + hsa_agent_t* agents, int num_agent, + void** agent_ptr); + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API hsa_amd_memory_unlock(void* host_ptr); + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API + hsa_amd_memory_fill(void* ptr, uint32_t value, size_t count); + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API hsa_amd_interop_map_buffer(uint32_t num_agents, + hsa_agent_t* agents, + int interop_handle, + uint32_t flags, + size_t* size, + void** ptr, + size_t* metadata_size, + const void** metadata); + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API hsa_amd_interop_unmap_buffer(void* ptr); + +} // end of AMD namespace + +#endif // header guard diff --git a/src/core/inc/hsa_ext_interface.h b/src/core/inc/hsa_ext_interface.h index 3645c23fb..236a165c7 100644 --- a/src/core/inc/hsa_ext_interface.h +++ b/src/core/inc/hsa_ext_interface.h @@ -52,27 +52,43 @@ #include "core/util/utils.h" namespace core { -struct ExtTableInternal : public ExtTable { +struct ImageExtTableInternal : public ImageExtTable { decltype(::hsa_amd_image_get_info_max_dim)* hsa_amd_image_get_info_max_dim_fn; - decltype(::hsa_amd_image_create)* hsa_amd_image_create_fn; }; class ExtensionEntryPoints { public: - ExtTableInternal table; + + // Table of function pointers for Hsa Extension Image + ImageExtTableInternal image_api; + + // Table of function pointers for Hsa Extension Finalizer + FinalizerExtTable finalizer_api; ExtensionEntryPoints(); - bool Load(std::string library_name); + bool LoadFinalizer(std::string library_name); + bool LoadImage(std::string library_name); void Unload(); private: - typedef void (*Load_t)(const ::ApiTable* table); + typedef void (*Load_t)(const ::HsaApiTable* table); typedef void (*Unload_t)(); std::vector<os::LibHandle> libs_; - void InitTable(); + // Initialize table for HSA Finalizer Extension Api's + void InitFinalizerExtTable(); + + // Initialize table for HSA Image Extension Api's + void InitImageExtTable(); + + // Initialize Amd Ext table for Api related to Images + void InitAmdExtTable(); + + // Update Amd Ext table for Api related to Images + void UpdateAmdExtTable(void *func_ptr); + DISALLOW_COPY_AND_ASSIGN(ExtensionEntryPoints); }; } diff --git a/src/core/inc/hsa_table_interface.h b/src/core/inc/hsa_table_interface.h index 236ef41c7..99a1280d8 100644 --- a/src/core/inc/hsa_table_interface.h +++ b/src/core/inc/hsa_table_interface.h @@ -42,6 +42,6 @@ #include "hsa_api_trace.h" -void hsa_table_interface_init(const ApiTable* table); +void hsa_table_interface_init(const HsaApiTable* apiTable); -const ApiTable* hsa_table_interface_get_table(); +const HsaApiTable* hsa_table_interface_get_table(); diff --git a/src/core/inc/interrupt_signal.h b/src/core/inc/interrupt_signal.h index bef9564be..adbbb5070 100644 --- a/src/core/inc/interrupt_signal.h +++ b/src/core/inc/interrupt_signal.h @@ -165,10 +165,6 @@ class InterruptSignal : public Signal { /// @brief See base class Signal. __forceinline HsaEvent* EopEvent() { return event_; } - // TODO: work around for SDMA async copy. Bypass waiting on EOP - // event because SDMA copy does not handle interrupt yet. - __forceinline void DisableWaitEvent() { wait_on_event_ = false; } - /// @brief prevent throwing exceptions void* operator new(size_t size) { return malloc(size); } @@ -186,10 +182,6 @@ class InterruptSignal : public Signal { /// closes or not. bool free_event_; - // TODO: work around for SDMA async copy. Bypass waiting on EOP - // event because SDMA copy does not handle interrupt yet. - bool wait_on_event_; - /// Used to obtain a globally unique value (address) for rtti. static int rtti_id_; diff --git a/src/core/inc/runtime.h b/src/core/inc/runtime.h index d3c6f8b9f..6d4554215 100644 --- a/src/core/inc/runtime.h +++ b/src/core/inc/runtime.h @@ -280,8 +280,6 @@ class Runtime { Agent* blit_agent() { return blit_agent_; } - Agent* host_agent() { return host_agent_; } - const std::vector<const MemoryRegion*>& system_regions_fine() const { return system_regions_fine_; } @@ -455,9 +453,6 @@ class Runtime { // Deallocator using ::system_region_ std::function<void(void*)> system_deallocator_; - // Pointer to a host/cpu agent object. - Agent* host_agent_; - // Pointer to DMA agent. Agent* blit_agent_; diff --git a/src/core/inc/signal.h b/src/core/inc/signal.h index e6509421c..478034951 100644 --- a/src/core/inc/signal.h +++ b/src/core/inc/signal.h @@ -57,6 +57,7 @@ #include "inc/amd_hsa_signal.h" namespace core { +class Agent; class Signal; /// @brief Helper structure to simplify conversion of amd_signal_t and @@ -75,7 +76,9 @@ class Signal : public Checked<0x71FCCA6A3D5D5276>, public: /// @brief Constructor initializes the signal with initial value. explicit Signal(hsa_signal_value_t initial_value) - : Shared(), signal_(shared_object()->amd_signal) { + : Shared(), + signal_(shared_object()->amd_signal), + async_copy_agent_(NULL) { if (!Shared::IsSharedObjectAllocationValid()) { invalid_ = true; return; @@ -225,6 +228,12 @@ class Signal : public Checked<0x71FCCA6A3D5D5276>, /// @brief Checks if signal is currently in use by a wait API. bool InWaiting() const { return waiting_ != 0; } + __forceinline void async_copy_agent(core::Agent* agent) { + async_copy_agent_ = agent; + } + + __forceinline core::Agent* async_copy_agent() { return async_copy_agent_; } + /// @brief Structure which defines key signal elements like type and value. /// Address of this struct is used as a value for the opaque handle of type /// hsa_signal_t provided to the public API. @@ -246,6 +255,9 @@ class Signal : public Checked<0x71FCCA6A3D5D5276>, volatile uint32_t retained_; + /// @variable Pointer to agent used to perform an async copy. + core::Agent* async_copy_agent_; + private: DISALLOW_COPY_AND_ASSIGN(Signal); }; diff --git a/src/core/runtime/amd_aql_queue.cpp b/src/core/runtime/amd_aql_queue.cpp index 9bfa78045..3999fd556 100644 --- a/src/core/runtime/amd_aql_queue.cpp +++ b/src/core/runtime/amd_aql_queue.cpp @@ -64,6 +64,7 @@ #include "core/util/utils.h" #include "core/inc/registers.h" #include "core/inc/interrupt_signal.h" +#include "core/inc/hsa_ext_amd_impl.h" namespace amd { // Queue::amd_queue_ is cache-aligned for performance. @@ -99,17 +100,13 @@ AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, return; } - hsa_status_t stat = agent_->GetInfo(HSA_AGENT_INFO_PROFILE, &agent_profile_); - assert(stat == HSA_STATUS_SUCCESS); - - const core::Isa* isa = agent_->isa(); - // When queue_full_workaround_ is set to 1, the ring buffer is internally // doubled in size. Virtual addresses in the upper half of the ring allocation // are mapped to the same set of pages backing the lower half. // Values written to the HW doorbell are modulo the doubled size. // This allows the HW to accept (doorbell == last_doorbell + queue_size). // This workaround is required for GFXIP 7 and GFXIP 8 ASICs. + const core::Isa* isa = agent_->isa(); queue_full_workaround_ = (isa->GetMajorVersion() == 7 || isa->GetMajorVersion() == 8) ? 1 @@ -177,7 +174,7 @@ AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, const auto& props = agent->properties(); amd_queue_.max_cu_id = (props.NumFComputeCores / props.NumSIMDPerCU) - 1; - amd_queue_.max_wave_id = props.MaxWavesPerSIMD - 1; + amd_queue_.max_wave_id = (props.MaxWavesPerSIMD * props.NumSIMDPerCU) - 1; #ifdef HSA_LARGE_MODEL AMD_HSA_BITS_SET(amd_queue_.queue_properties, AMD_QUEUE_PROPERTIES_IS_PTR64, @@ -187,62 +184,8 @@ AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, 0); #endif - // Populate scratch resource descriptor in amd_queue_. - SQ_BUF_RSRC_WORD0 srd0; - SQ_BUF_RSRC_WORD1 srd1; - SQ_BUF_RSRC_WORD2 srd2; - SQ_BUF_RSRC_WORD3 srd3; - uintptr_t scratch_base = uintptr_t(queue_scratch_.queue_base); - uint32_t scratch_base_hi = 0; - -#ifdef HSA_LARGE_MODEL - scratch_base_hi = uint32_t(scratch_base >> 32); -#endif - - srd0.bits.BASE_ADDRESS = uint32_t(scratch_base); - srd1.bits.BASE_ADDRESS_HI = scratch_base_hi; - srd1.bits.STRIDE = 0; - srd1.bits.CACHE_SWIZZLE = 0; - srd1.bits.SWIZZLE_ENABLE = 1; - srd2.bits.NUM_RECORDS = uint32_t(queue_scratch_.size); - srd3.bits.DST_SEL_X = SQ_SEL_X; - srd3.bits.DST_SEL_Y = SQ_SEL_Y; - srd3.bits.DST_SEL_Z = SQ_SEL_Z; - srd3.bits.DST_SEL_W = SQ_SEL_W; - srd3.bits.NUM_FORMAT = BUF_NUM_FORMAT_UINT; - srd3.bits.DATA_FORMAT = BUF_DATA_FORMAT_32; - srd3.bits.ELEMENT_SIZE = 1; // 4 - srd3.bits.INDEX_STRIDE = 3; // 64 - srd3.bits.ADD_TID_ENABLE = 1; - srd3.bits.ATC__CI__VI = (agent_profile_ == HSA_PROFILE_FULL) ? 1 : 0; - srd3.bits.HASH_ENABLE = 0; - srd3.bits.HEAP = 0; - srd3.bits.MTYPE__CI__VI = 0; - srd3.bits.TYPE = SQ_RSRC_BUF; - - amd_queue_.scratch_resource_descriptor[0] = srd0.u32All; - amd_queue_.scratch_resource_descriptor[1] = srd1.u32All; - amd_queue_.scratch_resource_descriptor[2] = srd2.u32All; - amd_queue_.scratch_resource_descriptor[3] = srd3.u32All; - - // Populate flat scratch parameters in amd_queue_. - amd_queue_.scratch_backing_memory_location = - queue_scratch_.queue_process_offset; - amd_queue_.scratch_backing_memory_byte_size = queue_scratch_.size; - amd_queue_.scratch_workitem_byte_size = - uint32_t(queue_scratch_.size_per_thread); - - // Set concurrent wavefront limits when scratch is being used. - COMPUTE_TMPRING_SIZE tmpring_size = {0}; - - if (queue_scratch_.size != 0) { - tmpring_size.bits.WAVES = - (queue_scratch_.size / queue_scratch_.size_per_thread / 64); - tmpring_size.bits.WAVESIZE = - (((64 * queue_scratch_.size_per_thread) + 1023) / 1024); - } - - amd_queue_.compute_tmpring_size = tmpring_size.u32All; + // Initialize scratch memory related entities + InitScratchSRD(); // Set group and private memory apertures in amd_queue_. auto& regions = agent->regions(); @@ -307,7 +250,7 @@ AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, } auto signal = new core::InterruptSignal(0, queue_event_); amd_queue_.queue_inactive_signal = core::InterruptSignal::Convert(signal); - if (hsa_amd_signal_async_handler( + if (AMD::hsa_amd_signal_async_handler( amd_queue_.queue_inactive_signal, HSA_SIGNAL_CONDITION_NE, 0, DynamicScratchHandler, this) != HSA_STATUS_SUCCESS) return; @@ -518,7 +461,7 @@ uint32_t AqlQueue::ComputeRingBufferMaxPkts() { } void AqlQueue::AllocRegisteredRingBuffer(uint32_t queue_size_pkts) { - if (agent_profile_ == HSA_PROFILE_FULL) { + if (agent_->profile() == HSA_PROFILE_FULL) { // Compute the physical and virtual size of the queue. uint32_t ring_buf_phys_size_bytes = uint32_t(queue_size_pkts * sizeof(core::AqlPacket)); @@ -696,7 +639,7 @@ void AqlQueue::AllocRegisteredRingBuffer(uint32_t queue_size_pkts) { } void AqlQueue::FreeRegisteredRingBuffer() { - if (agent_profile_ == HSA_PROFILE_FULL) { + if (agent_->profile() == HSA_PROFILE_FULL) { #ifdef __linux__ munmap(ring_buf_, ring_buf_alloc_bytes_); #endif @@ -755,37 +698,8 @@ bool AqlQueue::DynamicScratchHandler(hsa_signal_value_t error_code, void* arg) { return false; } - SQ_BUF_RSRC_WORD0 srd0; - SQ_BUF_RSRC_WORD2 srd2; - uintptr_t base = (uintptr_t)scratch.queue_base; - - srd0.u32All = queue->amd_queue_.scratch_resource_descriptor[0]; - srd2.u32All = queue->amd_queue_.scratch_resource_descriptor[2]; - - srd0.bits.BASE_ADDRESS = uint32_t(base); - srd2.bits.NUM_RECORDS = uint32_t(scratch.size); - - queue->amd_queue_.scratch_resource_descriptor[0] = srd0.u32All; - queue->amd_queue_.scratch_resource_descriptor[2] = srd2.u32All; - -#ifdef HSA_LARGE_MODEL - SQ_BUF_RSRC_WORD1 srd1; - srd1.u32All = queue->amd_queue_.scratch_resource_descriptor[1]; - srd1.bits.BASE_ADDRESS_HI = uint32_t(base >> 32); - queue->amd_queue_.scratch_resource_descriptor[1] = srd1.u32All; -#endif - - queue->amd_queue_.scratch_backing_memory_location = - scratch.queue_process_offset; - queue->amd_queue_.scratch_backing_memory_byte_size = scratch.size; - queue->amd_queue_.scratch_workitem_byte_size = - uint32_t(scratch.size_per_thread); - - COMPUTE_TMPRING_SIZE tmpring_size = {0}; - tmpring_size.bits.WAVES = (scratch.size / scratch.size_per_thread / 64); - tmpring_size.bits.WAVESIZE = - (((64 * scratch.size_per_thread) + 1023) / 1024); - queue->amd_queue_.compute_tmpring_size = tmpring_size.u32All; + // Reset scratch memory related entities for the queue + queue->InitScratchSRD(); } else if ((error_code & 2) == 2) { // Invalid dim queue->Inactivate(); @@ -853,4 +767,80 @@ hsa_status_t AqlQueue::SetCUMasking(const uint32_t num_cu_mask_count, reinterpret_cast<HSAuint32*>(const_cast<uint32_t*>(cu_mask))); return (HSAKMT_STATUS_SUCCESS == ret) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR; } + +// @brief Define the Scratch Buffer Descriptor and related parameters +// that enable kernel access scratch memory +void AqlQueue::InitScratchSRD() { + + // Populate scratch resource descriptor + SQ_BUF_RSRC_WORD0 srd0; + SQ_BUF_RSRC_WORD1 srd1; + SQ_BUF_RSRC_WORD2 srd2; + SQ_BUF_RSRC_WORD3 srd3; + + uint32_t scratch_base_hi = 0; + uintptr_t scratch_base = uintptr_t(queue_scratch_.queue_base); + #ifdef HSA_LARGE_MODEL + scratch_base_hi = uint32_t(scratch_base >> 32); + #endif + srd0.bits.BASE_ADDRESS = uint32_t(scratch_base); + + srd1.bits.BASE_ADDRESS_HI = scratch_base_hi; + srd1.bits.STRIDE = 0; + srd1.bits.CACHE_SWIZZLE = 0; + srd1.bits.SWIZZLE_ENABLE = 1; + + srd2.bits.NUM_RECORDS = uint32_t(queue_scratch_.size); + + srd3.bits.DST_SEL_X = SQ_SEL_X; + srd3.bits.DST_SEL_Y = SQ_SEL_Y; + srd3.bits.DST_SEL_Z = SQ_SEL_Z; + srd3.bits.DST_SEL_W = SQ_SEL_W; + srd3.bits.NUM_FORMAT = BUF_NUM_FORMAT_UINT; + srd3.bits.DATA_FORMAT = BUF_DATA_FORMAT_32; + srd3.bits.ELEMENT_SIZE = 1; // 4 + srd3.bits.INDEX_STRIDE = 3; // 64 + srd3.bits.ADD_TID_ENABLE = 1; + srd3.bits.ATC__CI__VI = (agent_->profile() == HSA_PROFILE_FULL); + srd3.bits.HASH_ENABLE = 0; + srd3.bits.HEAP = 0; + srd3.bits.MTYPE__CI__VI = 0; + srd3.bits.TYPE = SQ_RSRC_BUF; + + // Update Queue's Scratch descriptor's property + amd_queue_.scratch_resource_descriptor[0] = srd0.u32All; + amd_queue_.scratch_resource_descriptor[1] = srd1.u32All; + amd_queue_.scratch_resource_descriptor[2] = srd2.u32All; + amd_queue_.scratch_resource_descriptor[3] = srd3.u32All; + + // Populate flat scratch parameters in amd_queue_. + amd_queue_.scratch_backing_memory_location = + queue_scratch_.queue_process_offset; + amd_queue_.scratch_backing_memory_byte_size = queue_scratch_.size; + amd_queue_.scratch_workitem_byte_size = + uint32_t(queue_scratch_.size_per_thread); + + // Set concurrent wavefront limits only when scratch is being used. + COMPUTE_TMPRING_SIZE tmpring_size = {0}; + if (queue_scratch_.size == 0) { + amd_queue_.compute_tmpring_size = tmpring_size.u32All; + return; + } + + // Determine the maximum number of waves device can support + const auto& agent_props = agent_->properties(); + uint32_t num_cus = agent_props.NumFComputeCores / agent_props.NumSIMDPerCU; + uint32_t max_scratch_waves = num_cus * agent_props.MaxSlotsScratchCU; + + // Scratch is allocated program COMPUTE_TMPRING_SIZE register + // Scratch Size per Wave is specified in terms of kilobytes + uint32_t wave_size = agent_props.WaveFrontSize; + tmpring_size.bits.WAVESIZE = + (((wave_size * queue_scratch_.size_per_thread) + 1023) / 1024); + uint32_t num_waves = (queue_scratch_.size / (tmpring_size.bits.WAVESIZE * 1024)); + tmpring_size.bits.WAVES = std::min(num_waves, max_scratch_waves); + amd_queue_.compute_tmpring_size = tmpring_size.u32All; + + return; +} } // namespace amd diff --git a/src/core/runtime/amd_blit_kernel.cpp b/src/core/runtime/amd_blit_kernel.cpp index a05aef536..aff60f08b 100644 --- a/src/core/runtime/amd_blit_kernel.cpp +++ b/src/core/runtime/amd_blit_kernel.cpp @@ -43,40 +43,486 @@ #include "core/inc/amd_blit_kernel.h" #include <algorithm> -#include <climits> -#include <cmath> -#include <cstring> - -#if defined(_WIN32) || defined(_WIN64) -#define NOMINMAX -#include <windows.h> -#else -#include <sys/mman.h> -#endif - -#include "core/inc/amd_blit_kernel_kv.h" -#include "core/inc/amd_blit_kernel_vi.h" +#include <sstream> +#include <string> + #include "core/inc/amd_gpu_agent.h" #include "core/inc/hsa_internal.h" #include "core/util/utils.h" namespace amd { -const uint32_t BlitKernel::kGroupSize = 256; -const size_t BlitKernel::kMaxCopyCount = AlignDown(UINT32_MAX, kGroupSize); -const size_t BlitKernel::kMaxFillCount = AlignDown(UINT32_MAX, kGroupSize); - static const uint16_t kInvalidPacketHeader = HSA_PACKET_TYPE_INVALID; +static std::string kBlitKernelSource(R"( + // Compatibility function for GFXIP 7. + + function s_load_dword_offset(byte_offset) + if kGFXIPVersion == 7 + return byte_offset / 4 + else + return byte_offset + end + end + + // Memory copy for all cases except: + // (src_addr & 0x3) != (dst_addr & 0x3) + // + // Kernel argument buffer: + // [DW 0, 1] Phase 1 src start address + // [DW 2, 3] Phase 1 dst start address + // [DW 4, 5] Phase 2 src start address + // [DW 6, 7] Phase 2 dst start address + // [DW 8, 9] Phase 3 src start address + // [DW 10,11] Phase 3 dst start address + // [DW 12,13] Phase 4 src start address + // [DW 14,15] Phase 4 dst start address + // [DW 16,17] Phase 4 src end address + // [DW 18,19] Phase 4 dst end address + // [DW 20 ] Total number of workitems + + var kCopyAlignedVecWidth = 4 + var kCopyAlignedUnroll = 1 + + shader CopyAligned + type(CS) + user_sgpr_count(2) + sgpr_count(32) + vgpr_count(8 + (kCopyAlignedUnroll * kCopyAlignedVecWidth)) + + // Retrieve kernel arguments. + s_load_dwordx4 s[4:7], s[0:1], s_load_dword_offset(0x0) + s_load_dwordx4 s[8:11], s[0:1], s_load_dword_offset(0x10) + s_load_dwordx4 s[12:15], s[0:1], s_load_dword_offset(0x20) + s_load_dwordx4 s[16:19], s[0:1], s_load_dword_offset(0x30) + s_load_dwordx4 s[20:23], s[0:1], s_load_dword_offset(0x40) + s_load_dword s24, s[0:1], s_load_dword_offset(0x50) + s_waitcnt lgkmcnt(0) + + // Compute workitem id. + s_lshl_b32 s2, s2, 0x6 + v_add_u32 v0, vcc, s2, v0 + + // ===================================================== + // Phase 1: Byte copy up to 0x100 destination alignment. + // ===================================================== + + // Compute phase source address. + v_mov_b32 v3, s5 + v_add_u32 v2, vcc, v0, s4 + v_addc_u32 v3, vcc, v3, 0x0, vcc + + // Compute phase destination address. + v_mov_b32 v5, s7 + v_add_u32 v4, vcc, v0, s6 + v_addc_u32 v5, vcc, v5, 0x0, vcc + + L_COPY_ALIGNED_PHASE_1_LOOP: + // Mask off lanes (or branch out) after phase end. + v_cmp_lt_u64 vcc, v[2:3], s[8:9] + s_cbranch_vccz L_COPY_ALIGNED_PHASE_1_DONE + s_and_b64 exec, exec, vcc + + // Load from/advance the source address. + flat_load_ubyte v1, v[2:3] + s_waitcnt vmcnt(0) + v_add_u32 v2, vcc, v2, s24 + v_addc_u32 v3, vcc, v3, 0x0, vcc + + // Write to/advance the destination address. + flat_store_byte v[4:5], v1 + v_add_u32 v4, vcc, v4, s24 + v_addc_u32 v5, vcc, v5, 0x0, vcc + + // Repeat until branched out. + s_branch L_COPY_ALIGNED_PHASE_1_LOOP + + L_COPY_ALIGNED_PHASE_1_DONE: + // Restore EXEC mask for all lanes. + s_mov_b64 exec, 0xFFFFFFFFFFFFFFFF + + // ======================================================== + // Phase 2: Unrolled dword[x4] copy up to last whole block. + // ======================================================== + + // Compute unrolled dword[x4] stride across all threads. + if kCopyAlignedVecWidth == 4 + s_lshl_b32 s25, s24, 0x4 + else + s_lshl_b32 s25, s24, 0x2 + end + + // Compute phase source address. + if kCopyAlignedVecWidth == 4 + v_lshlrev_b32 v1, 0x4, v0 + else + v_lshlrev_b32 v1, 0x2, v0 + end + + v_mov_b32 v3, s9 + v_add_u32 v2, vcc, v1, s8 + v_addc_u32 v3, vcc, v3, 0x0, vcc + + // Compute phase destination address. + v_mov_b32 v5, s11 + v_add_u32 v4, vcc, v1, s10 + v_addc_u32 v5, vcc, v5, 0x0, vcc + + L_COPY_ALIGNED_PHASE_2_LOOP: + // Branch out after phase end. + v_cmp_lt_u64 vcc, v[2:3], s[12:13] + s_cbranch_vccz L_COPY_ALIGNED_PHASE_2_DONE + + // Load from/advance the source address. + for var i = 0; i < kCopyAlignedUnroll; i ++ + if kCopyAlignedVecWidth == 4 + flat_load_dwordx4 v[8 + (i * 4)], v[2:3] + else + flat_load_dword v[8 + i], v[2:3] + end + + v_add_u32 v2, vcc, v2, s25 + v_addc_u32 v3, vcc, v3, 0x0, vcc + end + + // Write to/advance the destination address. + s_waitcnt vmcnt(0) + + for var i = 0; i < kCopyAlignedUnroll; i ++ + if kCopyAlignedVecWidth == 4 + flat_store_dwordx4 v[4:5], v[8 + (i * 4)] + else + flat_store_dword v[4:5], v[8 + i] + end + + v_add_u32 v4, vcc, v4, s25 + v_addc_u32 v5, vcc, v5, 0x0, vcc + end + + // Repeat until branched out. + s_branch L_COPY_ALIGNED_PHASE_2_LOOP + + L_COPY_ALIGNED_PHASE_2_DONE: + + // =========================================== + // Phase 3: Dword copy up to last whole dword. + // =========================================== + + // Compute dword stride across all threads. + s_lshl_b32 s25, s24, 0x2 + + // Compute phase source address. + v_lshlrev_b32 v1, 0x2, v0 + v_mov_b32 v3, s13 + v_add_u32 v2, vcc, v1, s12 + v_addc_u32 v3, vcc, v3, 0x0, vcc + + // Compute phase destination address. + v_mov_b32 v5, s15 + v_add_u32 v4, vcc, v1, s14 + v_addc_u32 v5, vcc, v5, 0x0, vcc + + L_COPY_ALIGNED_PHASE_3_LOOP: + // Mask off lanes (or branch out) after phase end. + v_cmp_lt_u64 vcc, v[2:3], s[16:17] + s_cbranch_vccz L_COPY_ALIGNED_PHASE_3_DONE + s_and_b64 exec, exec, vcc + + // Load from/advance the source address. + flat_load_dword v1, v[2:3] + v_add_u32 v2, vcc, v2, s25 + v_addc_u32 v3, vcc, v3, 0x0, vcc + s_waitcnt vmcnt(0) + + // Write to/advance the destination address. + flat_store_dword v[4:5], v1 + v_add_u32 v4, vcc, v4, s25 + v_addc_u32 v5, vcc, v5, 0x0, vcc + + // Repeat until branched out. + s_branch L_COPY_ALIGNED_PHASE_3_LOOP + + L_COPY_ALIGNED_PHASE_3_DONE: + // Restore EXEC mask for all lanes. + s_mov_b64 exec, 0xFFFFFFFFFFFFFFFF + + // ============================= + // Phase 4: Byte copy up to end. + // ============================= + + // Compute phase source address. + v_mov_b32 v3, s17 + v_add_u32 v2, vcc, v0, s16 + v_addc_u32 v3, vcc, v3, 0x0, vcc + + // Compute phase destination address. + v_mov_b32 v5, s19 + v_add_u32 v4, vcc, v0, s18 + v_addc_u32 v5, vcc, v5, 0x0, vcc + + // Mask off lanes (or branch out) after phase end. + v_cmp_lt_u64 vcc, v[2:3], s[20:21] + s_cbranch_vccz L_COPY_ALIGNED_PHASE_4_DONE + s_and_b64 exec, exec, vcc + + // Load from the source address. + flat_load_ubyte v1, v[2:3] + s_waitcnt vmcnt(0) + + // Write to the destination address. + flat_store_byte v[4:5], v1 + + L_COPY_ALIGNED_PHASE_4_DONE: + s_endpgm + end + + // Memory copy for this case: + // (src_addr & 0x3) != (dst_addr & 0x3) + // + // Kernel argument buffer: + // [DW 0, 1] Phase 1 src start address + // [DW 2, 3] Phase 1 dst start address + // [DW 4, 5] Phase 2 src start address + // [DW 6, 7] Phase 2 dst start address + // [DW 8, 9] Phase 2 src end address + // [DW 10,11] Phase 2 dst end address + // [DW 12 ] Total number of workitems + + var kCopyMisalignedUnroll = 4 + + shader CopyMisaligned + type(CS) + user_sgpr_count(2) + sgpr_count(23) + vgpr_count(6 + kCopyMisalignedUnroll) + + // Retrieve kernel arguments. + s_load_dwordx4 s[4:7], s[0:1], s_load_dword_offset(0x0) + s_load_dwordx4 s[8:11], s[0:1], s_load_dword_offset(0x10) + s_load_dwordx4 s[12:15], s[0:1], s_load_dword_offset(0x20) + s_load_dword s16, s[0:1], s_load_dword_offset(0x30) + s_waitcnt lgkmcnt(0) + + // Compute workitem id. + s_lshl_b32 s2, s2, 0x6 + v_add_u32 v0, vcc, s2, v0 + + // =================================================== + // Phase 1: Unrolled byte copy up to last whole block. + // =================================================== + + // Compute phase source address. + v_mov_b32 v3, s5 + v_add_u32 v2, vcc, v0, s4 + v_addc_u32 v3, vcc, v3, 0x0, vcc + + // Compute phase destination address. + v_mov_b32 v5, s7 + v_add_u32 v4, vcc, v0, s6 + v_addc_u32 v5, vcc, v5, 0x0, vcc + + L_COPY_MISALIGNED_PHASE_1_LOOP: + // Branch out after phase end. + v_cmp_lt_u64 vcc, v[2:3], s[8:9] + s_cbranch_vccz L_COPY_MISALIGNED_PHASE_1_DONE + + // Load from/advance the source address. + for var i = 0; i < kCopyMisalignedUnroll; i ++ + flat_load_ubyte v[6 + i], v[2:3] + v_add_u32 v2, vcc, v2, s16 + v_addc_u32 v3, vcc, v3, 0x0, vcc + end + + // Write to/advance the destination address. + s_waitcnt vmcnt(0) + + for var i = 0; i < kCopyMisalignedUnroll; i ++ + flat_store_byte v[4:5], v[6 + i] + v_add_u32 v4, vcc, v4, s16 + v_addc_u32 v5, vcc, v5, 0x0, vcc + end + + // Repeat until branched out. + s_branch L_COPY_MISALIGNED_PHASE_1_LOOP + + L_COPY_MISALIGNED_PHASE_1_DONE: + + // ============================= + // Phase 2: Byte copy up to end. + // ============================= + + // Compute phase source address. + v_mov_b32 v3, s9 + v_add_u32 v2, vcc, v0, s8 + v_addc_u32 v3, vcc, v3, 0x0, vcc + + // Compute phase destination address. + v_mov_b32 v5, s11 + v_add_u32 v4, vcc, v0, s10 + v_addc_u32 v5, vcc, v5, 0x0, vcc + + L_COPY_MISALIGNED_PHASE_2_LOOP: + // Mask off lanes (or branch out) after phase end. + v_cmp_lt_u64 vcc, v[2:3], s[12:13] + s_cbranch_vccz L_COPY_MISALIGNED_PHASE_2_DONE + s_and_b64 exec, exec, vcc + + // Load from/advance the source address. + flat_load_ubyte v1, v[2:3] + v_add_u32 v2, vcc, v2, s16 + v_addc_u32 v3, vcc, v3, 0x0, vcc + s_waitcnt vmcnt(0) + + // Write to/advance the destination address. + flat_store_byte v[4:5], v1 + v_add_u32 v4, vcc, v4, s16 + v_addc_u32 v5, vcc, v5, 0x0, vcc + + // Repeat until branched out. + s_branch L_COPY_MISALIGNED_PHASE_2_LOOP + + L_COPY_MISALIGNED_PHASE_2_DONE: + s_endpgm + end + + // Memory fill for dword-aligned region. + // + // Kernel argument buffer: + // [DW 0, 1] Phase 1 dst start address + // [DW 2, 3] Phase 2 dst start address + // [DW 4, 5] Phase 2 dst end address + // [DW 6 ] Value to fill memory with + // [DW 7 ] Total number of workitems + + var kFillVecWidth = 4 + var kFillUnroll = 1 + + shader Fill + type(CS) + user_sgpr_count(2) + sgpr_count(19) + vgpr_count(8) + + // Retrieve kernel arguments. + s_load_dwordx4 s[4:7], s[0:1], s_load_dword_offset(0x0) + s_load_dwordx4 s[8:11], s[0:1], s_load_dword_offset(0x10) + s_waitcnt lgkmcnt(0) + + // Compute workitem id. + s_lshl_b32 s2, s2, 0x6 + v_add_u32 v0, vcc, s2, v0 + + // Copy fill pattern into VGPRs. + for var i = 0; i < kFillVecWidth; i ++ + v_mov_b32 v[4 + i], s10 + end + + // ======================================================== + // Phase 1: Unrolled dword[x4] fill up to last whole block. + // ======================================================== + + // Compute unrolled dword[x4] stride across all threads. + if kFillVecWidth == 4 + s_lshl_b32 s12, s11, 0x4 + else + s_lshl_b32 s12, s11, 0x2 + end + + // Compute phase destination address. + if kFillVecWidth == 4 + v_lshlrev_b32 v1, 0x4, v0 + else + v_lshlrev_b32 v1, 0x2, v0 + end + + v_mov_b32 v3, s5 + v_add_u32 v2, vcc, v1, s4 + v_addc_u32 v3, vcc, v3, 0x0, vcc + + L_FILL_PHASE_1_LOOP: + // Branch out after phase end. + v_cmp_lt_u64 vcc, v[2:3], s[6:7] + s_cbranch_vccz L_FILL_PHASE_1_DONE + + // Write to/advance the destination address. + for var i = 0; i < kFillUnroll; i ++ + if kFillVecWidth == 4 + flat_store_dwordx4 v[2:3], v[4:7] + else + flat_store_dword v[2:3], v4 + end + + v_add_u32 v2, vcc, v2, s12 + v_addc_u32 v3, vcc, v3, 0x0, vcc + end + + // Repeat until branched out. + s_branch L_FILL_PHASE_1_LOOP + + L_FILL_PHASE_1_DONE: + + // ============================== + // Phase 2: Dword fill up to end. + // ============================== + + // Compute dword stride across all threads. + s_lshl_b32 s12, s11, 0x2 + + // Compute phase destination address. + v_lshlrev_b32 v1, 0x2, v0 + v_mov_b32 v3, s7 + v_add_u32 v2, vcc, v1, s6 + v_addc_u32 v3, vcc, v3, 0x0, vcc + + L_FILL_PHASE_2_LOOP: + // Mask off lanes (or branch out) after phase end. + v_cmp_lt_u64 vcc, v[2:3], s[8:9] + s_cbranch_vccz L_FILL_PHASE_2_DONE + s_and_b64 exec, exec, vcc + + // Write to/advance the destination address. + flat_store_dword v[2:3], v4 + v_add_u32 v2, vcc, v2, s12 + v_addc_u32 v3, vcc, v3, 0x0, vcc + + // Repeat until branched out. + s_branch L_FILL_PHASE_2_LOOP + + L_FILL_PHASE_2_DONE: + s_endpgm + end +)"); + +// Search kernel source for variable definition and return value. +int GetKernelSourceParam(const char* paramName) { + std::stringstream paramDef; + paramDef << "var " << paramName << " = "; + + std::string::size_type paramDefLoc = kBlitKernelSource.find(paramDef.str()); + assert(paramDefLoc != std::string::npos); + std::string::size_type paramValLoc = paramDefLoc + paramDef.str().size(); + std::string::size_type paramEndLoc = + kBlitKernelSource.find('\n', paramDefLoc); + assert(paramDefLoc != std::string::npos); + + std::string paramVal(&kBlitKernelSource[paramValLoc], + &kBlitKernelSource[paramEndLoc]); + return std::stoi(paramVal); +} + +static int kCopyAlignedVecWidth = GetKernelSourceParam("kCopyAlignedVecWidth"); +static int kCopyAlignedUnroll = GetKernelSourceParam("kCopyAlignedUnroll"); +static int kCopyMisalignedUnroll = GetKernelSourceParam("kCopyMisalignedUnroll"); +static int kFillVecWidth = GetKernelSourceParam("kFillVecWidth"); +static int kFillUnroll = GetKernelSourceParam("kFillUnroll"); + BlitKernel::BlitKernel() : core::Blit(), - copy_code_handle_(0), - fill_code_handle_(0), queue_(NULL), cached_index_(0), kernarg_async_(NULL), kernarg_async_mask_(0), kernarg_async_counter_(0), - code_arg_buffer_(NULL) { + num_cus_(0) { completion_signal_.handle = 0; } @@ -96,150 +542,62 @@ hsa_status_t BlitKernel::Initialize(const core::Agent& agent) { return HSA_STATUS_ERROR; } - // Need queue buffer that can cover the max size of local memory. - const uint64_t kGpuVmVaSize = 1ULL << 40; - const uint32_t kRequiredQueueSize = NextPow2(static_cast<uint32_t>( - std::ceil(static_cast<double>(kGpuVmVaSize) / kMaxCopyCount))); - - uint32_t max_queue_size = 0; - status = HSA::hsa_agent_get_info(agent_handle, HSA_AGENT_INFO_QUEUE_MAX_SIZE, - &max_queue_size); + status = HSA::hsa_queue_create(agent_handle, 1024, HSA_QUEUE_TYPE_MULTI, NULL, + NULL, 0, 0, &queue_); if (HSA_STATUS_SUCCESS != status) { return status; } - if (max_queue_size < kRequiredQueueSize) { - return HSA_STATUS_ERROR_OUT_OF_RESOURCES; - } + queue_bitmask_ = queue_->size - 1; - status = - HSA::hsa_queue_create(agent_handle, kRequiredQueueSize, - HSA_QUEUE_TYPE_MULTI, NULL, NULL, 0, 0, &queue_); + cached_index_ = 0; + status = HSA::hsa_signal_create(1, 0, NULL, &completion_signal_); if (HSA_STATUS_SUCCESS != status) { return status; } - queue_bitmask_ = queue_->size - 1; - - cached_index_ = 0; - - void* copy_raw_obj_mem = NULL; - size_t copy_akc_size = 0; - size_t copy_akc_offset = 0; - - void* copy_aligned_raw_obj_mem = NULL; - size_t copy_aligned_akc_size = 0; - size_t copy_aligned_akc_offset = 0; - - void* fill_raw_obj_mem = NULL; - size_t fill_akc_size = 0; - size_t fill_akc_offset = 0; - - switch (agent.isa()->GetMajorVersion()) { - case 7: - copy_raw_obj_mem = kVectorCopyKvObject; - copy_akc_size = HSA_VECTOR_COPY_KV_AKC_SIZE; - copy_akc_offset = HSA_VECTOR_COPY_KV_AKC_OFFSET; + kernarg_async_ = reinterpret_cast<KernelArgs*>( + core::Runtime::runtime_singleton_->system_allocator()( + queue_->size * AlignUp(sizeof(KernelArgs), 16), 16)); - copy_aligned_raw_obj_mem = kVectorCopyAlignedKvObject; - copy_aligned_akc_size = HSA_VECTOR_COPY_ALIGNED_KV_AKC_SIZE; - copy_aligned_akc_offset = HSA_VECTOR_COPY_ALIGNED_KV_AKC_OFFSET; + kernarg_async_mask_ = queue_->size - 1; - fill_raw_obj_mem = kFillMemoryKvObject; - fill_akc_size = HSA_FILL_MEMORY_KV_AKC_SIZE; - fill_akc_offset = HSA_FILL_MEMORY_KV_AKC_OFFSET; - break; - case 8: - copy_raw_obj_mem = kVectorCopyViObject; - copy_akc_size = HSA_VECTOR_COPY_VI_AKC_SIZE; - copy_akc_offset = HSA_VECTOR_COPY_VI_AKC_OFFSET; - - copy_aligned_raw_obj_mem = kVectorCopyAlignedViObject; - copy_aligned_akc_size = HSA_VECTOR_COPY_ALIGNED_VI_AKC_SIZE; - copy_aligned_akc_offset = HSA_VECTOR_COPY_ALIGNED_VI_AKC_OFFSET; - - fill_raw_obj_mem = kFillMemoryViObject; - fill_akc_size = HSA_FILL_MEMORY_VI_AKC_SIZE; - fill_akc_offset = HSA_FILL_MEMORY_VI_AKC_OFFSET; - break; - default: - assert(false && "Only gfx7 and gfx8 are supported"); - break; - } + // Obtain the number of compute units in the underlying agent. + const GpuAgent& gpuAgent = static_cast<const GpuAgent&>(agent); + num_cus_ = gpuAgent.properties().NumFComputeCores / 4; - const size_t total_alloc_size = AlignUp( - AlignUp(copy_akc_size, 256) + AlignUp(copy_aligned_akc_size, 256) + - AlignUp(fill_akc_size, 256), - 4096); - - amd_kernel_code_t *code_ptr = nullptr; - code_arg_buffer_ = core::Runtime::runtime_singleton_->system_allocator()( - total_alloc_size, 4096); - - char* akc_arg = reinterpret_cast<char*>(code_arg_buffer_); - memcpy(akc_arg, - reinterpret_cast<const char*>(copy_raw_obj_mem) + copy_akc_offset, - copy_akc_size); - copy_code_handle_ = reinterpret_cast<uint64_t>(akc_arg); - code_ptr = (amd_kernel_code_t*)(copy_code_handle_); - code_ptr->runtime_loader_kernel_symbol = 0; - akc_arg += copy_akc_size; - - akc_arg = AlignUp(akc_arg, 256); - memcpy(akc_arg, reinterpret_cast<const char*>(copy_aligned_raw_obj_mem) + - copy_aligned_akc_offset, - copy_aligned_akc_size); - copy_aligned_code_handle_ = reinterpret_cast<uint64_t>(akc_arg); - code_ptr = (amd_kernel_code_t*)(copy_aligned_code_handle_); - code_ptr->runtime_loader_kernel_symbol = 0; - akc_arg += copy_aligned_akc_size; - - akc_arg = AlignUp(akc_arg, 256); - memcpy(akc_arg, - reinterpret_cast<const char*>(fill_raw_obj_mem) + fill_akc_offset, - fill_akc_size); - fill_code_handle_ = reinterpret_cast<uint64_t>(akc_arg); - code_ptr = (amd_kernel_code_t*)(fill_code_handle_); - code_ptr->runtime_loader_kernel_symbol = 0; - akc_arg += fill_akc_size; + // Assemble shaders to AQL code objects. + std::map<KernelType, const char*> kernel_names = { + {KernelType::CopyAligned, "CopyAligned"}, + {KernelType::CopyMisaligned, "CopyMisaligned"}, + {KernelType::Fill, "Fill"}}; - status = HSA::hsa_signal_create(1, 0, NULL, &completion_signal_); - if (HSA_STATUS_SUCCESS != status) { - return status; + for (auto kernel_name : kernel_names) { + KernelCode& kernel = kernels_[kernel_name.first]; + gpuAgent.AssembleShader(kBlitKernelSource.c_str(), kernel_name.second, + GpuAgent::AssembleTarget::AQL, kernel.code_buf_, + kernel.code_buf_size_); } - kernarg_async_ = reinterpret_cast<KernelArgs*>( - core::Runtime::runtime_singleton_->system_allocator()( - kRequiredQueueSize * AlignUp(sizeof(KernelArgs), 16), 16)); - - kernarg_async_mask_ = kRequiredQueueSize - 1; - - // TODO: remove this code when execute permission level is not mandatory. - if (((amd::GpuAgent&)agent).profile() == HSA_PROFILE_FULL) { -#if defined(_WIN32) || defined(_WIN64) -#define NOMINMAX - DWORD old_protect = 0; - const DWORD new_protect = PAGE_EXECUTE_READWRITE; - if (!VirtualProtect(code_arg_buffer_, total_alloc_size, new_protect, - &old_protect)) { - return HSA_STATUS_ERROR_OUT_OF_RESOURCES; - } -#else - if (0 != mprotect(code_arg_buffer_, total_alloc_size, - PROT_READ | PROT_WRITE | PROT_EXEC)) { - return HSA_STATUS_ERROR_OUT_OF_RESOURCES; - } -#endif + if (agent.profiling_enabled()) { + return EnableProfiling(true); } return HSA_STATUS_SUCCESS; } -hsa_status_t BlitKernel::Destroy(void) { +hsa_status_t BlitKernel::Destroy(const core::Agent& agent) { std::lock_guard<std::mutex> guard(lock_); + const GpuAgent& gpuAgent = static_cast<const GpuAgent&>(agent); + + for (auto kernel_pair : kernels_) { + gpuAgent.ReleaseShader(kernel_pair.second.code_buf_, + kernel_pair.second.code_buf_size_); + } + if (queue_ != NULL) { HSA::hsa_queue_destroy(queue_); } @@ -248,10 +606,6 @@ hsa_status_t BlitKernel::Destroy(void) { core::Runtime::runtime_singleton_->system_deallocator()(kernarg_async_); } - if (code_arg_buffer_ != NULL) { - core::Runtime::runtime_singleton_->system_deallocator()(code_arg_buffer_); - } - if (completion_signal_.handle != 0) { HSA::hsa_signal_destroy(completion_signal_); } @@ -259,11 +613,6 @@ hsa_status_t BlitKernel::Destroy(void) { return HSA_STATUS_SUCCESS; } -static bool IsSystemMemory(void* address) { - static const uint64_t kLimitSystem = 1ULL << 48; - return (reinterpret_cast<uint64_t>(address) < kLimitSystem); -} - hsa_status_t BlitKernel::SubmitLinearCopyCommand(void* dst, const void* src, size_t size) { // Protect completion_signal_. @@ -294,35 +643,14 @@ hsa_status_t BlitKernel::SubmitLinearCopyCommand(void* dst, const void* src, hsa_status_t BlitKernel::SubmitLinearCopyCommand( void* dst, const void* src, size_t size, std::vector<core::Signal*>& dep_signals, core::Signal& out_signal) { - assert(copy_code_handle_ != 0); - - const size_t kAlignmentChar = 1; - const size_t kAlignmentUin32 = 4; - const size_t kAlignmentVec4 = 16; - const size_t copy_granule = - (IsMultipleOf(dst, kAlignmentVec4) && IsMultipleOf(src, kAlignmentVec4) && - IsMultipleOf(size, kAlignmentVec4)) - ? kAlignmentVec4 - : (IsMultipleOf(dst, kAlignmentUin32) && - IsMultipleOf(src, kAlignmentUin32) && - IsMultipleOf(size, kAlignmentUin32)) - ? kAlignmentUin32 - : kAlignmentChar; - - size = size / copy_granule; - - const uint32_t num_copy_packet = static_cast<uint32_t>( - std::ceil(static_cast<double>(size) / kMaxCopyCount)); - - const uint32_t num_barrier_packet = - static_cast<uint32_t>(std::ceil(dep_signals.size() / 5.0f)); - - // Reserve write index for copy + fence packet. - const uint32_t total_num_packet = num_barrier_packet + num_copy_packet; + // Reserve write index for barrier(s) + dispatch packet. + const uint32_t num_barrier_packet = uint32_t((dep_signals.size() + 4) / 5); + const uint32_t total_num_packet = num_barrier_packet + 1; uint64_t write_index = AcquireWriteIndex(total_num_packet); uint64_t write_index_temp = write_index; + // Insert barrier packets to handle dependent signals. const uint16_t kBarrierPacketHeader = (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) | (1 << HSA_PACKET_HEADER_BARRIER) | @@ -352,99 +680,116 @@ hsa_status_t BlitKernel::SubmitLinearCopyCommand( } } - const uint32_t last_copy_index = num_copy_packet - 1; - size_t total_copy_count = 0; - for (uint32_t i = 0; i < num_copy_packet; ++i) { - // Setup arguments. - const uint32_t copy_count = static_cast<uint32_t>( - std::min((size - total_copy_count), kMaxCopyCount)); - - void* cur_dst = static_cast<char*>(dst) + (total_copy_count * copy_granule); - const void* cur_src = - static_cast<const char*>(src) + (total_copy_count * copy_granule); - - KernelArgs* args = ObtainAsyncKernelCopyArg(); - assert(args != NULL); - assert(IsMultipleOf(&args->copy, 16)); - - args->copy.src = cur_src; - args->copy.dst = cur_dst; - args->copy.size = copy_count; - args->copy.use_vector = (copy_granule == kAlignmentVec4) ? 1 : 0; - - const uint32_t grid_size_x = - AlignUp(static_cast<uint32_t>(copy_count), kGroupSize); - - // This assert to make sure kMaxCopySize is not changed to a number that - // could cause overflow to packet.grid_size_x. - assert(grid_size_x >= copy_count); - - hsa_signal_t signal = {(i == last_copy_index) - ? (core::Signal::Convert(&out_signal)).handle - : 0}; - PopulateQueue(write_index, ((copy_granule == kAlignmentChar) - ? copy_code_handle_ - : copy_aligned_code_handle_), - args, grid_size_x, signal); - - ++write_index; - - total_copy_count += copy_count; + // Insert dispatch packet for copy kernel. + KernelArgs* args = ObtainAsyncKernelCopyArg(); + KernelCode* kernel_code = nullptr; + int num_workitems = 0; + + bool aligned = ((uintptr_t(src) & 0x3) == (uintptr_t(dst) & 0x3)); + + if (aligned) { + // Use dword-based aligned kernel. + kernel_code = &kernels_[KernelType::CopyAligned]; + + // Compute the size of each copy phase. + num_workitems = 64 * 4 * num_cus_; + + // Phase 1 (byte copy) ends when destination is 0x100-aligned. + uintptr_t src_start = uintptr_t(src); + uintptr_t dst_start = uintptr_t(dst); + uint64_t phase1_size = + std::min(size, uint64_t(0x100 - (dst_start & 0xFF)) & 0xFF); + + // Phase 2 (unrolled dwordx4 copy) ends when last whole block fits. + uint64_t phase2_block = num_workitems * sizeof(uint32_t) * + kCopyAlignedUnroll * kCopyAlignedVecWidth; + uint64_t phase2_size = ((size - phase1_size) / phase2_block) * phase2_block; + + // Phase 3 (dword copy) ends when last whole dword fits. + uint64_t phase3_size = + ((size - phase1_size - phase2_size) / sizeof(uint32_t)) * + sizeof(uint32_t); + + args->copy_aligned.phase1_src_start = src_start; + args->copy_aligned.phase1_dst_start = dst_start; + args->copy_aligned.phase2_src_start = src_start + phase1_size; + args->copy_aligned.phase2_dst_start = dst_start + phase1_size; + args->copy_aligned.phase3_src_start = src_start + phase1_size + phase2_size; + args->copy_aligned.phase3_dst_start = dst_start + phase1_size + phase2_size; + args->copy_aligned.phase4_src_start = + src_start + phase1_size + phase2_size + phase3_size; + args->copy_aligned.phase4_dst_start = + dst_start + phase1_size + phase2_size + phase3_size; + args->copy_aligned.phase4_src_end = src_start + size; + args->copy_aligned.phase4_dst_end = dst_start + size; + args->copy_aligned.num_workitems = num_workitems; + } else { + // Use byte-based misaligned kernel. + kernel_code = &kernels_[KernelType::CopyMisaligned]; + + // Compute the size of each copy phase. + num_workitems = 64 * 4 * num_cus_; + + // Phase 1 (unrolled byte copy) ends when last whole block fits. + uintptr_t src_start = uintptr_t(src); + uintptr_t dst_start = uintptr_t(dst); + uint64_t phase1_block = + num_workitems * sizeof(uint8_t) * kCopyMisalignedUnroll; + uint64_t phase1_size = (size / phase1_block) * phase1_block; + + args->copy_misaligned.phase1_src_start = src_start; + args->copy_misaligned.phase1_dst_start = dst_start; + args->copy_misaligned.phase2_src_start = src_start + phase1_size; + args->copy_misaligned.phase2_dst_start = dst_start + phase1_size; + args->copy_misaligned.phase2_src_end = src_start + size; + args->copy_misaligned.phase2_dst_end = dst_start + size; + args->copy_misaligned.num_workitems = num_workitems; } - // Launch copy packet. + hsa_signal_t signal = {(core::Signal::Convert(&out_signal)).handle}; + PopulateQueue(write_index, uintptr_t(kernel_code->code_buf_), args, + num_workitems, signal); + + // Submit barrier(s) and dispatch packets. ReleaseWriteIndex(write_index_temp, total_num_packet); return HSA_STATUS_SUCCESS; } hsa_status_t BlitKernel::SubmitLinearFillCommand(void* ptr, uint32_t value, - size_t num) { - assert(fill_code_handle_ != 0); - + size_t count) { std::lock_guard<std::mutex> guard(lock_); - HSA::hsa_signal_store_relaxed(completion_signal_, 1); - - const uint32_t num_fill_packet = static_cast<uint32_t>( - std::ceil(static_cast<double>(num) / kMaxFillCount)); - - // Reserve write index for copy + fence packet. - uint64_t write_index = AcquireWriteIndex(num_fill_packet); - - const uint32_t last_fill_index = num_fill_packet - 1; - size_t total_fill_count = 0; - for (uint32_t i = 0; i < num_fill_packet; ++i) { - // Setup arguments. - const uint32_t fill_count = static_cast<uint32_t>( - std::min((num - total_fill_count), kMaxFillCount)); - void* cur_ptr = static_cast<char*>(ptr) + total_fill_count; - - KernelArgs* args = ObtainAsyncKernelCopyArg(); - assert(args != NULL); - assert(IsMultipleOf(&args->fill, 16)); + // Reject misaligned base address. + if ((uintptr_t(ptr) & 0x3) != 0) { + return HSA_STATUS_ERROR; + } - args->fill.ptr = cur_ptr; - args->fill.num = fill_count; - args->fill.value = value; + // Compute the size of each fill phase. + int num_workitems = 64 * num_cus_; - const uint32_t grid_size_x = - AlignUp(static_cast<uint32_t>(fill_count), kGroupSize); + // Phase 1 (unrolled dwordx4 copy) ends when last whole block fits. + uintptr_t dst_start = uintptr_t(ptr); + uint64_t fill_size = count * sizeof(uint32_t); - // This assert to make sure kMaxFillCount is not changed to a number that - // could cause overflow to packet.grid_size_x. - assert(grid_size_x >= fill_count); + uint64_t phase1_block = + num_workitems * sizeof(uint32_t) * kFillUnroll * kFillVecWidth; + uint64_t phase1_size = (fill_size / phase1_block) * phase1_block; - hsa_signal_t signal = {(i == last_fill_index) ? completion_signal_.handle - : 0}; - PopulateQueue(write_index + i, fill_code_handle_, &args[i], grid_size_x, - signal); + KernelArgs* args = ObtainAsyncKernelCopyArg(); + args->fill.phase1_dst_start = dst_start; + args->fill.phase2_dst_start = dst_start + phase1_size; + args->fill.phase2_dst_end = dst_start + fill_size; + args->fill.fill_value = value; + args->fill.num_workitems = num_workitems; - total_fill_count += fill_count; - } + // Submit dispatch packet. + HSA::hsa_signal_store_relaxed(completion_signal_, 1); - // Launch fill packet. - ReleaseWriteIndex(write_index, num_fill_packet); + uint64_t write_index = AcquireWriteIndex(1); + PopulateQueue(write_index, uintptr_t(kernels_[KernelType::Fill].code_buf_), + args, num_workitems, completion_signal_); + ReleaseWriteIndex(write_index, 1); // Wait for the packet to finish. if (HSA::hsa_signal_wait_acquire(completion_signal_, HSA_SIGNAL_CONDITION_LT, @@ -457,6 +802,17 @@ hsa_status_t BlitKernel::SubmitLinearFillCommand(void* ptr, uint32_t value, return HSA_STATUS_SUCCESS; } +hsa_status_t BlitKernel::EnableProfiling(bool enable) { + core::Queue* cmd_queue = core::Queue::Convert(queue_); + if (cmd_queue != NULL) { + AMD_HSA_BITS_SET(cmd_queue->amd_queue_.queue_properties, + AMD_QUEUE_PROPERTIES_ENABLE_PROFILING, enable); + return HSA_STATUS_SUCCESS; + } + + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; +} + uint64_t BlitKernel::AcquireWriteIndex(uint32_t num_packet) { assert(queue_->size >= num_packet); @@ -556,9 +912,9 @@ void BlitKernel::PopulateQueue(uint64_t index, uint64_t code_handle, void* args, // Setup working size. const int kNumDimension = 1; packet.setup = kNumDimension << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; - packet.grid_size_x = AlignUp(static_cast<uint32_t>(grid_size_x), kGroupSize); + packet.grid_size_x = AlignUp(static_cast<uint32_t>(grid_size_x), 64); packet.grid_size_y = packet.grid_size_z = 1; - packet.workgroup_size_x = kGroupSize; + packet.workgroup_size_x = 64; packet.workgroup_size_y = packet.workgroup_size_z = 1; packet.completion_signal = completion_signal; diff --git a/src/core/runtime/amd_blit_sdma.cpp b/src/core/runtime/amd_blit_sdma.cpp index b89ba7627..b0b28b2c7 100644 --- a/src/core/runtime/amd_blit_sdma.cpp +++ b/src/core/runtime/amd_blit_sdma.cpp @@ -48,19 +48,25 @@ #include <cstring> #include "core/inc/amd_gpu_agent.h" +#include "core/inc/amd_memory_region.h" #include "core/inc/runtime.h" #include "core/inc/signal.h" +#define SDMA_QUEUE_SIZE 1024 * 1024 + namespace amd { // SDMA packet for VI device. // Reference: http://people.freedesktop.org/~agd5f/dma_packets.txt const unsigned int SDMA_OP_COPY = 1; const unsigned int SDMA_OP_FENCE = 5; +const unsigned int SDMA_OP_TRAP = 6; const unsigned int SDMA_OP_POLL_REGMEM = 8; const unsigned int SDMA_OP_ATOMIC = 10; const unsigned int SDMA_OP_CONST_FILL = 11; +const unsigned int SDMA_OP_TIMESTAMP = 13; const unsigned int SDMA_SUBOP_COPY_LINEAR = 0; +const unsigned int SDMA_SUBOP_TIMESTAMP_GET_GLOBAL = 2; const unsigned int SDMA_ATOMIC_ADD64 = 47; typedef struct SDMA_PKT_COPY_LINEAR_TAG { @@ -310,6 +316,51 @@ typedef struct SDMA_PKT_ATOMIC_TAG { } LOOP_UNION; } SDMA_PKT_ATOMIC; +typedef struct SDMA_PKT_TIMESTAMP_TAG { + union { + struct { + unsigned int op : 8; + unsigned int sub_op : 8; + unsigned int reserved_0 : 16; + }; + unsigned int DW_0_DATA; + } HEADER_UNION; + + union { + struct { + unsigned int addr_31_0 : 32; + }; + unsigned int DW_1_DATA; + } ADDR_LO_UNION; + + union { + struct { + unsigned int addr_63_32 : 32; + }; + unsigned int DW_2_DATA; + } ADDR_HI_UNION; + +} SDMA_PKT_TIMESTAMP; + +typedef struct SDMA_PKT_TRAP_TAG { + union { + struct { + unsigned int op : 8; + unsigned int sub_op : 8; + unsigned int reserved_0 : 16; + }; + unsigned int DW_0_DATA; + } HEADER_UNION; + + union { + struct { + unsigned int int_ctx : 28; + unsigned int reserved_1 : 4; + }; + unsigned int DW_1_DATA; + } INT_CONTEXT_UNION; +} SDMA_PKT_TRAP; + inline uint32_t ptrlow32(const void* p) { return static_cast<uint32_t>(reinterpret_cast<uintptr_t>(p)); } @@ -322,21 +373,28 @@ inline uint32_t ptrhigh32(const void* p) { #endif } +const size_t BlitSdma::kQueueSize = SDMA_QUEUE_SIZE; +const size_t BlitSdma::kCopyPacketSize = sizeof(SDMA_PKT_COPY_LINEAR); + BlitSdma::BlitSdma() : core::Blit(), + agent_(NULL), queue_size_(0), queue_start_addr_(NULL), fence_base_addr_(NULL), fence_pool_size_(0), fence_pool_counter_(0), cached_reserve_offset_(0), - cached_commit_offset_(0) { + cached_commit_offset_(0), + platform_atomic_support_(true) { std::memset(&queue_resource_, 0, sizeof(queue_resource_)); } BlitSdma::~BlitSdma() {} hsa_status_t BlitSdma::Initialize(const core::Agent& agent) { + agent_ = reinterpret_cast<amd::GpuAgent*>(&const_cast<core::Agent&>(agent)); + if (queue_start_addr_ != NULL && queue_size_ != 0) { // Already initialized. return HSA_STATUS_SUCCESS; @@ -351,6 +409,8 @@ hsa_status_t BlitSdma::Initialize(const core::Agent& agent) { fence_command_size_ = sizeof(SDMA_PKT_FENCE); poll_command_size_ = sizeof(SDMA_PKT_POLL_REGMEM); atomic_command_size_ = sizeof(SDMA_PKT_ATOMIC); + timestamp_command_size_ = sizeof(SDMA_PKT_TIMESTAMP); + trap_command_size_ = sizeof(SDMA_PKT_TRAP); const uint32_t sync_command_size = fence_command_size_; const uint32_t max_num_copy_command = @@ -372,18 +432,20 @@ hsa_status_t BlitSdma::Initialize(const core::Agent& agent) { static_cast<uint64_t>(max_num_fill_command) * static_cast<uint64_t>(max_single_fill_size_))); - const amd::GpuAgent& amd_gpu_agent = static_cast<const amd::GpuAgent&>(agent); + const amd::GpuAgentInt& amd_gpu_agent = + static_cast<const amd::GpuAgentInt&>(agent); - if (amd_gpu_agent.isa()->version() != core::Isa::Version(8, 0, 3)) { - assert(false && "Only for Fiji currently"); + if (HSA_PROFILE_FULL == amd_gpu_agent.profile()) { + assert(false && "Only support SDMA for dgpu currently"); return HSA_STATUS_ERROR; } - // Allocate queue buffer. - const size_t kPageSize = 4096; - const size_t kSdmaQueueSize = 1024 * 1024; + if (amd_gpu_agent.isa()->version() == core::Isa::Version(7, 0, 1)) { + platform_atomic_support_ = false; + } - queue_size_ = kSdmaQueueSize; + // Allocate queue buffer. + queue_size_ = kQueueSize; HsaMemFlags flags; flags.Value = 0; @@ -404,7 +466,7 @@ hsa_status_t BlitSdma::Initialize(const core::Agent& agent) { if (err != HSAKMT_STATUS_SUCCESS) { assert(false && "AQL queue memory map failure."); - Destroy(); + Destroy(agent); return HSA_STATUS_ERROR_OUT_OF_RESOURCES; } @@ -413,21 +475,20 @@ hsa_status_t BlitSdma::Initialize(const core::Agent& agent) { // Access kernel driver to initialize the queue control block // This call binds user mode queue object to underlying compute // device. - const GpuAgent& gpu_agent = reinterpret_cast<const GpuAgent&>(agent); const HSA_QUEUE_TYPE kQueueType_ = HSA_QUEUE_SDMA; if (HSAKMT_STATUS_SUCCESS != - hsaKmtCreateQueue(gpu_agent.node_id(), kQueueType_, 100, + hsaKmtCreateQueue(amd_gpu_agent.node_id(), kQueueType_, 100, HSA_QUEUE_PRIORITY_MAXIMUM, queue_start_addr_, queue_size_, NULL, &queue_resource_)) { - Destroy(); + Destroy(agent); return HSA_STATUS_ERROR_OUT_OF_RESOURCES; } cached_reserve_offset_ = *(queue_resource_.Queue_write_ptr); cached_commit_offset_ = cached_reserve_offset_; - fence_pool_size_ = - static_cast<uint32_t>(std::ceil(kSdmaQueueSize / fence_command_size_)); + fence_pool_size_ = static_cast<uint32_t>( + (kQueueSize + fence_command_size_ - 1) / fence_command_size_); fence_pool_mask_ = fence_pool_size_ - 1; @@ -436,14 +497,14 @@ hsa_status_t BlitSdma::Initialize(const core::Agent& agent) { fence_pool_size_ * sizeof(uint32_t), 256)); if (fence_base_addr_ == NULL) { - Destroy(); + Destroy(agent); return HSA_STATUS_ERROR_OUT_OF_RESOURCES; } return HSA_STATUS_SUCCESS; } -hsa_status_t BlitSdma::Destroy(void) { +hsa_status_t BlitSdma::Destroy(const core::Agent& agent) { // Release all allocated resources and reset them to zero. if (queue_resource_.QueueId != 0) { @@ -479,8 +540,8 @@ hsa_status_t BlitSdma::SubmitLinearCopyCommand(void* dst, const void* src, // Break the copy into multiple copy operation incase the copy size exceeds // the SDMA linear copy limit. - const uint32_t num_copy_command = static_cast<uint32_t>( - std::ceil(static_cast<double>(size) / max_single_linear_copy_size_)); + const uint32_t num_copy_command = + (size + max_single_linear_copy_size_ - 1) / max_single_linear_copy_size_; const uint32_t total_copy_command_size = num_copy_command * linear_copy_command_size_; @@ -528,18 +589,55 @@ hsa_status_t BlitSdma::SubmitLinearCopyCommand( // Break the copy into multiple copy operation incase the copy size exceeds // the SDMA linear copy limit. - const uint32_t num_copy_command = static_cast<uint32_t>( - std::ceil(static_cast<double>(size) / max_single_linear_copy_size_)); + const uint32_t num_copy_command = + (size + max_single_linear_copy_size_ - 1) / max_single_linear_copy_size_; const uint32_t total_copy_command_size = num_copy_command * linear_copy_command_size_; - const uint32_t total_command_size = - total_poll_command_size + total_copy_command_size + atomic_command_size_ + - fence_command_size_; + // Load the profiling state early in case the user disable or enable the + // profiling in the middle of the call. + const bool profiling_enabled = agent_->profiling_enabled(); - const uint32_t kFenceValue = 2015; - uint32_t* fence_addr = ObtainFenceObject(); - *fence_addr = 0; + uint64_t* end_ts_addr = NULL; + uint32_t total_timestamp_command_size = 0; + + if (profiling_enabled) { + // SDMA timestamp packet requires 32 byte of aligned memory, but + // amd_signal_t::end_ts is not 32 byte aligned. So an extra copy packet to + // read from a 32 byte aligned bounce buffer is required to avoid changing + // the amd_signal_t ABI. + + end_ts_addr = agent_->ObtainEndTsObject(); + if (end_ts_addr == NULL) { + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + + total_timestamp_command_size = + (2 * timestamp_command_size_) + linear_copy_command_size_; + } + + // On agent that does not support platform atomic, we replace it with + // one or two fence packet(s) to update the signal value. The reason fence + // is used and not write packet is because the SDMA engine may overlap a + // serial copy/write packets. + const uint64_t completion_signal_value = + static_cast<uint64_t>(out_signal.LoadRelaxed() - 1); + const size_t sync_command_size = (platform_atomic_support_) + ? atomic_command_size_ + : (completion_signal_value > UINT32_MAX) + ? 2 * fence_command_size_ + : fence_command_size_; + + // If the signal is an interrupt signal, we also need to make SDMA engine to + // send interrupt packet to IH. + const size_t interrupt_command_size = + (out_signal.signal_.event_mailbox_ptr != 0) + ? (fence_command_size_ + trap_command_size_) + : 0; + + const uint32_t total_command_size = + total_poll_command_size + total_copy_command_size + sync_command_size + + total_timestamp_command_size + interrupt_command_size; char* command_addr = AcquireWriteAddress(total_command_size); char* const command_addr_temp = command_addr; @@ -559,17 +657,58 @@ hsa_status_t BlitSdma::SubmitLinearCopyCommand( command_addr += poll_command_size_; } + if (profiling_enabled) { + BuildGetGlobalTimestampCommand( + command_addr, reinterpret_cast<void*>(&out_signal.signal_.start_ts)); + command_addr += timestamp_command_size_; + } + // Do the transfer after all polls are satisfied. BuildCopyCommand(command_addr, num_copy_command, dst, src, size); command_addr += total_copy_command_size; - // After transfer is completed, decrement the signal. - BuildAtomicDecrementCommand(command_addr, out_signal.ValueLocation()); + if (profiling_enabled) { + assert(IsMultipleOf(end_ts_addr, 32)); + BuildGetGlobalTimestampCommand(command_addr, + reinterpret_cast<void*>(end_ts_addr)); + command_addr += timestamp_command_size_; - command_addr += atomic_command_size_; + BuildCopyCommand(command_addr, 1, + reinterpret_cast<void*>(&out_signal.signal_.end_ts), + reinterpret_cast<void*>(end_ts_addr), sizeof(uint64_t)); + command_addr += linear_copy_command_size_; + } - BuildFenceCommand(command_addr, fence_addr, kFenceValue); + // After transfer is completed, decrement the signal value. + if (platform_atomic_support_) { + BuildAtomicDecrementCommand(command_addr, out_signal.ValueLocation()); + command_addr += atomic_command_size_; + + } else { + uint32_t* signal_value_location = + reinterpret_cast<uint32_t*>(out_signal.ValueLocation()); + if (completion_signal_value > UINT32_MAX) { + BuildFenceCommand(command_addr, signal_value_location + 1, + static_cast<uint32_t>(completion_signal_value >> 32)); + command_addr += fence_command_size_; + } + + BuildFenceCommand(command_addr, signal_value_location, + static_cast<uint32_t>(completion_signal_value)); + + command_addr += fence_command_size_; + } + + // Update mailbox event and send interrupt to IH. + if (out_signal.signal_.event_mailbox_ptr != 0) { + BuildFenceCommand(command_addr, reinterpret_cast<uint32_t*>( + out_signal.signal_.event_mailbox_ptr), + static_cast<uint32_t>(out_signal.signal_.event_id)); + command_addr += fence_command_size_; + + BuildTrapCommand(command_addr); + } ReleaseWriteAddress(command_addr_temp, total_command_size); @@ -586,8 +725,8 @@ hsa_status_t BlitSdma::SubmitLinearFillCommand(void* ptr, uint32_t value, // Break the copy into multiple copy operation incase the copy size exceeds // the SDMA linear copy limit. - const uint32_t num_fill_command = static_cast<uint32_t>( - std::ceil(static_cast<double>(size) / max_single_fill_size_)); + const uint32_t num_fill_command = + (size + max_single_fill_size_ - 1) / max_single_fill_size_; const uint32_t total_fill_command_size = num_fill_command * fill_command_size_; @@ -644,6 +783,10 @@ hsa_status_t BlitSdma::SubmitLinearFillCommand(void* ptr, uint32_t value, return HSA_STATUS_SUCCESS; } +hsa_status_t BlitSdma::EnableProfiling(bool enable) { + return HSA_STATUS_SUCCESS; +} + char* BlitSdma::AcquireWriteAddress(uint32_t cmd_size) { if (cmd_size > queue_size_) { return NULL; @@ -867,4 +1010,27 @@ void BlitSdma::BuildAtomicDecrementCommand(char* cmd_addr, void* addr) { packet_addr->SRC_DATA_LO_UNION.src_data_31_0 = 0xffffffff; packet_addr->SRC_DATA_HI_UNION.src_data_63_32 = 0xffffffff; } + +void BlitSdma::BuildGetGlobalTimestampCommand(char* cmd_addr, + void* write_address) { + SDMA_PKT_TIMESTAMP* packet_addr = + reinterpret_cast<SDMA_PKT_TIMESTAMP*>(cmd_addr); + + memset(packet_addr, 0, sizeof(SDMA_PKT_TIMESTAMP)); + + packet_addr->HEADER_UNION.op = SDMA_OP_TIMESTAMP; + packet_addr->HEADER_UNION.sub_op = SDMA_SUBOP_TIMESTAMP_GET_GLOBAL; + + packet_addr->ADDR_LO_UNION.addr_31_0 = ptrlow32(write_address); + packet_addr->ADDR_HI_UNION.addr_63_32 = ptrhigh32(write_address); +} + +void BlitSdma::BuildTrapCommand(char* cmd_addr) { + SDMA_PKT_TRAP* packet_addr = + reinterpret_cast<SDMA_PKT_TRAP*>(cmd_addr); + + memset(packet_addr, 0, sizeof(SDMA_PKT_TRAP)); + + packet_addr->HEADER_UNION.op = SDMA_OP_TRAP; +} } // namespace amd diff --git a/src/core/runtime/amd_gpu_agent.cpp b/src/core/runtime/amd_gpu_agent.cpp index 244f7eaf5..d56f7fb51 100644 --- a/src/core/runtime/amd_gpu_agent.cpp +++ b/src/core/runtime/amd_gpu_agent.cpp @@ -46,23 +46,26 @@ #include <atomic> #include <cstring> #include <climits> +#include <map> +#include <string> #include <vector> #include "core/inc/amd_aql_queue.h" #include "core/inc/amd_blit_kernel.h" #include "core/inc/amd_blit_sdma.h" +#include "core/inc/amd_gpu_shaders.h" #include "core/inc/amd_memory_region.h" #include "core/inc/interrupt_signal.h" #include "core/inc/isa.h" #include "core/inc/runtime.h" -#include "utils/sp3/sp3.h" - #include "hsa_ext_image.h" // Size of scratch (private) segment pre-allocated per thread, in bytes. #define DEFAULT_SCRATCH_BYTES_PER_THREAD 2048 +extern core::HsaApiTable hsa_internal_api_table_; + namespace amd { GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props) : GpuAgentInt(node), @@ -70,13 +73,19 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props) current_coherency_type_(HSA_AMD_COHERENCY_TYPE_COHERENT), blit_h2d_(NULL), blit_d2h_(NULL), + blit_d2d_(NULL), + local_region_(NULL), is_kv_device_(false), trap_code_buf_(NULL), trap_code_buf_size_(0), memory_bus_width_(0), memory_max_frequency_(0), ape1_base_(0), - ape1_size_(0) { + ape1_size_(0), + blit_initialized_(false), + end_ts_pool_size_(0), + end_ts_pool_counter_(0), + end_ts_base_addr_(NULL) { const bool is_apu_node = (properties_.NumCPUCores > 0); profile_ = (is_apu_node) ? HSA_PROFILE_FULL : HSA_PROFILE_BASE; @@ -88,6 +97,7 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props) isa_ = (core::Isa*)core::IsaRegistry::GetIsa(core::Isa::Version( node_props.EngineId.ui32.Major, node_props.EngineId.ui32.Minor, node_props.EngineId.ui32.Stepping)); + // Check if the device is Kaveri, only on GPU device. if (isa_->GetMajorVersion() == 7 && isa_->GetMinorVersion() == 0 && isa_->GetStepping() == 0) { @@ -126,21 +136,33 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props) GpuAgent::~GpuAgent() { if (blit_h2d_ != NULL) { - hsa_status_t status = blit_h2d_->Destroy(); + hsa_status_t status = blit_h2d_->Destroy(*this); assert(status == HSA_STATUS_SUCCESS); delete blit_h2d_; blit_h2d_ = NULL; } - if (blit_d2h_ != NULL) { - hsa_status_t status = blit_d2h_->Destroy(); + if (blit_d2h_ != NULL && blit_d2h_ != blit_d2d_) { + hsa_status_t status = blit_d2h_->Destroy(*this); assert(status == HSA_STATUS_SUCCESS); delete blit_d2h_; blit_d2h_ = NULL; } + if (blit_d2d_ != NULL) { + hsa_status_t status = blit_d2d_->Destroy(*this); + assert(status == HSA_STATUS_SUCCESS); + + delete blit_d2d_; + blit_d2d_ = NULL; + } + + if (end_ts_base_addr_ != NULL) { + core::Runtime::runtime_singleton_->FreeMemory(end_ts_base_addr_); + } + if (ape1_base_ != 0) { _aligned_free(reinterpret_cast<void*>(ape1_base_)); } @@ -158,33 +180,60 @@ GpuAgent::~GpuAgent() { } void GpuAgent::AssembleShader(const char* src_sp3, const char* func_name, - void*& code_buf, size_t& code_buf_size) { -#ifdef __linux__ // No VS builds of libsp3 available right now - // Assemble source string with libsp3. - sp3_context* sp3 = sp3_new(); + AssembleTarget assemble_target, void*& code_buf, + size_t& code_buf_size) const { + // Select precompiled shader implementation from name/target. + struct ASICShader { + const void* code; + size_t size; + int num_sgprs; + int num_vgprs; + }; + + struct CompiledShader { + ASICShader compute_7; + ASICShader compute_8; + }; + + std::map<std::string, CompiledShader> compiled_shaders = { + {"TrapHandler", + {{NULL, 0, 0, 0}, {kCodeTrapHandler8, sizeof(kCodeTrapHandler8), 2, 4}}}, + {"CopyAligned", + {{kCodeCopyAligned7, sizeof(kCodeCopyAligned7), 32, 12}, + {kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12}}}, + {"CopyMisaligned", + {{kCodeCopyMisaligned7, sizeof(kCodeCopyMisaligned7), 23, 10}, + {kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10}}}, + {"Fill", + {{kCodeFill7, sizeof(kCodeFill7), 19, 8}, + {kCodeFill8, sizeof(kCodeFill8), 19, 8}}}}; + + auto compiled_shader_it = compiled_shaders.find(func_name); + assert(compiled_shader_it != compiled_shaders.end() && + "Precompiled shader unavailable"); + + ASICShader* asic_shader = NULL; switch (isa_->GetMajorVersion()) { case 7: - sp3_setasic(sp3, "CI"); + asic_shader = &compiled_shader_it->second.compute_7; break; case 8: - sp3_setasic(sp3, "VI"); + asic_shader = &compiled_shader_it->second.compute_8; break; default: - assert(false && "SP3 assembly not supported on this agent"); + assert(false && "Precompiled shader unavailable for target"); } - sp3_parse_string(sp3, src_sp3); - sp3_shader* code_sp3_meta = sp3_compile(sp3, func_name); - - // Allocate a GPU-visible buffer for the trap shader. + // Allocate a GPU-visible buffer for the shader. HsaMemFlags code_buf_flags = {0}; code_buf_flags.ui32.HostAccess = 1; code_buf_flags.ui32.ExecuteAccess = 1; code_buf_flags.ui32.NoSubstitute = 1; - size_t code_size = code_sp3_meta->size * sizeof(uint32_t); - code_buf_size = AlignUp(code_size, 0x1000); + size_t header_size = + (assemble_target == AssembleTarget::AQL ? sizeof(amd_kernel_code_t) : 0); + code_buf_size = AlignUp(header_size + asic_shader->size, 0x1000); HSAKMT_STATUS err = hsaKmtAllocMemory(node_id(), code_buf_size, code_buf_flags, &code_buf); @@ -193,17 +242,41 @@ void GpuAgent::AssembleShader(const char* src_sp3, const char* func_name, err = hsaKmtMapMemoryToGPU(code_buf, code_buf_size, NULL); assert(err == HSAKMT_STATUS_SUCCESS && "hsaKmtMapMemoryToGPU(Trap) failed"); - // Copy trap handler code into the GPU-visible buffer. memset(code_buf, 0, code_buf_size); - memcpy(code_buf, code_sp3_meta->data, code_size); - // Release SP3 resources. - sp3_free_shader(code_sp3_meta); - sp3_close(sp3); -#endif + // Populate optional code object header. + if (assemble_target == AssembleTarget::AQL) { + amd_kernel_code_t* header = reinterpret_cast<amd_kernel_code_t*>(code_buf); + + int gran_sgprs = std::max(0, (int(asic_shader->num_sgprs) - 1) / 8); + int gran_vgprs = std::max(0, (int(asic_shader->num_vgprs) - 1) / 4); + + header->kernel_code_entry_byte_offset = sizeof(amd_kernel_code_t); + AMD_HSA_BITS_SET(header->kernel_code_properties, + AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_KERNARG_SEGMENT_PTR, + 1); + AMD_HSA_BITS_SET(header->compute_pgm_rsrc1, + AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT, + gran_sgprs); + AMD_HSA_BITS_SET(header->compute_pgm_rsrc1, + AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT, + gran_vgprs); + AMD_HSA_BITS_SET(header->compute_pgm_rsrc1, + AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_16_64, 3); + AMD_HSA_BITS_SET(header->compute_pgm_rsrc1, + AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_IEEE_MODE, 1); + AMD_HSA_BITS_SET(header->compute_pgm_rsrc2, + AMD_COMPUTE_PGM_RSRC_TWO_USER_SGPR_COUNT, 2); + AMD_HSA_BITS_SET(header->compute_pgm_rsrc2, + AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_X, 1); + } + + // Copy shader code into the GPU-visible buffer. + memcpy((void*)(uintptr_t(code_buf) + header_size), asic_shader->code, + asic_shader->size); } -void GpuAgent::ReleaseShader(void* code_buf, size_t code_buf_size) { +void GpuAgent::ReleaseShader(void* code_buf, size_t code_buf_size) const { hsaKmtUnmapMemoryToGPU(code_buf); hsaKmtFreeMemory(code_buf, code_buf_size); } @@ -238,6 +311,10 @@ void GpuAgent::InitRegionList() { new MemoryRegion(false, false, this, mem_props[mem_idx]); regions_.push_back(region); + + if (region->IsLocalMemory()) { + local_region_ = region; + } break; } case HSA_HEAPTYPE_SYSTEM: @@ -314,6 +391,57 @@ void GpuAgent::InitCacheList() { } } +bool GpuAgent::InitEndTsPool() { + if (HSA_PROFILE_FULL == profile_) { + return true; + } + + if (end_ts_base_addr_.load(std::memory_order_acquire) != NULL) { + return true; + } + + ScopedAcquire<KernelMutex> lock(&blit_lock_); + + if (end_ts_base_addr_.load(std::memory_order_relaxed) != NULL) { + return true; + } + + end_ts_pool_size_ = static_cast<uint32_t>( + (BlitSdma::kQueueSize + BlitSdma::kCopyPacketSize - 1) / + (BlitSdma::kCopyPacketSize)); + + // Allocate end timestamp object for both h2d and d2h DMA. + const size_t alloc_size = 2 * end_ts_pool_size_ * kTsSize; + + core::Runtime* runtime = core::Runtime::runtime_singleton_; + + uint64_t* buff = NULL; + if (HSA_STATUS_SUCCESS != + runtime->AllocateMemory(true, local_region_, alloc_size, + reinterpret_cast<void**>(&buff))) { + return false; + } + + end_ts_base_addr_.store(buff, std::memory_order_release); + + return true; +} + +uint64_t* GpuAgent::ObtainEndTsObject() { + if (end_ts_base_addr_ == NULL) { + return NULL; + } + + const uint32_t end_ts_index = + end_ts_pool_counter_.fetch_add(1U, std::memory_order_acq_rel) % + end_ts_pool_size_; + const static size_t kNumU64 = kTsSize / sizeof(uint64_t); + uint64_t* end_ts_addr = &end_ts_base_addr_[end_ts_index * kNumU64]; + assert(IsMultipleOf(end_ts_addr, kTsSize)); + + return end_ts_addr; +} + hsa_status_t GpuAgent::IterateRegion( hsa_status_t (*callback)(hsa_region_t region, void* data), void* data) const { @@ -377,7 +505,7 @@ core::Blit* GpuAgent::CreateBlitSdma() { BlitSdma* sdma = new BlitSdma(); if (sdma->Initialize(*this) != HSA_STATUS_SUCCESS) { - sdma->Destroy(); + sdma->Destroy(*this); delete sdma; sdma = NULL; } @@ -389,7 +517,7 @@ core::Blit* GpuAgent::CreateBlitKernel() { BlitKernel* kernl = new BlitKernel(); if (kernl->Initialize(*this) != HSA_STATUS_SUCCESS) { - kernl->Destroy(); + kernl->Destroy(*this); delete kernl; kernl = NULL; } @@ -397,41 +525,60 @@ core::Blit* GpuAgent::CreateBlitKernel() { return kernl; } -hsa_status_t GpuAgent::InitDma() { - // Try create SDMA blit first. - if (core::Runtime::runtime_singleton_->flag().enable_sdma() && - isa_->GetMajorVersion() == 8 && isa_->GetMinorVersion() == 0 && - isa_->GetStepping() == 3) { - blit_h2d_ = CreateBlitSdma(); - blit_d2h_ = CreateBlitSdma(); +void GpuAgent::InitDma() { + // This provides the ability to lazy init the blit objects on places that + // could give indication of DMA usage in the future. E.g.: + // 1. Call to allow access API. + // 2. Call to memory lock API. + if (!blit_initialized_.load(std::memory_order_acquire)) { + ScopedAcquire<KernelMutex> lock(&blit_lock_); + if (!blit_initialized_.load(std::memory_order_relaxed)) { + // Try create SDMA blit first. + if (core::Runtime::runtime_singleton_->flag().enable_sdma() && + (HSA_PROFILE_BASE == profile_)) { + blit_h2d_ = CreateBlitSdma(); + blit_d2h_ = CreateBlitSdma(); + + if (blit_h2d_ != NULL && blit_d2h_ != NULL) { + blit_initialized_.store(true, std::memory_order_release); + return; + } + } - if (blit_h2d_ != NULL && blit_d2h_ != NULL) { - return HSA_STATUS_SUCCESS; - } - } + // Fall back to blit kernel if SDMA is unavailable. + assert(blit_h2d_ == NULL || blit_d2h_ == NULL); + + if (blit_h2d_ == NULL) { + blit_h2d_ = CreateBlitKernel(); + } - // Fall back to blit kernel if SDMA is unavailable. - assert(blit_h2d_ == NULL || blit_d2h_ == NULL); + if (blit_d2h_ == NULL) { + // Share device-to-host queue with device-to-device. + blit_d2h_ = blit_d2d_; + } - if (blit_h2d_ == NULL) { - blit_h2d_ = CreateBlitKernel(); + blit_initialized_.store(true, std::memory_order_release); + } } +} - if (blit_d2h_ == NULL) { - blit_d2h_ = CreateBlitKernel(); +hsa_status_t GpuAgent::InitBlitKernel() { + // Unlike InitDma, this function is not designed for lazy initialization. + // So checking the state without double checked locking is fine. + if (blit_d2d_ == NULL) { + blit_d2d_ = CreateBlitKernel(); } - return (blit_h2d_ != NULL && blit_d2h_ != NULL) - ? HSA_STATUS_SUCCESS - : HSA_STATUS_ERROR_OUT_OF_RESOURCES; + return (blit_d2d_ != NULL) ? HSA_STATUS_SUCCESS + : HSA_STATUS_ERROR_OUT_OF_RESOURCES; } hsa_status_t GpuAgent::DmaCopy(void* dst, const void* src, size_t size) { - if (blit_d2h_ == NULL) { + if (blit_d2d_ == NULL) { return HSA_STATUS_ERROR_OUT_OF_RESOURCES; } - return blit_d2h_->SubmitLinearCopyCommand(dst, src, size); + return blit_d2d_->SubmitLinearCopyCommand(dst, src, size); } hsa_status_t GpuAgent::DmaCopy(void* dst, core::Agent& dst_agent, @@ -439,31 +586,55 @@ hsa_status_t GpuAgent::DmaCopy(void* dst, core::Agent& dst_agent, size_t size, std::vector<core::Signal*>& dep_signals, core::Signal& out_signal) { - core::Blit* blit = (src_agent.device_type() == core::Agent::kAmdCpuDevice && - dst_agent.device_type() == core::Agent::kAmdGpuDevice) - ? blit_h2d_ - : blit_d2h_; + core::Blit* blit = + (src_agent.device_type() == core::Agent::kAmdCpuDevice && + dst_agent.device_type() == core::Agent::kAmdGpuDevice) + ? blit_h2d_ + : (src_agent.device_type() == core::Agent::kAmdGpuDevice && + dst_agent.device_type() == core::Agent::kAmdCpuDevice) + ? blit_d2h_ + : blit_d2d_; if (blit == NULL) { return HSA_STATUS_ERROR_OUT_OF_RESOURCES; } - // TODO: temporarily disable wait on thunk event if the out_signal - // is an interrupt signal object. Remove this when SDMA handle interrupt - // packet properly. - if (out_signal.EopEvent() != NULL) { - static_cast<core::InterruptSignal&>(out_signal).DisableWaitEvent(); + hsa_status_t stat = + blit->SubmitLinearCopyCommand(dst, src, size, dep_signals, out_signal); + + if (profiling_enabled() && HSA_STATUS_SUCCESS == stat) { + // Track the agent so we could translate the resulting timestamp to system + // domain correctly. + out_signal.async_copy_agent(this); } - return blit->SubmitLinearCopyCommand(dst, src, size, dep_signals, out_signal); + return stat; } hsa_status_t GpuAgent::DmaFill(void* ptr, uint32_t value, size_t count) { - if (blit_d2h_ == NULL) { + if (blit_d2d_ == NULL) { + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + + return blit_d2d_->SubmitLinearFillCommand(ptr, value, count); +} + +hsa_status_t GpuAgent::EnableDmaProfiling(bool enable) { + if (enable && !InitEndTsPool()) { return HSA_STATUS_ERROR_OUT_OF_RESOURCES; } - return blit_d2h_->SubmitLinearFillCommand(ptr, value, count); + core::Blit* blit[3] = {blit_h2d_, blit_d2h_, blit_d2d_}; + for (int i = 0; i < 3; ++i) { + if (blit[i] != NULL) { + const hsa_status_t stat = blit[i]->EnableProfiling(enable); + if (stat != HSA_STATUS_SUCCESS) { + return stat; + } + } + } + + return HSA_STATUS_SUCCESS; } hsa_status_t GpuAgent::GetInfo(hsa_agent_info_t attribute, void* value) const { @@ -472,27 +643,19 @@ hsa_status_t GpuAgent::GetInfo(hsa_agent_info_t attribute, void* value) const { const core::ExtensionEntryPoints& extensions = core::Runtime::runtime_singleton_->extensions_; - hsa_agent_t agent = core::Agent::Convert(this); - const size_t attribute_u = static_cast<size_t>(attribute); switch (attribute_u) { case HSA_AGENT_INFO_NAME: - // TODO: hardcode for now. + { + // This code assumes that UTF-16 HsaNodeProperties.MarketingName is + // actually encoded in 7-bit ASCII, and the runtime output is 7-bit ASCII + // in bytes. std::memset(value, 0, kNameSize); - if (isa_->GetMajorVersion() == 7) { - std::memcpy(value, "Kaveri", sizeof("Kaveri")); - } else if (isa_->GetMajorVersion() == 8) { - if (isa_->GetMinorVersion() == 0 && isa_->GetStepping() == 2) { - std::memcpy(value, "Tonga", sizeof("Tonga")); - } else if (isa_->GetMinorVersion() == 0 && isa_->GetStepping() == 3) { - std::memcpy(value, "Fiji", sizeof("Fiji")); - } else { - std::memcpy(value, "Carrizo", sizeof("Carrizo")); - } - } else { - std::memcpy(value, "Unknown", sizeof("Unknown")); - } + char* temp = reinterpret_cast<char*>(value); + for (uint32_t i = 0; properties_.MarketingName[i] != 0 && i < kNameSize - 1; i++) + temp[i] = properties_.MarketingName[i]; break; + } case HSA_AGENT_INFO_VENDOR_NAME: std::memset(value, 0, kNameSize); std::memcpy(value, "AMD", sizeof("AMD")); @@ -572,13 +735,11 @@ hsa_status_t GpuAgent::GetInfo(hsa_agent_info_t attribute, void* value) const { case HSA_AGENT_INFO_EXTENSIONS: memset(value, 0, sizeof(uint8_t) * 128); - if (extensions.table.hsa_ext_program_finalize_fn != NULL) { + if (core::hsa_internal_api_table_.finalizer_api.hsa_ext_program_finalize_fn != NULL) { *((uint8_t*)value) = 1 << HSA_EXTENSION_FINALIZER; } - if (profile_ == HSA_PROFILE_FULL && - extensions.table.hsa_ext_image_create_fn != NULL) { - // TODO: only APU supports images currently. + if (core::hsa_internal_api_table_.image_api.hsa_ext_image_create_fn != NULL) { *((uint8_t*)value) |= 1 << HSA_EXTENSION_IMAGES; } @@ -831,7 +992,6 @@ void GpuAgent::SyncClocks() { } void GpuAgent::BindTrapHandler() { -#ifdef __linux__ // No raw string literal support in VS builds right now const char* src_sp3 = R"( var s_trap_info_lo = ttmp0 var s_trap_info_hi = ttmp1 @@ -904,13 +1064,13 @@ void GpuAgent::BindTrapHandler() { } // Assemble the trap handler source code. - AssembleShader(src_sp3, "TrapHandler", trap_code_buf_, trap_code_buf_size_); + AssembleShader(src_sp3, "TrapHandler", AssembleTarget::ISA, trap_code_buf_, + trap_code_buf_size_); // Bind the trap handler to this node. HSAKMT_STATUS err = hsaKmtSetTrapHandler(node_id(), trap_code_buf_, trap_code_buf_size_, NULL, 0); assert(err == HSAKMT_STATUS_SUCCESS && "hsaKmtSetTrapHandler() failed"); -#endif } } // namespace diff --git a/src/core/runtime/amd_memory_region.cpp b/src/core/runtime/amd_memory_region.cpp index bf37110ae..62c9bfa16 100644 --- a/src/core/runtime/amd_memory_region.cpp +++ b/src/core/runtime/amd_memory_region.cpp @@ -43,6 +43,7 @@ #include "core/inc/amd_memory_region.h" #include <algorithm> +#include <set> #include "core/inc/runtime.h" #include "core/inc/amd_cpu_agent.h" @@ -374,35 +375,41 @@ hsa_status_t MemoryRegion::GetAgentPoolInfo( const core::Runtime::LinkInfo link_info = core::Runtime::runtime_singleton_->GetLinkInfo(node_id_from, node_id_to); + /** + * --------------------------------------------------- + * | |CPU |GPU (owner)|GPU (peer) | + * --------------------------------------------------- + * |system memory |allowed |disallowed |disallowed | + * --------------------------------------------------- + * |fb private |never |allowed |never | + * --------------------------------------------------- + * |fb public |disallowed |allowed |disallowed | + * --------------------------------------------------- + * |others |never |allowed |never | + * --------------------------------------------------- + */ + const hsa_amd_memory_pool_access_t access_type = + ((IsSystem() && (agent.device_type() == core::Agent::kAmdCpuDevice)) || + (agent.node_id() == owner()->node_id())) + ? HSA_AMD_MEMORY_POOL_ACCESS_ALLOWED_BY_DEFAULT + : (IsSystem() || (IsPublic() && link_info.num_hop > 0)) + ? HSA_AMD_MEMORY_POOL_ACCESS_DISALLOWED_BY_DEFAULT + : HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED; + switch (attribute) { case HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS: - /** - * --------------------------------------------------- - * | |CPU |GPU (owner)|GPU (peer) | - * --------------------------------------------------- - * |system memory |allowed |disallowed |disallowed | - * --------------------------------------------------- - * |fb private |never |allowed |never | - * --------------------------------------------------- - * |fb public |disallowed |allowed |disallowed | - * --------------------------------------------------- - * |others |never |allowed |never | - * --------------------------------------------------- - */ - *((hsa_amd_memory_pool_access_t*)value) = - (((IsSystem()) && - (agent.device_type() == core::Agent::kAmdCpuDevice)) || - (agent.node_id() == owner()->node_id())) - ? HSA_AMD_MEMORY_POOL_ACCESS_ALLOWED_BY_DEFAULT - : (IsSystem() || (IsPublic() && link_info.num_hop > 0)) - ? HSA_AMD_MEMORY_POOL_ACCESS_DISALLOWED_BY_DEFAULT - : HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED; + *((hsa_amd_memory_pool_access_t*)value) = access_type; break; case HSA_AMD_AGENT_MEMORY_POOL_INFO_NUM_LINK_HOPS: - *((uint32_t*)value) = link_info.num_hop; + *((uint32_t*)value) = + (access_type != HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) + ? link_info.num_hop + : 0; + break; case HSA_AMD_AGENT_MEMORY_POOL_INFO_LINK_INFO: memset(value, 0, sizeof(hsa_amd_memory_pool_link_info_t)); - if (link_info.num_hop > 0) { + if ((access_type != HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) && + (link_info.num_hop > 0)) { memcpy(value, &link_info.info, sizeof(hsa_amd_memory_pool_link_info_t)); } break; @@ -425,15 +432,17 @@ hsa_status_t MemoryRegion::AllowAccess(uint32_t num_agents, bool cpu_in_list = false; + std::set<GpuAgentInt*> whitelist_gpus; std::vector<uint32_t> whitelist_nodes; for (uint32_t i = 0; i < num_agents; ++i) { - const core::Agent* agent = core::Agent::Convert(agents[i]); + core::Agent* agent = core::Agent::Convert(agents[i]); if (agent == NULL || !agent->IsValid()) { return HSA_STATUS_ERROR_INVALID_AGENT; } if (agent->device_type() == core::Agent::kAmdGpuDevice) { whitelist_nodes.push_back(agent->node_id()); + whitelist_gpus.insert(reinterpret_cast<GpuAgentInt*>(agent)); } else { cpu_in_list = true; } @@ -452,17 +461,24 @@ hsa_status_t MemoryRegion::AllowAccess(uint32_t num_agents, std::find(whitelist_nodes.begin(), whitelist_nodes.end(), owner()->node_id()) == whitelist_nodes.end()) { whitelist_nodes.push_back(owner()->node_id()); + whitelist_gpus.insert(reinterpret_cast<GpuAgentInt*>(owner())); } HsaMemMapFlags map_flag = map_flag_; map_flag.ui32.HostAccess |= (cpu_in_list) ? 1 : 0; uint64_t alternate_va = 0; - return (amd::MemoryRegion::MakeKfdMemoryResident( - whitelist_nodes.size(), &whitelist_nodes[0], - const_cast<void*>(ptr), size, &alternate_va, map_flag)) - ? HSA_STATUS_SUCCESS - : HSA_STATUS_ERROR_OUT_OF_RESOURCES; + if (!amd::MemoryRegion::MakeKfdMemoryResident( + whitelist_nodes.size(), &whitelist_nodes[0], const_cast<void*>(ptr), + size, &alternate_va, map_flag)) { + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + + for (GpuAgentInt* gpu : whitelist_gpus) { + gpu->InitDma(); + } + + return HSA_STATUS_SUCCESS; } hsa_status_t MemoryRegion::CanMigrate(const MemoryRegion& dst, @@ -490,10 +506,15 @@ hsa_status_t MemoryRegion::Lock(uint32_t num_agents, const hsa_agent_t* agents, return HSA_STATUS_SUCCESS; } + std::set<core::Agent*> whitelist_gpus; std::vector<HSAuint32> whitelist_nodes; if (num_agents == 0 || agents == NULL) { // Map to all GPU agents. whitelist_nodes = core::Runtime::runtime_singleton_->gpu_ids(); + + whitelist_gpus.insert( + core::Runtime::runtime_singleton_->gpu_agents().begin(), + core::Runtime::runtime_singleton_->gpu_agents().end()); } else { for (int i = 0; i < num_agents; ++i) { core::Agent* agent = core::Agent::Convert(agents[i]); @@ -503,6 +524,7 @@ hsa_status_t MemoryRegion::Lock(uint32_t num_agents, const hsa_agent_t* agents, if (agent->device_type() == core::Agent::kAmdGpuDevice) { whitelist_nodes.push_back(agent->node_id()); + whitelist_gpus.insert(reinterpret_cast<GpuAgentInt*>(agent)); } } } @@ -520,8 +542,15 @@ hsa_status_t MemoryRegion::Lock(uint32_t num_agents, const hsa_agent_t* agents, uint64_t alternate_va = 0; if (MakeKfdMemoryResident(whitelist_nodes.size(), &whitelist_nodes[0], host_ptr, size, &alternate_va, map_flag_)) { - assert(alternate_va != 0); - *agent_ptr = reinterpret_cast<void*>(alternate_va); + if (alternate_va != 0) { + *agent_ptr = reinterpret_cast<void*>(alternate_va); + } else { + *agent_ptr = host_ptr; + } + for (core::Agent* gpu : whitelist_gpus) { + reinterpret_cast<GpuAgentInt*>(gpu)->InitDma(); + } + return HSA_STATUS_SUCCESS; } amd::MemoryRegion::DeregisterMemory(host_ptr); diff --git a/src/core/runtime/amd_topology.cpp b/src/core/runtime/amd_topology.cpp index e6a348330..b54292554 100644 --- a/src/core/runtime/amd_topology.cpp +++ b/src/core/runtime/amd_topology.cpp @@ -78,12 +78,6 @@ GpuAgent* DiscoverGpu(HSAuint32 node_id, HsaNodeProperties& node_prop) { GpuAgent* gpu = new GpuAgent(node_id, node_prop); core::Runtime::runtime_singleton_->RegisterAgent(gpu); - if (HSA_STATUS_SUCCESS != gpu->InitDma()) { - assert(false && "Fail init blit"); - delete gpu; - gpu = NULL; - } - return gpu; } diff --git a/src/core/runtime/hsa.cpp b/src/core/runtime/hsa.cpp index 8e8f0eceb..fdda3cf61 100644 --- a/src/core/runtime/hsa.cpp +++ b/src/core/runtime/hsa.cpp @@ -55,7 +55,7 @@ #include "core/inc/default_signal.h" #include "core/inc/interrupt_signal.h" #include "core/inc/amd_loader_context.hpp" -#include "inc/hsa_ven_amd_loaded_code_object.h" +#include "inc/hsa_ven_amd_loader.h" using namespace amd::hsa::code; @@ -168,9 +168,7 @@ hsa_status_t uint16_t version_minor, bool* result) { IS_OPEN(); - if ((extension > HSA_EXTENSION_AMD_PROFILER && - extension != HSA_EXTENSION_AMD_LOADED_CODE_OBJECT) || - (result == NULL)) { + if (extension >= HSA_EXTENSION_COUNT || result == NULL) { return HSA_STATUS_ERROR_INVALID_ARGUMENT; } @@ -208,57 +206,57 @@ hsa_status_t hsa_status_t status = hsa_system_extension_supported( extension, version_major, version_minor, &supported); - if (HSA_STATUS_SUCCESS != status) { + if ((HSA_STATUS_SUCCESS != status) || + (supported == false)) { return status; } - if (supported) { - ExtTable& runtime_ext_table = - core::Runtime::runtime_singleton_->extensions_.table; - - if (extension == HSA_EXTENSION_IMAGES) { - // Currently there is only version 1.00. - hsa_ext_images_1_00_pfn_t* ext_table = - reinterpret_cast<hsa_ext_images_1_00_pfn_t*>(table); - ext_table->hsa_ext_image_clear = hsa_ext_image_clear; - ext_table->hsa_ext_image_copy = hsa_ext_image_copy; - ext_table->hsa_ext_image_create = hsa_ext_image_create; - ext_table->hsa_ext_image_data_get_info = hsa_ext_image_data_get_info; - ext_table->hsa_ext_image_destroy = hsa_ext_image_destroy; - ext_table->hsa_ext_image_export = hsa_ext_image_export; - ext_table->hsa_ext_image_get_capability = hsa_ext_image_get_capability; - ext_table->hsa_ext_image_import = hsa_ext_image_import; - ext_table->hsa_ext_sampler_create = hsa_ext_sampler_create; - ext_table->hsa_ext_sampler_destroy = hsa_ext_sampler_destroy; - - return HSA_STATUS_SUCCESS; - } else if (extension == HSA_EXTENSION_FINALIZER) { - // Currently there is only version 1.00. - hsa_ext_finalizer_1_00_pfn_s* ext_table = - reinterpret_cast<hsa_ext_finalizer_1_00_pfn_s*>(table); - ext_table->hsa_ext_program_add_module = hsa_ext_program_add_module; - ext_table->hsa_ext_program_create = hsa_ext_program_create; - ext_table->hsa_ext_program_destroy = hsa_ext_program_destroy; - ext_table->hsa_ext_program_finalize = hsa_ext_program_finalize; - ext_table->hsa_ext_program_get_info = hsa_ext_program_get_info; - ext_table->hsa_ext_program_iterate_modules = - hsa_ext_program_iterate_modules; - - return HSA_STATUS_SUCCESS; - } else if (extension == HSA_EXTENSION_AMD_LOADED_CODE_OBJECT) { - // Currently there is only version 1.00. - hsa_ven_amd_loaded_code_object_1_00_pfn_t* ext_table = - reinterpret_cast<hsa_ven_amd_loaded_code_object_1_00_pfn_t*>(table); - ext_table->hsa_ven_amd_loaded_code_object_query_host_address = - hsa_ven_amd_loaded_code_object_query_host_address; - - return HSA_STATUS_SUCCESS; - } else { - return HSA_STATUS_ERROR; - } + if (extension == HSA_EXTENSION_IMAGES) { + // Currently there is only version 1.00. + hsa_ext_images_1_00_pfn_t* ext_table = + reinterpret_cast<hsa_ext_images_1_00_pfn_t*>(table); + ext_table->hsa_ext_image_clear = hsa_ext_image_clear; + ext_table->hsa_ext_image_copy = hsa_ext_image_copy; + ext_table->hsa_ext_image_create = hsa_ext_image_create; + ext_table->hsa_ext_image_data_get_info = hsa_ext_image_data_get_info; + ext_table->hsa_ext_image_destroy = hsa_ext_image_destroy; + ext_table->hsa_ext_image_export = hsa_ext_image_export; + ext_table->hsa_ext_image_get_capability = hsa_ext_image_get_capability; + ext_table->hsa_ext_image_import = hsa_ext_image_import; + ext_table->hsa_ext_sampler_create = hsa_ext_sampler_create; + ext_table->hsa_ext_sampler_destroy = hsa_ext_sampler_destroy; + + return HSA_STATUS_SUCCESS; } - return HSA_STATUS_SUCCESS; + if (extension == HSA_EXTENSION_FINALIZER) { + // Currently there is only version 1.00. + hsa_ext_finalizer_1_00_pfn_s* ext_table = + reinterpret_cast<hsa_ext_finalizer_1_00_pfn_s*>(table); + ext_table->hsa_ext_program_add_module = hsa_ext_program_add_module; + ext_table->hsa_ext_program_create = hsa_ext_program_create; + ext_table->hsa_ext_program_destroy = hsa_ext_program_destroy; + ext_table->hsa_ext_program_finalize = hsa_ext_program_finalize; + ext_table->hsa_ext_program_get_info = hsa_ext_program_get_info; + ext_table->hsa_ext_program_iterate_modules = + hsa_ext_program_iterate_modules; + + return HSA_STATUS_SUCCESS; + } + + if (extension == HSA_EXTENSION_AMD_LOADER) { + // Currently there is only version 1.00. + hsa_ven_amd_loader_1_00_pfn_t* ext_table = + reinterpret_cast<hsa_ven_amd_loader_1_00_pfn_t*>(table); + ext_table->hsa_ven_amd_loader_query_host_address = + hsa_ven_amd_loader_query_host_address; + ext_table->hsa_ven_amd_loader_query_segment_descriptors = + hsa_ven_amd_loader_query_segment_descriptors; + + return HSA_STATUS_SUCCESS; + } + + return HSA_STATUS_ERROR; } //---------------------------------------------------------------------------// @@ -785,7 +783,7 @@ hsa_status_t core::Signal* ret; - bool useshost = true; + bool uses_host = false; if (num_consumers > 0) { IS_BAD_PTR(consumers); @@ -798,13 +796,16 @@ hsa_status_t return HSA_STATUS_ERROR_INVALID_ARGUMENT; } - useshost = - (consumer_set.find( - core::Runtime::runtime_singleton_->host_agent()->public_handle()) != - consumer_set.end()); + for (const core::Agent* cpu_agent : + core::Runtime::runtime_singleton_->cpu_agents()) { + uses_host |= + (consumer_set.find(cpu_agent->public_handle()) != consumer_set.end()); + } + } else { + uses_host = true; } - if (core::g_use_interrupt_wait && useshost) { + if (core::g_use_interrupt_wait && uses_host) { ret = new core::InterruptSignal(initial_value); } else { ret = new core::DefaultSignal(initial_value); diff --git a/src/core/runtime/hsa_api_trace.cpp b/src/core/runtime/hsa_api_trace.cpp index ca0b40192..9fe3823a5 100644 --- a/src/core/runtime/hsa_api_trace.cpp +++ b/src/core/runtime/hsa_api_trace.cpp @@ -42,150 +42,258 @@ #include "core/inc/hsa_api_trace_int.h" #include "core/inc/runtime.h" +#include "core/inc/hsa_ext_amd_impl.h" #include "core/inc/hsa_table_interface.h" +#include <iostream> + namespace core { -ApiTable hsa_api_table_; -ApiTable hsa_internal_api_table_; +HsaApiTable hsa_api_table_; +HsaApiTable hsa_internal_api_table_; + +HsaApiTable::HsaApiTable() { + Init(); +} + +// Initialize member fields for Hsa Core and Amd Extension Api's +// Member fields for Finalizer and Image extensions will be +// updated as part of Hsa Runtime initialization. +void HsaApiTable::Init() { + + // Initialize Version of Api Table + hsa_api.version.major_id = HSA_API_TABLE_MAJOR_VERSION; + hsa_api.version.minor_id = sizeof(::HsaApiTable); + hsa_api.version.step_id = HSA_API_TABLE_STEP_VERSION; + + // Update Api table for Core and its major id + UpdateCore(); + hsa_api.core_ = &core_api; + + // Update Api table for Amd Extensions and its major id + UpdateAmdExts(); + hsa_api.amd_ext_ = &amd_ext_api; + + // Initialize Api tables for Finalizer and Image to NULL + // Tables for Finalizer and Images are initialized as part + // of Hsa Runtime initialization, including their major ids + hsa_api.finalizer_ext_ = NULL; + hsa_api.image_ext_ = NULL; +} + +void HsaApiTable::Reset() { + Init(); +} + +void HsaApiTable::CloneExts(void* ext_table, uint32_t table_id) { + + assert(ext_table != NULL && "Invalid extension table linked."); + + // Update HSA Extension Finalizer Api table + if (table_id == HSA_EXT_FINALIZER_API_TABLE_ID) { + finalizer_api = (*(FinalizerExtTable *)ext_table); + hsa_api.finalizer_ext_ = &finalizer_api; + return; + } -ApiTable::ApiTable() { - table.std_exts_ = NULL; - Reset(); + // Update HSA Extension Image Api table + if (table_id == HSA_EXT_IMAGE_API_TABLE_ID) { + image_api = (*(ImageExtTable *)ext_table); + hsa_api.image_ext_ = &image_api; + return; + } } -void ApiTable::LinkExts(ExtTable* ptr) { - assert(ptr != NULL && "Invalid extension table linked."); - extension_backup = *ptr; - table.std_exts_ = ptr; +void HsaApiTable::LinkExts(void* ext_table, uint32_t table_id) { + + assert(ext_table != NULL && "Invalid extension table linked."); + + // Update HSA Extension Finalizer Api table + if (table_id == HSA_EXT_FINALIZER_API_TABLE_ID) { + finalizer_api = (*(FinalizerExtTable *)ext_table); + hsa_api.finalizer_ext_ = (FinalizerExtTable *)ext_table; + return; + } + + // Update HSA Extension Image Api table + if (table_id == HSA_EXT_IMAGE_API_TABLE_ID) { + image_api = (*(ImageExtTable *)ext_table); + hsa_api.image_ext_ = (ImageExtTable *)ext_table; + return; + } } -void ApiTable::Reset() { - table.hsa_init_fn = HSA::hsa_init; - table.hsa_shut_down_fn = HSA::hsa_shut_down; - table.hsa_system_get_info_fn = HSA::hsa_system_get_info; - table.hsa_system_extension_supported_fn = HSA::hsa_system_extension_supported; - table.hsa_system_get_extension_table_fn = HSA::hsa_system_get_extension_table; - table.hsa_iterate_agents_fn = HSA::hsa_iterate_agents; - table.hsa_agent_get_info_fn = HSA::hsa_agent_get_info; - table.hsa_agent_get_exception_policies_fn = +// Update Api table for Hsa Core Runtime +void HsaApiTable::UpdateCore() { + + // Initialize Version of Api Table + core_api.version.major_id = HSA_CORE_API_TABLE_MAJOR_VERSION; + core_api.version.minor_id = sizeof(::CoreApiTable); + core_api.version.step_id = HSA_CORE_API_TABLE_STEP_VERSION; + + // Initialize function pointers for Hsa Core Runtime Api's + core_api.hsa_init_fn = HSA::hsa_init; + core_api.hsa_shut_down_fn = HSA::hsa_shut_down; + core_api.hsa_system_get_info_fn = HSA::hsa_system_get_info; + core_api.hsa_system_extension_supported_fn = HSA::hsa_system_extension_supported; + core_api.hsa_system_get_extension_table_fn = HSA::hsa_system_get_extension_table; + core_api.hsa_iterate_agents_fn = HSA::hsa_iterate_agents; + core_api.hsa_agent_get_info_fn = HSA::hsa_agent_get_info; + core_api.hsa_agent_get_exception_policies_fn = HSA::hsa_agent_get_exception_policies; - table.hsa_agent_extension_supported_fn = HSA::hsa_agent_extension_supported; - table.hsa_queue_create_fn = HSA::hsa_queue_create; - table.hsa_soft_queue_create_fn = HSA::hsa_soft_queue_create; - table.hsa_queue_destroy_fn = HSA::hsa_queue_destroy; - table.hsa_queue_inactivate_fn = HSA::hsa_queue_inactivate; - table.hsa_queue_load_read_index_acquire_fn = + core_api.hsa_agent_extension_supported_fn = HSA::hsa_agent_extension_supported; + core_api.hsa_queue_create_fn = HSA::hsa_queue_create; + core_api.hsa_soft_queue_create_fn = HSA::hsa_soft_queue_create; + core_api.hsa_queue_destroy_fn = HSA::hsa_queue_destroy; + core_api.hsa_queue_inactivate_fn = HSA::hsa_queue_inactivate; + core_api.hsa_queue_load_read_index_acquire_fn = HSA::hsa_queue_load_read_index_acquire; - table.hsa_queue_load_read_index_relaxed_fn = + core_api.hsa_queue_load_read_index_relaxed_fn = HSA::hsa_queue_load_read_index_relaxed; - table.hsa_queue_load_write_index_acquire_fn = + core_api.hsa_queue_load_write_index_acquire_fn = HSA::hsa_queue_load_write_index_acquire; - table.hsa_queue_load_write_index_relaxed_fn = + core_api.hsa_queue_load_write_index_relaxed_fn = HSA::hsa_queue_load_write_index_relaxed; - table.hsa_queue_store_write_index_relaxed_fn = + core_api.hsa_queue_store_write_index_relaxed_fn = HSA::hsa_queue_store_write_index_relaxed; - table.hsa_queue_store_write_index_release_fn = + core_api.hsa_queue_store_write_index_release_fn = HSA::hsa_queue_store_write_index_release; - table.hsa_queue_cas_write_index_acq_rel_fn = + core_api.hsa_queue_cas_write_index_acq_rel_fn = HSA::hsa_queue_cas_write_index_acq_rel; - table.hsa_queue_cas_write_index_acquire_fn = + core_api.hsa_queue_cas_write_index_acquire_fn = HSA::hsa_queue_cas_write_index_acquire; - table.hsa_queue_cas_write_index_relaxed_fn = + core_api.hsa_queue_cas_write_index_relaxed_fn = HSA::hsa_queue_cas_write_index_relaxed; - table.hsa_queue_cas_write_index_release_fn = + core_api.hsa_queue_cas_write_index_release_fn = HSA::hsa_queue_cas_write_index_release; - table.hsa_queue_add_write_index_acq_rel_fn = + core_api.hsa_queue_add_write_index_acq_rel_fn = HSA::hsa_queue_add_write_index_acq_rel; - table.hsa_queue_add_write_index_acquire_fn = + core_api.hsa_queue_add_write_index_acquire_fn = HSA::hsa_queue_add_write_index_acquire; - table.hsa_queue_add_write_index_relaxed_fn = + core_api.hsa_queue_add_write_index_relaxed_fn = HSA::hsa_queue_add_write_index_relaxed; - table.hsa_queue_add_write_index_release_fn = + core_api.hsa_queue_add_write_index_release_fn = HSA::hsa_queue_add_write_index_release; - table.hsa_queue_store_read_index_relaxed_fn = + core_api.hsa_queue_store_read_index_relaxed_fn = HSA::hsa_queue_store_read_index_relaxed; - table.hsa_queue_store_read_index_release_fn = + core_api.hsa_queue_store_read_index_release_fn = HSA::hsa_queue_store_read_index_release; - table.hsa_agent_iterate_regions_fn = HSA::hsa_agent_iterate_regions; - table.hsa_region_get_info_fn = HSA::hsa_region_get_info; - table.hsa_memory_register_fn = HSA::hsa_memory_register; - table.hsa_memory_deregister_fn = HSA::hsa_memory_deregister; - table.hsa_memory_allocate_fn = HSA::hsa_memory_allocate; - table.hsa_memory_free_fn = HSA::hsa_memory_free; - table.hsa_memory_copy_fn = HSA::hsa_memory_copy; - table.hsa_memory_assign_agent_fn = HSA::hsa_memory_assign_agent; - table.hsa_signal_create_fn = HSA::hsa_signal_create; - table.hsa_signal_destroy_fn = HSA::hsa_signal_destroy; - table.hsa_signal_load_relaxed_fn = HSA::hsa_signal_load_relaxed; - table.hsa_signal_load_acquire_fn = HSA::hsa_signal_load_acquire; - table.hsa_signal_store_relaxed_fn = HSA::hsa_signal_store_relaxed; - table.hsa_signal_store_release_fn = HSA::hsa_signal_store_release; - table.hsa_signal_wait_relaxed_fn = HSA::hsa_signal_wait_relaxed; - table.hsa_signal_wait_acquire_fn = HSA::hsa_signal_wait_acquire; - table.hsa_signal_and_relaxed_fn = HSA::hsa_signal_and_relaxed; - table.hsa_signal_and_acquire_fn = HSA::hsa_signal_and_acquire; - table.hsa_signal_and_release_fn = HSA::hsa_signal_and_release; - table.hsa_signal_and_acq_rel_fn = HSA::hsa_signal_and_acq_rel; - table.hsa_signal_or_relaxed_fn = HSA::hsa_signal_or_relaxed; - table.hsa_signal_or_acquire_fn = HSA::hsa_signal_or_acquire; - table.hsa_signal_or_release_fn = HSA::hsa_signal_or_release; - table.hsa_signal_or_acq_rel_fn = HSA::hsa_signal_or_acq_rel; - table.hsa_signal_xor_relaxed_fn = HSA::hsa_signal_xor_relaxed; - table.hsa_signal_xor_acquire_fn = HSA::hsa_signal_xor_acquire; - table.hsa_signal_xor_release_fn = HSA::hsa_signal_xor_release; - table.hsa_signal_xor_acq_rel_fn = HSA::hsa_signal_xor_acq_rel; - table.hsa_signal_exchange_relaxed_fn = HSA::hsa_signal_exchange_relaxed; - table.hsa_signal_exchange_acquire_fn = HSA::hsa_signal_exchange_acquire; - table.hsa_signal_exchange_release_fn = HSA::hsa_signal_exchange_release; - table.hsa_signal_exchange_acq_rel_fn = HSA::hsa_signal_exchange_acq_rel; - table.hsa_signal_add_relaxed_fn = HSA::hsa_signal_add_relaxed; - table.hsa_signal_add_acquire_fn = HSA::hsa_signal_add_acquire; - table.hsa_signal_add_release_fn = HSA::hsa_signal_add_release; - table.hsa_signal_add_acq_rel_fn = HSA::hsa_signal_add_acq_rel; - table.hsa_signal_subtract_relaxed_fn = HSA::hsa_signal_subtract_relaxed; - table.hsa_signal_subtract_acquire_fn = HSA::hsa_signal_subtract_acquire; - table.hsa_signal_subtract_release_fn = HSA::hsa_signal_subtract_release; - table.hsa_signal_subtract_acq_rel_fn = HSA::hsa_signal_subtract_acq_rel; - table.hsa_signal_cas_relaxed_fn = HSA::hsa_signal_cas_relaxed; - table.hsa_signal_cas_acquire_fn = HSA::hsa_signal_cas_acquire; - table.hsa_signal_cas_release_fn = HSA::hsa_signal_cas_release; - table.hsa_signal_cas_acq_rel_fn = HSA::hsa_signal_cas_acq_rel; - table.hsa_isa_from_name_fn = HSA::hsa_isa_from_name; - table.hsa_isa_get_info_fn = HSA::hsa_isa_get_info; - table.hsa_isa_compatible_fn = HSA::hsa_isa_compatible; - table.hsa_code_object_serialize_fn = HSA::hsa_code_object_serialize; - table.hsa_code_object_deserialize_fn = HSA::hsa_code_object_deserialize; - table.hsa_code_object_destroy_fn = HSA::hsa_code_object_destroy; - table.hsa_code_object_get_info_fn = HSA::hsa_code_object_get_info; - table.hsa_code_object_get_symbol_fn = HSA::hsa_code_object_get_symbol; - table.hsa_code_symbol_get_info_fn = HSA::hsa_code_symbol_get_info; - table.hsa_code_object_iterate_symbols_fn = + core_api.hsa_agent_iterate_regions_fn = HSA::hsa_agent_iterate_regions; + core_api.hsa_region_get_info_fn = HSA::hsa_region_get_info; + core_api.hsa_memory_register_fn = HSA::hsa_memory_register; + core_api.hsa_memory_deregister_fn = HSA::hsa_memory_deregister; + core_api.hsa_memory_allocate_fn = HSA::hsa_memory_allocate; + core_api.hsa_memory_free_fn = HSA::hsa_memory_free; + core_api.hsa_memory_copy_fn = HSA::hsa_memory_copy; + core_api.hsa_memory_assign_agent_fn = HSA::hsa_memory_assign_agent; + core_api.hsa_signal_create_fn = HSA::hsa_signal_create; + core_api.hsa_signal_destroy_fn = HSA::hsa_signal_destroy; + core_api.hsa_signal_load_relaxed_fn = HSA::hsa_signal_load_relaxed; + core_api.hsa_signal_load_acquire_fn = HSA::hsa_signal_load_acquire; + core_api.hsa_signal_store_relaxed_fn = HSA::hsa_signal_store_relaxed; + core_api.hsa_signal_store_release_fn = HSA::hsa_signal_store_release; + core_api.hsa_signal_wait_relaxed_fn = HSA::hsa_signal_wait_relaxed; + core_api.hsa_signal_wait_acquire_fn = HSA::hsa_signal_wait_acquire; + core_api.hsa_signal_and_relaxed_fn = HSA::hsa_signal_and_relaxed; + core_api.hsa_signal_and_acquire_fn = HSA::hsa_signal_and_acquire; + core_api.hsa_signal_and_release_fn = HSA::hsa_signal_and_release; + core_api.hsa_signal_and_acq_rel_fn = HSA::hsa_signal_and_acq_rel; + core_api.hsa_signal_or_relaxed_fn = HSA::hsa_signal_or_relaxed; + core_api.hsa_signal_or_acquire_fn = HSA::hsa_signal_or_acquire; + core_api.hsa_signal_or_release_fn = HSA::hsa_signal_or_release; + core_api.hsa_signal_or_acq_rel_fn = HSA::hsa_signal_or_acq_rel; + core_api.hsa_signal_xor_relaxed_fn = HSA::hsa_signal_xor_relaxed; + core_api.hsa_signal_xor_acquire_fn = HSA::hsa_signal_xor_acquire; + core_api.hsa_signal_xor_release_fn = HSA::hsa_signal_xor_release; + core_api.hsa_signal_xor_acq_rel_fn = HSA::hsa_signal_xor_acq_rel; + core_api.hsa_signal_exchange_relaxed_fn = HSA::hsa_signal_exchange_relaxed; + core_api.hsa_signal_exchange_acquire_fn = HSA::hsa_signal_exchange_acquire; + core_api.hsa_signal_exchange_release_fn = HSA::hsa_signal_exchange_release; + core_api.hsa_signal_exchange_acq_rel_fn = HSA::hsa_signal_exchange_acq_rel; + core_api.hsa_signal_add_relaxed_fn = HSA::hsa_signal_add_relaxed; + core_api.hsa_signal_add_acquire_fn = HSA::hsa_signal_add_acquire; + core_api.hsa_signal_add_release_fn = HSA::hsa_signal_add_release; + core_api.hsa_signal_add_acq_rel_fn = HSA::hsa_signal_add_acq_rel; + core_api.hsa_signal_subtract_relaxed_fn = HSA::hsa_signal_subtract_relaxed; + core_api.hsa_signal_subtract_acquire_fn = HSA::hsa_signal_subtract_acquire; + core_api.hsa_signal_subtract_release_fn = HSA::hsa_signal_subtract_release; + core_api.hsa_signal_subtract_acq_rel_fn = HSA::hsa_signal_subtract_acq_rel; + core_api.hsa_signal_cas_relaxed_fn = HSA::hsa_signal_cas_relaxed; + core_api.hsa_signal_cas_acquire_fn = HSA::hsa_signal_cas_acquire; + core_api.hsa_signal_cas_release_fn = HSA::hsa_signal_cas_release; + core_api.hsa_signal_cas_acq_rel_fn = HSA::hsa_signal_cas_acq_rel; + core_api.hsa_isa_from_name_fn = HSA::hsa_isa_from_name; + core_api.hsa_isa_get_info_fn = HSA::hsa_isa_get_info; + core_api.hsa_isa_compatible_fn = HSA::hsa_isa_compatible; + core_api.hsa_code_object_serialize_fn = HSA::hsa_code_object_serialize; + core_api.hsa_code_object_deserialize_fn = HSA::hsa_code_object_deserialize; + core_api.hsa_code_object_destroy_fn = HSA::hsa_code_object_destroy; + core_api.hsa_code_object_get_info_fn = HSA::hsa_code_object_get_info; + core_api.hsa_code_object_get_symbol_fn = HSA::hsa_code_object_get_symbol; + core_api.hsa_code_symbol_get_info_fn = HSA::hsa_code_symbol_get_info; + core_api.hsa_code_object_iterate_symbols_fn = HSA::hsa_code_object_iterate_symbols; - table.hsa_executable_create_fn = HSA::hsa_executable_create; - table.hsa_executable_destroy_fn = HSA::hsa_executable_destroy; - table.hsa_executable_load_code_object_fn = + core_api.hsa_executable_create_fn = HSA::hsa_executable_create; + core_api.hsa_executable_destroy_fn = HSA::hsa_executable_destroy; + core_api.hsa_executable_load_code_object_fn = HSA::hsa_executable_load_code_object; - table.hsa_executable_freeze_fn = HSA::hsa_executable_freeze; - table.hsa_executable_get_info_fn = HSA::hsa_executable_get_info; - table.hsa_executable_global_variable_define_fn = + core_api.hsa_executable_freeze_fn = HSA::hsa_executable_freeze; + core_api.hsa_executable_get_info_fn = HSA::hsa_executable_get_info; + core_api.hsa_executable_global_variable_define_fn = HSA::hsa_executable_global_variable_define; - table.hsa_executable_agent_global_variable_define_fn = + core_api.hsa_executable_agent_global_variable_define_fn = HSA::hsa_executable_agent_global_variable_define; - table.hsa_executable_readonly_variable_define_fn = + core_api.hsa_executable_readonly_variable_define_fn = HSA::hsa_executable_readonly_variable_define; - table.hsa_executable_validate_fn = HSA::hsa_executable_validate; - table.hsa_executable_get_symbol_fn = HSA::hsa_executable_get_symbol; - table.hsa_executable_symbol_get_info_fn = HSA::hsa_executable_symbol_get_info; - table.hsa_executable_iterate_symbols_fn = HSA::hsa_executable_iterate_symbols; - table.hsa_status_string_fn = HSA::hsa_status_string; + core_api.hsa_executable_validate_fn = HSA::hsa_executable_validate; + core_api.hsa_executable_get_symbol_fn = HSA::hsa_executable_get_symbol; + core_api.hsa_executable_symbol_get_info_fn = HSA::hsa_executable_symbol_get_info; + core_api.hsa_executable_iterate_symbols_fn = HSA::hsa_executable_iterate_symbols; + core_api.hsa_status_string_fn = HSA::hsa_status_string; +} - if (table.std_exts_ != NULL) *table.std_exts_ = extension_backup; +// Update Api table for Amd Extensions. +// @note: Current implementation will initialize the +// member variable hsa_amd_image_create_fn while loading +// Image extension library +void HsaApiTable::UpdateAmdExts() { + + // Initialize Version of Api Table + amd_ext_api.version.major_id = HSA_AMD_EXT_API_TABLE_MAJOR_VERSION; + amd_ext_api.version.minor_id = sizeof(::AmdExtTable); + amd_ext_api.version.step_id = HSA_AMD_EXT_API_TABLE_STEP_VERSION; + + // Initialize function pointers for Amd Extension Api's + amd_ext_api.hsa_amd_coherency_get_type_fn = AMD::hsa_amd_coherency_get_type; + amd_ext_api.hsa_amd_coherency_set_type_fn = AMD::hsa_amd_coherency_set_type; + amd_ext_api.hsa_amd_profiling_set_profiler_enabled_fn = AMD::hsa_amd_profiling_set_profiler_enabled; + amd_ext_api.hsa_amd_profiling_async_copy_enable_fn = AMD::hsa_amd_profiling_async_copy_enable; + amd_ext_api.hsa_amd_profiling_get_dispatch_time_fn = AMD::hsa_amd_profiling_get_dispatch_time; + amd_ext_api.hsa_amd_profiling_get_async_copy_time_fn = AMD::hsa_amd_profiling_get_async_copy_time; + amd_ext_api.hsa_amd_profiling_convert_tick_to_system_domain_fn = AMD::hsa_amd_profiling_convert_tick_to_system_domain; + amd_ext_api.hsa_amd_signal_async_handler_fn = AMD::hsa_amd_signal_async_handler; + amd_ext_api.hsa_amd_async_function_fn = AMD::hsa_amd_async_function; + amd_ext_api.hsa_amd_signal_wait_any_fn = AMD::hsa_amd_signal_wait_any; + amd_ext_api.hsa_amd_queue_cu_set_mask_fn = AMD::hsa_amd_queue_cu_set_mask; + amd_ext_api.hsa_amd_memory_pool_get_info_fn = AMD::hsa_amd_memory_pool_get_info; + amd_ext_api.hsa_amd_agent_iterate_memory_pools_fn = AMD::hsa_amd_agent_iterate_memory_pools; + amd_ext_api.hsa_amd_memory_pool_allocate_fn = AMD::hsa_amd_memory_pool_allocate; + amd_ext_api.hsa_amd_memory_pool_free_fn = AMD::hsa_amd_memory_pool_free; + amd_ext_api.hsa_amd_memory_async_copy_fn = AMD::hsa_amd_memory_async_copy; + amd_ext_api.hsa_amd_agent_memory_pool_get_info_fn = AMD::hsa_amd_agent_memory_pool_get_info; + amd_ext_api.hsa_amd_agents_allow_access_fn = AMD::hsa_amd_agents_allow_access; + amd_ext_api.hsa_amd_memory_pool_can_migrate_fn = AMD::hsa_amd_memory_pool_can_migrate; + amd_ext_api.hsa_amd_memory_migrate_fn = AMD::hsa_amd_memory_migrate; + amd_ext_api.hsa_amd_memory_lock_fn = AMD::hsa_amd_memory_lock; + amd_ext_api.hsa_amd_memory_unlock_fn = AMD::hsa_amd_memory_unlock; + amd_ext_api.hsa_amd_memory_fill_fn = AMD::hsa_amd_memory_fill; + amd_ext_api.hsa_amd_interop_map_buffer_fn = AMD::hsa_amd_interop_map_buffer; + amd_ext_api.hsa_amd_interop_unmap_buffer_fn = AMD::hsa_amd_interop_unmap_buffer; } class Init { public: - Init() { hsa_table_interface_init(&hsa_api_table_.table); } + Init() { hsa_table_interface_init(&hsa_api_table_.hsa_api); } }; static Init LinkAtLoad; } diff --git a/src/core/runtime/hsa_ext_amd.cpp b/src/core/runtime/hsa_ext_amd.cpp index 9394c3006..a31b5a6c3 100644 --- a/src/core/runtime/hsa_ext_amd.cpp +++ b/src/core/runtime/hsa_ext_amd.cpp @@ -109,7 +109,9 @@ static __forceinline bool IsValid(T* ptr) { return (ptr == NULL) ? NULL : ptr->IsValid(); } -hsa_status_t HSA_API +namespace AMD { + +hsa_status_t hsa_amd_coherency_get_type(hsa_agent_t agent_handle, hsa_amd_coherency_type_t* type) { IS_OPEN(); @@ -132,7 +134,7 @@ hsa_status_t HSA_API return HSA_STATUS_SUCCESS; } -hsa_status_t HSA_API hsa_amd_coherency_set_type(hsa_agent_t agent_handle, +hsa_status_t hsa_amd_coherency_set_type(hsa_agent_t agent_handle, hsa_amd_coherency_type_t type) { IS_OPEN(); @@ -158,7 +160,7 @@ hsa_status_t HSA_API hsa_amd_coherency_set_type(hsa_agent_t agent_handle, return HSA_STATUS_SUCCESS; } -hsa_status_t HSA_API +hsa_status_t hsa_amd_memory_fill(void* ptr, uint32_t value, size_t count) { IS_OPEN(); @@ -173,7 +175,7 @@ hsa_status_t HSA_API return core::Runtime::runtime_singleton_->FillMemory(ptr, value, count); } -hsa_status_t HSA_API +hsa_status_t hsa_amd_memory_async_copy(void* dst, hsa_agent_t dst_agent_handle, const void* src, hsa_agent_t src_agent_handle, size_t size, uint32_t num_dep_signals, @@ -215,7 +217,7 @@ hsa_status_t HSA_API return HSA_STATUS_SUCCESS; } -hsa_status_t HSA_API +hsa_status_t hsa_amd_profiling_set_profiler_enabled(hsa_queue_t* queue, int enable) { IS_OPEN(); @@ -229,7 +231,18 @@ hsa_status_t HSA_API return HSA_STATUS_SUCCESS; } -hsa_status_t HSA_API hsa_amd_profiling_get_dispatch_time( +hsa_status_t hsa_amd_profiling_async_copy_enable(bool enable) { + IS_OPEN(); + + return core::Runtime::runtime_singleton_->IterateAgent( + [](hsa_agent_t agent_handle, void* data) -> hsa_status_t { + const bool enable = *(reinterpret_cast<bool*>(data)); + return core::Agent::Convert(agent_handle)->profiling_enabled(enable); + }, + reinterpret_cast<void*>(&enable)); +} + +hsa_status_t hsa_amd_profiling_get_dispatch_time( hsa_agent_t agent_handle, hsa_signal_t hsa_signal, hsa_amd_profiling_dispatch_time_t* time) { IS_OPEN(); @@ -250,12 +263,41 @@ hsa_status_t HSA_API hsa_amd_profiling_get_dispatch_time( amd::GpuAgentInt* gpu_agent = static_cast<amd::GpuAgentInt*>(agent); + // Translate timestamp from GPU to system domain. gpu_agent->TranslateTime(signal, *time); return HSA_STATUS_SUCCESS; } -hsa_status_t HSA_API +hsa_status_t hsa_amd_profiling_get_async_copy_time( + hsa_signal_t hsa_signal, hsa_amd_profiling_async_copy_time_t* time) { + IS_OPEN(); + + IS_BAD_PTR(time); + + core::Signal* signal = core::Signal::Convert(hsa_signal); + + IS_VALID(signal); + + core::Agent* agent = signal->async_copy_agent(); + + if (agent == NULL) { + return HSA_STATUS_ERROR; + } + + if (agent->device_type() == core::Agent::DeviceType::kAmdGpuDevice) { + // Translate timestamp from GPU to system domain. + static_cast<amd::GpuAgentInt*>(agent)->TranslateTime(signal, *time); + return HSA_STATUS_SUCCESS; + } + + // The timestamp is already in system domain. + time->start = signal->signal_.start_ts; + time->end = signal->signal_.end_ts; + return HSA_STATUS_SUCCESS; +} + +hsa_status_t hsa_amd_profiling_convert_tick_to_system_domain(hsa_agent_t agent_handle, uint64_t agent_tick, uint64_t* system_tick) { @@ -278,7 +320,7 @@ hsa_status_t HSA_API return HSA_STATUS_SUCCESS; } -uint32_t HSA_API +uint32_t hsa_amd_signal_wait_any(uint32_t signal_count, hsa_signal_t* hsa_signals, hsa_signal_condition_t* conds, hsa_signal_value_t* values, uint64_t timeout_hint, @@ -296,7 +338,7 @@ uint32_t HSA_API timeout_hint, wait_hint, satisfying_value); } -hsa_status_t HSA_API +hsa_status_t hsa_amd_signal_async_handler(hsa_signal_t hsa_signal, hsa_signal_condition_t cond, hsa_signal_value_t value, @@ -312,7 +354,7 @@ hsa_status_t HSA_API hsa_signal, cond, value, handler, arg); } -hsa_status_t HSA_API +hsa_status_t hsa_amd_async_function(void (*callback)(void* arg), void* arg) { IS_OPEN(); @@ -323,7 +365,7 @@ hsa_status_t HSA_API arg); } -hsa_status_t HSA_API hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue, +hsa_status_t hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue, uint32_t num_cu_mask_count, const uint32_t* cu_mask) { IS_OPEN(); @@ -334,7 +376,7 @@ hsa_status_t HSA_API hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue, return cmd_queue->SetCUMasking(num_cu_mask_count, cu_mask); } -hsa_status_t HSA_API hsa_amd_memory_lock(void* host_ptr, size_t size, +hsa_status_t hsa_amd_memory_lock(void* host_ptr, size_t size, hsa_agent_t* agents, int num_agent, void** agent_ptr) { *agent_ptr = NULL; @@ -357,7 +399,7 @@ hsa_status_t HSA_API hsa_amd_memory_lock(void* host_ptr, size_t size, return system_region->Lock(num_agent, agents, host_ptr, size, agent_ptr); } -hsa_status_t HSA_API hsa_amd_memory_unlock(void* host_ptr) { +hsa_status_t hsa_amd_memory_unlock(void* host_ptr) { IS_OPEN(); const amd::MemoryRegion* system_region = @@ -367,7 +409,7 @@ hsa_status_t HSA_API hsa_amd_memory_unlock(void* host_ptr) { return system_region->Unlock(host_ptr); } -hsa_status_t HSA_API +hsa_status_t hsa_amd_memory_pool_get_info(hsa_amd_memory_pool_t memory_pool, hsa_amd_memory_pool_info_t attribute, void* value) { @@ -383,7 +425,7 @@ hsa_status_t HSA_API return mem_region->GetPoolInfo(attribute, value); } -hsa_status_t HSA_API hsa_amd_agent_iterate_memory_pools( +hsa_status_t hsa_amd_agent_iterate_memory_pools( hsa_agent_t agent_handle, hsa_status_t (*callback)(hsa_amd_memory_pool_t memory_pool, void* data), void* data) { @@ -406,7 +448,7 @@ hsa_status_t HSA_API hsa_amd_agent_iterate_memory_pools( data); } -hsa_status_t HSA_API +hsa_status_t hsa_amd_memory_pool_allocate(hsa_amd_memory_pool_t memory_pool, size_t size, uint32_t flags, void** ptr) { IS_OPEN(); @@ -426,11 +468,11 @@ hsa_status_t HSA_API size, ptr); } -hsa_status_t HSA_API hsa_amd_memory_pool_free(void* ptr) { +hsa_status_t hsa_amd_memory_pool_free(void* ptr) { return HSA::hsa_memory_free(ptr); } -hsa_status_t HSA_API +hsa_status_t hsa_amd_agents_allow_access(uint32_t num_agents, const hsa_agent_t* agents, const uint32_t* flags, const void* ptr) { IS_OPEN(); @@ -443,7 +485,7 @@ hsa_status_t HSA_API ptr); } -hsa_status_t HSA_API +hsa_status_t hsa_amd_memory_pool_can_migrate(hsa_amd_memory_pool_t src_memory_pool, hsa_amd_memory_pool_t dst_memory_pool, bool* result) { @@ -472,7 +514,7 @@ hsa_status_t HSA_API return src_mem_region->CanMigrate(*dst_mem_region, *result); } -hsa_status_t HSA_API hsa_amd_memory_migrate(const void* ptr, +hsa_status_t hsa_amd_memory_migrate(const void* ptr, hsa_amd_memory_pool_t memory_pool, uint32_t flags) { IS_OPEN(); @@ -492,7 +534,7 @@ hsa_status_t HSA_API hsa_amd_memory_migrate(const void* ptr, return dst_mem_region->Migrate(flags, ptr); } -hsa_status_t HSA_API hsa_amd_agent_memory_pool_get_info( +hsa_status_t hsa_amd_agent_memory_pool_get_info( hsa_agent_t agent_handle, hsa_amd_memory_pool_t memory_pool, hsa_amd_agent_memory_pool_info_t attribute, void* value) { IS_OPEN(); @@ -553,3 +595,5 @@ hsa_status_t hsa_amd_interop_unmap_buffer(void* ptr) { if (ptr != NULL) core::Runtime::runtime_singleton_->InteropUnmap(ptr); return HSA_STATUS_SUCCESS; } + +} // end of AMD namespace diff --git a/src/core/runtime/hsa_ext_interface.cpp b/src/core/runtime/hsa_ext_interface.cpp index 3aa9f5c04..c8d8bf541 100644 --- a/src/core/runtime/hsa_ext_interface.cpp +++ b/src/core/runtime/hsa_ext_interface.cpp @@ -161,28 +161,75 @@ static T0 hsa_ext_null(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19, T20) { return HSA_STATUS_ERROR_NOT_INITIALIZED; } +template <class T0, class T1, class T2, class T3, class T4, class T5, class T6> +static T0 hsa_amd_null(T1, T2, T3, T4, T5, T6) { + return HSA_STATUS_ERROR_NOT_INITIALIZED; +} -ExtensionEntryPoints::ExtensionEntryPoints() { InitTable(); } - -void ExtensionEntryPoints::InitTable() { - table.hsa_ext_program_create_fn = hsa_ext_null; - table.hsa_ext_program_destroy_fn = hsa_ext_null; - table.hsa_ext_program_add_module_fn = hsa_ext_null; - table.hsa_ext_program_iterate_modules_fn = hsa_ext_null; - table.hsa_ext_program_get_info_fn = hsa_ext_null; - table.hsa_ext_program_finalize_fn = hsa_ext_null; - table.hsa_ext_image_get_capability_fn = hsa_ext_null; - table.hsa_ext_image_data_get_info_fn = hsa_ext_null; - table.hsa_ext_image_create_fn = hsa_ext_null; - table.hsa_ext_image_import_fn = hsa_ext_null; - table.hsa_ext_image_export_fn = hsa_ext_null; - table.hsa_ext_image_copy_fn = hsa_ext_null; - table.hsa_ext_image_clear_fn = hsa_ext_null; - table.hsa_ext_image_destroy_fn = hsa_ext_null; - table.hsa_ext_sampler_create_fn = hsa_ext_null; - table.hsa_ext_sampler_destroy_fn = hsa_ext_null; - table.hsa_amd_image_get_info_max_dim_fn = hsa_ext_null; - table.hsa_amd_image_create_fn = hsa_ext_null; +ExtensionEntryPoints::ExtensionEntryPoints() { + InitFinalizerExtTable(); + InitImageExtTable(); + InitAmdExtTable(); +} + +// Initialize Finalizer function table to be NULLs +void ExtensionEntryPoints::InitFinalizerExtTable() { + + // Initialize Version of Api Table + finalizer_api.version.major_id = 0x00; + finalizer_api.version.minor_id = 0x00; + finalizer_api.version.step_id = 0x00; + + finalizer_api.hsa_ext_program_create_fn = hsa_ext_null; + finalizer_api.hsa_ext_program_destroy_fn = hsa_ext_null; + finalizer_api.hsa_ext_program_add_module_fn = hsa_ext_null; + finalizer_api.hsa_ext_program_iterate_modules_fn = hsa_ext_null; + finalizer_api.hsa_ext_program_get_info_fn = hsa_ext_null; + finalizer_api.hsa_ext_program_finalize_fn = hsa_ext_null; +} + +// Initialize Image function table to be NULLs +void ExtensionEntryPoints::InitImageExtTable() { + + // Initialize Version of Api Table + image_api.version.major_id = 0x00; + image_api.version.minor_id = 0x00; + image_api.version.step_id = 0x00; + + image_api.hsa_ext_image_get_capability_fn = hsa_ext_null; + image_api.hsa_ext_image_data_get_info_fn = hsa_ext_null; + image_api.hsa_ext_image_create_fn = hsa_ext_null; + image_api.hsa_ext_image_import_fn = hsa_ext_null; + image_api.hsa_ext_image_export_fn = hsa_ext_null; + image_api.hsa_ext_image_copy_fn = hsa_ext_null; + image_api.hsa_ext_image_clear_fn = hsa_ext_null; + image_api.hsa_ext_image_destroy_fn = hsa_ext_null; + image_api.hsa_ext_sampler_create_fn = hsa_ext_null; + image_api.hsa_ext_sampler_destroy_fn = hsa_ext_null; + image_api.hsa_amd_image_get_info_max_dim_fn = hsa_ext_null; +} + +// Initialize Amd Ext table for Api related to Images +void ExtensionEntryPoints::InitAmdExtTable() { + hsa_api_table_.amd_ext_api.hsa_amd_image_create_fn = hsa_ext_null; + hsa_internal_api_table_.amd_ext_api.hsa_amd_image_create_fn = hsa_ext_null; +} + +// Update Amd Ext table for Api related to Images. +// @note: Interface should be updated when Amd Ext table +// begins hosting Api's from other extension libraries +void ExtensionEntryPoints::UpdateAmdExtTable(void *func_ptr) { + + assert(hsa_api_table_.amd_ext_api.hsa_amd_image_create_fn == + (decltype(::hsa_amd_image_create)*)hsa_ext_null && + "Duplicate load of extension import."); + assert(hsa_internal_api_table_.amd_ext_api.hsa_amd_image_create_fn == + (decltype(::hsa_amd_image_create)*)hsa_ext_null && + "Duplicate load of extension import."); + hsa_api_table_.amd_ext_api.hsa_amd_image_create_fn = + (decltype(::hsa_amd_image_create)*)func_ptr; + hsa_internal_api_table_.amd_ext_api.hsa_amd_image_create_fn = + (decltype(::hsa_amd_image_create)*)func_ptr; } void ExtensionEntryPoints::Unload() { @@ -200,182 +247,217 @@ void ExtensionEntryPoints::Unload() { } } libs_.clear(); - InitTable(); + + InitFinalizerExtTable(); + InitImageExtTable(); + InitAmdExtTable(); + core::hsa_internal_api_table_.Reset(); } -bool ExtensionEntryPoints::Load(std::string library_name) { +bool ExtensionEntryPoints::LoadImage(std::string library_name) { os::LibHandle lib = os::LoadLib(library_name); if (lib == NULL) { return false; } libs_.push_back(lib); - + void* ptr; - ptr = os::GetExportAddress(lib, "hsa_ext_program_create_impl"); - if (ptr != NULL) { - assert(table.hsa_ext_program_create_fn == - (decltype(::hsa_ext_program_create)*)hsa_ext_null && - "Duplicate load of extension import."); - table.hsa_ext_program_create_fn = (decltype(::hsa_ext_program_create)*)ptr; - } - - ptr = os::GetExportAddress(lib, "hsa_ext_program_destroy_impl"); - if (ptr != NULL) { - assert(table.hsa_ext_program_destroy_fn == - (decltype(::hsa_ext_program_destroy)*)hsa_ext_null && - "Duplicate load of extension import."); - table.hsa_ext_program_destroy_fn = - (decltype(::hsa_ext_program_destroy)*)ptr; - } - - ptr = os::GetExportAddress(lib, "hsa_ext_program_add_module_impl"); - if (ptr != NULL) { - assert(table.hsa_ext_program_add_module_fn == - (decltype(::hsa_ext_program_add_module)*)hsa_ext_null && - "Duplicate load of extension import."); - table.hsa_ext_program_add_module_fn = - (decltype(::hsa_ext_program_add_module)*)ptr; - } - - ptr = os::GetExportAddress(lib, "hsa_ext_program_iterate_modules_impl"); - if (ptr != NULL) { - assert(table.hsa_ext_program_iterate_modules_fn == - (decltype(::hsa_ext_program_iterate_modules)*)hsa_ext_null && - "Duplicate load of extension import."); - table.hsa_ext_program_iterate_modules_fn = - (decltype(::hsa_ext_program_iterate_modules)*)ptr; - } - - ptr = os::GetExportAddress(lib, "hsa_ext_program_get_info_impl"); - if (ptr != NULL) { - assert(table.hsa_ext_program_get_info_fn == - (decltype(::hsa_ext_program_get_info)*)hsa_ext_null && - "Duplicate load of extension import."); - table.hsa_ext_program_get_info_fn = - (decltype(::hsa_ext_program_get_info)*)ptr; - } - - ptr = os::GetExportAddress(lib, "hsa_ext_program_finalize_impl"); - if (ptr != NULL) { - assert(table.hsa_ext_program_finalize_fn == - (decltype(::hsa_ext_program_finalize)*)hsa_ext_null && - "Duplicate load of extension import."); - table.hsa_ext_program_finalize_fn = - (decltype(::hsa_ext_program_finalize)*)ptr; - } - ptr = os::GetExportAddress(lib, "hsa_ext_image_get_capability_impl"); + bool libIsImage = (ptr != NULL); if (ptr != NULL) { - assert(table.hsa_ext_image_get_capability_fn == + assert(image_api.hsa_ext_image_get_capability_fn == (decltype(::hsa_ext_image_get_capability)*)hsa_ext_null && "Duplicate load of extension import."); - table.hsa_ext_image_get_capability_fn = + image_api.hsa_ext_image_get_capability_fn = (decltype(::hsa_ext_image_get_capability)*)ptr; } ptr = os::GetExportAddress(lib, "hsa_ext_image_data_get_info_impl"); if (ptr != NULL) { - assert(table.hsa_ext_image_data_get_info_fn == + assert(image_api.hsa_ext_image_data_get_info_fn == (decltype(::hsa_ext_image_data_get_info)*)hsa_ext_null && "Duplicate load of extension import."); - table.hsa_ext_image_data_get_info_fn = + image_api.hsa_ext_image_data_get_info_fn = (decltype(::hsa_ext_image_data_get_info)*)ptr; } ptr = os::GetExportAddress(lib, "hsa_ext_image_create_impl"); if (ptr != NULL) { - assert(table.hsa_ext_image_create_fn == + assert(image_api.hsa_ext_image_create_fn == (decltype(::hsa_ext_image_create)*)hsa_ext_null && "Duplicate load of extension import."); - table.hsa_ext_image_create_fn = (decltype(::hsa_ext_image_create)*)ptr; + image_api.hsa_ext_image_create_fn = (decltype(::hsa_ext_image_create)*)ptr; } ptr = os::GetExportAddress(lib, "hsa_ext_image_import_impl"); if (ptr != NULL) { - assert(table.hsa_ext_image_import_fn == + assert(image_api.hsa_ext_image_import_fn == (decltype(::hsa_ext_image_import)*)hsa_ext_null && "Duplicate load of extension import."); - table.hsa_ext_image_import_fn = (decltype(::hsa_ext_image_import)*)ptr; + image_api.hsa_ext_image_import_fn = (decltype(::hsa_ext_image_import)*)ptr; } ptr = os::GetExportAddress(lib, "hsa_ext_image_export_impl"); if (ptr != NULL) { - assert(table.hsa_ext_image_export_fn == + assert(image_api.hsa_ext_image_export_fn == (decltype(::hsa_ext_image_export)*)hsa_ext_null && "Duplicate load of extension import."); - table.hsa_ext_image_export_fn = (decltype(::hsa_ext_image_export)*)ptr; + image_api.hsa_ext_image_export_fn = (decltype(::hsa_ext_image_export)*)ptr; } ptr = os::GetExportAddress(lib, "hsa_ext_image_copy_impl"); if (ptr != NULL) { - assert(table.hsa_ext_image_copy_fn == + assert(image_api.hsa_ext_image_copy_fn == (decltype(::hsa_ext_image_copy)*)hsa_ext_null && "Duplicate load of extension import."); - table.hsa_ext_image_copy_fn = (decltype(::hsa_ext_image_copy)*)ptr; + image_api.hsa_ext_image_copy_fn = (decltype(::hsa_ext_image_copy)*)ptr; } ptr = os::GetExportAddress(lib, "hsa_ext_image_clear_impl"); if (ptr != NULL) { - assert(table.hsa_ext_image_clear_fn == + assert(image_api.hsa_ext_image_clear_fn == (decltype(::hsa_ext_image_clear)*)hsa_ext_null && "Duplicate load of extension import."); - table.hsa_ext_image_clear_fn = (decltype(::hsa_ext_image_clear)*)ptr; + image_api.hsa_ext_image_clear_fn = (decltype(::hsa_ext_image_clear)*)ptr; } ptr = os::GetExportAddress(lib, "hsa_ext_image_destroy_impl"); if (ptr != NULL) { - assert(table.hsa_ext_image_destroy_fn == + assert(image_api.hsa_ext_image_destroy_fn == (decltype(::hsa_ext_image_destroy)*)hsa_ext_null && "Duplicate load of extension import."); - table.hsa_ext_image_destroy_fn = (decltype(::hsa_ext_image_destroy)*)ptr; + image_api.hsa_ext_image_destroy_fn = (decltype(::hsa_ext_image_destroy)*)ptr; } ptr = os::GetExportAddress(lib, "hsa_ext_sampler_create_impl"); if (ptr != NULL) { - assert(table.hsa_ext_sampler_create_fn == + assert(image_api.hsa_ext_sampler_create_fn == (decltype(::hsa_ext_sampler_create)*)hsa_ext_null && "Duplicate load of extension import."); - table.hsa_ext_sampler_create_fn = (decltype(::hsa_ext_sampler_create)*)ptr; + image_api.hsa_ext_sampler_create_fn = (decltype(::hsa_ext_sampler_create)*)ptr; } ptr = os::GetExportAddress(lib, "hsa_ext_sampler_destroy_impl"); if (ptr != NULL) { - assert(table.hsa_ext_sampler_destroy_fn == + assert(image_api.hsa_ext_sampler_destroy_fn == (decltype(::hsa_ext_sampler_destroy)*)hsa_ext_null && "Duplicate load of extension import."); - table.hsa_ext_sampler_destroy_fn = + image_api.hsa_ext_sampler_destroy_fn = (decltype(::hsa_ext_sampler_destroy)*)ptr; } ptr = os::GetExportAddress(lib, "hsa_amd_image_get_info_max_dim_impl"); if (ptr != NULL) { - assert(table.hsa_amd_image_get_info_max_dim_fn == + assert(image_api.hsa_amd_image_get_info_max_dim_fn == (decltype(::hsa_amd_image_get_info_max_dim)*)hsa_ext_null && "Duplicate load of extension import."); - table.hsa_amd_image_get_info_max_dim_fn = + image_api.hsa_amd_image_get_info_max_dim_fn = (decltype(::hsa_amd_image_get_info_max_dim)*)ptr; } ptr = os::GetExportAddress(lib, "hsa_amd_image_create_impl"); if (ptr != NULL) { - assert(table.hsa_amd_image_create_fn == - (decltype(::hsa_amd_image_create)*)hsa_ext_null && + UpdateAmdExtTable(ptr); + } + + // Initialize Version of Api Table + image_api.version.major_id = HSA_IMAGE_API_TABLE_MAJOR_VERSION; + image_api.version.minor_id = sizeof(ImageExtTable); + image_api.version.step_id = HSA_IMAGE_API_TABLE_STEP_VERSION; + + // Update private copy of Api table with handle for Image extensions + hsa_internal_api_table_.CloneExts(&image_api, + core::HsaApiTable::HSA_EXT_IMAGE_API_TABLE_ID); + + ptr = os::GetExportAddress(lib, "Load"); + if (ptr != NULL) { + ((Load_t)ptr)(&core::hsa_internal_api_table_.hsa_api); + } + + return true; +} + +bool ExtensionEntryPoints::LoadFinalizer(std::string library_name) { + os::LibHandle lib = os::LoadLib(library_name); + if (lib == NULL) { + return false; + } + libs_.push_back(lib); + + void* ptr; + + ptr = os::GetExportAddress(lib, "hsa_ext_program_create_impl"); + if (ptr != NULL) { + assert(finalizer_api.hsa_ext_program_create_fn == + (decltype(::hsa_ext_program_create)*)hsa_ext_null && + "Duplicate load of extension import."); + finalizer_api.hsa_ext_program_create_fn = (decltype(::hsa_ext_program_create)*)ptr; + } + + ptr = os::GetExportAddress(lib, "hsa_ext_program_destroy_impl"); + if (ptr != NULL) { + assert(finalizer_api.hsa_ext_program_destroy_fn == + (decltype(::hsa_ext_program_destroy)*)hsa_ext_null && "Duplicate load of extension import."); - table.hsa_amd_image_create_fn = - (decltype(::hsa_amd_image_create)*)ptr; + finalizer_api.hsa_ext_program_destroy_fn = + (decltype(::hsa_ext_program_destroy)*)ptr; } - core::hsa_internal_api_table_.extension_backup=table; - core::hsa_internal_api_table_.table.std_exts_=&core::hsa_internal_api_table_.extension_backup; + ptr = os::GetExportAddress(lib, "hsa_ext_program_add_module_impl"); + if (ptr != NULL) { + assert(finalizer_api.hsa_ext_program_add_module_fn == + (decltype(::hsa_ext_program_add_module)*)hsa_ext_null && + "Duplicate load of extension import."); + finalizer_api.hsa_ext_program_add_module_fn = + (decltype(::hsa_ext_program_add_module)*)ptr; + } + + ptr = os::GetExportAddress(lib, "hsa_ext_program_iterate_modules_impl"); + if (ptr != NULL) { + assert(finalizer_api.hsa_ext_program_iterate_modules_fn == + (decltype(::hsa_ext_program_iterate_modules)*)hsa_ext_null && + "Duplicate load of extension import."); + finalizer_api.hsa_ext_program_iterate_modules_fn = + (decltype(::hsa_ext_program_iterate_modules)*)ptr; + } + + ptr = os::GetExportAddress(lib, "hsa_ext_program_get_info_impl"); + if (ptr != NULL) { + assert(finalizer_api.hsa_ext_program_get_info_fn == + (decltype(::hsa_ext_program_get_info)*)hsa_ext_null && + "Duplicate load of extension import."); + finalizer_api.hsa_ext_program_get_info_fn = + (decltype(::hsa_ext_program_get_info)*)ptr; + } + + ptr = os::GetExportAddress(lib, "hsa_ext_program_finalize_impl"); + if (ptr != NULL) { + assert(finalizer_api.hsa_ext_program_finalize_fn == + (decltype(::hsa_ext_program_finalize)*)hsa_ext_null && + "Duplicate load of extension import."); + finalizer_api.hsa_ext_program_finalize_fn = + (decltype(::hsa_ext_program_finalize)*)ptr; + } + // Initialize Version of Api Table + finalizer_api.version.major_id = HSA_FINALIZER_API_TABLE_MAJOR_VERSION; + finalizer_api.version.minor_id = sizeof(::FinalizerExtTable); + finalizer_api.version.step_id = HSA_FINALIZER_API_TABLE_STEP_VERSION; + + // Update handle of table of HSA extensions + hsa_internal_api_table_.CloneExts(&finalizer_api, + core::HsaApiTable::HSA_EXT_FINALIZER_API_TABLE_ID); + ptr = os::GetExportAddress(lib, "Load"); if (ptr != NULL) { - ((Load_t)ptr)(&core::hsa_internal_api_table_.table); + ((Load_t)ptr)(&core::hsa_internal_api_table_.hsa_api); } return true; } + + } // namespace core //---------------------------------------------------------------------------// @@ -386,19 +468,19 @@ hsa_status_t hsa_ext_program_create( hsa_machine_model_t machine_model, hsa_profile_t profile, hsa_default_float_rounding_mode_t default_float_rounding_mode, const char* options, hsa_ext_program_t* program) { - return core::Runtime::runtime_singleton_->extensions_.table + return core::Runtime::runtime_singleton_->extensions_.finalizer_api .hsa_ext_program_create_fn(machine_model, profile, default_float_rounding_mode, options, program); } hsa_status_t hsa_ext_program_destroy(hsa_ext_program_t program) { - return core::Runtime::runtime_singleton_->extensions_.table + return core::Runtime::runtime_singleton_->extensions_.finalizer_api .hsa_ext_program_destroy_fn(program); } hsa_status_t hsa_ext_program_add_module(hsa_ext_program_t program, hsa_ext_module_t module) { - return core::Runtime::runtime_singleton_->extensions_.table + return core::Runtime::runtime_singleton_->extensions_.finalizer_api .hsa_ext_program_add_module_fn(program, module); } @@ -407,14 +489,14 @@ hsa_status_t hsa_ext_program_iterate_modules( hsa_status_t (*callback)(hsa_ext_program_t program, hsa_ext_module_t module, void* data), void* data) { - return core::Runtime::runtime_singleton_->extensions_.table + return core::Runtime::runtime_singleton_->extensions_.finalizer_api .hsa_ext_program_iterate_modules_fn(program, callback, data); } hsa_status_t hsa_ext_program_get_info(hsa_ext_program_t program, hsa_ext_program_info_t attribute, void* value) { - return core::Runtime::runtime_singleton_->extensions_.table + return core::Runtime::runtime_singleton_->extensions_.finalizer_api .hsa_ext_program_get_info_fn(program, attribute, value); } @@ -422,7 +504,7 @@ hsa_status_t hsa_ext_program_finalize( hsa_ext_program_t program, hsa_isa_t isa, int32_t call_convention, hsa_ext_control_directives_t control_directives, const char* options, hsa_code_object_type_t code_object_type, hsa_code_object_t* code_object) { - return core::Runtime::runtime_singleton_->extensions_.table + return core::Runtime::runtime_singleton_->extensions_.finalizer_api .hsa_ext_program_finalize_fn(program, isa, call_convention, control_directives, options, code_object_type, code_object); @@ -431,7 +513,7 @@ hsa_status_t hsa_ext_program_finalize( hsa_status_t hsa_ext_image_get_capability( hsa_agent_t agent, hsa_ext_image_geometry_t geometry, const hsa_ext_image_format_t* image_format, uint32_t* capability_mask) { - return core::Runtime::runtime_singleton_->extensions_.table + return core::Runtime::runtime_singleton_->extensions_.image_api .hsa_ext_image_get_capability_fn(agent, geometry, image_format, capability_mask); } @@ -440,7 +522,7 @@ hsa_status_t hsa_ext_image_data_get_info( hsa_agent_t agent, const hsa_ext_image_descriptor_t* image_descriptor, hsa_access_permission_t access_permission, hsa_ext_image_data_info_t* image_data_info) { - return core::Runtime::runtime_singleton_->extensions_.table + return core::Runtime::runtime_singleton_->extensions_.image_api .hsa_ext_image_data_get_info_fn(agent, image_descriptor, access_permission, image_data_info); } @@ -449,7 +531,7 @@ hsa_status_t hsa_ext_image_create( hsa_agent_t agent, const hsa_ext_image_descriptor_t* image_descriptor, const void* image_data, hsa_access_permission_t access_permission, hsa_ext_image_t* image) { - return core::Runtime::runtime_singleton_->extensions_.table + return core::Runtime::runtime_singleton_->extensions_.image_api .hsa_ext_image_create_fn(agent, image_descriptor, image_data, access_permission, image); } @@ -458,7 +540,7 @@ hsa_status_t hsa_ext_image_import(hsa_agent_t agent, const void* src_memory, size_t src_row_pitch, size_t src_slice_pitch, hsa_ext_image_t dst_image, const hsa_ext_image_region_t* image_region) { - return core::Runtime::runtime_singleton_->extensions_.table + return core::Runtime::runtime_singleton_->extensions_.image_api .hsa_ext_image_import_fn(agent, src_memory, src_row_pitch, src_slice_pitch, dst_image, image_region); } @@ -467,7 +549,7 @@ hsa_status_t hsa_ext_image_export(hsa_agent_t agent, hsa_ext_image_t src_image, void* dst_memory, size_t dst_row_pitch, size_t dst_slice_pitch, const hsa_ext_image_region_t* image_region) { - return core::Runtime::runtime_singleton_->extensions_.table + return core::Runtime::runtime_singleton_->extensions_.image_api .hsa_ext_image_export_fn(agent, src_image, dst_memory, dst_row_pitch, dst_slice_pitch, image_region); } @@ -477,7 +559,7 @@ hsa_status_t hsa_ext_image_copy(hsa_agent_t agent, hsa_ext_image_t src_image, hsa_ext_image_t dst_image, const hsa_dim3_t* dst_offset, const hsa_dim3_t* range) { - return core::Runtime::runtime_singleton_->extensions_.table + return core::Runtime::runtime_singleton_->extensions_.image_api .hsa_ext_image_copy_fn(agent, src_image, src_offset, dst_image, dst_offset, range); } @@ -485,25 +567,25 @@ hsa_status_t hsa_ext_image_copy(hsa_agent_t agent, hsa_ext_image_t src_image, hsa_status_t hsa_ext_image_clear(hsa_agent_t agent, hsa_ext_image_t image, const void* data, const hsa_ext_image_region_t* image_region) { - return core::Runtime::runtime_singleton_->extensions_.table + return core::Runtime::runtime_singleton_->extensions_.image_api .hsa_ext_image_clear_fn(agent, image, data, image_region); } hsa_status_t hsa_ext_image_destroy(hsa_agent_t agent, hsa_ext_image_t image) { - return core::Runtime::runtime_singleton_->extensions_.table + return core::Runtime::runtime_singleton_->extensions_.image_api .hsa_ext_image_destroy_fn(agent, image); } hsa_status_t hsa_ext_sampler_create( hsa_agent_t agent, const hsa_ext_sampler_descriptor_t* sampler_descriptor, hsa_ext_sampler_t* sampler) { - return core::Runtime::runtime_singleton_->extensions_.table + return core::Runtime::runtime_singleton_->extensions_.image_api .hsa_ext_sampler_create_fn(agent, sampler_descriptor, sampler); } hsa_status_t hsa_ext_sampler_destroy(hsa_agent_t agent, hsa_ext_sampler_t sampler) { - return core::Runtime::runtime_singleton_->extensions_.table + return core::Runtime::runtime_singleton_->extensions_.image_api .hsa_ext_sampler_destroy_fn(agent, sampler); } @@ -511,20 +593,12 @@ hsa_status_t hsa_ext_sampler_destroy(hsa_agent_t agent, // Stubs for internal extension functions //---------------------------------------------------------------------------// +// Use the function pointer from local instance Image Extension hsa_status_t hsa_amd_image_get_info_max_dim(hsa_agent_t component, hsa_agent_info_t attribute, void* value) { - return core::Runtime::runtime_singleton_->extensions_.table + return core::Runtime::runtime_singleton_->extensions_.image_api .hsa_amd_image_get_info_max_dim_fn(component, attribute, value); } -hsa_status_t hsa_amd_image_create( - hsa_agent_t agent, - const hsa_ext_image_descriptor_t *image_descriptor, - const hsa_amd_image_descriptor_t *image_layout, - const void *image_data, - hsa_access_permission_t access_permission, - hsa_ext_image_t *image) { - return core::Runtime::runtime_singleton_->extensions_.table - .hsa_amd_image_create_fn(agent, image_descriptor, image_layout, image_data, access_permission, image); -} + diff --git a/src/core/runtime/hsa_ven_amd_loaded_code_object.cpp b/src/core/runtime/hsa_ven_amd_loader.cpp similarity index 83% rename from src/core/runtime/hsa_ven_amd_loaded_code_object.cpp rename to src/core/runtime/hsa_ven_amd_loader.cpp index 958e3051e..ba951053e 100644 --- a/src/core/runtime/hsa_ven_amd_loaded_code_object.cpp +++ b/src/core/runtime/hsa_ven_amd_loader.cpp @@ -40,14 +40,14 @@ // //////////////////////////////////////////////////////////////////////////////// -#include "hsa_ven_amd_loaded_code_object.h" +#include "hsa_ven_amd_loader.h" #include "core/inc/amd_hsa_loader.hpp" #include "core/inc/runtime.h" using namespace core; -hsa_status_t hsa_ven_amd_loaded_code_object_query_host_address( +hsa_status_t HSA_API hsa_ven_amd_loader_query_host_address( const void *device_address, const void **host_address) { if (false == core::Runtime::runtime_singleton_->IsOpen()) { @@ -69,3 +69,14 @@ hsa_status_t hsa_ven_amd_loaded_code_object_query_host_address( *host_address = reinterpret_cast<void*>(uhaddr); return HSA_STATUS_SUCCESS; } + +hsa_status_t HSA_API hsa_ven_amd_loader_query_segment_descriptors( + hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors, + size_t *num_segment_descriptors) { + if (false == core::Runtime::runtime_singleton_->IsOpen()) { + return HSA_STATUS_ERROR_NOT_INITIALIZED; + } + + // Arguments are checked by the loader. + return Runtime::runtime_singleton_->loader()->QuerySegmentDescriptors(segment_descriptors, num_segment_descriptors); +} diff --git a/src/core/runtime/interrupt_signal.cpp b/src/core/runtime/interrupt_signal.cpp index 67c95867d..eb07bcc53 100644 --- a/src/core/runtime/interrupt_signal.cpp +++ b/src/core/runtime/interrupt_signal.cpp @@ -87,8 +87,6 @@ InterruptSignal::InterruptSignal(hsa_signal_value_t initial_value, signal_.event_mailbox_ptr = 0; } signal_.kind = AMD_SIGNAL_KIND_USER; - - wait_on_event_ = true; } InterruptSignal::~InterruptSignal() { @@ -110,13 +108,11 @@ hsa_signal_value_t InterruptSignal::LoadAcquire() { } void InterruptSignal::StoreRelaxed(hsa_signal_value_t value) { - wait_on_event_ = true; atomic::Store(&signal_.value, int64_t(value), std::memory_order_relaxed); SetEvent(); } void InterruptSignal::StoreRelease(hsa_signal_value_t value) { - wait_on_event_ = true; atomic::Store(&signal_.value, int64_t(value), std::memory_order_release); SetEvent(); } @@ -181,7 +177,7 @@ hsa_signal_value_t InterruptSignal::WaitRelaxed( value = atomic::Load(&signal_.value, std::memory_order_relaxed); return hsa_signal_value_t(value); } - if (wait_on_event_ && wait_hint != HSA_WAIT_STATE_ACTIVE) { + if (wait_hint != HSA_WAIT_STATE_ACTIVE) { uint32_t wait_ms; auto time_remaining = fast_timeout - (time - start_time); if ((timeout == -1) || diff --git a/src/core/runtime/runtime.cpp b/src/core/runtime/runtime.cpp index 8449b8e7f..a93d75c99 100644 --- a/src/core/runtime/runtime.cpp +++ b/src/core/runtime/runtime.cpp @@ -58,6 +58,7 @@ #include "core/inc/amd_topology.h" #include "core/inc/signal.h" #include "core/inc/interrupt_signal.h" +#include "core/inc/hsa_ext_amd_impl.h" #include "core/inc/hsa_api_trace_int.h" @@ -187,7 +188,6 @@ void Runtime::RegisterAgent(Agent* agent) { HsaClockCounters clocks; hsaKmtGetClockCounters(0, &clocks); sys_clock_freq_ = clocks.SystemClockFrequencyHz; - host_agent_ = agent; } } else if (agent->device_type() == Agent::DeviceType::kAmdGpuDevice) { gpu_agents_.push_back(agent); @@ -261,6 +261,10 @@ void Runtime::RegisterLinkInfo(uint32_t node_id_from, uint32_t node_id_to, const uint32_t idx = GetIndexLinkInfo(node_id_from, node_id_to); link_matrix_[idx].num_hop = num_hop; link_matrix_[idx].info = link_info; + + // Limit the number of hop to 1 since the runtime does not have enough + // information to share to the user about each hop. + link_matrix_[idx].num_hop = std::min(link_matrix_[idx].num_hop , 1U); } const Runtime::LinkInfo Runtime::GetLinkInfo(uint32_t node_id_from, @@ -420,19 +424,38 @@ hsa_status_t Runtime::CopyMemory(void* dst, core::Agent& dst_agent, } // For cpu to cpu, fire and forget a copy thread. - std::thread([](void* dst, const void* src, size_t size, - std::vector<core::Signal*> dep_signals, - core::Signal* completion_signal) { - for (core::Signal* dep : dep_signals) { - dep->WaitRelaxed(HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, - HSA_WAIT_STATE_BLOCKED); - } + const bool profiling_enabled = + (dst_agent.profiling_enabled() || src_agent.profiling_enabled()); + std::thread( + [](void* dst, const void* src, size_t size, + std::vector<core::Signal*> dep_signals, + core::Signal* completion_signal, bool profiling_enabled) { + + for (core::Signal* dep : dep_signals) { + dep->WaitRelaxed(HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, + HSA_WAIT_STATE_BLOCKED); + } + + if (profiling_enabled) { + HsaClockCounters clocks = {0}; + core::Runtime::runtime_singleton_->GetSystemInfo( + HSA_SYSTEM_INFO_TIMESTAMP, reinterpret_cast<void*>(&clocks)); + completion_signal->signal_.start_ts = clocks.SystemClockCounter; + } + + memcpy(dst, src, size); - memcpy(dst, src, size); + if (profiling_enabled) { + HsaClockCounters clocks = {0}; + core::Runtime::runtime_singleton_->GetSystemInfo( + HSA_SYSTEM_INFO_TIMESTAMP, reinterpret_cast<void*>(&clocks)); + completion_signal->signal_.end_ts = clocks.SystemClockCounter; + } - completion_signal->SubRelease(1); - }, - dst, src, size, dep_signals, &completion_signal).detach(); + completion_signal->SubRelease(1); + }, + dst, src, size, dep_signals, &completion_signal, + profiling_enabled).detach(); return HSA_STATUS_SUCCESS; } @@ -505,11 +528,11 @@ hsa_status_t Runtime::GetSystemInfo(hsa_system_info_t attribute, void* value) { case HSA_SYSTEM_INFO_EXTENSIONS: memset(value, 0, sizeof(uint8_t) * 128); - if (extensions_.table.hsa_ext_program_finalize_fn != NULL) { + if (hsa_internal_api_table_.finalizer_api.hsa_ext_program_finalize_fn != NULL) { *((uint8_t*)value) = 1 << HSA_EXTENSION_FINALIZER; } - if (extensions_.table.hsa_ext_image_create_fn != NULL) { + if (hsa_internal_api_table_.image_api.hsa_ext_image_create_fn != NULL) { *((uint8_t*)value) |= 1 << HSA_EXTENSION_IMAGES; } @@ -629,7 +652,7 @@ void Runtime::AsyncEventsLoop(void*) { while (!async_events_control_.exit) { // Wait for a signal hsa_signal_value_t value; - uint32_t index = hsa_amd_signal_wait_any( + uint32_t index = AMD::hsa_amd_signal_wait_any( uint32_t(async_events_.Size()), &async_events_.signal_[0], &async_events_.cond_[0], &async_events_.value_[0], uint64_t(-1), HSA_WAIT_STATE_BLOCKED, &value); @@ -767,8 +790,7 @@ bool Runtime::VMFaultHandler(hsa_signal_value_t val, void* arg) { } Runtime::Runtime() - : host_agent_(NULL), - blit_agent_(NULL), + : blit_agent_(NULL), queue_count_(0), sys_clock_freq_(0), vm_fault_event_(NULL), @@ -798,6 +820,14 @@ void Runtime::Load() { // Load tools libraries LoadTools(); + + // Initialize blit kernel object after tools is initialized to allow tools + // to overload blit kernel. + for (core::Agent* agent : gpu_agents_) { + const hsa_status_t stat = + reinterpret_cast<amd::GpuAgentInt*>(agent)->InitBlitKernel(); + assert(HSA_STATUS_SUCCESS == stat); + } } void Runtime::Unload() { @@ -832,8 +862,12 @@ void Runtime::LoadExtensions() { static const std::string kImageLib[] = {"hsa-ext-image.dll", "libhsa-ext-image.so.1"}; #endif - extensions_.Load(kFinalizerLib[os_index(os::current_os)]); - extensions_.Load(kImageLib[os_index(os::current_os)]); + + // Update Hsa Api Table with handle of Image extension Apis + extensions_.LoadFinalizer(kFinalizerLib[os_index(os::current_os)]); + + // Update Hsa Api Table with handle of Finalizer extension Apis + extensions_.LoadImage(kImageLib[os_index(os::current_os)]); } void Runtime::UnloadExtensions() { extensions_.Unload(); } @@ -889,13 +923,16 @@ static std::vector<std::string> parse_tool_names(std::string tool_names) { } void Runtime::LoadTools() { - typedef bool (*tool_init_t)(::ApiTable*, uint64_t, uint64_t, + typedef bool (*tool_init_t)(::HsaApiTable*, uint64_t, uint64_t, const char* const*); typedef Agent* (*tool_wrap_t)(Agent*); typedef void (*tool_add_t)(Runtime*); - // Link extensions to API interception - hsa_api_table_.LinkExts(&extensions_.table); + // Link HSA Extensions for Finalizer and Images for Api interception + hsa_api_table_.LinkExts(&extensions_.finalizer_api, + core::HsaApiTable::HSA_EXT_FINALIZER_API_TABLE_ID); + hsa_api_table_.LinkExts(&extensions_.image_api, + core::HsaApiTable::HSA_EXT_IMAGE_API_TABLE_ID); // Load tool libs std::string tool_names = flag_.tools_lib_names(); @@ -911,7 +948,9 @@ void Runtime::LoadTools() { tool_init_t ld; ld = (tool_init_t)os::GetExportAddress(tool, "OnLoad"); if (ld) { - if (!ld(&hsa_api_table_.table, 0, failed.size(), &failed[0])) { + if (!ld(&hsa_api_table_.hsa_api, + hsa_api_table_.hsa_api.version.major_id, + failed.size(), &failed[0])) { failed.push_back(names[i].c_str()); os::CloseLib(tool); continue; diff --git a/src/core/util/win/os_win.cpp b/src/core/util/win/os_win.cpp new file mode 100644 index 000000000..d97bff0ce --- /dev/null +++ b/src/core/util/win/os_win.cpp @@ -0,0 +1,227 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// Copyright 2014 ADVANCED MICRO DEVICES, INC. +// +// AMD is granting you permission to use this software and documentation(if any) +// (collectively, the "Materials") pursuant to the terms and conditions of the +// Software License Agreement included with the Materials.If you do not have a +// copy of the Software License Agreement, contact your AMD representative for a +// copy. +// +// WARRANTY DISCLAIMER : THE SOFTWARE IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND.AMD DISCLAIMS ALL WARRANTIES, EXPRESS, IMPLIED, OR STATUTORY, +// INCLUDING BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE, TITLE, NON - INFRINGEMENT, THAT THE +// SOFTWARE WILL RUN UNINTERRUPTED OR ERROR - FREE OR WARRANTIES ARISING FROM +// CUSTOM OF TRADE OR COURSE OF USAGE.THE ENTIRE RISK ASSOCIATED WITH THE USE OF +// THE SOFTWARE IS ASSUMED BY YOU.Some jurisdictions do not allow the exclusion +// of implied warranties, so the above exclusion may not apply to You. +// +// LIMITATION OF LIABILITY AND INDEMNIFICATION : AMD AND ITS LICENSORS WILL NOT, +// UNDER ANY CIRCUMSTANCES BE LIABLE TO YOU FOR ANY PUNITIVE, DIRECT, +// INCIDENTAL, INDIRECT, SPECIAL OR CONSEQUENTIAL DAMAGES ARISING FROM USE OF +// THE SOFTWARE OR THIS AGREEMENT EVEN IF AMD AND ITS LICENSORS HAVE BEEN +// ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.In no event shall AMD's total +// liability to You for all damages, losses, and causes of action (whether in +// contract, tort (including negligence) or otherwise) exceed the amount of $100 +// USD. You agree to defend, indemnify and hold harmless AMD and its licensors, +// and any of their directors, officers, employees, affiliates or agents from +// and against any and all loss, damage, liability and other expenses (including +// reasonable attorneys' fees), resulting from Your use of the Software or +// violation of the terms and conditions of this Agreement. +// +// U.S.GOVERNMENT RESTRICTED RIGHTS : The Materials are provided with +// "RESTRICTED RIGHTS." Use, duplication, or disclosure by the Government is +// subject to the restrictions as set forth in FAR 52.227 - 14 and DFAR252.227 - +// 7013, et seq., or its successor.Use of the Materials by the Government +// constitutes acknowledgement of AMD's proprietary rights in them. +// +// EXPORT RESTRICTIONS: The Materials may be subject to export restrictions as +// stated in the Software License Agreement. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifdef _WIN32 // Are we compiling for windows? +#define NOMINMAX + +#include "core/util/os.h" + +#include <algorithm> +#include <process.h> +#include <string> +#include <windows.h> + +#include <emmintrin.h> +#include <pmmintrin.h> +#include <xmmintrin.h> + +#undef Yield +#undef CreateMutex + +namespace os { + +static_assert(sizeof(LibHandle) == sizeof(HMODULE), + "OS abstraction size mismatch"); +static_assert(sizeof(LibHandle) == sizeof(::HANDLE), + "OS abstraction size mismatch"); +static_assert(sizeof(Mutex) == sizeof(::HANDLE), + "OS abstraction size mismatch"); +static_assert(sizeof(Thread) == sizeof(::HANDLE), + "OS abstraction size mismatch"); +static_assert(sizeof(EventHandle) == sizeof(::HANDLE), + "OS abstraction size mismatch"); + +LibHandle LoadLib(std::string filename) { + HMODULE ret = LoadLibrary(filename.c_str()); + return *(LibHandle*)&ret; +} + +void* GetExportAddress(LibHandle lib, std::string export_name) { + return GetProcAddress(*(HMODULE*)&lib, export_name.c_str()); +} + +void CloseLib(LibHandle lib) { FreeLibrary(*(::HMODULE*)&lib); } + +Mutex CreateMutex() { return CreateEvent(NULL, false, true, NULL); } + +bool TryAcquireMutex(Mutex lock) { + return WaitForSingleObject(*(::HANDLE*)&lock, 0) == WAIT_OBJECT_0; +} + +bool AcquireMutex(Mutex lock) { + return WaitForSingleObject(*(::HANDLE*)&lock, INFINITE) == WAIT_OBJECT_0; +} + +void ReleaseMutex(Mutex lock) { SetEvent(*(::HANDLE*)&lock); } + +void DestroyMutex(Mutex lock) { CloseHandle(*(::HANDLE*)&lock); } + +void Sleep(int delay_in_millisecond) { ::Sleep(delay_in_millisecond); } + +void YieldThread() { ::Sleep(0); } + +struct ThreadArgs { + void* entry_args; + ThreadEntry entry_function; +}; + +unsigned __stdcall ThreadTrampoline(void* arg) { + ThreadArgs* thread_args = (ThreadArgs*)arg; + ThreadEntry entry = thread_args->entry_function; + void* data = thread_args->entry_args; + delete thread_args; + entry(data); + _endthreadex(0); + return 0; +} + +Thread CreateThread(ThreadEntry entry_function, void* entry_argument, + uint stack_size) { + ThreadArgs* thread_args = new ThreadArgs(); + thread_args->entry_args = entry_argument; + thread_args->entry_function = entry_function; + uintptr_t ret = + _beginthreadex(NULL, stack_size, ThreadTrampoline, thread_args, 0, NULL); + return *(Thread*)&ret; +} + +void CloseThread(Thread thread) { CloseHandle(*(::HANDLE*)&thread); } + +bool WaitForThread(Thread thread) { + return WaitForSingleObject(*(::HANDLE*)&thread, INFINITE) == WAIT_OBJECT_0; +} + +bool WaitForAllThreads(Thread* threads, uint thread_count) { + return WaitForMultipleObjects(thread_count, threads, TRUE, INFINITE) == + WAIT_OBJECT_0; +} + +void SetEnvVar(std::string env_var_name, std::string env_var_value) { + SetEnvironmentVariable(env_var_name.c_str(), env_var_value.c_str()); +} + +std::string GetEnvVar(std::string env_var_name) { + char* buff; + DWORD char_count = GetEnvironmentVariable(env_var_name.c_str(), NULL, 0); + if (char_count == 0) return ""; + buff = (char*)alloca(sizeof(char) * char_count); + GetEnvironmentVariable(env_var_name.c_str(), buff, char_count); + buff[char_count - 1] = '\0'; + std::string ret = buff; + return ret; +} + +size_t GetUserModeVirtualMemorySize() { + SYSTEM_INFO system_info = {0}; + GetSystemInfo(&system_info); + return ((size_t)system_info.lpMaximumApplicationAddress + 1); +} + +size_t GetUsablePhysicalHostMemorySize() { + MEMORYSTATUSEX memory_status = {0}; + memory_status.dwLength = sizeof(memory_status); + if (GlobalMemoryStatusEx(&memory_status) == 0) { + return 0; + } + + const size_t physical_size = static_cast<size_t>(memory_status.ullTotalPhys); + return std::min(GetUserModeVirtualMemorySize(), physical_size); +} + +uintptr_t GetUserModeVirtualMemoryBase() { return (uintptr_t)0; } + +// Os event wrappers +EventHandle CreateOsEvent(bool auto_reset, bool init_state) { + EventHandle evt = reinterpret_cast<EventHandle>( + CreateEvent(NULL, (BOOL)(!auto_reset), (BOOL)init_state, NULL)); + return evt; +} + +int DestroyOsEvent(EventHandle event) { + if (event == NULL) { + return -1; + } + return CloseHandle(reinterpret_cast<::HANDLE>(event)); +} + +int WaitForOsEvent(EventHandle event, unsigned int milli_seconds) { + if (event == NULL) { + return -1; + } + + int ret_code = + WaitForSingleObject(reinterpret_cast<::HANDLE>(event), milli_seconds); + if (ret_code == WAIT_TIMEOUT) { + ret_code = 0x14003; // 0x14003 indicates timeout + } + return ret_code; +} + +int SetOsEvent(EventHandle event) { + if (event == NULL) { + return -1; + } + return SetEvent(reinterpret_cast<::HANDLE>(event)); +} + +int ResetOsEvent(EventHandle event) { + if (event == NULL) { + return -1; + } + return ResetEvent(reinterpret_cast<::HANDLE>(event)); +} + +uint64_t ReadAccurateClock() { + uint64_t ret; + QueryPerformanceCounter((LARGE_INTEGER*)&ret); + return ret; +} + +uint64_t AccurateClockFrequency() { + uint64_t ret; + QueryPerformanceFrequency((LARGE_INTEGER*)&ret); + return ret; +} +} + +#endif diff --git a/src/inc/hsa.h b/src/inc/hsa.h index f80768dbf..6ab97c394 100644 --- a/src/inc/hsa.h +++ b/src/inc/hsa.h @@ -462,9 +462,13 @@ typedef enum { */ HSA_EXTENSION_AMD_PROFILER = 2, /** - * Loaded code object extension. + * Loader extension. */ - HSA_EXTENSION_AMD_LOADED_CODE_OBJECT = 3 + HSA_EXTENSION_AMD_LOADER = 3, + /** + * Extension count. + */ + HSA_EXTENSION_COUNT } hsa_extension_t; /** diff --git a/src/inc/hsa_api_trace.h b/src/inc/hsa_api_trace.h index ee7e63b9e..40d443de1 100644 --- a/src/inc/hsa_api_trace.h +++ b/src/inc/hsa_api_trace.h @@ -54,13 +54,51 @@ #include "inc/hsa_ext_finalize.h" #endif -struct ExtTable { +#include <string.h> +#include <assert.h> +#include <stddef.h> + +// Major Ids of the Api tables exported by Hsa Core Runtime +#define HSA_API_TABLE_MAJOR_VERSION 0x01 +#define HSA_CORE_API_TABLE_MAJOR_VERSION 0x01 +#define HSA_AMD_EXT_API_TABLE_MAJOR_VERSION 0x01 +#define HSA_FINALIZER_API_TABLE_MAJOR_VERSION 0x01 +#define HSA_IMAGE_API_TABLE_MAJOR_VERSION 0x01 + +// Step Ids of the Api tables exported by Hsa Core Runtime +#define HSA_API_TABLE_STEP_VERSION 0x00 +#define HSA_CORE_API_TABLE_STEP_VERSION 0x00 +#define HSA_AMD_EXT_API_TABLE_STEP_VERSION 0x00 +#define HSA_FINALIZER_API_TABLE_STEP_VERSION 0x00 +#define HSA_IMAGE_API_TABLE_STEP_VERSION 0x00 + +// Min function used to copy Api Tables +static inline uint32_t Min(const uint32_t a, const uint32_t b) { + return (a > b) ? b : a; +} + +// Structure of Version used to identify an instance of Api table +struct ApiTableVersion { + uint32_t major_id; + uint32_t minor_id; + uint32_t step_id; + uint32_t reserved; +}; + +// Table to export HSA Finalizer Extension Apis +struct FinalizerExtTable { + ApiTableVersion version; decltype(hsa_ext_program_create)* hsa_ext_program_create_fn; decltype(hsa_ext_program_destroy)* hsa_ext_program_destroy_fn; decltype(hsa_ext_program_add_module)* hsa_ext_program_add_module_fn; decltype(hsa_ext_program_iterate_modules)* hsa_ext_program_iterate_modules_fn; decltype(hsa_ext_program_get_info)* hsa_ext_program_get_info_fn; decltype(hsa_ext_program_finalize)* hsa_ext_program_finalize_fn; +}; + +// Table to export HSA Image Extension Apis +struct ImageExtTable { + ApiTableVersion version; decltype(hsa_ext_image_get_capability)* hsa_ext_image_get_capability_fn; decltype(hsa_ext_image_data_get_info)* hsa_ext_image_data_get_info_fn; decltype(hsa_ext_image_create)* hsa_ext_image_create_fn; @@ -73,7 +111,40 @@ struct ExtTable { decltype(hsa_ext_sampler_destroy)* hsa_ext_sampler_destroy_fn; }; -struct ApiTable { +// Table to export AMD Extension Apis +struct AmdExtTable { + ApiTableVersion version; + decltype(hsa_amd_coherency_get_type)* hsa_amd_coherency_get_type_fn; + decltype(hsa_amd_coherency_set_type)* hsa_amd_coherency_set_type_fn; + decltype(hsa_amd_profiling_set_profiler_enabled)* hsa_amd_profiling_set_profiler_enabled_fn; + decltype(hsa_amd_profiling_async_copy_enable) *hsa_amd_profiling_async_copy_enable_fn; + decltype(hsa_amd_profiling_get_dispatch_time)* hsa_amd_profiling_get_dispatch_time_fn; + decltype(hsa_amd_profiling_get_async_copy_time) *hsa_amd_profiling_get_async_copy_time_fn; + decltype(hsa_amd_profiling_convert_tick_to_system_domain)* hsa_amd_profiling_convert_tick_to_system_domain_fn; + decltype(hsa_amd_signal_async_handler)* hsa_amd_signal_async_handler_fn; + decltype(hsa_amd_async_function)* hsa_amd_async_function_fn; + decltype(hsa_amd_signal_wait_any)* hsa_amd_signal_wait_any_fn; + decltype(hsa_amd_queue_cu_set_mask)* hsa_amd_queue_cu_set_mask_fn; + decltype(hsa_amd_memory_pool_get_info)* hsa_amd_memory_pool_get_info_fn; + decltype(hsa_amd_agent_iterate_memory_pools)* hsa_amd_agent_iterate_memory_pools_fn; + decltype(hsa_amd_memory_pool_allocate)* hsa_amd_memory_pool_allocate_fn; + decltype(hsa_amd_memory_pool_free)* hsa_amd_memory_pool_free_fn; + decltype(hsa_amd_memory_async_copy)* hsa_amd_memory_async_copy_fn; + decltype(hsa_amd_agent_memory_pool_get_info)* hsa_amd_agent_memory_pool_get_info_fn; + decltype(hsa_amd_agents_allow_access)* hsa_amd_agents_allow_access_fn; + decltype(hsa_amd_memory_pool_can_migrate)* hsa_amd_memory_pool_can_migrate_fn; + decltype(hsa_amd_memory_migrate)* hsa_amd_memory_migrate_fn; + decltype(hsa_amd_memory_lock)* hsa_amd_memory_lock_fn; + decltype(hsa_amd_memory_unlock)* hsa_amd_memory_unlock_fn; + decltype(hsa_amd_memory_fill)* hsa_amd_memory_fill_fn; + decltype(hsa_amd_interop_map_buffer)* hsa_amd_interop_map_buffer_fn; + decltype(hsa_amd_interop_unmap_buffer)* hsa_amd_interop_unmap_buffer_fn; + decltype(::hsa_amd_image_create)* hsa_amd_image_create_fn; +}; + +// Table to export HSA Core Runtime Apis +struct CoreApiTable { + ApiTableVersion version; decltype(hsa_init)* hsa_init_fn; decltype(hsa_shut_down)* hsa_shut_down_fn; decltype(hsa_system_get_info)* hsa_system_get_info_fn; @@ -170,8 +241,126 @@ struct ApiTable { decltype(hsa_executable_symbol_get_info)* hsa_executable_symbol_get_info_fn; decltype(hsa_executable_iterate_symbols)* hsa_executable_iterate_symbols_fn; decltype(hsa_status_string)* hsa_status_string_fn; +}; + +// Table to export HSA Apis from Core Runtime, Amd Extensions +// Finalizer and Images +struct HsaApiTable { + + // Version of Hsa Api Table + ApiTableVersion version; + + // Table of function pointers to HSA Core Runtime + CoreApiTable* core_; + + // Table of function pointers to AMD extensions + AmdExtTable* amd_ext_; + + // Table of function pointers to HSA Finalizer Extension + FinalizerExtTable* finalizer_ext_; + + // Table of function pointers to HSA Image Extension + ImageExtTable* image_ext_; - ExtTable* std_exts_; }; +// Structure containing instances of different api tables +struct HsaApiTableContainer { + HsaApiTable root; + CoreApiTable core; + AmdExtTable amd_ext; + FinalizerExtTable finalizer_ext; + ImageExtTable image_ext; + + // Default initialization of a container instance + HsaApiTableContainer() { + root.version.major_id = HSA_API_TABLE_MAJOR_VERSION; + root.version.minor_id = sizeof(HsaApiTable); + root.version.step_id = HSA_API_TABLE_STEP_VERSION; + + core.version.major_id = HSA_CORE_API_TABLE_MAJOR_VERSION; + core.version.minor_id = sizeof(CoreApiTable); + core.version.step_id = HSA_CORE_API_TABLE_STEP_VERSION; + root.core_ = &core; + + amd_ext.version.major_id = HSA_AMD_EXT_API_TABLE_MAJOR_VERSION; + amd_ext.version.minor_id = sizeof(AmdExtTable); + amd_ext.version.step_id = HSA_AMD_EXT_API_TABLE_STEP_VERSION; + root.amd_ext_ = &amd_ext; + + finalizer_ext.version.major_id = HSA_FINALIZER_API_TABLE_MAJOR_VERSION; + finalizer_ext.version.minor_id = sizeof(FinalizerExtTable); + finalizer_ext.version.step_id = HSA_FINALIZER_API_TABLE_STEP_VERSION; + root.finalizer_ext_ = & finalizer_ext; + + image_ext.version.major_id = HSA_IMAGE_API_TABLE_MAJOR_VERSION; + image_ext.version.minor_id = sizeof(ImageExtTable); + image_ext.version.step_id = HSA_IMAGE_API_TABLE_STEP_VERSION; + root.image_ext_ = &image_ext; + } +}; + +// Api to copy function pointers of a table +static +void inline copyApi(void* src, void* dest, size_t size) { + memcpy((char*)src + sizeof(ApiTableVersion), + (char*)dest + sizeof(ApiTableVersion), + (size - sizeof(ApiTableVersion))); +} + +// Copy constructor for all Api tables. The function assumes the +// user has initialized an instance of tables container correctly +// for the Major, Minor and Stepping Ids of Root and Child Api tables. +// The function will overwrite the value of Minor Id by taking the +// minimum of source and destination parameters. It will also overwrite +// the stepping Id with value from source parameter. +static const +void inline copyTables(const HsaApiTable* src, HsaApiTableContainer* dest) { + + // Verify Major Id of source and destination tables are valid + assert(dest->root.version.major_id == src->version.major_id); + assert(dest->core.version.major_id == src->core_->version.major_id); + assert(dest->amd_ext.version.major_id == src->amd_ext_->version.major_id); + assert(dest->finalizer_ext.version.major_id == src->finalizer_ext_->version.major_id); + assert(dest->image_ext.version.major_id == src->image_ext_->version.major_id); + + // Initialize the stepping id and minor id of root table. For the + // minor id which encodes struct size, take the minimum of source + // and destination parameters + dest->root.version.step_id = src->version.step_id; + dest->root.version.minor_id = Min(dest->root.version.minor_id, src->version.minor_id); + + // Copy the Core Api table + size_t size = dest->root.version.minor_id; + if (size > offsetof(HsaApiTable, core_)) { + dest->core.version.step_id = src->core_->version.step_id; + dest->core.version.minor_id = Min(dest->core.version.minor_id, + src->core_->version.minor_id); + copyApi(&dest->core, src->core_, dest->core.version.minor_id); + } + + // Copy the Amd Ext Api table + if (size > offsetof(HsaApiTable, amd_ext_)) { + dest->amd_ext.version.step_id = src->amd_ext_->version.step_id; + dest->amd_ext.version.minor_id = Min(dest->core.version.minor_id, + src->amd_ext_->version.minor_id); + copyApi(&dest->amd_ext, src->amd_ext_, dest->amd_ext.version.minor_id); + } + + // Copy the Finalizer Ext Api table + if (size > offsetof(HsaApiTable, finalizer_ext_)) { + dest->finalizer_ext.version.step_id = src->finalizer_ext_->version.step_id; + dest->finalizer_ext.version.minor_id = Min(dest->core.version.minor_id, + src->finalizer_ext_->version.minor_id); + copyApi(&dest->finalizer_ext, src->finalizer_ext_, dest->finalizer_ext.version.minor_id); + } + + // Copy the Image Ext Api table + if (size > offsetof(HsaApiTable, image_ext_)) { + dest->image_ext.version.step_id = src->image_ext_->version.step_id; + dest->image_ext.version.minor_id = Min(dest->core.version.minor_id, + src->image_ext_->version.minor_id); + copyApi(&dest->image_ext, src->image_ext_, dest->image_ext.version.minor_id); + } +} #endif diff --git a/src/inc/hsa_ext_amd.h b/src/inc/hsa_ext_amd.h index 7a4ed5727..4cbc82e7e 100644 --- a/src/inc/hsa_ext_amd.h +++ b/src/inc/hsa_ext_amd.h @@ -211,6 +211,23 @@ typedef struct hsa_amd_profiling_dispatch_time_s { uint64_t end; } hsa_amd_profiling_dispatch_time_t; +/** + * @brief Structure containing profiling async copy time information. + * + * Times are reported as ticks in the domain of the HSA system clock. + * The HSA system clock tick and frequency is obtained via hsa_system_get_info. + */ +typedef struct hsa_amd_profiling_async_copy_time_s { + /** + * Async copy processing start time. + */ + uint64_t start; + /** + * Async copy completion time. + */ + uint64_t end; +} hsa_amd_profiling_async_copy_time_t; + /** * @brief Enable or disable profiling capability of a queue. * @@ -230,10 +247,34 @@ typedef struct hsa_amd_profiling_dispatch_time_s { hsa_status_t HSA_API hsa_amd_profiling_set_profiler_enabled(hsa_queue_t* queue, int enable); +/** + * @brief Enable or disable asynchronous memory copy profiling. + * + * @details The runtime will provide the copy processing start timestamp and + * completion timestamp of each call to hsa_amd_memory_async_copy if the + * async copy profiling is enabled prior to the call to + * hsa_amd_memory_async_copy. The completion signal object is used to + * hold the last async copy start and end timestamp. The client can retrieve + * these timestamps via call to hsa_amd_profiling_get_async_copy_time. + * + * @param[in] enable True to enable profiling. False to disable profiling. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Failed on allocating resources + * needed to profile the asynchronous copy. + */ +hsa_status_t HSA_API + hsa_amd_profiling_async_copy_enable(bool enable); + /** * @brief Retrieve packet processing time stamps. * - * @param[in] agent The agent with which the signal was last used. For instance, + * @param[in] agent The agent with which the signal was last used. For + *instance, * if the profiled dispatch packet is dispatched on to queue Q, which was * created on agent A, then this parameter must be A. * @@ -261,9 +302,33 @@ hsa_status_t HSA_API hsa_amd_profiling_get_dispatch_time( hsa_agent_t agent, hsa_signal_t signal, hsa_amd_profiling_dispatch_time_t* time); +/** + * @brief Retrieve asynchronous copy timestamps. + * + * @details Async copy profiling is enabled via call to + * hsa_amd_profiling_async_copy_enable. + * + * @param[in] signal A signal used as the completion signal of the call to + * hsa_amd_memory_async_copy. + * + * @param[out] time Async copy processing timestamps in the HSA system clock + * domain. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL The signal is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p time is NULL. + */ +hsa_status_t HSA_API hsa_amd_profiling_get_async_copy_time( + hsa_signal_t signal, hsa_amd_profiling_async_copy_time_t* time); + /** * @brief Computes the frequency ratio and offset between the agent clock and - * HSA system clock and converts the agent’s tick to HSA system domain tick. + * HSA system clock and converts the agent's tick to HSA system domain tick. * * @param[in] agent The agent used to retrieve the agent_tick. It is user's * responsibility to make sure the tick number is from this agent, otherwise, @@ -392,7 +457,7 @@ hsa_status_t HSA_API * * @details Allows waiting for any of several signal and conditions pairs to be * satisfied. The function returns the index into the list of signals of the - * first satisfying signal-condition pair. The value of the satisfying signal’s + * first satisfying signal-condition pair. The value of the satisfying signal's * value is returned in satisfying_value unless satisfying_value is NULL. This * function provides only relaxed memory semantics. */ @@ -857,7 +922,10 @@ typedef enum { /** * Number of links to hop when accessing the memory pool from the specified - * agent. The type of this attribute is uint32_t. + * agent. The value of this attribute is zero if the memory pool is associated + * with the agent, or if the access type is + * HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED. The type of this attribute is + * uint32_t. */ HSA_AMD_AGENT_MEMORY_POOL_INFO_NUM_LINK_HOPS = 1, diff --git a/src/inc/hsa_ven_amd_loaded_code_object.h b/src/inc/hsa_ven_amd_loaded_code_object.h deleted file mode 100644 index fe56e3813..000000000 --- a/src/inc/hsa_ven_amd_loaded_code_object.h +++ /dev/null @@ -1,95 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// -// The University of Illinois/NCSA -// Open Source License (NCSA) -// -// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// -// Developed by: -// -// AMD Research and AMD HSA Software Development -// -// Advanced Micro Devices, Inc. -// -// www.amd.com -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to -// deal with the Software without restriction, including without limitation -// the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the -// Software is furnished to do so, subject to the following conditions: -// -// - Redistributions of source code must retain the above copyright notice, -// this list of conditions and the following disclaimers. -// - Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimers in -// the documentation and/or other materials provided with the distribution. -// - Neither the names of Advanced Micro Devices, Inc, -// nor the names of its contributors may be used to endorse or promote -// products derived from this Software without specific prior written -// permission. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR -// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -// DEALINGS WITH THE SOFTWARE. -// -//////////////////////////////////////////////////////////////////////////////// - -// HSA AMD extension for loaded code objects. - -#ifndef HSA_VEN_AMD_LOADED_CODE_OBJECT_H -#define HSA_VEN_AMD_LOADED_CODE_OBJECT_H - -#include "hsa.h" - -#ifdef __cplusplus -extern "C" { -#endif // __cplusplus - -/** - * @brief Records loaded code object's host address in @p host_address given - * loaded code object's device address. Recorded host address points to host - * accessible memory, which is identical to memory pointed to by device address. - * If device address already points to host accessible memory, then device - * address is recorded in @p host_address. - * - * @param[in] device_address Device address. - * - * @param[out] host_address Pointer to application-allocated buffer, where to - * record host address. - * - * @retval HSA_STATUS_SUCCESS Function has been executed successfully. - * - * @retval HSA_STATUS_ERROR_NOT_INITIALIZED Runtime has not been initialized. - * - * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p device address is invalid/null, - * or @p host address is null. - */ -hsa_status_t HSA_API hsa_ven_amd_loaded_code_object_query_host_address( - const void *device_address, - const void **host_address); - -/** - * @brief Extension's version. - */ -#define hsa_ven_amd_loaded_code_object 001000 - -/** - * @brief Extension's function table. - */ -typedef struct hsa_ven_amd_loaded_code_object_1_00_pfn_s { - hsa_status_t (*hsa_ven_amd_loaded_code_object_query_host_address)( - const void *device_address, - const void **host_address); -} hsa_ven_amd_loaded_code_object_1_00_pfn_t; - -#ifdef __cplusplus -} // extern "C" -#endif // __cplusplus - -#endif // HSA_VEN_AMD_LOADED_CODE_OBJECT_H diff --git a/src/inc/hsa_ven_amd_loader.h b/src/inc/hsa_ven_amd_loader.h new file mode 100644 index 000000000..804a360a2 --- /dev/null +++ b/src/inc/hsa_ven_amd_loader.h @@ -0,0 +1,249 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// HSA AMD extension for additional loader functionality. + +#ifndef HSA_VEN_AMD_LOADER_H +#define HSA_VEN_AMD_LOADER_H + +#include "hsa.h" + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +/** + * @brief Queries equivalent host address for given @p device_address, and + * records it in @p host_address. + * + * + * @details Contents of memory pointed to by @p host_address would be identical + * to contents of memory pointed to by @p device_address. Only difference + * between the two is host accessibility: @p host_address is always accessible + * from host, @p device_address might not be accessible from host. + * + * If @p device_address already points to host accessible memory, then the value + * of @p device_address is simply copied into @p host_address. + * + * The lifetime of @p host_address is the same as the lifetime of @p + * device_address, and both lifetimes are limited by the lifetime of the + * executable that is managing these addresses. + * + * + * @param[in] device_address Device address to query equivalent host address + * for. + * + * @param[out] host_address Pointer to application-allocated buffer to record + * queried equivalent host address in. + * + * + * @retval HSA_STATUS_SUCCESS Function is executed successfully. + * + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED Runtime is not initialized. + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p device_address is invalid or + * null, or @p host_address is null. + */ +hsa_status_t HSA_API hsa_ven_amd_loader_query_host_address( + const void *device_address, + const void **host_address); + +/** + * @brief The storage type of the code object that is backing loaded memory + * segment. + */ +typedef enum { + /** + * Loaded memory segment is not backed by any code object (anonymous), as the + * case would be with BSS (uninitialized data). + */ + HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_NONE = 0, + /** + * Loaded memory segment is backed by the code object that is stored in the + * file. + */ + HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_FILE = 1, + /** + * Loaded memory segment is backed by the code object that is stored in the + * memory. + */ + HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_MEMORY = 2 +} hsa_ven_amd_loader_code_object_storage_type_t; + +/** + * @brief Loaded memory segment descriptor. + * + * + * @details Loaded memory segment descriptor describes underlying loaded memory + * segment. Loaded memory segment is created/allocated by the executable during + * the loading of the code object that is backing underlying memory segment. + * + * The lifetime of underlying memory segment is limited by the lifetime of the + * executable that is managing underlying memory segment. + */ +typedef struct hsa_ven_amd_loader_segment_descriptor_s { + /** + * Agent underlying memory segment is allocated on. If the code object that is + * backing underlying memory segment is program code object, then 0. + */ + hsa_agent_t agent; + /** + * Executable that is managing this underlying memory segment. + */ + hsa_executable_t executable; + /** + * Storage type of the code object that is backing underlying memory segment. + */ + hsa_ven_amd_loader_code_object_storage_type_t code_object_storage_type; + /** + * If the storage type of the code object that is backing underlying memory + * segment is: + * - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_NONE, then null; + * - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_FILE, then null-terminated + * filepath to the code object; + * - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_MEMORY, then host + * accessible pointer to the first byte of the code object. + */ + const void *code_object_storage_base; + /** + * If the storage type of the code object that is backing underlying memory + * segment is: + * - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_NONE, then 0; + * - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_FILE, then the length of + * the filepath to the code object (including null-terminating character); + * - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_MEMORY, then the size, in + * bytes, of the memory occupied by the code object. + */ + size_t code_object_storage_size; + /** + * If the storage type of the code object that is backing underlying memory + * segment is: + * - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_NONE, then 0; + * - other, then offset, in bytes, from the beginning of the code object to + * the first byte in the code object data is copied from. + */ + size_t code_object_storage_offset; + /** + * Starting address of the underlying memory segment. + */ + const void *segment_base; + /** + * Size, in bytes, of the underlying memory segment. + */ + size_t segment_size; +} hsa_ven_amd_loader_segment_descriptor_t; + +/** + * @brief Either queries loaded memory segment descriptors, or total number of + * loaded memory segment descriptors. + * + * + * @details If @p segment_descriptors is not null and @p num_segment_descriptors + * points to number that exactly matches total number of loaded memory segment + * descriptors, then queries loaded memory segment descriptors, and records them + * in @p segment_descriptors. If @p segment_descriptors is null and @p + * num_segment_descriptors points to zero, then queries total number of loaded + * memory segment descriptors, and records it in @p num_segment_descriptors. In + * all other cases returns appropriate error code (see below). + * + * The caller of this function is responsible for the allocation/deallocation + * and the lifetime of @p segment_descriptors and @p num_segment_descriptors. + * + * The lifetime of loaded memory segments that are described by queried loaded + * memory segment descriptors is limited by the lifetime of the executable that + * is managing loaded memory segments. + * + * Queried loaded memory segment descriptors are always self-consistent: they + * describe a complete set of loaded memory segments that are being backed by + * fully loaded code objects that are present at the time (i.e. this function + * is blocked until all executable manipulations are fully complete). + * + * + * @param[out] segment_descriptors Pointer to application-allocated buffer to + * record queried loaded memory segment descriptors in. Can be null if @p + * num_segment_descriptors points to zero. + * + * @param[in,out] num_segment_descriptors Pointer to application-allocated + * buffer that contains either total number of loaded memory segment descriptors + * or zero. + * + * + * @retval HSA_STATUS_SUCCESS Function is executed successfully. + * + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED Runtime is not initialized. + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p segment_descriptors is null + * while @p num_segment_descriptors points to non-zero number, @p + * segment_descriptors is not null while @p num_segment_descriptors points to + * zero, or @p num_segment_descriptors is null. + * + * @retval HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS @p num_segment_descriptors + * does not point to number that exactly matches total number of loaded memory + * segment descriptors. + */ +hsa_status_t HSA_API hsa_ven_amd_loader_query_segment_descriptors( + hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors, + size_t *num_segment_descriptors); + +/** + * @brief Extension version. + */ +#define hsa_ven_amd_loader 001000 + +/** + * @brief Extension function table. + */ +typedef struct hsa_ven_amd_loader_1_00_pfn_s { + hsa_status_t (*hsa_ven_amd_loader_query_host_address)( + const void *device_address, + const void **host_address); + + hsa_status_t (*hsa_ven_amd_loader_query_segment_descriptors)( + hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors, + size_t *num_segment_descriptors); +} hsa_ven_amd_loader_1_00_pfn_t; + +#ifdef __cplusplus +} +#endif /* __cplusplus */ + +#endif /* HSA_VEN_AMD_LOADER_H */ diff --git a/src/libamdhsacode/amd_elf_image.cpp b/src/libamdhsacode/amd_elf_image.cpp index df8748052..fb36d6234 100644 --- a/src/libamdhsacode/amd_elf_image.cpp +++ b/src/libamdhsacode/amd_elf_image.cpp @@ -456,6 +456,7 @@ namespace amd { uint64_t imageSize() const override { return phdr.p_filesz; } uint64_t vaddr() const override { return phdr.p_vaddr; } uint64_t flags() const override { return phdr.p_flags; } + uint64_t offset() const override { return phdr.p_offset; } const char* data() const override; uint16_t getSegmentIndex() override; bool updateAddSection(Section *section) override; @@ -1368,7 +1369,7 @@ namespace amd { section = new GElfRelocationSection(this); } else if (shdr.sh_type == SHT_STRTAB) { section = new GElfStringTable(this); - } else if (shdr.sh_type == SHT_SYMTAB) { + } else if (shdr.sh_type == SHT_SYMTAB || shdr.sh_type == SHT_DYNSYM) { section = new GElfSymbolTable(this); } else if (shdr.sh_type == SHT_NULL) { section = 0; @@ -1391,14 +1392,14 @@ namespace amd { for (size_t n = 1; n < sections.size(); ++n) { GElfSection* section = sections[n].get(); - if (section->type() == SHT_SYMTAB) { + if (section->type() == SHT_SYMTAB || section->type() == SHT_DYNSYM) { if (!section->pullData()) { return false; } } } for (size_t n = 1; n < sections.size(); ++n) { GElfSection* section = sections[n].get(); - if (section->type() != SHT_STRTAB && section->type() != SHT_SYMTAB) { + if (section->type() != SHT_STRTAB && section->type() != SHT_SYMTAB && section->type() != SHT_DYNSYM) { if (!section->pullData()) { return false; } } } diff --git a/src/libamdhsacode/amd_hsa_code.cpp b/src/libamdhsacode/amd_hsa_code.cpp index ed8753359..464ca7d1d 100644 --- a/src/libamdhsacode/amd_hsa_code.cpp +++ b/src/libamdhsacode/amd_hsa_code.cpp @@ -1181,7 +1181,13 @@ namespace code { void AmdHsaCode::PrintRelocationData(std::ostream& out, RelocationSection* section) { - out << " Relocation Entries for " << section->targetSection()->Name() << " Section (total " << section->relocationCount() << "):" << std::endl; + if (section->targetSection()) { + out << " Relocation Entries for " << section->targetSection()->Name() << " Section (total " << section->relocationCount() << "):" << std::endl; + } else { + // Dynamic relocations do not have a target section, they work with + // virtual addresses. + out << " Dynamic Relocation Entries (total " << section->relocationCount() << "):" << std::endl; + } for (size_t i = 0; i < section->relocationCount(); ++i) { out << " Relocation (Index " << i << "):" << std::endl; out << " Type: " << section->relocation(i)->type() << std::endl; diff --git a/src/loader/executable.cpp b/src/loader/executable.cpp index 45c6abd61..17eb7c1ca 100644 --- a/src/loader/executable.cpp +++ b/src/loader/executable.cpp @@ -137,7 +137,7 @@ void Loader::Destroy(Loader *loader) Executable* AmdHsaCodeLoader::CreateExecutable( hsa_profile_t profile, const char *options) { - std::lock_guard<std::mutex> lock(executables_mutex); + WriterLockGuard<ReaderWriterLock> writer_lock(rw_lock_); executables.push_back(new ExecutableImpl(profile, context, executables.size())); return executables.back(); @@ -145,7 +145,8 @@ Executable* AmdHsaCodeLoader::CreateExecutable( void AmdHsaCodeLoader::DestroyExecutable(Executable *executable) { - std::lock_guard<std::mutex> lock(executables_mutex); + WriterLockGuard<ReaderWriterLock> writer_lock(rw_lock_); + executables[((ExecutableImpl*)executable)->id()] = nullptr; delete executable; } @@ -156,7 +157,7 @@ hsa_status_t AmdHsaCodeLoader::IterateExecutables( void *data), void *data) { - std::lock_guard<std::mutex> lock(executables_mutex); + WriterLockGuard<ReaderWriterLock> writer_lock(rw_lock_); assert(callback); for (auto &exec : executables) { @@ -169,12 +170,57 @@ hsa_status_t AmdHsaCodeLoader::IterateExecutables( return HSA_STATUS_SUCCESS; } +hsa_status_t AmdHsaCodeLoader::QuerySegmentDescriptors( + hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors, + size_t *num_segment_descriptors) +{ + if (!num_segment_descriptors) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + if (*num_segment_descriptors == 0 && segment_descriptors) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + if (*num_segment_descriptors != 0 && !segment_descriptors) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + this->EnableReadOnlyMode(); + + size_t actual_num_segment_descriptors = 0; + for (auto &executable : executables) { + if (executable) { + actual_num_segment_descriptors += executable->GetNumSegmentDescriptors(); + } + } + + if (*num_segment_descriptors == 0) { + *num_segment_descriptors = actual_num_segment_descriptors; + this->DisableReadOnlyMode(); + return HSA_STATUS_SUCCESS; + } + if (*num_segment_descriptors != actual_num_segment_descriptors) { + this->DisableReadOnlyMode(); + return HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS; + } + + size_t i = 0; + for (auto &executable : executables) { + if (executable) { + i += executable->QuerySegmentDescriptors(segment_descriptors, actual_num_segment_descriptors, i); + } + } + + this->DisableReadOnlyMode(); + return HSA_STATUS_SUCCESS; +} + uint64_t AmdHsaCodeLoader::FindHostAddress(uint64_t device_address) { + ReaderLockGuard<ReaderWriterLock> reader_lock(rw_lock_); if (device_address == 0) { return 0; } - std::lock_guard<std::mutex> lock(executables_mutex); + for (auto &exec : executables) { if (exec != nullptr) { uint64_t host_address = exec->FindHostAddress(device_address); @@ -186,6 +232,26 @@ uint64_t AmdHsaCodeLoader::FindHostAddress(uint64_t device_address) return 0; } +void AmdHsaCodeLoader::EnableReadOnlyMode() +{ + rw_lock_.ReaderLock(); + for (auto &executable : executables) { + if (executable) { + ((ExecutableImpl*)executable)->EnableReadOnlyMode(); + } + } +} + +void AmdHsaCodeLoader::DisableReadOnlyMode() +{ + rw_lock_.ReaderUnlock(); + for (auto &executable : executables) { + if (executable) { + ((ExecutableImpl*)executable)->DisableReadOnlyMode(); + } + } +} + //===----------------------------------------------------------------------===// // SymbolImpl. // //===----------------------------------------------------------------------===// @@ -754,6 +820,44 @@ hsa_status_t ExecutableImpl::IterateLoadedCodeObjects( return HSA_STATUS_SUCCESS; } +size_t ExecutableImpl::GetNumSegmentDescriptors() +{ + // assuming we are in readonly mode. + size_t actual_num_segment_descriptors = 0; + for (auto &obj : loaded_code_objects) { + actual_num_segment_descriptors += obj->LoadedSegments().size(); + } + return actual_num_segment_descriptors; +} + +size_t ExecutableImpl::QuerySegmentDescriptors( + hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors, + size_t total_num_segment_descriptors, + size_t first_empty_segment_descriptor) +{ + // assuming we are in readonly mode. + assert(segment_descriptors); + assert(first_empty_segment_descriptor < total_num_segment_descriptors); + + size_t i = first_empty_segment_descriptor; + for (auto &obj : loaded_code_objects) { + assert(i < total_num_segment_descriptors); + for (auto &seg : obj->LoadedSegments()) { + segment_descriptors[i].agent = seg->Agent(); + segment_descriptors[i].executable = Executable::Handle(seg->Owner()); + segment_descriptors[i].code_object_storage_type = HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_MEMORY; + segment_descriptors[i].code_object_storage_base = obj->ElfData(); + segment_descriptors[i].code_object_storage_size = obj->ElfSize(); + segment_descriptors[i].code_object_storage_offset = seg->StorageOffset(); + segment_descriptors[i].segment_base = seg->Address(seg->VAddr()); + segment_descriptors[i].segment_size = seg->Size(); + ++i; + } + } + + return i - first_empty_segment_descriptor; +} + uint64_t ExecutableImpl::FindHostAddress(uint64_t device_address) { for (auto &obj : loaded_code_objects) { @@ -771,6 +875,16 @@ uint64_t ExecutableImpl::FindHostAddress(uint64_t device_address) return 0; } +void ExecutableImpl::EnableReadOnlyMode() +{ + rw_lock_.ReaderLock(); +} + +void ExecutableImpl::DisableReadOnlyMode() +{ + rw_lock_.ReaderUnlock(); +} + #define HSAERRCHECK(hsc) \ if (hsc != HSA_STATUS_SUCCESS) { \ assert(false); \ @@ -854,7 +968,7 @@ hsa_status_t ExecutableImpl::LoadCodeObject( } if (loaderOptions.DumpAll()->is_set() || loaderOptions.DumpCode()->is_set()) { - if (!code->SaveToFile(amd::hsa::DumpFileName(loaderOptions.DumpDir()->value(), LOADER_DUMP_PREFIX, "co", dumpNum))) { + if (!code->SaveToFile(amd::hsa::DumpFileName(loaderOptions.DumpDir()->value(), LOADER_DUMP_PREFIX, "hsaco", dumpNum))) { // Ignore error. } } @@ -946,7 +1060,7 @@ hsa_status_t ExecutableImpl::LoadSegmentV1(hsa_agent_t agent, code::Segment* s) if (need_alloc) { void* ptr = context_->SegmentAlloc(segment, agent, s->memSize(), s->align(), true); if (!ptr) { return HSA_STATUS_ERROR_OUT_OF_RESOURCES; } - new_seg = new Segment(this, agent, segment, ptr, s->memSize(), s->vaddr()); + new_seg = new Segment(this, agent, segment, ptr, s->memSize(), s->vaddr(), s->offset()); new_seg->Copy(s->vaddr(), s->data(), s->imageSize()); objects.push_back(new_seg); @@ -1422,7 +1536,7 @@ hsa_status_t ExecutableImpl::LoadSegmentV2(hsa_agent_t agent, code::Segment* s, void* ptr = context_->SegmentAlloc(segment, agent, s->memSize(), s->align(), true); if (!ptr) { return HSA_STATUS_ERROR_OUT_OF_RESOURCES; } - Segment *new_seg = new Segment(this, agent, segment, ptr, s->memSize(), s->vaddr()); + Segment *new_seg = new Segment(this, agent, segment, ptr, s->memSize(), s->vaddr(), s->offset()); new_seg->Copy(s->vaddr(), s->data(), s->imageSize()); objects.push_back(new_seg); assert(new_seg); diff --git a/src/loader/executable.hpp b/src/loader/executable.hpp index 6801be698..478a03ffd 100644 --- a/src/loader/executable.hpp +++ b/src/loader/executable.hpp @@ -273,16 +273,18 @@ class Segment : public LoadedSegment, public ExecutableObject { size_t size; uint64_t vaddr; bool frozen; + size_t storage_offset; public: - Segment(ExecutableImpl *owner_, hsa_agent_t agent_, amdgpu_hsa_elf_segment_t segment_, void* ptr_, size_t size_, uint64_t vaddr_) + Segment(ExecutableImpl *owner_, hsa_agent_t agent_, amdgpu_hsa_elf_segment_t segment_, void* ptr_, size_t size_, uint64_t vaddr_, size_t storage_offset_) : ExecutableObject(owner_, agent_), segment(segment_), - ptr(ptr_), size(size_), vaddr(vaddr_), frozen(false) { } + ptr(ptr_), size(size_), vaddr(vaddr_), frozen(false), storage_offset(storage_offset_) { } amdgpu_hsa_elf_segment_t ElfSegment() const { return segment; } void* Ptr() const { return ptr; } size_t Size() const { return size; } uint64_t VAddr() const { return vaddr; } + size_t StorageOffset() const { return storage_offset; } bool GetInfo(amd_loaded_segment_info_t attribute, void *value) override; @@ -399,8 +401,18 @@ class ExecutableImpl final: public Executable { void *data), void *data); + size_t GetNumSegmentDescriptors() override; + + size_t QuerySegmentDescriptors( + hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors, + size_t total_num_segment_descriptors, + size_t first_empty_segment_descriptor) override; + uint64_t FindHostAddress(uint64_t device_address) override; + void EnableReadOnlyMode(); + void DisableReadOnlyMode(); + void Print(std::ostream& out) override; bool PrintToFile(const std::string& filename) override; @@ -455,7 +467,7 @@ class AmdHsaCodeLoader : public Loader { private: Context* context; std::vector<Executable*> executables; - std::mutex executables_mutex; + amd::hsa::common::ReaderWriterLock rw_lock_; public: AmdHsaCodeLoader(Context* context_) @@ -473,7 +485,14 @@ class AmdHsaCodeLoader : public Loader { void *data), void *data) override; + hsa_status_t QuerySegmentDescriptors( + hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors, + size_t *num_segment_descriptors) override; + uint64_t FindHostAddress(uint64_t device_address) override; + + void EnableReadOnlyMode(); + void DisableReadOnlyMode(); }; } // namespace loader diff --git a/src/loader/loaders.cpp b/src/loader/loaders.cpp index 7cb81f111..f01d48d27 100644 --- a/src/loader/loaders.cpp +++ b/src/loader/loaders.cpp @@ -87,6 +87,10 @@ namespace loader { gfx803.handle = 803; gfx804.handle = 804; gfx810.handle = 810; +#if defined(GFX9_BUILD) + gfx900.handle = 900; + gfx901.handle = 901; +#endif // GFX9_BUILD } hsa_isa_t OfflineLoaderContext::IsaFromName(const char *name) @@ -108,6 +112,12 @@ namespace loader { return gfx804; } else if (sname == "AMD:AMDGPU:8:1:0") { return gfx810; +#if defined(GFX9_BUILD) + } else if (sname == "AMD:AMDGPU:9:0:0") { + return gfx900; + } else if (sname == "AMD:AMDGPU:9:0:1") { + return gfx901; +#endif // GFX_BUILD } else { assert(0); return invalid; diff --git a/src/loader/loaders.hpp b/src/loader/loaders.hpp index b0a6aa0ec..85a9ed2ec 100644 --- a/src/loader/loaders.hpp +++ b/src/loader/loaders.hpp @@ -55,7 +55,11 @@ namespace loader { private: hsa_isa_t invalid; hsa_isa_t gfx700, gfx701, gfx800, gfx801, gfx802, gfx803, gfx804, gfx810; +#if defined(GFX9_BUILD) + hsa_isa_t gfx900, gfx901; +#else hsa_isa_t reserved; +#endif // GFX9_BUILD std::ostream& out; typedef std::set<void*> PointerSet; PointerSet pointers;