From a6047746e4e7a8cd939c1f6cbfc7e865c9ac1a95 Mon Sep 17 00:00:00 2001
From: James Edwards <>
Date: Sun, 14 Aug 2016 10:03:45 -0500
Subject: [PATCH] ROCR 1.2 updates

 src/                                 |  12 -
 src/core/CMakeLists.txt                       |  17 +-
 src/core/common/hsa_table_interface.cpp       | 423 +++++++--
 src/core/common/shared.h                      |   2 +
 src/core/                       |   2 +
 src/core/inc/agent.h                          |  32 +-
 src/core/inc/amd_aql_queue.h                  |   6 +-
 src/core/inc/amd_blit_kernel.h                |  73 +-
 src/core/inc/amd_blit_kernel_kv.h             | 479 ----------
 src/core/inc/amd_blit_kernel_vi.h             | 490 ----------
 src/core/inc/amd_blit_sdma.h                  |  26 +-
 src/core/inc/amd_elf_image.hpp                |   1 +
 src/core/inc/amd_gpu_agent.h                  |  86 +-
 src/core/inc/amd_gpu_shaders.h                | 169 ++++
 src/core/inc/amd_hsa_loader.hpp               |  13 +
 src/core/inc/blit.h                           |  13 +-
 src/core/inc/hsa_api_trace_int.h              |  29 +-
 src/core/inc/hsa_ext_amd_impl.h               | 186 ++++
 src/core/inc/hsa_ext_interface.h              |  28 +-
 src/core/inc/hsa_table_interface.h            |   4 +-
 src/core/inc/interrupt_signal.h               |   8 -
 src/core/inc/runtime.h                        |   5 -
 src/core/inc/signal.h                         |  14 +-
 src/core/runtime/amd_aql_queue.cpp            | 182 ++--
 src/core/runtime/amd_blit_kernel.cpp          | 862 +++++++++++++-----
 src/core/runtime/amd_blit_sdma.cpp            | 230 ++++-
 src/core/runtime/amd_gpu_agent.cpp            | 330 +++++--
 src/core/runtime/amd_memory_region.cpp        |  91 +-
 src/core/runtime/amd_topology.cpp             |   6 -
 src/core/runtime/hsa.cpp                      | 113 +--
 src/core/runtime/hsa_api_trace.cpp            | 324 ++++---
 src/core/runtime/hsa_ext_amd.cpp              |  86 +-
 src/core/runtime/hsa_ext_interface.cpp        | 340 ++++---
 ...code_object.cpp => hsa_ven_amd_loader.cpp} |  15 +-
 src/core/runtime/interrupt_signal.cpp         |   6 +-
 src/core/runtime/runtime.cpp                  |  85 +-
 src/core/util/win/os_win.cpp                  | 227 +++++
 src/inc/hsa.h                                 |   8 +-
 src/inc/hsa_api_trace.h                       | 195 +++-
 src/inc/hsa_ext_amd.h                         |  76 +-
 src/inc/hsa_ven_amd_loaded_code_object.h      |  95 --
 src/inc/hsa_ven_amd_loader.h                  | 249 +++++
 src/libamdhsacode/amd_elf_image.cpp           |   7 +-
 src/libamdhsacode/amd_hsa_code.cpp            |   8 +-
 src/loader/executable.cpp                     | 128 ++-
 src/loader/executable.hpp                     |  25 +-
 src/loader/loaders.cpp                        |  10 +
 src/loader/loaders.hpp                        |   4 +
 48 files changed, 3680 insertions(+), 2140 deletions(-)
 delete mode 100644 src/core/inc/amd_blit_kernel_kv.h
 delete mode 100644 src/core/inc/amd_blit_kernel_vi.h
 create mode 100644 src/core/inc/amd_gpu_shaders.h
 create mode 100755 src/core/inc/hsa_ext_amd_impl.h
 rename src/core/runtime/{hsa_ven_amd_loaded_code_object.cpp => hsa_ven_amd_loader.cpp} (83%)
 create mode 100644 src/core/util/win/os_win.cpp
 delete mode 100644 src/inc/hsa_ven_amd_loaded_code_object.h
 create mode 100644 src/inc/hsa_ven_amd_loader.h

diff --git a/src/ b/src/
index 2b6bea468..1ee7e8425 100644
--- a/src/
+++ b/src/
@@ -60,18 +60,6 @@ For example, from the top level ROCR repository execute:
 The name of the core hsa runtime is
-#### External requirements
-The core runtime requires the sp3.a library to be able to compiler
-on x86_64 architechtures. The binaries for the sp3.a librariy can
-be found on the amd-codexl-analyzer GitHub repository:
-The x86_64 library and associated header files have been added to
-this code base for convenience, but are still subject to the 
-AMD copyright license.
 #### Specs
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index ec0816ca1..0eb9af686 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -66,18 +66,6 @@ if ( NOT EXISTS ${HSATHK_BUILD_LIB_PATH}/ )
     MESSAGE ( FATAL_ERROR "Environment variable HSATHK_BUILD_LIB_PATH is not set to point to the location where KFD Thunk library could be found." )
 endif ()
-else ()
-endif ()
-if ( EXISTS ${LIBSP3_BUILD_LIB_PATH}/libsp3.a )
-else ()
-endif ()
 MESSAGE ( ------IS64BIT: ${IS64BIT} )
 MESSAGE ( ------Compiler: ${CMAKE_CXX_COMPILER} )
@@ -132,7 +120,7 @@ set ( CORE_SRCS ${CORE_SRCS} runtime/amd_cpu_agent.cpp )
 set ( CORE_SRCS ${CORE_SRCS} runtime/amd_gpu_agent.cpp )
 set ( CORE_SRCS ${CORE_SRCS} runtime/amd_aql_queue.cpp )
 set ( CORE_SRCS ${CORE_SRCS} runtime/amd_loader_context.cpp )
-set ( CORE_SRCS ${CORE_SRCS} runtime/hsa_ven_amd_loaded_code_object.cpp )
+set ( CORE_SRCS ${CORE_SRCS} runtime/hsa_ven_amd_loader.cpp )
 set ( CORE_SRCS ${CORE_SRCS} runtime/amd_memory_region.cpp )
 set ( CORE_SRCS ${CORE_SRCS} runtime/amd_topology.cpp )
 set ( CORE_SRCS ${CORE_SRCS} runtime/default_signal.cpp )
@@ -153,11 +141,9 @@ include_directories ( ${CMAKE_CURRENT_SOURCE_DIR}/.. )
 include_directories ( ${CMAKE_CURRENT_SOURCE_DIR}/../inc )
 include_directories ( ${CMAKE_CURRENT_SOURCE_DIR}/inc )
 include_directories ( ${HSATHK_BUILD_INC_PATH} )
-include_directories ( ${LIBSP3_BUILD_INC_PATH} )
 ## Library path(s).
 link_directories ( ${HSATHK_BUILD_LIB_PATH} )
-link_directories ( ${LIBSP3_BUILD_LIB_PATH} )
@@ -172,7 +158,6 @@ target_link_libraries ( ${CORE_RUNTIME_TARGET}
   PRIVATE amdhsaloader
   PRIVATE amdhsacode
   PRIVATE hsakmt
-  PRIVATE sp3
   dl pthread rt
diff --git a/src/core/common/hsa_table_interface.cpp b/src/core/common/hsa_table_interface.cpp
index ffbe749a9..13154820f 100644
--- a/src/core/common/hsa_table_interface.cpp
+++ b/src/core/common/hsa_table_interface.cpp
@@ -41,60 +41,69 @@
 #include "hsa_api_trace.h"
+#include "core/inc/hsa_api_trace_int.h"
-static const ApiTable* HsaApiTable;
+static const HsaApiTable* hsaApiTable;
+static const CoreApiTable* coreApiTable;
+static const AmdExtTable* amdExtTable;
-void hsa_table_interface_init(const ApiTable* Table) { HsaApiTable = Table; }
+void hsa_table_interface_init(const HsaApiTable* apiTable) {
+    hsaApiTable = apiTable;
+    coreApiTable = apiTable->core_;
+    amdExtTable = apiTable->amd_ext_;
-const ApiTable* hsa_table_interface_get_table() { return HsaApiTable; }
+const HsaApiTable* hsa_table_interface_get_table() {
+  return hsaApiTable;
 // Pass through stub functions
-hsa_status_t HSA_API hsa_init() { return HsaApiTable->hsa_init_fn(); }
+hsa_status_t HSA_API hsa_init() { return coreApiTable->hsa_init_fn(); }
-hsa_status_t HSA_API hsa_shut_down() { return HsaApiTable->hsa_shut_down_fn(); }
+hsa_status_t HSA_API hsa_shut_down() { return coreApiTable->hsa_shut_down_fn(); }
 hsa_status_t HSA_API
     hsa_system_get_info(hsa_system_info_t attribute, void* value) {
-  return HsaApiTable->hsa_system_get_info_fn(attribute, value);
+  return coreApiTable->hsa_system_get_info_fn(attribute, value);
 hsa_status_t HSA_API
     hsa_system_extension_supported(uint16_t extension, uint16_t version_major,
                                    uint16_t version_minor, bool* result) {
-  return HsaApiTable->hsa_system_extension_supported_fn(
+  return coreApiTable->hsa_system_extension_supported_fn(
       extension, version_major, version_minor, result);
 hsa_status_t HSA_API
     hsa_system_get_extension_table(uint16_t extension, uint16_t version_major,
                                    uint16_t version_minor, void* table) {
-  return HsaApiTable->hsa_system_get_extension_table_fn(
+  return coreApiTable->hsa_system_get_extension_table_fn(
       extension, version_major, version_minor, table);
 hsa_status_t HSA_API
     hsa_iterate_agents(hsa_status_t (*callback)(hsa_agent_t agent, void* data),
                        void* data) {
-  return HsaApiTable->hsa_iterate_agents_fn(callback, data);
+  return coreApiTable->hsa_iterate_agents_fn(callback, data);
 hsa_status_t HSA_API hsa_agent_get_info(hsa_agent_t agent,
                                         hsa_agent_info_t attribute,
                                         void* value) {
-  return HsaApiTable->hsa_agent_get_info_fn(agent, attribute, value);
+  return coreApiTable->hsa_agent_get_info_fn(agent, attribute, value);
 hsa_status_t HSA_API hsa_agent_get_exception_policies(hsa_agent_t agent,
                                                       hsa_profile_t profile,
                                                       uint16_t* mask) {
-  return HsaApiTable->hsa_agent_get_exception_policies_fn(agent, profile, mask);
+  return coreApiTable->hsa_agent_get_exception_policies_fn(agent, profile, mask);
 hsa_status_t HSA_API
     hsa_agent_extension_supported(uint16_t extension, hsa_agent_t agent,
                                   uint16_t version_major,
                                   uint16_t version_minor, bool* result) {
-  return HsaApiTable->hsa_agent_extension_supported_fn(
+  return coreApiTable->hsa_agent_extension_supported_fn(
       extension, agent, version_major, version_minor, result);
@@ -104,7 +113,7 @@ hsa_status_t HSA_API
                                       void* data),
                      void* data, uint32_t private_segment_size,
                      uint32_t group_segment_size, hsa_queue_t** queue) {
-  return HsaApiTable->hsa_queue_create_fn(agent, size, type, callback, data,
+  return coreApiTable->hsa_queue_create_fn(agent, size, type, callback, data,
                                           group_segment_size, queue);
@@ -113,167 +122,167 @@ hsa_status_t HSA_API
     hsa_soft_queue_create(hsa_region_t region, uint32_t size,
                           hsa_queue_type_t type, uint32_t features,
                           hsa_signal_t completion_signal, hsa_queue_t** queue) {
-  return HsaApiTable->hsa_soft_queue_create_fn(region, size, type, features,
+  return coreApiTable->hsa_soft_queue_create_fn(region, size, type, features,
                                                completion_signal, queue);
 hsa_status_t HSA_API hsa_queue_destroy(hsa_queue_t* queue) {
-  return HsaApiTable->hsa_queue_destroy_fn(queue);
+  return coreApiTable->hsa_queue_destroy_fn(queue);
 hsa_status_t HSA_API hsa_queue_inactivate(hsa_queue_t* queue) {
-  return HsaApiTable->hsa_queue_inactivate_fn(queue);
+  return coreApiTable->hsa_queue_inactivate_fn(queue);
 uint64_t HSA_API hsa_queue_load_read_index_acquire(const hsa_queue_t* queue) {
-  return HsaApiTable->hsa_queue_load_read_index_acquire_fn(queue);
+  return coreApiTable->hsa_queue_load_read_index_acquire_fn(queue);
 uint64_t HSA_API hsa_queue_load_read_index_relaxed(const hsa_queue_t* queue) {
-  return HsaApiTable->hsa_queue_load_read_index_relaxed_fn(queue);
+  return coreApiTable->hsa_queue_load_read_index_relaxed_fn(queue);
 uint64_t HSA_API hsa_queue_load_write_index_acquire(const hsa_queue_t* queue) {
-  return HsaApiTable->hsa_queue_load_write_index_acquire_fn(queue);
+  return coreApiTable->hsa_queue_load_write_index_acquire_fn(queue);
 uint64_t HSA_API hsa_queue_load_write_index_relaxed(const hsa_queue_t* queue) {
-  return HsaApiTable->hsa_queue_load_write_index_relaxed_fn(queue);
+  return coreApiTable->hsa_queue_load_write_index_relaxed_fn(queue);
 void HSA_API hsa_queue_store_write_index_relaxed(const hsa_queue_t* queue,
                                                  uint64_t value) {
-  return HsaApiTable->hsa_queue_store_write_index_relaxed_fn(queue, value);
+  return coreApiTable->hsa_queue_store_write_index_relaxed_fn(queue, value);
 void HSA_API hsa_queue_store_write_index_release(const hsa_queue_t* queue,
                                                  uint64_t value) {
-  return HsaApiTable->hsa_queue_store_write_index_release_fn(queue, value);
+  return coreApiTable->hsa_queue_store_write_index_release_fn(queue, value);
 uint64_t HSA_API hsa_queue_cas_write_index_acq_rel(const hsa_queue_t* queue,
                                                    uint64_t expected,
                                                    uint64_t value) {
-  return HsaApiTable->hsa_queue_cas_write_index_acq_rel_fn(queue, expected,
+  return coreApiTable->hsa_queue_cas_write_index_acq_rel_fn(queue, expected,
 uint64_t HSA_API hsa_queue_cas_write_index_acquire(const hsa_queue_t* queue,
                                                    uint64_t expected,
                                                    uint64_t value) {
-  return HsaApiTable->hsa_queue_cas_write_index_acquire_fn(queue, expected,
+  return coreApiTable->hsa_queue_cas_write_index_acquire_fn(queue, expected,
 uint64_t HSA_API hsa_queue_cas_write_index_relaxed(const hsa_queue_t* queue,
                                                    uint64_t expected,
                                                    uint64_t value) {
-  return HsaApiTable->hsa_queue_cas_write_index_relaxed_fn(queue, expected,
+  return coreApiTable->hsa_queue_cas_write_index_relaxed_fn(queue, expected,
 uint64_t HSA_API hsa_queue_cas_write_index_release(const hsa_queue_t* queue,
                                                    uint64_t expected,
                                                    uint64_t value) {
-  return HsaApiTable->hsa_queue_cas_write_index_release_fn(queue, expected,
+  return coreApiTable->hsa_queue_cas_write_index_release_fn(queue, expected,
 uint64_t HSA_API hsa_queue_add_write_index_acq_rel(const hsa_queue_t* queue,
                                                    uint64_t value) {
-  return HsaApiTable->hsa_queue_add_write_index_acq_rel_fn(queue, value);
+  return coreApiTable->hsa_queue_add_write_index_acq_rel_fn(queue, value);
 uint64_t HSA_API hsa_queue_add_write_index_acquire(const hsa_queue_t* queue,
                                                    uint64_t value) {
-  return HsaApiTable->hsa_queue_add_write_index_acquire_fn(queue, value);
+  return coreApiTable->hsa_queue_add_write_index_acquire_fn(queue, value);
 uint64_t HSA_API hsa_queue_add_write_index_relaxed(const hsa_queue_t* queue,
                                                    uint64_t value) {
-  return HsaApiTable->hsa_queue_add_write_index_relaxed_fn(queue, value);
+  return coreApiTable->hsa_queue_add_write_index_relaxed_fn(queue, value);
 uint64_t HSA_API hsa_queue_add_write_index_release(const hsa_queue_t* queue,
                                                    uint64_t value) {
-  return HsaApiTable->hsa_queue_add_write_index_release_fn(queue, value);
+  return coreApiTable->hsa_queue_add_write_index_release_fn(queue, value);
 void HSA_API hsa_queue_store_read_index_relaxed(const hsa_queue_t* queue,
                                                 uint64_t value) {
-  return HsaApiTable->hsa_queue_store_read_index_relaxed_fn(queue, value);
+  return coreApiTable->hsa_queue_store_read_index_relaxed_fn(queue, value);
 void HSA_API hsa_queue_store_read_index_release(const hsa_queue_t* queue,
                                                 uint64_t value) {
-  return HsaApiTable->hsa_queue_store_read_index_release_fn(queue, value);
+  return coreApiTable->hsa_queue_store_read_index_release_fn(queue, value);
 hsa_status_t HSA_API hsa_agent_iterate_regions(
     hsa_agent_t agent,
     hsa_status_t (*callback)(hsa_region_t region, void* data), void* data) {
-  return HsaApiTable->hsa_agent_iterate_regions_fn(agent, callback, data);
+  return coreApiTable->hsa_agent_iterate_regions_fn(agent, callback, data);
 hsa_status_t HSA_API hsa_region_get_info(hsa_region_t region,
                                          hsa_region_info_t attribute,
                                          void* value) {
-  return HsaApiTable->hsa_region_get_info_fn(region, attribute, value);
+  return coreApiTable->hsa_region_get_info_fn(region, attribute, value);
 hsa_status_t HSA_API hsa_memory_register(void* address, size_t size) {
-  return HsaApiTable->hsa_memory_register_fn(address, size);
+  return coreApiTable->hsa_memory_register_fn(address, size);
 hsa_status_t HSA_API hsa_memory_deregister(void* address, size_t size) {
-  return HsaApiTable->hsa_memory_deregister_fn(address, size);
+  return coreApiTable->hsa_memory_deregister_fn(address, size);
 hsa_status_t HSA_API
     hsa_memory_allocate(hsa_region_t region, size_t size, void** ptr) {
-  return HsaApiTable->hsa_memory_allocate_fn(region, size, ptr);
+  return coreApiTable->hsa_memory_allocate_fn(region, size, ptr);
 hsa_status_t HSA_API hsa_memory_free(void* ptr) {
-  return HsaApiTable->hsa_memory_free_fn(ptr);
+  return coreApiTable->hsa_memory_free_fn(ptr);
 hsa_status_t HSA_API hsa_memory_copy(void* dst, const void* src, size_t size) {
-  return HsaApiTable->hsa_memory_copy_fn(dst, src, size);
+  return coreApiTable->hsa_memory_copy_fn(dst, src, size);
 hsa_status_t HSA_API hsa_memory_assign_agent(void* ptr, hsa_agent_t agent,
                                              hsa_access_permission_t access) {
-  return HsaApiTable->hsa_memory_assign_agent_fn(ptr, agent, access);
+  return coreApiTable->hsa_memory_assign_agent_fn(ptr, agent, access);
 hsa_status_t HSA_API
     hsa_signal_create(hsa_signal_value_t initial_value, uint32_t num_consumers,
                       const hsa_agent_t* consumers, hsa_signal_t* signal) {
-  return HsaApiTable->hsa_signal_create_fn(initial_value, num_consumers,
+  return coreApiTable->hsa_signal_create_fn(initial_value, num_consumers,
                                            consumers, signal);
 hsa_status_t HSA_API hsa_signal_destroy(hsa_signal_t signal) {
-  return HsaApiTable->hsa_signal_destroy_fn(signal);
+  return coreApiTable->hsa_signal_destroy_fn(signal);
 hsa_signal_value_t HSA_API hsa_signal_load_relaxed(hsa_signal_t signal) {
-  return HsaApiTable->hsa_signal_load_relaxed_fn(signal);
+  return coreApiTable->hsa_signal_load_relaxed_fn(signal);
 hsa_signal_value_t HSA_API hsa_signal_load_acquire(hsa_signal_t signal) {
-  return HsaApiTable->hsa_signal_load_acquire_fn(signal);
+  return coreApiTable->hsa_signal_load_acquire_fn(signal);
 void HSA_API
     hsa_signal_store_relaxed(hsa_signal_t signal, hsa_signal_value_t value) {
-  return HsaApiTable->hsa_signal_store_relaxed_fn(signal, value);
+  return coreApiTable->hsa_signal_store_relaxed_fn(signal, value);
 void HSA_API
     hsa_signal_store_release(hsa_signal_t signal, hsa_signal_value_t value) {
-  return HsaApiTable->hsa_signal_store_release_fn(signal, value);
+  return coreApiTable->hsa_signal_store_release_fn(signal, value);
 hsa_signal_value_t HSA_API
@@ -282,7 +291,7 @@ hsa_signal_value_t HSA_API
                             hsa_signal_value_t compare_value,
                             uint64_t timeout_hint,
                             hsa_wait_state_t wait_expectancy_hint) {
-  return HsaApiTable->hsa_signal_wait_relaxed_fn(
+  return coreApiTable->hsa_signal_wait_relaxed_fn(
       signal, condition, compare_value, timeout_hint, wait_expectancy_hint);
@@ -292,166 +301,166 @@ hsa_signal_value_t HSA_API
                             hsa_signal_value_t compare_value,
                             uint64_t timeout_hint,
                             hsa_wait_state_t wait_expectancy_hint) {
-  return HsaApiTable->hsa_signal_wait_acquire_fn(
+  return coreApiTable->hsa_signal_wait_acquire_fn(
       signal, condition, compare_value, timeout_hint, wait_expectancy_hint);
 void HSA_API
     hsa_signal_and_relaxed(hsa_signal_t signal, hsa_signal_value_t value) {
-  return HsaApiTable->hsa_signal_and_relaxed_fn(signal, value);
+  return coreApiTable->hsa_signal_and_relaxed_fn(signal, value);
 void HSA_API
     hsa_signal_and_acquire(hsa_signal_t signal, hsa_signal_value_t value) {
-  return HsaApiTable->hsa_signal_and_acquire_fn(signal, value);
+  return coreApiTable->hsa_signal_and_acquire_fn(signal, value);
 void HSA_API
     hsa_signal_and_release(hsa_signal_t signal, hsa_signal_value_t value) {
-  return HsaApiTable->hsa_signal_and_release_fn(signal, value);
+  return coreApiTable->hsa_signal_and_release_fn(signal, value);
 void HSA_API
     hsa_signal_and_acq_rel(hsa_signal_t signal, hsa_signal_value_t value) {
-  return HsaApiTable->hsa_signal_and_acq_rel_fn(signal, value);
+  return coreApiTable->hsa_signal_and_acq_rel_fn(signal, value);
 void HSA_API
     hsa_signal_or_relaxed(hsa_signal_t signal, hsa_signal_value_t value) {
-  return HsaApiTable->hsa_signal_or_relaxed_fn(signal, value);
+  return coreApiTable->hsa_signal_or_relaxed_fn(signal, value);
 void HSA_API
     hsa_signal_or_acquire(hsa_signal_t signal, hsa_signal_value_t value) {
-  return HsaApiTable->hsa_signal_or_acquire_fn(signal, value);
+  return coreApiTable->hsa_signal_or_acquire_fn(signal, value);
 void HSA_API
     hsa_signal_or_release(hsa_signal_t signal, hsa_signal_value_t value) {
-  return HsaApiTable->hsa_signal_or_release_fn(signal, value);
+  return coreApiTable->hsa_signal_or_release_fn(signal, value);
 void HSA_API
     hsa_signal_or_acq_rel(hsa_signal_t signal, hsa_signal_value_t value) {
-  return HsaApiTable->hsa_signal_or_acq_rel_fn(signal, value);
+  return coreApiTable->hsa_signal_or_acq_rel_fn(signal, value);
 void HSA_API
     hsa_signal_xor_relaxed(hsa_signal_t signal, hsa_signal_value_t value) {
-  return HsaApiTable->hsa_signal_xor_relaxed_fn(signal, value);
+  return coreApiTable->hsa_signal_xor_relaxed_fn(signal, value);
 void HSA_API
     hsa_signal_xor_acquire(hsa_signal_t signal, hsa_signal_value_t value) {
-  return HsaApiTable->hsa_signal_xor_acquire_fn(signal, value);
+  return coreApiTable->hsa_signal_xor_acquire_fn(signal, value);
 void HSA_API
     hsa_signal_xor_release(hsa_signal_t signal, hsa_signal_value_t value) {
-  return HsaApiTable->hsa_signal_xor_release_fn(signal, value);
+  return coreApiTable->hsa_signal_xor_release_fn(signal, value);
 void HSA_API
     hsa_signal_xor_acq_rel(hsa_signal_t signal, hsa_signal_value_t value) {
-  return HsaApiTable->hsa_signal_xor_acq_rel_fn(signal, value);
+  return coreApiTable->hsa_signal_xor_acq_rel_fn(signal, value);
 void HSA_API
     hsa_signal_add_relaxed(hsa_signal_t signal, hsa_signal_value_t value) {
-  return HsaApiTable->hsa_signal_add_relaxed_fn(signal, value);
+  return coreApiTable->hsa_signal_add_relaxed_fn(signal, value);
 void HSA_API
     hsa_signal_add_acquire(hsa_signal_t signal, hsa_signal_value_t value) {
-  return HsaApiTable->hsa_signal_add_acquire_fn(signal, value);
+  return coreApiTable->hsa_signal_add_acquire_fn(signal, value);
 void HSA_API
     hsa_signal_add_release(hsa_signal_t signal, hsa_signal_value_t value) {
-  return HsaApiTable->hsa_signal_add_release_fn(signal, value);
+  return coreApiTable->hsa_signal_add_release_fn(signal, value);
 void HSA_API
     hsa_signal_add_acq_rel(hsa_signal_t signal, hsa_signal_value_t value) {
-  return HsaApiTable->hsa_signal_add_acq_rel_fn(signal, value);
+  return coreApiTable->hsa_signal_add_acq_rel_fn(signal, value);
 void HSA_API
     hsa_signal_subtract_relaxed(hsa_signal_t signal, hsa_signal_value_t value) {
-  return HsaApiTable->hsa_signal_subtract_relaxed_fn(signal, value);
+  return coreApiTable->hsa_signal_subtract_relaxed_fn(signal, value);
 void HSA_API
     hsa_signal_subtract_acquire(hsa_signal_t signal, hsa_signal_value_t value) {
-  return HsaApiTable->hsa_signal_subtract_acquire_fn(signal, value);
+  return coreApiTable->hsa_signal_subtract_acquire_fn(signal, value);
 void HSA_API
     hsa_signal_subtract_release(hsa_signal_t signal, hsa_signal_value_t value) {
-  return HsaApiTable->hsa_signal_subtract_release_fn(signal, value);
+  return coreApiTable->hsa_signal_subtract_release_fn(signal, value);
 void HSA_API
     hsa_signal_subtract_acq_rel(hsa_signal_t signal, hsa_signal_value_t value) {
-  return HsaApiTable->hsa_signal_subtract_acq_rel_fn(signal, value);
+  return coreApiTable->hsa_signal_subtract_acq_rel_fn(signal, value);
 hsa_signal_value_t HSA_API
     hsa_signal_exchange_relaxed(hsa_signal_t signal, hsa_signal_value_t value) {
-  return HsaApiTable->hsa_signal_exchange_relaxed_fn(signal, value);
+  return coreApiTable->hsa_signal_exchange_relaxed_fn(signal, value);
 hsa_signal_value_t HSA_API
     hsa_signal_exchange_acquire(hsa_signal_t signal, hsa_signal_value_t value) {
-  return HsaApiTable->hsa_signal_exchange_acquire_fn(signal, value);
+  return coreApiTable->hsa_signal_exchange_acquire_fn(signal, value);
 hsa_signal_value_t HSA_API
     hsa_signal_exchange_release(hsa_signal_t signal, hsa_signal_value_t value) {
-  return HsaApiTable->hsa_signal_exchange_release_fn(signal, value);
+  return coreApiTable->hsa_signal_exchange_release_fn(signal, value);
 hsa_signal_value_t HSA_API
     hsa_signal_exchange_acq_rel(hsa_signal_t signal, hsa_signal_value_t value) {
-  return HsaApiTable->hsa_signal_exchange_acq_rel_fn(signal, value);
+  return coreApiTable->hsa_signal_exchange_acq_rel_fn(signal, value);
 hsa_signal_value_t HSA_API hsa_signal_cas_relaxed(hsa_signal_t signal,
                                                   hsa_signal_value_t expected,
                                                   hsa_signal_value_t value) {
-  return HsaApiTable->hsa_signal_cas_relaxed_fn(signal, expected, value);
+  return coreApiTable->hsa_signal_cas_relaxed_fn(signal, expected, value);
 hsa_signal_value_t HSA_API hsa_signal_cas_acquire(hsa_signal_t signal,
                                                   hsa_signal_value_t expected,
                                                   hsa_signal_value_t value) {
-  return HsaApiTable->hsa_signal_cas_acquire_fn(signal, expected, value);
+  return coreApiTable->hsa_signal_cas_acquire_fn(signal, expected, value);
 hsa_signal_value_t HSA_API hsa_signal_cas_release(hsa_signal_t signal,
                                                   hsa_signal_value_t expected,
                                                   hsa_signal_value_t value) {
-  return HsaApiTable->hsa_signal_cas_release_fn(signal, expected, value);
+  return coreApiTable->hsa_signal_cas_release_fn(signal, expected, value);
 hsa_signal_value_t HSA_API hsa_signal_cas_acq_rel(hsa_signal_t signal,
                                                   hsa_signal_value_t expected,
                                                   hsa_signal_value_t value) {
-  return HsaApiTable->hsa_signal_cas_acq_rel_fn(signal, expected, value);
+  return coreApiTable->hsa_signal_cas_acq_rel_fn(signal, expected, value);
 hsa_status_t hsa_isa_from_name(const char* name, hsa_isa_t* isa) {
-  return HsaApiTable->hsa_isa_from_name_fn(name, isa);
+  return coreApiTable->hsa_isa_from_name_fn(name, isa);
 hsa_status_t HSA_API hsa_isa_get_info(hsa_isa_t isa, hsa_isa_info_t attribute,
                                       uint32_t index, void* value) {
-  return HsaApiTable->hsa_isa_get_info_fn(isa, attribute, index, value);
+  return coreApiTable->hsa_isa_get_info_fn(isa, attribute, index, value);
 hsa_status_t hsa_isa_compatible(hsa_isa_t code_object_isa, hsa_isa_t agent_isa,
                                 bool* result) {
-  return HsaApiTable->hsa_isa_compatible_fn(code_object_isa, agent_isa, result);
+  return coreApiTable->hsa_isa_compatible_fn(code_object_isa, agent_isa, result);
 hsa_status_t HSA_API hsa_code_object_serialize(
@@ -460,7 +469,7 @@ hsa_status_t HSA_API hsa_code_object_serialize(
                                    void** address),
     hsa_callback_data_t callback_data, const char* options,
     void** serialized_code_object, size_t* serialized_code_object_size) {
-  return HsaApiTable->hsa_code_object_serialize_fn(
+  return coreApiTable->hsa_code_object_serialize_fn(
       code_object, alloc_callback, callback_data, options,
       serialized_code_object, serialized_code_object_size);
@@ -470,33 +479,33 @@ hsa_status_t HSA_API
                                 size_t serialized_code_object_size,
                                 const char* options,
                                 hsa_code_object_t* code_object) {
-  return HsaApiTable->hsa_code_object_deserialize_fn(
+  return coreApiTable->hsa_code_object_deserialize_fn(
       serialized_code_object, serialized_code_object_size, options,
 hsa_status_t HSA_API hsa_code_object_destroy(hsa_code_object_t code_object) {
-  return HsaApiTable->hsa_code_object_destroy_fn(code_object);
+  return coreApiTable->hsa_code_object_destroy_fn(code_object);
 hsa_status_t HSA_API hsa_code_object_get_info(hsa_code_object_t code_object,
                                               hsa_code_object_info_t attribute,
                                               void* value) {
-  return HsaApiTable->hsa_code_object_get_info_fn(code_object, attribute,
+  return coreApiTable->hsa_code_object_get_info_fn(code_object, attribute,
 hsa_status_t HSA_API hsa_code_object_get_symbol(hsa_code_object_t code_object,
                                                 const char* symbol_name,
                                                 hsa_code_symbol_t* symbol) {
-  return HsaApiTable->hsa_code_object_get_symbol_fn(code_object, symbol_name,
+  return coreApiTable->hsa_code_object_get_symbol_fn(code_object, symbol_name,
 hsa_status_t HSA_API hsa_code_symbol_get_info(hsa_code_symbol_t code_symbol,
                                               hsa_code_symbol_info_t attribute,
                                               void* value) {
-  return HsaApiTable->hsa_code_symbol_get_info_fn(code_symbol, attribute,
+  return coreApiTable->hsa_code_symbol_get_info_fn(code_symbol, attribute,
@@ -505,7 +514,7 @@ hsa_status_t HSA_API hsa_code_object_iterate_symbols(
     hsa_status_t (*callback)(hsa_code_object_t code_object,
                              hsa_code_symbol_t symbol, void* data),
     void* data) {
-  return HsaApiTable->hsa_code_object_iterate_symbols_fn(code_object, callback,
+  return coreApiTable->hsa_code_object_iterate_symbols_fn(code_object, callback,
@@ -513,12 +522,12 @@ hsa_status_t HSA_API
     hsa_executable_create(hsa_profile_t profile,
                           hsa_executable_state_t executable_state,
                           const char* options, hsa_executable_t* executable) {
-  return HsaApiTable->hsa_executable_create_fn(profile, executable_state,
+  return coreApiTable->hsa_executable_create_fn(profile, executable_state,
                                                options, executable);
 hsa_status_t HSA_API hsa_executable_destroy(hsa_executable_t executable) {
-  return HsaApiTable->hsa_executable_destroy_fn(executable);
+  return coreApiTable->hsa_executable_destroy_fn(executable);
 hsa_status_t HSA_API
@@ -526,26 +535,26 @@ hsa_status_t HSA_API
                                     hsa_agent_t agent,
                                     hsa_code_object_t code_object,
                                     const char* options) {
-  return HsaApiTable->hsa_executable_load_code_object_fn(executable, agent,
+  return coreApiTable->hsa_executable_load_code_object_fn(executable, agent,
                                                          code_object, options);
 hsa_status_t HSA_API
     hsa_executable_freeze(hsa_executable_t executable, const char* options) {
-  return HsaApiTable->hsa_executable_freeze_fn(executable, options);
+  return coreApiTable->hsa_executable_freeze_fn(executable, options);
 hsa_status_t HSA_API hsa_executable_get_info(hsa_executable_t executable,
                                              hsa_executable_info_t attribute,
                                              void* value) {
-  return HsaApiTable->hsa_executable_get_info_fn(executable, attribute, value);
+  return coreApiTable->hsa_executable_get_info_fn(executable, attribute, value);
 hsa_status_t HSA_API
     hsa_executable_global_variable_define(hsa_executable_t executable,
                                           const char* variable_name,
                                           void* address) {
-  return HsaApiTable->hsa_executable_global_variable_define_fn(
+  return coreApiTable->hsa_executable_global_variable_define_fn(
       executable, variable_name, address);
@@ -554,7 +563,7 @@ hsa_status_t HSA_API
                                                 hsa_agent_t agent,
                                                 const char* variable_name,
                                                 void* address) {
-  return HsaApiTable->hsa_executable_agent_global_variable_define_fn(
+  return coreApiTable->hsa_executable_agent_global_variable_define_fn(
       executable, agent, variable_name, address);
@@ -563,13 +572,13 @@ hsa_status_t HSA_API
                                             hsa_agent_t agent,
                                             const char* variable_name,
                                             void* address) {
-  return HsaApiTable->hsa_executable_readonly_variable_define_fn(
+  return coreApiTable->hsa_executable_readonly_variable_define_fn(
       executable, agent, variable_name, address);
 hsa_status_t HSA_API
     hsa_executable_validate(hsa_executable_t executable, uint32_t* result) {
-  return HsaApiTable->hsa_executable_validate_fn(executable, result);
+  return coreApiTable->hsa_executable_validate_fn(executable, result);
 hsa_status_t HSA_API
@@ -577,7 +586,7 @@ hsa_status_t HSA_API
                               const char* module_name, const char* symbol_name,
                               hsa_agent_t agent, int32_t call_convention,
                               hsa_executable_symbol_t* symbol) {
-  return HsaApiTable->hsa_executable_get_symbol_fn(
+  return coreApiTable->hsa_executable_get_symbol_fn(
       executable, module_name, symbol_name, agent, call_convention, symbol);
@@ -585,7 +594,7 @@ hsa_status_t HSA_API
     hsa_executable_symbol_get_info(hsa_executable_symbol_t executable_symbol,
                                    hsa_executable_symbol_info_t attribute,
                                    void* value) {
-  return HsaApiTable->hsa_executable_symbol_get_info_fn(executable_symbol,
+  return coreApiTable->hsa_executable_symbol_get_info_fn(executable_symbol,
                                                         attribute, value);
@@ -594,11 +603,227 @@ hsa_status_t HSA_API hsa_executable_iterate_symbols(
     hsa_status_t (*callback)(hsa_executable_t executable,
                              hsa_executable_symbol_t symbol, void* data),
     void* data) {
-  return HsaApiTable->hsa_executable_iterate_symbols_fn(executable, callback,
+  return coreApiTable->hsa_executable_iterate_symbols_fn(executable, callback,
 hsa_status_t HSA_API
     hsa_status_string(hsa_status_t status, const char** status_string) {
-  return HsaApiTable->hsa_status_string_fn(status, status_string);
+  return coreApiTable->hsa_status_string_fn(status, status_string);
+ * Following set of functions are bundled as AMD Extension Apis
+ */
+// Pass through stub functions
+hsa_status_t HSA_API hsa_amd_coherency_get_type(hsa_agent_t agent,
+                                                hsa_amd_coherency_type_t* type) {
+  return amdExtTable->hsa_amd_coherency_get_type_fn(agent, type);
+// Pass through stub functions
+hsa_status_t HSA_API hsa_amd_coherency_set_type(hsa_agent_t agent,
+                                                hsa_amd_coherency_type_t type) {
+  return amdExtTable->hsa_amd_coherency_set_type_fn(agent, type);
+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API
+    hsa_amd_profiling_set_profiler_enabled(hsa_queue_t* queue, int enable) {
+  return amdExtTable->hsa_amd_profiling_set_profiler_enabled_fn(
+                                     queue, enable);
+hsa_status_t HSA_API
+  hsa_amd_profiling_async_copy_enable(bool enable) {
+    return amdExtTable->hsa_amd_profiling_async_copy_enable_fn(enable);
+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API hsa_amd_profiling_get_dispatch_time(
+    hsa_agent_t agent, hsa_signal_t signal,
+    hsa_amd_profiling_dispatch_time_t* time) {
+  return amdExtTable->hsa_amd_profiling_get_dispatch_time_fn(
+                                     agent, signal, time); 
+hsa_status_t HSA_API
+  hsa_amd_profiling_get_async_copy_time(
+    hsa_signal_t hsa_signal, hsa_amd_profiling_async_copy_time_t* time) {
+      return amdExtTable->hsa_amd_profiling_get_async_copy_time_fn(hsa_signal, time);
+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API
+    hsa_amd_profiling_convert_tick_to_system_domain(hsa_agent_t agent,
+                                                    uint64_t agent_tick,
+                                                    uint64_t* system_tick) {
+  return amdExtTable->hsa_amd_profiling_convert_tick_to_system_domain_fn(
+                                     agent, agent_tick, system_tick);
+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API
+    hsa_amd_signal_async_handler(hsa_signal_t signal,
+                                 hsa_signal_condition_t cond,
+                                 hsa_signal_value_t value,
+                                 hsa_amd_signal_handler handler, void* arg) {
+  return amdExtTable->hsa_amd_signal_async_handler_fn(
+                                     signal, cond, value, handler, arg);
+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API
+    hsa_amd_async_function(void (*callback)(void* arg), void* arg) {
+  return amdExtTable->hsa_amd_async_function_fn(callback, arg);
+// Mirrors Amd Extension Apis
+uint32_t HSA_API
+    hsa_amd_signal_wait_any(uint32_t signal_count, hsa_signal_t* signals,
+                            hsa_signal_condition_t* conds,
+                            hsa_signal_value_t* values, uint64_t timeout_hint,
+                            hsa_wait_state_t wait_hint,
+                            hsa_signal_value_t* satisfying_value) {
+  return amdExtTable->hsa_amd_signal_wait_any_fn(
+                                     signal_count, signals,
+                                     conds, values, timeout_hint,
+                                     wait_hint, satisfying_value);
+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue,
+                                               uint32_t num_cu_mask_count,
+                                               const uint32_t* cu_mask) {
+  return amdExtTable->hsa_amd_queue_cu_set_mask_fn(
+                                     queue, num_cu_mask_count, cu_mask);
+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API
+    hsa_amd_memory_pool_get_info(hsa_amd_memory_pool_t memory_pool,
+                                 hsa_amd_memory_pool_info_t attribute,
+                                 void* value) {
+  return amdExtTable->hsa_amd_memory_pool_get_info_fn(
+                                     memory_pool, attribute, value);
+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API hsa_amd_agent_iterate_memory_pools(
+    hsa_agent_t agent,
+    hsa_status_t (*callback)(hsa_amd_memory_pool_t memory_pool, void* data),
+    void* data) {
+  return amdExtTable->hsa_amd_agent_iterate_memory_pools_fn(
+                                     agent, callback, data);
+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API
+    hsa_amd_memory_pool_allocate(hsa_amd_memory_pool_t memory_pool, size_t size,
+                                 uint32_t flags, void** ptr) {
+  return amdExtTable->hsa_amd_memory_pool_allocate_fn(
+                                     memory_pool, size, flags, ptr);
+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API hsa_amd_memory_pool_free(void* ptr) {
+  return amdExtTable->hsa_amd_memory_pool_free_fn(ptr);
+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API
+    hsa_amd_memory_async_copy(void* dst, hsa_agent_t dst_agent, const void* src,
+                              hsa_agent_t src_agent, size_t size,
+                              uint32_t num_dep_signals,
+                              const hsa_signal_t* dep_signals,
+                              hsa_signal_t completion_signal) {
+  return amdExtTable->hsa_amd_memory_async_copy_fn(
+                                     dst, dst_agent, src, src_agent, size,
+                                     num_dep_signals, dep_signals, completion_signal);
+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API hsa_amd_agent_memory_pool_get_info(
+    hsa_agent_t agent, hsa_amd_memory_pool_t memory_pool,
+    hsa_amd_agent_memory_pool_info_t attribute, void* value) {
+  return amdExtTable->hsa_amd_agent_memory_pool_get_info_fn(
+                                     agent, memory_pool, attribute, value);
+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API
+    hsa_amd_agents_allow_access(uint32_t num_agents, const hsa_agent_t* agents,
+                                const uint32_t* flags, const void* ptr) {
+  return amdExtTable->hsa_amd_agents_allow_access_fn(
+                                     num_agents, agents, flags, ptr);
+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API
+    hsa_amd_memory_pool_can_migrate(hsa_amd_memory_pool_t src_memory_pool,
+                                    hsa_amd_memory_pool_t dst_memory_pool,
+                                    bool* result) {
+  return amdExtTable->hsa_amd_memory_pool_can_migrate_fn(
+                                     src_memory_pool, dst_memory_pool, result);
+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API hsa_amd_memory_migrate(const void* ptr,
+                                            hsa_amd_memory_pool_t memory_pool,
+                                            uint32_t flags) {
+  return amdExtTable->hsa_amd_memory_migrate_fn(
+                                     ptr, memory_pool, flags);
+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API hsa_amd_memory_lock(void* host_ptr, size_t size,
+                                         hsa_agent_t* agents, int num_agent,
+                                         void** agent_ptr) {
+  return amdExtTable->hsa_amd_memory_lock_fn(
+                                     host_ptr, size, agents, num_agent, agent_ptr);
+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API hsa_amd_memory_unlock(void* host_ptr) {
+  return amdExtTable->hsa_amd_memory_unlock_fn(host_ptr);
+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API
+    hsa_amd_memory_fill(void* ptr, uint32_t value, size_t count) {
+  return amdExtTable->hsa_amd_memory_fill_fn(ptr, value, count);
+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API hsa_amd_interop_map_buffer(uint32_t num_agents,   
+                                        hsa_agent_t* agents,       
+                                        int interop_handle,    
+                                        uint32_t flags,        
+                                        size_t* size,          
+                                        void** ptr,            
+                                        size_t* metadata_size, 
+                                        const void** metadata) {
+  return amdExtTable->hsa_amd_interop_map_buffer_fn(
+                                     num_agents, agents, interop_handle,    
+                                     flags, size, ptr, metadata_size, metadata);
+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API hsa_amd_interop_unmap_buffer(void* ptr) {
+  return amdExtTable->hsa_amd_interop_unmap_buffer_fn(ptr);
+// Use the function pointer from local instance Image Extension
+hsa_status_t HSA_API hsa_amd_image_create(
+  hsa_agent_t agent,
+  const hsa_ext_image_descriptor_t *image_descriptor,
+  const hsa_amd_image_descriptor_t *image_layout,
+  const void *image_data,
+  hsa_access_permission_t access_permission,
+  hsa_ext_image_t *image) {
+  return amdExtTable->hsa_amd_image_create_fn(agent, image_descriptor,
+                          image_layout, image_data, access_permission, image);
diff --git a/src/core/common/shared.h b/src/core/common/shared.h
index 76720bd79..fdf89b625 100644
--- a/src/core/common/shared.h
+++ b/src/core/common/shared.h
@@ -82,6 +82,8 @@ class Shared : public BaseShared {
     assert(shared_object_ != NULL && "Failed on allocating shared_object_");
+    memset(shared_object_, 0, sizeof(T));
     if (shared_object_ != NULL) new (shared_object_) T;
diff --git a/src/core/ b/src/core/
index 4e04a8e2a..130dd0875 100644
--- a/src/core/
+++ b/src/core/
@@ -107,6 +107,8 @@ global:
+	hsa_amd_profiling_async_copy_enable;
+	hsa_amd_profiling_get_async_copy_time;
diff --git a/src/core/inc/agent.h b/src/core/inc/agent.h
index abd7acf84..41867eb3c 100644
--- a/src/core/inc/agent.h
+++ b/src/core/inc/agent.h
@@ -107,7 +107,9 @@ class Agent : public Checked<0xF6BC25EB17E6F917> {
   // @param [in] type CPU or GPU or other.
   explicit Agent(uint32_t node_id, DeviceType type)
-      : node_id_(node_id), device_type_(uint32_t(type)) {
+      : node_id_(node_id),
+        device_type_(uint32_t(type)),
+        profiling_enabled_(false) {
     public_handle_ = Convert(this);
@@ -115,7 +117,7 @@ class Agent : public Checked<0xF6BC25EB17E6F917> {
   // @param [in] type CPU or GPU or other.
   explicit Agent(uint32_t node_id, uint32_t type)
-      : node_id_(node_id), device_type_(type) {
+      : node_id_(node_id), device_type_(type), profiling_enabled_(false) {
     public_handle_ = Convert(this);
@@ -240,6 +242,19 @@ class Agent : public Checked<0xF6BC25EB17E6F917> {
   // @brief Returns node id associated with this agent.
   __forceinline uint32_t node_id() const { return node_id_; }
+  // @brief Getter for profiling_enabled_.
+  __forceinline bool profiling_enabled() const { return profiling_enabled_; }
+  // @brief Setter for profiling_enabled_.
+  virtual hsa_status_t profiling_enabled(bool enable) {
+    const hsa_status_t stat = EnableDmaProfiling(enable);
+    if (HSA_STATUS_SUCCESS == stat) {
+      profiling_enabled_ = enable;
+    }
+    return stat;
+  }
   // Intention here is to have a polymorphic update procedure for public_handle_
   // which is callable on any Agent* but only from some class dervied from
@@ -254,6 +269,17 @@ class Agent : public Checked<0xF6BC25EB17E6F917> {
     public_handle_ = handle;
+  // @brief Enable profiling of the asynchronous DMA copy. The timestamp
+  // of each copy request will be stored in the completion signal structure.
+  //
+  // @param enable True to enable profiling. False to disable profiling.
+  //
+  // @retval HSA_STATUS_SUCCESS The profiling is enabled and the
+  // timing of subsequent async copy will be measured.
+  virtual hsa_status_t EnableDmaProfiling(bool enable) {
+  }
   hsa_agent_t public_handle_;
@@ -262,6 +288,8 @@ class Agent : public Checked<0xF6BC25EB17E6F917> {
   const uint32_t device_type_;
+  bool profiling_enabled_;
   // Forbid copying and moving of this object
diff --git a/src/core/inc/amd_aql_queue.h b/src/core/inc/amd_aql_queue.h
index 25cb252f8..80c462977 100644
--- a/src/core/inc/amd_aql_queue.h
+++ b/src/core/inc/amd_aql_queue.h
@@ -358,6 +358,10 @@ class AqlQueue : public core::Queue, public core::Signal {
   static bool DynamicScratchHandler(hsa_signal_value_t error_code, void* arg);
+  /// @brief Define the Scratch Buffer Descriptor and related parameters
+  /// that enable kernel access scratch memory
+  void InitScratchSRD();
   // AQL packet ring buffer
   void* ring_buf_;
@@ -380,8 +384,6 @@ class AqlQueue : public core::Queue, public core::Signal {
   // Handle of agent, which queue is attached to
   GpuAgent* agent_;
-  hsa_profile_t agent_profile_;
   uint32_t queue_full_workaround_;
   // Handle of scratch memory descriptor
diff --git a/src/core/inc/amd_blit_kernel.h b/src/core/inc/amd_blit_kernel.h
index a7b0a58f8..f6f7b27e2 100644
--- a/src/core/inc/amd_blit_kernel.h
+++ b/src/core/inc/amd_blit_kernel.h
@@ -43,6 +43,7 @@
+#include <map>
 #include <stdint.h>
 #include "core/inc/blit.h"
@@ -66,8 +67,10 @@ class BlitKernel : public core::Blit {
   /// @note: The call will block until all AQL packets have been executed.
+  /// @param agent Agent passed to Initialize.
+  ///
   /// @return hsa_status_t
-  virtual hsa_status_t Destroy() override;
+  virtual hsa_status_t Destroy(const core::Agent& agent) override;
   /// @brief Submit an AQL packet to perform vector copy. The call is blocking
   /// until the command execution is finished.
@@ -102,19 +105,40 @@ class BlitKernel : public core::Blit {
   virtual hsa_status_t SubmitLinearFillCommand(void* ptr, uint32_t value,
                                                size_t count) override;
+  virtual hsa_status_t EnableProfiling(bool enable) override;
   union KernelArgs {
-    struct __ALIGNED__(16) KernelCopyArgs {
-      const void* src;
-      void* dst;
-      uint64_t size;
-      uint32_t use_vector;
-    } copy;
-    struct __ALIGNED__(16) KernelFillArgs {
-      void* ptr;
-      uint64_t num;
-      uint32_t value;
+    struct __ALIGNED__(16) {
+      uint64_t phase1_src_start;
+      uint64_t phase1_dst_start;
+      uint64_t phase2_src_start;
+      uint64_t phase2_dst_start;
+      uint64_t phase3_src_start;
+      uint64_t phase3_dst_start;
+      uint64_t phase4_src_start;
+      uint64_t phase4_dst_start;
+      uint64_t phase4_src_end;
+      uint64_t phase4_dst_end;
+      uint32_t num_workitems;
+    } copy_aligned;
+    struct __ALIGNED__(16) {
+      uint64_t phase1_src_start;
+      uint64_t phase1_dst_start;
+      uint64_t phase2_src_start;
+      uint64_t phase2_dst_start;
+      uint64_t phase2_src_end;
+      uint64_t phase2_dst_end;
+      uint32_t num_workitems;
+    } copy_misaligned;
+    struct __ALIGNED__(16) {
+      uint64_t phase1_dst_start;
+      uint64_t phase2_dst_start;
+      uint64_t phase2_dst_end;
+      uint32_t fill_value;
+      uint32_t num_workitems;
     } fill;
@@ -136,14 +160,19 @@ class BlitKernel : public core::Blit {
   KernelArgs* ObtainAsyncKernelCopyArg();
-  /// Handles to the vector copy kernel.
-  uint64_t copy_code_handle_;
+  /// AQL code object and size for each kernel.
+  enum class KernelType {
+    CopyAligned,
+    CopyMisaligned,
+    Fill,
+  };
-  /// Handles to the vector copy aligned kernel.
-  uint64_t copy_aligned_code_handle_;
+  struct KernelCode {
+    void* code_buf_;
+    size_t code_buf_size_;
+  };
-  /// Handles to the fill memory kernel.
-  uint64_t fill_code_handle_;
+  std::map<KernelType, KernelCode> kernels_;
   /// AQL queue for submitting the vector copy kernel.
   hsa_queue_t* queue_;
@@ -163,12 +192,8 @@ class BlitKernel : public core::Blit {
   /// Lock to synchronize access to kernarg_ and completion_signal_
   std::mutex lock_;
-  /// Pointer to memory containing the ISA and argument buffer.
-  void* code_arg_buffer_;
-  static const size_t kMaxCopyCount;
-  static const size_t kMaxFillCount;
-  static const uint32_t kGroupSize;
+  /// Number of CUs on the underlying agent.
+  int num_cus_;
 }  // namespace amd
diff --git a/src/core/inc/amd_blit_kernel_kv.h b/src/core/inc/amd_blit_kernel_kv.h
deleted file mode 100644
index a8e235ea7..000000000
--- a/src/core/inc/amd_blit_kernel_kv.h
+++ /dev/null
@@ -1,479 +0,0 @@
-// The University of Illinois/NCSA
-// Open Source License (NCSA)
-// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
-// Developed by:
-//                 AMD Research and AMD HSA Software Development
-//                 Advanced Micro Devices, Inc.
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to
-// deal with the Software without restriction, including without limitation
-// the rights to use, copy, modify, merge, publish, distribute, sublicense,
-// and/or sell copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following conditions:
-//  - Redistributions of source code must retain the above copyright notice,
-//    this list of conditions and the following disclaimers.
-//  - Redistributions in binary form must reproduce the above copyright
-//    notice, this list of conditions and the following disclaimers in
-//    the documentation and/or other materials provided with the distribution.
-//  - Neither the names of Advanced Micro Devices, Inc,
-//    nor the names of its contributors may be used to endorse or promote
-//    products derived from this Software without specific prior written
-//    permission.
-#include <stddef.h>
-/*****HSAIL code of the ISA in ::kVectorCopyRawKv.
-module &m:1:0:$full:$large:$default;
-prog kernel &__vector_copy_kernel(
-  kernarg_u64 %src,
-  kernarg_u64 %dst,
-  kernarg_u64 %size)
-  @__vector_copy_kernel_entry:
-  // BB#0:                                // %entry
-  workitemabsid_u32	$s0, 0;
-  cvt_u64_u32	$d0, $s0;
-  ld_kernarg_align(8)_width(all)_u64	$d1, [%size];
-  cmp_ge_b1_u64	$c0, $d0, $d1;
-  cbr_b1	$c0, @BB0_2;
-  // BB#1:                                // %if.end
-  ld_kernarg_align(8)_width(all)_u64	$d1, [%src];
-  ld_kernarg_align(8)_width(all)_u64	$d2, [%dst];
-  add_u64	$d2, $d2, $d0;
-  add_u64	$d0, $d1, $d0;
-  ld_global_u8	$s0, [$d0];
-  st_global_u8	$s0, [$d2];
-  @BB0_2:
-  // %return
-  ret;
-static char kVectorCopyRawKv[] = {
-    127,  69,   76,   70,   2,   1,   1,   64,   0,    0,    0,   0,    0,
-    0,    0,    0,    1,    0,   -32, 0,   1,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   64,  0,    0,    0,    0,   0,    0,
-    0,    -104, 3,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    64,   0,    56,   0,    1,   0,   64,  0,    6,    0,    5,   0,    3,
-    0,    0,    96,   6,    0,   0,   0,   0,    1,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   112, 1,   0,    0,    0,    0,   0,    0,
-    112,  1,    0,    0,    0,   0,   0,   0,    0,    1,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    1,    0,   0,    0,
-    0,    0,    0,    0,    1,   0,   0,   0,    0,    0,    0,   0,    0,
-    1,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   65,  0,   -116, 0,    -112, 0,   0,    0,
-    11,   0,    10,   0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    24,   0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    11,   0,    5,   0,   5,   0,    0,    0,    9,   0,    0,
-    0,    0,    0,    0,    0,   3,   0,   0,    6,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   1,   5,   0,    -64,  127,  0,   -116, -65,
-    0,    -1,   -128, -109, 0,   0,   16,  0,    0,    8,    0,   -109, 0,
-    0,    0,    74,   4,    7,   64,  -64, -128, 2,    2,    126, 127,  0,
-    -116, -65,  0,    0,    -56, 125, 106, 36,   -128, -66,  15,  0,    -120,
-    -65,  0,    7,    -126, -64, 127, 0,   -116, -65,  4,    0,   2,    74,
-    5,    2,    4,    126,  2,   106, 80,  -46,  2,    1,    -87, 1,    0,
-    0,    32,   -36,  1,    0,   0,   1,   6,    0,    6,    74,  7,    2,
-    4,    126,  4,    106,  80,  -46, 2,   1,    -87,  1,    112, 0,    -116,
-    -65,  0,    0,    96,   -36, 3,   1,   0,    0,    0,    0,   -127, -65,
-    3,    0,    0,    0,    8,   0,   0,   0,    1,    0,    0,   0,    65,
-    77,   68,   0,    1,    0,   0,   0,   0,    0,    0,    0,   3,    0,
-    0,    0,    12,   0,    0,   0,   2,   0,    0,    0,    65,  77,   68,
-    0,    1,    0,    0,    0,   0,   0,   0,    0,    1,    1,   1,    0,
-    3,    0,    0,    0,    28,  0,   0,   0,    3,    0,    0,   0,    65,
-    77,   68,   0,    4,    0,   7,   0,   7,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   65,  77,   68,   0,    65,  77,   68,
-    71,   80,   85,   0,    0,   3,   0,   0,    0,    40,   0,   0,    0,
-    4,    0,    0,    0,    65,  77,  68,  0,    26,   0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   65,   77,   68,   32,  72,   83,
-    65,   32,   82,   117,  110, 116, 105, 109,  101,  32,   70,  105,  110,
-    97,   108,  105,  122,  101, 114, 0,   0,    0,    38,   95,  95,   118,
-    101,  99,   116,  111,  114, 95,  99,  111,  112,  121,  95,  107,  101,
-    114,  110,  101,  108,  0,   95,  95,  104,  115,  97,   95,  115,  101,
-    99,   116,  105,  111,  110, 46,  104, 115,  97,   116,  101, 120,  116,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    26,   0,   1,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    22,   0,   0,   0,   3,    0,    1,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    46,   104,  115, 97,  116, 101,  120,  116,  0,   46,   110,
-    111,  116,  101,  0,    46,  115, 116, 114,  116,  97,   98,  0,    46,
-    115,  121,  109,  116,  97,  98,  0,   46,   115,  104,  115, 116,  114,
-    116,  97,   98,   0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    1,    0,   0,    0,
-    1,    0,    0,    0,    7,   0,   -64, 0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    1,    0,    0,   0,    0,
-    0,    0,    112,  1,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   1,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    10,   0,    0,   0,    7,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   112, 2,    0,    0,    0,   0,    0,
-    0,    -104, 0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    4,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   16,   0,    0,    0,   3,    0,
-    0,    0,    32,   0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   8,   3,   0,    0,    0,    0,   0,    0,
-    44,   0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    1,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   24,  0,    0,    0,    2,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    56,  3,   0,   0,    0,    0,    0,   0,    48,
-    0,    0,    0,    0,    0,   0,   0,   3,    0,    0,    0,   0,    0,
-    0,    0,    8,    0,    0,   0,   0,   0,    0,    0,    24,  0,    0,
-    0,    0,    0,    0,    0,   32,  0,   0,    0,    3,    0,   0,    0,
-    32,   0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    104,  3,   0,   0,   0,    0,    0,    0,   42,   0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    1,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,
-extern char* const kVectorCopyKvObject = &kVectorCopyRawKv[0];
-extern size_t const kVectorCopyKvObjectSize = sizeof(kVectorCopyRawKv);
-/*****HSAIL code of the ISA in ::kVectorCopyAlignedRawKv.
-module &m:1:0:$full:$large:$default;
-extension "amd:gcn";
-prog kernel &__copy_buffer_aligned_kernel(
-  kernarg_u64 %src,
-  kernarg_u64 %dst,
-  kernarg_u64 %size,
-  kernarg_u32 %use_vector)
-  @__copy_buffer_aligned_kernel_entry:
-  // BB#0:                                // %entry
-  workitemabsid_u32	$s0, 0;
-  cvt_u64_u32	$d0, $s0;
-  ld_kernarg_align(8)_width(all)_u64	$d1, [%size];
-  cmp_ge_b1_u64	$c0, $d0, $d1;
-  cbr_b1	$c0, @LBB0_4;
-  // BB#1:                                // %if.end
-  ld_kernarg_align(8)_width(all)_u64	$d2, [%dst];
-  ld_kernarg_align(8)_width(all)_u64	$d1, [%src];
-  ld_kernarg_align(4)_width(all)_u32	$s0, [%use_vector];
-  cmp_ne_b1_s32	$c0, $s0, 1;
-  cbr_b1	$c0, @LBB0_3;
-  // BB#2:                                // %if.then2
-  shl_u64	$d0, $d0, 4;
-  add_u64	$d2, $d2, $d0;
-  add_u64	$d0, $d1, $d0;
-  ld_v4_global_align(16)_const_u32	($s0, $s1, $s2, $s3), [$d0];
-  st_v4_global_align(16)_u32	($s0, $s1, $s2, $s3), [$d2];
-  br	@LBB0_4;
-  @LBB0_3:
-  // %if.else
-  shl_u64	$d0, $d0, 2;
-  add_u64	$d2, $d2, $d0;
-  add_u64	$d0, $d1, $d0;
-  ld_global_align(4)_const_u32	$s0, [$d0];
-  st_global_align(4)_u32	$s0, [$d2];
-  @LBB0_4:
-  // %if.end6
-  ret;
-static char kVectorCopyAlignedRawKv[] = {
-    127,  69,   76,   70,   2,   1,   1,    64,   0,    0,    0,    0,    0,
-    0,    0,    0,    1,    0,   -32, 0,    1,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   64,   0,    0,    0,    0,    0,    0,
-    0,    -8,   3,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    64,   0,    56,   0,    1,   0,   64,   0,    6,    0,    5,    0,    3,
-    0,    0,    96,   6,    0,   0,   0,    0,    1,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   -76, 1,    0,    0,    0,    0,    0,    0,
-    -76,  1,    0,    0,    0,   0,   0,    0,    0,    1,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    1,    0,    0,    0,
-    1,    0,    0,    0,    1,   0,   0,    0,    0,    0,    0,    0,    0,
-    1,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   65,  0,    -84,  0,    -112, 0,    0,    0,
-    11,   0,    10,   0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    32,   0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    11,   0,    7,   0,   7,    0,    0,    0,    9,    0,    0,
-    0,    0,    0,    0,    0,   4,   4,    4,    6,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   1,   5,    0,    -64,  127,  0,    -116, -65,
-    0,    -1,   -128, -109, 0,   0,   16,   0,    0,    8,    0,    -109, 0,
-    0,    0,    74,   4,    7,   64,  -64,  -128, 2,    2,    126,  127,  0,
-    -116, -65,  0,    0,    -56, 125, 106,  36,   -128, -66,  32,   0,    -120,
-    -65,  6,    7,    1,    -64, 0,   7,    -126, -64,  127,  0,    -116, -65,
-    2,    -127, 0,    -65,  14,  0,   -124, -65,  0,    0,    -62,  -46,  0,
-    9,    1,    0,    4,    0,   4,   74,   5,    2,    6,    126,  3,    3,
-    6,    80,   0,    0,    56,  -36, 2,    0,    0,    2,    6,    0,    0,
-    74,   7,    2,    12,   126, 6,   3,    2,    80,   112,  0,    -116, -65,
-    0,    0,    120,  -36,  0,   2,   0,    0,    13,   0,    -126, -65,  0,
-    0,    -62,  -46,  0,    5,   1,   0,    4,    0,    4,    74,   5,    2,
-    6,    126,  3,    3,    6,   80,  0,    0,    48,   -36,  2,    0,    0,
-    2,    6,    0,    0,    74,  7,   2,    6,    126,  3,    3,    2,    80,
-    112,  0,    -116, -65,  0,   0,   112,  -36,  0,    2,    0,    0,    0,
-    0,    -127, -65,  0,    0,   0,   0,    4,    0,    0,    0,    8,    0,
-    0,    0,    1,    0,    0,   0,   65,   77,   68,   0,    1,    0,    0,
-    0,    0,    0,    0,    0,   4,   0,    0,    0,    12,   0,    0,    0,
-    2,    0,    0,    0,    65,  77,  68,   0,    1,    0,    0,    0,    0,
-    0,    0,    0,    1,    1,   1,   0,    4,    0,    0,    0,    25,   0,
-    0,    0,    5,    0,    0,   0,   65,   77,   68,   0,    22,   0,    45,
-    104,  115,  97,   95,   99,  97,  108,  108,  95,   99,   111,  110,  118,
-    101,  110,  116,  105,  111, 110, 61,   0,    0,    0,    0,    0,    4,
-    0,    0,    0,    30,   0,   0,   0,    3,    0,    0,    0,    65,   77,
-    68,   0,    4,    0,    7,   0,   7,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   65,  77,   68,   0,    65,   77,   68,   71,
-    80,   85,   0,    0,    0,   0,   0,    0,    4,    0,    0,    0,    8,
-    0,    0,    0,    4,    0,   0,   0,    65,   77,   68,   0,    -32,  101,
-    -118, -12,  -1,   127,  0,   0,   38,   95,   95,   99,   111,  112,  121,
-    95,   98,   117,  102,  102, 101, 114,  95,   97,   108,  105,  103,  110,
-    101,  100,  95,   107,  101, 114, 110,  101,  108,  0,    95,   95,   104,
-    115,  97,   95,   115,  101, 99,  116,  105,  111,  110,  46,   104,  115,
-    97,   116,  101,  120,  116, 0,   0,    0,    0,    0,    0,    0,    0,
-    0,    26,   0,    1,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    -76,  1,    0,    0,    0,   0,   0,    0,    30,   0,    0,    0,    3,
-    0,    1,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    46,   104,  115,  97,   116,  101,
-    120,  116,  0,    46,   110, 111, 116,  101,  0,    46,   115,  116,  114,
-    116,  97,   98,   0,    46,  115, 121,  109,  116,  97,   98,   0,    46,
-    115,  104,  115,  116,  114, 116, 97,   98,   0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    1,    0,    0,    0,   1,   0,    0,    0,    7,    0,    -64,  0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    1,    0,    0,    0,    0,   0,   0,    -76,  1,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    1,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    10,   0,    0,    0,    7,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    -72,  2,
-    0,    0,    0,    0,    0,   0,   -88,  0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    8,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    16,
-    0,    0,    0,    3,    0,   0,   0,    32,   0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    96,   3,    0,
-    0,    0,    0,    0,    0,   52,  0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    1,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    24,   0,
-    0,    0,    2,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    -104, 3,    0,    0,
-    0,    0,    0,    0,    48,  0,   0,    0,    0,    0,    0,    0,    3,
-    0,    0,    0,    0,    0,   0,   0,    8,    0,    0,    0,    0,    0,
-    0,    0,    24,   0,    0,   0,   0,    0,    0,    0,    32,   0,    0,
-    0,    3,    0,    0,    0,   32,  0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    -56,  3,    0,    0,    0,
-    0,    0,    0,    42,   0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   1,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,
-extern char* const kVectorCopyAlignedKvObject = &kVectorCopyAlignedRawKv[0];
-extern size_t const kVectorCopyAlignedKvObjectSize =
-    sizeof(kVectorCopyAlignedRawKv);
-/*****HSAIL code of the ISA in ::kFillMemoryRawKv.
-module &m:1:0:$full:$large:$default;
-extension "amd:gcn";
-prog kernel &__fill_memory_kernel(
-kernarg_u64 %ptr,
-kernarg_u64 %num,
-kernarg_u32 %value)
-// BB#0:                                // %entry
-workitemabsid_u32	$s0, 0;
-cvt_u64_u32	$d0, $s0;
-ld_kernarg_align(8)_width(all)_u64	$d1, [%num];
-cmp_ge_b1_u64	$c0, $d0, $d1;
-cbr_b1	$c0, @LBB0_2;
-// BB#1:                                // %if.end
-ld_kernarg_align(8)_width(all)_u64	$d1, [%ptr];
-ld_kernarg_align(4)_width(all)_u32	$s0, [%value];
-shl_u64	$d0, $d0, 2;
-add_u64	$d0, $d1, $d0;
-st_global_align(4)_u32	$s0, [$d0];
-// %return
-static char kFillMemoryRawKv[] = {
-    127,  69,   76,  70,  2,    1,    1,   64,  0,    0,   0,   0,   0,    0,
-    0,    0,    1,   0,   -32,  0,    1,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   64,   0,    0,   0,   0,    0,   0,   0,   -104, 3,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   64,  0,   56,   0,
-    1,    0,    64,  0,   6,    0,    5,   0,   3,    0,   0,   96,  6,    0,
-    0,    0,    0,   1,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   96,   1,
-    0,    0,    0,   0,   0,    0,    96,  1,   0,    0,   0,   0,   0,    0,
-    0,    1,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   1,    0,    0,   0,   1,    0,   0,   0,   1,    0,
-    0,    0,    0,   0,   0,    0,    0,   1,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   64,  0,   -84,  0,
-    -112, 0,    0,   0,   11,   0,    10,  0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    32,  0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   11,   0,    3,   0,   3,    0,   0,   0,   9,    0,
-    0,    0,    0,   0,   0,    0,    4,   4,   4,    6,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   1,    5,   0,   -64, 127,  0,
-    -116, -65,  0,   -1,  -128, -109, 0,   0,   16,   0,   0,   8,   0,    -109,
-    0,    0,    0,   74,  2,    7,    64,  -64, -128, 2,   2,   126, 127,  0,
-    -116, -65,  0,   0,   -56,  125,  106, 36,  -128, -66, 11,  0,   -120, -65,
-    0,    7,    65,  -64, 4,    7,    2,   -64, 0,    0,   -62, -46, 0,    5,
-    1,    0,    127, 0,   -116, -65,  2,   0,   0,    74,  3,   2,   4,    126,
-    2,    3,    2,   80,  4,    2,    4,   126, 0,    0,   112, -36, 0,    2,
-    0,    0,    0,   0,   -127, -65,  4,   0,   0,    0,   8,   0,   0,    0,
-    1,    0,    0,   0,   65,   77,   68,  0,   1,    0,   0,   0,   0,    0,
-    0,    0,    4,   0,   0,    0,    12,  0,   0,    0,   2,   0,   0,    0,
-    65,   77,   68,  0,   1,    0,    0,   0,   0,    0,   0,   0,   1,    1,
-    1,    0,    4,   0,   0,    0,    25,  0,   0,    0,   5,   0,   0,    0,
-    65,   77,   68,  0,   22,   0,    45,  104, 115,  97,  95,  99,  97,   108,
-    108,  95,   99,  111, 110,  118,  101, 110, 116,  105, 111, 110, 61,   0,
-    0,    0,    0,   0,   4,    0,    0,   0,   30,   0,   0,   0,   3,    0,
-    0,    0,    65,  77,  68,   0,    4,   0,   7,    0,   7,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   65,   77,  68,  0,   65,   77,
-    68,   71,   80,  85,  0,    0,    0,   0,   0,    0,   4,   0,   0,    0,
-    8,    0,    0,   0,   4,    0,    0,   0,   65,   77,  68,  0,   48,   123,
-    44,   -103, -4,  127, 0,    0,    38,  95,  95,   102, 105, 108, 108,  95,
-    109,  101,  109, 111, 114,  121,  95,  107, 101,  114, 110, 101, 108,  0,
-    95,   95,   104, 115, 97,   95,   115, 101, 99,   116, 105, 111, 110,  46,
-    104,  115,  97,  116, 101,  120,  116, 0,   0,    0,   0,   0,   0,    0,
-    0,    0,    26,  0,   1,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    96,   1,    0,   0,   0,    0,    0,   0,   22,   0,   0,   0,   3,    0,
-    1,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    46,   104, 115, 97,   116, 101, 120, 116,  0,
-    46,   110,  111, 116, 101,  0,    46,  115, 116,  114, 116, 97,  98,   0,
-    46,   115,  121, 109, 116,  97,   98,  0,   46,   115, 104, 115, 116,  114,
-    116,  97,   98,  0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   1,    0,    0,   0,   1,    0,   0,   0,   7,    0,
-    -64,  0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    1,    0,   0,   0,    0,    0,   0,   96,   1,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   1,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   10,   0,
-    0,    0,    7,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   96,   2,   0,   0,   0,    0,
-    0,    0,    -88, 0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   8,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    16,  0,   0,    0,   3,   0,   0,    0,
-    32,   0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    8,   3,   0,    0,    0,   0,   0,    0,   44,  0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   1,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    24,   0,    0,   0,   2,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   56,  3,   0,    0,
-    0,    0,    0,   0,   48,   0,    0,   0,   0,    0,   0,   0,   3,    0,
-    0,    0,    0,   0,   0,    0,    8,   0,   0,    0,   0,   0,   0,    0,
-    24,   0,    0,   0,   0,    0,    0,   0,   32,   0,   0,   0,   3,    0,
-    0,    0,    32,  0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   104,  3,    0,   0,   0,    0,   0,   0,   42,   0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    1,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,
-extern char* const kFillMemoryKvObject = &kFillMemoryRawKv[0];
-extern size_t const kFillMemoryKvObjectSize = sizeof(kFillMemoryRawKv);
-#endif  // header guard
\ No newline at end of file
diff --git a/src/core/inc/amd_blit_kernel_vi.h b/src/core/inc/amd_blit_kernel_vi.h
deleted file mode 100644
index 13969370b..000000000
--- a/src/core/inc/amd_blit_kernel_vi.h
+++ /dev/null
@@ -1,490 +0,0 @@
-// The University of Illinois/NCSA
-// Open Source License (NCSA)
-// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
-// Developed by:
-//                 AMD Research and AMD HSA Software Development
-//                 Advanced Micro Devices, Inc.
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to
-// deal with the Software without restriction, including without limitation
-// the rights to use, copy, modify, merge, publish, distribute, sublicense,
-// and/or sell copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following conditions:
-//  - Redistributions of source code must retain the above copyright notice,
-//    this list of conditions and the following disclaimers.
-//  - Redistributions in binary form must reproduce the above copyright
-//    notice, this list of conditions and the following disclaimers in
-//    the documentation and/or other materials provided with the distribution.
-//  - Neither the names of Advanced Micro Devices, Inc,
-//    nor the names of its contributors may be used to endorse or promote
-//    products derived from this Software without specific prior written
-//    permission.
-#include <stddef.h>
-/*****HSAIL code of the ISA in ::kVectorCopyRawVi.
-module &m:1:0:$full:$large:$default;
-prog kernel &__vector_copy_kernel(
-  kernarg_u64 %src,
-  kernarg_u64 %dst,
-  kernarg_u64 %size)
-  @__vector_copy_kernel_entry:
-  // BB#0:                                // %entry
-  workitemabsid_u32	$s0, 0;
-  cvt_u64_u32	$d0, $s0;
-  ld_kernarg_align(8)_width(all)_u64	$d1, [%size];
-  cmp_ge_b1_u64	$c0, $d0, $d1;
-  cbr_b1	$c0, @BB0_2;
-  // BB#1:                                // %if.end
-  ld_kernarg_align(8)_width(all)_u64	$d1, [%src];
-  ld_kernarg_align(8)_width(all)_u64	$d2, [%dst];
-  add_u64	$d2, $d2, $d0;
-  add_u64	$d0, $d1, $d0;
-  ld_global_u8	$s0, [$d0];
-  st_global_u8	$s0, [$d2];
-  @BB0_2:
-  // %return
-  ret;
-static char kVectorCopyRawVi[] = {
-    127, 69,  76,   70,   2,   1,   1,    64,   0,    0,    0,    0,    0,
-    0,   0,   0,    1,    0,   -32, 0,    1,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   64,   0,    0,    0,    0,    0,    0,
-    0,   -72, 3,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    64,  0,   56,   0,    1,   0,   64,   0,    6,    0,    5,    0,    3,
-    0,   0,   96,   6,    0,   0,   0,    0,    1,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   124, 1,    0,    0,    0,    0,    0,    0,
-    124, 1,   0,    0,    0,   0,   0,    0,    0,    1,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    1,    0,    0,    0,
-    1,   0,   0,    0,    1,   0,   0,    0,    0,    0,    0,    0,    0,
-    1,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   -63, 2,    -84,  0,    -112, 0,    0,    0,
-    11,  0,   10,   0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    32,   0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   96,   0,    5,   0,   5,    0,    0,    0,    9,    0,    0,
-    0,   0,   0,    0,    0,   4,   4,    4,    6,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   2,   0,    2,    -64,  4,    0,    0,    0,
-    127, 0,   -116, -65,  0,   -1,  -128, -110, 0,    0,    16,   0,    0,
-    8,   0,   -110, 0,    0,   0,   50,   3,    0,    6,    -64,  16,   0,
-    0,   0,   -128, 2,    2,   126, 127,  0,    -116, -65,  0,    0,    -40,
-    125, 106, 32,   -128, -66, 16,  0,    -120, -65,  3,    1,    10,   -64,
-    0,   0,   0,    0,    127, 0,   -116, -65,  4,    0,    2,    50,   5,
-    2,   4,   126,  2,    106, 28,  -47,  2,    1,    -87,  1,    0,    0,
-    64,  -36, 1,    0,    0,   1,   6,    0,    6,    50,   7,    2,    4,
-    126, 4,   106,  28,   -47, 2,   1,    -87,  1,    112,  0,    -116, -65,
-    0,   0,   96,   -36,  3,   1,   0,    0,    0,    0,    -127, -65,  0,
-    0,   0,   0,    4,    0,   0,   0,    8,    0,    0,    0,    1,    0,
-    0,   0,   65,   77,   68,  0,   1,    0,    0,    0,    0,    0,    0,
-    0,   4,   0,    0,    0,   12,  0,    0,    0,    2,    0,    0,    0,
-    65,  77,  68,   0,    1,   0,   0,    0,    0,    0,    0,    0,    1,
-    1,   1,   0,    4,    0,   0,   0,    25,   0,    0,    0,    5,    0,
-    0,   0,   65,   77,   68,  0,   22,   0,    45,   104,  115,  97,   95,
-    99,  97,  108,  108,  95,  99,  111,  110,  118,  101,  110,  116,  105,
-    111, 110, 61,   0,    0,   0,   0,    0,    4,    0,    0,    0,    30,
-    0,   0,   0,    3,    0,   0,   0,    65,   77,   68,   0,    4,    0,
-    7,   0,   8,    0,    0,   0,   0,    0,    0,    0,    2,    0,    0,
-    0,   65,  77,   68,   0,   65,  77,   68,   71,   80,   85,   0,    0,
-    0,   0,   0,    0,    4,   0,   0,    0,    8,    0,    0,    0,    4,
-    0,   0,   0,    65,   77,  68,  0,    32,   103,  -72,  81,   -3,   127,
-    0,   0,   38,   95,   95,  118, 101,  99,   116,  111,  114,  95,   99,
-    111, 112, 121,  95,   107, 101, 114,  110,  101,  108,  0,    95,   95,
-    104, 115, 97,   95,   115, 101, 99,   116,  105,  111,  110,  46,   104,
-    115, 97,  116,  101,  120, 116, 0,    0,    0,    0,    0,    0,    0,
-    0,   0,   26,   0,    1,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   124, 1,    0,    0,   0,   0,    0,    0,    22,   0,    0,    0,
-    3,   0,   1,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    46,   104,  115,  97,   116,
-    101, 120, 116,  0,    46,  110, 111,  116,  101,  0,    46,   115,  116,
-    114, 116, 97,   98,   0,   46,  115,  121,  109,  116,  97,   98,   0,
-    46,  115, 104,  115,  116, 114, 116,  97,   98,   0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   1,    0,    0,   0,   1,    0,    0,    0,    7,    0,    -64,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   1,   0,    0,    0,   0,   0,    0,    124,  1,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    1,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   10,  0,    0,    0,   7,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    -128,
-    2,   0,   0,    0,    0,   0,   0,    -88,  0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    8,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    16,  0,   0,    0,    3,   0,   0,    0,    32,   0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    40,   3,
-    0,   0,   0,    0,    0,   0,   44,   0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    1,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    24,
-    0,   0,   0,    2,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    88,   3,    0,
-    0,   0,   0,    0,    0,   48,  0,    0,    0,    0,    0,    0,    0,
-    3,   0,   0,    0,    0,   0,   0,    0,    8,    0,    0,    0,    0,
-    0,   0,   0,    24,   0,   0,   0,    0,    0,    0,    0,    32,   0,
-    0,   0,   3,    0,    0,   0,   32,   0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    -120, 3,    0,    0,
-    0,   0,   0,    0,    42,  0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    1,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,
-extern char* const kVectorCopyViObject = &kVectorCopyRawVi[0];
-extern size_t const kVectorCopyViObjectSize = sizeof(kVectorCopyRawVi);
-/*****HSAIL code of the ISA in ::kVectorCopyAlignedRawVi.
-module &m:1:0:$full:$large:$default;
-extension "amd:gcn";
-prog kernel &__copy_buffer_aligned_kernel(
-  kernarg_u64 %src,
-  kernarg_u64 %dst,
-  kernarg_u64 %size,
-  kernarg_u32 %use_vector)
-  @__copy_buffer_aligned_kernel_entry:
-  // BB#0:                                // %entry
-  workitemabsid_u32	$s0, 0;
-  cvt_u64_u32	$d0, $s0;
-  ld_kernarg_align(8)_width(all)_u64	$d1, [%size];
-  cmp_ge_b1_u64	$c0, $d0, $d1;
-  cbr_b1	$c0, @LBB0_4;
-  // BB#1:                                // %if.end
-  ld_kernarg_align(8)_width(all)_u64	$d2, [%dst];
-  ld_kernarg_align(8)_width(all)_u64	$d1, [%src];
-  ld_kernarg_align(4)_width(all)_u32	$s0, [%use_vector];
-  cmp_ne_b1_s32	$c0, $s0, 1;
-  cbr_b1	$c0, @LBB0_3;
-  // BB#2:                                // %if.then2
-  shl_u64	$d0, $d0, 4;
-  add_u64	$d2, $d2, $d0;
-  add_u64	$d0, $d1, $d0;
-  ld_v4_global_align(16)_const_u32	($s0, $s1, $s2, $s3), [$d0];
-  st_v4_global_align(16)_u32	($s0, $s1, $s2, $s3), [$d2];
-  br	@LBB0_4;
-  @LBB0_3:
-  // %if.else
-  shl_u64	$d0, $d0, 2;
-  add_u64	$d2, $d2, $d0;
-  add_u64	$d0, $d1, $d0;
-  ld_global_align(4)_const_u32	$s0, [$d0];
-  st_global_align(4)_u32	$s0, [$d2];
-  @LBB0_4:
-  // %if.end6
-  ret;
-static char kVectorCopyAlignedRawVi[] = {
-    127,  69,   76,   70,   2,    1,    1,    64,   0,    0,    0,   0,   0,
-    0,    0,    0,    1,    0,    -32,  0,    1,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    64,   0,    0,    0,    0,   0,   0,
-    0,    8,    4,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    64,   0,    56,   0,    1,    0,    64,   0,    6,    0,    5,   0,   3,
-    0,    0,    96,   6,    0,    0,    0,    0,    1,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    -60,  1,    0,    0,    0,    0,   0,   0,
-    -60,  1,    0,    0,    0,    0,    0,    0,    0,    1,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    1,    0,   0,   0,
-    1,    0,    0,    0,    1,    0,    0,    0,    0,    0,    0,   0,   0,
-    1,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    65,   0,    -84,  0,    -112, 0,   0,   0,
-    11,   0,    74,   0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    32,   0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    16,   0,    8,    0,    8,    0,    0,    0,    12,  0,   0,
-    0,    0,    0,    0,    0,    4,    4,    4,    6,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    2,    0,    2,    -64,  4,    0,   0,   0,
-    127,  0,    -116, -65,  0,    -1,   -128, -110, 0,    0,    16,  0,   0,
-    8,    0,    -110, 0,    0,    0,    50,   3,    0,    6,    -64, 16,  0,
-    0,    0,    -128, 2,    2,    126,  127,  0,    -116, -65,  0,   0,   -40,
-    125,  106,  32,   -128, -66,  34,   0,    -120, -65,  -125, 0,   2,   -64,
-    24,   0,    0,    0,    3,    2,    10,   -64,  0,    0,    0,   0,   127,
-    0,    -116, -65,  2,    -127, 0,    -65,  14,   0,    -124, -65, 0,   0,
-    -113, -46,  -124, 0,    2,    0,    8,    0,    4,    50,   9,   2,   6,
-    126,  3,    3,    6,    56,   0,    0,    92,   -36,  2,    0,   0,   4,
-    10,   0,    0,    50,   11,   2,    4,    126,  2,    3,    2,   56,  112,
-    0,    -116, -65,  0,    0,    124,  -36,  0,    4,    0,    0,   13,  0,
-    -126, -65,  0,    0,    -113, -46,  -126, 0,    2,    0,    8,   0,   4,
-    50,   9,    2,    6,    126,  3,    3,    6,    56,   0,    0,   80,  -36,
-    2,    0,    0,    4,    10,   0,    0,    50,   11,   2,    4,   126, 2,
-    3,    2,    56,   112,  0,    -116, -65,  0,    0,    112,  -36, 0,   4,
-    0,    0,    0,    0,    -127, -65,  0,    0,    0,    0,    4,   0,   0,
-    0,    8,    0,    0,    0,    1,    0,    0,    0,    65,   77,  68,  0,
-    1,    0,    0,    0,    0,    0,    0,    0,    4,    0,    0,   0,   12,
-    0,    0,    0,    2,    0,    0,    0,    65,   77,   68,   0,   1,   0,
-    0,    0,    0,    0,    0,    0,    1,    1,    1,    0,    4,   0,   0,
-    0,    25,   0,    0,    0,    5,    0,    0,    0,    65,   77,  68,  0,
-    22,   0,    45,   104,  115,  97,   95,   99,   97,   108,  108, 95,  99,
-    111,  110,  118,  101,  110,  116,  105,  111,  110,  61,   0,   0,   0,
-    0,    0,    4,    0,    0,    0,    30,   0,    0,    0,    3,   0,   0,
-    0,    65,   77,   68,   0,    4,    0,    7,    0,    8,    0,   0,   0,
-    0,    0,    0,    0,    1,    0,    0,    0,    65,   77,   68,  0,   65,
-    77,   68,   71,   80,   85,   0,    0,    0,    0,    0,    0,   4,   0,
-    0,    0,    8,    0,    0,    0,    4,    0,    0,    0,    65,  77,  68,
-    0,    96,   62,   -27,  85,   -1,   127,  0,    0,    38,   95,  95,  99,
-    111,  112,  121,  95,   98,   117,  102,  102,  101,  114,  95,  97,  108,
-    105,  103,  110,  101,  100,  95,   107,  101,  114,  110,  101, 108, 0,
-    95,   95,   104,  115,  97,   95,   115,  101,  99,   116,  105, 111, 110,
-    46,   104,  115,  97,   116,  101,  120,  116,  0,    0,    0,   0,   0,
-    0,    0,    0,    0,    26,   0,    1,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    -60,  1,    0,    0,    0,    0,    0,    0,   30,  0,
-    0,    0,    3,    0,    1,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    46,  104, 115,
-    97,   116,  101,  120,  116,  0,    46,   110,  111,  116,  101, 0,   46,
-    115,  116,  114,  116,  97,   98,   0,    46,   115,  121,  109, 116, 97,
-    98,   0,    46,   115,  104,  115,  116,  114,  116,  97,   98,  0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    1,    0,    0,    0,    1,    0,    0,   0,   7,
-    0,    -64,  0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    1,    0,    0,    0,    0,    0,    0,    -60, 1,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    1,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    10,   0,    0,    0,    7,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    -56,  2,    0,    0,    0,    0,    0,    0,    -88,  0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   8,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    16,   0,    0,    0,    3,    0,    0,    0,    32,  0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    112,  3,    0,    0,    0,    0,    0,    0,    52,   0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   1,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    24,   0,    0,    0,    2,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   -88,
-    3,    0,    0,    0,    0,    0,    0,    48,   0,    0,    0,   0,   0,
-    0,    0,    3,    0,    0,    0,    0,    0,    0,    0,    8,   0,   0,
-    0,    0,    0,    0,    0,    24,   0,    0,    0,    0,    0,   0,   0,
-    32,   0,    0,    0,    3,    0,    0,    0,    32,   0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   -40, 3,
-    0,    0,    0,    0,    0,    0,    42,   0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    1,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,
-extern char* const kVectorCopyAlignedViObject = &kVectorCopyAlignedRawVi[0];
-extern size_t const kVectorCopyAlignedViObjectSize =
-    sizeof(kVectorCopyAlignedRawVi);
-/*****HSAIL code of the ISA in ::kFillMemoryRawVi.
-module &m:1:0:$full:$large:$default;
-extension "amd:gcn";
-prog kernel &__fill_memory_kernel(
-  kernarg_u64 %ptr,
-  kernarg_u64 %num,
-  kernarg_u32 %value)
-  @__fill_memory_kernel_entry:
-  // BB#0:                                // %entry
-  workitemabsid_u32	$s0, 0;
-  cvt_u64_u32	$d0, $s0;
-  ld_kernarg_align(8)_width(all)_u64	$d1, [%num];
-  cmp_ge_b1_u64	$c0, $d0, $d1;
-  cbr_b1	$c0, @LBB0_2;
-  // BB#1:                                // %if.end
-  ld_kernarg_align(8)_width(all)_u64	$d1, [%ptr];
-  ld_kernarg_align(4)_width(all)_u32	$s0, [%value];
-  shl_u64	$d0, $d0, 2;
-  add_u64	$d0, $d1, $d0;
-  st_global_align(4)_u32	$s0, [$d0];
-  @LBB0_2:
-  // %return
-  ret;
-static char kFillMemoryRawVi[] = {
-    127, 69,   76,   70,   2,   1,   1,    64,   0,    0,    0,   0,    0,
-    0,   0,    0,    1,    0,   -32, 0,    1,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   64,   0,    0,    0,    0,   0,    0,
-    0,   -88,  3,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    64,  0,    56,   0,    1,   0,   64,   0,    6,    0,    5,   0,    3,
-    0,   0,    96,   6,    0,   0,   0,    0,    1,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   112, 1,    0,    0,    0,    0,   0,    0,
-    112, 1,    0,    0,    0,   0,   0,    0,    0,    1,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    1,    0,   0,    0,
-    1,   0,    0,    0,    1,   0,   0,    0,    0,    0,    0,   0,    0,
-    1,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   64,  0,    -84,  0,    -112, 0,   0,    0,
-    11,  0,    74,   0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    32,   0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    13,   0,    3,   0,   3,    0,    0,    0,    9,   0,    0,
-    0,   0,    0,    0,    0,   4,   4,    4,    6,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   2,   0,    2,    -64,  4,    0,   0,    0,
-    127, 0,    -116, -65,  0,   -1,  -128, -110, 0,    0,    16,  0,    0,
-    8,   0,    -110, 0,    0,   0,   50,   3,    0,    6,    -64, 8,    0,
-    0,   0,    -128, 2,    2,   126, 127,  0,    -116, -65,  0,   0,    -40,
-    125, 106,  32,   -128, -66, 13,  0,    -120, -65,  -125, 0,   6,    -64,
-    0,   0,    0,    0,    3,   1,   2,    -64,  16,   0,    0,   0,    0,
-    0,   -113, -46,  -126, 0,   2,   0,    127,  0,    -116, -65, 2,    0,
-    0,   50,   3,    2,    4,   126, 2,    3,    2,    56,   4,   2,    4,
-    126, 0,    0,    112,  -36, 0,   2,    0,    0,    0,    0,   -127, -65,
-    4,   0,    0,    0,    8,   0,   0,    0,    1,    0,    0,   0,    65,
-    77,  68,   0,    1,    0,   0,   0,    0,    0,    0,    0,   4,    0,
-    0,   0,    12,   0,    0,   0,   2,    0,    0,    0,    65,  77,   68,
-    0,   1,    0,    0,    0,   0,   0,    0,    0,    1,    1,   1,    0,
-    4,   0,    0,    0,    25,  0,   0,    0,    5,    0,    0,   0,    65,
-    77,  68,   0,    22,   0,   45,  104,  115,  97,   95,   99,  97,   108,
-    108, 95,   99,   111,  110, 118, 101,  110,  116,  105,  111, 110,  61,
-    0,   0,    0,    0,    0,   4,   0,    0,    0,    30,   0,   0,    0,
-    3,   0,    0,    0,    65,  77,  68,   0,    4,    0,    7,   0,    8,
-    0,   0,    0,    0,    0,   0,   0,    1,    0,    0,    0,   65,   77,
-    68,  0,    65,   77,   68,  71,  80,   85,   0,    0,    0,   0,    0,
-    0,   4,    0,    0,    0,   8,   0,    0,    0,    4,    0,   0,    0,
-    65,  77,   68,   0,    16,  -20, 88,   97,   -4,   127,  0,   0,    38,
-    95,  95,   102,  105,  108, 108, 95,   109,  101,  109,  111, 114,  121,
-    95,  107,  101,  114,  110, 101, 108,  0,    95,   95,   104, 115,  97,
-    95,  115,  101,  99,   116, 105, 111,  110,  46,   104,  115, 97,   116,
-    101, 120,  116,  0,    0,   0,   0,    0,    0,    0,    0,   0,    26,
-    0,   1,    0,    0,    0,   0,   0,    0,    0,    0,    0,   112,  1,
-    0,   0,    0,    0,    0,   0,   22,   0,    0,    0,    3,   0,    1,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   46,  104,  115,  97,   116,  101, 120,  116,
-    0,   46,   110,  111,  116, 101, 0,    46,   115,  116,  114, 116,  97,
-    98,  0,    46,   115,  121, 109, 116,  97,   98,   0,    46,  115,  104,
-    115, 116,  114,  116,  97,  98,  0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    1,
-    0,   0,    0,    1,    0,   0,   0,    7,    0,    -64,  0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   1,    0,
-    0,   0,    0,    0,    0,   112, 1,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    1,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   10,   0,
-    0,   0,    7,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    112,  2,   0,    0,
-    0,   0,    0,    0,    -88, 0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    8,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    16,  0,    0,
-    0,   3,    0,    0,    0,   32,  0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    24,   3,    0,   0,    0,
-    0,   0,    0,    44,   0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   1,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    24,   0,   0,    0,
-    2,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    72,   3,    0,    0,   0,    0,
-    0,   0,    48,   0,    0,   0,   0,    0,    0,    0,    3,   0,    0,
-    0,   0,    0,    0,    0,   8,   0,    0,    0,    0,    0,   0,    0,
-    24,  0,    0,    0,    0,   0,   0,    0,    32,   0,    0,   0,    3,
-    0,   0,    0,    32,   0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   120,  3,    0,    0,    0,   0,    0,
-    0,   42,   0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    1,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,
-extern char* const kFillMemoryViObject = &kFillMemoryRawVi[0];
-extern size_t const kFillMemoryViObjectSize = sizeof(kFillMemoryRawVi);
-#endif  // header guard
\ No newline at end of file
diff --git a/src/core/inc/amd_blit_sdma.h b/src/core/inc/amd_blit_sdma.h
index 35f683bc3..6212c3dcc 100644
--- a/src/core/inc/amd_blit_sdma.h
+++ b/src/core/inc/amd_blit_sdma.h
@@ -43,10 +43,12 @@
+#include <mutex>
 #include <stdint.h>
 #include "hsakmt.h"
+#include "core/inc/amd_gpu_agent.h"
 #include "core/inc/blit.h"
 #include "core/inc/runtime.h"
 #include "core/inc/signal.h"
@@ -73,8 +75,10 @@ class BlitSdma : public core::Blit {
   /// @note: The call will block until all packets have executed.
+  /// @param agent Agent passed to Initialize.
+  ///
   /// @return hsa_status_t
-  virtual hsa_status_t Destroy() override;
+  virtual hsa_status_t Destroy(const core::Agent& agent) override;
   /// @brief Submit a linear copy command to the queue buffer.
@@ -107,6 +111,12 @@ class BlitSdma : public core::Blit {
   virtual hsa_status_t SubmitLinearFillCommand(void* ptr, uint32_t value,
                                                size_t count) override;
+  virtual hsa_status_t EnableProfiling(bool enable) override;
+  static const size_t kQueueSize;
+  static const size_t kCopyPacketSize;
   /// @brief Acquires the address into queue buffer where a new command
   /// packet of specified size could be written. The address that is
@@ -159,6 +169,13 @@ class BlitSdma : public core::Blit {
   void BuildAtomicDecrementCommand(char* cmd_addr, void* addr);
+  void BuildGetGlobalTimestampCommand(char* cmd_addr, void* write_address);
+  void BuildTrapCommand(char* cmd_addr);
+  // Agent object owning the SDMA engine.
+  GpuAgent* agent_;
   /// Indicates size of Queue buffer in bytes.
   uint32_t queue_size_;
@@ -199,6 +216,10 @@ class BlitSdma : public core::Blit {
   uint32_t atomic_command_size_;
+  uint32_t timestamp_command_size_;
+  uint32_t trap_command_size_;
   // Max copy size of a single linear copy command packet.
   size_t max_single_linear_copy_size_;
@@ -210,6 +231,9 @@ class BlitSdma : public core::Blit {
   /// Max total fill count supported by the queue.
   size_t max_total_fill_size_;
+  /// True if platform atomic is supported.
+  bool platform_atomic_support_;
 }  // namespace amd
diff --git a/src/core/inc/amd_elf_image.hpp b/src/core/inc/amd_elf_image.hpp
index 8bc811e17..763c5c831 100644
--- a/src/core/inc/amd_elf_image.hpp
+++ b/src/core/inc/amd_elf_image.hpp
@@ -103,6 +103,7 @@ namespace amd {
       virtual uint64_t imageSize() const = 0;
       virtual uint64_t vaddr() const = 0;
       virtual uint64_t flags() const = 0;
+      virtual uint64_t offset() const = 0;
       virtual const char* data() const = 0;
       virtual uint16_t getSegmentIndex() = 0;
       virtual bool updateAddSection(Section *section) = 0;
diff --git a/src/core/inc/amd_gpu_agent.h b/src/core/inc/amd_gpu_agent.h
index abd854679..7b3246465 100644
--- a/src/core/inc/amd_gpu_agent.h
+++ b/src/core/inc/amd_gpu_agent.h
@@ -57,6 +57,8 @@
 #include "core/util/locks.h"
 namespace amd {
+class MemoryRegion;
 // @brief Contains scratch memory information.
 struct ScratchInfo {
   void* queue_base;
@@ -72,6 +74,16 @@ class GpuAgentInt : public core::Agent {
   GpuAgentInt(uint32_t node_id)
       : core::Agent(node_id, core::Agent::DeviceType::kAmdGpuDevice) {}
+  // @brief Initialize DMA queue.
+  //
+  // @retval HSA_STATUS_SUCCESS DMA queue initialization is successful.
+  virtual void InitDma() = 0;
+  // @brief Initialize blit kernel object based on AQL queue.
+  //
+  // @retval HSA_STATUS_SUCCESS blit kernel object initialization is successful.
+  virtual hsa_status_t InitBlitKernel() = 0;
   // @brief Invoke the user provided callback for each region accessible by
   // this agent.
@@ -108,6 +120,16 @@ class GpuAgentInt : public core::Agent {
   virtual void TranslateTime(core::Signal* signal,
                              hsa_amd_profiling_dispatch_time_t& time) = 0;
+  // @brief Translate the async copy start and end timestamp from agent
+  // domain to host domain.
+  //
+  // @param [in] signal Pointer to signal that provides the async copy timing.
+  // @param [out] time Structure to be populated with the host domain value.
+  virtual void TranslateTime(core::Signal* signal,
+                             hsa_amd_profiling_async_copy_time_t& time) {
+    return TranslateTime(signal, (hsa_amd_profiling_dispatch_time_t&)time);
+  }
   // @brief Translate timestamp agent domain to host domain.
   // @param [out] time Timestamp in agent domain.
@@ -158,27 +180,32 @@ class GpuAgent : public GpuAgentInt {
   // @brief GPU agent destructor.
-  // @brief Initialize DMA queue.
-  //
-  // @retval HSA_STATUS_SUCCESS DMA queue initialization is successful.
-  hsa_status_t InitDma();
+  // @brief Override from core::Agent.
+  void InitDma() override;
+  // @brief Override from core::Agent.
+  hsa_status_t InitBlitKernel() override;
   uint16_t GetMicrocodeVersion() const;
-  // @brief Assembles SP3 shader source into executable code.
+  // @brief Assembles SP3 shader source into ISA or AQL code object.
   // @param [in] src_sp3 SP3 shader source text representation.
   // @param [in] func_name Name of the SP3 function to assemble.
-  // @param [out] code_buf Executable code buffer.
-  // @param [out] code_buf_size Size of executable code buffer in bytes.
+  // @param [in] assemble_target ISA or AQL assembly target.
+  // @param [out] code_buf Code object buffer.
+  // @param [out] code_buf_size Size of code object buffer in bytes.
+  enum class AssembleTarget { ISA, AQL };
   void AssembleShader(const char* src_sp3, const char* func_name,
-                      void*& code_buf, size_t& code_buf_size);
+                      AssembleTarget assemble_target, void*& code_buf,
+                      size_t& code_buf_size) const;
-  // @brief Frees executable code created by AssembleShader.
+  // @brief Frees code object created by AssembleShader.
-  // @param [in] code_buf Executable code buffer.
-  // @param [in] code_buf_size Size of executable code buffer in bytes.
-  void ReleaseShader(void* code_buf, size_t code_buf_size);
+  // @param [in] code_buf Code object buffer.
+  // @param [in] code_buf_size Size of code object buffer in bytes.
+  void ReleaseShader(void* code_buf, size_t code_buf_size) const;
   // @brief Override from core::Agent.
   hsa_status_t VisitRegion(bool include_peer,
@@ -203,6 +230,9 @@ class GpuAgent : public GpuAgentInt {
   // @brief Override from core::Agent.
   hsa_status_t DmaFill(void* ptr, uint32_t value, size_t count) override;
+  // @brief Get the next available end timestamp object.
+  uint64_t* ObtainEndTsObject();
   // @brief Override from core::Agent.
   hsa_status_t GetInfo(hsa_agent_info_t attribute, void* value) const override;
@@ -308,6 +338,9 @@ class GpuAgent : public GpuAgentInt {
   // @brief Binds the second-level trap handler to this node.
   void BindTrapHandler();
+  // @brief Override from core::Agent.
+  hsa_status_t EnableDmaProfiling(bool enable) override;
   // @brief Node properties.
   const HsaNodeProperties properties_;
@@ -329,10 +362,13 @@ class GpuAgent : public GpuAgentInt {
   // @brief Blit object to handle memory copy from system to device memory.
   core::Blit* blit_h2d_;
-  // @brief Blit object to handle memory copy from device to system, device to
-  // device, and memory fill.
+  // @brief Blit object to handle memory copy from device to system memory.
   core::Blit* blit_d2h_;
+  // @brief Blit object to handle memory copy from device to device memory, and
+  // memory fill.
+  core::Blit* blit_d2d_;
   // @brief Mutex to protect the update to coherency type.
   KernelMutex coherency_lock_;
@@ -342,6 +378,9 @@ class GpuAgent : public GpuAgentInt {
   // @brief Mutex to protect access to ::t1_.
   KernelMutex t1_lock_;
+  // @brief Mutex to protect access to blit objects.
+  KernelMutex blit_lock_;
   // @brief GPU tick on initialization.
   HsaClockCounters t0_;
@@ -353,6 +392,8 @@ class GpuAgent : public GpuAgentInt {
   // @brief Array of regions owned by this agent.
   std::vector<const core::MemoryRegion*> regions_;
+  MemoryRegion* local_region_;
   core::Isa* isa_;
   // @brief HSA profile.
@@ -381,12 +422,29 @@ class GpuAgent : public GpuAgentInt {
   // @brief Query the driver to get the cache properties.
   void InitCacheList();
+  // @brief Initialize memory pool for end timestamp object.
+  // @retval True if the memory pool for end timestamp object is initialized.
+  bool InitEndTsPool();
   // @brief Alternative aperture base address. Only on KV.
   uintptr_t ape1_base_;
   // @brief Alternative aperture size. Only on KV.
   size_t ape1_size_;
+  // @brief True if blit objects are initialized.
+  std::atomic<bool> blit_initialized_;
+  // Each end ts is 32 bytes.
+  static const size_t kTsSize = 32;
+  // Number of element in the pool.
+  uint32_t end_ts_pool_size_;
+  std::atomic<uint32_t> end_ts_pool_counter_;
+  std::atomic<uint64_t*> end_ts_base_addr_;
diff --git a/src/core/inc/amd_gpu_shaders.h b/src/core/inc/amd_gpu_shaders.h
new file mode 100644
index 000000000..2aa074981
--- /dev/null
+++ b/src/core/inc/amd_gpu_shaders.h
@@ -0,0 +1,169 @@
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
+// Developed by:
+//                 AMD Research and AMD HSA Software Development
+//                 Advanced Micro Devices, Inc.
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+namespace amd {
+static const unsigned int kCodeCopyAligned7[] = {
+    0xC0820100, 0xC0840104, 0xC0860108, 0xC088010C, 0xC08A0110, 0xC00C0114,
+    0xBF8C007F, 0x8F028602, 0x4A000002, 0x7E060205, 0xD24A6A02, 0x00000900,
+    0xD2506A03, 0x01A90103, 0x7E0A0207, 0xD24A6A04, 0x00000D00, 0xD2506A05,
+    0x01A90105, 0xD1C2006A, 0x00001102, 0xBF86000F, 0x87FE6A7E, 0xDC200000,
+    0x01000002, 0xBF8C0F70, 0xD24A6A02, 0x00003102, 0xD2506A03, 0x01A90103,
+    0xDC600000, 0x00000104, 0xD24A6A04, 0x00003104, 0xD2506A05, 0x01A90105,
+    0xBF82FFEE, 0xBEFE04C1, 0x8F198418, 0x34020084, 0x7E060209, 0xD24A6A02,
+    0x00001101, 0xD2506A03, 0x01A90103, 0x7E0A020B, 0xD24A6A04, 0x00001501,
+    0xD2506A05, 0x01A90105, 0xD1C2006A, 0x00001902, 0xBF86000E, 0xDC380000,
+    0x08000002, 0xD24A6A02, 0x00003302, 0xD2506A03, 0x01A90103, 0xBF8C0F70,
+    0xDC780000, 0x00000804, 0xD24A6A04, 0x00003304, 0xD2506A05, 0x01A90105,
+    0xBF82FFEF, 0x8F198218, 0x34020082, 0x7E06020D, 0xD24A6A02, 0x00001901,
+    0xD2506A03, 0x01A90103, 0x7E0A020F, 0xD24A6A04, 0x00001D01, 0xD2506A05,
+    0x01A90105, 0xD1C2006A, 0x00002102, 0xBF86000F, 0x87FE6A7E, 0xDC300000,
+    0x01000002, 0xD24A6A02, 0x00003302, 0xD2506A03, 0x01A90103, 0xBF8C0F70,
+    0xDC700000, 0x00000104, 0xD24A6A04, 0x00003304, 0xD2506A05, 0x01A90105,
+    0xBF82FFEE, 0xBEFE04C1, 0x7E060211, 0xD24A6A02, 0x00002100, 0xD2506A03,
+    0x01A90103, 0x7E0A0213, 0xD24A6A04, 0x00002500, 0xD2506A05, 0x01A90105,
+    0xD1C2006A, 0x00002902, 0xBF860006, 0x87FE6A7E, 0xDC200000, 0x01000002,
+    0xBF8C0F70, 0xDC600000, 0x00000104, 0xBF810000,
+static const unsigned int kCodeCopyMisaligned7[] = {
+    0xC0820100, 0xC0840104, 0xC0860108, 0xC008010C, 0xBF8C007F, 0x8F028602,
+    0x4A000002, 0x7E060205, 0xD24A6A02, 0x00000900, 0xD2506A03, 0x01A90103,
+    0x7E0A0207, 0xD24A6A04, 0x00000D00, 0xD2506A05, 0x01A90105, 0xD1C2006A,
+    0x00001102, 0xBF860032, 0xDC200000, 0x06000002, 0xD24A6A02, 0x00002102,
+    0xD2506A03, 0x01A90103, 0xDC200000, 0x07000002, 0xD24A6A02, 0x00002102,
+    0xD2506A03, 0x01A90103, 0xDC200000, 0x08000002, 0xD24A6A02, 0x00002102,
+    0xD2506A03, 0x01A90103, 0xDC200000, 0x09000002, 0xD24A6A02, 0x00002102,
+    0xD2506A03, 0x01A90103, 0xBF8C0F70, 0xDC600000, 0x00000604, 0xD24A6A04,
+    0x00002104, 0xD2506A05, 0x01A90105, 0xDC600000, 0x00000704, 0xD24A6A04,
+    0x00002104, 0xD2506A05, 0x01A90105, 0xDC600000, 0x00000804, 0xD24A6A04,
+    0x00002104, 0xD2506A05, 0x01A90105, 0xDC600000, 0x00000904, 0xD24A6A04,
+    0x00002104, 0xD2506A05, 0x01A90105, 0xBF82FFCB, 0x7E060209, 0xD24A6A02,
+    0x00001100, 0xD2506A03, 0x01A90103, 0x7E0A020B, 0xD24A6A04, 0x00001500,
+    0xD2506A05, 0x01A90105, 0xD1C2006A, 0x00001902, 0xBF86000F, 0x87FE6A7E,
+    0xDC200000, 0x01000002, 0xD24A6A02, 0x00002102, 0xD2506A03, 0x01A90103,
+    0xBF8C0F70, 0xDC600000, 0x00000104, 0xD24A6A04, 0x00002104, 0xD2506A05,
+    0x01A90105, 0xBF82FFEE, 0xBF810000,
+static const unsigned int kCodeFill7[] = {
+    0xC0820100, 0xC0840104, 0xBF8C007F, 0x8F028602, 0x4A000002, 0x7E08020A,
+    0x7E0A020A, 0x7E0C020A, 0x7E0E020A, 0x8F0C840B, 0x34020084, 0x7E060205,
+    0xD24A6A02, 0x00000901, 0xD2506A03, 0x01A90103, 0xD1C2006A, 0x00000D02,
+    0xBF860007, 0xDC780000, 0x00000402, 0xD24A6A02, 0x00001902, 0xD2506A03,
+    0x01A90103, 0xBF82FFF6, 0x8F0C820B, 0x34020082, 0x7E060207, 0xD24A6A02,
+    0x00000D01, 0xD2506A03, 0x01A90103, 0xD1C2006A, 0x00001102, 0xBF860008,
+    0x87FE6A7E, 0xDC700000, 0x00000402, 0xD24A6A02, 0x00001902, 0xD2506A03,
+    0x01A90103, 0xBF82FFF5, 0xBF810000,
+static const unsigned int kCodeTrapHandler8[] = {
+    0xC0061C80, 0x000000C0, 0xBF8C007F, 0xBEFE0181, 0x80728872, 0x82738073,
+    0x7E000272, 0x7E020273, 0x7E0402FF, 0x80000000, 0x7E060280, 0xDD800000,
+    0x00000200, 0xBF8C0F70, 0x7DD40500, 0xBF870011, 0xC0061D39, 0x00000008,
+    0xBF8C007F, 0x86F47474, 0xBF84000C, 0x80729072, 0x82738073, 0xC0021CB9,
+    0x00000000, 0xBF8C007F, 0x7E000274, 0x7E020275, 0x7E040272, 0xDC700000,
+    0x00000200, 0xBF8C0F70, 0xBF900001, 0xBF8D0001, 0xBE801F70,
+static const unsigned int kCodeCopyAligned8[] = {
+    0xC00A0100, 0x00000000, 0xC00A0200, 0x00000010, 0xC00A0300, 0x00000020,
+    0xC00A0400, 0x00000030, 0xC00A0500, 0x00000040, 0xC0020600, 0x00000050,
+    0xBF8C007F, 0x8E028602, 0x32000002, 0x7E060205, 0xD1196A02, 0x00000900,
+    0xD11C6A03, 0x01A90103, 0x7E0A0207, 0xD1196A04, 0x00000D00, 0xD11C6A05,
+    0x01A90105, 0xD0E9006A, 0x00001102, 0xBF86000F, 0x86FE6A7E, 0xDC400000,
+    0x01000002, 0xBF8C0F70, 0xD1196A02, 0x00003102, 0xD11C6A03, 0x01A90103,
+    0xDC600000, 0x00000104, 0xD1196A04, 0x00003104, 0xD11C6A05, 0x01A90105,
+    0xBF82FFEE, 0xBEFE01C1, 0x8E198418, 0x24020084, 0x7E060209, 0xD1196A02,
+    0x00001101, 0xD11C6A03, 0x01A90103, 0x7E0A020B, 0xD1196A04, 0x00001501,
+    0xD11C6A05, 0x01A90105, 0xD0E9006A, 0x00001902, 0xBF86000E, 0xDC5C0000,
+    0x08000002, 0xD1196A02, 0x00003302, 0xD11C6A03, 0x01A90103, 0xBF8C0F70,
+    0xDC7C0000, 0x00000804, 0xD1196A04, 0x00003304, 0xD11C6A05, 0x01A90105,
+    0xBF82FFEF, 0x8E198218, 0x24020082, 0x7E06020D, 0xD1196A02, 0x00001901,
+    0xD11C6A03, 0x01A90103, 0x7E0A020F, 0xD1196A04, 0x00001D01, 0xD11C6A05,
+    0x01A90105, 0xD0E9006A, 0x00002102, 0xBF86000F, 0x86FE6A7E, 0xDC500000,
+    0x01000002, 0xD1196A02, 0x00003302, 0xD11C6A03, 0x01A90103, 0xBF8C0F70,
+    0xDC700000, 0x00000104, 0xD1196A04, 0x00003304, 0xD11C6A05, 0x01A90105,
+    0xBF82FFEE, 0xBEFE01C1, 0x7E060211, 0xD1196A02, 0x00002100, 0xD11C6A03,
+    0x01A90103, 0x7E0A0213, 0xD1196A04, 0x00002500, 0xD11C6A05, 0x01A90105,
+    0xD0E9006A, 0x00002902, 0xBF860006, 0x86FE6A7E, 0xDC400000, 0x01000002,
+    0xBF8C0F70, 0xDC600000, 0x00000104, 0xBF810000,
+static const unsigned int kCodeCopyMisaligned8[] = {
+    0xC00A0100, 0x00000000, 0xC00A0200, 0x00000010, 0xC00A0300, 0x00000020,
+    0xC0020400, 0x00000030, 0xBF8C007F, 0x8E028602, 0x32000002, 0x7E060205,
+    0xD1196A02, 0x00000900, 0xD11C6A03, 0x01A90103, 0x7E0A0207, 0xD1196A04,
+    0x00000D00, 0xD11C6A05, 0x01A90105, 0xD0E9006A, 0x00001102, 0xBF860032,
+    0xDC400000, 0x06000002, 0xD1196A02, 0x00002102, 0xD11C6A03, 0x01A90103,
+    0xDC400000, 0x07000002, 0xD1196A02, 0x00002102, 0xD11C6A03, 0x01A90103,
+    0xDC400000, 0x08000002, 0xD1196A02, 0x00002102, 0xD11C6A03, 0x01A90103,
+    0xDC400000, 0x09000002, 0xD1196A02, 0x00002102, 0xD11C6A03, 0x01A90103,
+    0xBF8C0F70, 0xDC600000, 0x00000604, 0xD1196A04, 0x00002104, 0xD11C6A05,
+    0x01A90105, 0xDC600000, 0x00000704, 0xD1196A04, 0x00002104, 0xD11C6A05,
+    0x01A90105, 0xDC600000, 0x00000804, 0xD1196A04, 0x00002104, 0xD11C6A05,
+    0x01A90105, 0xDC600000, 0x00000904, 0xD1196A04, 0x00002104, 0xD11C6A05,
+    0x01A90105, 0xBF82FFCB, 0x7E060209, 0xD1196A02, 0x00001100, 0xD11C6A03,
+    0x01A90103, 0x7E0A020B, 0xD1196A04, 0x00001500, 0xD11C6A05, 0x01A90105,
+    0xD0E9006A, 0x00001902, 0xBF86000F, 0x86FE6A7E, 0xDC400000, 0x01000002,
+    0xD1196A02, 0x00002102, 0xD11C6A03, 0x01A90103, 0xBF8C0F70, 0xDC600000,
+    0x00000104, 0xD1196A04, 0x00002104, 0xD11C6A05, 0x01A90105, 0xBF82FFEE,
+    0xBF810000,
+static const unsigned int kCodeFill8[] = {
+    0xC00A0100, 0x00000000, 0xC00A0200, 0x00000010, 0xBF8C007F, 0x8E028602,
+    0x32000002, 0x7E08020A, 0x7E0A020A, 0x7E0C020A, 0x7E0E020A, 0x8E0C840B,
+    0x24020084, 0x7E060205, 0xD1196A02, 0x00000901, 0xD11C6A03, 0x01A90103,
+    0xD0E9006A, 0x00000D02, 0xBF860007, 0xDC7C0000, 0x00000402, 0xD1196A02,
+    0x00001902, 0xD11C6A03, 0x01A90103, 0xBF82FFF6, 0x8E0C820B, 0x24020082,
+    0x7E060207, 0xD1196A02, 0x00000D01, 0xD11C6A03, 0x01A90103, 0xD0E9006A,
+    0x00001102, 0xBF860008, 0x86FE6A7E, 0xDC700000, 0x00000402, 0xD1196A02,
+    0x00001902, 0xD11C6A03, 0x01A90103, 0xBF82FFF5, 0xBF810000,
+}  // namespace amd
+#endif  // header guard
diff --git a/src/core/inc/amd_hsa_loader.hpp b/src/core/inc/amd_hsa_loader.hpp
index 5b9cd4d92..251df841a 100644
--- a/src/core/inc/amd_hsa_loader.hpp
+++ b/src/core/inc/amd_hsa_loader.hpp
@@ -47,6 +47,7 @@
 #include <cstdint>
 #include "hsa.h"
 #include "hsa_ext_image.h"
+#include "hsa_ven_amd_loader.h"
 #include "amd_hsa_elf.h"
 #include <string>
 #include <mutex>
@@ -317,6 +318,13 @@ class Executable {
       void *data),
     void *data) = 0;
+  virtual size_t GetNumSegmentDescriptors() = 0;
+  virtual size_t QuerySegmentDescriptors(
+    hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors,
+    size_t total_num_segment_descriptors,
+    size_t first_empty_segment_descriptor) = 0;
   virtual uint64_t FindHostAddress(uint64_t device_address) = 0;
   virtual void Print(std::ostream& out) = 0;
@@ -368,6 +376,11 @@ class Loader {
       void *data),
     void *data) = 0;
+  /// @brief same as hsa_ven_amd_loader_query_segment_descriptors.
+  virtual hsa_status_t QuerySegmentDescriptors(
+    hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors,
+    size_t *num_segment_descriptors) = 0;
   /// @brief Returns host address given @p device_address. If @p device_address
   /// is already host address, returns null pointer. If @p device_address is
   /// invalid address, returns null pointer.
diff --git a/src/core/inc/blit.h b/src/core/inc/blit.h
index f44a6bab1..48aebaa64 100644
--- a/src/core/inc/blit.h
+++ b/src/core/inc/blit.h
@@ -66,8 +66,10 @@ class Blit {
   /// @note: The call will block until all commands have executed.
+  /// @param agent Agent passed to Initialize.
+  ///
   /// @return hsa_status_t
-  virtual hsa_status_t Destroy() = 0;
+  virtual hsa_status_t Destroy(const core::Agent& agent) = 0;
   /// @brief Submit a linear copy command to the the underlying compute device's
   /// control block. The call is blocking until the command execution is
@@ -102,6 +104,15 @@ class Blit {
   /// @param num Number of uint32_t element to be set to the value.
   virtual hsa_status_t SubmitLinearFillCommand(void* ptr, uint32_t value,
                                                size_t num) = 0;
+  /// @brief Enable profiling of the asynchronous copy command. The timestamp
+  /// of each copy request will be stored in the completion signal structure.
+  ///
+  /// @param enable True to enable profiling. False to disable profiling.
+  ///
+  /// @return HSA_STATUS_SUCCESS if the request to enable/disable profiling is
+  /// successful.
+  virtual hsa_status_t EnableProfiling(bool enable) = 0;
 }  // namespace core
diff --git a/src/core/inc/hsa_api_trace_int.h b/src/core/inc/hsa_api_trace_int.h
index cc9a638a9..769dbed2e 100644
--- a/src/core/inc/hsa_api_trace_int.h
+++ b/src/core/inc/hsa_api_trace_int.h
@@ -47,17 +47,28 @@
 #include "core/inc/hsa_internal.h"
 namespace core {
-struct ApiTable {
-  ::ApiTable table;
-  ExtTable extension_backup;
+  struct HsaApiTable {
-  ApiTable();
-  void Reset();
-  void LinkExts(ExtTable* ptr);
+    static const uint32_t HSA_EXT_FINALIZER_API_TABLE_ID = 0;
+    static const uint32_t HSA_EXT_IMAGE_API_TABLE_ID = 1;
-extern ApiTable hsa_api_table_;
-extern ApiTable hsa_internal_api_table_;
+    ::HsaApiTable hsa_api;
+    ::CoreApiTable core_api;
+    ::AmdExtTable amd_ext_api;
+    ::FinalizerExtTable finalizer_api;
+    ::ImageExtTable image_api;
+    HsaApiTable();
+    void Init();
+    void UpdateCore();
+    void UpdateAmdExts();
+    void CloneExts(void* ptr, uint32_t table_id);
+    void LinkExts(void* ptr, uint32_t table_id);
+    void Reset();
+  };
+  extern HsaApiTable hsa_api_table_;
+  extern HsaApiTable hsa_internal_api_table_;
diff --git a/src/core/inc/hsa_ext_amd_impl.h b/src/core/inc/hsa_ext_amd_impl.h
new file mode 100755
index 000000000..54f8e3458
--- /dev/null
+++ b/src/core/inc/hsa_ext_amd_impl.h
@@ -0,0 +1,186 @@
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
+// Developed by:
+//                 AMD Research and AMD HSA Software Development
+//                 Advanced Micro Devices, Inc.
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+// HSA AMD extension.
+#include "hsa.h"
+#include "hsa_ext_image.h"
+#include "hsa_ext_amd.h"
+// Wrap internal implementation inside AMD namespace
+namespace AMD {
+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API hsa_amd_coherency_get_type(hsa_agent_t agent,
+                                                hsa_amd_coherency_type_t* type);
+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API hsa_amd_coherency_set_type(hsa_agent_t agent,
+                                                hsa_amd_coherency_type_t type);
+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API
+    hsa_amd_profiling_set_profiler_enabled(hsa_queue_t* queue, int enable);
+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API
+    hsa_amd_profiling_async_copy_enable(bool enable);
+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API hsa_amd_profiling_get_dispatch_time(
+    hsa_agent_t agent, hsa_signal_t signal,
+    hsa_amd_profiling_dispatch_time_t* time);
+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API hsa_amd_profiling_get_async_copy_time(
+    hsa_signal_t signal, hsa_amd_profiling_async_copy_time_t* time);
+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API
+    hsa_amd_profiling_convert_tick_to_system_domain(hsa_agent_t agent,
+                                                    uint64_t agent_tick,
+                                                    uint64_t* system_tick);
+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API
+    hsa_amd_signal_async_handler(hsa_signal_t signal,
+                                 hsa_signal_condition_t cond,
+                                 hsa_signal_value_t value,
+                                 hsa_amd_signal_handler handler, void* arg);
+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API
+    hsa_amd_async_function(void (*callback)(void* arg), void* arg);
+// Mirrors Amd Extension Apis
+uint32_t HSA_API
+    hsa_amd_signal_wait_any(uint32_t signal_count, hsa_signal_t* signals,
+                            hsa_signal_condition_t* conds,
+                            hsa_signal_value_t* values, uint64_t timeout_hint,
+                            hsa_wait_state_t wait_hint,
+                            hsa_signal_value_t* satisfying_value);
+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue,
+                                               uint32_t num_cu_mask_count,
+                                               const uint32_t* cu_mask);
+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API
+    hsa_amd_memory_pool_get_info(hsa_amd_memory_pool_t memory_pool,
+                                 hsa_amd_memory_pool_info_t attribute,
+                                 void* value);
+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API hsa_amd_agent_iterate_memory_pools(
+    hsa_agent_t agent,
+    hsa_status_t (*callback)(hsa_amd_memory_pool_t memory_pool, void* data),
+    void* data);
+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API
+    hsa_amd_memory_pool_allocate(hsa_amd_memory_pool_t memory_pool, size_t size,
+                                 uint32_t flags, void** ptr);
+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API hsa_amd_memory_pool_free(void* ptr);
+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API
+    hsa_amd_memory_async_copy(void* dst, hsa_agent_t dst_agent, const void* src,
+                              hsa_agent_t src_agent, size_t size,
+                              uint32_t num_dep_signals,
+                              const hsa_signal_t* dep_signals,
+                              hsa_signal_t completion_signal);
+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API hsa_amd_agent_memory_pool_get_info(
+    hsa_agent_t agent, hsa_amd_memory_pool_t memory_pool,
+    hsa_amd_agent_memory_pool_info_t attribute, void* value);
+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API
+    hsa_amd_agents_allow_access(uint32_t num_agents, const hsa_agent_t* agents,
+                                const uint32_t* flags, const void* ptr);
+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API
+    hsa_amd_memory_pool_can_migrate(hsa_amd_memory_pool_t src_memory_pool,
+                                    hsa_amd_memory_pool_t dst_memory_pool,
+                                    bool* result);
+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API hsa_amd_memory_migrate(const void* ptr,
+                                            hsa_amd_memory_pool_t memory_pool,
+                                            uint32_t flags);
+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API hsa_amd_memory_lock(void* host_ptr, size_t size,
+                                         hsa_agent_t* agents, int num_agent,
+                                         void** agent_ptr);
+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API hsa_amd_memory_unlock(void* host_ptr);
+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API
+    hsa_amd_memory_fill(void* ptr, uint32_t value, size_t count);
+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API hsa_amd_interop_map_buffer(uint32_t num_agents,   
+                                        hsa_agent_t* agents,       
+                                        int interop_handle,    
+                                        uint32_t flags,        
+                                        size_t* size,          
+                                        void** ptr,            
+                                        size_t* metadata_size, 
+                                        const void** metadata);
+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API hsa_amd_interop_unmap_buffer(void* ptr);
+}  // end of AMD namespace
+#endif  // header guard
diff --git a/src/core/inc/hsa_ext_interface.h b/src/core/inc/hsa_ext_interface.h
index 3645c23fb..236a165c7 100644
--- a/src/core/inc/hsa_ext_interface.h
+++ b/src/core/inc/hsa_ext_interface.h
@@ -52,27 +52,43 @@
 #include "core/util/utils.h"
 namespace core {
-struct ExtTableInternal : public ExtTable {
+struct ImageExtTableInternal : public ImageExtTable {
   decltype(::hsa_amd_image_get_info_max_dim)* hsa_amd_image_get_info_max_dim_fn;
-  decltype(::hsa_amd_image_create)* hsa_amd_image_create_fn;
 class ExtensionEntryPoints {
-  ExtTableInternal table;
+  // Table of function pointers for Hsa Extension Image
+  ImageExtTableInternal image_api;
+  // Table of function pointers for Hsa Extension Finalizer
+  FinalizerExtTable finalizer_api;
-  bool Load(std::string library_name);
+  bool LoadFinalizer(std::string library_name);
+  bool LoadImage(std::string library_name);
   void Unload();
-  typedef void (*Load_t)(const ::ApiTable* table);
+  typedef void (*Load_t)(const ::HsaApiTable* table);
   typedef void (*Unload_t)();
   std::vector<os::LibHandle> libs_;
-  void InitTable();
+  // Initialize table for HSA Finalizer Extension Api's
+  void InitFinalizerExtTable();
+  // Initialize table for HSA Image Extension Api's
+  void InitImageExtTable();
+  // Initialize Amd Ext table for Api related to Images
+  void InitAmdExtTable();
+  // Update Amd Ext table for Api related to Images
+  void UpdateAmdExtTable(void *func_ptr);
diff --git a/src/core/inc/hsa_table_interface.h b/src/core/inc/hsa_table_interface.h
index 236ef41c7..99a1280d8 100644
--- a/src/core/inc/hsa_table_interface.h
+++ b/src/core/inc/hsa_table_interface.h
@@ -42,6 +42,6 @@
 #include "hsa_api_trace.h"
-void hsa_table_interface_init(const ApiTable* table);
+void hsa_table_interface_init(const HsaApiTable* apiTable);
-const ApiTable* hsa_table_interface_get_table();
+const HsaApiTable* hsa_table_interface_get_table();
diff --git a/src/core/inc/interrupt_signal.h b/src/core/inc/interrupt_signal.h
index bef9564be..adbbb5070 100644
--- a/src/core/inc/interrupt_signal.h
+++ b/src/core/inc/interrupt_signal.h
@@ -165,10 +165,6 @@ class InterruptSignal : public Signal {
   /// @brief See base class Signal.
   __forceinline HsaEvent* EopEvent() { return event_; }
-  // TODO: work around for SDMA async copy. Bypass waiting on EOP
-  // event because SDMA copy does not handle interrupt yet.
-  __forceinline void DisableWaitEvent() { wait_on_event_ = false; }
   /// @brief prevent throwing exceptions
   void* operator new(size_t size) { return malloc(size); }
@@ -186,10 +182,6 @@ class InterruptSignal : public Signal {
   /// closes or not.
   bool free_event_;
-  // TODO: work around for SDMA async copy. Bypass waiting on EOP
-  // event because SDMA copy does not handle interrupt yet.
-  bool wait_on_event_;
   /// Used to obtain a globally unique value (address) for rtti.
   static int rtti_id_;
diff --git a/src/core/inc/runtime.h b/src/core/inc/runtime.h
index d3c6f8b9f..6d4554215 100644
--- a/src/core/inc/runtime.h
+++ b/src/core/inc/runtime.h
@@ -280,8 +280,6 @@ class Runtime {
   Agent* blit_agent() { return blit_agent_; }
-  Agent* host_agent() { return host_agent_; }
   const std::vector<const MemoryRegion*>& system_regions_fine() const {
     return system_regions_fine_;
@@ -455,9 +453,6 @@ class Runtime {
   // Deallocator using ::system_region_
   std::function<void(void*)> system_deallocator_;
-  // Pointer to a host/cpu agent object.
-  Agent* host_agent_;
   // Pointer to DMA agent.
   Agent* blit_agent_;
diff --git a/src/core/inc/signal.h b/src/core/inc/signal.h
index e6509421c..478034951 100644
--- a/src/core/inc/signal.h
+++ b/src/core/inc/signal.h
@@ -57,6 +57,7 @@
 #include "inc/amd_hsa_signal.h"
 namespace core {
+class Agent;
 class Signal;
 /// @brief Helper structure to simplify conversion of amd_signal_t and
@@ -75,7 +76,9 @@ class Signal : public Checked<0x71FCCA6A3D5D5276>,
   /// @brief Constructor initializes the signal with initial value.
   explicit Signal(hsa_signal_value_t initial_value)
-      : Shared(), signal_(shared_object()->amd_signal) {
+      : Shared(),
+        signal_(shared_object()->amd_signal),
+        async_copy_agent_(NULL) {
     if (!Shared::IsSharedObjectAllocationValid()) {
       invalid_ = true;
@@ -225,6 +228,12 @@ class Signal : public Checked<0x71FCCA6A3D5D5276>,
   /// @brief Checks if signal is currently in use by a wait API.
   bool InWaiting() const { return waiting_ != 0; }
+  __forceinline void async_copy_agent(core::Agent* agent) {
+    async_copy_agent_ = agent;
+  }
+  __forceinline core::Agent* async_copy_agent() { return async_copy_agent_; }
   /// @brief Structure which defines key signal elements like type and value.
   /// Address of this struct is used as a value for the opaque handle of type
   /// hsa_signal_t provided to the public API.
@@ -246,6 +255,9 @@ class Signal : public Checked<0x71FCCA6A3D5D5276>,
   volatile uint32_t retained_;
+  /// @variable Pointer to agent used to perform an async copy.
+  core::Agent* async_copy_agent_;
diff --git a/src/core/runtime/amd_aql_queue.cpp b/src/core/runtime/amd_aql_queue.cpp
index 9bfa78045..3999fd556 100644
--- a/src/core/runtime/amd_aql_queue.cpp
+++ b/src/core/runtime/amd_aql_queue.cpp
@@ -64,6 +64,7 @@
 #include "core/util/utils.h"
 #include "core/inc/registers.h"
 #include "core/inc/interrupt_signal.h"
+#include "core/inc/hsa_ext_amd_impl.h"
 namespace amd {
 // Queue::amd_queue_ is cache-aligned for performance.
@@ -99,17 +100,13 @@ AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id,
-  hsa_status_t stat = agent_->GetInfo(HSA_AGENT_INFO_PROFILE, &agent_profile_);
-  assert(stat == HSA_STATUS_SUCCESS);
-  const core::Isa* isa = agent_->isa();
   // When queue_full_workaround_ is set to 1, the ring buffer is internally
   // doubled in size. Virtual addresses in the upper half of the ring allocation
   // are mapped to the same set of pages backing the lower half.
   // Values written to the HW doorbell are modulo the doubled size.
   // This allows the HW to accept (doorbell == last_doorbell + queue_size).
   // This workaround is required for GFXIP 7 and GFXIP 8 ASICs.
+  const core::Isa* isa = agent_->isa();
   queue_full_workaround_ =
       (isa->GetMajorVersion() == 7 || isa->GetMajorVersion() == 8)
           ? 1
@@ -177,7 +174,7 @@ AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id,
   const auto& props = agent->properties();
   amd_queue_.max_cu_id = (props.NumFComputeCores / props.NumSIMDPerCU) - 1;
-  amd_queue_.max_wave_id = props.MaxWavesPerSIMD - 1;
+  amd_queue_.max_wave_id = (props.MaxWavesPerSIMD * props.NumSIMDPerCU) - 1;
   AMD_HSA_BITS_SET(amd_queue_.queue_properties, AMD_QUEUE_PROPERTIES_IS_PTR64,
@@ -187,62 +184,8 @@ AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id,
-  // Populate scratch resource descriptor in amd_queue_.
-  SQ_BUF_RSRC_WORD0 srd0;
-  SQ_BUF_RSRC_WORD1 srd1;
-  SQ_BUF_RSRC_WORD2 srd2;
-  SQ_BUF_RSRC_WORD3 srd3;
-  uintptr_t scratch_base = uintptr_t(queue_scratch_.queue_base);
-  uint32_t scratch_base_hi = 0;
-  scratch_base_hi = uint32_t(scratch_base >> 32);
-  srd0.bits.BASE_ADDRESS = uint32_t(scratch_base);
-  srd1.bits.BASE_ADDRESS_HI = scratch_base_hi;
-  srd1.bits.STRIDE = 0;
-  srd1.bits.CACHE_SWIZZLE = 0;
-  srd1.bits.SWIZZLE_ENABLE = 1;
-  srd2.bits.NUM_RECORDS = uint32_t(queue_scratch_.size);
-  srd3.bits.DST_SEL_X = SQ_SEL_X;
-  srd3.bits.DST_SEL_Y = SQ_SEL_Y;
-  srd3.bits.DST_SEL_Z = SQ_SEL_Z;
-  srd3.bits.DST_SEL_W = SQ_SEL_W;
-  srd3.bits.DATA_FORMAT = BUF_DATA_FORMAT_32;
-  srd3.bits.ELEMENT_SIZE = 1;  // 4
-  srd3.bits.INDEX_STRIDE = 3;  // 64
-  srd3.bits.ADD_TID_ENABLE = 1;
-  srd3.bits.ATC__CI__VI = (agent_profile_ == HSA_PROFILE_FULL) ? 1 : 0;
-  srd3.bits.HASH_ENABLE = 0;
-  srd3.bits.HEAP = 0;
-  srd3.bits.MTYPE__CI__VI = 0;
-  srd3.bits.TYPE = SQ_RSRC_BUF;
-  amd_queue_.scratch_resource_descriptor[0] = srd0.u32All;
-  amd_queue_.scratch_resource_descriptor[1] = srd1.u32All;
-  amd_queue_.scratch_resource_descriptor[2] = srd2.u32All;
-  amd_queue_.scratch_resource_descriptor[3] = srd3.u32All;
-  // Populate flat scratch parameters in amd_queue_.
-  amd_queue_.scratch_backing_memory_location =
-      queue_scratch_.queue_process_offset;
-  amd_queue_.scratch_backing_memory_byte_size = queue_scratch_.size;
-  amd_queue_.scratch_workitem_byte_size =
-      uint32_t(queue_scratch_.size_per_thread);
-  // Set concurrent wavefront limits when scratch is being used.
-  COMPUTE_TMPRING_SIZE tmpring_size = {0};
-  if (queue_scratch_.size != 0) {
-    tmpring_size.bits.WAVES =
-        (queue_scratch_.size / queue_scratch_.size_per_thread / 64);
-    tmpring_size.bits.WAVESIZE =
-        (((64 * queue_scratch_.size_per_thread) + 1023) / 1024);
-  }
-  amd_queue_.compute_tmpring_size = tmpring_size.u32All;
+  // Initialize scratch memory related entities
+  InitScratchSRD();
   // Set group and private memory apertures in amd_queue_.
   auto& regions = agent->regions();
@@ -307,7 +250,7 @@ AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id,
     auto signal = new core::InterruptSignal(0, queue_event_);
     amd_queue_.queue_inactive_signal = core::InterruptSignal::Convert(signal);
-    if (hsa_amd_signal_async_handler(
+    if (AMD::hsa_amd_signal_async_handler(
             amd_queue_.queue_inactive_signal, HSA_SIGNAL_CONDITION_NE, 0,
             DynamicScratchHandler, this) != HSA_STATUS_SUCCESS)
@@ -518,7 +461,7 @@ uint32_t AqlQueue::ComputeRingBufferMaxPkts() {
 void AqlQueue::AllocRegisteredRingBuffer(uint32_t queue_size_pkts) {
-  if (agent_profile_ == HSA_PROFILE_FULL) {
+  if (agent_->profile() == HSA_PROFILE_FULL) {
     // Compute the physical and virtual size of the queue.
     uint32_t ring_buf_phys_size_bytes =
         uint32_t(queue_size_pkts * sizeof(core::AqlPacket));
@@ -696,7 +639,7 @@ void AqlQueue::AllocRegisteredRingBuffer(uint32_t queue_size_pkts) {
 void AqlQueue::FreeRegisteredRingBuffer() {
-  if (agent_profile_ == HSA_PROFILE_FULL) {
+  if (agent_->profile() == HSA_PROFILE_FULL) {
 #ifdef __linux__
     munmap(ring_buf_, ring_buf_alloc_bytes_);
@@ -755,37 +698,8 @@ bool AqlQueue::DynamicScratchHandler(hsa_signal_value_t error_code, void* arg) {
       return false;
-    SQ_BUF_RSRC_WORD0 srd0;
-    SQ_BUF_RSRC_WORD2 srd2;
-    uintptr_t base = (uintptr_t)scratch.queue_base;
-    srd0.u32All = queue->amd_queue_.scratch_resource_descriptor[0];
-    srd2.u32All = queue->amd_queue_.scratch_resource_descriptor[2];
-    srd0.bits.BASE_ADDRESS = uint32_t(base);
-    srd2.bits.NUM_RECORDS = uint32_t(scratch.size);
-    queue->amd_queue_.scratch_resource_descriptor[0] = srd0.u32All;
-    queue->amd_queue_.scratch_resource_descriptor[2] = srd2.u32All;
-    SQ_BUF_RSRC_WORD1 srd1;
-    srd1.u32All = queue->amd_queue_.scratch_resource_descriptor[1];
-    srd1.bits.BASE_ADDRESS_HI = uint32_t(base >> 32);
-    queue->amd_queue_.scratch_resource_descriptor[1] = srd1.u32All;
-    queue->amd_queue_.scratch_backing_memory_location =
-        scratch.queue_process_offset;
-    queue->amd_queue_.scratch_backing_memory_byte_size = scratch.size;
-    queue->amd_queue_.scratch_workitem_byte_size =
-        uint32_t(scratch.size_per_thread);
-    COMPUTE_TMPRING_SIZE tmpring_size = {0};
-    tmpring_size.bits.WAVES = (scratch.size / scratch.size_per_thread / 64);
-    tmpring_size.bits.WAVESIZE =
-        (((64 * scratch.size_per_thread) + 1023) / 1024);
-    queue->amd_queue_.compute_tmpring_size = tmpring_size.u32All;
+    // Reset scratch memory related entities for the queue
+    queue->InitScratchSRD();
   } else if ((error_code & 2) == 2) {  // Invalid dim
@@ -853,4 +767,80 @@ hsa_status_t AqlQueue::SetCUMasking(const uint32_t num_cu_mask_count,
+// @brief Define the Scratch Buffer Descriptor and related parameters
+// that enable kernel access scratch memory
+void AqlQueue::InitScratchSRD() {
+  // Populate scratch resource descriptor
+  SQ_BUF_RSRC_WORD0 srd0;
+  SQ_BUF_RSRC_WORD1 srd1;
+  SQ_BUF_RSRC_WORD2 srd2;
+  SQ_BUF_RSRC_WORD3 srd3;
+  uint32_t scratch_base_hi = 0;
+  uintptr_t scratch_base = uintptr_t(queue_scratch_.queue_base);
+  scratch_base_hi = uint32_t(scratch_base >> 32);
+  #endif
+  srd0.bits.BASE_ADDRESS = uint32_t(scratch_base);
+  srd1.bits.BASE_ADDRESS_HI = scratch_base_hi;
+  srd1.bits.STRIDE = 0;
+  srd1.bits.CACHE_SWIZZLE = 0;
+  srd1.bits.SWIZZLE_ENABLE = 1;
+  srd2.bits.NUM_RECORDS = uint32_t(queue_scratch_.size);
+  srd3.bits.DST_SEL_X = SQ_SEL_X;
+  srd3.bits.DST_SEL_Y = SQ_SEL_Y;
+  srd3.bits.DST_SEL_Z = SQ_SEL_Z;
+  srd3.bits.DST_SEL_W = SQ_SEL_W;
+  srd3.bits.DATA_FORMAT = BUF_DATA_FORMAT_32;
+  srd3.bits.ELEMENT_SIZE = 1;  // 4
+  srd3.bits.INDEX_STRIDE = 3;  // 64
+  srd3.bits.ADD_TID_ENABLE = 1;
+  srd3.bits.ATC__CI__VI = (agent_->profile() == HSA_PROFILE_FULL);
+  srd3.bits.HASH_ENABLE = 0;
+  srd3.bits.HEAP = 0;
+  srd3.bits.MTYPE__CI__VI = 0;
+  srd3.bits.TYPE = SQ_RSRC_BUF;
+  // Update Queue's Scratch descriptor's property 
+  amd_queue_.scratch_resource_descriptor[0] = srd0.u32All;
+  amd_queue_.scratch_resource_descriptor[1] = srd1.u32All;
+  amd_queue_.scratch_resource_descriptor[2] = srd2.u32All;
+  amd_queue_.scratch_resource_descriptor[3] = srd3.u32All;
+  // Populate flat scratch parameters in amd_queue_.
+  amd_queue_.scratch_backing_memory_location =
+      queue_scratch_.queue_process_offset;
+  amd_queue_.scratch_backing_memory_byte_size = queue_scratch_.size;
+  amd_queue_.scratch_workitem_byte_size =
+      uint32_t(queue_scratch_.size_per_thread);
+  // Set concurrent wavefront limits only when scratch is being used.
+  COMPUTE_TMPRING_SIZE tmpring_size = {0};
+  if (queue_scratch_.size == 0) {
+    amd_queue_.compute_tmpring_size = tmpring_size.u32All;
+    return;
+  }
+  // Determine the maximum number of waves device can support
+  const auto& agent_props = agent_->properties();
+  uint32_t num_cus = agent_props.NumFComputeCores / agent_props.NumSIMDPerCU;
+  uint32_t max_scratch_waves = num_cus * agent_props.MaxSlotsScratchCU;
+  // Scratch is allocated program COMPUTE_TMPRING_SIZE register
+  // Scratch Size per Wave is specified in terms of kilobytes 
+  uint32_t wave_size = agent_props.WaveFrontSize;
+  tmpring_size.bits.WAVESIZE =
+        (((wave_size * queue_scratch_.size_per_thread) + 1023) / 1024);
+  uint32_t num_waves = (queue_scratch_.size / (tmpring_size.bits.WAVESIZE * 1024));
+  tmpring_size.bits.WAVES = std::min(num_waves, max_scratch_waves);
+  amd_queue_.compute_tmpring_size = tmpring_size.u32All;
+  return;
 }  // namespace amd
diff --git a/src/core/runtime/amd_blit_kernel.cpp b/src/core/runtime/amd_blit_kernel.cpp
index a05aef536..aff60f08b 100644
--- a/src/core/runtime/amd_blit_kernel.cpp
+++ b/src/core/runtime/amd_blit_kernel.cpp
@@ -43,40 +43,486 @@
 #include "core/inc/amd_blit_kernel.h"
 #include <algorithm>
-#include <climits>
-#include <cmath>
-#include <cstring>
-#if defined(_WIN32) || defined(_WIN64)
-#define NOMINMAX
-#include <windows.h>
-#include <sys/mman.h>
-#include "core/inc/amd_blit_kernel_kv.h"
-#include "core/inc/amd_blit_kernel_vi.h"
+#include <sstream>
+#include <string>
 #include "core/inc/amd_gpu_agent.h"
 #include "core/inc/hsa_internal.h"
 #include "core/util/utils.h"
 namespace amd {
-const uint32_t BlitKernel::kGroupSize = 256;
-const size_t BlitKernel::kMaxCopyCount = AlignDown(UINT32_MAX, kGroupSize);
-const size_t BlitKernel::kMaxFillCount = AlignDown(UINT32_MAX, kGroupSize);
 static const uint16_t kInvalidPacketHeader = HSA_PACKET_TYPE_INVALID;
+static std::string kBlitKernelSource(R"(
+  // Compatibility function for GFXIP 7.
+  function s_load_dword_offset(byte_offset)
+    if kGFXIPVersion == 7
+      return byte_offset / 4
+    else
+      return byte_offset
+    end
+  end
+  // Memory copy for all cases except:
+  //  (src_addr & 0x3) != (dst_addr & 0x3)
+  //
+  // Kernel argument buffer:
+  //   [DW  0, 1]  Phase 1 src start address
+  //   [DW  2, 3]  Phase 1 dst start address
+  //   [DW  4, 5]  Phase 2 src start address
+  //   [DW  6, 7]  Phase 2 dst start address
+  //   [DW  8, 9]  Phase 3 src start address
+  //   [DW 10,11]  Phase 3 dst start address
+  //   [DW 12,13]  Phase 4 src start address
+  //   [DW 14,15]  Phase 4 dst start address
+  //   [DW 16,17]  Phase 4 src end address
+  //   [DW 18,19]  Phase 4 dst end address
+  //   [DW 20   ]  Total number of workitems
+  var kCopyAlignedVecWidth = 4
+  var kCopyAlignedUnroll = 1
+  shader CopyAligned
+    type(CS)
+    user_sgpr_count(2)
+    sgpr_count(32)
+    vgpr_count(8 + (kCopyAlignedUnroll * kCopyAlignedVecWidth))
+    // Retrieve kernel arguments.
+    s_load_dwordx4          s[4:7], s[0:1], s_load_dword_offset(0x0)
+    s_load_dwordx4          s[8:11], s[0:1], s_load_dword_offset(0x10)
+    s_load_dwordx4          s[12:15], s[0:1], s_load_dword_offset(0x20)
+    s_load_dwordx4          s[16:19], s[0:1], s_load_dword_offset(0x30)
+    s_load_dwordx4          s[20:23], s[0:1], s_load_dword_offset(0x40)
+    s_load_dword            s24, s[0:1], s_load_dword_offset(0x50)
+    s_waitcnt               lgkmcnt(0)
+    // Compute workitem id.
+    s_lshl_b32              s2, s2, 0x6
+    v_add_u32               v0, vcc, s2, v0
+    // =====================================================
+    // Phase 1: Byte copy up to 0x100 destination alignment.
+    // =====================================================
+    // Compute phase source address.
+    v_mov_b32               v3, s5
+    v_add_u32               v2, vcc, v0, s4
+    v_addc_u32              v3, vcc, v3, 0x0, vcc
+    // Compute phase destination address.
+    v_mov_b32               v5, s7
+    v_add_u32               v4, vcc, v0, s6
+    v_addc_u32              v5, vcc, v5, 0x0, vcc
+    // Mask off lanes (or branch out) after phase end.
+    v_cmp_lt_u64            vcc, v[2:3], s[8:9]
+    s_cbranch_vccz          L_COPY_ALIGNED_PHASE_1_DONE
+    s_and_b64               exec, exec, vcc
+    // Load from/advance the source address.
+    flat_load_ubyte         v1, v[2:3]
+    s_waitcnt               vmcnt(0)
+    v_add_u32               v2, vcc, v2, s24
+    v_addc_u32              v3, vcc, v3, 0x0, vcc
+    // Write to/advance the destination address.
+    flat_store_byte         v[4:5], v1
+    v_add_u32               v4, vcc, v4, s24
+    v_addc_u32              v5, vcc, v5, 0x0, vcc
+    // Repeat until branched out.
+    s_branch                L_COPY_ALIGNED_PHASE_1_LOOP
+    // Restore EXEC mask for all lanes.
+    s_mov_b64               exec, 0xFFFFFFFFFFFFFFFF
+    // ========================================================
+    // Phase 2: Unrolled dword[x4] copy up to last whole block.
+    // ========================================================
+    // Compute unrolled dword[x4] stride across all threads.
+    if kCopyAlignedVecWidth == 4
+      s_lshl_b32            s25, s24, 0x4
+    else
+      s_lshl_b32            s25, s24, 0x2
+    end
+    // Compute phase source address.
+    if kCopyAlignedVecWidth == 4
+      v_lshlrev_b32         v1, 0x4, v0
+    else
+      v_lshlrev_b32         v1, 0x2, v0
+    end
+    v_mov_b32               v3, s9
+    v_add_u32               v2, vcc, v1, s8
+    v_addc_u32              v3, vcc, v3, 0x0, vcc
+    // Compute phase destination address.
+    v_mov_b32               v5, s11
+    v_add_u32               v4, vcc, v1, s10
+    v_addc_u32              v5, vcc, v5, 0x0, vcc
+    // Branch out after phase end.
+    v_cmp_lt_u64            vcc, v[2:3], s[12:13]
+    s_cbranch_vccz          L_COPY_ALIGNED_PHASE_2_DONE
+    // Load from/advance the source address.
+    for var i = 0; i < kCopyAlignedUnroll; i ++
+      if kCopyAlignedVecWidth == 4
+        flat_load_dwordx4   v[8 + (i * 4)], v[2:3]
+      else
+        flat_load_dword     v[8 + i], v[2:3]
+      end
+      v_add_u32             v2, vcc, v2, s25
+      v_addc_u32            v3, vcc, v3, 0x0, vcc
+    end
+    // Write to/advance the destination address.
+    s_waitcnt               vmcnt(0)
+    for var i = 0; i < kCopyAlignedUnroll; i ++
+      if kCopyAlignedVecWidth == 4
+        flat_store_dwordx4  v[4:5], v[8 + (i * 4)]
+      else
+        flat_store_dword    v[4:5], v[8 + i]
+      end
+      v_add_u32             v4, vcc, v4, s25
+      v_addc_u32            v5, vcc, v5, 0x0, vcc
+    end
+    // Repeat until branched out.
+    s_branch                L_COPY_ALIGNED_PHASE_2_LOOP
+    // ===========================================
+    // Phase 3: Dword copy up to last whole dword.
+    // ===========================================
+    // Compute dword stride across all threads.
+    s_lshl_b32              s25, s24, 0x2
+    // Compute phase source address.
+    v_lshlrev_b32           v1, 0x2, v0
+    v_mov_b32               v3, s13
+    v_add_u32               v2, vcc, v1, s12
+    v_addc_u32              v3, vcc, v3, 0x0, vcc
+    // Compute phase destination address.
+    v_mov_b32               v5, s15
+    v_add_u32               v4, vcc, v1, s14
+    v_addc_u32              v5, vcc, v5, 0x0, vcc
+    // Mask off lanes (or branch out) after phase end.
+    v_cmp_lt_u64            vcc, v[2:3], s[16:17]
+    s_cbranch_vccz          L_COPY_ALIGNED_PHASE_3_DONE
+    s_and_b64               exec, exec, vcc
+    // Load from/advance the source address.
+    flat_load_dword         v1, v[2:3]
+    v_add_u32               v2, vcc, v2, s25
+    v_addc_u32              v3, vcc, v3, 0x0, vcc
+    s_waitcnt               vmcnt(0)
+    // Write to/advance the destination address.
+    flat_store_dword        v[4:5], v1
+    v_add_u32               v4, vcc, v4, s25
+    v_addc_u32              v5, vcc, v5, 0x0, vcc
+    // Repeat until branched out.
+    s_branch                L_COPY_ALIGNED_PHASE_3_LOOP
+    // Restore EXEC mask for all lanes.
+    s_mov_b64               exec, 0xFFFFFFFFFFFFFFFF
+    // =============================
+    // Phase 4: Byte copy up to end.
+    // =============================
+    // Compute phase source address.
+    v_mov_b32               v3, s17
+    v_add_u32               v2, vcc, v0, s16
+    v_addc_u32              v3, vcc, v3, 0x0, vcc
+    // Compute phase destination address.
+    v_mov_b32               v5, s19
+    v_add_u32               v4, vcc, v0, s18
+    v_addc_u32              v5, vcc, v5, 0x0, vcc
+    // Mask off lanes (or branch out) after phase end.
+    v_cmp_lt_u64            vcc, v[2:3], s[20:21]
+    s_cbranch_vccz          L_COPY_ALIGNED_PHASE_4_DONE
+    s_and_b64               exec, exec, vcc
+    // Load from the source address.
+    flat_load_ubyte         v1, v[2:3]
+    s_waitcnt               vmcnt(0)
+    // Write to the destination address.
+    flat_store_byte         v[4:5], v1
+    s_endpgm
+  end
+  // Memory copy for this case:
+  //  (src_addr & 0x3) != (dst_addr & 0x3)
+  //
+  // Kernel argument buffer:
+  //   [DW  0, 1]  Phase 1 src start address
+  //   [DW  2, 3]  Phase 1 dst start address
+  //   [DW  4, 5]  Phase 2 src start address
+  //   [DW  6, 7]  Phase 2 dst start address
+  //   [DW  8, 9]  Phase 2 src end address
+  //   [DW 10,11]  Phase 2 dst end address
+  //   [DW 12   ]  Total number of workitems
+  var kCopyMisalignedUnroll = 4
+  shader CopyMisaligned
+    type(CS)
+    user_sgpr_count(2)
+    sgpr_count(23)
+    vgpr_count(6 + kCopyMisalignedUnroll)
+    // Retrieve kernel arguments.
+    s_load_dwordx4          s[4:7], s[0:1], s_load_dword_offset(0x0)
+    s_load_dwordx4          s[8:11], s[0:1], s_load_dword_offset(0x10)
+    s_load_dwordx4          s[12:15], s[0:1], s_load_dword_offset(0x20)
+    s_load_dword            s16, s[0:1], s_load_dword_offset(0x30)
+    s_waitcnt               lgkmcnt(0)
+    // Compute workitem id.
+    s_lshl_b32              s2, s2, 0x6
+    v_add_u32               v0, vcc, s2, v0
+    // ===================================================
+    // Phase 1: Unrolled byte copy up to last whole block.
+    // ===================================================
+    // Compute phase source address.
+    v_mov_b32               v3, s5
+    v_add_u32               v2, vcc, v0, s4
+    v_addc_u32              v3, vcc, v3, 0x0, vcc
+    // Compute phase destination address.
+    v_mov_b32               v5, s7
+    v_add_u32               v4, vcc, v0, s6
+    v_addc_u32              v5, vcc, v5, 0x0, vcc
+    // Branch out after phase end.
+    v_cmp_lt_u64            vcc, v[2:3], s[8:9]
+    s_cbranch_vccz          L_COPY_MISALIGNED_PHASE_1_DONE
+    // Load from/advance the source address.
+    for var i = 0; i < kCopyMisalignedUnroll; i ++
+      flat_load_ubyte       v[6 + i], v[2:3]
+      v_add_u32             v2, vcc, v2, s16
+      v_addc_u32            v3, vcc, v3, 0x0, vcc
+    end
+    // Write to/advance the destination address.
+    s_waitcnt               vmcnt(0)
+    for var i = 0; i < kCopyMisalignedUnroll; i ++
+      flat_store_byte       v[4:5], v[6 + i]
+      v_add_u32             v4, vcc, v4, s16
+      v_addc_u32            v5, vcc, v5, 0x0, vcc
+    end
+    // Repeat until branched out.
+    s_branch                L_COPY_MISALIGNED_PHASE_1_LOOP
+    // =============================
+    // Phase 2: Byte copy up to end.
+    // =============================
+    // Compute phase source address.
+    v_mov_b32               v3, s9
+    v_add_u32               v2, vcc, v0, s8
+    v_addc_u32              v3, vcc, v3, 0x0, vcc
+    // Compute phase destination address.
+    v_mov_b32               v5, s11
+    v_add_u32               v4, vcc, v0, s10
+    v_addc_u32              v5, vcc, v5, 0x0, vcc
+    // Mask off lanes (or branch out) after phase end.
+    v_cmp_lt_u64            vcc, v[2:3], s[12:13]
+    s_cbranch_vccz          L_COPY_MISALIGNED_PHASE_2_DONE
+    s_and_b64               exec, exec, vcc
+    // Load from/advance the source address.
+    flat_load_ubyte         v1, v[2:3]
+    v_add_u32               v2, vcc, v2, s16
+    v_addc_u32              v3, vcc, v3, 0x0, vcc
+    s_waitcnt               vmcnt(0)
+    // Write to/advance the destination address.
+    flat_store_byte         v[4:5], v1
+    v_add_u32               v4, vcc, v4, s16
+    v_addc_u32              v5, vcc, v5, 0x0, vcc
+    // Repeat until branched out.
+    s_branch                L_COPY_MISALIGNED_PHASE_2_LOOP
+    s_endpgm
+  end
+  // Memory fill for dword-aligned region.
+  //
+  // Kernel argument buffer:
+  //   [DW  0, 1]  Phase 1 dst start address
+  //   [DW  2, 3]  Phase 2 dst start address
+  //   [DW  4, 5]  Phase 2 dst end address
+  //   [DW  6   ]  Value to fill memory with
+  //   [DW  7   ]  Total number of workitems
+  var kFillVecWidth = 4
+  var kFillUnroll = 1
+  shader Fill
+    type(CS)
+    user_sgpr_count(2)
+    sgpr_count(19)
+    vgpr_count(8)
+    // Retrieve kernel arguments.
+    s_load_dwordx4          s[4:7], s[0:1], s_load_dword_offset(0x0)
+    s_load_dwordx4          s[8:11], s[0:1], s_load_dword_offset(0x10)
+    s_waitcnt               lgkmcnt(0)
+    // Compute workitem id.
+    s_lshl_b32              s2, s2, 0x6
+    v_add_u32               v0, vcc, s2, v0
+    // Copy fill pattern into VGPRs.
+    for var i = 0; i < kFillVecWidth; i ++
+      v_mov_b32           v[4 + i], s10
+    end
+    // ========================================================
+    // Phase 1: Unrolled dword[x4] fill up to last whole block.
+    // ========================================================
+    // Compute unrolled dword[x4] stride across all threads.
+    if kFillVecWidth == 4
+      s_lshl_b32            s12, s11, 0x4
+    else
+      s_lshl_b32            s12, s11, 0x2
+    end
+    // Compute phase destination address.
+    if kFillVecWidth == 4
+      v_lshlrev_b32         v1, 0x4, v0
+    else
+      v_lshlrev_b32         v1, 0x2, v0
+    end
+    v_mov_b32               v3, s5
+    v_add_u32               v2, vcc, v1, s4
+    v_addc_u32              v3, vcc, v3, 0x0, vcc
+    // Branch out after phase end.
+    v_cmp_lt_u64            vcc, v[2:3], s[6:7]
+    s_cbranch_vccz          L_FILL_PHASE_1_DONE
+    // Write to/advance the destination address.
+    for var i = 0; i < kFillUnroll; i ++
+      if kFillVecWidth == 4
+        flat_store_dwordx4  v[2:3], v[4:7]
+      else
+        flat_store_dword    v[2:3], v4
+      end
+      v_add_u32             v2, vcc, v2, s12
+      v_addc_u32            v3, vcc, v3, 0x0, vcc
+    end
+    // Repeat until branched out.
+    s_branch                L_FILL_PHASE_1_LOOP
+    // ==============================
+    // Phase 2: Dword fill up to end.
+    // ==============================
+    // Compute dword stride across all threads.
+    s_lshl_b32              s12, s11, 0x2
+    // Compute phase destination address.
+    v_lshlrev_b32           v1, 0x2, v0
+    v_mov_b32               v3, s7
+    v_add_u32               v2, vcc, v1, s6
+    v_addc_u32              v3, vcc, v3, 0x0, vcc
+    // Mask off lanes (or branch out) after phase end.
+    v_cmp_lt_u64            vcc, v[2:3], s[8:9]
+    s_cbranch_vccz          L_FILL_PHASE_2_DONE
+    s_and_b64               exec, exec, vcc
+    // Write to/advance the destination address.
+    flat_store_dword        v[2:3], v4
+    v_add_u32               v2, vcc, v2, s12
+    v_addc_u32              v3, vcc, v3, 0x0, vcc
+    // Repeat until branched out.
+    s_branch                L_FILL_PHASE_2_LOOP
+    s_endpgm
+  end
+// Search kernel source for variable definition and return value.
+int GetKernelSourceParam(const char* paramName) {
+  std::stringstream paramDef;
+  paramDef << "var " << paramName << " = ";
+  std::string::size_type paramDefLoc = kBlitKernelSource.find(paramDef.str());
+  assert(paramDefLoc != std::string::npos);
+  std::string::size_type paramValLoc = paramDefLoc + paramDef.str().size();
+  std::string::size_type paramEndLoc =
+      kBlitKernelSource.find('\n', paramDefLoc);
+  assert(paramDefLoc != std::string::npos);
+  std::string paramVal(&kBlitKernelSource[paramValLoc],
+                       &kBlitKernelSource[paramEndLoc]);
+  return std::stoi(paramVal);
+static int kCopyAlignedVecWidth = GetKernelSourceParam("kCopyAlignedVecWidth");
+static int kCopyAlignedUnroll = GetKernelSourceParam("kCopyAlignedUnroll");
+static int kCopyMisalignedUnroll = GetKernelSourceParam("kCopyMisalignedUnroll");
+static int kFillVecWidth = GetKernelSourceParam("kFillVecWidth");
+static int kFillUnroll = GetKernelSourceParam("kFillUnroll");
     : core::Blit(),
-      copy_code_handle_(0),
-      fill_code_handle_(0),
-      code_arg_buffer_(NULL) {
+      num_cus_(0) {
   completion_signal_.handle = 0;
@@ -96,150 +542,62 @@ hsa_status_t BlitKernel::Initialize(const core::Agent& agent) {
     return HSA_STATUS_ERROR;
-  // Need queue buffer that can cover the max size of local memory.
-  const uint64_t kGpuVmVaSize = 1ULL << 40;
-  const uint32_t kRequiredQueueSize = NextPow2(static_cast<uint32_t>(
-      std::ceil(static_cast<double>(kGpuVmVaSize) / kMaxCopyCount)));
-  uint32_t max_queue_size = 0;
-  status = HSA::hsa_agent_get_info(agent_handle, HSA_AGENT_INFO_QUEUE_MAX_SIZE,
-                                   &max_queue_size);
+  status = HSA::hsa_queue_create(agent_handle, 1024, HSA_QUEUE_TYPE_MULTI, NULL,
+                                 NULL, 0, 0, &queue_);
   if (HSA_STATUS_SUCCESS != status) {
     return status;
-  if (max_queue_size < kRequiredQueueSize) {
-  }
+  queue_bitmask_ = queue_->size - 1;
-  status =
-      HSA::hsa_queue_create(agent_handle, kRequiredQueueSize,
-                            HSA_QUEUE_TYPE_MULTI, NULL, NULL, 0, 0, &queue_);
+  cached_index_ = 0;
+  status = HSA::hsa_signal_create(1, 0, NULL, &completion_signal_);
   if (HSA_STATUS_SUCCESS != status) {
     return status;
-  queue_bitmask_ = queue_->size - 1;
-  cached_index_ = 0;
-  void* copy_raw_obj_mem = NULL;
-  size_t copy_akc_size = 0;
-  size_t copy_akc_offset = 0;
-  void* copy_aligned_raw_obj_mem = NULL;
-  size_t copy_aligned_akc_size = 0;
-  size_t copy_aligned_akc_offset = 0;
-  void* fill_raw_obj_mem = NULL;
-  size_t fill_akc_size = 0;
-  size_t fill_akc_offset = 0;
-  switch (agent.isa()->GetMajorVersion()) {
-    case 7:
-      copy_raw_obj_mem = kVectorCopyKvObject;
-      copy_akc_size = HSA_VECTOR_COPY_KV_AKC_SIZE;
-      copy_akc_offset = HSA_VECTOR_COPY_KV_AKC_OFFSET;
+  kernarg_async_ = reinterpret_cast<KernelArgs*>(
+      core::Runtime::runtime_singleton_->system_allocator()(
+          queue_->size * AlignUp(sizeof(KernelArgs), 16), 16));
-      copy_aligned_raw_obj_mem = kVectorCopyAlignedKvObject;
-      copy_aligned_akc_size = HSA_VECTOR_COPY_ALIGNED_KV_AKC_SIZE;
-      copy_aligned_akc_offset = HSA_VECTOR_COPY_ALIGNED_KV_AKC_OFFSET;
+  kernarg_async_mask_ = queue_->size - 1;
-      fill_raw_obj_mem = kFillMemoryKvObject;
-      fill_akc_size = HSA_FILL_MEMORY_KV_AKC_SIZE;
-      fill_akc_offset = HSA_FILL_MEMORY_KV_AKC_OFFSET;
-      break;
-    case 8:
-      copy_raw_obj_mem = kVectorCopyViObject;
-      copy_akc_size = HSA_VECTOR_COPY_VI_AKC_SIZE;
-      copy_akc_offset = HSA_VECTOR_COPY_VI_AKC_OFFSET;
-      copy_aligned_raw_obj_mem = kVectorCopyAlignedViObject;
-      copy_aligned_akc_size = HSA_VECTOR_COPY_ALIGNED_VI_AKC_SIZE;
-      copy_aligned_akc_offset = HSA_VECTOR_COPY_ALIGNED_VI_AKC_OFFSET;
-      fill_raw_obj_mem = kFillMemoryViObject;
-      fill_akc_size = HSA_FILL_MEMORY_VI_AKC_SIZE;
-      fill_akc_offset = HSA_FILL_MEMORY_VI_AKC_OFFSET;
-      break;
-    default:
-      assert(false && "Only gfx7 and gfx8 are supported");
-      break;
-  }
+  // Obtain the number of compute units in the underlying agent.
+  const GpuAgent& gpuAgent = static_cast<const GpuAgent&>(agent);
+  num_cus_ = / 4;
-  const size_t total_alloc_size = AlignUp(
-      AlignUp(copy_akc_size, 256) + AlignUp(copy_aligned_akc_size, 256) +
-          AlignUp(fill_akc_size, 256),
-      4096);
-  amd_kernel_code_t *code_ptr = nullptr;
-  code_arg_buffer_ = core::Runtime::runtime_singleton_->system_allocator()(
-      total_alloc_size, 4096);
-  char* akc_arg = reinterpret_cast<char*>(code_arg_buffer_);
-  memcpy(akc_arg,
-         reinterpret_cast<const char*>(copy_raw_obj_mem) + copy_akc_offset,
-         copy_akc_size);
-  copy_code_handle_ = reinterpret_cast<uint64_t>(akc_arg);
-  code_ptr = (amd_kernel_code_t*)(copy_code_handle_);
-  code_ptr->runtime_loader_kernel_symbol = 0;
-  akc_arg += copy_akc_size;
-  akc_arg = AlignUp(akc_arg, 256);
-  memcpy(akc_arg, reinterpret_cast<const char*>(copy_aligned_raw_obj_mem) +
-                      copy_aligned_akc_offset,
-         copy_aligned_akc_size);
-  copy_aligned_code_handle_ = reinterpret_cast<uint64_t>(akc_arg);
-  code_ptr = (amd_kernel_code_t*)(copy_aligned_code_handle_);
-  code_ptr->runtime_loader_kernel_symbol = 0;
-  akc_arg += copy_aligned_akc_size;
-  akc_arg = AlignUp(akc_arg, 256);
-  memcpy(akc_arg,
-         reinterpret_cast<const char*>(fill_raw_obj_mem) + fill_akc_offset,
-         fill_akc_size);
-  fill_code_handle_ = reinterpret_cast<uint64_t>(akc_arg);
-  code_ptr = (amd_kernel_code_t*)(fill_code_handle_);
-  code_ptr->runtime_loader_kernel_symbol = 0;
-  akc_arg += fill_akc_size;
+  // Assemble shaders to AQL code objects.
+  std::map<KernelType, const char*> kernel_names = {
+      {KernelType::CopyAligned, "CopyAligned"},
+      {KernelType::CopyMisaligned, "CopyMisaligned"},
+      {KernelType::Fill, "Fill"}};
-  status = HSA::hsa_signal_create(1, 0, NULL, &completion_signal_);
-  if (HSA_STATUS_SUCCESS != status) {
-    return status;
+  for (auto kernel_name : kernel_names) {
+    KernelCode& kernel = kernels_[kernel_name.first];
+    gpuAgent.AssembleShader(kBlitKernelSource.c_str(), kernel_name.second,
+                            GpuAgent::AssembleTarget::AQL, kernel.code_buf_,
+                            kernel.code_buf_size_);
-  kernarg_async_ = reinterpret_cast<KernelArgs*>(
-      core::Runtime::runtime_singleton_->system_allocator()(
-          kRequiredQueueSize * AlignUp(sizeof(KernelArgs), 16), 16));
-  kernarg_async_mask_ = kRequiredQueueSize - 1;
-  // TODO: remove this code when execute permission level is not mandatory.
-  if (((amd::GpuAgent&)agent).profile() == HSA_PROFILE_FULL) {
-#if defined(_WIN32) || defined(_WIN64)
-#define NOMINMAX
-    DWORD old_protect = 0;
-    const DWORD new_protect = PAGE_EXECUTE_READWRITE;
-    if (!VirtualProtect(code_arg_buffer_, total_alloc_size, new_protect,
-                        &old_protect)) {
-    }
-    if (0 != mprotect(code_arg_buffer_, total_alloc_size,
-                      PROT_READ | PROT_WRITE | PROT_EXEC)) {
-    }
+  if (agent.profiling_enabled()) {
+    return EnableProfiling(true);
-hsa_status_t BlitKernel::Destroy(void) {
+hsa_status_t BlitKernel::Destroy(const core::Agent& agent) {
   std::lock_guard<std::mutex> guard(lock_);
+  const GpuAgent& gpuAgent = static_cast<const GpuAgent&>(agent);
+  for (auto kernel_pair : kernels_) {
+    gpuAgent.ReleaseShader(kernel_pair.second.code_buf_,
+                           kernel_pair.second.code_buf_size_);
+  }
   if (queue_ != NULL) {
@@ -248,10 +606,6 @@ hsa_status_t BlitKernel::Destroy(void) {
-  if (code_arg_buffer_ != NULL) {
-    core::Runtime::runtime_singleton_->system_deallocator()(code_arg_buffer_);
-  }
   if (completion_signal_.handle != 0) {
@@ -259,11 +613,6 @@ hsa_status_t BlitKernel::Destroy(void) {
-static bool IsSystemMemory(void* address) {
-  static const uint64_t kLimitSystem = 1ULL << 48;
-  return (reinterpret_cast<uint64_t>(address) < kLimitSystem);
 hsa_status_t BlitKernel::SubmitLinearCopyCommand(void* dst, const void* src,
                                                  size_t size) {
   // Protect completion_signal_.
@@ -294,35 +643,14 @@ hsa_status_t BlitKernel::SubmitLinearCopyCommand(void* dst, const void* src,
 hsa_status_t BlitKernel::SubmitLinearCopyCommand(
     void* dst, const void* src, size_t size,
     std::vector<core::Signal*>& dep_signals, core::Signal& out_signal) {
-  assert(copy_code_handle_ != 0);
-  const size_t kAlignmentChar = 1;
-  const size_t kAlignmentUin32 = 4;
-  const size_t kAlignmentVec4 = 16;
-  const size_t copy_granule =
-      (IsMultipleOf(dst, kAlignmentVec4) && IsMultipleOf(src, kAlignmentVec4) &&
-       IsMultipleOf(size, kAlignmentVec4))
-          ? kAlignmentVec4
-          : (IsMultipleOf(dst, kAlignmentUin32) &&
-             IsMultipleOf(src, kAlignmentUin32) &&
-             IsMultipleOf(size, kAlignmentUin32))
-                ? kAlignmentUin32
-                : kAlignmentChar;
-  size = size / copy_granule;
-  const uint32_t num_copy_packet = static_cast<uint32_t>(
-      std::ceil(static_cast<double>(size) / kMaxCopyCount));
-  const uint32_t num_barrier_packet =
-      static_cast<uint32_t>(std::ceil(dep_signals.size() / 5.0f));
-  // Reserve write index for copy + fence packet.
-  const uint32_t total_num_packet = num_barrier_packet + num_copy_packet;
+  // Reserve write index for barrier(s) + dispatch packet.
+  const uint32_t num_barrier_packet = uint32_t((dep_signals.size() + 4) / 5);
+  const uint32_t total_num_packet = num_barrier_packet + 1;
   uint64_t write_index = AcquireWriteIndex(total_num_packet);
   uint64_t write_index_temp = write_index;
+  // Insert barrier packets to handle dependent signals.
   const uint16_t kBarrierPacketHeader =
@@ -352,99 +680,116 @@ hsa_status_t BlitKernel::SubmitLinearCopyCommand(
-  const uint32_t last_copy_index = num_copy_packet - 1;
-  size_t total_copy_count = 0;
-  for (uint32_t i = 0; i < num_copy_packet; ++i) {
-    // Setup arguments.
-    const uint32_t copy_count = static_cast<uint32_t>(
-        std::min((size - total_copy_count), kMaxCopyCount));
-    void* cur_dst = static_cast<char*>(dst) + (total_copy_count * copy_granule);
-    const void* cur_src =
-        static_cast<const char*>(src) + (total_copy_count * copy_granule);
-    KernelArgs* args = ObtainAsyncKernelCopyArg();
-    assert(args != NULL);
-    assert(IsMultipleOf(&args->copy, 16));
-    args->copy.src = cur_src;
-    args->copy.dst = cur_dst;
-    args->copy.size = copy_count;
-    args->copy.use_vector = (copy_granule == kAlignmentVec4) ? 1 : 0;
-    const uint32_t grid_size_x =
-        AlignUp(static_cast<uint32_t>(copy_count), kGroupSize);
-    // This assert to make sure kMaxCopySize is not changed to a number that
-    // could cause overflow to packet.grid_size_x.
-    assert(grid_size_x >= copy_count);
-    hsa_signal_t signal = {(i == last_copy_index)
-                               ? (core::Signal::Convert(&out_signal)).handle
-                               : 0};
-    PopulateQueue(write_index, ((copy_granule == kAlignmentChar)
-                                    ? copy_code_handle_
-                                    : copy_aligned_code_handle_),
-                  args, grid_size_x, signal);
-    ++write_index;
-    total_copy_count += copy_count;
+  // Insert dispatch packet for copy kernel.
+  KernelArgs* args = ObtainAsyncKernelCopyArg();
+  KernelCode* kernel_code = nullptr;
+  int num_workitems = 0;
+  bool aligned = ((uintptr_t(src) & 0x3) == (uintptr_t(dst) & 0x3));
+  if (aligned) {
+    // Use dword-based aligned kernel.
+    kernel_code = &kernels_[KernelType::CopyAligned];
+    // Compute the size of each copy phase.
+    num_workitems = 64 * 4 * num_cus_;
+    // Phase 1 (byte copy) ends when destination is 0x100-aligned.
+    uintptr_t src_start = uintptr_t(src);
+    uintptr_t dst_start = uintptr_t(dst);
+    uint64_t phase1_size =
+        std::min(size, uint64_t(0x100 - (dst_start & 0xFF)) & 0xFF);
+    // Phase 2 (unrolled dwordx4 copy) ends when last whole block fits.
+    uint64_t phase2_block = num_workitems * sizeof(uint32_t) *
+                            kCopyAlignedUnroll * kCopyAlignedVecWidth;
+    uint64_t phase2_size = ((size - phase1_size) / phase2_block) * phase2_block;
+    // Phase 3 (dword copy) ends when last whole dword fits.
+    uint64_t phase3_size =
+        ((size - phase1_size - phase2_size) / sizeof(uint32_t)) *
+        sizeof(uint32_t);
+    args->copy_aligned.phase1_src_start = src_start;
+    args->copy_aligned.phase1_dst_start = dst_start;
+    args->copy_aligned.phase2_src_start = src_start + phase1_size;
+    args->copy_aligned.phase2_dst_start = dst_start + phase1_size;
+    args->copy_aligned.phase3_src_start = src_start + phase1_size + phase2_size;
+    args->copy_aligned.phase3_dst_start = dst_start + phase1_size + phase2_size;
+    args->copy_aligned.phase4_src_start =
+        src_start + phase1_size + phase2_size + phase3_size;
+    args->copy_aligned.phase4_dst_start =
+        dst_start + phase1_size + phase2_size + phase3_size;
+    args->copy_aligned.phase4_src_end = src_start + size;
+    args->copy_aligned.phase4_dst_end = dst_start + size;
+    args->copy_aligned.num_workitems = num_workitems;
+  } else {
+    // Use byte-based misaligned kernel.
+    kernel_code = &kernels_[KernelType::CopyMisaligned];
+    // Compute the size of each copy phase.
+    num_workitems = 64 * 4 * num_cus_;
+    // Phase 1 (unrolled byte copy) ends when last whole block fits.
+    uintptr_t src_start = uintptr_t(src);
+    uintptr_t dst_start = uintptr_t(dst);
+    uint64_t phase1_block =
+        num_workitems * sizeof(uint8_t) * kCopyMisalignedUnroll;
+    uint64_t phase1_size = (size / phase1_block) * phase1_block;
+    args->copy_misaligned.phase1_src_start = src_start;
+    args->copy_misaligned.phase1_dst_start = dst_start;
+    args->copy_misaligned.phase2_src_start = src_start + phase1_size;
+    args->copy_misaligned.phase2_dst_start = dst_start + phase1_size;
+    args->copy_misaligned.phase2_src_end = src_start + size;
+    args->copy_misaligned.phase2_dst_end = dst_start + size;
+    args->copy_misaligned.num_workitems = num_workitems;
-  // Launch copy packet.
+  hsa_signal_t signal = {(core::Signal::Convert(&out_signal)).handle};
+  PopulateQueue(write_index, uintptr_t(kernel_code->code_buf_), args,
+                num_workitems, signal);
+  // Submit barrier(s) and dispatch packets.
   ReleaseWriteIndex(write_index_temp, total_num_packet);
 hsa_status_t BlitKernel::SubmitLinearFillCommand(void* ptr, uint32_t value,
-                                                 size_t num) {
-  assert(fill_code_handle_ != 0);
+                                                 size_t count) {
   std::lock_guard<std::mutex> guard(lock_);
-  HSA::hsa_signal_store_relaxed(completion_signal_, 1);
-  const uint32_t num_fill_packet = static_cast<uint32_t>(
-      std::ceil(static_cast<double>(num) / kMaxFillCount));
-  // Reserve write index for copy + fence packet.
-  uint64_t write_index = AcquireWriteIndex(num_fill_packet);
-  const uint32_t last_fill_index = num_fill_packet - 1;
-  size_t total_fill_count = 0;
-  for (uint32_t i = 0; i < num_fill_packet; ++i) {
-    // Setup arguments.
-    const uint32_t fill_count = static_cast<uint32_t>(
-        std::min((num - total_fill_count), kMaxFillCount));
-    void* cur_ptr = static_cast<char*>(ptr) + total_fill_count;
-    KernelArgs* args = ObtainAsyncKernelCopyArg();
-    assert(args != NULL);
-    assert(IsMultipleOf(&args->fill, 16));
+  // Reject misaligned base address.
+  if ((uintptr_t(ptr) & 0x3) != 0) {
+    return HSA_STATUS_ERROR;
+  }
-    args->fill.ptr = cur_ptr;
-    args->fill.num = fill_count;
-    args->fill.value = value;
+  // Compute the size of each fill phase.
+  int num_workitems = 64 * num_cus_;
-    const uint32_t grid_size_x =
-        AlignUp(static_cast<uint32_t>(fill_count), kGroupSize);
+  // Phase 1 (unrolled dwordx4 copy) ends when last whole block fits.
+  uintptr_t dst_start = uintptr_t(ptr);
+  uint64_t fill_size = count * sizeof(uint32_t);
-    // This assert to make sure kMaxFillCount is not changed to a number that
-    // could cause overflow to packet.grid_size_x.
-    assert(grid_size_x >= fill_count);
+  uint64_t phase1_block =
+      num_workitems * sizeof(uint32_t) * kFillUnroll * kFillVecWidth;
+  uint64_t phase1_size = (fill_size / phase1_block) * phase1_block;
-    hsa_signal_t signal = {(i == last_fill_index) ? completion_signal_.handle
-                                                  : 0};
-    PopulateQueue(write_index + i, fill_code_handle_, &args[i], grid_size_x,
-                  signal);
+  KernelArgs* args = ObtainAsyncKernelCopyArg();
+  args->fill.phase1_dst_start = dst_start;
+  args->fill.phase2_dst_start = dst_start + phase1_size;
+  args->fill.phase2_dst_end = dst_start + fill_size;
+  args->fill.fill_value = value;
+  args->fill.num_workitems = num_workitems;
-    total_fill_count += fill_count;
-  }
+  // Submit dispatch packet.
+  HSA::hsa_signal_store_relaxed(completion_signal_, 1);
-  // Launch fill packet.
-  ReleaseWriteIndex(write_index, num_fill_packet);
+  uint64_t write_index = AcquireWriteIndex(1);
+  PopulateQueue(write_index, uintptr_t(kernels_[KernelType::Fill].code_buf_),
+                args, num_workitems, completion_signal_);
+  ReleaseWriteIndex(write_index, 1);
   // Wait for the packet to finish.
   if (HSA::hsa_signal_wait_acquire(completion_signal_, HSA_SIGNAL_CONDITION_LT,
@@ -457,6 +802,17 @@ hsa_status_t BlitKernel::SubmitLinearFillCommand(void* ptr, uint32_t value,
+hsa_status_t BlitKernel::EnableProfiling(bool enable) {
+  core::Queue* cmd_queue = core::Queue::Convert(queue_);
+  if (cmd_queue != NULL) {
+    AMD_HSA_BITS_SET(cmd_queue->amd_queue_.queue_properties,
+                     AMD_QUEUE_PROPERTIES_ENABLE_PROFILING, enable);
+  }
 uint64_t BlitKernel::AcquireWriteIndex(uint32_t num_packet) {
   assert(queue_->size >= num_packet);
@@ -556,9 +912,9 @@ void BlitKernel::PopulateQueue(uint64_t index, uint64_t code_handle, void* args,
   // Setup working size.
   const int kNumDimension = 1;
-  packet.grid_size_x = AlignUp(static_cast<uint32_t>(grid_size_x), kGroupSize);
+  packet.grid_size_x = AlignUp(static_cast<uint32_t>(grid_size_x), 64);
   packet.grid_size_y = packet.grid_size_z = 1;
-  packet.workgroup_size_x = kGroupSize;
+  packet.workgroup_size_x = 64;
   packet.workgroup_size_y = packet.workgroup_size_z = 1;
   packet.completion_signal = completion_signal;
diff --git a/src/core/runtime/amd_blit_sdma.cpp b/src/core/runtime/amd_blit_sdma.cpp
index b89ba7627..b0b28b2c7 100644
--- a/src/core/runtime/amd_blit_sdma.cpp
+++ b/src/core/runtime/amd_blit_sdma.cpp
@@ -48,19 +48,25 @@
 #include <cstring>
 #include "core/inc/amd_gpu_agent.h"
+#include "core/inc/amd_memory_region.h"
 #include "core/inc/runtime.h"
 #include "core/inc/signal.h"
+#define SDMA_QUEUE_SIZE 1024 * 1024
 namespace amd {
 // SDMA packet for VI device.
 // Reference:
 const unsigned int SDMA_OP_COPY = 1;
 const unsigned int SDMA_OP_FENCE = 5;
+const unsigned int SDMA_OP_TRAP = 6;
 const unsigned int SDMA_OP_POLL_REGMEM = 8;
 const unsigned int SDMA_OP_ATOMIC = 10;
 const unsigned int SDMA_OP_CONST_FILL = 11;
+const unsigned int SDMA_OP_TIMESTAMP = 13;
 const unsigned int SDMA_SUBOP_COPY_LINEAR = 0;
+const unsigned int SDMA_SUBOP_TIMESTAMP_GET_GLOBAL = 2;
 const unsigned int SDMA_ATOMIC_ADD64 = 47;
 typedef struct SDMA_PKT_COPY_LINEAR_TAG {
@@ -310,6 +316,51 @@ typedef struct SDMA_PKT_ATOMIC_TAG {
+typedef struct SDMA_PKT_TIMESTAMP_TAG {
+  union {
+    struct {
+      unsigned int op : 8;
+      unsigned int sub_op : 8;
+      unsigned int reserved_0 : 16;
+    };
+    unsigned int DW_0_DATA;
+  union {
+    struct {
+      unsigned int addr_31_0 : 32;
+    };
+    unsigned int DW_1_DATA;
+  union {
+    struct {
+      unsigned int addr_63_32 : 32;
+    };
+    unsigned int DW_2_DATA;
+typedef struct SDMA_PKT_TRAP_TAG {
+  union {
+    struct {
+      unsigned int op : 8;
+      unsigned int sub_op : 8;
+      unsigned int reserved_0 : 16;
+    };
+    unsigned int DW_0_DATA;
+  union {
+    struct {
+      unsigned int int_ctx : 28;
+      unsigned int reserved_1 : 4;
+    };
+    unsigned int DW_1_DATA;
 inline uint32_t ptrlow32(const void* p) {
   return static_cast<uint32_t>(reinterpret_cast<uintptr_t>(p));
@@ -322,21 +373,28 @@ inline uint32_t ptrhigh32(const void* p) {
+const size_t BlitSdma::kQueueSize = SDMA_QUEUE_SIZE;
+const size_t BlitSdma::kCopyPacketSize = sizeof(SDMA_PKT_COPY_LINEAR);
     : core::Blit(),
+      agent_(NULL),
-      cached_commit_offset_(0) {
+      cached_commit_offset_(0),
+      platform_atomic_support_(true) {
   std::memset(&queue_resource_, 0, sizeof(queue_resource_));
 BlitSdma::~BlitSdma() {}
 hsa_status_t BlitSdma::Initialize(const core::Agent& agent) {
+  agent_ = reinterpret_cast<amd::GpuAgent*>(&const_cast<core::Agent&>(agent));
   if (queue_start_addr_ != NULL && queue_size_ != 0) {
     // Already initialized.
@@ -351,6 +409,8 @@ hsa_status_t BlitSdma::Initialize(const core::Agent& agent) {
   fence_command_size_ = sizeof(SDMA_PKT_FENCE);
   poll_command_size_ = sizeof(SDMA_PKT_POLL_REGMEM);
   atomic_command_size_ = sizeof(SDMA_PKT_ATOMIC);
+  timestamp_command_size_ = sizeof(SDMA_PKT_TIMESTAMP);
+  trap_command_size_ = sizeof(SDMA_PKT_TRAP);
   const uint32_t sync_command_size = fence_command_size_;
   const uint32_t max_num_copy_command =
@@ -372,18 +432,20 @@ hsa_status_t BlitSdma::Initialize(const core::Agent& agent) {
                static_cast<uint64_t>(max_num_fill_command) *
-  const amd::GpuAgent& amd_gpu_agent = static_cast<const amd::GpuAgent&>(agent);
+  const amd::GpuAgentInt& amd_gpu_agent =
+      static_cast<const amd::GpuAgentInt&>(agent);
-  if (amd_gpu_agent.isa()->version() != core::Isa::Version(8, 0, 3)) {
-    assert(false && "Only for Fiji currently");
+  if (HSA_PROFILE_FULL == amd_gpu_agent.profile()) {
+    assert(false && "Only support SDMA for dgpu currently");
     return HSA_STATUS_ERROR;
-  // Allocate queue buffer.
-  const size_t kPageSize = 4096;
-  const size_t kSdmaQueueSize = 1024 * 1024;
+  if (amd_gpu_agent.isa()->version() == core::Isa::Version(7, 0, 1)) {
+    platform_atomic_support_ = false;
+  }
-  queue_size_ = kSdmaQueueSize;
+  // Allocate queue buffer.
+  queue_size_ = kQueueSize;
   HsaMemFlags flags;
   flags.Value = 0;
@@ -404,7 +466,7 @@ hsa_status_t BlitSdma::Initialize(const core::Agent& agent) {
   if (err != HSAKMT_STATUS_SUCCESS) {
     assert(false && "AQL queue memory map failure.");
-    Destroy();
+    Destroy(agent);
@@ -413,21 +475,20 @@ hsa_status_t BlitSdma::Initialize(const core::Agent& agent) {
   // Access kernel driver to initialize the queue control block
   // This call binds user mode queue object to underlying compute
   // device.
-  const GpuAgent& gpu_agent = reinterpret_cast<const GpuAgent&>(agent);
   const HSA_QUEUE_TYPE kQueueType_ = HSA_QUEUE_SDMA;
-      hsaKmtCreateQueue(gpu_agent.node_id(), kQueueType_, 100,
+      hsaKmtCreateQueue(amd_gpu_agent.node_id(), kQueueType_, 100,
                         HSA_QUEUE_PRIORITY_MAXIMUM, queue_start_addr_,
                         queue_size_, NULL, &queue_resource_)) {
-    Destroy();
+    Destroy(agent);
   cached_reserve_offset_ = *(queue_resource_.Queue_write_ptr);
   cached_commit_offset_ = cached_reserve_offset_;
-  fence_pool_size_ =
-      static_cast<uint32_t>(std::ceil(kSdmaQueueSize / fence_command_size_));
+  fence_pool_size_ = static_cast<uint32_t>(
+      (kQueueSize + fence_command_size_ - 1) / fence_command_size_);
   fence_pool_mask_ = fence_pool_size_ - 1;
@@ -436,14 +497,14 @@ hsa_status_t BlitSdma::Initialize(const core::Agent& agent) {
           fence_pool_size_ * sizeof(uint32_t), 256));
   if (fence_base_addr_ == NULL) {
-    Destroy();
+    Destroy(agent);
-hsa_status_t BlitSdma::Destroy(void) {
+hsa_status_t BlitSdma::Destroy(const core::Agent& agent) {
   // Release all allocated resources and reset them to zero.
   if (queue_resource_.QueueId != 0) {
@@ -479,8 +540,8 @@ hsa_status_t BlitSdma::SubmitLinearCopyCommand(void* dst, const void* src,
   // Break the copy into multiple copy operation incase the copy size exceeds
   // the SDMA linear copy limit.
-  const uint32_t num_copy_command = static_cast<uint32_t>(
-      std::ceil(static_cast<double>(size) / max_single_linear_copy_size_));
+  const uint32_t num_copy_command =
+      (size + max_single_linear_copy_size_ - 1) / max_single_linear_copy_size_;
   const uint32_t total_copy_command_size =
       num_copy_command * linear_copy_command_size_;
@@ -528,18 +589,55 @@ hsa_status_t BlitSdma::SubmitLinearCopyCommand(
   // Break the copy into multiple copy operation incase the copy size exceeds
   // the SDMA linear copy limit.
-  const uint32_t num_copy_command = static_cast<uint32_t>(
-      std::ceil(static_cast<double>(size) / max_single_linear_copy_size_));
+  const uint32_t num_copy_command =
+      (size + max_single_linear_copy_size_ - 1) / max_single_linear_copy_size_;
   const uint32_t total_copy_command_size =
       num_copy_command * linear_copy_command_size_;
-  const uint32_t total_command_size =
-      total_poll_command_size + total_copy_command_size + atomic_command_size_ +
-      fence_command_size_;
+  // Load the profiling state early in case the user disable or enable the
+  // profiling in the middle of the call.
+  const bool profiling_enabled = agent_->profiling_enabled();
-  const uint32_t kFenceValue = 2015;
-  uint32_t* fence_addr = ObtainFenceObject();
-  *fence_addr = 0;
+  uint64_t* end_ts_addr = NULL;
+  uint32_t total_timestamp_command_size = 0;
+  if (profiling_enabled) {
+    // SDMA timestamp packet requires 32 byte of aligned memory, but
+    // amd_signal_t::end_ts is not 32 byte aligned. So an extra copy packet to
+    // read from a 32 byte aligned bounce buffer is required to avoid changing
+    // the amd_signal_t ABI.
+    end_ts_addr = agent_->ObtainEndTsObject();
+    if (end_ts_addr == NULL) {
+    }
+    total_timestamp_command_size =
+        (2 * timestamp_command_size_) + linear_copy_command_size_;
+  }
+  // On agent that does not support platform atomic, we replace it with
+  // one or two fence packet(s) to update the signal value. The reason fence
+  // is used and not write packet is because the SDMA engine may overlap a
+  // serial copy/write packets.
+  const uint64_t completion_signal_value =
+      static_cast<uint64_t>(out_signal.LoadRelaxed() - 1);
+  const size_t sync_command_size = (platform_atomic_support_)
+                                       ? atomic_command_size_
+                                       : (completion_signal_value > UINT32_MAX)
+                                             ? 2 * fence_command_size_
+                                             : fence_command_size_;
+  // If the signal is an interrupt signal, we also need to make SDMA engine to
+  // send interrupt packet to IH.
+  const size_t interrupt_command_size =
+      (out_signal.signal_.event_mailbox_ptr != 0)
+          ? (fence_command_size_ + trap_command_size_)
+          : 0;
+  const uint32_t total_command_size =
+      total_poll_command_size + total_copy_command_size + sync_command_size +
+      total_timestamp_command_size + interrupt_command_size;
   char* command_addr = AcquireWriteAddress(total_command_size);
   char* const command_addr_temp = command_addr;
@@ -559,17 +657,58 @@ hsa_status_t BlitSdma::SubmitLinearCopyCommand(
     command_addr += poll_command_size_;
+  if (profiling_enabled) {
+    BuildGetGlobalTimestampCommand(
+        command_addr, reinterpret_cast<void*>(&out_signal.signal_.start_ts));
+    command_addr += timestamp_command_size_;
+  }
   // Do the transfer after all polls are satisfied.
   BuildCopyCommand(command_addr, num_copy_command, dst, src, size);
   command_addr += total_copy_command_size;
-  // After transfer is completed, decrement the signal.
-  BuildAtomicDecrementCommand(command_addr, out_signal.ValueLocation());
+  if (profiling_enabled) {
+    assert(IsMultipleOf(end_ts_addr, 32));
+    BuildGetGlobalTimestampCommand(command_addr,
+                                   reinterpret_cast<void*>(end_ts_addr));
+    command_addr += timestamp_command_size_;
-  command_addr += atomic_command_size_;
+    BuildCopyCommand(command_addr, 1,
+                     reinterpret_cast<void*>(&out_signal.signal_.end_ts),
+                     reinterpret_cast<void*>(end_ts_addr), sizeof(uint64_t));
+    command_addr += linear_copy_command_size_;
+  }
-  BuildFenceCommand(command_addr, fence_addr, kFenceValue);
+  // After transfer is completed, decrement the signal value.
+  if (platform_atomic_support_) {
+    BuildAtomicDecrementCommand(command_addr, out_signal.ValueLocation());
+    command_addr += atomic_command_size_;
+  } else {
+    uint32_t* signal_value_location =
+        reinterpret_cast<uint32_t*>(out_signal.ValueLocation());
+    if (completion_signal_value > UINT32_MAX) {
+      BuildFenceCommand(command_addr, signal_value_location + 1,
+                        static_cast<uint32_t>(completion_signal_value >> 32));
+      command_addr += fence_command_size_;
+    }
+    BuildFenceCommand(command_addr, signal_value_location,
+                      static_cast<uint32_t>(completion_signal_value));
+    command_addr += fence_command_size_;
+  }
+  // Update mailbox event and send interrupt to IH.
+  if (out_signal.signal_.event_mailbox_ptr != 0) {
+    BuildFenceCommand(command_addr, reinterpret_cast<uint32_t*>(
+                                        out_signal.signal_.event_mailbox_ptr),
+                      static_cast<uint32_t>(out_signal.signal_.event_id));
+    command_addr += fence_command_size_;
+    BuildTrapCommand(command_addr);
+  }
   ReleaseWriteAddress(command_addr_temp, total_command_size);
@@ -586,8 +725,8 @@ hsa_status_t BlitSdma::SubmitLinearFillCommand(void* ptr, uint32_t value,
   // Break the copy into multiple copy operation incase the copy size exceeds
   // the SDMA linear copy limit.
-  const uint32_t num_fill_command = static_cast<uint32_t>(
-      std::ceil(static_cast<double>(size) / max_single_fill_size_));
+  const uint32_t num_fill_command =
+      (size + max_single_fill_size_ - 1) / max_single_fill_size_;
   const uint32_t total_fill_command_size =
       num_fill_command * fill_command_size_;
@@ -644,6 +783,10 @@ hsa_status_t BlitSdma::SubmitLinearFillCommand(void* ptr, uint32_t value,
+hsa_status_t BlitSdma::EnableProfiling(bool enable) {
 char* BlitSdma::AcquireWriteAddress(uint32_t cmd_size) {
   if (cmd_size > queue_size_) {
     return NULL;
@@ -867,4 +1010,27 @@ void BlitSdma::BuildAtomicDecrementCommand(char* cmd_addr, void* addr) {
   packet_addr->SRC_DATA_LO_UNION.src_data_31_0 = 0xffffffff;
   packet_addr->SRC_DATA_HI_UNION.src_data_63_32 = 0xffffffff;
+void BlitSdma::BuildGetGlobalTimestampCommand(char* cmd_addr,
+                                              void* write_address) {
+  SDMA_PKT_TIMESTAMP* packet_addr =
+      reinterpret_cast<SDMA_PKT_TIMESTAMP*>(cmd_addr);
+  memset(packet_addr, 0, sizeof(SDMA_PKT_TIMESTAMP));
+  packet_addr->HEADER_UNION.op = SDMA_OP_TIMESTAMP;
+  packet_addr->ADDR_LO_UNION.addr_31_0 = ptrlow32(write_address);
+  packet_addr->ADDR_HI_UNION.addr_63_32 = ptrhigh32(write_address);
+void BlitSdma::BuildTrapCommand(char* cmd_addr) {
+  SDMA_PKT_TRAP* packet_addr =
+      reinterpret_cast<SDMA_PKT_TRAP*>(cmd_addr);
+  memset(packet_addr, 0, sizeof(SDMA_PKT_TRAP));
+  packet_addr->HEADER_UNION.op = SDMA_OP_TRAP;
 }  // namespace amd
diff --git a/src/core/runtime/amd_gpu_agent.cpp b/src/core/runtime/amd_gpu_agent.cpp
index 244f7eaf5..d56f7fb51 100644
--- a/src/core/runtime/amd_gpu_agent.cpp
+++ b/src/core/runtime/amd_gpu_agent.cpp
@@ -46,23 +46,26 @@
 #include <atomic>
 #include <cstring>
 #include <climits>
+#include <map>
+#include <string>
 #include <vector>
 #include "core/inc/amd_aql_queue.h"
 #include "core/inc/amd_blit_kernel.h"
 #include "core/inc/amd_blit_sdma.h"
+#include "core/inc/amd_gpu_shaders.h"
 #include "core/inc/amd_memory_region.h"
 #include "core/inc/interrupt_signal.h"
 #include "core/inc/isa.h"
 #include "core/inc/runtime.h"
-#include "utils/sp3/sp3.h"
 #include "hsa_ext_image.h"
 // Size of scratch (private) segment pre-allocated per thread, in bytes.
+extern core::HsaApiTable hsa_internal_api_table_;
 namespace amd {
 GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props)
     : GpuAgentInt(node),
@@ -70,13 +73,19 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props)
+      blit_d2d_(NULL),
+      local_region_(NULL),
-      ape1_size_(0) {
+      ape1_size_(0),
+      blit_initialized_(false),
+      end_ts_pool_size_(0),
+      end_ts_pool_counter_(0),
+      end_ts_base_addr_(NULL) {
   const bool is_apu_node = (properties_.NumCPUCores > 0);
   profile_ = (is_apu_node) ? HSA_PROFILE_FULL : HSA_PROFILE_BASE;
@@ -88,6 +97,7 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props)
   isa_ = (core::Isa*)core::IsaRegistry::GetIsa(core::Isa::Version(
       node_props.EngineId.ui32.Major, node_props.EngineId.ui32.Minor,
   // Check if the device is Kaveri, only on GPU device.
   if (isa_->GetMajorVersion() == 7 && isa_->GetMinorVersion() == 0 &&
       isa_->GetStepping() == 0) {
@@ -126,21 +136,33 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props)
 GpuAgent::~GpuAgent() {
   if (blit_h2d_ != NULL) {
-    hsa_status_t status = blit_h2d_->Destroy();
+    hsa_status_t status = blit_h2d_->Destroy(*this);
     assert(status == HSA_STATUS_SUCCESS);
     delete blit_h2d_;
     blit_h2d_ = NULL;
-  if (blit_d2h_ != NULL) {
-    hsa_status_t status = blit_d2h_->Destroy();
+  if (blit_d2h_ != NULL && blit_d2h_ != blit_d2d_) {
+    hsa_status_t status = blit_d2h_->Destroy(*this);
     assert(status == HSA_STATUS_SUCCESS);
     delete blit_d2h_;
     blit_d2h_ = NULL;
+  if (blit_d2d_ != NULL) {
+    hsa_status_t status = blit_d2d_->Destroy(*this);
+    assert(status == HSA_STATUS_SUCCESS);
+    delete blit_d2d_;
+    blit_d2d_ = NULL;
+  }
+  if (end_ts_base_addr_ != NULL) {
+    core::Runtime::runtime_singleton_->FreeMemory(end_ts_base_addr_);
+  }
   if (ape1_base_ != 0) {
@@ -158,33 +180,60 @@ GpuAgent::~GpuAgent() {
 void GpuAgent::AssembleShader(const char* src_sp3, const char* func_name,
-                              void*& code_buf, size_t& code_buf_size) {
-#ifdef __linux__  // No VS builds of libsp3 available right now
-  // Assemble source string with libsp3.
-  sp3_context* sp3 = sp3_new();
+                              AssembleTarget assemble_target, void*& code_buf,
+                              size_t& code_buf_size) const {
+  // Select precompiled shader implementation from name/target.
+  struct ASICShader {
+    const void* code;
+    size_t size;
+    int num_sgprs;
+    int num_vgprs;
+  };
+  struct CompiledShader {
+    ASICShader compute_7;
+    ASICShader compute_8;
+  };
+  std::map<std::string, CompiledShader> compiled_shaders = {
+      {"TrapHandler",
+       {{NULL, 0, 0, 0}, {kCodeTrapHandler8, sizeof(kCodeTrapHandler8), 2, 4}}},
+      {"CopyAligned",
+       {{kCodeCopyAligned7, sizeof(kCodeCopyAligned7), 32, 12},
+        {kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12}}},
+      {"CopyMisaligned",
+       {{kCodeCopyMisaligned7, sizeof(kCodeCopyMisaligned7), 23, 10},
+        {kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10}}},
+      {"Fill",
+       {{kCodeFill7, sizeof(kCodeFill7), 19, 8},
+        {kCodeFill8, sizeof(kCodeFill8), 19, 8}}}};
+  auto compiled_shader_it = compiled_shaders.find(func_name);
+  assert(compiled_shader_it != compiled_shaders.end() &&
+         "Precompiled shader unavailable");
+  ASICShader* asic_shader = NULL;
   switch (isa_->GetMajorVersion()) {
     case 7:
-      sp3_setasic(sp3, "CI");
+      asic_shader = &compiled_shader_it->second.compute_7;
     case 8:
-      sp3_setasic(sp3, "VI");
+      asic_shader = &compiled_shader_it->second.compute_8;
-      assert(false && "SP3 assembly not supported on this agent");
+      assert(false && "Precompiled shader unavailable for target");
-  sp3_parse_string(sp3, src_sp3);
-  sp3_shader* code_sp3_meta = sp3_compile(sp3, func_name);
-  // Allocate a GPU-visible buffer for the trap shader.
+  // Allocate a GPU-visible buffer for the shader.
   HsaMemFlags code_buf_flags = {0};
   code_buf_flags.ui32.HostAccess = 1;
   code_buf_flags.ui32.ExecuteAccess = 1;
   code_buf_flags.ui32.NoSubstitute = 1;
-  size_t code_size = code_sp3_meta->size * sizeof(uint32_t);
-  code_buf_size = AlignUp(code_size, 0x1000);
+  size_t header_size =
+      (assemble_target == AssembleTarget::AQL ? sizeof(amd_kernel_code_t) : 0);
+  code_buf_size = AlignUp(header_size + asic_shader->size, 0x1000);
       hsaKmtAllocMemory(node_id(), code_buf_size, code_buf_flags, &code_buf);
@@ -193,17 +242,41 @@ void GpuAgent::AssembleShader(const char* src_sp3, const char* func_name,
   err = hsaKmtMapMemoryToGPU(code_buf, code_buf_size, NULL);
   assert(err == HSAKMT_STATUS_SUCCESS && "hsaKmtMapMemoryToGPU(Trap) failed");
-  // Copy trap handler code into the GPU-visible buffer.
   memset(code_buf, 0, code_buf_size);
-  memcpy(code_buf, code_sp3_meta->data, code_size);
-  // Release SP3 resources.
-  sp3_free_shader(code_sp3_meta);
-  sp3_close(sp3);
+  // Populate optional code object header.
+  if (assemble_target == AssembleTarget::AQL) {
+    amd_kernel_code_t* header = reinterpret_cast<amd_kernel_code_t*>(code_buf);
+    int gran_sgprs = std::max(0, (int(asic_shader->num_sgprs) - 1) / 8);
+    int gran_vgprs = std::max(0, (int(asic_shader->num_vgprs) - 1) / 4);
+    header->kernel_code_entry_byte_offset = sizeof(amd_kernel_code_t);
+    AMD_HSA_BITS_SET(header->kernel_code_properties,
+                     1);
+    AMD_HSA_BITS_SET(header->compute_pgm_rsrc1,
+                     gran_sgprs);
+    AMD_HSA_BITS_SET(header->compute_pgm_rsrc1,
+                     gran_vgprs);
+    AMD_HSA_BITS_SET(header->compute_pgm_rsrc1,
+                     AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_16_64, 3);
+    AMD_HSA_BITS_SET(header->compute_pgm_rsrc1,
+                     AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_IEEE_MODE, 1);
+    AMD_HSA_BITS_SET(header->compute_pgm_rsrc2,
+                     AMD_COMPUTE_PGM_RSRC_TWO_USER_SGPR_COUNT, 2);
+    AMD_HSA_BITS_SET(header->compute_pgm_rsrc2,
+  }
+  // Copy shader code into the GPU-visible buffer.
+  memcpy((void*)(uintptr_t(code_buf) + header_size), asic_shader->code,
+         asic_shader->size);
-void GpuAgent::ReleaseShader(void* code_buf, size_t code_buf_size) {
+void GpuAgent::ReleaseShader(void* code_buf, size_t code_buf_size) const {
   hsaKmtFreeMemory(code_buf, code_buf_size);
@@ -238,6 +311,10 @@ void GpuAgent::InitRegionList() {
               new MemoryRegion(false, false, this, mem_props[mem_idx]);
+          if (region->IsLocalMemory()) {
+            local_region_ = region;
+          }
         case HSA_HEAPTYPE_SYSTEM:
@@ -314,6 +391,57 @@ void GpuAgent::InitCacheList() {
+bool GpuAgent::InitEndTsPool() {
+  if (HSA_PROFILE_FULL == profile_) {
+    return true;
+  }
+  if (end_ts_base_addr_.load(std::memory_order_acquire) != NULL) {
+    return true;
+  }
+  ScopedAcquire<KernelMutex> lock(&blit_lock_);
+  if (end_ts_base_addr_.load(std::memory_order_relaxed) != NULL) {
+    return true;
+  }
+  end_ts_pool_size_ = static_cast<uint32_t>(
+      (BlitSdma::kQueueSize + BlitSdma::kCopyPacketSize - 1) /
+      (BlitSdma::kCopyPacketSize));
+  // Allocate end timestamp object for both h2d and d2h DMA.
+  const size_t alloc_size = 2 * end_ts_pool_size_ * kTsSize;
+  core::Runtime* runtime = core::Runtime::runtime_singleton_;
+  uint64_t* buff = NULL;
+      runtime->AllocateMemory(true, local_region_, alloc_size,
+                              reinterpret_cast<void**>(&buff))) {
+    return false;
+  }
+, std::memory_order_release);
+  return true;
+uint64_t* GpuAgent::ObtainEndTsObject() {
+  if (end_ts_base_addr_ == NULL) {
+    return NULL;
+  }
+  const uint32_t end_ts_index =
+      end_ts_pool_counter_.fetch_add(1U, std::memory_order_acq_rel) %
+      end_ts_pool_size_;
+  const static size_t kNumU64 = kTsSize / sizeof(uint64_t);
+  uint64_t* end_ts_addr = &end_ts_base_addr_[end_ts_index * kNumU64];
+  assert(IsMultipleOf(end_ts_addr, kTsSize));
+  return end_ts_addr;
 hsa_status_t GpuAgent::IterateRegion(
     hsa_status_t (*callback)(hsa_region_t region, void* data),
     void* data) const {
@@ -377,7 +505,7 @@ core::Blit* GpuAgent::CreateBlitSdma() {
   BlitSdma* sdma = new BlitSdma();
   if (sdma->Initialize(*this) != HSA_STATUS_SUCCESS) {
-    sdma->Destroy();
+    sdma->Destroy(*this);
     delete sdma;
     sdma = NULL;
@@ -389,7 +517,7 @@ core::Blit* GpuAgent::CreateBlitKernel() {
   BlitKernel* kernl = new BlitKernel();
   if (kernl->Initialize(*this) != HSA_STATUS_SUCCESS) {
-    kernl->Destroy();
+    kernl->Destroy(*this);
     delete kernl;
     kernl = NULL;
@@ -397,41 +525,60 @@ core::Blit* GpuAgent::CreateBlitKernel() {
   return kernl;
-hsa_status_t GpuAgent::InitDma() {
-  // Try create SDMA blit first.
-  if (core::Runtime::runtime_singleton_->flag().enable_sdma() &&
-      isa_->GetMajorVersion() == 8 && isa_->GetMinorVersion() == 0 &&
-      isa_->GetStepping() == 3) {
-    blit_h2d_ = CreateBlitSdma();
-    blit_d2h_ = CreateBlitSdma();
+void GpuAgent::InitDma() {
+  // This provides the ability to lazy init the blit objects on places that
+  // could give indication of DMA usage in the future. E.g.:
+  // 1. Call to allow access API.
+  // 2. Call to memory lock API.
+  if (!blit_initialized_.load(std::memory_order_acquire)) {
+    ScopedAcquire<KernelMutex> lock(&blit_lock_);
+    if (!blit_initialized_.load(std::memory_order_relaxed)) {
+      // Try create SDMA blit first.
+      if (core::Runtime::runtime_singleton_->flag().enable_sdma() &&
+          (HSA_PROFILE_BASE == profile_)) {
+        blit_h2d_ = CreateBlitSdma();
+        blit_d2h_ = CreateBlitSdma();
+        if (blit_h2d_ != NULL && blit_d2h_ != NULL) {
+, std::memory_order_release);
+          return;
+        }
+      }
-    if (blit_h2d_ != NULL && blit_d2h_ != NULL) {
-      return HSA_STATUS_SUCCESS;
-    }
-  }
+      // Fall back to blit kernel if SDMA is unavailable.
+      assert(blit_h2d_ == NULL || blit_d2h_ == NULL);
+      if (blit_h2d_ == NULL) {
+        blit_h2d_ = CreateBlitKernel();
+      }
-  // Fall back to blit kernel if SDMA is unavailable.
-  assert(blit_h2d_ == NULL || blit_d2h_ == NULL);
+      if (blit_d2h_ == NULL) {
+        // Share device-to-host queue with device-to-device.
+        blit_d2h_ = blit_d2d_;
+      }
-  if (blit_h2d_ == NULL) {
-    blit_h2d_ = CreateBlitKernel();
+, std::memory_order_release);
+    }
-  if (blit_d2h_ == NULL) {
-    blit_d2h_ = CreateBlitKernel();
+hsa_status_t GpuAgent::InitBlitKernel() {
+  // Unlike InitDma, this function is not designed for lazy initialization.
+  // So checking the state without  double checked locking is fine.
+  if (blit_d2d_ == NULL) {
+    blit_d2d_ = CreateBlitKernel();
-  return (blit_h2d_ != NULL && blit_d2h_ != NULL)
-             ? HSA_STATUS_SUCCESS
+  return (blit_d2d_ != NULL) ? HSA_STATUS_SUCCESS
+                             : HSA_STATUS_ERROR_OUT_OF_RESOURCES;
 hsa_status_t GpuAgent::DmaCopy(void* dst, const void* src, size_t size) {
-  if (blit_d2h_ == NULL) {
+  if (blit_d2d_ == NULL) {
-  return blit_d2h_->SubmitLinearCopyCommand(dst, src, size);
+  return blit_d2d_->SubmitLinearCopyCommand(dst, src, size);
 hsa_status_t GpuAgent::DmaCopy(void* dst, core::Agent& dst_agent,
@@ -439,31 +586,55 @@ hsa_status_t GpuAgent::DmaCopy(void* dst, core::Agent& dst_agent,
                                size_t size,
                                std::vector<core::Signal*>& dep_signals,
                                core::Signal& out_signal) {
-  core::Blit* blit = (src_agent.device_type() == core::Agent::kAmdCpuDevice &&
-                      dst_agent.device_type() == core::Agent::kAmdGpuDevice)
-                         ? blit_h2d_
-                         : blit_d2h_;
+  core::Blit* blit =
+      (src_agent.device_type() == core::Agent::kAmdCpuDevice &&
+       dst_agent.device_type() == core::Agent::kAmdGpuDevice)
+          ? blit_h2d_
+          : (src_agent.device_type() == core::Agent::kAmdGpuDevice &&
+             dst_agent.device_type() == core::Agent::kAmdCpuDevice)
+                ? blit_d2h_
+                : blit_d2d_;
   if (blit == NULL) {
-  // TODO: temporarily disable wait on thunk event if the out_signal
-  // is an interrupt signal object. Remove this when SDMA handle interrupt
-  // packet properly.
-  if (out_signal.EopEvent() != NULL) {
-    static_cast<core::InterruptSignal&>(out_signal).DisableWaitEvent();
+  hsa_status_t stat =
+      blit->SubmitLinearCopyCommand(dst, src, size, dep_signals, out_signal);
+  if (profiling_enabled() && HSA_STATUS_SUCCESS == stat) {
+    // Track the agent so we could translate the resulting timestamp to system
+    // domain correctly.
+    out_signal.async_copy_agent(this);
-  return blit->SubmitLinearCopyCommand(dst, src, size, dep_signals, out_signal);
+  return stat;
 hsa_status_t GpuAgent::DmaFill(void* ptr, uint32_t value, size_t count) {
-  if (blit_d2h_ == NULL) {
+  if (blit_d2d_ == NULL) {
+  }
+  return blit_d2d_->SubmitLinearFillCommand(ptr, value, count);
+hsa_status_t GpuAgent::EnableDmaProfiling(bool enable) {
+  if (enable && !InitEndTsPool()) {
-  return blit_d2h_->SubmitLinearFillCommand(ptr, value, count);
+  core::Blit* blit[3] = {blit_h2d_, blit_d2h_, blit_d2d_};
+  for (int i = 0; i < 3; ++i) {
+    if (blit[i] != NULL) {
+      const hsa_status_t stat = blit[i]->EnableProfiling(enable);
+      if (stat != HSA_STATUS_SUCCESS) {
+        return stat;
+      }
+    }
+  }
 hsa_status_t GpuAgent::GetInfo(hsa_agent_info_t attribute, void* value) const {
@@ -472,27 +643,19 @@ hsa_status_t GpuAgent::GetInfo(hsa_agent_info_t attribute, void* value) const {
   const core::ExtensionEntryPoints& extensions =
-  hsa_agent_t agent = core::Agent::Convert(this);
   const size_t attribute_u = static_cast<size_t>(attribute);
   switch (attribute_u) {
-      // TODO: hardcode for now.
+    {
+      // This code assumes that UTF-16 HsaNodeProperties.MarketingName is
+      // actually encoded in 7-bit ASCII, and the runtime output is 7-bit ASCII
+      // in bytes.
       std::memset(value, 0, kNameSize);
-      if (isa_->GetMajorVersion() == 7) {
-        std::memcpy(value, "Kaveri", sizeof("Kaveri"));
-      } else if (isa_->GetMajorVersion() == 8) {
-        if (isa_->GetMinorVersion() == 0 && isa_->GetStepping() == 2) {
-          std::memcpy(value, "Tonga", sizeof("Tonga"));
-        } else if (isa_->GetMinorVersion() == 0 && isa_->GetStepping() == 3) {
-          std::memcpy(value, "Fiji", sizeof("Fiji"));
-        } else {
-          std::memcpy(value, "Carrizo", sizeof("Carrizo"));
-        }
-      } else {
-        std::memcpy(value, "Unknown", sizeof("Unknown"));
-      }
+      char* temp = reinterpret_cast<char*>(value);
+      for (uint32_t i = 0; properties_.MarketingName[i] != 0 && i < kNameSize - 1; i++)
+        temp[i] = properties_.MarketingName[i];
+    }
       std::memset(value, 0, kNameSize);
       std::memcpy(value, "AMD", sizeof("AMD"));
@@ -572,13 +735,11 @@ hsa_status_t GpuAgent::GetInfo(hsa_agent_info_t attribute, void* value) const {
       memset(value, 0, sizeof(uint8_t) * 128);
-      if (extensions.table.hsa_ext_program_finalize_fn != NULL) {
+      if (core::hsa_internal_api_table_.finalizer_api.hsa_ext_program_finalize_fn != NULL) {
         *((uint8_t*)value) = 1 << HSA_EXTENSION_FINALIZER;
-      if (profile_ == HSA_PROFILE_FULL &&
-          extensions.table.hsa_ext_image_create_fn != NULL) {
-        // TODO: only APU supports images currently.
+      if (core::hsa_internal_api_table_.image_api.hsa_ext_image_create_fn != NULL) {
         *((uint8_t*)value) |= 1 << HSA_EXTENSION_IMAGES;
@@ -831,7 +992,6 @@ void GpuAgent::SyncClocks() {
 void GpuAgent::BindTrapHandler() {
-#ifdef __linux__  // No raw string literal support in VS builds right now
   const char* src_sp3 = R"(
     var s_trap_info_lo = ttmp0
     var s_trap_info_hi = ttmp1
@@ -904,13 +1064,13 @@ void GpuAgent::BindTrapHandler() {
   // Assemble the trap handler source code.
-  AssembleShader(src_sp3, "TrapHandler", trap_code_buf_, trap_code_buf_size_);
+  AssembleShader(src_sp3, "TrapHandler", AssembleTarget::ISA, trap_code_buf_,
+                 trap_code_buf_size_);
   // Bind the trap handler to this node.
   HSAKMT_STATUS err = hsaKmtSetTrapHandler(node_id(), trap_code_buf_,
                                            trap_code_buf_size_, NULL, 0);
   assert(err == HSAKMT_STATUS_SUCCESS && "hsaKmtSetTrapHandler() failed");
 }  // namespace
diff --git a/src/core/runtime/amd_memory_region.cpp b/src/core/runtime/amd_memory_region.cpp
index bf37110ae..62c9bfa16 100644
--- a/src/core/runtime/amd_memory_region.cpp
+++ b/src/core/runtime/amd_memory_region.cpp
@@ -43,6 +43,7 @@
 #include "core/inc/amd_memory_region.h"
 #include <algorithm>
+#include <set>
 #include "core/inc/runtime.h"
 #include "core/inc/amd_cpu_agent.h"
@@ -374,35 +375,41 @@ hsa_status_t MemoryRegion::GetAgentPoolInfo(
   const core::Runtime::LinkInfo link_info =
       core::Runtime::runtime_singleton_->GetLinkInfo(node_id_from, node_id_to);
+  /**
+   *  ---------------------------------------------------
+   *  |              |CPU        |GPU (owner)|GPU (peer) |
+   *  ---------------------------------------------------
+   *  |system memory |allowed    |disallowed |disallowed |
+   *  ---------------------------------------------------
+   *  |fb private    |never      |allowed    |never      |
+   *  ---------------------------------------------------
+   *  |fb public     |disallowed |allowed    |disallowed |
+   *  ---------------------------------------------------
+   *  |others        |never      |allowed    |never      |
+   *  ---------------------------------------------------
+   */
+  const hsa_amd_memory_pool_access_t access_type =
+      ((IsSystem() && (agent.device_type() == core::Agent::kAmdCpuDevice)) ||
+       (agent.node_id() == owner()->node_id()))
+          : (IsSystem() || (IsPublic() && link_info.num_hop > 0))
   switch (attribute) {
-      /**
-      *  ---------------------------------------------------
-      *  |              |CPU        |GPU (owner)|GPU (peer) |
-      *  ---------------------------------------------------
-      *  |system memory |allowed    |disallowed |disallowed |
-      *  ---------------------------------------------------
-      *  |fb private    |never      |allowed    |never      |
-      *  ---------------------------------------------------
-      *  |fb public     |disallowed |allowed    |disallowed |
-      *  ---------------------------------------------------
-      *  |others        |never      |allowed    |never      |
-      *  ---------------------------------------------------
-      */
-      *((hsa_amd_memory_pool_access_t*)value) =
-          (((IsSystem()) &&
-            (agent.device_type() == core::Agent::kAmdCpuDevice)) ||
-           (agent.node_id() == owner()->node_id()))
-              : (IsSystem() || (IsPublic() && link_info.num_hop > 0))
+      *((hsa_amd_memory_pool_access_t*)value) = access_type;
-      *((uint32_t*)value) = link_info.num_hop;
+      *((uint32_t*)value) =
+          (access_type != HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED)
+              ? link_info.num_hop
+              : 0;
+      break;
       memset(value, 0, sizeof(hsa_amd_memory_pool_link_info_t));
-      if (link_info.num_hop > 0) {
+      if ((access_type != HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) &&
+          (link_info.num_hop > 0)) {
         memcpy(value, &, sizeof(hsa_amd_memory_pool_link_info_t));
@@ -425,15 +432,17 @@ hsa_status_t MemoryRegion::AllowAccess(uint32_t num_agents,
   bool cpu_in_list = false;
+  std::set<GpuAgentInt*> whitelist_gpus;
   std::vector<uint32_t> whitelist_nodes;
   for (uint32_t i = 0; i < num_agents; ++i) {
-    const core::Agent* agent = core::Agent::Convert(agents[i]);
+    core::Agent* agent = core::Agent::Convert(agents[i]);
     if (agent == NULL || !agent->IsValid()) {
     if (agent->device_type() == core::Agent::kAmdGpuDevice) {
+      whitelist_gpus.insert(reinterpret_cast<GpuAgentInt*>(agent));
     } else {
       cpu_in_list = true;
@@ -452,17 +461,24 @@ hsa_status_t MemoryRegion::AllowAccess(uint32_t num_agents,
       std::find(whitelist_nodes.begin(), whitelist_nodes.end(),
                 owner()->node_id()) == whitelist_nodes.end()) {
+    whitelist_gpus.insert(reinterpret_cast<GpuAgentInt*>(owner()));
   HsaMemMapFlags map_flag = map_flag_;
   map_flag.ui32.HostAccess |= (cpu_in_list) ? 1 : 0;
   uint64_t alternate_va = 0;
-  return (amd::MemoryRegion::MakeKfdMemoryResident(
-             whitelist_nodes.size(), &whitelist_nodes[0],
-             const_cast<void*>(ptr), size, &alternate_va, map_flag))
-             ? HSA_STATUS_SUCCESS
+  if (!amd::MemoryRegion::MakeKfdMemoryResident(
+          whitelist_nodes.size(), &whitelist_nodes[0], const_cast<void*>(ptr),
+          size, &alternate_va, map_flag)) {
+  }
+  for (GpuAgentInt* gpu : whitelist_gpus) {
+    gpu->InitDma();
+  }
 hsa_status_t MemoryRegion::CanMigrate(const MemoryRegion& dst,
@@ -490,10 +506,15 @@ hsa_status_t MemoryRegion::Lock(uint32_t num_agents, const hsa_agent_t* agents,
+  std::set<core::Agent*> whitelist_gpus;
   std::vector<HSAuint32> whitelist_nodes;
   if (num_agents == 0 || agents == NULL) {
     // Map to all GPU agents.
     whitelist_nodes = core::Runtime::runtime_singleton_->gpu_ids();
+    whitelist_gpus.insert(
+        core::Runtime::runtime_singleton_->gpu_agents().begin(),
+        core::Runtime::runtime_singleton_->gpu_agents().end());
   } else {
     for (int i = 0; i < num_agents; ++i) {
       core::Agent* agent = core::Agent::Convert(agents[i]);
@@ -503,6 +524,7 @@ hsa_status_t MemoryRegion::Lock(uint32_t num_agents, const hsa_agent_t* agents,
       if (agent->device_type() == core::Agent::kAmdGpuDevice) {
+        whitelist_gpus.insert(reinterpret_cast<GpuAgentInt*>(agent));
@@ -520,8 +542,15 @@ hsa_status_t MemoryRegion::Lock(uint32_t num_agents, const hsa_agent_t* agents,
     uint64_t alternate_va = 0;
     if (MakeKfdMemoryResident(whitelist_nodes.size(), &whitelist_nodes[0],
                               host_ptr, size, &alternate_va, map_flag_)) {
-      assert(alternate_va != 0);
-      *agent_ptr = reinterpret_cast<void*>(alternate_va);
+      if (alternate_va != 0) {
+        *agent_ptr = reinterpret_cast<void*>(alternate_va);
+      } else {
+        *agent_ptr = host_ptr;
+      }
+      for (core::Agent* gpu : whitelist_gpus) {
+        reinterpret_cast<GpuAgentInt*>(gpu)->InitDma();
+      }
       return HSA_STATUS_SUCCESS;
diff --git a/src/core/runtime/amd_topology.cpp b/src/core/runtime/amd_topology.cpp
index e6a348330..b54292554 100644
--- a/src/core/runtime/amd_topology.cpp
+++ b/src/core/runtime/amd_topology.cpp
@@ -78,12 +78,6 @@ GpuAgent* DiscoverGpu(HSAuint32 node_id, HsaNodeProperties& node_prop) {
   GpuAgent* gpu = new GpuAgent(node_id, node_prop);
-  if (HSA_STATUS_SUCCESS != gpu->InitDma()) {
-    assert(false && "Fail init blit");
-    delete gpu;
-    gpu = NULL;
-  }
   return gpu;
diff --git a/src/core/runtime/hsa.cpp b/src/core/runtime/hsa.cpp
index 8e8f0eceb..fdda3cf61 100644
--- a/src/core/runtime/hsa.cpp
+++ b/src/core/runtime/hsa.cpp
@@ -55,7 +55,7 @@
 #include "core/inc/default_signal.h"
 #include "core/inc/interrupt_signal.h"
 #include "core/inc/amd_loader_context.hpp"
-#include "inc/hsa_ven_amd_loaded_code_object.h"
+#include "inc/hsa_ven_amd_loader.h"
 using namespace amd::hsa::code;
@@ -168,9 +168,7 @@ hsa_status_t
                                    uint16_t version_minor, bool* result) {
-  if ((extension > HSA_EXTENSION_AMD_PROFILER &&
-        extension != HSA_EXTENSION_AMD_LOADED_CODE_OBJECT) ||
-      (result == NULL)) {
+  if (extension >= HSA_EXTENSION_COUNT || result == NULL) {
@@ -208,57 +206,57 @@ hsa_status_t
   hsa_status_t status = hsa_system_extension_supported(
       extension, version_major, version_minor, &supported);
-  if (HSA_STATUS_SUCCESS != status) {
+  if ((HSA_STATUS_SUCCESS != status) ||
+      (supported == false)) {
     return status;
-  if (supported) {
-    ExtTable& runtime_ext_table =
-        core::Runtime::runtime_singleton_->extensions_.table;
-    if (extension == HSA_EXTENSION_IMAGES) {
-      // Currently there is only version 1.00.
-      hsa_ext_images_1_00_pfn_t* ext_table =
-          reinterpret_cast<hsa_ext_images_1_00_pfn_t*>(table);
-      ext_table->hsa_ext_image_clear = hsa_ext_image_clear;
-      ext_table->hsa_ext_image_copy = hsa_ext_image_copy;
-      ext_table->hsa_ext_image_create = hsa_ext_image_create;
-      ext_table->hsa_ext_image_data_get_info = hsa_ext_image_data_get_info;
-      ext_table->hsa_ext_image_destroy = hsa_ext_image_destroy;
-      ext_table->hsa_ext_image_export = hsa_ext_image_export;
-      ext_table->hsa_ext_image_get_capability = hsa_ext_image_get_capability;
-      ext_table->hsa_ext_image_import = hsa_ext_image_import;
-      ext_table->hsa_ext_sampler_create = hsa_ext_sampler_create;
-      ext_table->hsa_ext_sampler_destroy = hsa_ext_sampler_destroy;
-      return HSA_STATUS_SUCCESS;
-    } else if (extension == HSA_EXTENSION_FINALIZER) {
-      // Currently there is only version 1.00.
-      hsa_ext_finalizer_1_00_pfn_s* ext_table =
-          reinterpret_cast<hsa_ext_finalizer_1_00_pfn_s*>(table);
-      ext_table->hsa_ext_program_add_module = hsa_ext_program_add_module;
-      ext_table->hsa_ext_program_create = hsa_ext_program_create;
-      ext_table->hsa_ext_program_destroy = hsa_ext_program_destroy;
-      ext_table->hsa_ext_program_finalize = hsa_ext_program_finalize;
-      ext_table->hsa_ext_program_get_info = hsa_ext_program_get_info;
-      ext_table->hsa_ext_program_iterate_modules =
-          hsa_ext_program_iterate_modules;
-      return HSA_STATUS_SUCCESS;
-    } else if (extension == HSA_EXTENSION_AMD_LOADED_CODE_OBJECT) {
-      // Currently there is only version 1.00.
-      hsa_ven_amd_loaded_code_object_1_00_pfn_t* ext_table =
-        reinterpret_cast<hsa_ven_amd_loaded_code_object_1_00_pfn_t*>(table);
-      ext_table->hsa_ven_amd_loaded_code_object_query_host_address =
-        hsa_ven_amd_loaded_code_object_query_host_address;
-      return HSA_STATUS_SUCCESS;
-    } else {
-      return HSA_STATUS_ERROR;
-    }
+  if (extension == HSA_EXTENSION_IMAGES) {
+    // Currently there is only version 1.00.
+    hsa_ext_images_1_00_pfn_t* ext_table =
+        reinterpret_cast<hsa_ext_images_1_00_pfn_t*>(table);
+    ext_table->hsa_ext_image_clear = hsa_ext_image_clear;
+    ext_table->hsa_ext_image_copy = hsa_ext_image_copy;
+    ext_table->hsa_ext_image_create = hsa_ext_image_create;
+    ext_table->hsa_ext_image_data_get_info = hsa_ext_image_data_get_info;
+    ext_table->hsa_ext_image_destroy = hsa_ext_image_destroy;
+    ext_table->hsa_ext_image_export = hsa_ext_image_export;
+    ext_table->hsa_ext_image_get_capability = hsa_ext_image_get_capability;
+    ext_table->hsa_ext_image_import = hsa_ext_image_import;
+    ext_table->hsa_ext_sampler_create = hsa_ext_sampler_create;
+    ext_table->hsa_ext_sampler_destroy = hsa_ext_sampler_destroy;
+  if (extension == HSA_EXTENSION_FINALIZER) {
+    // Currently there is only version 1.00.
+    hsa_ext_finalizer_1_00_pfn_s* ext_table =
+        reinterpret_cast<hsa_ext_finalizer_1_00_pfn_s*>(table);
+    ext_table->hsa_ext_program_add_module = hsa_ext_program_add_module;
+    ext_table->hsa_ext_program_create = hsa_ext_program_create;
+    ext_table->hsa_ext_program_destroy = hsa_ext_program_destroy;
+    ext_table->hsa_ext_program_finalize = hsa_ext_program_finalize;
+    ext_table->hsa_ext_program_get_info = hsa_ext_program_get_info;
+    ext_table->hsa_ext_program_iterate_modules =
+        hsa_ext_program_iterate_modules;
+  }
+  if (extension == HSA_EXTENSION_AMD_LOADER) {
+    // Currently there is only version 1.00.
+    hsa_ven_amd_loader_1_00_pfn_t* ext_table =
+      reinterpret_cast<hsa_ven_amd_loader_1_00_pfn_t*>(table);
+    ext_table->hsa_ven_amd_loader_query_host_address =
+      hsa_ven_amd_loader_query_host_address;
+    ext_table->hsa_ven_amd_loader_query_segment_descriptors =
+      hsa_ven_amd_loader_query_segment_descriptors;
+  }
@@ -785,7 +783,7 @@ hsa_status_t
   core::Signal* ret;
-  bool useshost = true;
+  bool uses_host = false;
   if (num_consumers > 0) {
@@ -798,13 +796,16 @@ hsa_status_t
-    useshost =
-        (consumer_set.find(
-            core::Runtime::runtime_singleton_->host_agent()->public_handle()) !=
-        consumer_set.end());
+    for (const core::Agent* cpu_agent :
+         core::Runtime::runtime_singleton_->cpu_agents()) {
+      uses_host |=
+          (consumer_set.find(cpu_agent->public_handle()) != consumer_set.end());
+    }
+  } else {
+    uses_host = true;
-  if (core::g_use_interrupt_wait && useshost) {
+  if (core::g_use_interrupt_wait && uses_host) {
     ret = new core::InterruptSignal(initial_value);
   } else {
     ret = new core::DefaultSignal(initial_value);
diff --git a/src/core/runtime/hsa_api_trace.cpp b/src/core/runtime/hsa_api_trace.cpp
index ca0b40192..9fe3823a5 100644
--- a/src/core/runtime/hsa_api_trace.cpp
+++ b/src/core/runtime/hsa_api_trace.cpp
@@ -42,150 +42,258 @@
 #include "core/inc/hsa_api_trace_int.h"
 #include "core/inc/runtime.h"
+#include "core/inc/hsa_ext_amd_impl.h"
 #include "core/inc/hsa_table_interface.h"
+#include <iostream>
 namespace core {
-ApiTable hsa_api_table_;
-ApiTable hsa_internal_api_table_;
+HsaApiTable hsa_api_table_;
+HsaApiTable hsa_internal_api_table_;
+HsaApiTable::HsaApiTable() {
+  Init();
+// Initialize member fields for Hsa Core and Amd Extension Api's
+// Member fields for Finalizer and Image extensions will be
+// updated as part of Hsa Runtime initialization.
+void HsaApiTable::Init() {
+  // Initialize Version of Api Table
+  hsa_api.version.major_id = HSA_API_TABLE_MAJOR_VERSION;
+  hsa_api.version.minor_id = sizeof(::HsaApiTable);
+  hsa_api.version.step_id = HSA_API_TABLE_STEP_VERSION;
+  // Update Api table for Core and its major id
+  UpdateCore();
+  hsa_api.core_ = &core_api;
+  // Update Api table for Amd Extensions and its major id
+  UpdateAmdExts();
+  hsa_api.amd_ext_ = &amd_ext_api;
+  // Initialize Api tables for Finalizer and Image to NULL
+  // Tables for Finalizer and Images are initialized as part
+  // of Hsa Runtime initialization, including their major ids
+  hsa_api.finalizer_ext_ = NULL;
+  hsa_api.image_ext_ = NULL;
+void HsaApiTable::Reset() {
+  Init();
+void HsaApiTable::CloneExts(void* ext_table, uint32_t table_id) {
+  assert(ext_table != NULL && "Invalid extension table linked.");
+  // Update HSA Extension Finalizer Api table
+  if (table_id == HSA_EXT_FINALIZER_API_TABLE_ID) {
+    finalizer_api = (*(FinalizerExtTable *)ext_table);
+    hsa_api.finalizer_ext_ = &finalizer_api;
+    return;
+  }
-ApiTable::ApiTable() {
-  table.std_exts_ = NULL;
-  Reset();
+  // Update HSA Extension Image Api table
+  if (table_id == HSA_EXT_IMAGE_API_TABLE_ID) {
+    image_api = (*(ImageExtTable *)ext_table);
+    hsa_api.image_ext_ = &image_api;
+    return;
+  }
-void ApiTable::LinkExts(ExtTable* ptr) {
-  assert(ptr != NULL && "Invalid extension table linked.");
-  extension_backup = *ptr;
-  table.std_exts_ = ptr;
+void HsaApiTable::LinkExts(void* ext_table, uint32_t table_id) {
+  assert(ext_table != NULL && "Invalid extension table linked.");
+  // Update HSA Extension Finalizer Api table
+  if (table_id == HSA_EXT_FINALIZER_API_TABLE_ID) {
+    finalizer_api = (*(FinalizerExtTable *)ext_table);
+    hsa_api.finalizer_ext_ = (FinalizerExtTable *)ext_table; 
+    return;
+  }
+  // Update HSA Extension Image Api table
+  if (table_id == HSA_EXT_IMAGE_API_TABLE_ID) {
+    image_api = (*(ImageExtTable *)ext_table);
+    hsa_api.image_ext_ = (ImageExtTable *)ext_table; 
+    return;
+  }
-void ApiTable::Reset() {
-  table.hsa_init_fn = HSA::hsa_init;
-  table.hsa_shut_down_fn = HSA::hsa_shut_down;
-  table.hsa_system_get_info_fn = HSA::hsa_system_get_info;
-  table.hsa_system_extension_supported_fn = HSA::hsa_system_extension_supported;
-  table.hsa_system_get_extension_table_fn = HSA::hsa_system_get_extension_table;
-  table.hsa_iterate_agents_fn = HSA::hsa_iterate_agents;
-  table.hsa_agent_get_info_fn = HSA::hsa_agent_get_info;
-  table.hsa_agent_get_exception_policies_fn =
+// Update Api table for Hsa Core Runtime
+void HsaApiTable::UpdateCore() {
+  // Initialize Version of Api Table
+  core_api.version.major_id = HSA_CORE_API_TABLE_MAJOR_VERSION;
+  core_api.version.minor_id = sizeof(::CoreApiTable);
+  core_api.version.step_id = HSA_CORE_API_TABLE_STEP_VERSION;
+  // Initialize function pointers for Hsa Core Runtime Api's
+  core_api.hsa_init_fn = HSA::hsa_init;
+  core_api.hsa_shut_down_fn = HSA::hsa_shut_down;
+  core_api.hsa_system_get_info_fn = HSA::hsa_system_get_info;
+  core_api.hsa_system_extension_supported_fn = HSA::hsa_system_extension_supported;
+  core_api.hsa_system_get_extension_table_fn = HSA::hsa_system_get_extension_table;
+  core_api.hsa_iterate_agents_fn = HSA::hsa_iterate_agents;
+  core_api.hsa_agent_get_info_fn = HSA::hsa_agent_get_info;
+  core_api.hsa_agent_get_exception_policies_fn =
-  table.hsa_agent_extension_supported_fn = HSA::hsa_agent_extension_supported;
-  table.hsa_queue_create_fn = HSA::hsa_queue_create;
-  table.hsa_soft_queue_create_fn = HSA::hsa_soft_queue_create;
-  table.hsa_queue_destroy_fn = HSA::hsa_queue_destroy;
-  table.hsa_queue_inactivate_fn = HSA::hsa_queue_inactivate;
-  table.hsa_queue_load_read_index_acquire_fn =
+  core_api.hsa_agent_extension_supported_fn = HSA::hsa_agent_extension_supported;
+  core_api.hsa_queue_create_fn = HSA::hsa_queue_create;
+  core_api.hsa_soft_queue_create_fn = HSA::hsa_soft_queue_create;
+  core_api.hsa_queue_destroy_fn = HSA::hsa_queue_destroy;
+  core_api.hsa_queue_inactivate_fn = HSA::hsa_queue_inactivate;
+  core_api.hsa_queue_load_read_index_acquire_fn =
-  table.hsa_queue_load_read_index_relaxed_fn =
+  core_api.hsa_queue_load_read_index_relaxed_fn =
-  table.hsa_queue_load_write_index_acquire_fn =
+  core_api.hsa_queue_load_write_index_acquire_fn =
-  table.hsa_queue_load_write_index_relaxed_fn =
+  core_api.hsa_queue_load_write_index_relaxed_fn =
-  table.hsa_queue_store_write_index_relaxed_fn =
+  core_api.hsa_queue_store_write_index_relaxed_fn =
-  table.hsa_queue_store_write_index_release_fn =
+  core_api.hsa_queue_store_write_index_release_fn =
-  table.hsa_queue_cas_write_index_acq_rel_fn =
+  core_api.hsa_queue_cas_write_index_acq_rel_fn =
-  table.hsa_queue_cas_write_index_acquire_fn =
+  core_api.hsa_queue_cas_write_index_acquire_fn =
-  table.hsa_queue_cas_write_index_relaxed_fn =
+  core_api.hsa_queue_cas_write_index_relaxed_fn =
-  table.hsa_queue_cas_write_index_release_fn =
+  core_api.hsa_queue_cas_write_index_release_fn =
-  table.hsa_queue_add_write_index_acq_rel_fn =
+  core_api.hsa_queue_add_write_index_acq_rel_fn =
-  table.hsa_queue_add_write_index_acquire_fn =
+  core_api.hsa_queue_add_write_index_acquire_fn =
-  table.hsa_queue_add_write_index_relaxed_fn =
+  core_api.hsa_queue_add_write_index_relaxed_fn =
-  table.hsa_queue_add_write_index_release_fn =
+  core_api.hsa_queue_add_write_index_release_fn =
-  table.hsa_queue_store_read_index_relaxed_fn =
+  core_api.hsa_queue_store_read_index_relaxed_fn =
-  table.hsa_queue_store_read_index_release_fn =
+  core_api.hsa_queue_store_read_index_release_fn =
-  table.hsa_agent_iterate_regions_fn = HSA::hsa_agent_iterate_regions;
-  table.hsa_region_get_info_fn = HSA::hsa_region_get_info;
-  table.hsa_memory_register_fn = HSA::hsa_memory_register;
-  table.hsa_memory_deregister_fn = HSA::hsa_memory_deregister;
-  table.hsa_memory_allocate_fn = HSA::hsa_memory_allocate;
-  table.hsa_memory_free_fn = HSA::hsa_memory_free;
-  table.hsa_memory_copy_fn = HSA::hsa_memory_copy;
-  table.hsa_memory_assign_agent_fn = HSA::hsa_memory_assign_agent;
-  table.hsa_signal_create_fn = HSA::hsa_signal_create;
-  table.hsa_signal_destroy_fn = HSA::hsa_signal_destroy;
-  table.hsa_signal_load_relaxed_fn = HSA::hsa_signal_load_relaxed;
-  table.hsa_signal_load_acquire_fn = HSA::hsa_signal_load_acquire;
-  table.hsa_signal_store_relaxed_fn = HSA::hsa_signal_store_relaxed;
-  table.hsa_signal_store_release_fn = HSA::hsa_signal_store_release;
-  table.hsa_signal_wait_relaxed_fn = HSA::hsa_signal_wait_relaxed;
-  table.hsa_signal_wait_acquire_fn = HSA::hsa_signal_wait_acquire;
-  table.hsa_signal_and_relaxed_fn = HSA::hsa_signal_and_relaxed;
-  table.hsa_signal_and_acquire_fn = HSA::hsa_signal_and_acquire;
-  table.hsa_signal_and_release_fn = HSA::hsa_signal_and_release;
-  table.hsa_signal_and_acq_rel_fn = HSA::hsa_signal_and_acq_rel;
-  table.hsa_signal_or_relaxed_fn = HSA::hsa_signal_or_relaxed;
-  table.hsa_signal_or_acquire_fn = HSA::hsa_signal_or_acquire;
-  table.hsa_signal_or_release_fn = HSA::hsa_signal_or_release;
-  table.hsa_signal_or_acq_rel_fn = HSA::hsa_signal_or_acq_rel;
-  table.hsa_signal_xor_relaxed_fn = HSA::hsa_signal_xor_relaxed;
-  table.hsa_signal_xor_acquire_fn = HSA::hsa_signal_xor_acquire;
-  table.hsa_signal_xor_release_fn = HSA::hsa_signal_xor_release;
-  table.hsa_signal_xor_acq_rel_fn = HSA::hsa_signal_xor_acq_rel;
-  table.hsa_signal_exchange_relaxed_fn = HSA::hsa_signal_exchange_relaxed;
-  table.hsa_signal_exchange_acquire_fn = HSA::hsa_signal_exchange_acquire;
-  table.hsa_signal_exchange_release_fn = HSA::hsa_signal_exchange_release;
-  table.hsa_signal_exchange_acq_rel_fn = HSA::hsa_signal_exchange_acq_rel;
-  table.hsa_signal_add_relaxed_fn = HSA::hsa_signal_add_relaxed;
-  table.hsa_signal_add_acquire_fn = HSA::hsa_signal_add_acquire;
-  table.hsa_signal_add_release_fn = HSA::hsa_signal_add_release;
-  table.hsa_signal_add_acq_rel_fn = HSA::hsa_signal_add_acq_rel;
-  table.hsa_signal_subtract_relaxed_fn = HSA::hsa_signal_subtract_relaxed;
-  table.hsa_signal_subtract_acquire_fn = HSA::hsa_signal_subtract_acquire;
-  table.hsa_signal_subtract_release_fn = HSA::hsa_signal_subtract_release;
-  table.hsa_signal_subtract_acq_rel_fn = HSA::hsa_signal_subtract_acq_rel;
-  table.hsa_signal_cas_relaxed_fn = HSA::hsa_signal_cas_relaxed;
-  table.hsa_signal_cas_acquire_fn = HSA::hsa_signal_cas_acquire;
-  table.hsa_signal_cas_release_fn = HSA::hsa_signal_cas_release;
-  table.hsa_signal_cas_acq_rel_fn = HSA::hsa_signal_cas_acq_rel;
-  table.hsa_isa_from_name_fn = HSA::hsa_isa_from_name;
-  table.hsa_isa_get_info_fn = HSA::hsa_isa_get_info;
-  table.hsa_isa_compatible_fn = HSA::hsa_isa_compatible;
-  table.hsa_code_object_serialize_fn = HSA::hsa_code_object_serialize;
-  table.hsa_code_object_deserialize_fn = HSA::hsa_code_object_deserialize;
-  table.hsa_code_object_destroy_fn = HSA::hsa_code_object_destroy;
-  table.hsa_code_object_get_info_fn = HSA::hsa_code_object_get_info;
-  table.hsa_code_object_get_symbol_fn = HSA::hsa_code_object_get_symbol;
-  table.hsa_code_symbol_get_info_fn = HSA::hsa_code_symbol_get_info;
-  table.hsa_code_object_iterate_symbols_fn =
+  core_api.hsa_agent_iterate_regions_fn = HSA::hsa_agent_iterate_regions;
+  core_api.hsa_region_get_info_fn = HSA::hsa_region_get_info;
+  core_api.hsa_memory_register_fn = HSA::hsa_memory_register;
+  core_api.hsa_memory_deregister_fn = HSA::hsa_memory_deregister;
+  core_api.hsa_memory_allocate_fn = HSA::hsa_memory_allocate;
+  core_api.hsa_memory_free_fn = HSA::hsa_memory_free;
+  core_api.hsa_memory_copy_fn = HSA::hsa_memory_copy;
+  core_api.hsa_memory_assign_agent_fn = HSA::hsa_memory_assign_agent;
+  core_api.hsa_signal_create_fn = HSA::hsa_signal_create;
+  core_api.hsa_signal_destroy_fn = HSA::hsa_signal_destroy;
+  core_api.hsa_signal_load_relaxed_fn = HSA::hsa_signal_load_relaxed;
+  core_api.hsa_signal_load_acquire_fn = HSA::hsa_signal_load_acquire;
+  core_api.hsa_signal_store_relaxed_fn = HSA::hsa_signal_store_relaxed;
+  core_api.hsa_signal_store_release_fn = HSA::hsa_signal_store_release;
+  core_api.hsa_signal_wait_relaxed_fn = HSA::hsa_signal_wait_relaxed;
+  core_api.hsa_signal_wait_acquire_fn = HSA::hsa_signal_wait_acquire;
+  core_api.hsa_signal_and_relaxed_fn = HSA::hsa_signal_and_relaxed;
+  core_api.hsa_signal_and_acquire_fn = HSA::hsa_signal_and_acquire;
+  core_api.hsa_signal_and_release_fn = HSA::hsa_signal_and_release;
+  core_api.hsa_signal_and_acq_rel_fn = HSA::hsa_signal_and_acq_rel;
+  core_api.hsa_signal_or_relaxed_fn = HSA::hsa_signal_or_relaxed;
+  core_api.hsa_signal_or_acquire_fn = HSA::hsa_signal_or_acquire;
+  core_api.hsa_signal_or_release_fn = HSA::hsa_signal_or_release;
+  core_api.hsa_signal_or_acq_rel_fn = HSA::hsa_signal_or_acq_rel;
+  core_api.hsa_signal_xor_relaxed_fn = HSA::hsa_signal_xor_relaxed;
+  core_api.hsa_signal_xor_acquire_fn = HSA::hsa_signal_xor_acquire;
+  core_api.hsa_signal_xor_release_fn = HSA::hsa_signal_xor_release;
+  core_api.hsa_signal_xor_acq_rel_fn = HSA::hsa_signal_xor_acq_rel;
+  core_api.hsa_signal_exchange_relaxed_fn = HSA::hsa_signal_exchange_relaxed;
+  core_api.hsa_signal_exchange_acquire_fn = HSA::hsa_signal_exchange_acquire;
+  core_api.hsa_signal_exchange_release_fn = HSA::hsa_signal_exchange_release;
+  core_api.hsa_signal_exchange_acq_rel_fn = HSA::hsa_signal_exchange_acq_rel;
+  core_api.hsa_signal_add_relaxed_fn = HSA::hsa_signal_add_relaxed;
+  core_api.hsa_signal_add_acquire_fn = HSA::hsa_signal_add_acquire;
+  core_api.hsa_signal_add_release_fn = HSA::hsa_signal_add_release;
+  core_api.hsa_signal_add_acq_rel_fn = HSA::hsa_signal_add_acq_rel;
+  core_api.hsa_signal_subtract_relaxed_fn = HSA::hsa_signal_subtract_relaxed;
+  core_api.hsa_signal_subtract_acquire_fn = HSA::hsa_signal_subtract_acquire;
+  core_api.hsa_signal_subtract_release_fn = HSA::hsa_signal_subtract_release;
+  core_api.hsa_signal_subtract_acq_rel_fn = HSA::hsa_signal_subtract_acq_rel;
+  core_api.hsa_signal_cas_relaxed_fn = HSA::hsa_signal_cas_relaxed;
+  core_api.hsa_signal_cas_acquire_fn = HSA::hsa_signal_cas_acquire;
+  core_api.hsa_signal_cas_release_fn = HSA::hsa_signal_cas_release;
+  core_api.hsa_signal_cas_acq_rel_fn = HSA::hsa_signal_cas_acq_rel;
+  core_api.hsa_isa_from_name_fn = HSA::hsa_isa_from_name;
+  core_api.hsa_isa_get_info_fn = HSA::hsa_isa_get_info;
+  core_api.hsa_isa_compatible_fn = HSA::hsa_isa_compatible;
+  core_api.hsa_code_object_serialize_fn = HSA::hsa_code_object_serialize;
+  core_api.hsa_code_object_deserialize_fn = HSA::hsa_code_object_deserialize;
+  core_api.hsa_code_object_destroy_fn = HSA::hsa_code_object_destroy;
+  core_api.hsa_code_object_get_info_fn = HSA::hsa_code_object_get_info;
+  core_api.hsa_code_object_get_symbol_fn = HSA::hsa_code_object_get_symbol;
+  core_api.hsa_code_symbol_get_info_fn = HSA::hsa_code_symbol_get_info;
+  core_api.hsa_code_object_iterate_symbols_fn =
-  table.hsa_executable_create_fn = HSA::hsa_executable_create;
-  table.hsa_executable_destroy_fn = HSA::hsa_executable_destroy;
-  table.hsa_executable_load_code_object_fn =
+  core_api.hsa_executable_create_fn = HSA::hsa_executable_create;
+  core_api.hsa_executable_destroy_fn = HSA::hsa_executable_destroy;
+  core_api.hsa_executable_load_code_object_fn =
-  table.hsa_executable_freeze_fn = HSA::hsa_executable_freeze;
-  table.hsa_executable_get_info_fn = HSA::hsa_executable_get_info;
-  table.hsa_executable_global_variable_define_fn =
+  core_api.hsa_executable_freeze_fn = HSA::hsa_executable_freeze;
+  core_api.hsa_executable_get_info_fn = HSA::hsa_executable_get_info;
+  core_api.hsa_executable_global_variable_define_fn =
-  table.hsa_executable_agent_global_variable_define_fn =
+  core_api.hsa_executable_agent_global_variable_define_fn =
-  table.hsa_executable_readonly_variable_define_fn =
+  core_api.hsa_executable_readonly_variable_define_fn =
-  table.hsa_executable_validate_fn = HSA::hsa_executable_validate;
-  table.hsa_executable_get_symbol_fn = HSA::hsa_executable_get_symbol;
-  table.hsa_executable_symbol_get_info_fn = HSA::hsa_executable_symbol_get_info;
-  table.hsa_executable_iterate_symbols_fn = HSA::hsa_executable_iterate_symbols;
-  table.hsa_status_string_fn = HSA::hsa_status_string;
+  core_api.hsa_executable_validate_fn = HSA::hsa_executable_validate;
+  core_api.hsa_executable_get_symbol_fn = HSA::hsa_executable_get_symbol;
+  core_api.hsa_executable_symbol_get_info_fn = HSA::hsa_executable_symbol_get_info;
+  core_api.hsa_executable_iterate_symbols_fn = HSA::hsa_executable_iterate_symbols;
+  core_api.hsa_status_string_fn = HSA::hsa_status_string;
-  if (table.std_exts_ != NULL) *table.std_exts_ = extension_backup;
+// Update Api table for Amd Extensions.
+// @note: Current implementation will initialize the
+// member variable hsa_amd_image_create_fn while loading
+// Image extension library
+void HsaApiTable::UpdateAmdExts() {
+  // Initialize Version of Api Table
+  amd_ext_api.version.major_id = HSA_AMD_EXT_API_TABLE_MAJOR_VERSION;
+  amd_ext_api.version.minor_id = sizeof(::AmdExtTable);
+  amd_ext_api.version.step_id = HSA_AMD_EXT_API_TABLE_STEP_VERSION;
+  // Initialize function pointers for Amd Extension Api's
+  amd_ext_api.hsa_amd_coherency_get_type_fn = AMD::hsa_amd_coherency_get_type;
+  amd_ext_api.hsa_amd_coherency_set_type_fn = AMD::hsa_amd_coherency_set_type;
+  amd_ext_api.hsa_amd_profiling_set_profiler_enabled_fn = AMD::hsa_amd_profiling_set_profiler_enabled;
+  amd_ext_api.hsa_amd_profiling_async_copy_enable_fn = AMD::hsa_amd_profiling_async_copy_enable;
+  amd_ext_api.hsa_amd_profiling_get_dispatch_time_fn = AMD::hsa_amd_profiling_get_dispatch_time;
+  amd_ext_api.hsa_amd_profiling_get_async_copy_time_fn = AMD::hsa_amd_profiling_get_async_copy_time;
+  amd_ext_api.hsa_amd_profiling_convert_tick_to_system_domain_fn = AMD::hsa_amd_profiling_convert_tick_to_system_domain;
+  amd_ext_api.hsa_amd_signal_async_handler_fn = AMD::hsa_amd_signal_async_handler;
+  amd_ext_api.hsa_amd_async_function_fn = AMD::hsa_amd_async_function;
+  amd_ext_api.hsa_amd_signal_wait_any_fn = AMD::hsa_amd_signal_wait_any;
+  amd_ext_api.hsa_amd_queue_cu_set_mask_fn = AMD::hsa_amd_queue_cu_set_mask;
+  amd_ext_api.hsa_amd_memory_pool_get_info_fn = AMD::hsa_amd_memory_pool_get_info;
+  amd_ext_api.hsa_amd_agent_iterate_memory_pools_fn = AMD::hsa_amd_agent_iterate_memory_pools;
+  amd_ext_api.hsa_amd_memory_pool_allocate_fn = AMD::hsa_amd_memory_pool_allocate;
+  amd_ext_api.hsa_amd_memory_pool_free_fn = AMD::hsa_amd_memory_pool_free;
+  amd_ext_api.hsa_amd_memory_async_copy_fn = AMD::hsa_amd_memory_async_copy;
+  amd_ext_api.hsa_amd_agent_memory_pool_get_info_fn = AMD::hsa_amd_agent_memory_pool_get_info;
+  amd_ext_api.hsa_amd_agents_allow_access_fn = AMD::hsa_amd_agents_allow_access;
+  amd_ext_api.hsa_amd_memory_pool_can_migrate_fn = AMD::hsa_amd_memory_pool_can_migrate;
+  amd_ext_api.hsa_amd_memory_migrate_fn = AMD::hsa_amd_memory_migrate;
+  amd_ext_api.hsa_amd_memory_lock_fn = AMD::hsa_amd_memory_lock;
+  amd_ext_api.hsa_amd_memory_unlock_fn = AMD::hsa_amd_memory_unlock;
+  amd_ext_api.hsa_amd_memory_fill_fn = AMD::hsa_amd_memory_fill;
+  amd_ext_api.hsa_amd_interop_map_buffer_fn = AMD::hsa_amd_interop_map_buffer;
+  amd_ext_api.hsa_amd_interop_unmap_buffer_fn = AMD::hsa_amd_interop_unmap_buffer;
 class Init {
-  Init() { hsa_table_interface_init(&hsa_api_table_.table); }
+  Init() { hsa_table_interface_init(&hsa_api_table_.hsa_api); }
 static Init LinkAtLoad;
diff --git a/src/core/runtime/hsa_ext_amd.cpp b/src/core/runtime/hsa_ext_amd.cpp
index 9394c3006..a31b5a6c3 100644
--- a/src/core/runtime/hsa_ext_amd.cpp
+++ b/src/core/runtime/hsa_ext_amd.cpp
@@ -109,7 +109,9 @@ static __forceinline bool IsValid(T* ptr) {
   return (ptr == NULL) ? NULL : ptr->IsValid();
-hsa_status_t HSA_API
+namespace AMD {
     hsa_amd_coherency_get_type(hsa_agent_t agent_handle,
                                hsa_amd_coherency_type_t* type) {
@@ -132,7 +134,7 @@ hsa_status_t HSA_API
-hsa_status_t HSA_API hsa_amd_coherency_set_type(hsa_agent_t agent_handle,
+hsa_status_t hsa_amd_coherency_set_type(hsa_agent_t agent_handle,
                                                 hsa_amd_coherency_type_t type) {
@@ -158,7 +160,7 @@ hsa_status_t HSA_API hsa_amd_coherency_set_type(hsa_agent_t agent_handle,
-hsa_status_t HSA_API
     hsa_amd_memory_fill(void* ptr, uint32_t value, size_t count) {
@@ -173,7 +175,7 @@ hsa_status_t HSA_API
   return core::Runtime::runtime_singleton_->FillMemory(ptr, value, count);
-hsa_status_t HSA_API
     hsa_amd_memory_async_copy(void* dst, hsa_agent_t dst_agent_handle,
                               const void* src, hsa_agent_t src_agent_handle,
                               size_t size, uint32_t num_dep_signals,
@@ -215,7 +217,7 @@ hsa_status_t HSA_API
-hsa_status_t HSA_API
     hsa_amd_profiling_set_profiler_enabled(hsa_queue_t* queue, int enable) {
@@ -229,7 +231,18 @@ hsa_status_t HSA_API
-hsa_status_t HSA_API hsa_amd_profiling_get_dispatch_time(
+hsa_status_t hsa_amd_profiling_async_copy_enable(bool enable) {
+  IS_OPEN();
+  return core::Runtime::runtime_singleton_->IterateAgent(
+      [](hsa_agent_t agent_handle, void* data) -> hsa_status_t {
+        const bool enable = *(reinterpret_cast<bool*>(data));
+        return core::Agent::Convert(agent_handle)->profiling_enabled(enable);
+      },
+      reinterpret_cast<void*>(&enable));
+hsa_status_t hsa_amd_profiling_get_dispatch_time(
     hsa_agent_t agent_handle, hsa_signal_t hsa_signal,
     hsa_amd_profiling_dispatch_time_t* time) {
@@ -250,12 +263,41 @@ hsa_status_t HSA_API hsa_amd_profiling_get_dispatch_time(
   amd::GpuAgentInt* gpu_agent = static_cast<amd::GpuAgentInt*>(agent);
+  // Translate timestamp from GPU to system domain.
   gpu_agent->TranslateTime(signal, *time);
-hsa_status_t HSA_API
+hsa_status_t hsa_amd_profiling_get_async_copy_time(
+    hsa_signal_t hsa_signal, hsa_amd_profiling_async_copy_time_t* time) {
+  IS_OPEN();
+  IS_BAD_PTR(time);
+  core::Signal* signal = core::Signal::Convert(hsa_signal);
+  IS_VALID(signal);
+  core::Agent* agent = signal->async_copy_agent();
+  if (agent == NULL) {
+    return HSA_STATUS_ERROR;
+  }
+  if (agent->device_type() == core::Agent::DeviceType::kAmdGpuDevice) {
+    // Translate timestamp from GPU to system domain.
+    static_cast<amd::GpuAgentInt*>(agent)->TranslateTime(signal, *time);
+  }
+  // The timestamp is already in system domain.
+  time->start = signal->signal_.start_ts;
+  time->end = signal->signal_.end_ts;
     hsa_amd_profiling_convert_tick_to_system_domain(hsa_agent_t agent_handle,
                                                     uint64_t agent_tick,
                                                     uint64_t* system_tick) {
@@ -278,7 +320,7 @@ hsa_status_t HSA_API
-uint32_t HSA_API
     hsa_amd_signal_wait_any(uint32_t signal_count, hsa_signal_t* hsa_signals,
                             hsa_signal_condition_t* conds,
                             hsa_signal_value_t* values, uint64_t timeout_hint,
@@ -296,7 +338,7 @@ uint32_t HSA_API
                                timeout_hint, wait_hint, satisfying_value);
-hsa_status_t HSA_API
     hsa_amd_signal_async_handler(hsa_signal_t hsa_signal,
                                  hsa_signal_condition_t cond,
                                  hsa_signal_value_t value,
@@ -312,7 +354,7 @@ hsa_status_t HSA_API
       hsa_signal, cond, value, handler, arg);
-hsa_status_t HSA_API
     hsa_amd_async_function(void (*callback)(void* arg), void* arg) {
@@ -323,7 +365,7 @@ hsa_status_t HSA_API
-hsa_status_t HSA_API hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue,
+hsa_status_t hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue,
                                                uint32_t num_cu_mask_count,
                                                const uint32_t* cu_mask) {
@@ -334,7 +376,7 @@ hsa_status_t HSA_API hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue,
   return cmd_queue->SetCUMasking(num_cu_mask_count, cu_mask);
-hsa_status_t HSA_API hsa_amd_memory_lock(void* host_ptr, size_t size,
+hsa_status_t hsa_amd_memory_lock(void* host_ptr, size_t size,
                                          hsa_agent_t* agents, int num_agent,
                                          void** agent_ptr) {
   *agent_ptr = NULL;
@@ -357,7 +399,7 @@ hsa_status_t HSA_API hsa_amd_memory_lock(void* host_ptr, size_t size,
   return system_region->Lock(num_agent, agents, host_ptr, size, agent_ptr);
-hsa_status_t HSA_API hsa_amd_memory_unlock(void* host_ptr) {
+hsa_status_t hsa_amd_memory_unlock(void* host_ptr) {
   const amd::MemoryRegion* system_region =
@@ -367,7 +409,7 @@ hsa_status_t HSA_API hsa_amd_memory_unlock(void* host_ptr) {
   return system_region->Unlock(host_ptr);
-hsa_status_t HSA_API
     hsa_amd_memory_pool_get_info(hsa_amd_memory_pool_t memory_pool,
                                  hsa_amd_memory_pool_info_t attribute,
                                  void* value) {
@@ -383,7 +425,7 @@ hsa_status_t HSA_API
   return mem_region->GetPoolInfo(attribute, value);
-hsa_status_t HSA_API hsa_amd_agent_iterate_memory_pools(
+hsa_status_t hsa_amd_agent_iterate_memory_pools(
     hsa_agent_t agent_handle,
     hsa_status_t (*callback)(hsa_amd_memory_pool_t memory_pool, void* data),
     void* data) {
@@ -406,7 +448,7 @@ hsa_status_t HSA_API hsa_amd_agent_iterate_memory_pools(
-hsa_status_t HSA_API
     hsa_amd_memory_pool_allocate(hsa_amd_memory_pool_t memory_pool, size_t size,
                                  uint32_t flags, void** ptr) {
@@ -426,11 +468,11 @@ hsa_status_t HSA_API
                                                            size, ptr);
-hsa_status_t HSA_API hsa_amd_memory_pool_free(void* ptr) {
+hsa_status_t hsa_amd_memory_pool_free(void* ptr) {
   return HSA::hsa_memory_free(ptr);
-hsa_status_t HSA_API
     hsa_amd_agents_allow_access(uint32_t num_agents, const hsa_agent_t* agents,
                                 const uint32_t* flags, const void* ptr) {
@@ -443,7 +485,7 @@ hsa_status_t HSA_API
-hsa_status_t HSA_API
     hsa_amd_memory_pool_can_migrate(hsa_amd_memory_pool_t src_memory_pool,
                                     hsa_amd_memory_pool_t dst_memory_pool,
                                     bool* result) {
@@ -472,7 +514,7 @@ hsa_status_t HSA_API
   return src_mem_region->CanMigrate(*dst_mem_region, *result);
-hsa_status_t HSA_API hsa_amd_memory_migrate(const void* ptr,
+hsa_status_t hsa_amd_memory_migrate(const void* ptr,
                                             hsa_amd_memory_pool_t memory_pool,
                                             uint32_t flags) {
@@ -492,7 +534,7 @@ hsa_status_t HSA_API hsa_amd_memory_migrate(const void* ptr,
   return dst_mem_region->Migrate(flags, ptr);
-hsa_status_t HSA_API hsa_amd_agent_memory_pool_get_info(
+hsa_status_t hsa_amd_agent_memory_pool_get_info(
     hsa_agent_t agent_handle, hsa_amd_memory_pool_t memory_pool,
     hsa_amd_agent_memory_pool_info_t attribute, void* value) {
@@ -553,3 +595,5 @@ hsa_status_t hsa_amd_interop_unmap_buffer(void* ptr) {
   if (ptr != NULL) core::Runtime::runtime_singleton_->InteropUnmap(ptr);
+} // end of AMD namespace
diff --git a/src/core/runtime/hsa_ext_interface.cpp b/src/core/runtime/hsa_ext_interface.cpp
index 3aa9f5c04..c8d8bf541 100644
--- a/src/core/runtime/hsa_ext_interface.cpp
+++ b/src/core/runtime/hsa_ext_interface.cpp
@@ -161,28 +161,75 @@ static T0 hsa_ext_null(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
                        T14, T15, T16, T17, T18, T19, T20) {
+template <class T0, class T1, class T2, class T3, class T4, class T5, class T6>
+static T0 hsa_amd_null(T1, T2, T3, T4, T5, T6) {
-ExtensionEntryPoints::ExtensionEntryPoints() { InitTable(); }
-void ExtensionEntryPoints::InitTable() {
-  table.hsa_ext_program_create_fn = hsa_ext_null;
-  table.hsa_ext_program_destroy_fn = hsa_ext_null;
-  table.hsa_ext_program_add_module_fn = hsa_ext_null;
-  table.hsa_ext_program_iterate_modules_fn = hsa_ext_null;
-  table.hsa_ext_program_get_info_fn = hsa_ext_null;
-  table.hsa_ext_program_finalize_fn = hsa_ext_null;
-  table.hsa_ext_image_get_capability_fn = hsa_ext_null;
-  table.hsa_ext_image_data_get_info_fn = hsa_ext_null;
-  table.hsa_ext_image_create_fn = hsa_ext_null;
-  table.hsa_ext_image_import_fn = hsa_ext_null;
-  table.hsa_ext_image_export_fn = hsa_ext_null;
-  table.hsa_ext_image_copy_fn = hsa_ext_null;
-  table.hsa_ext_image_clear_fn = hsa_ext_null;
-  table.hsa_ext_image_destroy_fn = hsa_ext_null;
-  table.hsa_ext_sampler_create_fn = hsa_ext_null;
-  table.hsa_ext_sampler_destroy_fn = hsa_ext_null;
-  table.hsa_amd_image_get_info_max_dim_fn = hsa_ext_null;
-  table.hsa_amd_image_create_fn = hsa_ext_null;
+ExtensionEntryPoints::ExtensionEntryPoints() {
+  InitFinalizerExtTable();
+  InitImageExtTable();
+  InitAmdExtTable();
+// Initialize Finalizer function table to be NULLs
+void ExtensionEntryPoints::InitFinalizerExtTable() {
+  // Initialize Version of Api Table
+  finalizer_api.version.major_id = 0x00;
+  finalizer_api.version.minor_id = 0x00;
+  finalizer_api.version.step_id = 0x00;
+  finalizer_api.hsa_ext_program_create_fn = hsa_ext_null;
+  finalizer_api.hsa_ext_program_destroy_fn = hsa_ext_null;
+  finalizer_api.hsa_ext_program_add_module_fn = hsa_ext_null;
+  finalizer_api.hsa_ext_program_iterate_modules_fn = hsa_ext_null;
+  finalizer_api.hsa_ext_program_get_info_fn = hsa_ext_null;
+  finalizer_api.hsa_ext_program_finalize_fn = hsa_ext_null;
+// Initialize Image function table to be NULLs
+void ExtensionEntryPoints::InitImageExtTable() {
+  // Initialize Version of Api Table
+  image_api.version.major_id = 0x00;
+  image_api.version.minor_id = 0x00;
+  image_api.version.step_id = 0x00;
+  image_api.hsa_ext_image_get_capability_fn = hsa_ext_null;
+  image_api.hsa_ext_image_data_get_info_fn = hsa_ext_null;
+  image_api.hsa_ext_image_create_fn = hsa_ext_null;
+  image_api.hsa_ext_image_import_fn = hsa_ext_null;
+  image_api.hsa_ext_image_export_fn = hsa_ext_null;
+  image_api.hsa_ext_image_copy_fn = hsa_ext_null;
+  image_api.hsa_ext_image_clear_fn = hsa_ext_null;
+  image_api.hsa_ext_image_destroy_fn = hsa_ext_null;
+  image_api.hsa_ext_sampler_create_fn = hsa_ext_null;
+  image_api.hsa_ext_sampler_destroy_fn = hsa_ext_null;
+  image_api.hsa_amd_image_get_info_max_dim_fn = hsa_ext_null;
+// Initialize Amd Ext table for Api related to Images
+void ExtensionEntryPoints::InitAmdExtTable() {
+  hsa_api_table_.amd_ext_api.hsa_amd_image_create_fn = hsa_ext_null;
+  hsa_internal_api_table_.amd_ext_api.hsa_amd_image_create_fn = hsa_ext_null;
+// Update Amd Ext table for Api related to Images.
+// @note: Interface should be updated when Amd Ext table
+// begins hosting Api's from other extension libraries
+void ExtensionEntryPoints::UpdateAmdExtTable(void *func_ptr) {
+  assert(hsa_api_table_.amd_ext_api.hsa_amd_image_create_fn ==
+             (decltype(::hsa_amd_image_create)*)hsa_ext_null && 
+             "Duplicate load of extension import.");
+  assert(hsa_internal_api_table_.amd_ext_api.hsa_amd_image_create_fn ==
+             (decltype(::hsa_amd_image_create)*)hsa_ext_null && 
+             "Duplicate load of extension import.");
+  hsa_api_table_.amd_ext_api.hsa_amd_image_create_fn = 
+             (decltype(::hsa_amd_image_create)*)func_ptr;
+  hsa_internal_api_table_.amd_ext_api.hsa_amd_image_create_fn = 
+             (decltype(::hsa_amd_image_create)*)func_ptr;
 void ExtensionEntryPoints::Unload() {
@@ -200,182 +247,217 @@ void ExtensionEntryPoints::Unload() {
-  InitTable();
+  InitFinalizerExtTable();
+  InitImageExtTable();
+  InitAmdExtTable();
+  core::hsa_internal_api_table_.Reset();
-bool ExtensionEntryPoints::Load(std::string library_name) {
+bool ExtensionEntryPoints::LoadImage(std::string library_name) {
   os::LibHandle lib = os::LoadLib(library_name);
   if (lib == NULL) {
     return false;
   void* ptr;
-  ptr = os::GetExportAddress(lib, "hsa_ext_program_create_impl");
-  if (ptr != NULL) {
-    assert(table.hsa_ext_program_create_fn ==
-               (decltype(::hsa_ext_program_create)*)hsa_ext_null &&
-           "Duplicate load of extension import.");
-    table.hsa_ext_program_create_fn = (decltype(::hsa_ext_program_create)*)ptr;
-  }
-  ptr = os::GetExportAddress(lib, "hsa_ext_program_destroy_impl");
-  if (ptr != NULL) {
-    assert(table.hsa_ext_program_destroy_fn ==
-               (decltype(::hsa_ext_program_destroy)*)hsa_ext_null &&
-           "Duplicate load of extension import.");
-    table.hsa_ext_program_destroy_fn =
-        (decltype(::hsa_ext_program_destroy)*)ptr;
-  }
-  ptr = os::GetExportAddress(lib, "hsa_ext_program_add_module_impl");
-  if (ptr != NULL) {
-    assert(table.hsa_ext_program_add_module_fn ==
-               (decltype(::hsa_ext_program_add_module)*)hsa_ext_null &&
-           "Duplicate load of extension import.");
-    table.hsa_ext_program_add_module_fn =
-        (decltype(::hsa_ext_program_add_module)*)ptr;
-  }
-  ptr = os::GetExportAddress(lib, "hsa_ext_program_iterate_modules_impl");
-  if (ptr != NULL) {
-    assert(table.hsa_ext_program_iterate_modules_fn ==
-               (decltype(::hsa_ext_program_iterate_modules)*)hsa_ext_null &&
-           "Duplicate load of extension import.");
-    table.hsa_ext_program_iterate_modules_fn =
-        (decltype(::hsa_ext_program_iterate_modules)*)ptr;
-  }
-  ptr = os::GetExportAddress(lib, "hsa_ext_program_get_info_impl");
-  if (ptr != NULL) {
-    assert(table.hsa_ext_program_get_info_fn ==
-               (decltype(::hsa_ext_program_get_info)*)hsa_ext_null &&
-           "Duplicate load of extension import.");
-    table.hsa_ext_program_get_info_fn =
-        (decltype(::hsa_ext_program_get_info)*)ptr;
-  }
-  ptr = os::GetExportAddress(lib, "hsa_ext_program_finalize_impl");
-  if (ptr != NULL) {
-    assert(table.hsa_ext_program_finalize_fn ==
-               (decltype(::hsa_ext_program_finalize)*)hsa_ext_null &&
-           "Duplicate load of extension import.");
-    table.hsa_ext_program_finalize_fn =
-        (decltype(::hsa_ext_program_finalize)*)ptr;
-  }
   ptr = os::GetExportAddress(lib, "hsa_ext_image_get_capability_impl");
+  bool libIsImage = (ptr != NULL);
   if (ptr != NULL) {
-    assert(table.hsa_ext_image_get_capability_fn ==
+    assert(image_api.hsa_ext_image_get_capability_fn ==
                (decltype(::hsa_ext_image_get_capability)*)hsa_ext_null &&
            "Duplicate load of extension import.");
-    table.hsa_ext_image_get_capability_fn =
+    image_api.hsa_ext_image_get_capability_fn =
   ptr = os::GetExportAddress(lib, "hsa_ext_image_data_get_info_impl");
   if (ptr != NULL) {
-    assert(table.hsa_ext_image_data_get_info_fn ==
+    assert(image_api.hsa_ext_image_data_get_info_fn ==
                (decltype(::hsa_ext_image_data_get_info)*)hsa_ext_null &&
            "Duplicate load of extension import.");
-    table.hsa_ext_image_data_get_info_fn =
+    image_api.hsa_ext_image_data_get_info_fn =
   ptr = os::GetExportAddress(lib, "hsa_ext_image_create_impl");
   if (ptr != NULL) {
-    assert(table.hsa_ext_image_create_fn ==
+    assert(image_api.hsa_ext_image_create_fn ==
                (decltype(::hsa_ext_image_create)*)hsa_ext_null &&
            "Duplicate load of extension import.");
-    table.hsa_ext_image_create_fn = (decltype(::hsa_ext_image_create)*)ptr;
+    image_api.hsa_ext_image_create_fn = (decltype(::hsa_ext_image_create)*)ptr;
   ptr = os::GetExportAddress(lib, "hsa_ext_image_import_impl");
   if (ptr != NULL) {
-    assert(table.hsa_ext_image_import_fn ==
+    assert(image_api.hsa_ext_image_import_fn ==
                (decltype(::hsa_ext_image_import)*)hsa_ext_null &&
            "Duplicate load of extension import.");
-    table.hsa_ext_image_import_fn = (decltype(::hsa_ext_image_import)*)ptr;
+    image_api.hsa_ext_image_import_fn = (decltype(::hsa_ext_image_import)*)ptr;
   ptr = os::GetExportAddress(lib, "hsa_ext_image_export_impl");
   if (ptr != NULL) {
-    assert(table.hsa_ext_image_export_fn ==
+    assert(image_api.hsa_ext_image_export_fn ==
                (decltype(::hsa_ext_image_export)*)hsa_ext_null &&
            "Duplicate load of extension import.");
-    table.hsa_ext_image_export_fn = (decltype(::hsa_ext_image_export)*)ptr;
+    image_api.hsa_ext_image_export_fn = (decltype(::hsa_ext_image_export)*)ptr;
   ptr = os::GetExportAddress(lib, "hsa_ext_image_copy_impl");
   if (ptr != NULL) {
-    assert(table.hsa_ext_image_copy_fn ==
+    assert(image_api.hsa_ext_image_copy_fn ==
                (decltype(::hsa_ext_image_copy)*)hsa_ext_null &&
            "Duplicate load of extension import.");
-    table.hsa_ext_image_copy_fn = (decltype(::hsa_ext_image_copy)*)ptr;
+    image_api.hsa_ext_image_copy_fn = (decltype(::hsa_ext_image_copy)*)ptr;
   ptr = os::GetExportAddress(lib, "hsa_ext_image_clear_impl");
   if (ptr != NULL) {
-    assert(table.hsa_ext_image_clear_fn ==
+    assert(image_api.hsa_ext_image_clear_fn ==
                (decltype(::hsa_ext_image_clear)*)hsa_ext_null &&
            "Duplicate load of extension import.");
-    table.hsa_ext_image_clear_fn = (decltype(::hsa_ext_image_clear)*)ptr;
+    image_api.hsa_ext_image_clear_fn = (decltype(::hsa_ext_image_clear)*)ptr;
   ptr = os::GetExportAddress(lib, "hsa_ext_image_destroy_impl");
   if (ptr != NULL) {
-    assert(table.hsa_ext_image_destroy_fn ==
+    assert(image_api.hsa_ext_image_destroy_fn ==
                (decltype(::hsa_ext_image_destroy)*)hsa_ext_null &&
            "Duplicate load of extension import.");
-    table.hsa_ext_image_destroy_fn = (decltype(::hsa_ext_image_destroy)*)ptr;
+    image_api.hsa_ext_image_destroy_fn = (decltype(::hsa_ext_image_destroy)*)ptr;
   ptr = os::GetExportAddress(lib, "hsa_ext_sampler_create_impl");
   if (ptr != NULL) {
-    assert(table.hsa_ext_sampler_create_fn ==
+    assert(image_api.hsa_ext_sampler_create_fn ==
                (decltype(::hsa_ext_sampler_create)*)hsa_ext_null &&
            "Duplicate load of extension import.");
-    table.hsa_ext_sampler_create_fn = (decltype(::hsa_ext_sampler_create)*)ptr;
+    image_api.hsa_ext_sampler_create_fn = (decltype(::hsa_ext_sampler_create)*)ptr;
   ptr = os::GetExportAddress(lib, "hsa_ext_sampler_destroy_impl");
   if (ptr != NULL) {
-    assert(table.hsa_ext_sampler_destroy_fn ==
+    assert(image_api.hsa_ext_sampler_destroy_fn ==
                (decltype(::hsa_ext_sampler_destroy)*)hsa_ext_null &&
            "Duplicate load of extension import.");
-    table.hsa_ext_sampler_destroy_fn =
+    image_api.hsa_ext_sampler_destroy_fn =
   ptr = os::GetExportAddress(lib, "hsa_amd_image_get_info_max_dim_impl");
   if (ptr != NULL) {
-    assert(table.hsa_amd_image_get_info_max_dim_fn ==
+    assert(image_api.hsa_amd_image_get_info_max_dim_fn ==
                (decltype(::hsa_amd_image_get_info_max_dim)*)hsa_ext_null &&
            "Duplicate load of extension import.");
-    table.hsa_amd_image_get_info_max_dim_fn =
+    image_api.hsa_amd_image_get_info_max_dim_fn =
   ptr = os::GetExportAddress(lib, "hsa_amd_image_create_impl");
   if (ptr != NULL) {
-    assert(table.hsa_amd_image_create_fn ==
-               (decltype(::hsa_amd_image_create)*)hsa_ext_null &&
+    UpdateAmdExtTable(ptr);
+  }
+  // Initialize Version of Api Table
+  image_api.version.major_id = HSA_IMAGE_API_TABLE_MAJOR_VERSION;
+  image_api.version.minor_id = sizeof(ImageExtTable);
+  image_api.version.step_id = HSA_IMAGE_API_TABLE_STEP_VERSION;
+  // Update private copy of Api table with handle for Image extensions
+  hsa_internal_api_table_.CloneExts(&image_api,
+                                    core::HsaApiTable::HSA_EXT_IMAGE_API_TABLE_ID);
+  ptr = os::GetExportAddress(lib, "Load");
+  if (ptr != NULL) {
+    ((Load_t)ptr)(&core::hsa_internal_api_table_.hsa_api);
+  }
+  return true;
+bool ExtensionEntryPoints::LoadFinalizer(std::string library_name) {
+  os::LibHandle lib = os::LoadLib(library_name);
+  if (lib == NULL) {
+    return false;
+  }
+  libs_.push_back(lib);
+  void* ptr;
+  ptr = os::GetExportAddress(lib, "hsa_ext_program_create_impl");
+  if (ptr != NULL) {
+    assert(finalizer_api.hsa_ext_program_create_fn ==
+               (decltype(::hsa_ext_program_create)*)hsa_ext_null &&
+           "Duplicate load of extension import.");
+    finalizer_api.hsa_ext_program_create_fn = (decltype(::hsa_ext_program_create)*)ptr;
+  }
+  ptr = os::GetExportAddress(lib, "hsa_ext_program_destroy_impl");
+  if (ptr != NULL) {
+    assert(finalizer_api.hsa_ext_program_destroy_fn ==
+               (decltype(::hsa_ext_program_destroy)*)hsa_ext_null &&
            "Duplicate load of extension import.");
-    table.hsa_amd_image_create_fn =
-        (decltype(::hsa_amd_image_create)*)ptr;
+    finalizer_api.hsa_ext_program_destroy_fn =
+        (decltype(::hsa_ext_program_destroy)*)ptr;
-  core::hsa_internal_api_table_.extension_backup=table;
-  core::hsa_internal_api_table_.table.std_exts_=&core::hsa_internal_api_table_.extension_backup;
+  ptr = os::GetExportAddress(lib, "hsa_ext_program_add_module_impl");
+  if (ptr != NULL) {
+    assert(finalizer_api.hsa_ext_program_add_module_fn ==
+               (decltype(::hsa_ext_program_add_module)*)hsa_ext_null &&
+           "Duplicate load of extension import.");
+    finalizer_api.hsa_ext_program_add_module_fn =
+        (decltype(::hsa_ext_program_add_module)*)ptr;
+  }
+  ptr = os::GetExportAddress(lib, "hsa_ext_program_iterate_modules_impl");
+  if (ptr != NULL) {
+    assert(finalizer_api.hsa_ext_program_iterate_modules_fn ==
+               (decltype(::hsa_ext_program_iterate_modules)*)hsa_ext_null &&
+           "Duplicate load of extension import.");
+    finalizer_api.hsa_ext_program_iterate_modules_fn =
+        (decltype(::hsa_ext_program_iterate_modules)*)ptr;
+  }
+  ptr = os::GetExportAddress(lib, "hsa_ext_program_get_info_impl");
+  if (ptr != NULL) {
+    assert(finalizer_api.hsa_ext_program_get_info_fn ==
+               (decltype(::hsa_ext_program_get_info)*)hsa_ext_null &&
+           "Duplicate load of extension import.");
+    finalizer_api.hsa_ext_program_get_info_fn =
+        (decltype(::hsa_ext_program_get_info)*)ptr;
+  }
+  ptr = os::GetExportAddress(lib, "hsa_ext_program_finalize_impl");
+  if (ptr != NULL) {
+    assert(finalizer_api.hsa_ext_program_finalize_fn ==
+               (decltype(::hsa_ext_program_finalize)*)hsa_ext_null &&
+           "Duplicate load of extension import.");
+    finalizer_api.hsa_ext_program_finalize_fn =
+        (decltype(::hsa_ext_program_finalize)*)ptr;
+  }
+  // Initialize Version of Api Table
+  finalizer_api.version.major_id = HSA_FINALIZER_API_TABLE_MAJOR_VERSION;
+  finalizer_api.version.minor_id = sizeof(::FinalizerExtTable);
+  finalizer_api.version.step_id = HSA_FINALIZER_API_TABLE_STEP_VERSION;
+  // Update handle of table of HSA extensions
+  hsa_internal_api_table_.CloneExts(&finalizer_api,
+                                    core::HsaApiTable::HSA_EXT_FINALIZER_API_TABLE_ID);
   ptr = os::GetExportAddress(lib, "Load");
   if (ptr != NULL) {
-    ((Load_t)ptr)(&core::hsa_internal_api_table_.table);
+    ((Load_t)ptr)(&core::hsa_internal_api_table_.hsa_api);
   return true;
 }  // namespace core
@@ -386,19 +468,19 @@ hsa_status_t hsa_ext_program_create(
     hsa_machine_model_t machine_model, hsa_profile_t profile,
     hsa_default_float_rounding_mode_t default_float_rounding_mode,
     const char* options, hsa_ext_program_t* program) {
-  return core::Runtime::runtime_singleton_->extensions_.table
+  return core::Runtime::runtime_singleton_->extensions_.finalizer_api
       .hsa_ext_program_create_fn(machine_model, profile,
                                  default_float_rounding_mode, options, program);
 hsa_status_t hsa_ext_program_destroy(hsa_ext_program_t program) {
-  return core::Runtime::runtime_singleton_->extensions_.table
+  return core::Runtime::runtime_singleton_->extensions_.finalizer_api
 hsa_status_t hsa_ext_program_add_module(hsa_ext_program_t program,
                                         hsa_ext_module_t module) {
-  return core::Runtime::runtime_singleton_->extensions_.table
+  return core::Runtime::runtime_singleton_->extensions_.finalizer_api
       .hsa_ext_program_add_module_fn(program, module);
@@ -407,14 +489,14 @@ hsa_status_t hsa_ext_program_iterate_modules(
     hsa_status_t (*callback)(hsa_ext_program_t program, hsa_ext_module_t module,
                              void* data),
     void* data) {
-  return core::Runtime::runtime_singleton_->extensions_.table
+  return core::Runtime::runtime_singleton_->extensions_.finalizer_api
       .hsa_ext_program_iterate_modules_fn(program, callback, data);
 hsa_status_t hsa_ext_program_get_info(hsa_ext_program_t program,
                                       hsa_ext_program_info_t attribute,
                                       void* value) {
-  return core::Runtime::runtime_singleton_->extensions_.table
+  return core::Runtime::runtime_singleton_->extensions_.finalizer_api
       .hsa_ext_program_get_info_fn(program, attribute, value);
@@ -422,7 +504,7 @@ hsa_status_t hsa_ext_program_finalize(
     hsa_ext_program_t program, hsa_isa_t isa, int32_t call_convention,
     hsa_ext_control_directives_t control_directives, const char* options,
     hsa_code_object_type_t code_object_type, hsa_code_object_t* code_object) {
-  return core::Runtime::runtime_singleton_->extensions_.table
+  return core::Runtime::runtime_singleton_->extensions_.finalizer_api
       .hsa_ext_program_finalize_fn(program, isa, call_convention,
                                    control_directives, options,
                                    code_object_type, code_object);
@@ -431,7 +513,7 @@ hsa_status_t hsa_ext_program_finalize(
 hsa_status_t hsa_ext_image_get_capability(
     hsa_agent_t agent, hsa_ext_image_geometry_t geometry,
     const hsa_ext_image_format_t* image_format, uint32_t* capability_mask) {
-  return core::Runtime::runtime_singleton_->extensions_.table
+  return core::Runtime::runtime_singleton_->extensions_.image_api
       .hsa_ext_image_get_capability_fn(agent, geometry, image_format,
@@ -440,7 +522,7 @@ hsa_status_t hsa_ext_image_data_get_info(
     hsa_agent_t agent, const hsa_ext_image_descriptor_t* image_descriptor,
     hsa_access_permission_t access_permission,
     hsa_ext_image_data_info_t* image_data_info) {
-  return core::Runtime::runtime_singleton_->extensions_.table
+  return core::Runtime::runtime_singleton_->extensions_.image_api
       .hsa_ext_image_data_get_info_fn(agent, image_descriptor,
                                       access_permission, image_data_info);
@@ -449,7 +531,7 @@ hsa_status_t hsa_ext_image_create(
     hsa_agent_t agent, const hsa_ext_image_descriptor_t* image_descriptor,
     const void* image_data, hsa_access_permission_t access_permission,
     hsa_ext_image_t* image) {
-  return core::Runtime::runtime_singleton_->extensions_.table
+  return core::Runtime::runtime_singleton_->extensions_.image_api
       .hsa_ext_image_create_fn(agent, image_descriptor, image_data,
                                access_permission, image);
@@ -458,7 +540,7 @@ hsa_status_t hsa_ext_image_import(hsa_agent_t agent, const void* src_memory,
                                   size_t src_row_pitch, size_t src_slice_pitch,
                                   hsa_ext_image_t dst_image,
                                   const hsa_ext_image_region_t* image_region) {
-  return core::Runtime::runtime_singleton_->extensions_.table
+  return core::Runtime::runtime_singleton_->extensions_.image_api
       .hsa_ext_image_import_fn(agent, src_memory, src_row_pitch,
                                src_slice_pitch, dst_image, image_region);
@@ -467,7 +549,7 @@ hsa_status_t hsa_ext_image_export(hsa_agent_t agent, hsa_ext_image_t src_image,
                                   void* dst_memory, size_t dst_row_pitch,
                                   size_t dst_slice_pitch,
                                   const hsa_ext_image_region_t* image_region) {
-  return core::Runtime::runtime_singleton_->extensions_.table
+  return core::Runtime::runtime_singleton_->extensions_.image_api
       .hsa_ext_image_export_fn(agent, src_image, dst_memory, dst_row_pitch,
                                dst_slice_pitch, image_region);
@@ -477,7 +559,7 @@ hsa_status_t hsa_ext_image_copy(hsa_agent_t agent, hsa_ext_image_t src_image,
                                 hsa_ext_image_t dst_image,
                                 const hsa_dim3_t* dst_offset,
                                 const hsa_dim3_t* range) {
-  return core::Runtime::runtime_singleton_->extensions_.table
+  return core::Runtime::runtime_singleton_->extensions_.image_api
       .hsa_ext_image_copy_fn(agent, src_image, src_offset, dst_image,
                              dst_offset, range);
@@ -485,25 +567,25 @@ hsa_status_t hsa_ext_image_copy(hsa_agent_t agent, hsa_ext_image_t src_image,
 hsa_status_t hsa_ext_image_clear(hsa_agent_t agent, hsa_ext_image_t image,
                                  const void* data,
                                  const hsa_ext_image_region_t* image_region) {
-  return core::Runtime::runtime_singleton_->extensions_.table
+  return core::Runtime::runtime_singleton_->extensions_.image_api
       .hsa_ext_image_clear_fn(agent, image, data, image_region);
 hsa_status_t hsa_ext_image_destroy(hsa_agent_t agent, hsa_ext_image_t image) {
-  return core::Runtime::runtime_singleton_->extensions_.table
+  return core::Runtime::runtime_singleton_->extensions_.image_api
       .hsa_ext_image_destroy_fn(agent, image);
 hsa_status_t hsa_ext_sampler_create(
     hsa_agent_t agent, const hsa_ext_sampler_descriptor_t* sampler_descriptor,
     hsa_ext_sampler_t* sampler) {
-  return core::Runtime::runtime_singleton_->extensions_.table
+  return core::Runtime::runtime_singleton_->extensions_.image_api
       .hsa_ext_sampler_create_fn(agent, sampler_descriptor, sampler);
 hsa_status_t hsa_ext_sampler_destroy(hsa_agent_t agent,
                                      hsa_ext_sampler_t sampler) {
-  return core::Runtime::runtime_singleton_->extensions_.table
+  return core::Runtime::runtime_singleton_->extensions_.image_api
       .hsa_ext_sampler_destroy_fn(agent, sampler);
@@ -511,20 +593,12 @@ hsa_status_t hsa_ext_sampler_destroy(hsa_agent_t agent,
 //  Stubs for internal extension functions
+// Use the function pointer from local instance Image Extension
 hsa_status_t hsa_amd_image_get_info_max_dim(hsa_agent_t component,
                                             hsa_agent_info_t attribute,
                                             void* value) {
-  return core::Runtime::runtime_singleton_->extensions_.table
+  return core::Runtime::runtime_singleton_->extensions_.image_api
       .hsa_amd_image_get_info_max_dim_fn(component, attribute, value);
-hsa_status_t hsa_amd_image_create(
-  hsa_agent_t agent,
-  const hsa_ext_image_descriptor_t *image_descriptor,
-  const hsa_amd_image_descriptor_t *image_layout,
-  const void *image_data,
-  hsa_access_permission_t access_permission,
-  hsa_ext_image_t *image) {
-    return core::Runtime::runtime_singleton_->extensions_.table
-      .hsa_amd_image_create_fn(agent, image_descriptor, image_layout, image_data, access_permission, image);
diff --git a/src/core/runtime/hsa_ven_amd_loaded_code_object.cpp b/src/core/runtime/hsa_ven_amd_loader.cpp
similarity index 83%
rename from src/core/runtime/hsa_ven_amd_loaded_code_object.cpp
rename to src/core/runtime/hsa_ven_amd_loader.cpp
index 958e3051e..ba951053e 100644
--- a/src/core/runtime/hsa_ven_amd_loaded_code_object.cpp
+++ b/src/core/runtime/hsa_ven_amd_loader.cpp
@@ -40,14 +40,14 @@
-#include "hsa_ven_amd_loaded_code_object.h"
+#include "hsa_ven_amd_loader.h"
 #include "core/inc/amd_hsa_loader.hpp"
 #include "core/inc/runtime.h"
 using namespace core;
-hsa_status_t hsa_ven_amd_loaded_code_object_query_host_address(
+hsa_status_t HSA_API hsa_ven_amd_loader_query_host_address(
   const void *device_address,
   const void **host_address) {
   if (false == core::Runtime::runtime_singleton_->IsOpen()) {
@@ -69,3 +69,14 @@ hsa_status_t hsa_ven_amd_loaded_code_object_query_host_address(
   *host_address = reinterpret_cast<void*>(uhaddr);
+hsa_status_t HSA_API hsa_ven_amd_loader_query_segment_descriptors(
+  hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors,
+  size_t *num_segment_descriptors) {
+  if (false == core::Runtime::runtime_singleton_->IsOpen()) {
+  }
+  // Arguments are checked by the loader.
+  return Runtime::runtime_singleton_->loader()->QuerySegmentDescriptors(segment_descriptors, num_segment_descriptors);
diff --git a/src/core/runtime/interrupt_signal.cpp b/src/core/runtime/interrupt_signal.cpp
index 67c95867d..eb07bcc53 100644
--- a/src/core/runtime/interrupt_signal.cpp
+++ b/src/core/runtime/interrupt_signal.cpp
@@ -87,8 +87,6 @@ InterruptSignal::InterruptSignal(hsa_signal_value_t initial_value,
     signal_.event_mailbox_ptr = 0;
   signal_.kind = AMD_SIGNAL_KIND_USER;
-  wait_on_event_ = true;
 InterruptSignal::~InterruptSignal() {
@@ -110,13 +108,11 @@ hsa_signal_value_t InterruptSignal::LoadAcquire() {
 void InterruptSignal::StoreRelaxed(hsa_signal_value_t value) {
-  wait_on_event_ = true;
   atomic::Store(&signal_.value, int64_t(value), std::memory_order_relaxed);
 void InterruptSignal::StoreRelease(hsa_signal_value_t value) {
-  wait_on_event_ = true;
   atomic::Store(&signal_.value, int64_t(value), std::memory_order_release);
@@ -181,7 +177,7 @@ hsa_signal_value_t InterruptSignal::WaitRelaxed(
         value = atomic::Load(&signal_.value, std::memory_order_relaxed);
         return hsa_signal_value_t(value);
-      if (wait_on_event_ && wait_hint != HSA_WAIT_STATE_ACTIVE) {
+      if (wait_hint != HSA_WAIT_STATE_ACTIVE) {
         uint32_t wait_ms;
         auto time_remaining = fast_timeout - (time - start_time);
         if ((timeout == -1) ||
diff --git a/src/core/runtime/runtime.cpp b/src/core/runtime/runtime.cpp
index 8449b8e7f..a93d75c99 100644
--- a/src/core/runtime/runtime.cpp
+++ b/src/core/runtime/runtime.cpp
@@ -58,6 +58,7 @@
 #include "core/inc/amd_topology.h"
 #include "core/inc/signal.h"
 #include "core/inc/interrupt_signal.h"
+#include "core/inc/hsa_ext_amd_impl.h"
 #include "core/inc/hsa_api_trace_int.h"
@@ -187,7 +188,6 @@ void Runtime::RegisterAgent(Agent* agent) {
       HsaClockCounters clocks;
       hsaKmtGetClockCounters(0, &clocks);
       sys_clock_freq_ = clocks.SystemClockFrequencyHz;
-      host_agent_ = agent;
   } else if (agent->device_type() == Agent::DeviceType::kAmdGpuDevice) {
@@ -261,6 +261,10 @@ void Runtime::RegisterLinkInfo(uint32_t node_id_from, uint32_t node_id_to,
   const uint32_t idx = GetIndexLinkInfo(node_id_from, node_id_to);
   link_matrix_[idx].num_hop = num_hop;
   link_matrix_[idx].info = link_info;
+  // Limit the number of hop to 1 since the runtime does not have enough
+  // information to share to the user about each hop.
+  link_matrix_[idx].num_hop = std::min(link_matrix_[idx].num_hop , 1U);
 const Runtime::LinkInfo Runtime::GetLinkInfo(uint32_t node_id_from,
@@ -420,19 +424,38 @@ hsa_status_t Runtime::CopyMemory(void* dst, core::Agent& dst_agent,
   // For cpu to cpu, fire and forget a copy thread.
-  std::thread([](void* dst, const void* src, size_t size,
-                 std::vector<core::Signal*> dep_signals,
-                 core::Signal* completion_signal) {
-                for (core::Signal* dep : dep_signals) {
-                  dep->WaitRelaxed(HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX,
-                                   HSA_WAIT_STATE_BLOCKED);
-                }
+  const bool profiling_enabled =
+      (dst_agent.profiling_enabled() || src_agent.profiling_enabled());
+  std::thread(
+      [](void* dst, const void* src, size_t size,
+         std::vector<core::Signal*> dep_signals,
+         core::Signal* completion_signal, bool profiling_enabled) {
+        for (core::Signal* dep : dep_signals) {
+          dep->WaitRelaxed(HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX,
+                           HSA_WAIT_STATE_BLOCKED);
+        }
+        if (profiling_enabled) {
+          HsaClockCounters clocks = {0};
+          core::Runtime::runtime_singleton_->GetSystemInfo(
+              HSA_SYSTEM_INFO_TIMESTAMP, reinterpret_cast<void*>(&clocks));
+          completion_signal->signal_.start_ts = clocks.SystemClockCounter;
+        }
+        memcpy(dst, src, size);
-                memcpy(dst, src, size);
+        if (profiling_enabled) {
+          HsaClockCounters clocks = {0};
+          core::Runtime::runtime_singleton_->GetSystemInfo(
+              HSA_SYSTEM_INFO_TIMESTAMP, reinterpret_cast<void*>(&clocks));
+          completion_signal->signal_.end_ts = clocks.SystemClockCounter;
+        }
-                completion_signal->SubRelease(1);
-              },
-              dst, src, size, dep_signals, &completion_signal).detach();
+        completion_signal->SubRelease(1);
+      },
+      dst, src, size, dep_signals, &completion_signal,
+      profiling_enabled).detach();
@@ -505,11 +528,11 @@ hsa_status_t Runtime::GetSystemInfo(hsa_system_info_t attribute, void* value) {
       memset(value, 0, sizeof(uint8_t) * 128);
-      if (extensions_.table.hsa_ext_program_finalize_fn != NULL) {
+      if (hsa_internal_api_table_.finalizer_api.hsa_ext_program_finalize_fn != NULL) {
         *((uint8_t*)value) = 1 << HSA_EXTENSION_FINALIZER;
-      if (extensions_.table.hsa_ext_image_create_fn != NULL) {
+      if (hsa_internal_api_table_.image_api.hsa_ext_image_create_fn != NULL) {
         *((uint8_t*)value) |= 1 << HSA_EXTENSION_IMAGES;
@@ -629,7 +652,7 @@ void Runtime::AsyncEventsLoop(void*) {
   while (!async_events_control_.exit) {
     // Wait for a signal
     hsa_signal_value_t value;
-    uint32_t index = hsa_amd_signal_wait_any(
+    uint32_t index = AMD::hsa_amd_signal_wait_any(
         uint32_t(async_events_.Size()), &async_events_.signal_[0],
         &async_events_.cond_[0], &async_events_.value_[0], uint64_t(-1),
         HSA_WAIT_STATE_BLOCKED, &value);
@@ -767,8 +790,7 @@ bool Runtime::VMFaultHandler(hsa_signal_value_t val, void* arg) {
-    : host_agent_(NULL),
-      blit_agent_(NULL),
+    : blit_agent_(NULL),
@@ -798,6 +820,14 @@ void Runtime::Load() {
   // Load tools libraries
+  // Initialize blit kernel object after tools is initialized to allow tools
+  // to overload blit kernel.
+  for (core::Agent* agent : gpu_agents_) {
+    const hsa_status_t stat =
+        reinterpret_cast<amd::GpuAgentInt*>(agent)->InitBlitKernel();
+    assert(HSA_STATUS_SUCCESS == stat);
+  }
 void Runtime::Unload() {
@@ -832,8 +862,12 @@ void Runtime::LoadExtensions() {
   static const std::string kImageLib[] = {"hsa-ext-image.dll",
-  extensions_.Load(kFinalizerLib[os_index(os::current_os)]);
-  extensions_.Load(kImageLib[os_index(os::current_os)]);
+  // Update Hsa Api Table with handle of Image extension Apis
+  extensions_.LoadFinalizer(kFinalizerLib[os_index(os::current_os)]);
+  // Update Hsa Api Table with handle of Finalizer extension Apis
+  extensions_.LoadImage(kImageLib[os_index(os::current_os)]);
 void Runtime::UnloadExtensions() { extensions_.Unload(); }
@@ -889,13 +923,16 @@ static std::vector<std::string> parse_tool_names(std::string tool_names) {
 void Runtime::LoadTools() {
-  typedef bool (*tool_init_t)(::ApiTable*, uint64_t, uint64_t,
+  typedef bool (*tool_init_t)(::HsaApiTable*, uint64_t, uint64_t,
                               const char* const*);
   typedef Agent* (*tool_wrap_t)(Agent*);
   typedef void (*tool_add_t)(Runtime*);
-  // Link extensions to API interception
-  hsa_api_table_.LinkExts(&extensions_.table);
+  // Link HSA Extensions for Finalizer and Images for Api interception
+  hsa_api_table_.LinkExts(&extensions_.finalizer_api,
+                          core::HsaApiTable::HSA_EXT_FINALIZER_API_TABLE_ID);
+  hsa_api_table_.LinkExts(&extensions_.image_api,
+                          core::HsaApiTable::HSA_EXT_IMAGE_API_TABLE_ID);
   // Load tool libs
   std::string tool_names = flag_.tools_lib_names();
@@ -911,7 +948,9 @@ void Runtime::LoadTools() {
         tool_init_t ld;
         ld = (tool_init_t)os::GetExportAddress(tool, "OnLoad");
         if (ld) {
-          if (!ld(&hsa_api_table_.table, 0, failed.size(), &failed[0])) {
+          if (!ld(&hsa_api_table_.hsa_api,
+                  hsa_api_table_.hsa_api.version.major_id,
+                  failed.size(), &failed[0])) {
diff --git a/src/core/util/win/os_win.cpp b/src/core/util/win/os_win.cpp
new file mode 100644
index 000000000..d97bff0ce
--- /dev/null
+++ b/src/core/util/win/os_win.cpp
@@ -0,0 +1,227 @@
+// AMD is granting you permission to use this software and documentation(if any)
+// (collectively, the "Materials") pursuant to the terms and conditions of the
+// Software License Agreement included with the Materials.If you do not have a
+// copy of the Software License Agreement, contact your AMD representative for a
+// copy.
+// THE SOFTWARE IS ASSUMED BY YOU.Some jurisdictions do not allow the exclusion
+// of implied warranties, so the above exclusion may not apply to You.
+// liability to You for all damages, losses, and causes of action (whether in
+// contract, tort (including negligence) or otherwise) exceed the amount of $100
+// USD.  You agree to defend, indemnify and hold harmless AMD and its licensors,
+// and any of their directors, officers, employees, affiliates or agents from
+// and against any and all loss, damage, liability and other expenses (including
+// reasonable attorneys' fees), resulting from Your use of the Software or
+// violation of the terms and conditions of this Agreement.
+// U.S.GOVERNMENT RESTRICTED RIGHTS : The Materials are provided with
+// "RESTRICTED RIGHTS." Use, duplication, or disclosure by the Government is
+// subject to the restrictions as set forth in FAR 52.227 - 14 and DFAR252.227 -
+// 7013, et seq., or its successor.Use of the Materials by the Government
+// constitutes acknowledgement of AMD's proprietary rights in them.
+// EXPORT RESTRICTIONS: The Materials may be subject to export restrictions as
+//                      stated in the Software License Agreement.
+#ifdef _WIN32  // Are we compiling for windows?
+#define NOMINMAX
+#include "core/util/os.h"
+#include <algorithm>
+#include <process.h>
+#include <string>
+#include <windows.h>
+#include <emmintrin.h>
+#include <pmmintrin.h>
+#include <xmmintrin.h>
+#undef Yield
+#undef CreateMutex
+namespace os {
+static_assert(sizeof(LibHandle) == sizeof(HMODULE),
+              "OS abstraction size mismatch");
+static_assert(sizeof(LibHandle) == sizeof(::HANDLE),
+              "OS abstraction size mismatch");
+static_assert(sizeof(Mutex) == sizeof(::HANDLE),
+              "OS abstraction size mismatch");
+static_assert(sizeof(Thread) == sizeof(::HANDLE),
+              "OS abstraction size mismatch");
+static_assert(sizeof(EventHandle) == sizeof(::HANDLE),
+              "OS abstraction size mismatch");
+LibHandle LoadLib(std::string filename) {
+  HMODULE ret = LoadLibrary(filename.c_str());
+  return *(LibHandle*)&ret;
+void* GetExportAddress(LibHandle lib, std::string export_name) {
+  return GetProcAddress(*(HMODULE*)&lib, export_name.c_str());
+void CloseLib(LibHandle lib) { FreeLibrary(*(::HMODULE*)&lib); }
+Mutex CreateMutex() { return CreateEvent(NULL, false, true, NULL); }
+bool TryAcquireMutex(Mutex lock) {
+  return WaitForSingleObject(*(::HANDLE*)&lock, 0) == WAIT_OBJECT_0;
+bool AcquireMutex(Mutex lock) {
+  return WaitForSingleObject(*(::HANDLE*)&lock, INFINITE) == WAIT_OBJECT_0;
+void ReleaseMutex(Mutex lock) { SetEvent(*(::HANDLE*)&lock); }
+void DestroyMutex(Mutex lock) { CloseHandle(*(::HANDLE*)&lock); }
+void Sleep(int delay_in_millisecond) { ::Sleep(delay_in_millisecond); }
+void YieldThread() { ::Sleep(0); }
+struct ThreadArgs {
+  void* entry_args;
+  ThreadEntry entry_function;
+unsigned __stdcall ThreadTrampoline(void* arg) {
+  ThreadArgs* thread_args = (ThreadArgs*)arg;
+  ThreadEntry entry = thread_args->entry_function;
+  void* data = thread_args->entry_args;
+  delete thread_args;
+  entry(data);
+  _endthreadex(0);
+  return 0;
+Thread CreateThread(ThreadEntry entry_function, void* entry_argument,
+                    uint stack_size) {
+  ThreadArgs* thread_args = new ThreadArgs();
+  thread_args->entry_args = entry_argument;
+  thread_args->entry_function = entry_function;
+  uintptr_t ret =
+      _beginthreadex(NULL, stack_size, ThreadTrampoline, thread_args, 0, NULL);
+  return *(Thread*)&ret;
+void CloseThread(Thread thread) { CloseHandle(*(::HANDLE*)&thread); }
+bool WaitForThread(Thread thread) {
+  return WaitForSingleObject(*(::HANDLE*)&thread, INFINITE) == WAIT_OBJECT_0;
+bool WaitForAllThreads(Thread* threads, uint thread_count) {
+  return WaitForMultipleObjects(thread_count, threads, TRUE, INFINITE) ==
+         WAIT_OBJECT_0;
+void SetEnvVar(std::string env_var_name, std::string env_var_value) {
+  SetEnvironmentVariable(env_var_name.c_str(), env_var_value.c_str());
+std::string GetEnvVar(std::string env_var_name) {
+  char* buff;
+  DWORD char_count = GetEnvironmentVariable(env_var_name.c_str(), NULL, 0);
+  if (char_count == 0) return "";
+  buff = (char*)alloca(sizeof(char) * char_count);
+  GetEnvironmentVariable(env_var_name.c_str(), buff, char_count);
+  buff[char_count - 1] = '\0';
+  std::string ret = buff;
+  return ret;
+size_t GetUserModeVirtualMemorySize() {
+  SYSTEM_INFO system_info = {0};
+  GetSystemInfo(&system_info);
+  return ((size_t)system_info.lpMaximumApplicationAddress + 1);
+size_t GetUsablePhysicalHostMemorySize() {
+  MEMORYSTATUSEX memory_status = {0};
+  memory_status.dwLength = sizeof(memory_status);
+  if (GlobalMemoryStatusEx(&memory_status) == 0) {
+    return 0;
+  }
+  const size_t physical_size = static_cast<size_t>(memory_status.ullTotalPhys);
+  return std::min(GetUserModeVirtualMemorySize(), physical_size);
+uintptr_t GetUserModeVirtualMemoryBase() { return (uintptr_t)0; }
+// Os event wrappers
+EventHandle CreateOsEvent(bool auto_reset, bool init_state) {
+  EventHandle evt = reinterpret_cast<EventHandle>(
+      CreateEvent(NULL, (BOOL)(!auto_reset), (BOOL)init_state, NULL));
+  return evt;
+int DestroyOsEvent(EventHandle event) {
+  if (event == NULL) {
+    return -1;
+  }
+  return CloseHandle(reinterpret_cast<::HANDLE>(event));
+int WaitForOsEvent(EventHandle event, unsigned int milli_seconds) {
+  if (event == NULL) {
+    return -1;
+  }
+  int ret_code =
+      WaitForSingleObject(reinterpret_cast<::HANDLE>(event), milli_seconds);
+  if (ret_code == WAIT_TIMEOUT) {
+    ret_code = 0x14003;  // 0x14003 indicates timeout
+  }
+  return ret_code;
+int SetOsEvent(EventHandle event) {
+  if (event == NULL) {
+    return -1;
+  }
+  return SetEvent(reinterpret_cast<::HANDLE>(event));
+int ResetOsEvent(EventHandle event) {
+  if (event == NULL) {
+    return -1;
+  }
+  return ResetEvent(reinterpret_cast<::HANDLE>(event));
+uint64_t ReadAccurateClock() {
+  uint64_t ret;
+  QueryPerformanceCounter((LARGE_INTEGER*)&ret);
+  return ret;
+uint64_t AccurateClockFrequency() {
+  uint64_t ret;
+  QueryPerformanceFrequency((LARGE_INTEGER*)&ret);
+  return ret;
diff --git a/src/inc/hsa.h b/src/inc/hsa.h
index f80768dbf..6ab97c394 100644
--- a/src/inc/hsa.h
+++ b/src/inc/hsa.h
@@ -462,9 +462,13 @@ typedef enum {
-   * Loaded code object extension.
+   * Loader extension.
+  /**
+   * Extension count.
+   */
 } hsa_extension_t;
diff --git a/src/inc/hsa_api_trace.h b/src/inc/hsa_api_trace.h
index ee7e63b9e..40d443de1 100644
--- a/src/inc/hsa_api_trace.h
+++ b/src/inc/hsa_api_trace.h
@@ -54,13 +54,51 @@
 #include "inc/hsa_ext_finalize.h"
-struct ExtTable {
+#include <string.h>
+#include <assert.h>
+#include <stddef.h>
+// Major Ids of the Api tables exported by Hsa Core Runtime
+#define HSA_API_TABLE_MAJOR_VERSION               0x01
+#define HSA_CORE_API_TABLE_MAJOR_VERSION          0x01
+// Step Ids of the Api tables exported by Hsa Core Runtime
+#define HSA_API_TABLE_STEP_VERSION                0x00
+#define HSA_CORE_API_TABLE_STEP_VERSION           0x00
+#define HSA_IMAGE_API_TABLE_STEP_VERSION          0x00
+// Min function used to copy Api Tables
+static inline uint32_t Min(const uint32_t a, const uint32_t b) {
+  return (a > b) ? b : a;
+// Structure of Version used to identify an instance of Api table
+struct ApiTableVersion {
+  uint32_t major_id;
+  uint32_t minor_id;
+  uint32_t step_id;
+  uint32_t reserved;
+// Table to export HSA Finalizer Extension Apis 
+struct FinalizerExtTable {
+  ApiTableVersion version;
 	decltype(hsa_ext_program_create)* hsa_ext_program_create_fn;
 	decltype(hsa_ext_program_destroy)* hsa_ext_program_destroy_fn;
 	decltype(hsa_ext_program_add_module)* hsa_ext_program_add_module_fn;
 	decltype(hsa_ext_program_iterate_modules)* hsa_ext_program_iterate_modules_fn;
 	decltype(hsa_ext_program_get_info)* hsa_ext_program_get_info_fn;
 	decltype(hsa_ext_program_finalize)* hsa_ext_program_finalize_fn;
+// Table to export HSA Image Extension Apis
+struct ImageExtTable {
+  ApiTableVersion version;
 	decltype(hsa_ext_image_get_capability)* hsa_ext_image_get_capability_fn;
 	decltype(hsa_ext_image_data_get_info)* hsa_ext_image_data_get_info_fn;
 	decltype(hsa_ext_image_create)* hsa_ext_image_create_fn;
@@ -73,7 +111,40 @@ struct ExtTable {
 	decltype(hsa_ext_sampler_destroy)* hsa_ext_sampler_destroy_fn;
-struct ApiTable {
+// Table to export AMD Extension Apis
+struct AmdExtTable {
+  ApiTableVersion version;
+	decltype(hsa_amd_coherency_get_type)* hsa_amd_coherency_get_type_fn;
+	decltype(hsa_amd_coherency_set_type)* hsa_amd_coherency_set_type_fn;
+  decltype(hsa_amd_profiling_set_profiler_enabled)* hsa_amd_profiling_set_profiler_enabled_fn;
+  decltype(hsa_amd_profiling_async_copy_enable) *hsa_amd_profiling_async_copy_enable_fn;
+  decltype(hsa_amd_profiling_get_dispatch_time)* hsa_amd_profiling_get_dispatch_time_fn;
+  decltype(hsa_amd_profiling_get_async_copy_time) *hsa_amd_profiling_get_async_copy_time_fn;
+  decltype(hsa_amd_profiling_convert_tick_to_system_domain)* hsa_amd_profiling_convert_tick_to_system_domain_fn;
+  decltype(hsa_amd_signal_async_handler)* hsa_amd_signal_async_handler_fn;
+  decltype(hsa_amd_async_function)* hsa_amd_async_function_fn;
+  decltype(hsa_amd_signal_wait_any)* hsa_amd_signal_wait_any_fn;
+  decltype(hsa_amd_queue_cu_set_mask)* hsa_amd_queue_cu_set_mask_fn;
+  decltype(hsa_amd_memory_pool_get_info)* hsa_amd_memory_pool_get_info_fn;
+  decltype(hsa_amd_agent_iterate_memory_pools)* hsa_amd_agent_iterate_memory_pools_fn;
+  decltype(hsa_amd_memory_pool_allocate)* hsa_amd_memory_pool_allocate_fn;
+  decltype(hsa_amd_memory_pool_free)* hsa_amd_memory_pool_free_fn;
+  decltype(hsa_amd_memory_async_copy)* hsa_amd_memory_async_copy_fn;
+  decltype(hsa_amd_agent_memory_pool_get_info)* hsa_amd_agent_memory_pool_get_info_fn;
+  decltype(hsa_amd_agents_allow_access)* hsa_amd_agents_allow_access_fn;
+  decltype(hsa_amd_memory_pool_can_migrate)* hsa_amd_memory_pool_can_migrate_fn;
+  decltype(hsa_amd_memory_migrate)* hsa_amd_memory_migrate_fn;
+  decltype(hsa_amd_memory_lock)* hsa_amd_memory_lock_fn;
+  decltype(hsa_amd_memory_unlock)* hsa_amd_memory_unlock_fn;
+  decltype(hsa_amd_memory_fill)* hsa_amd_memory_fill_fn;
+  decltype(hsa_amd_interop_map_buffer)* hsa_amd_interop_map_buffer_fn;
+  decltype(hsa_amd_interop_unmap_buffer)* hsa_amd_interop_unmap_buffer_fn;
+  decltype(::hsa_amd_image_create)* hsa_amd_image_create_fn;
+// Table to export HSA Core Runtime Apis
+struct CoreApiTable {
+  ApiTableVersion version;
 	decltype(hsa_init)* hsa_init_fn;
 	decltype(hsa_shut_down)* hsa_shut_down_fn;
 	decltype(hsa_system_get_info)* hsa_system_get_info_fn;
@@ -170,8 +241,126 @@ struct ApiTable {
 	decltype(hsa_executable_symbol_get_info)* hsa_executable_symbol_get_info_fn;
 	decltype(hsa_executable_iterate_symbols)* hsa_executable_iterate_symbols_fn;
 	decltype(hsa_status_string)* hsa_status_string_fn;
+// Table to export HSA Apis from Core Runtime, Amd Extensions
+// Finalizer and Images
+struct HsaApiTable {
+  // Version of Hsa Api Table
+  ApiTableVersion version;
+  // Table of function pointers to HSA Core Runtime
+	CoreApiTable* core_;
+  // Table of function pointers to AMD extensions
+	AmdExtTable* amd_ext_;
+  // Table of function pointers to HSA Finalizer Extension
+	FinalizerExtTable* finalizer_ext_;
+  // Table of function pointers to HSA Image Extension
+	ImageExtTable* image_ext_;
-	ExtTable* std_exts_;
+// Structure containing instances of different api tables
+struct HsaApiTableContainer {
+  HsaApiTable root;
+	CoreApiTable core;
+	AmdExtTable amd_ext;
+	FinalizerExtTable finalizer_ext;
+	ImageExtTable image_ext;
+  // Default initialization of a container instance
+  HsaApiTableContainer() {
+    root.version.major_id = HSA_API_TABLE_MAJOR_VERSION;
+    root.version.minor_id = sizeof(HsaApiTable);
+    root.version.step_id = HSA_API_TABLE_STEP_VERSION;
+    core.version.major_id = HSA_CORE_API_TABLE_MAJOR_VERSION;
+    core.version.minor_id = sizeof(CoreApiTable);
+    core.version.step_id = HSA_CORE_API_TABLE_STEP_VERSION;
+    root.core_ = &core;
+    amd_ext.version.major_id = HSA_AMD_EXT_API_TABLE_MAJOR_VERSION;
+    amd_ext.version.minor_id = sizeof(AmdExtTable);
+    amd_ext.version.step_id = HSA_AMD_EXT_API_TABLE_STEP_VERSION;
+    root.amd_ext_ = &amd_ext;
+    finalizer_ext.version.major_id = HSA_FINALIZER_API_TABLE_MAJOR_VERSION;
+    finalizer_ext.version.minor_id = sizeof(FinalizerExtTable);
+    finalizer_ext.version.step_id = HSA_FINALIZER_API_TABLE_STEP_VERSION;
+    root.finalizer_ext_ = & finalizer_ext;
+    image_ext.version.major_id = HSA_IMAGE_API_TABLE_MAJOR_VERSION;
+    image_ext.version.minor_id = sizeof(ImageExtTable);
+    image_ext.version.step_id = HSA_IMAGE_API_TABLE_STEP_VERSION;
+    root.image_ext_ = &image_ext;
+  }
+// Api to copy function pointers of a table
+void inline copyApi(void* src, void* dest, size_t size) {
+  memcpy((char*)src + sizeof(ApiTableVersion),
+         (char*)dest + sizeof(ApiTableVersion),
+         (size - sizeof(ApiTableVersion)));
+// Copy constructor for all Api tables. The function assumes the
+// user has initialized an instance of tables container correctly
+// for the Major, Minor and Stepping Ids of Root and Child Api tables.
+// The function will overwrite the value of Minor Id by taking the
+// minimum of source and destination parameters. It will also overwrite
+// the stepping Id with value from source parameter.
+static const
+void inline copyTables(const HsaApiTable* src, HsaApiTableContainer* dest) {
+  // Verify Major Id of source and destination tables are valid
+  assert(dest->root.version.major_id == src->version.major_id);
+  assert(dest->core.version.major_id == src->core_->version.major_id);
+  assert(dest->amd_ext.version.major_id == src->amd_ext_->version.major_id);
+  assert(dest->finalizer_ext.version.major_id == src->finalizer_ext_->version.major_id);
+  assert(dest->image_ext.version.major_id == src->image_ext_->version.major_id);
+  // Initialize the stepping id and minor id of root table. For the
+  // minor id which encodes struct size, take the minimum of source
+  // and destination parameters
+  dest->root.version.step_id = src->version.step_id;
+  dest->root.version.minor_id = Min(dest->root.version.minor_id, src->version.minor_id);
+  // Copy the Core Api table
+  size_t size = dest->root.version.minor_id;
+  if (size > offsetof(HsaApiTable, core_)) {
+    dest->core.version.step_id = src->core_->version.step_id;
+    dest->core.version.minor_id = Min(dest->core.version.minor_id,
+                                      src->core_->version.minor_id);
+    copyApi(&dest->core, src->core_, dest->core.version.minor_id);
+  }
+  // Copy the Amd Ext Api table
+  if (size > offsetof(HsaApiTable, amd_ext_)) {
+    dest->amd_ext.version.step_id = src->amd_ext_->version.step_id;
+    dest->amd_ext.version.minor_id = Min(dest->core.version.minor_id,
+                                         src->amd_ext_->version.minor_id);
+    copyApi(&dest->amd_ext, src->amd_ext_, dest->amd_ext.version.minor_id);
+  }
+  // Copy the Finalizer Ext Api table
+  if (size > offsetof(HsaApiTable, finalizer_ext_)) {
+    dest->finalizer_ext.version.step_id = src->finalizer_ext_->version.step_id;
+    dest->finalizer_ext.version.minor_id = Min(dest->core.version.minor_id,
+                                               src->finalizer_ext_->version.minor_id);
+    copyApi(&dest->finalizer_ext, src->finalizer_ext_, dest->finalizer_ext.version.minor_id);
+  }
+  // Copy the Image Ext Api table
+  if (size > offsetof(HsaApiTable, image_ext_)) {
+    dest->image_ext.version.step_id = src->image_ext_->version.step_id;
+    dest->image_ext.version.minor_id = Min(dest->core.version.minor_id,
+                                           src->image_ext_->version.minor_id);
+    copyApi(&dest->image_ext, src->image_ext_, dest->image_ext.version.minor_id);
+  }
diff --git a/src/inc/hsa_ext_amd.h b/src/inc/hsa_ext_amd.h
index 7a4ed5727..4cbc82e7e 100644
--- a/src/inc/hsa_ext_amd.h
+++ b/src/inc/hsa_ext_amd.h
@@ -211,6 +211,23 @@ typedef struct hsa_amd_profiling_dispatch_time_s {
   uint64_t end;
 } hsa_amd_profiling_dispatch_time_t;
+ * @brief Structure containing profiling async copy time information.
+ *
+ * Times are reported as ticks in the domain of the HSA system clock.
+ * The HSA system clock tick and frequency is obtained via hsa_system_get_info.
+ */
+typedef struct hsa_amd_profiling_async_copy_time_s {
+  /**
+   * Async copy processing start time.
+   */
+  uint64_t start;
+  /**
+   * Async copy completion time.
+   */
+  uint64_t end;
+} hsa_amd_profiling_async_copy_time_t;
  * @brief Enable or disable profiling capability of a queue.
@@ -230,10 +247,34 @@ typedef struct hsa_amd_profiling_dispatch_time_s {
 hsa_status_t HSA_API
     hsa_amd_profiling_set_profiler_enabled(hsa_queue_t* queue, int enable);
+ * @brief Enable or disable asynchronous memory copy profiling.
+ *
+ * @details The runtime will provide the copy processing start timestamp and
+ * completion timestamp of each call to hsa_amd_memory_async_copy if the
+ * async copy profiling is enabled prior to the call to
+ * hsa_amd_memory_async_copy. The completion signal object is used to
+ * hold the last async copy start and end timestamp. The client can retrieve
+ * these timestamps via call to hsa_amd_profiling_get_async_copy_time.
+ *
+ * @param[in] enable True to enable profiling. False to disable profiling.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Failed on allocating resources
+ * needed to profile the asynchronous copy.
+ */
+hsa_status_t HSA_API
+    hsa_amd_profiling_async_copy_enable(bool enable);
  * @brief Retrieve packet processing time stamps.
- * @param[in] agent The agent with which the signal was last used.  For instance,
+ * @param[in] agent The agent with which the signal was last used.  For
+ *instance,
  * if the profiled dispatch packet is dispatched on to queue Q, which was
  * created on agent A, then this parameter must be A.
@@ -261,9 +302,33 @@ hsa_status_t HSA_API hsa_amd_profiling_get_dispatch_time(
     hsa_agent_t agent, hsa_signal_t signal,
     hsa_amd_profiling_dispatch_time_t* time);
+ * @brief Retrieve asynchronous copy timestamps.
+ *
+ * @details Async copy profiling is enabled via call to
+ * hsa_amd_profiling_async_copy_enable.
+ *
+ * @param[in] signal A signal used as the completion signal of the call to
+ * hsa_amd_memory_async_copy.
+ *
+ * @param[out] time Async copy processing timestamps in the HSA system clock
+ * domain.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL The signal is invalid.
+ *
+ */
+hsa_status_t HSA_API hsa_amd_profiling_get_async_copy_time(
+    hsa_signal_t signal, hsa_amd_profiling_async_copy_time_t* time);
  * @brief Computes the frequency ratio and offset between the agent clock and
- * HSA system clock and converts the agent’s tick to HSA system domain tick.
+ * HSA system clock and converts the agent's tick to HSA system domain tick.
  * @param[in] agent The agent used to retrieve the agent_tick. It is user's
  * responsibility to make sure the tick number is from this agent, otherwise,
@@ -392,7 +457,7 @@ hsa_status_t HSA_API
  * @details Allows waiting for any of several signal and conditions pairs to be
  * satisfied. The function returns the index into the list of signals of the
- * first satisfying signal-condition pair. The value of the satisfying signal’s
+ * first satisfying signal-condition pair. The value of the satisfying signal's
  * value is returned in satisfying_value unless satisfying_value is NULL. This
  * function provides only relaxed memory semantics.
@@ -857,7 +922,10 @@ typedef enum {
   * Number of links to hop when accessing the memory pool from the specified
-  * agent. The type of this attribute is uint32_t.
+  * agent. The value of this attribute is zero if the memory pool is associated
+  * with the agent, or if the access type is
+  * HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED. The type of this attribute is
+  * uint32_t.
diff --git a/src/inc/hsa_ven_amd_loaded_code_object.h b/src/inc/hsa_ven_amd_loaded_code_object.h
deleted file mode 100644
index fe56e3813..000000000
--- a/src/inc/hsa_ven_amd_loaded_code_object.h
+++ /dev/null
@@ -1,95 +0,0 @@
-// The University of Illinois/NCSA
-// Open Source License (NCSA)
-// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
-// Developed by:
-//                 AMD Research and AMD HSA Software Development
-//                 Advanced Micro Devices, Inc.
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to
-// deal with the Software without restriction, including without limitation
-// the rights to use, copy, modify, merge, publish, distribute, sublicense,
-// and/or sell copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following conditions:
-//  - Redistributions of source code must retain the above copyright notice,
-//    this list of conditions and the following disclaimers.
-//  - Redistributions in binary form must reproduce the above copyright
-//    notice, this list of conditions and the following disclaimers in
-//    the documentation and/or other materials provided with the distribution.
-//  - Neither the names of Advanced Micro Devices, Inc,
-//    nor the names of its contributors may be used to endorse or promote
-//    products derived from this Software without specific prior written
-//    permission.
-// HSA AMD extension for loaded code objects.
-#include "hsa.h"
-#ifdef __cplusplus
-extern "C" {
-#endif // __cplusplus
- * @brief Records loaded code object's host address in @p host_address given
- * loaded code object's device address. Recorded host address points to host
- * accessible memory, which is identical to memory pointed to by device address.
- * If device address already points to host accessible memory, then device
- * address is recorded in @p host_address.
- *
- * @param[in] device_address Device address.
- *
- * @param[out] host_address Pointer to application-allocated buffer, where to
- * record host address.
- *
- * @retval HSA_STATUS_SUCCESS Function has been executed successfully.
- *
- * @retval HSA_STATUS_ERROR_NOT_INITIALIZED Runtime has not been initialized.
- *
- * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p device address is invalid/null,
- * or @p host address is null.
- */
-hsa_status_t HSA_API hsa_ven_amd_loaded_code_object_query_host_address(
-  const void *device_address,
-  const void **host_address);
- * @brief Extension's version.
- */
-#define hsa_ven_amd_loaded_code_object 001000
- * @brief Extension's function table.
- */
-typedef struct hsa_ven_amd_loaded_code_object_1_00_pfn_s {
-  hsa_status_t (*hsa_ven_amd_loaded_code_object_query_host_address)(
-    const void *device_address,
-    const void **host_address);
-} hsa_ven_amd_loaded_code_object_1_00_pfn_t;
-#ifdef __cplusplus
-} // extern "C"
-#endif // __cplusplus
diff --git a/src/inc/hsa_ven_amd_loader.h b/src/inc/hsa_ven_amd_loader.h
new file mode 100644
index 000000000..804a360a2
--- /dev/null
+++ b/src/inc/hsa_ven_amd_loader.h
@@ -0,0 +1,249 @@
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
+// Developed by:
+//                 AMD Research and AMD HSA Software Development
+//                 Advanced Micro Devices, Inc.
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+// HSA AMD extension for additional loader functionality.
+#include "hsa.h"
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+ * @brief Queries equivalent host address for given @p device_address, and
+ * records it in @p host_address.
+ *
+ *
+ * @details Contents of memory pointed to by @p host_address would be identical
+ * to contents of memory pointed to by @p device_address. Only difference
+ * between the two is host accessibility: @p host_address is always accessible
+ * from host, @p device_address might not be accessible from host.
+ *
+ * If @p device_address already points to host accessible memory, then the value
+ * of @p device_address is simply copied into @p host_address.
+ *
+ * The lifetime of @p host_address is the same as the lifetime of @p
+ * device_address, and both lifetimes are limited by the lifetime of the
+ * executable that is managing these addresses.
+ *
+ *
+ * @param[in] device_address Device address to query equivalent host address
+ * for.
+ *
+ * @param[out] host_address Pointer to application-allocated buffer to record
+ * queried equivalent host address in.
+ *
+ *
+ * @retval HSA_STATUS_SUCCESS Function is executed successfully.
+ *
+ * @retval HSA_STATUS_ERROR_NOT_INITIALIZED Runtime is not initialized.
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p device_address is invalid or
+ * null, or @p host_address is null.
+ */
+hsa_status_t HSA_API hsa_ven_amd_loader_query_host_address(
+  const void *device_address,
+  const void **host_address);
+ * @brief The storage type of the code object that is backing loaded memory
+ * segment.
+ */
+typedef enum {
+  /**
+   * Loaded memory segment is not backed by any code object (anonymous), as the
+   * case would be with BSS (uninitialized data).
+   */
+  /**
+   * Loaded memory segment is backed by the code object that is stored in the
+   * file.
+   */
+  /**
+   * Loaded memory segment is backed by the code object that is stored in the
+   * memory.
+   */
+} hsa_ven_amd_loader_code_object_storage_type_t;
+ * @brief Loaded memory segment descriptor.
+ *
+ *
+ * @details Loaded memory segment descriptor describes underlying loaded memory
+ * segment. Loaded memory segment is created/allocated by the executable during
+ * the loading of the code object that is backing underlying memory segment.
+ *
+ * The lifetime of underlying memory segment is limited by the lifetime of the
+ * executable that is managing underlying memory segment.
+ */
+typedef struct hsa_ven_amd_loader_segment_descriptor_s {
+  /**
+   * Agent underlying memory segment is allocated on. If the code object that is
+   * backing underlying memory segment is program code object, then 0.
+   */
+  hsa_agent_t agent;
+  /**
+   * Executable that is managing this underlying memory segment.
+   */
+  hsa_executable_t executable;
+  /**
+   * Storage type of the code object that is backing underlying memory segment.
+   */
+  hsa_ven_amd_loader_code_object_storage_type_t code_object_storage_type;
+  /**
+   * If the storage type of the code object that is backing underlying memory
+   * segment is:
+   *   - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_FILE, then null-terminated
+   *     filepath to the code object;
+   *     accessible pointer to the first byte of the code object.
+   */
+  const void *code_object_storage_base;
+  /**
+   * If the storage type of the code object that is backing underlying memory
+   * segment is:
+   *     the filepath to the code object (including null-terminating character);
+   *     bytes, of the memory occupied by the code object.
+   */
+  size_t code_object_storage_size;
+  /**
+   * If the storage type of the code object that is backing underlying memory
+   * segment is:
+   *   - other, then offset, in bytes, from the beginning of the code object to
+   *     the first byte in the code object data is copied from.
+   */
+  size_t code_object_storage_offset;
+  /**
+   * Starting address of the underlying memory segment.
+   */
+  const void *segment_base;
+  /**
+   * Size, in bytes, of the underlying memory segment.
+   */
+  size_t segment_size;
+} hsa_ven_amd_loader_segment_descriptor_t;
+ * @brief Either queries loaded memory segment descriptors, or total number of
+ * loaded memory segment descriptors.
+ *
+ *
+ * @details If @p segment_descriptors is not null and @p num_segment_descriptors
+ * points to number that exactly matches total number of loaded memory segment
+ * descriptors, then queries loaded memory segment descriptors, and records them
+ * in @p segment_descriptors. If @p segment_descriptors is null and @p
+ * num_segment_descriptors points to zero, then queries total number of loaded
+ * memory segment descriptors, and records it in @p num_segment_descriptors. In
+ * all other cases returns appropriate error code (see below).
+ *
+ * The caller of this function is responsible for the allocation/deallocation
+ * and the lifetime of @p segment_descriptors and @p num_segment_descriptors.
+ *
+ * The lifetime of loaded memory segments that are described by queried loaded
+ * memory segment descriptors is limited by the lifetime of the executable that
+ * is managing loaded memory segments.
+ *
+ * Queried loaded memory segment descriptors are always self-consistent: they
+ * describe a complete set of loaded memory segments that are being backed by
+ * fully loaded code objects that are present at the time (i.e. this function
+ * is blocked until all executable manipulations are fully complete).
+ *
+ *
+ * @param[out] segment_descriptors Pointer to application-allocated buffer to
+ * record queried loaded memory segment descriptors in. Can be null if @p
+ * num_segment_descriptors points to zero.
+ *
+ * @param[in,out] num_segment_descriptors Pointer to application-allocated
+ * buffer that contains either total number of loaded memory segment descriptors
+ * or zero.
+ *
+ *
+ * @retval HSA_STATUS_SUCCESS Function is executed successfully.
+ *
+ * @retval HSA_STATUS_ERROR_NOT_INITIALIZED Runtime is not initialized.
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p segment_descriptors is null
+ * while @p num_segment_descriptors points to non-zero number, @p
+ * segment_descriptors is not null while @p num_segment_descriptors points to
+ * zero, or @p num_segment_descriptors is null.
+ *
+ * @retval HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS @p num_segment_descriptors
+ * does not point to number that exactly matches total number of loaded memory
+ * segment descriptors.
+ */
+hsa_status_t HSA_API hsa_ven_amd_loader_query_segment_descriptors(
+  hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors,
+  size_t *num_segment_descriptors);
+ * @brief Extension version.
+ */
+#define hsa_ven_amd_loader 001000
+ * @brief Extension function table.
+ */
+typedef struct hsa_ven_amd_loader_1_00_pfn_s {
+  hsa_status_t (*hsa_ven_amd_loader_query_host_address)(
+    const void *device_address,
+    const void **host_address);
+  hsa_status_t (*hsa_ven_amd_loader_query_segment_descriptors)(
+    hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors,
+    size_t *num_segment_descriptors);
+} hsa_ven_amd_loader_1_00_pfn_t;
+#ifdef __cplusplus
+#endif /* __cplusplus */
+#endif /* HSA_VEN_AMD_LOADER_H */
diff --git a/src/libamdhsacode/amd_elf_image.cpp b/src/libamdhsacode/amd_elf_image.cpp
index df8748052..fb36d6234 100644
--- a/src/libamdhsacode/amd_elf_image.cpp
+++ b/src/libamdhsacode/amd_elf_image.cpp
@@ -456,6 +456,7 @@ namespace amd {
       uint64_t imageSize() const override { return phdr.p_filesz; }
       uint64_t vaddr() const override { return phdr.p_vaddr; }
       uint64_t flags() const override { return phdr.p_flags; }
+      uint64_t offset() const override { return phdr.p_offset; }
       const char* data() const override;
       uint16_t getSegmentIndex() override;
       bool updateAddSection(Section *section) override;
@@ -1368,7 +1369,7 @@ namespace amd {
           section = new GElfRelocationSection(this);
         } else if (shdr.sh_type == SHT_STRTAB) {
           section = new GElfStringTable(this);
-        } else if (shdr.sh_type == SHT_SYMTAB) {
+        } else if (shdr.sh_type == SHT_SYMTAB || shdr.sh_type == SHT_DYNSYM) {
           section = new GElfSymbolTable(this);
         } else if (shdr.sh_type == SHT_NULL) {
           section = 0;
@@ -1391,14 +1392,14 @@ namespace amd {
       for (size_t n = 1; n < sections.size(); ++n) {
         GElfSection* section = sections[n].get();
-        if (section->type() == SHT_SYMTAB) {
+        if (section->type() == SHT_SYMTAB || section->type() == SHT_DYNSYM) {
           if (!section->pullData()) { return false; }
       for (size_t n = 1; n < sections.size(); ++n) {
         GElfSection* section = sections[n].get();
-        if (section->type() != SHT_STRTAB && section->type() != SHT_SYMTAB) {
+        if (section->type() != SHT_STRTAB && section->type() != SHT_SYMTAB && section->type() != SHT_DYNSYM) {
           if (!section->pullData()) { return false; }
diff --git a/src/libamdhsacode/amd_hsa_code.cpp b/src/libamdhsacode/amd_hsa_code.cpp
index ed8753359..464ca7d1d 100644
--- a/src/libamdhsacode/amd_hsa_code.cpp
+++ b/src/libamdhsacode/amd_hsa_code.cpp
@@ -1181,7 +1181,13 @@ namespace code {
     void AmdHsaCode::PrintRelocationData(std::ostream& out, RelocationSection* section)
-      out << "    Relocation Entries for " << section->targetSection()->Name() << " Section (total " << section->relocationCount() << "):" << std::endl;
+      if (section->targetSection()) {
+        out << "    Relocation Entries for " << section->targetSection()->Name() << " Section (total " << section->relocationCount() << "):" << std::endl;
+      } else {
+        // Dynamic relocations do not have a target section, they work with
+        // virtual addresses.
+        out << "    Dynamic Relocation Entries (total " << section->relocationCount() << "):" << std::endl;
+      }
       for (size_t i = 0; i < section->relocationCount(); ++i) {
         out << "      Relocation (Index " << i << "):" << std::endl;
         out << "        Type: " << section->relocation(i)->type() << std::endl;
diff --git a/src/loader/executable.cpp b/src/loader/executable.cpp
index 45c6abd61..17eb7c1ca 100644
--- a/src/loader/executable.cpp
+++ b/src/loader/executable.cpp
@@ -137,7 +137,7 @@ void Loader::Destroy(Loader *loader)
 Executable* AmdHsaCodeLoader::CreateExecutable(
   hsa_profile_t profile, const char *options)
-  std::lock_guard<std::mutex> lock(executables_mutex);
+  WriterLockGuard<ReaderWriterLock> writer_lock(rw_lock_);
   executables.push_back(new ExecutableImpl(profile, context, executables.size()));
   return executables.back();
@@ -145,7 +145,8 @@ Executable* AmdHsaCodeLoader::CreateExecutable(
 void AmdHsaCodeLoader::DestroyExecutable(Executable *executable)
-  std::lock_guard<std::mutex> lock(executables_mutex);
+  WriterLockGuard<ReaderWriterLock> writer_lock(rw_lock_);
   executables[((ExecutableImpl*)executable)->id()] = nullptr;
   delete executable;
@@ -156,7 +157,7 @@ hsa_status_t AmdHsaCodeLoader::IterateExecutables(
     void *data),
   void *data)
-  std::lock_guard<std::mutex> lock(executables_mutex);
+  WriterLockGuard<ReaderWriterLock> writer_lock(rw_lock_);
   for (auto &exec : executables) {
@@ -169,12 +170,57 @@ hsa_status_t AmdHsaCodeLoader::IterateExecutables(
+hsa_status_t AmdHsaCodeLoader::QuerySegmentDescriptors(
+  hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors,
+  size_t *num_segment_descriptors)
+  if (!num_segment_descriptors) {
+  }
+  if (*num_segment_descriptors == 0 && segment_descriptors) {
+  }
+  if (*num_segment_descriptors != 0 && !segment_descriptors) {
+  }
+  this->EnableReadOnlyMode();
+  size_t actual_num_segment_descriptors = 0;
+  for (auto &executable : executables) {
+    if (executable) {
+      actual_num_segment_descriptors += executable->GetNumSegmentDescriptors();
+    }
+  }
+  if (*num_segment_descriptors == 0) {
+    *num_segment_descriptors = actual_num_segment_descriptors;
+    this->DisableReadOnlyMode();
+  }
+  if (*num_segment_descriptors != actual_num_segment_descriptors) {
+    this->DisableReadOnlyMode();
+  }
+  size_t i = 0;
+  for (auto &executable : executables) {
+    if (executable) {
+      i += executable->QuerySegmentDescriptors(segment_descriptors, actual_num_segment_descriptors, i);
+    }
+  }
+  this->DisableReadOnlyMode();
 uint64_t AmdHsaCodeLoader::FindHostAddress(uint64_t device_address)
+  ReaderLockGuard<ReaderWriterLock> reader_lock(rw_lock_);
   if (device_address == 0) {
     return 0;
-  std::lock_guard<std::mutex> lock(executables_mutex);
   for (auto &exec : executables) {
     if (exec != nullptr) {
       uint64_t host_address = exec->FindHostAddress(device_address);
@@ -186,6 +232,26 @@ uint64_t AmdHsaCodeLoader::FindHostAddress(uint64_t device_address)
   return 0;
+void AmdHsaCodeLoader::EnableReadOnlyMode()
+  rw_lock_.ReaderLock();
+  for (auto &executable : executables) {
+    if (executable) {
+      ((ExecutableImpl*)executable)->EnableReadOnlyMode();
+    }
+  }
+void AmdHsaCodeLoader::DisableReadOnlyMode()
+  rw_lock_.ReaderUnlock();
+  for (auto &executable : executables) {
+    if (executable) {
+      ((ExecutableImpl*)executable)->DisableReadOnlyMode();
+    }
+  }
 // SymbolImpl.                                                                    //
@@ -754,6 +820,44 @@ hsa_status_t ExecutableImpl::IterateLoadedCodeObjects(
+size_t ExecutableImpl::GetNumSegmentDescriptors()
+  // assuming we are in readonly mode.
+  size_t actual_num_segment_descriptors = 0;
+  for (auto &obj : loaded_code_objects) {
+    actual_num_segment_descriptors += obj->LoadedSegments().size();
+  }
+  return actual_num_segment_descriptors;
+size_t ExecutableImpl::QuerySegmentDescriptors(
+  hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors,
+  size_t total_num_segment_descriptors,
+  size_t first_empty_segment_descriptor)
+  // assuming we are in readonly mode.
+  assert(segment_descriptors);
+  assert(first_empty_segment_descriptor < total_num_segment_descriptors);
+  size_t i = first_empty_segment_descriptor;
+  for (auto &obj : loaded_code_objects) {
+    assert(i < total_num_segment_descriptors);
+    for (auto &seg : obj->LoadedSegments()) {
+      segment_descriptors[i].agent = seg->Agent();
+      segment_descriptors[i].executable = Executable::Handle(seg->Owner());
+      segment_descriptors[i].code_object_storage_type = HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_MEMORY;
+      segment_descriptors[i].code_object_storage_base = obj->ElfData();
+      segment_descriptors[i].code_object_storage_size = obj->ElfSize();
+      segment_descriptors[i].code_object_storage_offset = seg->StorageOffset();
+      segment_descriptors[i].segment_base = seg->Address(seg->VAddr());
+      segment_descriptors[i].segment_size = seg->Size();
+      ++i;
+    }
+  }
+  return i - first_empty_segment_descriptor;
 uint64_t ExecutableImpl::FindHostAddress(uint64_t device_address)
   for (auto &obj : loaded_code_objects) {
@@ -771,6 +875,16 @@ uint64_t ExecutableImpl::FindHostAddress(uint64_t device_address)
   return 0;
+void ExecutableImpl::EnableReadOnlyMode()
+  rw_lock_.ReaderLock();
+void ExecutableImpl::DisableReadOnlyMode()
+  rw_lock_.ReaderUnlock();
 #define HSAERRCHECK(hsc)                                                       \
   if (hsc != HSA_STATUS_SUCCESS) {                                             \
     assert(false);                                                             \
@@ -854,7 +968,7 @@ hsa_status_t ExecutableImpl::LoadCodeObject(
   if (loaderOptions.DumpAll()->is_set() || loaderOptions.DumpCode()->is_set()) {
-    if (!code->SaveToFile(amd::hsa::DumpFileName(loaderOptions.DumpDir()->value(), LOADER_DUMP_PREFIX, "co", dumpNum))) {
+    if (!code->SaveToFile(amd::hsa::DumpFileName(loaderOptions.DumpDir()->value(), LOADER_DUMP_PREFIX, "hsaco", dumpNum))) {
       // Ignore error.
@@ -946,7 +1060,7 @@ hsa_status_t ExecutableImpl::LoadSegmentV1(hsa_agent_t agent, code::Segment* s)
   if (need_alloc) {
     void* ptr = context_->SegmentAlloc(segment, agent, s->memSize(), s->align(), true);
     if (!ptr) { return HSA_STATUS_ERROR_OUT_OF_RESOURCES; }
-    new_seg = new Segment(this, agent, segment, ptr, s->memSize(), s->vaddr());
+    new_seg = new Segment(this, agent, segment, ptr, s->memSize(), s->vaddr(), s->offset());
     new_seg->Copy(s->vaddr(), s->data(), s->imageSize());
@@ -1422,7 +1536,7 @@ hsa_status_t ExecutableImpl::LoadSegmentV2(hsa_agent_t agent, code::Segment* s,
   void* ptr = context_->SegmentAlloc(segment, agent, s->memSize(), s->align(), true);
   if (!ptr) { return HSA_STATUS_ERROR_OUT_OF_RESOURCES; }
-  Segment *new_seg = new Segment(this, agent, segment, ptr, s->memSize(), s->vaddr());
+  Segment *new_seg = new Segment(this, agent, segment, ptr, s->memSize(), s->vaddr(), s->offset());
   new_seg->Copy(s->vaddr(), s->data(), s->imageSize());
diff --git a/src/loader/executable.hpp b/src/loader/executable.hpp
index 6801be698..478a03ffd 100644
--- a/src/loader/executable.hpp
+++ b/src/loader/executable.hpp
@@ -273,16 +273,18 @@ class Segment : public LoadedSegment, public ExecutableObject {
   size_t size;
   uint64_t vaddr;
   bool frozen;
+  size_t storage_offset;
-  Segment(ExecutableImpl *owner_, hsa_agent_t agent_, amdgpu_hsa_elf_segment_t segment_, void* ptr_, size_t size_, uint64_t vaddr_)
+  Segment(ExecutableImpl *owner_, hsa_agent_t agent_, amdgpu_hsa_elf_segment_t segment_, void* ptr_, size_t size_, uint64_t vaddr_, size_t storage_offset_)
     : ExecutableObject(owner_, agent_), segment(segment_),
-      ptr(ptr_), size(size_), vaddr(vaddr_), frozen(false) { }
+      ptr(ptr_), size(size_), vaddr(vaddr_), frozen(false), storage_offset(storage_offset_) { }
   amdgpu_hsa_elf_segment_t ElfSegment() const { return segment; }
   void* Ptr() const { return ptr; }
   size_t Size() const { return size; }
   uint64_t VAddr() const { return vaddr; }
+  size_t StorageOffset() const { return storage_offset;  }
   bool GetInfo(amd_loaded_segment_info_t attribute, void *value) override;
@@ -399,8 +401,18 @@ class ExecutableImpl final: public Executable {
       void *data),
     void *data);
+  size_t GetNumSegmentDescriptors() override;
+  size_t QuerySegmentDescriptors(
+    hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors,
+    size_t total_num_segment_descriptors,
+    size_t first_empty_segment_descriptor) override;
   uint64_t FindHostAddress(uint64_t device_address) override;
+  void EnableReadOnlyMode();
+  void DisableReadOnlyMode();
   void Print(std::ostream& out) override;
   bool PrintToFile(const std::string& filename) override;
@@ -455,7 +467,7 @@ class AmdHsaCodeLoader : public Loader {
   Context* context;
   std::vector<Executable*> executables;
-  std::mutex executables_mutex;
+  amd::hsa::common::ReaderWriterLock rw_lock_;
   AmdHsaCodeLoader(Context* context_)
@@ -473,7 +485,14 @@ class AmdHsaCodeLoader : public Loader {
       void *data),
     void *data) override;
+  hsa_status_t QuerySegmentDescriptors(
+    hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors,
+    size_t *num_segment_descriptors) override;
   uint64_t FindHostAddress(uint64_t device_address) override;
+  void EnableReadOnlyMode();
+  void DisableReadOnlyMode();
 } // namespace loader
diff --git a/src/loader/loaders.cpp b/src/loader/loaders.cpp
index 7cb81f111..f01d48d27 100644
--- a/src/loader/loaders.cpp
+++ b/src/loader/loaders.cpp
@@ -87,6 +87,10 @@ namespace loader {
     gfx803.handle = 803;
     gfx804.handle = 804;
     gfx810.handle = 810;
+#if defined(GFX9_BUILD)
+    gfx900.handle = 900;
+    gfx901.handle = 901;
+#endif // GFX9_BUILD
   hsa_isa_t OfflineLoaderContext::IsaFromName(const char *name)
@@ -108,6 +112,12 @@ namespace loader {
       return gfx804;
     } else if (sname == "AMD:AMDGPU:8:1:0") {
       return gfx810;
+#if defined(GFX9_BUILD)
+    } else if (sname == "AMD:AMDGPU:9:0:0") {
+      return gfx900;
+    } else if (sname == "AMD:AMDGPU:9:0:1") {
+      return gfx901;
+#endif // GFX_BUILD
     } else {
       return invalid;
diff --git a/src/loader/loaders.hpp b/src/loader/loaders.hpp
index b0a6aa0ec..85a9ed2ec 100644
--- a/src/loader/loaders.hpp
+++ b/src/loader/loaders.hpp
@@ -55,7 +55,11 @@ namespace loader {
     hsa_isa_t invalid;
     hsa_isa_t gfx700, gfx701, gfx800, gfx801, gfx802, gfx803, gfx804, gfx810;
+#if defined(GFX9_BUILD)
+    hsa_isa_t gfx900, gfx901;
     hsa_isa_t reserved;
+#endif // GFX9_BUILD
     std::ostream& out;
     typedef std::set<void*> PointerSet;
     PointerSet pointers;