From f23829c7d1d5a31e12b1b7f820143b8e03818895 Mon Sep 17 00:00:00 2001 From: asaigal Date: Mon, 30 Sep 2024 18:20:31 +0000 Subject: [PATCH] #0: Apply feedback on profiler changes: - Set PROFILER_DONE flag to avoid sending writes to DRAM profiler buffer, instead of calling profileScopeGuaranteed ctor and dtor as nops - Reduces overhead associated with checking launch message twice --- .../dispatch/test_pgm_dispatch.cpp | 17 +++------ tt_metal/hw/firmware/src/brisc.cc | 5 ++- tt_metal/hw/firmware/src/erisc.cc | 5 ++- tt_metal/hw/inc/dev_msgs.h | 1 + tt_metal/impl/dispatch/kernels/cq_common.hpp | 1 - .../impl/dispatch/kernels/cq_prefetch.cpp | 6 ++- tt_metal/tools/profiler/kernel_profiler.hpp | 38 +++++++++---------- 7 files changed, 35 insertions(+), 38 deletions(-) diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch.cpp index 9181171ea398..5e41ba01a6a4 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch.cpp @@ -213,7 +213,7 @@ int main(int argc, char **argv) { bool pass = true; try { int device_id = 0; - tt_metal::Device *device = tt_metal::CreateDevice(device_id, 1, 0, 875521024); + tt_metal::Device *device = tt_metal::CreateDevice(device_id); CommandQueue& cq = device->command_queue(); @@ -234,23 +234,18 @@ int main(int argc, char **argv) { tt_metal::detail::SetLazyCommandQueueMode(true); } - // auto start = std::chrono::system_clock::now(); - uint32_t tid = BeginTraceCapture(device, 0); + auto start = std::chrono::system_clock::now(); for (int i = 0; i < iterations_g; i++) { EnqueueProgram(cq, program[0], false); for (int j = 0; j < nfast_kernels_g; j++) { EnqueueProgram(cq, program[1], false); } } - EndTraceCapture(device, 0, tid); - auto start = std::chrono::system_clock::now(); - ReplayTrace(device, 0, tid, true); + if (time_just_finish_g) { + start = std::chrono::system_clock::now(); + } + Finish(cq); auto end = std::chrono::system_clock::now(); - // if (time_just_finish_g) { - // start = std::chrono::system_clock::now(); - // } - // Finish(cq); - // auto end = std::chrono::system_clock::now(); log_info(LogTest, "Warmup iterations: {}", warmup_iterations_g); log_info(LogTest, "Iterations: {}", iterations_g); diff --git a/tt_metal/hw/firmware/src/brisc.cc b/tt_metal/hw/firmware/src/brisc.cc index 105d107f9a81..ce3f0642eba9 100644 --- a/tt_metal/hw/firmware/src/brisc.cc +++ b/tt_metal/hw/firmware/src/brisc.cc @@ -394,8 +394,9 @@ int main() { { // Only include this iteration in the device profile if the launch message is valid. This is because all workers get a go signal regardless of whether // they're running a kernel or not. We don't want to profile "invalid" iterations. - DeviceConditionalZoneScopedMainN("BRISC-FW", mailboxes->launch[launch_msg_rd_ptr].kernel_config.enables); - DeviceConditionalZoneSetCounter(mailboxes->launch[launch_msg_rd_ptr].kernel_config.host_assigned_id, mailboxes->launch[launch_msg_rd_ptr].kernel_config.enables); + DeviceZoneScopedMainN("BRISC-FW"); + DeviceValidateProfiler(mailboxes->launch[launch_msg_rd_ptr].kernel_config.enables); + DeviceZoneSetCounter(mailboxes->launch[launch_msg_rd_ptr].kernel_config.host_assigned_id); // Copies from L1 to IRAM on chips where NCRISC has IRAM l1_to_ncrisc_iram_copy(mailboxes->launch[launch_msg_rd_ptr].kernel_config.ncrisc_kernel_size16, ncrisc_kernel_start_offset16); diff --git a/tt_metal/hw/firmware/src/erisc.cc b/tt_metal/hw/firmware/src/erisc.cc index b84ed4630b49..10492df945c0 100644 --- a/tt_metal/hw/firmware/src/erisc.cc +++ b/tt_metal/hw/firmware/src/erisc.cc @@ -72,8 +72,9 @@ void __attribute__((section("erisc_l1_code.1"), noinline)) Application(void) { if (mailboxes->go_message.signal == RUN_MSG_GO) { // Only include this iteration in the device profile if the launch message is valid. This is because all workers get a go signal regardless of whether // they're running a kernel or not. We don't want to profile "invalid" iterations. - DeviceConditionalZoneScopedMainN("ERISC-FW", mailboxes->launch[mailboxes->launch_msg_rd_ptr].kernel_config.enables); - DeviceConditionalZoneSetCounter(mailboxes->launch[mailboxes->launch_msg_rd_ptr].kernel_config.host_assigned_id, mailboxes->launch[mailboxes->launch_msg_rd_ptr].kernel_config.enables); + DeviceZoneScopedMainN("ERISC-FW"); + DeviceValidateProfiler(mailboxes->launch[mailboxes->launch_msg_rd_ptr].kernel_config.enables); + DeviceZoneSetCounter(mailboxes->launch[mailboxes->launch_msg_rd_ptr].kernel_config.host_assigned_id); enum dispatch_core_processor_masks enables = (enum dispatch_core_processor_masks)mailboxes->launch[mailboxes->launch_msg_rd_ptr].kernel_config.enables; if (enables & DISPATCH_CLASS_MASK_ETH_DM0) { firmware_config_init(mailboxes, ProgrammableCoreType::ACTIVE_ETH, DISPATCH_CLASS_ETH_DM0); diff --git a/tt_metal/hw/inc/dev_msgs.h b/tt_metal/hw/inc/dev_msgs.h index c3ef6f34f05e..28abb160c0d9 100644 --- a/tt_metal/hw/inc/dev_msgs.h +++ b/tt_metal/hw/inc/dev_msgs.h @@ -321,6 +321,7 @@ static constexpr uint32_t TENSIX_LAUNCH_CHECK = (MEM_MAILBOX_BASE + offsetof(mai static constexpr uint32_t TENSIX_PROFILER_CHECK = (MEM_MAILBOX_BASE + offsetof(mailboxes_t, profiler)) % TT_ARCH_MAX_NOC_WRITE_ALIGNMENT; static_assert( TENSIX_LAUNCH_CHECK == 0); static_assert( TENSIX_PROFILER_CHECK == 0); +static_assert( sizeof(launch_msg_t) % TT_ARCH_MAX_NOC_WRITE_ALIGNMENT == 0); #endif #endif diff --git a/tt_metal/impl/dispatch/kernels/cq_common.hpp b/tt_metal/impl/dispatch/kernels/cq_common.hpp index 53f91b16a44a..698a0afac0f7 100644 --- a/tt_metal/impl/dispatch/kernels/cq_common.hpp +++ b/tt_metal/impl/dispatch/kernels/cq_common.hpp @@ -274,7 +274,6 @@ void cb_wait_all_pages(uint32_t n) { } template -FORCE_INLINE void cb_acquire_pages(uint32_t n) { volatile tt_l1_ptr uint32_t* sem_addr = reinterpret_cast(get_semaphore(sem_id)); diff --git a/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp b/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp index 740aa5bb716a..132611c2657a 100644 --- a/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp +++ b/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp @@ -311,6 +311,9 @@ void fetch_q_get_cmds(uint32_t& fence, uint32_t& cmd_ptr, uint32_t& pcie_read_pt uint32_t process_debug_cmd(uint32_t cmd_ptr) { volatile CQPrefetchCmd tt_l1_ptr *cmd = (volatile CQPrefetchCmd tt_l1_ptr *)cmd_ptr; +#if 0 + // Out of code memory for prefetcher + DPRINT + // Hack this off for now, have to revisit soon uint32_t checksum = 0; uint32_t data_start = (uint32_t)cmd + sizeof(CQPrefetchCmd); uint32_t *data = (uint32_t *)data_start; @@ -332,7 +335,7 @@ uint32_t process_debug_cmd(uint32_t cmd_ptr) { WAYPOINT("!CHK"); ASSERT(0); } - +#endif return cmd->debug.stride; } @@ -933,6 +936,7 @@ void paged_read_into_cmddat_q(uint32_t read_ptr, PrefetchExecBufState& exec_buf_ // ie, reads the data from dram and relays it on // Separate implementation that fetches more data from exec buf when cmd has been split template +FORCE_INLINE static uint32_t process_exec_buf_relay_inline_cmd(uint32_t& cmd_ptr, uint32_t& local_downstream_data_ptr, PrefetchExecBufState& exec_buf_state) { diff --git a/tt_metal/tools/profiler/kernel_profiler.hpp b/tt_metal/tools/profiler/kernel_profiler.hpp index 51a7a3c2e89e..0b50594d0809 100644 --- a/tt_metal/tools/profiler/kernel_profiler.hpp +++ b/tt_metal/tools/profiler/kernel_profiler.hpp @@ -162,6 +162,10 @@ namespace kernel_profiler{ } } + inline __attribute__((always_inline)) void set_profiler_zone_valid(bool condition) { + profiler_control_buffer[PROFILER_DONE] = !condition; + } + inline __attribute__((always_inline)) void risc_finished_profiling() { for (int i = 0; i < SUM_COUNT; i ++) @@ -361,26 +365,20 @@ namespace kernel_profiler{ static_assert (start_index < CUSTOM_MARKERS); static_assert (end_index < CUSTOM_MARKERS); - bool condition = true; - inline __attribute__((always_inline)) profileScopeGuaranteed (bool condition) + inline __attribute__((always_inline)) profileScopeGuaranteed () { - this->condition = condition; - if (this->condition) { - if constexpr (index == 0) - { - init_profiler(); - } - mark_time_at_index_inlined(start_index, timer_id); + if constexpr (index == 0) + { + init_profiler(); } + mark_time_at_index_inlined(start_index, timer_id); } inline __attribute__((always_inline)) ~profileScopeGuaranteed () { - if (this->condition) { - mark_time_at_index_inlined(end_index, get_end_timer_id(timer_id)); - if constexpr (index == 0) - { - finish_profiler(); - } + mark_time_at_index_inlined(end_index, get_end_timer_id(timer_id)); + if constexpr (index == 0) + { + finish_profiler(); } } }; @@ -417,11 +415,11 @@ namespace kernel_profiler{ #endif -#define DeviceConditionalZoneScopedMainN( name, condition ) DO_PRAGMA(message(PROFILER_MSG_NAME(name))); auto constexpr hash = kernel_profiler::Hash16_CT(PROFILER_MSG_NAME(name)); kernel_profiler::profileScopeGuaranteed zone = kernel_profiler::profileScopeGuaranteed(condition); +#define DeviceValidateProfiler( condition ) kernel_profiler::set_profiler_zone_valid(condition); -#define DeviceZoneScopedMainN( name ) DO_PRAGMA(message(PROFILER_MSG_NAME(name))); auto constexpr hash = kernel_profiler::Hash16_CT(PROFILER_MSG_NAME(name)); kernel_profiler::profileScopeGuaranteed zone = kernel_profiler::profileScopeGuaranteed(true); +#define DeviceZoneScopedMainN( name ) DO_PRAGMA(message(PROFILER_MSG_NAME(name))); auto constexpr hash = kernel_profiler::Hash16_CT(PROFILER_MSG_NAME(name)); kernel_profiler::profileScopeGuaranteed zone = kernel_profiler::profileScopeGuaranteed(); -#define DeviceZoneScopedMainChildN( name ) DO_PRAGMA(message(PROFILER_MSG_NAME(name))); auto constexpr hash = kernel_profiler::Hash16_CT(PROFILER_MSG_NAME(name));kernel_profiler::profileScopeGuaranteed zone = kernel_profiler::profileScopeGuaranteed(true); +#define DeviceZoneScopedMainChildN( name ) DO_PRAGMA(message(PROFILER_MSG_NAME(name))); auto constexpr hash = kernel_profiler::Hash16_CT(PROFILER_MSG_NAME(name));kernel_profiler::profileScopeGuaranteed zone = kernel_profiler::profileScopeGuaranteed(); #define DeviceZoneScopedSumN1( name ) DO_PRAGMA(message(PROFILER_MSG_NAME(name))); auto constexpr hash = kernel_profiler::Hash16_CT(PROFILER_MSG_NAME(name)); kernel_profiler::profileScopeAccumulate zone = kernel_profiler::profileScopeAccumulate(); @@ -429,10 +427,9 @@ namespace kernel_profiler{ #define DeviceZoneSetCounter( counter ) kernel_profiler::set_host_counter(counter); -#define DeviceConditionalZoneSetCounter( counter, condition ) if (condition) { kernel_profiler::set_host_counter(counter); } #else -#define DeviceConditionalZoneScopedMainN( name, condition ) +#define DeviceValidateProfiler( condition ) #define DeviceZoneScopedMainN( name ) @@ -448,5 +445,4 @@ namespace kernel_profiler{ #define DeviceZoneSetCounter( counter ) -#define DeviceConditionalZoneSetCounter( counter, condition ) #endif