Skip to content

Commit

Permalink
#0: Apply feedback on profiler changes:
Browse files Browse the repository at this point in the history
  - Set PROFILER_DONE flag to avoid sending writes to DRAM profiler
    buffer, instead of calling profileScopeGuaranteed ctor and dtor
    as nops
  - Reduces overhead associated with checking launch message twice
  • Loading branch information
tt-asaigal committed Sep 30, 2024
1 parent f1c6a73 commit f23829c
Show file tree
Hide file tree
Showing 7 changed files with 35 additions and 38 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ int main(int argc, char **argv) {
bool pass = true;
try {
int device_id = 0;
tt_metal::Device *device = tt_metal::CreateDevice(device_id, 1, 0, 875521024);
tt_metal::Device *device = tt_metal::CreateDevice(device_id);

CommandQueue& cq = device->command_queue();

Expand All @@ -234,23 +234,18 @@ int main(int argc, char **argv) {
tt_metal::detail::SetLazyCommandQueueMode(true);
}

// auto start = std::chrono::system_clock::now();
uint32_t tid = BeginTraceCapture(device, 0);
auto start = std::chrono::system_clock::now();
for (int i = 0; i < iterations_g; i++) {
EnqueueProgram(cq, program[0], false);
for (int j = 0; j < nfast_kernels_g; j++) {
EnqueueProgram(cq, program[1], false);
}
}
EndTraceCapture(device, 0, tid);
auto start = std::chrono::system_clock::now();
ReplayTrace(device, 0, tid, true);
if (time_just_finish_g) {
start = std::chrono::system_clock::now();
}
Finish(cq);
auto end = std::chrono::system_clock::now();
// if (time_just_finish_g) {
// start = std::chrono::system_clock::now();
// }
// Finish(cq);
// auto end = std::chrono::system_clock::now();

log_info(LogTest, "Warmup iterations: {}", warmup_iterations_g);
log_info(LogTest, "Iterations: {}", iterations_g);
Expand Down
5 changes: 3 additions & 2 deletions tt_metal/hw/firmware/src/brisc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -394,8 +394,9 @@ int main() {
{
// Only include this iteration in the device profile if the launch message is valid. This is because all workers get a go signal regardless of whether
// they're running a kernel or not. We don't want to profile "invalid" iterations.
DeviceConditionalZoneScopedMainN("BRISC-FW", mailboxes->launch[launch_msg_rd_ptr].kernel_config.enables);
DeviceConditionalZoneSetCounter(mailboxes->launch[launch_msg_rd_ptr].kernel_config.host_assigned_id, mailboxes->launch[launch_msg_rd_ptr].kernel_config.enables);
DeviceZoneScopedMainN("BRISC-FW");
DeviceValidateProfiler(mailboxes->launch[launch_msg_rd_ptr].kernel_config.enables);
DeviceZoneSetCounter(mailboxes->launch[launch_msg_rd_ptr].kernel_config.host_assigned_id);
// Copies from L1 to IRAM on chips where NCRISC has IRAM
l1_to_ncrisc_iram_copy(mailboxes->launch[launch_msg_rd_ptr].kernel_config.ncrisc_kernel_size16, ncrisc_kernel_start_offset16);

Expand Down
5 changes: 3 additions & 2 deletions tt_metal/hw/firmware/src/erisc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,9 @@ void __attribute__((section("erisc_l1_code.1"), noinline)) Application(void) {
if (mailboxes->go_message.signal == RUN_MSG_GO) {
// Only include this iteration in the device profile if the launch message is valid. This is because all workers get a go signal regardless of whether
// they're running a kernel or not. We don't want to profile "invalid" iterations.
DeviceConditionalZoneScopedMainN("ERISC-FW", mailboxes->launch[mailboxes->launch_msg_rd_ptr].kernel_config.enables);
DeviceConditionalZoneSetCounter(mailboxes->launch[mailboxes->launch_msg_rd_ptr].kernel_config.host_assigned_id, mailboxes->launch[mailboxes->launch_msg_rd_ptr].kernel_config.enables);
DeviceZoneScopedMainN("ERISC-FW");
DeviceValidateProfiler(mailboxes->launch[mailboxes->launch_msg_rd_ptr].kernel_config.enables);
DeviceZoneSetCounter(mailboxes->launch[mailboxes->launch_msg_rd_ptr].kernel_config.host_assigned_id);
enum dispatch_core_processor_masks enables = (enum dispatch_core_processor_masks)mailboxes->launch[mailboxes->launch_msg_rd_ptr].kernel_config.enables;
if (enables & DISPATCH_CLASS_MASK_ETH_DM0) {
firmware_config_init(mailboxes, ProgrammableCoreType::ACTIVE_ETH, DISPATCH_CLASS_ETH_DM0);
Expand Down
1 change: 1 addition & 0 deletions tt_metal/hw/inc/dev_msgs.h
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,7 @@ static constexpr uint32_t TENSIX_LAUNCH_CHECK = (MEM_MAILBOX_BASE + offsetof(mai
static constexpr uint32_t TENSIX_PROFILER_CHECK = (MEM_MAILBOX_BASE + offsetof(mailboxes_t, profiler)) % TT_ARCH_MAX_NOC_WRITE_ALIGNMENT;
static_assert( TENSIX_LAUNCH_CHECK == 0);
static_assert( TENSIX_PROFILER_CHECK == 0);
static_assert( sizeof(launch_msg_t) % TT_ARCH_MAX_NOC_WRITE_ALIGNMENT == 0);
#endif
#endif

Expand Down
1 change: 0 additions & 1 deletion tt_metal/impl/dispatch/kernels/cq_common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,6 @@ void cb_wait_all_pages(uint32_t n) {
}

template<uint32_t noc_xy, uint32_t sem_id>
FORCE_INLINE
void cb_acquire_pages(uint32_t n) {
volatile tt_l1_ptr uint32_t* sem_addr =
reinterpret_cast<volatile tt_l1_ptr uint32_t*>(get_semaphore<fd_core_type>(sem_id));
Expand Down
6 changes: 5 additions & 1 deletion tt_metal/impl/dispatch/kernels/cq_prefetch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,9 @@ void fetch_q_get_cmds(uint32_t& fence, uint32_t& cmd_ptr, uint32_t& pcie_read_pt
uint32_t process_debug_cmd(uint32_t cmd_ptr) {

volatile CQPrefetchCmd tt_l1_ptr *cmd = (volatile CQPrefetchCmd tt_l1_ptr *)cmd_ptr;
#if 0
// Out of code memory for prefetcher + DPRINT
// Hack this off for now, have to revisit soon
uint32_t checksum = 0;
uint32_t data_start = (uint32_t)cmd + sizeof(CQPrefetchCmd);
uint32_t *data = (uint32_t *)data_start;
Expand All @@ -332,7 +335,7 @@ uint32_t process_debug_cmd(uint32_t cmd_ptr) {
WAYPOINT("!CHK");
ASSERT(0);
}

#endif
return cmd->debug.stride;
}

Expand Down Expand Up @@ -933,6 +936,7 @@ void paged_read_into_cmddat_q(uint32_t read_ptr, PrefetchExecBufState& exec_buf_
// ie, reads the data from dram and relays it on
// Separate implementation that fetches more data from exec buf when cmd has been split
template<bool send_to_dispatch_master>
FORCE_INLINE
static uint32_t process_exec_buf_relay_inline_cmd(uint32_t& cmd_ptr,
uint32_t& local_downstream_data_ptr,
PrefetchExecBufState& exec_buf_state) {
Expand Down
38 changes: 17 additions & 21 deletions tt_metal/tools/profiler/kernel_profiler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,10 @@ namespace kernel_profiler{
}
}

inline __attribute__((always_inline)) void set_profiler_zone_valid(bool condition) {
profiler_control_buffer[PROFILER_DONE] = !condition;
}

inline __attribute__((always_inline)) void risc_finished_profiling()
{
for (int i = 0; i < SUM_COUNT; i ++)
Expand Down Expand Up @@ -361,26 +365,20 @@ namespace kernel_profiler{

static_assert (start_index < CUSTOM_MARKERS);
static_assert (end_index < CUSTOM_MARKERS);
bool condition = true;
inline __attribute__((always_inline)) profileScopeGuaranteed (bool condition)
inline __attribute__((always_inline)) profileScopeGuaranteed ()
{
this->condition = condition;
if (this->condition) {
if constexpr (index == 0)
{
init_profiler();
}
mark_time_at_index_inlined(start_index, timer_id);
if constexpr (index == 0)
{
init_profiler();
}
mark_time_at_index_inlined(start_index, timer_id);
}
inline __attribute__((always_inline)) ~profileScopeGuaranteed ()
{
if (this->condition) {
mark_time_at_index_inlined(end_index, get_end_timer_id(timer_id));
if constexpr (index == 0)
{
finish_profiler();
}
mark_time_at_index_inlined(end_index, get_end_timer_id(timer_id));
if constexpr (index == 0)
{
finish_profiler();
}
}
};
Expand Down Expand Up @@ -417,22 +415,21 @@ namespace kernel_profiler{

#endif

#define DeviceConditionalZoneScopedMainN( name, condition ) DO_PRAGMA(message(PROFILER_MSG_NAME(name))); auto constexpr hash = kernel_profiler::Hash16_CT(PROFILER_MSG_NAME(name)); kernel_profiler::profileScopeGuaranteed<hash, 0> zone = kernel_profiler::profileScopeGuaranteed<hash, 0>(condition);
#define DeviceValidateProfiler( condition ) kernel_profiler::set_profiler_zone_valid(condition);

#define DeviceZoneScopedMainN( name ) DO_PRAGMA(message(PROFILER_MSG_NAME(name))); auto constexpr hash = kernel_profiler::Hash16_CT(PROFILER_MSG_NAME(name)); kernel_profiler::profileScopeGuaranteed<hash, 0> zone = kernel_profiler::profileScopeGuaranteed<hash, 0>(true);
#define DeviceZoneScopedMainN( name ) DO_PRAGMA(message(PROFILER_MSG_NAME(name))); auto constexpr hash = kernel_profiler::Hash16_CT(PROFILER_MSG_NAME(name)); kernel_profiler::profileScopeGuaranteed<hash, 0> zone = kernel_profiler::profileScopeGuaranteed<hash, 0>();

#define DeviceZoneScopedMainChildN( name ) DO_PRAGMA(message(PROFILER_MSG_NAME(name))); auto constexpr hash = kernel_profiler::Hash16_CT(PROFILER_MSG_NAME(name));kernel_profiler::profileScopeGuaranteed<hash, 1> zone = kernel_profiler::profileScopeGuaranteed<hash, 1>(true);
#define DeviceZoneScopedMainChildN( name ) DO_PRAGMA(message(PROFILER_MSG_NAME(name))); auto constexpr hash = kernel_profiler::Hash16_CT(PROFILER_MSG_NAME(name));kernel_profiler::profileScopeGuaranteed<hash, 1> zone = kernel_profiler::profileScopeGuaranteed<hash, 1>();

#define DeviceZoneScopedSumN1( name ) DO_PRAGMA(message(PROFILER_MSG_NAME(name))); auto constexpr hash = kernel_profiler::Hash16_CT(PROFILER_MSG_NAME(name)); kernel_profiler::profileScopeAccumulate<hash, 0> zone = kernel_profiler::profileScopeAccumulate<hash, 0>();

#define DeviceZoneScopedSumN2( name ) DO_PRAGMA(message(PROFILER_MSG_NAME(name))); auto constexpr hash = kernel_profiler::Hash16_CT(PROFILER_MSG_NAME(name)); kernel_profiler::profileScopeAccumulate<hash, 1> zone = kernel_profiler::profileScopeAccumulate<hash, 1>();

#define DeviceZoneSetCounter( counter ) kernel_profiler::set_host_counter(counter);

#define DeviceConditionalZoneSetCounter( counter, condition ) if (condition) { kernel_profiler::set_host_counter(counter); }
#else

#define DeviceConditionalZoneScopedMainN( name, condition )
#define DeviceValidateProfiler( condition )

#define DeviceZoneScopedMainN( name )

Expand All @@ -448,5 +445,4 @@ namespace kernel_profiler{

#define DeviceZoneSetCounter( counter )

#define DeviceConditionalZoneSetCounter( counter, condition )
#endif

0 comments on commit f23829c

Please sign in to comment.