Skip to content

Commit

Permalink
#0: Add validation test for dispatched remote circular buffer config …
Browse files Browse the repository at this point in the history
…to device
  • Loading branch information
tt-aho committed Jan 29, 2025
1 parent 3adec9c commit ed8472f
Show file tree
Hide file tree
Showing 9 changed files with 366 additions and 62 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ TEST_F(DispatchFixture, TensixProgramGlobalCircularBuffers) {
tt::DataFormat tile_format = tt::DataFormat::Float16_b;
auto all_cores = sender_cores.merge(receiver_cores).merge(dummy_receiver_cores);
auto device = devices_[0];
std::vector<std::pair<CoreCoord, CoreRangeSet>> sender_receiver_core_mapping = {{CoreCoord(0, 0), receiver_cores}};
std::vector<std::pair<CoreCoord, CoreRangeSet>> sender_receiver_core_mapping = {{sender_core, receiver_cores}};
auto global_cb = tt::tt_metal::v1::experimental::CreateGlobalCircularBuffer(
device, sender_receiver_core_mapping, 3200, tt::tt_metal::BufferType::L1);
std::vector<std::pair<CoreCoord, CoreRangeSet>> dummy_sender_receiver_core_mapping = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ set(UNIT_TESTS_DISPATCH_PROGRAM_SRC
${CMAKE_CURRENT_SOURCE_DIR}/test_dispatch_stress.cpp
${CMAKE_CURRENT_SOURCE_DIR}/test_dispatch.cpp
${CMAKE_CURRENT_SOURCE_DIR}/test_EnqueueProgram.cpp
${CMAKE_CURRENT_SOURCE_DIR}/test_global_circular_buffers.cpp
${CMAKE_CURRENT_SOURCE_DIR}/test_sub_device.cpp
${CMAKE_CURRENT_SOURCE_DIR}/test_program_reuse.cpp
PARENT_SCOPE
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
//
// SPDX-License-Identifier: Apache-2.0

#include <gtest/gtest.h>

#include <vector>

#include "dispatch_fixture.hpp"
#include <tt-metalium/core_coord.hpp>
#include <tt-metalium/tt_metal.hpp>
#include <tt-metalium/host_api.hpp>
#include <tt-metalium/global_circular_buffer_impl.hpp>
#include <tt-metalium/global_circular_buffer.hpp>
#include "tt_metal/include/tt_metal/program.hpp"

TEST_F(DispatchFixture, TensixProgramGlobalCircularBuffers) {
CoreCoord sender_core = CoreCoord(0, 0);
CoreRangeSet sender_cores = CoreRangeSet(CoreRange(sender_core));
CoreRangeSet receiver_cores(CoreRange({1, 1}, {2, 2}));
uint32_t global_cb_size = 3200;
uint32_t cb_page_size = 32;
tt::DataFormat tile_format = tt::DataFormat::Float16_b;
auto all_cores = sender_cores.merge(receiver_cores);
auto device = devices_[0];
std::vector<std::pair<CoreCoord, CoreRangeSet>> sender_receiver_core_mapping = {{sender_core, receiver_cores}};
auto global_cb = tt::tt_metal::v1::experimental::CreateGlobalCircularBuffer(
device, sender_receiver_core_mapping, 3200, tt::tt_metal::BufferType::L1);

tt::tt_metal::Program program = CreateProgram();
uint32_t remote_cb_index = 31;
uint32_t local_cb_index = 0;
tt::tt_metal::CircularBufferConfig global_cb_config = tt::tt_metal::CircularBufferConfig(cb_page_size);
global_cb_config.remote_index(remote_cb_index).set_page_size(cb_page_size).set_data_format(tile_format);
global_cb_config.index(local_cb_index).set_page_size(cb_page_size).set_data_format(tile_format);
auto remote_cb =
tt::tt_metal::v1::experimental::CreateCircularBuffer(program, all_cores, global_cb_config, global_cb);

std::vector<uint32_t> compile_args = {remote_cb_index};
tt::tt_metal::KernelHandle dm0_sender_kernel = tt::tt_metal::CreateKernel(
program,
"tests/tt_metal/tt_metal/test_kernels/misc/global_circular_buffer/validate_sender_config.cpp",
sender_cores,
tt::tt_metal::ReaderDataMovementConfig(compile_args));
tt::tt_metal::KernelHandle dm1_sender_kernel = tt::tt_metal::CreateKernel(
program,
"tests/tt_metal/tt_metal/test_kernels/misc/global_circular_buffer/validate_sender_config.cpp",
sender_cores,
tt::tt_metal::WriterDataMovementConfig(compile_args));
tt::tt_metal::KernelHandle compute_sender_kernel = tt::tt_metal::CreateKernel(
program,
"tests/tt_metal/tt_metal/test_kernels/misc/global_circular_buffer/validate_sender_config.cpp",
sender_cores,
tt::tt_metal::ComputeConfig{.compile_args = compile_args});
tt::tt_metal::KernelHandle dm0_receiver_kernel = tt::tt_metal::CreateKernel(
program,
"tests/tt_metal/tt_metal/test_kernels/misc/global_circular_buffer/validate_receiver_config.cpp",
receiver_cores,
tt::tt_metal::ReaderDataMovementConfig(compile_args));
tt::tt_metal::KernelHandle dm1_receiver_kernel = tt::tt_metal::CreateKernel(
program,
"tests/tt_metal/tt_metal/test_kernels/misc/global_circular_buffer/validate_receiver_config.cpp",
receiver_cores,
tt::tt_metal::WriterDataMovementConfig(compile_args));
tt::tt_metal::KernelHandle compute_receiver_kernel = tt::tt_metal::CreateKernel(
program,
"tests/tt_metal/tt_metal/test_kernels/misc/global_circular_buffer/validate_receiver_config.cpp",
receiver_cores,
tt::tt_metal::ComputeConfig{.compile_args = compile_args});

for (const auto& [sender_core, receiver_cores] : sender_receiver_core_mapping) {
auto sender_noc_coords = device->worker_core_from_logical_core(sender_core);
std::vector<CoreCoord> receiver_noc_coords;
for (const auto& receiver_core_range : receiver_cores.ranges()) {
const auto& receiver_cores_vec = corerange_to_cores(receiver_core_range);
for (const auto& receiver_core : receiver_cores_vec) {
receiver_noc_coords.push_back(device->worker_core_from_logical_core(receiver_core));
}
}
std::vector<uint32_t> sender_runtime_args(11 + receiver_noc_coords.size() * 2);
uint32_t sender_args_idx = 0;
sender_runtime_args[sender_args_idx++] = global_cb.config_address(); // config_addr
sender_runtime_args[sender_args_idx++] = 1; // is_sender
sender_runtime_args[sender_args_idx++] = receiver_noc_coords.size(); // num_receivers
sender_runtime_args[sender_args_idx++] = global_cb.buffer_address(); // fifo_start_addr
sender_runtime_args[sender_args_idx++] = global_cb.size(); // fifo_size
sender_runtime_args[sender_args_idx++] = global_cb.buffer_address(); // fifo_ptr
for (const auto& receiver_noc_coord : receiver_noc_coords) {
sender_runtime_args[sender_args_idx++] = receiver_noc_coord.x; // remote_noc_x
sender_runtime_args[sender_args_idx++] = receiver_noc_coord.y; // remote_noc_y
}
sender_runtime_args[sender_args_idx++] = 0; // aligned_pages_sent
sender_runtime_args[sender_args_idx++] = 0; // aligned_pages_acked
sender_runtime_args[sender_args_idx++] = global_cb.buffer_address(); // fifo_wr_ptr
sender_runtime_args[sender_args_idx++] =
global_cb.buffer_address() + global_cb.size(); // fifo_limit_page_aligned
sender_runtime_args[sender_args_idx++] = cb_page_size; // fifo_page_size

std::vector<uint32_t> receiver_runtime_args = {
global_cb.config_address(), // config_addr
0, // is_sender
global_cb.buffer_address(), // fifo_start_addr
global_cb.size(), // fifo_size
global_cb.buffer_address(), // fifo_ptr
sender_noc_coords.x, // sender_noc_x
sender_noc_coords.y, // sender_noc_y
0, // aligned_pages_sent
0, // aligned_pages_acked
global_cb.buffer_address(), // fifo_rd_ptr
global_cb.buffer_address() + global_cb.size(), // fifo_limit_page_aligned
cb_page_size, // fifo_page_size
};
tt::tt_metal::SetRuntimeArgs(program, dm0_sender_kernel, sender_cores, sender_runtime_args);
tt::tt_metal::SetRuntimeArgs(program, dm1_sender_kernel, sender_cores, sender_runtime_args);
tt::tt_metal::SetRuntimeArgs(program, compute_sender_kernel, sender_cores, sender_runtime_args);
tt::tt_metal::SetRuntimeArgs(program, dm0_receiver_kernel, receiver_cores, receiver_runtime_args);
tt::tt_metal::SetRuntimeArgs(program, dm1_receiver_kernel, receiver_cores, receiver_runtime_args);
tt::tt_metal::SetRuntimeArgs(program, compute_receiver_kernel, receiver_cores, receiver_runtime_args);
}
this->RunProgram(device, program);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
//
// SPDX-License-Identifier: Apache-2.0

#include <cstdint>
#include "debug/assert.h"

#if defined(COMPILE_FOR_TRISC)
#include "compute_kernel_api/common.h"

namespace NAMESPACE {
void MAIN {
#else
#include "dataflow_api.h"

void kernel_main() {
#endif
#if !defined(UCK_CHLKC_MATH)
constexpr uint32_t remote_cb_id = get_compile_time_arg_val(0);

auto& remote_receiver_cb_interface = get_remote_receiver_cb_interface(remote_cb_id);

uint32_t arg_idx = 0;
uint32_t config_idx = 0;
bool pass = true;
// config_addr
uint32_t config_addr = get_arg_val<uint32_t>(arg_idx++);
pass &= remote_receiver_cb_interface.config_ptr == config_addr;
ASSERT(pass);
volatile tt_l1_ptr uint32_t* config_ptr = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(config_addr);
// is_sender
bool is_sender = get_arg_val<uint32_t>(arg_idx++);
pass &= config_ptr[config_idx++] == is_sender;
ASSERT(pass);
// num_receivers
config_idx++; // Skip num_receivers
// fifo_start_addr
uint32_t fifo_start_addr = get_arg_val<uint32_t>(arg_idx++);
pass &= config_ptr[config_idx++] == fifo_start_addr;
ASSERT(pass);
pass &= remote_receiver_cb_interface.fifo_start_addr == fifo_start_addr;
ASSERT(pass);
// fifo_size
uint32_t fifo_size = get_arg_val<uint32_t>(arg_idx++);
pass &= config_ptr[config_idx++] == fifo_size;
ASSERT(pass);
// fifo_ptr
uint32_t fifo_ptr = get_arg_val<uint32_t>(arg_idx++);
pass &= config_ptr[config_idx++] == fifo_ptr;
ASSERT(pass);
// remote_noc_xy_addr
uint32_t remote_noc_xy_addr = config_ptr[config_idx++];
pass &= remote_receiver_cb_interface.sender_noc_x == get_arg_val<uint32_t>(arg_idx++);
ASSERT(pass);
pass &= remote_receiver_cb_interface.sender_noc_y == get_arg_val<uint32_t>(arg_idx++);
ASSERT(pass);

// aligned_pages_acked_addr
uint32_t aligned_pages_sent_addr = config_ptr[config_idx++];
pass &= remote_receiver_cb_interface.aligned_pages_acked_ptr == aligned_pages_sent_addr + L1_ALIGNMENT;
ASSERT(pass);
volatile tt_l1_ptr uint32_t* pages_sent_ptr =
reinterpret_cast<volatile tt_l1_ptr uint32_t*>(aligned_pages_sent_addr);
pass &= *pages_sent_ptr == get_arg_val<uint32_t>(arg_idx++);
ASSERT(pass);
volatile tt_l1_ptr uint32_t* pages_acked_ptr =
reinterpret_cast<volatile tt_l1_ptr uint32_t*>(remote_receiver_cb_interface.aligned_pages_acked_ptr);
pass &= *pages_sent_ptr == get_arg_val<uint32_t>(arg_idx++);
ASSERT(pass);
// fifo_rd_ptr
pass &= remote_receiver_cb_interface.fifo_rd_ptr == get_arg_val<uint32_t>(arg_idx++);
ASSERT(pass);
// fifo_limit_page_aligned
pass &= remote_receiver_cb_interface.fifo_limit_page_aligned == get_arg_val<uint32_t>(arg_idx++);
ASSERT(pass);
// fifo_page_size
pass &= remote_receiver_cb_interface.fifo_page_size == get_arg_val<uint32_t>(arg_idx++);
ASSERT(pass);

// Hang if watcher not enabled
while (!pass);
#endif
}
#if defined(COMPILE_FOR_TRISC)
} // namespace NAMESPACE
#endif
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
//
// SPDX-License-Identifier: Apache-2.0

#include <cstdint>
#include "debug/assert.h"
#include "debug/ring_buffer.h"

#if defined(COMPILE_FOR_TRISC)
#include "compute_kernel_api/common.h"

namespace NAMESPACE {
void MAIN {
#else
#include "dataflow_api.h"

void kernel_main() {
#endif
#if !defined(UCK_CHLKC_MATH)
constexpr uint32_t remote_cb_id = get_compile_time_arg_val(0);

auto& remote_sender_cb_interface = get_remote_sender_cb_interface(remote_cb_id);

uint32_t arg_idx = 0;
uint32_t config_idx = 0;
bool pass = true;
// config_addr
uint32_t config_addr = get_arg_val<uint32_t>(arg_idx++);
pass &= remote_sender_cb_interface.config_ptr == config_addr;
ASSERT(pass);
volatile tt_l1_ptr uint32_t* config_ptr = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(config_addr);
// is_sender
bool is_sender = get_arg_val<uint32_t>(arg_idx++);
pass &= config_ptr[config_idx++] == is_sender;
ASSERT(pass);
// num_receivers
uint32_t num_receivers = get_arg_val<uint32_t>(arg_idx++);
pass &= config_ptr[config_idx++] == num_receivers;
ASSERT(pass);
pass &= remote_sender_cb_interface.num_receivers == num_receivers;
ASSERT(pass);
// fifo_start_addr
uint32_t fifo_start_addr = get_arg_val<uint32_t>(arg_idx++);
pass &= config_ptr[config_idx++] == fifo_start_addr;
ASSERT(pass);
pass &= remote_sender_cb_interface.fifo_start_addr == fifo_start_addr;
ASSERT(pass);
// fifo_size
uint32_t fifo_size = get_arg_val<uint32_t>(arg_idx++);
pass &= config_ptr[config_idx++] == fifo_size;
ASSERT(pass);
// fifo_ptr
uint32_t fifo_ptr = get_arg_val<uint32_t>(arg_idx++);
pass &= config_ptr[config_idx++] == fifo_ptr;
ASSERT(pass);
// remote_noc_xy_addr
uint32_t remote_noc_xy_addr = config_ptr[config_idx++];
pass &= remote_sender_cb_interface.receiver_noc_xy_ptr == remote_noc_xy_addr;
ASSERT(pass);
volatile tt_l1_ptr uint32_t* remote_noc_xy_ptr = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(remote_noc_xy_addr);
for (uint32_t i = 0; i < num_receivers * 2; ++i) {
pass &= remote_noc_xy_ptr[i] == get_arg_val<uint32_t>(arg_idx++);
ASSERT(pass);
}
// aligned_pages_sent_addr
uint32_t aligned_pages_sent_addr = config_ptr[config_idx++];
pass &= remote_sender_cb_interface.aligned_pages_sent_ptr == aligned_pages_sent_addr;
ASSERT(pass);
volatile tt_l1_ptr uint32_t* pages_sent_ptr =
reinterpret_cast<volatile tt_l1_ptr uint32_t*>(remote_sender_cb_interface.aligned_pages_sent_ptr);
volatile tt_l1_ptr uint32_t* pages_acked_ptr = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(
remote_sender_cb_interface.aligned_pages_sent_ptr + L1_ALIGNMENT);
for (uint32_t i = 0; i < num_receivers; ++i) {
pass &= *pages_sent_ptr == get_arg_val<uint32_t>(arg_idx);
ASSERT(pass);
pass &= *pages_acked_ptr == get_arg_val<uint32_t>(arg_idx + 1);
ASSERT(pass);
pages_sent_ptr += 2 * L1_ALIGNMENT / sizeof(uint32_t);
pages_acked_ptr += 2 * L1_ALIGNMENT / sizeof(uint32_t);
}
arg_idx += 2;
// fifo_wr_ptr
pass &= remote_sender_cb_interface.fifo_wr_ptr == get_arg_val<uint32_t>(arg_idx++);
ASSERT(pass);
// fifo_limit_page_aligned
pass &= remote_sender_cb_interface.fifo_limit_page_aligned == get_arg_val<uint32_t>(arg_idx++);
ASSERT(pass);
// fifo_page_size
pass &= remote_sender_cb_interface.fifo_page_size == get_arg_val<uint32_t>(arg_idx++);
ASSERT(pass);

// Hang if watcher not enabled
while (!pass);
#endif
}
#if defined(COMPILE_FOR_TRISC)
} // namespace NAMESPACE
#endif
7 changes: 2 additions & 5 deletions tt_metal/hw/inc/dataflow_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -133,10 +133,7 @@ FORCE_INLINE uint32_t get_bank_offset(uint32_t bank_index) {
* | arg_idx | Unique Runtime argument index | uint32_t | 0 to 255 | True |
*/
// clang-format on
static FORCE_INLINE uint32_t get_arg_addr(int arg_idx) {
return (uint32_t)&rta_l1_base[arg_idx];
;
}
static FORCE_INLINE uint32_t get_arg_addr(int arg_idx) { return (uint32_t)&rta_l1_base[arg_idx]; }
// clang-format off
/**
* Returns the address in L1 for a given runtime argument index for common (all cores) runtime arguments set via
Expand Down Expand Up @@ -188,7 +185,7 @@ template <typename T>
FORCE_INLINE T get_common_arg_val(int arg_idx) {
// only 4B args are supported (eg int32, uint32)
static_assert("Error: only 4B args are supported" && sizeof(T) == 4);
return *((volatile tt_l1_ptr T*)(get_common_arg_addr(arg_idx)));
return *((tt_l1_ptr T*)(get_common_arg_addr(arg_idx)));
}

// clang-format off
Expand Down
Loading

0 comments on commit ed8472f

Please sign in to comment.