Skip to content

Commit

Permalink
#16339: seperate dispatch constants (#17340)
Browse files Browse the repository at this point in the history
### Ticket
#16339

### Problem description
`dispatch_constants` is actually a memory map.

### What's changed
- Rename `dispatch_constants` to `DispatchMemMap`
- Removed constants from `DispatchMemMap (ex dispatch_constants)`. They
are already in DispatchConstants and it doesn't need the singleton.
- DispatchMemMap address sizes like Dispatch Buffer Size, Cmddat size,
etc. are configured by DispatchSettings. Eventually DispatchSettings +
FDTopology will be loaded from a file.
  • Loading branch information
nhuang-tt authored Jan 31, 2025
1 parent 56be6a5 commit 9d69fb1
Show file tree
Hide file tree
Showing 36 changed files with 503 additions and 532 deletions.
4 changes: 2 additions & 2 deletions tests/tt_metal/tt_metal/device/test_device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -244,9 +244,9 @@ TEST_F(DeviceFixture, TensixTestL1ToPCIeAt16BAlignedAddress) {

uint32_t base_l1_src_address =
device->allocator()->get_base_allocator_addr(HalMemType::L1) + hal.get_alignment(HalMemType::L1);
// This is a slow dispatch test dispatch core type is needed to query dispatch_constants
// This is a slow dispatch test dispatch core type is needed to query DispatchMemMap
uint32_t base_pcie_dst_address =
dispatch_constants::get(CoreType::WORKER).get_host_command_queue_addr(CommandQueueHostAddrType::UNRESERVED) +
DispatchMemMap::get(CoreType::WORKER).get_host_command_queue_addr(CommandQueueHostAddrType::UNRESERVED) +
hal.get_alignment(HalMemType::L1);

uint32_t size_bytes = 2048 * 128;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include <tt-metalium/tt_metal.hpp>
#include <tt-metalium/host_api.hpp>
#include <tt-metalium/device.hpp>
#include <tt-metalium/dispatch_settings.hpp>

using std::vector;
using namespace tt::tt_metal;
Expand Down Expand Up @@ -194,7 +195,7 @@ void test_EnqueueWriteBuffer_and_EnqueueReadBuffer(IDevice* device, CommandQueue
uint32_t cq_size = device->sysmem_manager().get_cq_size();
CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device->id());
uint32_t cq_start =
dispatch_constants::get(dispatch_core_type).get_host_command_queue_addr(CommandQueueHostAddrType::UNRESERVED);
DispatchMemMap::get(dispatch_core_type).get_host_command_queue_addr(CommandQueueHostAddrType::UNRESERVED);

std::vector<uint32_t> cq_zeros((cq_size - cq_start) / sizeof(uint32_t), 0);

Expand Down Expand Up @@ -505,7 +506,7 @@ TEST_F(CommandQueueSingleCardBufferFixture, TestPageLargerThanAndUnalignedToTran
for (IDevice* device : devices_) {
TestBufferConfig config = {
.num_pages = num_round_robins * (device->allocator()->get_num_banks(BufferType::DRAM)),
.page_size = dispatch_constants::TRANSFER_PAGE_SIZE + 32,
.page_size = DispatchSettings::TRANSFER_PAGE_SIZE + 32,
.buftype = BufferType::DRAM};
local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(device, device->command_queue(), config);
}
Expand All @@ -515,8 +516,7 @@ TEST_F(CommandQueueSingleCardBufferFixture, TestPageLargerThanMaxPrefetchCommand
constexpr uint32_t num_round_robins = 1;
for (IDevice* device : devices_) {
CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device->id());
const uint32_t max_prefetch_command_size =
dispatch_constants::get(dispatch_core_type).max_prefetch_command_size();
const uint32_t max_prefetch_command_size = DispatchMemMap::get(dispatch_core_type).max_prefetch_command_size();
TestBufferConfig config = {
.num_pages = 1, .page_size = max_prefetch_command_size + 2048, .buftype = BufferType::DRAM};
local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(device, device->command_queue(), config);
Expand All @@ -527,8 +527,7 @@ TEST_F(CommandQueueSingleCardBufferFixture, TestUnalignedPageLargerThanMaxPrefet
constexpr uint32_t num_round_robins = 1;
for (IDevice* device : devices_) {
CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device->id());
const uint32_t max_prefetch_command_size =
dispatch_constants::get(dispatch_core_type).max_prefetch_command_size();
const uint32_t max_prefetch_command_size = DispatchMemMap::get(dispatch_core_type).max_prefetch_command_size();
uint32_t unaligned_page_size = max_prefetch_command_size + 4;
TestBufferConfig config = {.num_pages = 1, .page_size = unaligned_page_size, .buftype = BufferType::DRAM};
local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(device, device->command_queue(), config);
Expand Down Expand Up @@ -569,8 +568,8 @@ TEST_F(CommandQueueSingleCardBufferFixture, TestWrapHostHugepageOnEnqueueReadBuf
uint32_t page_size = 2048;
uint32_t command_issue_region_size = device->sysmem_manager().get_issue_queue_size(0);
CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device->id());
uint32_t cq_start = dispatch_constants::get(dispatch_core_type)
.get_host_command_queue_addr(CommandQueueHostAddrType::UNRESERVED);
uint32_t cq_start =
DispatchMemMap::get(dispatch_core_type).get_host_command_queue_addr(CommandQueueHostAddrType::UNRESERVED);

uint32_t max_command_size = command_issue_region_size - cq_start;
uint32_t buffer = 14240;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ TEST_F(MultiCommandQueueSingleDeviceEventFixture, TestEventsEnqueueWaitForEventC
for (uint cq_id = 0; cq_id < cqs.size(); cq_id++) {
for (size_t i = 0; i < num_cmds_per_cq * num_events_per_cq; i++) {
uint32_t host_addr =
completion_queue_base[cq_id] + i * dispatch_constants::TRANSFER_PAGE_SIZE + sizeof(CQDispatchCmd);
completion_queue_base[cq_id] + i * DispatchSettings::TRANSFER_PAGE_SIZE + sizeof(CQDispatchCmd);
tt::Cluster::instance().read_sysmem(&event, 4, host_addr, mmio_device_id, channel);
log_debug(
tt::LogTest,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ using std::vector;
using namespace tt::tt_metal;

constexpr uint32_t completion_queue_event_offset = sizeof(CQDispatchCmd);
constexpr uint32_t completion_queue_page_size = dispatch_constants::TRANSFER_PAGE_SIZE;
constexpr uint32_t completion_queue_page_size = DispatchSettings::TRANSFER_PAGE_SIZE;

enum class DataMovementMode : uint8_t { WRITE = 0, READ = 1 };

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,12 @@
#include <stdexcept>
#include "command_queue_fixture.hpp"
#include <tt-metalium/logger.hpp>
#include <tt-metalium/dispatch_constants.hpp>
#include "gtest/gtest.h"
#include <tt-metalium/hal.hpp>
#include <tt-metalium/dispatch_settings.hpp>
#include "umd/device/tt_core_coordinates.h"

using namespace tt::tt_metal::dispatch;
using namespace tt::tt_metal;

// Loop through test_func for WORKER, ETH X 1, 2 CQs
void ForEachCoreTypeXHWCQs(const std::function<void(const CoreType& core_type, const uint32_t num_hw_cqs)>& test_func) {
Expand Down Expand Up @@ -54,7 +53,7 @@ TEST_F(CommandQueueSingleCardFixture, TestDispatchSettingsEq) {
TEST_F(CommandQueueSingleCardFixture, TestDispatchSettingsSetPrefetchDBuffer) {
static constexpr uint32_t hw_cqs = 2;
static constexpr uint32_t expected_buffer_bytes = 0xcafe;
static constexpr uint32_t expected_page_count = expected_buffer_bytes / (1 << DispatchConstants::PREFETCH_D_BUFFER_LOG_PAGE_SIZE);
static constexpr uint32_t expected_page_count = expected_buffer_bytes / (1 << DispatchSettings::PREFETCH_D_BUFFER_LOG_PAGE_SIZE);
auto settings = DispatchSettings::worker_defaults(tt::Cluster::instance(), hw_cqs);
settings.prefetch_d_buffer_size(expected_buffer_bytes);
EXPECT_EQ(settings.prefetch_d_buffer_size_, expected_buffer_bytes);
Expand All @@ -64,7 +63,7 @@ TEST_F(CommandQueueSingleCardFixture, TestDispatchSettingsSetPrefetchDBuffer) {
TEST_F(CommandQueueSingleCardFixture, TestDispatchSettingsSetPrefetchQBuffer) {
static constexpr uint32_t hw_cqs = 2;
static constexpr uint32_t expected_buffer_entries = 0x1000;
static constexpr uint32_t expected_buffer_bytes = expected_buffer_entries * sizeof(DispatchConstants::prefetch_q_entry_type);
static constexpr uint32_t expected_buffer_bytes = expected_buffer_entries * sizeof(DispatchSettings::prefetch_q_entry_type);
auto settings = DispatchSettings::worker_defaults(tt::Cluster::instance(), hw_cqs);
settings.prefetch_q_entries(expected_buffer_entries);
EXPECT_EQ(settings.prefetch_q_entries_, expected_buffer_entries);
Expand All @@ -74,7 +73,7 @@ TEST_F(CommandQueueSingleCardFixture, TestDispatchSettingsSetPrefetchQBuffer) {
TEST_F(CommandQueueSingleCardFixture, TestDispatchSettingsSetDispatchBuffer) {
static constexpr uint32_t hw_cqs = 2;
static constexpr uint32_t expected_buffer_bytes = 0x2000;
static constexpr uint32_t expected_page_count = expected_buffer_bytes / (1 << DispatchConstants::DISPATCH_BUFFER_LOG_PAGE_SIZE);
static constexpr uint32_t expected_page_count = expected_buffer_bytes / (1 << DispatchSettings::DISPATCH_BUFFER_LOG_PAGE_SIZE);
auto settings = DispatchSettings::worker_defaults(tt::Cluster::instance(), hw_cqs);
settings.dispatch_size(expected_buffer_bytes);
EXPECT_EQ(settings.dispatch_size_, expected_buffer_bytes);
Expand All @@ -84,7 +83,7 @@ TEST_F(CommandQueueSingleCardFixture, TestDispatchSettingsSetDispatchBuffer) {
TEST_F(CommandQueueSingleCardFixture, TestDispatchSettingsSetDispatchSBuffer) {
static constexpr uint32_t hw_cqs = 2;
static constexpr uint32_t expected_buffer_bytes = 0x2000;
static constexpr uint32_t expected_page_count = expected_buffer_bytes / (1 << DispatchConstants::DISPATCH_S_BUFFER_LOG_PAGE_SIZE);
static constexpr uint32_t expected_page_count = expected_buffer_bytes / (1 << DispatchSettings::DISPATCH_S_BUFFER_LOG_PAGE_SIZE);
auto settings = DispatchSettings::worker_defaults(tt::Cluster::instance(), hw_cqs);
settings.dispatch_s_buffer_size(expected_buffer_bytes);
EXPECT_EQ(settings.dispatch_s_buffer_size_, expected_buffer_bytes);
Expand All @@ -94,7 +93,7 @@ TEST_F(CommandQueueSingleCardFixture, TestDispatchSettingsSetDispatchSBuffer) {
TEST_F(CommandQueueSingleCardFixture, TestDispatchSettingsSetTunnelerBuffer) {
static constexpr uint32_t hw_cqs = 2;
static constexpr uint32_t expected_buffer_bytes = 0x2000;
static constexpr uint32_t expected_page_count = expected_buffer_bytes / (1 << DispatchConstants::PREFETCH_D_BUFFER_LOG_PAGE_SIZE);
static constexpr uint32_t expected_page_count = expected_buffer_bytes / (1 << DispatchSettings::PREFETCH_D_BUFFER_LOG_PAGE_SIZE);
auto settings = DispatchSettings::worker_defaults(tt::Cluster::instance(), hw_cqs);
settings.tunneling_buffer_size(expected_buffer_bytes);
EXPECT_EQ(settings.tunneling_buffer_size_, expected_buffer_bytes);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ int main(int argc, char** argv) {
uint32_t host_write_ptr = 0;

CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device_id);
uint32_t prefetch_q_base = dispatch_constants::get(dispatch_core_type)
uint32_t prefetch_q_base = DispatchMemMap::get(dispatch_core_type)
.get_device_command_queue_addr(CommandQueueDeviceAddrType::UNRESERVED);

uint32_t reg_addr = prefetch_q_base;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -396,8 +396,8 @@ int main(int argc, char** argv) {
vec.resize(page_size_g / sizeof(uint32_t));

CoreType core_type = dispatch_core_manager::instance().get_dispatch_core_type(device->id());
uint32_t dispatch_l1_unreserved_base = dispatch_constants::get(core_type).get_device_command_queue_addr(
CommandQueueDeviceAddrType::UNRESERVED);
uint32_t dispatch_l1_unreserved_base =
DispatchMemMap::get(core_type).get_device_command_queue_addr(CommandQueueDeviceAddrType::UNRESERVED);
for (int i = 0; i < warmup_iterations_g; i++) {
if (source_mem_g == 4) {
tt::Cluster::instance().read_core(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -434,8 +434,7 @@ int main(int argc, char** argv) {

// Want different buffers on each core, instead use big buffer and self-manage it
uint32_t dispatch_l1_unreserved_base =
dispatch_constants::get(CoreType::WORKER)
.get_device_command_queue_addr(CommandQueueDeviceAddrType::UNRESERVED);
DispatchMemMap::get(CoreType::WORKER).get_device_command_queue_addr(CommandQueueDeviceAddrType::UNRESERVED);
uint32_t l1_buf_base = tt::align(dispatch_l1_unreserved_base, dispatch_buffer_page_size_g);
TT_ASSERT((l1_buf_base & (dispatch_buffer_page_size_g - 1)) == 0);

Expand Down Expand Up @@ -524,13 +523,13 @@ int main(int argc, char** argv) {
const uint32_t prefetch_sync_sem = spoof_prefetch_core_sem_1_id;

const uint32_t host_completion_queue_wr_ptr =
dispatch_constants::get(CoreType::WORKER)
DispatchMemMap::get(CoreType::WORKER)
.get_host_command_queue_addr(CommandQueueHostAddrType::COMPLETION_Q_WR);
const uint32_t dev_completion_queue_wr_ptr =
dispatch_constants::get(CoreType::WORKER)
DispatchMemMap::get(CoreType::WORKER)
.get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q_WR);
const uint32_t dev_completion_queue_rd_ptr =
dispatch_constants::get(CoreType::WORKER)
DispatchMemMap::get(CoreType::WORKER)
.get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q_RD);

std::vector<uint32_t> dispatch_compile_args = {
Expand All @@ -556,8 +555,8 @@ int main(int argc, char** argv) {
0, // prefetch_downstream_buffer_pages
num_compute_cores, // max_write_packed_cores
0,
dispatch_constants::DISPATCH_MESSAGE_ENTRIES,
dispatch_constants::DISPATCH_GO_SIGNAL_NOC_DATA_ENTRIES,
DispatchSettings::DISPATCH_MESSAGE_ENTRIES,
DispatchSettings::DISPATCH_GO_SIGNAL_NOC_DATA_ENTRIES,
0,
0,
0,
Expand Down
Loading

0 comments on commit 9d69fb1

Please sign in to comment.