diff --git a/tt_metal/api/tt-metalium/program_impl.hpp b/tt_metal/api/tt-metalium/program_impl.hpp index 7f6af7e4942..86b43108d2c 100644 --- a/tt_metal/api/tt-metalium/program_impl.hpp +++ b/tt_metal/api/tt-metalium/program_impl.hpp @@ -65,8 +65,7 @@ namespace detail{ void ValidateCircularBufferRegion(const Program &program, const IDevice* device); KernelHandle AddKernel (Program &program, const std::shared_ptr& kernel, const HalProgrammableCoreType core_type); std::shared_ptr GetKernel(const Program &program, KernelHandle kernel_id); - std::shared_ptr GetCircularBuffer(const Program &program, CBHandle id); - void AddConfigBuffer(Program &program, const std::shared_ptr& config_buffer); + std::shared_ptr GetCircularBuffer(const Program& program, CBHandle id); class Internal_; } @@ -228,7 +227,6 @@ class Program { std::unordered_map &get_cached_program_command_sequences() noexcept; bool kernel_binary_always_stored_in_ringbuffer(); - friend void detail::AddConfigBuffer(Program &program, const std::shared_ptr& config_buffer); friend void program_dispatch::assemble_device_commands( ProgramCommandSequence& program_command_sequence, Program& program, IDevice* device, SubDeviceId sub_device_id); template friend void program_dispatch::finalize_program_offsets(T&, IDevice*); diff --git a/tt_metal/impl/program/program.cpp b/tt_metal/impl/program/program.cpp index d91b2835e69..55fd759a020 100644 --- a/tt_metal/impl/program/program.cpp +++ b/tt_metal/impl/program/program.cpp @@ -227,8 +227,6 @@ class Program_ { std::vector>> kernel_groups_; std::vector> core_to_kernel_group_index_table_; - std::vector> config_buffers_; - std::vector program_configs_; // Counts how much space is needed for each core + each launch buffer msg queue. std::vector program_config_sizes_; @@ -252,9 +250,6 @@ class Program_ { void add_semaphore(const CoreRangeSet & crs, uint32_t semaphore_id, uint32_t init_value, CoreType core_type); - friend void AddConfigBuffer(Program &program, const std::shared_ptr& config_buffer); - void add_config_buffer(const std::shared_ptr& config_buffer); - // Ensures that statically allocated circular buffers do not grow into L1 buffer space void validate_circular_buffer_region(const IDevice* device); @@ -297,10 +292,6 @@ void ValidateCircularBufferRegion(const Program &program, const IDevice* device) program.pimpl_->validate_circular_buffer_region(device); } -void AddConfigBuffer(Program &program, const std::shared_ptr& config_buffer) { - program.pimpl_->add_config_buffer(std::move(config_buffer)); -} - void EnablePersistentKernelCache() { enable_persistent_kernel_cache = true; } void DisablePersistentKernelCache() { enable_persistent_kernel_cache = false; } @@ -894,8 +885,6 @@ void Program::add_semaphore(const CoreRangeSet &crs, uint32_t semaphore_id, uint pimpl_->add_semaphore(crs, semaphore_id, init_value, core_type); } -void detail::Program_::add_config_buffer(const std::shared_ptr& config_buffer) { config_buffers_.emplace_back(config_buffer); } - std::vector> detail::Program_::logical_cores() const { std::vector> cores_in_program; std::vector> unique_cores; diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp index c5f92b8b4b4..0288f5e027b 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp @@ -439,6 +439,8 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( uint32_t out_subblock_h_ntiles = block_config.out_subblock_h_ntiles; uint32_t out_subblock_w_ntiles = block_config.out_subblock_w_ntiles; + auto conv_reader_indices_buffer = conv_reader_indices.value().device_buffer(); + // out_subblock_h_ntiles = 8; tt::DataFormat act_df = tt_metal::datatype_to_dataformat_converter(a.get_dtype()); @@ -1239,7 +1241,7 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( CircularBufferConfig cb_for_reader_indices_config = CircularBufferConfig(out_block_h_datums * 2, {{cb_for_reader_indices, tt::DataFormat::Float16_b}}) .set_page_size(cb_for_reader_indices, out_block_h_datums * 2); - cb_for_reader_indices_config.set_globally_allocated_address(*conv_reader_indices.value().buffer()); + cb_for_reader_indices_config.set_globally_allocated_address(*conv_reader_indices_buffer); auto cb_for_reader_indices_id = tt_metal::CreateCircularBuffer(program, all_cores, cb_for_reader_indices_config); @@ -1714,6 +1716,7 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( auto mcast_sender_cores_vec = grid_to_cores(mcast_sender_cores.start_coord, mcast_sender_cores.end_coord, true); auto mcast_receiver_cores_vec = corerange_to_cores(mcast_receiver_cores, std::nullopt, true); + // Capture conv_reader_indices_buffer to cache this with the program auto override_runtime_arguments_callback = [reader_kernel_id = reader_id, mcast_sender_cores = mcast_sender_cores_vec, @@ -1725,7 +1728,8 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( total_active_num_cores = total_active_num_cores, num_cores_x = num_cores_x, num_cores_y = num_cores_y, - has_bias = has_bias]( + has_bias = has_bias, + conv_reader_indices_buffer = conv_reader_indices_buffer]( const void* operation, Program& program, const std::vector& input_tensors, @@ -1836,8 +1840,6 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_new( conv_reader_indices_tensor = ttnn::operations::sliding_window::move_config_tensor_to_device( conv_reader_indices_tensor, parallel_config, is_block_sharded, a.device()); - // add config tensor to program - tt::tt_metal::detail::AddConfigBuffer(program, conv_reader_indices_tensor.device_buffer()); if (parallel_config.shard_scheme == TensorMemoryLayout::WIDTH_SHARDED) { return multi_core_optimized_conv_width_sharded_v2_impl( program, diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_width_sharded_program_factory.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_width_sharded_program_factory.cpp index ea2214ee506..fc73efc2b82 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_width_sharded_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_width_sharded_program_factory.cpp @@ -739,10 +739,12 @@ tt::tt_metal::operation::ProgramWithCallbacks multi_core_optimized_conv_width_sh act_block_num_tiles_split, act_tile_size); + auto conv_reader_indices_buffer = conv_reader_indices.value().device_buffer(); + CircularBufferConfig cb_for_reader_indices_config = CircularBufferConfig(out_block_h_datums * 2, {{cb_for_reader_indices, tt::DataFormat::Float16_b}}) .set_page_size(cb_for_reader_indices, out_block_h_datums * 2); - cb_for_reader_indices_config.set_globally_allocated_address(*conv_reader_indices.value().buffer()); + cb_for_reader_indices_config.set_globally_allocated_address(*conv_reader_indices_buffer); auto cb_for_reader_indices_id = tt_metal::CreateCircularBuffer(program, all_cores, cb_for_reader_indices_config); if (has_bias) { @@ -874,11 +876,13 @@ tt::tt_metal::operation::ProgramWithCallbacks multi_core_optimized_conv_width_sh (uint32_t)(core_index < output_num_cores)}); } - auto empty_callback = [](const void* operation, - Program& program, - const std::vector& input_tensors, - const std::vector>& optional_input_tensors, - const std::vector& output_tensors) {}; + // Capture conv_reader_indices_buffer to cache this with the program + auto empty_callback = [conv_reader_indices_buffer]( + const void* operation, + Program& program, + const std::vector& input_tensors, + const std::vector>& optional_input_tensors, + const std::vector& output_tensors) {}; return {.program = std::move(program), .override_runtime_arguments_callback = empty_callback}; } diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/untilize_with_halo_v2_op.cpp b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/untilize_with_halo_v2_op.cpp index cfdf0f2efa1..a2b3f024b97 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/untilize_with_halo_v2_op.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/untilize_with_halo_v2_op.cpp @@ -99,7 +99,8 @@ operation::ProgramWithCallbacks UntilizeWithHaloV2::create_program( remote_config, remote_read_, transpose_mcast_, - output_tensor)}; + output_tensor, + /*capture_buffers=*/false)}; } } // namespace ttnn::operations::data_movement diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/untilize_with_halo_v2_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/untilize_with_halo_v2_program_factory.cpp index 476ffd05bba..9362fa7daee 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/untilize_with_halo_v2_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/untilize_with_halo_v2_program_factory.cpp @@ -30,7 +30,8 @@ operation::ProgramWithCallbacks untilize_with_halo_multi_core_v2( const Tensor& remote_config, const bool remote_read, const bool transpose_mcast, - Tensor& output_tensor) { + Tensor& output_tensor, + const bool capture_buffers) { IDevice* device = input_tensor.device(); Buffer* src_buffer = input_tensor.buffer(); Buffer* dst_buffer = output_tensor.buffer(); @@ -143,7 +144,7 @@ operation::ProgramWithCallbacks untilize_with_halo_multi_core_v2( TT_ASSERT(local_config.get_dtype() == DataType::UINT16); TT_ASSERT(remote_config.get_dtype() == DataType::UINT16); - Buffer* padding_config_buffer = padding_config.buffer(); + auto padding_config_buffer = padding_config.device_buffer(); const uint32_t num_cores = all_cores.num_cores(); auto padding_config_cb_config = CircularBufferConfig(padding_config_buffer->size() / num_cores, {{padding_config_cb_id, kernel_config_df}}) @@ -151,14 +152,14 @@ operation::ProgramWithCallbacks untilize_with_halo_multi_core_v2( .set_globally_allocated_address(*padding_config_buffer); CBHandle padding_config_cb = CreateCircularBuffer(program, all_cores, padding_config_cb_config); - Buffer* local_config_buffer = local_config.buffer(); + auto local_config_buffer = local_config.device_buffer(); auto local_config_cb_config = CircularBufferConfig(local_config_buffer->size() / num_cores, {{local_config_cb_id, kernel_config_df}}) .set_page_size(local_config_cb_id, local_config_buffer->page_size()) .set_globally_allocated_address(*local_config_buffer); CBHandle local_config_cb = CreateCircularBuffer(program, all_cores, local_config_cb_config); - Buffer* remote_config_buffer = remote_config.buffer(); + auto remote_config_buffer = remote_config.device_buffer(); auto remote_config_cb_config = CircularBufferConfig(remote_config_buffer->size() / num_cores, {{remote_config_cb_id, kernel_config_df}}) .set_page_size(remote_config_cb_id, remote_config_buffer->page_size()) @@ -212,7 +213,20 @@ operation::ProgramWithCallbacks untilize_with_halo_multi_core_v2( DataMovementConfig{ .processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default, .compile_args = reader_ct_args}); - auto override_runtime_arguments_callback = [src_cb, out_cb, padding_config_cb, local_config_cb, remote_config_cb]( + if (!capture_buffers) { + padding_config_buffer = nullptr; + local_config_buffer = nullptr; + remote_config_buffer = nullptr; + } + // Capture padding_config_buffer, local_config_buffer, remote_config_buffer to cache this with the program + auto override_runtime_arguments_callback = [src_cb, + out_cb, + padding_config_cb, + local_config_cb, + remote_config_cb, + padding_config_buffer, + local_config_buffer, + remote_config_buffer]( const void* operation, Program& program, const std::vector& input_tensors, diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/untilize_with_halo_v2_program_factory.hpp b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/untilize_with_halo_v2_program_factory.hpp index dd606912a6a..5a2d2e6d76f 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/untilize_with_halo_v2_program_factory.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/untilize_with_halo_v2_program_factory.hpp @@ -19,6 +19,8 @@ tt::tt_metal::operation::ProgramWithCallbacks untilize_with_halo_multi_core_v2( const Tensor& remote_config, const bool remote_read, const bool transpose_mcast, - Tensor& output_tensor); + Tensor& output_tensor, + const bool capture_buffers); // Used by halo op to cache internally created config buffers with the program + // Untilize with Halo V2 op takes them as inputs from the user, so doesn't capture } // namespace ttnn::operations::data_movement::detail diff --git a/ttnn/cpp/ttnn/operations/pool/generic/device/pool_multi_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/pool/generic/device/pool_multi_core_program_factory.cpp index bad1f89094b..7ad87a5cbda 100644 --- a/ttnn/cpp/ttnn/operations/pool/generic/device/pool_multi_core_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/pool/generic/device/pool_multi_core_program_factory.cpp @@ -40,7 +40,7 @@ Pool2D::MultiCore::cached_program_t pool2d_multi_core_sharded_with_halo_v2_impl_ // This should allocate a DRAM buffer on the device IDevice* device = input.device(); tt::tt_metal::Buffer* src_dram_buffer = input.buffer(); - tt::tt_metal::Buffer* reader_indices_buffer = reader_indices.buffer(); + auto reader_indices_buffer = reader_indices.device_buffer(); tt::tt_metal::Buffer* dst_dram_buffer = output.buffer(); const tt::tt_metal::LegacyShape input_shape = input.get_legacy_shape(); @@ -376,6 +376,7 @@ Pool2D::MultiCore::cached_program_t pool2d_multi_core_sharded_with_halo_v2_impl_ auto compute_kernel = CreateKernel(program, compute_kernel_fname, core_range, compute_config); + // Capture reader_indices_buffer to cache this with the program return { std::move(program), {.reader0_kernel = reader0_kernel, @@ -383,7 +384,8 @@ Pool2D::MultiCore::cached_program_t pool2d_multi_core_sharded_with_halo_v2_impl_ .raw_in_cb = raw_in_cb, .cb_out = cb_out, .ncores = ncores, - .ncores_w = ncores_w}}; + .ncores_w = ncores_w, + .reader_indices_buffer = reader_indices_buffer}}; } Pool2D::MultiCore::cached_program_t Pool2D::MultiCore::create( @@ -418,8 +420,6 @@ Pool2D::MultiCore::cached_program_t Pool2D::MultiCore::create( auto reader_indices_on_device = sliding_window::move_config_tensor_to_device(reader_indices, parallel_config, is_block_sharded, input.device()); - tt::tt_metal::detail::AddConfigBuffer(program, reader_indices_on_device.device_buffer()); - auto in_n = sliding_window_config.batch_size; auto in_h = sliding_window_config.input_hw.first; auto in_w = sliding_window_config.input_hw.second; diff --git a/ttnn/cpp/ttnn/operations/pool/generic/device/pool_op.hpp b/ttnn/cpp/ttnn/operations/pool/generic/device/pool_op.hpp index 7c83971a419..7077436c97c 100644 --- a/ttnn/cpp/ttnn/operations/pool/generic/device/pool_op.hpp +++ b/ttnn/cpp/ttnn/operations/pool/generic/device/pool_op.hpp @@ -46,6 +46,7 @@ struct Pool2D { CBHandle cb_out; uint32_t ncores; uint32_t ncores_w; + std::shared_ptr reader_indices_buffer; }; using cached_program_t = ttnn::device_operation::CachedProgram; diff --git a/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_program_factory_multicore.cpp b/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_program_factory_multicore.cpp index 83048f71d0a..dc61eca5bfc 100644 --- a/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_program_factory_multicore.cpp +++ b/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_program_factory_multicore.cpp @@ -242,10 +242,9 @@ operation::ProgramWithCallbacks upsample_multi_core( ShardSpec config_shard_spec(input.shard_spec().value().grid, shard_shape, config_tensor_shard_orientation); MemoryConfig memory_config{input.memory_config().memory_layout, BufferType::L1_SMALL, config_shard_spec}; auto config_tensor_device = config_tensor.to(device, memory_config); - tt::tt_metal::detail::AddConfigBuffer(program, config_tensor_device.device_buffer()); tt::DataFormat config_df = tt::DataFormat::RawUInt16; - Buffer* config_buffer = config_tensor_device.buffer(); + auto config_buffer = config_tensor_device.device_buffer(); auto config_buffer_page_size = config_buffer->page_size(); uint32_t config_cb_id = CBIndex::c_6; auto config_cb_config = CircularBufferConfig(config_buffer_page_size, {{config_cb_id, config_df}}) @@ -311,7 +310,8 @@ operation::ProgramWithCallbacks upsample_multi_core( TT_THROW("Unsupported memory layout"); } - auto override_runtime_args_callback = [writer_kernel, cb_src0, out_cb, config_cb]( + // Capture config_buffer to cache this with the program + auto override_runtime_args_callback = [writer_kernel, cb_src0, out_cb, config_cb, config_buffer]( const void* operation, Program& program, const std::vector& input_tensors, diff --git a/ttnn/cpp/ttnn/operations/sliding_window/halo/device/halo_device_operation.cpp b/ttnn/cpp/ttnn/operations/sliding_window/halo/device/halo_device_operation.cpp index fe81dd49573..ef7e86c2725 100644 --- a/ttnn/cpp/ttnn/operations/sliding_window/halo/device/halo_device_operation.cpp +++ b/ttnn/cpp/ttnn/operations/sliding_window/halo/device/halo_device_operation.cpp @@ -121,10 +121,6 @@ operation::ProgramWithCallbacks HaloDeviceOperation::create_program( Program program = CreateProgram(); - tt::tt_metal::detail::AddConfigBuffer(program, pad_config_device_tensor.device_buffer()); - tt::tt_metal::detail::AddConfigBuffer(program, local_config_device_tensor.device_buffer()); - tt::tt_metal::detail::AddConfigBuffer(program, remote_config_device_tensor.device_buffer()); - return {data_movement::detail::untilize_with_halo_multi_core_v2( program, input_tensor, @@ -136,7 +132,8 @@ operation::ProgramWithCallbacks HaloDeviceOperation::create_program( remote_config_device_tensor, remote_read_, transpose_mcast_, - output_tensor)}; + output_tensor, + /*capture_buffers=*/true)}; } Tensor halo_op(