Skip to content

Commit

Permalink
[Feature] Support heterogeneous core configuration
Browse files Browse the repository at this point in the history
  • Loading branch information
YWHyuk committed Aug 26, 2024
1 parent b64c31f commit 608ddf1
Show file tree
Hide file tree
Showing 27 changed files with 170 additions and 143 deletions.
56 changes: 31 additions & 25 deletions src/Common.cc
Original file line number Diff line number Diff line change
Expand Up @@ -53,34 +53,40 @@ SimulationConfig initialize_config(json config) {

/* Core configs */
parsed_config.num_cores = get_config_value<uint32_t>(config, "num_cores");
std::string core_type = get_config_value<std::string>(config, "core_type");
if (core_type_map.contains(core_type)) {
parsed_config.core_type = core_type_map.at(core_type);
} else {
throw std::runtime_error(fmt::format("Not implemented core type {} ", core_type));
}
parsed_config.core_config = new struct CoreConfig[parsed_config.num_cores];
parsed_config.core_freq = get_config_value<uint32_t>(config, "core_freq");
parsed_config.core_width = get_config_value<uint32_t>(config, "core_width");
parsed_config.core_height = get_config_value<uint32_t>(config, "core_height");
parsed_config.core_print_interval = get_config_value<uint32_t>(config, "core_print_interval");

/* Vector configs */
parsed_config.vector_process_bit = get_config_value<uint32_t>(config, "vector_process_bit");
parsed_config.add_latency = get_config_value<uint32_t>(config, "add_latency");
parsed_config.mul_latency = get_config_value<uint32_t>(config, "mul_latency");
parsed_config.exp_latency = get_config_value<uint32_t>(config, "exp_latency");
parsed_config.gelu_latency = get_config_value<uint32_t>(config, "gelu_latency");
parsed_config.add_tree_latency = get_config_value<uint32_t>(config, "add_tree_latency");
parsed_config.scalar_sqrt_latency = get_config_value<uint32_t>(config, "scalar_sqrt_latency");
parsed_config.scalar_add_latency = get_config_value<uint32_t>(config, "scalar_add_latency");
parsed_config.scalar_mul_latency = get_config_value<uint32_t>(config, "scalar_mul_latency");
parsed_config.mac_latency = get_config_value<uint32_t>(config, "mac_latency");
parsed_config.div_latency = get_config_value<uint32_t>(config, "div_latency");

/* SRAM configs */
parsed_config.sram_width = get_config_value<uint32_t>(config, "sram_width");
parsed_config.spad_size = get_config_value<uint32_t>(config, "spad_size");
parsed_config.accum_spad_size = get_config_value<uint32_t>(config, "accum_spad_size");
for (int i=0; i<parsed_config.num_cores; i++) {
std::string core_id = "core_" + std::to_string(i);
auto core_config = config["core_config"][core_id];
std::string core_type = core_config["core_type"];
if (core_type_map.contains(core_type)) {
parsed_config.core_config[i].core_type = core_type_map.at(core_type);
} else {
throw std::runtime_error(fmt::format("Not implemented core type {} ", core_type));
}
parsed_config.core_config[i].core_width = core_config["core_width"];
parsed_config.core_config[i].core_height = core_config["core_height"];

/* Vector configs */
parsed_config.core_config[i].vector_process_bit = core_config["vector_process_bit"];
parsed_config.core_config[i].add_latency = core_config["add_latency"];
parsed_config.core_config[i].mul_latency = core_config["mul_latency"];
parsed_config.core_config[i].exp_latency = core_config["exp_latency"];
parsed_config.core_config[i].gelu_latency = core_config["gelu_latency"];
parsed_config.core_config[i].add_tree_latency = core_config["add_tree_latency"];
parsed_config.core_config[i].scalar_sqrt_latency = core_config["scalar_sqrt_latency"];
parsed_config.core_config[i].scalar_add_latency = core_config["scalar_add_latency"];
parsed_config.core_config[i].scalar_mul_latency = core_config["scalar_mul_latency"];
parsed_config.core_config[i].mac_latency = core_config["mac_latency"];
parsed_config.core_config[i].div_latency = core_config["div_latency"];

/* SRAM configs */
parsed_config.core_config[i].sram_width = core_config["sram_width"];
parsed_config.core_config[i].spad_size = core_config["spad_size"];
parsed_config.core_config[i].accum_spad_size = core_config["accum_spad_size"];
}

/* DRAM config */
std::string dram_type = get_config_value<std::string>(config, "dram_type");
Expand Down
14 changes: 7 additions & 7 deletions src/Core.cc
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
#include "helper/HelperFunctions.h"

std::unique_ptr<Core> Core::create(uint32_t id, SimulationConfig config) {
if (config.core_type == CoreType::SYSTOLIC_WS) {
if (config.core_config[id].core_type == CoreType::SYSTOLIC_WS) {
return std::make_unique<SystolicWS>(id, config);
} else if (config.core_type == CoreType::SYSTOLIC_OS) {
} else if (config.core_config[id].core_type == CoreType::SYSTOLIC_OS) {
return std::make_unique<SystolicOS>(id, config);
} else {
spdlog::error("[Configuration] Invalid core type...!");
Expand All @@ -23,8 +23,8 @@ Core::Core(uint32_t id, SimulationConfig config)
_stat_memory_idle_cycle(0),
_stat_vec_compute_cycle(0),
_stat_matmul_cycle(0),
_spad(Sram(config, _core_cycle, false)),
_acc_spad(Sram(config, _core_cycle, true)) {
_spad(Sram(config, _core_cycle, false, id)),
_acc_spad(Sram(config, _core_cycle, true, id)) {
_waiting_write_reqs = 0;
_running_layer = -1;
}
Expand Down Expand Up @@ -282,7 +282,7 @@ void Core::finish_compute_pipeline(){
inst->my_tile->inst_finished = true;
}
double compute_size = inst->tile_k * inst->tile_m * inst->tile_n
/ (_config.core_height * _config.core_width);
/ (_config.core_config[_id].core_height * _config.core_config[_id].core_width);
spdlog::trace("Compute size {} tile m {} tile k {} tile n {}", inst->compute_size, inst->tile_m, inst->tile_k, inst->tile_n);
spdlog::trace("Compute size {} , compute time {}", compute_size, inst->finish_cycle - inst->start_cycle);
_stat_matmul_cycle += compute_size;
Expand Down Expand Up @@ -401,7 +401,7 @@ void Core::handle_st_inst_queue() {
}

cycle_type Core::calculate_add_tree_iterations(uint32_t vector_size) {
uint32_t calculation_unit = _config.vector_process_bit >> 3;
uint32_t calculation_unit = _config.core_config[_id].vector_process_bit >> 3;
if (vector_size <= calculation_unit) {
return 1;
}
Expand All @@ -414,7 +414,7 @@ cycle_type Core::calculate_add_tree_iterations(uint32_t vector_size) {
}

cycle_type Core::calculate_vector_op_iterations(uint32_t vector_size) {
uint32_t calculation_unit = _config.vector_process_bit >> 3;
uint32_t calculation_unit = _config.core_config[_id].vector_process_bit >> 3;
uint32_t ret = vector_size / calculation_unit;
if (vector_size % calculation_unit != 0) {
ret++;
Expand Down
17 changes: 11 additions & 6 deletions src/Mapping.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,7 @@ MappingTable::MappingTable () {}
MappingTable::MappingTable (SimulationConfig config) {
_mapping_table = _MappingTable();
_config = config;
_dim = _config.core_height;
_max_spad_rows = (_config.spad_size KB) / (_dim * _config.precision * 2);
_max_acc_rows = (_config.accum_spad_size KB) / (_dim * 4 * 2); // Accumulator is 4 Byte

}

MappingTable MappingTable::parse_mapping_file(
Expand Down Expand Up @@ -54,8 +52,11 @@ MappingTable MappingTable::parse_mapping_file(

void MappingTable::gemm_mapping(Mapping::LoopCounts &key) {
uint32_t dim_I, dim_J, dim_K;
uint32_t _dim = _config.core_config[key.target_core].core_height;
uint32_t _max_spad_rows = (_config.core_config[key.target_core].spad_size KB) / (_dim * _config.precision * 2);
uint32_t _max_acc_rows = (_config.core_config[key.target_core].accum_spad_size KB) / (_dim * 4 * 2);

assert(_config.core_height==_config.core_width);
assert(_config.core_config[key.target_core].core_height==_config.core_config[key.target_core].core_width);
dim_I = key.N;
dim_J = key.M;
dim_K = key.C;
Expand Down Expand Up @@ -116,7 +117,7 @@ void MappingTable::gemm_mapping(Mapping::LoopCounts &key) {
mapping.tile_out_loop = {tile_I, tile_K, tile_J, 1, 1, 1, 1};
mapping.tile_in_loop = {inner_I, inner_K, inner_J, 1, 1, 1, 1};
_mapping_table[key] = mapping;
spdlog::info("[GEMM] spad_size: {} accum_size: {}", _config.spad_size * 1024, _config.accum_spad_size * 1024);
spdlog::info("[GEMM] spad_size: {} accum_size: {}", _config.core_config[key.target_core].spad_size * 1024, _config.core_config[key.target_core].accum_spad_size * 1024);
spdlog::info("[GEMM] required_sram_size: {} required_accum_size: {}", (inner_I+inner_J)*inner_K*_config.precision, (inner_I*inner_J)*_config.precision);
spdlog::info("[GEMM] Used gemmini gemm mapping: Total N:{} C:{} M:{}, " \
"Outer N:{} C:{} M:{}, " \
Expand Down Expand Up @@ -396,6 +397,10 @@ Mapping MappingTable::calc_conv_mapping(Mapping::LoopCounts &key) {
int stride, input_dilation, kernel_dilation, padding, kernel_dim;
bool trans_input_3120, trans_weight_0132;
int pool_size, pool_stride, pool_padding;
uint32_t _dim = _config.core_config[key.target_core].core_height;
uint32_t _max_spad_rows = (_config.core_config[key.target_core].spad_size KB) / (_dim * _config.precision * 2);
uint32_t _max_acc_rows = (_config.core_config[key.target_core].accum_spad_size KB) / (_dim * 4 * 2);

batch_size = 1;
out_channels = key.M;
in_channels = key.C;
Expand Down Expand Up @@ -522,7 +527,7 @@ Mapping MappingTable::calc_conv_mapping(Mapping::LoopCounts &key) {
// B * O_row * O_col * O_ch
int input_tile_size = args[0] * (args[1]+2*padding) * (args[2]+2*padding) * args[6] * _config.precision;
if (spad_rows <= max_spad_rows && acc_rows <= max_acc_rows &&
((input_tile_size + weight_tile_size) * 3 >> 1) <= (_config.spad_size KB / 2)) {
((input_tile_size + weight_tile_size) * 3 >> 1) <= (_config.core_config[key.target_core].spad_size KB / 2)) {
args[i] = args_candidate[i];
nothing_increased = false;
}
Expand Down
1 change: 1 addition & 0 deletions src/Mapping.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ struct Mapping {
uint32_t P = 1; // Input width
uint32_t Padding = 1; // Pading
uint32_t Stride = 1; // Stride
uint32_t target_core = 0;
bool operator==(const LoopCounts &other) const {
return (N == other.N) && (C == other.C) && (M == other.M) &&
(S == other.S) && (R == other.R) && (Q == other.Q) &&
Expand Down
5 changes: 4 additions & 1 deletion src/Model.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ Model::Model(std::string onnx_path, json model_config, SimulationConfig config,
if (_model_config.contains("partition_id")) {
_partition_id = uint32_t(_model_config["partition_id"]);
}
if (_model_config.contains("target_core")) {
_target_core = uint32_t(_model_config["target_core"]);
}
}

Model::Model(json model_config, SimulationConfig config, std::string name)
Expand Down Expand Up @@ -106,7 +109,7 @@ void Model::initialize_model(std::vector<std::unique_ptr<Tensor>>& weight_table)


for(auto node_proto : model_proto.graph().node()) {
auto node = OperationFactory::create_operation(this, node_proto);
auto node = OperationFactory::create_operation(this, node_proto, _target_core);
if(node != nullptr) {
int node_id = node->get_id();
_operation_map[node->get_id()] = std::move(node);
Expand Down
2 changes: 1 addition & 1 deletion src/Model.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ class Model {

virtual void initialize_model(std::vector<std::unique_ptr<Tensor>>& weight_table);
virtual void initialize_weight(std::vector<std::unique_ptr<Tensor>>& weight_table);

protected:

uint32_t _id;
Expand All @@ -50,6 +49,7 @@ class Model {
std::vector<Operation*> _executable_layer;
SimulationConfig _config;
uint32_t _partition_id = 0;
uint32_t _target_core = 0;

/* Number of simulating attention block */
int nr_skip = 0; // NR_SKIP == 2 * NR_ATTEN
Expand Down
22 changes: 13 additions & 9 deletions src/SimulationConfig.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,10 @@ enum class DramType { SIMPLE, RAMULATOR1, RAMULATOR2 };

enum class IcntType { SIMPLE, BOOKSIM2 };

struct SimulationConfig {
/* Core config */
uint32_t num_cores;
struct CoreConfig {
CoreType core_type;
uint32_t core_freq;
uint32_t core_width;
uint32_t core_height;
uint32_t core_print_interval;

/* Vector config*/
uint32_t vector_process_bit;
Expand All @@ -39,6 +35,14 @@ struct SimulationConfig {
uint32_t sram_width;
uint32_t spad_size;
uint32_t accum_spad_size;
};

struct SimulationConfig {
/* Core config */
uint32_t num_cores;
uint32_t core_freq;
uint32_t core_print_interval;
struct CoreConfig *core_config;

/* DRAM config */
DramType dram_type;
Expand Down Expand Up @@ -78,12 +82,12 @@ struct SimulationConfig {
return addr - (addr % dram_req_size);
}

float max_systolic_flops() {
return core_width * core_height * core_freq * 2 * num_cores / 1000; // GFLOPS
float max_systolic_flops(uint32_t id) {
return core_config[id].core_width * core_config[id].core_height * core_freq * 2 * num_cores / 1000; // GFLOPS
}

float max_vector_flops() {
return (vector_process_bit >> 3) / precision * 2 * core_freq / 1000; // GFLOPS
float max_vector_flops(uint32_t id) {
return (core_config[id].vector_process_bit >> 3) / precision * 2 * core_freq / 1000; // GFLOPS
}

float max_dram_bandwidth() {
Expand Down
4 changes: 3 additions & 1 deletion src/Simulator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@ Simulator::Simulator(SimulationConfig config, bool language_mode)
: _config(config), _core_cycles(0), _language_mode(language_mode) {
// Create dram object
spdlog::info("Simulator Configuration:");
spdlog::info("Systolic Array Throughput: {} GFLOPS", config.max_systolic_flops());
for (int i=0; i<config.num_cores;i++)
spdlog::info("[Core {}] Systolic Array Throughput: {} GFLOPS, Spad size: {} KB, Accumulator size: {} KB",
i, config.max_systolic_flops(i), config.core_config[i].spad_size, config.core_config[i].accum_spad_size);
spdlog::info("DRAM Bandwidth {} GB/s", config.max_dram_bandwidth());
_core_period = 1000000 / (config.core_freq);
_icnt_period = 1000000 / (config.icnt_freq);
Expand Down
6 changes: 3 additions & 3 deletions src/Sram.cc
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
#include "Sram.h"
#define NUM_PORTS 3

Sram::Sram(SimulationConfig config, const cycle_type& core_cycle, bool accum)
Sram::Sram(SimulationConfig config, const cycle_type& core_cycle, bool accum, uint32_t core_id)
: _core_cycle(core_cycle) {
if (!accum) {
_size = config.spad_size KB / 2;
_size = config.core_config[core_id].spad_size KB / 2;
} else {
_size = config.accum_spad_size KB / 2;
_size = config.core_config[core_id].accum_spad_size KB / 2;
}
_data_width = config.dram_req_size;
int precision = config.precision;
Expand Down
2 changes: 1 addition & 1 deletion src/Sram.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

class Sram {
public:
Sram(SimulationConfig config, const cycle_type& core_cycle, bool accum);
Sram(SimulationConfig config, const cycle_type& core_cycle, bool accum, uint32_t core_id);

bool check_hit(addr_type address, int buffer_id);
bool check_full(int buffer_id);
Expand Down
2 changes: 1 addition & 1 deletion src/SystolicOS.cc
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@ void SystolicOS::cycle() {
}

cycle_type SystolicOS::get_inst_compute_cycles(std::unique_ptr<Instruction>& inst) {
return _config.core_height + _config.core_width - 2 + inst->size;
return _config.core_config[_id].core_height + _config.core_config[_id].core_width - 2 + inst->size;
}
32 changes: 16 additions & 16 deletions src/SystolicWS.cc
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ void SystolicWS::cycle() {
offset = MAX(offset, 4);
if (front->opcode == Opcode::GEMM_PRELOAD) {
// State mul-pre
offset = MAX(offset, _config.core_height);
offset = MAX(offset, _config.core_config[_id].core_height);
_stat_systolic_preload_issue_count++;
}
if (_compute_pipeline.back()->start_cycle+offset < _core_cycle)
Expand All @@ -59,7 +59,7 @@ void SystolicWS::cycle() {
/* Preload weight to systolic array*/
if (front->opcode == Opcode::GEMM_PRELOAD) {
/* Weight preload from buffer latecny + WEight preload latency */
front->start_cycle += _config.core_height + _config.core_height - 1;
front->start_cycle += _config.core_config[_id].core_height + _config.core_config[_id].core_height - 1;
_stat_systolic_preload_issue_count++;
}
}
Expand Down Expand Up @@ -110,7 +110,7 @@ bool SystolicWS::can_issue_compute(std::unique_ptr<Instruction>& inst) {
if(Core::can_issue_compute(inst) == false)
return false;
if (inst->opcode == Opcode::GEMM || inst->opcode == Opcode::GEMM_PRELOAD) {
if (_compute_pipeline.size() >= _config.core_height) {
if (_compute_pipeline.size() >= _config.core_config[_id].core_height) {
return false;
}
} else {
Expand All @@ -122,7 +122,7 @@ bool SystolicWS::can_issue_compute(std::unique_ptr<Instruction>& inst) {
}

cycle_type SystolicWS::get_inst_compute_cycles(std::unique_ptr<Instruction>& inst) {
return _config.core_height + _config.core_width - 2 + MAX(inst->compute_size, 4);
return _config.core_config[_id].core_height + _config.core_config[_id].core_width - 2 + MAX(inst->compute_size, 4);
}

cycle_type SystolicWS::get_vector_compute_cycles(std::unique_ptr<Instruction>& inst) {
Expand All @@ -131,34 +131,34 @@ cycle_type SystolicWS::get_vector_compute_cycles(std::unique_ptr<Instruction>& i
cycle_type add_tree, scalar_ops, vector_ops;
switch (inst->opcode) {
case Opcode::LAYERNORM:
add_tree = 2 * add_tree_iter * _config.add_tree_latency;
scalar_ops = 2 * _config.scalar_mul_latency + _config.scalar_sqrt_latency;
add_tree = 2 * add_tree_iter * _config.core_config[_id].add_tree_latency;
scalar_ops = 2 * _config.core_config[_id].scalar_mul_latency + _config.core_config[_id].scalar_sqrt_latency;
// 1 addition, 1 subtraction, 1 division, 2 multiplication.
vector_ops = vec_op_iter * (2 * _config.add_latency + 3 * _config.mul_latency) * inst->tile_m;
vector_ops = vec_op_iter * (2 * _config.core_config[_id].add_latency + 3 * _config.core_config[_id].mul_latency) * inst->tile_m;
return add_tree + scalar_ops + vector_ops;
case Opcode::SOFTMAX:
// 1 add tree, 1 compare tree
add_tree = 2 * add_tree_iter * _config.add_tree_latency * inst->tile_m;
add_tree = 2 * add_tree_iter * _config.core_config[_id].add_tree_latency * inst->tile_m;
vector_ops =
vec_op_iter * (_config.add_latency + _config.exp_latency + _config.mul_latency);
vec_op_iter * (_config.core_config[_id].add_latency + _config.core_config[_id].exp_latency + _config.core_config[_id].mul_latency);
return add_tree + vector_ops;
case Opcode::ADD:
return vec_op_iter * _config.add_latency;
return vec_op_iter * _config.core_config[_id].add_latency;
case Opcode::MUL:
return vec_op_iter * _config.mul_latency;
return vec_op_iter * _config.core_config[_id].mul_latency;
case Opcode::MAC:
return vec_op_iter * _config.mac_latency;
return vec_op_iter * _config.core_config[_id].mac_latency;
case Opcode::SWISH: //TODO: Implement SWISH
case Opcode::GELU:
return vec_op_iter * _config.gelu_latency;
return vec_op_iter * _config.core_config[_id].gelu_latency;
case Opcode::COMP:
return vec_op_iter * 1;
case Opcode::ADDTREE:
return add_tree_iter * _config.add_tree_latency * inst->tile_m;
return add_tree_iter * _config.core_config[_id].add_tree_latency * inst->tile_m;
case Opcode::DIV:
return vec_op_iter * _config.div_latency;
return vec_op_iter * _config.core_config[_id].div_latency;
case Opcode::EXP:
return vec_op_iter * _config.exp_latency;
return vec_op_iter * _config.core_config[_id].exp_latency;

}
spdlog::info("not configured operation. {}", inst->id);
Expand Down
Loading

0 comments on commit 608ddf1

Please sign in to comment.