Skip to content

Commit

Permalink
Added a vpu and vec units
Browse files Browse the repository at this point in the history
  • Loading branch information
MichaelJSr committed Feb 10, 2025
1 parent e80ee2c commit c67078a
Show file tree
Hide file tree
Showing 13 changed files with 434 additions and 5 deletions.
14 changes: 14 additions & 0 deletions hw/rtl/VX_types.vh
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
`define VX_DCR_MPM_CLASS_NONE 0
`define VX_DCR_MPM_CLASS_CORE 1
`define VX_DCR_MPM_CLASS_MEM 2
`define VX_DCR_MPM_CLASS_VEC 3

// User Floating-Point CSRs ///////////////////////////////////////////////////

Expand Down Expand Up @@ -99,6 +100,8 @@
`define VX_CSR_MPM_SCRB_CSRS_H 12'hB8C
`define VX_CSR_MPM_SCRB_WCTL 12'hB0D
`define VX_CSR_MPM_SCRB_WCTL_H 12'hB8D
`define VX_CSR_MPM_SCRB_VEC 12'hB13 // Vector scoreboard
`define VX_CSR_MPM_SCRB_VEC_H 12'hB93
// PERF: memory
`define VX_CSR_MPM_IFETCHES 12'hB0E
`define VX_CSR_MPM_IFETCHES_H 12'hB8E
Expand Down Expand Up @@ -182,6 +185,17 @@
// Machine Performance-monitoring memory counters (class 3) ///////////////////
// <Add your own counters: use addresses hB03..B1F, hB83..hB9F>

// Machine Performance-monitoring vector counters
// PERF: vector unit
`define VX_CSR_MPM_VEC_READS 12'hB03 // vector reads
`define VX_CSR_MPM_VEC_READS_H 12'hB83
`define VX_CSR_MPM_VEC_WRITES 12'hB04 // vector writes
`define VX_CSR_MPM_VEC_WRITES_H 12'hB84
`define VX_CSR_MPM_VEC_LAT 12'hB05 // vector latency
`define VX_CSR_MPM_VEC_LAT_H 12'hB85
`define VX_CSR_MPM_VEC_ST 12'hB06 // vector stalls
`define VX_CSR_MPM_VEC_ST_H 12'hB86

// Machine Information Registers //////////////////////////////////////////////

`define VX_CSR_MVENDORID 12'hF11
Expand Down
58 changes: 58 additions & 0 deletions runtime/stub/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,9 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
uint64_t scrb_lsu = 0;
uint64_t scrb_csrs = 0;
uint64_t scrb_wctl = 0;
#ifdef EXT_V_ENABLE
uint64_t scrb_vpu = 0;
#endif
uint64_t ifetches = 0;
uint64_t loads = 0;
uint64_t stores = 0;
Expand All @@ -212,6 +215,13 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
uint64_t mem_writes = 0;
uint64_t mem_lat = 0;
uint64_t mem_bank_stalls = 0;
#ifdef EXT_V_ENABLE
// PERF: vecunit
uint64_t vec_mem_reads = 0;
uint64_t vec_mem_writes = 0;
uint64_t vec_mem_lat = 0;
uint64_t vec_stall_cycles = 0;
#endif

uint64_t num_cores;
CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_NUM_CORES, &num_cores), {
Expand Down Expand Up @@ -312,13 +322,25 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_WCTL, core_id, &scrb_wctl_per_core), {
return err;
});
#ifdef EXT_V_ENABLE
uint64_t scrb_vpu_per_core;
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_VPU, core_id, &scrb_vpu_per_core), {
return err;
});
#endif
scrb_alu += scrb_alu_per_core;
scrb_fpu += scrb_fpu_per_core;
scrb_lsu += scrb_lsu_per_core;
scrb_csrs += scrb_csrs_per_core;
scrb_wctl += scrb_wctl_per_core;
#ifdef EXT_V_ENABLE
scrb_vpu += scrb_vpu_per_core;
#endif
if (num_cores > 1) {
uint64_t scrb_total = scrb_alu_per_core + scrb_fpu_per_core + scrb_lsu_per_core + scrb_csrs_per_core + scrb_wctl_per_core;
#ifdef EXT_V_ENABLE
scrb_total += scrb_vpu_per_core;
#endif
int scrb_percent_per_core = calcAvgPercent(scrb_stalls_per_core, cycles_per_core);
fprintf(stream, "PERF: core%d: scoreboard stalls=%ld (%d%%) (alu=%d%%, fpu=%d%%, lsu=%d%%, csrs=%d%%, wctl=%d%%)\n"
, core_id
Expand All @@ -329,6 +351,9 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
, calcAvgPercent(scrb_lsu_per_core, scrb_total)
, calcAvgPercent(scrb_csrs_per_core, scrb_total)
, calcAvgPercent(scrb_wctl_per_core, scrb_total)
#ifdef EXT_V_ENABLE
, calcAvgPercent(scrb_vpu_per_core, scrb_total)
#endif
);
}
scrb_stalls += scrb_stalls_per_core;
Expand Down Expand Up @@ -555,6 +580,19 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
});
}
} break;
#ifdef EXT_V_ENABLE
case VX_DCR_MPM_CLASS_VEC: {
uint64_t tmp;
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_VEC_READS, core_id, &tmp), { return err; });
vec_mem_reads += tmp;
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_VEC_WRITES, core_id, &tmp), { return err; });
vec_mem_writes += tmp;
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_VEC_LAT, core_id, &tmp), { return err; });
vec_mem_lat += tmp;
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_VEC_ST, core_id, &tmp), { return err; });
vec_stall_cycles += tmp;
} break;
#endif
default:
break;
}
Expand All @@ -576,6 +614,9 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
int ifetch_avg_lat = caclAverage(ifetch_lat, ifetches);
int load_avg_lat = caclAverage(load_lat, loads);
uint64_t scrb_total = scrb_alu + scrb_fpu + scrb_lsu + scrb_csrs + scrb_wctl;
#ifdef EXT_V_ENABLE
scrb_total += scrb_vpu;
#endif
fprintf(stream, "PERF: scheduler idle=%ld (%d%%)\n", sched_idles, sched_idles_percent);
fprintf(stream, "PERF: scheduler stalls=%ld (%d%%)\n", sched_stalls, sched_stalls_percent);
fprintf(stream, "PERF: ibuffer stalls=%ld (%d%%)\n", ibuffer_stalls, ibuffer_percent);
Expand All @@ -587,6 +628,9 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
, calcAvgPercent(scrb_lsu, scrb_total)
, calcAvgPercent(scrb_csrs, scrb_total)
, calcAvgPercent(scrb_wctl, scrb_total)
#ifdef EXT_V_ENABLE
, calcAvgPercent(scrb_vpu, scrb_total)
#endif
);
fprintf(stream, "PERF: operands stalls=%ld (%d%%)\n", opds_stalls, opds_percent);
fprintf(stream, "PERF: ifetches=%ld\n", ifetches);
Expand Down Expand Up @@ -637,6 +681,20 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
fprintf(stream, "PERF: memory bank stalls=%ld (utilization=%d%%)\n", mem_bank_stalls, mem_bank_utilization);
}
} break;
#ifdef EXT_V_ENABLE
case VX_DCR_MPM_CLASS_VEC: {
vec_mem_reads /= num_cores;
vec_mem_writes /= num_cores;
vec_mem_lat /= num_cores;
vec_stall_cycles /= num_cores;
int vec_avg_lat = caclAverage(vec_mem_lat, vec_mem_reads);
int vec_stall_cycles_ratio = calcRatio(vec_stall_cycles, total_cycles);
fprintf(stream, "PERF: vec memory reads=%ld\n", vec_mem_reads);
fprintf(stream, "PERF: vec memory writes=%ld\n", vec_mem_writes);
fprintf(stream, "PERF: vec memory latency=%d cycles\n", vec_avg_lat);
fprintf(stream, "PERF: vec stalls=%ld (%d%%)\n", vec_stall_cycles, vec_stall_cycles_ratio);
} break;
#endif
default:
break;
}
Expand Down
2 changes: 1 addition & 1 deletion sim/simx/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ SRCS += $(SRC_DIR)/processor.cpp $(SRC_DIR)/cluster.cpp $(SRC_DIR)/socket.cpp $(

# Add V extension sources
ifneq ($(findstring -DEXT_V_ENABLE, $(CONFIGS)),)
SRCS += $(SRC_DIR)/vpu.cpp
SRCS += $(SRC_DIR)/vec_unit.cpp $(SRC_DIR)/vpu.cpp
endif

# Debugging
Expand Down
6 changes: 5 additions & 1 deletion sim/simx/core.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ Core::Core(const SimContext& ctx,
uint32_t core_id,
Socket* socket,
const Arch &arch,
const DCRS &dcrs)
const DCRS &dcrs
)
: SimObject(ctx, StrFormat("core%d", core_id))
, icache_req_ports(1, this)
, icache_rsp_ports(1, this)
Expand All @@ -38,6 +39,9 @@ Core::Core(const SimContext& ctx,
, core_id_(core_id)
, socket_(socket)
, arch_(arch)
#ifdef EXT_V_ENABLE
, vec_unit_(VecUnit::Create("vpu", arch))
#endif
, emulator_(arch, dcrs, this)
, ibuffers_(arch.num_warps(), IBUF_SIZE)
, scoreboard_(arch_)
Expand Down
22 changes: 21 additions & 1 deletion sim/simx/core.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@
#include "func_unit.h"
#include "mem_coalescer.h"
#include "VX_config.h"
#ifdef EXT_V_ENABLE
#include "vec_unit.h"
#endif

namespace vortex {

Expand All @@ -52,6 +55,9 @@ class Core : public SimObject<Core> {
uint64_t scrb_sfu;
uint64_t scrb_csrs;
uint64_t scrb_wctl;
#ifdef EXT_V_ENABLE
uint64_t scrb_vpu;
#endif
uint64_t ifetches;
uint64_t loads;
uint64_t stores;
Expand All @@ -72,6 +78,9 @@ class Core : public SimObject<Core> {
, scrb_sfu(0)
, scrb_csrs(0)
, scrb_wctl(0)
#ifdef EXT_V_ENABLE
, scrb_vpu(0)
#endif
, ifetches(0)
, loads(0)
, stores(0)
Expand All @@ -90,7 +99,8 @@ class Core : public SimObject<Core> {
uint32_t core_id,
Socket* socket,
const Arch &arch,
const DCRS &dcrs);
const DCRS &dcrs
);

~Core();

Expand Down Expand Up @@ -131,6 +141,12 @@ class Core : public SimObject<Core> {
return mem_coalescers_.at(idx);
}

#ifdef EXT_V_ENABLE
VecUnit::Ptr& vec_unit() {
return vec_unit_;
}
#endif

const PerfStats& perf_stats() const {
return perf_stats_;
}
Expand All @@ -150,6 +166,10 @@ class Core : public SimObject<Core> {
Socket* socket_;
const Arch& arch_;

#ifdef EXT_V_ENABLE
VecUnit::Ptr vec_unit_;
#endif

Emulator emulator_;

std::vector<IBuffer> ibuffers_;
Expand Down
19 changes: 18 additions & 1 deletion sim/simx/emulator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ Emulator::warp_t::warp_t(const Arch& arch)
: ireg_file(arch.num_threads(), std::vector<Word>(MAX_NUM_REGS))
, freg_file(arch.num_threads(), std::vector<uint64_t>(MAX_NUM_REGS))
#ifdef EXT_V_ENABLE
, vreg_file(MAX_NUM_REGS, std::vector<Byte>(MAX_NUM_REGS))
, vreg_file(MAX_NUM_REGS, std::vector<Byte>(VLEN / 8))
#endif
, uuid(0)
{}
Expand Down Expand Up @@ -96,6 +96,7 @@ Emulator::Emulator(const Arch &arch, const DCRS &dcrs, Core* core)
// In future versions, scratchpad size should be fixed to an appropriate value.
, scratchpad(std::vector<Word>(32 * 32 * 32768))
#ifdef EXT_V_ENABLE
, vec_unit_(core->vec_unit())
, csrs_(arch.num_warps())
#endif
{
Expand Down Expand Up @@ -133,6 +134,10 @@ void Emulator::clear() {
barrier.reset();
}

#ifdef EXT_V_ENABLE
vec_unit_->reset();
#endif

csr_mscratch_ = startup_arg;

stalled_warps_.reset();
Expand Down Expand Up @@ -607,6 +612,18 @@ Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
CSR_READ_64(VX_CSR_MPM_LMEM_BANK_ST, lmem_perf.bank_stalls);
}
} break;
#ifdef EXT_V_ENABLE
case VX_DCR_MPM_CLASS_VEC: {
VecUnit::PerfStats vec_perf_stats;
vec_perf_stats += vec_unit_->perf_stats();
switch (addr) {
CSR_READ_64(VX_CSR_MPM_VEC_READS, vec_perf_stats.reads);
CSR_READ_64(VX_CSR_MPM_VEC_WRITES, vec_perf_stats.writes);
CSR_READ_64(VX_CSR_MPM_VEC_LAT, vec_perf_stats.latency);
CSR_READ_64(VX_CSR_MPM_VEC_ST, vec_perf_stats.stalls);
}
} break;
#endif
default: {
std::cout << "Error: invalid MPM CLASS: value=" << perf_class << std::endl;
std::abort();
Expand Down
6 changes: 6 additions & 0 deletions sim/simx/emulator.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@
#include <stack>
#include <mem.h>
#include "types.h"
#ifdef EXT_V_ENABLE
#include "vec_unit.h"
#endif

namespace vortex {

Expand Down Expand Up @@ -175,7 +178,10 @@ class Emulator {
uint32_t mat_size;
uint32_t tc_size;
uint32_t tc_num;
#ifdef EXT_V_ENABLE
VecUnit::Ptr vec_unit_;
std::vector<std::vector<std::unordered_map<uint32_t, uint32_t>>> csrs_;
#endif
};

}
Expand Down
49 changes: 49 additions & 0 deletions sim/simx/func_unit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -366,3 +366,52 @@ void SfuUnit::tick() {
input.pop();
}
}

///////////////////////////////////////////////////////////////////////////////

#ifdef EXT_V_ENABLE
VpuUnit::VpuUnit(const SimContext& ctx, Core* core)
: FuncUnit(ctx, core, "vpu-unit")
{}

void VpuUnit::tick() {
for (uint32_t iw = 0; iw < ISSUE_WIDTH; ++iw) {
auto& input = Inputs.at(iw);
if (input.empty())
continue;
//auto& output = Outputs.at(iw);
auto trace = input.front();
//int delay = 2;
switch (trace->vpu_type) {
case VpuType::VSET:
case VpuType::VL:
case VpuType::VS:
case VpuType::ARITHVV:
case VpuType::MULVV:
case VpuType::DIVVV:
case VpuType::ARITHVX:
case VpuType::MULVX:
case VpuType::DIVVX:
case VpuType::ARITHVI:
case VpuType::MULVI:
case VpuType::DIVVI:
case VpuType::ARITHFVV:
case VpuType::MULFVV:
case VpuType::DIVFVV:
case VpuType::ARITHFVX:
case VpuType::MULFVX:
case VpuType::DIVFVX:
case VpuType::ARITHFVI:
case VpuType::MULFVI:
case VpuType::DIVFVI:
default:
std::abort();
}
DT(3, this->name() << ": op=" << trace->vpu_type << ", " << *trace);
if (trace->eop && trace->fetch_stall) {
core_->resume(trace->wid);
}
input.pop();
}
}
#endif
Loading

0 comments on commit c67078a

Please sign in to comment.