diff --git a/hw/rtl/VX_types.vh b/hw/rtl/VX_types.vh index 455d42ce1..d4eb31f58 100644 --- a/hw/rtl/VX_types.vh +++ b/hw/rtl/VX_types.vh @@ -35,6 +35,7 @@ `define VX_DCR_MPM_CLASS_NONE 0 `define VX_DCR_MPM_CLASS_CORE 1 `define VX_DCR_MPM_CLASS_MEM 2 +`define VX_DCR_MPM_CLASS_VEC 3 // User Floating-Point CSRs /////////////////////////////////////////////////// @@ -99,6 +100,8 @@ `define VX_CSR_MPM_SCRB_CSRS_H 12'hB8C `define VX_CSR_MPM_SCRB_WCTL 12'hB0D `define VX_CSR_MPM_SCRB_WCTL_H 12'hB8D +`define VX_CSR_MPM_SCRB_VEC 12'hB13 // Vector scoreboard +`define VX_CSR_MPM_SCRB_VEC_H 12'hB93 // PERF: memory `define VX_CSR_MPM_IFETCHES 12'hB0E `define VX_CSR_MPM_IFETCHES_H 12'hB8E @@ -182,6 +185,17 @@ // Machine Performance-monitoring memory counters (class 3) /////////////////// // +// Machine Performance-monitoring vector counters +// PERF: vector unit +`define VX_CSR_MPM_VEC_READS 12'hB03 // vector reads +`define VX_CSR_MPM_VEC_READS_H 12'hB83 +`define VX_CSR_MPM_VEC_WRITES 12'hB04 // vector writes +`define VX_CSR_MPM_VEC_WRITES_H 12'hB84 +`define VX_CSR_MPM_VEC_LAT 12'hB05 // vector latency +`define VX_CSR_MPM_VEC_LAT_H 12'hB85 +`define VX_CSR_MPM_VEC_ST 12'hB06 // vector stalls +`define VX_CSR_MPM_VEC_ST_H 12'hB86 + // Machine Information Registers ////////////////////////////////////////////// `define VX_CSR_MVENDORID 12'hF11 diff --git a/runtime/stub/utils.cpp b/runtime/stub/utils.cpp index 220f916ae..4cfa5b62b 100644 --- a/runtime/stub/utils.cpp +++ b/runtime/stub/utils.cpp @@ -188,6 +188,9 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { uint64_t scrb_lsu = 0; uint64_t scrb_csrs = 0; uint64_t scrb_wctl = 0; +#ifdef EXT_V_ENABLE + uint64_t scrb_vpu = 0; +#endif uint64_t ifetches = 0; uint64_t loads = 0; uint64_t stores = 0; @@ -212,6 +215,13 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { uint64_t mem_writes = 0; uint64_t mem_lat = 0; uint64_t mem_bank_stalls = 0; +#ifdef EXT_V_ENABLE + // PERF: vecunit + uint64_t vec_mem_reads = 0; + uint64_t vec_mem_writes = 0; + uint64_t vec_mem_lat = 0; + uint64_t vec_stall_cycles = 0; +#endif uint64_t num_cores; CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_NUM_CORES, &num_cores), { @@ -312,13 +322,25 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_WCTL, core_id, &scrb_wctl_per_core), { return err; }); + #ifdef EXT_V_ENABLE + uint64_t scrb_vpu_per_core; + CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_VPU, core_id, &scrb_vpu_per_core), { + return err; + }); + #endif scrb_alu += scrb_alu_per_core; scrb_fpu += scrb_fpu_per_core; scrb_lsu += scrb_lsu_per_core; scrb_csrs += scrb_csrs_per_core; scrb_wctl += scrb_wctl_per_core; + #ifdef EXT_V_ENABLE + scrb_vpu += scrb_vpu_per_core; + #endif if (num_cores > 1) { uint64_t scrb_total = scrb_alu_per_core + scrb_fpu_per_core + scrb_lsu_per_core + scrb_csrs_per_core + scrb_wctl_per_core; + #ifdef EXT_V_ENABLE + scrb_total += scrb_vpu_per_core; + #endif int scrb_percent_per_core = calcAvgPercent(scrb_stalls_per_core, cycles_per_core); fprintf(stream, "PERF: core%d: scoreboard stalls=%ld (%d%%) (alu=%d%%, fpu=%d%%, lsu=%d%%, csrs=%d%%, wctl=%d%%)\n" , core_id @@ -329,6 +351,9 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { , calcAvgPercent(scrb_lsu_per_core, scrb_total) , calcAvgPercent(scrb_csrs_per_core, scrb_total) , calcAvgPercent(scrb_wctl_per_core, scrb_total) + #ifdef EXT_V_ENABLE + , calcAvgPercent(scrb_vpu_per_core, scrb_total) + #endif ); } scrb_stalls += scrb_stalls_per_core; @@ -555,6 +580,19 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { }); } } break; + #ifdef EXT_V_ENABLE + case VX_DCR_MPM_CLASS_VEC: { + uint64_t tmp; + CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_VEC_READS, core_id, &tmp), { return err; }); + vec_mem_reads += tmp; + CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_VEC_WRITES, core_id, &tmp), { return err; }); + vec_mem_writes += tmp; + CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_VEC_LAT, core_id, &tmp), { return err; }); + vec_mem_lat += tmp; + CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_VEC_ST, core_id, &tmp), { return err; }); + vec_stall_cycles += tmp; + } break; + #endif default: break; } @@ -576,6 +614,9 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { int ifetch_avg_lat = caclAverage(ifetch_lat, ifetches); int load_avg_lat = caclAverage(load_lat, loads); uint64_t scrb_total = scrb_alu + scrb_fpu + scrb_lsu + scrb_csrs + scrb_wctl; + #ifdef EXT_V_ENABLE + scrb_total += scrb_vpu; + #endif fprintf(stream, "PERF: scheduler idle=%ld (%d%%)\n", sched_idles, sched_idles_percent); fprintf(stream, "PERF: scheduler stalls=%ld (%d%%)\n", sched_stalls, sched_stalls_percent); fprintf(stream, "PERF: ibuffer stalls=%ld (%d%%)\n", ibuffer_stalls, ibuffer_percent); @@ -587,6 +628,9 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { , calcAvgPercent(scrb_lsu, scrb_total) , calcAvgPercent(scrb_csrs, scrb_total) , calcAvgPercent(scrb_wctl, scrb_total) + #ifdef EXT_V_ENABLE + , calcAvgPercent(scrb_vpu, scrb_total) + #endif ); fprintf(stream, "PERF: operands stalls=%ld (%d%%)\n", opds_stalls, opds_percent); fprintf(stream, "PERF: ifetches=%ld\n", ifetches); @@ -637,6 +681,20 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { fprintf(stream, "PERF: memory bank stalls=%ld (utilization=%d%%)\n", mem_bank_stalls, mem_bank_utilization); } } break; +#ifdef EXT_V_ENABLE + case VX_DCR_MPM_CLASS_VEC: { + vec_mem_reads /= num_cores; + vec_mem_writes /= num_cores; + vec_mem_lat /= num_cores; + vec_stall_cycles /= num_cores; + int vec_avg_lat = caclAverage(vec_mem_lat, vec_mem_reads); + int vec_stall_cycles_ratio = calcRatio(vec_stall_cycles, total_cycles); + fprintf(stream, "PERF: vec memory reads=%ld\n", vec_mem_reads); + fprintf(stream, "PERF: vec memory writes=%ld\n", vec_mem_writes); + fprintf(stream, "PERF: vec memory latency=%d cycles\n", vec_avg_lat); + fprintf(stream, "PERF: vec stalls=%ld (%d%%)\n", vec_stall_cycles, vec_stall_cycles_ratio); + } break; +#endif default: break; } diff --git a/sim/simx/Makefile b/sim/simx/Makefile index 83054edc4..9c6743efa 100644 --- a/sim/simx/Makefile +++ b/sim/simx/Makefile @@ -22,7 +22,7 @@ SRCS += $(SRC_DIR)/processor.cpp $(SRC_DIR)/cluster.cpp $(SRC_DIR)/socket.cpp $( # Add V extension sources ifneq ($(findstring -DEXT_V_ENABLE, $(CONFIGS)),) - SRCS += $(SRC_DIR)/vpu.cpp + SRCS += $(SRC_DIR)/vec_unit.cpp $(SRC_DIR)/vpu.cpp endif # Debugging diff --git a/sim/simx/core.cpp b/sim/simx/core.cpp index 65609a9aa..f3a3c09f9 100644 --- a/sim/simx/core.cpp +++ b/sim/simx/core.cpp @@ -29,7 +29,8 @@ Core::Core(const SimContext& ctx, uint32_t core_id, Socket* socket, const Arch &arch, - const DCRS &dcrs) + const DCRS &dcrs + ) : SimObject(ctx, StrFormat("core%d", core_id)) , icache_req_ports(1, this) , icache_rsp_ports(1, this) @@ -38,6 +39,9 @@ Core::Core(const SimContext& ctx, , core_id_(core_id) , socket_(socket) , arch_(arch) +#ifdef EXT_V_ENABLE + , vec_unit_(VecUnit::Create("vpu", arch)) +#endif , emulator_(arch, dcrs, this) , ibuffers_(arch.num_warps(), IBUF_SIZE) , scoreboard_(arch_) diff --git a/sim/simx/core.h b/sim/simx/core.h index 1bd0571bc..1546452d1 100644 --- a/sim/simx/core.h +++ b/sim/simx/core.h @@ -27,6 +27,9 @@ #include "func_unit.h" #include "mem_coalescer.h" #include "VX_config.h" +#ifdef EXT_V_ENABLE +#include "vec_unit.h" +#endif namespace vortex { @@ -52,6 +55,9 @@ class Core : public SimObject { uint64_t scrb_sfu; uint64_t scrb_csrs; uint64_t scrb_wctl; + #ifdef EXT_V_ENABLE + uint64_t scrb_vpu; + #endif uint64_t ifetches; uint64_t loads; uint64_t stores; @@ -72,6 +78,9 @@ class Core : public SimObject { , scrb_sfu(0) , scrb_csrs(0) , scrb_wctl(0) + #ifdef EXT_V_ENABLE + , scrb_vpu(0) + #endif , ifetches(0) , loads(0) , stores(0) @@ -90,7 +99,8 @@ class Core : public SimObject { uint32_t core_id, Socket* socket, const Arch &arch, - const DCRS &dcrs); + const DCRS &dcrs + ); ~Core(); @@ -131,6 +141,12 @@ class Core : public SimObject { return mem_coalescers_.at(idx); } +#ifdef EXT_V_ENABLE + VecUnit::Ptr& vec_unit() { + return vec_unit_; + } +#endif + const PerfStats& perf_stats() const { return perf_stats_; } @@ -150,6 +166,10 @@ class Core : public SimObject { Socket* socket_; const Arch& arch_; +#ifdef EXT_V_ENABLE + VecUnit::Ptr vec_unit_; +#endif + Emulator emulator_; std::vector ibuffers_; diff --git a/sim/simx/emulator.cpp b/sim/simx/emulator.cpp index ee297279a..b87fe1af6 100644 --- a/sim/simx/emulator.cpp +++ b/sim/simx/emulator.cpp @@ -34,7 +34,7 @@ Emulator::warp_t::warp_t(const Arch& arch) : ireg_file(arch.num_threads(), std::vector(MAX_NUM_REGS)) , freg_file(arch.num_threads(), std::vector(MAX_NUM_REGS)) #ifdef EXT_V_ENABLE - , vreg_file(MAX_NUM_REGS, std::vector(MAX_NUM_REGS)) + , vreg_file(MAX_NUM_REGS, std::vector(VLEN / 8)) #endif , uuid(0) {} @@ -96,6 +96,7 @@ Emulator::Emulator(const Arch &arch, const DCRS &dcrs, Core* core) // In future versions, scratchpad size should be fixed to an appropriate value. , scratchpad(std::vector(32 * 32 * 32768)) #ifdef EXT_V_ENABLE + , vec_unit_(core->vec_unit()) , csrs_(arch.num_warps()) #endif { @@ -133,6 +134,10 @@ void Emulator::clear() { barrier.reset(); } +#ifdef EXT_V_ENABLE + vec_unit_->reset(); +#endif + csr_mscratch_ = startup_arg; stalled_warps_.reset(); @@ -607,6 +612,18 @@ Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) { CSR_READ_64(VX_CSR_MPM_LMEM_BANK_ST, lmem_perf.bank_stalls); } } break; + #ifdef EXT_V_ENABLE + case VX_DCR_MPM_CLASS_VEC: { + VecUnit::PerfStats vec_perf_stats; + vec_perf_stats += vec_unit_->perf_stats(); + switch (addr) { + CSR_READ_64(VX_CSR_MPM_VEC_READS, vec_perf_stats.reads); + CSR_READ_64(VX_CSR_MPM_VEC_WRITES, vec_perf_stats.writes); + CSR_READ_64(VX_CSR_MPM_VEC_LAT, vec_perf_stats.latency); + CSR_READ_64(VX_CSR_MPM_VEC_ST, vec_perf_stats.stalls); + } + } break; + #endif default: { std::cout << "Error: invalid MPM CLASS: value=" << perf_class << std::endl; std::abort(); diff --git a/sim/simx/emulator.h b/sim/simx/emulator.h index 980bc8f8a..8b3d0e644 100644 --- a/sim/simx/emulator.h +++ b/sim/simx/emulator.h @@ -19,6 +19,9 @@ #include #include #include "types.h" +#ifdef EXT_V_ENABLE +#include "vec_unit.h" +#endif namespace vortex { @@ -175,7 +178,10 @@ class Emulator { uint32_t mat_size; uint32_t tc_size; uint32_t tc_num; +#ifdef EXT_V_ENABLE + VecUnit::Ptr vec_unit_; std::vector>> csrs_; +#endif }; } diff --git a/sim/simx/func_unit.cpp b/sim/simx/func_unit.cpp index d33a0ac1c..d100f960e 100644 --- a/sim/simx/func_unit.cpp +++ b/sim/simx/func_unit.cpp @@ -366,3 +366,52 @@ void SfuUnit::tick() { input.pop(); } } + +/////////////////////////////////////////////////////////////////////////////// + +#ifdef EXT_V_ENABLE +VpuUnit::VpuUnit(const SimContext& ctx, Core* core) + : FuncUnit(ctx, core, "vpu-unit") +{} + +void VpuUnit::tick() { + for (uint32_t iw = 0; iw < ISSUE_WIDTH; ++iw) { + auto& input = Inputs.at(iw); + if (input.empty()) + continue; + //auto& output = Outputs.at(iw); + auto trace = input.front(); + //int delay = 2; + switch (trace->vpu_type) { + case VpuType::VSET: + case VpuType::VL: + case VpuType::VS: + case VpuType::ARITHVV: + case VpuType::MULVV: + case VpuType::DIVVV: + case VpuType::ARITHVX: + case VpuType::MULVX: + case VpuType::DIVVX: + case VpuType::ARITHVI: + case VpuType::MULVI: + case VpuType::DIVVI: + case VpuType::ARITHFVV: + case VpuType::MULFVV: + case VpuType::DIVFVV: + case VpuType::ARITHFVX: + case VpuType::MULFVX: + case VpuType::DIVFVX: + case VpuType::ARITHFVI: + case VpuType::MULFVI: + case VpuType::DIVFVI: + default: + std::abort(); + } + DT(3, this->name() << ": op=" << trace->vpu_type << ", " << *trace); + if (trace->eop && trace->fetch_stall) { + core_->resume(trace->wid); + } + input.pop(); + } +} +#endif diff --git a/sim/simx/func_unit.h b/sim/simx/func_unit.h index 2250d70c5..1354b7fdf 100644 --- a/sim/simx/func_unit.h +++ b/sim/simx/func_unit.h @@ -113,4 +113,39 @@ class SfuUnit : public FuncUnit { void tick(); }; +/////////////////////////////////////////////////////////////////////////////// + +class VpuUnit : public FuncUnit { +public: + VpuUnit(const SimContext& ctx, Core*); + + void tick(); +}; + +// Simulate clock cycles depending on instruction type and element width and #lanes +// VSET = 1 cycle +// Vector instructions take the same amount of time as ALU instructions. +// In general there should be less overall instructions (hence the SIMD vector speedup). +// But, each vector instruction is bigger, and # of lanes greatly effects execution speed. + +// Whenever we change VL using imm/VSET, we need to keep track of the new VL and SEW. +// By default, VL is set to MAXVL. +// After determining VL, we use VL and #lanes in order to determine overall cycle time. +// For example, for a vector add with VL=4 and #lanes=2, we will probably take 2 cycles, +// since we can only operate on two elements of the vector each cycle (limited by #lanes). +// SEW (element width) likely affects the cycle time, we can probably observe +// ALU operation cycle time in relation to element width to determine this though. + +// The RTL implementation has an unroll and accumulate stage. +// The unroll stage sends vector elements to the appropriate functional unit up to VL, +// limited by the # lanes available. +// The accumulate stage deals with combining the results from the functional units, +// into the destination vector register. +// Which exact pipeline stage does the VPU unroll the vector (decode or execute)? +// Which exact pipeline stage does the VPU accumulate results? + +// How do vector loads and stores interact with the cache? +// How about loading and storing scalars in vector registers? +// How does striding affect loads and stores? + } \ No newline at end of file diff --git a/sim/simx/instr_trace.h b/sim/simx/instr_trace.h index 5ed98d265..f8f305de0 100644 --- a/sim/simx/instr_trace.h +++ b/sim/simx/instr_trace.h @@ -43,6 +43,13 @@ struct SFUTraceData : public ITraceData { SFUTraceData(Word arg1, Word arg2) : arg1(arg1), arg2(arg2) {} }; +struct VPUTraceData : public ITraceData { + using Ptr = std::shared_ptr; + Word arg1; + Word arg2; + VPUTraceData(Word arg1, Word arg2) : arg1(arg1), arg2(arg2) {} +}; + struct instr_trace_t { public: struct reg_t { @@ -77,7 +84,10 @@ struct instr_trace_t { AluType alu_type; FpuType fpu_type; SfuType sfu_type; - TCUType tcu_type; + #ifdef EXT_V_ENABLE + VpuType vpu_type; + #endif + TCUType tcu_type; }; ITraceData::Ptr data; diff --git a/sim/simx/types.h b/sim/simx/types.h index 220d4b645..b86aec2db 100644 --- a/sim/simx/types.h +++ b/sim/simx/types.h @@ -269,6 +269,48 @@ inline std::ostream &operator<<(std::ostream &os, const SfuType& type) { /////////////////////////////////////////////////////////////////////////////// +enum class VpuType { + VSET, // Set vector length + VL, // Vector load + VS, // Vector store + + // ALU OPERATIONS + ARITHVV, // Vector-vector + MULVV, + DIVVV, + + ARITHVX, // Vector-scalar + MULVX, + DIVVX, + + ARITHVI, // Vector-immediate + MULVI, + DIVVI, + + // FPU OPERATIONS + ARITHFVV, // Vector-vector + MULFVV, + DIVFVV, + + ARITHFVX, // Vector-scalar + MULFVX, + DIVFVX, + + ARITHFVI, // Vector-immediate + MULFVI, + DIVFVI +}; + +inline std::ostream &operator<<(std::ostream &os, const VpuType& type) { + switch (type) { + case VpuType::VSET: os << "VSET"; break; + default: assert(false); + } + return os; +} + +/////////////////////////////////////////////////////////////////////////////// + enum class ArbiterType { Priority, RoundRobin diff --git a/sim/simx/vec_unit.cpp b/sim/simx/vec_unit.cpp new file mode 100644 index 000000000..86c71cd8a --- /dev/null +++ b/sim/simx/vec_unit.cpp @@ -0,0 +1,87 @@ +#ifdef EXT_V_ENABLE + +#include "vec_unit.h" + +using namespace vortex; + +class VecUnit::Impl { +public: + Impl(VecUnit* simobject, const Arch& /*arch*/) + : simobject_(simobject) + { + this->clear(); + } + + ~Impl() {} + + void clear() { + perf_stats_ = PerfStats(); + } + + void tick() { + } + +/* + void load(const Instr &instr, uint32_t wid, std::vector &rsdata) { + } + + void store(const Instr &instr, uint32_t wid, std::vector &rsdata) { + } + + void execute(const Instr &instr, uint32_t wid, std::vector &rsdata, std::vector &rddata) { + } +*/ + + const PerfStats& perf_stats() const { + return perf_stats_; + } + +private: + + VecUnit* simobject_; + std::vector> vreg_file_; + vtype_t vtype_; + uint32_t vl_; + Word vlmax_; + PerfStats perf_stats_; +}; + +VecUnit::VecUnit(const SimContext& ctx, + const char* name, + const Arch &arch) + : SimObject(ctx, name) + , Input(this) + , Output(this) + , impl_(new Impl(this, arch)) +{} + +VecUnit::~VecUnit() { + delete impl_; +} + +void VecUnit::reset() { + impl_->clear(); +} + +void VecUnit::tick() { + impl_->tick(); +} + +/* +void VecUnit::load(const Instr &instr, uint32_t wid, std::vector &rsdata) { + return impl_->load(instr, wid, rsdata); +} + +void VecUnit::store(const Instr &instr, uint32_t wid, std::vector &rsdata) { + return impl_->store(instr, wid, rsdata); +} + +void VecUnit::execute(const Instr &instr, uint32_t wid, std::vector &rsdata, std::vector &rddata) { + return impl_->execute(instr, wid, rsdata, rddata); +} +*/ + +const VecUnit::PerfStats& VecUnit::perf_stats() const { + return impl_->perf_stats(); +} +#endif \ No newline at end of file diff --git a/sim/simx/vec_unit.h b/sim/simx/vec_unit.h new file mode 100644 index 000000000..8fbb95d3f --- /dev/null +++ b/sim/simx/vec_unit.h @@ -0,0 +1,87 @@ +#ifdef EXT_V_ENABLE +#pragma once + +#include "arch.h" +#include "instr.h" +#include "instr_trace.h" +#include +#include "types.h" + +namespace vortex { + +struct vtype_t { + uint32_t vill; + uint32_t vma; + uint32_t vta; + uint32_t vsew; + uint32_t vlmul; +}; + +union reg_data_t { + Word u; + WordI i; + WordF f; + float f32; + double f64; + uint32_t u32; + uint64_t u64; + int32_t i32; + int64_t i64; +}; + +class VecUnit : public SimObject { +public: + struct PerfStats { + uint64_t reads; + uint64_t writes; + uint64_t latency; + uint64_t stalls; + + PerfStats() + : reads(0) + , writes(0) + , latency(0) + , stalls(0) + {} + + PerfStats& operator+=(const PerfStats& rhs) { + this->reads += rhs.reads; + this->writes += rhs.writes; + this->latency += rhs.latency; + this->stalls += rhs.stalls; + return *this; + } + }; + + std::vector> MemReqs; + std::vector> MemRsps; + + SimPort Input; + SimPort Output; + + VecUnit(const SimContext& ctx, + const char* name, + const Arch &arch); + + ~VecUnit(); + + void reset(); + + void tick(); + + void load(const Instr &instr, uint32_t wid, std::vector &rsdata); + + void store(const Instr &instr, uint32_t wid, std::vector &rsdata); + + void execute(const Instr &instr, uint32_t wid, std::vector &rsdata, std::vector &rddata); + + const PerfStats& perf_stats() const; + +private: + + class Impl; + Impl* impl_; +}; + +} +#endif \ No newline at end of file