diff --git a/hw/rtl/VX_types.vh b/hw/rtl/VX_types.vh
index 455d42ce1..d4eb31f58 100644
--- a/hw/rtl/VX_types.vh
+++ b/hw/rtl/VX_types.vh
@@ -35,6 +35,7 @@
 `define VX_DCR_MPM_CLASS_NONE           0
 `define VX_DCR_MPM_CLASS_CORE           1
 `define VX_DCR_MPM_CLASS_MEM            2
+`define VX_DCR_MPM_CLASS_VEC            3
 
 // User Floating-Point CSRs ///////////////////////////////////////////////////
 
@@ -99,6 +100,8 @@
 `define VX_CSR_MPM_SCRB_CSRS_H          12'hB8C
 `define VX_CSR_MPM_SCRB_WCTL            12'hB0D
 `define VX_CSR_MPM_SCRB_WCTL_H          12'hB8D
+`define VX_CSR_MPM_SCRB_VEC             12'hB13 // Vector scoreboard
+`define VX_CSR_MPM_SCRB_VEC_H           12'hB93
 // PERF: memory
 `define VX_CSR_MPM_IFETCHES             12'hB0E
 `define VX_CSR_MPM_IFETCHES_H           12'hB8E
@@ -182,6 +185,17 @@
 // Machine Performance-monitoring memory counters (class 3) ///////////////////
 // <Add your own counters: use addresses hB03..B1F, hB83..hB9F>
 
+// Machine Performance-monitoring vector counters
+// PERF: vector unit
+`define VX_CSR_MPM_VEC_READS            12'hB03     // vector reads
+`define VX_CSR_MPM_VEC_READS_H          12'hB83
+`define VX_CSR_MPM_VEC_WRITES           12'hB04     // vector writes
+`define VX_CSR_MPM_VEC_WRITES_H         12'hB84
+`define VX_CSR_MPM_VEC_LAT              12'hB05     // vector latency
+`define VX_CSR_MPM_VEC_LAT_H            12'hB85
+`define VX_CSR_MPM_VEC_ST               12'hB06     // vector stalls
+`define VX_CSR_MPM_VEC_ST_H             12'hB86
+
 // Machine Information Registers //////////////////////////////////////////////
 
 `define VX_CSR_MVENDORID                12'hF11
diff --git a/runtime/stub/utils.cpp b/runtime/stub/utils.cpp
index 220f916ae..4cfa5b62b 100644
--- a/runtime/stub/utils.cpp
+++ b/runtime/stub/utils.cpp
@@ -188,6 +188,9 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
   uint64_t scrb_lsu = 0;
   uint64_t scrb_csrs = 0;
   uint64_t scrb_wctl = 0;
+#ifdef EXT_V_ENABLE
+  uint64_t scrb_vpu = 0;
+#endif
   uint64_t ifetches = 0;
   uint64_t loads = 0;
   uint64_t stores = 0;
@@ -212,6 +215,13 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
   uint64_t mem_writes = 0;
   uint64_t mem_lat = 0;
   uint64_t mem_bank_stalls = 0;
+#ifdef EXT_V_ENABLE
+  // PERF: vecunit
+  uint64_t vec_mem_reads = 0;
+  uint64_t vec_mem_writes = 0;
+  uint64_t vec_mem_lat = 0;
+  uint64_t vec_stall_cycles = 0;
+#endif
 
   uint64_t num_cores;
   CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_NUM_CORES, &num_cores), {
@@ -312,13 +322,25 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
         CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_WCTL, core_id, &scrb_wctl_per_core), {
           return err;
         });
+      #ifdef EXT_V_ENABLE
+        uint64_t scrb_vpu_per_core;
+        CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_VPU, core_id, &scrb_vpu_per_core), {
+          return err;
+        });
+      #endif
         scrb_alu += scrb_alu_per_core;
         scrb_fpu += scrb_fpu_per_core;
         scrb_lsu += scrb_lsu_per_core;
         scrb_csrs += scrb_csrs_per_core;
         scrb_wctl += scrb_wctl_per_core;
+      #ifdef EXT_V_ENABLE
+        scrb_vpu += scrb_vpu_per_core;
+      #endif
         if (num_cores > 1) {
           uint64_t scrb_total = scrb_alu_per_core + scrb_fpu_per_core + scrb_lsu_per_core + scrb_csrs_per_core + scrb_wctl_per_core;
+        #ifdef EXT_V_ENABLE
+          scrb_total += scrb_vpu_per_core;
+        #endif
           int scrb_percent_per_core = calcAvgPercent(scrb_stalls_per_core, cycles_per_core);
           fprintf(stream, "PERF: core%d: scoreboard stalls=%ld (%d%%) (alu=%d%%, fpu=%d%%, lsu=%d%%, csrs=%d%%, wctl=%d%%)\n"
           , core_id
@@ -329,6 +351,9 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
           , calcAvgPercent(scrb_lsu_per_core, scrb_total)
           , calcAvgPercent(scrb_csrs_per_core, scrb_total)
           , calcAvgPercent(scrb_wctl_per_core, scrb_total)
+        #ifdef EXT_V_ENABLE
+          , calcAvgPercent(scrb_vpu_per_core, scrb_total)
+        #endif
           );
         }
         scrb_stalls += scrb_stalls_per_core;
@@ -555,6 +580,19 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
         });
       }
     } break;
+  #ifdef EXT_V_ENABLE
+    case VX_DCR_MPM_CLASS_VEC: {
+      uint64_t tmp;
+      CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_VEC_READS, core_id, &tmp), { return err; });
+			vec_mem_reads += tmp;
+      CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_VEC_WRITES, core_id, &tmp), { return err; });
+			vec_mem_writes += tmp;
+      CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_VEC_LAT, core_id, &tmp), { return err; });
+			vec_mem_lat += tmp;
+      CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_VEC_ST, core_id, &tmp), { return err; });
+			vec_stall_cycles += tmp;
+    } break;
+  #endif
     default:
       break;
     }
@@ -576,6 +614,9 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
     int ifetch_avg_lat = caclAverage(ifetch_lat, ifetches);
     int load_avg_lat = caclAverage(load_lat, loads);
     uint64_t scrb_total = scrb_alu + scrb_fpu + scrb_lsu + scrb_csrs + scrb_wctl;
+  #ifdef EXT_V_ENABLE
+    scrb_total += scrb_vpu;
+  #endif
     fprintf(stream, "PERF: scheduler idle=%ld (%d%%)\n", sched_idles, sched_idles_percent);
     fprintf(stream, "PERF: scheduler stalls=%ld (%d%%)\n", sched_stalls, sched_stalls_percent);
     fprintf(stream, "PERF: ibuffer stalls=%ld (%d%%)\n", ibuffer_stalls, ibuffer_percent);
@@ -587,6 +628,9 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
       , calcAvgPercent(scrb_lsu, scrb_total)
       , calcAvgPercent(scrb_csrs, scrb_total)
       , calcAvgPercent(scrb_wctl, scrb_total)
+    #ifdef EXT_V_ENABLE
+      , calcAvgPercent(scrb_vpu, scrb_total)
+    #endif
     );
     fprintf(stream, "PERF: operands stalls=%ld (%d%%)\n", opds_stalls, opds_percent);
     fprintf(stream, "PERF: ifetches=%ld\n", ifetches);
@@ -637,6 +681,20 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
       fprintf(stream, "PERF: memory bank stalls=%ld (utilization=%d%%)\n", mem_bank_stalls, mem_bank_utilization);
     }
   } break;
+#ifdef EXT_V_ENABLE
+  case VX_DCR_MPM_CLASS_VEC: {
+    vec_mem_reads /= num_cores;
+    vec_mem_writes /= num_cores;
+    vec_mem_lat /= num_cores;
+    vec_stall_cycles /= num_cores;
+    int vec_avg_lat = caclAverage(vec_mem_lat, vec_mem_reads);
+    int vec_stall_cycles_ratio = calcRatio(vec_stall_cycles, total_cycles);
+    fprintf(stream, "PERF: vec memory reads=%ld\n", vec_mem_reads);
+    fprintf(stream, "PERF: vec memory writes=%ld\n", vec_mem_writes);
+    fprintf(stream, "PERF: vec memory latency=%d cycles\n", vec_avg_lat);
+    fprintf(stream, "PERF: vec stalls=%ld (%d%%)\n", vec_stall_cycles, vec_stall_cycles_ratio);
+  } break;
+#endif
   default:
     break;
   }
diff --git a/sim/simx/Makefile b/sim/simx/Makefile
index 83054edc4..9c6743efa 100644
--- a/sim/simx/Makefile
+++ b/sim/simx/Makefile
@@ -22,7 +22,7 @@ SRCS += $(SRC_DIR)/processor.cpp $(SRC_DIR)/cluster.cpp $(SRC_DIR)/socket.cpp $(
 
 # Add V extension sources
 ifneq ($(findstring -DEXT_V_ENABLE, $(CONFIGS)),)
-  SRCS += $(SRC_DIR)/vpu.cpp
+  SRCS += $(SRC_DIR)/vec_unit.cpp $(SRC_DIR)/vpu.cpp
 endif
 
 # Debugging
diff --git a/sim/simx/core.cpp b/sim/simx/core.cpp
index 65609a9aa..f3a3c09f9 100644
--- a/sim/simx/core.cpp
+++ b/sim/simx/core.cpp
@@ -29,7 +29,8 @@ Core::Core(const SimContext& ctx,
            uint32_t core_id,
            Socket* socket,
            const Arch &arch,
-           const DCRS &dcrs)
+           const DCRS &dcrs
+           )
   : SimObject(ctx, StrFormat("core%d", core_id))
   , icache_req_ports(1, this)
   , icache_rsp_ports(1, this)
@@ -38,6 +39,9 @@ Core::Core(const SimContext& ctx,
   , core_id_(core_id)
   , socket_(socket)
   , arch_(arch)
+#ifdef EXT_V_ENABLE
+  , vec_unit_(VecUnit::Create("vpu", arch))
+#endif
   , emulator_(arch, dcrs, this)
   , ibuffers_(arch.num_warps(), IBUF_SIZE)
   , scoreboard_(arch_)
diff --git a/sim/simx/core.h b/sim/simx/core.h
index 1bd0571bc..1546452d1 100644
--- a/sim/simx/core.h
+++ b/sim/simx/core.h
@@ -27,6 +27,9 @@
 #include "func_unit.h"
 #include "mem_coalescer.h"
 #include "VX_config.h"
+#ifdef EXT_V_ENABLE
+#include "vec_unit.h"
+#endif
 
 namespace vortex {
 
@@ -52,6 +55,9 @@ class Core : public SimObject<Core> {
     uint64_t scrb_sfu;
     uint64_t scrb_csrs;
     uint64_t scrb_wctl;
+  #ifdef EXT_V_ENABLE
+    uint64_t scrb_vpu;
+  #endif
     uint64_t ifetches;
     uint64_t loads;
     uint64_t stores;
@@ -72,6 +78,9 @@ class Core : public SimObject<Core> {
       , scrb_sfu(0)
       , scrb_csrs(0)
       , scrb_wctl(0)
+    #ifdef EXT_V_ENABLE
+      , scrb_vpu(0)
+    #endif
       , ifetches(0)
       , loads(0)
       , stores(0)
@@ -90,7 +99,8 @@ class Core : public SimObject<Core> {
        uint32_t core_id,
        Socket* socket,
        const Arch &arch,
-       const DCRS &dcrs);
+       const DCRS &dcrs
+       );
 
   ~Core();
 
@@ -131,6 +141,12 @@ class Core : public SimObject<Core> {
     return mem_coalescers_.at(idx);
   }
 
+#ifdef EXT_V_ENABLE
+  VecUnit::Ptr& vec_unit() {
+    return vec_unit_;
+  }
+#endif
+
   const PerfStats& perf_stats() const {
     return perf_stats_;
   }
@@ -150,6 +166,10 @@ class Core : public SimObject<Core> {
   Socket* socket_;
   const Arch& arch_;
 
+#ifdef EXT_V_ENABLE
+  VecUnit::Ptr vec_unit_;
+#endif
+
   Emulator emulator_;
 
   std::vector<IBuffer> ibuffers_;
diff --git a/sim/simx/emulator.cpp b/sim/simx/emulator.cpp
index ee297279a..b87fe1af6 100644
--- a/sim/simx/emulator.cpp
+++ b/sim/simx/emulator.cpp
@@ -34,7 +34,7 @@ Emulator::warp_t::warp_t(const Arch& arch)
   : ireg_file(arch.num_threads(), std::vector<Word>(MAX_NUM_REGS))
   , freg_file(arch.num_threads(), std::vector<uint64_t>(MAX_NUM_REGS))
 #ifdef EXT_V_ENABLE
-  , vreg_file(MAX_NUM_REGS, std::vector<Byte>(MAX_NUM_REGS))
+  , vreg_file(MAX_NUM_REGS, std::vector<Byte>(VLEN / 8))
 #endif
   , uuid(0)
 {}
@@ -96,6 +96,7 @@ Emulator::Emulator(const Arch &arch, const DCRS &dcrs, Core* core)
     // In future versions, scratchpad size should be fixed to an appropriate value.
     , scratchpad(std::vector<Word>(32 * 32 * 32768))
   #ifdef EXT_V_ENABLE
+    , vec_unit_(core->vec_unit())
     , csrs_(arch.num_warps())
   #endif
 {
@@ -133,6 +134,10 @@ void Emulator::clear() {
     barrier.reset();
   }
 
+#ifdef EXT_V_ENABLE
+  vec_unit_->reset();
+#endif
+
   csr_mscratch_ = startup_arg;
 
   stalled_warps_.reset();
@@ -607,6 +612,18 @@ Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
         CSR_READ_64(VX_CSR_MPM_LMEM_BANK_ST, lmem_perf.bank_stalls);
         }
       } break;
+    #ifdef EXT_V_ENABLE
+      case VX_DCR_MPM_CLASS_VEC: {
+        VecUnit::PerfStats vec_perf_stats;
+        vec_perf_stats += vec_unit_->perf_stats();
+        switch (addr) {
+        CSR_READ_64(VX_CSR_MPM_VEC_READS, vec_perf_stats.reads);
+        CSR_READ_64(VX_CSR_MPM_VEC_WRITES, vec_perf_stats.writes);
+        CSR_READ_64(VX_CSR_MPM_VEC_LAT, vec_perf_stats.latency);
+        CSR_READ_64(VX_CSR_MPM_VEC_ST, vec_perf_stats.stalls);
+        }
+      } break;
+    #endif
       default: {
         std::cout << "Error: invalid MPM CLASS: value=" << perf_class << std::endl;
         std::abort();
diff --git a/sim/simx/emulator.h b/sim/simx/emulator.h
index 980bc8f8a..8b3d0e644 100644
--- a/sim/simx/emulator.h
+++ b/sim/simx/emulator.h
@@ -19,6 +19,9 @@
 #include <stack>
 #include <mem.h>
 #include "types.h"
+#ifdef EXT_V_ENABLE
+#include "vec_unit.h"
+#endif
 
 namespace vortex {
 
@@ -175,7 +178,10 @@ class Emulator {
   uint32_t mat_size;
   uint32_t tc_size;
   uint32_t tc_num;
+#ifdef EXT_V_ENABLE
+  VecUnit::Ptr vec_unit_;
   std::vector<std::vector<std::unordered_map<uint32_t, uint32_t>>> csrs_;
+#endif
 };
 
 }
diff --git a/sim/simx/func_unit.cpp b/sim/simx/func_unit.cpp
index d33a0ac1c..d100f960e 100644
--- a/sim/simx/func_unit.cpp
+++ b/sim/simx/func_unit.cpp
@@ -366,3 +366,52 @@ void SfuUnit::tick() {
 		input.pop();
 	}
 }
+
+///////////////////////////////////////////////////////////////////////////////
+
+#ifdef EXT_V_ENABLE
+VpuUnit::VpuUnit(const SimContext& ctx, Core* core)
+	: FuncUnit(ctx, core, "vpu-unit")
+{}
+
+void VpuUnit::tick() {
+  for (uint32_t iw = 0; iw < ISSUE_WIDTH; ++iw) {
+		auto& input = Inputs.at(iw);
+		if (input.empty())
+			continue;
+		//auto& output = Outputs.at(iw);
+		auto trace = input.front();
+		//int delay = 2;
+		switch (trace->vpu_type) {
+		case VpuType::VSET:
+		case VpuType::VL:
+		case VpuType::VS:
+		case VpuType::ARITHVV:
+		case VpuType::MULVV:
+		case VpuType::DIVVV:
+		case VpuType::ARITHVX:
+		case VpuType::MULVX:
+		case VpuType::DIVVX:
+		case VpuType::ARITHVI:
+		case VpuType::MULVI:
+		case VpuType::DIVVI:
+		case VpuType::ARITHFVV:
+		case VpuType::MULFVV:
+		case VpuType::DIVFVV:
+		case VpuType::ARITHFVX:
+		case VpuType::MULFVX:
+		case VpuType::DIVFVX:
+		case VpuType::ARITHFVI:
+		case VpuType::MULFVI:
+		case VpuType::DIVFVI:
+		default:
+			std::abort();
+		}
+		DT(3, this->name() << ": op=" << trace->vpu_type << ", " << *trace);
+		if (trace->eop && trace->fetch_stall) {
+			core_->resume(trace->wid);
+		}
+		input.pop();
+	}
+}
+#endif
diff --git a/sim/simx/func_unit.h b/sim/simx/func_unit.h
index 2250d70c5..1354b7fdf 100644
--- a/sim/simx/func_unit.h
+++ b/sim/simx/func_unit.h
@@ -113,4 +113,39 @@ class SfuUnit : public FuncUnit {
 	void tick();
 };
 
+///////////////////////////////////////////////////////////////////////////////
+
+class VpuUnit : public FuncUnit {
+public:
+	VpuUnit(const SimContext& ctx, Core*);
+
+	void tick();
+};
+
+// Simulate clock cycles depending on instruction type and element width and #lanes
+// VSET = 1 cycle
+// Vector instructions take the same amount of time as ALU instructions.
+// In general there should be less overall instructions (hence the SIMD vector speedup).
+// But, each vector instruction is bigger, and # of lanes greatly effects execution speed.
+
+// Whenever we change VL using imm/VSET, we need to keep track of the new VL and SEW.
+// By default, VL is set to MAXVL.
+// After determining VL, we use VL and #lanes in order to determine overall cycle time.
+// For example, for a vector add with VL=4 and #lanes=2, we will probably take 2 cycles,
+// since we can only operate on two elements of the vector each cycle (limited by #lanes).
+// SEW (element width) likely affects the cycle time, we can probably observe
+// ALU operation cycle time in relation to element width to determine this though.
+
+// The RTL implementation has an unroll and accumulate stage.
+// The unroll stage sends vector elements to the appropriate functional unit up to VL,
+// limited by the # lanes available.
+// The accumulate stage deals with combining the results from the functional units,
+// into the destination vector register.
+// Which exact pipeline stage does the VPU unroll the vector (decode or execute)?
+// Which exact pipeline stage does the VPU accumulate results?
+
+// How do vector loads and stores interact with the cache?
+// How about loading and storing scalars in vector registers?
+// How does striding affect loads and stores?
+
 }
\ No newline at end of file
diff --git a/sim/simx/instr_trace.h b/sim/simx/instr_trace.h
index 5ed98d265..f8f305de0 100644
--- a/sim/simx/instr_trace.h
+++ b/sim/simx/instr_trace.h
@@ -43,6 +43,13 @@ struct SFUTraceData : public ITraceData {
   SFUTraceData(Word arg1, Word arg2) : arg1(arg1), arg2(arg2) {}
 };
 
+struct VPUTraceData : public ITraceData {
+  using Ptr = std::shared_ptr<VPUTraceData>;
+  Word arg1;
+  Word arg2;
+  VPUTraceData(Word arg1, Word arg2) : arg1(arg1), arg2(arg2) {}
+};
+
 struct instr_trace_t {
 public:
   struct reg_t {
@@ -77,7 +84,10 @@ struct instr_trace_t {
     AluType  alu_type;
     FpuType  fpu_type;
     SfuType  sfu_type;
-    TCUType  tcu_type; 
+  #ifdef EXT_V_ENABLE
+    VpuType  vpu_type;
+  #endif
+    TCUType  tcu_type;
   };
 
   ITraceData::Ptr data;
diff --git a/sim/simx/types.h b/sim/simx/types.h
index 220d4b645..b86aec2db 100644
--- a/sim/simx/types.h
+++ b/sim/simx/types.h
@@ -269,6 +269,48 @@ inline std::ostream &operator<<(std::ostream &os, const SfuType& type) {
 
 ///////////////////////////////////////////////////////////////////////////////
 
+enum class VpuType {
+  VSET,     // Set vector length
+  VL,       // Vector load
+  VS,       // Vector store
+
+  // ALU OPERATIONS
+  ARITHVV,  // Vector-vector
+  MULVV,
+  DIVVV,
+
+  ARITHVX,  // Vector-scalar
+  MULVX,
+  DIVVX,
+
+  ARITHVI,  // Vector-immediate
+  MULVI,
+  DIVVI,
+
+  // FPU OPERATIONS
+  ARITHFVV,  // Vector-vector
+  MULFVV,
+  DIVFVV,
+
+  ARITHFVX,  // Vector-scalar
+  MULFVX,
+  DIVFVX,
+
+  ARITHFVI,  // Vector-immediate
+  MULFVI,
+  DIVFVI
+};
+
+inline std::ostream &operator<<(std::ostream &os, const VpuType& type) {
+  switch (type) {
+  case VpuType::VSET:   os << "VSET"; break;
+  default: assert(false);
+  }
+  return os;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
 enum class ArbiterType {
   Priority,
   RoundRobin
diff --git a/sim/simx/vec_unit.cpp b/sim/simx/vec_unit.cpp
new file mode 100644
index 000000000..86c71cd8a
--- /dev/null
+++ b/sim/simx/vec_unit.cpp
@@ -0,0 +1,87 @@
+#ifdef EXT_V_ENABLE
+
+#include "vec_unit.h"
+
+using namespace vortex;
+
+class VecUnit::Impl {
+public:
+    Impl(VecUnit* simobject, const Arch& /*arch*/)
+        : simobject_(simobject)
+    {
+        this->clear();
+    }
+
+    ~Impl() {}
+
+    void clear() {
+        perf_stats_ = PerfStats();
+    }
+
+    void tick() {
+    }
+
+/*
+    void load(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata) {
+    }
+
+    void store(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata) {
+    }
+
+    void execute(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata, std::vector<reg_data_t> &rddata) {
+    }
+*/
+
+    const PerfStats& perf_stats() const {
+        return perf_stats_;
+    }
+
+private:
+
+    VecUnit* simobject_;
+    std::vector<std::vector<Byte>>  vreg_file_;
+    vtype_t                         vtype_;
+    uint32_t                        vl_;
+    Word                            vlmax_;
+    PerfStats perf_stats_;
+};
+
+VecUnit::VecUnit(const SimContext& ctx,
+                 const char* name,
+                 const Arch &arch)
+    : SimObject<VecUnit>(ctx, name)
+    , Input(this)
+    , Output(this)
+    , impl_(new Impl(this, arch))
+{}
+
+VecUnit::~VecUnit() {
+    delete impl_;
+}
+
+void VecUnit::reset() {
+    impl_->clear();
+}
+
+void VecUnit::tick() {
+    impl_->tick();
+}
+
+/*
+void VecUnit::load(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata) {
+    return impl_->load(instr, wid, rsdata);
+}
+
+void VecUnit::store(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata) {
+    return impl_->store(instr, wid, rsdata);
+}
+
+void VecUnit::execute(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata, std::vector<reg_data_t> &rddata) {
+    return impl_->execute(instr, wid, rsdata, rddata);
+}
+*/
+
+const VecUnit::PerfStats& VecUnit::perf_stats() const {
+    return impl_->perf_stats();
+}
+#endif
\ No newline at end of file
diff --git a/sim/simx/vec_unit.h b/sim/simx/vec_unit.h
new file mode 100644
index 000000000..8fbb95d3f
--- /dev/null
+++ b/sim/simx/vec_unit.h
@@ -0,0 +1,87 @@
+#ifdef EXT_V_ENABLE
+#pragma once
+
+#include "arch.h"
+#include "instr.h"
+#include "instr_trace.h"
+#include <simobject.h>
+#include "types.h"
+
+namespace vortex {
+
+struct vtype_t {
+  uint32_t vill;
+  uint32_t vma;
+  uint32_t vta;
+  uint32_t vsew;
+  uint32_t vlmul;
+};
+
+union reg_data_t {
+  Word     u;
+  WordI    i;
+  WordF    f;
+  float    f32;
+  double   f64;
+  uint32_t u32;
+  uint64_t u64;
+  int32_t  i32;
+  int64_t  i64;
+};
+
+class VecUnit : public SimObject<VecUnit> {
+public:
+  struct PerfStats {
+    uint64_t reads;
+    uint64_t writes;
+    uint64_t latency;
+    uint64_t stalls;
+
+    PerfStats()
+      : reads(0)
+      , writes(0)
+      , latency(0)
+      , stalls(0)
+    {}
+
+    PerfStats& operator+=(const PerfStats& rhs) {
+      this->reads   += rhs.reads;
+      this->writes  += rhs.writes;
+      this->latency += rhs.latency;
+      this->stalls  += rhs.stalls;
+      return *this;
+    }
+  };
+
+  std::vector<SimPort<MemReq>> MemReqs;
+  std::vector<SimPort<MemRsp>> MemRsps;
+
+  SimPort<instr_trace_t*> Input;
+  SimPort<instr_trace_t*> Output;
+
+  VecUnit(const SimContext& ctx,
+          const char* name,
+          const Arch &arch);
+
+  ~VecUnit();
+
+  void reset();
+
+  void tick();
+
+  void load(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata);
+
+  void store(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata);
+
+  void execute(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata, std::vector<reg_data_t> &rddata);
+
+  const PerfStats& perf_stats() const;
+
+private:
+
+  class Impl;
+  Impl* impl_;
+};
+
+}
+#endif
\ No newline at end of file