From 929ef1b6e2d1eb726f67bf6c30edb8bcbf7896d2 Mon Sep 17 00:00:00 2001
From: MichaelJSr <miky.srouji@gmail.com>
Date: Mon, 13 Jan 2025 16:45:13 -0800
Subject: [PATCH 1/4] Remove unused EXTV code, clean up code, pragma once
 around vpu.h

---
 sim/simx/Makefile                             |    2 +-
 sim/simx/arch.h                               |    6 -
 sim/simx/emulator.cpp                         |   12 +-
 sim/simx/execute.cpp                          |   12 +-
 sim/simx/{execute_v.cpp => vpu.cpp}           | 2391 +----------------
 sim/simx/vpu.h                                | 2391 +++++++++++++++++
 tests/riscv/riscv-vector-tests/README         |    2 +-
 tests/riscv/riscv-vector-tests/run-test.sh.in |    3 -
 8 files changed, 2399 insertions(+), 2420 deletions(-)
 rename sim/simx/{execute_v.cpp => vpu.cpp} (55%)
 create mode 100644 sim/simx/vpu.h
diff --git a/sim/simx/Makefile b/sim/simx/Makefile
index d3e726bbe..4b0fa410f 100644
--- a/sim/simx/Makefile
+++ b/sim/simx/Makefile
@@ -22,7 +22,7 @@ SRCS += $(SRC_DIR)/processor.cpp $(SRC_DIR)/cluster.cpp $(SRC_DIR)/socket.cpp $(
 
 # Add V extension sources
 ifneq ($(findstring -DEXT_V_ENABLE, $(CONFIGS)),)
-  SRCS += $(SRC_DIR)/execute_v.cpp
+  SRCS += $(SRC_DIR)/vpu.cpp
 endif
 
 # Debugging
diff --git a/sim/simx/arch.h b/sim/simx/arch.h
index d68345db6..6becf5c91 100644
--- a/sim/simx/arch.h
+++ b/sim/simx/arch.h
@@ -29,7 +29,6 @@ class Arch {
   uint16_t num_cores_;
   uint16_t num_clusters_;
   uint16_t socket_size_;
-  uint16_t vsize_;
   uint16_t num_barriers_;
   uint64_t local_mem_base_;
 
@@ -40,7 +39,6 @@ class Arch {
     , num_cores_(num_cores)
     , num_clusters_(NUM_CLUSTERS)
     , socket_size_(SOCKET_SIZE)
-    , vsize_(VLEN / 8)
     , num_barriers_(NUM_BARRIERS)
     , local_mem_base_(LMEM_BASE_ADDR)
   {}
@@ -73,10 +71,6 @@ class Arch {
     return socket_size_;
   }
 
-  uint16_t vsize() const {
-    return vsize_;
-  }
-
 };
 
 }
\ No newline at end of file
diff --git a/sim/simx/emulator.cpp b/sim/simx/emulator.cpp
index b834a87f2..4bb94915e 100644
--- a/sim/simx/emulator.cpp
+++ b/sim/simx/emulator.cpp
@@ -33,7 +33,7 @@ using namespace vortex;
 Emulator::warp_t::warp_t(const Arch& arch)
   : ireg_file(arch.num_threads(), std::vector<Word>(MAX_NUM_REGS))
   , freg_file(arch.num_threads(), std::vector<uint64_t>(MAX_NUM_REGS))
-  , vreg_file(MAX_NUM_REGS, std::vector<Byte>(arch.vsize()))
+  , vreg_file(MAX_NUM_REGS, std::vector<Byte>(MAX_NUM_REGS))
   , uuid(0)
 {}
 
@@ -77,16 +77,6 @@ void Emulator::warp_t::clear(uint64_t startup_addr) {
     #endif
     }
   }
-
-  for (auto& reg_file : this->vreg_file) {
-    for (auto& reg : reg_file) {
-    #ifndef NDEBUG
-      reg = 0;
-    #else
-      reg = std::rand();
-    #endif
-    }
-  }
 }
 
 ///////////////////////////////////////////////////////////////////////////////
diff --git a/sim/simx/execute.cpp b/sim/simx/execute.cpp
index 436d43486..86623a00c 100644
--- a/sim/simx/execute.cpp
+++ b/sim/simx/execute.cpp
@@ -932,7 +932,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
     for (uint32_t t = thread_start; t < num_threads; ++t) {
       if (!warp.tmask.test(t))
         continue;
-      uint32_t frm = (func3 == 0x7) ? this->get_csr(VX_CSR_FRM, t, wid) : func3;
+      uint32_t frm = this->get_fpu_rm(func3, t, wid);
       uint32_t fflags = 0;
       switch (func7) {
       case 0x00: { // RV32F: FADD.S
@@ -1247,10 +1247,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
         break;
       }
       }
-      if (fflags) {
-        this->set_csr(VX_CSR_FCSR, this->get_csr(VX_CSR_FCSR, t, wid) | fflags, t, wid);
-        this->set_csr(VX_CSR_FFLAGS, this->get_csr(VX_CSR_FFLAGS, t, wid) | fflags, t, wid);
-      }
+      this->update_fcrs(fflags, t, wid);
     }
     rd_write = true;
     break;
@@ -1304,10 +1301,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
       default:
         break;
       }
-      if (fflags) {
-        this->set_csr(VX_CSR_FCSR, this->get_csr(VX_CSR_FCSR, t, wid) | fflags, t, wid);
-        this->set_csr(VX_CSR_FFLAGS, this->get_csr(VX_CSR_FFLAGS, t, wid) | fflags, t, wid);
-      }
+      this->update_fcrs(fflags, t, wid);
     }
     rd_write = true;
     break;
diff --git a/sim/simx/execute_v.cpp b/sim/simx/vpu.cpp
similarity index 55%
rename from sim/simx/execute_v.cpp
rename to sim/simx/vpu.cpp
index d14338024..63ed8fcc2 100644
--- a/sim/simx/execute_v.cpp
+++ b/sim/simx/vpu.cpp
@@ -1,5 +1,5 @@
 // This is a fork of https://github.com/troibe/vortex/tree/simx-v2-vector
-// The purpose of this fork is to make the simx-v2-vector up to date with master
+// The purpose of this fork is to make simx-v2-vector up to date with master
 // Thanks to Troibe for his amazing work
 
 #include "emulator.h"
@@ -10,2397 +10,10 @@
 #include <math.h>
 #include <rvfloats.h>
 #include <stdlib.h>
+#include "vpu.h"
 
 using namespace vortex;
 
-template <typename T, typename R>
-class Add {
-public:
-  static R apply(T first, T second, R) {
-    return (R)first + (R)second;
-  }
-  static std::string name() { return "Add"; }
-};
-
-template <typename T, typename R>
-class Sub {
-public:
-  static R apply(T first, T second, R) {
-    return (R)second - (R)first;
-  }
-  static std::string name() { return "Sub"; }
-};
-
-template <typename T, typename R>
-class Adc {
-public:
-  static R apply(T first, T second, R third) {
-    return (R)first + (R)second + third;
-  }
-  static std::string name() { return "Adc"; }
-};
-
-template <typename T, typename R>
-class Madc {
-public:
-  static R apply(T first, T second, R third) {
-    return ((R)first + (R)second + third) > (R)std::numeric_limits<T>::max();
-  }
-  static std::string name() { return "Madc"; }
-};
-
-template <typename T, typename R>
-class Sbc {
-public:
-  static R apply(T first, T second, R third) {
-    return (R)second - (R)first - third;
-  }
-  static std::string name() { return "Sbc"; }
-};
-
-template <typename T, typename R>
-class Msbc {
-public:
-  static R apply(T first, T second, R third) {
-    return (R)second < ((R)first + third);
-  }
-  static std::string name() { return "Msbc"; }
-};
-
-template <typename T, typename R>
-class Ssub {
-public:
-  static R apply(T first, T second, uint32_t, uint32_t &vxsat_) {
-    // rounding mode is not relevant for this operation
-    T unclippedResult = second - first;
-    R clippedResult = std::clamp(unclippedResult, (T)std::numeric_limits<R>::min(), (T)std::numeric_limits<R>::max());
-    vxsat_ |= clippedResult != unclippedResult;
-    return clippedResult;
-  }
-  static std::string name() { return "Ssub"; }
-};
-
-template <typename T, typename R>
-class Ssubu {
-public:
-  static R apply(T first, T second, uint32_t, uint32_t &vxsat_) {
-    // rounding mode is not relevant for this operation
-    if (first > second) {
-      vxsat_ = true;
-      return 0;
-    } else {
-      vxsat_ = false;
-      return second - first;
-    }
-  }
-  static std::string name() { return "Ssubu"; }
-};
-
-template <typename T, typename R>
-class Sadd {
-public:
-  static R apply(T first, T second, uint32_t, uint32_t &vxsat_) {
-    // rounding mode is not relevant for this operation
-    T unclippedResult = second + first;
-    R clippedResult = std::clamp(unclippedResult, (T)std::numeric_limits<R>::min(), (T)std::numeric_limits<R>::max());
-    vxsat_ |= clippedResult != unclippedResult;
-    return clippedResult;
-  }
-  static std::string name() { return "Sadd"; }
-};
-
-template <typename T, typename R>
-class Rsub {
-public:
-  static R apply(T first, T second, R) {
-    return first - second;
-  }
-  static std::string name() { return "Rsub"; }
-};
-
-template <typename T, typename R>
-class Div {
-public:
-  static R apply(T first, T second, R) {
-    // logic taken from scalar div
-    if (first == 0) {
-      return -1;
-    } else if (second == std::numeric_limits<T>::min() && first == T(-1)) {
-      return second;
-    } else {
-      return (R)second / (R)first;
-    }
-  }
-  static std::string name() { return "Div"; }
-};
-
-template <typename T, typename R>
-class Rem {
-public:
-  static R apply(T first, T second, R) {
-    // logic taken from scalar rem
-    if (first == 0) {
-      return second;
-    } else if (second == std::numeric_limits<T>::min() && first == T(-1)) {
-      return 0;
-    } else {
-      return (R)second % (R)first;
-    }
-  }
-  static std::string name() { return "Rem"; }
-};
-
-template <typename T, typename R>
-class Mul {
-public:
-  static R apply(T first, T second, R) {
-    return (R)first * (R)second;
-  }
-  static std::string name() { return "Mul"; }
-};
-
-template <typename T, typename R>
-class Mulsu {
-public:
-  static R apply(T first, T second, R) {
-    R first_ext = zext((R)first, (sizeof(T) * 8));
-    return first_ext * (R)second;
-  }
-  static std::string name() { return "Mulsu"; }
-};
-
-template <typename T, typename R>
-class Mulh {
-public:
-  static R apply(T first, T second, R) {
-    __int128_t first_ext = sext((__int128_t)first, (sizeof(T) * 8));
-    __int128_t second_ext = sext((__int128_t)second, (sizeof(T) * 8));
-    return (first_ext * second_ext) >> (sizeof(T) * 8);
-  }
-  static std::string name() { return "Mulh"; }
-};
-
-template <typename T, typename R>
-class Mulhsu {
-public:
-  static R apply(T first, T second, R) {
-    __int128_t first_ext = zext((__int128_t)first, (sizeof(T) * 8));
-    __int128_t second_ext = sext((__int128_t)second, (sizeof(T) * 8));
-    return (first_ext * second_ext) >> (sizeof(T) * 8);
-  }
-  static std::string name() { return "Mulhsu"; }
-};
-
-template <typename T, typename R>
-class Mulhu {
-public:
-  static R apply(T first, T second, R) {
-    return ((__uint128_t)first * (__uint128_t)second) >> (sizeof(T) * 8);
-  }
-  static std::string name() { return "Mulhu"; }
-};
-
-template <typename T, typename R>
-class Madd {
-public:
-  static R apply(T first, T second, R third) {
-    return ((R)first * third) + (R)second;
-  }
-  static std::string name() { return "Madd"; }
-};
-
-template <typename T, typename R>
-class Nmsac {
-public:
-  static R apply(T first, T second, R third) {
-    return -((R)first * (R)second) + third;
-  }
-  static std::string name() { return "Nmsac"; }
-};
-
-template <typename T, typename R>
-class Macc {
-public:
-  static R apply(T first, T second, R third) {
-    return ((R)first * (R)second) + third;
-  }
-  static std::string name() { return "Macc"; }
-};
-
-template <typename T, typename R>
-class Maccsu {
-public:
-  static R apply(T first, T second, R third) {
-    R first_ext = sext((R)first, (sizeof(T) * 8));
-    R second_ext = zext((R)second, (sizeof(T) * 8));
-    return (first_ext * second_ext) + third;
-  }
-  static std::string name() { return "Maccsu"; }
-};
-
-template <typename T, typename R>
-class Maccus {
-public:
-  static R apply(T first, T second, R third) {
-    R first_ext = zext((R)first, (sizeof(T) * 8));
-    R second_ext = sext((R)second, (sizeof(T) * 8));
-    return (first_ext * second_ext) + third;
-  }
-  static std::string name() { return "Maccus"; }
-};
-
-template <typename T, typename R>
-class Nmsub {
-public:
-  static R apply(T first, T second, R third) {
-    return -((R)first * third) + (R)second;
-  }
-  static std::string name() { return "Nmsub"; }
-};
-
-template <typename T, typename R>
-class Min {
-public:
-  static R apply(T first, T second, R) {
-    return std::min(first, second);
-  }
-  static std::string name() { return "Min"; }
-};
-
-template <typename T, typename R>
-class Max {
-public:
-  static R apply(T first, T second, R) {
-    return std::max(first, second);
-  }
-  static std::string name() { return "Max"; }
-};
-
-template <typename T, typename R>
-class And {
-public:
-  static R apply(T first, T second, R) {
-    return first & second;
-  }
-  static std::string name() { return "And"; }
-};
-
-template <typename T, typename R>
-class Or {
-public:
-  static R apply(T first, T second, R) {
-    return first | second;
-  }
-  static std::string name() { return "Or"; }
-};
-
-template <typename T, typename R>
-class Xor {
-public:
-  static R apply(T first, T second, R) {
-    return first ^ second;
-  }
-  static std::string name() { return "Xor"; }
-};
-
-template <typename T, typename R>
-class Sll {
-public:
-  static R apply(T first, T second, R) {
-    // Only the low lg2(SEW) bits of the shift-amount value are used to control the shift amount.
-    return second << (first & (sizeof(T) * 8 - 1));
-  }
-  static std::string name() { return "Sll"; }
-};
-
-template <typename T, typename R>
-bool bitAt(T value, R pos, R negOffset) {
-  R offsetPos = pos - negOffset;
-  return pos >= negOffset && ((value >> offsetPos) & 0x1);
-}
-
-template <typename T, typename R>
-bool anyBitUpTo(T value, R to, R negOffset) {
-  R offsetTo = to - negOffset;
-  return to >= negOffset && (value & (((R)1 << (offsetTo + 1)) - 1));
-}
-
-template <typename T, typename R>
-bool roundBit(T value, R shiftDown, uint32_t vxrm) {
-  switch (vxrm) {
-  case 0: // round-to-nearest-up
-    return bitAt(value, shiftDown, (R)1);
-  case 1: // round-to-nearest-even
-    return bitAt(value, shiftDown, (R)1) && (anyBitUpTo(value, shiftDown, (R)2) || bitAt(value, shiftDown, (R)0));
-  case 2: // round-down (truncate)
-    return 0;
-  case 3: // round-to-odd
-    return !bitAt(value, shiftDown, (R)0) && anyBitUpTo(value, shiftDown, (R)1);
-  default:
-    std::cout << "Roundoff - invalid value for vxrm: " << vxrm << std::endl;
-    std::abort();
-  }
-}
-
-template <typename T, typename R>
-class SrlSra {
-public:
-  static R apply(T first, T second, R) {
-    // Only the low lg2(SEW) bits of the shift-amount value are used to control the shift amount.
-    return second >> (first & (sizeof(T) * 8 - 1));
-  }
-  static R apply(T first, T second, uint32_t vxrm, uint32_t) {
-    // Saturation is not relevant for this operation
-    // Only the low lg2(SEW) bits of the shift-amount value are used to control the shift amount.
-    T firstValid = first & (sizeof(T) * 8 - 1);
-    return apply(firstValid, second, 0) + roundBit(second, firstValid, vxrm);
-  }
-  static std::string name() { return "SrlSra"; }
-};
-
-template <typename T, typename R>
-class Aadd {
-public:
-  static R apply(T first, T second, uint32_t vxrm, uint32_t) {
-    // Saturation is not relevant for this operation
-    T sum = second + first;
-    return (sum >> 1) + roundBit(sum, 1, vxrm);
-  }
-  static std::string name() { return "Aadd"; }
-};
-
-template <typename T, typename R>
-class Asub {
-public:
-  static R apply(T first, T second, uint32_t vxrm, uint32_t) {
-    // Saturation is not relevant for this operation
-    T difference = second - first;
-    return (difference >> 1) + roundBit(difference, 1, vxrm);
-  }
-  static std::string name() { return "Asub"; }
-};
-
-template <typename T, typename R>
-class Eq {
-public:
-  static R apply(T first, T second, R) {
-    return first == second;
-  }
-  static std::string name() { return "Eq"; }
-};
-
-template <typename T, typename R>
-class Ne {
-public:
-  static R apply(T first, T second, R) {
-    return first != second;
-  }
-  static std::string name() { return "Ne"; }
-};
-
-template <typename T, typename R>
-class Lt {
-public:
-  static R apply(T first, T second, R) {
-    return first > second;
-  }
-  static std::string name() { return "Lt"; }
-};
-
-template <typename T, typename R>
-class Le {
-public:
-  static R apply(T first, T second, R) {
-    return first >= second;
-  }
-  static std::string name() { return "Le"; }
-};
-
-template <typename T, typename R>
-class Gt {
-public:
-  static R apply(T first, T second, R) {
-    return first < second;
-  }
-  static std::string name() { return "Gt"; }
-};
-
-template <typename T, typename R>
-class AndNot {
-public:
-  static R apply(T first, T second, R) {
-    return second & ~first;
-  }
-  static std::string name() { return "AndNot"; }
-};
-
-template <typename T, typename R>
-class OrNot {
-public:
-  static R apply(T first, T second, R) {
-    return second | ~first;
-  }
-  static std::string name() { return "OrNot"; }
-};
-
-template <typename T, typename R>
-class Nand {
-public:
-  static R apply(T first, T second, R) {
-    return ~(second & first);
-  }
-  static std::string name() { return "Nand"; }
-};
-
-template <typename T, typename R>
-class Mv {
-public:
-  static R apply(T first, T, R) {
-    return first;
-  }
-  static std::string name() { return "Mv"; }
-};
-
-template <typename T, typename R>
-class Nor {
-public:
-  static R apply(T first, T second, R) {
-    return ~(second | first);
-  }
-  static std::string name() { return "Nor"; }
-};
-
-template <typename T, typename R>
-class Xnor {
-public:
-  static R apply(T first, T second, R) {
-    return ~(second ^ first);
-  }
-  static std::string name() { return "Xnor"; }
-};
-
-template <typename T, typename R>
-class Fadd {
-public:
-  static R apply(T first, T second, R) {
-    // ignoring flags for now
-    uint32_t fflags = 0;
-    // ignoring rounding mode for now
-    uint32_t frm = 0;
-    if (sizeof(R) == 4) {
-      return rv_fadd_s(first, second, frm, &fflags);
-    } else if (sizeof(R) == 8) {
-      uint64_t first_d = sizeof(T) == 8 ? first : rv_ftod(first);
-      uint64_t second_d = sizeof(T) == 8 ? second : rv_ftod(second);
-      return rv_fadd_d(first_d, second_d, frm, &fflags);
-    } else {
-      std::cout << "Fadd only supports f32 and f64" << std::endl;
-      std::abort();
-    }
-  }
-  static std::string name() { return "Fadd"; }
-};
-
-template <typename T, typename R>
-class Fsub {
-public:
-  static R apply(T first, T second, R) {
-    // ignoring flags for now
-    uint32_t fflags = 0;
-    // ignoring rounding mode for now
-    uint32_t frm = 0;
-    if (sizeof(R) == 4) {
-      return rv_fsub_s(second, first, frm, &fflags);
-    } else if (sizeof(R) == 8) {
-      uint64_t first_d = sizeof(T) == 8 ? first : rv_ftod(first);
-      uint64_t second_d = sizeof(T) == 8 ? second : rv_ftod(second);
-      return rv_fsub_d(second_d, first_d, frm, &fflags);
-    } else {
-      std::cout << "Fsub only supports f32 and f64" << std::endl;
-      std::abort();
-    }
-  }
-  static std::string name() { return "Fsub"; }
-};
-
-template <typename T, typename R>
-class Fmacc {
-public:
-  static R apply(T first, T second, R third) {
-    // ignoring flags for now
-    uint32_t fflags = 0;
-    // ignoring rounding mode for now
-    uint32_t frm = 0;
-    if (sizeof(R) == 4) {
-      return rv_fmadd_s(first, second, third, frm, &fflags);
-    } else if (sizeof(R) == 8) {
-      uint64_t first_d = sizeof(T) == 8 ? first : rv_ftod(first);
-      uint64_t second_d = sizeof(T) == 8 ? second : rv_ftod(second);
-      return rv_fmadd_d(first_d, second_d, third, frm, &fflags);
-    } else {
-      std::cout << "Fmacc only supports f32 and f64" << std::endl;
-      std::abort();
-    }
-  }
-  static std::string name() { return "Fmacc"; }
-};
-
-template <typename T, typename R>
-class Fnmacc {
-public:
-  static R apply(T first, T second, R third) {
-    // ignoring flags for now
-    uint32_t fflags = 0;
-    // ignoring rounding mode for now
-    uint32_t frm = 0;
-    if (sizeof(R) == 4) {
-      return rv_fnmadd_s(first, second, third, frm, &fflags);
-    } else if (sizeof(R) == 8) {
-      uint64_t first_d = sizeof(T) == 8 ? first : rv_ftod(first);
-      uint64_t second_d = sizeof(T) == 8 ? second : rv_ftod(second);
-      return rv_fnmadd_d(first_d, second_d, third, frm, &fflags);
-    } else {
-      std::cout << "Fnmacc only supports f32 and f64" << std::endl;
-      std::abort();
-    }
-  }
-  static std::string name() { return "Fnmacc"; }
-};
-
-template <typename T, typename R>
-class Fmsac {
-public:
-  static R apply(T first, T second, R third) {
-    // ignoring flags for now
-    uint32_t fflags = 0;
-    // ignoring rounding mode for now
-    uint32_t frm = 0;
-    if (sizeof(R) == 4) {
-      return rv_fmadd_s(first, second, rv_fsgnjn_s(third, third), frm, &fflags);
-    } else if (sizeof(R) == 8) {
-      uint64_t first_d = sizeof(T) == 8 ? first : rv_ftod(first);
-      uint64_t second_d = sizeof(T) == 8 ? second : rv_ftod(second);
-      return rv_fmadd_d(first_d, second_d, rv_fsgnjn_d(third, third), frm, &fflags);
-    } else {
-      std::cout << "Fmsac only supports f32 and f64" << std::endl;
-      std::abort();
-    }
-  }
-  static std::string name() { return "Fmsac"; }
-};
-
-template <typename T, typename R>
-class Fnmsac {
-public:
-  static R apply(T first, T second, R third) {
-    // ignoring flags for now
-    uint32_t fflags = 0;
-    // ignoring rounding mode for now
-    uint32_t frm = 0;
-    if (sizeof(R) == 4) {
-      return rv_fnmadd_s(first, second, rv_fsgnjn_s(third, third), frm, &fflags);
-    } else if (sizeof(R) == 8) {
-      uint64_t first_d = sizeof(T) == 8 ? first : rv_ftod(first);
-      uint64_t second_d = sizeof(T) == 8 ? second : rv_ftod(second);
-      return rv_fnmadd_d(first_d, second_d, rv_fsgnjn_d(third, third), frm, &fflags);
-    } else {
-      std::cout << "Fnmsac only supports f32 and f64" << std::endl;
-      std::abort();
-    }
-  }
-  static std::string name() { return "Fnmsac"; }
-};
-
-template <typename T, typename R>
-class Fmadd {
-public:
-  static R apply(T first, T second, R third) {
-    if (sizeof(T) == 4 || sizeof(T) == 8) {
-      return Fmacc<T, R>::apply(first, third, second);
-    } else {
-      std::cout << "Fmadd only supports f32 and f64" << std::endl;
-      std::abort();
-    }
-  }
-  static std::string name() { return "Fmadd"; }
-};
-
-template <typename T, typename R>
-class Fnmadd {
-public:
-  static R apply(T first, T second, R third) {
-    if (sizeof(T) == 4 || sizeof(T) == 8) {
-      return Fnmacc<T, R>::apply(first, third, second);
-    } else {
-      std::cout << "Fnmadd only supports f32 and f64" << std::endl;
-      std::abort();
-    }
-  }
-  static std::string name() { return "Fnmadd"; }
-};
-
-template <typename T, typename R>
-class Fmsub {
-public:
-  static R apply(T first, T second, R third) {
-    if (sizeof(T) == 4 || sizeof(T) == 8) {
-      return Fmsac<T, R>::apply(first, third, second);
-    } else {
-      std::cout << "Fmsub only supports f32 and f64" << std::endl;
-      std::abort();
-    }
-  }
-  static std::string name() { return "Fmsub"; }
-};
-
-template <typename T, typename R>
-class Fnmsub {
-public:
-  static R apply(T first, T second, R third) {
-    if (sizeof(T) == 4 || sizeof(T) == 8) {
-      return Fnmsac<T, R>::apply(first, third, second);
-    } else {
-      std::cout << "Fnmsub only supports f32 and f64" << std::endl;
-      std::abort();
-    }
-  }
-  static std::string name() { return "Fnmsub"; }
-};
-
-template <typename T, typename R>
-class Fmin {
-public:
-  static R apply(T first, T second, R) {
-    // ignoring rounding modes for now
-    uint32_t fflags = 0;
-    if (sizeof(T) == 4) {
-      return rv_fmin_s(first, second, &fflags);
-    } else if (sizeof(T) == 8) {
-      return rv_fmin_d(first, second, &fflags);
-    } else {
-      std::cout << "Fmin only supports f32 and f64" << std::endl;
-      std::abort();
-    }
-  }
-  static std::string name() { return "Fmin"; }
-};
-
-template <typename T, typename R>
-class Fmax {
-public:
-  static R apply(T first, T second, R) {
-    // ignoring rounding modes for now
-    uint32_t fflags = 0;
-    if (sizeof(T) == 4) {
-      return rv_fmax_s(first, second, &fflags);
-    } else if (sizeof(T) == 8) {
-      return rv_fmax_d(first, second, &fflags);
-    } else {
-      std::cout << "Fmax only supports f32 and f64" << std::endl;
-      std::abort();
-    }
-  }
-  static std::string name() { return "Fmax"; }
-};
-
-template <typename T, typename R>
-class Fsgnj {
-public:
-  static R apply(T first, T second, R) {
-    if (sizeof(T) == 4) {
-      return rv_fsgnj_s(second, first);
-    } else if (sizeof(T) == 8) {
-      return rv_fsgnj_d(second, first);
-    } else {
-      std::cout << "Fsgnj only supports f32 and f64" << std::endl;
-      std::abort();
-    }
-  }
-  static std::string name() { return "Fsgnj"; }
-};
-
-template <typename T, typename R>
-class Fsgnjn {
-public:
-  static R apply(T first, T second, R) {
-    if (sizeof(T) == 4) {
-      return rv_fsgnjn_s(second, first);
-    } else if (sizeof(T) == 8) {
-      return rv_fsgnjn_d(second, first);
-    } else {
-      std::cout << "Fsgnjn only supports f32 and f64" << std::endl;
-      std::abort();
-    }
-  }
-  static std::string name() { return "Fsgnjn"; }
-};
-
-template <typename T, typename R>
-class Fsgnjx {
-public:
-  static R apply(T first, T second, R) {
-    if (sizeof(T) == 4) {
-      return rv_fsgnjx_s(second, first);
-    } else if (sizeof(T) == 8) {
-      return rv_fsgnjx_d(second, first);
-    } else {
-      std::cout << "Fsgnjx only supports f32 and f64" << std::endl;
-      std::abort();
-    }
-  }
-  static std::string name() { return "Fsgnjx"; }
-};
-
-template <typename T, typename R>
-class Fcvt {
-public:
-  static R apply(T first, T second, R) {
-    // ignoring flags for now
-    uint32_t fflags = 0;
-    // ignoring rounding mode for now
-    uint32_t frm = 0;
-    if (sizeof(T) == 4) {
-      switch (first) {
-      case 0b00000: // vfcvt.xu.f.v
-        return rv_ftou_s(second, frm, &fflags);
-      case 0b00001: // vfcvt.x.f.v
-        return rv_ftoi_s(second, frm, &fflags);
-      case 0b00010: // vfcvt.f.xu.v
-        return rv_utof_s(second, frm, &fflags);
-      case 0b00011: // vfcvt.f.x.v
-        return rv_itof_s(second, frm, &fflags);
-      case 0b00110: // vfcvt.rtz.xu.f.v
-        return rv_ftou_s(second, 1, &fflags);
-      case 0b00111: // vfcvt.rtz.x.f.v
-        return rv_ftoi_s(second, 1, &fflags);
-      case 0b01000: // vfwcvt.xu.f.v
-        return rv_ftolu_s(second, frm, &fflags);
-      case 0b01001: // vfwcvt.x.f.v
-        return rv_ftol_s(second, frm, &fflags);
-      case 0b01010: // vfwcvt.f.xu.v
-        return rv_utof_d(second, frm, &fflags);
-      case 0b01011: // vfwcvt.f.x.v
-        return rv_itof_d(second, frm, &fflags);
-      case 0b01100: // vfwcvt.f.f.v
-        return rv_ftod(second);
-      case 0b01110: // vfwcvt.rtz.xu.f.v
-        return rv_ftolu_s(second, 1, &fflags);
-      case 0b01111: // vfwcvt.rtz.x.f.v
-        return rv_ftol_s(second, 1, &fflags);
-      default:
-        std::cout << "Fcvt has unsupported value for first: " << first << std::endl;
-        std::abort();
-      }
-    } else if (sizeof(T) == 8) {
-      switch (first) {
-      case 0b00000: // vfcvt.xu.f.v
-        return rv_ftolu_d(second, frm, &fflags);
-      case 0b00001: // vfcvt.x.f.v
-        return rv_ftol_d(second, frm, &fflags);
-      case 0b00010: // vfcvt.f.xu.v
-        return rv_lutof_d(second, frm, &fflags);
-      case 0b00011: // vfcvt.f.x.v
-        return rv_ltof_d(second, frm, &fflags);
-      case 0b00110: // vfcvt.rtz.xu.f.v
-        return rv_ftolu_d(second, 1, &fflags);
-      case 0b00111: // vfcvt.rtz.x.f.v
-        return rv_ftol_d(second, 1, &fflags);
-      case 0b01000: // vfwcvt.xu.f.v
-      case 0b01001: // vfwcvt.x.f.v
-      case 0b01010: // vfwcvt.f.xu.v
-      case 0b01011: // vfwcvt.f.x.v
-      case 0b01100: // vfwcvt.f.f.v
-      case 0b01110: // vfwcvt.rtz.xu.f.v
-      case 0b01111: // vfwcvt.rtz.x.f.v
-        std::cout << "Fwcvt only supports f32" << std::endl;
-        std::abort();
-      default:
-        std::cout << "Fcvt has unsupported value for first: " << first << std::endl;
-        std::abort();
-      }
-    } else {
-      std::cout << "Fcvt only supports f32 and f64" << std::endl;
-      std::abort();
-    }
-  }
-  static R apply(T first, T second, uint32_t vxrm, uint32_t &) { // saturation argument is unused
-    // ignoring flags for now
-    uint32_t fflags = 0;
-    if (sizeof(T) == 8) {
-      switch (first) {
-      case 0b10000: // vfncvt.xu.f.w
-        return rv_ftou_d(second, vxrm, &fflags);
-      case 0b10001: // vfncvt.x.f.w
-        return rv_ftoi_d(second, vxrm, &fflags);
-      case 0b10010: // vfncvt.f.xu.w
-        return rv_lutof_s(second, vxrm, &fflags);
-      case 0b10011: // vfncvt.f.x.w
-        return rv_ltof_s(second, vxrm, &fflags);
-      case 0b10100: // vfncvt.f.f.w
-        return rv_dtof_r(second, vxrm);
-      case 0b10101: // vfncvt.rod.f.f.w
-        return rv_dtof_r(second, 6);
-      case 0b10110: // vfncvt.rtz.xu.f.w
-        return rv_ftou_d(second, 1, &fflags);
-      case 0b10111: // vfncvt.rtz.x.f.w
-        return rv_ftoi_d(second, 1, &fflags);
-      default:
-        std::cout << "Fncvt has unsupported value for first: " << first << std::endl;
-        std::abort();
-      }
-    } else {
-      std::cout << "Fncvt only supports f64" << std::endl;
-      std::abort();
-    }
-  }
-  static std::string name() { return "Fcvt"; }
-};
-
-template <typename T, typename R>
-class Funary1 {
-public:
-  static R apply(T first, T second, R) {
-    // ignoring flags for now
-    uint32_t fflags = 0;
-    // ignoring rounding mode for now
-    uint32_t frm = 0;
-    if (sizeof(T) == 4) {
-      switch (first) {
-      case 0b00000: // vfsqrt.v
-        return rv_fsqrt_s(second, frm, &fflags);
-      case 0b00100: // vfrsqrt7.v
-        return rv_frsqrt7_s(second, frm, &fflags);
-      case 0b00101: // vfrec7.v
-        return rv_frecip7_s(second, frm, &fflags);
-      case 0b10000: // vfclass.v
-        return rv_fclss_s(second);
-      default:
-        std::cout << "Funary1 has unsupported value for first: " << first << std::endl;
-        std::abort();
-      }
-    } else if (sizeof(T) == 8) {
-      switch (first) {
-      case 0b00000: // vfsqrt.v
-        return rv_fsqrt_d(second, frm, &fflags);
-      case 0b00100: // vfrsqrt7.v
-        return rv_frsqrt7_d(second, frm, &fflags);
-      case 0b00101: // vfrec7.v
-        return rv_frecip7_d(second, frm, &fflags);
-      case 0b10000: // vfclass.v
-        return rv_fclss_d(second);
-      default:
-        std::cout << "Funary1 has unsupported value for first: " << first << std::endl;
-        std::abort();
-      }
-    } else {
-      std::cout << "Funary1 only supports f32 and f64" << std::endl;
-      std::abort();
-    }
-  }
-  static std::string name() { return "Funary1"; }
-};
-
-template <typename T, typename R>
-class Xunary0 {
-public:
-  static R apply(T, T second, T) {
-    return second;
-  }
-  static std::string name() { return "Xunary0"; }
-};
-
-template <typename T, typename R>
-class Feq {
-public:
-  static R apply(T first, T second, R) {
-    // ignoring flags for now
-    uint32_t fflags = 0;
-    if (sizeof(T) == 4) {
-      return rv_feq_s(second, first, &fflags);
-    } else if (sizeof(T) == 8) {
-      return rv_feq_d(second, first, &fflags);
-    } else {
-      std::cout << "Feq only supports f32 and f64" << std::endl;
-      std::abort();
-    }
-  }
-  static std::string name() { return "Feq"; }
-};
-
-template <typename T, typename R>
-class Fle {
-public:
-  static R apply(T first, T second, R) {
-    // ignoring flags for now
-    uint32_t fflags = 0;
-    if (sizeof(T) == 4) {
-      return rv_fle_s(second, first, &fflags);
-    } else if (sizeof(T) == 8) {
-      return rv_fle_d(second, first, &fflags);
-    } else {
-      std::cout << "Fle only supports f32 and f64" << std::endl;
-      std::abort();
-    }
-  }
-  static std::string name() { return "Fle"; }
-};
-
-template <typename T, typename R>
-class Flt {
-public:
-  static R apply(T first, T second, R) {
-    // ignoring flags for now
-    uint32_t fflags = 0;
-    if (sizeof(T) == 4) {
-      return rv_flt_s(second, first, &fflags);
-    } else if (sizeof(T) == 8) {
-      return rv_flt_d(second, first, &fflags);
-    } else {
-      std::cout << "Flt only supports f32 and f64" << std::endl;
-      std::abort();
-    }
-  }
-  static std::string name() { return "Flt"; }
-};
-
-template <typename T, typename R>
-class Fne {
-public:
-  static R apply(T first, T second, R) {
-    // ignoring flags for now
-    uint32_t fflags = 0;
-    if (sizeof(T) == 4) {
-      return !rv_feq_s(second, first, &fflags);
-    } else if (sizeof(T) == 8) {
-      return !rv_feq_d(second, first, &fflags);
-    } else {
-      std::cout << "Fne only supports f32 and f64" << std::endl;
-      std::abort();
-    }
-  }
-  static std::string name() { return "Fne"; }
-};
-
-template <typename T, typename R>
-class Fgt {
-public:
-  static R apply(T first, T second, R) {
-    // ignoring flags for now
-    uint32_t fflags = 0;
-    if (sizeof(T) == 4) {
-      return rv_flt_s(first, second, &fflags);
-    } else if (sizeof(T) == 8) {
-      return rv_flt_d(first, second, &fflags);
-    } else {
-      std::cout << "Fgt only supports f32 and f64" << std::endl;
-      std::abort();
-    }
-  }
-  static std::string name() { return "Fgt"; }
-};
-
-template <typename T, typename R>
-class Fge {
-public:
-  static R apply(T first, T second, R) {
-    // ignoring flags for now
-    uint32_t fflags = 0;
-    if (sizeof(T) == 4) {
-      return rv_fle_s(first, second, &fflags);
-    } else if (sizeof(T) == 8) {
-      return rv_fle_d(first, second, &fflags);
-    } else {
-      std::cout << "Fge only supports f32 and f64" << std::endl;
-      std::abort();
-    }
-  }
-  static std::string name() { return "Fge"; }
-};
-
-template <typename T, typename R>
-class Fdiv {
-public:
-  static R apply(T first, T second, R) {
-    // ignoring flags for now
-    uint32_t fflags = 0;
-    // ignoring rounding mode for now
-    uint32_t frm = 0;
-    if (sizeof(T) == 4) {
-      return rv_fdiv_s(second, first, frm, &fflags);
-    } else if (sizeof(T) == 8) {
-      return rv_fdiv_d(second, first, frm, &fflags);
-    } else {
-      std::cout << "Fdiv only supports f32 and f64" << std::endl;
-      std::abort();
-    }
-  }
-  static std::string name() { return "Fdiv"; }
-};
-
-template <typename T, typename R>
-class Frdiv {
-public:
-  static R apply(T first, T second, R) {
-    // ignoring flags for now
-    uint32_t fflags = 0;
-    // ignoring rounding mode for now
-    uint32_t frm = 0;
-    if (sizeof(T) == 4) {
-      return rv_fdiv_s(first, second, frm, &fflags);
-    } else if (sizeof(T) == 8) {
-      return rv_fdiv_d(first, second, frm, &fflags);
-    } else {
-      std::cout << "Frdiv only supports f32 and f64" << std::endl;
-      std::abort();
-    }
-  }
-  static std::string name() { return "Frdiv"; }
-};
-
-template <typename T, typename R>
-class Fmul {
-public:
-  static R apply(T first, T second, R) {
-    // ignoring flags for now
-    uint32_t fflags = 0;
-    // ignoring rounding mode for now
-    uint32_t frm = 0;
-    if (sizeof(R) == 4) {
-      return rv_fmul_s(first, second, frm, &fflags);
-    } else if (sizeof(R) == 8) {
-      uint64_t first_d = sizeof(T) == 8 ? first : rv_ftod(first);
-      uint64_t second_d = sizeof(T) == 8 ? second : rv_ftod(second);
-      return rv_fmul_d(first_d, second_d, frm, &fflags);
-    } else {
-      std::cout << "Fmul only supports f32 and f64" << std::endl;
-      std::abort();
-    }
-  }
-  static std::string name() { return "Fmul"; }
-};
-
-template <typename T, typename R>
-class Frsub {
-public:
-  static R apply(T first, T second, R) {
-    // ignoring flags for now
-    uint32_t fflags = 0;
-    // ignoring rounding mode for now
-    uint32_t frm = 0;
-    if (sizeof(T) == 4) {
-      return rv_fsub_s(first, second, frm, &fflags);
-    } else if (sizeof(T) == 8) {
-      return rv_fsub_d(first, second, frm, &fflags);
-    } else {
-      std::cout << "Frsub only supports f32 and f64" << std::endl;
-      std::abort();
-    }
-  }
-  static std::string name() { return "Frsub"; }
-};
-
-template <typename T, typename R>
-class Clip {
-public:
-  static R apply(T first, T second, uint32_t vxrm, uint32_t &vxsat_) {
-    // The low lg2(2*SEW) bits of the vector or scalar shift-amount value (e.g., the low 6 bits for a SEW=64-bit to
-    // SEW=32-bit narrowing operation) are used to control the right shift amount, which provides the scaling.
-    R firstValid = first & (sizeof(T) * 8 - 1);
-    T unclippedResult = (second >> firstValid) + roundBit(second, firstValid, vxrm);
-    R clippedResult = std::clamp(unclippedResult, (T)std::numeric_limits<R>::min(), (T)std::numeric_limits<R>::max());
-    vxsat_ |= clippedResult != unclippedResult;
-    return clippedResult;
-  }
-  static std::string name() { return "Clip"; }
-};
-
-template <typename T, typename R>
-class Smul {
-public:
-  static R apply(T first, T second, uint32_t vxrm, uint32_t &vxsat_) {
-    R shift = sizeof(R) * 8 - 1;
-    T unshiftedResult = first * second;
-    T unclippedResult = (unshiftedResult >> shift) + roundBit(unshiftedResult, shift, vxrm);
-    R clippedResult = std::clamp(unclippedResult, (T)std::numeric_limits<R>::min(), (T)std::numeric_limits<R>::max());
-    vxsat_ |= clippedResult != unclippedResult;
-    return clippedResult;
-  }
-  static std::string name() { return "Smul"; }
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
-bool isMasked(std::vector<std::vector<Byte>> &vreg_file, uint32_t maskVreg, uint32_t byteI, bool vmask) {
-  auto &mask = vreg_file.at(maskVreg);
-  uint8_t emask = *(uint8_t *)(mask.data() + byteI / 8);
-  uint8_t value = (emask >> (byteI % 8)) & 0x1;
-  DP(4, "Masking enabled: " << +!vmask << " mask element: " << +value);
-  return !vmask && value == 0;
-}
-
-template <typename DT>
-uint32_t getVreg(uint32_t baseVreg, uint32_t byteI) {
-  uint32_t vsew = sizeof(DT) * 8;
-  return (baseVreg + (byteI / (VLEN / vsew))) % 32;
-}
-
-template <typename DT>
-DT &getVregData(std::vector<vortex::Byte> &baseVregVec, uint32_t byteI) {
-  uint32_t vsew = sizeof(DT) * 8;
-  return *(DT *)(baseVregVec.data() + (byteI % (VLEN / vsew)) * vsew / 8);
-}
-
-template <typename DT>
-DT &getVregData(std::vector<std::vector<vortex::Byte>> &vreg_file, uint32_t baseVreg, uint32_t byteI) {
-  auto &vr1 = vreg_file.at(getVreg<DT>(baseVreg, byteI));
-  return getVregData<DT>(vr1, byteI);
-}
-
-template <typename DT>
-void vector_op_vix_load(std::vector<std::vector<Byte>> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rdest, uint32_t vl, bool strided, WordI stride, uint32_t nfields, uint32_t lmul, uint32_t vmask) {
-  uint32_t vsew = sizeof(DT) * 8;
-  uint32_t emul = lmul >> 2 ? 1 : 1 << (lmul & 0b11);
-  if (nfields * emul > 8) {
-    std::cout << "NFIELDS * EMUL = " << nfields * lmul << " but it should be <= 8" << std::endl;
-    std::abort();
-  }
-  for (uint32_t i = 0; i < vl * nfields; i++) {
-    if (isMasked(vreg_file, 0, i / nfields, vmask))
-      continue;
-
-    uint32_t nfields_strided = strided ? nfields : 1;
-    Word mem_addr = (base_addr & 0xFFFFFFFC) + (i / nfields_strided) * stride + (i % nfields_strided) * sizeof(DT);
-    Word mem_data = 0;
-    emul_->dcache_read(&mem_data, mem_addr, vsew / 8);
-    DP(4, "Loading data " << mem_data << " from: " << mem_addr << " to vec reg: " << getVreg<DT>(rdest + (i % nfields) * emul, i / nfields) << " i: " << i / nfields);
-    DT &result = getVregData<DT>(vreg_file, rdest + (i % nfields) * emul, i / nfields);
-    DP(4, "Previous data: " << +result);
-    result = (DT)mem_data;
-  }
-}
-
-void vector_op_vix_load(std::vector<std::vector<Byte>> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rdest, uint32_t vsew, uint32_t vl, bool strided, WordI stride, uint32_t nfields, uint32_t lmul, uint32_t vmask) {
-  switch (vsew) {
-  case 8:
-    vector_op_vix_load<uint8_t>(vreg_file, emul_, base_addr, rdest, vl, strided, stride, nfields, lmul, vmask);
-    break;
-  case 16:
-    vector_op_vix_load<uint16_t>(vreg_file, emul_, base_addr, rdest, vl, strided, stride, nfields, lmul, vmask);
-    break;
-  case 32:
-    vector_op_vix_load<uint32_t>(vreg_file, emul_, base_addr, rdest, vl, strided, stride, nfields, lmul, vmask);
-    break;
-  case 64:
-    vector_op_vix_load<uint64_t>(vreg_file, emul_, base_addr, rdest, vl, strided, stride, nfields, lmul, vmask);
-    break;
-  default:
-    std::cout << "Failed to execute VLE for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <typename DT>
-void vector_op_vv_load(std::vector<std::vector<Byte>> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rsrc1, uint32_t rdest, uint32_t iSew, uint32_t vl, uint32_t nfields, uint32_t lmul, uint32_t vmask) {
-  uint32_t vsew = sizeof(DT) * 8;
-  uint32_t emul = lmul >> 2 ? 1 : 1 << (lmul & 0b11);
-  if (nfields * emul > 8) {
-    std::cout << "NFIELDS * EMUL = " << nfields * lmul << " but it should be <= 8" << std::endl;
-    std::abort();
-  }
-  for (uint32_t i = 0; i < vl * nfields; i++) {
-    if (isMasked(vreg_file, 0, i / nfields, vmask))
-      continue;
-
-    Word offset = 0;
-    switch (iSew) {
-    case 8:
-      offset = getVregData<uint8_t>(vreg_file, rsrc1, i / nfields);
-      break;
-    case 16:
-      offset = getVregData<uint16_t>(vreg_file, rsrc1, i / nfields);
-      break;
-    case 32:
-      offset = getVregData<uint32_t>(vreg_file, rsrc1, i / nfields);
-      break;
-    case 64:
-      offset = getVregData<uint64_t>(vreg_file, rsrc1, i / nfields);
-      break;
-    default:
-      std::cout << "Unsupported iSew: " << iSew << std::endl;
-      std::abort();
-    }
-
-    Word mem_addr = (base_addr & 0xFFFFFFFC) + offset + (i % nfields) * sizeof(DT);
-    Word mem_data = 0;
-    emul_->dcache_read(&mem_data, mem_addr, vsew / 8);
-    DP(4, "VLUX/VLOX - Loading data " << mem_data << " from: " << mem_addr << " with offset: " << std::dec << offset << " to vec reg: " << getVreg<DT>(rdest + (i % nfields) * emul, i / nfields) << " i: " << i / nfields);
-    DT &result = getVregData<DT>(vreg_file, rdest + (i % nfields) * emul, i / nfields);
-    DP(4, "Previous data: " << +result);
-    result = (DT)mem_data;
-  }
-}
-
-void vector_op_vv_load(std::vector<std::vector<Byte>> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t iSew, uint32_t vl, uint32_t nfields, uint32_t lmul, uint32_t vmask) {
-  switch (vsew) {
-  case 8:
-    vector_op_vv_load<uint8_t>(vreg_file, emul_, base_addr, rsrc1, rdest, iSew, vl, nfields, lmul, vmask);
-    break;
-  case 16:
-    vector_op_vv_load<uint16_t>(vreg_file, emul_, base_addr, rsrc1, rdest, iSew, vl, nfields, lmul, vmask);
-    break;
-  case 32:
-    vector_op_vv_load<uint32_t>(vreg_file, emul_, base_addr, rsrc1, rdest, iSew, vl, nfields, lmul, vmask);
-    break;
-  case 64:
-    vector_op_vv_load<uint64_t>(vreg_file, emul_, base_addr, rsrc1, rdest, iSew, vl, nfields, lmul, vmask);
-    break;
-  default:
-    std::cout << "Failed to execute VLUX/VLOX for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <typename DT>
-void vector_op_vix_store(std::vector<std::vector<Byte>> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rsrc3, uint32_t vl, bool strided, WordI stride, uint32_t nfields, uint32_t lmul, uint32_t vmask) {
-  uint32_t vsew = sizeof(DT) * 8;
-  uint32_t emul = lmul >> 2 ? 1 : 1 << (lmul & 0b11);
-  for (uint32_t i = 0; i < vl * nfields; i++) {
-    if (isMasked(vreg_file, 0, i / nfields, vmask))
-      continue;
-
-    uint32_t nfields_strided = strided ? nfields : 1;
-    Word mem_addr = base_addr + (i / nfields_strided) * stride + (i % nfields_strided) * sizeof(DT);
-    Word mem_data = getVregData<DT>(vreg_file, rsrc3 + (i % nfields) * emul, i / nfields);
-    DP(4, "Storing: " << std::hex << mem_data << " at: " << mem_addr << " from vec reg: " << getVreg<DT>(rsrc3 + (i % nfields) * emul, i / nfields) << " i: " << i / nfields);
-    emul_->dcache_write(&mem_data, mem_addr, vsew / 8);
-  }
-}
-
-void vector_op_vix_store(std::vector<std::vector<Byte>> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rsrc3, uint32_t vsew, uint32_t vl, bool strided, WordI stride, uint32_t nfields, uint32_t lmul, uint32_t vmask) {
-  switch (vsew) {
-  case 8:
-    vector_op_vix_store<uint8_t>(vreg_file, emul_, base_addr, rsrc3, vl, strided, stride, nfields, lmul, vmask);
-    break;
-  case 16:
-    vector_op_vix_store<uint16_t>(vreg_file, emul_, base_addr, rsrc3, vl, strided, stride, nfields, lmul, vmask);
-    break;
-  case 32:
-    vector_op_vix_store<uint32_t>(vreg_file, emul_, base_addr, rsrc3, vl, strided, stride, nfields, lmul, vmask);
-    break;
-  case 64:
-    vector_op_vix_store<uint64_t>(vreg_file, emul_, base_addr, rsrc3, vl, strided, stride, nfields, lmul, vmask);
-    break;
-  default:
-    std::cout << "Failed to execute VSE for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <typename DT>
-void vector_op_vv_store(std::vector<std::vector<Byte>> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rsrc1, uint32_t rsrc3, uint32_t iSew, uint32_t vl, uint32_t nfields, uint32_t lmul, uint32_t vmask) {
-  uint32_t vsew = sizeof(DT) * 8;
-  uint32_t emul = lmul >> 2 ? 1 : 1 << (lmul & 0b11);
-  for (uint32_t i = 0; i < vl * nfields; i++) {
-    if (isMasked(vreg_file, 0, i / nfields, vmask))
-      continue;
-
-    Word offset = 0;
-    switch (iSew) {
-    case 8:
-      offset = getVregData<uint8_t>(vreg_file, rsrc1, i / nfields);
-      break;
-    case 16:
-      offset = getVregData<uint16_t>(vreg_file, rsrc1, i / nfields);
-      break;
-    case 32:
-      offset = getVregData<uint32_t>(vreg_file, rsrc1, i / nfields);
-      break;
-    case 64:
-      offset = getVregData<uint64_t>(vreg_file, rsrc1, i / nfields);
-      break;
-    default:
-      std::cout << "Unsupported iSew: " << iSew << std::endl;
-      std::abort();
-    }
-
-    Word mem_addr = base_addr + offset + (i % nfields) * sizeof(DT);
-    Word mem_data = getVregData<DT>(vreg_file, rsrc3 + (i % nfields) * emul, i / nfields);
-    DP(4, "VSUX/VSOX - Storing: " << std::hex << mem_data << " at: " << mem_addr << " with offset: " << std::dec << offset << " from vec reg: " << getVreg<DT>(rsrc3 + (i % nfields) * emul, i / nfields) << " i: " << i / nfields);
-    emul_->dcache_write(&mem_data, mem_addr, vsew / 8);
-  }
-}
-
-void vector_op_vv_store(std::vector<std::vector<Byte>> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rsrc1, uint32_t rsrc3, uint32_t vsew, uint32_t iSew, uint32_t vl, uint32_t nfields, uint32_t lmul, uint32_t vmask) {
-  switch (vsew) {
-  case 8:
-    vector_op_vv_store<uint8_t>(vreg_file, emul_, base_addr, rsrc1, rsrc3, iSew, vl, nfields, lmul, vmask);
-    break;
-  case 16:
-    vector_op_vv_store<uint16_t>(vreg_file, emul_, base_addr, rsrc1, rsrc3, iSew, vl, nfields, lmul, vmask);
-    break;
-  case 32:
-    vector_op_vv_store<uint32_t>(vreg_file, emul_, base_addr, rsrc1, rsrc3, iSew, vl, nfields, lmul, vmask);
-    break;
-  case 64:
-    vector_op_vv_store<uint64_t>(vreg_file, emul_, base_addr, rsrc1, rsrc3, iSew, vl, nfields, lmul, vmask);
-    break;
-  default:
-    std::cout << "Failed to execute VSUX/VSOX for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT>
-void vector_op_vix(DT first, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vl, uint32_t vmask) {
-  for (uint32_t i = 0; i < vl; i++) {
-    if (isMasked(vreg_file, 0, i, vmask))
-      continue;
-
-    DT second = getVregData<DT>(vreg_file, rsrc0, i);
-    DT third = getVregData<DT>(vreg_file, rdest, i);
-    DT result = OP<DT, DT>::apply(first, second, third);
-    DP(4, (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
-    getVregData<DT>(vreg_file, rdest, i) = result;
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
-void vector_op_vix(Word src1, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
-  switch (vsew) {
-  case 8:
-    vector_op_vix<OP, DT8>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-    break;
-  case 16:
-    vector_op_vix<OP, DT16>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-    break;
-  case 32:
-    vector_op_vix<OP, DT32>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-    break;
-  case 64:
-    vector_op_vix<OP, DT64>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-    break;
-  default:
-    std::cout << "Failed to execute VI/VX for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT>
-void vector_op_vix_carry(DT first, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vl) {
-  for (uint32_t i = 0; i < vl; i++) {
-    DT second = getVregData<DT>(vreg_file, rsrc0, i);
-    bool third = !isMasked(vreg_file, 0, i, false);
-    DT result = OP<DT, DT>::apply(first, second, third);
-    DP(4, (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
-    getVregData<DT>(vreg_file, rdest, i) = result;
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
-void vector_op_vix_carry(Word src1, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vsew, uint32_t vl) {
-  switch (vsew) {
-  case 8:
-    vector_op_vix_carry<OP, DT8>(src1, vreg_file, rsrc0, rdest, vl);
-    break;
-  case 16:
-    vector_op_vix_carry<OP, DT16>(src1, vreg_file, rsrc0, rdest, vl);
-    break;
-  case 32:
-    vector_op_vix_carry<OP, DT32>(src1, vreg_file, rsrc0, rdest, vl);
-    break;
-  case 64:
-    vector_op_vix_carry<OP, DT64>(src1, vreg_file, rsrc0, rdest, vl);
-    break;
-  default:
-    std::cout << "Failed to execute VI/VX carry for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT, typename DTR>
-void vector_op_vix_carry_out(DT first, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vl, uint32_t vmask) {
-  for (uint32_t i = 0; i < vl; i++) {
-    DT second = getVregData<DT>(vreg_file, rsrc0, i);
-    bool third = !vmask && !isMasked(vreg_file, 0, i, vmask);
-    bool result = OP<DT, DTR>::apply(first, second, third);
-    DP(4, (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
-    if (result) {
-      getVregData<uint8_t>(vreg_file, rdest, i / 8) |= 1 << (i % 8);
-    } else {
-      getVregData<uint8_t>(vreg_file, rdest, i / 8) &= ~(1 << (i % 8));
-    }
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64, typename DT128>
-void vector_op_vix_carry_out(Word src1, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
-  switch (vsew) {
-  case 8:
-    vector_op_vix_carry_out<OP, DT8, DT16>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-    break;
-  case 16:
-    vector_op_vix_carry_out<OP, DT16, DT32>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-    break;
-  case 32:
-    vector_op_vix_carry_out<OP, DT32, DT64>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-    break;
-  case 64:
-    vector_op_vix_carry_out<OP, DT64, DT128>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-    break;
-  default:
-    std::cout << "Failed to execute VI/VX carry out for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <typename DT>
-void vector_op_vix_merge(DT first, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vl, uint32_t vmask) {
-  for (uint32_t i = 0; i < vl; i++) {
-    DT result = isMasked(vreg_file, 0, i, vmask) ? getVregData<DT>(vreg_file, rsrc0, i) : first;
-    DP(4, "Merge - Choosing result: " << +result);
-    getVregData<DT>(vreg_file, rdest, i) = result;
-  }
-}
-
-template <typename DT8, typename DT16, typename DT32, typename DT64>
-void vector_op_vix_merge(Word src1, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
-  switch (vsew) {
-  case 8:
-    vector_op_vix_merge<DT8>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-    break;
-  case 16:
-    vector_op_vix_merge<DT16>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-    break;
-  case 32:
-    vector_op_vix_merge<DT32>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-    break;
-  case 64:
-    vector_op_vix_merge<DT64>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-    break;
-  default:
-    std::cout << "Failed to execute VI/VX for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <typename DT>
-void vector_op_scalar(DT &dest, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t vsew) {
-  if (rsrc0 != 0) {
-    std::cout << "Vwxunary0/Vwfunary0 has unsupported value for vs2: " << rsrc0 << std::endl;
-    std::abort();
-  }
-  switch (vsew) {
-  case 8:
-    dest = getVregData<uint8_t>(vreg_file, rsrc1, 0);
-    break;
-  case 16:
-    dest = getVregData<uint16_t>(vreg_file, rsrc1, 0);
-    break;
-  case 32:
-    dest = getVregData<uint32_t>(vreg_file, rsrc1, 0);
-    break;
-  case 64:
-    dest = getVregData<uint64_t>(vreg_file, rsrc1, 0);
-    break;
-  default:
-    std::cout << "Failed to execute vmv.x.s/vfmv.f.s for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT, typename DTR>
-void vector_op_vix_w(DT first, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vl, uint32_t vmask) {
-  for (uint32_t i = 0; i < vl; i++) {
-    if (isMasked(vreg_file, 0, i, vmask))
-      continue;
-
-    DT second = getVregData<DT>(vreg_file, rsrc0, i);
-    DTR third = getVregData<DTR>(vreg_file, rdest, i);
-    DTR result = OP<DT, DTR>::apply(first, second, third);
-    DP(4, "Widening " << (OP<DT, DTR>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
-    getVregData<DTR>(vreg_file, rdest, i) = result;
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
-void vector_op_vix_w(Word src1, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
-  switch (vsew) {
-  case 8:
-    vector_op_vix_w<OP, DT8, DT16>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-    break;
-  case 16:
-    vector_op_vix_w<OP, DT16, DT32>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-    break;
-  case 32:
-    vector_op_vix_w<OP, DT32, DT64>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-    break;
-  default:
-    std::cout << "Failed to execute VI/VX widening for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
-void vector_op_vix_wx(Word src1, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
-  switch (vsew) {
-  case 8:
-    vector_op_vix<OP, DT16>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-    break;
-  case 16:
-    vector_op_vix<OP, DT32>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-    break;
-  case 32:
-    vector_op_vix<OP, DT64>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-    break;
-  default:
-    std::cout << "Failed to execute VI/VX widening wx for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT, typename DTR>
-void vector_op_vix_n(DT first, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vl, uint32_t vmask, uint32_t vxrm, uint32_t &vxsat) {
-  for (uint32_t i = 0; i < vl; i++) {
-    if (isMasked(vreg_file, 0, i, vmask))
-      continue;
-
-    DT second = getVregData<DT>(vreg_file, rsrc0, i);
-    DTR result = OP<DT, DTR>::apply(first, second, vxrm, vxsat);
-    DP(4, "Narrowing " << (OP<DT, DTR>::name()) << "(" << +first << ", " << +second << ")" << " = " << +result);
-    getVregData<DTR>(vreg_file, rdest, i) = result;
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
-void vector_op_vix_n(Word src1, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask, uint32_t vxrm, uint32_t &vxsat) {
-  switch (vsew) {
-  case 8:
-    vector_op_vix_n<OP, DT16, DT8>(src1, vreg_file, rsrc0, rdest, vl, vmask, vxrm, vxsat);
-    break;
-  case 16:
-    vector_op_vix_n<OP, DT32, DT16>(src1, vreg_file, rsrc0, rdest, vl, vmask, vxrm, vxsat);
-    break;
-  case 32:
-    vector_op_vix_n<OP, DT64, DT32>(src1, vreg_file, rsrc0, rdest, vl, vmask, vxrm, vxsat);
-    break;
-  default:
-    std::cout << "Failed to execute VI/VX narrowing for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT, typename DTR>
-void vector_op_vix_sat(DTR first, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vl, uint32_t vmask, uint32_t vxrm, uint32_t &vxsat) {
-  for (uint32_t i = 0; i < vl; i++) {
-    if (isMasked(vreg_file, 0, i, vmask))
-      continue;
-
-    DT second = getVregData<DTR>(vreg_file, rsrc0, i);
-    DTR result = OP<DT, DTR>::apply(first, second, vxrm, vxsat);
-    DP(4, "Saturating " << (OP<DT, DTR>::name()) << "(" << +(DTR)first << ", " << +(DTR)second << ")" << " = " << +(DTR)result);
-    getVregData<DTR>(vreg_file, rdest, i) = result;
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64, typename DT128>
-void vector_op_vix_sat(Word src1, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask, uint32_t vxrm, uint32_t &vxsat) {
-  switch (vsew) {
-  case 8:
-    vector_op_vix_sat<OP, DT16, DT8>(src1, vreg_file, rsrc0, rdest, vl, vmask, vxrm, vxsat);
-    break;
-  case 16:
-    vector_op_vix_sat<OP, DT32, DT16>(src1, vreg_file, rsrc0, rdest, vl, vmask, vxrm, vxsat);
-    break;
-  case 32:
-    vector_op_vix_sat<OP, DT64, DT32>(src1, vreg_file, rsrc0, rdest, vl, vmask, vxrm, vxsat);
-    break;
-  case 64:
-    vector_op_vix_sat<OP, DT128, DT64>(src1, vreg_file, rsrc0, rdest, vl, vmask, vxrm, vxsat);
-    break;
-  default:
-    std::cout << "Failed to execute VI/VX saturating for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
-void vector_op_vix_scale(Word src1, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask, uint32_t vxrm, uint32_t &vxsat) {
-  switch (vsew) {
-  case 8:
-    vector_op_vix_sat<OP, DT8, DT8>(src1, vreg_file, rsrc0, rdest, vl, vmask, vxrm, vxsat);
-    break;
-  case 16:
-    vector_op_vix_sat<OP, DT16, DT16>(src1, vreg_file, rsrc0, rdest, vl, vmask, vxrm, vxsat);
-    break;
-  case 32:
-    vector_op_vix_sat<OP, DT32, DT32>(src1, vreg_file, rsrc0, rdest, vl, vmask, vxrm, vxsat);
-    break;
-  case 64:
-    vector_op_vix_sat<OP, DT64, DT64>(src1, vreg_file, rsrc0, rdest, vl, vmask, vxrm, vxsat);
-    break;
-  default:
-    std::cout << "Failed to execute VI/VX scale for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP>
-void vector_op_vix_ext(Word src1, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
-  if (vsew == 16) {
-    switch (src1) {
-    case 0b00110: // vzext.vf2
-      vector_op_vix_w<OP, uint8_t, uint16_t>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-      break;
-    case 0b00111: // vsext.vf2
-      vector_op_vix_w<OP, int8_t, int16_t>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-      break;
-    default:
-      std::cout << "Xunary0 has unsupported value for vf: " << src1 << std::endl;
-      std::abort();
-    }
-  } else if (vsew == 32) {
-    switch (src1) {
-    case 0b00100: // vzext.vf4
-      vector_op_vix_w<OP, uint8_t, uint32_t>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-      break;
-    case 0b00101: // vsext.vf4
-      vector_op_vix_w<OP, int8_t, int32_t>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-      break;
-    case 0b00110: // vzext.vf2
-      vector_op_vix_w<OP, uint16_t, uint32_t>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-      break;
-    case 0b00111: // vsext.vf2
-      vector_op_vix_w<OP, int16_t, int32_t>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-      break;
-    default:
-      std::cout << "Xunary0 has unsupported value for vf: " << src1 << std::endl;
-      std::abort();
-    }
-  } else if (vsew == 64) {
-    switch (src1) {
-    case 0b00010: // vzext.vf8
-      vector_op_vix_w<OP, uint8_t, uint64_t>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-      break;
-    case 0b00011: // vsext.vf8
-      vector_op_vix_w<OP, int8_t, int64_t>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-      break;
-    case 0b00100: // vzext.vf4
-      vector_op_vix_w<OP, uint16_t, uint64_t>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-      break;
-    case 0b00101: // vsext.vf4
-      vector_op_vix_w<OP, int16_t, int64_t>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-      break;
-    case 0b00110: // vzext.vf2
-      vector_op_vix_w<OP, uint32_t, uint64_t>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-      break;
-    case 0b00111: // vsext.vf2
-      vector_op_vix_w<OP, int32_t, int64_t>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-      break;
-    default:
-      std::cout << "Xunary0 has unsupported value for vf: " << src1 << std::endl;
-      std::abort();
-    }
-  } else {
-    std::cout << "Failed to execute Xunary0 for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT>
-void vector_op_vix_mask(DT first, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vl, uint32_t vmask) {
-  for (uint32_t i = 0; i < vl; i++) {
-    if (isMasked(vreg_file, 0, i, vmask))
-      continue;
-
-    DT second = getVregData<DT>(vreg_file, rsrc0, i);
-    bool result = OP<DT, bool>::apply(first, second, 0);
-    DP(4, "Integer/float compare mask " << (OP<DT, bool>::name()) << "(" << +first << ", " << +second << ")" << " = " << +result);
-    if (result) {
-      getVregData<uint8_t>(vreg_file, rdest, i / 8) |= 1 << (i % 8);
-    } else {
-      getVregData<uint8_t>(vreg_file, rdest, i / 8) &= ~(1 << (i % 8));
-    }
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
-void vector_op_vix_mask(Word src1, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
-  switch (vsew) {
-  case 8:
-    vector_op_vix_mask<OP, DT8>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-    break;
-  case 16:
-    vector_op_vix_mask<OP, DT16>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-    break;
-  case 32:
-    vector_op_vix_mask<OP, DT32>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-    break;
-  case 64:
-    vector_op_vix_mask<OP, DT64>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-    break;
-  default:
-    std::cout << "Failed to execute VI/VX integer/float compare mask for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <typename DT>
-void vector_op_vix_slide(Word first, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vl, Word vlmax, uint32_t vmask, bool scalar) {
-  // If vlmax > 0 this means we have a vslidedown instruction, vslideup does not require vlmax
-  bool slideDown = vlmax;
-  uint32_t scalarPos = slideDown ? vl - 1 : 0;
-  // If scalar set is set this means we have a v(f)slide1up or v(f)slide1down instruction,
-  // so first is our scalar value and we need to overwrite it with 1 for later computations
-  if (scalar && vl && !isMasked(vreg_file, 0, scalarPos, vmask)) {
-    DP(4, "Slide - Moving scalar value " << +first << " to position " << +scalarPos);
-    getVregData<DT>(vreg_file, rdest, scalarPos) = first;
-  }
-  first = scalar ? 1 : first;
-
-  for (Word i = slideDown ? 0 : first; i < vl - (scalar && vl && slideDown); i++) {
-    if (isMasked(vreg_file, 0, i, vmask))
-      continue;
-
-    __uint128_t iSrc = slideDown ? (__uint128_t)i + (__uint128_t)first : (__uint128_t)i - (__uint128_t)first; // prevent overflows/underflows
-    DT value = (!slideDown || iSrc < vlmax) ? getVregData<DT>(vreg_file, rsrc0, iSrc) : 0;
-    DP(4, "Slide - Moving value " << +value << " from position " << (uint64_t)iSrc << " to position " << +i);
-    getVregData<DT>(vreg_file, rdest, i) = value;
-  }
-}
-
-template <typename DT8, typename DT16, typename DT32, typename DT64>
-void vector_op_vix_slide(Word src1, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vsew, uint32_t vl, Word vlmax, uint32_t vmask, bool scalar) {
-  switch (vsew) {
-  case 8:
-    vector_op_vix_slide<DT8>(src1, vreg_file, rsrc0, rdest, vl, vlmax, vmask, scalar);
-    break;
-  case 16:
-    vector_op_vix_slide<DT16>(src1, vreg_file, rsrc0, rdest, vl, vlmax, vmask, scalar);
-    break;
-  case 32:
-    vector_op_vix_slide<DT32>(src1, vreg_file, rsrc0, rdest, vl, vlmax, vmask, scalar);
-    break;
-  case 64:
-    vector_op_vix_slide<DT64>(src1, vreg_file, rsrc0, rdest, vl, vlmax, vmask, scalar);
-    break;
-  default:
-    std::cout << "Failed to execute VI/VX slide for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <typename DT>
-void vector_op_vix_gather(Word first, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vl, Word vlmax, uint32_t vmask) {
-  for (Word i = 0; i < vl; i++) {
-    if (isMasked(vreg_file, 0, i, vmask))
-      continue;
-
-    DT value = first < vlmax ? getVregData<DT>(vreg_file, rsrc0, first) : 0;
-    DP(4, "Register gather - Moving value " << +value << " from position " << +first << " to position " << +i);
-    getVregData<DT>(vreg_file, rdest, i) = value;
-  }
-}
-
-template <typename DT8, typename DT16, typename DT32, typename DT64>
-void vector_op_vix_gather(Word src1, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vsew, uint32_t vl, Word vlmax, uint32_t vmask) {
-  switch (vsew) {
-  case 8:
-    vector_op_vix_gather<DT8>(src1, vreg_file, rsrc0, rdest, vl, vlmax, vmask);
-    break;
-  case 16:
-    vector_op_vix_gather<DT16>(src1, vreg_file, rsrc0, rdest, vl, vlmax, vmask);
-    break;
-  case 32:
-    vector_op_vix_gather<DT32>(src1, vreg_file, rsrc0, rdest, vl, vlmax, vmask);
-    break;
-  case 64:
-    vector_op_vix_gather<DT64>(src1, vreg_file, rsrc0, rdest, vl, vlmax, vmask);
-    break;
-  default:
-    std::cout << "Failed to execute VI/VX register gather for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT>
-void vector_op_vv(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl, uint32_t vmask) {
-  for (uint32_t i = 0; i < vl; i++) {
-    if (isMasked(vreg_file, 0, i, vmask))
-      continue;
-
-    DT first = getVregData<DT>(vreg_file, rsrc0, i);
-    DT second = getVregData<DT>(vreg_file, rsrc1, i);
-    DT third = getVregData<DT>(vreg_file, rdest, i);
-    DT result = OP<DT, DT>::apply(first, second, third);
-    DP(4, (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
-    getVregData<DT>(vreg_file, rdest, i) = result;
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
-void vector_op_vv(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
-  switch (vsew) {
-  case 8:
-    vector_op_vv<OP, DT8>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  case 16:
-    vector_op_vv<OP, DT16>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  case 32:
-    vector_op_vv<OP, DT32>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  case 64:
-    vector_op_vv<OP, DT64>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  default:
-    std::cout << "Failed to execute VV for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT>
-void vector_op_vv_carry(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl) {
-  for (uint32_t i = 0; i < vl; i++) {
-    DT first = getVregData<DT>(vreg_file, rsrc0, i);
-    DT second = getVregData<DT>(vreg_file, rsrc1, i);
-    bool third = !isMasked(vreg_file, 0, i, false);
-    DT result = OP<DT, DT>::apply(first, second, third);
-    DP(4, (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
-    getVregData<DT>(vreg_file, rdest, i) = result;
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
-void vector_op_vv_carry(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl) {
-  switch (vsew) {
-  case 8:
-    vector_op_vv_carry<OP, DT8>(vreg_file, rsrc0, rsrc1, rdest, vl);
-    break;
-  case 16:
-    vector_op_vv_carry<OP, DT16>(vreg_file, rsrc0, rsrc1, rdest, vl);
-    break;
-  case 32:
-    vector_op_vv_carry<OP, DT32>(vreg_file, rsrc0, rsrc1, rdest, vl);
-    break;
-  case 64:
-    vector_op_vv_carry<OP, DT64>(vreg_file, rsrc0, rsrc1, rdest, vl);
-    break;
-  default:
-    std::cout << "Failed to execute VV carry for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT, typename DTR>
-void vector_op_vv_carry_out(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl, uint32_t vmask) {
-  for (uint32_t i = 0; i < vl; i++) {
-    DT first = getVregData<DT>(vreg_file, rsrc0, i);
-    DT second = getVregData<DT>(vreg_file, rsrc1, i);
-    bool third = !vmask && !isMasked(vreg_file, 0, i, vmask);
-    bool result = OP<DT, DTR>::apply(first, second, third);
-    DP(4, (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
-    if (result) {
-      getVregData<uint8_t>(vreg_file, rdest, i / 8) |= 1 << (i % 8);
-    } else {
-      getVregData<uint8_t>(vreg_file, rdest, i / 8) &= ~(1 << (i % 8));
-    }
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64, typename DT128>
-void vector_op_vv_carry_out(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
-  switch (vsew) {
-  case 8:
-    vector_op_vv_carry_out<OP, DT8, DT16>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  case 16:
-    vector_op_vv_carry_out<OP, DT16, DT32>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  case 32:
-    vector_op_vv_carry_out<OP, DT32, DT64>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  case 64:
-    vector_op_vv_carry_out<OP, DT64, DT128>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  default:
-    std::cout << "Failed to execute VV carry out for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <typename DT>
-void vector_op_vv_merge(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl, uint32_t vmask) {
-  for (uint32_t i = 0; i < vl; i++) {
-    uint32_t rsrc = isMasked(vreg_file, 0, i, vmask) ? rsrc1 : rsrc0;
-    DT result = getVregData<DT>(vreg_file, rsrc, i);
-    DP(4, "Merge - Choosing result: " << +result);
-    getVregData<DT>(vreg_file, rdest, i) = result;
-  }
-}
-
-template <typename DT8, typename DT16, typename DT32, typename DT64>
-void vector_op_vv_merge(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
-  switch (vsew) {
-  case 8:
-    vector_op_vv_merge<DT8>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  case 16:
-    vector_op_vv_merge<DT16>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  case 32:
-    vector_op_vv_merge<DT32>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  case 64:
-    vector_op_vv_merge<DT64>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  default:
-    std::cout << "Failed to execute VV for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <typename DT>
-void vector_op_vv_gather(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl, bool ei16, uint32_t vlmax, uint32_t vmask) {
-  for (Word i = 0; i < vl; i++) {
-    if (isMasked(vreg_file, 0, i, vmask))
-      continue;
-
-    uint32_t first = ei16 ? getVregData<uint16_t>(vreg_file, rsrc0, i) : getVregData<DT>(vreg_file, rsrc0, i);
-    DT value = first < vlmax ? getVregData<DT>(vreg_file, rsrc1, first) : 0;
-    DP(4, "Register gather - Moving value " << +value << " from position " << +first << " to position " << +i);
-    getVregData<DT>(vreg_file, rdest, i) = value;
-  }
-}
-
-template <typename DT8, typename DT16, typename DT32, typename DT64>
-void vector_op_vv_gather(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, bool ei16, uint32_t vlmax, uint32_t vmask) {
-  switch (vsew) {
-  case 8:
-    vector_op_vv_gather<DT8>(vreg_file, rsrc0, rsrc1, rdest, vl, ei16, vlmax, vmask);
-    break;
-  case 16:
-    vector_op_vv_gather<DT16>(vreg_file, rsrc0, rsrc1, rdest, vl, ei16, vlmax, vmask);
-    break;
-  case 32:
-    vector_op_vv_gather<DT32>(vreg_file, rsrc0, rsrc1, rdest, vl, ei16, vlmax, vmask);
-    break;
-  case 64:
-    vector_op_vv_gather<DT64>(vreg_file, rsrc0, rsrc1, rdest, vl, ei16, vlmax, vmask);
-    break;
-  default:
-    std::cout << "Failed to execute VV register gather for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT, typename DTR>
-void vector_op_vv_w(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl, uint32_t vmask) {
-  for (uint32_t i = 0; i < vl; i++) {
-    if (isMasked(vreg_file, 0, i, vmask))
-      continue;
-
-    DT first = getVregData<DT>(vreg_file, rsrc0, i);
-    DT second = getVregData<DT>(vreg_file, rsrc1, i);
-    DTR third = getVregData<DTR>(vreg_file, rdest, i);
-    DTR result = OP<DT, DTR>::apply(first, second, third);
-    DP(4, "Widening " << (OP<DT, DTR>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
-    getVregData<DTR>(vreg_file, rdest, i) = result;
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
-void vector_op_vv_w(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
-  switch (vsew) {
-  case 8:
-    vector_op_vv_w<OP, DT8, DT16>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  case 16:
-    vector_op_vv_w<OP, DT16, DT32>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  case 32:
-    vector_op_vv_w<OP, DT32, DT64>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  default:
-    std::cout << "Failed to execute VV widening for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT, typename DTR>
-void vector_op_vv_wv(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl, uint32_t vmask) {
-  for (uint32_t i = 0; i < vl; i++) {
-    if (isMasked(vreg_file, 0, i, vmask))
-      continue;
-
-    DT first = getVregData<DT>(vreg_file, rsrc0, i);
-    DTR second = getVregData<DTR>(vreg_file, rsrc1, i);
-    DTR third = getVregData<DTR>(vreg_file, rdest, i);
-    DTR result = OP<DTR, DTR>::apply(first, second, third);
-    DP(4, "Widening wv " << (OP<DT, DTR>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
-    getVregData<DTR>(vreg_file, rdest, i) = result;
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
-void vector_op_vv_wv(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
-  switch (vsew) {
-  case 8:
-    vector_op_vv_wv<OP, DT8, DT16>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  case 16:
-    vector_op_vv_wv<OP, DT16, DT32>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  case 32:
-    vector_op_vv_wv<OP, DT32, DT64>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  default:
-    std::cout << "Failed to execute VV widening wv for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT, typename DTR>
-void vector_op_vv_wfv(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl, uint32_t vmask) {
-  for (uint32_t i = 0; i < vl; i++) {
-    if (isMasked(vreg_file, 0, i, vmask))
-      continue;
-
-    DT first = getVregData<DT>(vreg_file, rsrc0, i);
-    DTR second = getVregData<DTR>(vreg_file, rsrc1, i);
-    DTR third = getVregData<DTR>(vreg_file, rdest, i);
-    DTR result = OP<DTR, DTR>::apply(rv_ftod(first), second, third);
-    DP(4, "Widening wfv " << (OP<DT, DTR>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
-    getVregData<DTR>(vreg_file, rdest, i) = result;
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
-void vector_op_vv_wfv(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
-  if (vsew == 32) {
-    vector_op_vv_wfv<OP, DT32, DT64>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-  } else {
-    std::cout << "Failed to execute VV widening wfv for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT, typename DTR>
-void vector_op_vv_n(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl, uint32_t vmask, uint32_t vxrm, uint32_t &vxsat) {
-  for (uint32_t i = 0; i < vl; i++) {
-    if (isMasked(vreg_file, 0, i, vmask))
-      continue;
-
-    DTR first = getVregData<DTR>(vreg_file, rsrc0, i);
-    DT second = getVregData<DT>(vreg_file, rsrc1, i);
-    DTR result = OP<DT, DTR>::apply(first, second, vxrm, vxsat);
-    DP(4, "Narrowing " << (OP<DT, DTR>::name()) << "(" << +first << ", " << +second << ")" << " = " << +result);
-    getVregData<DTR>(vreg_file, rdest, i) = result;
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
-void vector_op_vv_n(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask, uint32_t vxrm, uint32_t &vxsat) {
-  switch (vsew) {
-  case 8:
-    vector_op_vv_n<OP, DT16, DT8>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask, vxrm, vxsat);
-    break;
-  case 16:
-    vector_op_vv_n<OP, DT32, DT16>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask, vxrm, vxsat);
-    break;
-  case 32:
-    vector_op_vv_n<OP, DT64, DT32>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask, vxrm, vxsat);
-    break;
-  default:
-    std::cout << "Failed to execute VV narrowing for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT, typename DTR>
-void vector_op_vv_sat(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl, uint32_t vmask, uint32_t vxrm, uint32_t &vxsat) {
-  for (uint32_t i = 0; i < vl; i++) {
-    if (isMasked(vreg_file, 0, i, vmask))
-      continue;
-
-    DT first = getVregData<DTR>(vreg_file, rsrc0, i);
-    DT second = getVregData<DTR>(vreg_file, rsrc1, i);
-    DTR result = OP<DT, DTR>::apply(first, second, vxrm, vxsat);
-    DP(4, "Saturating " << (OP<DT, DTR>::name()) << "(" << +(DTR)first << ", " << +(DTR)second << ")" << " = " << +(DTR)result);
-    getVregData<DTR>(vreg_file, rdest, i) = result;
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64, typename DT128>
-void vector_op_vv_sat(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask, uint32_t vxrm, uint32_t &vxsat) {
-  switch (vsew) {
-  case 8:
-    vector_op_vv_sat<OP, DT16, DT8>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask, vxrm, vxsat);
-    break;
-  case 16:
-    vector_op_vv_sat<OP, DT32, DT16>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask, vxrm, vxsat);
-    break;
-  case 32:
-    vector_op_vv_sat<OP, DT64, DT32>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask, vxrm, vxsat);
-    break;
-  case 64:
-    vector_op_vv_sat<OP, DT128, DT64>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask, vxrm, vxsat);
-    break;
-  default:
-    std::cout << "Failed to execute VV saturating for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
-void vector_op_vv_scale(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask, uint32_t vxrm, uint32_t &vxsat) {
-  switch (vsew) {
-  case 8:
-    vector_op_vv_sat<OP, DT8, DT8>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask, vxrm, vxsat);
-    break;
-  case 16:
-    vector_op_vv_sat<OP, DT16, DT16>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask, vxrm, vxsat);
-    break;
-  case 32:
-    vector_op_vv_sat<OP, DT32, DT32>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask, vxrm, vxsat);
-    break;
-  case 64:
-    vector_op_vv_sat<OP, DT64, DT64>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask, vxrm, vxsat);
-    break;
-  default:
-    std::cout << "Failed to execute VV scale for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT>
-void vector_op_vv_red(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl, uint32_t vmask) {
-  for (uint32_t i = 0; i < vl; i++) {
-    // use rdest as accumulator
-    if (i == 0) {
-      getVregData<DT>(vreg_file, rdest, 0) = getVregData<DT>(vreg_file, rsrc0, 0);
-    }
-    if (isMasked(vreg_file, 0, i, vmask))
-      continue;
-
-    DT first = getVregData<DT>(vreg_file, rdest, 0);
-    DT second = getVregData<DT>(vreg_file, rsrc1, i);
-    DT result = OP<DT, DT>::apply(first, second, 0);
-    DP(4, "Reduction " << (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ")" << " = " << +result);
-    getVregData<DT>(vreg_file, rdest, 0) = result;
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
-void vector_op_vv_red(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
-  switch (vsew) {
-  case 8:
-    vector_op_vv_red<OP, DT8>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  case 16:
-    vector_op_vv_red<OP, DT16>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  case 32:
-    vector_op_vv_red<OP, DT32>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  case 64:
-    vector_op_vv_red<OP, DT64>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  default:
-    std::cout << "Failed to execute VV reduction for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT, typename DTR>
-void vector_op_vv_red_w(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl, uint32_t vmask) {
-  for (uint32_t i = 0; i < vl; i++) {
-    // use rdest as accumulator
-    if (i == 0) {
-      getVregData<DTR>(vreg_file, rdest, 0) = getVregData<DTR>(vreg_file, rsrc0, 0);
-    }
-    if (isMasked(vreg_file, 0, i, vmask))
-      continue;
-
-    DTR first = getVregData<DTR>(vreg_file, rdest, 0);
-    DT second = getVregData<DT>(vreg_file, rsrc1, i);
-    DTR second_w = std::is_signed<DT>() ? sext((DTR)second, sizeof(DT) * 8) : zext((DTR)second, sizeof(DT) * 8);
-    DTR result = OP<DTR, DTR>::apply(first, second_w, 0);
-    DP(4, "Widening reduction " << (OP<DTR, DTR>::name()) << "(" << +first << ", " << +second_w << ")" << " = " << +result);
-    getVregData<DTR>(vreg_file, rdest, 0) = result;
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
-void vector_op_vv_red_w(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
-  switch (vsew) {
-  case 8:
-    vector_op_vv_red_w<OP, DT8, DT16>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  case 16:
-    vector_op_vv_red_w<OP, DT16, DT32>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  case 32:
-    vector_op_vv_red_w<OP, DT32, DT64>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  default:
-    std::cout << "Failed to execute VV widening reduction for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT, typename DTR>
-void vector_op_vv_red_wf(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl, uint32_t vmask) {
-  for (uint32_t i = 0; i < vl; i++) {
-    // use rdest as accumulator
-    if (i == 0) {
-      getVregData<DTR>(vreg_file, rdest, 0) = getVregData<DTR>(vreg_file, rsrc0, 0);
-    }
-    if (isMasked(vreg_file, 0, i, vmask))
-      continue;
-
-    DTR first = getVregData<DTR>(vreg_file, rdest, 0);
-    DT second = getVregData<DT>(vreg_file, rsrc1, i);
-    DTR second_w = rv_ftod(second);
-    DTR result = OP<DTR, DTR>::apply(first, second_w, 0);
-    DP(4, "Float widening reduction " << (OP<DTR, DTR>::name()) << "(" << +first << ", " << +second_w << ")" << " = " << +result);
-    getVregData<DTR>(vreg_file, rdest, 0) = result;
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
-void vector_op_vv_red_wf(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
-  if (vsew == 32) {
-    vector_op_vv_red_wf<OP, DT32, DT64>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-  } else {
-    std::cout << "Failed to execute VV float widening reduction for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <typename DT>
-void vector_op_vid(std::vector<std::vector<Byte>> &vreg_file, uint32_t rdest, uint32_t vl, uint32_t vmask) {
-  for (uint32_t i = 0; i < vl; i++) {
-    if (isMasked(vreg_file, 0, i, vmask))
-      continue;
-
-    DP(4, "Element Index = " << +i);
-    getVregData<DT>(vreg_file, rdest, i) = i;
-  }
-}
-
-void vector_op_vid(std::vector<std::vector<Byte>> &vreg_file, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
-  switch (vsew) {
-  case 8:
-    vector_op_vid<uint8_t>(vreg_file, rdest, vl, vmask);
-    break;
-  case 16:
-    vector_op_vid<uint16_t>(vreg_file, rdest, vl, vmask);
-    break;
-  case 32:
-    vector_op_vid<uint32_t>(vreg_file, rdest, vl, vmask);
-    break;
-  case 64:
-    vector_op_vid<uint64_t>(vreg_file, rdest, vl, vmask);
-    break;
-  default:
-    std::cout << "Failed to execute vector element index for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT>
-void vector_op_vv_mask(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl, uint32_t vmask) {
-  for (uint32_t i = 0; i < vl; i++) {
-    if (isMasked(vreg_file, 0, i, vmask))
-      continue;
-
-    DT first = getVregData<DT>(vreg_file, rsrc0, i);
-    DT second = getVregData<DT>(vreg_file, rsrc1, i);
-    bool result = OP<DT, bool>::apply(first, second, 0);
-    DP(4, "Integer/float compare mask " << (OP<DT, bool>::name()) << "(" << +first << ", " << +second << ")" << " = " << +result);
-    if (result) {
-      getVregData<uint8_t>(vreg_file, rdest, i / 8) |= 1 << (i % 8);
-    } else {
-      getVregData<uint8_t>(vreg_file, rdest, i / 8) &= ~(1 << (i % 8));
-    }
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
-void vector_op_vv_mask(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
-  switch (vsew) {
-  case 8:
-    vector_op_vv_mask<OP, DT8>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  case 16:
-    vector_op_vv_mask<OP, DT16>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  case 32:
-    vector_op_vv_mask<OP, DT32>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  case 64:
-    vector_op_vv_mask<OP, DT64>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  default:
-    std::cout << "Failed to execute VV integer/float compare mask for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP>
-void vector_op_vv_mask(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl) {
-  for (uint32_t i = 0; i < vl; i++) {
-    uint8_t firstMask = getVregData<uint8_t>(vreg_file, rsrc0, i / 8);
-    bool first = (firstMask >> (i % 8)) & 0x1;
-    uint8_t secondMask = getVregData<uint8_t>(vreg_file, rsrc1, i / 8);
-    bool second = (secondMask >> (i % 8)) & 0x1;
-    bool result = OP<uint8_t, uint8_t>::apply(first, second, 0) & 0x1;
-    DP(4, "Compare mask bits " << (OP<uint8_t, uint8_t>::name()) << "(" << +first << ", " << +second << ")" << " = " << +result);
-    if (result) {
-      getVregData<uint8_t>(vreg_file, rdest, i / 8) |= 1 << (i % 8);
-    } else {
-      getVregData<uint8_t>(vreg_file, rdest, i / 8) &= ~(1 << (i % 8));
-    }
-  }
-}
-
-template <typename DT>
-void vector_op_vv_compress(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl) {
-  int currPos = 0;
-  for (uint32_t i = 0; i < vl; i++) {
-    // Special case: use rsrc0 as mask vector register instead of default v0
-    // This instruction is always masked (vmask == 0), but encoded as unmasked (vmask == 1)
-    if (isMasked(vreg_file, rsrc0, i, 0))
-      continue;
-
-    DT value = getVregData<DT>(vreg_file, rsrc1, i);
-    DP(4, "Compression - Moving value " << +value << " from position " << i << " to position " << currPos);
-    getVregData<DT>(vreg_file, rdest, currPos) = value;
-    currPos++;
-  }
-}
-
-template <typename DT8, typename DT16, typename DT32, typename DT64>
-void vector_op_vv_compress(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl) {
-  switch (vsew) {
-  case 8:
-    vector_op_vv_compress<DT8>(vreg_file, rsrc0, rsrc1, rdest, vl);
-    break;
-  case 16:
-    vector_op_vv_compress<DT16>(vreg_file, rsrc0, rsrc1, rdest, vl);
-    break;
-  case 32:
-    vector_op_vv_compress<DT32>(vreg_file, rsrc0, rsrc1, rdest, vl);
-    break;
-  case 64:
-    vector_op_vv_compress<DT64>(vreg_file, rsrc0, rsrc1, rdest, vl);
-    break;
-  default:
-    std::cout << "Failed to execute VV compression for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
 void Emulator::loadVector(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata) {
   auto &warp = warps_.at(wid);
   auto vmask = instr.getVmask();
diff --git a/sim/simx/vpu.h b/sim/simx/vpu.h
new file mode 100644
index 000000000..3974d2552
--- /dev/null
+++ b/sim/simx/vpu.h
@@ -0,0 +1,2391 @@
+#pragma once
+
+using namespace vortex;
+
+template <typename T, typename R>
+class Add {
+public:
+  static R apply(T first, T second, R) {
+    return (R)first + (R)second;
+  }
+  static std::string name() { return "Add"; }
+};
+
+template <typename T, typename R>
+class Sub {
+public:
+  static R apply(T first, T second, R) {
+    return (R)second - (R)first;
+  }
+  static std::string name() { return "Sub"; }
+};
+
+template <typename T, typename R>
+class Adc {
+public:
+  static R apply(T first, T second, R third) {
+    return (R)first + (R)second + third;
+  }
+  static std::string name() { return "Adc"; }
+};
+
+template <typename T, typename R>
+class Madc {
+public:
+  static R apply(T first, T second, R third) {
+    return ((R)first + (R)second + third) > (R)std::numeric_limits<T>::max();
+  }
+  static std::string name() { return "Madc"; }
+};
+
+template <typename T, typename R>
+class Sbc {
+public:
+  static R apply(T first, T second, R third) {
+    return (R)second - (R)first - third;
+  }
+  static std::string name() { return "Sbc"; }
+};
+
+template <typename T, typename R>
+class Msbc {
+public:
+  static R apply(T first, T second, R third) {
+    return (R)second < ((R)first + third);
+  }
+  static std::string name() { return "Msbc"; }
+};
+
+template <typename T, typename R>
+class Ssub {
+public:
+  static R apply(T first, T second, uint32_t, uint32_t &vxsat_) {
+    // rounding mode is not relevant for this operation
+    T unclippedResult = second - first;
+    R clippedResult = std::clamp(unclippedResult, (T)std::numeric_limits<R>::min(), (T)std::numeric_limits<R>::max());
+    vxsat_ |= clippedResult != unclippedResult;
+    return clippedResult;
+  }
+  static std::string name() { return "Ssub"; }
+};
+
+template <typename T, typename R>
+class Ssubu {
+public:
+  static R apply(T first, T second, uint32_t, uint32_t &vxsat_) {
+    // rounding mode is not relevant for this operation
+    if (first > second) {
+      vxsat_ = true;
+      return 0;
+    } else {
+      vxsat_ = false;
+      return second - first;
+    }
+  }
+  static std::string name() { return "Ssubu"; }
+};
+
+template <typename T, typename R>
+class Sadd {
+public:
+  static R apply(T first, T second, uint32_t, uint32_t &vxsat_) {
+    // rounding mode is not relevant for this operation
+    T unclippedResult = second + first;
+    R clippedResult = std::clamp(unclippedResult, (T)std::numeric_limits<R>::min(), (T)std::numeric_limits<R>::max());
+    vxsat_ |= clippedResult != unclippedResult;
+    return clippedResult;
+  }
+  static std::string name() { return "Sadd"; }
+};
+
+template <typename T, typename R>
+class Rsub {
+public:
+  static R apply(T first, T second, R) {
+    return first - second;
+  }
+  static std::string name() { return "Rsub"; }
+};
+
+template <typename T, typename R>
+class Div {
+public:
+  static R apply(T first, T second, R) {
+    // logic taken from scalar div
+    if (first == 0) {
+      return -1;
+    } else if (second == std::numeric_limits<T>::min() && first == T(-1)) {
+      return second;
+    } else {
+      return (R)second / (R)first;
+    }
+  }
+  static std::string name() { return "Div"; }
+};
+
+template <typename T, typename R>
+class Rem {
+public:
+  static R apply(T first, T second, R) {
+    // logic taken from scalar rem
+    if (first == 0) {
+      return second;
+    } else if (second == std::numeric_limits<T>::min() && first == T(-1)) {
+      return 0;
+    } else {
+      return (R)second % (R)first;
+    }
+  }
+  static std::string name() { return "Rem"; }
+};
+
+template <typename T, typename R>
+class Mul {
+public:
+  static R apply(T first, T second, R) {
+    return (R)first * (R)second;
+  }
+  static std::string name() { return "Mul"; }
+};
+
+template <typename T, typename R>
+class Mulsu {
+public:
+  static R apply(T first, T second, R) {
+    R first_ext = zext((R)first, (sizeof(T) * 8));
+    return first_ext * (R)second;
+  }
+  static std::string name() { return "Mulsu"; }
+};
+
+template <typename T, typename R>
+class Mulh {
+public:
+  static R apply(T first, T second, R) {
+    __int128_t first_ext = sext((__int128_t)first, (sizeof(T) * 8));
+    __int128_t second_ext = sext((__int128_t)second, (sizeof(T) * 8));
+    return (first_ext * second_ext) >> (sizeof(T) * 8);
+  }
+  static std::string name() { return "Mulh"; }
+};
+
+template <typename T, typename R>
+class Mulhsu {
+public:
+  static R apply(T first, T second, R) {
+    __int128_t first_ext = zext((__int128_t)first, (sizeof(T) * 8));
+    __int128_t second_ext = sext((__int128_t)second, (sizeof(T) * 8));
+    return (first_ext * second_ext) >> (sizeof(T) * 8);
+  }
+  static std::string name() { return "Mulhsu"; }
+};
+
+template <typename T, typename R>
+class Mulhu {
+public:
+  static R apply(T first, T second, R) {
+    return ((__uint128_t)first * (__uint128_t)second) >> (sizeof(T) * 8);
+  }
+  static std::string name() { return "Mulhu"; }
+};
+
+template <typename T, typename R>
+class Madd {
+public:
+  static R apply(T first, T second, R third) {
+    return ((R)first * third) + (R)second;
+  }
+  static std::string name() { return "Madd"; }
+};
+
+template <typename T, typename R>
+class Nmsac {
+public:
+  static R apply(T first, T second, R third) {
+    return -((R)first * (R)second) + third;
+  }
+  static std::string name() { return "Nmsac"; }
+};
+
+template <typename T, typename R>
+class Macc {
+public:
+  static R apply(T first, T second, R third) {
+    return ((R)first * (R)second) + third;
+  }
+  static std::string name() { return "Macc"; }
+};
+
+template <typename T, typename R>
+class Maccsu {
+public:
+  static R apply(T first, T second, R third) {
+    R first_ext = sext((R)first, (sizeof(T) * 8));
+    R second_ext = zext((R)second, (sizeof(T) * 8));
+    return (first_ext * second_ext) + third;
+  }
+  static std::string name() { return "Maccsu"; }
+};
+
+template <typename T, typename R>
+class Maccus {
+public:
+  static R apply(T first, T second, R third) {
+    R first_ext = zext((R)first, (sizeof(T) * 8));
+    R second_ext = sext((R)second, (sizeof(T) * 8));
+    return (first_ext * second_ext) + third;
+  }
+  static std::string name() { return "Maccus"; }
+};
+
+template <typename T, typename R>
+class Nmsub {
+public:
+  static R apply(T first, T second, R third) {
+    return -((R)first * third) + (R)second;
+  }
+  static std::string name() { return "Nmsub"; }
+};
+
+template <typename T, typename R>
+class Min {
+public:
+  static R apply(T first, T second, R) {
+    return std::min(first, second);
+  }
+  static std::string name() { return "Min"; }
+};
+
+template <typename T, typename R>
+class Max {
+public:
+  static R apply(T first, T second, R) {
+    return std::max(first, second);
+  }
+  static std::string name() { return "Max"; }
+};
+
+template <typename T, typename R>
+class And {
+public:
+  static R apply(T first, T second, R) {
+    return first & second;
+  }
+  static std::string name() { return "And"; }
+};
+
+template <typename T, typename R>
+class Or {
+public:
+  static R apply(T first, T second, R) {
+    return first | second;
+  }
+  static std::string name() { return "Or"; }
+};
+
+template <typename T, typename R>
+class Xor {
+public:
+  static R apply(T first, T second, R) {
+    return first ^ second;
+  }
+  static std::string name() { return "Xor"; }
+};
+
+template <typename T, typename R>
+class Sll {
+public:
+  static R apply(T first, T second, R) {
+    // Only the low lg2(SEW) bits of the shift-amount value are used to control the shift amount.
+    return second << (first & (sizeof(T) * 8 - 1));
+  }
+  static std::string name() { return "Sll"; }
+};
+
+template <typename T, typename R>
+bool bitAt(T value, R pos, R negOffset) {
+  R offsetPos = pos - negOffset;
+  return pos >= negOffset && ((value >> offsetPos) & 0x1);
+}
+
+template <typename T, typename R>
+bool anyBitUpTo(T value, R to, R negOffset) {
+  R offsetTo = to - negOffset;
+  return to >= negOffset && (value & (((R)1 << (offsetTo + 1)) - 1));
+}
+
+template <typename T, typename R>
+bool roundBit(T value, R shiftDown, uint32_t vxrm) {
+  switch (vxrm) {
+  case 0: // round-to-nearest-up
+    return bitAt(value, shiftDown, (R)1);
+  case 1: // round-to-nearest-even
+    return bitAt(value, shiftDown, (R)1) && (anyBitUpTo(value, shiftDown, (R)2) || bitAt(value, shiftDown, (R)0));
+  case 2: // round-down (truncate)
+    return 0;
+  case 3: // round-to-odd
+    return !bitAt(value, shiftDown, (R)0) && anyBitUpTo(value, shiftDown, (R)1);
+  default:
+    std::cout << "Roundoff - invalid value for vxrm: " << vxrm << std::endl;
+    std::abort();
+  }
+}
+
+template <typename T, typename R>
+class SrlSra {
+public:
+  static R apply(T first, T second, R) {
+    // Only the low lg2(SEW) bits of the shift-amount value are used to control the shift amount.
+    return second >> (first & (sizeof(T) * 8 - 1));
+  }
+  static R apply(T first, T second, uint32_t vxrm, uint32_t) {
+    // Saturation is not relevant for this operation
+    // Only the low lg2(SEW) bits of the shift-amount value are used to control the shift amount.
+    T firstValid = first & (sizeof(T) * 8 - 1);
+    return apply(firstValid, second, 0) + roundBit(second, firstValid, vxrm);
+  }
+  static std::string name() { return "SrlSra"; }
+};
+
+template <typename T, typename R>
+class Aadd {
+public:
+  static R apply(T first, T second, uint32_t vxrm, uint32_t) {
+    // Saturation is not relevant for this operation
+    T sum = second + first;
+    return (sum >> 1) + roundBit(sum, 1, vxrm);
+  }
+  static std::string name() { return "Aadd"; }
+};
+
+template <typename T, typename R>
+class Asub {
+public:
+  static R apply(T first, T second, uint32_t vxrm, uint32_t) {
+    // Saturation is not relevant for this operation
+    T difference = second - first;
+    return (difference >> 1) + roundBit(difference, 1, vxrm);
+  }
+  static std::string name() { return "Asub"; }
+};
+
+template <typename T, typename R>
+class Eq {
+public:
+  static R apply(T first, T second, R) {
+    return first == second;
+  }
+  static std::string name() { return "Eq"; }
+};
+
+template <typename T, typename R>
+class Ne {
+public:
+  static R apply(T first, T second, R) {
+    return first != second;
+  }
+  static std::string name() { return "Ne"; }
+};
+
+template <typename T, typename R>
+class Lt {
+public:
+  static R apply(T first, T second, R) {
+    return first > second;
+  }
+  static std::string name() { return "Lt"; }
+};
+
+template <typename T, typename R>
+class Le {
+public:
+  static R apply(T first, T second, R) {
+    return first >= second;
+  }
+  static std::string name() { return "Le"; }
+};
+
+template <typename T, typename R>
+class Gt {
+public:
+  static R apply(T first, T second, R) {
+    return first < second;
+  }
+  static std::string name() { return "Gt"; }
+};
+
+template <typename T, typename R>
+class AndNot {
+public:
+  static R apply(T first, T second, R) {
+    return second & ~first;
+  }
+  static std::string name() { return "AndNot"; }
+};
+
+template <typename T, typename R>
+class OrNot {
+public:
+  static R apply(T first, T second, R) {
+    return second | ~first;
+  }
+  static std::string name() { return "OrNot"; }
+};
+
+template <typename T, typename R>
+class Nand {
+public:
+  static R apply(T first, T second, R) {
+    return ~(second & first);
+  }
+  static std::string name() { return "Nand"; }
+};
+
+template <typename T, typename R>
+class Mv {
+public:
+  static R apply(T first, T, R) {
+    return first;
+  }
+  static std::string name() { return "Mv"; }
+};
+
+template <typename T, typename R>
+class Nor {
+public:
+  static R apply(T first, T second, R) {
+    return ~(second | first);
+  }
+  static std::string name() { return "Nor"; }
+};
+
+template <typename T, typename R>
+class Xnor {
+public:
+  static R apply(T first, T second, R) {
+    return ~(second ^ first);
+  }
+  static std::string name() { return "Xnor"; }
+};
+
+template <typename T, typename R>
+class Fadd {
+public:
+  static R apply(T first, T second, R) {
+    // ignoring flags for now
+    uint32_t fflags = 0;
+    // ignoring rounding mode for now
+    uint32_t frm = 0;
+    if (sizeof(R) == 4) {
+      return rv_fadd_s(first, second, frm, &fflags);
+    } else if (sizeof(R) == 8) {
+      uint64_t first_d = sizeof(T) == 8 ? first : rv_ftod(first);
+      uint64_t second_d = sizeof(T) == 8 ? second : rv_ftod(second);
+      return rv_fadd_d(first_d, second_d, frm, &fflags);
+    } else {
+      std::cout << "Fadd only supports f32 and f64" << std::endl;
+      std::abort();
+    }
+  }
+  static std::string name() { return "Fadd"; }
+};
+
+template <typename T, typename R>
+class Fsub {
+public:
+  static R apply(T first, T second, R) {
+    // ignoring flags for now
+    uint32_t fflags = 0;
+    // ignoring rounding mode for now
+    uint32_t frm = 0;
+    if (sizeof(R) == 4) {
+      return rv_fsub_s(second, first, frm, &fflags);
+    } else if (sizeof(R) == 8) {
+      uint64_t first_d = sizeof(T) == 8 ? first : rv_ftod(first);
+      uint64_t second_d = sizeof(T) == 8 ? second : rv_ftod(second);
+      return rv_fsub_d(second_d, first_d, frm, &fflags);
+    } else {
+      std::cout << "Fsub only supports f32 and f64" << std::endl;
+      std::abort();
+    }
+  }
+  static std::string name() { return "Fsub"; }
+};
+
+template <typename T, typename R>
+class Fmacc {
+public:
+  static R apply(T first, T second, R third) {
+    // ignoring flags for now
+    uint32_t fflags = 0;
+    // ignoring rounding mode for now
+    uint32_t frm = 0;
+    if (sizeof(R) == 4) {
+      return rv_fmadd_s(first, second, third, frm, &fflags);
+    } else if (sizeof(R) == 8) {
+      uint64_t first_d = sizeof(T) == 8 ? first : rv_ftod(first);
+      uint64_t second_d = sizeof(T) == 8 ? second : rv_ftod(second);
+      return rv_fmadd_d(first_d, second_d, third, frm, &fflags);
+    } else {
+      std::cout << "Fmacc only supports f32 and f64" << std::endl;
+      std::abort();
+    }
+  }
+  static std::string name() { return "Fmacc"; }
+};
+
+template <typename T, typename R>
+class Fnmacc {
+public:
+  static R apply(T first, T second, R third) {
+    // ignoring flags for now
+    uint32_t fflags = 0;
+    // ignoring rounding mode for now
+    uint32_t frm = 0;
+    if (sizeof(R) == 4) {
+      return rv_fnmadd_s(first, second, third, frm, &fflags);
+    } else if (sizeof(R) == 8) {
+      uint64_t first_d = sizeof(T) == 8 ? first : rv_ftod(first);
+      uint64_t second_d = sizeof(T) == 8 ? second : rv_ftod(second);
+      return rv_fnmadd_d(first_d, second_d, third, frm, &fflags);
+    } else {
+      std::cout << "Fnmacc only supports f32 and f64" << std::endl;
+      std::abort();
+    }
+  }
+  static std::string name() { return "Fnmacc"; }
+};
+
+template <typename T, typename R>
+class Fmsac {
+public:
+  static R apply(T first, T second, R third) {
+    // ignoring flags for now
+    uint32_t fflags = 0;
+    // ignoring rounding mode for now
+    uint32_t frm = 0;
+    if (sizeof(R) == 4) {
+      return rv_fmadd_s(first, second, rv_fsgnjn_s(third, third), frm, &fflags);
+    } else if (sizeof(R) == 8) {
+      uint64_t first_d = sizeof(T) == 8 ? first : rv_ftod(first);
+      uint64_t second_d = sizeof(T) == 8 ? second : rv_ftod(second);
+      return rv_fmadd_d(first_d, second_d, rv_fsgnjn_d(third, third), frm, &fflags);
+    } else {
+      std::cout << "Fmsac only supports f32 and f64" << std::endl;
+      std::abort();
+    }
+  }
+  static std::string name() { return "Fmsac"; }
+};
+
+template <typename T, typename R>
+class Fnmsac {
+public:
+  static R apply(T first, T second, R third) {
+    // ignoring flags for now
+    uint32_t fflags = 0;
+    // ignoring rounding mode for now
+    uint32_t frm = 0;
+    if (sizeof(R) == 4) {
+      return rv_fnmadd_s(first, second, rv_fsgnjn_s(third, third), frm, &fflags);
+    } else if (sizeof(R) == 8) {
+      uint64_t first_d = sizeof(T) == 8 ? first : rv_ftod(first);
+      uint64_t second_d = sizeof(T) == 8 ? second : rv_ftod(second);
+      return rv_fnmadd_d(first_d, second_d, rv_fsgnjn_d(third, third), frm, &fflags);
+    } else {
+      std::cout << "Fnmsac only supports f32 and f64" << std::endl;
+      std::abort();
+    }
+  }
+  static std::string name() { return "Fnmsac"; }
+};
+
+template <typename T, typename R>
+class Fmadd {
+public:
+  static R apply(T first, T second, R third) {
+    if (sizeof(T) == 4 || sizeof(T) == 8) {
+      return Fmacc<T, R>::apply(first, third, second);
+    } else {
+      std::cout << "Fmadd only supports f32 and f64" << std::endl;
+      std::abort();
+    }
+  }
+  static std::string name() { return "Fmadd"; }
+};
+
+template <typename T, typename R>
+class Fnmadd {
+public:
+  static R apply(T first, T second, R third) {
+    if (sizeof(T) == 4 || sizeof(T) == 8) {
+      return Fnmacc<T, R>::apply(first, third, second);
+    } else {
+      std::cout << "Fnmadd only supports f32 and f64" << std::endl;
+      std::abort();
+    }
+  }
+  static std::string name() { return "Fnmadd"; }
+};
+
+template <typename T, typename R>
+class Fmsub {
+public:
+  static R apply(T first, T second, R third) {
+    if (sizeof(T) == 4 || sizeof(T) == 8) {
+      return Fmsac<T, R>::apply(first, third, second);
+    } else {
+      std::cout << "Fmsub only supports f32 and f64" << std::endl;
+      std::abort();
+    }
+  }
+  static std::string name() { return "Fmsub"; }
+};
+
+template <typename T, typename R>
+class Fnmsub {
+public:
+  static R apply(T first, T second, R third) {
+    if (sizeof(T) == 4 || sizeof(T) == 8) {
+      return Fnmsac<T, R>::apply(first, third, second);
+    } else {
+      std::cout << "Fnmsub only supports f32 and f64" << std::endl;
+      std::abort();
+    }
+  }
+  static std::string name() { return "Fnmsub"; }
+};
+
+template <typename T, typename R>
+class Fmin {
+public:
+  static R apply(T first, T second, R) {
+    // ignoring rounding modes for now
+    uint32_t fflags = 0;
+    if (sizeof(T) == 4) {
+      return rv_fmin_s(first, second, &fflags);
+    } else if (sizeof(T) == 8) {
+      return rv_fmin_d(first, second, &fflags);
+    } else {
+      std::cout << "Fmin only supports f32 and f64" << std::endl;
+      std::abort();
+    }
+  }
+  static std::string name() { return "Fmin"; }
+};
+
+template <typename T, typename R>
+class Fmax {
+public:
+  static R apply(T first, T second, R) {
+    // ignoring rounding modes for now
+    uint32_t fflags = 0;
+    if (sizeof(T) == 4) {
+      return rv_fmax_s(first, second, &fflags);
+    } else if (sizeof(T) == 8) {
+      return rv_fmax_d(first, second, &fflags);
+    } else {
+      std::cout << "Fmax only supports f32 and f64" << std::endl;
+      std::abort();
+    }
+  }
+  static std::string name() { return "Fmax"; }
+};
+
+template <typename T, typename R>
+class Fsgnj {
+public:
+  static R apply(T first, T second, R) {
+    if (sizeof(T) == 4) {
+      return rv_fsgnj_s(second, first);
+    } else if (sizeof(T) == 8) {
+      return rv_fsgnj_d(second, first);
+    } else {
+      std::cout << "Fsgnj only supports f32 and f64" << std::endl;
+      std::abort();
+    }
+  }
+  static std::string name() { return "Fsgnj"; }
+};
+
+template <typename T, typename R>
+class Fsgnjn {
+public:
+  static R apply(T first, T second, R) {
+    if (sizeof(T) == 4) {
+      return rv_fsgnjn_s(second, first);
+    } else if (sizeof(T) == 8) {
+      return rv_fsgnjn_d(second, first);
+    } else {
+      std::cout << "Fsgnjn only supports f32 and f64" << std::endl;
+      std::abort();
+    }
+  }
+  static std::string name() { return "Fsgnjn"; }
+};
+
+template <typename T, typename R>
+class Fsgnjx {
+public:
+  static R apply(T first, T second, R) {
+    if (sizeof(T) == 4) {
+      return rv_fsgnjx_s(second, first);
+    } else if (sizeof(T) == 8) {
+      return rv_fsgnjx_d(second, first);
+    } else {
+      std::cout << "Fsgnjx only supports f32 and f64" << std::endl;
+      std::abort();
+    }
+  }
+  static std::string name() { return "Fsgnjx"; }
+};
+
+template <typename T, typename R>
+class Fcvt {
+public:
+  static R apply(T first, T second, R) {
+    // ignoring flags for now
+    uint32_t fflags = 0;
+    // ignoring rounding mode for now
+    uint32_t frm = 0;
+    if (sizeof(T) == 4) {
+      switch (first) {
+      case 0b00000: // vfcvt.xu.f.v
+        return rv_ftou_s(second, frm, &fflags);
+      case 0b00001: // vfcvt.x.f.v
+        return rv_ftoi_s(second, frm, &fflags);
+      case 0b00010: // vfcvt.f.xu.v
+        return rv_utof_s(second, frm, &fflags);
+      case 0b00011: // vfcvt.f.x.v
+        return rv_itof_s(second, frm, &fflags);
+      case 0b00110: // vfcvt.rtz.xu.f.v
+        return rv_ftou_s(second, 1, &fflags);
+      case 0b00111: // vfcvt.rtz.x.f.v
+        return rv_ftoi_s(second, 1, &fflags);
+      case 0b01000: // vfwcvt.xu.f.v
+        return rv_ftolu_s(second, frm, &fflags);
+      case 0b01001: // vfwcvt.x.f.v
+        return rv_ftol_s(second, frm, &fflags);
+      case 0b01010: // vfwcvt.f.xu.v
+        return rv_utof_d(second, frm, &fflags);
+      case 0b01011: // vfwcvt.f.x.v
+        return rv_itof_d(second, frm, &fflags);
+      case 0b01100: // vfwcvt.f.f.v
+        return rv_ftod(second);
+      case 0b01110: // vfwcvt.rtz.xu.f.v
+        return rv_ftolu_s(second, 1, &fflags);
+      case 0b01111: // vfwcvt.rtz.x.f.v
+        return rv_ftol_s(second, 1, &fflags);
+      default:
+        std::cout << "Fcvt has unsupported value for first: " << first << std::endl;
+        std::abort();
+      }
+    } else if (sizeof(T) == 8) {
+      switch (first) {
+      case 0b00000: // vfcvt.xu.f.v
+        return rv_ftolu_d(second, frm, &fflags);
+      case 0b00001: // vfcvt.x.f.v
+        return rv_ftol_d(second, frm, &fflags);
+      case 0b00010: // vfcvt.f.xu.v
+        return rv_lutof_d(second, frm, &fflags);
+      case 0b00011: // vfcvt.f.x.v
+        return rv_ltof_d(second, frm, &fflags);
+      case 0b00110: // vfcvt.rtz.xu.f.v
+        return rv_ftolu_d(second, 1, &fflags);
+      case 0b00111: // vfcvt.rtz.x.f.v
+        return rv_ftol_d(second, 1, &fflags);
+      case 0b01000: // vfwcvt.xu.f.v
+      case 0b01001: // vfwcvt.x.f.v
+      case 0b01010: // vfwcvt.f.xu.v
+      case 0b01011: // vfwcvt.f.x.v
+      case 0b01100: // vfwcvt.f.f.v
+      case 0b01110: // vfwcvt.rtz.xu.f.v
+      case 0b01111: // vfwcvt.rtz.x.f.v
+        std::cout << "Fwcvt only supports f32" << std::endl;
+        std::abort();
+      default:
+        std::cout << "Fcvt has unsupported value for first: " << first << std::endl;
+        std::abort();
+      }
+    } else {
+      std::cout << "Fcvt only supports f32 and f64" << std::endl;
+      std::abort();
+    }
+  }
+  static R apply(T first, T second, uint32_t vxrm, uint32_t &) { // saturation argument is unused
+    // ignoring flags for now
+    uint32_t fflags = 0;
+    if (sizeof(T) == 8) {
+      switch (first) {
+      case 0b10000: // vfncvt.xu.f.w
+        return rv_ftou_d(second, vxrm, &fflags);
+      case 0b10001: // vfncvt.x.f.w
+        return rv_ftoi_d(second, vxrm, &fflags);
+      case 0b10010: // vfncvt.f.xu.w
+        return rv_lutof_s(second, vxrm, &fflags);
+      case 0b10011: // vfncvt.f.x.w
+        return rv_ltof_s(second, vxrm, &fflags);
+      case 0b10100: // vfncvt.f.f.w
+        return rv_dtof_r(second, vxrm);
+      case 0b10101: // vfncvt.rod.f.f.w
+        return rv_dtof_r(second, 6);
+      case 0b10110: // vfncvt.rtz.xu.f.w
+        return rv_ftou_d(second, 1, &fflags);
+      case 0b10111: // vfncvt.rtz.x.f.w
+        return rv_ftoi_d(second, 1, &fflags);
+      default:
+        std::cout << "Fncvt has unsupported value for first: " << first << std::endl;
+        std::abort();
+      }
+    } else {
+      std::cout << "Fncvt only supports f64" << std::endl;
+      std::abort();
+    }
+  }
+  static std::string name() { return "Fcvt"; }
+};
+
+template <typename T, typename R>
+class Funary1 {
+public:
+  static R apply(T first, T second, R) {
+    // ignoring flags for now
+    uint32_t fflags = 0;
+    // ignoring rounding mode for now
+    uint32_t frm = 0;
+    if (sizeof(T) == 4) {
+      switch (first) {
+      case 0b00000: // vfsqrt.v
+        return rv_fsqrt_s(second, frm, &fflags);
+      case 0b00100: // vfrsqrt7.v
+        return rv_frsqrt7_s(second, frm, &fflags);
+      case 0b00101: // vfrec7.v
+        return rv_frecip7_s(second, frm, &fflags);
+      case 0b10000: // vfclass.v
+        return rv_fclss_s(second);
+      default:
+        std::cout << "Funary1 has unsupported value for first: " << first << std::endl;
+        std::abort();
+      }
+    } else if (sizeof(T) == 8) {
+      switch (first) {
+      case 0b00000: // vfsqrt.v
+        return rv_fsqrt_d(second, frm, &fflags);
+      case 0b00100: // vfrsqrt7.v
+        return rv_frsqrt7_d(second, frm, &fflags);
+      case 0b00101: // vfrec7.v
+        return rv_frecip7_d(second, frm, &fflags);
+      case 0b10000: // vfclass.v
+        return rv_fclss_d(second);
+      default:
+        std::cout << "Funary1 has unsupported value for first: " << first << std::endl;
+        std::abort();
+      }
+    } else {
+      std::cout << "Funary1 only supports f32 and f64" << std::endl;
+      std::abort();
+    }
+  }
+  static std::string name() { return "Funary1"; }
+};
+
+template <typename T, typename R>
+class Xunary0 {
+public:
+  static R apply(T, T second, T) {
+    return second;
+  }
+  static std::string name() { return "Xunary0"; }
+};
+
+template <typename T, typename R>
+class Feq {
+public:
+  static R apply(T first, T second, R) {
+    // ignoring flags for now
+    uint32_t fflags = 0;
+    if (sizeof(T) == 4) {
+      return rv_feq_s(second, first, &fflags);
+    } else if (sizeof(T) == 8) {
+      return rv_feq_d(second, first, &fflags);
+    } else {
+      std::cout << "Feq only supports f32 and f64" << std::endl;
+      std::abort();
+    }
+  }
+  static std::string name() { return "Feq"; }
+};
+
+template <typename T, typename R>
+class Fle {
+public:
+  static R apply(T first, T second, R) {
+    // ignoring flags for now
+    uint32_t fflags = 0;
+    if (sizeof(T) == 4) {
+      return rv_fle_s(second, first, &fflags);
+    } else if (sizeof(T) == 8) {
+      return rv_fle_d(second, first, &fflags);
+    } else {
+      std::cout << "Fle only supports f32 and f64" << std::endl;
+      std::abort();
+    }
+  }
+  static std::string name() { return "Fle"; }
+};
+
+template <typename T, typename R>
+class Flt {
+public:
+  static R apply(T first, T second, R) {
+    // ignoring flags for now
+    uint32_t fflags = 0;
+    if (sizeof(T) == 4) {
+      return rv_flt_s(second, first, &fflags);
+    } else if (sizeof(T) == 8) {
+      return rv_flt_d(second, first, &fflags);
+    } else {
+      std::cout << "Flt only supports f32 and f64" << std::endl;
+      std::abort();
+    }
+  }
+  static std::string name() { return "Flt"; }
+};
+
+template <typename T, typename R>
+class Fne {
+public:
+  static R apply(T first, T second, R) {
+    // ignoring flags for now
+    uint32_t fflags = 0;
+    if (sizeof(T) == 4) {
+      return !rv_feq_s(second, first, &fflags);
+    } else if (sizeof(T) == 8) {
+      return !rv_feq_d(second, first, &fflags);
+    } else {
+      std::cout << "Fne only supports f32 and f64" << std::endl;
+      std::abort();
+    }
+  }
+  static std::string name() { return "Fne"; }
+};
+
+template <typename T, typename R>
+class Fgt {
+public:
+  static R apply(T first, T second, R) {
+    // ignoring flags for now
+    uint32_t fflags = 0;
+    if (sizeof(T) == 4) {
+      return rv_flt_s(first, second, &fflags);
+    } else if (sizeof(T) == 8) {
+      return rv_flt_d(first, second, &fflags);
+    } else {
+      std::cout << "Fgt only supports f32 and f64" << std::endl;
+      std::abort();
+    }
+  }
+  static std::string name() { return "Fgt"; }
+};
+
+template <typename T, typename R>
+class Fge {
+public:
+  static R apply(T first, T second, R) {
+    // ignoring flags for now
+    uint32_t fflags = 0;
+    if (sizeof(T) == 4) {
+      return rv_fle_s(first, second, &fflags);
+    } else if (sizeof(T) == 8) {
+      return rv_fle_d(first, second, &fflags);
+    } else {
+      std::cout << "Fge only supports f32 and f64" << std::endl;
+      std::abort();
+    }
+  }
+  static std::string name() { return "Fge"; }
+};
+
+template <typename T, typename R>
+class Fdiv {
+public:
+  static R apply(T first, T second, R) {
+    // ignoring flags for now
+    uint32_t fflags = 0;
+    // ignoring rounding mode for now
+    uint32_t frm = 0;
+    if (sizeof(T) == 4) {
+      return rv_fdiv_s(second, first, frm, &fflags);
+    } else if (sizeof(T) == 8) {
+      return rv_fdiv_d(second, first, frm, &fflags);
+    } else {
+      std::cout << "Fdiv only supports f32 and f64" << std::endl;
+      std::abort();
+    }
+  }
+  static std::string name() { return "Fdiv"; }
+};
+
+template <typename T, typename R>
+class Frdiv {
+public:
+  static R apply(T first, T second, R) {
+    // ignoring flags for now
+    uint32_t fflags = 0;
+    // ignoring rounding mode for now
+    uint32_t frm = 0;
+    if (sizeof(T) == 4) {
+      return rv_fdiv_s(first, second, frm, &fflags);
+    } else if (sizeof(T) == 8) {
+      return rv_fdiv_d(first, second, frm, &fflags);
+    } else {
+      std::cout << "Frdiv only supports f32 and f64" << std::endl;
+      std::abort();
+    }
+  }
+  static std::string name() { return "Frdiv"; }
+};
+
+template <typename T, typename R>
+class Fmul {
+public:
+  static R apply(T first, T second, R) {
+    // ignoring flags for now
+    uint32_t fflags = 0;
+    // ignoring rounding mode for now
+    uint32_t frm = 0;
+    if (sizeof(R) == 4) {
+      return rv_fmul_s(first, second, frm, &fflags);
+    } else if (sizeof(R) == 8) {
+      uint64_t first_d = sizeof(T) == 8 ? first : rv_ftod(first);
+      uint64_t second_d = sizeof(T) == 8 ? second : rv_ftod(second);
+      return rv_fmul_d(first_d, second_d, frm, &fflags);
+    } else {
+      std::cout << "Fmul only supports f32 and f64" << std::endl;
+      std::abort();
+    }
+  }
+  static std::string name() { return "Fmul"; }
+};
+
+template <typename T, typename R>
+class Frsub {
+public:
+  static R apply(T first, T second, R) {
+    // ignoring flags for now
+    uint32_t fflags = 0;
+    // ignoring rounding mode for now
+    uint32_t frm = 0;
+    if (sizeof(T) == 4) {
+      return rv_fsub_s(first, second, frm, &fflags);
+    } else if (sizeof(T) == 8) {
+      return rv_fsub_d(first, second, frm, &fflags);
+    } else {
+      std::cout << "Frsub only supports f32 and f64" << std::endl;
+      std::abort();
+    }
+  }
+  static std::string name() { return "Frsub"; }
+};
+
+template <typename T, typename R>
+class Clip {
+public:
+  static R apply(T first, T second, uint32_t vxrm, uint32_t &vxsat_) {
+    // The low lg2(2*SEW) bits of the vector or scalar shift-amount value (e.g., the low 6 bits for a SEW=64-bit to
+    // SEW=32-bit narrowing operation) are used to control the right shift amount, which provides the scaling.
+    R firstValid = first & (sizeof(T) * 8 - 1);
+    T unclippedResult = (second >> firstValid) + roundBit(second, firstValid, vxrm);
+    R clippedResult = std::clamp(unclippedResult, (T)std::numeric_limits<R>::min(), (T)std::numeric_limits<R>::max());
+    vxsat_ |= clippedResult != unclippedResult;
+    return clippedResult;
+  }
+  static std::string name() { return "Clip"; }
+};
+
+template <typename T, typename R>
+class Smul {
+public:
+  static R apply(T first, T second, uint32_t vxrm, uint32_t &vxsat_) {
+    R shift = sizeof(R) * 8 - 1;
+    T unshiftedResult = first * second;
+    T unclippedResult = (unshiftedResult >> shift) + roundBit(unshiftedResult, shift, vxrm);
+    R clippedResult = std::clamp(unclippedResult, (T)std::numeric_limits<R>::min(), (T)std::numeric_limits<R>::max());
+    vxsat_ |= clippedResult != unclippedResult;
+    return clippedResult;
+  }
+  static std::string name() { return "Smul"; }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+bool isMasked(std::vector<std::vector<Byte>> &vreg_file, uint32_t maskVreg, uint32_t byteI, bool vmask) {
+  auto &mask = vreg_file.at(maskVreg);
+  uint8_t emask = *(uint8_t *)(mask.data() + byteI / 8);
+  uint8_t value = (emask >> (byteI % 8)) & 0x1;
+  DP(4, "Masking enabled: " << +!vmask << " mask element: " << +value);
+  return !vmask && value == 0;
+}
+
+template <typename DT>
+uint32_t getVreg(uint32_t baseVreg, uint32_t byteI) {
+  uint32_t vsew = sizeof(DT) * 8;
+  return (baseVreg + (byteI / (VLEN / vsew))) % 32;
+}
+
+template <typename DT>
+DT &getVregData(std::vector<vortex::Byte> &baseVregVec, uint32_t byteI) {
+  uint32_t vsew = sizeof(DT) * 8;
+  return *(DT *)(baseVregVec.data() + (byteI % (VLEN / vsew)) * vsew / 8);
+}
+
+template <typename DT>
+DT &getVregData(std::vector<std::vector<vortex::Byte>> &vreg_file, uint32_t baseVreg, uint32_t byteI) {
+  auto &vr1 = vreg_file.at(getVreg<DT>(baseVreg, byteI));
+  return getVregData<DT>(vr1, byteI);
+}
+
+template <typename DT>
+void vector_op_vix_load(std::vector<std::vector<Byte>> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rdest, uint32_t vl, bool strided, WordI stride, uint32_t nfields, uint32_t lmul, uint32_t vmask) {
+  uint32_t vsew = sizeof(DT) * 8;
+  uint32_t emul = lmul >> 2 ? 1 : 1 << (lmul & 0b11);
+  if (nfields * emul > 8) {
+    std::cout << "NFIELDS * EMUL = " << nfields * lmul << " but it should be <= 8" << std::endl;
+    std::abort();
+  }
+  for (uint32_t i = 0; i < vl * nfields; i++) {
+    if (isMasked(vreg_file, 0, i / nfields, vmask))
+      continue;
+
+    uint32_t nfields_strided = strided ? nfields : 1;
+    Word mem_addr = (base_addr & 0xFFFFFFFC) + (i / nfields_strided) * stride + (i % nfields_strided) * sizeof(DT);
+    Word mem_data = 0;
+    emul_->dcache_read(&mem_data, mem_addr, vsew / 8);
+    DP(4, "Loading data " << mem_data << " from: " << mem_addr << " to vec reg: " << getVreg<DT>(rdest + (i % nfields) * emul, i / nfields) << " i: " << i / nfields);
+    DT &result = getVregData<DT>(vreg_file, rdest + (i % nfields) * emul, i / nfields);
+    DP(4, "Previous data: " << +result);
+    result = (DT)mem_data;
+  }
+}
+
+void vector_op_vix_load(std::vector<std::vector<Byte>> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rdest, uint32_t vsew, uint32_t vl, bool strided, WordI stride, uint32_t nfields, uint32_t lmul, uint32_t vmask) {
+  switch (vsew) {
+  case 8:
+    vector_op_vix_load<uint8_t>(vreg_file, emul_, base_addr, rdest, vl, strided, stride, nfields, lmul, vmask);
+    break;
+  case 16:
+    vector_op_vix_load<uint16_t>(vreg_file, emul_, base_addr, rdest, vl, strided, stride, nfields, lmul, vmask);
+    break;
+  case 32:
+    vector_op_vix_load<uint32_t>(vreg_file, emul_, base_addr, rdest, vl, strided, stride, nfields, lmul, vmask);
+    break;
+  case 64:
+    vector_op_vix_load<uint64_t>(vreg_file, emul_, base_addr, rdest, vl, strided, stride, nfields, lmul, vmask);
+    break;
+  default:
+    std::cout << "Failed to execute VLE for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <typename DT>
+void vector_op_vv_load(std::vector<std::vector<Byte>> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rsrc1, uint32_t rdest, uint32_t iSew, uint32_t vl, uint32_t nfields, uint32_t lmul, uint32_t vmask) {
+  uint32_t vsew = sizeof(DT) * 8;
+  uint32_t emul = lmul >> 2 ? 1 : 1 << (lmul & 0b11);
+  if (nfields * emul > 8) {
+    std::cout << "NFIELDS * EMUL = " << nfields * lmul << " but it should be <= 8" << std::endl;
+    std::abort();
+  }
+  for (uint32_t i = 0; i < vl * nfields; i++) {
+    if (isMasked(vreg_file, 0, i / nfields, vmask))
+      continue;
+
+    Word offset = 0;
+    switch (iSew) {
+    case 8:
+      offset = getVregData<uint8_t>(vreg_file, rsrc1, i / nfields);
+      break;
+    case 16:
+      offset = getVregData<uint16_t>(vreg_file, rsrc1, i / nfields);
+      break;
+    case 32:
+      offset = getVregData<uint32_t>(vreg_file, rsrc1, i / nfields);
+      break;
+    case 64:
+      offset = getVregData<uint64_t>(vreg_file, rsrc1, i / nfields);
+      break;
+    default:
+      std::cout << "Unsupported iSew: " << iSew << std::endl;
+      std::abort();
+    }
+
+    Word mem_addr = (base_addr & 0xFFFFFFFC) + offset + (i % nfields) * sizeof(DT);
+    Word mem_data = 0;
+    emul_->dcache_read(&mem_data, mem_addr, vsew / 8);
+    DP(4, "VLUX/VLOX - Loading data " << mem_data << " from: " << mem_addr << " with offset: " << std::dec << offset << " to vec reg: " << getVreg<DT>(rdest + (i % nfields) * emul, i / nfields) << " i: " << i / nfields);
+    DT &result = getVregData<DT>(vreg_file, rdest + (i % nfields) * emul, i / nfields);
+    DP(4, "Previous data: " << +result);
+    result = (DT)mem_data;
+  }
+}
+
+void vector_op_vv_load(std::vector<std::vector<Byte>> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t iSew, uint32_t vl, uint32_t nfields, uint32_t lmul, uint32_t vmask) {
+  switch (vsew) {
+  case 8:
+    vector_op_vv_load<uint8_t>(vreg_file, emul_, base_addr, rsrc1, rdest, iSew, vl, nfields, lmul, vmask);
+    break;
+  case 16:
+    vector_op_vv_load<uint16_t>(vreg_file, emul_, base_addr, rsrc1, rdest, iSew, vl, nfields, lmul, vmask);
+    break;
+  case 32:
+    vector_op_vv_load<uint32_t>(vreg_file, emul_, base_addr, rsrc1, rdest, iSew, vl, nfields, lmul, vmask);
+    break;
+  case 64:
+    vector_op_vv_load<uint64_t>(vreg_file, emul_, base_addr, rsrc1, rdest, iSew, vl, nfields, lmul, vmask);
+    break;
+  default:
+    std::cout << "Failed to execute VLUX/VLOX for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <typename DT>
+void vector_op_vix_store(std::vector<std::vector<Byte>> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rsrc3, uint32_t vl, bool strided, WordI stride, uint32_t nfields, uint32_t lmul, uint32_t vmask) {
+  uint32_t vsew = sizeof(DT) * 8;
+  uint32_t emul = lmul >> 2 ? 1 : 1 << (lmul & 0b11);
+  for (uint32_t i = 0; i < vl * nfields; i++) {
+    if (isMasked(vreg_file, 0, i / nfields, vmask))
+      continue;
+
+    uint32_t nfields_strided = strided ? nfields : 1;
+    Word mem_addr = base_addr + (i / nfields_strided) * stride + (i % nfields_strided) * sizeof(DT);
+    Word mem_data = getVregData<DT>(vreg_file, rsrc3 + (i % nfields) * emul, i / nfields);
+    DP(4, "Storing: " << std::hex << mem_data << " at: " << mem_addr << " from vec reg: " << getVreg<DT>(rsrc3 + (i % nfields) * emul, i / nfields) << " i: " << i / nfields);
+    emul_->dcache_write(&mem_data, mem_addr, vsew / 8);
+  }
+}
+
+void vector_op_vix_store(std::vector<std::vector<Byte>> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rsrc3, uint32_t vsew, uint32_t vl, bool strided, WordI stride, uint32_t nfields, uint32_t lmul, uint32_t vmask) {
+  switch (vsew) {
+  case 8:
+    vector_op_vix_store<uint8_t>(vreg_file, emul_, base_addr, rsrc3, vl, strided, stride, nfields, lmul, vmask);
+    break;
+  case 16:
+    vector_op_vix_store<uint16_t>(vreg_file, emul_, base_addr, rsrc3, vl, strided, stride, nfields, lmul, vmask);
+    break;
+  case 32:
+    vector_op_vix_store<uint32_t>(vreg_file, emul_, base_addr, rsrc3, vl, strided, stride, nfields, lmul, vmask);
+    break;
+  case 64:
+    vector_op_vix_store<uint64_t>(vreg_file, emul_, base_addr, rsrc3, vl, strided, stride, nfields, lmul, vmask);
+    break;
+  default:
+    std::cout << "Failed to execute VSE for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <typename DT>
+void vector_op_vv_store(std::vector<std::vector<Byte>> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rsrc1, uint32_t rsrc3, uint32_t iSew, uint32_t vl, uint32_t nfields, uint32_t lmul, uint32_t vmask) {
+  uint32_t vsew = sizeof(DT) * 8;
+  uint32_t emul = lmul >> 2 ? 1 : 1 << (lmul & 0b11);
+  for (uint32_t i = 0; i < vl * nfields; i++) {
+    if (isMasked(vreg_file, 0, i / nfields, vmask))
+      continue;
+
+    Word offset = 0;
+    switch (iSew) {
+    case 8:
+      offset = getVregData<uint8_t>(vreg_file, rsrc1, i / nfields);
+      break;
+    case 16:
+      offset = getVregData<uint16_t>(vreg_file, rsrc1, i / nfields);
+      break;
+    case 32:
+      offset = getVregData<uint32_t>(vreg_file, rsrc1, i / nfields);
+      break;
+    case 64:
+      offset = getVregData<uint64_t>(vreg_file, rsrc1, i / nfields);
+      break;
+    default:
+      std::cout << "Unsupported iSew: " << iSew << std::endl;
+      std::abort();
+    }
+
+    Word mem_addr = base_addr + offset + (i % nfields) * sizeof(DT);
+    Word mem_data = getVregData<DT>(vreg_file, rsrc3 + (i % nfields) * emul, i / nfields);
+    DP(4, "VSUX/VSOX - Storing: " << std::hex << mem_data << " at: " << mem_addr << " with offset: " << std::dec << offset << " from vec reg: " << getVreg<DT>(rsrc3 + (i % nfields) * emul, i / nfields) << " i: " << i / nfields);
+    emul_->dcache_write(&mem_data, mem_addr, vsew / 8);
+  }
+}
+
+void vector_op_vv_store(std::vector<std::vector<Byte>> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rsrc1, uint32_t rsrc3, uint32_t vsew, uint32_t iSew, uint32_t vl, uint32_t nfields, uint32_t lmul, uint32_t vmask) {
+  switch (vsew) {
+  case 8:
+    vector_op_vv_store<uint8_t>(vreg_file, emul_, base_addr, rsrc1, rsrc3, iSew, vl, nfields, lmul, vmask);
+    break;
+  case 16:
+    vector_op_vv_store<uint16_t>(vreg_file, emul_, base_addr, rsrc1, rsrc3, iSew, vl, nfields, lmul, vmask);
+    break;
+  case 32:
+    vector_op_vv_store<uint32_t>(vreg_file, emul_, base_addr, rsrc1, rsrc3, iSew, vl, nfields, lmul, vmask);
+    break;
+  case 64:
+    vector_op_vv_store<uint64_t>(vreg_file, emul_, base_addr, rsrc1, rsrc3, iSew, vl, nfields, lmul, vmask);
+    break;
+  default:
+    std::cout << "Failed to execute VSUX/VSOX for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT>
+void vector_op_vix(DT first, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vl, uint32_t vmask) {
+  for (uint32_t i = 0; i < vl; i++) {
+    if (isMasked(vreg_file, 0, i, vmask))
+      continue;
+
+    DT second = getVregData<DT>(vreg_file, rsrc0, i);
+    DT third = getVregData<DT>(vreg_file, rdest, i);
+    DT result = OP<DT, DT>::apply(first, second, third);
+    DP(4, (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
+    getVregData<DT>(vreg_file, rdest, i) = result;
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
+void vector_op_vix(Word src1, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
+  switch (vsew) {
+  case 8:
+    vector_op_vix<OP, DT8>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+    break;
+  case 16:
+    vector_op_vix<OP, DT16>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+    break;
+  case 32:
+    vector_op_vix<OP, DT32>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+    break;
+  case 64:
+    vector_op_vix<OP, DT64>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+    break;
+  default:
+    std::cout << "Failed to execute VI/VX for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT>
+void vector_op_vix_carry(DT first, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vl) {
+  for (uint32_t i = 0; i < vl; i++) {
+    DT second = getVregData<DT>(vreg_file, rsrc0, i);
+    bool third = !isMasked(vreg_file, 0, i, false);
+    DT result = OP<DT, DT>::apply(first, second, third);
+    DP(4, (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
+    getVregData<DT>(vreg_file, rdest, i) = result;
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
+void vector_op_vix_carry(Word src1, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vsew, uint32_t vl) {
+  switch (vsew) {
+  case 8:
+    vector_op_vix_carry<OP, DT8>(src1, vreg_file, rsrc0, rdest, vl);
+    break;
+  case 16:
+    vector_op_vix_carry<OP, DT16>(src1, vreg_file, rsrc0, rdest, vl);
+    break;
+  case 32:
+    vector_op_vix_carry<OP, DT32>(src1, vreg_file, rsrc0, rdest, vl);
+    break;
+  case 64:
+    vector_op_vix_carry<OP, DT64>(src1, vreg_file, rsrc0, rdest, vl);
+    break;
+  default:
+    std::cout << "Failed to execute VI/VX carry for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT, typename DTR>
+void vector_op_vix_carry_out(DT first, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vl, uint32_t vmask) {
+  for (uint32_t i = 0; i < vl; i++) {
+    DT second = getVregData<DT>(vreg_file, rsrc0, i);
+    bool third = !vmask && !isMasked(vreg_file, 0, i, vmask);
+    bool result = OP<DT, DTR>::apply(first, second, third);
+    DP(4, (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
+    if (result) {
+      getVregData<uint8_t>(vreg_file, rdest, i / 8) |= 1 << (i % 8);
+    } else {
+      getVregData<uint8_t>(vreg_file, rdest, i / 8) &= ~(1 << (i % 8));
+    }
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64, typename DT128>
+void vector_op_vix_carry_out(Word src1, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
+  switch (vsew) {
+  case 8:
+    vector_op_vix_carry_out<OP, DT8, DT16>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+    break;
+  case 16:
+    vector_op_vix_carry_out<OP, DT16, DT32>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+    break;
+  case 32:
+    vector_op_vix_carry_out<OP, DT32, DT64>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+    break;
+  case 64:
+    vector_op_vix_carry_out<OP, DT64, DT128>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+    break;
+  default:
+    std::cout << "Failed to execute VI/VX carry out for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <typename DT>
+void vector_op_vix_merge(DT first, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vl, uint32_t vmask) {
+  for (uint32_t i = 0; i < vl; i++) {
+    DT result = isMasked(vreg_file, 0, i, vmask) ? getVregData<DT>(vreg_file, rsrc0, i) : first;
+    DP(4, "Merge - Choosing result: " << +result);
+    getVregData<DT>(vreg_file, rdest, i) = result;
+  }
+}
+
+template <typename DT8, typename DT16, typename DT32, typename DT64>
+void vector_op_vix_merge(Word src1, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
+  switch (vsew) {
+  case 8:
+    vector_op_vix_merge<DT8>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+    break;
+  case 16:
+    vector_op_vix_merge<DT16>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+    break;
+  case 32:
+    vector_op_vix_merge<DT32>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+    break;
+  case 64:
+    vector_op_vix_merge<DT64>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+    break;
+  default:
+    std::cout << "Failed to execute VI/VX for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <typename DT>
+void vector_op_scalar(DT &dest, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t vsew) {
+  if (rsrc0 != 0) {
+    std::cout << "Vwxunary0/Vwfunary0 has unsupported value for vs2: " << rsrc0 << std::endl;
+    std::abort();
+  }
+  switch (vsew) {
+  case 8:
+    dest = getVregData<uint8_t>(vreg_file, rsrc1, 0);
+    break;
+  case 16:
+    dest = getVregData<uint16_t>(vreg_file, rsrc1, 0);
+    break;
+  case 32:
+    dest = getVregData<uint32_t>(vreg_file, rsrc1, 0);
+    break;
+  case 64:
+    dest = getVregData<uint64_t>(vreg_file, rsrc1, 0);
+    break;
+  default:
+    std::cout << "Failed to execute vmv.x.s/vfmv.f.s for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT, typename DTR>
+void vector_op_vix_w(DT first, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vl, uint32_t vmask) {
+  for (uint32_t i = 0; i < vl; i++) {
+    if (isMasked(vreg_file, 0, i, vmask))
+      continue;
+
+    DT second = getVregData<DT>(vreg_file, rsrc0, i);
+    DTR third = getVregData<DTR>(vreg_file, rdest, i);
+    DTR result = OP<DT, DTR>::apply(first, second, third);
+    DP(4, "Widening " << (OP<DT, DTR>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
+    getVregData<DTR>(vreg_file, rdest, i) = result;
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
+void vector_op_vix_w(Word src1, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
+  switch (vsew) {
+  case 8:
+    vector_op_vix_w<OP, DT8, DT16>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+    break;
+  case 16:
+    vector_op_vix_w<OP, DT16, DT32>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+    break;
+  case 32:
+    vector_op_vix_w<OP, DT32, DT64>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+    break;
+  default:
+    std::cout << "Failed to execute VI/VX widening for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
+void vector_op_vix_wx(Word src1, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
+  switch (vsew) {
+  case 8:
+    vector_op_vix<OP, DT16>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+    break;
+  case 16:
+    vector_op_vix<OP, DT32>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+    break;
+  case 32:
+    vector_op_vix<OP, DT64>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+    break;
+  default:
+    std::cout << "Failed to execute VI/VX widening wx for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT, typename DTR>
+void vector_op_vix_n(DT first, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vl, uint32_t vmask, uint32_t vxrm, uint32_t &vxsat) {
+  for (uint32_t i = 0; i < vl; i++) {
+    if (isMasked(vreg_file, 0, i, vmask))
+      continue;
+
+    DT second = getVregData<DT>(vreg_file, rsrc0, i);
+    DTR result = OP<DT, DTR>::apply(first, second, vxrm, vxsat);
+    DP(4, "Narrowing " << (OP<DT, DTR>::name()) << "(" << +first << ", " << +second << ")" << " = " << +result);
+    getVregData<DTR>(vreg_file, rdest, i) = result;
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
+void vector_op_vix_n(Word src1, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask, uint32_t vxrm, uint32_t &vxsat) {
+  switch (vsew) {
+  case 8:
+    vector_op_vix_n<OP, DT16, DT8>(src1, vreg_file, rsrc0, rdest, vl, vmask, vxrm, vxsat);
+    break;
+  case 16:
+    vector_op_vix_n<OP, DT32, DT16>(src1, vreg_file, rsrc0, rdest, vl, vmask, vxrm, vxsat);
+    break;
+  case 32:
+    vector_op_vix_n<OP, DT64, DT32>(src1, vreg_file, rsrc0, rdest, vl, vmask, vxrm, vxsat);
+    break;
+  default:
+    std::cout << "Failed to execute VI/VX narrowing for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT, typename DTR>
+void vector_op_vix_sat(DTR first, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vl, uint32_t vmask, uint32_t vxrm, uint32_t &vxsat) {
+  for (uint32_t i = 0; i < vl; i++) {
+    if (isMasked(vreg_file, 0, i, vmask))
+      continue;
+
+    DT second = getVregData<DTR>(vreg_file, rsrc0, i);
+    DTR result = OP<DT, DTR>::apply(first, second, vxrm, vxsat);
+    DP(4, "Saturating " << (OP<DT, DTR>::name()) << "(" << +(DTR)first << ", " << +(DTR)second << ")" << " = " << +(DTR)result);
+    getVregData<DTR>(vreg_file, rdest, i) = result;
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64, typename DT128>
+void vector_op_vix_sat(Word src1, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask, uint32_t vxrm, uint32_t &vxsat) {
+  switch (vsew) {
+  case 8:
+    vector_op_vix_sat<OP, DT16, DT8>(src1, vreg_file, rsrc0, rdest, vl, vmask, vxrm, vxsat);
+    break;
+  case 16:
+    vector_op_vix_sat<OP, DT32, DT16>(src1, vreg_file, rsrc0, rdest, vl, vmask, vxrm, vxsat);
+    break;
+  case 32:
+    vector_op_vix_sat<OP, DT64, DT32>(src1, vreg_file, rsrc0, rdest, vl, vmask, vxrm, vxsat);
+    break;
+  case 64:
+    vector_op_vix_sat<OP, DT128, DT64>(src1, vreg_file, rsrc0, rdest, vl, vmask, vxrm, vxsat);
+    break;
+  default:
+    std::cout << "Failed to execute VI/VX saturating for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
+void vector_op_vix_scale(Word src1, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask, uint32_t vxrm, uint32_t &vxsat) {
+  switch (vsew) {
+  case 8:
+    vector_op_vix_sat<OP, DT8, DT8>(src1, vreg_file, rsrc0, rdest, vl, vmask, vxrm, vxsat);
+    break;
+  case 16:
+    vector_op_vix_sat<OP, DT16, DT16>(src1, vreg_file, rsrc0, rdest, vl, vmask, vxrm, vxsat);
+    break;
+  case 32:
+    vector_op_vix_sat<OP, DT32, DT32>(src1, vreg_file, rsrc0, rdest, vl, vmask, vxrm, vxsat);
+    break;
+  case 64:
+    vector_op_vix_sat<OP, DT64, DT64>(src1, vreg_file, rsrc0, rdest, vl, vmask, vxrm, vxsat);
+    break;
+  default:
+    std::cout << "Failed to execute VI/VX scale for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP>
+void vector_op_vix_ext(Word src1, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
+  if (vsew == 16) {
+    switch (src1) {
+    case 0b00110: // vzext.vf2
+      vector_op_vix_w<OP, uint8_t, uint16_t>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+      break;
+    case 0b00111: // vsext.vf2
+      vector_op_vix_w<OP, int8_t, int16_t>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+      break;
+    default:
+      std::cout << "Xunary0 has unsupported value for vf: " << src1 << std::endl;
+      std::abort();
+    }
+  } else if (vsew == 32) {
+    switch (src1) {
+    case 0b00100: // vzext.vf4
+      vector_op_vix_w<OP, uint8_t, uint32_t>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+      break;
+    case 0b00101: // vsext.vf4
+      vector_op_vix_w<OP, int8_t, int32_t>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+      break;
+    case 0b00110: // vzext.vf2
+      vector_op_vix_w<OP, uint16_t, uint32_t>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+      break;
+    case 0b00111: // vsext.vf2
+      vector_op_vix_w<OP, int16_t, int32_t>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+      break;
+    default:
+      std::cout << "Xunary0 has unsupported value for vf: " << src1 << std::endl;
+      std::abort();
+    }
+  } else if (vsew == 64) {
+    switch (src1) {
+    case 0b00010: // vzext.vf8
+      vector_op_vix_w<OP, uint8_t, uint64_t>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+      break;
+    case 0b00011: // vsext.vf8
+      vector_op_vix_w<OP, int8_t, int64_t>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+      break;
+    case 0b00100: // vzext.vf4
+      vector_op_vix_w<OP, uint16_t, uint64_t>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+      break;
+    case 0b00101: // vsext.vf4
+      vector_op_vix_w<OP, int16_t, int64_t>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+      break;
+    case 0b00110: // vzext.vf2
+      vector_op_vix_w<OP, uint32_t, uint64_t>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+      break;
+    case 0b00111: // vsext.vf2
+      vector_op_vix_w<OP, int32_t, int64_t>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+      break;
+    default:
+      std::cout << "Xunary0 has unsupported value for vf: " << src1 << std::endl;
+      std::abort();
+    }
+  } else {
+    std::cout << "Failed to execute Xunary0 for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT>
+void vector_op_vix_mask(DT first, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vl, uint32_t vmask) {
+  for (uint32_t i = 0; i < vl; i++) {
+    if (isMasked(vreg_file, 0, i, vmask))
+      continue;
+
+    DT second = getVregData<DT>(vreg_file, rsrc0, i);
+    bool result = OP<DT, bool>::apply(first, second, 0);
+    DP(4, "Integer/float compare mask " << (OP<DT, bool>::name()) << "(" << +first << ", " << +second << ")" << " = " << +result);
+    if (result) {
+      getVregData<uint8_t>(vreg_file, rdest, i / 8) |= 1 << (i % 8);
+    } else {
+      getVregData<uint8_t>(vreg_file, rdest, i / 8) &= ~(1 << (i % 8));
+    }
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
+void vector_op_vix_mask(Word src1, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
+  switch (vsew) {
+  case 8:
+    vector_op_vix_mask<OP, DT8>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+    break;
+  case 16:
+    vector_op_vix_mask<OP, DT16>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+    break;
+  case 32:
+    vector_op_vix_mask<OP, DT32>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+    break;
+  case 64:
+    vector_op_vix_mask<OP, DT64>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+    break;
+  default:
+    std::cout << "Failed to execute VI/VX integer/float compare mask for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <typename DT>
+void vector_op_vix_slide(Word first, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vl, Word vlmax, uint32_t vmask, bool scalar) {
+  // If vlmax > 0 this means we have a vslidedown instruction, vslideup does not require vlmax
+  bool slideDown = vlmax;
+  uint32_t scalarPos = slideDown ? vl - 1 : 0;
+  // If scalar set is set this means we have a v(f)slide1up or v(f)slide1down instruction,
+  // so first is our scalar value and we need to overwrite it with 1 for later computations
+  if (scalar && vl && !isMasked(vreg_file, 0, scalarPos, vmask)) {
+    DP(4, "Slide - Moving scalar value " << +first << " to position " << +scalarPos);
+    getVregData<DT>(vreg_file, rdest, scalarPos) = first;
+  }
+  first = scalar ? 1 : first;
+
+  for (Word i = slideDown ? 0 : first; i < vl - (scalar && vl && slideDown); i++) {
+    if (isMasked(vreg_file, 0, i, vmask))
+      continue;
+
+    __uint128_t iSrc = slideDown ? (__uint128_t)i + (__uint128_t)first : (__uint128_t)i - (__uint128_t)first; // prevent overflows/underflows
+    DT value = (!slideDown || iSrc < vlmax) ? getVregData<DT>(vreg_file, rsrc0, iSrc) : 0;
+    DP(4, "Slide - Moving value " << +value << " from position " << (uint64_t)iSrc << " to position " << +i);
+    getVregData<DT>(vreg_file, rdest, i) = value;
+  }
+}
+
+template <typename DT8, typename DT16, typename DT32, typename DT64>
+void vector_op_vix_slide(Word src1, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vsew, uint32_t vl, Word vlmax, uint32_t vmask, bool scalar) {
+  switch (vsew) {
+  case 8:
+    vector_op_vix_slide<DT8>(src1, vreg_file, rsrc0, rdest, vl, vlmax, vmask, scalar);
+    break;
+  case 16:
+    vector_op_vix_slide<DT16>(src1, vreg_file, rsrc0, rdest, vl, vlmax, vmask, scalar);
+    break;
+  case 32:
+    vector_op_vix_slide<DT32>(src1, vreg_file, rsrc0, rdest, vl, vlmax, vmask, scalar);
+    break;
+  case 64:
+    vector_op_vix_slide<DT64>(src1, vreg_file, rsrc0, rdest, vl, vlmax, vmask, scalar);
+    break;
+  default:
+    std::cout << "Failed to execute VI/VX slide for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <typename DT>
+void vector_op_vix_gather(Word first, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vl, Word vlmax, uint32_t vmask) {
+  for (Word i = 0; i < vl; i++) {
+    if (isMasked(vreg_file, 0, i, vmask))
+      continue;
+
+    DT value = first < vlmax ? getVregData<DT>(vreg_file, rsrc0, first) : 0;
+    DP(4, "Register gather - Moving value " << +value << " from position " << +first << " to position " << +i);
+    getVregData<DT>(vreg_file, rdest, i) = value;
+  }
+}
+
+template <typename DT8, typename DT16, typename DT32, typename DT64>
+void vector_op_vix_gather(Word src1, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vsew, uint32_t vl, Word vlmax, uint32_t vmask) {
+  switch (vsew) {
+  case 8:
+    vector_op_vix_gather<DT8>(src1, vreg_file, rsrc0, rdest, vl, vlmax, vmask);
+    break;
+  case 16:
+    vector_op_vix_gather<DT16>(src1, vreg_file, rsrc0, rdest, vl, vlmax, vmask);
+    break;
+  case 32:
+    vector_op_vix_gather<DT32>(src1, vreg_file, rsrc0, rdest, vl, vlmax, vmask);
+    break;
+  case 64:
+    vector_op_vix_gather<DT64>(src1, vreg_file, rsrc0, rdest, vl, vlmax, vmask);
+    break;
+  default:
+    std::cout << "Failed to execute VI/VX register gather for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT>
+void vector_op_vv(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl, uint32_t vmask) {
+  for (uint32_t i = 0; i < vl; i++) {
+    if (isMasked(vreg_file, 0, i, vmask))
+      continue;
+
+    DT first = getVregData<DT>(vreg_file, rsrc0, i);
+    DT second = getVregData<DT>(vreg_file, rsrc1, i);
+    DT third = getVregData<DT>(vreg_file, rdest, i);
+    DT result = OP<DT, DT>::apply(first, second, third);
+    DP(4, (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
+    getVregData<DT>(vreg_file, rdest, i) = result;
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
+void vector_op_vv(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
+  switch (vsew) {
+  case 8:
+    vector_op_vv<OP, DT8>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  case 16:
+    vector_op_vv<OP, DT16>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  case 32:
+    vector_op_vv<OP, DT32>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  case 64:
+    vector_op_vv<OP, DT64>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  default:
+    std::cout << "Failed to execute VV for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT>
+void vector_op_vv_carry(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl) {
+  for (uint32_t i = 0; i < vl; i++) {
+    DT first = getVregData<DT>(vreg_file, rsrc0, i);
+    DT second = getVregData<DT>(vreg_file, rsrc1, i);
+    bool third = !isMasked(vreg_file, 0, i, false);
+    DT result = OP<DT, DT>::apply(first, second, third);
+    DP(4, (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
+    getVregData<DT>(vreg_file, rdest, i) = result;
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
+void vector_op_vv_carry(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl) {
+  switch (vsew) {
+  case 8:
+    vector_op_vv_carry<OP, DT8>(vreg_file, rsrc0, rsrc1, rdest, vl);
+    break;
+  case 16:
+    vector_op_vv_carry<OP, DT16>(vreg_file, rsrc0, rsrc1, rdest, vl);
+    break;
+  case 32:
+    vector_op_vv_carry<OP, DT32>(vreg_file, rsrc0, rsrc1, rdest, vl);
+    break;
+  case 64:
+    vector_op_vv_carry<OP, DT64>(vreg_file, rsrc0, rsrc1, rdest, vl);
+    break;
+  default:
+    std::cout << "Failed to execute VV carry for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT, typename DTR>
+void vector_op_vv_carry_out(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl, uint32_t vmask) {
+  for (uint32_t i = 0; i < vl; i++) {
+    DT first = getVregData<DT>(vreg_file, rsrc0, i);
+    DT second = getVregData<DT>(vreg_file, rsrc1, i);
+    bool third = !vmask && !isMasked(vreg_file, 0, i, vmask);
+    bool result = OP<DT, DTR>::apply(first, second, third);
+    DP(4, (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
+    if (result) {
+      getVregData<uint8_t>(vreg_file, rdest, i / 8) |= 1 << (i % 8);
+    } else {
+      getVregData<uint8_t>(vreg_file, rdest, i / 8) &= ~(1 << (i % 8));
+    }
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64, typename DT128>
+void vector_op_vv_carry_out(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
+  switch (vsew) {
+  case 8:
+    vector_op_vv_carry_out<OP, DT8, DT16>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  case 16:
+    vector_op_vv_carry_out<OP, DT16, DT32>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  case 32:
+    vector_op_vv_carry_out<OP, DT32, DT64>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  case 64:
+    vector_op_vv_carry_out<OP, DT64, DT128>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  default:
+    std::cout << "Failed to execute VV carry out for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <typename DT>
+void vector_op_vv_merge(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl, uint32_t vmask) {
+  for (uint32_t i = 0; i < vl; i++) {
+    uint32_t rsrc = isMasked(vreg_file, 0, i, vmask) ? rsrc1 : rsrc0;
+    DT result = getVregData<DT>(vreg_file, rsrc, i);
+    DP(4, "Merge - Choosing result: " << +result);
+    getVregData<DT>(vreg_file, rdest, i) = result;
+  }
+}
+
+template <typename DT8, typename DT16, typename DT32, typename DT64>
+void vector_op_vv_merge(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
+  switch (vsew) {
+  case 8:
+    vector_op_vv_merge<DT8>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  case 16:
+    vector_op_vv_merge<DT16>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  case 32:
+    vector_op_vv_merge<DT32>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  case 64:
+    vector_op_vv_merge<DT64>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  default:
+    std::cout << "Failed to execute VV for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <typename DT>
+void vector_op_vv_gather(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl, bool ei16, uint32_t vlmax, uint32_t vmask) {
+  for (Word i = 0; i < vl; i++) {
+    if (isMasked(vreg_file, 0, i, vmask))
+      continue;
+
+    uint32_t first = ei16 ? getVregData<uint16_t>(vreg_file, rsrc0, i) : getVregData<DT>(vreg_file, rsrc0, i);
+    DT value = first < vlmax ? getVregData<DT>(vreg_file, rsrc1, first) : 0;
+    DP(4, "Register gather - Moving value " << +value << " from position " << +first << " to position " << +i);
+    getVregData<DT>(vreg_file, rdest, i) = value;
+  }
+}
+
+template <typename DT8, typename DT16, typename DT32, typename DT64>
+void vector_op_vv_gather(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, bool ei16, uint32_t vlmax, uint32_t vmask) {
+  switch (vsew) {
+  case 8:
+    vector_op_vv_gather<DT8>(vreg_file, rsrc0, rsrc1, rdest, vl, ei16, vlmax, vmask);
+    break;
+  case 16:
+    vector_op_vv_gather<DT16>(vreg_file, rsrc0, rsrc1, rdest, vl, ei16, vlmax, vmask);
+    break;
+  case 32:
+    vector_op_vv_gather<DT32>(vreg_file, rsrc0, rsrc1, rdest, vl, ei16, vlmax, vmask);
+    break;
+  case 64:
+    vector_op_vv_gather<DT64>(vreg_file, rsrc0, rsrc1, rdest, vl, ei16, vlmax, vmask);
+    break;
+  default:
+    std::cout << "Failed to execute VV register gather for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT, typename DTR>
+void vector_op_vv_w(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl, uint32_t vmask) {
+  for (uint32_t i = 0; i < vl; i++) {
+    if (isMasked(vreg_file, 0, i, vmask))
+      continue;
+
+    DT first = getVregData<DT>(vreg_file, rsrc0, i);
+    DT second = getVregData<DT>(vreg_file, rsrc1, i);
+    DTR third = getVregData<DTR>(vreg_file, rdest, i);
+    DTR result = OP<DT, DTR>::apply(first, second, third);
+    DP(4, "Widening " << (OP<DT, DTR>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
+    getVregData<DTR>(vreg_file, rdest, i) = result;
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
+void vector_op_vv_w(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
+  switch (vsew) {
+  case 8:
+    vector_op_vv_w<OP, DT8, DT16>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  case 16:
+    vector_op_vv_w<OP, DT16, DT32>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  case 32:
+    vector_op_vv_w<OP, DT32, DT64>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  default:
+    std::cout << "Failed to execute VV widening for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT, typename DTR>
+void vector_op_vv_wv(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl, uint32_t vmask) {
+  for (uint32_t i = 0; i < vl; i++) {
+    if (isMasked(vreg_file, 0, i, vmask))
+      continue;
+
+    DT first = getVregData<DT>(vreg_file, rsrc0, i);
+    DTR second = getVregData<DTR>(vreg_file, rsrc1, i);
+    DTR third = getVregData<DTR>(vreg_file, rdest, i);
+    DTR result = OP<DTR, DTR>::apply(first, second, third);
+    DP(4, "Widening wv " << (OP<DT, DTR>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
+    getVregData<DTR>(vreg_file, rdest, i) = result;
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
+void vector_op_vv_wv(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
+  switch (vsew) {
+  case 8:
+    vector_op_vv_wv<OP, DT8, DT16>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  case 16:
+    vector_op_vv_wv<OP, DT16, DT32>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  case 32:
+    vector_op_vv_wv<OP, DT32, DT64>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  default:
+    std::cout << "Failed to execute VV widening wv for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT, typename DTR>
+void vector_op_vv_wfv(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl, uint32_t vmask) {
+  for (uint32_t i = 0; i < vl; i++) {
+    if (isMasked(vreg_file, 0, i, vmask))
+      continue;
+
+    DT first = getVregData<DT>(vreg_file, rsrc0, i);
+    DTR second = getVregData<DTR>(vreg_file, rsrc1, i);
+    DTR third = getVregData<DTR>(vreg_file, rdest, i);
+    DTR result = OP<DTR, DTR>::apply(rv_ftod(first), second, third);
+    DP(4, "Widening wfv " << (OP<DT, DTR>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
+    getVregData<DTR>(vreg_file, rdest, i) = result;
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
+void vector_op_vv_wfv(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
+  if (vsew == 32) {
+    vector_op_vv_wfv<OP, DT32, DT64>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+  } else {
+    std::cout << "Failed to execute VV widening wfv for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT, typename DTR>
+void vector_op_vv_n(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl, uint32_t vmask, uint32_t vxrm, uint32_t &vxsat) {
+  for (uint32_t i = 0; i < vl; i++) {
+    if (isMasked(vreg_file, 0, i, vmask))
+      continue;
+
+    DTR first = getVregData<DTR>(vreg_file, rsrc0, i);
+    DT second = getVregData<DT>(vreg_file, rsrc1, i);
+    DTR result = OP<DT, DTR>::apply(first, second, vxrm, vxsat);
+    DP(4, "Narrowing " << (OP<DT, DTR>::name()) << "(" << +first << ", " << +second << ")" << " = " << +result);
+    getVregData<DTR>(vreg_file, rdest, i) = result;
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
+void vector_op_vv_n(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask, uint32_t vxrm, uint32_t &vxsat) {
+  switch (vsew) {
+  case 8:
+    vector_op_vv_n<OP, DT16, DT8>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask, vxrm, vxsat);
+    break;
+  case 16:
+    vector_op_vv_n<OP, DT32, DT16>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask, vxrm, vxsat);
+    break;
+  case 32:
+    vector_op_vv_n<OP, DT64, DT32>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask, vxrm, vxsat);
+    break;
+  default:
+    std::cout << "Failed to execute VV narrowing for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT, typename DTR>
+void vector_op_vv_sat(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl, uint32_t vmask, uint32_t vxrm, uint32_t &vxsat) {
+  for (uint32_t i = 0; i < vl; i++) {
+    if (isMasked(vreg_file, 0, i, vmask))
+      continue;
+
+    DT first = getVregData<DTR>(vreg_file, rsrc0, i);
+    DT second = getVregData<DTR>(vreg_file, rsrc1, i);
+    DTR result = OP<DT, DTR>::apply(first, second, vxrm, vxsat);
+    DP(4, "Saturating " << (OP<DT, DTR>::name()) << "(" << +(DTR)first << ", " << +(DTR)second << ")" << " = " << +(DTR)result);
+    getVregData<DTR>(vreg_file, rdest, i) = result;
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64, typename DT128>
+void vector_op_vv_sat(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask, uint32_t vxrm, uint32_t &vxsat) {
+  switch (vsew) {
+  case 8:
+    vector_op_vv_sat<OP, DT16, DT8>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask, vxrm, vxsat);
+    break;
+  case 16:
+    vector_op_vv_sat<OP, DT32, DT16>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask, vxrm, vxsat);
+    break;
+  case 32:
+    vector_op_vv_sat<OP, DT64, DT32>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask, vxrm, vxsat);
+    break;
+  case 64:
+    vector_op_vv_sat<OP, DT128, DT64>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask, vxrm, vxsat);
+    break;
+  default:
+    std::cout << "Failed to execute VV saturating for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
+void vector_op_vv_scale(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask, uint32_t vxrm, uint32_t &vxsat) {
+  switch (vsew) {
+  case 8:
+    vector_op_vv_sat<OP, DT8, DT8>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask, vxrm, vxsat);
+    break;
+  case 16:
+    vector_op_vv_sat<OP, DT16, DT16>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask, vxrm, vxsat);
+    break;
+  case 32:
+    vector_op_vv_sat<OP, DT32, DT32>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask, vxrm, vxsat);
+    break;
+  case 64:
+    vector_op_vv_sat<OP, DT64, DT64>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask, vxrm, vxsat);
+    break;
+  default:
+    std::cout << "Failed to execute VV scale for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT>
+void vector_op_vv_red(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl, uint32_t vmask) {
+  for (uint32_t i = 0; i < vl; i++) {
+    // use rdest as accumulator
+    if (i == 0) {
+      getVregData<DT>(vreg_file, rdest, 0) = getVregData<DT>(vreg_file, rsrc0, 0);
+    }
+    if (isMasked(vreg_file, 0, i, vmask))
+      continue;
+
+    DT first = getVregData<DT>(vreg_file, rdest, 0);
+    DT second = getVregData<DT>(vreg_file, rsrc1, i);
+    DT result = OP<DT, DT>::apply(first, second, 0);
+    DP(4, "Reduction " << (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ")" << " = " << +result);
+    getVregData<DT>(vreg_file, rdest, 0) = result;
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
+void vector_op_vv_red(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
+  switch (vsew) {
+  case 8:
+    vector_op_vv_red<OP, DT8>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  case 16:
+    vector_op_vv_red<OP, DT16>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  case 32:
+    vector_op_vv_red<OP, DT32>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  case 64:
+    vector_op_vv_red<OP, DT64>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  default:
+    std::cout << "Failed to execute VV reduction for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT, typename DTR>
+void vector_op_vv_red_w(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl, uint32_t vmask) {
+  for (uint32_t i = 0; i < vl; i++) {
+    // use rdest as accumulator
+    if (i == 0) {
+      getVregData<DTR>(vreg_file, rdest, 0) = getVregData<DTR>(vreg_file, rsrc0, 0);
+    }
+    if (isMasked(vreg_file, 0, i, vmask))
+      continue;
+
+    DTR first = getVregData<DTR>(vreg_file, rdest, 0);
+    DT second = getVregData<DT>(vreg_file, rsrc1, i);
+    DTR second_w = std::is_signed<DT>() ? sext((DTR)second, sizeof(DT) * 8) : zext((DTR)second, sizeof(DT) * 8);
+    DTR result = OP<DTR, DTR>::apply(first, second_w, 0);
+    DP(4, "Widening reduction " << (OP<DTR, DTR>::name()) << "(" << +first << ", " << +second_w << ")" << " = " << +result);
+    getVregData<DTR>(vreg_file, rdest, 0) = result;
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
+void vector_op_vv_red_w(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
+  switch (vsew) {
+  case 8:
+    vector_op_vv_red_w<OP, DT8, DT16>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  case 16:
+    vector_op_vv_red_w<OP, DT16, DT32>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  case 32:
+    vector_op_vv_red_w<OP, DT32, DT64>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  default:
+    std::cout << "Failed to execute VV widening reduction for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT, typename DTR>
+void vector_op_vv_red_wf(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl, uint32_t vmask) {
+  for (uint32_t i = 0; i < vl; i++) {
+    // use rdest as accumulator
+    if (i == 0) {
+      getVregData<DTR>(vreg_file, rdest, 0) = getVregData<DTR>(vreg_file, rsrc0, 0);
+    }
+    if (isMasked(vreg_file, 0, i, vmask))
+      continue;
+
+    DTR first = getVregData<DTR>(vreg_file, rdest, 0);
+    DT second = getVregData<DT>(vreg_file, rsrc1, i);
+    DTR second_w = rv_ftod(second);
+    DTR result = OP<DTR, DTR>::apply(first, second_w, 0);
+    DP(4, "Float widening reduction " << (OP<DTR, DTR>::name()) << "(" << +first << ", " << +second_w << ")" << " = " << +result);
+    getVregData<DTR>(vreg_file, rdest, 0) = result;
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
+void vector_op_vv_red_wf(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
+  if (vsew == 32) {
+    vector_op_vv_red_wf<OP, DT32, DT64>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+  } else {
+    std::cout << "Failed to execute VV float widening reduction for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <typename DT>
+void vector_op_vid(std::vector<std::vector<Byte>> &vreg_file, uint32_t rdest, uint32_t vl, uint32_t vmask) {
+  for (uint32_t i = 0; i < vl; i++) {
+    if (isMasked(vreg_file, 0, i, vmask))
+      continue;
+
+    DP(4, "Element Index = " << +i);
+    getVregData<DT>(vreg_file, rdest, i) = i;
+  }
+}
+
+void vector_op_vid(std::vector<std::vector<Byte>> &vreg_file, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
+  switch (vsew) {
+  case 8:
+    vector_op_vid<uint8_t>(vreg_file, rdest, vl, vmask);
+    break;
+  case 16:
+    vector_op_vid<uint16_t>(vreg_file, rdest, vl, vmask);
+    break;
+  case 32:
+    vector_op_vid<uint32_t>(vreg_file, rdest, vl, vmask);
+    break;
+  case 64:
+    vector_op_vid<uint64_t>(vreg_file, rdest, vl, vmask);
+    break;
+  default:
+    std::cout << "Failed to execute vector element index for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT>
+void vector_op_vv_mask(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl, uint32_t vmask) {
+  for (uint32_t i = 0; i < vl; i++) {
+    if (isMasked(vreg_file, 0, i, vmask))
+      continue;
+
+    DT first = getVregData<DT>(vreg_file, rsrc0, i);
+    DT second = getVregData<DT>(vreg_file, rsrc1, i);
+    bool result = OP<DT, bool>::apply(first, second, 0);
+    DP(4, "Integer/float compare mask " << (OP<DT, bool>::name()) << "(" << +first << ", " << +second << ")" << " = " << +result);
+    if (result) {
+      getVregData<uint8_t>(vreg_file, rdest, i / 8) |= 1 << (i % 8);
+    } else {
+      getVregData<uint8_t>(vreg_file, rdest, i / 8) &= ~(1 << (i % 8));
+    }
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
+void vector_op_vv_mask(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
+  switch (vsew) {
+  case 8:
+    vector_op_vv_mask<OP, DT8>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  case 16:
+    vector_op_vv_mask<OP, DT16>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  case 32:
+    vector_op_vv_mask<OP, DT32>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  case 64:
+    vector_op_vv_mask<OP, DT64>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  default:
+    std::cout << "Failed to execute VV integer/float compare mask for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP>
+void vector_op_vv_mask(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl) {
+  for (uint32_t i = 0; i < vl; i++) {
+    uint8_t firstMask = getVregData<uint8_t>(vreg_file, rsrc0, i / 8);
+    bool first = (firstMask >> (i % 8)) & 0x1;
+    uint8_t secondMask = getVregData<uint8_t>(vreg_file, rsrc1, i / 8);
+    bool second = (secondMask >> (i % 8)) & 0x1;
+    bool result = OP<uint8_t, uint8_t>::apply(first, second, 0) & 0x1;
+    DP(4, "Compare mask bits " << (OP<uint8_t, uint8_t>::name()) << "(" << +first << ", " << +second << ")" << " = " << +result);
+    if (result) {
+      getVregData<uint8_t>(vreg_file, rdest, i / 8) |= 1 << (i % 8);
+    } else {
+      getVregData<uint8_t>(vreg_file, rdest, i / 8) &= ~(1 << (i % 8));
+    }
+  }
+}
+
+template <typename DT>
+void vector_op_vv_compress(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl) {
+  int currPos = 0;
+  for (uint32_t i = 0; i < vl; i++) {
+    // Special case: use rsrc0 as mask vector register instead of default v0
+    // This instruction is always masked (vmask == 0), but encoded as unmasked (vmask == 1)
+    if (isMasked(vreg_file, rsrc0, i, 0))
+      continue;
+
+    DT value = getVregData<DT>(vreg_file, rsrc1, i);
+    DP(4, "Compression - Moving value " << +value << " from position " << i << " to position " << currPos);
+    getVregData<DT>(vreg_file, rdest, currPos) = value;
+    currPos++;
+  }
+}
+
+template <typename DT8, typename DT16, typename DT32, typename DT64>
+void vector_op_vv_compress(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl) {
+  switch (vsew) {
+  case 8:
+    vector_op_vv_compress<DT8>(vreg_file, rsrc0, rsrc1, rdest, vl);
+    break;
+  case 16:
+    vector_op_vv_compress<DT16>(vreg_file, rsrc0, rsrc1, rdest, vl);
+    break;
+  case 32:
+    vector_op_vv_compress<DT32>(vreg_file, rsrc0, rsrc1, rdest, vl);
+    break;
+  case 64:
+    vector_op_vv_compress<DT64>(vreg_file, rsrc0, rsrc1, rdest, vl);
+    break;
+  default:
+    std::cout << "Failed to execute VV compression for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
diff --git a/tests/riscv/riscv-vector-tests/README b/tests/riscv/riscv-vector-tests/README
index bf75d2675..78af65edf 100644
--- a/tests/riscv/riscv-vector-tests/README
+++ b/tests/riscv/riscv-vector-tests/README
@@ -11,7 +11,7 @@ XLEN=64 ./run-test.sh
 
 ## Adding a new testcase
 
-The source code for the vector extension can be found in `sim/simx/execute_vector.cpp`.
+The source code for the vector extension can be found in `sim/simx/vpu.cpp`.
 If you add support for a new vector instruction please go to `run-test.sh` and it to the default testcases.
 This will ensure your instruction is included in the regression test suite.
 
diff --git a/tests/riscv/riscv-vector-tests/run-test.sh.in b/tests/riscv/riscv-vector-tests/run-test.sh.in
index 68b4b6563..11ebcf313 100755
--- a/tests/riscv/riscv-vector-tests/run-test.sh.in
+++ b/tests/riscv/riscv-vector-tests/run-test.sh.in
@@ -1,7 +1,4 @@
 #!/bin/bash
-VLEN=${VLEN:-256}
-XLEN=${XLEN:-32}
-
 RISCV_TOOLCHAIN_PATH=${RISCV_TOOLCHAIN_PATH:-$TOOLDIR"/riscv"$XLEN"-gnu-toolchain"}
 
 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )

From cb491ddb53eb9cda6f9900eabf1faa3f6d468aaf Mon Sep 17 00:00:00 2001
From: MichaelJSr <miky.srouji@gmail.com>
Date: Mon, 13 Jan 2025 18:01:06 -0800
Subject: [PATCH 2/4] test

Revert "test"

This reverts commit 393e347c2faba260f1469667596e22dc2aa16553.

From a2cfeffcfe10c3bfec074b0bdbcdef46775b4042 Mon Sep 17 00:00:00 2001
From: MichaelJSr <miky.srouji@gmail.com>
Date: Mon, 13 Jan 2025 17:46:23 -0800
Subject: [PATCH 3/4] Added ifndef statements for the vector extension anywhere
 they didn't exist already

Added ifndef statements for the vector extension anywhere they didn't exist already

more ifdef statements

more ifdef

Update decode.cpp

Update decode.cpp

Update decode.cpp
---
 sim/simx/Makefile      |  2 +-
 sim/simx/decode.cpp    | 32 +++++++++++++++++++++++++++++++-
 sim/simx/emulator.cpp  | 14 ++++++++++++++
 sim/simx/emulator.h    |  8 +++++++-
 sim/simx/execute.cpp   | 18 ++++++++++++------
 sim/simx/instr.h       | 25 ++++++++++++++++++++-----
 sim/simx/main.cpp      |  2 ++
 sim/simx/processor.cpp |  2 ++
 sim/simx/types.h       |  8 ++++++--
 sim/simx/vpu.cpp       |  2 ++
 sim/simx/vpu.h         |  2 ++
 11 files changed, 99 insertions(+), 16 deletions(-)

diff --git a/sim/simx/Makefile b/sim/simx/Makefile
index 4b0fa410f..83054edc4 100644
--- a/sim/simx/Makefile
+++ b/sim/simx/Makefile
@@ -17,7 +17,7 @@ CXXFLAGS += $(CONFIGS)
 LDFLAGS += $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a
 LDFLAGS += -Wl,-rpath,$(THIRD_PARTY_DIR)/ramulator -L$(THIRD_PARTY_DIR)/ramulator -lramulator
 
-SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/softfloat_ext.cpp $(COMMON_DIR)/softfloat_ext.cpp $(COMMON_DIR)/rvfloats.cpp $(COMMON_DIR)/dram_sim.cpp
+SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/softfloat_ext.cpp $(COMMON_DIR)/rvfloats.cpp $(COMMON_DIR)/dram_sim.cpp
 SRCS += $(SRC_DIR)/processor.cpp $(SRC_DIR)/cluster.cpp $(SRC_DIR)/socket.cpp $(SRC_DIR)/core.cpp $(SRC_DIR)/emulator.cpp $(SRC_DIR)/decode.cpp $(SRC_DIR)/execute.cpp $(SRC_DIR)/func_unit.cpp $(SRC_DIR)/cache_sim.cpp $(SRC_DIR)/mem_sim.cpp $(SRC_DIR)/local_mem.cpp $(SRC_DIR)/mem_coalescer.cpp $(SRC_DIR)/dcrs.cpp $(SRC_DIR)/types.cpp
 
 # Add V extension sources
diff --git a/sim/simx/decode.cpp b/sim/simx/decode.cpp
index a4c0bb2ad..55b83daf3 100644
--- a/sim/simx/decode.cpp
+++ b/sim/simx/decode.cpp
@@ -47,7 +47,9 @@ static const std::unordered_map<Opcode, InstType> sc_instTable = {
   {Opcode::FMSUB,   InstType::R4},
   {Opcode::FMNMADD, InstType::R4},
   {Opcode::FMNMSUB, InstType::R4},
+#ifdef EXT_V_ENABLE
   {Opcode::VSET,    InstType::V},
+#endif
   {Opcode::EXT1,    InstType::R},
   {Opcode::EXT2,    InstType::R4},
   {Opcode::R_W,     InstType::R},
@@ -373,7 +375,9 @@ static const char* op_string(const Instr &instr) {
   case Opcode::FMSUB:   return func2 ? "FMSUB.D" : "FMSUB.S";
   case Opcode::FMNMADD: return func2 ? "FNMADD.D" : "FNMADD.S";
   case Opcode::FMNMSUB: return func2 ? "FNMSUB.D" : "FNMSUB.S";
+#ifdef EXT_V_ENABLE
   case Opcode::VSET:    return "VSET";
+#endif
   case Opcode::EXT1:
     switch (func7) {
     case 0:
@@ -405,6 +409,7 @@ static const char* op_string(const Instr &instr) {
   }
 }
 
+#ifdef EXT_V_ENABLE
 inline void print_vec_attr(std::ostream &os, const Instr &instr) {
   uint32_t mask = instr.getVattrMask();
   if (mask & vattr_vlswidth)
@@ -432,6 +437,7 @@ inline void print_vec_attr(std::ostream &os, const Instr &instr) {
   if (mask & vattr_vediv)
     os << ", ediv:" << instr.getVediv();
 }
+#endif
 
 namespace vortex {
 std::ostream &operator<<(std::ostream &os, const Instr &instr) {
@@ -453,6 +459,7 @@ std::ostream &operator<<(std::ostream &os, const Instr &instr) {
     if (sep++ != 0) { os << ", "; } else { os << " "; }
     os << "0x" << std::hex << instr.getImm() << std::dec;
   }
+#ifdef EXT_V_ENABLE
   if (instr.getOpcode() == Opcode::SYS && instr.getFunc3() >= 5) {
     // CSRs with immediate values
     if (sep++ != 0) { os << ", "; } else { os << " "; }
@@ -462,6 +469,7 @@ std::ostream &operator<<(std::ostream &os, const Instr &instr) {
   if (instr.getVattrMask() != 0) {
     print_vec_attr(os, instr);
   }
+#endif
   return os;
 }
 }
@@ -473,9 +481,11 @@ std::shared_ptr<Instr> Emulator::decode(uint32_t code) const {
 
   auto func2 = (code >> shift_func2) & mask_func2;
   auto func3 = (code >> shift_func3) & mask_func3;
-  auto func6 = (code >> shift_func6) & mask_func6;
   auto func7 = (code >> shift_func7) & mask_func7;
+#ifdef EXT_V_ENABLE
+  auto func6 = (code >> shift_func6) & mask_func6;
   __unused(func6);
+#endif
 
   auto rd  = (code >> shift_rd)  & mask_reg;
   auto rs1 = (code >> shift_rs1) & mask_reg;
@@ -489,11 +499,13 @@ std::shared_ptr<Instr> Emulator::decode(uint32_t code) const {
   }
 
   auto iType = op_it->second;
+#ifdef EXT_V_ENABLE
   if (op == Opcode::FL || op == Opcode::FS) {
     if (func3 != 0x2 && func3 != 0x3) {
       iType = InstType::V;
     }
   }
+#endif
 
   switch (iType) {
   case InstType::R:
@@ -582,7 +594,9 @@ std::shared_ptr<Instr> Emulator::decode(uint32_t code) const {
       instr->addSrcReg(rs2, RegType::Integer);
       break;
     }
+  #ifdef EXT_V_ENABLE
     instr->setFunc3(func3);
+  #endif
     instr->setFunc7(func7);
     break;
 
@@ -591,7 +605,9 @@ std::shared_ptr<Instr> Emulator::decode(uint32_t code) const {
     case Opcode::TCU: {
       instr->setDestReg(rs1, RegType::Integer);
       instr->addSrcReg(rs1, RegType::Integer);
+    #ifdef EXT_V_ENABLE
       instr->setFunc3(func3);
+    #endif
       instr->setFunc7(func7);
       auto imm = code >> shift_rs2;
       instr->setImm(sext(imm, width_i_imm));
@@ -601,7 +617,9 @@ std::shared_ptr<Instr> Emulator::decode(uint32_t code) const {
     case Opcode::JALR:
       instr->setDestReg(rd, RegType::Integer);
       instr->addSrcReg(rs1, RegType::Integer);
+    #ifdef EXT_V_ENABLE
       instr->setFunc3(func3);
+    #endif
       if (func3 == 0x1 || func3 == 0x5) {
         // Shift instructions
         auto shamt = rs2; // uint5
@@ -622,19 +640,25 @@ std::shared_ptr<Instr> Emulator::decode(uint32_t code) const {
     case Opcode::FL: {
       instr->setDestReg(rd, (op == Opcode::FL) ? RegType::Float : RegType::Integer);
       instr->addSrcReg(rs1, RegType::Integer);
+    #ifdef EXT_V_ENABLE
       instr->setFunc3(func3);
+    #endif
       auto imm = code >> shift_rs2;
       instr->setImm(sext(imm, width_i_imm));
     } break;
     case Opcode::FENCE:
+    #ifdef EXT_V_ENABLE
       instr->setFunc3(func3);
+    #endif
       instr->setImm(code >> shift_rs2);
       break;
     case Opcode::SYS:
       if (func3 != 0) {
         // CSR instructions
         instr->setDestReg(rd, RegType::Integer);
+      #ifdef EXT_V_ENABLE
         instr->setFunc3(func3);
+      #endif
         if (func3 < 5) {
           instr->addSrcReg(rs1, RegType::Integer);
         } else {
@@ -655,7 +679,9 @@ std::shared_ptr<Instr> Emulator::decode(uint32_t code) const {
   case InstType::S: {
     instr->addSrcReg(rs1, RegType::Integer);
     instr->addSrcReg(rs2, (op == Opcode::FS) ? RegType::Float : RegType::Integer);
+  #ifdef EXT_V_ENABLE
     instr->setFunc3(func3);
+  #endif
     auto imm = (func7 << width_reg) | rd;
     instr->setImm(sext(imm, width_i_imm));
   } break;
@@ -663,7 +689,9 @@ std::shared_ptr<Instr> Emulator::decode(uint32_t code) const {
   case InstType::B: {
     instr->addSrcReg(rs1, RegType::Integer);
     instr->addSrcReg(rs2, RegType::Integer);
+  #ifdef EXT_V_ENABLE
     instr->setFunc3(func3);
+  #endif
     auto bit_11   = rd & 0x1;
     auto bits_4_1 = rd >> 1;
     auto bit_10_5 = func7 & 0x3f;
@@ -695,7 +723,9 @@ std::shared_ptr<Instr> Emulator::decode(uint32_t code) const {
     instr->addSrcReg(rs2, RegType::Float);
     instr->addSrcReg(rs3, RegType::Float);
     instr->setFunc2(func2);
+  #ifdef EXT_V_ENABLE
     instr->setFunc3(func3);
+  #endif
   } break;
 
 #ifdef EXT_V_ENABLE
diff --git a/sim/simx/emulator.cpp b/sim/simx/emulator.cpp
index 4bb94915e..7abec98c5 100644
--- a/sim/simx/emulator.cpp
+++ b/sim/simx/emulator.cpp
@@ -33,7 +33,9 @@ using namespace vortex;
 Emulator::warp_t::warp_t(const Arch& arch)
   : ireg_file(arch.num_threads(), std::vector<Word>(MAX_NUM_REGS))
   , freg_file(arch.num_threads(), std::vector<uint64_t>(MAX_NUM_REGS))
+#ifdef EXT_V_ENABLE
   , vreg_file(MAX_NUM_REGS, std::vector<Byte>(MAX_NUM_REGS))
+#endif
   , uuid(0)
 {}
 
@@ -43,9 +45,11 @@ void Emulator::warp_t::clear(uint64_t startup_addr) {
   this->uuid = 0;
   this->fcsr = 0;
 
+#ifdef EXT_V_ENABLE
   this->vtype = {0, 0, 0, 0, 0};
   this->vl = 0;
   this->vlmax = 0;
+#endif
 
   for (auto& reg_file : this->ireg_file) {
     for (auto& reg : reg_file) {
@@ -68,6 +72,7 @@ void Emulator::warp_t::clear(uint64_t startup_addr) {
     }
   }
 
+#ifdef EXT_V_ENABLE
   for (auto& reg_file : this->vreg_file) {
     for (auto& reg : reg_file) {
     #ifndef NDEBUG
@@ -77,6 +82,7 @@ void Emulator::warp_t::clear(uint64_t startup_addr) {
     #endif
     }
   }
+#endif
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -92,13 +98,17 @@ Emulator::Emulator(const Arch &arch, const DCRS &dcrs, Core* core)
     // considered to be big enough to hold input tiles for one output tile.
     // In future versions, scratchpad size should be fixed to an appropriate value.
     , scratchpad(std::vector<Word>(32 * 32 * 32768))
+  #ifdef EXT_V_ENABLE
     , csrs_(arch.num_warps())
+  #endif
 {
   std::srand(50);
 
+#ifdef EXT_V_ENABLE
   for (uint32_t i = 0; i < arch_.num_warps(); ++i) {
     csrs_.at(i).resize(arch.num_threads());
   }
+#endif
 
   this->clear();
 }
@@ -480,6 +490,7 @@ Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
   case VX_CSR_FRM:        return (warps_.at(wid).fcsr >> 5);
   case VX_CSR_FCSR:       return warps_.at(wid).fcsr;
 
+#ifdef EXT_V_ENABLE
   // Vector CRSs
   case VX_CSR_VSTART:
     return csrs_.at(wid).at(tid)[VX_CSR_VSTART];
@@ -504,6 +515,7 @@ Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
     return csrs_.at(wid).at(tid)[VX_CSR_VTIME];
   case VX_CSR_VINSTRET:
     return csrs_.at(wid).at(tid)[VX_CSR_VINSTRET];
+#endif
 
   case VX_CSR_MHARTID:    return (core_->id() * arch_.num_warps() + wid) * arch_.num_threads() + tid;
   case VX_CSR_THREAD_ID:  return tid;
@@ -621,6 +633,7 @@ void Emulator::set_csr(uint32_t addr, Word value, uint32_t tid, uint32_t wid) {
     csr_mscratch_ = value;
     break;
 
+#ifdef EXT_V_ENABLE
   // Vector CRSs
   case VX_CSR_VSTART:
     csrs_.at(wid).at(tid)[VX_CSR_VSTART] = value;
@@ -642,6 +655,7 @@ void Emulator::set_csr(uint32_t addr, Word value, uint32_t tid, uint32_t wid) {
     csrs_.at(wid).at(tid)[VX_CSR_VTYPE] = value;
     break;
   case VX_CSR_VLENB: // read only, set to VLEN / 8
+#endif
 
   case VX_CSR_SATP:
   #ifdef VM_ENABLE
diff --git a/sim/simx/emulator.h b/sim/simx/emulator.h
index 144ff2a93..d5f6669dc 100644
--- a/sim/simx/emulator.h
+++ b/sim/simx/emulator.h
@@ -81,6 +81,7 @@ class Emulator {
     bool        fallthrough;
   };
 
+#ifdef EXT_V_ENABLE
   struct vtype_t {
     uint32_t vill;
     uint32_t vma;
@@ -88,6 +89,7 @@ class Emulator {
     uint32_t vsew;
     uint32_t vlmul;
   };
+#endif
 
   union reg_data_t {
     Word     u;
@@ -109,12 +111,14 @@ class Emulator {
     ThreadMask                        tmask;
     std::vector<std::vector<Word>>    ireg_file;
     std::vector<std::vector<uint64_t>>freg_file;
-    std::vector<std::vector<Byte>>    vreg_file;
     std::stack<ipdom_entry_t>         ipdom_stack;
     Byte                              fcsr;
+  #ifdef EXT_V_ENABLE
+    std::vector<std::vector<Byte>>    vreg_file;
     vtype_t                           vtype;
     uint32_t                          vl;
     Word                              vlmax;
+  #endif
     uint32_t                          uuid;
   };
 
@@ -173,7 +177,9 @@ class Emulator {
   uint32_t mat_size;
   uint32_t tc_size;
   uint32_t tc_num;
+#ifdef EXT_V_ENABLE
   std::vector<std::vector<std::unordered_map<uint32_t, uint32_t>>> csrs_;
+#endif
 };
 
 }
diff --git a/sim/simx/execute.cpp b/sim/simx/execute.cpp
index 86623a00c..aae018fc5 100644
--- a/sim/simx/execute.cpp
+++ b/sim/simx/execute.cpp
@@ -25,7 +25,9 @@
 #include "emulator.h"
 #include "instr.h"
 #include "core.h"
+#ifdef EXT_V_ENABLE
 #include "processor_impl.h"
+#endif
 #include "VX_types.h"
 
 using namespace vortex;
@@ -117,8 +119,10 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
         }
         DPN(2, "}" << std::endl);
         break;
+    #ifdef EXT_V_ENABLE
       case RegType::Vector:
         break;
+    #endif
       default:
         break;
       }
@@ -707,11 +711,12 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
         }
       }
       rd_write = true;
-    } else {
-    #ifdef EXT_V_ENABLE
+    }
+  #ifdef EXT_V_ENABLE
+    else {
       this->loadVector(instr, wid, rsdata);
-    #endif
     }
+  #endif
     break;
   }
   case Opcode::S:
@@ -744,11 +749,12 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
           std::abort();
         }
       }
-    } else {
-    #ifdef EXT_V_ENABLE
+    }
+  #ifdef EXT_V_ENABLE
+    else {
       this->storeVector(instr, wid, rsdata);
-    #endif
     }
+  #endif
     break;
   }
   case Opcode::AMO: {
diff --git a/sim/simx/instr.h b/sim/simx/instr.h
index 88b9f5cd3..bbd853b6b 100644
--- a/sim/simx/instr.h
+++ b/sim/simx/instr.h
@@ -42,8 +42,10 @@ enum class Opcode {
   // RV64 Standard Extension
   R_W       = 0x3b,
   I_W       = 0x1b,
+#ifdef EXT_V_ENABLE
   // Vector Extension
   VSET      = 0x57,
+#endif
   // Custom Extensions
   EXT1      = 0x0b,
   EXT2      = 0x2b,
@@ -58,7 +60,9 @@ enum class InstType {
   B,
   U,
   J,
+#ifdef EXT_V_ENABLE
   V,
+#endif
   R4
 };
 
@@ -138,6 +142,7 @@ class Instr {
     , rdest_(0)
     , func2_(0)
     , func3_(0)
+  #ifdef EXT_V_ENABLE
     , func6_(0)
     , func7_(0)
     , vmask_(0)
@@ -152,8 +157,9 @@ class Instr {
     , vta_(0)
     , vma_(0)
     , vediv_(0)
-    , vattr_mask_(0) {
-    for (uint32_t i = 0; i < MAX_REG_SOURCES; ++i) {
+    , vattr_mask_(0)
+    #endif
+    { for (uint32_t i = 0; i < MAX_REG_SOURCES; ++i) {
        rsrc_type_[i] = RegType::None;
        rsrc_[i] = 0;
     }
@@ -183,9 +189,11 @@ class Instr {
   void setImm(uint32_t imm) { has_imm_ = true; imm_ = imm; }
 
   void setFunc2(uint32_t func2) { func2_ = func2; }
+  void setFunc7(uint32_t func7) { func7_ = func7; }
+
+#ifdef EXT_V_ENABLE
   void setFunc3(uint32_t func3) { func3_ = func3; }
   void setFunc6(uint32_t func6) { func6_ = func6; }
-  void setFunc7(uint32_t func7) { func7_ = func7; }
 
   // Attributes for Vector instructions
   void setVlsWidth(uint32_t width) { vlsWidth_ = width; vattr_mask_ |= vattr_vlswidth; }
@@ -200,6 +208,7 @@ class Instr {
   void setVta(uint32_t vta) { vta_ = vta; vattr_mask_ |= vattr_vta; }
   void setVma(uint32_t vma) { vma_ = vma; vattr_mask_ |= vattr_vma; }
   void setVediv(uint32_t ediv) { vediv_ = 1 << ediv; vattr_mask_ |= vattr_vediv; }
+#endif
 
   Opcode   getOpcode() const { return opcode_; }
 
@@ -215,9 +224,11 @@ class Instr {
 
   uint32_t getFunc2() const { return func2_; }
   uint32_t getFunc3() const { return func3_; }
-  uint32_t getFunc6() const { return func6_; }
   uint32_t getFunc7() const { return func7_; }
 
+#ifdef EXT_V_ENABLE
+  uint32_t getFunc6() const { return func6_; }
+
   uint32_t getVlsWidth() const { return vlsWidth_; }
   uint32_t getVmop() const { return vMop_; }
   uint32_t getVumop() const { return vUmop_; }
@@ -231,6 +242,7 @@ class Instr {
   uint32_t getVma() const { return vma_; }
   uint32_t getVediv() const { return vediv_; }
   uint32_t getVattrMask() const { return vattr_mask_; }
+#endif
 
 private:
 
@@ -248,9 +260,11 @@ class Instr {
   uint32_t rdest_;
   uint32_t func2_;
   uint32_t func3_;
-  uint32_t func6_;
   uint32_t func7_;
 
+#ifdef EXT_V_ENABLE
+  uint32_t func6_;
+
   // Vector
   uint32_t vmask_;
   uint32_t vlsWidth_;
@@ -265,6 +279,7 @@ class Instr {
   uint32_t vma_;
   uint32_t vediv_;
   uint32_t vattr_mask_;
+#endif
 
   friend std::ostream &operator<<(std::ostream &, const Instr&);
 };
diff --git a/sim/simx/main.cpp b/sim/simx/main.cpp
index 3df8b0e1a..d6ed15a25 100644
--- a/sim/simx/main.cpp
+++ b/sim/simx/main.cpp
@@ -120,7 +120,9 @@ int main(int argc, char **argv) {
 #endif
     // run simulation
     // vector test exitcode is a special case
+  #ifdef EXT_V_ENABLE
     if (vector_test) return processor.run();
+  #endif
     // else continue as normal
     processor.run();
 
diff --git a/sim/simx/processor.cpp b/sim/simx/processor.cpp
index 96fc49df9..a11351d03 100644
--- a/sim/simx/processor.cpp
+++ b/sim/simx/processor.cpp
@@ -127,7 +127,9 @@ int ProcessorImpl::run() {
         done = false;
         continue;
       }
+    #ifdef EXT_V_ENABLE
       exitcode |= cluster->get_exitcode();
+    #endif
     }
     perf_mem_latency_ += perf_mem_pending_reads_;
   } while (!done);
diff --git a/sim/simx/types.h b/sim/simx/types.h
index 00b895968..7b16dcfde 100644
--- a/sim/simx/types.h
+++ b/sim/simx/types.h
@@ -84,8 +84,10 @@ enum class RegType {
   None,
   Integer,
   Float,
-  Count,
-  Vector
+#ifdef EXT_V_ENABLE
+  Vector,
+#endif
+  Count
 };
 
 inline std::ostream &operator<<(std::ostream &os, const RegType& type) {
@@ -93,7 +95,9 @@ inline std::ostream &operator<<(std::ostream &os, const RegType& type) {
   case RegType::None: break;
   case RegType::Integer: os << "x"; break;
   case RegType::Float:   os << "f"; break;
+#ifdef EXT_V_ENABLE
   case RegType::Vector:  os << "v"; break;
+#endif
   default: assert(false);
   }
   return os;
diff --git a/sim/simx/vpu.cpp b/sim/simx/vpu.cpp
index 63ed8fcc2..3a70560ec 100644
--- a/sim/simx/vpu.cpp
+++ b/sim/simx/vpu.cpp
@@ -2,6 +2,7 @@
 // The purpose of this fork is to make simx-v2-vector up to date with master
 // Thanks to Troibe for his amazing work
 
+#ifdef EXT_V_ENABLE
 #include "emulator.h"
 #include "instr.h"
 #include "processor_impl.h"
@@ -2477,3 +2478,4 @@ void Emulator::executeVector(const Instr &instr, uint32_t wid, std::vector<reg_d
     std::abort();
   }
 }
+#endif
\ No newline at end of file
diff --git a/sim/simx/vpu.h b/sim/simx/vpu.h
index 3974d2552..9ea9ec389 100644
--- a/sim/simx/vpu.h
+++ b/sim/simx/vpu.h
@@ -1,3 +1,4 @@
+#ifdef EXT_V_ENABLE
 #pragma once
 
 using namespace vortex;
@@ -2389,3 +2390,4 @@ void vector_op_vv_compress(std::vector<std::vector<Byte>> &vreg_file, uint32_t r
     std::abort();
   }
 }
+#endif
\ No newline at end of file

From 6d27575db340e575284c46908ee888ca2961f55e Mon Sep 17 00:00:00 2001
From: MichaelJSr <miky.srouji@gmail.com>
Date: Tue, 14 Jan 2025 21:56:39 -0800
Subject: [PATCH 4/4] Revert some of "Added ifndef statements for the vector
 extension anywhere they didn't exist already"

---
 sim/simx/decode.cpp | 30 +-----------------------------
 sim/simx/emulator.h |  8 +-------
 sim/simx/instr.h    | 25 +++++--------------------
 sim/simx/types.h    |  8 ++------
 4 files changed, 9 insertions(+), 62 deletions(-)

diff --git a/sim/simx/decode.cpp b/sim/simx/decode.cpp
index 55b83daf3..b57893daa 100644
--- a/sim/simx/decode.cpp
+++ b/sim/simx/decode.cpp
@@ -47,9 +47,7 @@ static const std::unordered_map<Opcode, InstType> sc_instTable = {
   {Opcode::FMSUB,   InstType::R4},
   {Opcode::FMNMADD, InstType::R4},
   {Opcode::FMNMSUB, InstType::R4},
-#ifdef EXT_V_ENABLE
   {Opcode::VSET,    InstType::V},
-#endif
   {Opcode::EXT1,    InstType::R},
   {Opcode::EXT2,    InstType::R4},
   {Opcode::R_W,     InstType::R},
@@ -375,9 +373,7 @@ static const char* op_string(const Instr &instr) {
   case Opcode::FMSUB:   return func2 ? "FMSUB.D" : "FMSUB.S";
   case Opcode::FMNMADD: return func2 ? "FNMADD.D" : "FNMADD.S";
   case Opcode::FMNMSUB: return func2 ? "FNMSUB.D" : "FNMSUB.S";
-#ifdef EXT_V_ENABLE
   case Opcode::VSET:    return "VSET";
-#endif
   case Opcode::EXT1:
     switch (func7) {
     case 0:
@@ -409,7 +405,6 @@ static const char* op_string(const Instr &instr) {
   }
 }
 
-#ifdef EXT_V_ENABLE
 inline void print_vec_attr(std::ostream &os, const Instr &instr) {
   uint32_t mask = instr.getVattrMask();
   if (mask & vattr_vlswidth)
@@ -437,7 +432,6 @@ inline void print_vec_attr(std::ostream &os, const Instr &instr) {
   if (mask & vattr_vediv)
     os << ", ediv:" << instr.getVediv();
 }
-#endif
 
 namespace vortex {
 std::ostream &operator<<(std::ostream &os, const Instr &instr) {
@@ -481,11 +475,9 @@ std::shared_ptr<Instr> Emulator::decode(uint32_t code) const {
 
   auto func2 = (code >> shift_func2) & mask_func2;
   auto func3 = (code >> shift_func3) & mask_func3;
-  auto func7 = (code >> shift_func7) & mask_func7;
-#ifdef EXT_V_ENABLE
   auto func6 = (code >> shift_func6) & mask_func6;
+  auto func7 = (code >> shift_func7) & mask_func7;
   __unused(func6);
-#endif
 
   auto rd  = (code >> shift_rd)  & mask_reg;
   auto rs1 = (code >> shift_rs1) & mask_reg;
@@ -499,13 +491,11 @@ std::shared_ptr<Instr> Emulator::decode(uint32_t code) const {
   }
 
   auto iType = op_it->second;
-#ifdef EXT_V_ENABLE
   if (op == Opcode::FL || op == Opcode::FS) {
     if (func3 != 0x2 && func3 != 0x3) {
       iType = InstType::V;
     }
   }
-#endif
 
   switch (iType) {
   case InstType::R:
@@ -594,9 +584,7 @@ std::shared_ptr<Instr> Emulator::decode(uint32_t code) const {
       instr->addSrcReg(rs2, RegType::Integer);
       break;
     }
-  #ifdef EXT_V_ENABLE
     instr->setFunc3(func3);
-  #endif
     instr->setFunc7(func7);
     break;
 
@@ -605,9 +593,7 @@ std::shared_ptr<Instr> Emulator::decode(uint32_t code) const {
     case Opcode::TCU: {
       instr->setDestReg(rs1, RegType::Integer);
       instr->addSrcReg(rs1, RegType::Integer);
-    #ifdef EXT_V_ENABLE
       instr->setFunc3(func3);
-    #endif
       instr->setFunc7(func7);
       auto imm = code >> shift_rs2;
       instr->setImm(sext(imm, width_i_imm));
@@ -617,9 +603,7 @@ std::shared_ptr<Instr> Emulator::decode(uint32_t code) const {
     case Opcode::JALR:
       instr->setDestReg(rd, RegType::Integer);
       instr->addSrcReg(rs1, RegType::Integer);
-    #ifdef EXT_V_ENABLE
       instr->setFunc3(func3);
-    #endif
       if (func3 == 0x1 || func3 == 0x5) {
         // Shift instructions
         auto shamt = rs2; // uint5
@@ -640,25 +624,19 @@ std::shared_ptr<Instr> Emulator::decode(uint32_t code) const {
     case Opcode::FL: {
       instr->setDestReg(rd, (op == Opcode::FL) ? RegType::Float : RegType::Integer);
       instr->addSrcReg(rs1, RegType::Integer);
-    #ifdef EXT_V_ENABLE
       instr->setFunc3(func3);
-    #endif
       auto imm = code >> shift_rs2;
       instr->setImm(sext(imm, width_i_imm));
     } break;
     case Opcode::FENCE:
-    #ifdef EXT_V_ENABLE
       instr->setFunc3(func3);
-    #endif
       instr->setImm(code >> shift_rs2);
       break;
     case Opcode::SYS:
       if (func3 != 0) {
         // CSR instructions
         instr->setDestReg(rd, RegType::Integer);
-      #ifdef EXT_V_ENABLE
         instr->setFunc3(func3);
-      #endif
         if (func3 < 5) {
           instr->addSrcReg(rs1, RegType::Integer);
         } else {
@@ -679,9 +657,7 @@ std::shared_ptr<Instr> Emulator::decode(uint32_t code) const {
   case InstType::S: {
     instr->addSrcReg(rs1, RegType::Integer);
     instr->addSrcReg(rs2, (op == Opcode::FS) ? RegType::Float : RegType::Integer);
-  #ifdef EXT_V_ENABLE
     instr->setFunc3(func3);
-  #endif
     auto imm = (func7 << width_reg) | rd;
     instr->setImm(sext(imm, width_i_imm));
   } break;
@@ -689,9 +665,7 @@ std::shared_ptr<Instr> Emulator::decode(uint32_t code) const {
   case InstType::B: {
     instr->addSrcReg(rs1, RegType::Integer);
     instr->addSrcReg(rs2, RegType::Integer);
-  #ifdef EXT_V_ENABLE
     instr->setFunc3(func3);
-  #endif
     auto bit_11   = rd & 0x1;
     auto bits_4_1 = rd >> 1;
     auto bit_10_5 = func7 & 0x3f;
@@ -723,9 +697,7 @@ std::shared_ptr<Instr> Emulator::decode(uint32_t code) const {
     instr->addSrcReg(rs2, RegType::Float);
     instr->addSrcReg(rs3, RegType::Float);
     instr->setFunc2(func2);
-  #ifdef EXT_V_ENABLE
     instr->setFunc3(func3);
-  #endif
   } break;
 
 #ifdef EXT_V_ENABLE
diff --git a/sim/simx/emulator.h b/sim/simx/emulator.h
index d5f6669dc..144ff2a93 100644
--- a/sim/simx/emulator.h
+++ b/sim/simx/emulator.h
@@ -81,7 +81,6 @@ class Emulator {
     bool        fallthrough;
   };
 
-#ifdef EXT_V_ENABLE
   struct vtype_t {
     uint32_t vill;
     uint32_t vma;
@@ -89,7 +88,6 @@ class Emulator {
     uint32_t vsew;
     uint32_t vlmul;
   };
-#endif
 
   union reg_data_t {
     Word     u;
@@ -111,14 +109,12 @@ class Emulator {
     ThreadMask                        tmask;
     std::vector<std::vector<Word>>    ireg_file;
     std::vector<std::vector<uint64_t>>freg_file;
+    std::vector<std::vector<Byte>>    vreg_file;
     std::stack<ipdom_entry_t>         ipdom_stack;
     Byte                              fcsr;
-  #ifdef EXT_V_ENABLE
-    std::vector<std::vector<Byte>>    vreg_file;
     vtype_t                           vtype;
     uint32_t                          vl;
     Word                              vlmax;
-  #endif
     uint32_t                          uuid;
   };
 
@@ -177,9 +173,7 @@ class Emulator {
   uint32_t mat_size;
   uint32_t tc_size;
   uint32_t tc_num;
-#ifdef EXT_V_ENABLE
   std::vector<std::vector<std::unordered_map<uint32_t, uint32_t>>> csrs_;
-#endif
 };
 
 }
diff --git a/sim/simx/instr.h b/sim/simx/instr.h
index bbd853b6b..88b9f5cd3 100644
--- a/sim/simx/instr.h
+++ b/sim/simx/instr.h
@@ -42,10 +42,8 @@ enum class Opcode {
   // RV64 Standard Extension
   R_W       = 0x3b,
   I_W       = 0x1b,
-#ifdef EXT_V_ENABLE
   // Vector Extension
   VSET      = 0x57,
-#endif
   // Custom Extensions
   EXT1      = 0x0b,
   EXT2      = 0x2b,
@@ -60,9 +58,7 @@ enum class InstType {
   B,
   U,
   J,
-#ifdef EXT_V_ENABLE
   V,
-#endif
   R4
 };
 
@@ -142,7 +138,6 @@ class Instr {
     , rdest_(0)
     , func2_(0)
     , func3_(0)
-  #ifdef EXT_V_ENABLE
     , func6_(0)
     , func7_(0)
     , vmask_(0)
@@ -157,9 +152,8 @@ class Instr {
     , vta_(0)
     , vma_(0)
     , vediv_(0)
-    , vattr_mask_(0)
-    #endif
-    { for (uint32_t i = 0; i < MAX_REG_SOURCES; ++i) {
+    , vattr_mask_(0) {
+    for (uint32_t i = 0; i < MAX_REG_SOURCES; ++i) {
        rsrc_type_[i] = RegType::None;
        rsrc_[i] = 0;
     }
@@ -189,11 +183,9 @@ class Instr {
   void setImm(uint32_t imm) { has_imm_ = true; imm_ = imm; }
 
   void setFunc2(uint32_t func2) { func2_ = func2; }
-  void setFunc7(uint32_t func7) { func7_ = func7; }
-
-#ifdef EXT_V_ENABLE
   void setFunc3(uint32_t func3) { func3_ = func3; }
   void setFunc6(uint32_t func6) { func6_ = func6; }
+  void setFunc7(uint32_t func7) { func7_ = func7; }
 
   // Attributes for Vector instructions
   void setVlsWidth(uint32_t width) { vlsWidth_ = width; vattr_mask_ |= vattr_vlswidth; }
@@ -208,7 +200,6 @@ class Instr {
   void setVta(uint32_t vta) { vta_ = vta; vattr_mask_ |= vattr_vta; }
   void setVma(uint32_t vma) { vma_ = vma; vattr_mask_ |= vattr_vma; }
   void setVediv(uint32_t ediv) { vediv_ = 1 << ediv; vattr_mask_ |= vattr_vediv; }
-#endif
 
   Opcode   getOpcode() const { return opcode_; }
 
@@ -224,10 +215,8 @@ class Instr {
 
   uint32_t getFunc2() const { return func2_; }
   uint32_t getFunc3() const { return func3_; }
-  uint32_t getFunc7() const { return func7_; }
-
-#ifdef EXT_V_ENABLE
   uint32_t getFunc6() const { return func6_; }
+  uint32_t getFunc7() const { return func7_; }
 
   uint32_t getVlsWidth() const { return vlsWidth_; }
   uint32_t getVmop() const { return vMop_; }
@@ -242,7 +231,6 @@ class Instr {
   uint32_t getVma() const { return vma_; }
   uint32_t getVediv() const { return vediv_; }
   uint32_t getVattrMask() const { return vattr_mask_; }
-#endif
 
 private:
 
@@ -260,10 +248,8 @@ class Instr {
   uint32_t rdest_;
   uint32_t func2_;
   uint32_t func3_;
-  uint32_t func7_;
-
-#ifdef EXT_V_ENABLE
   uint32_t func6_;
+  uint32_t func7_;
 
   // Vector
   uint32_t vmask_;
@@ -279,7 +265,6 @@ class Instr {
   uint32_t vma_;
   uint32_t vediv_;
   uint32_t vattr_mask_;
-#endif
 
   friend std::ostream &operator<<(std::ostream &, const Instr&);
 };
diff --git a/sim/simx/types.h b/sim/simx/types.h
index 7b16dcfde..00b895968 100644
--- a/sim/simx/types.h
+++ b/sim/simx/types.h
@@ -84,10 +84,8 @@ enum class RegType {
   None,
   Integer,
   Float,
-#ifdef EXT_V_ENABLE
-  Vector,
-#endif
-  Count
+  Count,
+  Vector
 };
 
 inline std::ostream &operator<<(std::ostream &os, const RegType& type) {
@@ -95,9 +93,7 @@ inline std::ostream &operator<<(std::ostream &os, const RegType& type) {
   case RegType::None: break;
   case RegType::Integer: os << "x"; break;
   case RegType::Float:   os << "f"; break;
-#ifdef EXT_V_ENABLE
   case RegType::Vector:  os << "v"; break;
-#endif
   default: assert(false);
   }
   return os;