diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv
new file mode 100644
index 0000000000..b49315637f
--- /dev/null
+++ b/finn-rtllib/mvu/mvu_4sx4u.sv
@@ -0,0 +1,494 @@
+module mvu_4sx4u #(
+	int unsigned  PE,
+	int unsigned  SIMD,
+	int unsigned  ACCU_WIDTH,
+
+	int unsigned  VERSION = 1,
+	bit  SIGNED_ACTIVATIONS = 0,
+	bit  FORCE_BEHAVIORAL = 0
+)(
+	// Global Control
+	input	logic  clk,
+	input	logic  rst,
+	input	logic  en,
+
+	// Input
+	input	logic  last,
+	input	logic  zero,	// ignore current inputs and force this partial product to zero
+	input	logic signed [PE-1:0][SIMD-1:0][3:0]  w,	// signed weights
+	input	logic                [SIMD-1:0][3:0]  a,	// unsigned activations (override by SIGNED_ACTIVATIONS)
+
+	// Ouput
+	output	logic  vld,
+	output	logic signed [PE-1:0][ACCU_WIDTH-1:0]  p
+);
+	// for verilator always use behavioral code
+	localparam bit  BEHAVIORAL =
+`ifdef VERILATOR
+		1 ||
+`endif
+		FORCE_BEHAVIORAL;
+
+	typedef int unsigned  leave_load_t[2*SIMD-1];
+	function leave_load_t init_leave_loads();
+		automatic leave_load_t  res;
+		for(int  i = 2*(SIMD-1); i >= int'(SIMD)-1; i--)  res[i] = 1;
+		for(int  i = SIMD-2; i >= 0; i--)  res[i] = res[2*i+1] + res[2*i+2];
+		return  res;
+	endfunction : init_leave_loads
+
+	// Pipeline for last indicator flag
+	logic [1:5] L = '0;
+	always_ff @(posedge clk) begin
+		if(rst)      L <= '0;
+		else if(en)  L <= { last, L[1:4] };
+	end
+	assign	vld = L[5];
+
+	// Stages #1 - #3: DSP Lanes + cross-lane canaries duplicated with SIMD parallelism
+	localparam int unsigned  D[4:0] = '{ ACCU_WIDTH+22, 22, 15, 8, 0 }; // Lane offsets
+
+	localparam int unsigned  PIPE_COUNT = (PE+3)/4;
+	for(genvar  c = 0; c < PIPE_COUNT; c++) begin : genPipes
+
+		localparam int unsigned  PE_BEG = 4*c;
+		localparam int unsigned  PE_END = PE < 4*(c+1)? PE : 4*(c+1);
+		localparam int unsigned  PE_REM = 4*(c+1) - PE_END;
+
+		uwire        [57:0]  p3[SIMD];
+		uwire signed [ 1:0]  h3[SIMD][3];
+		for(genvar  s = 0; s < SIMD; s++) begin : genSIMD
+
+			// Input Lane Assembly
+			uwire [17:0]  bb = { {(14){SIGNED_ACTIVATIONS && a[s][3]}}, a[s] };
+			logic [29:0]  aa;
+			logic [26:0]  dd;
+			logic [ 1:0]  xx[3:1];
+			if(1) begin : blkVectorize
+				uwire [3:0]  ww[PE_END - PE_BEG];
+				for(genvar  pe = 0; pe < PE_END - PE_BEG; pe++) begin
+					assign	ww[pe] = w[PE_BEG + pe][s];
+					if(pe) begin
+						if(BEHAVIORAL)  assign  xx[pe + PE_REM] = zero? 0 : ww[pe] * a[s];
+`ifndef VERILATOR
+						else begin
+							LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x (
+								.O6(xx[pe + PE_REM][1]),
+								.O5(xx[pe + PE_REM][0]),
+								.I5(1'b1),
+								.I4(zero),
+								.I3(ww[pe][1]),
+								.I2(a[s][1]),
+								.I1(ww[pe][0]),
+								.I0(a[s][0])
+							);
+						end
+`endif
+					end
+				end
+				always_comb begin
+					dd = '0;
+					aa = '0;
+					for(int unsigned  pe = 0; pe < PE_END - PE_BEG; pe++) begin
+						dd[D[pe + PE_REM]+:3] = ww[pe];
+						aa[D[pe + PE_REM]+ 3] = ww[pe][3];
+					end
+				end
+			end : blkVectorize
+
+			uwire [47:0]  pp;
+
+			// Note: Since the product B * AD is computed,
+			//       rst can be only applied to AD and zero only to B
+			//       with the same effect as zeroing both.
+			if(BEHAVIORAL) begin : genBehav
+				// Stage #1: Input Refine
+				logic signed [17:0]  B1  = 0;
+				always_ff @(posedge clk) begin
+					if(zero)     B1  <= 0;
+					else if(en)  B1  <= bb;
+				end
+
+				logic signed [26:0]  AD1 = 0;
+				always_ff @(posedge clk) begin
+					if(rst)      AD1 <= 0;
+					else if(en)  AD1 <= dd - aa;
+				end
+
+				// Stage #2: Multiply
+				logic signed [45:0]  M2 = 0;
+				always_ff @(posedge clk) begin
+					if(rst)      M2 <= 0;
+					else if(en)  M2 <=
+// synthesis translate off
+						(B1 === '0) || (AD1 === '0)? 0 :
+// synthesis translate on
+						B1 * AD1;
+				end
+
+				// Stage #3: Accumulate
+				logic signed [47:0]  P3 = 0;
+				always_ff @(posedge clk) begin
+					if(rst)      P3 <= 0;
+					else if(en)  P3 <= M2 + (L[3]? 0 : P3);
+				end
+
+				assign	pp = P3;
+			end : genBehav
+`ifndef VERILATOR
+			else begin : genDSP
+				localparam logic [6:0]  OPMODE_INVERSION = 7'b010_01_01;
+				uwire [6:0]  opmode = { { 1'b0, L[2], 1'b0 }, 4'b00_00 };
+				case(VERSION)
+				1: DSP48E1 #(
+					// Feature Control Attributes: Data Path Selection
+					.A_INPUT("DIRECT"),		// Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port)
+					.B_INPUT("DIRECT"),		// Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port)
+					.USE_DPORT("TRUE"),		// Select D port usage (TRUE or FALSE)
+					.USE_MULT("MULTIPLY"),	// Select multiplier usage ("MULTIPLY", "DYNAMIC", or "NONE")
+					.USE_SIMD("ONE48"),		// SIMD selection ("ONE48", "TWO24", "FOUR12")
+
+					// Pattern Detector Attributes: Pattern Detection Configuration
+					.AUTORESET_PATDET("NO_RESET"),		// "NO_RESET", "RESET_MATCH", "RESET_NOT_MATCH"
+					.MASK('1),							// 48-bit mask value for pattern detect (1=ignore)
+					.PATTERN('0),						// 48-bit pattern match for pattern detect
+					.SEL_MASK("MASK"),					// "C", "MASK", "ROUNDING_MODE1", "ROUNDING_MODE2"
+					.SEL_PATTERN("PATTERN"),			// Select pattern value ("PATTERN" or "C")
+					.USE_PATTERN_DETECT("NO_PATDET"),	// Enable pattern detect ("PATDET" or "NO_PATDET")
+
+					// Register Control Attributes: Pipeline Register Configuration
+					.ACASCREG(0),		// Number of pipeline stages between A/ACIN and ACOUT (0, 1 or 2)
+					.ADREG(1),			// Number of pipeline stages for pre-adder (0 or 1)
+					.ALUMODEREG(0),		// Number of pipeline stages for ALUMODE (0 or 1)
+					.AREG(0),			// Number of pipeline stages for A (0, 1 or 2)
+					.BCASCREG(1),		// Number of pipeline stages between B/BCIN and BCOUT (0, 1 or 2)
+					.BREG(1),			// Number of pipeline stages for B (0, 1 or 2)
+					.CARRYINREG(0),		// Number of pipeline stages for CARRYIN (0 or 1)
+					.CARRYINSELREG(0),	// Number of pipeline stages for CARRYINSEL (0 or 1)
+					.CREG(0),			// Number of pipeline stages for C (0 or 1)
+					.DREG(0),			// Number of pipeline stages for D (0 or 1)
+					.INMODEREG(0),		// Number of pipeline stages for INMODE (0 or 1)
+					.MREG(1),			// Number of multiplier pipeline stages (0 or 1)
+					.OPMODEREG(1),		// Number of pipeline stages for OPMODE (0 or 1)
+					.PREG(1)			// Number of pipeline stages for P (0 or 1)
+				) dsp (
+					// Cascade: 30-bit (each) output: Cascade Ports
+					.ACOUT(),			// 30-bit output: A port cascade output
+					.BCOUT(),			// 18-bit output: B port cascade output
+					.CARRYCASCOUT(),	// 1-bit output: Cascade carry output
+					.MULTSIGNOUT(),		// 1-bit output: Multiplier sign cascade output
+					.PCOUT(),			// 48-bit output: Cascade output
+
+					// Control: 1-bit (each) output: Control Inputs/Status Bits
+					.OVERFLOW(),		 // 1-bit output: Overflow in add/acc output
+					.PATTERNBDETECT(),	 // 1-bit output: Pattern bar detect output
+					.PATTERNDETECT(),	 // 1-bit output: Pattern detect output
+					.UNDERFLOW(),		 // 1-bit output: Underflow in add/acc output
+
+					// Data: 4-bit (each) output: Data Ports
+					.CARRYOUT(),	// 4-bit output: Carry output
+					.P(pp),			// 48-bit output: Primary data output
+
+					// Cascade: 30-bit (each) input: Cascade Ports
+					.ACIN('x),			 // 30-bit input: A cascade data input
+					.BCIN('x),			 // 18-bit input: B cascade input
+					.CARRYCASCIN('x),	 // 1-bit input: Cascade carry input
+					.MULTSIGNIN('x),	 // 1-bit input: Multiplier sign input
+					.PCIN('x),			 // 48-bit input: P cascade input
+
+					// Control: 4-bit (each) input: Control Inputs/Status Bits
+					.CLK(clk),				// 1-bit input: Clock input
+					.ALUMODE('0),			// 4-bit input: ALU control input
+					.CARRYINSEL('0),		// 3-bit input: Carry select input
+					.INMODE(5'b01100),		// 5-bit input: INMODE control input
+					.OPMODE(opmode ^ OPMODE_INVERSION), // 7-bit input: Operation mode input
+
+					// Data: 30-bit (each) input: Data Ports
+					.A(aa),			// 30-bit input: A data input
+					.B(bb),			// 18-bit input: B data input
+					.C('x),			// 48-bit input: C data input
+					.CARRYIN('0),	// 1-bit input: Carry input signal
+					.D(dd),			// 25-bit input: D data input
+
+					// Reset/Clock Enable: 1-bit (each) input: Reset/Clock Enable Inputs
+					.CEA1('0),			// 1-bit input: Clock enable input for 1st stage AREG
+					.CEA2('0),			// 1-bit input: Clock enable input for 2nd stage AREG
+					.CEAD(en),			// 1-bit input: Clock enable input for ADREG
+					.CEALUMODE('0),		// 1-bit input: Clock enable input for ALUMODERE
+					.CEB1('0),			// 1-bit input: Clock enable input for 1st stage BREG
+					.CEB2(en),			// 1-bit input: Clock enable input for 2nd stage BREG
+					.CEC('0),			// 1-bit input: Clock enable input for CREG
+					.CECARRYIN('0),		// 1-bit input: Clock enable input for CARRYINREG
+					.CECTRL(en),		// 1-bit input: Clock enable input for OPMODEREG and CARRYINSELREG
+					.CED('0),			// 1-bit input: Clock enable input for DREG
+					.CEINMODE('0),		// 1-bit input: Clock enable input for INMODEREG
+					.CEM(en),			// 1-bit input: Clock enable input for MREG
+					.CEP(en),			// 1-bit input: Clock enable input for PREG
+					.RSTA('0),			// 1-bit input: Reset input for AREG
+					.RSTB(				// 1-bit input: Reset for BREG
+// synthesis translate_off
+						rst ||
+// synthesis translate_on
+						zero
+					),
+					.RSTC('0),			// 1-bit input: Reset for CREG
+					.RSTD(				// 1-bit input: Reset for DREG and ADREG
+// synthesis translate_off
+						zero ||
+// synthesis translate_on
+						rst
+					),
+					.RSTALLCARRYIN('0),	// 1-bit input: Reset for CARRYINREG
+					.RSTALUMODE('0),	// 1-bit input: Reset for ALUMODEREG
+					.RSTCTRL('0),		// 1-bit input: Reset for OPMODEREG and CARRYINSELREG
+					.RSTINMODE('0),		// 1-bit input: Reset for INMODE register
+					.RSTM(rst),			// 1-bit input: Reset for MREG
+					.RSTP(rst)			// 1-bit input: Reset for PREG
+				);
+				2: DSP48E2 #(
+					// Feature Control Attributes: Data Path Selection
+					.AMULTSEL("AD"),	// Selects A input to multiplier (A, AD)
+					.A_INPUT("DIRECT"),	// Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port)
+					.BMULTSEL("B"),		// Selects B input to multiplier (AD, B)
+					.B_INPUT("DIRECT"),	// Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port)
+					.PREADDINSEL("A"),                 // Selects input to pre-adder (A, B)
+					.RND('0),                          // Rounding Constant
+					.USE_MULT("MULTIPLY"),             // Select multiplier usage (DYNAMIC, MULTIPLY, NONE)
+					.USE_SIMD("ONE48"),                // SIMD selection (FOUR12, ONE58, TWO24)
+					.USE_WIDEXOR("FALSE"),             // Use the Wide XOR function (FALSE, TRUE)
+					.XORSIMD("XOR24_48_96"),       // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116)
+
+					// Pattern Detector Attributes: Pattern Detection Configuration
+					.AUTORESET_PATDET("NO_RESET"),     // NO_RESET, RESET_MATCH, RESET_NOT_MATCH
+					.AUTORESET_PRIORITY("RESET"),      // Priority of AUTORESET vs. CEP (CEP, RESET).
+					.MASK('1),                         // 58-bit mask value for pattern detect (1=ignore)
+					.PATTERN('0),                      // 58-bit pattern match for pattern detect
+					.SEL_MASK("MASK"),                 // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2
+					.SEL_PATTERN("PATTERN"),           // Select pattern value (C, PATTERN)
+					.USE_PATTERN_DETECT("NO_PATDET"),  // Enable pattern detect (NO_PATDET, PATDET)
+
+					// Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins
+					.IS_ALUMODE_INVERTED('0),							// Optional inversion for ALUMODE
+					.IS_CARRYIN_INVERTED('0),							// Optional inversion for CARRYIN
+					.IS_CLK_INVERTED('0),								// Optional inversion for CLK
+					.IS_INMODE_INVERTED('0),							// Optional inversion for INMODE
+					.IS_OPMODE_INVERTED({ 2'b00, OPMODE_INVERSION}),	// Optional inversion for OPMODE
+					.IS_RSTALLCARRYIN_INVERTED('0),						// Optional inversion for RSTALLCARRYIN
+					.IS_RSTALUMODE_INVERTED('0),						// Optional inversion for RSTALUMODE
+					.IS_RSTA_INVERTED('0),								// Optional inversion for RSTA
+					.IS_RSTB_INVERTED('0),								// Optional inversion for RSTB
+					.IS_RSTCTRL_INVERTED('0),							// Optional inversion for STCONJUGATE_A
+					.IS_RSTC_INVERTED('0),								// Optional inversion for RSTC
+					.IS_RSTD_INVERTED('0),								// Optional inversion for RSTD
+					.IS_RSTINMODE_INVERTED('0),							// Optional inversion for RSTINMODE
+					.IS_RSTM_INVERTED('0),								// Optional inversion for RSTM
+					.IS_RSTP_INVERTED('0),								// Optional inversion for RSTP
+
+					// Register Control Attributes: Pipeline Register Configuration
+					.ACASCREG(0),                      // Number of pipeline stages between A/ACIN and ACOUT (0-2)
+					.ADREG(1),                         // Pipeline stages for pre-adder (0-1)
+					.ALUMODEREG(0),                    // Pipeline stages for ALUMODE (0-1)
+					.AREG(0),                          // Pipeline stages for A (0-2)
+					.BCASCREG(1),                      // Number of pipeline stages between B/BCIN and BCOUT (0-2)
+					.BREG(1),                          // Pipeline stages for B (0-2)
+					.CARRYINREG(0),                    // Pipeline stages for CARRYIN (0-1)
+					.CARRYINSELREG(0),                 // Pipeline stages for CARRYINSEL (0-1)
+					.CREG(0),                          // Pipeline stages for C (0-1)
+					.DREG(0),                          // Pipeline stages for D (0-1)
+					.INMODEREG(0),                     // Pipeline stages for INMODE (0-1)
+					.MREG(1),                          // Multiplier pipeline stages (0-1)
+					.OPMODEREG(1),                     // Pipeline stages for OPMODE (0-1)
+					.PREG(1)                          // Number of pipeline stages for P (0-1)
+				) dsp (
+					// Cascade outputs: Cascade Ports
+					.ACOUT(),			// 34-bit output: A port cascade
+					.BCOUT(),			// 24-bit output: B cascade
+					.CARRYCASCOUT(),	// 1-bit output: Cascade carry
+					.MULTSIGNOUT(),		// 1-bit output: Multiplier sign cascade
+					.PCOUT(),			// 58-bit output: Cascade output
+
+					// Control outputs: Control Inputs/Status Bits
+					.OVERFLOW(),		// 1-bit output: Overflow in add/acc
+					.PATTERNBDETECT(),	// 1-bit output: Pattern bar detect
+					.PATTERNDETECT(),	// 1-bit output: Pattern detect
+					.UNDERFLOW(),		// 1-bit output: Underflow in add/acc
+
+					// Data outputs: Data Ports
+					.CARRYOUT(),		// 4-bit output: Carry
+					.P(pp),				// 58-bit output: Primary data
+					.XOROUT(),			// 8-bit output: XOR data
+
+					// Cascade inputs: Cascade Ports
+					.ACIN('x),			// 34-bit input: A cascade data
+					.BCIN('x),			// 24-bit input: B cascade
+					.CARRYCASCIN('x),	// 1-bit input: Cascade carry
+					.MULTSIGNIN('x),	// 1-bit input: Multiplier sign cascade
+					.PCIN('x),			// 58-bit input: P cascade
+
+					// Control inputs: Control Inputs/Status Bits
+					.CLK(clk),					// 1-bit input: Clock
+					.ALUMODE(4'h0),				// 4-bit input: ALU control
+					.CARRYINSEL('0),			// 3-bit input: Carry select
+					.INMODE(5'b01100),			// 5-bit input: INMODE control
+					.OPMODE({ 2'b00, opmode }),	// 9-bit input: Operation mode
+
+					// Data inputs: Data Ports
+					.A(aa),						// 34-bit input: A data
+					.B(bb),						// 24-bit input: B data
+					.C('x),						// 58-bit input: C data
+					.CARRYIN('0),				// 1-bit input: Carry-in
+					.D(dd),						// 27-bit input: D data
+
+					// Reset/Clock Enable inputs: Reset/Clock Enable Inputs
+					.CEA1('0),			// 1-bit input: Clock enable for 1st stage AREG
+					.CEA2('0),			// 1-bit input: Clock enable for 2nd stage AREG
+					.CEAD(en),			// 1-bit input: Clock enable for ADREG
+					.CEALUMODE('0),		// 1-bit input: Clock enable for ALUMODE
+					.CEB1('0),			// 1-bit input: Clock enable for 1st stage BREG
+					.CEB2(en),			// 1-bit input: Clock enable for 2nd stage BREG
+					.CEC('0),			// 1-bit input: Clock enable for CREG
+					.CECARRYIN('0),		// 1-bit input: Clock enable for CARRYINREG
+					.CECTRL(en),		// 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG
+					.CED('0),			// 1-bit input: Clock enable for DREG
+					.CEINMODE('0),		// 1-bit input: Clock enable for INMODEREG
+					.CEM(en),			// 1-bit input: Clock enable for MREG
+					.CEP(en),			// 1-bit input: Clock enable for PREG
+					.RSTA('0),			// 1-bit input: Reset for AREG
+					.RSTB(				// 1-bit input: Reset for BREG
+// synthesis translate_off
+						rst ||
+// synthesis translate_on
+						zero
+					),
+					.RSTC('0),			// 1-bit input: Reset for CREG
+					.RSTD(				// 1-bit input: Reset for DREG and ADREG
+// synthesis translate_off
+						zero ||
+// synthesis translate_on
+						rst
+					),
+					.RSTALLCARRYIN('0),	// 1-bit input: Reset for CARRYINREG
+					.RSTALUMODE('0),	// 1-bit input: Reset for ALUMODEREG
+					.RSTCTRL('0),		// 1-bit input: Reset for OPMODEREG and CARRYINSELREG
+					.RSTINMODE('0),		// 1-bit input: Reset for INMODE register
+					.RSTM(rst),			// 1-bit input: Reset for MREG
+					.RSTP(rst)			// 1-bit input: Reset for PREG
+				);
+				default: initial begin
+					$error("Unknown version DSP48E%0d.", VERSION);
+					$finish;
+				end
+				endcase
+			end : genDSP
+`endif
+
+			// External Canary Pipeline
+			logic [1:0]  X1[3:1] = '{ default: 0 };
+			logic [1:0]  X2[3:1] = '{ default: 0 };
+			logic [1:0]  X3[3:1] = '{ default: 0 };
+			always_ff @(posedge clk) begin
+				if(rst) begin
+					X1 <= '{ default: 0 };
+					X2 <= '{ default: 0 };
+					X3 <= '{ default: 0 };
+				end
+				else if(en) begin
+					X1 <= xx;
+					X2 <= X1;
+					foreach(X3[i]) begin
+						X3[i] <= X2[i] + (L[3]? 2'h0 : pp[D[i]+:2]);
+					end
+				end
+			end
+
+			// Derive actual cross-lane overflows
+			for(genvar  i = 0; i < 3; i++) begin
+				assign	h3[s][i] = pp[D[i+1]+:2] - X3[i+1];
+			end
+			assign	p3[s] = pp;
+
+		end : genSIMD
+
+		// Stage #4: Cross-SIMD Reduction
+
+		// Count leaves reachable from each node
+		localparam leave_load_t  LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 1}; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop
+
+		uwire signed [ACCU_WIDTH  -1:0]  up4;
+		uwire signed [ACCU_WIDTH  -8:0]  hi4[3];
+		uwire        [$clog2(SIMD)+7:0]  lo4[3];
+		for(genvar  i = 0; i < 4; i++) begin
+			localparam int unsigned  LO_WIDTH = D[i+1] - D[i];
+			localparam int unsigned  HI_WIDTH = ACCU_WIDTH - LO_WIDTH;
+
+			// Conclusive high part accumulation
+			if(i >= PE_REM && i < 3) begin : genHi
+				// Adder Tree across all SIMD high contributions, each from [-1:1]
+				uwire signed [2*SIMD-2:0][$clog2(1+SIMD):0]  tree;
+				for(genvar  s = 0; s < SIMD;   s++)  assign  tree[SIMD-1+s] = h3[s][i];
+				for(genvar  n = 0; n < SIMD-1; n++) begin
+					// Sum truncated to actual maximum bit width at this node
+					uwire signed [$clog2(1+LEAVE_LOAD[n]):0]  s = tree[2*n+1] + tree[2*n+2];
+					assign  tree[n] = s;
+				end
+
+				// High Sideband Accumulation
+				logic signed [HI_WIDTH-1:0]  Hi4 = 0;
+				always_ff @(posedge clk) begin
+					if(rst)      Hi4 <= 0;
+					else if(en)  Hi4 <= (L[4]? 0 : Hi4) + tree[0];
+				end
+				assign	hi4[i] = Hi4;
+			end : genHi
+			else if (i < 3) begin : genHiZero
+				assign hi4[i] = '0;
+			end : genHiZero
+
+			// Conclusive low part accumulation
+			if(i >= PE_REM) begin : blkLo
+				// Adder Tree across all SIMD low contributions
+				localparam int unsigned  ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1));
+				uwire [2*SIMD-2:0][ROOT_WIDTH-1:0]  tree;
+				for(genvar  s = 0; s < SIMD;   s++)  assign  tree[SIMD-1+s] = p3[s][D[i]+:LO_WIDTH];
+				for(genvar  n = 0; n < SIMD-1; n++) begin
+					// Sum truncated to actual maximum bit width at this node
+					localparam int unsigned  NODE_WIDTH = $clog2(1 + LEAVE_LOAD[n]*(2**LO_WIDTH-1));
+					uwire [NODE_WIDTH-1:0]  s = tree[2*n+1] + tree[2*n+2];
+					assign  tree[n] = s;
+				end
+
+				logic [ROOT_WIDTH-1:0]  Lo4 = 0;
+				always_ff @(posedge clk) begin
+					if(rst)      Lo4 <= 0;
+					else if(en)  Lo4 <= tree[0];
+				end
+
+				if(i == 3)  assign  up4 = Lo4;
+				else  assign  lo4[i] = Lo4;
+			end : blkLo
+			else begin : blkLoZero
+				assign lo4[i] = '0;
+			end : blkLoZero
+
+		end
+
+		// Stage #5: Resolve lane totals
+		logic signed [3:0][ACCU_WIDTH-1:0]  Res5 = '{ default: 0 };
+		always_ff @(posedge clk) begin
+			if(rst)  Res5 <= '{ default: 0 };
+			else if(en) begin
+				Res5[3] <= up4 - hi4[2];
+				Res5[2] <= $signed({ hi4[2], {(D[3] - D[2]){1'b0}} }) + $signed({ 1'b0, lo4[2] }) - hi4[1];
+				Res5[1] <= $signed({ hi4[1], {(D[2] - D[1]){1'b0}} }) + $signed({ 1'b0, lo4[1] }) - hi4[0];
+				Res5[0] <= $signed({ hi4[0], {(D[1] - D[0]){1'b0}} }) + $signed({ 1'b0, lo4[0] });
+			end
+		end
+
+		// Output
+		for(genvar  pe = PE_BEG; pe < PE_END; pe++) begin
+			assign	p[pe] = Res5[pe - PE_BEG + PE_REM];
+		end
+
+	end : genPipes
+
+endmodule : mvu_4sx4u
diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
new file mode 100644
index 0000000000..f3cde9dea9
--- /dev/null
+++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
@@ -0,0 +1,492 @@
+module mvu_8sx8u_dsp48 #(
+	int unsigned  PE,
+	int unsigned  SIMD,
+	int unsigned  ACCU_WIDTH,
+	int unsigned  ACTIVATION_WIDTH,
+	int unsigned  WEIGHT_WIDTH,
+
+	int unsigned  VERSION = 1,
+	bit  SIGNED_ACTIVATIONS = 0,
+	bit  FORCE_BEHAVIORAL = 0
+)(
+	// Global Control
+	input	logic  clk,
+	input	logic  rst,
+	input	logic  en,
+
+	// Input
+	input	logic  last,
+	input	logic  zero,	// ignore current inputs and force this partial product to zero
+	input	logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH    -1:0]  w,	// signed weights
+	input	logic                [SIMD-1:0][ACTIVATION_WIDTH-1:0]  a,	// unsigned activations (override by SIGNED_ACTIVATIONS)
+
+	// Ouput
+	output	logic  vld,
+	output	logic signed [PE-1:0][ACCU_WIDTH-1:0]  p
+);
+	// for verilator always use behavioral code
+	localparam bit  BEHAVIORAL =
+`ifdef VERILATOR
+		1 ||
+`endif
+		FORCE_BEHAVIORAL;
+
+	typedef int unsigned  leave_load_t[2*SIMD-1];
+	function leave_load_t init_leave_loads();
+		automatic leave_load_t  res;
+		for(int  i = 2*(SIMD-1); i >= int'(SIMD)-1; i--)  res[i] = 1;
+		for(int  i = SIMD-2; i >= 0; i--)  res[i] = res[2*i+1] + res[2*i+2];
+		return  res;
+	endfunction : init_leave_loads
+
+	// Pipeline for last indicator flag
+	logic [1:5] L = '0;
+	always_ff @(posedge clk) begin
+		if(rst)      L <= '0;
+		else if(en)  L <= { last, L[1:4] };
+	end
+	assign	vld = L[5];
+
+	// Stages #1 - #3: DSP Lanes + cross-lane canaries duplicated with SIMD parallelism
+	localparam int unsigned  SINGLE_PROD_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH;
+	localparam int unsigned  D[2:0] = '{ ACCU_WIDTH+SINGLE_PROD_WIDTH, SINGLE_PROD_WIDTH, 0 }; // Lane offsets
+
+	localparam int unsigned  PIPE_COUNT = (PE+1)/2;
+	for(genvar  c = 0; c < PIPE_COUNT; c++) begin : genPipes
+
+		localparam int unsigned  PE_BEG = 2*c;
+		localparam int unsigned  PE_END = PE < 2*(c+1)? PE : 2*(c+1);
+		localparam int unsigned  PE_REM = 2*(c+1) - PE_END;
+
+		uwire        [57:0]  p3[SIMD];
+		uwire signed [ 1:0]  h3[SIMD];
+		for(genvar  s = 0; s < SIMD; s++) begin : genSIMD
+
+			// Input Lane Assembly
+			uwire [17:0]  bb = { {(18-ACTIVATION_WIDTH){SIGNED_ACTIVATIONS && a[s][ACTIVATION_WIDTH-1]}}, a[s] };
+			logic [29:0]  aa;
+			logic [26:0]  dd;
+			logic [ 1:0]  xx;
+			if(1) begin : blkVectorize
+				uwire [WEIGHT_WIDTH-1:0]  ww[PE_END - PE_BEG];
+				for(genvar  pe = 0; pe < PE_END - PE_BEG; pe++) begin
+					assign	ww[pe] = w[PE_BEG + pe][s];
+					if(pe) begin
+						if(BEHAVIORAL)  assign  xx = zero? 0 : ww[pe] * a[s];
+`ifndef VERILATOR
+						else begin
+							LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x (
+								.O6(xx[1]),
+								.O5(xx[0]),
+								.I5(1'b1),
+								.I4(zero),
+								.I3(ww[pe][1]),
+								.I2(a[s][1]),
+								.I1(ww[pe][0]),
+								.I0(a[s][0])
+							);
+						end
+`endif
+					end
+				end
+				always_comb begin
+					dd = '0;
+					aa = '0;
+					for(int unsigned  pe = 0; pe < PE_END - PE_BEG; pe++) begin
+						dd[D[pe + PE_REM] +: WEIGHT_WIDTH-1] = ww[pe];
+						aa[D[pe + PE_REM] + WEIGHT_WIDTH-1] = ww[pe][WEIGHT_WIDTH-1];
+					end
+				end
+			end : blkVectorize
+
+			uwire [47:0]  pp;
+
+			// Note: Since the product B * AD is computed,
+			//       rst can be only applied to AD and zero only to B
+			//       with the same effect as zeroing both.
+			if(BEHAVIORAL) begin : genBehav
+				// Stage #1: Input Refine
+				logic signed [17:0]  B1  = 0;
+				always_ff @(posedge clk) begin
+					if(zero)     B1  <= 0;
+					else if(en)  B1  <= bb;
+				end
+
+				logic signed [26:0]  AD1 = 0;
+				always_ff @(posedge clk) begin
+					if(rst)      AD1 <= 0;
+					else if(en)  AD1 <= dd - aa;
+				end
+
+				// Stage #2: Multiply
+				logic signed [45:0]  M2 = 0;
+				always_ff @(posedge clk) begin
+					if(rst)      M2 <= 0;
+					else if(en)  M2 <=
+// synthesis translate off
+						(B1 === '0) || (AD1 === '0)? 0 :
+// synthesis translate on
+						B1 * AD1;
+				end
+
+				// Stage #3: Accumulate
+				logic signed [47:0]  P3 = 0;
+				always_ff @(posedge clk) begin
+					if(rst)      P3 <= 0;
+					else if(en)  P3 <= M2 + (L[3]? 0 : P3);
+				end
+
+				assign	pp = P3;
+			end : genBehav
+`ifndef VERILATOR
+			else begin : genDSP
+				localparam logic [6:0]  OPMODE_INVERSION = 7'b010_01_01;
+				uwire [6:0]  opmode = { { 1'b0, L[2], 1'b0 }, 4'b00_00 };
+				case(VERSION)
+				1: DSP48E1 #(
+					// Feature Control Attributes: Data Path Selection
+					.A_INPUT("DIRECT"),		// Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port)
+					.B_INPUT("DIRECT"),		// Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port)
+					.USE_DPORT("TRUE"),		// Select D port usage (TRUE or FALSE)
+					.USE_MULT("MULTIPLY"),	// Select multiplier usage ("MULTIPLY", "DYNAMIC", or "NONE")
+					.USE_SIMD("ONE48"),		// SIMD selection ("ONE48", "TWO24", "FOUR12")
+
+					// Pattern Detector Attributes: Pattern Detection Configuration
+					.AUTORESET_PATDET("NO_RESET"),		// "NO_RESET", "RESET_MATCH", "RESET_NOT_MATCH"
+					.MASK('1),							// 48-bit mask value for pattern detect (1=ignore)
+					.PATTERN('0),						// 48-bit pattern match for pattern detect
+					.SEL_MASK("MASK"),					// "C", "MASK", "ROUNDING_MODE1", "ROUNDING_MODE2"
+					.SEL_PATTERN("PATTERN"),			// Select pattern value ("PATTERN" or "C")
+					.USE_PATTERN_DETECT("NO_PATDET"),	// Enable pattern detect ("PATDET" or "NO_PATDET")
+
+					// Register Control Attributes: Pipeline Register Configuration
+					.ACASCREG(0),		// Number of pipeline stages between A/ACIN and ACOUT (0, 1 or 2)
+					.ADREG(1),			// Number of pipeline stages for pre-adder (0 or 1)
+					.ALUMODEREG(0),		// Number of pipeline stages for ALUMODE (0 or 1)
+					.AREG(0),			// Number of pipeline stages for A (0, 1 or 2)
+					.BCASCREG(1),		// Number of pipeline stages between B/BCIN and BCOUT (0, 1 or 2)
+					.BREG(1),			// Number of pipeline stages for B (0, 1 or 2)
+					.CARRYINREG(0),		// Number of pipeline stages for CARRYIN (0 or 1)
+					.CARRYINSELREG(0),	// Number of pipeline stages for CARRYINSEL (0 or 1)
+					.CREG(0),			// Number of pipeline stages for C (0 or 1)
+					.DREG(0),			// Number of pipeline stages for D (0 or 1)
+					.INMODEREG(0),		// Number of pipeline stages for INMODE (0 or 1)
+					.MREG(1),			// Number of multiplier pipeline stages (0 or 1)
+					.OPMODEREG(1),		// Number of pipeline stages for OPMODE (0 or 1)
+					.PREG(1)			// Number of pipeline stages for P (0 or 1)
+				) dsp (
+					// Cascade: 30-bit (each) output: Cascade Ports
+					.ACOUT(),			// 30-bit output: A port cascade output
+					.BCOUT(),			// 18-bit output: B port cascade output
+					.CARRYCASCOUT(),	// 1-bit output: Cascade carry output
+					.MULTSIGNOUT(),		// 1-bit output: Multiplier sign cascade output
+					.PCOUT(),			// 48-bit output: Cascade output
+
+					// Control: 1-bit (each) output: Control Inputs/Status Bits
+					.OVERFLOW(),		 // 1-bit output: Overflow in add/acc output
+					.PATTERNBDETECT(),	 // 1-bit output: Pattern bar detect output
+					.PATTERNDETECT(),	 // 1-bit output: Pattern detect output
+					.UNDERFLOW(),		 // 1-bit output: Underflow in add/acc output
+
+					// Data: 4-bit (each) output: Data Ports
+					.CARRYOUT(),	// 4-bit output: Carry output
+					.P(pp),			// 48-bit output: Primary data output
+
+					// Cascade: 30-bit (each) input: Cascade Ports
+					.ACIN('x),			 // 30-bit input: A cascade data input
+					.BCIN('x),			 // 18-bit input: B cascade input
+					.CARRYCASCIN('x),	 // 1-bit input: Cascade carry input
+					.MULTSIGNIN('x),	 // 1-bit input: Multiplier sign input
+					.PCIN('x),			 // 48-bit input: P cascade input
+
+					// Control: 4-bit (each) input: Control Inputs/Status Bits
+					.CLK(clk),				// 1-bit input: Clock input
+					.ALUMODE('0),			// 4-bit input: ALU control input
+					.CARRYINSEL('0),		// 3-bit input: Carry select input
+					.INMODE(5'b01100),		// 5-bit input: INMODE control input
+					.OPMODE(opmode ^ OPMODE_INVERSION), // 7-bit input: Operation mode input
+
+					// Data: 30-bit (each) input: Data Ports
+					.A(aa),			// 30-bit input: A data input
+					.B(bb),			// 18-bit input: B data input
+					.C('x),			// 48-bit input: C data input
+					.CARRYIN('0),	// 1-bit input: Carry input signal
+					.D(dd),			// 25-bit input: D data input
+
+					// Reset/Clock Enable: 1-bit (each) input: Reset/Clock Enable Inputs
+					.CEA1('0),			// 1-bit input: Clock enable input for 1st stage AREG
+					.CEA2('0),			// 1-bit input: Clock enable input for 2nd stage AREG
+					.CEAD(en),			// 1-bit input: Clock enable input for ADREG
+					.CEALUMODE('0),		// 1-bit input: Clock enable input for ALUMODERE
+					.CEB1('0),			// 1-bit input: Clock enable input for 1st stage BREG
+					.CEB2(en),			// 1-bit input: Clock enable input for 2nd stage BREG
+					.CEC('0),			// 1-bit input: Clock enable input for CREG
+					.CECARRYIN('0),		// 1-bit input: Clock enable input for CARRYINREG
+					.CECTRL(en),		// 1-bit input: Clock enable input for OPMODEREG and CARRYINSELREG
+					.CED('0),			// 1-bit input: Clock enable input for DREG
+					.CEINMODE('0),		// 1-bit input: Clock enable input for INMODEREG
+					.CEM(en),			// 1-bit input: Clock enable input for MREG
+					.CEP(en),			// 1-bit input: Clock enable input for PREG
+					.RSTA('0),			// 1-bit input: Reset input for AREG
+					.RSTB(				// 1-bit input: Reset for BREG
+// synthesis translate_off
+						rst ||
+// synthesis translate_on
+						zero
+					),
+					.RSTC('0),			// 1-bit input: Reset for CREG
+					.RSTD(				// 1-bit input: Reset for DREG and ADREG
+// synthesis translate_off
+						zero ||
+// synthesis translate_on
+						rst
+					),
+					.RSTALLCARRYIN('0),	// 1-bit input: Reset for CARRYINREG
+					.RSTALUMODE('0),	// 1-bit input: Reset for ALUMODEREG
+					.RSTCTRL('0),		// 1-bit input: Reset for OPMODEREG and CARRYINSELREG
+					.RSTINMODE('0),		// 1-bit input: Reset for INMODE register
+					.RSTM(rst),			// 1-bit input: Reset for MREG
+					.RSTP(rst)			// 1-bit input: Reset for PREG
+				);
+				2: DSP48E2 #(
+					// Feature Control Attributes: Data Path Selection
+					.AMULTSEL("AD"),	// Selects A input to multiplier (A, AD)
+					.A_INPUT("DIRECT"),	// Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port)
+					.BMULTSEL("B"),		// Selects B input to multiplier (AD, B)
+					.B_INPUT("DIRECT"),	// Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port)
+					.PREADDINSEL("A"),                 // Selects input to pre-adder (A, B)
+					.RND('0),                          // Rounding Constant
+					.USE_MULT("MULTIPLY"),             // Select multiplier usage (DYNAMIC, MULTIPLY, NONE)
+					.USE_SIMD("ONE48"),                // SIMD selection (FOUR12, ONE58, TWO24)
+					.USE_WIDEXOR("FALSE"),             // Use the Wide XOR function (FALSE, TRUE)
+					.XORSIMD("XOR24_48_96"),       // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116)
+
+					// Pattern Detector Attributes: Pattern Detection Configuration
+					.AUTORESET_PATDET("NO_RESET"),     // NO_RESET, RESET_MATCH, RESET_NOT_MATCH
+					.AUTORESET_PRIORITY("RESET"),      // Priority of AUTORESET vs. CEP (CEP, RESET).
+					.MASK('1),                         // 58-bit mask value for pattern detect (1=ignore)
+					.PATTERN('0),                      // 58-bit pattern match for pattern detect
+					.SEL_MASK("MASK"),                 // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2
+					.SEL_PATTERN("PATTERN"),           // Select pattern value (C, PATTERN)
+					.USE_PATTERN_DETECT("NO_PATDET"),  // Enable pattern detect (NO_PATDET, PATDET)
+
+					// Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins
+					.IS_ALUMODE_INVERTED('0),							// Optional inversion for ALUMODE
+					.IS_CARRYIN_INVERTED('0),							// Optional inversion for CARRYIN
+					.IS_CLK_INVERTED('0),								// Optional inversion for CLK
+					.IS_INMODE_INVERTED('0),							// Optional inversion for INMODE
+					.IS_OPMODE_INVERTED({ 2'b00, OPMODE_INVERSION}),	// Optional inversion for OPMODE
+					.IS_RSTALLCARRYIN_INVERTED('0),						// Optional inversion for RSTALLCARRYIN
+					.IS_RSTALUMODE_INVERTED('0),						// Optional inversion for RSTALUMODE
+					.IS_RSTA_INVERTED('0),								// Optional inversion for RSTA
+					.IS_RSTB_INVERTED('0),								// Optional inversion for RSTB
+					.IS_RSTCTRL_INVERTED('0),							// Optional inversion for STCONJUGATE_A
+					.IS_RSTC_INVERTED('0),								// Optional inversion for RSTC
+					.IS_RSTD_INVERTED('0),								// Optional inversion for RSTD
+					.IS_RSTINMODE_INVERTED('0),							// Optional inversion for RSTINMODE
+					.IS_RSTM_INVERTED('0),								// Optional inversion for RSTM
+					.IS_RSTP_INVERTED('0),								// Optional inversion for RSTP
+
+					// Register Control Attributes: Pipeline Register Configuration
+					.ACASCREG(0),                      // Number of pipeline stages between A/ACIN and ACOUT (0-2)
+					.ADREG(1),                         // Pipeline stages for pre-adder (0-1)
+					.ALUMODEREG(0),                    // Pipeline stages for ALUMODE (0-1)
+					.AREG(0),                          // Pipeline stages for A (0-2)
+					.BCASCREG(1),                      // Number of pipeline stages between B/BCIN and BCOUT (0-2)
+					.BREG(1),                          // Pipeline stages for B (0-2)
+					.CARRYINREG(0),                    // Pipeline stages for CARRYIN (0-1)
+					.CARRYINSELREG(0),                 // Pipeline stages for CARRYINSEL (0-1)
+					.CREG(0),                          // Pipeline stages for C (0-1)
+					.DREG(0),                          // Pipeline stages for D (0-1)
+					.INMODEREG(0),                     // Pipeline stages for INMODE (0-1)
+					.MREG(1),                          // Multiplier pipeline stages (0-1)
+					.OPMODEREG(1),                     // Pipeline stages for OPMODE (0-1)
+					.PREG(1)                          // Number of pipeline stages for P (0-1)
+				) dsp (
+					// Cascade outputs: Cascade Ports
+					.ACOUT(),			// 34-bit output: A port cascade
+					.BCOUT(),			// 24-bit output: B cascade
+					.CARRYCASCOUT(),	// 1-bit output: Cascade carry
+					.MULTSIGNOUT(),		// 1-bit output: Multiplier sign cascade
+					.PCOUT(),			// 58-bit output: Cascade output
+
+					// Control outputs: Control Inputs/Status Bits
+					.OVERFLOW(),		// 1-bit output: Overflow in add/acc
+					.PATTERNBDETECT(),	// 1-bit output: Pattern bar detect
+					.PATTERNDETECT(),	// 1-bit output: Pattern detect
+					.UNDERFLOW(),		// 1-bit output: Underflow in add/acc
+
+					// Data outputs: Data Ports
+					.CARRYOUT(),		// 4-bit output: Carry
+					.P(pp),				// 58-bit output: Primary data
+					.XOROUT(),			// 8-bit output: XOR data
+
+					// Cascade inputs: Cascade Ports
+					.ACIN('x),			// 34-bit input: A cascade data
+					.BCIN('x),			// 24-bit input: B cascade
+					.CARRYCASCIN('x),	// 1-bit input: Cascade carry
+					.MULTSIGNIN('x),	// 1-bit input: Multiplier sign cascade
+					.PCIN('x),			// 58-bit input: P cascade
+
+					// Control inputs: Control Inputs/Status Bits
+					.CLK(clk),					// 1-bit input: Clock
+					.ALUMODE(4'h0),				// 4-bit input: ALU control
+					.CARRYINSEL('0),			// 3-bit input: Carry select
+					.INMODE(5'b01100),			// 5-bit input: INMODE control
+					.OPMODE({ 2'b00, opmode }),	// 9-bit input: Operation mode
+
+					// Data inputs: Data Ports
+					.A(aa),						// 34-bit input: A data
+					.B(bb),						// 24-bit input: B data
+					.C('x),						// 58-bit input: C data
+					.CARRYIN('0),				// 1-bit input: Carry-in
+					.D(dd),						// 27-bit input: D data
+
+					// Reset/Clock Enable inputs: Reset/Clock Enable Inputs
+					.CEA1('0),			// 1-bit input: Clock enable for 1st stage AREG
+					.CEA2('0),			// 1-bit input: Clock enable for 2nd stage AREG
+					.CEAD(en),			// 1-bit input: Clock enable for ADREG
+					.CEALUMODE('0),		// 1-bit input: Clock enable for ALUMODE
+					.CEB1('0),			// 1-bit input: Clock enable for 1st stage BREG
+					.CEB2(en),			// 1-bit input: Clock enable for 2nd stage BREG
+					.CEC('0),			// 1-bit input: Clock enable for CREG
+					.CECARRYIN('0),		// 1-bit input: Clock enable for CARRYINREG
+					.CECTRL(en),		// 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG
+					.CED('0),			// 1-bit input: Clock enable for DREG
+					.CEINMODE('0),		// 1-bit input: Clock enable for INMODEREG
+					.CEM(en),			// 1-bit input: Clock enable for MREG
+					.CEP(en),			// 1-bit input: Clock enable for PREG
+					.RSTA('0),			// 1-bit input: Reset for AREG
+					.RSTB(				// 1-bit input: Reset for BREG
+// synthesis translate_off
+						rst ||
+// synthesis translate_on
+						zero
+					),
+					.RSTC('0),			// 1-bit input: Reset for CREG
+					.RSTD(				// 1-bit input: Reset for DREG and ADREG
+// synthesis translate_off
+						zero ||
+// synthesis translate_on
+						rst
+					),
+					.RSTALLCARRYIN('0),	// 1-bit input: Reset for CARRYINREG
+					.RSTALUMODE('0),	// 1-bit input: Reset for ALUMODEREG
+					.RSTCTRL('0),		// 1-bit input: Reset for OPMODEREG and CARRYINSELREG
+					.RSTINMODE('0),		// 1-bit input: Reset for INMODE register
+					.RSTM(rst),			// 1-bit input: Reset for MREG
+					.RSTP(rst)			// 1-bit input: Reset for PREG
+				);
+				default: initial begin
+					$error("Unknown version DSP48E%0d.", VERSION);
+					$finish;
+				end
+				endcase
+			end : genDSP
+`endif
+
+			// External Canary Pipeline
+			logic [1:0]  X1 = '{ default: 0 };
+			logic [1:0]  X2 = '{ default: 0 };
+			logic [1:0]  X3 = '{ default: 0 };
+			always_ff @(posedge clk) begin
+				if(rst) begin
+					X1 <= '{ default: 0 };
+					X2 <= '{ default: 0 };
+					X3 <= '{ default: 0 };
+				end
+				else if(en) begin
+					X1 <= xx;
+					X2 <= X1;
+					X3 <= X2 + (L[3]? 2'h0 : pp[D[1]+:2]);
+				end
+			end
+
+			// Derive actual cross-lane overflows
+			assign  h3[s] = pp[D[1]+:2] - X3;
+
+			assign	p3[s] = pp;
+
+		end : genSIMD
+
+		// Stage #4: Cross-SIMD Reduction
+
+		// Count leaves reachable from each node
+		localparam leave_load_t  LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 0}; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop
+
+		uwire signed [ACCU_WIDTH  -1:0]  up4;
+		uwire signed [ACCU_WIDTH  -SINGLE_PROD_WIDTH:0]  hi4;
+		uwire        [$clog2(SIMD)+SINGLE_PROD_WIDTH-1:0]  lo4;
+
+		// Conclusive high part accumulation
+		if(PE_REM == 0) begin : genHi
+			localparam int unsigned  HI_WIDTH = ACCU_WIDTH - D[1];
+			// Adder Tree across all SIMD high contributions, each from [-1:1]
+			uwire signed [$clog2(1+SIMD):0]  tree[2*SIMD-1];
+			for(genvar  s = 0; s < SIMD;   s++)  assign  tree[SIMD-1+s] = h3[s];
+			for(genvar  n = 0; n < SIMD-1; n++) begin
+				// Sum truncated to actual maximum bit width at this node
+				uwire signed [$clog2(1+LEAVE_LOAD[n]):0]  s = tree[2*n+1] + tree[2*n+2];
+				assign  tree[n] = s;
+			end
+
+			// High Sideband Accumulation
+			logic signed [HI_WIDTH-1:0]  Hi4 = 0;
+			always_ff @(posedge clk) begin
+				if(rst)      Hi4 <= 0;
+				else if(en)  Hi4 <= (L[4]? 0 : Hi4) + tree[0];
+			end
+			assign	hi4 = Hi4;
+		end : genHi
+		else begin : genHiZero
+			assign hi4 = '0;
+		end : genHiZero
+
+		for(genvar  i = 0; i < 2; i++) begin
+			localparam int unsigned  LO_WIDTH = D[i+1] - D[i];
+			// Conclusive low part accumulation
+			if(i >= PE_REM) begin : blkLo
+				// Adder Tree across all SIMD low contributions
+				localparam int unsigned  ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1));
+				uwire [2*SIMD-2:0][ROOT_WIDTH-1:0]  tree;
+				for(genvar  s = 0; s < SIMD;   s++)  assign  tree[SIMD-1+s] = p3[s][D[i]+:LO_WIDTH];
+				for(genvar  n = 0; n < SIMD-1; n++) begin
+					// Sum truncated to actual maximum bit width at this node
+					localparam int unsigned  NODE_WIDTH = $clog2(1 + LEAVE_LOAD[n]*(2**LO_WIDTH-1));
+					uwire [NODE_WIDTH-1:0]  s = tree[2*n+1] + tree[2*n+2];
+					assign  tree[n] = s;
+				end
+
+				logic [ROOT_WIDTH-1:0]  Lo4 = 0;
+				always_ff @(posedge clk) begin
+					if(rst)      Lo4 <= 0;
+					else if(en)  Lo4 <= tree[0];
+				end
+
+				if(i == 1)  assign  up4 = Lo4;
+				else  assign  lo4 = Lo4;
+			end : blkLo
+			else begin : blkLoZero
+				assign lo4 = '0;
+			end : blkLoZero
+
+		end
+
+		// Stage #5: Resolve lane totals
+		logic signed [1:0][ACCU_WIDTH-1:0]  Res5 = '{ default: 0 };
+		always_ff @(posedge clk) begin
+			if(rst)  Res5 <= '{ default: 0 };
+			else if(en) begin
+				Res5[1] <= up4 - hi4;
+				Res5[0] <= $signed({ hi4, {(D[1] - D[0]){1'b0}} }) + $signed({ 1'b0, lo4 });
+			end
+		end
+
+		// Output
+		for(genvar  pe = PE_BEG; pe < PE_END; pe++) begin
+			assign	p[pe] = Res5[pe - PE_BEG + PE_REM];
+		end
+
+	end : genPipes
+
+endmodule : mvu_8sx8u_dsp48
diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v
new file mode 100644
index 0000000000..e15f77fbae
--- /dev/null
+++ b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v
@@ -0,0 +1,93 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Verilog AXI-lite wrapper for MVU.
+ *****************************************************************************/
+
+module $MODULE_NAME_AXI_WRAPPER$ #(
+	parameter 	MW = $MW$,
+	parameter	MH = $MH$,
+	parameter 	PE = $PE$,
+	parameter 	SIMD = $SIMD$,
+	parameter 	ACTIVATION_WIDTH = $ACTIVATION_WIDTH$,
+	parameter 	WEIGHT_WIDTH = $WEIGHT_WIDTH$,
+	parameter 	ACCU_WIDTH = $ACCU_WIDTH$,
+	parameter 	SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$,
+	parameter 	SEGMENTLEN = $SEGMENTLEN$,
+	parameter 	RAM_STYLE = "$IBUF_RAM_STYLE$",
+
+	// Safely deducible parameters
+	parameter 	WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
+	parameter 	INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8,
+	parameter 	OUTPUT_LANES = PE,
+	parameter 	OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
+)(
+	// Global Control
+	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF s_axis_weights:s_axis_input:m_axis_output, ASSOCIATED_RESET ap_rst_n" *)
+	(* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *)
+	input	ap_clk,
+	(* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *)
+	input	ap_rst_n,
+
+	// Weight Stream
+	input	[WEIGHT_STREAM_WIDTH_BA-1:0]  s_axis_weights_tdata,
+	input	s_axis_weights_tvalid,
+	output	s_axis_weights_tready,
+
+	// Input Stream
+	input	[INPUT_STREAM_WIDTH_BA-1:0]  s_axis_input_tdata,
+	input	s_axis_input_tvalid,
+	output	s_axis_input_tready,
+
+	// Output Stream
+	output	[OUTPUT_STREAM_WIDTH_BA-1:0]  m_axis_output_tdata,
+	output	m_axis_output_tvalid,
+	input	m_axis_output_tready
+);
+
+mvu_8sx9_axi #(
+	.MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH),
+	.WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
+	.SEGMENTLEN(SEGMENTLEN), .RAM_STYLE(RAM_STYLE)
+	) inst (
+	.ap_clk(ap_clk),
+	.ap_rst_n(ap_rst_n),
+	.s_axis_weights_tdata(s_axis_weights_tdata),
+	.s_axis_weights_tvalid(s_axis_weights_tvalid),
+	.s_axis_weights_tready(s_axis_weights_tready),
+	.s_axis_input_tdata(s_axis_input_tdata),
+	.s_axis_input_tvalid(s_axis_input_tvalid),
+	.s_axis_input_tready(s_axis_input_tready),
+	.m_axis_output_tdata(m_axis_output_tdata),
+	.m_axis_output_tvalid(m_axis_output_tvalid),
+	.m_axis_output_tready(m_axis_output_tready)
+);
+
+endmodule : $MODULE_NAME_AXI_WRAPPER$
diff --git a/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv b/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv
new file mode 100644
index 0000000000..53cf71fd5f
--- /dev/null
+++ b/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv
@@ -0,0 +1,430 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Matrix Vector Unit (MVU) core compute kernel utilizing DSP58.
+ *****************************************************************************/
+
+module mvu_vvu_8sx9_dsp58 #(
+	bit IS_MVU,
+    int unsigned PE,
+    int unsigned SIMD,
+    int unsigned ACTIVATION_WIDTH,
+    int unsigned WEIGHT_WIDTH,
+	int unsigned ACCU_WIDTH,
+    bit SIGNED_ACTIVATIONS = 0,
+    int unsigned SEGMENTLEN = 0, // Default to 0 (which implies a single segment)
+	bit FORCE_BEHAVIORAL = 0,
+
+	localparam int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD,
+	localparam int unsigned WEIGHT_ELEMENTS = PE*SIMD
+  )
+  (
+    // Global Control
+	input   logic clk,
+    input   logic rst,
+    input   logic en,
+
+	// Input
+    input   logic last,
+    input   logic zero, // ignore current inputs and force this partial product to zero
+    input   logic [WEIGHT_ELEMENTS-1:0][WEIGHT_WIDTH-1:0] w, // weights
+	input   logic [ACTIVATION_ELEMENTS-1:0][ACTIVATION_WIDTH-1:0] a, // activations
+
+	// Ouput
+	output  logic vld,
+    output  logic [PE-1:0][ACCU_WIDTH-1:0] p
+  );
+	// for verilator always use behavioral code
+	localparam bit  BEHAVIORAL =
+`ifdef VERILATOR
+		1 ||
+`endif
+		FORCE_BEHAVIORAL;
+
+//-------------------- Declare global signals --------------------\\
+	localparam int unsigned CHAINLEN = (SIMD+2)/3;
+	localparam int unsigned SEGLEN = SEGMENTLEN == 0 ? CHAINLEN : SEGMENTLEN; // Additional constant to default a SEGMENTLEN of '0' to the DSP-chain length
+	localparam int unsigned PE_ACTIVATION = IS_MVU ? 1 : PE;
+	uwire [26:0] a_in_i [PE_ACTIVATION * CHAINLEN];
+	uwire [23:0] b_in_i [PE][CHAINLEN];
+	uwire [PE-1:0][CHAINLEN-1:0][57:0] pcout; // Array with packed dimension > 256 (with a loop-carried dependency) cannot be handled out-of-the-box with PyVerilator
+
+//-------------------- Shift register for opmode select signal --------------------\\
+	localparam int unsigned MAX_PIPELINE_STAGES = (CHAINLEN + SEGLEN-1)/SEGLEN; // >=1 (== number of pipeline registers + 1 (A/B inputs always have 1 register))
+	logic L [0:1+MAX_PIPELINE_STAGES] = '{default: 0}; // After MAX_PIPELINE_STAGES (== number of pipeline stages for input data), we have 3 additional cycles latency (A/B reg, Mreg, Preg). Thus, we add +2 (since OPMODE is buffered by 1 cycle in the DSP fabric)
+
+	always_ff @(posedge clk) begin
+		if(rst)     L <= '{default: 0};
+		else if(en) begin
+			L[1+MAX_PIPELINE_STAGES] <= last;
+			L[0:MAX_PIPELINE_STAGES] <= L[1:1+MAX_PIPELINE_STAGES];
+		end
+	end
+	assign vld = L[0];
+
+//-------------------- Shift register for ZERO flag --------------------\\
+	logic Z [0:MAX_PIPELINE_STAGES-2] = '{default:0}; // We need MAX_PIPELINE_STAGES-1 pipeline stages (note: INMODE is buffered inside DSP fabric)
+
+	if (MAX_PIPELINE_STAGES > 1) begin : genZreg
+		always_ff @(posedge clk) begin
+			if (rst)      Z <= '{default: 0};
+			else if(en) begin
+				Z[0] <= zero;
+				if (MAX_PIPELINE_STAGES > 2)  Z[1:MAX_PIPELINE_STAGES-2] <= Z[0:MAX_PIPELINE_STAGES-3];
+			end
+		end
+	end;
+
+//-------------------- Buffer for input activations --------------------\\
+	localparam int unsigned PAD_BITS_ACT = 9 - ACTIVATION_WIDTH;
+	for (genvar k=0; k<PE_ACTIVATION; k++) begin : genActPE
+		for (genvar i=0; i<CHAINLEN; i++) begin : genActSIMD
+			localparam int TOTAL_PREGS = i/SEGLEN;
+			localparam int EXTERNAL_PREGS = TOTAL_PREGS>1 ? TOTAL_PREGS-1 : 0;
+			localparam int LANES_OCCUPIED = i == CHAINLEN-1 ? SIMD - 3*i : 3;
+
+			if (EXTERNAL_PREGS > 0) begin : genExternalPregAct
+				logic [0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][ACTIVATION_WIDTH-1:0] A = '{ default : 0};
+				always_ff @(posedge clk) begin
+					if (rst)     A <= '{default: 0};
+					else if(en) begin
+						A[EXTERNAL_PREGS-1] <= 
+	// synthesis translate_off
+							zero ? '1 : 
+	// synthesis translate_on						
+							a[SIMD*k + 3*i +: LANES_OCCUPIED];
+						if (EXTERNAL_PREGS > 1)   A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1];
+					end
+				end
+				for (genvar j=0; j<LANES_OCCUPIED; j++) begin : genAin
+				assign a_in_i[CHAINLEN*k+i][9*j +: 9] = SIGNED_ACTIVATIONS ? PAD_BITS_ACT == 0 ? A[0][j] : { {PAD_BITS_ACT{A[0][j][ACTIVATION_WIDTH-1]}}, A[0][j] } 
+													  : PAD_BITS_ACT == 0 ? A[0][j] : { {PAD_BITS_ACT{1'b0}}, A[0][j] } ;
+				end : genAin
+				for (genvar j=LANES_OCCUPIED; j<3; j++) begin : genAinZero
+					assign a_in_i[CHAINLEN*k+i][9*j +: 9] = 9'b0;
+				end : genAinZero
+			end : genExternalPregAct
+			else begin : genInpDSPAct
+				for (genvar j=0; j<LANES_OCCUPIED; j++) begin : genAin
+					assign a_in_i[CHAINLEN*k+i][9*j +: 9] = 
+	// synthesis translate_off
+						zero ? '1 : 				
+	// synthesis translate_on
+						SIGNED_ACTIVATIONS ? PAD_BITS_ACT == 0 ? a[SIMD*k+3*i+j] : { {PAD_BITS_ACT{a[SIMD*k+3*i+j][ACTIVATION_WIDTH-1]}}, a[SIMD*k+3*i+j] }
+													: PAD_BITS_ACT == 0 ? a[SIMD*k+3*i+j] : { {PAD_BITS_ACT{1'b0}}, a[SIMD*k+3*i+j] } ;
+				end : genAin
+				for (genvar j=LANES_OCCUPIED; j<3; j++) begin : genAinZero
+					assign a_in_i[CHAINLEN*k+i][9*j +: 9] = 9'b0;
+				end : genAinZero
+			end : genInpDSPAct
+		end : genActSIMD
+	end : genActPE
+
+//-------------------- Buffer for weights --------------------\\
+	localparam int unsigned PAD_BITS_WEIGHT = 8 - WEIGHT_WIDTH;
+
+	for (genvar i=0; i<PE; i++) begin : genWeightPE
+		for (genvar j=0; j<CHAINLEN; j++) begin : genWeightSIMD
+			localparam int TOTAL_PREGS = j/SEGLEN;
+			localparam int EXTERNAL_PREGS = TOTAL_PREGS>1 ? TOTAL_PREGS-1 : 0;
+			localparam int LANES_OCCUPIED = j == CHAINLEN-1 ? SIMD - 3*j : 3;
+
+			if (EXTERNAL_PREGS > 0) begin : genExternalPregWeight
+				logic [0:PE-1][0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][WEIGHT_WIDTH-1:0] B = '{ default : 0};
+				always_ff @(posedge clk) begin
+					if (rst)    B <= '{default: 0};
+					else if (en) begin
+						B[i][EXTERNAL_PREGS-1] <= 
+// synthesis translate_off
+							zero ? '1 : 						
+// synthesis translate_on							
+							//w[i][3*j +: LANES_OCCUPIED];
+							w[SIMD*i+3*j +: LANES_OCCUPIED];
+						if (EXTERNAL_PREGS > 1) B[i][0:EXTERNAL_PREGS-2] <= B[i][1:EXTERNAL_PREGS-1];
+					end
+				end
+				for (genvar k = 0 ; k < LANES_OCCUPIED ; k++) begin : genBin
+					assign b_in_i[i][j][8*k +: 8] = PAD_BITS_WEIGHT == 0 ? B[i][0][k] : { {PAD_BITS_WEIGHT{B[i][0][k][WEIGHT_WIDTH-1]}}, B[i][0][k] };
+				end : genBin
+				for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero
+					assign b_in_i[i][j][8*k +: 8] = 8'b0;
+				end : genBinZero
+			end : genExternalPregWeight
+			else begin : genInpDSPWeight
+				for (genvar k = 0; k < LANES_OCCUPIED; k++) begin : genBin
+					assign b_in_i[i][j][8*k +: 8] = 
+// synthesis translate_off					
+						zero ? '1 : 
+// synthesis translate_on					
+						//PAD_BITS_WEIGHT == 0 ? w[i][3*j+k] : { {PAD_BITS_WEIGHT{w[i][3*j+k][WEIGHT_WIDTH-1]}}, w[i][3*j+k] };
+						PAD_BITS_WEIGHT == 0 ? w[SIMD*i+3*j+k] : { {PAD_BITS_WEIGHT{w[SIMD*i+3*j+k][WEIGHT_WIDTH-1]}}, w[SIMD*i+3*j+k] };
+				end : genBin
+				for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero
+					assign b_in_i[i][j][8*k +: 8] = 8'b0;
+				end : genBinZero
+			end : genInpDSPWeight
+		end : genWeightSIMD
+	end : genWeightPE
+
+//-------------------- Instantiate PE x CHAINLEN DSPs --------------------\\
+	for (genvar i=0; i<PE; i++) begin : genDSPPE
+		for (genvar j=0; j<CHAINLEN; j++) begin : genDSPChain
+			localparam int TOTAL_PREGS = j/SEGLEN;
+			localparam int INTERNAL_PREGS = TOTAL_PREGS>0 ? 2 : 1; // 1 : 0
+			localparam bit PREG = (j+1)%SEGLEN==0 || j == CHAINLEN-1;
+			localparam bit FIRST = j == 0;
+			localparam bit LAST = j == CHAINLEN-1;
+			uwire [57:0] pp;
+
+			if (LAST) begin : genPOUT
+				assign p[i] = pp[ACCU_WIDTH-1:0];
+			end
+
+			// Note: Since the product B * AD is computed,
+			//       rst can be only applied to AD and zero only to B
+			//       with the same effect as zeroing both.
+			if(BEHAVIORAL) begin : genBehav
+				// Stage #1: Input A/B
+				logic signed [33:0] Areg [INTERNAL_PREGS];
+				always_ff @(posedge clk) begin
+					if (rst)	Areg <= '{ default : 0};
+					else if (en) begin
+						Areg[0] <= { 7'bx, a_in_i[(IS_MVU ? 0 : CHAINLEN*i) + j] };
+						if (INTERNAL_PREGS == 2) Areg[1] <= Areg[0];
+					end
+				end
+				logic signed [23:0] Breg [INTERNAL_PREGS];
+				always_ff @(posedge clk) begin
+					if (rst)	Breg <= '{ default : 0};
+					else if (en) begin
+						Breg[0] <= b_in_i[i][j];
+						if (INTERNAL_PREGS == 2) Breg[1] <= Breg[0];
+					end
+				end
+
+				// Stage #2: Multiply-Accumulate
+				logic signed [57:0] Mreg;
+				logic InmodeZero = 0;
+				always_ff @(posedge clk) begin
+					if (rst)		InmodeZero <= 0;
+					else if (en)	InmodeZero <= ( TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero );
+				end
+				always_ff @(posedge clk) begin
+					if (rst)	Mreg <= 0;
+					else if (en) begin
+						automatic logic signed [57:0] m = 0;
+						for (int k = 0; k < 3; k++) begin
+							m = m + (InmodeZero ? 0 : $signed(Areg[INTERNAL_PREGS-1][9*k +: 9]) * $signed(Breg[INTERNAL_PREGS-1][8*k +: 8]));
+						end
+						Mreg <= m;
+					end
+				end
+
+				// Stage #3: Accumulate
+				logic signed [57:0] Preg;
+				logic Opmode = 0;
+				if (FIRST && !LAST) begin : genFirst
+					if (PREG) begin : genPregBehav
+						always_ff @(posedge clk) begin
+							if (rst)		Preg <= 0;
+							else if (en)	Preg <= Mreg;
+						end
+					end
+					else	assign Preg = Mreg;
+				end
+				else if (FIRST && LAST) begin : genSingle
+					always_ff @(posedge clk) begin
+						if (rst)		Opmode <= 0;
+						else if (en)	Opmode <= L[1];
+					end
+					always_ff @(posedge clk) begin
+						if (rst) 		Preg <= 0;
+						else if (en)	Preg <= (Opmode ? 0 : Preg) + Mreg;
+					end
+				end
+				else if (!FIRST && LAST) begin : genLast
+					always_ff @(posedge clk) begin
+						if (rst)		Opmode <= 0;
+						else if (en)	Opmode <= L[1];
+					end
+					always_ff @(posedge clk) begin
+						if (rst) 		Preg <= 0;
+						else if (en)	Preg <= (Opmode ? 0 : Preg) + Mreg + pcout[i][j-1];
+					end
+				end
+				else begin : genMid
+					if (PREG) begin : genPregBehav
+						always_ff @(posedge clk) begin
+							if (rst)		Preg <= 0;
+							else if (en)	Preg <= Mreg + pcout[i][j-1];
+						end
+					end
+					else	assign Preg = Mreg + pcout[i][j-1];
+				end
+				assign pp = Preg;
+				assign pcout[i][j] = Preg;
+			end : genBehav
+`ifndef VERILATOR
+			else begin: genDSP
+				DSP58 #(
+					// Feature Control Attributes: Data Path Selection
+					.AMULTSEL("A"),                     // Selects A input to multiplier (A, AD)
+					.A_INPUT("DIRECT"),                 // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port)
+					.BMULTSEL("B"),                     // Selects B input to multiplier (AD, B)
+					.B_INPUT("DIRECT"),                 // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port)
+					.DSP_MODE("INT8"),                  // Configures DSP to a particular mode of operation. Set to INT24 for
+														// legacy mode.
+					.PREADDINSEL("A"),                  // Selects input to pre-adder (A, B)
+					.RND(58'h000000000000000),          // Rounding Constant
+					.USE_MULT("MULTIPLY"),              // Select multiplier usage (DYNAMIC, MULTIPLY, NONE)
+					.USE_SIMD("ONE58"),                 // SIMD selection (FOUR12, ONE58, TWO24)
+					.USE_WIDEXOR("FALSE"),              // Use the Wide XOR function (FALSE, TRUE)
+					.XORSIMD("XOR24_34_58_116"),        // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116)
+					// Pattern Detector Attributes: Pattern Detection Configuration
+					.AUTORESET_PATDET("NO_RESET"),      // NO_RESET, RESET_MATCH, RESET_NOT_MATCH
+					.AUTORESET_PRIORITY("RESET"),       // Priority of AUTORESET vs. CEP (CEP, RESET).
+					.MASK(58'h0ffffffffffffff),         // 58-bit mask value for pattern detect (1=ignore)
+					.PATTERN(58'h000000000000000),      // 58-bit pattern match for pattern detect
+					.SEL_MASK("MASK"),                  // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2
+					.SEL_PATTERN("PATTERN"),            // Select pattern value (C, PATTERN)
+					.USE_PATTERN_DETECT("NO_PATDET"),   // Enable pattern detect (NO_PATDET, PATDET)
+					// Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins
+					.IS_ALUMODE_INVERTED(4'b0000),      // Optional inversion for ALUMODE
+					.IS_CARRYIN_INVERTED(1'b0),         // Optional inversion for CARRYIN
+					.IS_CLK_INVERTED(1'b0),             // Optional inversion for CLK
+					.IS_INMODE_INVERTED(5'b00000),      // Optional inversion for INMODE
+					.IS_NEGATE_INVERTED(3'b000),        // Optional inversion for NEGATE
+					.IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0
+										FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN
+										2'b01, // Y : M
+										2'b01  // X: M
+					}), // Optional inversion for OPMODE
+					.IS_RSTALLCARRYIN_INVERTED(1'b0),   // Optional inversion for RSTALLCARRYIN
+					.IS_RSTALUMODE_INVERTED(1'b0),      // Optional inversion for RSTALUMODE
+					.IS_RSTA_INVERTED(1'b0),            // Optional inversion for RSTA
+					.IS_RSTB_INVERTED(1'b0),            // Optional inversion for RSTB
+					.IS_RSTCTRL_INVERTED(1'b0),         // Optional inversion for STCONJUGATE_A
+					.IS_RSTC_INVERTED(1'b0),            // Optional inversion for RSTC
+					.IS_RSTD_INVERTED(1'b0),            // Optional inversion for RSTD
+					.IS_RSTINMODE_INVERTED(1'b0),       // Optional inversion for RSTINMODE
+					.IS_RSTM_INVERTED(1'b0),            // Optional inversion for RSTM
+					.IS_RSTP_INVERTED(1'b0),            // Optional inversion for RSTP
+					// Register Control Attributes: Pipeline Register Configuration
+					.ACASCREG(INTERNAL_PREGS),          // Number of pipeline stages between A/ACIN and ACOUT (0-2)
+					.ADREG(0),                          // Pipeline stages for pre-adder (0-1)
+					.ALUMODEREG(0),                     // Pipeline stages for ALUMODE (0-1)
+					.AREG(INTERNAL_PREGS),              // Pipeline stages for A (0-2)
+					.BCASCREG(INTERNAL_PREGS),          // Number of pipeline stages between B/BCIN and BCOUT (0-2)
+					.BREG(INTERNAL_PREGS),              // Pipeline stages for B (0-2)
+					.CARRYINREG(0),                     // Pipeline stages for CARRYIN (0-1)
+					.CARRYINSELREG(0),                  // Pipeline stages for CARRYINSEL (0-1)
+					.CREG(0),                           // Pipeline stages for C (0-1)
+					.DREG(0),                           // Pipeline stages for D (0-1)
+					.INMODEREG(1),                      // Pipeline stages for INMODE (0-1)
+					.MREG(1),                           // Multiplier pipeline stages (0-1)
+					.OPMODEREG(1),                      // Pipeline stages for OPMODE (0-1)
+					.PREG(PREG),                        // Number of pipeline stages for P (0-1)
+					.RESET_MODE("SYNC")                 // Selection of synchronous or asynchronous reset. (ASYNC, SYNC).
+				)
+				DSP58_inst (
+					// Cascade outputs: Cascade Ports
+					.ACOUT(),                           // 34-bit output: A port cascade
+					.BCOUT(),                           // 24-bit output: B cascade
+					.CARRYCASCOUT(),                    // 1-bit output: Cascade carry
+					.MULTSIGNOUT(),                     // 1-bit output: Multiplier sign cascade
+					.PCOUT(pcout[i][j]),                // 58-bit output: Cascade output
+					// Control outputs: Control Inputs/Status Bits
+					.OVERFLOW(),                        // 1-bit output: Overflow in add/acc
+					.PATTERNBDETECT(),                  // 1-bit output: Pattern bar detect
+					.PATTERNDETECT(),                   // 1-bit output: Pattern detect
+					.UNDERFLOW(),                       // 1-bit output: Underflow in add/acc
+					// Data outputs: Data Ports
+					.CARRYOUT(),                        // 4-bit output: Carry
+					.P(pp),                             // 58-bit output: Primary data
+					.XOROUT(),                          // 8-bit output: XOR data
+					// Cascade inputs: Cascade Ports
+					.ACIN('x),                          // 34-bit input: A cascade data
+					.BCIN('x),                          // 24-bit input: B cascade
+					.CARRYCASCIN('x),                   // 1-bit input: Cascade carry
+					.MULTSIGNIN('x),                    // 1-bit input: Multiplier sign cascade
+					.PCIN(FIRST ? 'x : pcout[i][j-1]),  // 58-bit input: P cascade
+					// Control inputs: Control Inputs/Status Bits
+					.ALUMODE(4'h0),                     // 4-bit input: ALU control
+					.CARRYINSEL('0),                    // 3-bit input: Carry select
+					.CLK(clk),                          // 1-bit input: Clock
+					.INMODE({
+							INTERNAL_PREGS==2 ? 1'b0 : 1'b1,
+							2'b00,
+							TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero,
+							INTERNAL_PREGS==2 ? 1'b0 : 1'b1
+					}),                                 // 5-bit input: INMODE control
+					.NEGATE('0),                        // 3-bit input: Negates the input of the multiplier
+					.OPMODE({
+							LAST ? {1'b0, L[1]} : 2'b00,
+							7'b000_0000
+					}), // 9-bit input: Operation mode
+					// Data inputs: Data Ports
+					.A({ 7'bx, a_in_i[(IS_MVU ? 0 : CHAINLEN*i) + j] }),            // 34-bit input: A data
+					.B(b_in_i[i][j]),                   // 24-bit input: B data
+					.C('x),                             // 58-bit input: C data
+					.CARRYIN('0),                       // 1-bit input: Carry-in
+					.D('x),                             // 27-bit input: D data
+					// Reset/Clock Enable inputs: Reset/Clock Enable Inputs
+					.ASYNC_RST('0),                     // 1-bit input: Asynchronous reset for all registers.
+					.CEA1(en),                          // 1-bit input: Clock enable for 1st stage AREG
+					.CEA2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage AREG
+					.CEAD('0),                          // 1-bit input: Clock enable for ADREG
+					.CEALUMODE('0),                     // 1-bit input: Clock enable for ALUMODE
+					.CEB1(en),                          // 1-bit input: Clock enable for 1st stage BREG
+					.CEB2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage BREG
+					.CEC('0),                           // 1-bit input: Clock enable for CREG
+					.CECARRYIN('0),                     // 1-bit input: Clock enable for CARRYINREG
+					.CECTRL(en),                        // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG
+					.CED('0),                           // 1-bit input: Clock enable for DREG
+					.CEINMODE(en),                      // 1-bit input: Clock enable for INMODEREG
+					.CEM(en),                           // 1-bit input: Clock enable for MREG
+					.CEP(PREG && en),                   // 1-bit input: Clock enable for PREG
+					.RSTA(rst),                         // 1-bit input: Reset for AREG
+					.RSTALLCARRYIN('0),                 // 1-bit input: Reset for CARRYINREG
+					.RSTALUMODE('0),                    // 1-bit input: Reset for ALUMODEREG
+					.RSTB(rst),                         // 1-bit input: Reset for BREG
+					.RSTC('0),                          // 1-bit input: Reset for CREG
+					.RSTCTRL(rst),                      // 1-bit input: Reset for OPMODEREG and CARRYINSELREG
+					.RSTD('0),                          // 1-bit input: Reset for DREG and ADREG
+					.RSTINMODE(rst),                    // 1-bit input: Reset for INMODE register
+					.RSTM(rst),                         // 1-bit input: Reset for MREG
+					.RSTP(PREG && rst)                  // 1-bit input: Reset for PREG
+				);
+			end : genDSP
+`endif
+		end : genDSPChain
+	end : genDSPPE
+
+endmodule : mvu_vvu_8sx9_dsp58
diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
new file mode 100644
index 0000000000..d40c5e1b10
--- /dev/null
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -0,0 +1,383 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Matrix Vector Unit (MVU) & Vector Vector Unit (VVU) AXI-lite interface wrapper.
+ * @details
+ *	 The following compute cores are supported:
+ *   - 4-bit MVU on DSP48 & DSP58 achieving 4 MACs/DSP, 
+ *     (4,8]-bit MVU on DSP48 achieving 2 MACs/DSP,
+ *     [4,9]-bit MVU and VVU on DSP58 achieving 3 MACs/DSP,
+ *     'unconstrained' LUT-based MVU and VVU.
+ *  Folding hints:
+ *	 - PE scaling should divide MH.
+ *   - SIMD scaling should divide MW.
+ *	 - Otherwise, keep SIMD and PE somewhat balanced. SIMD scaling tends to
+ *	   impact critical paths more than PE scaling. PE scaling implies a
+ *	   bigger fanout on the input activations.
+ *	 - Full unfolding along MH (PE=MH) results in no replay buffer instantiated
+ *****************************************************************************/
+
+module mvu_vvu_axi #(
+	bit IS_MVU,
+	parameter COMPUTE_CORE,
+	int unsigned MW,
+	int unsigned MH,
+	int unsigned PE,
+	int unsigned SIMD,
+	int unsigned SEGMENTLEN = 0,
+
+	int unsigned ACTIVATION_WIDTH,
+	int unsigned WEIGHT_WIDTH,
+	int unsigned ACCU_WIDTH,
+	bit SIGNED_ACTIVATIONS = 0,
+
+	bit PUMPED_COMPUTE = 0,	// requires an even SIMD % 2 == 0
+	bit FORCE_BEHAVIORAL = 0,
+	bit M_REG_LUT = 1,
+
+	// Safely deducible parameters
+	localparam int unsigned  WEIGHT_STREAM_WIDTH    = PE * SIMD * WEIGHT_WIDTH,
+	localparam int unsigned  WEIGHT_STREAM_WIDTH_BA = (WEIGHT_STREAM_WIDTH + 7)/8 * 8,
+	localparam int unsigned  INPUT_STREAM_WIDTH     = (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH,
+	localparam int unsigned  INPUT_STREAM_WIDTH_BA  = (INPUT_STREAM_WIDTH  + 7)/8 * 8,
+	localparam int unsigned  OUTPUT_STREAM_WIDTH    = PE*ACCU_WIDTH,
+	localparam int unsigned  OUTPUT_STREAM_WIDTH_BA = (OUTPUT_STREAM_WIDTH + 7)/8 * 8,
+	localparam bit  		 SIMD_UNEVEN = SIMD % 2
+)(
+	// Global Control
+	input	logic  ap_clk,
+	input	logic  ap_clk2x,	// synchronous, double-speed clock; only used for PUMPED_COMPUTE
+	input	logic  ap_rst_n,
+
+	// Weight Stream
+	input	logic [WEIGHT_STREAM_WIDTH_BA-1:0]  s_axis_weights_tdata,
+	input	logic  s_axis_weights_tvalid,
+	output	logic  s_axis_weights_tready,
+
+	// Input Stream
+	input	logic [INPUT_STREAM_WIDTH_BA-1:0]  s_axis_input_tdata,
+	input	logic  s_axis_input_tvalid,
+	output	logic  s_axis_input_tready,
+
+	// Output Stream
+	output	logic [OUTPUT_STREAM_WIDTH_BA-1:0]  m_axis_output_tdata,
+	output	logic  m_axis_output_tvalid,
+	input	logic  m_axis_output_tready
+);
+
+//-------------------- Parameter sanity checks --------------------\\
+	initial begin
+		if (MW % SIMD != 0) begin
+			$error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD);
+			$finish;
+		end
+		if (MH % PE != 0) begin
+			$error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE);
+			$finish;
+		end
+		if (WEIGHT_WIDTH > 8) begin
+			$error("Weight width of %0d-bits exceeds maximum of 8-bits", WEIGHT_WIDTH);
+			$finish;
+		end
+		if (ACTIVATION_WIDTH > 8) begin
+			if (!(SIGNED_ACTIVATIONS == 1 && ACTIVATION_WIDTH == 9 && COMPUTE_CORE == "mvu_vvu_8sx9_dsp58")) begin
+				$error("Activation width of %0d-bits exceeds maximum of 9-bits for signed numbers on DSP48", ACTIVATION_WIDTH);
+				$finish;
+			end
+		end
+		if (COMPUTE_CORE == "mvu_vvu_8sx9_dsp58") begin
+			if (SEGMENTLEN == 0) begin
+				$warning("Segment length of %0d defaults to chain length of %0d", SEGMENTLEN, (SIMD+2)/3);
+			end
+			if (SEGMENTLEN > (SIMD+2)/3) begin
+				$error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3);
+				$finish;
+			end
+		end
+		if (!IS_MVU) begin
+			if (COMPUTE_CORE != "mvu_vvu_8sx9_dsp58" && COMPUTE_CORE != "mvu_vvu_lut") begin
+				$error("VVU only supported on DSP58 or LUT-based implementation");
+				$finish;
+			end
+		end
+
+		// //- Pumping Constraints ---------
+		// if(PUMPED_COMPUTE) begin
+		// 	if(SIMD % 2 != 0) begin
+		// 		$error("Odd SIMD=%0d is incompatible with pumped compute.", SIMD);
+		// 		$finish;
+		// 	end
+		// end
+	end
+
+	uwire  clk = ap_clk;
+	uwire  clk2x = ap_clk2x;
+	uwire  rst = !ap_rst_n;
+
+	//- Replay to Accommodate Neuron Fold -----------------------------------
+	typedef logic [(IS_MVU? 1:PE)*SIMD-1:0][ACTIVATION_WIDTH-1:0]  mvu_flatin_t;
+	uwire mvu_flatin_t amvau;
+	uwire alast;
+	uwire afin;
+	uwire avld;
+	uwire ardy;
+
+	localparam int unsigned  SF = MW/SIMD;
+	localparam int unsigned  NF = MH/PE;
+	replay_buffer #(.LEN(SF), .REP(IS_MVU ? NF : 1), .W($bits(mvu_flatin_t))) activation_replay (
+		.clk, .rst,
+		.ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvu_flatin_t'(s_axis_input_tdata)),
+		.ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin)
+	);
+
+	//- Unflatten inputs into structured matrices ---------------------------
+	localparam int unsigned  ACT_PE = IS_MVU? 1 : PE;
+	typedef logic [PE    -1:0][SIMD-1:0][WEIGHT_WIDTH    -1:0]  mvu_w_t;
+	typedef logic [ACT_PE-1:0][SIMD-1:0][ACTIVATION_WIDTH-1:0]  mvu_a_t;
+
+	uwire  mvu_w_t  mvu_w = s_axis_weights_tdata;
+
+	//- Conditional Activations Layout Adjustment for VVU
+	uwire mvu_a_t  amvau_i;
+	if (IS_MVU || (PE == 1)) begin : genMVUInput
+		assign  amvau_i = amvau;
+	end : genMVUInput
+	else begin : genVVUInput
+		// The input stream will have the channels interleaved for VVU when PE>1
+		// Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..]
+		// Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like:
+		// (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to
+		// (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i, P_1), ..., (S_i, P_i)
+		for(genvar  pe = 0; pe < ACT_PE; pe++) begin
+			for(genvar  simd = 0; simd < SIMD; simd++) begin
+				assign	amvau_i[pe][simd] = amvau[simd*ACT_PE+pe];
+			end
+		end
+	end : genVVUInput
+
+	//- Flow Control Bracket around Compute Core ----------------------------
+	uwire en;
+	uwire istb = avld && s_axis_weights_tvalid;
+	assign ardy = en && s_axis_weights_tvalid;
+	assign s_axis_weights_tready = en && avld;
+
+	//- Conditionally Pumped DSP Compute ------------------------------------
+	typedef logic [PE-1:0][ACCU_WIDTH-1:0]  dsp_p_t;
+	uwire  ovld;
+	uwire dsp_p_t  odat;
+	if(1) begin : blkDsp
+		localparam int unsigned  EFFECTIVE_SIMD = SIMD_UNEVEN && PUMPED_COMPUTE ? SIMD+1 : SIMD; 
+		localparam int unsigned  DSP_SIMD = EFFECTIVE_SIMD/(PUMPED_COMPUTE+1);
+		typedef logic [PE    -1:0][DSP_SIMD-1:0][WEIGHT_WIDTH    -1:0]  dsp_w_t;
+		typedef logic [ACT_PE-1:0][DSP_SIMD-1:0][ACTIVATION_WIDTH-1:0]  dsp_a_t;
+
+		uwire  dsp_clk;
+		uwire  dsp_en;
+
+		uwire  dsp_last;
+		uwire  dsp_zero;
+		uwire dsp_w_t  dsp_w;
+		uwire dsp_a_t  dsp_a;
+
+		uwire  dsp_vld;
+		uwire dsp_p_t  dsp_p;
+
+		if(!PUMPED_COMPUTE) begin : genUnpumpedCompute
+			assign	dsp_clk = clk;
+			assign	dsp_en  = en;
+
+			assign	dsp_last = alast && avld;
+			assign	dsp_zero = !istb;
+			assign	dsp_w = mvu_w;
+			assign	dsp_a = amvau_i;
+
+			assign	ovld = dsp_vld;
+			assign	odat = dsp_p;
+		end : genUnpumpedCompute
+		else begin : genPumpedCompute
+			assign	dsp_clk = clk2x;
+
+			// Identify second fast cycle just before active slow clock edge
+			logic  Active = 0;
+			if(1) begin : blkActive
+				uwire  clk_lut[2];	// Put some LUT delay on the input from the fast clock net
+				(* DONT_TOUCH = "TRUE", HLUTNM = "CLK_LUT" *) LUT1 #(.INIT(2'b10)) lut0(.O(clk_lut[0]), .I0(clk));
+				(* DONT_TOUCH = "TRUE", HLUTNM = "CLK_LUT" *) LUT1 #(.INIT(2'b10)) lut1(.O(clk_lut[1]), .I0(clk_lut[0]));
+				always_ff @(posedge clk2x)  Active <= clk_lut[1];
+			end : blkActive
+
+			// The input for a slow cycle is split across two fast cycles along the SIMD dimension.
+			//	- Both fast cycles are controlled by the same enable state.
+			//	- A zero cycle is duplicated across both fast cycles.
+			//	- The last flag must be restricted to the second fast cycle.
+
+			dsp_w_t  W = 'x;
+			for(genvar  pe = 0; pe < PE; pe++) begin : genPERegW
+
+				uwire [2*DSP_SIMD-1:0][WEIGHT_WIDTH-1:0]  w;
+				for(genvar  i =    0; i <       SIMD; i++)  assign  w[i] = mvu_w[pe][i];
+				for(genvar  i = SIMD; i < 2*DSP_SIMD; i++)  assign  w[i] = 0;
+
+				always_ff @(posedge clk2x) begin
+					if(rst)      W[pe] <= 'x;
+					else if(en)  W[pe] <= w[(Active? DSP_SIMD : 0) +: DSP_SIMD];
+				end
+
+			end : genPERegW
+
+			dsp_a_t  A = 'x;
+			for(genvar  pe = 0; pe < ACT_PE; pe++) begin : genPERegA
+
+				uwire [2*DSP_SIMD-1:0][ACTIVATION_WIDTH-1:0]  a;
+				for(genvar  i =    0; i <       SIMD; i++)  assign  a[i] = amvau_i[pe][i];
+				for(genvar  i = SIMD; i < 2*DSP_SIMD; i++)  assign  a[i] = 0;
+
+				always_ff @(posedge clk2x) begin
+					if(rst)      A[pe] <= 'x;
+					else if(en)  A[pe] <= a[(Active? DSP_SIMD : 0) +: DSP_SIMD];
+				end
+
+			end : genPERegA
+
+			logic  Zero = 1;
+			logic  Last = 0;
+			always_ff @(posedge clk2x) begin
+				if(rst) begin
+					Zero <= 1;
+					Last <= 0;
+				end
+				else if(en) begin
+					Zero <= !istb;
+					Last <= alast && avld && Active;
+				end
+			end
+
+			assign	dsp_en = en;
+			assign	dsp_last = Last;
+			assign	dsp_zero = Zero;
+			assign	dsp_w = W;
+			assign	dsp_a = A;
+
+			// Since no two consecutive last cycles will ever be asserted on the input,
+			// valid outputs will also always be spaced by, at least, one other cycle.
+			// We can always hold a captured output for two cycles to allow the slow
+			// clock to pick it up.
+			logic    Vld = 0;
+			dsp_p_t  P = 'x;
+			always_ff @(posedge clk2x) begin
+				if(rst) begin
+					Vld <= 0;
+					P   <= 'x;
+				end
+				else if(en) begin
+					if(dsp_vld)  P <= dsp_p;
+					Vld <= dsp_vld || (Vld && !Active);
+				end
+			end
+			assign	ovld = Vld;
+			assign	odat = P;
+
+		end : genPumpedCompute
+
+		case(COMPUTE_CORE)
+		"mvu_vvu_8sx9_dsp58":
+			mvu_vvu_8sx9_dsp58 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(DSP_SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
+			.ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN),
+			.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
+				.clk(dsp_clk), .rst, .en(dsp_en),
+				.last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a),
+				.vld(dsp_vld), .p(dsp_p)
+			);
+		"mvu_4sx4u":
+			mvu_4sx4u #(.PE(PE), .SIMD(DSP_SIMD), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
+				.clk(dsp_clk), .rst, .en(dsp_en),
+				.last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a),
+				.vld(dsp_vld), .p(dsp_p)
+			);
+		"mvu_8sx8u_dsp48":
+			mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(DSP_SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
+			.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
+				.clk(dsp_clk), .rst, .en(dsp_en),
+				.last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a),
+				.vld(dsp_vld), .p(dsp_p)
+			);
+		"mvu_vvu_lut":
+			mvu_vvu_lut #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(DSP_SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH),
+			.WEIGHT_WIDTH(WEIGHT_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .M_REG(M_REG_LUT)) core (
+				.clk(dsp_clk), .rst, .en(dsp_en),
+				.last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a),
+				.vld(dsp_vld), .p(dsp_p)
+			);
+		default: initial begin
+			$error("Unrecognized COMPUTE_CORE '%s'", COMPUTE_CORE);
+			$finish;
+		end
+		endcase
+
+	end : blkDsp
+
+//-------------------- Output register slice --------------------\\
+	// Make `en`computation independent from external inputs.
+	// Drive all outputs from registers.
+	struct packed {
+		logic rdy;
+		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
+	}  A = '{ rdy: 1, default: 'x };	// side-step register used when encountering backpressure
+	struct packed {
+		logic vld;
+		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
+	}  B = '{ vld: 0, default: 'x };	// ultimate output register
+
+	assign	en = A.rdy;
+	uwire  b_load = !B.vld || m_axis_output_tready;
+
+	always_ff @(posedge clk) begin
+		if(rst) begin
+			A <= '{ rdy: 1, default: 'x };
+			B <= '{ vld: 0, default: 'x };
+		end
+		else begin
+			if(A.rdy)  A.dat <= odat;
+			A.rdy <= (A.rdy && !ovld) || b_load;
+
+			if(b_load) begin
+				B <= '{
+					vld: ovld || !A.rdy,
+					dat: A.rdy? odat : A.dat
+				};
+			end
+		end
+	end
+	assign	m_axis_output_tvalid = B.vld;
+	// Why would we need a sign extension here potentially creating a higher signal load into the next FIFO?
+	// These extra bits should never be used. Why not 'x them out?
+	assign	m_axis_output_tdata  = { {(OUTPUT_STREAM_WIDTH_BA-OUTPUT_STREAM_WIDTH){B.dat[PE-1][ACCU_WIDTH-1]}}, B.dat};
+
+endmodule : mvu_vvu_axi
diff --git a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
new file mode 100644
index 0000000000..11949dec24
--- /dev/null
+++ b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
@@ -0,0 +1,97 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Verilog AXI-lite wrapper for MVU & VVU.
+ *****************************************************************************/
+
+module $MODULE_NAME_AXI_WRAPPER$ #(
+	parameter	IS_MVU = $IS_MVU$,
+	parameter	COMPUTE_CORE = "$COMPUTE_CORE$",
+	parameter	PUMPED_COMPUTE = $PUMPED_COMPUTE$,
+	parameter	MW = $MW$,
+	parameter	MH = $MH$,
+	parameter	PE = $PE$,
+	parameter	SIMD = $SIMD$,
+	parameter	ACTIVATION_WIDTH = $ACTIVATION_WIDTH$,
+	parameter	WEIGHT_WIDTH = $WEIGHT_WIDTH$,
+	parameter	ACCU_WIDTH = $ACCU_WIDTH$,
+	parameter	SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$,
+	parameter	SEGMENTLEN = $SEGMENTLEN$,
+	parameter	FORCE_BEHAVIORAL = $FORCE_BEHAVIORAL$,
+
+	// Safely deducible parameters
+	parameter	WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
+	parameter 	INPUT_STREAM_WIDTH_BA = ((IS_MVU == 1 ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8,
+	parameter 	OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8
+)(
+	// Global Control
+	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *)
+	(* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *)
+	input	ap_clk,
+	(* X_INTERFACE_PARAMETER = "ASSOCIATED_RESET ap_rst_n" *)
+	(* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk2x CLK" *)
+	input   ap_clk2x,
+	(* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *)
+	input	ap_rst_n,
+
+	// Weight Stream
+	input	[WEIGHT_STREAM_WIDTH_BA-1:0]  weights_V_TDATA,
+	input   weights_V_TVALID,
+	output  weights_V_TREADY,
+	// Input Stream
+	input	[INPUT_STREAM_WIDTH_BA-1:0]  in0_V_TDATA,
+	input	in0_V_TVALID,
+	output	in0_V_TREADY,
+	// Output Stream
+	output	[OUTPUT_STREAM_WIDTH_BA-1:0]  out_V_TDATA,
+	output	out_V_TVALID,
+	input	out_V_TREADY
+);
+
+mvu_vvu_axi #(
+	.IS_MVU(IS_MVU), .COMPUTE_CORE(COMPUTE_CORE), .PUMPED_COMPUTE(PUMPED_COMPUTE), .MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD),
+	.ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH),
+	.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)
+	) inst (
+	.ap_clk(ap_clk),
+	.ap_clk2x(ap_clk2x),
+	.ap_rst_n(ap_rst_n),
+	.s_axis_weights_tdata(weights_V_TDATA),
+	.s_axis_weights_tvalid(weights_V_TVALID),
+	.s_axis_weights_tready(weights_V_TREADY),
+	.s_axis_input_tdata(in0_V_TDATA),
+	.s_axis_input_tvalid(in0_V_TVALID),
+	.s_axis_input_tready(in0_V_TREADY),
+	.m_axis_output_tdata(out_V_TDATA),
+	.m_axis_output_tvalid(out_V_TVALID),
+	.m_axis_output_tready(out_V_TREADY)
+);
+
+endmodule // $MODULE_NAME_AXI_WRAPPER$
diff --git a/finn-rtllib/mvu/replay_buffer.sv b/finn-rtllib/mvu/replay_buffer.sv
new file mode 100644
index 0000000000..3e2766f63d
--- /dev/null
+++ b/finn-rtllib/mvu/replay_buffer.sv
@@ -0,0 +1,181 @@
+/******************************************************************************
+ * Copyright (C) 2022-2023, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Replay buffer for counted sequences on an AXI-lite stream.
+ * @author	Thomas B. Preußer <thomas.preusser@amd.com>
+ *****************************************************************************/
+
+module replay_buffer #(
+	int unsigned  LEN,	// Sequence length
+	int unsigned  REP,	// Sequence replay count
+	int unsigned  W 	// Data width
+)(
+	input	logic  clk,
+	input	logic  rst,
+
+	input	logic [W-1:0]  idat,
+	input	logic  ivld,
+	output	logic  irdy,
+
+	output	logic [W-1:0]  odat,
+	output	logic  olast,
+	output	logic  ofin,
+	output	logic  ovld,
+	input	logic  ordy
+);
+
+	if(LEN == 0)  initial begin
+		$error("%m: Illegal zero sequence LEN.");
+		$finish;
+	end
+	if(REP == 0) initial begin
+		$error("%m: Illegal zero REP count.");
+		$finish;
+	end
+
+	// Track position in Sequence
+	uwire  last_item;
+	uwire  shift;
+	if(LEN == 1)  assign  last_item = 1;
+	else begin
+		typedef logic [$clog2(LEN)-1:0]  count_t;
+		count_t  Count = 0;
+		logic    Last  = 0;
+		always_ff @(posedge clk) begin
+			if(rst) begin
+				Count <= 0;
+				Last  <= 0;
+			end
+			else if(shift) begin
+				Count <= Count + (Last? 2**$clog2(LEN)-LEN+1 : 1);
+				Last  <= (((LEN-2) & ~Count) == 0) && ((LEN&1) || !Last);
+			end
+		end
+		assign	last_item = Last;
+	end
+
+	if(REP == 1) begin
+		assign	shift = ivld && ordy;
+
+		assign	irdy  = ordy;
+		assign	odat  = idat;
+		assign	olast = last_item;
+		assign	ofin  = last_item;
+		assign	ovld  = ivld;
+	end
+	else begin
+
+		// Track Repetitions
+		uwire  last_rep;
+		if(1) begin : blkRep
+			typedef logic [$clog2(REP)-1:0]  rep_t;
+			rep_t  RepCnt = 0;
+			logic  RepLst = 0;
+			always_ff @(posedge clk) begin
+				if(rst) begin
+					RepCnt <= 0;
+					RepLst <= 0;
+				end
+				else if(last_item && shift) begin
+					RepCnt <= RepCnt + (RepLst? 2**$clog2(REP)-REP+1 : 1);
+					RepLst <= (((REP-2) & ~RepCnt) == 0) && ((REP&1) || !RepLst);
+				end
+			end
+			assign	last_rep = RepLst;
+		end : blkRep
+
+		localparam int unsigned  AWIDTH = LEN < 2? 1 : $clog2(LEN);
+		typedef logic [AWIDTH  :0]  ptr_t;	// pointers with additional generational MSB
+		typedef logic [W     -1:0]  data_t;
+
+		// Output Registers
+		data_t  ODat;
+		logic   OVld =  0;
+		logic   OLst = 'x;
+		logic   OFin = 'x;
+		assign	odat  = ODat;
+		assign	olast = OLst;
+		assign	ofin  = OFin;
+		assign	ovld  = OVld;
+
+		// Buffer Memory Management
+		data_t  Mem[2**AWIDTH];
+		ptr_t  WP = 0;	// Write Pointer
+		ptr_t  RP = 0;	// Read Pointer
+		ptr_t  FP = 0;	// Free Pointer
+
+		// Operational Guards
+		//	Occupancy:    WP-FP
+		//	  WP-FP < 2**AWIDTH -> writing allowed
+		//		- increments WP
+		//	Availability: WP-RP
+		//	  WP-RP > 0         -> reading allowed
+		//		- increments RP, last in sequence rewinds to FP for non-final repetition
+		//		- increments FP in last repetition
+		assign	irdy = !((WP-FP) >> AWIDTH);
+
+		uwire  wr = irdy && ivld;
+		uwire  rd = !OVld || ordy;
+		always_ff @(posedge clk) begin
+			if(wr)  Mem[WP[AWIDTH-1:0]] <= idat;
+			if(rd)  ODat <= Mem[RP[AWIDTH-1:0]];
+		end
+
+		uwire  vld = (RP != WP);
+		assign	shift = rd && vld;
+		always_ff @(posedge clk) begin
+			if(rst) begin
+				WP <= 0;
+				RP <= 0;
+				FP <= 0;
+
+				OVld <=  0;
+				OLst <= 'x;
+				OFin <= 'x;
+			end
+			else begin
+				if(wr)  WP <= WP + 1;
+				if(rd) begin
+					if(vld) begin
+						automatic logic  rewind = last_item && !last_rep;
+						RP <= RP + (rewind? 2**(AWIDTH+1)-LEN+1 : 1);
+						FP <= FP + last_rep;
+					end
+
+					OVld <= vld;
+					OLst <= last_item;
+					OFin <= last_rep && last_item;
+				end
+			end
+		end
+
+	end
+
+endmodule : replay_buffer
diff --git a/finn-rtllib/mvu/tb/mvu_8sx9_tb.sv b/finn-rtllib/mvu/tb/mvu_8sx9_tb.sv
new file mode 100644
index 0000000000..c8bfe5370a
--- /dev/null
+++ b/finn-rtllib/mvu/tb/mvu_8sx9_tb.sv
@@ -0,0 +1,165 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Testbench for MVU core compute kernel.
+ *****************************************************************************/
+
+module mvu_8sx9_tb();
+
+//-------------------- Simulation parameters --------------------\\
+	// Matrix & parallelism config
+	localparam int unsigned MH = 256;
+	localparam int unsigned PE = 16;
+	localparam int unsigned MW = 600;
+	localparam int unsigned SIMD = 60;
+	localparam int unsigned SEGMENTLEN = 4;
+	// Bit-width config  
+	localparam int unsigned ACTIVATION_WIDTH = 8;
+	localparam int unsigned WEIGHT_WIDTH = 4;
+	localparam bit SIGNED_ACTIVATIONS = 1;
+	// Simulation constants
+	localparam int unsigned NF = MH/PE;
+	localparam int unsigned SF = MW/SIMD;
+	localparam int unsigned NUM_OF_DSP = SIMD/3;
+
+	typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t;
+	typedef activation_t activation_vector_t[SF];
+
+	function activation_vector_t init_ACTIVATIONS;
+		automatic activation_vector_t res;
+		std::randomize(res);
+		return res;
+	endfunction : init_ACTIVATIONS
+
+	typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t;
+	typedef weight_t weight_matrix_t[NF][SF];
+
+	function weight_matrix_t init_WEIGHTS;
+		automatic weight_matrix_t res;
+		std::randomize(res);
+		return res;
+	endfunction : init_WEIGHTS;
+
+	typedef logic signed [PE-1:0][57:0] output_t;
+	typedef output_t output_vector_t [NF];
+
+	function output_vector_t check_output(activation_vector_t a, weight_matrix_t w);
+		automatic output_vector_t res = '{default: 0};
+		for (int j = 0; j<MH; j++) begin
+			for (int i = 0; i<MW; i++) begin
+				res[j/PE][j%PE] = $signed(res[j/PE][j%PE]) + $signed(a[i/SIMD][i%SIMD]) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
+			end
+		end  
+		return res;
+	endfunction : check_output;
+
+	logic clk = 0;
+	always #5ns clk = !clk;
+
+	logic rst;
+	initial begin
+		rst = 1;
+		repeat(16) @(posedge clk);
+		rst <= 0;
+	end
+
+	logic last;
+	logic zero;
+	logic vld;
+	activation_t a;
+	weight_t w;
+	output_t p;
+	// Reference signals
+	activation_vector_t ACTIVATIONS; //   [SF-1:0][SIMD-1:0][ACTIVATION_WIDTH-1:0]
+	weight_matrix_t WEIGHTS; //           [NF-1:0][SF-1:0][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]
+	output_vector_t GOLDEN_OUTPUT; //     [NF-1:0][PE-1:0][57:0]
+	// Counter for number of outputs (NF dimension) that are produced
+	int NF_CNT = 0;
+
+	initial begin
+		ACTIVATIONS = init_ACTIVATIONS();
+		WEIGHTS = init_WEIGHTS();
+		GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS);
+		last = 0;
+		zero = 0;
+		a = 'x;
+		w = 'x;
+
+		@(posedge clk iff !rst);
+
+		for (int j=0; j<NF; j++) begin
+			for (int i=0; i<SF; i++) begin
+				last <= (i==SF-1) ? 1 : 0;
+				a <= ACTIVATIONS[i];
+				w <= WEIGHTS[j][i];
+				@(posedge clk iff en);
+			end
+		end
+
+		last <= 0;
+		zero <= 1;  
+
+		// Continue until all NF outputs are produced & compared
+		@(posedge clk && (NF_CNT==NF));
+
+		$finish;
+	end
+
+	logic en = 0;
+	always_ff @(posedge clk) begin
+		en <= ($urandom()%7 > 1) && !rst;
+	end
+
+	// Compare computed output against golden output when vld flag is raised by DUT
+	always_ff @(posedge clk iff (vld && en)) begin
+		foreach(p[i]) begin
+			assert ($signed(p[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+			else begin 
+				$error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+				$stop;
+			end  
+		end
+		NF_CNT += 1;
+	end
+
+	// Instantiate DUT
+	mvu_8sx9 #(
+		.PE(PE),
+		.SIMD(SIMD),
+		.WEIGHT_WIDTH(WEIGHT_WIDTH),
+		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
+		.ACTIVATION_WIDTH(ACTIVATION_WIDTH),
+		.SEGMENTLEN(SEGMENTLEN)
+	)
+	dut (
+		.clk, .rst, .en, .last, .zero, .a, .w, .vld, .p
+	);
+
+endmodule : mvu_8sx9_tb
diff --git a/finn-rtllib/mvu/tb/mvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_axi_tb.sv
new file mode 100644
index 0000000000..08e8679214
--- /dev/null
+++ b/finn-rtllib/mvu/tb/mvu_axi_tb.sv
@@ -0,0 +1,241 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Testbench for MVU AXI-lite interface wrapper.
+ *****************************************************************************/
+
+module mvu_axi_tb();
+
+//-------------------- Simulation parameters --------------------\\
+	// Matrix & parallelism config
+	localparam bit IS_MVU = 1;
+	localparam string COMPUTE_CORE = "mvu_vvu_8sx9_dsp58";
+	localparam int unsigned MW = 36;
+	localparam int unsigned MH = 4;
+	localparam int unsigned SIMD = 36;
+	localparam int unsigned PE = 4;
+	localparam int unsigned SEGMENTLEN = 2.0;
+	localparam bit FORCE_BEHAVIORAL = 1;
+	localparam bit M_REG_LUT = 1;
+	// Bit-width config
+	localparam int unsigned ACTIVATION_WIDTH = 4;
+	localparam int unsigned WEIGHT_WIDTH = 4;
+	localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW);
+	localparam bit SIGNED_ACTIVATIONS = 0;
+	// Simulation constants
+	localparam int unsigned NF = IS_MVU ? MH/PE : 1;
+	localparam int unsigned SF = IS_MVU ? MW/SIMD : MW/(SIMD*PE);
+	localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8;
+	localparam int unsigned ACTIVATION_WIDTH_BA = ((IS_MVU ? 1 : PE)*SIMD*ACTIVATION_WIDTH+7)/8*8;
+	localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH;
+	localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - (IS_MVU ? 1 : PE)*SIMD*ACTIVATION_WIDTH;
+	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8;
+
+	// Generate clk and reset signal
+	logic clk = 0;
+	always #5ns clk = !clk;
+
+	logic ap_rst_n = 0;
+	initial begin
+		repeat(16) @(posedge clk);
+		ap_rst_n <= 1;
+	end
+
+	uwire ap_clk = clk;
+
+	// Generate activations
+	typedef logic [(IS_MVU ? 1 : PE)*SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t;
+	typedef activation_t activation_vector_t[SF];
+
+	function activation_vector_t init_ACTIVATIONS;
+		automatic activation_vector_t res;
+		std::randomize(res);
+		return res;
+	endfunction : init_ACTIVATIONS
+
+	activation_vector_t ACTIVATIONS = init_ACTIVATIONS();
+
+	struct {
+		activation_t dat;
+		logic vld;
+		logic rdy;
+	} activations;
+
+	initial begin
+		activations.vld = 0;
+		activations.dat = 'X;
+		@(posedge clk iff ap_rst_n);
+
+		for (int i=0; i<SF; i++) begin
+			activations.dat <= ACTIVATIONS[i];
+			do begin
+				activations.vld <= $urandom()%7 >= 0;
+				@(posedge clk);
+			end while (!(activations.vld === 1 && activations.rdy === 1));
+		end
+
+		activations.vld <= 0;
+		activations.dat <= 'x;
+	end
+
+	// Generate weights
+	typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t;
+	typedef weight_t weight_matrix_t[NF][SF];
+
+	function weight_matrix_t init_WEIGHTS;
+		automatic weight_matrix_t res;
+		std::randomize(res);
+		return res;
+	endfunction : init_WEIGHTS;
+
+	weight_matrix_t WEIGHTS = init_WEIGHTS();
+
+	struct {
+		weight_t dat;
+		logic vld;
+		logic rdy;
+	} weights;
+
+	initial begin
+		weights.vld = 0;
+		weights.dat = 'X;
+		@(posedge clk iff ap_rst_n);
+
+		weights.vld <= 1;
+		for (int i=0; i<NF; i++) begin
+			for (int j=0; j<SF; j++) begin
+				weights.dat <= WEIGHTS[i][j];
+				@(posedge clk iff weights.rdy);
+			end
+		end
+
+		weights.vld <= 0;
+		weights.dat <= 'x;
+	end
+
+	// Function to compute golden output
+	// a: [SF][(IS_MVU ? 1 : PE)*SIMD-1:0][ACTIVATION_WIDTH-1:0]
+	// a: [SF][SIMD-1:0][ACTIVATION_WIDTH-1:0]
+	// a: [SF][PE*SIMD-1:0][ACTIVATION_WIDTH-1:0]
+	// w: [NF][SF][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]
+	typedef logic signed [PE-1:0][ACCU_WIDTH-1:0] output_t;
+	typedef output_t output_vector_t [NF];
+
+	struct {
+		output_t dat;
+		logic vld;
+		logic rdy;
+	} outputs;
+
+	function output_vector_t check_output(activation_vector_t a, weight_matrix_t w);
+		automatic output_vector_t res = '{default: 0};
+		// for (int j = 0; j<MH; j++) begin
+		// 	for (int i = 0; i<MW; i++) begin
+		// 		if (SIGNED_ACTIVATIONS)
+		// 			res[j/PE][j%PE] = IS_MVU ? $signed(res[j/PE][j%PE]) + $signed(a[i/SIMD][i%SIMD]) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]) : 
+		// 									   $signed(res[j/PE][j%PE]) + ( PE > 1 ? $signed(a[i/SIMD/PE][i % (SIMD*PE)]) : $signed(a[i/SIMD/PE][(i)%(SIMD*PE)]) ) * $signed(w[0][i/SIMD/PE][i/PE][i%SIMD]);
+		// 		else
+		// 			res[j/PE][j%PE] = IS_MVU ? $signed(res[j/PE][j%PE]) + $signed({1'b0, a[i/SIMD][i%SIMD]}) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]) : 
+		// 									   $signed(res[j/PE][j%PE]) + ( PE > 1 ? $signed({1'b0, a[i/SIMD/PE][i % (SIMD*PE)]}) : $signed({1'b0, a[i/SIMD/PE][i%(SIMD*PE)]}) ) * $signed(w[0][i/SIMD][0][i%SIMD]);
+		// 	end
+		// end
+		// The input stream will have the channels interleaved for VVU when PE>1
+		// Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..]
+		// Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like:
+		// (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to
+		// (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i)
+		for (int i = 0; i < NF; i++) begin
+			for (int j = 0; j < SF; j++) begin
+				for (int k = 0; k < PE; k++) begin
+					for (int l = 0; l < SIMD; l++) begin
+						if (SIGNED_ACTIVATIONS)
+							res[i][k] = IS_MVU ? $signed(res[i][k]) + $signed(a[j][l]) * $signed(w[i][j][k][l]) :
+												 $signed(res[i][k]) + $signed(a[j][k + l*PE]) * $signed(w[i][j][k][l]);
+						else
+							res[i][k] = IS_MVU ? $signed(res[i][k]) + $signed({1'b0, a[j][l]}) * $signed(w[i][j][k][l]) :
+												 $signed(res[i][k]) + $signed({1'b0, a[j][k + l*PE]}) * $signed(w[i][j][k][l]);
+					end
+				end
+			end
+		end
+		return res;
+	endfunction : check_output;
+
+	output_vector_t GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS);
+
+	int unsigned NF_CNT = 0;
+	initial begin
+		outputs.rdy = 0;
+		while (NF_CNT < NF) begin
+			// Loop until both rdy & vld are asserted
+			do begin
+				outputs.rdy <= $urandom()%7 >= 0;
+				@(posedge clk iff ap_rst_n);
+			end while (!(outputs.rdy === 1 && outputs.vld === 1));
+
+			// Compare produced outputs against golden outputs
+			foreach(outputs.dat[i]) begin
+				assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+				else begin
+					$error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+					$stop;
+				end
+			end
+
+			NF_CNT += 1;
+		end
+
+		$finish;
+	end
+
+	// Instantiate DUT
+	mvu_vvu_axi #(
+		.IS_MVU(IS_MVU),
+		.COMPUTE_CORE(COMPUTE_CORE),
+		.MW(MW),
+		.MH(MH),
+		.PE(PE),
+		.SIMD(SIMD),
+		.ACTIVATION_WIDTH(ACTIVATION_WIDTH),
+		.WEIGHT_WIDTH(WEIGHT_WIDTH),
+		.ACCU_WIDTH(ACCU_WIDTH),
+		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
+		.SEGMENTLEN(SEGMENTLEN),
+		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL),
+		.M_REG_LUT(M_REG_LUT)
+	)
+	dut (
+		.ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld),
+		.s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld),
+		.s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld),
+		.m_axis_output_tready(outputs.rdy)
+	);
+
+endmodule : mvu_axi_tb
diff --git a/finn-rtllib/mvu/tb/mvu_dsp58_tb.sv b/finn-rtllib/mvu/tb/mvu_dsp58_tb.sv
new file mode 100644
index 0000000000..108980c497
--- /dev/null
+++ b/finn-rtllib/mvu/tb/mvu_dsp58_tb.sv
@@ -0,0 +1,142 @@
+module mvu_dsp58_tb;
+
+	localparam int unsigned  N = 1000;
+
+	localparam int unsigned  MW = 12;
+	localparam int unsigned  MH = 4;
+	localparam int unsigned  PE = 2;
+	localparam int unsigned  SIMD = 6;
+	localparam int unsigned  ACTIVATION_WIDTH = 8;
+	localparam int unsigned  WEIGHT_WIDTH = 8;
+	localparam int unsigned  ACCU_WIDTH = 24;
+
+	//- Global Control ------------------
+	logic  clk = 1;
+	logic  clk2x = 1;
+	always #5ns clk = !clk;
+	always #2.5ns clk2x = !clk2x;
+
+	logic  rst = 1;
+	initial begin
+		repeat(8) @(posedge clk);
+		rst <= 0;
+	end
+
+	//- DUTs ----------------------------
+
+	// Weight Stream
+	logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]  s_axis_weights_tdata;
+	logic  s_axis_weights_tvalid[2];
+	uwire  s_axis_weights_tready[2];
+
+	// Input Stream
+	logic [SIMD-1:0][ACTIVATION_WIDTH-1:0]  s_axis_input_tdata;
+	logic  s_axis_input_tvalid[2];
+	uwire  s_axis_input_tready[2];
+
+	// Output Stream
+	uwire [PE-1:0][ACCU_WIDTH-1:0]  m_axis_output_tdata[2];
+	uwire  m_axis_output_tvalid[2];
+	logic  m_axis_output_tready[2];
+
+	for(genvar  i = 0; i < 2; i++) begin : genDUTs
+		mvu_vvu_axi #(
+			.IS_MVU(1),
+			.COMPUTE_CORE("mvu_vvu_8sx9_dsp58"),
+			.MW(MW), .MH(MH),
+			.PE(PE), .SIMD(SIMD),
+			.ACTIVATION_WIDTH(ACTIVATION_WIDTH),
+			.WEIGHT_WIDTH(WEIGHT_WIDTH),
+			.ACCU_WIDTH(ACCU_WIDTH),
+			.PUMPED_COMPUTE(i)
+		) dut (
+			.ap_clk(clk), .ap_clk2x(clk2x), .ap_rst_n(!rst),
+			.s_axis_weights_tdata,                        .s_axis_weights_tvalid(s_axis_weights_tvalid[i]), .s_axis_weights_tready(s_axis_weights_tready[i]),
+			.s_axis_input_tdata,                          .s_axis_input_tvalid  (s_axis_input_tvalid  [i]), .s_axis_input_tready  (s_axis_input_tready  [i]),
+			.m_axis_output_tdata(m_axis_output_tdata[i]), .m_axis_output_tvalid (m_axis_output_tvalid [i]), .m_axis_output_tready (m_axis_output_tready [i])
+		);
+	end : genDUTs
+
+
+	//- Stimuli -------------------------
+
+	// Weight Feed
+	initial begin
+		s_axis_weights_tvalid = '{ default: 0 };
+		s_axis_weights_tdata  = 'x;
+		@(posedge clk iff !rst);
+
+		repeat(N * (MH/PE)*(MW/SIMD)) begin
+			automatic type(s_axis_weights_tdata)  weights;
+			std::randomize(weights);
+			s_axis_weights_tdata <= weights;
+			s_axis_weights_tvalid <= '{ default: 1 };
+			fork
+				begin
+					@(posedge clk iff s_axis_weights_tready[0]);
+					s_axis_weights_tvalid[0] <= 0;
+				end
+				begin
+					@(posedge clk iff s_axis_weights_tready[1]);
+					s_axis_weights_tvalid[1] <= 0;
+				end
+			join
+		end
+	end
+
+	// Input Feed
+	initial begin
+		s_axis_input_tvalid = '{ default: 0 };
+		s_axis_input_tdata  = 'x;
+		@(posedge clk iff !rst);
+
+		repeat(N * (MW/SIMD)) begin
+			automatic type(s_axis_input_tdata)  in;
+			std::randomize(in);
+			s_axis_input_tdata <= in;
+			s_axis_input_tvalid <= '{ default: 1 };
+			fork
+				begin
+					@(posedge clk iff s_axis_input_tready[0]);
+					s_axis_input_tvalid[0] <= 0;
+				end
+				begin
+					@(posedge clk iff s_axis_input_tready[1]);
+					s_axis_input_tvalid[1] <= 0;
+				end
+			join
+		end
+	end
+
+	// Output Capture and Comparison
+	initial begin
+		m_axis_output_tready = '{ default: 0 };
+		@(posedge clk iff !rst);
+
+		repeat(N * (MH/PE)) begin
+			automatic type(m_axis_output_tdata)  res;
+			m_axis_output_tready <= '{ default: 1 };
+			fork
+				begin
+					@(posedge clk iff m_axis_output_tvalid[0]);
+					m_axis_output_tready[0] <= 0;
+					res[0] = m_axis_output_tdata[0];
+				end
+				begin
+					@(posedge clk iff m_axis_output_tvalid[1]);
+					m_axis_output_tready[1] <= 0;
+					res[1] = m_axis_output_tdata[1];
+				end
+			join
+			assert(res[0] == res[1]) else begin
+				$error("Output mismatch: %0x <=> %0x", res[0], res[1]);
+				$stop;
+			end
+			while($urandom()%7 < MW/SIMD) @(posedge clk);	// Occassional backpressure
+		end
+
+		$display("Test completed.");
+		$finish;
+	end
+
+endmodule : mvu_dsp58_tb
diff --git a/finn-rtllib/mvu/tb/replay_buffer_tb.sv b/finn-rtllib/mvu/tb/replay_buffer_tb.sv
new file mode 100644
index 0000000000..5581354e0e
--- /dev/null
+++ b/finn-rtllib/mvu/tb/replay_buffer_tb.sv
@@ -0,0 +1,130 @@
+/******************************************************************************
+ * Copyright (C) 2023, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Testbench for replay_buffer module.
+ * @author	Thomas B. Preußer <thomas.preusser@amd.com>
+ *****************************************************************************/
+
+module replay_buffer_tb;
+
+	// Global Control
+	logic  clk = 0;
+	always #5ns clk = !clk;
+	uwire  rst = 0;
+
+	// DUT Geometries
+	localparam int unsigned  DIMS[3] = '{ 7, 8, 10 };
+	localparam int unsigned  W = 8;
+	typedef logic [W-1:0]  data_t;
+
+	bit [2**$size(DIMS)-1:0]  done = 0;
+	always_comb begin
+		if(&done) begin
+			$display("Test completed.");
+			$finish;
+		end
+	end
+
+	// Parallel DUT Instantiations
+	for(genvar  r = 0; r < $size(DIMS); r++) begin
+		for(genvar  l = 0; l < $size(DIMS); l++) begin
+			localparam int unsigned  REP = DIMS[r];
+			localparam int unsigned  LEN = DIMS[l];
+
+			data_t  idat;
+			logic  ivld;
+			uwire  irdy;
+
+			uwire data_t  odat;
+			uwire  olast;
+			uwire  ofin;
+			uwire  ovld;
+			logic  ordy;
+
+			replay_buffer #(.LEN(LEN), .REP(REP), .W(W)) dut (
+				.clk, .rst,
+				.idat, .ivld, .irdy,
+				.odat, .olast, .ofin, .ovld, .ordy
+			);
+
+			// Input Feed: 0, 1, ..., 10*LEN-1
+			initial begin
+				idat = 'x;
+				ivld =  0;
+				@(posedge clk iff !rst);
+
+				for(int unsigned  i = 0; i < 10*LEN; i++) begin
+					idat <= i;
+					ivld <= 1;
+					@(posedge clk iff irdy);
+					idat <= 'x;
+					ivld <=  0;
+					while($urandom()%(REP-1) != 0) @(posedge clk);
+				end
+			end
+
+			// Output Check
+			initial begin
+				automatic int unsigned  base = 0;
+
+				ordy = 0;
+				@(posedge clk iff !rst);
+
+				for(int unsigned  k = 0; k < 10; k++) begin
+					for(int unsigned  j = 0; j < REP; j++) begin
+						for(int unsigned  i = 0; i < LEN; i++) begin
+							ordy <= 1;
+							@(posedge clk iff ovld);
+							assert(odat == base+i) else begin
+								$error("#%0d.%0d: Data mismatch: %0d instead of %0d.", r, l, odat, base+i);
+								$stop;
+							end
+							assert(olast == (i == LEN-1)) else begin
+								$error("#%0d.%0d: Last mismatch.", r, l);
+								$stop;
+							end
+							assert(ofin == ((i == LEN-1) && (j == REP-1))) else begin
+								$error("#%0d.%0d: Fin mismatch.", r, l);
+								$stop;
+							end
+
+							ordy <= 0;
+							while($urandom()%13 == 0) @(posedge clk);
+						end
+					end
+					base += LEN;
+				end
+
+				done[$size(DIMS)*r + l] <= 1;
+			end
+		end
+	end
+
+endmodule : replay_buffer_tb
diff --git a/finn-rtllib/mvu/tb/vvu_axi_tb.sv b/finn-rtllib/mvu/tb/vvu_axi_tb.sv
new file mode 100644
index 0000000000..fbb45845e1
--- /dev/null
+++ b/finn-rtllib/mvu/tb/vvu_axi_tb.sv
@@ -0,0 +1,227 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Testbench for MVU AXI-lite interface wrapper.
+ *****************************************************************************/
+
+module vvu_axi_tb();
+
+//-------------------- Simulation parameters --------------------\\
+	// Matrix & parallelism config
+	localparam bit IS_MVU = 0;
+	localparam string COMPUTE_CORE = "mvu_vvu_8sx9_dsp58";
+	localparam int unsigned MW = 25; // Kernel*Kernel
+	localparam int unsigned MH = 4; // Channels
+	localparam int unsigned SIMD = 25; // MW%SIMD == 0
+	localparam int unsigned PE = 2; // MH%PE == 0
+	localparam int unsigned SEGMENTLEN = 3.0;
+	localparam bit FORCE_BEHAVIORAL = 1;
+	localparam bit M_REG_LUT = 1;
+	// Bit-width config
+	localparam int unsigned ACTIVATION_WIDTH = 4;
+	localparam int unsigned WEIGHT_WIDTH = 4;
+	localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW);
+	localparam bit SIGNED_ACTIVATIONS = 1;
+	// Simulation constants
+	localparam int unsigned NF = MH/PE;
+	localparam int unsigned SF = MW/SIMD;
+	localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8;
+	localparam int unsigned ACTIVATION_WIDTH_BA = (PE*SIMD*ACTIVATION_WIDTH+7)/8*8;
+	localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH;
+	localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - PE*SIMD*ACTIVATION_WIDTH;
+	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8;
+
+	// Generate clk and reset signal
+	logic clk = 0;
+	always #5ns clk = !clk;
+
+	logic ap_rst_n = 0;
+	initial begin
+		repeat(16) @(posedge clk);
+		ap_rst_n <= 1;
+	end
+
+	uwire ap_clk = clk;
+
+	// Generate activations
+	typedef logic [PE*SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t;
+	typedef activation_t activation_vector_t[NF*SF];
+
+	function activation_vector_t init_ACTIVATIONS;
+		automatic activation_vector_t res;
+		std::randomize(res);
+		return res;
+	endfunction : init_ACTIVATIONS
+
+	activation_vector_t ACTIVATIONS = init_ACTIVATIONS();
+
+	struct {
+		activation_t dat;
+		logic vld;
+		logic rdy;
+	} activations;
+
+	initial begin
+		activations.vld = 0;
+		activations.dat = 'X;
+		@(posedge clk iff ap_rst_n);
+
+		for (int i=0; i<SF*NF; i++) begin
+			activations.dat <= ACTIVATIONS[i];
+			do begin
+				activations.vld <= $urandom()%7 >= 0;
+				@(posedge clk);
+			end while (!(activations.vld === 1 && activations.rdy === 1));
+		end
+
+		activations.vld <= 0;
+		activations.dat <= 'x;
+	end
+
+	// Generate weights
+	typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t;
+	typedef weight_t weight_matrix_t[NF][SF];
+
+	function weight_matrix_t init_WEIGHTS;
+		automatic weight_matrix_t res;
+		std::randomize(res);
+		return res;
+	endfunction : init_WEIGHTS;
+
+	weight_matrix_t WEIGHTS = init_WEIGHTS();
+
+	struct {
+		weight_t dat;
+		logic vld;
+		logic rdy;
+	} weights;
+
+	initial begin
+		weights.vld = 0;
+		weights.dat = 'X;
+		@(posedge clk iff ap_rst_n);
+
+		weights.vld <= 1;
+		for (int i=0; i<NF; i++) begin
+			for (int j=0; j<SF; j++) begin
+				weights.dat <= WEIGHTS[i][j];
+				@(posedge clk iff weights.rdy);
+			end
+		end
+
+		weights.vld <= 0;
+		weights.dat <= 'x;
+	end
+
+	// Function to compute golden output
+	// a: [NF*SF][PE*SIMD-1:0][ACTIVATION_WIDTH-1:0]
+	// w: [NF][SF][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]
+	typedef logic signed [PE-1:0][ACCU_WIDTH-1:0] output_t;
+	typedef output_t output_vector_t [NF];
+
+	struct {
+		output_t dat;
+		logic vld;
+		logic rdy;
+	} outputs;
+
+	function output_vector_t check_output(activation_vector_t a, weight_matrix_t w);
+		automatic output_vector_t res = '{default: 0};
+		// The input stream will have the channels interleaved for VVU when PE>1
+		// Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..]
+		// Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like:
+		// (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to
+		// (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i)
+		for (int i = 0; i < NF; i++) begin
+			for (int j = 0; j < SF; j++) begin
+				for (int k = 0; k < PE; k++) begin
+					for (int l = 0; l < SIMD; l++) begin
+						if (SIGNED_ACTIVATIONS)
+							res[i][k] = $signed(res[i][k]) + $signed(a[i*SF+j][k + l*PE]) * $signed(w[i][j][k][l]);
+						else
+							res[i][k] = $signed(res[i][k]) + $signed({1'b0, a[i*SF+j][k + l*PE]}) * $signed(w[i][j][k][l]);
+					end
+				end
+			end
+		end
+		return res;
+	endfunction : check_output;
+
+	output_vector_t GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS);
+
+	int unsigned NF_CNT = 0;
+	initial begin
+		outputs.rdy = 0;
+		while (NF_CNT < NF) begin
+			// Loop until both rdy & vld are asserted
+			do begin
+				outputs.rdy <= $urandom()%7 >= 0;
+				@(posedge clk iff ap_rst_n);
+			end while (!(outputs.rdy === 1 && outputs.vld === 1));
+
+			// Compare produced outputs against golden outputs
+			foreach(outputs.dat[i]) begin
+				assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+				else begin
+					$error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+					$stop;
+				end
+			end
+
+			NF_CNT += 1;
+		end
+
+		$finish;
+	end
+
+	// Instantiate DUT
+	mvu_vvu_axi #(
+		.IS_MVU(IS_MVU),
+		.COMPUTE_CORE(COMPUTE_CORE),
+		.MW(MW),
+		.MH(MH),
+		.PE(PE),
+		.SIMD(SIMD),
+		.ACTIVATION_WIDTH(ACTIVATION_WIDTH),
+		.WEIGHT_WIDTH(WEIGHT_WIDTH),
+		.ACCU_WIDTH(ACCU_WIDTH),
+		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
+		.SEGMENTLEN(SEGMENTLEN),
+		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL),
+		.M_REG_LUT(M_REG_LUT)
+	)
+	dut (
+		.ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld),
+		.s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld),
+		.s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld),
+		.m_axis_output_tready(outputs.rdy)
+	);
+
+endmodule : vvu_axi_tb
diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py
index e4fed05731..85b7d61ce5 100644
--- a/src/finn/builder/build_dataflow_config.py
+++ b/src/finn/builder/build_dataflow_config.py
@@ -116,11 +116,13 @@ class VerificationStepType(str, Enum):
     "step_tidy_up",
     "step_streamline",
     "step_convert_to_hls",
+    "step_specialize_layers",
     "step_create_dataflow_partition",
     "step_target_fps_parallelization",
     "step_apply_folding_config",
     "step_minimize_bit_width",
     "step_generate_estimate_reports",
+    "step_specialize_to_rtl",
     "step_hls_codegen",
     "step_hls_ipgen",
     "step_set_fifo_depths",
@@ -138,6 +140,7 @@ class VerificationStepType(str, Enum):
     "step_tidy_up",
     "step_streamline",
     "step_convert_to_hls",
+    "step_specialize_layers",
     "step_create_dataflow_partition",
     "step_target_fps_parallelization",
     "step_apply_folding_config",
diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
index 54ba7e4ea1..b74dc7adc5 100644
--- a/src/finn/builder/build_dataflow_steps.py
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -53,6 +53,7 @@
 from shutil import copy
 
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
+from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
 import finn.transformation.streamline.absorb as absorb
 from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
@@ -472,6 +473,12 @@ def step_generate_estimate_reports(model: ModelWrapper, cfg: DataflowBuildConfig
     return model
 
 
+def step_specialize_layers(model: ModelWrapper, cfg: DataflowBuildConfig):
+    """Convert HW custom-ops into custom-ops suitable for FPGA implementation either with HLS or RTL backend."""
+    model = model.transform(SpecializeLayers())
+    return model
+
+
 def step_minimize_bit_width(model: ModelWrapper, cfg: DataflowBuildConfig):
     """Tighten the weight and accumulator bit widths for each layer."""
     if cfg.minimize_bit_width:
@@ -569,7 +576,12 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
         model = model.transform(GiveUniqueNodeNames())
         model = model.transform(GiveReadableTensorNames())
         if cfg.folding_config_file is not None:
-            model = model.transform(ApplyConfig(cfg.folding_config_file))
+            model = model.transform(
+                ApplyConfig(
+                    cfg.folding_config_file,
+                    node_filter=lambda x: x.op_type == "StreamingFIFO",
+                )
+            )
 
     # extract the final configuration and save it as json
     hw_attrs = [
@@ -829,6 +841,7 @@ def step_deployment_package(model: ModelWrapper, cfg: DataflowBuildConfig):
     "step_apply_folding_config": step_apply_folding_config,
     "step_minimize_bit_width": step_minimize_bit_width,
     "step_generate_estimate_reports": step_generate_estimate_reports,
+    "step_specialize_layers": step_specialize_layers,
     "step_hls_codegen": step_hls_codegen,
     "step_hls_ipgen": step_hls_ipgen,
     "step_set_fifo_depths": step_set_fifo_depths,
diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index d4c9904fe1..1f2c2740bb 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -64,7 +64,6 @@
 custom_op["Thresholding"] = Thresholding
 custom_op["VectorVectorActivation"] = VectorVectorActivation
 custom_op["StreamingDataflowPartition"] = StreamingDataflowPartition
-
 custom_op["AddStreams"] = AddStreams
 custom_op["ChannelwiseOp"] = ChannelwiseOp
 custom_op["ConvolutionInputGenerator"] = ConvolutionInputGenerator
diff --git a/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py
index 5206ee3a06..aa3631a240 100644
--- a/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py
@@ -33,6 +33,7 @@
 from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend
 from finn.custom_op.fpgadataflow.matrixvectoractivation import MatrixVectorActivation
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+from pyverilator.util.axi_utils import toggle_clk, reset_rtlsim
 
 # ONNX i/o tensor shape assumptions for MatrixVectorActivation:
 # input 0 is the input tensor, shape (.., i_size) = (..., MW)
@@ -54,6 +55,84 @@ def get_nodeattr_types(self):
         my_attrs.update(HLSBackend.get_nodeattr_types(self))
         return my_attrs
 
+    def lut_estimation(self):
+        """Calculates resource estimations for LUTs based on:
+        - FINN-R: An End-to-End Deep-Learning Framework for Fast
+        Exploration of Quantized Neural Networks
+        - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien,
+        Y. Umuroglu, M. Leeser and K. Vissers
+        - 12. Sep 2018
+        """
+        # TODO add in/out FIFO contributions
+        P = self.get_nodeattr("PE")
+        Q = self.get_nodeattr("SIMD")
+        MW = self.get_nodeattr("MW")
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        # determine tdt with input and weight data types
+        idt = self.get_input_datatype()
+        A = idt.bitwidth()
+        # parameters from experiments in paper mentioned above
+        c0 = 300
+        c1 = 1.1
+        c2 = 0
+        mmode = self.get_nodeattr("mem_mode")
+        mstyle = self.get_nodeattr("ram_style")
+        if (mmode == "decoupled" and mstyle == "distributed") or (
+            mmode == "const" and self.calc_wmem() <= 128
+        ):
+            c2 = (P * Q * W) * math.ceil(self.calc_wmem() / 64)
+
+        # multiplication
+        res_type = self.get_nodeattr("resType")
+        if res_type == "dsp":
+            mult_luts = 0
+        else:
+            mult_luts = Q * (2 * math.ceil((W + A) / 6) - 1) * (W + A)
+        # adder tree
+        addertree_luts = (W + A) * (2 * Q - 1)
+        # accumulator
+        acc_datatype = self.get_accumulator_datatype()
+        # if accDataType is not set, then it will default to INT32, which would
+        # be a large overestimate in most (if not all) cases. In this scenario,
+        # we would use the minimum accumulator as determined by the data types
+        # bound, derived in https://arxiv.org/abs/2301.13376
+        alpha = math.log(MW, 2) + W + A - 1 - int(idt.signed())
+        acc_bits = min(
+            acc_datatype.bitwidth(),
+            np.ceil(alpha + math.log(1 + pow(2, -alpha), 2) + 1),
+        )
+        acc_luts = acc_bits
+        # thresholds and threshold comparators
+        thr_luts = 0
+        comp_luts = 0
+        noact = self.get_nodeattr("noActivation")
+        tmem_style = self.get_nodeattr("ram_style_thresholds")
+        if (noact == 0) and (tmem_style == "distributed"):
+            odt = self.get_output_datatype()
+            B = odt.bitwidth()
+            thr_luts = (2**B - 1) * acc_bits * math.ceil(self.calc_tmem() / 64)
+            comp_luts = (2**B - 1) * acc_bits
+
+        return int(
+            c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts)) + c2
+        )
+
+    def dsp_estimation(self):
+        # multiplication
+        P = self.get_nodeattr("PE")
+        res_type = self.get_nodeattr("resType")
+        Q = self.get_nodeattr("SIMD")
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        idt = self.get_input_datatype()
+        A = idt.bitwidth()
+        if res_type == "dsp":
+            mult_dsp = P * Q * np.ceil((W + A) / 48)  # TODO: more accurate modelling
+        else:
+            mult_dsp = 0
+        return int(mult_dsp)
+
     def get_template_param_values(self):
         """Returns the template parameter values according to input, output and weight
         data types."""
@@ -416,6 +495,7 @@ def execute_node(self, context, graph):
         mem_mode = self.get_nodeattr("mem_mode")
         node = self.onnx_node
 
+        # TODO ensure codegen dir exists
         if mode == "cppsim":
             code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
         elif mode == "rtlsim":
@@ -433,6 +513,7 @@ def execute_node(self, context, graph):
         for inputs in node.input:
             # it is assumed that the first input of the node is the data input
             # the second input are the weights
+            # the third input are the thresholds
             if in_ind == 0:
                 assert (
                     str(context[inputs].dtype) == "float32"
@@ -440,7 +521,12 @@ def execute_node(self, context, graph):
                 not float32 as expected."""
                 expected_inp_shape = self.get_folded_input_shape()
                 reshaped_input = context[inputs].reshape(expected_inp_shape)
-                export_idt = self.get_input_datatype()
+                if self.get_input_datatype() == DataType["BIPOLAR"]:
+                    # store bipolar activations as binary
+                    reshaped_input = (reshaped_input + 1) / 2
+                    export_idt = DataType["BINARY"]
+                else:
+                    export_idt = self.get_input_datatype()
                 # make copy before saving the array
                 reshaped_input = reshaped_input.copy()
                 np.save(
@@ -468,11 +554,15 @@ def execute_node(self, context, graph):
             sim = self.get_rtlsim()
             nbits = self.get_instream_width()
             inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits)
-            self.reset_rtlsim(sim)
-            self.toggle_clk(sim)
-            if mem_mode in ["external", "decoupled"]:
+            reset_rtlsim(sim)
+            toggle_clk(sim)
+            if mem_mode == "external" or mem_mode == "decoupled":
                 wnbits = self.get_weightstream_width()
                 export_wdt = self.get_weight_datatype()
+                # we have converted bipolar weights to binary for export,
+                # so use it as such for weight generation
+                if self.get_weight_datatype() == DataType["BIPOLAR"]:
+                    export_wdt = DataType["BINARY"]
                 wei = npy_to_rtlsim_input("{}/weights.npy".format(code_gen_dir), export_wdt, wnbits)
                 num_w_reps = np.prod(self.get_nodeattr("numInputVectors"))
                 io_dict = {
@@ -489,6 +579,7 @@ def execute_node(self, context, graph):
             out_npy_path = "{}/output.npy".format(code_gen_dir)
             out_shape = self.get_folded_output_shape()
             rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits)
+
             # load and reshape output
             output = np.load(out_npy_path)
             oshape = self.get_normal_output_shape()
@@ -497,7 +588,19 @@ def execute_node(self, context, graph):
         else:
             raise Exception(
                 """Invalid value for attribute exec_mode! Is currently set to: {}
-            has to be set to "rtlsim" """.format(
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
                     mode
                 )
             )
+
+    def instantiate_ip(self, cmd):
+        # instantiate the HLS IP
+        vlnv = self.get_nodeattr("ip_vlnv")
+        node_name = self.onnx_node.name
+        if self.get_nodeattr("mem_mode") == "decoupled":
+            cmd.append(
+                "create_bd_cell -type ip -vlnv %s /%s/%s"
+                % (vlnv, node_name, node_name)
+            )
+        else:
+            cmd.append("create_bd_cell -type ip -vlnv %s %s" % (vlnv, self.onnx_node.name))
\ No newline at end of file
diff --git a/src/finn/custom_op/fpgadataflow/hwcustomop.py b/src/finn/custom_op/fpgadataflow/hwcustomop.py
index bf89bcc0b4..f62cf1af8a 100644
--- a/src/finn/custom_op/fpgadataflow/hwcustomop.py
+++ b/src/finn/custom_op/fpgadataflow/hwcustomop.py
@@ -204,15 +204,11 @@ def get_op_and_param_counts(self):
     def reset_rtlsim(self, sim):
         """Sets reset input in pyverilator to zero, toggles the clock and set it
         back to one"""
-        sim.io.ap_rst_n = 0
-        sim.io.ap_clk = 1
-        sim.io.ap_clk = 0
-        sim.io.ap_rst_n = 1
+        reset_rtlsim(sim)
 
     def toggle_clk(self, sim):
         """Toggles the clock input in pyverilator once."""
-        sim.io.ap_clk = 1
-        sim.io.ap_clk = 0
+        toggle_clk(sim)
 
     def rtlsim(self, sim, inp, inp2=None):
         """Runs the pyverilator simulation by passing the input values to the simulation,
diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
index 463a4effa8..28c0c24c09 100644
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
@@ -118,6 +118,8 @@ def get_nodeattr_types(self):
             # vector through the accelerator. This will get rid of any old
             # weight data from the weight FIFOs.
             "runtime_writeable_weights": ("i", False, 0, {0, 1}),
+            # Flag to specify whether RTL-based or HLS-based implementation is preferred
+            "preferred_backend": ("s", False, "rtl", {"hls", "rtl"})
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
@@ -150,11 +152,13 @@ def execute_node(self, context, graph):
             odt_is_bipolar = self.get_nodeattr("outputDataType") == "BIPOLAR"
             out_scale = 2 if odt_is_bipolar else 1
             out_bias = -1 if odt_is_bipolar else self.get_nodeattr("ActVal")
-            # NHWC to NCHW for multithreshold node
-            result = result.transpose((0, 3, 1, 2))
+            if result.ndim == 4:
+                # NHWC to NCHW for multithreshold node
+                result = result.transpose((0, 3, 1, 2))
             result = multithreshold(result, mvau_thr, out_scale, out_bias)
-            # NCHW to NHWC
-            result = result.transpose((0, 2, 3, 1))
+            if result.ndim == 4:
+                # NCHW to NHWC
+                result = result.transpose((0, 2, 3, 1))
 
         context[node.output[0]] = result
 
@@ -436,84 +440,6 @@ def uram_efficiency_estimation(self):
         uram_est_capacity = uram_est * 72 * 4096
         return wbits / uram_est_capacity
 
-    def lut_estimation(self):
-        """Calculates resource estimations for LUTs based on:
-        - FINN-R: An End-to-End Deep-Learning Framework for Fast
-        Exploration of Quantized Neural Networks
-        - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien,
-        Y. Umuroglu, M. Leeser and K. Vissers
-        - 12. Sep 2018
-        """
-        # TODO add in/out FIFO contributions
-        P = self.get_nodeattr("PE")
-        Q = self.get_nodeattr("SIMD")
-        MW = self.get_nodeattr("MW")
-        wdt = self.get_weight_datatype()
-        W = wdt.bitwidth()
-        # determine tdt with input and weight data types
-        idt = self.get_input_datatype()
-        A = idt.bitwidth()
-        # parameters from experiments in paper mentioned above
-        c0 = 300
-        c1 = 1.1
-        c2 = 0
-        mmode = self.get_nodeattr("mem_mode")
-        mstyle = self.get_nodeattr("ram_style")
-        if (mmode == "decoupled" and mstyle == "distributed") or (
-            mmode == "const" and self.calc_wmem() <= 128
-        ):
-            c2 = (P * Q * W) * math.ceil(self.calc_wmem() / 64)
-
-        # multiplication
-        res_type = self.get_nodeattr("resType")
-        if res_type == "dsp":
-            mult_luts = 0
-        else:
-            mult_luts = Q * (2 * math.ceil((W + A) / 6) - 1) * (W + A)
-        # adder tree
-        addertree_luts = (W + A) * (2 * Q - 1)
-        # accumulator
-        acc_datatype = self.get_accumulator_datatype()
-        # if accDataType is not set, then it will default to INT32, which would
-        # be a large overestimate in most (if not all) cases. In this scenario,
-        # we would use the minimum accumulator as determined by the data types
-        # bound, derived in https://arxiv.org/abs/2301.13376
-        alpha = math.log(MW, 2) + W + A - 1 - int(idt.signed())
-        acc_bits = min(
-            acc_datatype.bitwidth(),
-            np.ceil(alpha + math.log(1 + pow(2, -alpha), 2) + 1),
-        )
-        acc_luts = acc_bits
-        # thresholds and threshold comparators
-        thr_luts = 0
-        comp_luts = 0
-        noact = self.get_nodeattr("noActivation")
-        tmem_style = self.get_nodeattr("ram_style_thresholds")
-        if (noact == 0) and (tmem_style == "distributed"):
-            odt = self.get_output_datatype()
-            B = odt.bitwidth()
-            thr_luts = (2**B - 1) * acc_bits * math.ceil(self.calc_tmem() / 64)
-            comp_luts = (2**B - 1) * acc_bits
-
-        return int(
-            c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts)) + c2
-        )
-
-    def dsp_estimation(self):
-        # multiplication
-        P = self.get_nodeattr("PE")
-        res_type = self.get_nodeattr("resType")
-        Q = self.get_nodeattr("SIMD")
-        wdt = self.get_weight_datatype()
-        W = wdt.bitwidth()
-        idt = self.get_input_datatype()
-        A = idt.bitwidth()
-        if res_type == "dsp":
-            mult_dsp = P * Q * np.ceil((W + A) / 48)  # TODO: more accurate modelling
-        else:
-            mult_dsp = 0
-        return int(mult_dsp)
-
     def get_exp_cycles(self):
         pe = self.get_nodeattr("PE")
         simd = self.get_nodeattr("SIMD")
@@ -953,12 +879,9 @@ def code_generation_ipi(self):
                 "create_bd_intf_pin -mode Slave "
                 "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name)
             )
-            # instantiate the hls ip
-            cmd.append(
-                "create_bd_cell -type ip -vlnv %s /%s/%s"
-                % (self.get_nodeattr("ip_vlnv"), node_name, node_name)
-            )
-
+            # Instantiate either the HLS or RTL IP depending on operator
+            self.instantiate_ip(cmd)
+            
             # instantiate a streamer and connect it to the HLS IP
             strm_vlnv = "amd.com:finn:memstream:1.0"
             strm_inst = node_name + "_wstrm"
@@ -1029,7 +952,7 @@ def code_generation_ipi(self):
             cmd.append("save_bd_design")
         elif mem_mode == "const" or mem_mode == "external":
             # base class impl sufficient for const/external modes
-            return super().code_generation_ipi()
+            self.instantiate_ip(cmd)
         else:
             raise Exception("Unrecognized mem_mode for MatrixVectorActivation")
         return cmd
diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
new file mode 100644
index 0000000000..fcab06658c
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
@@ -0,0 +1,1086 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import math
+import numpy as np
+import os
+import textwrap
+import warnings
+from qonnx.core.datatype import DataType
+from qonnx.util.basic import (
+    calculate_matvec_accumulator_range,
+    interleave_matrix_outer_dim_from_partitions,
+    roundup_to_integer_multiple,
+)
+
+from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
+from finn.util.basic import get_rtlsim_trace_depth, make_build_dir
+from finn.util.data_packing import (
+    npy_to_rtlsim_input,
+    pack_innermost_dim_as_hex_string,
+    rtlsim_output_to_npy,
+)
+
+try:
+    from pyverilator import PyVerilator
+except ModuleNotFoundError:
+    PyVerilator = None
+
+
+# ONNX i/o tensor shape assumptions for MatrixVectorActivation:
+# input 0 is the input tensor, shape (.., i_size) = (..., MW)
+# input 1 is the weight tensor, shape (i_size, o_size) = (MW, MH)
+# (optional) input 2 is the thresholds tensor, shape (o_size, n_thres)
+# output 0 is the output tensor, shape (.., o_size) = (..., MH)
+# the ... here can be any shape (representing groups of vectors)
+
+
+class MatrixVectorActivation_rtl(HLSCustomOp):
+    """Class that corresponds to finn-rtl Matrix Vector Unit."""
+
+    def __init__(self, onnx_node, **kwargs):
+        super().__init__(onnx_node, **kwargs)
+
+    def get_nodeattr_types(self):
+        my_attrs = {
+            "PE": ("i", True, 0),
+            "SIMD": ("i", True, 0),
+            "MW": ("i", True, 0),
+            "MH": ("i", True, 0),
+            "resType": ("s", False, "dsp", {"auto", "lut", "dsp"}),
+            # FINN DataTypes for inputs, weights, outputs
+            "inputDataType": ("s", True, ""),
+            "weightDataType": ("s", True, ""),
+            "outputDataType": ("s", True, ""),
+            # FINN DataType for accumulator -- auto-computed and updated
+            "accDataType": ("s", False, "INT32"),
+            # number of input vectors, examples:
+            # [1] is a single vector (like a FC layer with batch=1)
+            # [4] is four vectors (like a FC layer with batch=4)
+            # [1, 4, 4] is four * four vectors (like a conv layer with batch=1)
+            "numInputVectors": ("ints", False, [1]),
+            # memory mode for the FC weights
+            # const -- embedded weights, default, long compile/synth times
+            # decoupled -- streaming weights with weight streamer packaged inside IP
+            # external -- streaming weights with external streamer
+            "mem_mode": ("s", False, "const", {"const", "decoupled", "external"}),
+            # FPGA resource type for memories in decoupled mode
+            # auto -- let Vivado decide
+            # block -- use BRAM
+            # distributed -- use LUTRAM
+            # ultra -- use UltraRAM (URAM), must have runtime_writeable_weights=1
+            # see also https://www.xilinx.com/support/answers/38070.html
+            "ram_style": (
+                "s",
+                False,
+                "auto",
+                {"auto", "block", "distributed", "ultra"},
+            ),
+            # (mem_mode = decoupled only) whether weights will be writable through
+            # an AXI-lite interface during runtime
+            # 1 for enabled, 0 for disabled.
+            # see finn-rtllib/memstream/doc/README for more about the memory
+            # address map used for writable weights
+            # IMPORTANT: After using AXI lite to either read or write the weights,
+            # always "flush" the accelerator by first passing a dummy input
+            # vector through the accelerator. This will get rid of any old
+            # weight data from the weight FIFOs.
+            "runtime_writeable_weights": ("i", False, 0, {0, 1}),
+            # attribute to save top module name - not user configurable
+            "gen_top_module": ("s", False, ""),
+        }
+        my_attrs.update(super().get_nodeattr_types())
+        return my_attrs
+
+    def calc_wmem(self):
+        """Calculates and returns WMEM."""
+        mw = self.get_nodeattr("MW")
+        mh = self.get_nodeattr("MH")
+        pe = self.get_nodeattr("PE")
+        simd = self.get_nodeattr("SIMD")
+        assert mh % pe == 0, "Requirement MH divisable by PE is violated."
+        assert mw % simd == 0, "Requirement MW divisable by SIMD is violated."
+        wmem = mw * mh // (pe * simd)
+        return wmem
+
+    def calc_tmem(self):
+        """Calculates and returns TMEM."""
+        return 0
+
+    def make_shape_compatible_op(self, model):
+        oshape = self.get_normal_output_shape()
+        return super().make_const_shape_op(oshape)
+
+    def infer_node_datatype(self, model):
+        node = self.onnx_node
+        idt = model.get_tensor_datatype(node.input[0])
+        if idt != self.get_input_datatype():
+            warn_str = "inputDataType changing for %s: %s -> %s " % (
+                node.name,
+                str(self.get_input_datatype()),
+                str(idt),
+            )
+            warnings.warn(warn_str)
+        self.set_nodeattr("inputDataType", idt.name)
+        # set output datatype from property
+        odt = self.get_output_datatype()
+        model.set_tensor_datatype(node.output[0], odt)
+
+    def verify_node(self):
+        info_messages = []
+        # verify that "backend" is set to "fpgadataflow"
+        backend_value = self.get_nodeattr("backend")
+        if backend_value == "fpgadataflow":
+            info_messages.append("Attribute backend is set correctly")
+        else:
+            info_messages.append('Attribute backend should be set to "fpgadataflow"')
+
+        # verify that all necessary attributes exist
+        # TODO collect automatically from get_nodeattr_types
+        try:
+            self.get_nodeattr("executable_path")
+            self.get_nodeattr("resType")
+            self.get_nodeattr("MW")
+            self.get_nodeattr("MH")
+            self.get_nodeattr("SIMD")
+            self.get_nodeattr("PE")
+            self.get_nodeattr("inputDataType")
+            self.get_nodeattr("weightDataType")
+            self.get_nodeattr("outputDataType")
+            info_messages.append("All necessary attributes exist")
+        except Exception:
+            info_messages.append(
+                """The required MatrixVectorActivation attributes do not exist."""
+            )
+
+        num_of_inputs = len(self.onnx_node.input)
+        if num_of_inputs != 2:
+            info_messages.append(
+                "RTL-based MatrixVectorActivation expects two inputs "
+                "(weights and activation), but got {} inputs.".format(
+                    len(self.onnx_node.input)
+                )
+            )
+
+        mem_mode = self.get_nodeattr("mem_mode")
+
+        if mem_mode not in ["decoupled", "external"]:
+            info_messages.append(
+                "RTL-based MVU only supports decoupled or external weights."
+            )
+
+        if self.get_nodeattr("resType") == "lut":
+            info_message.append(
+                "RTL-based MVU only supports DSP-based implementation"
+            )
+
+        return info_messages
+
+    def uram_estimation(self):
+        P = self.get_nodeattr("PE")
+        Q = self.get_nodeattr("SIMD")
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        D_in = self.get_nodeattr("MW")
+        D_out = self.get_nodeattr("MH")
+        omega = (D_in * D_out) / (Q * P)
+        mem_width = Q * W * P
+        mmode = self.get_nodeattr("mem_mode")
+        mstyle = self.get_nodeattr("ram_style")
+        if (
+            (mmode == "decoupled" and mstyle != "ultra")
+            or (mmode == "external")
+        ):
+            return 0
+        width_multiplier = math.ceil(mem_width / 72)
+        depth_multiplier = math.ceil(omega / 4096)
+        return width_multiplier * depth_multiplier
+
+    def bram_estimation(self):
+        """Calculates resource estimation for BRAM based on:
+        - FINN-R: An End-to-End Deep-Learning Framework for Fast
+        Exploration of Quantized Neural Networks
+        - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien,
+        Y. Umuroglu, M. Leeser and K. Vissers
+        - 12. Sep 2018
+        """
+        # TODO add in/out FIFO contributions
+        P = self.get_nodeattr("PE")
+        Q = self.get_nodeattr("SIMD")
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        D_in = self.get_nodeattr("MW")
+        D_out = self.get_nodeattr("MH")
+        omega = (D_in * D_out) / (Q * P)
+        mem_width = Q * W * P
+        mmode = self.get_nodeattr("mem_mode")
+        mstyle = self.get_nodeattr("ram_style")
+        if (
+            (mmode == "decoupled" and mstyle in ["distributed", "ultra"])
+            or (mmode == "external")
+        ):
+            return 0
+        # assuming SDP mode RAMB18s (see UG573 Table 1-10)
+        # assuming decoupled (RTL) memory
+        if mem_width == 1:
+            return math.ceil(omega / 16384)
+        elif mem_width == 2:
+            return math.ceil(omega / 8192)
+        elif mem_width <= 4:
+            return (math.ceil(omega / 4096)) * (math.ceil(mem_width / 4))
+        elif mem_width <= 9:
+            return (math.ceil(omega / 2048)) * (math.ceil(mem_width / 9))
+        elif mem_width <= 18 or omega > 512:
+            return (math.ceil(omega / 1024)) * (math.ceil(mem_width / 18))
+        else:
+            return (math.ceil(omega / 512)) * (math.ceil(mem_width / 36))
+
+    def bram_efficiency_estimation(self):
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        D_in = self.get_nodeattr("MW")
+        D_out = self.get_nodeattr("MH")
+        bram16_est = self.bram_estimation()
+        if bram16_est == 0:
+            return 1
+        wbits = W * D_in * D_out
+        bram16_est_capacity = bram16_est * 36 * 512
+        return wbits / bram16_est_capacity
+
+    def uram_efficiency_estimation(self):
+        """Function for URAM efficiency estimation: actual parameter storage
+        needed divided by the allocated URAM storage (from estimation)"""
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        D_in = self.get_nodeattr("MW")
+        D_out = self.get_nodeattr("MH")
+        uram_est = self.uram_estimation()
+        if uram_est == 0:
+            return 1
+        wbits = W * D_in * D_out
+        uram_est_capacity = uram_est * 72 * 4096
+        return wbits / uram_est_capacity
+
+# TODO: fix lut estimations 
+    def lut_estimation(self):
+        """Calculates resource estimations for LUTs based on:
+        - FINN-R: An End-to-End Deep-Learning Framework for Fast
+        Exploration of Quantized Neural Networks
+        - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien,
+        Y. Umuroglu, M. Leeser and K. Vissers
+        - 12. Sep 2018
+        """
+        # TODO add in/out FIFO contributions
+        P = self.get_nodeattr("PE")
+        Q = self.get_nodeattr("SIMD")
+        MW = self.get_nodeattr("MW")
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        # determine tdt with input and weight data types
+        idt = self.get_input_datatype()
+        A = idt.bitwidth()
+        # parameters from experiments in paper mentioned above
+        c0 = 300
+        c1 = 1.1
+        c2 = 0
+        mmode = self.get_nodeattr("mem_mode")
+        mstyle = self.get_nodeattr("ram_style")
+        if (mmode == "decoupled" and mstyle == "distributed") or (
+            mmode == "const" and self.calc_wmem() <= 128
+        ):
+            c2 = (P * Q * W) * math.ceil(self.calc_wmem() / 64)
+
+        # multiplication
+        res_type = self.get_nodeattr("resType")
+        if res_type == "dsp":
+            mult_luts = 0
+        else:
+            mult_luts = Q * (2 * math.ceil((W + A) / 6) - 1) * (W + A)
+        # adder tree
+        addertree_luts = (W + A) * (2 * Q - 1)
+        # accumulator
+        acc_bits = W + A + np.ceil(math.log(MW, 2))
+        acc_luts = acc_bits
+
+        return int(c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts)) + c2)
+
+# TODO: fix DSP estimations --> depends on fpga_part
+    def dsp_estimation(self):
+        # multiplication
+        # mvu_8sx9 (DSP58): ceil(SIMD/3)
+        # mvu_4sx4u (DSP48/DSP58): ceil(PE/4)
+        # mvu_8sx8u (DSP48): ceil(PE/2)
+        # mvu_lut: 0
+        P = self.get_nodeattr("PE")
+        res_type = self.get_nodeattr("resType")
+        Q = self.get_nodeattr("SIMD")
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        idt = self.get_input_datatype()
+        A = idt.bitwidth()
+        if res_type == "dsp":
+            mult_dsp = P * Q * np.ceil((W + A) / 48)  # TODO: more accurate modelling
+        else:
+            mult_dsp = 0
+        return int(mult_dsp)
+
+# TODO: fix exp_cycles estimations --> depends on fpga_part and clk
+    def get_exp_cycles(self):
+        # mvu_8sx9 (DSP58):
+        # 2 (replay_buffer) + ceil(chainlen/seglen) + 2 (MREG, PREG) + 2 (output reg slice)
+        # + MW/SIMD * MH/PE
+        # mvu_4sx4u (DSP48/DSP58) / mvu_8sx8u (DSP48): 
+        # 3 (IN_REG, MREG, PREG) + 2 (replay_buffer) + 2 (output reg slice) + 1 (adder tree SIMD) + 1 (output lane)
+        # + MW/SIMD * MH/PE
+        # mvu_lut:
+        # 2 (replay_buffer) + 1 OR 2 (no MREG OR MREG) + 2 (output reg slice) 
+        # + MW/SIMD * MH/PE
+        pe = self.get_nodeattr("PE")
+        simd = self.get_nodeattr("SIMD")
+        num_inp_vec = self.get_nodeattr("numInputVectors")
+        mh = self.get_nodeattr("MH")
+        mw = self.get_nodeattr("MW")
+        # since mmv != 1 is not supported yet, we set mmv for now to 1
+        mmv = 1     
+        exp_cycles = (mh / pe) * (mw / simd) * np.prod(num_inp_vec) / mmv
+        return int(exp_cycles)
+
+    def get_input_datatype(self, ind=0):
+        """Returns FINN DataType of input."""
+        # when performing FIFO insertion on an FC layer with ext weights, the ind
+        # parameter can be > 0 (referring to the weights) so handle that here
+        if ind == 0:
+            return DataType[self.get_nodeattr("inputDataType")]
+        elif ind == 1:
+            return DataType[self.get_nodeattr("weightDataType")]
+        else:
+            raise Exception("Undefined input ind for this layer type")
+
+    def get_weight_datatype(self):
+        """Returns FINN DataType of weights."""
+        return DataType[self.get_nodeattr("weightDataType")]
+
+    def get_output_datatype(self, ind=0):
+        """Returns FINN DataType of output."""
+        return DataType[self.get_nodeattr("outputDataType")]
+
+    def get_instream_width(self, ind=0):
+        i_bits = self.get_input_datatype().bitwidth()
+        assert (
+            i_bits <= 9
+        ), "RTL-based MVAU only supports activations with bit-width up to 9-bits"
+        in_width = i_bits * self.get_nodeattr("SIMD")
+        return in_width
+
+    def get_outstream_width(self, ind=0):
+        o_bits = self.get_output_datatype().bitwidth()
+        out_width = o_bits * self.get_nodeattr("PE")
+        return out_width
+
+    def get_weightstream_width(self):
+        """Returns weight stream width. Used only in decoupled mode."""
+        if (
+            self.get_nodeattr("mem_mode") == "decoupled"
+            or self.get_nodeattr("mem_mode") == "external"
+        ):
+            pe = self.get_nodeattr("PE")
+            simd = self.get_nodeattr("SIMD")
+            wp = self.get_weight_datatype().bitwidth()
+            assert (
+                wp <= 8
+            ), "RTL-based MVAU only supports weights with bit-width up to 8-bits"
+            w_width = pe * simd * wp
+            return w_width
+        else:
+            return 0
+
+    def get_weightstream_width_padded(self):
+        """Returns weight stream width padded to a multiple of 8. This is required
+        by the AXI Stream spec. Used in decoupled mode."""
+        weight_width = self.get_weightstream_width()
+        return roundup_to_integer_multiple(weight_width, 8)
+
+    def get_ap_int_max_w(self):
+        # base class impl (max of inp/out stream widths)
+        max_of_io = super().get_ap_int_max_w()
+        # decoupled mode weight stream
+        weightstream = self.get_weightstream_width()
+        # single PE weight entry
+        weight_bits = self.get_weight_datatype().bitwidth()
+        simd = self.get_nodeattr("SIMD")
+        single_pe_w = simd * weight_bits
+        return max([weightstream, max_of_io, single_pe_w])
+
+    def get_folded_input_shape(self, ind=0):
+        mw = self.get_nodeattr("MW")
+        mh = self.get_nodeattr("MH")
+        simd = self.get_nodeattr("SIMD")
+        pe = self.get_nodeattr("PE")
+        sf = mw // simd
+        nf = mh // pe
+        vecs = list(self.get_nodeattr("numInputVectors"))
+
+        if ind == 0:
+            # calculate shape of input 0
+            folded_input_shape = tuple(vecs + [sf, simd])
+        elif ind == 1 and self.get_nodeattr("mem_mode") == "external":
+            # calculate shape of input 1 (weights)
+            folded_input_shape = tuple(vecs + [sf * nf, simd * pe])
+        else:
+            raise Exception("Undefined input shape for requested input")
+
+        return folded_input_shape
+
+    def get_folded_output_shape(self, ind=0):
+        mh = self.get_nodeattr("MH")
+        pe = self.get_nodeattr("PE")
+        nf = mh // pe
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        folded_output_shape = tuple(vecs + [nf, pe])
+        return folded_output_shape
+
+    def get_normal_input_shape(self, ind=0):
+        mw = self.get_nodeattr("MW")
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        normal_input_shape = tuple(vecs + [mw])
+        return normal_input_shape
+
+    def get_normal_output_shape(self, ind=0):
+        mh = self.get_nodeattr("MH")
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        normal_output_shape = tuple(vecs + [mh])
+        return normal_output_shape
+
+    def get_number_output_values(self):
+        nf = np.prod(self.get_folded_output_shape()[:-1])
+        return nf
+
+    def get_hls_compatible_weight_tensor(self, orig_weight_matrix):
+        """Convert the original numpy weight matrix orig_weight_matrix into
+        a form suitable for passing to the hlslib call:
+        * ensure MH % PE == 0 and MW % SIMD == 0
+        * for bipolar {-1,+1} weights, convert to binary {0, 1}
+        * interleave rows between PEs
+        * reshape into (1, PE, WMEM, SIMD) and return
+        """
+        mw = self.get_nodeattr("MW")
+        mh = self.get_nodeattr("MH")
+        pe = self.get_nodeattr("PE")
+        simd = self.get_nodeattr("SIMD")
+        wmem = self.calc_wmem()
+        assert orig_weight_matrix.shape == (
+            mw,
+            mh,
+        ), """Weights matrix doesn't
+        have expected shape (mw, mh)"""
+        assert mw % simd == 0, "Requirement MH divisable by SIMD is violated."
+        assert mh % pe == 0, "Requirement MH divisable by PE is violated."
+        # start by transposing the original weight matrix, since ONNX and
+        # finn-hlslib use different assumptions
+        # ONNX uses (in_features, out_features) and matmul(x, W)
+        # finn-hlslib uses (out_features, in_features) and matmul(W, x)
+        ret = orig_weight_matrix.T
+        # interleave rows between PEs and reshape
+        # distribute rows between PEs
+        ret = interleave_matrix_outer_dim_from_partitions(ret, pe)
+        # create SIMD as innermost dimension and add a dummy outer dim
+        ret = ret.reshape(1, pe, wmem, simd)
+        # reverse the SIMD dimension
+        ret = np.flip(ret, axis=-1)
+        return ret
+
+    def minimize_accumulator_width(self, model):
+        weights = model.get_initializer(self.onnx_node.input[1])
+        idt = self.get_input_datatype()
+        # calculate minimum and maximum values of accumulator
+        (acc_min, acc_max) = calculate_matvec_accumulator_range(weights, idt)
+        if acc_min < 0:
+            if abs(acc_min) > acc_max:
+                adt = DataType.get_smallest_possible(acc_min)
+            else:
+                adt = DataType.get_smallest_possible(-acc_max - 1)
+        else:
+            adt = DataType.get_smallest_possible(acc_max)
+        # Note: we are interested in simply the width of the output dot product.
+        # Padding the actual output stream to a multiple of 8-bits is done in
+        # the RTL component
+        self.set_nodeattr("accDataType", adt.name)
+        # for no-activation nodes, output dt = acc dt
+        self.set_nodeattr("outputDataType", adt.name)
+        return DataType[self.get_nodeattr("accDataType")]
+
+    def make_weight_file(self, weights, weight_file_mode, weight_file_name):
+        """Produce a file containing given weights in appropriate format for this
+        layer. This file can be used for either synthesis or run-time reconfig
+        of weights.
+
+        Arguments:
+        * weights : numpy array with weights to be put into the file
+        * weight_file_mode : one of {hls_header, decoupled_verilog_dat,
+          decoupled_runtime}
+        * weight_file_name : filename for the weight file to be generated
+        """
+        # convert weights into hlslib-compatible format
+        weight_tensor = self.get_hls_compatible_weight_tensor(weights)
+        export_wdt = self.get_weight_datatype()
+        if "decoupled" in weight_file_mode:
+            # create a weight stream for various flavors of decoupled mode:
+            # transpose weight tensor from (1, PE, WMEM, SIMD) to (1, WMEM, PE, SIMD)
+            weight_tensor_unflipped = np.transpose(weight_tensor, (0, 2, 1, 3))
+            # reverse SIMD flip for saving weights in .npy
+            weight_tensor_simd_flipped = np.flip(weight_tensor_unflipped, axis=-1)
+            # PE flip for saving weights in .dat
+            weight_tensor_pe_flipped = np.flip(weight_tensor_unflipped, axis=-2)
+            # reshape weight tensor (simd_flipped and pe_flipped) to desired shape
+            pe = self.get_nodeattr("PE")
+            simd = self.get_nodeattr("SIMD")
+            # simd_flipped
+            weight_tensor_simd_flipped = weight_tensor_simd_flipped.reshape(
+                1, -1, pe * simd
+            )
+            weight_tensor_simd_flipped = weight_tensor_simd_flipped.copy()
+            # flipped
+            weight_tensor_pe_flipped = weight_tensor_pe_flipped.reshape(
+                1, -1, pe * simd
+            )
+            weight_tensor_pe_flipped = weight_tensor_pe_flipped.copy()
+            if weight_file_mode == "decoupled_npy":
+                # save weight stream into npy for cppsim
+                np.save(weight_file_name, weight_tensor_simd_flipped)
+            elif weight_file_mode == "decoupled_verilog_dat":
+                # convert weight values into hexstring
+                weight_width = self.get_weightstream_width()
+                # pad to nearest 4 bits to get hex strings
+                weight_width_padded = roundup_to_integer_multiple(weight_width, 4)
+                weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string(
+                    weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix=""
+                )
+                # add zeroes to pad out file to 1024 entries
+                weight_stream = weight_tensor_pe_flipped.flatten()
+                weight_stream = weight_stream.copy()
+                with open(weight_file_name, "w") as f:
+                    for val in weight_stream:
+                        f.write(val + "\n")
+            elif weight_file_mode == "decoupled_runtime":
+                # memstream axi-lite interface will map each mem line to
+                # one or multiple 32-bit words
+                weight_width = self.get_weightstream_width()
+                words_per_memwidth = 2 ** math.ceil(math.log2(weight_width / 32))
+                if words_per_memwidth < 1:
+                    words_per_memwidth = 1
+                weight_width_padded = words_per_memwidth * 32
+                # first, pack and ensure padding to 32 bits
+                weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string(
+                    weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix=""
+                )
+                weight_stream = weight_tensor_pe_flipped.flatten()
+                weight_stream = weight_stream.copy()
+                with open(weight_file_name, "w") as f:
+                    for val in weight_stream:
+                        # split into groups of 8 hex digits (= 32 bits)
+                        words_32b = textwrap.wrap(val, 8)
+                        words_32b.reverse()
+                        for word_32b in words_32b:
+                            f.write(word_32b + "\n")
+            else:
+                raise Exception("Unknown/unsupported weight_file_mode")
+
+        else:
+            raise Exception("Unknown/unsupported weight_file_mode")
+
+    def generate_params(self, model, path):
+        mem_mode = self.get_nodeattr("mem_mode")
+        code_gen_dir = path
+        # weights, if not external
+        weights = model.get_initializer(self.onnx_node.input[1])
+        if mem_mode == "decoupled" or mem_mode == "external":
+            weight_filename_sim = "{}/weights.npy".format(code_gen_dir)
+            # save decoupled weights for cppsim
+            self.make_weight_file(weights, "decoupled_npy", weight_filename_sim)
+            if mem_mode == "decoupled":
+                # also save weights as Verilog .dat file
+                # This file will be ignored when synthesizing UltraScale memory.
+                weight_filename_rtl = "{}/memblock.dat".format(code_gen_dir)
+                self.make_weight_file(
+                    weights, "decoupled_verilog_dat", weight_filename_rtl
+                )
+        else:
+            raise Exception(
+                """Please set mem_mode to "const", "decoupled", or "external",
+                currently no other parameter value is supported!"""
+            )
+
+    def execute_node(self, context, graph):
+        mode = self.get_nodeattr("exec_mode")
+        mem_mode = self.get_nodeattr("mem_mode")
+        node = self.onnx_node
+
+        if mode == "cppsim":
+            raise Exception(
+                "cppsim not possible for RTL MVAU, please set exec_mode to rtlsim"
+            )
+        elif mode == "rtlsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        # create a npy file fore each input of the node (in_ind is input index)
+        in_ind = 0
+        for inputs in node.input:
+            # it is assumed that the first input of the node is the data input
+            # the second input are the weights
+            if in_ind == 0:
+                assert (
+                    str(context[inputs].dtype) == "float32"
+                ), """Input datatype is
+                not float32 as expected."""
+                expected_inp_shape = self.get_folded_input_shape()
+                reshaped_input = context[inputs].reshape(expected_inp_shape)
+                export_idt = self.get_input_datatype()
+                # make copy before saving the array
+                reshaped_input = reshaped_input.copy()
+                np.save(
+                    os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)),
+                    reshaped_input,
+                )
+            elif in_ind > 2:
+                raise Exception("Unexpected input found for MatrixVectorActivation_rtl")
+            in_ind += 1
+
+        if mode == "rtlsim":
+            sim = self.get_rtlsim()
+            nbits = self.get_instream_width()
+            inp = npy_to_rtlsim_input(
+                "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
+            )
+            super().reset_rtlsim(sim)
+            super().toggle_clk(sim)
+            if mem_mode in ["external", "decoupled"]:
+                wnbits = self.get_weightstream_width()
+                export_wdt = self.get_weight_datatype()
+                wei = npy_to_rtlsim_input(
+                    "{}/weights.npy".format(code_gen_dir), export_wdt, wnbits
+                )
+                num_w_reps = np.prod(self.get_nodeattr("numInputVectors"))
+                io_dict = {
+                    "inputs": {"in0": inp, "weights": wei * num_w_reps},
+                    "outputs": {"out": []},
+                }
+                self.rtlsim_multi_io(sim, io_dict)
+                output = io_dict["outputs"]["out"]
+            else:
+                output = self.rtlsim(sim, inp)
+            odt = self.get_output_datatype()
+            target_bits = odt.bitwidth()
+            packed_bits = self.get_outstream_width()
+            out_npy_path = "{}/output.npy".format(code_gen_dir)
+            out_shape = self.get_folded_output_shape()
+            rtlsim_output_to_npy(
+                output, out_npy_path, odt, out_shape, packed_bits, target_bits
+            )
+            # load and reshape output
+            output = np.load(out_npy_path)
+            oshape = self.get_normal_output_shape()
+            output = np.asarray([output], dtype=np.float32).reshape(*oshape)
+            context[node.output[0]] = output
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to "rtlsim" """.format(
+                    mode
+                )
+            )
+
+    def code_generation_ipgen(self, model, fpgapart, clk):
+        """Normally: Generates C++ code and tcl script for IP generation.
+        Here: Generates (System-)Verilog code for IP generation."""
+        self.generate_hdl(model, fpgapart, clk)
+
+    def ipgen_singlenode_code(self):
+        """Normally: Builds the bash script for IP generation."""
+        pass
+
+    def code_generation_cppsim(self, model):
+        """Normally: Generates C++ code for simulation (cppsim)."""
+        pass
+
+    def compile_singlenode_code(self):
+        pass
+
+    def global_includes(self):
+        pass
+
+    def defines(self, var):
+        pass
+
+    def read_npy_data(self):
+        pass
+
+    def strm_decl(self):
+        pass
+
+    def docompute(self):
+        pass
+
+    def dataoutstrm(self):
+        pass
+
+    def save_as_npy(self):
+        pass
+
+    def blackboxfunction(self):
+        pass
+
+    def pragmas(self):
+        pass
+
+    def code_generation_ipi(self):
+        cmd = []
+        # add streamer if needed
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode == "decoupled":
+            runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1
+            if self.get_nodeattr("ram_style") == "ultra":
+                assert (
+                    runtime_writable == 1
+                ), "Layer with URAM weights must have runtime_writeable_weights=1"
+            node_name = self.onnx_node.name
+            sname = self.hls_sname()
+            # create a hierarchy for this layer, with the same port names
+            clk_name = self.get_verilog_top_module_intf_names()["clk"][0]
+            rst_name = self.get_verilog_top_module_intf_names()["rst"][0]
+            dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0][0]
+            din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0]
+            cmd.append("create_bd_cell -type hier %s" % node_name)
+            cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name))
+            cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name))
+            cmd.append(
+                "create_bd_intf_pin -mode Master "
+                "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s"
+                % (node_name, dout_name)
+            )
+            cmd.append(
+                "create_bd_intf_pin -mode Slave "
+                "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name)
+            )
+            # instantiate the RTL block
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+            rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/")
+            sourcefiles = [
+                os.path.join(
+                    code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"
+                ),
+                rtllib_dir + "mvu_vvu_axi.sv",
+                rtllib_dir + "replay_buffer.sv",
+                rtllib_dir + "mvu_4sx4u.sv",
+                rtllib_dir + "mvu_vvu_8sx9_dsp58.sv",
+                rtllib_dir + "mvu_8sx8u_dsp48.sv",
+            ]
+            for f in sourcefiles:
+                cmd.append("add_files -norecurse %s" % (f))
+            cmd.append(
+                "create_bd_cell -type hier -reference %s /%s/%s"
+                % (
+                    self.get_nodeattr("gen_top_module"),
+                    self.onnx_node.name,
+                    self.onnx_node.name,
+                )
+            )
+
+            # instantiate a streamer and connect it to the HLS IP
+            strm_vlnv = "amd.com:finn:memstream:1.0"
+            strm_inst = node_name + "_wstrm"
+            cmd.append(
+                "create_bd_cell -type ip -vlnv %s /%s/%s"
+                % (strm_vlnv, node_name, strm_inst)
+            )
+            cmd.append(
+                "set_property -dict [list "
+                "CONFIG.DEPTH {%d} "
+                "CONFIG.WIDTH {%d} "
+                "CONFIG.INIT_FILE {%s} "
+                "CONFIG.RAM_STYLE {%s} "
+                "] [get_bd_cells /%s/%s]"
+                % (
+                    self.calc_wmem(),
+                    self.get_weightstream_width_padded(),
+                    self.get_nodeattr("code_gen_dir_ipgen") + "/memblock.dat",
+                    self.get_nodeattr("ram_style"),
+                    node_name,
+                    strm_inst,
+                )
+            )
+            cmd.append(
+                "connect_bd_intf_net [get_bd_intf_pins %s/%s/m_axis_0] "
+                "[get_bd_intf_pins %s/%s/weights_%s]"
+                % (node_name, strm_inst, node_name, node_name, sname)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/ap_rst_n]"
+                % (node_name, rst_name, node_name, strm_inst)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/ap_clk]"
+                % (node_name, clk_name, node_name, strm_inst)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]"
+                % (node_name, rst_name, node_name, node_name, rst_name)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]"
+                % (node_name, clk_name, node_name, node_name, clk_name)
+            )
+            cmd.append(
+                "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
+                "[get_bd_intf_pins %s/%s/%s]"
+                % (node_name, din_name, node_name, node_name, din_name)
+            )
+            cmd.append(
+                "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
+                "[get_bd_intf_pins %s/%s/%s]"
+                % (node_name, dout_name, node_name, node_name, dout_name)
+            )
+            if runtime_writable:
+                # expose axi lite interface for writeable weights
+                axilite_name = self.get_verilog_top_module_intf_names()["axilite"][0]
+                cmd.append(
+                    "create_bd_intf_pin -mode Slave "
+                    "-vlnv xilinx.com:interface:aximm_rtl:1.0 /%s/%s"
+                    % (node_name, axilite_name)
+                )
+                cmd.append(
+                    "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
+                    "[get_bd_intf_pins %s/%s/%s]"
+                    % (node_name, axilite_name, node_name, strm_inst, axilite_name)
+                )
+                # TODO calculate and pass in segment size here
+                cmd.append("assign_bd_address")
+            cmd.append("save_bd_design")
+        elif mem_mode == "external":
+            # instantiate the RTL block
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+            rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/")
+            sourcefiles = [
+                os.path.join(
+                    code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"
+                ),
+                rtllib_dir + "mvu_vvu_axi.sv",
+                rtllib_dir + "replay_buffer.sv",
+                rtllib_dir + "mvu_4sx4u.sv",
+                rtllib_dir + "mvu_vvu_8sx9_dsp58.sv",
+                rtllib_dir + "mvu_8sx8u_dsp48.sv",
+            ]
+            for f in sourcefiles:
+                cmd.append("add_files -norecurse %s" % (f))
+            cmd.append(
+                "create_bd_cell -type module -reference %s %s"
+                % (
+                    self.get_nodeattr("gen_top_module"),
+                    self.onnx_node.name,
+                )
+            )
+        else:
+            raise Exception("Unrecognized mem_mode for MatrixVectorActivation")
+        return cmd
+
+    def get_verilog_top_module_intf_names(self):
+        intf_names = super().get_verilog_top_module_intf_names()
+        mem_mode = self.get_nodeattr("mem_mode")
+        sname = self.hls_sname()
+        if mem_mode == "external":
+            intf_names["s_axis"].append(
+                ("weights_" + sname, self.get_weightstream_width_padded())
+            )
+        if mem_mode == "decoupled":
+            # only expose axilite interface if attribute is set
+            runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1
+            if runtime_writable:
+                intf_names["axilite"] = ["s_axilite"]
+        return intf_names
+
+    def get_op_and_param_counts(self):
+        in_features = self.get_nodeattr("MW")
+        out_features = self.get_nodeattr("MH")
+        weight_bits = self.get_weight_datatype().bitwidth()
+        inp_bits = self.get_input_datatype().bitwidth()
+        num_inp_vec = self.get_nodeattr("numInputVectors")
+        num_repetitions = int(np.prod(num_inp_vec))
+        mac_count = in_features * out_features * num_repetitions
+        # cannonicalize op type: highest bitwidth operand first s.t.
+        # e.g. mac_8bx4b and mac_4bx8b don't appear as two different op types
+        bw1 = min(inp_bits, weight_bits)
+        bw2 = max(inp_bits, weight_bits)
+        mac_op_type = "op_mac_%dbx%db" % (bw1, bw2)
+        weight_param_type = "param_weight_%db" % (weight_bits)
+        weight_count = in_features * out_features
+        ret_dict = {mac_op_type: mac_count, weight_param_type: weight_count}
+        return ret_dict
+
+    def derive_characteristic_fxns(self, period):
+        n_inps = np.prod(self.get_folded_input_shape()[:-1])
+        io_dict = {
+            "inputs": {
+                "in0": [0 for i in range(n_inps)],
+            },
+            "outputs": {"out": []},
+        }
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode in ["decoupled", "external"]:
+            n_weight_inps = self.calc_wmem()
+            num_w_reps = np.prod(self.get_nodeattr("numInputVectors"))
+            io_dict["inputs"]["weights"] = [
+                0 for i in range(num_w_reps * n_weight_inps)
+            ]
+        super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
+
+    def _resolve_segment_len(self, clk):
+        # Insert pipeline registers in the DSP58 chain to meet target clock frequency
+        # ~0.741 ns seems the worst-case delay through first DSP
+        # ~0.605 ns seems to be (on average) delay for all subsequent DSPs
+        # clk >= (critical_path_dsps - 1) * 0.605 + 0.741
+        assert (clk > 0.741), "Infeasible clk target of {} ns has been set, consider lowering the targeted clock frequency!".format(clk)
+        critical_path_dsps = np.floor((clk - 0.741) / 0.605 + 1)
+        max_chain_len = np.ceil(self.get_nodeattr("SIMD") / 3)
+        dsp_chain_len = critical_path_dsps if critical_path_dsps < max_chain_len else max_chain_len
+        return dsp_chain_len
+
+    def _resolve_impl_style(self, fpgapart):
+        # Based on target device and activation/weight-width, choose the
+        # supported RTL compute core
+        
+        assert self.get_nodeattr("resType") != "lut", "LUT-based RTL-MVU implementation currently not supported! Please change resType for {}".format(self.onnx_node.name)
+
+        act_width = self.get_input_datatype(0).bitwidth()
+        weight_width = self.get_input_datatype(1).bitwidth()
+        is_versal = (
+            fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"]
+            or fpgapart[0:5] == "xqrvc"
+        )
+        
+        if is_versal:
+            return "mvu_vvu_8sx9_dsp58"
+        else:
+            if act_width == 4 and weight_width == 4:
+                return "mvu_4sx4u"
+            else:
+                return "mvu_8sx8u_dsp48"
+
+    def generate_hdl(self, model, fpgapart, clk):
+        # Generate params as part of IP preparation
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        self.generate_params(model, code_gen_dir)
+
+        template_path, code_gen_dict = self.prepare_codegen_default(fpgapart, clk)
+        # add general parameters to dictionary
+        code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [
+            self.get_verilog_top_module_name()
+        ]
+        # save top module name so we can refer to it after this node has been renamed
+        # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject)
+        self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name())
+
+        # apply code generation to template
+        with open(template_path, "r") as f:
+            template_wrapper = f.read()
+        for key in code_gen_dict:
+            # transform list into long string separated by '\n'
+            code_gen_line = "\n".join(code_gen_dict[key])
+            template_wrapper = template_wrapper.replace(key, code_gen_line)
+        with open(
+            os.path.join(
+                code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"
+            ),
+            "w",
+        ) as f:
+            f.write(template_wrapper.replace("$FORCE_BEHAVIORAL$", str(0)))
+        with open(
+            os.path.join(
+                code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper_sim.v"
+            ),
+            "w",
+        ) as f:
+            f.write(template_wrapper.replace("$FORCE_BEHAVIORAL$", str(1)))
+
+        # set ipgen_path and ip_path so that HLS-Synth transformation
+        # and stich_ip transformation do not complain
+        self.set_nodeattr("ipgen_path", code_gen_dir)
+        self.set_nodeattr("ip_path", code_gen_dir)
+
+    def prepare_codegen_default(self, fpgapart, clk):
+        template_path = os.environ["FINN_ROOT"] + "/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v"
+
+        code_gen_dict = {}
+        code_gen_dict["$IS_MVU$"] = [str(1)]
+        code_gen_dict["$COMPUTE_CORE$"] = [self._resolve_impl_style(fpgapart)]
+        code_gen_dict["$MW$"] = [str(self.get_nodeattr("MW"))]
+        code_gen_dict["$MH$"] = [str(self.get_nodeattr("MH"))]
+        code_gen_dict["$PE$"] = [str(self.get_nodeattr("PE"))]
+        code_gen_dict["$SIMD$"] = [str(self.get_nodeattr("SIMD"))]
+        code_gen_dict["$ACTIVATION_WIDTH$"] = [
+            str(self.get_input_datatype(0).bitwidth())
+        ]
+        code_gen_dict["$WEIGHT_WIDTH$"] = [str(self.get_input_datatype(1).bitwidth())]
+        code_gen_dict["$ACCU_WIDTH$"] = [str(self.get_output_datatype().bitwidth())]
+        code_gen_dict["$SIGNED_ACTIVATIONS$"] = (
+            [str(1)] if (self.get_input_datatype(0).min() < 0) else [str(0)]
+        )
+        code_gen_dict["$SEGMENTLEN$"] = [str(self._resolve_segment_len(clk))]
+
+        return template_path, code_gen_dict
+
+    def prepare_rtlsim(self):
+        """Creates a Verilator emulation library for the RTL code generated
+        for this node, sets the rtlsim_so attribute to its path and returns
+        a PyVerilator wrapper around it."""
+
+        if PyVerilator is None:
+            raise ImportError("Installation of PyVerilator is required.")
+
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        # Path to (System-)Verilog files used by top-module & path to top-module
+        verilog_paths = [code_gen_dir, os.environ["FINN_ROOT"] + "/finn-rtllib/mvu"]
+        verilog_files = [self.get_nodeattr("gen_top_module") + "_wrapper_sim.v"]
+
+        # build the Verilator emu library
+        sim = PyVerilator.build(
+            verilog_files,
+            build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"),
+            verilog_path=verilog_paths,
+            trace_depth=get_rtlsim_trace_depth(),
+            top_module_name=self.get_verilog_top_module_name(),
+        )
+        # save generated lib filename in attribute
+        self.set_nodeattr("rtlsim_so", sim.lib._name)
+
+        return sim
diff --git a/src/finn/custom_op/fpgadataflow/rtl/__init__.py b/src/finn/custom_op/fpgadataflow/rtl/__init__.py
index 914c033584..28e08aa445 100644
--- a/src/finn/custom_op/fpgadataflow/rtl/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/rtl/__init__.py
@@ -34,6 +34,8 @@
     StreamingDataWidthConverter_rtl,
 )
 from finn.custom_op.fpgadataflow.rtl.streamingfifo_rtl import StreamingFIFO_rtl
+from finn.custom_op.fpgadataflow.rtl.matrixvectoractivation_rtl import MatrixVectorActivation_rtl
+from finn.custom_op.fpgadataflow.rtl.vectorvectoractivation_rtl import VectorVectorActivation_rtl
 
 custom_op = dict()
 
@@ -43,3 +45,5 @@
 custom_op["FMPadding_rtl"] = FMPadding_rtl
 custom_op["StreamingDataWidthConverter_rtl"] = StreamingDataWidthConverter_rtl
 custom_op["StreamingFIFO_rtl"] = StreamingFIFO_rtl
+custom_op["MatrixVectorActivation_rtl"] = MatrixVectorActivation_rtl
+custom_op["VectorVectorActivation_rtl"] = VectorVectorActivation_rtl
diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
index 1c316e1285..f797e3d841 100644
--- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py
+++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
@@ -48,13 +48,12 @@ def is_external_input(model, node, i):
     # True only if input is unconnected and has no initializer
     # Only esception is second input of FC layers when mem_mode is external
     node_inst = getCustomOp(node)
-    op_type = node.op_type
     producer = model.find_producer(node.input[i])
     if producer is None:
         if model.get_initializer(node.input[i]) is None:
             return True
         else:
-            if op_type.startswith("MatrixVectorActivation"):
+            if node.op_type == "MatrixVectorActivation":
                 if node_inst.get_nodeattr("mem_mode") == "external":
                     return True
     return False
@@ -103,6 +102,7 @@ def __init__(self, fpgapart, clk_ns, ip_name="finn_design", vitis=False, signatu
         # keep track of top-level interface names
         self.intf_names = {
             "clk": [],
+            "clk2x": [],
             "rst": [],
             "s_axis": [],
             "m_axis": [],
@@ -110,10 +110,19 @@ def __init__(self, fpgapart, clk_ns, ip_name="finn_design", vitis=False, signatu
             "axilite": [],
         }
 
+    def _is_double_pumped(self, node):
+        try:
+            pumped_compute = getCustomOp(node).get_nodeattr("pumpedCompute")
+            return pumped_compute==1
+        except:
+            return False
+
     def connect_clk_rst(self, node):
         inst_name = node.name
         node_inst = getCustomOp(node)
         clock_intf_name = node_inst.get_verilog_top_module_intf_names()["clk"][0]
+        if self._is_double_pumped(node):
+            clock2x_intf_name = node_inst.get_verilog_top_module_intf_names()["clk2x"][0]
         reset_intf_name = node_inst.get_verilog_top_module_intf_names()["rst"][0]
         # make clock and reset external, if they aren't already
         if not self.clock_reset_are_external:
@@ -128,6 +137,22 @@ def connect_clk_rst(self, node):
             self.clock_reset_are_external = True
             self.intf_names["clk"] = ["ap_clk"]
             self.intf_names["rst"] = ["ap_rst_n"]
+        # make clk2x external, if it isn't already and connect clk and reset
+        elif self._is_double_pumped(node) and not self.clock2x_is_external:
+            self.connect_cmds.append(
+                "make_bd_pins_external [get_bd_pins %s/%s]" % (inst_name, clock2x_intf_name)
+            )
+            self.connect_cmds.append("set_property name ap_clk2x [get_bd_ports ap_clk2x_0]")
+            self.clock2x_is_external = True
+            self.intf_names["clk2x"] = ["ap_clk2x"]
+            self.connect_cmds.append(
+                "connect_bd_net [get_bd_ports ap_rst_n] [get_bd_pins %s/%s]"
+                % (inst_name, reset_intf_name)
+            )
+            self.connect_cmds.append(
+                "connect_bd_net [get_bd_ports ap_clk] [get_bd_pins %s/%s]"
+                % (inst_name, clock_intf_name)
+            )
         # otherwise connect clock and reset
         else:
             self.connect_cmds.append(
@@ -138,6 +163,11 @@ def connect_clk_rst(self, node):
                 "connect_bd_net [get_bd_ports ap_clk] [get_bd_pins %s/%s]"
                 % (inst_name, clock_intf_name)
             )
+            if self._is_double_pumped(node):
+                self.connect_cmds.append(
+                    "connect_bd_net [get_bd_ports ap_clk2x] [get_bd_pins %s/%s]"
+                    % (inst_name, clock2x_intf_name)
+                )
 
     def connect_axi(self, node):
         inst_name = node.name
@@ -285,7 +315,7 @@ def apply(self, model):
         ip_dirs.append("$::env(FINN_ROOT)/finn-rtllib/memstream")
         if self.signature:
             ip_dirs.append("$::env(FINN_ROOT)/finn-rtllib/axi_info")
-        if model.graph.node[0].op_type not in ["StreamingFIFO", "IODMA_hls"]:
+        if model.graph.node[0].op_type not in ["StreamingFIFO", "IODMA"]:
             warnings.warn(
                 """First node is not StreamingFIFO or IODMA.
                 You may experience incorrect stitched-IP rtlsim or hardware
@@ -377,6 +407,7 @@ def apply(self, model):
         fclk_hz = fclk_mhz * 1000000
         model.set_metadata_prop("clk_ns", str(self.clk_ns))
         tcl.append("set_property CONFIG.FREQ_HZ %d [get_bd_ports /ap_clk]" % round(fclk_hz))
+        tcl.append("set_property CONFIG.FREQ_HZ %d [get_bd_ports /ap_clk2x]" % round(2*fclk_hz))
         tcl.append("validate_bd_design")
         tcl.append("save_bd_design")
         # create wrapper hdl (for rtlsim later on)
diff --git a/src/finn/transformation/fpgadataflow/set_folding.py b/src/finn/transformation/fpgadataflow/set_folding.py
index 62457f164a..871919f3f2 100644
--- a/src/finn/transformation/fpgadataflow/set_folding.py
+++ b/src/finn/transformation/fpgadataflow/set_folding.py
@@ -31,6 +31,7 @@
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.base import Transformation
 from qonnx.transformation.general import GiveUniqueNodeNames
+from qonnx.core.datatype import DataType
 
 from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance
 from finn.transformation.fpgadataflow.annotate_cycles import AnnotateCycles
@@ -80,11 +81,12 @@ class SetFolding(Transformation):
       unfolded before SIMD is increased
     """
 
-    def __init__(self, target_cycles_per_frame=1000, mvau_wwidth_max=36, two_pass_relaxation=True):
+    def __init__(self, target_cycles_per_frame=1000, mvau_wwidth_max=36, two_pass_relaxation=True, fpga_part=None):
         super().__init__()
         self.target_cycles_per_frame = target_cycles_per_frame
         self.mvau_wwidth_max = mvau_wwidth_max
         self.two_pass_relaxation = two_pass_relaxation
+        self.fpga_part = fpga_part
 
     def optimize_attribute_val(self, node_inst, max_val, attr_name):
         node_inst.set_nodeattr(attr_name, 1)
@@ -95,6 +97,10 @@ def optimize_attribute_val(self, node_inst, max_val, attr_name):
                 # finish if target met
                 break
 
+    def _is_versal(self, fpga_part):
+        assert fpga_part is not None, "Please specify a target board before setting the folding configuration for a more efficient folding configuration for RTL-based MVU/VVU"
+        return fpga_part[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"] or fpga_partt[0:5] == "xqrvc"
+
     def apply(self, model):
         graph = model.graph
         # these ops use PE parallelism, up to a max value of NumChannels
@@ -112,20 +118,20 @@ def apply(self, model):
         simd_ops = [
             "DownSampler",
             "FMPadding_Batch",
-            "FMPadding_Pixel",
+            "FMPadding_Batch_rtl",
             "ConvolutionInputGenerator",
             "ConvolutionInputGenerator1D",
             "ConvolutionInputGenerator_rtl",
         ]
         # these ops are preceded by depthwise SWG and have special behavior,
         # as explained in the SetFolding docstring
-        depthwise_op_exceptions = ["VectorVectorActivation", "Pool_Batch"]
+        depthwise_op_exceptions = ["VectorVectorActivation", "VectorVectorActivation_rtl", "Pool_Batch"]
         for node in graph.node:
             if not is_fpgadataflow_node(node):
                 continue
             op_type = node.op_type
             node_inst = getCustomOp(node)
-            if op_type.startswith("MatrixVectorActivation"):
+            if op_type == "MatrixVectorActivation":
                 max_simd = node_inst.get_nodeattr("MW")
                 max_pe = node_inst.get_nodeattr("MH")
                 node_inst.set_nodeattr("PE", 1)
@@ -149,6 +155,37 @@ def apply(self, model):
                         break
                 # increase PE until target met or reached max_pe
                 self.optimize_attribute_val(node_inst, max_pe, "PE")
+            if op_type == "MatrixVectorActivation_rtl":
+                max_simd = node_inst.get_nodeattr("MW")
+                max_pe = node_inst.get_nodeattr("MH")
+                node_inst.set_nodeattr("PE", 1)
+                node_inst.set_nodeattr("SIMD", 1)
+                # Depending on the board and the layer's config, either the
+                # SIMD or PE folding dimension would be preferred to enable efficient DSP-packing
+                act_width = DataType[node_inst.get_nodeattr("inputDataType")].bitwidth()
+                weight_width = DataType[node_inst.get_nodeattr("weightDataType")].bitwidth()
+                is_versal = self._is_versal(self.fpga_part)
+                is_dsp48 = act_width < 5 and weight_width < 5 or not(is_versal)
+                preferred_folding_dimension = "PE" if is_dsp48 else "SIMD"
+                preferred_folding_max = max_pe if is_dsp48 else max_simd
+                second_folding_dimension = "SIMD" if is_dsp48 else "PE"
+                second_folding_max = max_simd if is_dsp48 else max_pe
+                for fold_val in divisors(preferred_folding_max):
+                    prev_fold_val = node_inst.get_nodeattr(preferred_folding_dimension)
+                    node_inst.set_nodeattr(preferred_folding_dimension, fold_val)
+                    cyc = node_inst.get_exp_cycles()
+                    if cyc < self.target_cycles_per_frame:
+                        # finish if target met
+                        break
+                    if (
+                        node_inst.get_weight_datatype().bitwidth() * node_inst.get_nodeattr(preferred_folding_dimension)
+                        > self.mvau_wwidth_max
+                    ):
+                        # revert if we've gone above width threshold
+                        node_inst.set_nodeattr(preferred_folding_dimension, prev_fold_val)
+                        break
+                # increase SIMD until target met or reached max_simd
+                self.optimize_attribute_val(node_inst, second_folding_max, second_folding_dimension)
             elif op_type in pe_ops:
                 max_pe = node_inst.get_nodeattr("NumChannels")
                 self.optimize_attribute_val(node_inst, max_pe, "PE")
@@ -157,37 +194,44 @@ def apply(self, model):
                 self.optimize_attribute_val(node_inst, max_pe, "PE")
             elif op_type in depthwise_op_exceptions:
                 # init/reset SIMD of VVAU
-                if op_type == "VectorVectorActivation":
-                    node_inst.set_nodeattr("SIMD", 1)
+                is_hls_vvu_or_pool = op_type in ["VectorVectorActivation", "Pool_Batch"]
                 max_pe = node_inst.get_nodeattr("Channels")
-                self.optimize_attribute_val(node_inst, max_pe, "PE")
-                # increase SIMD for VVAU once PE is exhausted
-                pe = node_inst.get_nodeattr("PE")
+                max_simd = np.prod(node_inst.get_nodeattr("Kernel")) if op_type.startswith("VectorVectorActivation") else 0
+                preferred_folding_dimension = "PE" if is_hls_vvu_or_pool else "SIMD"
+                preferred_folding_max = max_pe if is_hls_vvu_or_pool else max_simd
+                second_folding_dimension = "SIMD" if is_hls_vvu_or_pool else "PE"
+                second_folding_max = max_simd if is_hls_vvu_or_pool else max_pe
+                if op_type.startswith("VectorVectorActivation"):
+                    node_inst.set_nodeattr(second_folding_dimension, 1)
+                self.optimize_attribute_val(node_inst, preferred_folding_max, preferred_folding_dimension)
+                # increase SIMD(/PE) for VVAU once PE(/SIMD) is exhausted
+                fold_val = node_inst.get_nodeattr(preferred_folding_dimension)
                 cyc = node_inst.get_exp_cycles()
                 if (
-                    op_type == "VectorVectorActivation"
-                    and pe == max_pe
+                    op_type.startswith("VectorVectorActivation")
+                    and fold_val == preferred_folding_max
                     and cyc > self.target_cycles_per_frame
                 ):
-                    max_simd = np.prod(node_inst.get_nodeattr("Kernel"))
-                    self.optimize_attribute_val(node_inst, max_simd, "SIMD")
-                # also set the folding of the upsteam DW SWU
+                    self.optimize_attribute_val(node_inst, second_folding_max, second_folding_dimension)
+                # also set the folding of the upsteam DW SWU (in case of HLS-based VVU)
                 # which must be identical to this node
                 swu_node = model.find_producer(node.input[0])
                 if swu_node.op_type.startswith("ConvolutionInputGenerator"):
                     swu_node_inst = getCustomOp(swu_node)
-                    swu_node_inst.set_nodeattr("SIMD", pe)
                     # enable parallel_window mode of RTL SWG if needed
                     if swu_node.op_type == "ConvolutionInputGenerator_rtl":
                         if (
-                            op_type == "VectorVectorActivation"
+                            op_type.startswith("VectorVectorActivation")
                             and node_inst.get_nodeattr("SIMD") > 1
                         ):
                             swu_node_inst.set_nodeattr("parallel_window", 1)
+                            swu_node_inst.set_nodeattr("SIMD", max_pe)
                         else:
                             swu_node_inst.set_nodeattr("parallel_window", 0)
+                            pe = node_inst.get_nodeattr("PE")
+                            swu_node_inst.set_nodeattr("SIMD", pe)
                 else:
-                    if op_type == "VectorVectorActivation":
+                    if op_type.startswith("VectorVectorActivation"):
                         ksize = np.prod(node_inst.get_nodeattr("Kernel"))
                     elif op_type == "Pool_Batch":
                         ksize = node_inst.get_nodeattr("KernelSize")
diff --git a/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py
new file mode 100644
index 0000000000..5061282695
--- /dev/null
+++ b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py
@@ -0,0 +1,191 @@
+# Copyright (c) 2023, AMD
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+from qonnx.transformation.base import Transformation
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.core.datatype import DataType
+from onnx import helper
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.general import GiveUniqueNodeNames
+from finn.transformation.fpgadataflow.minimize_accumulator_width import MinimizeAccumulatorWidth
+
+class InferRTLMatrixVectorActivation(Transformation):
+    """Convert (HLS-based) MatrixVectorActivation layers to specialized RTL layers if supported."""
+
+    def __init__(self):
+        super().__init__()
+
+    def _is_rtl_variant_compatible(self, n):
+        no_activation = getCustomOp(n).get_nodeattr("noActivation") == 1
+        act_width_in_range = (DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() <= 8) or (DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() == 9 and DataType[getCustomOp(n).get_nodeattr("inputDataType")].min() < 0)
+        weight_width_in_range = DataType[getCustomOp(n).get_nodeattr("weightDataType")].bitwidth() <= 8
+        folding_supported = (getCustomOp(n).get_nodeattr("MH") % getCustomOp(n).get_nodeattr("PE") == 0) and (getCustomOp(n).get_nodeattr("MW") % getCustomOp(n).get_nodeattr("SIMD") == 0)
+
+        if (no_activation and act_width_in_range and weight_width_in_range and folding_supported):
+            return True
+        else:
+            return False
+
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for n in graph.node:
+            node_ind += 1
+            if n.op_type == "MatrixVectorActivation":
+                preferred_in_rtl = getCustomOp(n).get_nodeattr("preferred_backend") == "rtl"
+                supported_in_rtl = self._is_rtl_variant_compatible(n)
+                if (preferred_in_rtl and supported_in_rtl):
+                    mvau_input = n.input[0]
+                    mvau_weight = n.input[1]
+                    mvau_output = n.output[0]
+                    inputDataType = getCustomOp(n).get_nodeattr("inputDataType")
+                    weightDataType = getCustomOp(n).get_nodeattr("weightDataType")
+                    outputDataType = getCustomOp(n).get_nodeattr("outputDataType")
+                    numInputVectors = getCustomOp(n).get_nodeattr("numInputVectors")
+                    mw = getCustomOp(n).get_nodeattr("MW")
+                    mh = getCustomOp(n).get_nodeattr("MH")
+                    simd = getCustomOp(n).get_nodeattr("SIMD")
+                    pe = getCustomOp(n).get_nodeattr("PE")
+                    mem_mode = getCustomOp(n).get_nodeattr("mem_mode")
+                    ram_style = getCustomOp(n).get_nodeattr("ram_style")
+                    resType = getCustomOp(n).get_nodeattr("resType")
+                    runtime_writeable_weights = getCustomOp(n).get_nodeattr("runtime_writeable_weights")
+
+                    new_node = helper.make_node(
+                        "MatrixVectorActivation_rtl",
+                        [mvau_input, mvau_weight],
+                        [mvau_output],
+                        domain="finn.custom_op.fpgadataflow",
+                        backend="fpgadataflow",
+                        MW=mw,
+                        MH=mh,
+                        SIMD=simd,
+                        PE=pe,
+                        inputDataType=inputDataType,
+                        weightDataType=weightDataType,
+                        outputDataType=outputDataType,
+                        numInputVectors=numInputVectors,
+                        mem_mode=mem_mode,
+                        resType=resType,
+                        name=n.name + "_rtl",
+                        ram_style=ram_style,
+                        runtime_writeable_weights=runtime_writeable_weights
+                    )
+                    graph.node.insert(node_ind, new_node)
+                    # remove old node
+                    graph.node.remove(n)
+                    graph_modified=True
+        
+        if graph_modified:
+            model = model.transform(MinimizeAccumulatorWidth())
+            model = model.transform(InferShapes())
+            model = model.transform(InferDataTypes())
+            model = model.transform(GiveUniqueNodeNames())
+        
+        return (model, graph_modified)
+
+class InferRTLVectorVectorActivation(Transformation):
+    """Convert (HLS-based) VectorVectorActivation layers to specialized RTL layers is supported."""
+
+    def __init__(self):
+        super().__init__()
+
+    def _is_rtl_variant_compatible(self, n):
+        no_activation = getCustomOp(n).get_nodeattr("noActivation") == 1
+        act_width_in_range = (DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() <= 8) or (DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() == 9 and DataType[getCustomOp(n).get_nodeattr("inputDataType")].min() < 0)
+        weight_width_in_range = DataType[getCustomOp(n).get_nodeattr("weightDataType")].bitwidth() <= 8
+        folding_supported = (getCustomOp(n).get_nodeattr("Channels") % getCustomOp(n).get_nodeattr("PE") == 0) and (np.prod(getCustomOp(n).get_nodeattr("Kernel")) % getCustomOp(n).get_nodeattr("SIMD") == 0)
+        
+        if (no_activation and act_width_in_range and weight_width_in_range and folding_supported):
+            return True
+        else:
+            return False
+    
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for n in graph.node:
+            node_ind += 1
+            if n.op_type == "VectorVectorActivation":
+                preferred_in_rtl = getCustomOp(n).get_nodeattr("preferred_backend") == "rtl"
+                supported_in_rtl = self._is_rtl_variant_compatible(n)
+                if (preferred_in_rtl and supported_in_rtl):
+                    vvau_input = n.input[0]
+                    vvau_weight = n.input[1]
+                    vvau_output = n.output[0]
+                    inputDataType = getCustomOp(n).get_nodeattr("inputDataType")
+                    weightDataType = getCustomOp(n).get_nodeattr("weightDataType")
+                    outputDataType = getCustomOp(n).get_nodeattr("outputDataType")
+                    pe = getCustomOp(n).get_nodeattr("PE")
+                    simd = getCustomOp(n).get_nodeattr("SIMD")
+                    dim = getCustomOp(n).get_nodeattr("Dim")
+                    channels = getCustomOp(n).get_nodeattr("Channels")
+                    kernel = getCustomOp(n).get_nodeattr("Kernel")
+                    resType = getCustomOp(n).get_nodeattr("resType")
+                    mem_mode = getCustomOp(n).get_nodeattr("mem_mode")
+                    runtime_writeable_weights = getCustomOp(n).get_nodeattr("runtime_writeable_weights")
+                    ram_style = getCustomOp(n).get_nodeattr("ram_style")
+                    resType = getCustomOp(n).get_nodeattr("resType")                    
+
+                    new_node = helper.make_node(
+                        "VectorVectorActivation_rtl",
+                        [vvau_input, vvau_weight],
+                        [vvau_output],
+                        domain="finn.custom_op.fpgadataflow",
+                        backend="fpgadataflow",
+                        name=n.name + "_rtl",
+                        PE=pe,
+                        SIMD=simd,
+                        Dim=dim,
+                        Channels=channels,
+                        Kernel=kernel,
+                        resType=resType,
+                        inputDataType=inputDataType,
+                        weightDataType=weightDataType,
+                        outputDataType=outputDataType,
+                        mem_mode=mem_mode,
+                        runtime_writeable_weights=runtime_writeable_weights,
+                        ram_style=ram_style
+                    )
+                    graph.node.insert(node_ind, new_node)
+                    # remove old node
+                    graph.node.remove(n)
+                    graph_modified=True
+        
+        if graph_modified:
+            model = model.transform(MinimizeAccumulatorWidth())
+            model = model.transform(InferShapes())
+            model = model.transform(InferDataTypes())
+            model = model.transform(GiveUniqueNodeNames())
+        
+        return (model, graph_modified)
\ No newline at end of file
diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py
index b296dad827..db065fec42 100644
--- a/tests/end2end/test_end2end_bnn_pynq.py
+++ b/tests/end2end/test_end2end_bnn_pynq.py
@@ -653,7 +653,15 @@ def test_set_fifo_depths(self, topology, wbits, abits, board):
         prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "ipgen_" + board)
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         test_fpga_part = get_build_env(board, target_clk_ns)["part"]
-        model = model.transform(InsertAndSetFIFODepths(test_fpga_part, target_clk_ns))
+        if topology == "cnv" and wbits == 2 and abits == 2 and board == "Pynq-Z1":
+            # Enabling swg_exception for this single test case. Disabling the exception results in
+            # a design that exceeds the resources of the Pynq-Z1 board. In future this should be
+            # revisited and handled correctly as the swg_exception is poorly justified.
+            model = model.transform(
+                InsertAndSetFIFODepths(test_fpga_part, target_clk_ns, swg_exception=True)
+            )
+        else:
+            model = model.transform(InsertAndSetFIFODepths(test_fpga_part, target_clk_ns))
         fifo_layers = model.get_nodes_by_op_type("StreamingFIFO")
         assert len(fifo_layers) > 0
         model.save(get_checkpoint_name(topology, wbits, abits, "fifodepth_" + board))
diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py b/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py
new file mode 100644
index 0000000000..45b33b24e8
--- /dev/null
+++ b/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py
@@ -0,0 +1,189 @@
+# Copyright (C) 2022, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+import os
+import pickle
+
+import numpy as np
+import os
+import pickle
+from onnx import TensorProto, helper
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.general import ApplyConfig, GiveUniqueNodeNames, GiveReadableTensorNames
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
+
+import finn.core.onnx_exec as oxe
+import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
+import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl
+from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
+from qonnx.custom_op.registry import getCustomOp
+
+build_dir = os.environ["FINN_BUILD_DIR"]
+
+
+def make_single_matmul_modelwrapper(ifm, ofm, idt, wdt, W):
+    matmul_node = helper.make_node("MatMul", ["ifm", "weights"], ["ofm"])
+    graph = helper.make_graph(nodes=[matmul_node], name="matmul_graph", inputs=[ifm], outputs=[ofm])
+
+    model = qonnx_make_model(graph, producer_name="fclayer-model")
+    model = ModelWrapper(model)
+
+    model.set_tensor_datatype("ifm", idt)
+    model.set_tensor_datatype("weights", wdt)
+    model.set_tensor_datatype(
+        "ofm", DataType["INT32"]
+    )  # At this step, the MatMul layer does not optimize the bit-width of the output datatype
+    model.set_initializer("weights", W)
+    # model.set_tensor_layout("ifm", DataLayout.NHWC)
+
+    return model
+
+def prepare_inputs(input_tensor):
+    return {"ifm": input_tensor}
+
+@pytest.mark.parametrize("mh", [4])
+# @pytest.mark.parametrize("mw", [36])
+@pytest.mark.parametrize("mw", [18])
+# @pytest.mark.parametrize("pe", [1,2,4,8])
+@pytest.mark.parametrize("pe", [2])
+# @pytest.mark.parametrize("simd", [1,3,6,9,18,36])
+@pytest.mark.parametrize("simd", [6])
+#@pytest.mark.parametrize("idt", [DataType["UINT4"], DataType["UINT8"]])
+@pytest.mark.parametrize("idt", [DataType["UINT8"]])
+#@pytest.mark.parametrize("wdt", [DataType["INT4"], DataType["INT6"]])
+@pytest.mark.parametrize("wdt", [DataType["INT8"]])
+#@pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S", "xcku3p-ffva676-1-e"])
+#@pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S"])
+@pytest.mark.parametrize("part", ["xcvc1902-vsva2197-2MP-e-S"])
+@pytest.mark.parametrize("segmentlen", [1])
+@pytest.mark.fpgadataflow
+@pytest.mark.slow
+@pytest.mark.vivado
+def test_fpgadataflow_mvau_rtl(mh, mw, pe, simd, idt, wdt, part, segmentlen):
+    # Synthesis constants
+    clk_ns = 5
+    # Create test input vector (produced by SWG)
+    ofm_shape = (5, 5)
+    ofm_h, ofm_w = ofm_shape
+    ifm = helper.make_tensor_value_info("ifm", TensorProto.FLOAT, [1, ofm_h, ofm_w, mw])
+    ofm = helper.make_tensor_value_info("ofm", TensorProto.FLOAT, (1, ofm_h, ofm_w, mh))
+    W = gen_finn_dt_tensor(wdt, (mw, mh))
+    # np.save("weights.npy", W)
+    ##
+    # W = np.load("weights.npy")
+    model = make_single_matmul_modelwrapper(W, ofm_shape, mh, ifm, weights, idt, wdt)
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+
+    model.save(build_dir + "/matmul.onnx")
+
+    # Create MatMul & obtain golden reference output
+    A = gen_finn_dt_tensor(model.get_tensor_datatype("ifm"), model.get_tensor_shape("ifm"))
+    # np.save("activations.npy", A)
+    ##
+    # A = np.load("activations.npy")
+    input_dict = prepare_inputs(A)
+
+    # Execute ONNX model
+    output_matmul = oxe.execute_onnx(model, input_dict)["global_out"]
+
+    with open(build_dir + "/onnx_output.pkl", "wb") as f:
+        pickle.dump(output_matmul, f)
+
+    with open(build_dir + "/onnx_output.pkl", "wb") as f:
+        pickle.dump(output_matmul, f)
+
+    # Create MVAU (HLS)
+    model = model.transform(to_hls.InferQuantizedMatrixVectorActivation(mem_mode="decoupled"))
+    model = model.transform(GiveUniqueNodeNames())
+
+    # Apply folding (i.e. specify to use DSPs)
+    folding_config = {
+        "Defaults": {},
+        "MatrixVectorActivation_0": {
+            "PE" : pe,
+            "SIMD" : simd,
+            "mem_mode" : "decoupled",
+            "ram_style" : "auto",
+            "resType" : "dsp",
+            "preferred_backend" : "rtl"
+        }
+    }
+    model = model.transform(ApplyConfig(folding_config))
+    model.save(build_dir+"/mvau_hls.onnx")
+
+    model = model.transform(SetExecMode("rtlsim"))
+    model = model.transform(PrepareIP(part, clk_ns))
+    model = model.transform(HLSSynthIP())
+    model = model.transform(PrepareRTLSim())
+    for n in model.graph.node:
+        getCustomOp(n).set_nodeattr("rtlsim_trace", "mvu_trace_hls.vcd")
+    output_mvau_hls = oxe.execute_onnx(model, input_dict)["ofm"]
+    
+
+    # Apply convert-to-rtl step
+    model = model.transform(to_rtl.InferRTLMatrixVectorActivation())
+    model = model.transform(GiveUniqueNodeNames())
+    for n in model.graph.node:
+        if n.op_type=="MatrixVectorActivation_rtl":
+            getCustomOp(n).set_nodeattr("pumpedCompute", 0)
+    model.save(build_dir+"/mvau_rtl.onnx")
+
+    # Reset rtlsim_so and ip-related paths such that new Pyverilator SO and IP is generated
+    for n in model.graph.node:
+        getCustomOp(n).set_nodeattr("rtlsim_so", "")
+        getCustomOp(n).set_nodeattr("code_gen_dir_ipgen", "")
+        getCustomOp(n).set_nodeattr("ipgen_path", "")
+        getCustomOp(n).set_nodeattr("ip_path", "")
+        getCustomOp(n).set_nodeattr("rtlsim_trace", "mvu_trace_rtl.vcd")
+    model = model.transform(SetExecMode("rtlsim"))
+    model = model.transform(PrepareIP(part, clk_ns))
+    model = model.transform(HLSSynthIP())
+    model = model.transform(PrepareRTLSim())
+    output_mvau_rtl = oxe.execute_onnx(model, input_dict)["global_out"]
+
+    with open(build_dir + "/mvau_rtl_output.pkl", "wb") as f:
+        pickle.dump(output_mvau_rtl, f)
+
+    with open(build_dir + "/hls_output.pkl", "wb") as f:
+        pickle.dump(output_mvau_hls, f)
+
+    with open(build_dir + "/rtl_output.pkl", "wb") as f:
+        pickle.dump(output_mvau_rtl, f)
+
+    # model = model.transform(PrepareIP(part, clk_ns))
+    # model = model.transform(HLSSynthIP())
+    # model = model.transform(CreateStitchedIP(fpgapart=part, clk_ns=clk_ns, vitis=True))
+    # model.save(build_dir+"/stitched_ip.onnx")
+
+    #assert (output_mvau_hls == output_mvau_rtl).all()
+    assert (output_matmul['ofm'] == output_mvau_rtl).all()
+    # assert (output_mvau_hls.size > 0)
diff --git a/tests/fpgadataflow/test_fpgadataflow_vvau_rtl.py b/tests/fpgadataflow/test_fpgadataflow_vvau_rtl.py
new file mode 100644
index 0000000000..25fad308ee
--- /dev/null
+++ b/tests/fpgadataflow/test_fpgadataflow_vvau_rtl.py
@@ -0,0 +1,234 @@
+# Copyright (C) 2022, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+import numpy as np
+import os
+import pickle
+from onnx import TensorProto, helper
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.general.im2col import compute_conv_output_dim
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.general import (
+    ApplyConfig,
+    GiveReadableTensorNames,
+    GiveUniqueNodeNames,
+)
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
+
+import finn.core.onnx_exec as oxe
+import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
+import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl
+from finn.transformation.fpgadataflow.create_dataflow_partition import (
+    CreateDataflowPartition,
+)
+from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.minimize_accumulator_width import (
+    MinimizeAccumulatorWidth,
+)
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.fpgadataflow.set_fifo_depths import InsertAndSetFIFODepths
+
+# import qonnx.core.data_layout as DataLayout
+
+build_dir = os.environ["FINN_BUILD_DIR"]
+
+
+def make_single_dw_conv_modelwrapper(conv_config, idt, wdt):
+    kernel_size, in_feature_dim, in_chn = conv_config
+    stride = 1
+    pad = 0
+
+    out_feature_dim = compute_conv_output_dim(in_feature_dim, kernel_size, stride, pad)
+    group = out_chn = in_chn
+
+    conv_param_shape = [out_chn, 1, kernel_size, kernel_size]
+    input_shape = [1, in_chn, in_feature_dim, in_feature_dim]
+    output_shape = [1, out_chn, out_feature_dim, out_feature_dim]
+
+    conv_config = {}
+    conv_config["dilations"] = [1, 1]
+    conv_config["group"] = group
+    conv_config["kernel_shape"] = [kernel_size, kernel_size]
+    conv_config["pads"] = [pad, pad, pad, pad]
+    conv_config["strides"] = [stride, stride]
+
+    ifm = helper.make_tensor_value_info("ifm", TensorProto.FLOAT, input_shape)
+    ofm = helper.make_tensor_value_info("ofm", TensorProto.FLOAT, output_shape)
+    weights = [helper.make_tensor_value_info("weights", TensorProto.FLOAT, conv_param_shape)]
+
+    modelproto = qonnx_make_model(
+        helper.make_graph(
+            name="conv_test",
+            inputs=[ifm],
+            outputs=[ofm],
+            value_info=weights,
+            nodes=[helper.make_node("Conv", ["ifm", "weights"], ["ofm"], **conv_config)],
+        )
+    )
+
+    model = ModelWrapper(modelproto)
+    model.set_tensor_datatype("ifm", idt)
+    model.set_tensor_datatype("weights", wdt)
+    model.set_initializer("weights", gen_finn_dt_tensor(wdt, conv_param_shape))
+
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+
+    return model
+
+
+def prepare_inputs(input_tensor):
+    return {"global_in": input_tensor}
+
+
+@pytest.mark.parametrize("kernel_size", [3])
+@pytest.mark.parametrize("in_feature_dim", [5])
+@pytest.mark.parametrize("in_chn", [4])
+@pytest.mark.parametrize("idt", [DataType["INT8"]])
+# @pytest.mark.parametrize("idt", [DataType["UINT8"]])
+@pytest.mark.parametrize("wdt", [DataType["INT6"]])
+@pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S"])
+@pytest.mark.parametrize("segmentlen", [1])
+@pytest.mark.parametrize("pe", [1, 2, 4])
+@pytest.mark.parametrize("simd", [1, 3, 9])
+@pytest.mark.fpgadataflow
+@pytest.mark.slow
+@pytest.mark.vivado
+def test_fpgadataflow_vvau_rtl(
+    kernel_size, in_feature_dim, in_chn, idt, wdt, part, segmentlen, pe, simd
+):
+    # Create depthwise-separable convolution
+    conv_config = (kernel_size, in_feature_dim, in_chn)
+    model = make_single_dw_conv_modelwrapper(conv_config, idt, wdt)
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+    model.save(build_dir + "/dw_conv.onnx")
+
+    # Obtain golden reference output
+    golden_in = gen_finn_dt_tensor(
+        model.get_tensor_datatype("global_in"), model.get_tensor_shape("global_in")
+    )
+    input_dict = prepare_inputs(golden_in)
+    golden_out = oxe.execute_onnx(model, input_dict, return_full_exec_context=True)
+    with open(build_dir + "/onnx_dws_conv.pkl", "wb") as f:
+        pickle.dump(golden_out, f)
+
+    # Convert to HLS custom-op first
+    model = model.transform(LowerConvsToMatMul())
+    model = model.transform(to_hls.InferConvInpGen(use_rtl_variant=True))
+    model = model.transform(to_hls.InferVectorVectorActivation())
+    model = model.transform(MinimizeAccumulatorWidth())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+    model.save(build_dir + "/hls_vvau.onnx")
+
+    # Apply folding (i.e. specify to use DSPs)
+    folding_config = {
+        "Defaults": {},
+        "ConvolutionInputGenerator_rtl_0": {"SIMD": 4, "parallel_window": 1},
+        "VectorVectorActivation_0": {
+            "PE": pe,
+            "SIMD": simd,
+            "mem_mode": "decoupled",
+            "ram_style": "auto",
+            "resType": "dsp",
+            "preferred_backend": "rtl",
+        },
+    }
+    model = model.transform(ApplyConfig(folding_config))
+    model.save(build_dir + "/hls_vvau_folded.onnx")
+
+    # Obtain second reference from HLS-based VVAU layer
+    model = model.transform(SetExecMode("rtlsim"))
+    model = model.transform(PrepareIP(part, 5))
+    model = model.transform(HLSSynthIP())
+    model = model.transform(PrepareRTLSim())
+    conv_hls_out = oxe.execute_onnx(model, input_dict, return_full_exec_context=True)
+    with open(build_dir + "/hls_vvau_folded_output.pkl", "wb") as f:
+        pickle.dump(conv_hls_out, f)
+
+    # Stitched-IP RTLsim
+    model = model.transform(CreateDataflowPartition(partition_model_dir=build_dir))
+    model.save(build_dir + "/ip-stitched.onnx")
+    partition_model_path = getCustomOp(
+        model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
+    ).get_nodeattr("model")
+    partitioned_model = ModelWrapper(partition_model_path)
+    # FIFOs needed for stitched-ip RTLsim, DWC needed for VVU operating on SIMD parallelism
+    partitioned_model = partitioned_model.transform(InsertAndSetFIFODepths(part, 5))
+    partitioned_model = partitioned_model.transform(PrepareIP(part, 5))
+    partitioned_model = partitioned_model.transform(HLSSynthIP())
+    partitioned_model.save(build_dir + "/partitioned_model.onnx")
+    partitioned_model = partitioned_model.transform(CreateStitchedIP(part, 5))
+    partitioned_model.save(partition_model_path)
+    partitioned_model.set_metadata_prop("rtlsim_trace", build_dir + "/hls-vvu.vcd")
+    # set top-level prop for stitched-ip rtlsim and launch
+    partitioned_model.set_metadata_prop("exec_mode", "rtlsim")
+    # transpose input since we're now simulating HW layers (NCHW --> NHWC)
+    input_dict["global_in"] = np.transpose(input_dict["global_in"], (0, 2, 3, 1))
+    stitched_ip_out = oxe.execute_onnx(partitioned_model, input_dict, return_full_exec_context=True)
+    with open(build_dir + "/stitched_ip_output.pkl", "wb") as f:
+        pickle.dump(stitched_ip_out, f)
+
+    # Apply convert-to-rtl step
+    partitioned_model = partitioned_model.transform(to_rtl.InferRTLVectorVectorActivation())
+    partitioned_model = partitioned_model.transform(GiveUniqueNodeNames())
+    partitioned_model = partitioned_model.transform(GiveReadableTensorNames())
+    partitioned_model = partitioned_model.transform(PrepareIP(part, 5))
+    partitioned_model = partitioned_model.transform(HLSSynthIP())
+    partitioned_model = partitioned_model.transform(CreateStitchedIP(part, 5))
+    partitioned_model.save(build_dir + "/partition_rtl_vvau.onnx")
+    partitioned_model.set_metadata_prop("rtlsim_trace", build_dir + "/rtl-vvu.vcd")
+    # Reset rtlsim_so path to re-generate Pyverilator sim object
+    partitioned_model.set_metadata_prop("rtlsim_so", "")
+    # set top-level prop for stitched-ip rtlsim and launch
+    partitioned_model.set_metadata_prop("exec_mode", "rtlsim")
+    vvu_rtl_out = oxe.execute_onnx(partitioned_model, input_dict, return_full_exec_context=True)
+    with open(build_dir + "/rtl_vvau_output.pkl", "wb") as f:
+        pickle.dump(vvu_rtl_out, f)
+
+    golden_ret = golden_out["global_out"]
+    # tranpose hardware-generated outputs NHWC -> NCHW to be comparable
+    vvu_rtl_ret = vvu_rtl_out["global_out"].transpose(0, 3, 1, 2)
+    hls_ret = stitched_ip_out["global_out"].transpose(0, 3, 1, 2)
+
+    assert (
+        vvu_rtl_ret == golden_ret
+    ).all(), "Output of ONNX model not matching output of stitched-IP RTL model!"
+    assert (
+        vvu_rtl_ret == hls_ret
+    ).all(), "Output of stitched-IP HLS model not matching output of stitched-IP RTL model!"
diff --git a/tests/fpgadataflow/test_split_large_fifos.py b/tests/fpgadataflow/test_split_large_fifos.py
index 3061696a68..653e1e7896 100644
--- a/tests/fpgadataflow/test_split_large_fifos.py
+++ b/tests/fpgadataflow/test_split_large_fifos.py
@@ -54,7 +54,7 @@ def fetch_test_model(topology, wbits=2, abits=2):
 def get_folding_cfg(depth=65536):
     cfg = dict()
     cfg["Defaults"] = dict()
-    for i in range(3):
+    for i in range(4):
         key = "StreamingFIFO_" + str(i)
         cfg[key] = {"depth": depth, "ram_style": "auto", "impl_style": "vivado"}
     return cfg