diff --git a/.github/workflows/quicktest-dev-pr.yml b/.github/workflows/quicktest-dev-pr.yml
index e2ba47ec29..91104653f6 100644
--- a/.github/workflows/quicktest-dev-pr.yml
+++ b/.github/workflows/quicktest-dev-pr.yml
@@ -22,4 +22,5 @@ jobs:
           export FINN_ROOT=$(pwd)
           export FINN_BUILD_DIR=/tmp/finn_gha
           export FINN_INST_NAME=finn_gha
+          export FINN_SKIP_XRT_DOWNLOAD=1
           ./run-docker.sh quicktest
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index 575a60c69d..81eb96b44f 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -44,3 +44,6 @@ sphinx:
 python:
    install:
         - requirements: docs/requirements.txt
+
+formats:
+  - pdf
diff --git a/AUTHORS.rst b/AUTHORS.rst
index 5a11497fc8..5ad2b26ac2 100644
--- a/AUTHORS.rst
+++ b/AUTHORS.rst
@@ -34,3 +34,6 @@ Contributors
 * Shashwat Khandelwal (@shashwat1198)
 * Ian Colbert (@i-colbert)
 * Rachit Garg (@rstar900)
+* Christoph Berganski (@iksnagreb)
+* Jonas Kuehle (@vopade)
+* Aditya S (@Adityasrinivas24)
diff --git a/docker/Dockerfile.finn b/docker/Dockerfile.finn
index 2ceb1f4195..5126ed3ff4 100644
--- a/docker/Dockerfile.finn
+++ b/docker/Dockerfile.finn
@@ -31,6 +31,8 @@ FROM ubuntu:jammy-20230126
 LABEL maintainer="Jakoba Petri-Koenig <jakoba.petri-koenig@amd.com>, Yaman Umuroglu <yaman.umuroglu@amd.com>"
 
 ARG XRT_DEB_VERSION="xrt_202220.2.14.354_22.04-amd64-xrt"
+ARG SKIP_XRT
+ARG LOCAL_XRT
 
 WORKDIR /workspace
 
@@ -78,15 +80,19 @@ RUN cd verilator && \
     make install
 
 # install XRT
-RUN wget https://www.xilinx.com/bin/public/openDownload?filename=$XRT_DEB_VERSION.deb -O /tmp/$XRT_DEB_VERSION.deb
-RUN apt install -y /tmp/$XRT_DEB_VERSION.deb
-RUN rm /tmp/$XRT_DEB_VERSION.deb
+RUN if [ -z "$LOCAL_XRT" ] && [ -z "$SKIP_XRT" ];then \
+    wget -U 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17' "https://www.xilinx.com/bin/public/openDownload?filename=$XRT_DEB_VERSION.deb" -O /tmp/$XRT_DEB_VERSION.deb; fi
+
+COPY requirements.txt $XRT_DEB_VERSION.* /tmp/
+
+RUN if [ -z "$SKIP_XRT" ];then \
+    apt install -y /tmp/$XRT_DEB_VERSION.deb && \
+    rm /tmp/$XRT_DEB_VERSION.deb; fi
 
 # versioned Python package requirements for FINN compiler
 # these are given in requirements.txt
-COPY requirements.txt .
-RUN pip install -r requirements.txt
-RUN rm requirements.txt
+RUN pip install -r /tmp/requirements.txt
+RUN rm /tmp/requirements.txt
 
 # install PyTorch
 RUN pip install torch==1.13.1 torchvision==0.14.1 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116
@@ -126,6 +132,9 @@ RUN pip install tokenize-rt==4.2.1
 # pyverilator
 RUN pip install tclwrapper==0.0.1
 
+# assure that we have the right setuptools version
+RUN pip install setuptools==68.2.2
+
 # extra environment variables for FINN compiler
 ENV VIVADO_IP_CACHE "/tmp/vivado_ip_cache"
 
diff --git a/docker/finn_entrypoint.sh b/docker/finn_entrypoint.sh
index 61c8f78665..c7500bcaa6 100644
--- a/docker/finn_entrypoint.sh
+++ b/docker/finn_entrypoint.sh
@@ -86,7 +86,7 @@ if [ -f "$VITIS_PATH/settings64.sh" ];then
     source $XILINX_XRT/setup.sh
     gecho "Found XRT at $XILINX_XRT"
   else
-    recho "XRT not found on $XILINX_XRT, did the installation fail?"
+    recho "XRT not found on $XILINX_XRT, did you skip the download or did the installation fail?"
     exit -1
   fi
 else
diff --git a/docs/finn/faq.rst b/docs/finn/faq.rst
index 70c2f24ed2..0d643feba3 100644
--- a/docs/finn/faq.rst
+++ b/docs/finn/faq.rst
@@ -81,7 +81,7 @@ Which data layout do FINN-generated accelerators use? Big-endian? Little-endian?
     If you need to do this manually, first examine how the `FINN PYNQ Python drivers <https://github.com/Xilinx/finn-examples/blob/main/finn_examples/driver.py#L379>`_ do this – notice how the input data is
     first reshaped to create the “folded input shape” that reflects the word size of the first layer based on how much it
     was parallelized, then data packing is applied to obtain a raw byte array (with some reversals going on) that can be
-    fed directly to the hardware. Another example of this is the `npy_to_rtlsim_input <https://github.com/Xilinx/finn-base/blob/dev/src/finn/util/data_packing.py#L289>`_ function, which converts npy arrays to lists of Python arbitrary-precision integers that we feed into pyverilator for rtl simulation.
+    fed directly to the hardware. Another example of this is the `npy_to_rtlsim_input <https://github.com/Xilinx/finn/blob/dev/src/finn/util/data_packing.py#L284>`_ function, which converts npy arrays to lists of Python arbitrary-precision integers that we feed into pyverilator for rtl simulation.
 
 Why does FIFO sizing take so long for my network? Is something wrong?
     The automatic FIFO sizing in FINN can take quite long. It unfortunately doesn’t really parallelize on multiple cores since
diff --git a/fetch-repos.sh b/fetch-repos.sh
index 073c052d67..a4fc124fa4 100755
--- a/fetch-repos.sh
+++ b/fetch-repos.sh
@@ -27,9 +27,9 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-QONNX_COMMIT="fd61cfeebbdaba351abf7e9d54cd785d7776fa4f"
-FINN_EXP_COMMIT="de99347e936d51715f5356a1b6c64e37b91c23c2"
-BREVITAS_COMMIT="84f42259ec869eb151af4cb8a8b23ad925f493db"
+QONNX_COMMIT="2281a777d84aa5cbd7469085c2e534fb4a03ccf9"
+FINN_EXP_COMMIT="0724be21111a21f0d81a072fccc1c446e053f851"
+BREVITAS_COMMIT="d4834bd2a0fad3c1fbc0ff7e1346d5dcb3797ea4"
 PYVERILATOR_COMMIT="ce0a08c20cb8c1d1e84181d6f392390f846adbd1"
 CNPY_COMMIT="4e8810b1a8637695171ed346ce68f6984e585ef4"
 HLSLIB_COMMIT="16e5847a5e3ef76cffe84c8fad2f010d593457d3"
diff --git a/finn-rtllib/fifo/hdl/Q_srl.v b/finn-rtllib/fifo/hdl/Q_srl.v
index 11cef604e0..0b01973163 100644
--- a/finn-rtllib/fifo/hdl/Q_srl.v
+++ b/finn-rtllib/fifo/hdl/Q_srl.v
@@ -74,7 +74,8 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count, maxcount);
    parameter depth = 16;   // - greatest #items in queue  (2 <= depth <= 256)
    parameter width = 16;   // - width of data (i_d, o_d)
 
-   parameter addrwidth = $clog2(depth);
+   localparam countwidth = $clog2(depth + 1);
+   localparam addrwidth = $clog2(depth);
 
    input     clock;
    input     reset;
@@ -89,10 +90,10 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count, maxcount);
    input              o_r;	// - output stream ready
    wire               o_b;	// - output stream back-pressure
 
-   output [addrwidth:0] count;  // - output number of elems in queue
-   output [addrwidth:0] maxcount;  // - maximum observed count since reset
+   output [countwidth-1:0] count;  // - output number of elems in queue
+   output [countwidth-1:0] maxcount;  // - maximum observed count since reset
 
-   reg [addrwidth:0] maxcount_reg;  // - maximum count seen until now
+   reg [countwidth-1:0] maxcount_reg;  // - maximum count seen until now
    reg    [addrwidth-1:0] addr, addr_, a_;		// - SRL16 address
 							//     for data output
    reg 			  shift_en_;			// - SRL16 shift enable
@@ -183,58 +184,58 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count, maxcount);
    end // always @ (posedge clock or negedge reset)
 
    always @* begin					// - combi always
-        srlo_       <=  'bx;
-        shift_en_o_ <= 1'bx;
-        shift_en_   <= 1'bx;
-        addr_       <=  'bx;
-        state_      <= 2'bx;
+        srlo_       =  'bx;
+        shift_en_o_ = 1'bx;
+        shift_en_   = 1'bx;
+        addr_       =  'bx;
+        state_      = 2'bx;
       case (state)
 
 	state_empty: begin		    // - (empty, will not produce)
 	      if (i_v) begin		    // - empty & i_v => consume
-		 srlo_       <= i_d;
-		 shift_en_o_ <= 1;
-		 shift_en_   <= 1'bx;
-		 addr_       <= 0;
-		 state_      <= state_one;
+		 srlo_       = i_d;
+		 shift_en_o_ = 1;
+		 shift_en_   = 1'bx;
+		 addr_       = 0;
+		 state_      = state_one;
 	      end
 	      else	begin		    // - empty & !i_v => idle
-		 srlo_       <= 'bx;
-		 shift_en_o_ <= 0;
-		 shift_en_   <= 1'bx;
-		 addr_       <= 0;
-		 state_      <= state_empty;
+		 srlo_       = 'bx;
+		 shift_en_o_ = 0;
+		 shift_en_   = 1'bx;
+		 addr_       = 0;
+		 state_      = state_empty;
 	      end
 	end
 
 	state_one: begin		    // - (contains one)
 	      if (i_v && o_b) begin	    // - one & i_v & o_b => consume
-		 srlo_       <= 'bx;
-		 shift_en_o_ <= 0;
-		 shift_en_   <= 1;
-		 addr_       <= 0;
-		 state_      <= state_more;
+		 srlo_       = 'bx;
+		 shift_en_o_ = 0;
+		 shift_en_   = 1;
+		 addr_       = 0;
+		 state_      = state_more;
 	      end
 	      else if (i_v && !o_b) begin   // - one & i_v & !o_b => cons+prod
-		 srlo_       <= i_d;
-		 shift_en_o_ <= 1;
-		 shift_en_   <= 1;
-		 addr_       <= 0;
-		 state_      <= state_one;
+		 srlo_       = i_d;
+		 shift_en_o_ = 1;
+		 shift_en_   = 1;
+		 addr_       = 0;
+		 state_      = state_one;
 	      end
 	      else if (!i_v && o_b) begin   // - one & !i_v & o_b => idle
-		 srlo_       <= 'bx;
-		 shift_en_o_ <= 0;
-		 shift_en_   <= 1'bx;
-		 addr_       <= 0;
-		 state_      <= state_one;
+		 srlo_       = 'bx;
+		 shift_en_o_ = 0;
+		 shift_en_   = 1'bx;
+		 addr_       = 0;
+		 state_      = state_one;
 	      end
 	      else if (!i_v && !o_b) begin  // - one & !i_v & !o_b => produce
-		 srlo_       <= 'bx;
-		 shift_en_o_ <= 0;
-		 shift_en_   <= 1'bx;
-		 addr_       <= 0;
-		 state_      <= state_empty;
+		 srlo_       = 'bx;
+		 shift_en_o_ = 0;
+		 shift_en_   = 1'bx;
+		 addr_       = 0;
+		 state_      = state_empty;
 	      end
 	end // case: state_one
 
@@ -243,60 +244,60 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count, maxcount);
 					    // - (full, will not consume)
 					    // - (full here if depth==2)
 	      if (o_b) begin		    // - full & o_b => idle
-		 srlo_       <= 'bx;
-		 shift_en_o_ <= 0;
-		 shift_en_   <= 0;
-		 addr_       <= addr;
-		 state_      <= state_more;
+		 srlo_       = 'bx;
+		 shift_en_o_ = 0;
+		 shift_en_   = 0;
+		 addr_       = addr;
+		 state_      = state_more;
 	      end
 	      else begin		    // - full & !o_b => produce
-		 srlo_       <= srl[addr];
-		 shift_en_o_ <= 1;
-		 shift_en_   <= 0;
-//		 addr_       <= addr-1;
-//		 state_      <= state_more;
-		 addr_       <= addr_zero_ ? 0         : addr-1;
-		 state_      <= addr_zero_ ? state_one : state_more;
+		 srlo_       = srl[addr];
+		 shift_en_o_ = 1;
+		 shift_en_   = 0;
+//		 addr_       = addr-1;
+//		 state_      = state_more;
+		 addr_       = addr_zero_ ? 0         : addr-1;
+		 state_      = addr_zero_ ? state_one : state_more;
 	      end
 	   end
 	   else begin			    // - (mid: neither empty nor full)
 	      if (i_v && o_b) begin	    // - mid & i_v & o_b => consume
-		 srlo_       <= 'bx;
-		 shift_en_o_ <= 0;
-		 shift_en_   <= 1;
-		 addr_       <= addr+1;
-		 state_      <= state_more;
+		 srlo_       = 'bx;
+		 shift_en_o_ = 0;
+		 shift_en_   = 1;
+		 addr_       = addr+1;
+		 state_      = state_more;
 	      end
 	      else if (i_v && !o_b) begin   // - mid & i_v & !o_b => cons+prod
-		 srlo_       <= srl[addr];
-		 shift_en_o_ <= 1;
-		 shift_en_   <= 1;
-		 addr_       <= addr;
-		 state_      <= state_more;
+		 srlo_       = srl[addr];
+		 shift_en_o_ = 1;
+		 shift_en_   = 1;
+		 addr_       = addr;
+		 state_      = state_more;
 	      end
 	      else if (!i_v && o_b) begin   // - mid & !i_v & o_b => idle
-		 srlo_       <= 'bx;
-		 shift_en_o_ <= 0;
-		 shift_en_   <= 0;
-		 addr_       <= addr;
-		 state_      <= state_more;
+		 srlo_       = 'bx;
+		 shift_en_o_ = 0;
+		 shift_en_   = 0;
+		 addr_       = addr;
+		 state_      = state_more;
 	      end
 	      else if (!i_v && !o_b) begin  // - mid & !i_v & !o_b => produce
-		 srlo_       <= srl[addr];
-		 shift_en_o_ <= 1;
-		 shift_en_   <= 0;
-		 addr_       <= addr_zero_ ? 0         : addr-1;
-		 state_      <= addr_zero_ ? state_one : state_more;
+		 srlo_       = srl[addr];
+		 shift_en_o_ = 1;
+		 shift_en_   = 0;
+		 addr_       = addr_zero_ ? 0         : addr-1;
+		 state_      = addr_zero_ ? state_one : state_more;
 	      end
 	   end // else: !if(addr_full)
 	end // case: state_more
 
 	default: begin
-		 srlo_       <=  'bx;
-		 shift_en_o_ <= 1'bx;
-		 shift_en_   <= 1'bx;
-		 addr_       <=  'bx;
-		 state_      <= 2'bx;
+		 srlo_       =  'bx;
+		 shift_en_o_ = 1'bx;
+		 shift_en_   = 1'bx;
+		 addr_       =  'bx;
+		 state_      = 2'bx;
 	end // case: default
 
       endcase // case(state)
diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv
index 0ac2628ee5..1f6e97281e 100644
--- a/finn-rtllib/mvu/mvu_4sx4u.sv
+++ b/finn-rtllib/mvu/mvu_4sx4u.sv
@@ -34,10 +34,13 @@
 module mvu_4sx4u #(
 	int unsigned  PE,
 	int unsigned  SIMD,
+	int unsigned  WEIGHT_WIDTH,
+	int unsigned  ACTIVATION_WIDTH,
 	int unsigned  ACCU_WIDTH,
 
-	int unsigned  VERSION = 1,
+	int unsigned  VERSION = 1,	// Version 1 (DSP48E1) *must* commit to NARROW_WEIGHTS
 	bit  SIGNED_ACTIVATIONS = 0,
+	bit  NARROW_WEIGHTS   = 0,	// Weights from [-7:7] rather than [-8:7]
 	bit  FORCE_BEHAVIORAL = 0
 )(
 	// Global Control
@@ -48,8 +51,8 @@ module mvu_4sx4u #(
 	// Input
 	input	logic  last,
 	input	logic  zero,	// ignore current inputs and force this partial product to zero
-	input	logic signed [PE-1:0][SIMD-1:0][3:0]  w,	// signed weights
-	input	logic                [SIMD-1:0][3:0]  a,	// unsigned activations (override by SIGNED_ACTIVATIONS)
+	input	logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH    -1:0]  w,	// signed weights
+	input	logic                [SIMD-1:0][ACTIVATION_WIDTH-1:0]  a,	// unsigned activations (override by SIGNED_ACTIVATIONS)
 
 	// Ouput
 	output	logic  vld,
@@ -62,6 +65,55 @@ module mvu_4sx4u #(
 `endif
 		FORCE_BEHAVIORAL;
 
+	//-----------------------------------------------------------------------
+	// Determine Lane Configuration
+	initial begin
+		if(!NARROW_WEIGHTS && (VERSION == 1)) begin
+			$error("%m: Need NARROW_WEIGHTS for DSP48E1.");
+			$finish;
+		end
+	end
+
+	/**
+	 * Lane Slicing
+	 *	Assumptions:
+	 *	 - Internal lane widths differ, at most, by a single bit.
+	 *	 - The rightmost lane (#0) has the maximum internal width.
+	 *	 - The leftmost lane (#3) extends into the wide DSP accumulation path and
+	 *	   is constrained by ACCU_WIDTH rather than the next lane. It doesn't have
+	 *	   an external high extension.
+	 *	 - The one but leftmost lane (#2) has the minimum internal width and, hence,
+	 *	   the macimum external high extension.
+	 */
+	typedef int unsigned  lane_offset_v[4:0];
+	function lane_offset_v sliceLanes();
+		unique case(VERSION)
+		1: begin
+			return  NARROW_WEIGHTS?
+				lane_offset_v'{ ACCU_WIDTH+21, 21, 14, 7, 0 } :
+				lane_offset_v'{ 0, 0, 0, 0, 0 };	// not supported
+		end
+		2: begin
+			return  NARROW_WEIGHTS?
+				lane_offset_v'{ ACCU_WIDTH+23, 23, 16, 8, 0 } :
+				lane_offset_v'{ ACCU_WIDTH+22, 22, 15, 8, 0 };
+		end
+		endcase
+	endfunction : sliceLanes
+	localparam lane_offset_v  OFFSETS = sliceLanes();
+
+	function int unsigned lo_width(input int unsigned  i);
+		return  OFFSETS[i+1] - OFFSETS[i];
+	endfunction : lo_width
+	function int unsigned hi_width(input int unsigned  i);
+		return  1 + $clog2(2**(ACCU_WIDTH-lo_width(i)-1)+SIMD);
+	endfunction : hi_width
+	localparam int unsigned  LO_WIDTH_MAX = OFFSETS[1] - OFFSETS[0];
+	localparam int unsigned  HI_WIDTH_MAX = hi_width(2);
+
+	localparam int unsigned  A_WIDTH = 23 + 2*VERSION;	// Width of A datapath
+
+	// Compute the count of decendents for all nodes in the reduction trees.
 	typedef int unsigned  leave_load_t[2*SIMD-1];
 	function leave_load_t init_leave_loads();
 		automatic leave_load_t  res;
@@ -79,8 +131,6 @@ module mvu_4sx4u #(
 	assign	vld = L[5];
 
 	// Stages #1 - #3: DSP Lanes + cross-lane canaries duplicated with SIMD parallelism
-	localparam int unsigned  D[4:0] = '{ ACCU_WIDTH+22, 22, 15, 8, 0 }; // Lane offsets
-
 	localparam int unsigned  PIPE_COUNT = (PE+3)/4;
 	for(genvar  c = 0; c < PIPE_COUNT; c++) begin : genPipes
 
@@ -88,20 +138,20 @@ module mvu_4sx4u #(
 		localparam int unsigned  PE_END = PE < 4*(c+1)? PE : 4*(c+1);
 		localparam int unsigned  PE_REM = 4*(c+1) - PE_END;
 
-		uwire        [57:0]  p3[SIMD];
+		uwire        [47:0]  p3[SIMD];
 		uwire signed [ 1:0]  h3[SIMD][3];
 		for(genvar  s = 0; s < SIMD; s++) begin : genSIMD
 
 			// Input Lane Assembly
-			uwire [17:0]  bb = { {(14){SIGNED_ACTIVATIONS && a[s][3]}}, a[s] };
+			uwire [17:0]  bb = { {(18-ACTIVATION_WIDTH){SIGNED_ACTIVATIONS && a[s][ACTIVATION_WIDTH-1]}}, a[s] };
 			logic [29:0]  aa;
 			logic [26:0]  dd;
 			logic [ 1:0]  xx[3:1];
 			if(1) begin : blkVectorize
-				uwire [3:0]  ww[PE_END - PE_BEG];
+				uwire signed [3:0]  ww[PE_END - PE_BEG];
 				for(genvar  pe = 0; pe < PE_END - PE_BEG; pe++) begin
-					assign	ww[pe] = w[PE_BEG + pe][s];
-					if(pe) begin
+					assign	ww[pe] = $signed(w[PE_BEG + pe][s]);
+					if(pe > 0) begin
 						if(BEHAVIORAL)  assign  xx[pe + PE_REM] = zero? 0 : ww[pe] * a[s];
 `ifndef VERILATOR
 						else begin
@@ -123,8 +173,19 @@ module mvu_4sx4u #(
 					dd = '0;
 					aa = '0;
 					for(int unsigned  pe = 0; pe < PE_END - PE_BEG; pe++) begin
-						dd[D[pe + PE_REM]+:3] = ww[pe];
-						aa[D[pe + PE_REM]+ 3] = ww[pe][3];
+						automatic int unsigned  ofs = OFFSETS[pe + PE_REM];
+						dd[ofs+:3] = ww[pe];
+						assert(!NARROW_WEIGHTS || rst || !en || zero || (ww[pe] != -8)) else begin
+							$warning("%m: Weight of -8 violates NARROW_WEIGHTS commitment.");
+						end
+
+						// The sign of the weights are generally put on the subtracted A port.
+						// However, when coinciding with the actual sign bit position of the
+						// multiplier input path, it also goes onto the D input. This prevents
+						// sign extensions that may happen when a DSP primitive is auto-promoted
+						// to a newer generation.
+						if(ofs+3 == A_WIDTH-1)  dd[ofs+3] = ww[pe][3];
+						else                    aa[ofs+3] = ww[pe][3];
 					end
 				end
 			end : blkVectorize
@@ -135,6 +196,7 @@ module mvu_4sx4u #(
 			//       rst can be only applied to AD and zero only to B
 			//       with the same effect as zeroing both.
 			if(BEHAVIORAL) begin : genBehav
+
 				// Stage #1: Input Refine
 				logic signed [17:0]  B1  = 0;
 				always_ff @(posedge clk) begin
@@ -142,7 +204,7 @@ module mvu_4sx4u #(
 					else if(en)  B1  <= bb;
 				end
 
-				logic signed [26:0]  AD1 = 0;
+				logic signed [A_WIDTH-1:0]  AD1 = 0;
 				always_ff @(posedge clk) begin
 					if(rst)      AD1 <= 0;
 					else if(en)  AD1 <= dd - aa;
@@ -429,14 +491,14 @@ module mvu_4sx4u #(
 					X1 <= xx;
 					X2 <= X1;
 					foreach(X3[i]) begin
-						X3[i] <= X2[i] + (L[3]? 2'h0 : pp[D[i]+:2]);
+						X3[i] <= X2[i] + (L[3]? 2'h0 : pp[OFFSETS[i]+:2]);
 					end
 				end
 			end
 
 			// Derive actual cross-lane overflows
 			for(genvar  i = 0; i < 3; i++) begin
-				assign	h3[s][i] = pp[D[i+1]+:2] - X3[i+1];
+				assign	h3[s][i] = pp[OFFSETS[i+1]+:2] - X3[i+1];
 			end
 			assign	p3[s] = pp;
 
@@ -445,48 +507,59 @@ module mvu_4sx4u #(
 		// Stage #4: Cross-SIMD Reduction
 
 		// Count leaves reachable from each node
-		localparam leave_load_t  LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 1}; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop
+		localparam leave_load_t  LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 1 }; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop
 
-		uwire signed [ACCU_WIDTH  -1:0]  up4;
-		uwire signed [ACCU_WIDTH  -8:0]  hi4[3];
-		uwire        [$clog2(SIMD)+7:0]  lo4[3];
+		uwire signed [ACCU_WIDTH-1:0]  up4;
+		uwire signed [             HI_WIDTH_MAX-1:0]  hi4[3];
+		uwire        [$clog2(SIMD)+LO_WIDTH_MAX-1:0]  lo4[3];
 		for(genvar  i = 0; i < 4; i++) begin
-			localparam int unsigned  LO_WIDTH = D[i+1] - D[i];
-			localparam int unsigned  HI_WIDTH = ACCU_WIDTH - LO_WIDTH;
 
 			// Conclusive high part accumulation
-			if(i >= PE_REM && i < 3) begin : genHi
-				// Adder Tree across all SIMD high contributions, each from [-1:1]
-				uwire signed [2*SIMD-2:0][$clog2(1+SIMD):0]  tree;
-				for(genvar  s = 0; s < SIMD;   s++)  assign  tree[SIMD-1+s] = h3[s][i];
-				for(genvar  n = 0; n < SIMD-1; n++) begin
-					// Sum truncated to actual maximum bit width at this node
-					uwire signed [$clog2(1+LEAVE_LOAD[n]):0]  s = $signed(tree[2*n+1]) + $signed(tree[2*n+2]);
-					assign  tree[n] = s;
-				end
+			if(i < 3) begin : genHi
+				if(i < PE_REM)  assign  hi4[i] = '0;
+				else begin
+					localparam int unsigned  HI_WIDTH = hi_width(i);
+
+					// Adder Tree across all SIMD high contributions, each from [-1:1]
+					uwire signed [2*SIMD-2:0][$clog2(1+SIMD):0]  tree;
+					for(genvar  s = 0; s < SIMD;   s++)  assign  tree[SIMD-1+s] = h3[s][i];
+					for(genvar  n = 0; n < SIMD-1; n++) begin
+						// Sum truncated to actual maximum bit width at this node
+						uwire signed [$clog2(1+LEAVE_LOAD[n]):0]  s = $signed(tree[2*n+1]) + $signed(tree[2*n+2]);
+						assign  tree[n] = s;
+					end
+
+					// High Sideband Accumulation
+					logic signed [HI_WIDTH-1:0]  Hi4 = 0;
+					always_ff @(posedge clk) begin
+						if(rst)      Hi4 <= 0;
+						else if(en) begin
+							automatic logic signed [HI_WIDTH:0]  h = $signed(L[4]? 0 : Hi4) + $signed(tree[0]);
+							assert(h[HI_WIDTH] == h[HI_WIDTH-1]) else begin
+								$error("%m: Accumulation overflow for ACCU_WIDTH=%0d", ACCU_WIDTH);
+								$stop;
+							end
+							Hi4 <= h;
+						end
+					end
+					assign	hi4[i] = Hi4;
 
-				// High Sideband Accumulation
-				logic signed [HI_WIDTH-1:0]  Hi4 = 0;
-				always_ff @(posedge clk) begin
-					if(rst)      Hi4 <= 0;
-					else if(en)  Hi4 <= (L[4]? 0 : Hi4) + $signed(tree[0]);
 				end
-				assign	hi4[i] = Hi4;
 			end : genHi
-			else if (i < 3) begin : genHiZero
-				assign hi4[i] = '0;
-			end : genHiZero
 
-			// Conclusive low part accumulation
-			if(i >= PE_REM) begin : blkLo
+			// Conclusive low part accumulation (all unsigned arithmetic)
+			if(i < PE_REM)  assign  lo4[i] = '0;
+			else begin : genLo
+				localparam int unsigned  LO_WIDTH = lo_width(i);
+
 				// Adder Tree across all SIMD low contributions
 				localparam int unsigned  ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1));
 				uwire [2*SIMD-2:0][ROOT_WIDTH-1:0]  tree;
-				for(genvar  s = 0; s < SIMD;   s++)  assign  tree[SIMD-1+s] = p3[s][D[i]+:LO_WIDTH];
+				for(genvar  s = 0; s < SIMD;   s++)  assign  tree[SIMD-1+s] = p3[s][OFFSETS[i]+:LO_WIDTH];
 				for(genvar  n = 0; n < SIMD-1; n++) begin
 					// Sum truncated to actual maximum bit width at this node
 					localparam int unsigned  NODE_WIDTH = $clog2(1 + LEAVE_LOAD[n]*(2**LO_WIDTH-1));
-					uwire [NODE_WIDTH-1:0]  s = $signed(tree[2*n+1]) + $signed(tree[2*n+2]);
+					uwire [NODE_WIDTH-1:0]  s = tree[2*n+1] + tree[2*n+2];
 					assign  tree[n] = s;
 				end
 
@@ -498,10 +571,7 @@ module mvu_4sx4u #(
 
 				if(i == 3)  assign  up4 = Lo4;
 				else  assign  lo4[i] = Lo4;
-			end : blkLo
-			else begin : blkLoZero
-				assign lo4[i] = '0;
-			end : blkLoZero
+			end : genLo
 
 		end
 
@@ -511,9 +581,9 @@ module mvu_4sx4u #(
 			if(rst)  Res5 <= '{ default: 0 };
 			else if(en) begin
 				Res5[3] <= up4 - hi4[2];
-				Res5[2] <= $signed({ hi4[2], {(D[3] - D[2]){1'b0}} }) + $signed({ 1'b0, lo4[2] }) - hi4[1];
-				Res5[1] <= $signed({ hi4[1], {(D[2] - D[1]){1'b0}} }) + $signed({ 1'b0, lo4[1] }) - hi4[0];
-				Res5[0] <= $signed({ hi4[0], {(D[1] - D[0]){1'b0}} }) + $signed({ 1'b0, lo4[0] });
+				Res5[2] <= $signed({ hi4[2], {(lo_width(2)){1'b0}} }) + $signed({ 1'b0, lo4[2] }) - hi4[1];
+				Res5[1] <= $signed({ hi4[1], {(lo_width(1)){1'b0}} }) + $signed({ 1'b0, lo4[1] }) - hi4[0];
+				Res5[0] <= $signed({ hi4[0], {(lo_width(0)){1'b0}} }) + $signed({ 1'b0, lo4[0] });
 			end
 		end
 
diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
index fbf48784f0..dabb36647e 100644
--- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
+++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
@@ -34,9 +34,9 @@
 module mvu_8sx8u_dsp48 #(
 	int unsigned  PE,
 	int unsigned  SIMD,
-	int unsigned  ACCU_WIDTH,
-	int unsigned  ACTIVATION_WIDTH,
 	int unsigned  WEIGHT_WIDTH,
+	int unsigned  ACTIVATION_WIDTH,
+	int unsigned  ACCU_WIDTH,
 
 	int unsigned  VERSION = 1,
 	bit  SIGNED_ACTIVATIONS = 0,
@@ -72,6 +72,10 @@ module mvu_8sx8u_dsp48 #(
 		return  res;
 	endfunction : init_leave_loads
 
+	function int unsigned sum_width(input int unsigned  n, input int unsigned  w);
+		return	w <= 16? $clog2(1 + n*(2**w - 1)) : w + $clog2(n);
+	endfunction : sum_width
+
 	// Pipeline for last indicator flag
 	logic [1:5] L = '0;
 	always_ff @(posedge clk) begin
@@ -91,7 +95,7 @@ module mvu_8sx8u_dsp48 #(
 		localparam int unsigned  PE_END = PE < 2*(c+1)? PE : 2*(c+1);
 		localparam int unsigned  PE_REM = 2*(c+1) - PE_END;
 
-		uwire        [57:0]  p3[SIMD];
+		uwire        [47:0]  p3[SIMD];
 		uwire signed [ 1:0]  h3[SIMD];
 		for(genvar  s = 0; s < SIMD; s++) begin : genSIMD
 
@@ -445,15 +449,32 @@ module mvu_8sx8u_dsp48 #(
 		// Stage #4: Cross-SIMD Reduction
 
 		// Count leaves reachable from each node
-		localparam leave_load_t  LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 0}; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop
-
-		uwire signed [ACCU_WIDTH  -1:0]  up4;
-		uwire signed [ACCU_WIDTH  -SINGLE_PROD_WIDTH:0]  hi4;
-		uwire        [$clog2(SIMD)+SINGLE_PROD_WIDTH-1:0]  lo4;
+		localparam leave_load_t  LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 0 }; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop
+
+		// Range of Cross-lane Contribution Tracked in Hi4
+		/*
+		 * - Assumption: ACCU_WIDTH bounds right lane value at any point in time.
+		 * - The value x beyond the lane boundary is hence bounded by:
+		 *		-2^(w-1) <= x <= 2^(w-1)-1    with w = ACCU_WIDTH - D[1]
+		 * - This value decomposes into the tracked overflow h and the overflow l
+		 *   from the low SIMD lane reduction with:
+		 *		0 <= l <= SIMD
+		 * - From x = l + h follows:
+		 *		h = x - l
+		 *		-2^(w-1) - SIMD <= h <= 2^(w-1)-1
+		 * - This required bit width of the two's complement representation of this
+		 *   signed value is determined by its lower bound to be at least:
+		 *		1 + $clog2(2^(w-1)+SIMD)
+		 */
+		localparam int unsigned  HI_WIDTH = 1 + ($clog2(SIMD) < ACCU_WIDTH-D[1]? ACCU_WIDTH-D[1] : $clog2(2**(ACCU_WIDTH-D[1]-1)+SIMD));
+
+		uwire signed [ACCU_WIDTH       -1:0]  up4;
+		uwire signed [HI_WIDTH         -1:0]  hi4;
+		uwire        [$clog2(SIMD)+D[1]-1:0]  lo4;
 
 		// Conclusive high part accumulation
 		if(PE_REM == 0) begin : genHi
-			localparam int unsigned  HI_WIDTH = ACCU_WIDTH - D[1];
+
 			// Adder Tree across all SIMD high contributions, each from [-1:1]
 			uwire signed [2*SIMD-2:0][$clog2(1+SIMD):0]  tree;
 			for(genvar  s = 0; s < SIMD;   s++)  assign  tree[SIMD-1+s] = h3[s];
@@ -466,8 +487,15 @@ module mvu_8sx8u_dsp48 #(
 			// High Sideband Accumulation
 			logic signed [HI_WIDTH-1:0]  Hi4 = 0;
 			always_ff @(posedge clk) begin
-				if(rst)      Hi4 <= 0;
-				else if(en)  Hi4 <= (L[4]? 0 : Hi4) + $signed(tree[0]);
+				if(rst)  Hi4 <= 0;
+				else if(en) begin
+					automatic logic signed [HI_WIDTH:0]  h = $signed(L[4]? 0 : Hi4) + $signed(tree[0]);
+					assert(h[HI_WIDTH] == h[HI_WIDTH-1]) else begin
+						$error("%m: Accumulation overflow for ACCU_WIDTH=%0d", ACCU_WIDTH);
+						$stop;
+					end
+					Hi4 <= h;
+				end
 			end
 			assign	hi4 = Hi4;
 		end : genHi
@@ -479,14 +507,14 @@ module mvu_8sx8u_dsp48 #(
 			localparam int unsigned  LO_WIDTH = D[i+1] - D[i];
 			// Conclusive low part accumulation
 			if(i >= PE_REM) begin : blkLo
-				// Adder Tree across all SIMD low contributions
-				localparam int unsigned  ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1));
+				// Adder Tree across all SIMD low contributions (all unsigned arithmetic)
+				localparam int unsigned  ROOT_WIDTH = sum_width(SIMD, LO_WIDTH);
 				uwire [2*SIMD-2:0][ROOT_WIDTH-1:0]  tree;
 				for(genvar  s = 0; s < SIMD;   s++)  assign  tree[SIMD-1+s] = p3[s][D[i]+:LO_WIDTH];
 				for(genvar  n = 0; n < SIMD-1; n++) begin
 					// Sum truncated to actual maximum bit width at this node
-					localparam int unsigned  NODE_WIDTH = $clog2(1 + LEAVE_LOAD[n]*(2**LO_WIDTH-1));
-					uwire [NODE_WIDTH-1:0]  s = $signed(tree[2*n+1]) + $signed(tree[2*n+2]);
+					localparam int unsigned  NODE_WIDTH = sum_width(LEAVE_LOAD[n], LO_WIDTH);
+					uwire [NODE_WIDTH-1:0]  s = tree[2*n+1] + tree[2*n+2];
 					assign  tree[n] = s;
 				end
 
diff --git a/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv b/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv
index 3bbc7051b9..2734f37cf3 100644
--- a/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv
+++ b/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv
@@ -35,9 +35,10 @@ module mvu_vvu_8sx9_dsp58 #(
 	bit IS_MVU,
     int unsigned PE,
     int unsigned SIMD,
-    int unsigned ACTIVATION_WIDTH,
-    int unsigned WEIGHT_WIDTH,
-	int unsigned ACCU_WIDTH,
+	int unsigned  WEIGHT_WIDTH,
+	int unsigned  ACTIVATION_WIDTH,
+	int unsigned  ACCU_WIDTH,
+
     bit SIGNED_ACTIVATIONS = 0,
     int unsigned SEGMENTLEN = 0, // Default to 0 (which implies a single segment)
 	bit FORCE_BEHAVIORAL = 0,
diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index 6498530113..0ee84b2f79 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -55,6 +55,7 @@ module mvu_vvu_axi #(
 	int unsigned ACTIVATION_WIDTH,
 	int unsigned WEIGHT_WIDTH,
 	int unsigned ACCU_WIDTH,
+	bit NARROW_WEIGHTS     = 0,
 	bit SIGNED_ACTIVATIONS = 0,
 
 	bit PUMPED_COMPUTE = 0,
@@ -299,22 +300,45 @@ module mvu_vvu_axi #(
 
 		case(COMPUTE_CORE)
 		"mvu_vvu_8sx9_dsp58":
-			mvu_vvu_8sx9_dsp58 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(DSP_SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
-			.ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN),
-			.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
+			mvu_vvu_8sx9_dsp58 #(
+				.IS_MVU(IS_MVU),
+				.PE(PE), .SIMD(DSP_SIMD),
+				.WEIGHT_WIDTH(WEIGHT_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .ACCU_WIDTH(ACCU_WIDTH),
+				.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN),
+				.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)
+			) core (
 				.clk(dsp_clk), .rst, .en(dsp_en),
 				.last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a),
 				.vld(dsp_vld), .p(dsp_p)
 			);
-		"mvu_4sx4u":
-			mvu_4sx4u #(.PE(PE), .SIMD(DSP_SIMD), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
+		"mvu_4sx4u_dsp48e1":
+			mvu_4sx4u #(
+				.PE(PE), .SIMD(DSP_SIMD),
+				.WEIGHT_WIDTH(WEIGHT_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .ACCU_WIDTH(ACCU_WIDTH),
+				.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .NARROW_WEIGHTS(NARROW_WEIGHTS),
+				.VERSION(1), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)
+			) core (
+				.clk(dsp_clk), .rst, .en(dsp_en),
+				.last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a),
+				.vld(dsp_vld), .p(dsp_p)
+			);
+		"mvu_4sx4u_dsp48e2":
+			mvu_4sx4u #(
+				.PE(PE), .SIMD(DSP_SIMD),
+				.WEIGHT_WIDTH(WEIGHT_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .ACCU_WIDTH(ACCU_WIDTH),
+				.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .NARROW_WEIGHTS(NARROW_WEIGHTS),
+				.VERSION(2), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)
+			) core (
 				.clk(dsp_clk), .rst, .en(dsp_en),
 				.last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a),
 				.vld(dsp_vld), .p(dsp_p)
 			);
 		"mvu_8sx8u_dsp48":
-			mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(DSP_SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
-			.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
+			mvu_8sx8u_dsp48 #(
+				.PE(PE), .SIMD(DSP_SIMD),
+				.WEIGHT_WIDTH(WEIGHT_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .ACCU_WIDTH(ACCU_WIDTH),
+				.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)
+			) core (
 				.clk(dsp_clk), .rst, .en(dsp_en),
 				.last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a),
 				.vld(dsp_vld), .p(dsp_p)
diff --git a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
index 50c15c1b02..4edf676008 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
+++ b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
@@ -42,6 +42,7 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 	parameter	ACTIVATION_WIDTH = $ACTIVATION_WIDTH$,
 	parameter	WEIGHT_WIDTH = $WEIGHT_WIDTH$,
 	parameter	ACCU_WIDTH = $ACCU_WIDTH$,
+        parameter       NARROW_WEIGHTS = $NARROW_WEIGHTS$,
 	parameter	SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$,
 	parameter	SEGMENTLEN = $SEGMENTLEN$,
 	parameter	FORCE_BEHAVIORAL = $FORCE_BEHAVIORAL$,
@@ -77,7 +78,7 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 
 mvu_vvu_axi #(
 	.IS_MVU(IS_MVU), .COMPUTE_CORE(COMPUTE_CORE), .PUMPED_COMPUTE(PUMPED_COMPUTE), .MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD),
-	.ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH),
+	.ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .NARROW_WEIGHTS(NARROW_WEIGHTS),
 	.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)
 	) inst (
 	.ap_clk(ap_clk),
diff --git a/finn-rtllib/mvu/tb/mvu_3sx3u_tb.sv b/finn-rtllib/mvu/tb/mvu_3sx3u_tb.sv
new file mode 100644
index 0000000000..783218e08c
--- /dev/null
+++ b/finn-rtllib/mvu/tb/mvu_3sx3u_tb.sv
@@ -0,0 +1,165 @@
+module mvu_3sx3u_tb;
+
+	localparam int unsigned  ROUNDS = 157;
+
+	localparam int unsigned  MH = 32;
+	localparam int unsigned  MW = 60;
+	localparam int unsigned  PE = 1;
+	localparam int unsigned  SIMD = 1;
+
+	localparam int unsigned  ACTIVATION_WIDTH = 3;
+	localparam int unsigned  WEIGHT_WIDTH = 3;
+	localparam int unsigned  ACCU_WIDTH = 16;
+
+
+	//-----------------------------------------------------------------------
+	// Global Control
+	logic  clk = 1;
+	always #5ns clk = !clk;
+
+	logic  rst = 1;
+	initial begin
+		repeat(16) @(posedge clk);
+		rst <= 0;
+	end
+
+	//-----------------------------------------------------------------------
+	// DUT
+	logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]  s_axis_weights_tdata;
+	logic  s_axis_weights_tvalid;
+	uwire  s_axis_weights_tready;
+
+	logic [SIMD-1:0][ACTIVATION_WIDTH-1:0]  s_axis_input_tdata;
+	logic  s_axis_input_tvalid;
+	uwire  s_axis_input_tready;
+
+	uwire [PE-1:0][ACCU_WIDTH-1:0]  m_axis_output_tdata;
+	uwire  m_axis_output_tvalid;
+	logic  m_axis_output_tready;
+
+	mvu_vvu_axi #(
+		.IS_MVU(1),
+		.COMPUTE_CORE("mvu_4sx4u_dsp48e2"),
+		.MH(MH), .MW(MW),
+		.PE(PE), .SIMD(SIMD),
+
+		.ACTIVATION_WIDTH(ACTIVATION_WIDTH),
+		.WEIGHT_WIDTH(WEIGHT_WIDTH),
+		.ACCU_WIDTH(ACCU_WIDTH)
+		//int unsigned SEGMENTLEN = 0,
+		//bit FORCE_BEHAVIORAL = 0,
+	) dut (
+		.ap_clk(clk), .ap_clk2x('x), .ap_rst_n(!rst),
+		.s_axis_weights_tdata, .s_axis_weights_tvalid, .s_axis_weights_tready,
+		.s_axis_input_tdata,   .s_axis_input_tvalid,   .s_axis_input_tready,
+		.m_axis_output_tdata,  .m_axis_output_tvalid,  .m_axis_output_tready
+	);
+
+	//-----------------------------------------------------------------------
+	// Stimuli
+
+	//- Infinite Weight Feed ------------
+	typedef logic signed [WEIGHT_WIDTH-1:0]  weights_t[MH][MW];
+	function weights_t calc_WEIGHTS();
+		automatic weights_t  ret;
+		std::randomize(ret);
+		return  ret;
+	endfunction : calc_WEIGHTS
+	weights_t  WEIGHTS = calc_WEIGHTS();
+
+	initial begin
+		s_axis_weights_tdata  = 'x;
+		s_axis_weights_tvalid =  0;
+		@(posedge clk iff !rst);
+
+		forever begin
+			for(int unsigned  h = 0; h < MH; h+=PE) begin
+				for(int unsigned  w = 0; w < MW; w+=SIMD) begin
+					for(int unsigned  pe = 0; pe < PE; pe++) begin
+						for(int unsigned  simd = 0; simd < SIMD; simd++) begin
+							s_axis_weights_tdata[pe][simd] <= WEIGHTS[h+pe][w+simd];
+						end
+					end
+					s_axis_weights_tvalid <= 1;
+					@(posedge clk iff s_axis_weights_tready);
+					s_axis_weights_tvalid <=  0;
+					s_axis_weights_tdata  <= 'x;
+				end
+			end
+		end
+	end
+
+	//- Input Feed and Reference Computation
+	typedef logic [PE-1:0][ACCU_WIDTH-1:0]  outvec_t;
+	outvec_t  Q_ref[$] = {};
+
+	initial begin
+		s_axis_input_tdata  = 'x;
+		s_axis_input_tvalid =  0;
+		@(posedge clk iff !rst);
+
+		repeat(ROUNDS) begin : blkRounds
+			automatic logic [MH-1:0][ACCU_WIDTH-1:0]  accus = '{ default: 0 };
+
+			for(int unsigned  w = 0; w < MW; w+=SIMD) begin : blkSF
+				for(int unsigned  simd = 0; simd < SIMD; simd++) begin : blkSIMD
+					automatic logic [ACTIVATION_WIDTH-1:0]  act = $urandom();
+					for(int unsigned  h = 0; h < MH; h++) begin : blkMH
+						automatic logic signed [ACCU_WIDTH-1:0]  prod = WEIGHTS[h][w+simd] * $signed({1'b0, act});
+						accus[h] += prod;
+					end : blkMH
+					s_axis_input_tdata[simd] <= act;
+				end : blkSIMD
+				s_axis_input_tvalid <= 1;
+				@(posedge clk iff s_axis_input_tready);
+				s_axis_input_tvalid <=  0;
+				s_axis_input_tdata  <= 'x;
+			end : blkSF
+
+			for(int unsigned  h = 0; h < MH; h+=PE) begin
+				Q_ref.push_back(accus[h+:PE]);
+			end
+
+		end : blkRounds
+	end
+
+	//- Output Checker
+	initial begin
+		automatic int  timeout = 0;
+
+		m_axis_output_tready = 0;
+		@(posedge clk iff !rst);
+
+		m_axis_output_tready <= 1;
+		while(timeout < MW/SIMD+16) begin
+			@(posedge clk);
+			if(!m_axis_output_tvalid)  timeout++;
+			else begin
+				automatic outvec_t  exp;
+
+				assert(Q_ref.size()) else begin
+					$error("Spurious output.");
+					$stop;
+				end
+
+				exp = Q_ref.pop_front();
+				assert(m_axis_output_tdata === exp) else begin
+					$error("Mismatched output %p instead of %p.", m_axis_output_tdata, exp);
+					$stop;
+				end
+
+				timeout = 0;
+			end
+		end
+		m_axis_output_tready <= 0;
+
+		assert(Q_ref.size() == 0) else begin
+			$error("Missing output.");
+			$stop;
+		end
+
+		$display("Test completed.");
+		$finish;
+	end
+
+endmodule : mvu_3sx3u_tb
diff --git a/finn-rtllib/mvu/tb/mvu_accu_tb.dat b/finn-rtllib/mvu/tb/mvu_accu_tb.dat
new file mode 100644
index 0000000000..7e102ab6ab
--- /dev/null
+++ b/finn-rtllib/mvu/tb/mvu_accu_tb.dat
@@ -0,0 +1,192 @@
+9
+4
+d
+9
+2
+a
+d
+7
+9
+7
+b
+4
+4
+7
+0
+0
+c
+9
+9
+1
+9
+0
+a
+0
+5
+5
+7
+7
+2
+6
+7
+9
+0
+0
+9
+7
+7
+c
+7
+9
+7
+1
+2
+0
+f
+7
+1
+7
+f
+7
+1
+7
+1
+6
+6
+9
+e
+f
+e
+a
+6
+1
+7
+9
+d
+a
+7
+7
+f
+4
+7
+f
+9
+f
+9
+1
+9
+f
+7
+3
+4
+1
+1
+0
+d
+c
+d
+b
+9
+9
+f
+7
+0
+5
+e
+6
+7
+e
+7
+1
+7
+0
+e
+3
+c
+4
+9
+7
+9
+9
+d
+e
+c
+1
+f
+7
+0
+7
+1
+7
+d
+0
+7
+e
+a
+1
+9
+4
+b
+7
+9
+0
+a
+e
+6
+7
+2
+9
+0
+9
+0
+9
+1
+9
+0
+0
+7
+2
+7
+1
+5
+9
+1
+9
+6
+7
+c
+1
+9
+d
+9
+f
+c
+9
+9
+9
+b
+b
+9
+f
+9
+5
+1
+3
+0
+9
+0
+9
+2
+a
+9
+0
+f
+0
+7
+0
+a
+7
+3
+e
+5
+7
diff --git a/finn-rtllib/mvu/tb/mvu_accu_tb.sv b/finn-rtllib/mvu/tb/mvu_accu_tb.sv
new file mode 100644
index 0000000000..ceeb31194c
--- /dev/null
+++ b/finn-rtllib/mvu/tb/mvu_accu_tb.sv
@@ -0,0 +1,162 @@
+/******************************************************************************
+ * Copyright (C) 2024, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Testbench for MVU core compute kernel.
+ *****************************************************************************/
+
+module mvu_accu_tb;
+
+	localparam	IS_MVU = 1;
+	localparam	COMPUTE_CORE = "mvu_8sx8u_dsp48";
+	localparam	PUMPED_COMPUTE = 0;
+	localparam	MW = 6;
+	localparam	MH = 32;
+	localparam	PE = 1;
+	localparam	SIMD = 1;
+	localparam	ACTIVATION_WIDTH = 8;
+	localparam	WEIGHT_WIDTH = 4;
+	localparam	NARROW_WEIGHTS = 1;
+	localparam	SIGNED_ACTIVATIONS = 1;
+	localparam	SEGMENTLEN = 1;
+	localparam	FORCE_BEHAVIORAL = 0;
+
+	// Safely deducible parameters
+	localparam  WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8;
+	localparam  INPUT_STREAM_WIDTH_BA = ((IS_MVU == 1 ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8;
+
+	// Global Control
+	logic  clk = 0;
+	always #5ns clk = !clk;
+	logic  rst = 1;
+	initial begin
+		repeat(16) @(posedge clk);
+		rst <= 0;
+	end
+
+	logic [WEIGHT_WIDTH-1:0]  WeightMem[MH*MW];
+	initial  $readmemh("mvu_accu_tb.dat", WeightMem);
+
+	// Shared Input Feed
+	logic [INPUT_STREAM_WIDTH_BA-1:0]  in_TDATA;
+	logic  in_TVALID[2];
+	uwire  in_TREADY[2];
+	initial begin
+		in_TDATA = 'x;
+		in_TVALID = '{ default: 0 };
+		@(posedge clk iff !rst);
+
+		repeat(2161*MW) begin
+			automatic logic [ACTIVATION_WIDTH-1:0]  a = $urandom();
+			in_TDATA  <= a;
+			in_TVALID <= '{ default: 1 };
+			fork
+				begin
+					@(posedge clk iff in_TREADY[0]);
+					in_TVALID[0] <= 0;
+				end
+				begin
+					@(posedge clk iff in_TREADY[1]);
+					in_TVALID[1] <= 0;
+				end
+			join
+		end
+
+		repeat(MH*MW) @(posedge clk);
+		$display("Test completed.");
+		$finish;
+	end
+
+	// DUTs
+	localparam int unsigned  ACCU_WIDTHS[2] = '{ 16, 32 };
+	int  OutQ[2][$];
+	for(genvar  i = 0; i < $size(ACCU_WIDTHS); i++) begin : genDUTs
+		localparam int unsigned  ACCU_WIDTH = ACCU_WIDTHS[i];
+		localparam int unsigned  OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8;
+
+		// Private Weight Feed
+		logic [WEIGHT_STREAM_WIDTH_BA-1:0]  weights_TDATA;
+		logic  weights_TVALID;
+		uwire  weights_TREADY;
+		initial begin
+			weights_TDATA  = 'x;
+			weights_TVALID = 0;
+			@(posedge clk iff !rst);
+
+			weights_TVALID <= 1;
+			forever begin
+				for(int unsigned  i = 0; i < MH*MW; i++)  begin
+					weights_TDATA <= WeightMem[i];
+					@(posedge clk iff weights_TREADY);
+				end
+			end
+		end
+
+		// Private Output Capture into Queue
+		uwire signed [OUTPUT_STREAM_WIDTH_BA-1:0]  out_TDATA;
+		uwire  out_TVALID;
+		uwire  out_TREADY = !rst;
+		always_ff @(posedge clk iff !rst) begin
+			if(out_TVALID)  OutQ[i].push_back(out_TDATA);
+		end
+
+		// Actual DUT Instance
+		mvu_vvu_axi #(
+			.IS_MVU(IS_MVU), .COMPUTE_CORE(COMPUTE_CORE), .PUMPED_COMPUTE(PUMPED_COMPUTE), .MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD),
+			.ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .NARROW_WEIGHTS(NARROW_WEIGHTS),
+			.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)
+		) dut (
+			.ap_clk(clk),
+			.ap_clk2x(1'b0),
+			.ap_rst_n(!rst),
+			.s_axis_weights_tdata(weights_TDATA),
+			.s_axis_weights_tvalid(weights_TVALID),
+			.s_axis_weights_tready(weights_TREADY),
+			.s_axis_input_tdata(in_TDATA),
+			.s_axis_input_tvalid(in_TVALID[i]),
+			.s_axis_input_tready(in_TREADY[i]),
+			.m_axis_output_tdata(out_TDATA),
+			.m_axis_output_tvalid(out_TVALID),
+			.m_axis_output_tready(out_TREADY)
+		);
+	end : genDUTs
+
+	// Output Equivalence Checker
+	always_ff @(posedge clk) begin
+		if(OutQ[0].size && OutQ[1].size) begin
+			automatic int unsigned  y0 = OutQ[0].pop_front();
+			automatic int unsigned  y1 = OutQ[1].pop_front();
+			assert(y0 == y1) else begin
+				$error("Output Mismatch: %0d vs. %0d", y0, y1);
+				$stop;
+			end
+		end
+	end
+
+endmodule : mvu_accu_tb
diff --git a/finn-rtllib/mvu/tb/mvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_axi_tb.sv
index 4ed7b4bf5f..f16c40db34 100644
--- a/finn-rtllib/mvu/tb/mvu_axi_tb.sv
+++ b/finn-rtllib/mvu/tb/mvu_axi_tb.sv
@@ -37,18 +37,18 @@ module mvu_axi_tb();
 	// Matrix & parallelism config
 	localparam bit IS_MVU = 1;
 	localparam string COMPUTE_CORE = "mvu_4sx4u";
-	localparam int unsigned MW = 120;
-	localparam int unsigned MH = 40;
-	localparam int unsigned SIMD = 20;
-	localparam int unsigned PE = 10;
-	localparam int unsigned SEGMENTLEN = 2.0;
-	localparam bit FORCE_BEHAVIORAL = 1;
+	localparam int unsigned MW = 96;
+	localparam int unsigned MH = 32;
+	localparam int unsigned SIMD = 48;
+	localparam int unsigned PE = 16;
+	localparam int unsigned SEGMENTLEN = 2;
+	localparam bit FORCE_BEHAVIORAL = 0;
 	localparam bit M_REG_LUT = 1;
 	// Bit-width config
 	localparam int unsigned ACTIVATION_WIDTH = 4;
 	localparam int unsigned WEIGHT_WIDTH = 4;
 	localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW);
-	localparam bit SIGNED_ACTIVATIONS = 0;
+	localparam bit SIGNED_ACTIVATIONS = 1;
 	// Simulation constants
 	localparam int unsigned NF = MH/PE;
 	localparam int unsigned SF = MW/SIMD;
@@ -70,7 +70,7 @@ module mvu_axi_tb();
 
 	uwire ap_clk = clk;
 
-	// Generate activations
+	// Generate shared Activations
 	typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t;
 	typedef activation_t activation_vector_t[SF];
 
@@ -82,148 +82,174 @@ module mvu_axi_tb();
 
 	activation_vector_t ACTIVATIONS = init_ACTIVATIONS();
 
-	struct {
-		activation_t dat;
-		logic vld;
-		logic rdy;
-	} activations;
-
-	initial begin
-		activations.vld = 0;
-		activations.dat = 'X;
-		@(posedge clk iff ap_rst_n);
-
-		for (int i=0; i<SF; i++) begin
-			activations.dat <= ACTIVATIONS[i];
-			do begin
-				activations.vld <= $urandom()%7 >= 0;
-				@(posedge clk);
-			end while (!(activations.vld === 1 && activations.rdy === 1));
+	// Run parallel instances across DSP versions and NARROW_WEIGHTS
+	bit [2:1][1:0]  done = { 2: 2'b00, 1: 2'b01 }; // [ver][narrow]
+	always_comb begin
+		if(&done) begin
+			$display("Test completed.");
+			$finish;
 		end
-
-		activations.vld <= 0;
-		activations.dat <= 'x;
 	end
 
-	// Generate weights
-	typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t;
-	typedef weight_t weight_matrix_t[NF][SF];
-
-	function weight_matrix_t init_WEIGHTS;
-		automatic weight_matrix_t res;
-		std::randomize(res);
-		return res;
-	endfunction : init_WEIGHTS;
-
-	weight_matrix_t WEIGHTS = init_WEIGHTS();
-
-	struct {
-		weight_t dat;
-		logic vld;
-		logic rdy;
-	} weights;
-
-	initial begin
-		weights.vld = 0;
-		weights.dat = 'X;
-		@(posedge clk iff ap_rst_n);
-
-		weights.vld <= 1;
-		for (int i=0; i<NF; i++) begin
-			for (int j=0; j<SF; j++) begin
-				weights.dat <= WEIGHTS[i][j];
-				@(posedge clk iff weights.rdy);
+	for(genvar  ver = 1; ver <= 2; ver++) begin : genVersion
+		for(genvar  narrow = (ver == 1); narrow <= 1; narrow++) begin : genNarrowWide
+
+		// Activations Feed
+		struct {
+			activation_t dat;
+			logic vld;
+			logic rdy;
+		} activations;
+
+		initial begin
+			activations.vld = 0;
+			activations.dat = 'X;
+			@(posedge clk iff ap_rst_n);
+
+			for(int unsigned  i = 0; i < SF; i++) begin
+				while($urandom()%7 == 0) @(posedge clk);
+				activations.dat <= ACTIVATIONS[i];
+				activations.vld <= 1;
+				@(posedge clk iff activations.rdy);
+				activations.dat <= 'x;
+				activations.vld <= 0;
 			end
 		end
 
-		weights.vld <= 0;
-		weights.dat <= 'x;
-	end
-
-	// Function to compute golden output
-	// a: [SF][SIMD-1:0][ACTIVATION_WIDTH-1:0]
-	// a: [SF][SIMD-1:0][ACTIVATION_WIDTH-1:0]
-	// a: [SF][PE*SIMD-1:0][ACTIVATION_WIDTH-1:0]
-	// w: [NF][SF][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]
-	typedef logic signed [PE-1:0][ACCU_WIDTH-1:0] output_t;
-	typedef output_t output_vector_t [NF];
-
-	struct {
-		output_t dat;
-		logic vld;
-		logic rdy;
-	} outputs;
-
-	function output_vector_t check_output(activation_vector_t a, weight_matrix_t w);
-		automatic output_vector_t res = '{default: 0};
-		// The input stream will have the channels interleaved for VVU when PE>1
-		// Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..]
-		// Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like:
-		// (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to
-		// (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i)
-		for (int i = 0; i < NF; i++) begin
-			for (int j = 0; j < SF; j++) begin
-				for (int k = 0; k < PE; k++) begin
-					for (int l = 0; l < SIMD; l++) begin
-						if (SIGNED_ACTIVATIONS)
-							res[i][k] = $signed(res[i][k]) + $signed(a[j][l]) * $signed(w[i][j][k][l]);
-						else
-							res[i][k] = $signed(res[i][k]) + $signed({1'b0, a[j][l]}) * $signed(w[i][j][k][l]);
+		// Instance-specifc Weights (may be narrow)
+		typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t;
+		typedef weight_t weight_matrix_t[NF][SF];
+
+		function weight_matrix_t init_WEIGHTS;
+			automatic weight_matrix_t  res;
+			std::randomize(res);
+			if(narrow) begin  // increment all weights of -8
+				for(int unsigned  nf = 0; nf < NF; nf++) begin
+					for(int unsigned  sf = 0; sf < SF; sf++) begin
+						for(int unsigned  pe = 0; pe < PE; pe++) begin
+							for(int unsigned  simd = 0; simd < SIMD; simd++) begin
+								if(res[nf][sf][pe][simd] == (1 << (WEIGHT_WIDTH-1))) begin
+									res[nf][sf][pe][simd]++;
+								end
+							end
+						end
 					end
 				end
 			end
+			return res;
+		endfunction : init_WEIGHTS;
+
+		weight_matrix_t WEIGHTS = init_WEIGHTS();
+
+		// Weight Feed
+		struct {
+			weight_t dat;
+			logic vld;
+			logic rdy;
+		} weights;
+
+		initial begin
+			weights.vld = 0;
+			weights.dat = 'X;
+			@(posedge clk iff ap_rst_n);
+
+			weights.vld <= 1;
+			for(int unsigned  i = 0; i < NF; i++) begin
+				for(int unsigned  j = 0; j < SF; j++) begin
+					weights.dat <= WEIGHTS[i][j];
+					@(posedge clk iff weights.rdy);
+				end
+			end
+			weights.vld <= 0;
+			weights.dat <= 'x;
 		end
-		return res;
-	endfunction : check_output;
 
-	output_vector_t GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS);
-
-	int unsigned NF_CNT = 0;
-	initial begin
-		outputs.rdy = 0;
-		while (NF_CNT < NF) begin
-			// Loop until both rdy & vld are asserted
-			do begin
-				outputs.rdy <= $urandom()%7 >= 0;
-				@(posedge clk iff ap_rst_n);
-			end while (!(outputs.rdy === 1 && outputs.vld === 1));
-
-			// Compare produced outputs against golden outputs
-			foreach(outputs.dat[i]) begin
-				assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
-				else begin
-					$error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
-					$stop;
+		// Function to compute golden output
+		// a: [SF][SIMD-1:0][ACTIVATION_WIDTH-1:0]
+		// a: [SF][PE*SIMD-1:0][ACTIVATION_WIDTH-1:0]
+		// w: [NF][SF][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]
+		typedef logic signed [PE-1:0][ACCU_WIDTH-1:0] output_t;
+		typedef output_t output_vector_t [NF];
+
+		struct {
+			output_t dat;
+			logic vld;
+			logic rdy;
+		} outputs;
+
+		function output_vector_t check_output(activation_vector_t a, weight_matrix_t w);
+			automatic output_vector_t res = '{default: 0};
+			// The input stream will have the channels interleaved for VVU when PE>1
+			// Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..]
+			// Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like:
+			// (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to
+			// (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i)
+			for (int i = 0; i < NF; i++) begin
+				for (int j = 0; j < SF; j++) begin
+					for (int k = 0; k < PE; k++) begin
+						for (int l = 0; l < SIMD; l++) begin
+							if (SIGNED_ACTIVATIONS)
+								res[i][k] = $signed(res[i][k]) + $signed(a[j][l]) * $signed(w[i][j][k][l]);
+							else
+								res[i][k] = $signed(res[i][k]) + $signed({1'b0, a[j][l]}) * $signed(w[i][j][k][l]);
+						end
+					end
+				end
+			end
+			return res;
+		endfunction : check_output;
+
+		output_vector_t  GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS);
+		initial begin
+			outputs.rdy = 0;
+			@(posedge clk iff ap_rst_n);
+
+			for(int unsigned  nf = 0; nf < NF; nf++) begin
+				while($urandom()%13 == 0) @(posedge clk);
+				outputs.rdy <= 1;
+				@(posedge clk iff outputs.vld);
+				outputs.rdy <= 0;
+
+				// Compare produced outputs against golden outputs
+				foreach(outputs.dat[i]) begin
+					assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[nf][i])) begin
+						$display(">>> [t=%0t] Test succeeded (nf=%0d)! Computed / GOLDEN = %0d / %0d", $time, nf, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[nf][i]));
+					end
+					else begin
+						$error(">>> [t=%0t] TEST failed (nf=%0d)! Computed / GOLDEN = %0d / %0d", $time, nf, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[nf][i]));
+						$stop;
+					end
 				end
 			end
 
-			NF_CNT += 1;
+			done[ver][narrow] = 1;
 		end
 
-		$finish;
-	end
-
-	// Instantiate DUT
-	mvu_vvu_axi #(
-		.IS_MVU(IS_MVU),
-		.COMPUTE_CORE(COMPUTE_CORE),
-		.MW(MW),
-		.MH(MH),
-		.PE(PE),
-		.SIMD(SIMD),
-		.ACTIVATION_WIDTH(ACTIVATION_WIDTH),
-		.WEIGHT_WIDTH(WEIGHT_WIDTH),
-		.ACCU_WIDTH(ACCU_WIDTH),
-		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
-		.SEGMENTLEN(SEGMENTLEN),
-		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL),
-		.M_REG_LUT(M_REG_LUT)
-	)
-	dut (
-		.ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld),
-		.s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld),
-		.s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld),
-		.m_axis_output_tready(outputs.rdy)
-	);
+		// Instantiate DUT
+		mvu_vvu_axi #(
+			.IS_MVU(IS_MVU),
+			.COMPUTE_CORE(ver == 1? "mvu_4sx4u_dsp48e1" : "mvu_4sx4u_dsp48e2"),
+			.MW(MW),
+			.MH(MH),
+			.PE(PE),
+			.SIMD(SIMD),
+			.ACTIVATION_WIDTH(ACTIVATION_WIDTH),
+			.WEIGHT_WIDTH(WEIGHT_WIDTH),
+			.ACCU_WIDTH(ACCU_WIDTH),
+			.NARROW_WEIGHTS(narrow),
+			.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
+			.SEGMENTLEN(SEGMENTLEN),
+			.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL),
+			.M_REG_LUT(M_REG_LUT)
+		)
+		dut (
+			.ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld),
+			.s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld),
+			.s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld),
+			.m_axis_output_tready(outputs.rdy)
+		);
+
+		end : genNarrowWide
+	end : genVersion
 
 endmodule : mvu_axi_tb
diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi.sv b/finn-rtllib/thresholding/hdl/thresholding_axi.sv
index 5c7182b214..39756e5c2b 100644
--- a/finn-rtllib/thresholding/hdl/thresholding_axi.sv
+++ b/finn-rtllib/thresholding/hdl/thresholding_axi.sv
@@ -39,8 +39,9 @@
  *****************************************************************************/
 
 module thresholding_axi #(
-	int unsigned  N,		// output precision
-	int unsigned  K,		// input/threshold precision
+	int unsigned  N,	// output precision
+	int unsigned  WI,	// input precision
+	int unsigned  WT,	// threshold precision
 	int unsigned  C = 1,	// Channels
 	int unsigned  PE = 1,	// Processing Parallelism, requires C = k*PE
 
@@ -96,7 +97,7 @@ module thresholding_axi #(
 	//- AXI Stream - Input --------------
 	output	logic  s_axis_tready,
 	input	logic  s_axis_tvalid,
-	input	logic [((PE*K+7)/8)*8-1:0]  s_axis_tdata,
+	input	logic [((PE*WI+7)/8)*8-1:0]  s_axis_tdata,
 
 	//- AXI Stream - Output -------------
 	input	logic  m_axis_tready,
@@ -109,13 +110,13 @@ module thresholding_axi #(
 	uwire  cfg_en;
 	uwire  cfg_we;
 	uwire [ADDR_BITS-3:0]  cfg_a;
-	uwire [K        -1:0]  cfg_d;
+	uwire [WT       -1:0]  cfg_d;
 	uwire  cfg_rack;
-	uwire [K        -1:0]  cfg_q;
+	uwire [WT       -1:0]  cfg_q;
 
 	if(USE_AXILITE) begin
 		uwire [ADDR_BITS-1:0]  cfg_a0;
-		axi4lite_if #(.ADDR_WIDTH(ADDR_BITS), .DATA_WIDTH(32), .IP_DATA_WIDTH(K)) axi (
+		axi4lite_if #(.ADDR_WIDTH(ADDR_BITS), .DATA_WIDTH(32), .IP_DATA_WIDTH(WT)) axi (
 			.aclk(ap_clk), .aresetn(ap_rst_n),
 
 			.awready(s_axilite_AWREADY), .awvalid(s_axilite_AWVALID), .awaddr(s_axilite_AWADDR), .awprot('x),
@@ -143,10 +144,42 @@ module thresholding_axi #(
 		assign	cfg_d  = 'x;
 	end
 
+	//-----------------------------------------------------------------------
+	// Cast Inputs into Threshold Data Type
+	uwire [PE-1:0][WT-1:0]  idat;
+	for(genvar  pe = 0; pe < PE; pe++) begin
+		if(WT == WI) begin : genCopy
+			assign	idat[pe] = s_axis_tdata[pe*WI+:WI];
+		end : genCopy
+		else begin
+			initial begin
+				if(FPARG) begin
+					$error("%m: Can't cast floating-point type.");
+					$finish;
+				end
+			end
+
+			if(WT > WI) begin : genWiden
+				assign	idat[pe] = { {(WT-WI){SIGNED? s_axis_tdata[(pe+1)*WI-1] : 1'b0}}, s_axis_tdata[pe*WI+:WI] };
+			end : genWiden
+			else begin : genNarrow
+				// Saturate for clipping inputs
+				if(!SIGNED) begin
+					assign	idat[pe] = |s_axis_tdata[pe*WI+WT+:WI-WT]? '1 : s_axis_tdata[pe*WI+:WT];
+				end
+				else begin
+					assign	idat[pe] =
+						(s_axis_tdata[pe*WI+WT+:WI-WT] == '1) || (s_axis_tdata[pe*WI+WT+:WI-WT] == '0)? s_axis_tdata[pe*WI+:WT] :
+						{s_axis_tdata[(pe+1)*WI-1], {(WT-1){!s_axis_tdata[(pe+1)*WI-1]}}};
+				end
+			end : genNarrow
+		end
+	end
+
 	//-----------------------------------------------------------------------
 	// Kernel Implementation
 	thresholding #(
-		.N(N), .K(K), .C(C), .PE(PE),
+		.N(N), .K(WT), .C(C), .PE(PE),
 		.SIGNED(SIGNED), .FPARG(FPARG), .BIAS(BIAS),
 		.THRESHOLDS_PATH(THRESHOLDS_PATH), .USE_CONFIG(USE_AXILITE),
 		.DEPTH_TRIGGER_URAM(DEPTH_TRIGGER_URAM), .DEPTH_TRIGGER_BRAM(DEPTH_TRIGGER_BRAM),
@@ -157,7 +190,7 @@ module thresholding_axi #(
 		.cfg_en, .cfg_we, .cfg_a, .cfg_d,
 		.cfg_rack, .cfg_q,
 
-		.irdy(s_axis_tready), .ivld(s_axis_tvalid), .idat(s_axis_tdata),
+		.irdy(s_axis_tready), .ivld(s_axis_tvalid), .idat,
 		.ordy(m_axis_tready), .ovld(m_axis_tvalid), .odat(m_axis_tdata)
 	);
 
diff --git a/finn-rtllib/thresholding/hdl/thresholding_template_wrapper.v b/finn-rtllib/thresholding/hdl/thresholding_template_wrapper.v
index f35db156f6..49a1f2bd8b 100644
--- a/finn-rtllib/thresholding/hdl/thresholding_template_wrapper.v
+++ b/finn-rtllib/thresholding/hdl/thresholding_template_wrapper.v
@@ -33,8 +33,9 @@
  */
 
 module $MODULE_NAME_AXI_WRAPPER$ #(
-	parameter  N = $N$,	// output precision
-	parameter  K = $M$,	// input/threshold precision
+	parameter  N = $N$,		// output precision
+	parameter  WI = $WI$,	// input precision
+	parameter  WT = $WT$,	// threshold precision
 	parameter  C = $C$,	// Channels
 	parameter  PE = $PE$,	// Processing Parallelism, requires C = k*PE
 
@@ -87,7 +88,7 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 	//- AXI Stream - Input --------------
 	output  in0_V_TREADY,
 	input   in0_V_TVALID,
-	input [((PE*K+7)/8)*8-1:0]  in0_V_TDATA,
+	input [((PE*WI+7)/8)*8-1:0]  in0_V_TDATA,
 
 	//- AXI Stream - Output -------------
 	input   out_V_TREADY,
@@ -96,7 +97,7 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 );
 
 	thresholding_axi #(
-		.N(N), .K(K), .C(C), .PE(PE),
+		.N(N), .WI(WI), .WT(WT), .C(C), .PE(PE),
 		.SIGNED(SIGNED),
 		.FPARG(FPARG),
 		.BIAS(BIAS),
diff --git a/finn-rtllib/thresholding/sim/thresholding_axi_tb.sv b/finn-rtllib/thresholding/sim/thresholding_axi_tb.sv
index 429fb7776f..cfd875f5c4 100644
--- a/finn-rtllib/thresholding/sim/thresholding_axi_tb.sv
+++ b/finn-rtllib/thresholding/sim/thresholding_axi_tb.sv
@@ -110,7 +110,7 @@ module thresholding_axi_tb #(
 	uwire  ovld;
 	uwire [PE-1:0][N-1:0]  odat;
 
-	thresholding_axi #(.N(N), .K(K), .C(C), .PE(PE), .SIGNED(0), .USE_AXILITE(1)) dut (
+	thresholding_axi #(.N(N), .WI(K), .WT(K), .C(C), .PE(PE), .SIGNED(0), .USE_AXILITE(1)) dut (
 		.ap_clk(clk), .ap_rst_n(!rst),
 
 		// Configuration
diff --git a/notebooks/advanced/2_custom_op.ipynb b/notebooks/advanced/2_custom_op.ipynb
index bdd2976412..4c80c0263b 100644
--- a/notebooks/advanced/2_custom_op.ipynb
+++ b/notebooks/advanced/2_custom_op.ipynb
@@ -649,7 +649,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# run with FINN's execute_onnx, custom node will use c++ execution\n",
+    "# run with FINN's execute_onnx, custom node will use C++ execution\n",
     "new_op_inst.set_nodeattr(\"exec_mode\", \"c++\")\n",
     "ret = execute_onnx(mixedop_graph_new, inp_dict)\n",
     "ret"
diff --git a/notebooks/advanced/3_folding.ipynb b/notebooks/advanced/3_folding.ipynb
index 8c7b97d6c6..e9527a2ef7 100644
--- a/notebooks/advanced/3_folding.ipynb
+++ b/notebooks/advanced/3_folding.ipynb
@@ -159,6 +159,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "from functools import partial\n",
     "from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer\n",
     "from finn.analysis.fpgadataflow.res_estimation import res_estimation"
    ]
@@ -216,7 +217,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "res_dict = model.analysis(res_estimation)\n",
+    "res_dict = model.analysis(partial(res_estimation, fpgapart=\"xc7z020clg400-1\"))\n",
     "res_dict"
    ]
   },
@@ -363,7 +364,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "res_dict_updated = model.analysis(res_estimation)\n",
+    "res_dict_updated = model.analysis(partial(res_estimation, fpgapart=\"xc7z020clg400-1\"))\n",
     "res_dict_updated"
    ]
   },
@@ -567,7 +568,7 @@
     "from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers\n",
     "\n",
     "model_updated = model_updated.transform(InsertDWC())\n",
-    "model_updated = model_updated.transform(SpecializeLayers())\n",
+    "model_updated = model_updated.transform(SpecializeLayers(\"xc7z020clg400-1\"))\n",
     "model_updated = model_updated.transform(GiveUniqueNodeNames())"
    ]
   },
@@ -596,7 +597,7 @@
    "outputs": [],
    "source": [
     "model_dwc = ModelWrapper(\"cybsec_DWC.onnx\")\n",
-    "res_dict_dwc = model_dwc.analysis(res_estimation)\n",
+    "res_dict_dwc = model_dwc.analysis(partial(res_estimation, fpgapart=\"xc7z020clg400-1\"))\n",
     "res_dict_dwc"
    ]
   },
diff --git a/notebooks/advanced/4_advanced_builder_settings.ipynb b/notebooks/advanced/4_advanced_builder_settings.ipynb
index 5139377342..4a0f2bc695 100644
--- a/notebooks/advanced/4_advanced_builder_settings.ipynb
+++ b/notebooks/advanced/4_advanced_builder_settings.ipynb
@@ -199,7 +199,7 @@
    "id": "d746eff3",
    "metadata": {},
    "source": [
-    "After each FINN builder step, the graph is saved as .onnx file. In the cell above we sort the intermediate models by time in descending order (`ls -t -r`) to visualize the builder flow. As you can see after the conversion to the FINN-ONNX format (`step_qonnx_to_finn`), the graph is prepared by tidy up and streamlining (`step_tidy_up` and `step_streamline`) and then the high level nodes are converted to HW abstraction layers (`step_convert_to_hw`). Then there is a partition created from all layers that were converted to HW layers (`step_create_dataflow_partition`), then we convert each of the HW abstraction layers into an HLS or RTL variant (`step_specialize_layers`). Afterwards optimizations are applied (`step_target_fps_parallelization`, `step_apply_folding_config` and `step_minimize_bit_width`). In the final step of this example we generate resource and performance reports for the network (`step_generate_estimate_reports`). Use the code below to investigate the network after each step."
+    "After each FINN builder step, the graph is saved as an .onnx file. In the cell above we sort the intermediate models by time in descending order (`ls -t -r`) to visualize the builder flow. As you can see after the conversion to the FINN-ONNX format (`step_qonnx_to_finn`), the graph is prepared by tidy up and streamlining (`step_tidy_up` and `step_streamline`) and then the high level nodes are converted to HW abstraction layers (`step_convert_to_hw`). Then there is a partition created from all layers that were converted to HW layers (`step_create_dataflow_partition`), then we convert each of the HW abstraction layers into an HLS or RTL variant (`step_specialize_layers`). Afterwards optimizations are applied (`step_target_fps_parallelization`, `step_apply_folding_config` and `step_minimize_bit_width`). In the final step of this example we generate resource and performance reports for the network (`step_generate_estimate_reports`). Use the code below to investigate the network after each step."
    ]
   },
   {
@@ -218,7 +218,7 @@
    "id": "bccebd0d",
    "metadata": {},
    "source": [
-    "The analysis of these .onnx files can help us identifying points in the flow in which we might need to intervene and provide the compiler with additional information. When investigating the network after the conversion to HW layers, we can see that there are layers that were not converted. We can see this by clicking on the different nodes. HW layers have the module `finn.custom_op.fpgadataflow`."
+    "The analysis of these .onnx files can help us identify points in the flow in which we might need to intervene and provide the compiler with additional information. When investigating the network after the conversion to HW layers, we can see that there are layers that were not converted. We can see this by clicking on the different nodes. HW layers have the module `finn.custom_op.fpgadataflow`."
    ]
   },
   {
@@ -361,7 +361,7 @@
    "id": "2809f6a7",
    "metadata": {},
    "source": [
-    "Each steps gets the model (`model: ModelWrapper`) and the build configuration (`cfg: DataflowBuildConfig`) as input arguments. Then a certain sequence of transformations is applied to the model. In some of the steps, verification can be run to ensure that the applied transformations have not changed the behaviour of the network. In the end the modified model is returned."
+    "Each steps gets the model (`model: ModelWrapper`) and the build configuration (`cfg: DataflowBuildConfig`) as input arguments. Then a certain sequence of transformations is applied to the model. In some of the steps, verification can be run to ensure that the applied transformations have not changed the behaviour of the network. In the end, the modified model is returned."
    ]
   },
   {
@@ -993,7 +993,7 @@
    "id": "fd1519fe",
    "metadata": {},
    "source": [
-    "In the following part of the tutorial, we will use the auto generated json file as starting point to create two new json files which explore the `ram_style` attribute. We will use one of the generated reports from the FINN builder to see the impact of these changes.\n",
+    "In the following part of the tutorial, we will use the auto generated json file as a starting point to create two new json files which explore the `ram_style` attribute. We will use one of the generated reports from the FINN builder to see the impact of these changes.\n",
     "For that, we will extract the total resources from the *estimate_layer_resources.json* report in the following cell."
    ]
   },
@@ -1254,7 +1254,7 @@
    "id": "97f87780",
    "metadata": {},
    "source": [
-    "The initial implementation already had a high utilization of BRAM, but the estimations went now up to ~500 BRAMs while the LUT count went down to ~99k."
+    "The initial implementation already had a high utilization of BRAM, but the estimations now went up to ~500 BRAMs while the LUT count went down to ~99k."
    ]
   },
   {
@@ -1278,7 +1278,7 @@
    "id": "f7012b9a",
    "metadata": {},
    "source": [
-    "In this section, we will have a peak into additional builder arguments the FINN compiler exposes. We will not be able to cover all but you will be able to have a look at a list and we encourage you to take your time to look into the different options there are to customize the FINN builder configuration."
+    "In this section, we will have a peek into additional builder arguments the FINN compiler exposes. We will not be able to cover all but you will be able to have a look at a list and we encourage you to take your time to look into the different options there are to customize the FINN builder configuration."
    ]
   },
   {
@@ -1302,7 +1302,7 @@
    "id": "308d52ba",
    "metadata": {},
    "source": [
-    "Earlier in the tutorial, we had a look at how build steps are written. When investigating the `step_tidy_up`, we can see that before the changed model is returned a verification step can be run. In the case of `step_tidy_up` it is the step `\"initial python\"` that can be initiated by setting `VerificationStepType.TIDY_UP_PYTHON`."
+    "Earlier in the tutorial, we had a look at how build steps are written. When investigating the `step_tidy_up`, we can see that before the changed model is returned, a verification step can be run. In the case of `step_tidy_up` it is the step `\"initial python\"` that can be initiated by setting `VerificationStepType.TIDY_UP_PYTHON`."
    ]
   },
   {
@@ -1536,7 +1536,7 @@
    "source": [
     "There are attributes that come from the dataclasses-json class: `to_dict`, `to_json`, `schema`, `from_json`, `from_dict`. This class is used for the implementation of the FINN builder. In this tutorial, we are mainly interested in the FINN specific arguments.  \n",
     "\n",
-    "Some of these arguments we have seen already in the Cybersecurity notebook and in this notebook, e.g. `target_fps`, `fpga_part` and `folding_config_file`. In the code of the FINN builder, the function of each builder argument is documents, you can have a look [here](https://github.com/Xilinx/finn/blob/dev/src/finn/builder/build_dataflow_config.py#L155) and scroll through the available builder arguments."
+    "Some of these arguments we have seen already in the Cybersecurity notebook and in this notebook, e.g. `target_fps`, `fpga_part` and `folding_config_file`. In the code of the FINN builder, the function of each builder argument is documented, you can have a look [here](https://github.com/Xilinx/finn/blob/dev/src/finn/builder/build_dataflow_config.py#L155) and scroll through the available builder arguments."
    ]
   },
   {
@@ -1602,7 +1602,7 @@
    "id": "c249f141",
    "metadata": {},
    "source": [
-    "This concludes the advanced builder settings tutorial. Below you can find code that can help you investigating more of the builder arguments and invoking the whole flow to generate a bitfile."
+    "This concludes the advanced builder settings tutorial. Below you can find code that can help you in investigating more of the builder arguments and invoking the whole flow to generate a bitfile."
    ]
   },
   {
diff --git a/notebooks/basics/1_brevitas_network_import_via_QONNX.ipynb b/notebooks/basics/1_brevitas_network_import_via_QONNX.ipynb
index 5c2f10310f..1eaaeb138a 100644
--- a/notebooks/basics/1_brevitas_network_import_via_QONNX.ipynb
+++ b/notebooks/basics/1_brevitas_network_import_via_QONNX.ipynb
@@ -277,7 +277,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "model = ModelWrapper(export_onnx_path_cleaned)\n",
+    "model = ModelWrapper(export_onnx_path_converted)\n",
     "input_dict = {\"global_in\": nph.to_array(input_tensor)}\n",
     "output_dict = oxe.execute_onnx(model, input_dict)\n",
     "produced_finn = output_dict[list(output_dict.keys())[0]]\n",
diff --git a/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb b/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb
index 3141d54ddf..8b8cff8ee9 100644
--- a/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb
+++ b/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb
@@ -282,6 +282,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "from finn.util.basic import pynq_part_map\n",
+    "# change this if you have a different PYNQ board, see list above\n",
+    "pynq_board = \"Pynq-Z1\"\n",
+    "fpga_part = pynq_part_map[pynq_board]\n",
+    "target_clk_ns = 10\n",
+    "\n",
     "import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw\n",
     "from finn.transformation.fpgadataflow.create_dataflow_partition import (\n",
     "    CreateDataflowPartition,\n",
@@ -314,7 +320,7 @@
     "# save the dataflow partition with a different name for easier access\n",
     "# and specialize the layers to HLS variants\n",
     "dataflow_model = ModelWrapper(dataflow_model_filename)\n",
-    "dataflow_model = dataflow_model.transform(SpecializeLayers())\n",
+    "dataflow_model = dataflow_model.transform(SpecializeLayers(fpga_part))\n",
     "dataflow_model.save(build_dir + \"/end2end_cnv_w1a1_dataflow_model.onnx\")"
    ]
   },
@@ -432,12 +438,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "test_pynq_board = \"Pynq-Z1\"\n",
-    "target_clk_ns = 10\n",
-    "\n",
     "from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild\n",
     "model = ModelWrapper(build_dir+\"/end2end_cnv_w1a1_folded.onnx\")\n",
-    "model = model.transform(ZynqBuild(platform = test_pynq_board, period_ns = target_clk_ns))"
+    "model = model.transform(ZynqBuild(platform = pynq_board, period_ns = target_clk_ns))"
    ]
   },
   {
diff --git a/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb b/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb
index bbaa74dbff..675ba23d2d 100644
--- a/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb
+++ b/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb
@@ -547,6 +547,36 @@
     "thresh_node_inst.set_nodeattr(\"preferred_impl_style\", \"hls\")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We'll define two helper variables that describe the Xilinx FPGA part name and the PYNQ board name that we are targeting."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# print the names of the supported PYNQ boards\n",
+    "from finn.util.basic import pynq_part_map\n",
+    "print(pynq_part_map.keys())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# change this if you have a different PYNQ board, see list above\n",
+    "pynq_board = \"Pynq-Z1\"\n",
+    "fpga_part = pynq_part_map[pynq_board]\n",
+    "target_clk_ns = 10"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -561,7 +591,7 @@
    "outputs": [],
    "source": [
     "from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers\n",
-    "model = model.transform(SpecializeLayers())\n",
+    "model = model.transform(SpecializeLayers(fpga_part))\n",
     "\n",
     "model.save(build_dir+\"/tfc_w1_a1_specialize_layers.onnx\")\n",
     "showInNetron(build_dir+\"/tfc_w1_a1_specialize_layers.onnx\")"
@@ -687,32 +717,7 @@
    "source": [
     "## 3. Hardware Build <a id='vivado'></a>\n",
     "\n",
-    "We're finally ready to start generating hardware from our network. Depending on whether you want to target a Zynq or Alveo platform, FINN offers two transformations to build the accelerator, integrate into an appropriate shell and build a bitfile. These are `ZynqBuild` and `VitisBuild` for Zynq and Alveo, respectively. In this notebook we'll demonstrate the `ZynqBuild` as these boards are more common and it's much faster to complete bitfile generation for the smaller FPGAs found on them.\n",
-    "\n",
-    "As we will be dealing with FPGA synthesis tools in these tasks, we'll define two helper variables that describe the Xilinx FPGA part name and the PYNQ board name that we are targeting."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# print the names of the supported PYNQ boards\n",
-    "from finn.util.basic import pynq_part_map\n",
-    "print(pynq_part_map.keys())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# change this if you have a different PYNQ board, see list above\n",
-    "pynq_board = \"Pynq-Z1\"\n",
-    "fpga_part = pynq_part_map[pynq_board]\n",
-    "target_clk_ns = 10"
+    "We're finally ready to start generating hardware from our network. Depending on whether you want to target a Zynq or Alveo platform, FINN offers two transformations to build the accelerator, integrate into an appropriate shell and build a bitfile. These are `ZynqBuild` and `VitisBuild` for Zynq and Alveo, respectively. In this notebook we'll demonstrate the `ZynqBuild` as these boards are more common and it's much faster to complete bitfile generation for the smaller FPGAs found on them."
    ]
   },
   {
diff --git a/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb b/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb
index a07a8d2254..aacd12ef05 100644
--- a/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb
+++ b/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb
@@ -396,7 +396,7 @@
     "child_model = child_model.transform(InsertDWC())  \n",
     "child_model = child_model.transform(InsertFIFO(create_shallow_fifos=True))\n",
     "# DWC and FIFOs need to be specialized to either HLS or RTL variants\n",
-    "child_model = child_model.transform(SpecializeLayers())\n",
+    "child_model = child_model.transform(SpecializeLayers(test_fpga_part))\n",
     "child_model.save(build_dir + \"/test.onnx\");\n",
     "child_model = child_model.transform(GiveUniqueNodeNames())\n",
     "child_model = child_model.transform(PrepareIP(test_fpga_part, target_clk_ns))\n",
diff --git a/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb b/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb
index da037050bb..3f8d65497b 100644
--- a/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb
+++ b/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb
@@ -53,7 +53,7 @@
     "    * [(Option 1) Train the Model from Scratch](#train_scratch)\n",
     "    * [(Option 2) Load Pre-Trained Parameters](#load_pretrained)\n",
     "* [Network Surgery Before Export](#network_surgery)\n",
-    "* [Export to QONNX and Conversion to FINN-ONNX](#export_qonnx)"
+    "* [Export to QONNX](#export_qonnx)"
    ]
   },
   {
@@ -194,7 +194,7 @@
    "source": [
     "# Define a PyTorch Device <a id='define_pytorch_device'></a> \n",
     "\n",
-    "GPUs can significantly speed-up training of deep neural networks. We check for availability of a GPU and if so define it as target device."
+    "GPUs can significantly speed-up training of deep neural networks. We check for availability of a GPU and if so define it as the target device."
    ]
   },
   {
@@ -667,12 +667,12 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Export to QONNX and Conversion to FINN-ONNX <a id=\"export_qonnx\" ></a>\n",
+    "# Export to QONNX <a id=\"export_qonnx\" ></a>\n",
     "\n",
     "\n",
     "[ONNX](https://onnx.ai/) is an open format built to represent machine learning models, and the FINN compiler expects an ONNX model as input. We'll now export our network into ONNX to be imported and used in FINN for the next notebooks. Note that the particular ONNX representation used for FINN differs from standard ONNX, you can read more about this [here](https://finn.readthedocs.io/en/latest/internals.html#intermediate-representation-finn-onnx).\n",
     "\n",
-    "You can see below how we export a trained network in Brevitas into a FINN-compatible ONNX representation (QONNX). QONNX is the format we can export from Brevitas, to feed it into the FINN compiler, we will need to make a conversion to the FINN-ONNX format which is the intermediate representation the compiler works on. The conversion of the FINN-ONNX format is a FINN compiler transformation and to be able to apply it to our model, we will need to wrap it into [ModelWrapper](https://finn.readthedocs.io/en/latest/internals.html#modelwrapper). This is a wrapper around the ONNX model which provides several helper functions to make it easier to work with the model. Then we can call the conversion function to obtain the model in FINN-ONNX format."
+    "You can see below how we export a trained network in Brevitas into a FINN-compatible ONNX representation (QONNX). QONNX is the format we can export from Brevitas, to feed it into the FINN compiler, we will need to make a conversion to the FINN-ONNX format which is the intermediate representation the compiler works on. This will be done in the next notebook. For now, we simply export and save the QONNX model."
    ]
   },
   {
@@ -707,13 +707,6 @@
     "# clean-up\n",
     "qonnx_cleanup(ready_model_filename, out_file=ready_model_filename)\n",
     "\n",
-    "# ModelWrapper\n",
-    "model = ModelWrapper(ready_model_filename)\n",
-    "# Setting the input datatype explicitly because it doesn't get derived from the export function\n",
-    "model.set_tensor_datatype(model.graph.input[0].name, DataType[\"BIPOLAR\"])\n",
-    "model = model.transform(ConvertQONNXtoFINN())\n",
-    "model.save(ready_model_filename)\n",
-    "\n",
     "print(\"Model saved to %s\" % ready_model_filename)"
    ]
   },
@@ -721,16 +714,16 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## View the Exported ONNX in Netron\n",
+    "## View the Exported QONNX in Netron\n",
     "\n",
-    "Let's examine the exported ONNX model with [Netron](https://github.com/lutzroeder/netron), which is a visualizer for neural networks and allows interactive investigation of network properties. For example, you can click on the individual nodes and view the properties. Particular things of note:\n",
+    "Let's examine the exported QONNX model with [Netron](https://github.com/lutzroeder/netron), which is a visualizer for neural networks and allows interactive investigation of network properties. For example, you can click on the individual nodes and view the properties. Particular things of note:\n",
     "\n",
-    "* The input tensor \"0\" is annotated with `quantization: finn_datatype: BIPOLAR`\n",
     "* The input preprocessing (x + 1) / 2 is exported as part of the network (initial `Add` and `Div` layers)\n",
-    "* Brevitas `QuantLinear` layers are exported to ONNX as `MatMul`. We've exported the padded version; shape of the first MatMul node's weight parameter is 600x64\n",
-    "* The weight parameters (second inputs) for MatMul nodes are annotated with `quantization: finn_datatype: INT2`\n",
-    "* The quantized activations are exported as `MultiThreshold` nodes with `domain=qonnx.custom_op.general`\n",
-    "* There's a final `MultiThreshold` node with threshold=0 to produce the final bipolar output (this is the `qnt_output` from `CybSecMLPForExport`"
+    "* Brevitas `QuantLinear` layers are exported to QONNX as `Gemm`. We've exported the padded version; shape of the first `Gemm` node's weight parameter is 600x64\n",
+    "* The quantized activations are exported as `Quant` nodes with `domain=qonnx.custom_op.general`\n",
+    "* The weight parameters (second inputs) for the `Gemm` node can also be viewed by opening up the producer `Quant` node, scrolling down to the `Inputs` section and pressing the plus sign to the right of the first input parameter. For the first `Quant` node, this would be the parameter named `Quant_0_param0`\n",
+    "* The bitwidth of the weights are also shown as the 4th value in the `Quant` node, (3=2) meaning that we quantize to 2 bits total.\n",
+    "* There's a final `BipolarQuant` node with a single input and output value to produce the final bipolar output (this is the `qnt_output` from `CybSecMLPForExport`)"
    ]
   },
   {
diff --git a/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb b/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb
index 33b64e11c0..70f1acae0a 100644
--- a/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb
+++ b/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb
@@ -12,7 +12,7 @@
     "\n",
     "**Also remember to 'close and halt' any other FINN notebooks, since Netron visualizations use the same port.**\n",
     "\n",
-    "In this notebook we will show how to import the network we trained in Brevitas and verify it in the FINN compiler. \n",
+    "In this notebook we will show how to import the network we trained in Brevitas, convert it from the QONNX format to FINN-ONNX, going over the differences and, lastly, verify it in the FINN compiler. \n",
     "This verification process can actually be done at various stages in the compiler [as explained in this notebook](../bnn-pynq/tfc_end2end_verification.ipynb) but for this example we'll only consider the first step: verifying the exported high-level FINN-ONNX model.\n",
     "Another goal of this notebook is to introduce you to the concept of *graph transformations* -- we'll be applying some transformations to the graph to make it executable for verification. \n",
     "Once this model is sucessfully verified, we'll generate an FPGA accelerator from it in the next notebook."
@@ -41,7 +41,7 @@
    "source": [
     "## Outline\n",
     "-------------\n",
-    "1. [Import model into FINN with ModelWrapper](#brevitas_import_visualization)\n",
+    "1. [Convert model from QONNX to FINN-ONNX](#brevitas_import_visualization)\n",
     "2. [Network preparations: Tidy-up transformations](#network_preparations)\n",
     "3. [Load the dataset and Brevitas model](#load_dataset) \n",
     "4. [Compare FINN and Brevitas execution](#compare_brevitas)"
@@ -51,9 +51,21 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# 1. Import model into FINN with ModelWrapper <a id=\"brevitas_import_visualization\"></a>\n",
+    "# 1. Convert model from QONNX to FINN-ONNX <a id=\"brevitas_import_visualization\"></a>\n",
     "\n",
-    "Now that we have the model in .onnx format, we can work with it using FINN. To import it into FINN, we'll use the [`ModelWrapper`](https://finn.readthedocs.io/en/latest/source_code/finn.core.html#qonnx.core.modelwrapper.ModelWrapper). It is a wrapper around the ONNX model which provides several helper functions to make it easier to work with the model."
+    "\n",
+    "Even though the input to finn is the QONNX format, an IR called FINN-ONNX is used inside the compiler. In this part of the notebook, we show how to convert QONNX to FINN-ONNX and explain the key differences between the three representations: ONNX, QONNX and FINN-ONNX.\n",
+    "\n",
+    "QONNX and FINN-ONNX are extensions to the standard ONNX format. Currently, ONNX provides only limited support for expressing quantization, while QONNX and FINN-ONNX provide fully flexible quantization support. However the way in which they do differs: QONNX provides special node types called `Quant` which ingest weights or previous node output streams as inputs to produce quantized output streams. Meanwhile, FINN-ONNX uses tensor annotation to express quantization and has a special node type called `MultiThreshold`, which implements quantization on the activation data path.\n",
+    "\n",
+    "Beyond, this, there are other node types which differ in FINN-ONNX as opposed to QONNX. Thus we need a conversion function, which we will explore in more detail shortly.\n",
+    "\n",
+    "Lastly, we want to emphasize that we use the uppercase naming (ONNX, QONNX, FINN-ONNX) for the intermediate representations (IR), while the lower case naming (onnx, qonnx, finn) are usually used to refer to the compiler toolkits themselves.\n",
+    "\n",
+    "\n",
+    "## 1.1 Using ModelWrapper to load and observe a model\n",
+    "We first load the model which we prepared in the last notebook by using the\n",
+    "[`ModelWrapper`](https://finn.readthedocs.io/en/latest/source_code/finn.core.html#qonnx.core.modelwrapper.ModelWrapper). It is a wrapper around the ONNX model which provides several helper functions to make it easier to work with the model."
    ]
   },
   {
@@ -64,17 +76,79 @@
    "source": [
     "import os\n",
     "from qonnx.core.modelwrapper import ModelWrapper\n",
+    "from qonnx.core.datatype import DataType\n",
+    "from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN\n",
     "\n",
     "model_dir = os.environ['FINN_ROOT'] + \"/notebooks/end2end_example/cybersecurity\"\n",
     "ready_model_filename = model_dir + \"/cybsec-mlp-ready.onnx\"\n",
-    "model_for_sim = ModelWrapper(ready_model_filename)"
+    "\n",
+    "# ModelWrapper\n",
+    "model = ModelWrapper(ready_model_filename)\n",
+    "\n",
+    "print(\"Model loaded from %s\" % ready_model_filename)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To import the model into FINN, we will need to use the `ConvertQONNXtoFINN` transformation. But before that, let us use some of the member functions exposed by `ModelWrapper` to see what kind of information we can extract from it and have a baseline to compare to when we do call the `ConvertQONNXtoFINN` transformation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dir(model)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Many of these helper functions relate to extracting information about the structure and properties of the ONNX model. You can find out more about examining and manipulating ONNX models programmatically in [this tutorial](../../basics/0_how_to_work_with_onnx.ipynb), but we'll show a few basic functions here. For instance, we can extract the shape and datatype annotation for various tensors in the graph, as well as information related to the operation types associated with each node. We will do this now."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from qonnx.core.datatype import DataType\n",
+    "\n",
+    "in_tensor_name = model.graph.input[0].name\n",
+    "out_tensor_name = model.graph.output[0].name\n",
+    "print(\"Input tensor name: %s\" % in_tensor_name)\n",
+    "print(\"Output tensor name: %s\" % out_tensor_name)\n",
+    "model_in_shape = model.get_tensor_shape(in_tensor_name)\n",
+    "model_out_shape = model.get_tensor_shape(out_tensor_name)\n",
+    "print(\"Input tensor shape: %s\" % str(model_in_shape))\n",
+    "print(\"Output tensor shape: %s\" % str(model_out_shape))\n",
+    "model_in_dt = model.get_tensor_datatype(in_tensor_name)\n",
+    "model_out_dt = model.get_tensor_datatype(out_tensor_name)\n",
+    "print(\"Input tensor datatype: %s\" % str(model_in_dt.name))\n",
+    "print(\"Output tensor datatype: %s\" % str(model_out_dt.name))\n",
+    "print(\"List of node operator types in the graph: \")\n",
+    "print([x.op_type for x in model.graph.node])"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Let's have a look at some of the member functions exposed by `ModelWrapper` to see what kind of information we can extract from it."
+    "Note that the input and output tensors are (as of yet) marked as a float32 values, even though we know they are binary. The output datatype will get inferred when we call the `ConvertQONNXtoFINN` transformation, which internally features an `InferDataTypes` transformation, while the input we will adjust manually with the `set_tensor_datatype` function."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1.2 Converting to from QONNX to FINN-ONNX using ConvertQONNXtoFINN\n",
+    "\n",
+    "At this point, we would like to move from the QONNX IR onto the FINN-ONNX IR. We can do this by using the `ConvertQONNXtoFINN()` function on a QONNX model."
    ]
   },
   {
@@ -83,14 +157,18 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "dir(model_for_sim)"
+    "# Setting the input datatype explicitly because it doesn't get derived from the export function\n",
+    "model.set_tensor_datatype(model.graph.input[0].name, DataType[\"BIPOLAR\"])\n",
+    "\n",
+    "# Calling the actual QONNX -> FINN transformation\n",
+    "model = model.transform(ConvertQONNXtoFINN())"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Many of these helper functions relate to extracting information about the structure and properties of the ONNX model. You can find out more about examining and manipulating ONNX models programmatically in [this tutorial](../../basics/0_how_to_work_with_onnx.ipynb), but we'll show a few basic functions here. For instance, we can extract the shape and datatype annotation for various tensors in the graph, as well as information related to the operation types associated with each node."
+    "We can look at the tensor datatypes and operator types again to see how they have changed."
    ]
   },
   {
@@ -101,27 +179,51 @@
    "source": [
     "from qonnx.core.datatype import DataType\n",
     "\n",
-    "finnonnx_in_tensor_name = model_for_sim.graph.input[0].name\n",
-    "finnonnx_out_tensor_name = model_for_sim.graph.output[0].name\n",
-    "print(\"Input tensor name: %s\" % finnonnx_in_tensor_name)\n",
-    "print(\"Output tensor name: %s\" % finnonnx_out_tensor_name)\n",
-    "finnonnx_model_in_shape = model_for_sim.get_tensor_shape(finnonnx_in_tensor_name)\n",
-    "finnonnx_model_out_shape = model_for_sim.get_tensor_shape(finnonnx_out_tensor_name)\n",
-    "print(\"Input tensor shape: %s\" % str(finnonnx_model_in_shape))\n",
-    "print(\"Output tensor shape: %s\" % str(finnonnx_model_out_shape))\n",
-    "finnonnx_model_in_dt = model_for_sim.get_tensor_datatype(finnonnx_in_tensor_name)\n",
-    "finnonnx_model_out_dt = model_for_sim.get_tensor_datatype(finnonnx_out_tensor_name)\n",
-    "print(\"Input tensor datatype: %s\" % str(finnonnx_model_in_dt.name))\n",
-    "print(\"Output tensor datatype: %s\" % str(finnonnx_model_out_dt.name))\n",
+    "in_tensor_name = model.graph.input[0].name\n",
+    "out_tensor_name = model.graph.output[0].name\n",
+    "print(\"Input tensor name: %s\" % in_tensor_name)\n",
+    "print(\"Output tensor name: %s\" % out_tensor_name)\n",
+    "model_in_shape = model.get_tensor_shape(in_tensor_name)\n",
+    "model_out_shape = model.get_tensor_shape(out_tensor_name)\n",
+    "print(\"Input tensor shape: %s\" % str(model_in_shape))\n",
+    "print(\"Output tensor shape: %s\" % str(model_out_shape))\n",
+    "model_in_dt = model.get_tensor_datatype(in_tensor_name)\n",
+    "model_out_dt = model.get_tensor_datatype(out_tensor_name)\n",
+    "print(\"Input tensor datatype: %s\" % str(model_in_dt.name))\n",
+    "print(\"Output tensor datatype: %s\" % str(model_out_dt.name))\n",
     "print(\"List of node operator types in the graph: \")\n",
-    "print([x.op_type for x in model_for_sim.graph.node])"
+    "print([x.op_type for x in model.graph.node])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Notice that the input and output tensor datatypes now correctly show `BIPOLAR` while the operator types have also heavily changed compared to the QONNX version. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "`ConvertQONNXtoFINN` internally called many transformations which change the operators in such a manner and we can actually peek at the source code to see them using the `showSrc` function."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from finn.util.visualization import showSrc\n",
+    "showSrc(ConvertQONNXtoFINN.apply)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Note that the output tensor is (as of yet) marked as a float32 value, even though we know the output is binary. This will be automatically inferred by the compiler in the next step when we run the `InferDataTypes` transformation."
+    "As we can see, `ConvertQONNXtoFINN`  turned `Gemm` operation into `MatMuls` using the `GemmToMatMul()` transform and turned `Quant` nodes into `MultiThreshold` nodes using the `ConvertQuantActToMultiThreshold()` transform to name a few. However, these nodes do need further transformations before they can be turned into FPGA operators."
    ]
   },
   {
@@ -146,15 +248,15 @@
     "from qonnx.transformation.infer_datatypes import InferDataTypes\n",
     "from qonnx.transformation.fold_constants import FoldConstants\n",
     "\n",
-    "model_for_sim = model_for_sim.transform(InferShapes())\n",
-    "model_for_sim = model_for_sim.transform(FoldConstants())\n",
-    "model_for_sim = model_for_sim.transform(GiveUniqueNodeNames())\n",
-    "model_for_sim = model_for_sim.transform(GiveReadableTensorNames())\n",
-    "model_for_sim = model_for_sim.transform(InferDataTypes())\n",
-    "model_for_sim = model_for_sim.transform(RemoveStaticGraphInputs())\n",
+    "model = model.transform(InferShapes())\n",
+    "model = model.transform(FoldConstants())\n",
+    "model = model.transform(GiveUniqueNodeNames())\n",
+    "model = model.transform(GiveReadableTensorNames())\n",
+    "model = model.transform(InferDataTypes())\n",
+    "model = model.transform(RemoveStaticGraphInputs())\n",
     "\n",
     "verif_model_filename = model_dir + \"/cybsec-mlp-verification.onnx\"\n",
-    "model_for_sim.save(verif_model_filename)"
+    "model.save(verif_model_filename)"
    ]
   },
   {
@@ -309,22 +411,22 @@
     "import finn.core.onnx_exec as oxe\n",
     "\n",
     "def inference_with_finn_onnx(current_inp):\n",
-    "    finnonnx_in_tensor_name = model_for_sim.graph.input[0].name\n",
-    "    finnonnx_model_in_shape = model_for_sim.get_tensor_shape(finnonnx_in_tensor_name)\n",
-    "    finnonnx_out_tensor_name = model_for_sim.graph.output[0].name\n",
+    "    in_tensor_name = model.graph.input[0].name\n",
+    "    model_in_shape = model.get_tensor_shape(in_tensor_name)\n",
+    "    out_tensor_name = model.graph.output[0].name\n",
     "    # convert input to numpy for FINN\n",
     "    current_inp = current_inp.detach().numpy()\n",
     "    # add padding and re-scale to bipolar\n",
     "    current_inp = np.pad(current_inp, [(0, 0), (0, 7)])\n",
     "    current_inp = 2*current_inp-1\n",
     "    # reshape to expected input (add 1 for batch dimension)\n",
-    "    current_inp = current_inp.reshape(finnonnx_model_in_shape)\n",
+    "    current_inp = current_inp.reshape(model_in_shape)\n",
     "    # create the input dictionary\n",
-    "    input_dict = {finnonnx_in_tensor_name : current_inp} \n",
+    "    input_dict = {in_tensor_name : current_inp} \n",
     "    # run with FINN's execute_onnx\n",
-    "    output_dict = oxe.execute_onnx(model_for_sim, input_dict)\n",
+    "    output_dict = oxe.execute_onnx(model, input_dict)\n",
     "    #get the output tensor\n",
-    "    finn_output = output_dict[finnonnx_out_tensor_name] \n",
+    "    finn_output = output_dict[out_tensor_name] \n",
     "    return finn_output"
    ]
   },
diff --git a/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb b/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb
index 73cd25cf20..28702d0286 100644
--- a/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb
+++ b/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb
@@ -78,7 +78,7 @@
     "### Configuring the Board and FPGA Part <a id=\"config_fpga\"></a>\n",
     "\n",
     "* `fpga_part`: Xilinx FPGA part to be used for synthesis, can be left unspecified to be inferred from `board` below, or specified explicitly for e.g. out-of-context synthesis.\n",
-    "* `board`: target Xilinx Zynq or Alveo board for generating accelerators integrated into a shell. See the `pynq_part_map` and `alveo_part_map` dicts in [this file](https://github.com/Xilinx/finn-base/blob/dev/src/finn/util/basic.py#L41) for a list of possible boards.\n",
+    "* `board`: target Xilinx Zynq or Alveo board for generating accelerators integrated into a shell. See the `pynq_part_map` and `alveo_part_map` dicts in [this file](https://github.com/Xilinx/finn/blob/dev/src/finn/util/basic.py#L39) for a list of possible boards.\n",
     "* `shell_flow_type`: the target [shell flow type](https://finn-dev.readthedocs.io/en/latest/source_code/finn.builder.html#finn.builder.build_dataflow_config.ShellFlowType), only needed for generating full bitfiles where the FINN design is integrated into a shell (so only needed if `BITFILE` is selected) \n",
     "\n",
     "### Configuring the Performance <a id=\"config_perf\"></a>\n",
diff --git a/requirements.txt b/requirements.txt
index c2973f9432..1683695576 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,16 +5,15 @@ gspread==3.6.0
 importlib-resources==6.1.0
 ipython==8.12.2
 numpy==1.24.1
-onnx==1.13.0
+onnx==1.17.0
 onnxoptimizer
-onnxruntime==1.16.1
+onnxruntime==1.18.1
 pre-commit==3.3.2
 protobuf==3.20.3
 psutil==5.9.4
 pyscaffold==4.4
 scipy==1.10.1
 setupext-janitor>=1.1.2
-setuptools==68.2.2
 sigtools==4.0.1
 toposort==1.7.0
 vcdvcd==1.0.5
diff --git a/run-docker.sh b/run-docker.sh
index e732492728..ec55299f6c 100755
--- a/run-docker.sh
+++ b/run-docker.sh
@@ -100,6 +100,9 @@ SCRIPTPATH=$(dirname "$SCRIPT")
 : ${NVIDIA_VISIBLE_DEVICES=""}
 : ${DOCKER_BUILDKIT="1"}
 : ${FINN_SINGULARITY=""}
+: ${FINN_SKIP_XRT_DOWNLOAD=""}
+: ${FINN_XRT_PATH=""}
+: ${FINN_DOCKER_NO_CACHE="0"}
 
 DOCKER_INTERACTIVE=""
 
@@ -140,7 +143,7 @@ elif [ "$1" = "build_custom" ]; then
   DOCKER_INTERACTIVE="-it"
   #FINN_HOST_BUILD_DIR=$BUILD_DATAFLOW_DIR/build
   gecho "Running build_custom: $BUILD_CUSTOM_DIR/$FLOW_NAME.py"
-  DOCKER_CMD="python -mpdb -cc -cq $FLOW_NAME.py"
+  DOCKER_CMD="python -mpdb -cc -cq $FLOW_NAME.py ${@:4}"
 elif [ -z "$1" ]; then
    gecho "Running container only"
    DOCKER_CMD="bash"
@@ -181,14 +184,31 @@ if [ "$FINN_SKIP_DEP_REPOS" = "0" ]; then
   ./fetch-repos.sh
 fi
 
+# If xrt path given, copy .deb file to this repo
+# Be aware that we assume a certain name of the xrt deb version
+if [ -d "$FINN_XRT_PATH" ];then
+  cp $FINN_XRT_PATH/$XRT_DEB_VERSION.deb .
+  export LOCAL_XRT=1
+fi
+
+if [ "$FINN_DOCKER_NO_CACHE" = "1" ]; then
+  FINN_DOCKER_BUILD_EXTRA+="--no-cache "
+fi
+
 # Build the FINN Docker image
 if [ "$FINN_DOCKER_PREBUILT" = "0" ] && [ -z "$FINN_SINGULARITY" ]; then
   # Need to ensure this is done within the finn/ root folder:
   OLD_PWD=$(pwd)
   cd $SCRIPTPATH
-  docker build -f docker/Dockerfile.finn --build-arg XRT_DEB_VERSION=$XRT_DEB_VERSION --tag=$FINN_DOCKER_TAG $FINN_DOCKER_BUILD_EXTRA .
+  docker build -f docker/Dockerfile.finn --build-arg XRT_DEB_VERSION=$XRT_DEB_VERSION --build-arg SKIP_XRT=$FINN_SKIP_XRT_DOWNLOAD --build-arg LOCAL_XRT=$LOCAL_XRT --tag=$FINN_DOCKER_TAG $FINN_DOCKER_BUILD_EXTRA .
   cd $OLD_PWD
 fi
+
+# Remove local xrt.deb file from repo
+if [ ! -z "$LOCAL_XRT" ];then
+  rm $XRT_DEB_VERSION.deb
+fi
+
 # Launch container with current directory mounted
 # important to pass the --init flag here for correct Vivado operation, see:
 # https://stackoverflow.com/questions/55733058/vivado-synthesis-hangs-in-docker-container-spawned-by-jenkins
@@ -211,6 +231,9 @@ DOCKER_EXEC+="-e NUM_DEFAULT_WORKERS=$NUM_DEFAULT_WORKERS "
 # Workaround for FlexLM issue, see:
 # https://community.flexera.com/t5/InstallAnywhere-Forum/Issues-when-running-Xilinx-tools-or-Other-vendor-tools-in-docker/m-p/245820#M10647
 DOCKER_EXEC+="-e LD_PRELOAD=/lib/x86_64-linux-gnu/libudev.so.1 "
+# Workaround for running multiple Vivado instances simultaneously, see:
+# https://adaptivesupport.amd.com/s/article/63253?language=en_US
+DOCKER_EXEC+="-e XILINX_LOCAL_USER_DATA=no "
 if [ "$FINN_DOCKER_RUN_AS_ROOT" = "0" ] && [ -z "$FINN_SINGULARITY" ];then
   DOCKER_EXEC+="-v /etc/group:/etc/group:ro "
   DOCKER_EXEC+="-v /etc/passwd:/etc/passwd:ro "
@@ -250,6 +273,36 @@ if [ ! -z "$FINN_XILINX_PATH" ];then
     DOCKER_EXEC+="-e ALVEO_TARGET_DIR=$ALVEO_TARGET_DIR "
   fi
 fi
+
+# This part is used for internal ci for finn-examples
+# if using build verification for finn-examples ci, set up the necessary Docker variables
+if [ "$VERIFICATION_EN" = 1 ]; then
+  if [ -z "$FINN_EXAMPLES_ROOT" ]; then
+    recho "FINN_EXAMPLES_ROOT path has not been set."
+    recho "Please set FINN_EXAMPLES_ROOT path to enable verification."
+    exit -1
+  elif [ ! -d "${FINN_EXAMPLES_ROOT}/ci" ]; then
+    recho "ci folder not found in ${FINN_EXAMPLES_ROOT}."
+    recho "Please ensure the FINN-examples repo has been set up correctly, and FINN_EXAMPLES_ROOT path is set correctly, to enable verification."
+    exit -1
+  elif [ -z "$VERIFICATION_IO" ]; then
+    recho "VERIFICATION_IO paths has not been set."
+    recho "Please ensure the path to the input and expected output files has been set correctly to eneable verification."
+    exit -1
+  elif [ ! -d "$VERIFICATION_IO" ]; then
+    recho "${VERIFICATION_IO} is not a directory."
+    recho "Please ensure the VERIFICATION_IO path has been set to the directory containing the input and expected output files for verification."
+    exit -1
+  else
+    DOCKER_EXEC+="-e VERIFICATION_EN=$VERIFICATION_EN "
+    DOCKER_EXEC+="-e FINN_EXAMPLES_ROOT=$FINN_EXAMPLES_ROOT "
+    DOCKER_EXEC+="-e VERIFICATION_IO=$VERIFICATION_IO "
+    FINN_DOCKER_EXTRA+="-v $FINN_EXAMPLES_ROOT/ci:$FINN_EXAMPLES_ROOT/ci "
+    FINN_DOCKER_EXTRA+="-v $VERIFICATION_IO:$VERIFICATION_IO "
+  fi
+fi
+
+
 DOCKER_EXEC+="$FINN_DOCKER_EXTRA "
 
 if [ -z "$FINN_SINGULARITY" ];then
diff --git a/src/finn/analysis/fpgadataflow/post_synth_res.py b/src/finn/analysis/fpgadataflow/post_synth_res.py
index 7b65b60fa7..f7a3e6e2ba 100644
--- a/src/finn/analysis/fpgadataflow/post_synth_res.py
+++ b/src/finn/analysis/fpgadataflow/post_synth_res.py
@@ -58,14 +58,13 @@ def post_synth_res(model, override_synth_report_filename=None):
     else:
         raise Exception("Please run synthesis first")
 
-    # TODO build these indices based on table headers instead of harcoding
     restype_to_ind_default = {
         "LUT": 2,
         "SRL": 5,
         "FF": 6,
         "BRAM_36K": 7,
         "BRAM_18K": 8,
-        "DSP48": 9,
+        "DSP": 10,
     }
     restype_to_ind_vitis = {
         "LUT": 4,
@@ -74,13 +73,36 @@ def post_synth_res(model, override_synth_report_filename=None):
         "BRAM_36K": 9,
         "BRAM_18K": 10,
         "URAM": 11,
-        "DSP48": 12,
+        "DSP": 12,
     }
 
-    if model.get_metadata_prop("platform") == "alveo":
-        restype_to_ind = restype_to_ind_vitis
+    # format: (human_readable_name_in_report, canonical_name)
+    res_types_to_search = [
+        ("Total LUTs", "LUT"),
+        ("SRLs", "SRL"),
+        ("FFs", "FF"),
+        ("RAMB36", "BRAM_36K"),
+        ("RAMB18", "BRAM_18K"),
+        ("URAM", "URAM"),
+        ("DSP Blocks", "DSP"),
+    ]
+
+    # try to infer resource type to table index by
+    # looking at the names in headings
+    header_row = root.findall(".//*[@contents='Instance']/..")
+    if header_row != []:
+        headers = [x.attrib["contents"] for x in list(header_row[0])]
+        restype_to_ind = {}
+        for res_type_name, res_type in res_types_to_search:
+            if res_type_name in headers:
+                restype_to_ind[res_type] = headers.index(res_type_name)
     else:
-        restype_to_ind = restype_to_ind_default
+        # could not infer resource types from header
+        # fall back to default indices
+        if model.get_metadata_prop("platform") == "alveo":
+            restype_to_ind = restype_to_ind_vitis
+        else:
+            restype_to_ind = restype_to_ind_default
 
     def get_instance_stats(inst_name):
         row = root.findall(".//*[@contents='%s']/.." % inst_name)
diff --git a/src/finn/analysis/fpgadataflow/res_estimation.py b/src/finn/analysis/fpgadataflow/res_estimation.py
index a6be1f1f53..fb12eed837 100644
--- a/src/finn/analysis/fpgadataflow/res_estimation.py
+++ b/src/finn/analysis/fpgadataflow/res_estimation.py
@@ -31,7 +31,7 @@
 from finn.util.fpgadataflow import is_hls_node, is_rtl_node
 
 
-def res_estimation(model):
+def res_estimation(model, fpgapart):
     """Estimates the resources needed for the given model.
     Ensure that all nodes have unique names (by calling the GiveUniqueNodeNames
     transformation) prior to calling this analysis pass to ensure all nodes are
@@ -43,12 +43,12 @@ def res_estimation(model):
     for node in model.graph.node:
         if is_hls_node(node) or is_rtl_node(node):
             inst = registry.getCustomOp(node)
-            res_dict[node.name] = inst.node_res_estimation()
+            res_dict[node.name] = inst.node_res_estimation(fpgapart)
 
     return res_dict
 
 
-def res_estimation_complete(model):
+def res_estimation_complete(model, fpgapart):
     """Estimates the resources needed for the given model and all values for
     resource-related switches.
     Ensure that all nodes have unique names (by calling the GiveUniqueNodeNames
@@ -66,21 +66,21 @@ def res_estimation_complete(model):
                 orig_restype = inst.get_nodeattr("resType")
                 res_dict[node.name] = []
                 inst.set_nodeattr("resType", "dsp")
-                res_dict[node.name].append(inst.node_res_estimation())
+                res_dict[node.name].append(inst.node_res_estimation(fpgapart))
                 inst.set_nodeattr("resType", "lut")
-                res_dict[node.name].append(inst.node_res_estimation())
+                res_dict[node.name].append(inst.node_res_estimation(fpgapart))
                 inst.set_nodeattr("resType", orig_restype)
             elif op_type.startswith("ConvolutionInputGenerator"):
                 orig_ramstyle = inst.get_nodeattr("ram_style")
                 res_dict[node.name] = []
                 inst.set_nodeattr("ram_style", "block")
-                res_dict[node.name].append(inst.node_res_estimation())
+                res_dict[node.name].append(inst.node_res_estimation(fpgapart))
                 inst.set_nodeattr("ram_style", "distributed")
-                res_dict[node.name].append(inst.node_res_estimation())
+                res_dict[node.name].append(inst.node_res_estimation(fpgapart))
                 inst.set_nodeattr("ram_style", "ultra")
-                res_dict[node.name].append(inst.node_res_estimation())
+                res_dict[node.name].append(inst.node_res_estimation(fpgapart))
                 inst.set_nodeattr("ram_style", orig_ramstyle)
             else:
-                res_dict[node.name] = [inst.node_res_estimation()]
+                res_dict[node.name] = [inst.node_res_estimation(fpgapart)]
 
     return res_dict
diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py
index e35c1cd346..d6437a2e5c 100644
--- a/src/finn/builder/build_dataflow_config.py
+++ b/src/finn/builder/build_dataflow_config.py
@@ -35,7 +35,7 @@
 from typing import Any, List, Optional
 
 from finn.transformation.fpgadataflow.vitis_build import VitisOptStrategy
-from finn.util.basic import alveo_default_platform, alveo_part_map, pynq_part_map
+from finn.util.basic import alveo_default_platform, part_map
 
 
 class AutoFIFOSizingMethod(str, Enum):
@@ -96,6 +96,8 @@ class VerificationStepType(str, Enum):
     STREAMLINED_PYTHON = "streamlined_python"
     #: verify after step_apply_folding_config, using C++ for each HLS node
     FOLDED_HLS_CPPSIM = "folded_hls_cppsim"
+    #: verify after step_hw_ipgen
+    NODE_BY_NODE_RTLSIM = "node_by_node_rtlsim"
     #: verify after step_create_stitched_ip, using stitched-ip Verilog
     STITCHED_IP_RTLSIM = "stitched_ip_rtlsim"
 
@@ -368,11 +370,10 @@ def _resolve_driver_platform(self):
     def _resolve_fpga_part(self):
         if self.fpga_part is None:
             # lookup from part map if not specified
-            if self.shell_flow_type == ShellFlowType.VIVADO_ZYNQ:
-                return pynq_part_map[self.board]
-            elif self.shell_flow_type == ShellFlowType.VITIS_ALVEO:
-                return alveo_part_map[self.board]
-            else:
+            try:
+                fpga_part = part_map[self.board]
+                return fpga_part
+            except KeyError:
                 raise Exception("Couldn't resolve fpga_part for " + self.board)
         else:
             # return as-is when explicitly specified
diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
index 443d2df54c..ab2280554c 100644
--- a/src/finn/builder/build_dataflow_steps.py
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -34,6 +34,7 @@
 import warnings
 from copy import deepcopy
 from distutils.dir_util import copy_tree
+from functools import partial
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount
@@ -62,6 +63,7 @@
     aggregate_dict_keys,
     op_and_param_counts,
 )
+from finn.analysis.fpgadataflow.post_synth_res import post_synth_res
 from finn.analysis.fpgadataflow.res_estimation import (
     res_estimation,
     res_estimation_complete,
@@ -119,6 +121,7 @@
 )
 from finn.transformation.streamline import Streamline
 from finn.transformation.streamline.reorder import MakeMaxPoolNHWC
+from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds
 from finn.util.basic import (
     get_rtlsim_trace_depth,
     pyverilate_get_liveness_threshold_cycles,
@@ -277,7 +280,7 @@ def step_qonnx_to_finn(model: ModelWrapper, cfg: DataflowBuildConfig):
     )
 
     if VerificationStepType.QONNX_TO_FINN_PYTHON in cfg._resolve_verification_steps():
-        verify_step(model, cfg, "qonnx_to_finn_python", need_parent=False)
+        verify_step(model, cfg, "finn_onnx_python", need_parent=False)
 
     return model
 
@@ -469,11 +472,15 @@ def step_generate_estimate_reports(model: ModelWrapper, cfg: DataflowBuildConfig
         estimate_layer_cycles = model.analysis(exp_cycles_per_layer)
         with open(report_dir + "/estimate_layer_cycles.json", "w") as f:
             json.dump(estimate_layer_cycles, f, indent=2)
-        estimate_layer_resources = model.analysis(res_estimation)
+        estimate_layer_resources = model.analysis(
+            partial(res_estimation, fpgapart=cfg._resolve_fpga_part())
+        )
         estimate_layer_resources["total"] = aggregate_dict_keys(estimate_layer_resources)
         with open(report_dir + "/estimate_layer_resources.json", "w") as f:
             json.dump(estimate_layer_resources, f, indent=2)
-        estimate_layer_resources_complete = model.analysis(res_estimation_complete)
+        estimate_layer_resources_complete = model.analysis(
+            partial(res_estimation_complete, fpgapart=cfg._resolve_fpga_part())
+        )
         with open(report_dir + "/estimate_layer_config_alternatives.json", "w") as f:
             json.dump(estimate_layer_resources_complete, f, indent=2)
         # need to call AnnotateCycles before dataflow_performance
@@ -497,6 +504,7 @@ def step_minimize_bit_width(model: ModelWrapper, cfg: DataflowBuildConfig):
     if cfg.minimize_bit_width:
         model = model.transform(MinimizeWeightBitWidth())
         model = model.transform(MinimizeAccumulatorWidth())
+        model = model.transform(RoundAndClipThresholds())
         # make sure the changed datatypes are propagated through the network
         model = model.transform(InferDataTypes())
     return model
@@ -521,6 +529,11 @@ def step_hw_ipgen(model: ModelWrapper, cfg: DataflowBuildConfig):
     estimate_layer_resources_hls = model.analysis(hls_synth_res_estimation)
     with open(report_dir + "/estimate_layer_resources_hls.json", "w") as f:
         json.dump(estimate_layer_resources_hls, f, indent=2)
+
+    if VerificationStepType.NODE_BY_NODE_RTLSIM in cfg._resolve_verification_steps():
+        model = model.transform(PrepareRTLSim())
+        model = model.transform(SetExecMode("rtlsim"))
+        verify_step(model, cfg, "node_by_node_rtlsim", need_parent=True)
     return model
 
 
@@ -540,7 +553,7 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
     if cfg.auto_fifo_depths:
         if cfg.auto_fifo_strategy == "characterize":
             model = model.transform(InsertDWC())
-            model = model.transform(SpecializeLayers())
+            model = model.transform(SpecializeLayers(cfg._resolve_fpga_part()))
             model = model.transform(GiveUniqueNodeNames())
             model = model.transform(
                 PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period())
@@ -558,7 +571,7 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
                     create_shallow_fifos=True,
                 )
             )
-            model = model.transform(SpecializeLayers())
+            model = model.transform(SpecializeLayers(cfg._resolve_fpga_part()))
             model = model.transform(GiveUniqueNodeNames())
             model = model.transform(GiveReadableTensorNames())
         elif cfg.auto_fifo_strategy == "largefifo_rtlsim":
@@ -590,7 +603,7 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
         # need to make sure all FIFOs are created so that their depth can be
         # set by ApplyConfig, so create_shallow_fifos=True
         model = model.transform(InsertFIFO(create_shallow_fifos=True))
-        model = model.transform(SpecializeLayers())
+        model = model.transform(SpecializeLayers(cfg._resolve_fpga_part()))
         model = model.transform(GiveUniqueNodeNames())
         model = model.transform(GiveReadableTensorNames())
         if cfg.folding_config_file is not None:
@@ -655,7 +668,7 @@ def step_create_stitched_ip(model: ModelWrapper, cfg: DataflowBuildConfig):
         estimate_network_performance = verify_model.analysis(dataflow_performance)
         prev_liveness = pyverilate_get_liveness_threshold_cycles()
         os.environ["LIVENESS_THRESHOLD"] = str(
-            int(estimate_network_performance["critical_path_cycles"])
+            int(estimate_network_performance["critical_path_cycles"] * 1.1)
         )
         if cfg.verify_save_rtlsim_waveforms:
             report_dir = cfg.output_dir + "/report"
@@ -801,6 +814,11 @@ def step_synthesize_bitfile(model: ModelWrapper, cfg: DataflowBuildConfig):
                 model.get_metadata_prop("vivado_synth_rpt"),
                 report_dir + "/post_synth_resources.xml",
             )
+
+            post_synth_resources = model.analysis(post_synth_res)
+            with open(report_dir + "/post_synth_resources.json", "w") as f:
+                json.dump(post_synth_resources, f, indent=2)
+
             vivado_pynq_proj_dir = model.get_metadata_prop("vivado_pynq_proj")
             timing_rpt = (
                 "%s/finn_zynq_link.runs/impl_1/top_wrapper_timing_summary_routed.rpt"
@@ -825,6 +843,10 @@ def step_synthesize_bitfile(model: ModelWrapper, cfg: DataflowBuildConfig):
                 model.get_metadata_prop("vivado_synth_rpt"),
                 report_dir + "/post_synth_resources.xml",
             )
+
+            post_synth_resources = model.analysis(post_synth_res)
+            with open(report_dir + "/post_synth_resources.json", "w") as f:
+                json.dump(post_synth_resources, f, indent=2)
         else:
             raise Exception("Unrecognized shell_flow_type: " + str(cfg.shell_flow_type))
         print("Bitfile written into " + bitfile_dir)
diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
index 96f49069c7..1fb4940fb4 100644
--- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
+++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
@@ -27,6 +27,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import numpy as np
+import warnings
 from onnx import TensorProto, helper
 from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
@@ -141,6 +142,27 @@ def infer_node_datatype(self, model):
         node = self.onnx_node
         # data type stays the same
         dtype = model.get_tensor_datatype(node.input[0])
+
+        # Test for changing input datatype
+        if dtype != self.get_nodeattr("inputDataType"):
+            # Issue a warning message
+            warnings.warn(
+                f"{node.name}: inputDataType changing from"
+                f" {self.get_nodeattr('inputDataType')} to {dtype}"
+            )
+            # Set the new datatype attribute
+            self.set_nodeattr("inputDataType", dtype.name)
+
+        # Test for changing output datatype
+        if dtype != self.get_nodeattr("outputDataType"):
+            # Issue a warning message
+            warnings.warn(
+                f"{node.name}: outputDataType changing from"
+                f" {self.get_nodeattr('outputDataType')} to {dtype}"
+            )
+            # Set the new datatype attribute
+            self.set_nodeattr("outputDataType", dtype.name)
+        # Propagate the datatype through the model graph
         model.set_tensor_datatype(node.output[0], dtype)
 
     def verify_node(self):
diff --git a/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py
index 94f8cc0845..cae1c30eb6 100644
--- a/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py
@@ -118,7 +118,7 @@ def lut_estimation(self):
             c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts)) + c2
         )
 
-    def dsp_estimation(self):
+    def dsp_estimation(self, fpgapart):
         # multiplication
         P = self.get_nodeattr("PE")
         res_type = self.get_nodeattr("resType")
diff --git a/src/finn/custom_op/fpgadataflow/hls/streamingdatawidthconverter_hls.py b/src/finn/custom_op/fpgadataflow/hls/streamingdatawidthconverter_hls.py
index d1f58d3e87..4619a1756b 100644
--- a/src/finn/custom_op/fpgadataflow/hls/streamingdatawidthconverter_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/streamingdatawidthconverter_hls.py
@@ -78,12 +78,6 @@ def strm_decl(self):
                 self.get_instream_width(), self.hls_sname(), self.hls_sname()
             )
         )
-        if self.needs_lcm():
-            self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-                'hls::stream<ap_uint<{}>> intermediate ("intermediate");'.format(
-                    self.get_iowidth_lcm()
-                )
-            )
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
             'hls::stream<ap_uint<{}>> out_{} ("out_{}");'.format(
                 self.get_outstream_width(), self.hls_sname(), self.hls_sname()
diff --git a/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py b/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py
index 3e10b640c5..f9ba68e6b6 100644
--- a/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py
@@ -112,7 +112,7 @@ def lut_estimation(self):
             c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts)) + c2
         )
 
-    def dsp_estimation(self):
+    def dsp_estimation(self, fpgapart):
         # multiplication
         P = self.get_nodeattr("PE")
         res_type = self.get_nodeattr("resType")
diff --git a/src/finn/custom_op/fpgadataflow/hlsbackend.py b/src/finn/custom_op/fpgadataflow/hlsbackend.py
index d8210fd684..4677960ea8 100644
--- a/src/finn/custom_op/fpgadataflow/hlsbackend.py
+++ b/src/finn/custom_op/fpgadataflow/hlsbackend.py
@@ -54,6 +54,8 @@ def get_nodeattr_types(self):
             "code_gen_dir_cppsim": ("s", False, ""),
             "executable_path": ("s", False, ""),
             "res_hls": ("s", False, ""),
+            # temporary node attribute to keep track of interface style of hls ops
+            "cpp_interface": ("s", False, "packed", {"packed", "hls_vector"}),
         }
 
     def get_all_verilog_paths(self):
@@ -206,7 +208,13 @@ def code_generation_cppsim(self, model):
         self.dataoutstrm()
         self.save_as_npy()
 
-        template = templates.docompute_template
+        if self.get_nodeattr("cpp_interface") == "hls_vector":
+            self.timeout_value()
+            self.timeout_condition()
+            self.timeout_read_stream()
+            template = templates.docompute_template_timeout
+        else:
+            template = templates.docompute_template
 
         for key in self.code_gen_dict:
             # transform list into long string separated by '\n'
@@ -371,24 +379,40 @@ def read_npy_data(self):
         if dtype == DataType["BIPOLAR"]:
             # use binary for bipolar storage
             dtype = DataType["BINARY"]
-        elem_bits = dtype.bitwidth()
-        packed_bits = self.get_instream_width()
-        packed_hls_type = "ap_uint<%d>" % packed_bits
         elem_hls_type = dtype.get_hls_datatype_str()
         npy_type = "float"
         npy_in = "%s/input_0.npy" % code_gen_dir
         self.code_gen_dict["$READNPYDATA$"] = []
-        self.code_gen_dict["$READNPYDATA$"].append(
-            'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);'
-            % (
-                packed_hls_type,
-                elem_hls_type,
-                elem_bits,
-                npy_type,
-                npy_in,
-                self.hls_sname(),
+
+        cpp_interface = self.get_nodeattr("cpp_interface")
+
+        if cpp_interface == "packed":
+            elem_bits = dtype.bitwidth()
+            packed_bits = self.get_instream_width()
+            packed_hls_type = "ap_uint<%d>" % packed_bits
+            self.code_gen_dict["$READNPYDATA$"].append(
+                'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);'
+                % (
+                    packed_hls_type,
+                    elem_hls_type,
+                    elem_bits,
+                    npy_type,
+                    npy_in,
+                    self.hls_sname(),
+                )
+            )
+        else:
+            folded_shape = self.get_folded_input_shape()
+            self.code_gen_dict["$READNPYDATA$"].append(
+                'npy2vectorstream<%s, %s, %d>("%s", in0_%s, false);'
+                % (
+                    elem_hls_type,
+                    npy_type,
+                    folded_shape[-1],
+                    npy_in,
+                    self.hls_sname(),
+                )
             )
-        )
 
     def strm_decl(self):
         """Function to generate the commands for the stream declaration in c++,
@@ -422,27 +446,43 @@ def dataoutstrm(self):
         if dtype == DataType["BIPOLAR"]:
             # use binary for bipolar storage
             dtype = DataType["BINARY"]
-        elem_bits = dtype.bitwidth()
-        packed_bits = self.get_outstream_width()
-        packed_hls_type = "ap_uint<%d>" % packed_bits
         elem_hls_type = dtype.get_hls_datatype_str()
         npy_type = "float"
         npy_out = "%s/output.npy" % code_gen_dir
         oshape = self.get_folded_output_shape()
         oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
 
-        self.code_gen_dict["$DATAOUTSTREAM$"] = [
-            'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");'
-            % (
-                packed_hls_type,
-                elem_hls_type,
-                elem_bits,
-                npy_type,
-                self.hls_sname(),
-                oshape_cpp_str,
-                npy_out,
-            )
-        ]
+        cpp_interface = self.get_nodeattr("cpp_interface")
+
+        if cpp_interface == "packed":
+            elem_bits = dtype.bitwidth()
+            packed_bits = self.get_outstream_width()
+            packed_hls_type = "ap_uint<%d>" % packed_bits
+
+            self.code_gen_dict["$DATAOUTSTREAM$"] = [
+                'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");'
+                % (
+                    packed_hls_type,
+                    elem_hls_type,
+                    elem_bits,
+                    npy_type,
+                    self.hls_sname(),
+                    oshape_cpp_str,
+                    npy_out,
+                )
+            ]
+        else:
+            folded_shape = self.get_folded_output_shape()
+            self.code_gen_dict["$DATAOUTSTREAM$"] = [
+                'vectorstream2npy<%s, %s, %d>(strm, %s, "%s");'
+                % (
+                    elem_hls_type,
+                    npy_type,
+                    folded_shape[-1],
+                    oshape_cpp_str,
+                    npy_out,
+                )
+            ]
 
     def save_as_npy(self):
         """Function to generate the commands for saving data in .npy file in c++"""
@@ -474,3 +514,17 @@ def get_ap_int_max_w(self):
         ret = max([instream, outstream])
         assert ret <= 8191, "AP_INT_MAX_W=%d is larger than allowed maximum of 8191" % ret
         return ret
+
+    def timeout_value(self):
+        """Set timeout value for HLS functions defined for one clock cycle"""
+        self.code_gen_dict["$TIMEOUT_VALUE$"] = ["1000"]
+
+    def timeout_condition(self):
+        """Set timeout condition for HLS functions defined for one clock cycle"""
+        self.code_gen_dict["$TIMEOUT_CONDITION$"] = ["out_{}.empty()".format(self.hls_sname())]
+
+    def timeout_read_stream(self):
+        """Set reading output stream procedure for HLS functions defined for one clock cycle"""
+        self.code_gen_dict["$TIMEOUT_READ_STREAM$"] = [
+            "strm << out_{}.read();".format(self.hls_sname())
+        ]
diff --git a/src/finn/custom_op/fpgadataflow/hwcustomop.py b/src/finn/custom_op/fpgadataflow/hwcustomop.py
index 57c0fec067..b40b8f3074 100644
--- a/src/finn/custom_op/fpgadataflow/hwcustomop.py
+++ b/src/finn/custom_op/fpgadataflow/hwcustomop.py
@@ -136,7 +136,7 @@ def get_rtlsim(self):
         sim = PyVerilator(rtlsim_so)
         return sim
 
-    def node_res_estimation(self):
+    def node_res_estimation(self, fpgapart):
         """Returns summarized resource estimation of BRAMs and LUTs
         of the node as a dictionary."""
         ret = dict()
@@ -145,7 +145,7 @@ def node_res_estimation(self):
         ret["LUT"] = self.lut_estimation()
         ret["URAM"] = self.uram_estimation()
         ret["URAM_efficiency"] = self.uram_efficiency_estimation()
-        ret["DSP"] = self.dsp_estimation()
+        ret["DSP"] = self.dsp_estimation(fpgapart)
         return ret
 
     def bram_efficiency_estimation(self):
@@ -173,7 +173,7 @@ def lut_estimation(self):
         HWCustomOp class but has to be filled by every node"""
         return 0
 
-    def dsp_estimation(self):
+    def dsp_estimation(self, fpgapart):
         """Function for DSP resource estimation, is member function of
         HWCustomOp class but has to be filled by every node"""
         return 0
diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
index 7bbe4c04e9..8f0a987bce 100644
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
@@ -130,6 +130,8 @@ def get_nodeattr_types(self):
     def execute_node(self, context, graph):
         node = self.onnx_node
         in_act = context[node.input[0]]
+        # ensure that shape is compatible
+        in_act = in_act.reshape(self.get_normal_input_shape())
         mvau_w_init = [x for x in graph.initializer if x.name == node.input[1]][0]
         mvau_w = np_helper.to_array(mvau_w_init)
         # Matrix multiplication
@@ -159,8 +161,8 @@ def execute_node(self, context, graph):
             if result.ndim == 4:
                 # NCHW to NHWC
                 result = result.transpose((0, 2, 3, 1))
-
-        context[node.output[0]] = result
+        oshape = context[node.output[0]].shape
+        context[node.output[0]] = result.reshape(oshape)
 
     def verify_node(self):
         info_messages = []
diff --git a/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py
index d48b3a918d..d9ab501117 100644
--- a/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py
@@ -32,7 +32,7 @@
 
 from finn.custom_op.fpgadataflow.matrixvectoractivation import MVAU
 from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend
-from finn.util.basic import get_rtlsim_trace_depth, make_build_dir
+from finn.util.basic import get_dsp_block, get_rtlsim_trace_depth, make_build_dir
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 
 try:
@@ -55,10 +55,7 @@ def __init__(self, onnx_node, **kwargs):
         super().__init__(onnx_node, **kwargs)
 
     def get_nodeattr_types(self):
-        my_attrs = {
-            # Flag to indicate if Versal device is targeted
-            "is_versal": ("i", False, 0, {0, 1}),
-        }
+        my_attrs = {}
         my_attrs.update(MVAU.get_nodeattr_types(self))
         my_attrs.update(RTLBackend.get_nodeattr_types(self))
         return my_attrs
@@ -137,11 +134,12 @@ def execute_node(self, context, graph):
     def lut_estimation(self):
         return 0
 
-    def dsp_estimation(self):
+    def dsp_estimation(self, fpgapart):
         # multiplication
         P = self.get_nodeattr("PE")
         Q = self.get_nodeattr("SIMD")
-        if self.get_nodeattr("is_versal"):
+        dsp_block = get_dsp_block(fpgapart)
+        if dsp_block == "DSP58":
             mult_dsp = P * np.ceil(Q / 3)
         else:
             mult_dsp = np.ceil(P / 4) * Q
@@ -161,14 +159,24 @@ def instantiate_ip(self, cmd):
         ]
         for f in sourcefiles:
             cmd.append("add_files -norecurse %s" % (f))
-        cmd.append(
-            "create_bd_cell -type hier -reference %s /%s/%s"
-            % (
-                self.get_nodeattr("gen_top_module"),
-                self.onnx_node.name,
-                self.onnx_node.name,
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode == "internal_decoupled":
+            cmd.append(
+                "create_bd_cell -type hier -reference %s /%s/%s"
+                % (
+                    self.get_nodeattr("gen_top_module"),
+                    self.onnx_node.name,
+                    self.onnx_node.name,
+                )
+            )
+        else:
+            cmd.append(
+                "create_bd_cell -type hier -reference %s %s"
+                % (
+                    self.get_nodeattr("gen_top_module"),
+                    self.onnx_node.name,
+                )
             )
-        )
 
     def _resolve_segment_len(self, clk):
         # Insert pipeline registers in the DSP58 chain to meet target clock frequency
@@ -186,7 +194,7 @@ def _resolve_segment_len(self, clk):
         dsp_chain_len = critical_path_dsps if critical_path_dsps < max_chain_len else max_chain_len
         return dsp_chain_len
 
-    def _resolve_impl_style(self, fpgapart):
+    def _resolve_impl_style(self, dsp_block):
         # Based on target device and activation/weight-width, choose the
         # supported RTL compute core
         assert (
@@ -198,15 +206,18 @@ def _resolve_impl_style(self, fpgapart):
 
         act_width = self.get_input_datatype(0).bitwidth()
         weight_width = self.get_input_datatype(1).bitwidth()
-        is_versal_family = self.get_nodeattr("is_versal")
 
-        if is_versal_family:
-            return "mvu_vvu_8sx9_dsp58"
+        if dsp_block == "DSP58":
+            if act_width <= 4 and weight_width <= 4:
+                return "mvu_4sx4u_dsp48e2"
+            else:
+                return "mvu_vvu_8sx9_dsp58"
         else:
-            act_width = self.get_input_datatype(0).bitwidth()
-            weight_width = self.get_input_datatype(1).bitwidth()
-            if (act_width == 4 and weight_width == 4) and not (is_versal_family):
-                return "mvu_4sx4u"
+            if act_width <= 4 and weight_width <= 4:
+                if dsp_block == "DSP48E1":
+                    return "mvu_4sx4u_dsp48e1"
+                elif dsp_block == "DSP48E2":
+                    return "mvu_4sx4u_dsp48e2"
             else:
                 return "mvu_8sx8u_dsp48"
 
@@ -216,6 +227,11 @@ def generate_hdl(self, model, fpgapart, clk):
         self.generate_params(model, code_gen_dir)
 
         template_path, code_gen_dict = self.prepare_codegen_default(fpgapart, clk)
+        # determine if weights are narrow range and add parameter to code gen dict
+        weights = model.get_initializer(self.onnx_node.input[1])
+        wdt = self.get_weight_datatype()
+        narrow_weights = 0 if np.min(weights) == wdt.min() else 1
+        code_gen_dict["$NARROW_WEIGHTS$"] = str(narrow_weights)
         # add general parameters to dictionary
         code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [self.get_verilog_top_module_name()]
         # save top module name so we can refer to it after this node has been renamed
@@ -248,9 +264,10 @@ def generate_hdl(self, model, fpgapart, clk):
     def prepare_codegen_default(self, fpgapart, clk):
         template_path = os.environ["FINN_ROOT"] + "/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v"
 
+        dsp_block = get_dsp_block(fpgapart)
         code_gen_dict = {}
         code_gen_dict["$IS_MVU$"] = [str(1)]
-        code_gen_dict["$COMPUTE_CORE$"] = [self._resolve_impl_style(fpgapart)]
+        code_gen_dict["$COMPUTE_CORE$"] = [self._resolve_impl_style(dsp_block)]
         code_gen_dict["$MW$"] = [str(self.get_nodeattr("MW"))]
         code_gen_dict["$MH$"] = [str(self.get_nodeattr("MH"))]
         code_gen_dict["$PE$"] = [str(self.get_nodeattr("PE"))]
diff --git a/src/finn/custom_op/fpgadataflow/rtl/streamingfifo_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/streamingfifo_rtl.py
index dfae607622..f8f27cb647 100644
--- a/src/finn/custom_op/fpgadataflow/rtl/streamingfifo_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/rtl/streamingfifo_rtl.py
@@ -95,7 +95,7 @@ def generate_hdl(self, model, fpgapart, clk):
         code_gen_dict["$TOP_MODULE_NAME$"] = topname
         # make instream width a multiple of 8 for axi interface
         in_width = self.get_instream_width_padded()
-        count_width = int(self.get_nodeattr("depth") - 1).bit_length()
+        count_width = int(self.get_nodeattr("depth")).bit_length()
         code_gen_dict["$COUNT_RANGE$"] = "[{}:0]".format(count_width - 1)
         code_gen_dict["$IN_RANGE$"] = "[{}:0]".format(in_width - 1)
         code_gen_dict["$OUT_RANGE$"] = "[{}:0]".format(in_width - 1)
diff --git a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py
index 67b41d0165..d1e9387b1b 100644
--- a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py
@@ -167,41 +167,61 @@ def prepare_codegen_rtl_values(self, model):
         their key value(s) in the RTL template files"""
         code_gen_dict = {}
 
-        # TODO check for sortedness and size here?
         thresholds = model.get_initializer(self.onnx_node.input[1])
         bias = self.get_nodeattr("ActVal")  # activation bias value
         output_data_type = self.get_nodeattr("outputDataType")  # output precision
         input_data_type = self.get_nodeattr("inputDataType")  # input/threshold precision
         o_bitwidth = DataType[output_data_type].bitwidth()
 
+        t_path = self.get_nodeattr("code_gen_dir_ipgen")
+        if self.get_nodeattr("runtime_writeable_weights") == 1:
+            thresh_file_name = f"{t_path}/memblock.dat"
+            self.make_weight_file(thresholds, "decoupled", thresh_file_name)
+
         # The RTL expects 2^N-1 thresholds, but narrow range quantization will result in
-        # one less threshold, prepending a dummy threshold and reducing bias by 1 to compensate.
+        # one less threshold, prepending a dummy threshold (minimal possible value determined by
+        # input data type) and decrease the bias by 1.
+        # Additionally, increase number of threshold steps to reflect new shape
         expected_thresholds = 2**o_bitwidth - 1
         n_thres_steps = self.get_nodeattr("numSteps")
-        if expected_thresholds != n_thres_steps and DataType[input_data_type].signed() is not True:
-            min_val = np.amin(thresholds, axis=1)
-            thresholds = np.insert(thresholds, 0, min_val, axis=1)
-            bias = bias - 1
+        wdt = self.get_weight_datatype()
+        if expected_thresholds != n_thres_steps:
+            if DataType[output_data_type].signed():
+                min_val = wdt.min()
+                thresholds = np.insert(thresholds, 0, min_val, axis=1)
+                bias = bias - 1
+            # TODO: temporary fix for unsigned narrow quantization
+            else:
+                max_val = wdt.max()
+                if max_val > DataType[input_data_type].max():
+                    thresholds = np.insert(thresholds, len(thresholds[0]), max_val, axis=1)
+                else:
+                    max_val = max_val + 1
+                    # increase wdt
+                    if not wdt.signed():
+                        wdt = DataType.get_smallest_possible(max_val)
+                    else:
+                        wdt = DataType.get_smallest_possible(-max_val - 1)
+                    thresholds = np.insert(thresholds, len(thresholds[0]), max_val, axis=1)
+            n_thres_steps += 1
 
         # add dummy dimension as final dimension (that's what gets packed with next call)
-        thresholds = np.expand_dims(thresholds, axis=-1)
-        wdt = self.get_weight_datatype()
+        t_expand = np.expand_dims(thresholds, axis=-1)
         bw_hexdigit = roundup_to_integer_multiple(wdt.bitwidth(), 4)
         t_packed = pack_innermost_dim_as_hex_string(
-            thresholds,
+            t_expand,
             wdt,
             bw_hexdigit,
             prefix="",
         )
 
-        t_path = self.get_nodeattr("code_gen_dir_ipgen")
         pe = self.get_nodeattr("PE")
         num_channels = self.get_nodeattr("NumChannels")  # number of channels
 
         # If a single threshold value is found, broadcast the value
-        expected_shape = (num_channels, n_thres_steps)
-        if t_packed.shape == (1, 1):
-            t_packed = np.broadcast_to(t_packed, expected_shape)
+        if t_packed.shape[0] == 1:
+            t_packed = np.broadcast_to(t_packed, (pe, expected_thresholds))
+            num_channels = pe
 
         channel_fold = int(num_channels / pe)
 
@@ -235,9 +255,10 @@ def prepare_codegen_rtl_values(self, model):
         i_bitwidth = DataType[input_data_type].bitwidth()
 
         code_gen_dict["$N$"] = [str(o_bitwidth)]  # output precision - convert bitwidth to string
-        code_gen_dict["$M$"] = [
-            str(i_bitwidth)
-        ]  # input/threshold precision - convert bitwidth to string
+        code_gen_dict["$WT$"] = [
+            str(wdt.bitwidth())
+        ]  # threshold precision - convert bitwidth to string
+        code_gen_dict["$WI$"] = [str(i_bitwidth)]  # input precision - convert bitwidth to string
         code_gen_dict["$C$"] = [str(num_channels)]  # number of channels
         code_gen_dict["$BIAS$"] = [str(bias)]  # activation bias value
         code_gen_dict["$PE$"] = [str(pe)]  # requires C = M*PE
@@ -255,7 +276,6 @@ def prepare_codegen_rtl_values(self, model):
             o_bits = 1 + math.ceil(
                 math.log2(-bias if -bias >= 2 ** (o_bitwidth - 1) else 2**o_bitwidth + bias)
             )
-
         code_gen_dict["$O_BITS$"] = [str(int(o_bits))]
 
         rt_weights = self.get_nodeattr("runtime_writeable_weights")
@@ -322,10 +342,6 @@ def generate_hdl(self, model, fpgapart, clk):
         # by PyVerilator and IPI generation
         self.set_nodeattr("gen_top_module", code_gen_dict["$TOP_MODULE$"][0])
 
-        weights = model.get_initializer(self.onnx_node.input[1])
-        weights_fname = f"{code_gen_dir}/memblock.dat"
-        self.make_weight_file(weights, "decoupled", weights_fname)
-
         for rtl_file_path in self.get_rtl_file_paths():
             # read in original RTL template file
             template_data = self.get_rtl_template_data(rtl_file_path)
@@ -513,27 +529,46 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name):
         * weight_file_name : filename for the weight file to be generated
 
         """
-        threshold_tensor = self.get_hw_compatible_threshold_tensor(weights)
-        tdt = self.get_weight_datatype()
-        assert np.vectorize(tdt.allowed)(
-            threshold_tensor
-        ).all(), "Thresholds can't be expressed with type %s" % str(tdt)
-
+        thresholds = weights
         pe = self.get_nodeattr("PE")
         ch = self.get_nodeattr("NumChannels")
+        output_data_type = self.get_nodeattr("outputDataType")  # output precision
+        o_bitwidth = DataType[output_data_type].bitwidth()
+        # The RTL expects 2^N-1 thresholds, but narrow range quantization will result in
+        # one less threshold, prepending a dummy threshold (minimal possible value determined by
+        # input data type) and decrease the bias by 1.
+        # Additionally, increase number of threshold steps to reflect new shape
+        expected_thresholds = 2**o_bitwidth - 1
         n_thres_steps = self.get_nodeattr("numSteps")
+        wdt = self.get_weight_datatype()
+        if expected_thresholds != n_thres_steps:
+            if DataType[output_data_type].signed():
+                min_val = wdt.min()
+                thresholds = np.insert(thresholds, 0, min_val, axis=1)
+            # TODO: temporary fix for unsigned narrow quantization
+            else:
+                max_val = wdt.max()
+                if max_val > self.get_input_datatype().max():
+                    thresholds = np.insert(thresholds, len(thresholds[0]), max_val, axis=1)
+                else:
+                    max_val = max_val + 1
+                    # increase wdt
+                    if not wdt.signed():
+                        wdt = DataType.get_smallest_possible(max_val)
+                    else:
+                        wdt = DataType.get_smallest_possible(-max_val - 1)
+                    thresholds = np.insert(thresholds, len(thresholds[0]), max_val, axis=1)
+            n_thres_steps += 1
 
         # If a single threshold value is found, broadcast the value
-        n_thres_steps = self.get_nodeattr("numSteps")
-        expected_shape = (ch, n_thres_steps)
-        if weights.shape == (1, 1):
-            weights = np.broadcast_to(weights, expected_shape)
-
-        width_padded = roundup_to_integer_multiple(weights.shape[1], 4)
-        weight_padded = np.zeros((weights.shape[0], width_padded))
-        weight_padded[: weights.shape[0], :n_thres_steps] = weights
-        weight_stream = []
-        wdt = self.get_weight_datatype()
+        if thresholds.shape[0] == 1:
+            thresholds = np.broadcast_to(thresholds, (pe, expected_thresholds))
+            ch = pe
+
+        width_padded = roundup_to_integer_multiple(thresholds.shape[1], 2**o_bitwidth)
+        thresh_padded = np.zeros((thresholds.shape[0], width_padded))
+        thresh_padded[: thresholds.shape[0], :n_thres_steps] = thresholds
+        thresh_stream = []
         bw_hexdigit = roundup_to_integer_multiple(wdt.bitwidth(), 32)
         padding = np.zeros(width_padded, dtype=np.int32)
 
@@ -542,18 +577,18 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name):
         for fold in range(cf):
             for c in range(2 ** (pe - 1).bit_length()):
                 if (c == 0 or c % pe != 0) and c < pe:
-                    for w in weight_padded[chan_ind]:
-                        w_packed = pack_innermost_dim_as_hex_string(
-                            [w], wdt, bw_hexdigit, prefix=""
+                    for t in thresh_padded[chan_ind]:
+                        t_packed = pack_innermost_dim_as_hex_string(
+                            [t], wdt, bw_hexdigit, prefix=""
                         ).item()
-                        weight_stream.append(w_packed)
+                        thresh_stream.append(t_packed)
                     chan_ind += 1
                 else:
                     for z in padding:
-                        w_packed = pack_innermost_dim_as_hex_string(
+                        t_packed = pack_innermost_dim_as_hex_string(
                             [z], wdt, bw_hexdigit, prefix=""
                         ).item()
-                        weight_stream.append(w_packed)
+                        thresh_stream.append(t_packed)
         with open(weight_file_name, "w") as f:
-            for val in weight_stream:
+            for val in thresh_stream:
                 f.write(val + "\n")
diff --git a/src/finn/custom_op/fpgadataflow/rtl/vectorvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/vectorvectoractivation_rtl.py
index 27fc9f10a1..32943d86cf 100644
--- a/src/finn/custom_op/fpgadataflow/rtl/vectorvectoractivation_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/rtl/vectorvectoractivation_rtl.py
@@ -33,9 +33,8 @@
 
 from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend
 from finn.custom_op.fpgadataflow.vectorvectoractivation import VVAU
-from finn.util.basic import get_rtlsim_trace_depth, make_build_dir
+from finn.util.basic import get_rtlsim_trace_depth, is_versal, make_build_dir
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
-from finn.util.fpgadataflow import is_versal
 
 try:
     from pyverilator import PyVerilator
@@ -143,7 +142,7 @@ def execute_node(self, context, graph):
     def lut_estimation(self):
         return 0
 
-    def dsp_estimation(self):
+    def dsp_estimation(self, fpgapart):
         P = self.get_nodeattr("PE")
         Q = self.get_nodeattr("SIMD")
         return int(P * np.ceil(Q / 3))
@@ -162,14 +161,25 @@ def instantiate_ip(self, cmd):
         ]
         for f in sourcefiles:
             cmd.append("add_files -norecurse %s" % (f))
-        cmd.append(
-            "create_bd_cell -type hier -reference %s /%s/%s"
-            % (
-                self.get_nodeattr("gen_top_module"),
-                self.onnx_node.name,
-                self.onnx_node.name,
+
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode == "internal_decoupled":
+            cmd.append(
+                "create_bd_cell -type hier -reference %s /%s/%s"
+                % (
+                    self.get_nodeattr("gen_top_module"),
+                    self.onnx_node.name,
+                    self.onnx_node.name,
+                )
+            )
+        else:
+            cmd.append(
+                "create_bd_cell -type hier -reference %s %s"
+                % (
+                    self.get_nodeattr("gen_top_module"),
+                    self.onnx_node.name,
+                )
             )
-        )
 
     def generate_hdl(self, model, fpgapart, clk):
         # Generate params as part of IP preparation
@@ -177,6 +187,11 @@ def generate_hdl(self, model, fpgapart, clk):
         self.generate_params(model, code_gen_dir)
 
         template_path, code_gen_dict = self.prepare_codegen_default(fpgapart, clk)
+        # determine if weights are narrow range and add parameter to code gen dict
+        weights = model.get_initializer(self.onnx_node.input[1])
+        wdt = self.get_weight_datatype()
+        narrow_weights = 0 if np.min(weights) == wdt.min() else 1
+        code_gen_dict["$NARROW_WEIGHTS$"] = str(narrow_weights)
         # add general parameters to dictionary
         code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [self.get_verilog_top_module_name()]
         # save top module name so we can refer to it after this node has been renamed
diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py
index e5787bfd2a..ddc1d1f99a 100644
--- a/src/finn/custom_op/fpgadataflow/templates.py
+++ b/src/finn/custom_op/fpgadataflow/templates.py
@@ -33,6 +33,7 @@
 #define AP_INT_MAX_W $AP_INT_MAX_W$
 #include "cnpy.h"
 #include "npy2apintstream.hpp"
+#include "npy2vectorstream.hpp"
 #include <vector>
 #include "bnn-library.h"
 
@@ -59,6 +60,51 @@
 
 """
 
+# template for single node execution with timeout (for single clock hls operations)
+docompute_template_timeout = """
+#define AP_INT_MAX_W $AP_INT_MAX_W$
+#include "cnpy.h"
+#include "npy2apintstream.hpp"
+#include "npy2vectorstream.hpp"
+#include <vector>
+#include "bnn-library.h"
+
+// includes for network parameters
+$GLOBALS$
+
+// defines for network parameters
+$DEFINES$
+
+int main(){
+$PRAGMAS$
+
+$STREAMDECLARATIONS$
+
+$READNPYDATA$
+
+unsigned timeout = 0;
+while(timeout < $TIMEOUT_VALUE$){
+
+$DOCOMPUTE$
+
+if($TIMEOUT_CONDITION$){
+timeout++;
+}
+
+else{
+$TIMEOUT_READ_STREAM$
+timeout = 0;
+}
+}
+
+$DATAOUTSTREAM$
+
+$SAVEASCNPY$
+
+}
+
+"""
+
 # templates for single node ip generation
 
 # cpp file
diff --git a/src/finn/custom_op/fpgadataflow/thresholding.py b/src/finn/custom_op/fpgadataflow/thresholding.py
index 363c1572cf..cda2412617 100644
--- a/src/finn/custom_op/fpgadataflow/thresholding.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding.py
@@ -242,6 +242,7 @@ def execute_node(self, context, graph):
         node = self.onnx_node
         inp_values = context[node.input[0]]
         th_val = context[node.input[1]]
+        out_bias = self.get_nodeattr("ActVal")
         # MT expects inputs to be in the shape (N,C,H,W) or (N, C)
         # if 4D then input values in context are (N,H,W,C) and need to
         # be transposed.
@@ -249,7 +250,7 @@ def execute_node(self, context, graph):
         is_4d = len(inp_values.shape) == 4
         if is_4d:
             inp_values = np.transpose(inp_values, (0, 3, 1, 2))
-        y = multithreshold(inp_values, th_val)
+        y = multithreshold(inp_values, th_val, out_bias=out_bias)
         if is_4d:
             y = y.transpose(0, 2, 3, 1)
         act = DataType[self.get_nodeattr("outputDataType")]
diff --git a/src/finn/qnn-data/build_dataflow/build.py b/src/finn/qnn-data/build_dataflow/build.py
index 58d566a6e6..6cc7ff2419 100644
--- a/src/finn/qnn-data/build_dataflow/build.py
+++ b/src/finn/qnn-data/build_dataflow/build.py
@@ -61,6 +61,7 @@
         build_cfg.VerificationStepType.TIDY_UP_PYTHON,
         build_cfg.VerificationStepType.STREAMLINED_PYTHON,
         build_cfg.VerificationStepType.FOLDED_HLS_CPPSIM,
+        build_cfg.VerificationStepType.NODE_BY_NODE_RTLSIM,
         build_cfg.VerificationStepType.STITCHED_IP_RTLSIM,
     ],
     save_intermediate_models=True,
diff --git a/src/finn/qnn-data/build_dataflow/dataflow_build_config.json b/src/finn/qnn-data/build_dataflow/dataflow_build_config.json
index 8165055fd5..2c1db458dd 100644
--- a/src/finn/qnn-data/build_dataflow/dataflow_build_config.json
+++ b/src/finn/qnn-data/build_dataflow/dataflow_build_config.json
@@ -13,6 +13,7 @@
     "initial_python",
     "streamlined_python",
     "folded_hls_cppsim",
+    "node_by_node_rtlsim",
     "stitched_ip_rtlsim"
   ],
   "generate_outputs": [
diff --git a/src/finn/qnn-data/cpp/npy2vectorstream.hpp b/src/finn/qnn-data/cpp/npy2vectorstream.hpp
new file mode 100644
index 0000000000..cd26401ebf
--- /dev/null
+++ b/src/finn/qnn-data/cpp/npy2vectorstream.hpp
@@ -0,0 +1,72 @@
+#include <iostream>
+#include "cnpy.h"
+#include "hls_stream.h"
+#include "ap_int.h"
+#include <vector>
+#include <stdio.h>
+#include <hls_vector.h>
+
+#ifdef DEBUG
+#define DEBUG_NPY2VECTORSTREAM(x) std::cout << "[npy2vectorstream] " << x << std::endl;
+#define DEBUG_VECTORSTREAM2NPY(x) std::cout << "[vectorstream2npy] " << x << std::endl;
+#else
+#define DEBUG_NPY2VECTORSTREAM(x) ;
+#define DEBUG_VECTORSTREAM2NPY(x) ;
+#endif
+
+template <typename ElemT, typename NpyT, unsigned N>
+void npy2vectorstream(const char * npy_path, hls::stream<hls::vector<ElemT,N>> & out_stream, bool reverse_inner = true, size_t numReps = 1) {
+  for (size_t rep = 0; rep < numReps; rep++) {
+    cnpy::NpyArray arr = cnpy::npy_load(npy_path);
+    DEBUG_NPY2VECTORSTREAM("word_size " << arr.word_size << " num_vals " << arr.num_vals)
+    if (arr.word_size != sizeof(NpyT)) {
+      throw "Npy array word size and specified NpyT size do not match";
+    }
+    NpyT* loaded_data = arr.data<NpyT>();
+    size_t outer_dim_elems = 1;
+    for (size_t dim = 0; dim < arr.shape.size() - 1; dim++) {
+      outer_dim_elems *= arr.shape[dim];
+    }
+    size_t inner_dim_elems = arr.shape[arr.shape.size() - 1];
+    DEBUG_NPY2VECTORSTREAM("n_outer " << outer_dim_elems << " n_inner " << inner_dim_elems)
+    for (size_t outer_elem = 0; outer_elem < outer_dim_elems; outer_elem++) {
+      hls::vector <ElemT, N> vec;
+      for (size_t ii = 0; ii < inner_dim_elems; ii++) {
+        NpyT elemNpy = loaded_data[outer_elem * inner_dim_elems + ii];
+        ElemT elem = loaded_data[outer_elem * inner_dim_elems + ii];
+        DEBUG_NPY2VECTORSTREAM("npy2 elem = " << elem << ", loaded data = " << loaded_data[outer_elem * inner_dim_elems + ii])
+        vec[ii] = elem;
+      }
+      out_stream << vec;
+    }
+  }
+}
+
+template <typename ElemT, typename NpyT, unsigned N>
+void vectorstream2npy(hls::stream<hls::vector<ElemT,N>> & in_stream, const std::vector<size_t> & shape, const char * npy_path, bool reverse_inner = false, size_t numReps = 1, size_t multi_pixel_out = 1) {
+  for(size_t rep = 0; rep < numReps; rep++) {
+    std::vector<NpyT> data_to_save;
+    size_t outer_dim_elems = 1;
+    for(size_t dim = 0; dim < shape.size()-1; dim++) {
+      outer_dim_elems *= shape[dim];
+    }
+    size_t inner_dim_elems = shape[shape.size()-1] / multi_pixel_out;
+    DEBUG_VECTORSTREAM2NPY("n_outer " << outer_dim_elems << " n_inner " << inner_dim_elems << " n_multi_pixel_out " << multi_pixel_out)
+    for(size_t outer_elem = 0; outer_elem < outer_dim_elems; outer_elem++) {
+      for(size_t ii_multi_pixel_out = 0; ii_multi_pixel_out < multi_pixel_out; ii_multi_pixel_out++) {
+        // loop over multi_pixel_out blocks of inner_dim_elems separately,
+        // so that reverse_inner is not applied across multiple pixels
+        hls::vector<ElemT, N> elems;
+        in_stream >> elems;
+        for(size_t ii = 0; ii < inner_dim_elems; ii++) {
+          size_t i = ii_multi_pixel_out*inner_dim_elems;
+          i += reverse_inner ? inner_dim_elems-ii-1 : ii;
+          NpyT npyt = (NpyT) elems[i];
+          DEBUG_VECTORSTREAM2NPY("elems[i] = " << elems[i] << ", NpyT = " << npyt)
+          data_to_save.push_back(npyt);
+        }
+      }
+    }
+    cnpy::npy_save(npy_path, &data_to_save[0], shape, "w");
+  }
+}
diff --git a/src/finn/qnn-data/test_ext_weights/specialize_layers_config.json b/src/finn/qnn-data/test_ext_weights/specialize_layers_config.json
new file mode 100644
index 0000000000..3218c2d89a
--- /dev/null
+++ b/src/finn/qnn-data/test_ext_weights/specialize_layers_config.json
@@ -0,0 +1,30 @@
+{
+  "Defaults": {},
+  "Thresholding_0": {
+    "preferred_impl_style": "rtl"
+  },
+  "MVAU_0": {
+    "preferred_impl_style": "rtl"
+  },
+  "Thresholding_1": {
+    "preferred_impl_style": "rtl"
+  },
+  "MVAU_1": {
+    "preferred_impl_style": "hls"
+  },
+  "Thresholding_2": {
+    "preferred_impl_style": "rtl"
+  },
+  "MVAU_2": {
+    "preferred_impl_style": "rtl"
+  },
+  "Thresholding_3": {
+    "preferred_impl_style": "rtl"
+  },
+  "MVAU_3": {
+    "preferred_impl_style": "rtl"
+  },
+  "LabelSelect_0": {
+    "preferred_impl_style": "hls"
+  }
+}
diff --git a/src/finn/qnn-data/test_ext_weights/tfc-w1a1-extw.json b/src/finn/qnn-data/test_ext_weights/tfc-w2a2-extw.json
similarity index 66%
rename from src/finn/qnn-data/test_ext_weights/tfc-w1a1-extw.json
rename to src/finn/qnn-data/test_ext_weights/tfc-w2a2-extw.json
index 9fe22443dc..29484e2940 100644
--- a/src/finn/qnn-data/test_ext_weights/tfc-w1a1-extw.json
+++ b/src/finn/qnn-data/test_ext_weights/tfc-w2a2-extw.json
@@ -3,22 +3,31 @@
     "Thresholding_rtl_0": {
       "PE": 49
     },
-    "MVAU_hls_0": {
+    "MVAU_rtl_0": {
       "PE": 16,
       "SIMD": 49,
       "ram_style": "block"
     },
-    "MVAU_hls_1": {
+    "Thresholding_rtl_1": {
+      "PE": 16
+    },
+    "MVAU_hls_0": {
       "PE": 8,
       "SIMD": 8,
       "mem_mode": "external"
     },
-    "MVAU_hls_2": {
+    "Thresholding_rtl_2": {
+      "PE": 8
+    },
+    "MVAU_rtl_1": {
       "PE": 8,
       "SIMD": 8,
       "mem_mode": "external"
     },
-    "MVAU_hls_3": {
+    "Thresholding_rtl_3": {
+      "PE": 8
+    },
+    "MVAU_rtl_2": {
       "PE": 10,
       "SIMD": 8,
       "ram_style": "distributed"
diff --git a/src/finn/transformation/fpgadataflow/annotate_resources.py b/src/finn/transformation/fpgadataflow/annotate_resources.py
index f07a5186d5..ee2da2094c 100644
--- a/src/finn/transformation/fpgadataflow/annotate_resources.py
+++ b/src/finn/transformation/fpgadataflow/annotate_resources.py
@@ -26,8 +26,8 @@
 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
 import qonnx.custom_op.registry as registry
+from functools import partial
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.base import Transformation
@@ -49,15 +49,16 @@ class AnnotateResources(Transformation):
     chosen mode (e.g. HLSSynthIP for hls) was previously run.
     """
 
-    def __init__(self, mode, override_res_dict=None):
+    def __init__(self, mode, fpgapart, override_res_dict=None):
         super().__init__()
         self.mode = mode
+        self.fpgapart = fpgapart
         self.res_dict = override_res_dict
 
     def apply(self, model):
         graph = model.graph
         if self.mode == "estimate":
-            res_fxn = res_estimation
+            res_fxn = partial(res_estimation, fpgapart=self.fpgapart)
         elif self.mode == "hls":
             res_fxn = hls_synth_res_estimation
         elif self.mode == "synth":
@@ -77,7 +78,9 @@ def apply(self, model):
                 # recurse into model to manually annotate per-layer resources
                 sdp_model_filename = getCustomOp(node).get_nodeattr("model")
                 sdp_model = ModelWrapper(sdp_model_filename)
-                sdp_model = sdp_model.transform(AnnotateResources(self.mode, self.res_dict))
+                sdp_model = sdp_model.transform(
+                    AnnotateResources(self.mode, self.fpgapart, self.res_dict)
+                )
                 sdp_dict = sdp_model.get_metadata_prop("res_total_" + self.mode)
                 sdp_dict = eval(sdp_dict)
                 # save transformed model
diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
index 7aa28999de..d45be1afb9 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
@@ -208,10 +208,16 @@ def apply(self, model):
                 thl_in_shape = model.get_tensor_shape(thl_input)
                 thl_thres_shape = model.get_tensor_shape(thl_threshold)
                 idt = model.get_tensor_datatype(thl_input)
-
+                tdt = model.get_tensor_datatype(thl_threshold)
                 # skip conversion for layers with float input
                 if not idt.is_integer():
                     continue
+                assert tdt.is_integer(), (
+                    node.name
+                    + """: MultiThreshold cannot be converted
+                    because thresholds are float type. Input data type is integer,
+                    please run RoundAndClipThresholds to convert thresholds to integer."""
+                )
 
                 # check layout of inputs/outputs, and convert if needed
                 # check layout and convert if necessary
@@ -262,7 +268,7 @@ def apply(self, model):
                     PE=pe,
                     numSteps=thl_thres_shape[1],
                     inputDataType=idt.name,
-                    weightDataType=idt.name,
+                    weightDataType=tdt.name,
                     outputDataType=odt.name,
                     numInputVectors=list(thl_in_shape[:-1]),
                     ActVal=actval,
@@ -532,65 +538,119 @@ def apply(self, model):
         graph = model.graph
         node_ind = 0
         graph_modified = False
+        # check first if global input is split
+        successors = model.find_consumers(graph.input[0].name)
+        dt = model.get_tensor_datatype(graph.input[0].name)
+        if successors is not None and len(successors) >= 2 and dt.is_integer():
+            output_tensor = graph.input[0].name
+            n_outputs = len(successors)
+            dt = model.get_tensor_datatype(output_tensor)
+
+            # create clone tensors
+            out_shape = model.get_tensor_shape(output_tensor)
+            out_tensor_clones = []
+            for i in range(n_outputs):
+                clone = helper.make_tensor_value_info(
+                    model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape
+                )
+                model.graph.value_info.append(clone)
+                out_tensor_clones += [clone.name]
+
+            num_ch = int(out_shape[-1])
+            vecs = out_shape[:-1]
+
+            # create node with no parallelization first
+            pe = 1
+
+            dup_node = helper.make_node(
+                "DuplicateStreams",
+                [output_tensor],
+                out_tensor_clones,
+                domain="finn.custom_op.fpgadataflow",
+                backend="fpgadataflow",
+                NumChannels=num_ch,
+                PE=pe,
+                inputDataType=dt.name,
+                numInputVectors=vecs,
+                NumOutputStreams=n_outputs,
+                outFIFODepths=[2] * n_outputs,
+                name="DuplicateStreams_" + output_tensor,
+            )
+
+            graph.node.insert(0, dup_node)
+
+            # connect successors to out tensor clone
+            clone_idx = 0
+            for successor in successors:
+                for i, succ_input in enumerate(successor.input):
+                    if succ_input == output_tensor:
+                        successor.input[i] = out_tensor_clones[clone_idx]
+                        clone_idx += 1
+                        # if one node has multiple connections to the same output
+                        # find_direct_successors will return one node per input
+                        # so break the inner loop will result in correct behaviour
+                        break
+            graph_modified = True
+
         for node in graph.node:
             node_ind += 1
-            successors = model.find_consumers(node.output[0])
-            if successors is not None and len(successors) >= 2:
-                output_tensor = node.output[0]
-                n_outputs = len(successors)
+            for output_tensor in node.output:
+                successors = model.find_consumers(output_tensor)
+                if successors is not None and len(successors) >= 2:
+                    n_outputs = len(successors)
 
-                dt = model.get_tensor_datatype(output_tensor)
+                    dt = model.get_tensor_datatype(output_tensor)
 
-                # skip conversion for layers with float input
-                if not dt.is_integer():
-                    continue
+                    # skip conversion for layers with float input
+                    if not dt.is_integer():
+                        continue
 
-                # create clone tensors
-                out_shape = model.get_tensor_shape(output_tensor)
-                out_tensor_clones = []
-                for i in range(n_outputs):
-                    clone = helper.make_tensor_value_info(
-                        model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape
-                    )
-                    model.graph.value_info.append(clone)
-                    out_tensor_clones += [clone.name]
+                    # create clone tensors
+                    out_shape = model.get_tensor_shape(output_tensor)
+                    out_tensor_clones = []
+                    for i in range(n_outputs):
+                        clone = helper.make_tensor_value_info(
+                            model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape
+                        )
+                        model.graph.value_info.append(clone)
+                        out_tensor_clones += [clone.name]
 
-                num_ch = int(out_shape[-1])
-                vecs = out_shape[:-1]
+                    num_ch = int(out_shape[-1])
+                    vecs = out_shape[:-1]
 
-                # create node with no parallelization first
-                pe = 1
+                    # create node with no parallelization first
+                    pe = 1
 
-                dup_node = helper.make_node(
-                    "DuplicateStreams",
-                    [output_tensor],
-                    out_tensor_clones,
-                    domain="finn.custom_op.fpgadataflow",
-                    backend="fpgadataflow",
-                    NumChannels=num_ch,
-                    PE=pe,
-                    inputDataType=dt.name,
-                    numInputVectors=vecs,
-                    NumOutputStreams=n_outputs,
-                    outFIFODepths=[2] * n_outputs,
-                    name="DuplicateStreams_" + node.name,
-                )
+                    dup_node = helper.make_node(
+                        "DuplicateStreams",
+                        [output_tensor],
+                        out_tensor_clones,
+                        domain="finn.custom_op.fpgadataflow",
+                        backend="fpgadataflow",
+                        NumChannels=num_ch,
+                        PE=pe,
+                        inputDataType=dt.name,
+                        numInputVectors=vecs,
+                        NumOutputStreams=n_outputs,
+                        outFIFODepths=[2] * n_outputs,
+                        name="DuplicateStreams_" + node.name,
+                    )
 
-                graph.node.insert(node_ind, dup_node)
+                    graph.node.insert(node_ind, dup_node)
 
-                # connect successors to out tensor clone
-                clone_idx = 0
-                for successor in successors:
-                    for i, succ_input in enumerate(successor.input):
-                        if succ_input == output_tensor:
-                            successor.input[i] = out_tensor_clones[clone_idx]
-                            clone_idx += 1
-                            # if one node has multiple connections to the same output
-                            # find_direct_successors will return one node per input
-                            # so break the inner loop will result in correct behaviour
-                            break
+                    # connect successors to out tensor clone
+                    clone_idx = 0
+                    for successor in successors:
+                        for i, succ_input in enumerate(successor.input):
+                            if succ_input == output_tensor:
+                                successor.input[i] = out_tensor_clones[clone_idx]
+                                clone_idx += 1
+                                # if one node has multiple connections to the same output
+                                # find_direct_successors will return one node per input
+                                # so break the inner loop will result in correct behaviour
+                                break
 
-                graph_modified = True
+                    graph_modified = True
 
         if graph_modified:
             model = model.transform(SortGraph())
@@ -1200,8 +1260,8 @@ def apply(self, model):
 
 
 class InferStreamingEltwise(Transformation):
-    """Convert eltwise Sub or Sub -> Abs to StreamingEltwise layer
-    with SubEltwise or AbsDiffEltwise op."""
+    """Convert eltwise Add, Sub or Sub -> Abs to StreamingEltwise layer
+    with AddEltwise, SubEltwise or AbsDiffEltwise op."""
 
     def apply(self, model):
         graph = model.graph
@@ -1209,7 +1269,7 @@ def apply(self, model):
         graph_modified = False
         for node in graph.node:
             node_ind += 1
-            if node.op_type == "Sub":
+            if node.op_type in ["Sub", "Add"]:
                 in0 = node.input[0]
                 in1 = node.input[1]
                 result = node.output[0]
@@ -1233,14 +1293,15 @@ def apply(self, model):
                 if not (idt0.is_integer() and idt1.is_integer()):
                     continue
 
-                eltwiseOp = "Sub"
+                eltwiseOp = node.op_type
                 nodes_to_remove = [node]
-                # look for a downstream Abs node
-                res_consumer = model.find_consumer(result)
-                if (res_consumer is not None) and (res_consumer.op_type == "Abs"):
-                    eltwiseOp = "AbsDiff"
-                    result = res_consumer.output[0]
-                    nodes_to_remove.append(res_consumer)
+                if node.op_type == "Sub":
+                    # look for a downstream Abs node
+                    res_consumer = model.find_consumer(result)
+                    if (res_consumer is not None) and (res_consumer.op_type == "Abs"):
+                        eltwiseOp = "AbsDiff"
+                        result = res_consumer.output[0]
+                        nodes_to_remove.append(res_consumer)
 
                 # check layout and convert if necessary
                 in0_layout = model.get_tensor_layout(in0)
@@ -1441,6 +1502,9 @@ def apply(self, model):
             if n.op_type == "MatMul" and model.get_tensor_sparsity(n.input[1]) is None:
                 mm_input = n.input[0]
                 mm_weight = n.input[1]
+                # if mm_weight is not constant, skip node
+                if model.get_initializer(n.input[1]) is None:
+                    continue
                 mm_output = n.output[0]
                 mm_in_shape = model.get_tensor_shape(mm_input)
                 mm_out_shape = model.get_tensor_shape(mm_output)
diff --git a/src/finn/transformation/fpgadataflow/infer_pixel_padding_deconv.py b/src/finn/transformation/fpgadataflow/infer_pixel_padding_deconv.py
index 8dbf7071fc..e1dcf1dde5 100644
--- a/src/finn/transformation/fpgadataflow/infer_pixel_padding_deconv.py
+++ b/src/finn/transformation/fpgadataflow/infer_pixel_padding_deconv.py
@@ -2,8 +2,7 @@
 import warnings
 from onnx import TensorProto, helper
 from qonnx.transformation.base import Transformation
-from qonnx.transformation.lower_convs_to_matmul import _auto_pad_to_explicit_padding
-from qonnx.util.basic import get_by_name
+from qonnx.util.basic import auto_pad_to_explicit_padding, get_by_name
 
 
 class InferPixelPaddingDeconv(Transformation):
@@ -61,7 +60,7 @@ def apply(self, model):
                         # use specified padding
                         pad = get_by_name(n.attribute, "pads").ints
                     else:
-                        pad = _auto_pad_to_explicit_padding(
+                        pad = auto_pad_to_explicit_padding(
                             auto_pad,
                             ifm_dim_h,
                             ifm_dim_w,
diff --git a/src/finn/transformation/fpgadataflow/insert_dwc.py b/src/finn/transformation/fpgadataflow/insert_dwc.py
index 33cc3e86d3..b56c8b74ea 100644
--- a/src/finn/transformation/fpgadataflow/insert_dwc.py
+++ b/src/finn/transformation/fpgadataflow/insert_dwc.py
@@ -26,7 +26,6 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from onnx import TensorProto
 from onnx import helper as oh
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.base import Transformation
@@ -110,12 +109,15 @@ def apply(self, model):
                             # determine shape for dwc
                             dwc_shape = n0.get_normal_output_shape()
 
-                            # determine dtype for dwc
+                            # determine FINN dtype for dwc
                             dtype = n0.get_output_datatype()
+                            # determine onnx tensor dtype for dwc
+                            n0_otensor = model.get_tensor_valueinfo(output_name)
+                            n0_tensor_dtype = n0_otensor.type.tensor_type.elem_type
 
                             dwc_output_tensor = oh.make_tensor_value_info(
                                 model.make_new_valueinfo_name(),
-                                TensorProto.FLOAT,
+                                n0_tensor_dtype,
                                 dwc_shape,
                             )
                             graph.value_info.append(dwc_output_tensor)
diff --git a/src/finn/transformation/fpgadataflow/insert_fifo.py b/src/finn/transformation/fpgadataflow/insert_fifo.py
index 9df193efcf..9ed0f51cd4 100644
--- a/src/finn/transformation/fpgadataflow/insert_fifo.py
+++ b/src/finn/transformation/fpgadataflow/insert_fifo.py
@@ -29,7 +29,6 @@
 
 import numpy as np
 import warnings
-from onnx import TensorProto
 from onnx import helper as oh
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.base import Transformation
@@ -114,6 +113,8 @@ def apply(self, model):
                         # determine fifo node attributes
                         fld_shape = n0.get_folded_output_shape()
                         dtype = n0.get_output_datatype()
+                        n0_otensor = model.get_tensor_valueinfo(output_name)
+                        n0_tensor_dtype = n0_otensor.type.tensor_type.elem_type
 
                         # check if folded_shape of output of first node and
                         # input of the second node is equal
@@ -145,7 +146,7 @@ def apply(self, model):
                             # or unless create_shallow_fifos is specified
                             fifo_output_tensor = oh.make_tensor_value_info(
                                 model.make_new_valueinfo_name(),
-                                TensorProto.FLOAT,
+                                n0_tensor_dtype,
                                 n0.get_normal_output_shape(),
                             )
                             graph.value_info.append(fifo_output_tensor)
@@ -196,13 +197,15 @@ def apply(self, model):
                     fld_shape = n0.get_folded_input_shape(inp_ind)
                     n_shape = n0.get_normal_input_shape(inp_ind)
                     dtype = n0.get_input_datatype(inp_ind)
+                    n0_itensor = model.get_tensor_valueinfo(graph_in_name)
+                    n0_tensor_dtype = n0_itensor.type.tensor_type.elem_type
                     fifo_depth = n0.get_nodeattr("inFIFODepths")[inp_ind]
 
                     if fifo_depth > 2 or self.create_shallow_fifos:
                         # create fifo node
                         fifo_output_tensor = oh.make_tensor_value_info(
                             model.make_new_valueinfo_name(),
-                            TensorProto.FLOAT,
+                            n0_tensor_dtype,
                             n0.get_normal_input_shape(inp_ind),
                         )
                         graph.value_info.append(fifo_output_tensor)
@@ -256,14 +259,16 @@ def apply(self, model):
                     fld_shape = n0.get_folded_output_shape(out_ind)
                     n_shape = n0.get_normal_output_shape(out_ind)
                     dtype = n0.get_output_datatype(out_ind)
+                    n0_otensor = model.get_tensor_valueinfo(graph_out_name)
+                    n0_tensor_dtype = n0_otensor.type.tensor_type.elem_type
                     fifo_depth = n0.get_nodeattr("outFIFODepths")[out_ind]
 
                     if fifo_depth > 2 or self.create_shallow_fifos:
                         # create fifo node
                         fifo_input_tensor = oh.make_tensor_value_info(
                             model.make_new_valueinfo_name(),
-                            TensorProto.FLOAT,
-                            n0.get_normal_output_shape(),
+                            n0_tensor_dtype,
+                            n0.get_normal_output_shape(out_ind),
                         )
                         graph.value_info.append(fifo_input_tensor)
                         model.set_tensor_datatype(fifo_input_tensor.name, dtype)
@@ -289,7 +294,7 @@ def apply(self, model):
                         graph.node.append(fifo_node)
 
                         # set fifo output tensor as new input tensor of second node
-                        final_node.output[0] = fifo_input_tensor.name
+                        final_node.output[out_ind] = fifo_input_tensor.name
                     else:
                         warnings.warn(
                             """Output FIFO for %s has depth %d and won't
diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
index fc2047b08e..63ce2d3cbf 100644
--- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py
+++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
@@ -322,7 +322,7 @@ def apply(self, model):
         prep_transforms = [
             InsertIODMA(self.axi_port_width),
             InsertDWC(),
-            SpecializeLayers(),
+            SpecializeLayers(self.fpga_part),
             Floorplan(),
             CreateDataflowPartition(partition_model_dir=self.partition_model_dir),
         ]
@@ -338,7 +338,7 @@ def apply(self, model):
             dataflow_model_filename = sdp_node.get_nodeattr("model")
             kernel_model = ModelWrapper(dataflow_model_filename)
             kernel_model = kernel_model.transform(InsertFIFO())
-            kernel_model = kernel_model.transform(SpecializeLayers())
+            kernel_model = kernel_model.transform(SpecializeLayers(self.fpga_part))
             kernel_model = kernel_model.transform(GiveUniqueNodeNames(prefix))
             kernel_model.save(dataflow_model_filename)
             kernel_model = kernel_model.transform(PrepareIP(self.fpga_part, self.period_ns))
diff --git a/src/finn/transformation/fpgadataflow/specialize_layers.py b/src/finn/transformation/fpgadataflow/specialize_layers.py
index e71d6c23a4..dbcadd1df5 100644
--- a/src/finn/transformation/fpgadataflow/specialize_layers.py
+++ b/src/finn/transformation/fpgadataflow/specialize_layers.py
@@ -26,18 +26,18 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import numpy as np
 import warnings
 from onnx import helper
-from qonnx.core.datatype import DataType
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.base import Transformation
 
 from finn.custom_op.fpgadataflow.hls import custom_op as hls_variants
 from finn.custom_op.fpgadataflow.rtl import custom_op as rtl_variants
-from finn.util.fpgadataflow import is_versal
+from finn.util.basic import get_dsp_block, is_versal
 
 
-def _determine_impl_style(node, fpgapart):
+def _determine_impl_style(node, fpgapart, model):
     optype = node.op_type
 
     # check if there is an HLS or RTL variant or both
@@ -45,8 +45,8 @@ def _determine_impl_style(node, fpgapart):
     rtl_variant = optype + "_rtl" in rtl_variants.keys()
 
     # check if user has specified a preferred_impl_style
-    inst = getCustomOp(node)
-    impl_style = inst.get_nodeattr("preferred_impl_style")
+    node_inst = getCustomOp(node)
+    impl_style = node_inst.get_nodeattr("preferred_impl_style")
 
     # if impl_style not set, for "simple" layers always try
     # to use rtl variant if available
@@ -55,23 +55,19 @@ def _determine_impl_style(node, fpgapart):
             return _dwc_determine_impl_style(node)
         if rtl_variant:
             if optype == "MVAU":
-                inp_width_fit = (
-                    DataType[getCustomOp(node).get_nodeattr("inputDataType")].bitwidth() >= 4
-                )
-                weight_width_fit = (
-                    DataType[getCustomOp(node).get_nodeattr("weightDataType")].bitwidth() >= 4
-                )
-                if inp_width_fit and weight_width_fit and _mvu_rtl_possible(node):
+                idt = node_inst.get_input_datatype()
+                wdt = node_inst.get_weight_datatype()
+                inp_width_fit = idt.bitwidth() >= 4
+                weight_width_fit = wdt.bitwidth() >= 4
+                if inp_width_fit and weight_width_fit and _mvu_rtl_possible(node, fpgapart, model):
                     return "rtl"
                 else:
                     return "hls"
             elif optype == "VVAU":
-                inp_width_fit = (
-                    DataType[getCustomOp(node).get_nodeattr("inputDataType")].bitwidth() >= 4
-                )
-                weight_width_fit = (
-                    DataType[getCustomOp(node).get_nodeattr("weightDataType")].bitwidth() >= 4
-                )
+                idt = node_inst.get_input_datatype()
+                wdt = node_inst.get_weight_datatype()
+                inp_width_fit = idt.bitwidth() >= 4
+                weight_width_fit = wdt.bitwidth() >= 4
                 if inp_width_fit and weight_width_fit and _vvu_rtl_possible(node, fpgapart):
                     return "rtl"
                 else:
@@ -136,7 +132,7 @@ def _determine_impl_style(node, fpgapart):
                 # user setting can be fulfilled
                 return "rtl"
         elif optype == "MVAU":
-            if _mvu_rtl_possible(node):
+            if _mvu_rtl_possible(node, fpgapart, model):
                 return "rtl"
             else:
                 warn_str = """There is no RTL variant for %s. The node will automatically be
@@ -232,31 +228,43 @@ def _swg_hls_possible(node):
             return False
 
 
-def _mvu_rtl_possible(n):
+def _mvu_rtl_possible(n, fpgapart, model):
     # Checks whether RTL-based MVU is supported
     # Currently, for DSP48 we only support computations up to
     # 8sx8u (8-bit signed weights x 8-bit (un)signed activations)
-    # and for DSP58 we support up to 8sx9s. Next to that,
-    # embedded thresholding functionality is not supported and
-    # neither binaryxnormode computation.
-    inp_width_in_range = (
-        DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() <= 8
-    ) or (
-        DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() == 9
-        and DataType[getCustomOp(n).get_nodeattr("inputDataType")].min() < 0
-    )
-    weight_width_in_range = DataType[getCustomOp(n).get_nodeattr("weightDataType")].bitwidth() <= 8
-    signed_weights = DataType[getCustomOp(n).get_nodeattr("weightDataType")].min() < 0
-    no_activation = getCustomOp(n).get_nodeattr("noActivation") == 1
-    not_binaryxnor_mode = getCustomOp(n).get_nodeattr("binaryXnorMode") == 0
+    # and for DSP58 we support up to 8sx9s.
+    # Please note, DSP48E1 does only support narrow range for weights
+    # Next to that, embedded thresholding functionality is not supported
+    # and neither binaryxnormode computation.
+    node_inst = getCustomOp(n)
+    # first check if no Activation or binary xnor mode and return False
+    # immediately if one of them is True
+    no_activation = node_inst.get_nodeattr("noActivation") == 0
+    not_binaryxnor_mode = node_inst.get_nodeattr("binaryXnorMode") == 1
+    if no_activation or not_binaryxnor_mode:
+        return False
 
-    return (
-        inp_width_in_range
-        and weight_width_in_range
-        and signed_weights
-        and no_activation
-        and not_binaryxnor_mode
-    )
+    # check if weights are signed, if not return False
+    wdt = node_inst.get_weight_datatype()
+    if not wdt.signed():
+        return False
+
+    # check which dsp block is available on fpga
+    dsp_block = get_dsp_block(fpgapart)
+    # check if weights are narrow
+    weights = model.get_initializer(n.input[1])
+    narrow_weights = False if np.min(weights) == wdt.min() else True
+    # if non narrow weights and only DSP48E1 available return False
+    if not narrow_weights and dsp_block == "DSP48E1":
+        return False
+
+    # if none of the above constraints have been triggered
+    # we now check if input and weight data types are in range
+    idt = node_inst.get_input_datatype()
+    inp_width_in_range = (idt.bitwidth() <= 8) or (idt.bitwidth() == 9 and idt.signed())
+    weight_width_in_range = wdt.bitwidth() <= 8
+
+    return inp_width_in_range and weight_width_in_range
 
 
 def _vvu_rtl_possible(n, fpgapart):
@@ -264,30 +272,25 @@ def _vvu_rtl_possible(n, fpgapart):
     # Currently, we only support RTL-VVU on DSP58 up to 8sx9s inputs
     # (8-bit signed weights x (9-bit signed OR 8-bit (un)signed) activations).
     # Next to that, embedded thresholding functionality is not supported.
-    in_width_in_range = (
-        DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() <= 8
-    ) or (
-        DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() == 9
-        and DataType[getCustomOp(n).get_nodeattr("inputDataType")].min() < 0
-    )
-    weight_width_in_range = DataType[getCustomOp(n).get_nodeattr("weightDataType")].bitwidth() <= 8
-    signed_weights = DataType[getCustomOp(n).get_nodeattr("weightDataType")].min() < 0
-    is_versal_family = is_versal(fpgapart)
-    no_activation = getCustomOp(n).get_nodeattr("noActivation") == 1
+    node_inst = getCustomOp(n)
+    if not node_inst.get_nodeattr("noActivation"):
+        return False
+    if not is_versal(fpgapart):
+        return False
+
+    idt = node_inst.get_input_datatype()
+    wdt = node_inst.get_weight_datatype()
+    in_width_in_range = (idt.bitwidth() <= 8) or (idt.bitwidth() == 9 and idt.min() < 0)
+    weight_width_in_range = wdt.bitwidth() <= 8
+    signed_weights = wdt.min() < 0
 
-    return (
-        in_width_in_range
-        and weight_width_in_range
-        and signed_weights
-        and is_versal_family
-        and no_activation
-    )
+    return in_width_in_range and weight_width_in_range and signed_weights
 
 
 class SpecializeLayers(Transformation):
     """Specialize all layers to either HLS or RTL variants"""
 
-    def __init__(self, fpgapart=""):
+    def __init__(self, fpgapart):
         super().__init__()
         self.fpgapart = fpgapart
 
@@ -300,7 +303,7 @@ def apply(self, model):
             if not node.domain == "finn.custom_op.fpgadataflow":
                 continue
             node_ind += 1
-            impl_style = _determine_impl_style(node, self.fpgapart)
+            impl_style = _determine_impl_style(node, self.fpgapart, model)
             optype = node.op_type + "_" + impl_style
 
             new_node = helper.make_node(
@@ -313,9 +316,6 @@ def apply(self, model):
             for attribute in node.attribute:
                 if attribute.name != "preferred_impl_style":
                     new_node.attribute.append(attribute)
-            if new_node.op_type == "MVAU_rtl":
-                is_versal_family = is_versal(self.fpgapart)
-                getCustomOp(new_node).set_nodeattr("is_versal", is_versal_family)
             graph.node.insert(node_ind, new_node)
             # remove old nodes
             graph.node.remove(node)
diff --git a/src/finn/transformation/fpgadataflow/vitis_build.py b/src/finn/transformation/fpgadataflow/vitis_build.py
index da7624b8ff..157d81cf35 100644
--- a/src/finn/transformation/fpgadataflow/vitis_build.py
+++ b/src/finn/transformation/fpgadataflow/vitis_build.py
@@ -383,7 +383,7 @@ def __init__(
     def apply(self, model):
         _check_vitis_envvars()
         # prepare at global level, then break up into kernels
-        prep_transforms = [InsertIODMA(512), InsertDWC(), SpecializeLayers()]
+        prep_transforms = [InsertIODMA(512), InsertDWC(), SpecializeLayers(self.fpga_part)]
         for trn in prep_transforms:
             model = model.transform(trn)
             model = model.transform(GiveUniqueNodeNames())
@@ -405,7 +405,7 @@ def apply(self, model):
             dataflow_model_filename = sdp_node.get_nodeattr("model")
             kernel_model = ModelWrapper(dataflow_model_filename)
             kernel_model = kernel_model.transform(InsertFIFO())
-            kernel_model = kernel_model.transform(SpecializeLayers())
+            kernel_model = kernel_model.transform(SpecializeLayers(self.fpga_part))
             kernel_model = kernel_model.transform(RemoveUnusedTensors())
             kernel_model = kernel_model.transform(GiveUniqueNodeNames(prefix))
             kernel_model.save(dataflow_model_filename)
diff --git a/src/finn/transformation/qonnx/qonnx_activation_handlers.py b/src/finn/transformation/qonnx/qonnx_activation_handlers.py
index 451ba52c29..e328d7dc77 100644
--- a/src/finn/transformation/qonnx/qonnx_activation_handlers.py
+++ b/src/finn/transformation/qonnx/qonnx_activation_handlers.py
@@ -416,6 +416,13 @@ def _calculate_thresholds(self):
 
         # ToDo: The index 1 needs to be changed to -1 for the channels last format
         num_output_channels = self._model.get_tensor_shape(self._q_node.output[0])[cdim]
+
+        assert (
+                thresholds.shape[0] == 1 or thresholds.shape[
+            0] == num_output_channels
+        ), """Quant node cannot be converted to MultiThreshold because only
+            per tensor or per channel quantization supported."""
+
         final_shape = (num_output_channels, num_thresholds)
         if thresholds.shape != final_shape:
             thresholds = np.broadcast_to(thresholds, final_shape)
@@ -578,6 +585,11 @@ def _calculate_thresholds(self):
             # ToDo: The index 1 needs to be changed to -1 for the channels last format
             num_output_channels = self._model.get_tensor_shape(self._q_node.output[0])[cdim]
 
+            assert (
+                thresholds.shape[0] == 1 or thresholds.shape[0] == num_output_channels
+            ), """Quant node cannot be converted to MultiThreshold because only
+                per tensor or per channel quantization supported."""
+
             final_shape = (num_output_channels, num_thresholds)
             if thresholds.shape != final_shape:
                 thresholds = np.broadcast_to(thresholds, final_shape)
diff --git a/src/finn/transformation/streamline/reorder.py b/src/finn/transformation/streamline/reorder.py
index 4cfc4cfff7..ac71659a9c 100644
--- a/src/finn/transformation/streamline/reorder.py
+++ b/src/finn/transformation/streamline/reorder.py
@@ -29,6 +29,7 @@
 import numpy as np
 import qonnx.core.data_layout as DataLayout
 import warnings
+from copy import deepcopy
 from onnx import TensorProto
 from onnx import helper as oh
 from qonnx.core.datatype import DataType
@@ -668,6 +669,10 @@ def apply(self, model):
                     # if initializer is not scalar, skip
                     if np.prod(init0.shape) != 1:
                         continue
+                    if model.is_fork_node(prod0):
+                        model = model.transform(MoveOpPastFork(prod0.op_type))
+                        # topology modified, "ask" ModelWrapper to apply this transform again
+                        return (model, True)
                     # Flatten input if required
                     if len(init0.shape) > 0:
                         init0 = init0.flatten()[0]
@@ -740,6 +745,12 @@ def apply(self, model):
                 elif producer is not None and producer.op_type == "Transpose":
                     perms = list(get_by_name(producer.attribute, "perm").ints)
                     if perms == [0, 3, 1, 2]:
+                        # check if the producer is a fork node
+                        # (need to move it past the fork before this transform)
+                        if model.is_fork_node(producer):
+                            model = model.transform(MoveTransposePastFork())
+                            # topology modified, "ask" ModelWrapper to apply this transform again
+                            return (model, True)
                         ceil_mode = get_by_name(n.attribute, "ceil_mode")
                         if ceil_mode is not None:
                             ceil_mode = ceil_mode.i
@@ -791,6 +802,12 @@ def apply(self, model):
                 if producer is not None and producer.op_type == "Transpose":
                     perms = list(get_by_name(producer.attribute, "perm").ints)
                     if perms == [0, 3, 1, 2]:
+                        # check if the producer is a fork node
+                        # (need to move it past the fork before this transform)
+                        if model.is_fork_node(producer):
+                            model = model.transform(MoveTransposePastFork())
+                            # topology modified, "ask" ModelWrapper to apply this transform again
+                            return (model, True)
                         old_value = model.get_initializer(n.input[scales_ind])
                         new_value = np.array(
                             [old_value[idx] for idx in (0, 2, 3, 1)],
@@ -840,10 +857,9 @@ class MoveOpPastFork(Transformation):
     can be merged with nodes in the branches
     """
 
-    def __init__(self, op_name_list, get_attrs_fxn=lambda x: {}):
+    def __init__(self, op_name_list):
         super().__init__()
         self.ops_to_move = op_name_list
-        self.get_attrs_fxn = get_attrs_fxn
 
     def apply(self, model):
         graph = model.graph
@@ -886,11 +902,9 @@ def apply(self, model):
                         new_param_name = model.make_new_valueinfo_name()
                         new_inp_list = [n.input[0], new_param_name]
                         model.set_initializer(new_param_name, op_init_param)
-                    attrs = self.get_attrs_fxn(n)
-                    # TODO use copy of original node instead to get attrs?
-                    new_node = oh.make_node(
-                        n.op_type, new_inp_list, [new_output_tensor_name], **attrs
-                    )
+                    new_node = deepcopy(n)
+                    new_node.input[:] = new_inp_list
+                    new_node.output[:] = [new_output_tensor_name]
                     graph.node.insert(node_ind, new_node)
                     node_ind += 1
 
@@ -928,7 +942,7 @@ def __init__(self):
 
 class MoveTransposePastFork(MoveOpPastFork):
     def __init__(self):
-        super().__init__(["Transpose"], lambda x: {"perm": get_by_name(x.attribute, "perm").ints})
+        super().__init__(["Transpose"])
 
 
 class MoveMaxPoolPastMultiThreshold(Transformation):
diff --git a/src/finn/transformation/streamline/round_thresholds.py b/src/finn/transformation/streamline/round_thresholds.py
index 2666242730..312db404ac 100644
--- a/src/finn/transformation/streamline/round_thresholds.py
+++ b/src/finn/transformation/streamline/round_thresholds.py
@@ -1,4 +1,5 @@
-# Copyright (c) 2020, Xilinx
+# Copyright (c) 2020-2022, Xilinx
+# Copyright (C) 2022-2024, Advanced Micro Devices, Inc.
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -26,22 +27,14 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-# Need numpy for modifying the onnx graph tensors, which are numpy style arrays
 import numpy as np
-
-# QONNX wrapper of ONNX model graphs
+from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
-
-# QONNX graph transformation base class
+from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.base import Transformation
-
-# Transformation running qonnx datatype inference
 from qonnx.transformation.infer_datatypes import InferDataTypes
 
 
-# Rounds and clips thresholds to integer values if the node inputs are integer,
-# respecting range, representability and data type (promotion) of the container
-# data type
 class RoundAndClipThresholds(Transformation):
     """For MultiThreshold nodes operating on integer inputs, round up
     thresholds values to the nearest integer. Additionally, if the input
@@ -50,29 +43,19 @@ class RoundAndClipThresholds(Transformation):
     annotation). Runs InferDataTypes() afterward to propagate any changes to the
     quantization data types."""
 
-    # Applies the transform to a whole model graph
     def apply(self, model: ModelWrapper):  # noqa
-        # Get the model graph out of the model wrapper object
         graph = model.graph
-        # Keep track of whether the graph has been modified
         graph_modified = False
-        # Iterate all nodes in the graph keeping track of the index
         for index, node in enumerate(graph.node):
-            # Applies to initializer tensors of MultiThreshold operations
-            if node.op_type == "MultiThreshold":
-                # Try to get the thresholds initializer tensor
+            op_type = node.op_type
+            if op_type == "MultiThreshold" or op_type.startswith("Thresholding"):
                 thresholds = model.get_initializer(node.input[1])
-                # There might be no constant thresholds stored as initializer
-                # tensor inside the model
                 if thresholds is None:
-                    # Nothing we can do, skip to the next node
                     continue
-                # Get the data type of the inputs to this operation
                 dtype = model.get_tensor_datatype(node.input[0])
                 # This transformation only applies to thresholding operations
                 # operating on integer inputs
                 if not dtype.is_integer():
-                    # Nothing we can do, skip to the next node
                     continue
                 # Round thresholds up to nearest integer and clip thresholds
                 # outside the input range
@@ -80,24 +63,24 @@ def apply(self, model: ModelWrapper):  # noqa
                 #   introduce extra inaccuracies due to large integers not being
                 #   exactly representable in floating-point representation.
                 #   See for example: np.ceil(np.float32(16777217)) == 16777216
-                # fmt: off
-                new_thresholds = np.clip(
-                    np.ceil(thresholds), dtype.min(), dtype.max()
-                )
-                # fmt: on
+                new_thresholds = np.clip(np.ceil(thresholds), dtype.min(), dtype.max() + 1)
                 # Convert back to the preferred float32 container type
-                #   Note: np.clip might have promoted the thresholds to float64
-                #   TODO: Maybe consider an int64 container type for thresholds
-                #    rounded to integer? Need to check all other transformations
-                #    and code generation through the whole FINN and QONNX stack
-                #    first, as these probably assume a float32 container type.
                 new_thresholds = new_thresholds.astype(np.float32)
                 # Insert the rounded and clipped thresholds back into the model
                 model.set_initializer(node.input[1], new_thresholds)
-                # The rounded and clipped thresholds now fit into the input data
-                # type
-                model.set_tensor_datatype(node.input[1], dtype)
-                # Test whether the new thresholds actually differ from the old
+                # The rounded and clipped thresholds now fit into a data type
+                # that is one bit bigger than the input datatype
+                # Determine new max_value
+                max_val = dtype.max() + 1
+                if not dtype.signed():
+                    tdt = DataType.get_smallest_possible(max_val)
+                else:
+                    tdt = DataType.get_smallest_possible(-(max_val) - 1)
+                model.set_tensor_datatype(node.input[1], tdt)
+                # If hw op we need to set the weight data type attribute as well
+                if op_type.startswith("Thresholding"):
+                    inst = getCustomOp(node)
+                    inst.set_nodeattr("weightDataType", tdt.name)
                 # ones
                 if np.any(new_thresholds != thresholds):
                     # Track the graph has been modified to inform the transform
@@ -107,9 +90,5 @@ def apply(self, model: ModelWrapper):  # noqa
                     # Immediately exit here to propagate the data type changes
                     # before considering the next node
                     break
-        # Some data types might have changed, do one pass of data type inference
-        # to propagate these changes through the graph
         model = model.transform(InferDataTypes())
-        # Return the transformed model and indicate whether the graph actually
-        # has been transformed to exhaustively apply this transformation again.
         return model, graph_modified
diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py
index 1995d9f06a..5eb72194ea 100644
--- a/src/finn/util/basic.py
+++ b/src/finn/util/basic.py
@@ -81,6 +81,7 @@
 part_map = {**pynq_part_map, **alveo_part_map}
 part_map["VEK280"] = "xcve2802-vsvh1760-2MP-e-S"
 part_map["VCK190"] = "xcvc1902-vsva2197-2MP-e-S"
+part_map["V80"] = "xcv80-lsva4737-2MHP-e-s"
 
 
 def get_rtlsim_trace_depth():
@@ -288,3 +289,20 @@ def memutil(req_mem_spec, primitive_spec):
     eff = (req_width * req_depth) / (count * prim_width * prim_depth)
     waste = (count * prim_width * prim_depth) - (req_width * req_depth)
     return (count, eff, waste)
+
+
+def is_versal(fpgapart):
+    """Returns whether board is part of the Versal family"""
+    return fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"] or fpgapart[0:5] in [
+        "xqrvc",
+        "xcv80",
+    ]
+
+
+def get_dsp_block(fpgapart):
+    if is_versal(fpgapart):
+        return "DSP58"
+    elif fpgapart[2] == "7":
+        return "DSP48E1"
+    else:
+        return "DSP48E2"
diff --git a/src/finn/util/data_packing.py b/src/finn/util/data_packing.py
index 7698850029..6a72d38058 100644
--- a/src/finn/util/data_packing.py
+++ b/src/finn/util/data_packing.py
@@ -295,9 +295,14 @@ def npy_to_rtlsim_input(input_file, input_dtype, pad_to_nbits, reverse_inner=Tru
         inp = np.load(input_file)
     else:
         raise Exception("input_file must be ndarray or filename for .npy")
-    if inp.shape[-1] == 1 and input_dtype.is_integer():
+    if (
+        inp.shape[-1] == 1
+        and input_dtype.is_integer()
+        and input_dtype.get_canonical_name() != "BIPOLAR"
+    ):
+        mask = (1 << input_dtype.bitwidth()) - 1
         packed_data = inp.flatten().astype(input_dtype.to_numpy_dt())
-        packed_data = [int(x) for x in packed_data]
+        packed_data = [int(x) & mask for x in packed_data]
     else:
         packed_data = pack_innermost_dim_as_hex_string(
             inp, input_dtype, pad_to_nbits, reverse_inner=reverse_inner
diff --git a/src/finn/util/fpgadataflow.py b/src/finn/util/fpgadataflow.py
index 3d3d343cd4..aae438fac2 100644
--- a/src/finn/util/fpgadataflow.py
+++ b/src/finn/util/fpgadataflow.py
@@ -69,11 +69,3 @@ def is_rtl_node(node):
                     is_node = True
 
     return is_node
-
-
-def is_versal(fpgapart):
-    """Returns whether board is part of the Versal family"""
-    return (
-        fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"]
-        or fpgapart[0:5] == "xqrvc"
-    )
diff --git a/tests/brevitas/test_brevitas_debug.py b/tests/brevitas/test_brevitas_debug.py
index d6879a727b..3d059a6856 100644
--- a/tests/brevitas/test_brevitas_debug.py
+++ b/tests/brevitas/test_brevitas_debug.py
@@ -35,6 +35,7 @@
 import os
 import torch
 from brevitas.export import export_qonnx
+from brevitas.quant_tensor import _unpack_quant_tensor
 from pkgutil import get_data
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.util.cleanup import cleanup as qonnx_cleanup
@@ -90,7 +91,7 @@ def test_brevitas_debug(QONNX_FINN_conversion):
     else:
         assert len(names_common) == 8
     for dbg_name in names_common:
-        tensor_pytorch = dbg_hook.values[dbg_name].value.detach().numpy()
+        tensor_pytorch = _unpack_quant_tensor(dbg_hook.values[dbg_name]).detach().numpy()
         tensor_finn = output_dict[dbg_name]
         assert np.isclose(tensor_finn, tensor_pytorch, atol=1e-5).all()
     os.remove(finn_onnx)
diff --git a/tests/brevitas/test_brevitas_fc.py b/tests/brevitas/test_brevitas_fc.py
index 842d099f57..a7a73a5ed4 100644
--- a/tests/brevitas/test_brevitas_fc.py
+++ b/tests/brevitas/test_brevitas_fc.py
@@ -45,8 +45,6 @@
 from finn.util.basic import make_build_dir
 from finn.util.test import get_test_model_trained
 
-export_onnx_path = make_build_dir("test_brevitas_fc_")
-
 
 @pytest.mark.brevitas_export
 # act bits
@@ -61,6 +59,7 @@ def test_brevitas_fc_onnx_export_and_exec(size, wbits, abits):
     if wbits > abits:
         pytest.skip("No wbits > abits cases at the moment")
     nname = "%s_%dW%dA" % (size, wbits, abits)
+    export_onnx_path = make_build_dir("test_brevitas_fc_")
     finn_onnx = export_onnx_path + "/%s.onnx" % nname
     fc = get_test_model_trained(size, wbits, abits)
     ishape = (1, 1, 28, 28)
diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py
index 556ba1d187..0d3418624a 100644
--- a/tests/end2end/test_end2end_bnn_pynq.py
+++ b/tests/end2end/test_end2end_bnn_pynq.py
@@ -94,8 +94,8 @@
     MakeMaxPoolNHWC,
     MoveScalarLinearPastInvariants,
 )
+from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds
 from finn.util.basic import get_finn_root, make_build_dir, test_board_map
-from finn.util.fpgadataflow import is_fpgadataflow_node
 from finn.util.pytorch import ToTensor
 from finn.util.test import (
     execute_parent,
@@ -131,12 +131,12 @@ def fold_tfc(model):
         fcl_inst.set_nodeattr("SIMD", simd)
         fcl_inst.set_nodeattr("ram_style", ramstyle)
         fcl_inst.set_nodeattr("mem_mode", "internal_decoupled")
+        fcl_inst.set_nodeattr("resType", "lut")
     # set parallelism for input quantizer to be same as first layer's SIMD
     inp_qnt_node = model.get_nodes_by_op_type("Thresholding_rtl")[0]
     inp_qnt = getCustomOp(inp_qnt_node)
     inp_qnt.set_nodeattr("PE", 49)
-    # TODO: update PYNQ driver to support runtime writeable weights for RTL Thresholding
-    # inp_qnt.set_nodeattr("runtime_writeable_weights", 1)
+    inp_qnt.set_nodeattr("runtime_writeable_weights", 1)
     return model
 
 
@@ -156,6 +156,7 @@ def fold_lfc(model):
         fcl_inst.set_nodeattr("ram_style", ramstyle)
         fcl_inst.set_nodeattr("runtime_writeable_weights", 1)
         fcl_inst.set_nodeattr("mem_mode", "internal_decoupled")
+        fcl_inst.set_nodeattr("resType", "lut")
     # set parallelism for input quantizer to be same as first layer's SIMD
     inp_qnt_node = model.get_nodes_by_op_type("Thresholding_rtl")[0]
     inp_qnt = getCustomOp(inp_qnt_node)
@@ -182,12 +183,14 @@ def fold_cnv_large(model):
         fcl_inst.set_nodeattr("PE", pe)
         fcl_inst.set_nodeattr("SIMD", simd)
         fcl_inst.set_nodeattr("mem_mode", "internal_decoupled")
+        fcl_inst.set_nodeattr("resType", "lut")
 
-    swg_layers = model.get_nodes_by_op_type("ConvolutionInputGenerator_hls")
+    swg_layers = model.get_nodes_by_op_type("ConvolutionInputGenerator_rtl")
     for i in range(len(swg_layers)):
         swg_inst = getCustomOp(swg_layers[i])
         simd = folding[i][1]
         swg_inst.set_nodeattr("SIMD", simd)
+        swg_inst.set_nodeattr("ram_style", "distributed")
     return model
 
 
@@ -198,11 +201,11 @@ def fold_cnv_small(model):
         (8, 3, "distributed"),
         (16, 16, "distributed"),
         (8, 16, "auto"),
-        (8, 16, "block"),
+        (8, 16, "distributed"),
         (4, 8, "auto"),
         (1, 8, "auto"),
-        (1, 2, "distributed"),
-        (2, 2, "block"),
+        (1, 2, "block"),
+        (2, 2, "auto"),
         (5, 1, "distributed"),
     ]
     for fcl, (pe, simd, ramstyle) in zip(fc_layers, folding):
@@ -211,12 +214,18 @@ def fold_cnv_small(model):
         fcl_inst.set_nodeattr("SIMD", simd)
         fcl_inst.set_nodeattr("ram_style", ramstyle)
         fcl_inst.set_nodeattr("mem_mode", "internal_decoupled")
+        fcl_inst.set_nodeattr("resType", "lut")
 
-    swg_layers = model.get_nodes_by_op_type("ConvolutionInputGenerator_hls")
+    swg_layers = model.get_nodes_by_op_type("ConvolutionInputGenerator_rtl")
     for i in range(len(swg_layers)):
         swg_inst = getCustomOp(swg_layers[i])
         simd = folding[i][1]
         swg_inst.set_nodeattr("SIMD", simd)
+        swg_inst.set_nodeattr("ram_style", "distributed")
+    inp_qnt_node = model.get_nodes_by_op_type("Thresholding_rtl")[0]
+    inp_qnt = getCustomOp(inp_qnt_node)
+    inp_qnt.set_nodeattr("depth_trigger_uram", 32000)
+    inp_qnt.set_nodeattr("depth_trigger_bram", 32000)
     return model
 
 
@@ -597,16 +606,10 @@ def test_convert_to_hw_layers(self, topology, wbits, abits, board):
             assert len(model.get_nodes_by_op_type(op_type)) == exp_count
 
     def test_specialize_layers(self, topology, wbits, abits, board):
+        build_data = get_build_env(board, target_clk_ns)
         prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "convert_to_hw_layers")
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
-        # set preferred impl style to hls for all layers
-        force_hls_boards = ["Pynq-Z1", "U250"]
-        if topology == "cnv" and wbits == 2 and abits == 2 and board in force_hls_boards:
-            for node in model.graph.node:
-                if is_fpgadataflow_node(node):
-                    inst = getCustomOp(node)
-                    inst.set_nodeattr("preferred_impl_style", "hls")
-        model = model.transform(SpecializeLayers())
+        model = model.transform(SpecializeLayers(build_data["part"]))
         model = model.transform(GiveUniqueNodeNames())
         model.save(get_checkpoint_name(topology, wbits, abits, "specialize_layers"))
         exp_layer_counts = {
@@ -636,19 +639,9 @@ def test_specialize_layers(self, topology, wbits, abits, board):
                 ("StreamingMaxPool_hls", 2),
                 ("LabelSelect_hls", 1),
             ],
-            "cnv-2-2": [
-                ("Transpose", 1),
-                ("Thresholding_hls", 1),
-                ("ConvolutionInputGenerator_hls", 6),
-                ("MVAU_hls", 9),
-                ("StreamingMaxPool_hls", 2),
-                ("LabelSelect_hls", 1),
-            ],
         }
         if topology == "tfc" and wbits == 1 and abits == 1:
             exp_key = "tfc-1-1"
-        elif topology == "cnv" and wbits == 2 and abits == 2 and board in force_hls_boards:
-            exp_key = "cnv-2-2"
         else:
             exp_key = topology
         exp_layer_counts = exp_layer_counts[exp_key]
@@ -680,6 +673,7 @@ def test_minimize_bit_width(self, topology, wbits, abits, board):
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         model = model.transform(MinimizeAccumulatorWidth())
         model = model.transform(MinimizeWeightBitWidth())
+        model = model.transform(RoundAndClipThresholds())
         curr_chkpt_name = get_checkpoint_name(topology, wbits, abits, "minimize_bit_width")
         model.save(curr_chkpt_name)
 
@@ -706,7 +700,7 @@ def test_ipgen(self, topology, wbits, abits, board):
         build_data = get_build_env(board, target_clk_ns)
         if build_data["kind"] == "alveo" and ("VITIS_PATH" not in os.environ):
             pytest.skip("VITIS_PATH not set")
-        prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "fold")
+        prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "minimize_bit_width")
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         model = model.transform(GiveUniqueNodeNames())
         model = model.transform(PrepareIP(build_data["part"], target_clk_ns))
@@ -719,8 +713,8 @@ def test_set_fifo_depths(self, topology, wbits, abits, board):
         prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "ipgen_" + board)
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         test_fpga_part = get_build_env(board, target_clk_ns)["part"]
-        if topology == "cnv" and wbits == 2 and abits == 2 and board == "Pynq-Z1":
-            # Enabling swg_exception for this single test case. Disabling the exception results in
+        if topology == "cnv" and abits == 2 and board == "Pynq-Z1":
+            # Enabling swg_exception for these test cases. Disabling the exception results in
             # a design that exceeds the resources of the Pynq-Z1 board. In future this should be
             # revisited and handled correctly as the swg_exception is poorly justified.
             model = model.transform(
@@ -740,7 +734,7 @@ def test_ipstitch_rtlsim(self, topology, wbits, abits, board):
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         test_fpga_part = get_build_env(board, target_clk_ns)["part"]
         model = model.transform(InsertDWC())
-        model = model.transform(SpecializeLayers())
+        model = model.transform(SpecializeLayers(test_fpga_part))
         model = model.transform(GiveUniqueNodeNames())
         model = model.transform(AnnotateCycles())
         perf = model.analysis(dataflow_performance)
@@ -807,7 +801,7 @@ def test_build(self, topology, wbits, abits, board):
         prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "fifodepth_" + board)
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         model = model.transform(build_data["build_fxn"])
-        model = model.transform(AnnotateResources("synth"))
+        model = model.transform(AnnotateResources("synth", build_data["part"]))
         model.save(get_checkpoint_name(topology, wbits, abits, "build_" + board))
 
     @pytest.mark.slow
diff --git a/tests/end2end/test_end2end_mobilenet_v1.py b/tests/end2end/test_end2end_mobilenet_v1.py
index cbf89c2eae..4c52277970 100644
--- a/tests/end2end/test_end2end_mobilenet_v1.py
+++ b/tests/end2end/test_end2end_mobilenet_v1.py
@@ -58,11 +58,15 @@
 import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw
 import finn.transformation.streamline.absorb as absorb
 import finn.transformation.streamline.reorder as reorder
+from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance
 from finn.core.onnx_exec import execute_onnx
+from finn.transformation.fpgadataflow.annotate_cycles import AnnotateCycles
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.create_dataflow_partition import (
     CreateDataflowPartition,
 )
+from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.minimize_accumulator_width import (
     MinimizeAccumulatorWidth,
 )
@@ -70,14 +74,22 @@
     MinimizeWeightBitWidth,
 )
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.fpgadataflow.set_fifo_depths import (
+    InsertAndSetFIFODepths,
+    RemoveShallowFIFOs,
+    SplitLargeFIFOs,
+)
 from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
 from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
 from finn.transformation.streamline import Streamline
 from finn.transformation.streamline.collapse_repeated import CollapseRepeatedMul
 from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds
-from finn.util.basic import alveo_default_platform, alveo_part_map, get_finn_root
+from finn.util.basic import get_finn_root
 from finn.util.pytorch import NormalizePreProc
+from finn.util.pyverilator import verilator_fifosim
 from finn.util.test import (
     crop_center,
     get_test_model_trained,
@@ -87,11 +99,9 @@
 
 build_dir = os.environ["FINN_BUILD_DIR"]
 
-test_board = "U250"
-test_platform = alveo_default_platform[test_board]
-test_fpga_part = alveo_part_map[test_board]
+# Select Versal device such that RTL VVU (i.e. DSP58) can be enabled
+fpga_part = "xcvm1802-vsvd1760-2MP-e-S"
 target_clk_ns = 3
-large_fifo_ram_style = "ultra"
 extra_fold = 1
 first_layer_res_type = "dsp"
 
@@ -218,7 +228,6 @@ def test_end2end_mobilenet_lowering():
 
 
 @pytest.mark.end2end
-@pytest.mark.xfail
 def test_end2end_mobilenet_convert_to_hw_layers():
     model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_lowered.onnx")
     model = model.transform(to_hw.InferPool())
@@ -237,38 +246,50 @@ def test_end2end_mobilenet_convert_to_hw_layers():
 @pytest.mark.end2end
 def test_end2end_mobilenet_specialize_layers():
     model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_hw_layers.onnx")
-    model = model.transform(SpecializeLayers())
+    model = model.transform(SpecializeLayers(fpga_part))
     model = model.transform(GiveUniqueNodeNames())
     model = model.transform(GiveReadableTensorNames())
     model.save(build_dir + "/end2end_mobilenet_specialize_layers.onnx")
 
 
 @pytest.mark.end2end
-def test_end2end_mobilenet_folding():
+def test_end2end_mobilenet_create_dataflow_partition():
     model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_specialize_layers.onnx")
+    parent_model = model.transform(CreateDataflowPartition())
+    parent_model.save(build_dir + "/end2end_mobilenet_dataflow_parent.onnx")
+    sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
+    sdp_node = getCustomOp(sdp_node)
+    dataflow_model_filename = sdp_node.get_nodeattr("model")
+    dataflow_model = load_test_checkpoint_or_skip(dataflow_model_filename)
+    dataflow_model = dataflow_model.transform(RemoveUnusedTensors())
+    dataflow_model.save(build_dir + "/end2end_mobilenet_dataflow_model.onnx")
+
+
+@pytest.mark.end2end
+def test_end2end_mobilenet_folding():
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_dataflow_model.onnx")
     # optional extra folding to use fewer resources
     # applied while setting the attributes on each node
     assert extra_fold in [1, 2, 4]
     # set up folding for the conv layers impl'd by MVAUs
     # each value is PE for a layer
-    fc_layers = model.get_nodes_by_op_type("MVAU_hls")
-    fc_layers += model.get_nodes_by_op_type("MVAU_rtl")
+    fc_layers = model.get_nodes_by_op_type("MVAU_rtl")
     # each tuple is (PE, SIMD, ram_style) for a layer
     folding = [
-        (32, 3, "block"),
+        (16, 3, "block"),
+        (8, 16, "distributed"),
+        (8, 16, "distributed"),
+        (16, 16, "distributed"),
+        (8, 16, "distributed"),
+        (16, 16, "distributed"),
+        (8, 16, "block"),
+        (16, 16, "block"),
         (16, 16, "block"),
         (16, 16, "block"),
-        (32, 16, "block"),
         (16, 16, "block"),
-        (32, 16, "block"),
         (16, 16, "block"),
-        (32, 16, "block"),
-        (32, 16, "block"),
-        (32, 16, "block"),
-        (32, 16, "block"),
-        (32, 16, "block"),
+        (8, 16, "block"),
         (16, 16, "block"),
-        (32, 16, "block"),
         (4, 4, "block"),
     ]
     for fcl, (pe, simd, ramstyle) in zip(fc_layers, folding):
@@ -276,26 +297,46 @@ def test_end2end_mobilenet_folding():
         fcl_inst.set_nodeattr("PE", pe // extra_fold)
         fcl_inst.set_nodeattr("SIMD", simd)
         fcl_inst.set_nodeattr("ram_style", ramstyle)
-    # first layer uses 8-bit weights & activations
-    # control its compute resource type explicitly
-    getCustomOp(fc_layers[0]).set_nodeattr("resType", first_layer_res_type)
     # set up folding for the depthwise conv layers impl'd by VVAUs
     # each value is PE for a layer
-    vvau_layers = model.get_nodes_by_op_type("VVAU_hls")
-    vvau_layers += model.get_nodes_by_op_type("VVAU_rtl")
-    folding = [32, 32, 64, 16, 32, 8, 16, 16, 16, 16, 16, 4, 8]
-    for vvau, pe in zip(vvau_layers, folding):
+    vvau_layers = model.get_nodes_by_op_type("VVAU_rtl")
+    pe_simd_fold = [
+        [16, 3],
+        [8, 3],
+        [16, 3],
+        [4, 3],
+        [8, 3],
+        [2, 3],
+        [4, 3],
+        [4, 3],
+        [4, 3],
+        [4, 3],
+        [4, 3],
+        [1, 3],
+        [2, 3],
+    ]
+    for vvau, pe_simd in zip(vvau_layers, pe_simd_fold):
+        pe, simd = pe_simd
         vvau_inst = getCustomOp(vvau)
         vvau_inst.set_nodeattr("PE", pe // extra_fold)
+        vvau_inst.set_nodeattr("SIMD", simd)
         # set SIMD in preceeding ConvInputGen to same value
         convinputgen = model.find_direct_predecessors(vvau)[0]
         convinputgen_inst = getCustomOp(convinputgen)
         convinputgen_inst.set_nodeattr("SIMD", pe // extra_fold)
+        # Enable parallel_window mode for SIMD parallelism VVU
+        convinputgen_inst.set_nodeattr("parallel_window", 1)
         # set SIMD in preceeding FMPadding to same value
         padding = model.find_direct_predecessors(convinputgen)[0]
-        if padding.op_type == "FMPadding_hls":
+        if padding.op_type == "FMPadding_rtl":
             padding_inst = getCustomOp(padding)
             padding_inst.set_nodeattr("SIMD", pe // extra_fold)
+    # Set folding Thresholding layers
+    thresholding_layers = model.get_nodes_by_op_type("Thresholding_rtl")
+    folding = [2, 2, 4, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+    for thresholding, pe in zip(thresholding_layers, folding):
+        thresholding_inst = getCustomOp(thresholding)
+        thresholding_inst.set_nodeattr("PE", pe)
     # adjust final pooling layer + its inpgen
     pool_node = model.get_nodes_by_op_type("Pool_hls")[0]
     pool_inst = getCustomOp(pool_node)
@@ -312,29 +353,17 @@ def test_end2end_mobilenet_minimize_bit_width():
     model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_folded.onnx")
     model = model.transform(MinimizeAccumulatorWidth())
     model = model.transform(MinimizeWeightBitWidth())
-    model = model.save(build_dir + "/end2end_mobilenet_minimize_bitwidth.onnx")
-
-
-@pytest.mark.end2end
-def test_end2end_mobilenet_create_dataflow_partition():
-    model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_minimize_bitwidth.onnx")
-    parent_model = model.transform(CreateDataflowPartition())
-    parent_model.save(build_dir + "/end2end_mobilenet_dataflow_parent.onnx")
-    sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
-    sdp_node = getCustomOp(sdp_node)
-    dataflow_model_filename = sdp_node.get_nodeattr("model")
-    dataflow_model = load_test_checkpoint_or_skip(dataflow_model_filename)
-    dataflow_model = dataflow_model.transform(RemoveUnusedTensors())
-    dataflow_model.save(build_dir + "/end2end_mobilenet_dataflow_model.onnx")
+    model = model.transform(RoundAndClipThresholds())
+    model.save(build_dir + "/end2end_mobilenet_minimize_bitwidth.onnx")
 
 
 @pytest.mark.slow
 @pytest.mark.vivado
 @pytest.mark.end2end
-@pytest.mark.xfail
 def test_end2end_mobilenet_cppsim():
     model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_minimize_bitwidth.onnx")
     x = np.load(build_dir + "/end2end_mobilenet_input.npy")
+    x = x.transpose(0, 2, 3, 1)  # Convert NCHW to NHWC
     inp_name = model.graph.input[0].name
     out_name = model.graph.output[0].name
     inp_dict = {inp_name: x}
@@ -358,7 +387,159 @@ def test_end2end_mobilenet_cppsim():
 
     # check result with golden values
     golden = np.load(build_dir + "/end2end_mobilenet_golden_top5.npy")
-    golden_prob = np.load(build_dir + "/end2end_mobilenet_golden_top5_prob.npy")
+    # golden_prob = np.load(build_dir + "/end2end_mobilenet_golden_top5_prob.npy")
 
     assert (golden == res_cppsim).all()
-    assert np.isclose(golden_prob, res_cppsim_prob).all()
+    # assert np.isclose(golden_prob, res_cppsim_prob[0, 0, 0, :5]).all()
+
+
+@pytest.mark.slow
+@pytest.mark.vivado
+@pytest.mark.end2end
+def test_end2end_mobilenet_ipgen():
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_minimize_bitwidth.onnx")
+    model = model.transform(PrepareIP(fpga_part, target_clk_ns))
+    model = model.transform(HLSSynthIP())
+    model.save(build_dir + "/end2end_mobilenet_hw_ipgen.onnx")
+
+
+@pytest.mark.slow
+@pytest.mark.vivado
+@pytest.mark.end2end
+def test_end2end_mobilenet_rtlsim():
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_hw_ipgen.onnx")
+    # use critical path estimate to set rtlsim liveness threshold
+    # (very conservative)
+    model = model.transform(AnnotateCycles())
+    estimate_network_performance = model.analysis(dataflow_performance)
+    os.environ["LIVENESS_THRESHOLD"] = str(
+        int(estimate_network_performance["critical_path_cycles"])
+    )
+    x = np.load(build_dir + "/end2end_mobilenet_input.npy")
+    x = x.transpose(0, 2, 3, 1)  # Convert NCHW to NHWC
+    inp_name = model.graph.input[0].name
+    out_name = model.graph.output[0].name
+    inp_dict = {inp_name: x}
+    # rtlsim
+    model = model.transform(SetExecMode("rtlsim"))
+    model = model.transform(PrepareRTLSim())
+    model.save(build_dir + "/end2end_mobilenet_rtlsim.onnx")
+    ret_rtlsim = execute_onnx(model, inp_dict, True)
+    res_rtlsim = ret_rtlsim[out_name]
+    np.save(build_dir + "/end2end_mobilenet_result_rtlsim.npy", res_rtlsim)
+    a0 = np.load(build_dir + "/end2end_mobilenet_topk_scale.npy")
+    res_rtlsim_prob = ret_rtlsim[model.graph.node[-2].output[0]] * a0
+    np.save(build_dir + "/end2end_mobilenet_result_rtlsim_prob.npy", res_rtlsim_prob)
+
+    # check result with golden values
+    golden = np.load(build_dir + "/end2end_mobilenet_golden_top5.npy")
+    # golden_prob = np.load(build_dir + "/end2end_mobilenet_golden_top5_prob.npy")
+
+    assert (golden == res_rtlsim).all()
+    # assert np.isclose(golden_prob, res_rtlsim_prob[0, 0, 0, :5]).all()
+
+
+@pytest.mark.slow
+@pytest.mark.vivado
+@pytest.mark.end2end
+def test_end2end_mobilenet_set_fifo_depths():
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_hw_ipgen.onnx")
+    model = model.transform(
+        InsertAndSetFIFODepths(
+            fpga_part,
+            target_clk_ns,
+            swg_exception=False,
+            vivado_ram_style="auto",
+            force_python_sim=False,
+        )
+    )
+    # perform FIFO splitting and shallow FIFO removal only after the final config
+    # json file has been written. otherwise, since these transforms may add/remove
+    # FIFOs, we get name mismatch problems when trying to reuse the final config.
+    model = model.transform(SplitLargeFIFOs())
+    model = model.transform(RemoveShallowFIFOs())
+    # after FIFOs are ready to go, call PrepareIP and HLSSynthIP again
+    # this will only run for the new nodes (e.g. FIFOs and DWCs)
+    model = model.transform(PrepareIP(fpga_part, target_clk_ns))
+    model = model.transform(HLSSynthIP())
+    model.save(build_dir + "/end2end_mobilenet_set_fifo_depths.onnx")
+
+
+@pytest.mark.slow
+@pytest.mark.vivado
+@pytest.mark.end2end
+def test_end2end_mobilenet_stitched_ip():
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_set_fifo_depths.onnx")
+    model = model.transform(
+        CreateStitchedIP(
+            fpga_part,
+            target_clk_ns,
+            vitis=False,
+            signature=None,
+        )
+    )
+    model.save(build_dir + "/end2end_mobilenet_stitched_ip.onnx")
+
+
+@pytest.mark.slow
+@pytest.mark.vivado
+@pytest.mark.end2end
+def test_end2end_mobilenet_stitched_ip_rtlsim():
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_stitched_ip.onnx")
+    # use critical path estimate to set rtlsim liveness threshold
+    # (very conservative)
+    model = model.transform(AnnotateCycles())
+    estimate_network_performance = model.analysis(dataflow_performance)
+    os.environ["LIVENESS_THRESHOLD"] = str(
+        int(estimate_network_performance["critical_path_cycles"])
+    )
+    # Prepare input
+    x = np.load(build_dir + "/end2end_mobilenet_input.npy")
+    x = x.transpose(0, 2, 3, 1)  # Convert NCHW to NHWC
+    inp_name = model.graph.input[0].name
+    out_name = model.graph.output[0].name
+    inp_dict = {inp_name: x}
+
+    # set top-level prop for stitched-ip rtlsim and launch
+    model.set_metadata_prop("exec_mode", "rtlsim")
+    ret_rtlsim_ip = execute_onnx(model, inp_dict, True)
+    res_rtlsim_ip = ret_rtlsim_ip[out_name]
+    np.save(build_dir + "/end2end_mobilenet_result_rtlsim_ip.npy", res_rtlsim_ip)
+    a0 = np.load(build_dir + "/end2end_mobilenet_topk_scale.npy")
+    res_rtlsim_ip_prob = ret_rtlsim_ip[model.graph.node[-2].output[0]] * a0
+    np.save(build_dir + "/end2end_mobilenet_result_cppsim_prob.npy", res_rtlsim_ip_prob)
+
+    # check result with golden values
+    golden = np.load(build_dir + "/end2end_mobilenet_golden_top5.npy")
+    # golden_prob = np.load(build_dir + "/end2end_mobilenet_golden_top5_prob.npy")
+
+    assert (golden == res_rtlsim_ip).all()
+    # assert np.isclose(golden_prob, res_rtlsim_ip_prob[0, 0, 0, :5]).all()
+
+
+@pytest.mark.slow
+@pytest.mark.vivado
+@pytest.mark.end2end
+def test_end2end_mobilenet_rtlsim_performance():
+    model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_stitched_ip.onnx")
+    report_dir = build_dir + "/report"
+    os.makedirs(report_dir, exist_ok=True)
+    # multi-in/out streams currently not supported in our C++ verilator driver
+    rtlsim_bs = 1
+
+    rtlsim_perf_dict = verilator_fifosim(model, rtlsim_bs)
+    # keep keys consistent between the Python and C++-styles
+    cycles = rtlsim_perf_dict["cycles"]
+    clk_ns = float(model.get_metadata_prop("clk_ns"))
+    fclk_mhz = 1 / (clk_ns * 0.001)
+    runtime_s = (cycles * clk_ns) * (10**-9)
+    rtlsim_perf_dict["runtime[ms]"] = runtime_s * 1000
+    rtlsim_perf_dict["throughput[images/s]"] = rtlsim_bs / runtime_s
+    rtlsim_perf_dict["fclk[mhz]"] = fclk_mhz
+    for key, val in rtlsim_perf_dict.items():
+        if "max_count" in key:
+            del rtlsim_perf_dict[key]
+    # estimate stable-state throughput based on latency+throughput
+    rtlsim_perf_dict["stable_throughput[images/s]"] = rtlsim_perf_dict["throughput[images/s]"]
+
+    model.save(build_dir + "/end2end_mobilenet_rtlsim_performance.onnx")
diff --git a/tests/end2end/test_ext_weights.py b/tests/end2end/test_ext_weights.py
index bac343bedf..29d2f58e66 100644
--- a/tests/end2end/test_ext_weights.py
+++ b/tests/end2end/test_ext_weights.py
@@ -60,7 +60,7 @@ def get_checkpoint_name(step):
         # checkpoint for build step is an entire dir
         return build_dir + "/end2end_ext_weights_build"
     elif step == "download":
-        return onnx_dir_local + "/tfc-w1a1.onnx"
+        return onnx_dir_local + "/tfc-w2a2.onnx"
     else:
         # other checkpoints are onnx files
         return build_dir + "/end2end_ext_weights_%s.onnx" % (step)
@@ -82,14 +82,17 @@ def test_end2end_ext_weights_build():
     model_file = get_checkpoint_name("download")
     load_test_checkpoint_or_skip(model_file)
     test_data = os.environ["FINN_ROOT"] + "/src/finn/qnn-data/test_ext_weights"
-    folding_config_file = test_data + "/tfc-w1a1-extw.json"
+    folding_config_file = test_data + "/tfc-w2a2-extw.json"
+    specialize_layers_config_file = test_data + "/specialize_layers_config.json"
     output_dir = make_build_dir("test_end2end_ext_weights_build")
     cfg = build.DataflowBuildConfig(
         output_dir=output_dir,
         verbose=True,
+        standalone_thresholds=True,
         folding_config_file=folding_config_file,
+        specialize_layers_config_file=specialize_layers_config_file,
         synth_clk_period_ns=target_clk_ns,
-        board="Pynq-Z1",
+        board="ZCU104",
         shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ,
         generate_outputs=[
             build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
diff --git a/tests/fpgadataflow/test_convert_to_hw_1d_conv_layer.py b/tests/fpgadataflow/test_convert_to_hw_1d_conv_layer.py
index c5d0281203..6d3929109f 100644
--- a/tests/fpgadataflow/test_convert_to_hw_1d_conv_layer.py
+++ b/tests/fpgadataflow/test_convert_to_hw_1d_conv_layer.py
@@ -143,10 +143,10 @@ def test_convert_to_hw_1d_conv_layer(conv_config, depthwise, use_rtl_swg, exec_m
                 inst.set_nodeattr("preferred_impl_style", "hls")
     if depthwise is True:
         new_model = new_model.transform(to_hw.InferVectorVectorActivation())
-        new_model = new_model.transform(SpecializeLayers())
+        new_model = new_model.transform(SpecializeLayers("xc7z020clg400-1"))
     else:
         new_model = new_model.transform(to_hw.InferQuantizedMatrixVectorActivation())
-        new_model = new_model.transform(SpecializeLayers())
+        new_model = new_model.transform(SpecializeLayers("xc7z020clg400-1"))
         # set folding parameters for MVAU
         if new_model.get_nodes_by_op_type("MVAU_hls"):
             fc_node = new_model.get_nodes_by_op_type("MVAU_hls")[0]
diff --git a/tests/fpgadataflow/test_convert_to_hw_channelwise_layer.py b/tests/fpgadataflow/test_convert_to_hw_channelwise_layer.py
index 4b063f8505..ac02008ff2 100644
--- a/tests/fpgadataflow/test_convert_to_hw_channelwise_layer.py
+++ b/tests/fpgadataflow/test_convert_to_hw_channelwise_layer.py
@@ -121,7 +121,7 @@ def test_convert_to_hw_channelwise_layer(pdt, idt, onnx_op_name, scalar_param, e
     assert (y_produced == y_expected).all()
     assert model.graph.node[1].op_type == "ChannelwiseOp"
 
-    model = model.transform(SpecializeLayers())
+    model = model.transform(SpecializeLayers("xc7z020clg400-1"))
 
     if exec_mode == "cppsim":
         model = model.transform(PrepareCppSim())
diff --git a/tests/fpgadataflow/test_convert_to_hw_conv_fc_transition.py b/tests/fpgadataflow/test_convert_to_hw_conv_fc_transition.py
index f7b3c55c2a..f9b5dff56c 100755
--- a/tests/fpgadataflow/test_convert_to_hw_conv_fc_transition.py
+++ b/tests/fpgadataflow/test_convert_to_hw_conv_fc_transition.py
@@ -204,7 +204,7 @@ def test_convert_to_hw_conv_fc_transition(conv_config, depthwise, use_reshape):
         if is_fpgadataflow_node(node):
             inst = getCustomOp(node)
             inst.set_nodeattr("preferred_impl_style", "hls")
-    new_model = new_model.transform(SpecializeLayers())
+    new_model = new_model.transform(SpecializeLayers("xc7z020clg400-1"))
     new_model = new_model.transform(GiveUniqueNodeNames())
     new_model = new_model.transform(InferDataLayouts())
 
diff --git a/tests/fpgadataflow/test_convert_to_hw_conv_layer.py b/tests/fpgadataflow/test_convert_to_hw_conv_layer.py
index 61f8af7806..122997e412 100644
--- a/tests/fpgadataflow/test_convert_to_hw_conv_layer.py
+++ b/tests/fpgadataflow/test_convert_to_hw_conv_layer.py
@@ -131,10 +131,10 @@ def test_convert_to_hw_conv_layer(conv_config, depthwise, use_rtl_swg, exec_mode
                 inst.set_nodeattr("preferred_impl_style", "hls")
     if depthwise is True:
         new_model = new_model.transform(to_hw.InferVectorVectorActivation())
-        new_model = new_model.transform(SpecializeLayers())
+        new_model = new_model.transform(SpecializeLayers("xc7z020clg400-1"))
     else:
         new_model = new_model.transform(to_hw.InferQuantizedMatrixVectorActivation())
-        new_model = new_model.transform(SpecializeLayers())
+        new_model = new_model.transform(SpecializeLayers("xc7z020clg400-1"))
         # set folding parameters for MVAU
         if new_model.get_nodes_by_op_type("MVAU_hls"):
             fc_node = new_model.get_nodes_by_op_type("MVAU_hls")[0]
diff --git a/tests/fpgadataflow/test_convert_to_hw_layers_cnv.py b/tests/fpgadataflow/test_convert_to_hw_layers_cnv.py
index 71f383ca23..4b8668c7b3 100644
--- a/tests/fpgadataflow/test_convert_to_hw_layers_cnv.py
+++ b/tests/fpgadataflow/test_convert_to_hw_layers_cnv.py
@@ -111,7 +111,7 @@ def test_convert_to_hw_layers_cnv_w1a1(fused_activation):
         if is_fpgadataflow_node(node):
             inst = getCustomOp(node)
             inst.set_nodeattr("preferred_impl_style", "hls")
-    model = model.transform(SpecializeLayers())
+    model = model.transform(SpecializeLayers("xc7z020clg400-1"))
     for node in model.graph.node:
         if node.op_type == "MVAU_hls":
             inst = getCustomOp(node)
diff --git a/tests/fpgadataflow/test_convert_to_hw_layers_fc.py b/tests/fpgadataflow/test_convert_to_hw_layers_fc.py
index 746ded9074..94fafae6b7 100644
--- a/tests/fpgadataflow/test_convert_to_hw_layers_fc.py
+++ b/tests/fpgadataflow/test_convert_to_hw_layers_fc.py
@@ -82,7 +82,7 @@ def test_convert_to_hw_layers_tfc_w1a1():
     model = model.transform(absorb.AbsorbMulIntoMultiThreshold())
     model = model.transform(RoundAndClipThresholds())
     model = model.transform(to_hw.InferBinaryMatrixVectorActivation())
-    model = model.transform(SpecializeLayers())
+    model = model.transform(SpecializeLayers("xc7z020clg400-1"))
     fc0 = model.graph.node[2]
     assert fc0.op_type.startswith("MVAU")
     assert model.get_tensor_shape(fc0.input[0]) == [1, 784]
@@ -154,7 +154,7 @@ def test_convert_to_hw_layers_tfc_w1a2():
     model = model.transform(GiveReadableTensorNames())
     model = model.transform(Streamline())
     model = model.transform(to_hw.InferQuantizedMatrixVectorActivation())
-    model = model.transform(SpecializeLayers())
+    model = model.transform(SpecializeLayers("xc7z020clg400-1"))
 
     fc0 = model.graph.node[2]
     assert fc0.op_type.startswith("MVAU")
diff --git a/tests/fpgadataflow/test_convert_to_hw_layers_synthetic.py b/tests/fpgadataflow/test_convert_to_hw_layers_synthetic.py
index 6c83f10617..6a22f39cdc 100644
--- a/tests/fpgadataflow/test_convert_to_hw_layers_synthetic.py
+++ b/tests/fpgadataflow/test_convert_to_hw_layers_synthetic.py
@@ -210,7 +210,7 @@ def test_convert_to_hw_layers_synthetic(ch, ifmdim, idt):
 
     output_hw = oxe.execute_onnx(model, input_dict, True)
 
-    model = model.transform(SpecializeLayers())
+    model = model.transform(SpecializeLayers("xc7z020clg400-1"))
 
     # check topology status
 
diff --git a/tests/fpgadataflow/test_convert_to_hw_pool_batch.py b/tests/fpgadataflow/test_convert_to_hw_pool_batch.py
index d532cf345e..e155053b8b 100644
--- a/tests/fpgadataflow/test_convert_to_hw_pool_batch.py
+++ b/tests/fpgadataflow/test_convert_to_hw_pool_batch.py
@@ -186,7 +186,7 @@ def test_convert_to_hw_pool(idt, odt, pool_config, ifm_ch, pe, op_type, exec_mod
         inst.set_nodeattr("preferred_impl_style", "hls")
     y_produced = oxe.execute_onnx(new_model, input_dict)["outp"]
     assert (y_produced == y_expected).all()
-    new_model = new_model.transform(SpecializeLayers())
+    new_model = new_model.transform(SpecializeLayers("xc7z020clg400-1"))
 
     # Folding
     for n in new_model.graph.node:
diff --git a/tests/fpgadataflow/test_convert_to_hw_thresholding.py b/tests/fpgadataflow/test_convert_to_hw_thresholding.py
deleted file mode 100755
index 63cb5986e1..0000000000
--- a/tests/fpgadataflow/test_convert_to_hw_thresholding.py
+++ /dev/null
@@ -1,205 +0,0 @@
-# Copyright (C) 2024, Advanced Micro Devices, Inc.
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of FINN nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import pytest
-
-import numpy as np
-from onnx import TensorProto, helper
-from qonnx.core.datatype import DataType
-from qonnx.core.modelwrapper import ModelWrapper
-from qonnx.custom_op.general.multithreshold import multithreshold
-from qonnx.custom_op.registry import getCustomOp
-from qonnx.transformation.general import GiveUniqueNodeNames
-from qonnx.transformation.infer_datatypes import InferDataTypes
-from qonnx.transformation.infer_shapes import InferShapes
-from qonnx.util.basic import gen_finn_dt_tensor
-
-import finn.core.onnx_exec as oxe
-from finn.transformation.fpgadataflow.convert_to_hw_layers import InferThresholdingLayer
-from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
-
-test_fpga_part = "xczu3eg-sbva484-1-e"
-target_clk_ns = 5
-
-
-# Helper functions
-def sort_thresholds_increasing(thresholds):
-    return np.sort(thresholds, axis=1)
-
-
-def prepare_inputs(input_tensor):
-    return {"inp": input_tensor}
-
-
-# n = batch, c = channel, h = height, w = width of feature map
-# Standard = NCHW; FINN = NHWC
-# Convert from NHWC(FINN) to NCHW(Standard)
-def layout_FINN2NCHW(data):
-    return np.transpose(data, (0, 3, 1, 2))
-
-
-# Convert from NCHW(Standard) to NHWC(FINN)
-def layout_NCHW2FINN(data):
-    return np.transpose(data, (0, 2, 3, 1))
-
-
-def generate_random_threshold_values(input_data_type, num_input_channels, num_steps):
-    return np.random.randint(
-        input_data_type.min(),
-        input_data_type.max() + 1,
-        (num_input_channels, num_steps),
-    ).astype(np.float32)
-
-
-def generate_pe_value(fold, num_input_channels):
-    if fold == -1:
-        fold = num_input_channels
-    pe = num_input_channels // fold
-    assert num_input_channels % pe == 0
-    return pe
-
-
-def make_single_multithresholding_modelwrapper(
-    thresholds,
-    pe,
-    input_data_type,
-    output_data_type,
-    activation_bias,
-    num_input_vecs,
-):
-    NumChannels = thresholds.shape[0]
-
-    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, num_input_vecs + [NumChannels])
-    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, num_input_vecs + [NumChannels])
-
-    node_inp_list = ["inp", "thresh"]
-
-    Multithresholding_node = helper.make_node(
-        "MultiThreshold",
-        node_inp_list,
-        ["outp"],
-        domain="qonnx.custom_op.general",
-        out_dtype=output_data_type.name,
-        out_bias=float(activation_bias),
-        out_scale=1.0,
-    )
-
-    graph = helper.make_graph(
-        nodes=[Multithresholding_node],
-        name="multithresholding_graph",
-        inputs=[inp],
-        outputs=[outp],
-    )
-
-    model = helper.make_model(graph, producer_name="multithresholding-model")
-    model = ModelWrapper(model)
-    model = model.transform(InferShapes())
-    model = model.transform(InferDataTypes())
-    model = model.transform(GiveUniqueNodeNames())
-
-    model.set_tensor_datatype("inp", input_data_type)
-    model.set_tensor_datatype("outp", output_data_type)
-
-    model.set_tensor_datatype("thresh", input_data_type)
-    model.set_initializer("thresh", thresholds)
-    return model
-
-
-# N.B. Fold values where C % PE != 0 fail
-@pytest.mark.parametrize("activation", [DataType["INT4"], DataType["BIPOLAR"]])
-@pytest.mark.parametrize("input_data_type", [DataType["INT16"], DataType["UINT16"]])
-@pytest.mark.parametrize("fold", [-1, 1, 2, 4, 6])
-@pytest.mark.parametrize("num_input_channels", [16])
-@pytest.mark.parametrize("impl_style", ["hls", "rtl"])
-@pytest.mark.fpgadataflow
-@pytest.mark.vivado
-def test_convert_multithreshold_to_hardware(
-    impl_style,
-    activation,
-    input_data_type,
-    fold,
-    num_input_channels,
-):
-    # Handle inputs to the test
-    pe = generate_pe_value(fold, num_input_channels)
-    num_steps = activation.get_num_possible_values() - 1
-
-    # Other non-input parameters
-    num_input_vecs = [1, 2, 2]
-    output_data_type = activation
-    if output_data_type == DataType["BIPOLAR"]:
-        activation_bias = 0
-    else:
-        activation_bias = output_data_type.min()
-
-    # Generate random thresholds and sort in ascending order
-    thresholds = generate_random_threshold_values(input_data_type, num_input_channels, num_steps)
-
-    # provide non-decreasing/ascending thresholds
-    thresholds = sort_thresholds_increasing(thresholds)
-
-    # Make a Multithreshold graph and convert to thresholding binary search node
-    model = make_single_multithresholding_modelwrapper(
-        thresholds,
-        pe,
-        input_data_type,
-        output_data_type,
-        activation_bias,
-        num_input_vecs,
-    )
-
-    model = model.transform(InferThresholdingLayer())
-
-    # Perform functional validation of the InferThresholdingLayer transform
-    x = gen_finn_dt_tensor(input_data_type, tuple(num_input_vecs + [num_input_channels]))
-
-    x_nchw = layout_FINN2NCHW(x)
-    y_expected = multithreshold(x_nchw, thresholds)
-
-    # convert back to NHWC for comparison to hw outputs
-    y_expected = layout_NCHW2FINN(y_expected)
-    if activation == DataType["BIPOLAR"]:
-        # binary to bipolar
-        y_expected = 2 * y_expected - 1
-    else:
-        # signed offset
-        y_expected += activation.min()
-
-    input_dict = prepare_inputs(x)
-    y_produced = oxe.execute_onnx(model, input_dict)["outp"]
-
-    assert (y_produced == y_expected).all()
-
-    # Transform to the specified implementation style, either the
-    # RTL or HLS according to test parameters
-    node = model.get_nodes_by_op_type(model.graph.node[0].op_type)[0]
-    inst = getCustomOp(node)
-    inst.set_nodeattr("preferred_impl_style", impl_style)
-    model = model.transform(SpecializeLayers())
-    model = model.transform(InferShapes())
-    assert model.graph.node[0].op_type == "Thresholding_" + str(impl_style)
diff --git a/tests/fpgadataflow/test_depthwise_convolution.py b/tests/fpgadataflow/test_depthwise_convolution.py
index b8242df933..f684931478 100644
--- a/tests/fpgadataflow/test_depthwise_convolution.py
+++ b/tests/fpgadataflow/test_depthwise_convolution.py
@@ -182,7 +182,7 @@ def test_depthwise_conv_hw_cppsim(act, pe, k, stride, padding):
     new_model = model.transform(InferConvInpGen())
     new_model = new_model.transform(InferVectorVectorActivation())
 
-    new_model = new_model.transform(SpecializeLayers())
+    new_model = new_model.transform(SpecializeLayers("xc7z020clg400-1"))
 
     # set SIMD in ConvInputGen node and PE in VVAU node
     for n in new_model.graph.node:
@@ -226,7 +226,7 @@ def test_depthwise_conv_hw_rtlsim(act, pe, k, stride, padding):
     new_model = model.transform(InferConvInpGen())
     new_model = new_model.transform(InferVectorVectorActivation())
 
-    new_model = new_model.transform(SpecializeLayers())
+    new_model = new_model.transform(SpecializeLayers("xc7z020clg400-1"))
 
     # set SIMD in ConvInputGen node and PE in VVAU node
     for n in new_model.graph.node:
diff --git a/tests/fpgadataflow/test_fifosizing.py b/tests/fpgadataflow/test_fifosizing.py
index 338204c0c7..e5f9659665 100644
--- a/tests/fpgadataflow/test_fifosizing.py
+++ b/tests/fpgadataflow/test_fifosizing.py
@@ -70,7 +70,6 @@ def test_fifosizing_linear(method, topology):
         synth_clk_period_ns=10.0,
         board="Pynq-Z1",
         rtlsim_batch_size=100 if topology == "tfc" else 2,
-        shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ,
         generate_outputs=[
             build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
             build_cfg.DataflowOutputType.STITCHED_IP,
diff --git a/tests/fpgadataflow/test_fpgadataflow_addstreams.py b/tests/fpgadataflow/test_fpgadataflow_addstreams.py
index 530d94e13b..484cbbe04a 100644
--- a/tests/fpgadataflow/test_fpgadataflow_addstreams.py
+++ b/tests/fpgadataflow/test_fpgadataflow_addstreams.py
@@ -116,7 +116,7 @@ def test_fpgadataflow_addstreams(idt, ch, fold, exec_mode):
     y_produced = oxe.execute_onnx(model, input_dict)["outp"]
     assert (y_produced == y_expected).all(), "Execution of hw layer failed"
 
-    model = model.transform(SpecializeLayers())
+    model = model.transform(SpecializeLayers("xc7z020clg400-1"))
 
     if exec_mode == "cppsim":
         model = model.transform(PrepareCppSim())
diff --git a/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py b/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py
index d5fa7c779f..2ad49ae58b 100644
--- a/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py
+++ b/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py
@@ -139,7 +139,7 @@ def test_fpgadataflow_channelwise_ops(idt, act, pdt, nf, ich, func, vecs, exec_m
 
     assert (y_produced == y_expected).all(), "HW layer execution failed"
 
-    model = model.transform(SpecializeLayers())
+    model = model.transform(SpecializeLayers("xc7z020clg400-1"))
 
     if exec_mode == "cppsim":
         model = model.transform(PrepareCppSim())
diff --git a/tests/fpgadataflow/test_fpgadataflow_checksum.py b/tests/fpgadataflow/test_fpgadataflow_checksum.py
index 34a48996c9..817d13e13d 100644
--- a/tests/fpgadataflow/test_fpgadataflow_checksum.py
+++ b/tests/fpgadataflow/test_fpgadataflow_checksum.py
@@ -176,7 +176,7 @@ def test_fpgadataflow_checksum():
 
     # rtlsim
     model = model.transform(InsertFIFO(True))
-    model = model.transform(SpecializeLayers())
+    model = model.transform(SpecializeLayers(test_fpga_part))
     model = model.transform(GiveUniqueNodeNames())
     model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
     model = model.transform(HLSSynthIP())
diff --git a/tests/fpgadataflow/test_fpgadataflow_concat.py b/tests/fpgadataflow/test_fpgadataflow_concat.py
index b52b14fca3..25c738d049 100644
--- a/tests/fpgadataflow/test_fpgadataflow_concat.py
+++ b/tests/fpgadataflow/test_fpgadataflow_concat.py
@@ -98,7 +98,7 @@ def test_fpgadataflow_concat(exec_mode, idt):
     assert model.graph.node[0].domain == "finn.custom_op.fpgadataflow"
     ret = execute_onnx(model, inp_dict)
     assert (ret[oname] == exp_out).all()
-    model = model.transform(SpecializeLayers())
+    model = model.transform(SpecializeLayers("xc7z020clg400-1"))
     assert model.graph.node[0].op_type == "StreamingConcat_hls"
     assert model.graph.node[0].domain == "finn.custom_op.fpgadataflow.hls"
     if exec_mode == "cppsim":
@@ -141,11 +141,11 @@ def test_fpgadataflow_concat_stitchedip():
     model = model.transform(InferConcatLayer())
     assert model.graph.node[0].op_type == "StreamingConcat"
     assert model.graph.node[0].domain == "finn.custom_op.fpgadataflow"
-    model = model.transform(SpecializeLayers())
+    model = model.transform(SpecializeLayers(fpga_part))
     assert model.graph.node[0].op_type == "StreamingConcat_hls"
     assert model.graph.node[0].domain == "finn.custom_op.fpgadataflow.hls"
     model = model.transform(InsertFIFO(create_shallow_fifos=True))
-    model = model.transform(SpecializeLayers())
+    model = model.transform(SpecializeLayers(fpga_part))
     model = model.transform(GiveUniqueNodeNames())
     model = model.transform(PrepareIP(fpga_part, clk_ns))
     model = model.transform(HLSSynthIP())
diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
index 45ca74fbea..dc5dc0c02a 100644
--- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
+++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
@@ -189,7 +189,7 @@ def test_fpgadataflow_slidingwindow(
     # set impl_style
     inst = getCustomOp(model.get_nodes_by_op_type("ConvolutionInputGenerator")[0])
     inst.set_nodeattr("preferred_impl_style", impl_style)
-    model = model.transform(SpecializeLayers())
+    model = model.transform(SpecializeLayers("xc7z020clg400-1"))
     # set simd
     inst = getCustomOp(model.graph.node[0])
     inst.set_nodeattr("SIMD", simd)
diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py
index 6c0712b7b0..26ce8f5f0e 100644
--- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py
+++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py
@@ -41,11 +41,13 @@
 from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
 from qonnx.transformation.infer_datatypes import InferDataTypes
 from qonnx.transformation.infer_shapes import InferShapes
-from qonnx.transformation.lower_convs_to_matmul import (
-    LowerConvsToMatMul,
-    _auto_pad_to_explicit_padding,
+from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
+from qonnx.util.basic import (
+    auto_pad_to_explicit_padding,
+    gen_finn_dt_tensor,
+    get_by_name,
+    qonnx_make_model,
 )
-from qonnx.util.basic import gen_finn_dt_tensor, get_by_name, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw
@@ -69,11 +71,11 @@ def create_conv_model(idim_h, idim_w, ifm, k, stride, ofm, idt, wdt, pad_mode, d
     group = ifm if depthwise else 1
     group_str = str(group)
     ishp = (1, ifm, idim_h, idim_w)
-    pad_0 = _auto_pad_to_explicit_padding(pad_mode, idim_h, idim_w, k, k, stride, stride, 2)
+    pad_0 = auto_pad_to_explicit_padding(pad_mode, idim_h, idim_w, k, k, stride, stride, 2)
     int_dim_h = compute_conv_output_dim(idim_h, k, stride, total_pad=pad_0[0] + pad_0[2])
     int_dim_w = compute_conv_output_dim(idim_w, k, stride, total_pad=pad_0[1] + pad_0[3])
 
-    pad_1 = _auto_pad_to_explicit_padding(pad_mode, int_dim_h, int_dim_w, k, k, stride, stride, 2)
+    pad_1 = auto_pad_to_explicit_padding(pad_mode, int_dim_h, int_dim_w, k, k, stride, stride, 2)
     odim_h = compute_conv_output_dim(int_dim_h, k, stride, total_pad=pad_1[0] + pad_1[2])
     odim_w = compute_conv_output_dim(int_dim_w, k, stride, total_pad=pad_1[1] + pad_1[3])
     oshp = (1, ifm, odim_h, odim_w) if depthwise else (1, ofm, odim_h, odim_w)
@@ -253,7 +255,7 @@ def test_fpgadataflow_conv_dynamic(cfg):
     model = model.transform(to_hw.InferQuantizedMatrixVectorActivation())
     model = model.transform(to_hw.InferVectorVectorActivation())
     model = model.transform(absorb.AbsorbConsecutiveTransposes())
-    model = model.transform(SpecializeLayers())
+    model = model.transform(SpecializeLayers("xc7z020clg400-1"))
     parent_model = model.transform(CreateDataflowPartition())
     sdp_inst = getCustomOp(parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0])
     model = ModelWrapper(sdp_inst.get_nodeattr("model"))
@@ -281,7 +283,7 @@ def test_fpgadataflow_conv_dynamic(cfg):
             getCustomOp(comp_node).set_nodeattr("PE", 4)
     model = model.transform(InsertDWC())
     model = model.transform(InsertFIFO(create_shallow_fifos=True))
-    model = model.transform(SpecializeLayers())
+    model = model.transform(SpecializeLayers("xc7z020clg400-1"))
     model = model.transform(GiveUniqueNodeNames())
     model = model.transform(GiveReadableTensorNames())
     model = model.transform(PrepareIP("xc7z020clg400-1", 5))
@@ -523,11 +525,11 @@ def test_fpgadataflow_slidingwindow_rtl_dynamic(
         dw=dw,
     )
 
-    model = model.transform(SpecializeLayers())
+    model = model.transform(SpecializeLayers("xc7z020clg400-1"))
     # Simulate using stitched-ip-rtlsim so we can use existing infrastructure
     # that supports hook functions to re-program configuration before rtlsim
     model = model.transform(InsertFIFO(True))  # required for proper simulation
-    model = model.transform(SpecializeLayers())
+    model = model.transform(SpecializeLayers("xc7z020clg400-1"))
     model = model.transform(GiveUniqueNodeNames())
     model = model.transform(PrepareIP("xc7z020clg400-1", 5))
     model = model.transform(HLSSynthIP())
diff --git a/tests/fpgadataflow/test_fpgadataflow_deconv.py b/tests/fpgadataflow/test_fpgadataflow_deconv.py
index f1fc989066..16cf7481f2 100644
--- a/tests/fpgadataflow/test_fpgadataflow_deconv.py
+++ b/tests/fpgadataflow/test_fpgadataflow_deconv.py
@@ -169,7 +169,7 @@ def test_fpgadataflow_deconv(idim, stride, ifm_ch, ofm_ch, simd, pe, k, padding,
     y_produced = oxe.execute_onnx(model, input_dict)["outp"]
     assert (y_produced == y_expected).all()
 
-    model = model.transform(SpecializeLayers())
+    model = model.transform(SpecializeLayers(test_fpga_part))
     model = model.transform(MinimizeAccumulatorWidth())
 
     for n in model.graph.node:
diff --git a/tests/fpgadataflow/test_fpgadataflow_downsampler.py b/tests/fpgadataflow/test_fpgadataflow_downsampler.py
index 25717a4152..fb9d52eb51 100644
--- a/tests/fpgadataflow/test_fpgadataflow_downsampler.py
+++ b/tests/fpgadataflow/test_fpgadataflow_downsampler.py
@@ -131,7 +131,7 @@ def test_fpgadataflow_downsampler(is_1d, flip_1d, exec_mode):
     assert len(model.get_nodes_by_op_type("DownSampler")) == 1
     y_produced = execute_onnx(model, idict)["out0"]
     assert (y_produced == y_expected).all()
-    model = model.transform(SpecializeLayers())
+    model = model.transform(SpecializeLayers("xc7z020clg400-1"))
     if exec_mode == "cppsim":
         model = model.transform(SetExecMode("cppsim"))
         model = model.transform(PrepareCppSim())
diff --git a/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py b/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py
index 62b9265466..7ac9cbe3fb 100644
--- a/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py
+++ b/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py
@@ -128,7 +128,7 @@ def test_fpgadataflow_duplicatestreams(idt, ch, fold, imdim, n_dupl, exec_mode,
         y = output_dict["outp%d" % i]
         assert (y == expected_y).all(), "HW layer execution failed"
 
-    model = model.transform(SpecializeLayers())
+    model = model.transform(SpecializeLayers("xc7z020clg400-1"))
 
     if exec_mode == "cppsim":
         model = model.transform(PrepareCppSim())
diff --git a/tests/fpgadataflow/test_fpgadataflow_dwc.py b/tests/fpgadataflow/test_fpgadataflow_dwc.py
index 7152d32a7b..6b79a39ed5 100644
--- a/tests/fpgadataflow/test_fpgadataflow_dwc.py
+++ b/tests/fpgadataflow/test_fpgadataflow_dwc.py
@@ -47,7 +47,7 @@
 from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
 
 
-def make_single_dwc_modelwrapper(shape, inWidth, outWidth, finn_dtype):
+def make_single_dwc_modelwrapper(shape, inWidth, outWidth, finn_dtype, impl_style):
     inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, shape)
     outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, shape)
 
@@ -63,6 +63,7 @@ def make_single_dwc_modelwrapper(shape, inWidth, outWidth, finn_dtype):
         inWidth=inWidth,
         outWidth=outWidth,
         dataType=str(finn_dtype.name),
+        preferred_impl_style=impl_style,
     )
 
     graph = helper.make_graph(nodes=[DWC_node], name="dwc_graph", inputs=[inp], outputs=[outp])
@@ -86,17 +87,17 @@ def prepare_inputs(input_tensor, dt):
         ([1, 24], 6, 4, DataType["INT2"]),
         ([1, 24], 4, 6, DataType["INT2"]),
         ([1, 4], 2, 4, DataType["BIPOLAR"]),
-        ([1, 2, 8], 2, 4, DataType["BIPOLAR"]),
         ([1, 4], 4, 2, DataType["INT2"]),
         ([1, 2, 8], 4, 4, DataType["INT2"]),
         ([1, 2, 8], 8, 16, DataType["INT2"]),
     ],
 )
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
+@pytest.mark.parametrize("impl_style", ["hls", "rtl"])
 @pytest.mark.fpgadataflow
 @pytest.mark.slow
 @pytest.mark.vivado
-def test_fpgadataflow_dwc(config, exec_mode):
+def test_fpgadataflow_dwc(config, exec_mode, impl_style):
     shape, inWidth, outWidth, finn_dtype = config
 
     test_fpga_part = "xc7z020clg400-1"
@@ -104,7 +105,7 @@ def test_fpgadataflow_dwc(config, exec_mode):
     x = gen_finn_dt_tensor(finn_dtype, shape)
     input_dict = prepare_inputs(x, finn_dtype)
 
-    model = make_single_dwc_modelwrapper(shape, inWidth, outWidth, finn_dtype)
+    model = make_single_dwc_modelwrapper(shape, inWidth, outWidth, finn_dtype, impl_style)
     # verify abstraction level execution
     y = oxe.execute_onnx(model, input_dict)["outp"]
     assert (
@@ -113,7 +114,7 @@ def test_fpgadataflow_dwc(config, exec_mode):
         input values anymore."""
     assert y.shape == tuple(shape), """The output shape is incorrect."""
 
-    model = model.transform(SpecializeLayers())
+    model = model.transform(SpecializeLayers(test_fpga_part))
     model = model.transform(GiveUniqueNodeNames())
     if exec_mode == "cppsim":
         model = model.transform(PrepareCppSim())
@@ -136,19 +137,17 @@ def test_fpgadataflow_dwc(config, exec_mode):
 @pytest.mark.parametrize(
     "config",
     [
-        ([1, 24], 6, 4, DataType["INT2"]),
-        ([1, 24], 4, 6, DataType["INT2"]),
         ([1, 4], 2, 4, DataType["BIPOLAR"]),
-        ([1, 2, 8], 2, 4, DataType["BIPOLAR"]),
         ([1, 4], 4, 2, DataType["INT2"]),
         ([1, 2, 8], 4, 4, DataType["INT2"]),
         ([1, 2, 8], 8, 16, DataType["INT2"]),
     ],
 )
+@pytest.mark.parametrize("impl_style", ["hls", "rtl"])
 @pytest.mark.fpgadataflow
 @pytest.mark.slow
 @pytest.mark.vivado
-def test_fpgadataflow_dwc_stitched_rtlsim(config):
+def test_fpgadataflow_dwc_stitched_rtlsim(config, impl_style):
     shape, inWidth, outWidth, finn_dtype = config
 
     test_fpga_part = "xc7z020clg400-1"
@@ -157,10 +156,10 @@ def test_fpgadataflow_dwc_stitched_rtlsim(config):
     x = gen_finn_dt_tensor(finn_dtype, shape)
     input_dict = prepare_inputs(x, finn_dtype)
 
-    model = make_single_dwc_modelwrapper(shape, inWidth, outWidth, finn_dtype)
-    model = model.transform(SpecializeLayers())
+    model = make_single_dwc_modelwrapper(shape, inWidth, outWidth, finn_dtype, impl_style)
+    model = model.transform(SpecializeLayers(test_fpga_part))
     model = model.transform(InsertFIFO(create_shallow_fifos=True))
-    model = model.transform(SpecializeLayers())
+    model = model.transform(SpecializeLayers(test_fpga_part))
     model = model.transform(GiveUniqueNodeNames())
     model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
     model = model.transform(HLSSynthIP())
diff --git a/tests/fpgadataflow/test_fpgadataflow_eltwise.py b/tests/fpgadataflow/test_fpgadataflow_eltwise.py
index fbfcc8e28b..996477f28f 100644
--- a/tests/fpgadataflow/test_fpgadataflow_eltwise.py
+++ b/tests/fpgadataflow/test_fpgadataflow_eltwise.py
@@ -114,7 +114,7 @@ def test_fpgadataflow_eltwise(dt0, ch, fold, do_abs, exec_mode):
     y_produced = execute_onnx(model, idict)["out0"]
     assert (y_produced == y_expected).all(), exec_mode + " failed"
 
-    model = model.transform(SpecializeLayers())
+    model = model.transform(SpecializeLayers("xc7z020clg400-1"))
 
     assert len(model.graph.node) == 1
     assert model.graph.node[0].op_type == "StreamingEltwise_hls"
diff --git a/tests/fpgadataflow/test_fpgadataflow_fifo.py b/tests/fpgadataflow/test_fpgadataflow_fifo.py
index 1719da1454..f628a0e7af 100644
--- a/tests/fpgadataflow/test_fpgadataflow_fifo.py
+++ b/tests/fpgadataflow/test_fpgadataflow_fifo.py
@@ -96,7 +96,7 @@ def test_fpgadataflow_fifo_rtlsim(Shape, folded_shape, depth, finn_dtype):
     input_dict = prepare_inputs(x, finn_dtype)
 
     model = make_single_fifo_modelwrapper(Shape, depth, folded_shape, finn_dtype)
-    model = model.transform(SpecializeLayers())
+    model = model.transform(SpecializeLayers(test_fpga_part))
 
     model = model.transform(SetExecMode("rtlsim"))
     model = model.transform(GiveUniqueNodeNames())
diff --git a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
index 45cc265ac7..87e3267186 100644
--- a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
+++ b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
@@ -135,7 +135,7 @@ def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, idt, mode, impl_style):
     assert y_produced.shape == expected_oshape
     assert (y_produced == y_expected).all(), "HW layer execution failed"
 
-    model = model.transform(SpecializeLayers())
+    model = model.transform(SpecializeLayers(test_fpga_part))
 
     model = model.transform(InferShapes())
     model = model.transform(SetExecMode(mode))
diff --git a/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py b/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py
index 9c2802aade..cca4bb7e8e 100644
--- a/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py
+++ b/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py
@@ -112,7 +112,7 @@ def test_fpgadataflow_globalaccpool(idt, ch, fold, imdim, exec_mode, impl_style)
 
     assert (y == expected_y).all(), "HW layer verification failed"
 
-    model = model.transform(SpecializeLayers())
+    model = model.transform(SpecializeLayers("xc7z020clg400-1"))
 
     if exec_mode == "cppsim":
         model = model.transform(PrepareCppSim())
diff --git a/tests/fpgadataflow/test_fpgadataflow_labelselect.py b/tests/fpgadataflow/test_fpgadataflow_labelselect.py
index 98ded66ca7..83ab2ddcaf 100644
--- a/tests/fpgadataflow/test_fpgadataflow_labelselect.py
+++ b/tests/fpgadataflow/test_fpgadataflow_labelselect.py
@@ -118,7 +118,7 @@ def test_fpgadataflow_labelselect(idt, labels, fold, k, exec_mode, impl_style):
 
     assert soft_verify_topk(x, y, k), "HW layer execution failed"
 
-    model = model.transform(SpecializeLayers())
+    model = model.transform(SpecializeLayers("xc7z020clg400-1"))
 
     if exec_mode == "cppsim":
         model = model.transform(PrepareCppSim())
diff --git a/tests/fpgadataflow/test_fpgadataflow_lookup.py b/tests/fpgadataflow/test_fpgadataflow_lookup.py
index cb15fa3ae5..d5aadc33d4 100644
--- a/tests/fpgadataflow/test_fpgadataflow_lookup.py
+++ b/tests/fpgadataflow/test_fpgadataflow_lookup.py
@@ -131,7 +131,7 @@ def test_fpgadataflow_lookup(edt, embedding_cfg, exec_mode):
     ret_hw = execute_onnx(model, {iname: itensor})
     assert (exp_out == ret_hw[oname]).all()
     # call transformation to convert abstraction layer into HLS layer
-    model = model.transform(SpecializeLayers())
+    model = model.transform(SpecializeLayers("xczu3eg-sbva484-1-e"))
     assert model.graph.node[0].op_type == "Lookup_hls"
     if exec_mode == "cppsim":
         model = model.transform(GiveUniqueNodeNames())
@@ -174,7 +174,7 @@ def test_fpgadataflow_lookup_external():
     assert (model.get_initializer(ename) == embeddings).all()
     model = model.transform(InferLookupLayer())
     assert model.graph.node[0].op_type == "Lookup"
-    model = model.transform(SpecializeLayers())
+    model = model.transform(SpecializeLayers(fpga_part))
     assert model.graph.node[0].op_type == "Lookup_hls"
     assert model.graph.node[0].input[0] == iname
     assert model.graph.node[0].input[1] == ename
diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau.py b/tests/fpgadataflow/test_fpgadataflow_mvau.py
index 2a22f3fc41..1ec77f4eec 100644
--- a/tests/fpgadataflow/test_fpgadataflow_mvau.py
+++ b/tests/fpgadataflow/test_fpgadataflow_mvau.py
@@ -312,7 +312,7 @@ def test_fpgadataflow_mvau_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
         inst.set_nodeattr("mem_mode", mem_mode)
         # Note: only HLS-based MVAU layers execute CPPsim
         inst.set_nodeattr("preferred_impl_style", "hls")
-    model = model.transform(SpecializeLayers("xc7z020clg400-1"))
+    model = model.transform(SpecializeLayers("xczu7ev-ffvc1156-2-e"))
     model = model.transform(GiveUniqueNodeNames())
     model = model.transform(SetExecMode("cppsim"))
     model = model.transform(PrepareCppSim())
@@ -423,10 +423,10 @@ def test_fpgadataflow_mvau_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
     y_expected = y.reshape(oshape)
     # TODO split up into several dependent tests -- need to check how this
     # works for parametrized tests...
-    model = model.transform(SpecializeLayers("xc7z020clg400-1"))
+    model = model.transform(SpecializeLayers("xczu7ev-ffvc1156-2-e"))
     model = model.transform(SetExecMode("rtlsim"))
     model = model.transform(GiveUniqueNodeNames())
-    model = model.transform(PrepareIP("xc7z020clg400-1", 5))
+    model = model.transform(PrepareIP("xczu7ev-ffvc1156-2-e", 5))
     model = model.transform(HLSSynthIP())
     model = model.transform(PrepareRTLSim())
     y_produced = oxe.execute_onnx(model, input_dict)["outp"]
@@ -531,12 +531,12 @@ def test_fpgadataflow_mvau_large_depth_decoupled_mode_rtlsim(
     y_expected = y.reshape(oshape)
     # TODO split up into several dependent tests -- need to check how this
     # works for parametrized tests...
-    model = model.transform(SpecializeLayers("xc7z020clg400-1"))
+    model = model.transform(SpecializeLayers("xczu7ev-ffvc1156-2-e"))
     model = model.transform(MinimizeWeightBitWidth())
     model = model.transform(MinimizeAccumulatorWidth())
     model = model.transform(SetExecMode("rtlsim"))
     model = model.transform(GiveUniqueNodeNames())
-    model = model.transform(PrepareIP("xc7z020clg400-1", 5))
+    model = model.transform(PrepareIP("xczu7ev-ffvc1156-2-e", 5))
     model = model.transform(HLSSynthIP())
     model = model.transform(PrepareRTLSim())
     y_produced = oxe.execute_onnx(model, input_dict)["outp"]
@@ -611,12 +611,12 @@ def test_mvau_fifocharacterize_rtlsim(
         inst.set_nodeattr("preferred_impl_style", preferred_impl_style)
     total_fold = nf * sf
     exp_total_cycles = total_fold + 10
-    model = model.transform(SpecializeLayers("xc7z020clg400-1"))
+    model = model.transform(SpecializeLayers("xczu7ev-ffvc1156-2-e"))
     model = model.transform(MinimizeWeightBitWidth())
     model = model.transform(MinimizeAccumulatorWidth())
     model = model.transform(SetExecMode("rtlsim"))
     model = model.transform(GiveUniqueNodeNames())
-    model = model.transform(PrepareIP("xc7z020clg400-1", 5))
+    model = model.transform(PrepareIP("xczu7ev-ffvc1156-2-e", 5))
     model = model.transform(HLSSynthIP())
     model = model.transform(PrepareRTLSim())
     model = model.transform(DeriveCharacteristic(exp_total_cycles))
@@ -635,17 +635,19 @@ def test_mvau_fifocharacterize_rtlsim(
 
 @pytest.mark.parametrize("mh", [18])
 @pytest.mark.parametrize("mw", [128])
-@pytest.mark.parametrize("pe", [1, 6, 9, 18])
-@pytest.mark.parametrize("simd", [1, 4, 16, 64, 128])
+@pytest.mark.parametrize("pe", [1, 9, 18])
+@pytest.mark.parametrize("simd", [1, 64, 128])
 @pytest.mark.parametrize("idt", [DataType["UINT4"], DataType["UINT8"]])
 @pytest.mark.parametrize("wdt", [DataType["INT4"], DataType["INT8"]])
-@pytest.mark.parametrize("part", ["xcvc1902-vsva2197-2MP-e-S", "xcku3p-ffva676-1-e"])
+@pytest.mark.parametrize(
+    "part", ["xcvc1902-vsva2197-2MP-e-S", "xcku3p-ffva676-1-e", "xc7z020clg400-1"]
+)
 @pytest.mark.parametrize("clk_ns", [1.66, 4])
 @pytest.mark.fpgadataflow
 @pytest.mark.slow
 @pytest.mark.vivado
 def test_fpgadataflow_rtl_mvau(mh, mw, pe, simd, idt, wdt, part, clk_ns):
-    if part == "xcku3p-ffva676-1-e" and clk_ns != 1.66:
+    if part != "xcvc1902-vsva2197-2MP-e-S" and clk_ns != 1.66:
         pytest.skip(
             """Skip test for varying clk for devices other than Versal,
             since this variable only affects DSP58s"""
@@ -657,6 +659,9 @@ def test_fpgadataflow_rtl_mvau(mh, mw, pe, simd, idt, wdt, part, clk_ns):
     ifm = helper.make_tensor_value_info("ifm", TensorProto.FLOAT, [1, ofm_h, ofm_w, mw])
     ofm = helper.make_tensor_value_info("ofm", TensorProto.FLOAT, (1, ofm_h, ofm_w, mh))
     W = gen_finn_dt_tensor(wdt, (mw, mh))
+    # if 7 series, force weights to narrow range
+    if part == "xc7z020clg400-1":
+        W = np.clip(W, wdt.min() + 1, wdt.max())
     model = make_single_matmul_modelwrapper(ifm, ofm, idt, wdt, W)
     model = model.transform(GiveUniqueNodeNames())
     model = model.transform(GiveReadableTensorNames())
diff --git a/tests/fpgadataflow/test_fpgadataflow_res_estimate.py b/tests/fpgadataflow/test_fpgadataflow_res_estimate.py
index 1bc2d9d59e..d81936f7e5 100644
--- a/tests/fpgadataflow/test_fpgadataflow_res_estimate.py
+++ b/tests/fpgadataflow/test_fpgadataflow_res_estimate.py
@@ -28,6 +28,7 @@
 
 import pytest
 
+from functools import partial
 from onnx import TensorProto, helper
 from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
@@ -40,6 +41,8 @@
 )
 from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
 
+test_fpga_part = "xczu3eg-sbva484-1-e"
+
 
 def check_two_dict_for_equality(dict1, dict2):
     for key in dict1:
@@ -96,9 +99,9 @@ def test_res_estimate():
     model.set_tensor_datatype("outp", odt)
     model.set_tensor_datatype("weights", wdt)
 
-    model.transform(SpecializeLayers())
+    model.transform(SpecializeLayers(test_fpga_part))
     model = model.transform(GiveUniqueNodeNames())
-    prod_resource_estimation = model.analysis(res_estimation)
+    prod_resource_estimation = model.analysis(partial(res_estimation, fpgapart=test_fpga_part))
     expect_resource_estimation = {
         "MVAU_hls_0": {
             "BRAM_18K": 0,
@@ -115,7 +118,9 @@ def test_res_estimate():
     ), """The produced output of
     the res_estimation analysis pass is not equal to the expected one"""
 
-    prod_resource_estimation = model.analysis(res_estimation_complete)
+    prod_resource_estimation = model.analysis(
+        partial(res_estimation_complete, fpgapart=test_fpga_part)
+    )
     expect_resource_estimation = {
         "MVAU_hls_0": [
             {
diff --git a/tests/fpgadataflow/test_fpgadataflow_streamingmaxpool.py b/tests/fpgadataflow/test_fpgadataflow_streamingmaxpool.py
index 0df7181a60..c520fb50fc 100644
--- a/tests/fpgadataflow/test_fpgadataflow_streamingmaxpool.py
+++ b/tests/fpgadataflow/test_fpgadataflow_streamingmaxpool.py
@@ -146,7 +146,7 @@ def test_fpgadataflow_streamingmaxpool(idt, dim_1d, k, ifm_dim, ifm_ch, pe, ceil
     y_produced = oxe.execute_onnx(model, input_dict)["outp"]
     assert (y_produced == y_expected).all()
 
-    model = model.transform(SpecializeLayers())
+    model = model.transform(SpecializeLayers("xczu3eg-sbva484-1-e"))
 
     # Ensure PE value is set
     streamingmaxpool_node = model.get_nodes_by_op_type("StreamingMaxPool_hls")[0]
diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding.py b/tests/fpgadataflow/test_fpgadataflow_thresholding.py
index a6e7e41596..2079fe7fc5 100644
--- a/tests/fpgadataflow/test_fpgadataflow_thresholding.py
+++ b/tests/fpgadataflow/test_fpgadataflow_thresholding.py
@@ -29,38 +29,43 @@
 import pytest
 
 import numpy as np
-import os
 from onnx import TensorProto, helper
-from pyverilator.util.axi_utils import axilite_read, axilite_write
 from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
-from qonnx.custom_op.general.multithreshold import multithreshold
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveUniqueNodeNames
-from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import gen_finn_dt_tensor
 
 import finn.core.onnx_exec as oxe
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 from finn.analysis.fpgadataflow.hls_synth_res_estimation import hls_synth_res_estimation
-from finn.core.rtlsim_exec import rtlsim_exec
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
-from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
+from finn.transformation.fpgadataflow.convert_to_hw_layers import InferThresholdingLayer
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
-from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
+from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds
 
 test_fpga_part = "xczu3eg-sbva484-1-e"
 target_clk_ns = 5
 
 
-def generate_random_threshold_values(input_data_type, num_input_channels, num_steps):
+def generate_random_threshold_values(
+    data_type, num_input_channels, num_steps, narrow=False, per_tensor=False
+):
+    if per_tensor:
+        num_input_channels = 1
+    if narrow:
+        num_steps -= 1
+
     return np.random.randint(
-        input_data_type.min(),
-        input_data_type.max() + 1,
+        data_type.min(),
+        data_type.max() + 1,
         (num_input_channels, num_steps),
     ).astype(np.float32)
 
@@ -69,76 +74,93 @@ def sort_thresholds_increasing(thresholds):
     return np.sort(thresholds, axis=1)
 
 
-# n = batch, c = channel, h = height, w = width of feature map
-# Standard = NCHW; FINN = NHWC
-# Convert from NHWC(FINN) to NCHW(Standard)
-def layout_FINN2NCHW(data):
-    return np.transpose(data, (0, 3, 1, 2))
-
-
-# Convert from NCHW(Standard) to NHWC(FINN)
-def layout_NCHW2FINN(data):
-    return np.transpose(data, (0, 2, 3, 1))
-
-
-def make_single_thresholding_modelwrapper(impl_style, T, idt, odt, actval, n_inp_vecs):
-    NumChannels = T.shape[0]
-
-    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, n_inp_vecs + [NumChannels])
-    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, n_inp_vecs + [NumChannels])
+def make_single_multithresholding_modelwrapper(
+    thresholds,
+    input_data_type,
+    threshold_data_type,
+    output_data_type,
+    activation_bias,
+    num_input_vecs,
+    num_channels,
+):
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, num_input_vecs + [num_channels])
+    thresh = helper.make_tensor_value_info("thresh", TensorProto.FLOAT, thresholds.shape)
+    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, num_input_vecs + [num_channels])
 
     node_inp_list = ["inp", "thresh"]
 
-    Thresholding_node = helper.make_node(
-        "Thresholding",
+    Multithresholding_node = helper.make_node(
+        "MultiThreshold",
         node_inp_list,
         ["outp"],
-        domain="finn.custom_op.fpgadataflow",
-        backend="fpgadataflow",
-        NumChannels=NumChannels,
-        numSteps=T.shape[1],
-        inputDataType=idt.name,
-        weightDataType=idt.name,  # will be set by MinimizeAccumulatorWidth
-        outputDataType=odt.name,
-        ActVal=actval,
-        numInputVectors=n_inp_vecs,
-        preferred_impl_style=impl_style,
+        domain="qonnx.custom_op.general",
+        out_dtype=output_data_type.name,
+        out_bias=float(activation_bias),
+        out_scale=1.0,
+        data_layout="NHWC",
     )
+
     graph = helper.make_graph(
-        nodes=[Thresholding_node],
-        name="thresholding_graph",
+        nodes=[Multithresholding_node],
+        name="multithresholding_graph",
         inputs=[inp],
         outputs=[outp],
+        value_info=[thresh],
     )
 
-    model = qonnx_make_model(graph, producer_name="thresholding-model")
+    model = helper.make_model(graph, producer_name="multithresholding-model")
     model = ModelWrapper(model)
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+    model = model.transform(GiveUniqueNodeNames())
 
-    model.set_tensor_datatype("inp", idt)
-    model.set_tensor_datatype("outp", odt)
+    model.set_tensor_datatype("inp", input_data_type)
+    model.set_tensor_datatype("outp", output_data_type)
 
-    model.set_tensor_datatype("thresh", idt)
-    model.set_initializer("thresh", T)
+    model.set_tensor_datatype("thresh", threshold_data_type)
+    model.set_initializer("thresh", thresholds)
     return model
 
 
-# activation: None or DataType
-@pytest.mark.parametrize("act", [DataType["INT4"], DataType["BIPOLAR"]])
-# input datatype
-@pytest.mark.parametrize("idt", [DataType["INT16"], DataType["UINT16"]])
-# folding, -1 is maximum possible
-@pytest.mark.parametrize("nf", [-1, 2, 1])
-# number of input features
-@pytest.mark.parametrize("ich", [16])
-# execution mode
+@pytest.mark.parametrize("num_input_channels", [6, 16])
+@pytest.mark.parametrize(
+    "num_input_vecs",
+    [
+        [1],
+        [1, 2, 2],
+    ],
+)
+@pytest.mark.parametrize("activation", [DataType["UINT4"], DataType["INT4"], DataType["BIPOLAR"]])
+@pytest.mark.parametrize(
+    "idt_tdt_cfg",
+    [
+        (DataType["INT8"], DataType["INT25"]),
+        (DataType["UINT5"], DataType["UINT8"]),
+    ],
+)
+@pytest.mark.parametrize("fold", [-1, 1, 2])
+@pytest.mark.parametrize("narrow", [True, False])
+@pytest.mark.parametrize("per_tensor", [True, False])
+@pytest.mark.parametrize("impl_style", ["hls", "rtl"])
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
-# memory mode
 @pytest.mark.parametrize("mem_mode", ["internal_embedded", "internal_decoupled"])
-@pytest.mark.parametrize("impl_style", ["rtl", "hls"])
+@pytest.mark.parametrize("round_thresh", [True, False])
 @pytest.mark.fpgadataflow
 @pytest.mark.vivado
 @pytest.mark.slow
-def test_fpgadataflow_thresholding(impl_style, idt, act, nf, ich, exec_mode, mem_mode):
+def test_fpgadataflow_thresholding(
+    num_input_channels,
+    num_input_vecs,
+    activation,
+    idt_tdt_cfg,
+    fold,
+    narrow,
+    per_tensor,
+    impl_style,
+    exec_mode,
+    mem_mode,
+    round_thresh,
+):
     # the mem_mode parameter can only be used for the hls thresholding
     # so the test will only be executed once for impl_style=rtl and once skipped
     # when the mem_mode is varied. Otherwise, the same test configuration would always
@@ -147,66 +169,76 @@ def test_fpgadataflow_thresholding(impl_style, idt, act, nf, ich, exec_mode, mem
         pytest.skip(
             "Skip, because test is identical to impl_style=rtl and mem_mode=internal_embedded"
         )
-    if nf == -1:
-        nf = ich
-    pe = ich // nf
-    n_inp_vecs = [1, 2, 2]
-    assert ich % pe == 0
-
-    # generate input data, data layout is NHWC for FINN
-    x = gen_finn_dt_tensor(idt, tuple(n_inp_vecs + [ich]))
-
-    odt = act
-    n_steps = act.get_num_possible_values() - 1
-
-    # Generate random, non-decreasing thresholds
-    thresholds = generate_random_threshold_values(idt, ich, n_steps)
-
-    thresholds = sort_thresholds_increasing(thresholds)
-
-    if odt == DataType["BIPOLAR"]:
-        actval = 0
+    if narrow and activation == DataType["BIPOLAR"]:
+        pytest.skip("Narrow needs to be false with biploar activation.")
+    input_data_type, threshold_data_type = idt_tdt_cfg
+    num_steps = activation.get_num_possible_values() - 1
+
+    if fold == -1:
+        fold = num_input_channels
+    pe = num_input_channels // fold
+    if num_input_channels % pe != 0:
+        pytest.skip("Invalid folding configuration. Skipping test.")
+
+    output_data_type = activation
+    if activation == DataType["BIPOLAR"]:
+        activation_bias = 0
     else:
-        actval = odt.min()
+        activation_bias = activation.min()
+        if narrow and activation.signed():
+            activation_bias += 1
 
-    # Build DUT
-    model = make_single_thresholding_modelwrapper(
-        impl_style, thresholds, idt, odt, actval, n_inp_vecs
+    # Generate random thresholds and sort in ascending order
+    thresholds = generate_random_threshold_values(
+        threshold_data_type, num_input_channels, num_steps, narrow, per_tensor
     )
 
-    # Expected Reference output
-    # multithreshold util fxn wants NCHW input, not NHWC
-    x_nchw = layout_FINN2NCHW(x)
-    y = multithreshold(x_nchw, thresholds)
+    # provide non-decreasing/ascending thresholds
+    thresholds = sort_thresholds_increasing(thresholds)
 
-    # convert back to NHWC for comparison to hw outputs
-    y = layout_NCHW2FINN(y)
-    if act == DataType["BIPOLAR"]:
-        # binary to bipolar
-        y = 2 * y - 1
-    else:
-        # signed offset
-        y += act.min()
+    # Make a Multithreshold graph and convert to thresholding binary search node
+    model = make_single_multithresholding_modelwrapper(
+        thresholds,
+        input_data_type,
+        threshold_data_type,
+        output_data_type,
+        activation_bias,
+        num_input_vecs,
+        num_input_channels,
+    )
 
-    oshape = model.get_tensor_shape("outp")
-    y_expected = y.reshape(oshape)
+    # calculate reference output
+    x = gen_finn_dt_tensor(input_data_type, tuple(num_input_vecs + [num_input_channels]))
 
-    # package input data as dictionary
-    input_dict = {"inp": x}
+    input_dict = {model.graph.input[0].name: x}
+    y_expected = oxe.execute_onnx(model, input_dict)[model.graph.output[0].name]
 
-    # execute DUT
-    y_produced = oxe.execute_onnx(model, input_dict)["outp"]
+    if output_data_type == DataType["BIPOLAR"]:
+        # binary to bipolar
+        y_expected = 2 * y_expected - 1
 
-    y_produced = y_produced.reshape(y_expected.shape)
+    model = model.transform(InferThresholdingLayer())
 
+    # Perform functional validation of the InferThresholdingLayer transform
+    y_produced = oxe.execute_onnx(model, input_dict)[model.graph.output[0].name]
     assert (y_produced == y_expected).all()
 
-    model = model.transform(SpecializeLayers())
-    # Make sure that SpecializeLayers did not default to HLS implementation unexpectedly
+    # Transform to the specified implementation style, either the
+    # RTL or HLS according to test parameters
+    node = model.get_nodes_by_op_type(model.graph.node[0].op_type)[0]
+    inst = getCustomOp(node)
+    inst.set_nodeattr("preferred_impl_style", impl_style)
+    model = model.transform(SpecializeLayers(test_fpga_part))
+    model = model.transform(InferShapes())
     assert model.graph.node[0].op_type == "Thresholding_" + str(impl_style)
-    node = model.graph.node[0]
+
+    node = model.get_nodes_by_op_type(model.graph.node[0].op_type)[0]
     inst = getCustomOp(node)
     inst.set_nodeattr("PE", pe)
+    if round_thresh is True:
+        model = model.transform(RoundAndClipThresholds())
+    model = model.transform(GiveUniqueNodeNames())
+
     if impl_style == "hls":
         inst.set_nodeattr("mem_mode", mem_mode)
 
@@ -215,19 +247,12 @@ def test_fpgadataflow_thresholding(impl_style, idt, act, nf, ich, exec_mode, mem
         model = model.transform(CompileCppSim())
         model = model.transform(SetExecMode("cppsim"))
     elif exec_mode == "rtlsim":
-        model = model.transform(SetExecMode("rtlsim"))
-        model = model.transform(GiveUniqueNodeNames())
         model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
+        model = model.transform(SetExecMode("rtlsim"))
         model = model.transform(HLSSynthIP())
         model = model.transform(PrepareRTLSim())
-    else:
-        raise Exception("Unknown exec_mode")
-
-    # execute model
-    y_produced = oxe.execute_onnx(model, input_dict)["outp"]
-
-    y_produced = y_produced.reshape(y_expected.shape)
 
+    y_produced = oxe.execute_onnx(model, input_dict)[model.graph.output[0].name]
     assert (y_produced == y_expected).all()
 
     if exec_mode == "rtlsim":
@@ -241,219 +266,3 @@ def test_fpgadataflow_thresholding(impl_style, idt, act, nf, ich, exec_mode, mem
         exp_cycles = exp_cycles_dict[node.name]
         assert np.isclose(exp_cycles, cycles_rtlsim, atol=15)
         assert exp_cycles != 0
-
-
-@pytest.mark.parametrize("impl_style", ["rtl", "hls"])
-# configuration (ch, pe)
-@pytest.mark.parametrize("cfg", [(1, 1), (6, 2), (6, 3), (8, 4)])
-@pytest.mark.fpgadataflow
-@pytest.mark.vivado
-def test_runtime_thresholds_read(impl_style, cfg):
-    """Read back threshold weights during runtime
-
-    1. Create random initial weights T
-    2. Execute model
-    3. Read back weights via AXI
-    4. Compare with initial weights T
-    """
-    ch = cfg[0]
-    pe = cfg[1]
-    n_inp_vecs = [1, 2, 2]
-    hls_mem_mode = "internal_decoupled"
-    act = DataType["INT4"]
-    idt = DataType["INT16"]
-    odt = act
-    n_steps = act.get_num_possible_values() - 1
-    np.random.seed(2)
-    T = np.random.randint(idt.min(), idt.max() + 1, (ch, n_steps)).astype(np.float32)
-    # provide non-decreasing thresholds
-    T = np.sort(T, axis=1)
-
-    if odt == DataType["BIPOLAR"]:
-        actval = 0
-    else:
-        actval = odt.min()
-
-    model = make_single_thresholding_modelwrapper(impl_style, T, idt, odt, actval, n_inp_vecs)
-    model = model.transform(SpecializeLayers())
-
-    # Make sure that specialize layer did not default to HLS implementation
-    assert model.graph.node[0].op_type == "Thresholding_" + str(impl_style)
-
-    node = model.get_nodes_by_op_type(f"Thresholding_{impl_style}")[0]
-    op_inst = getCustomOp(node)
-    op_inst.set_nodeattr("PE", pe)
-    if impl_style == "hls":
-        op_inst.set_nodeattr("mem_mode", hls_mem_mode)
-    op_inst.set_nodeattr("runtime_writeable_weights", 1)
-
-    dat_fname = f"old_weights_{cfg}.dat"
-    op_inst.make_weight_file(T, "decoupled_runtime", dat_fname)
-    with open(dat_fname, "r") as f:
-        old_weight_stream = f.read().strip()
-    os.remove(dat_fname)
-    old_weight_stream = map(lambda x: int(x, 16), old_weight_stream.split("\n"))
-    old_weight_stream = list(old_weight_stream)
-    # need to create stitched IP for runtime weight testing
-    model = model.transform(InsertFIFO(True))
-    model = model.transform(SpecializeLayers())
-    model = model.transform(GiveUniqueNodeNames())
-    model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
-    model = model.transform(HLSSynthIP())
-    model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns))
-    model = model.transform(PrepareRTLSim())
-    model.set_metadata_prop("exec_mode", "rtlsim")
-    # add two copies of the input tensor as the first one is just used to
-    # "flush out" the pipeline (as mvau already starts receiving old weights while
-    # we read/write new ones and reads seem to cause a disturbance too)
-    # generate input data
-    in_tensor = gen_finn_dt_tensor(idt, tuple(n_inp_vecs + [ch]))
-    in_tensor = np.tile(in_tensor, (2, 1, 1, 1))
-
-    exec_ctx = {"inp": in_tensor}
-    extracted_weight_stream = []
-
-    def read_weights(sim):
-        addr = 0
-        for i in range(len(old_weight_stream)):
-            extracted_weight_stream.append(axilite_read(sim, addr, basename="s_axilite_0_"))
-            addr += 4
-
-    rtlsim_exec(model, exec_ctx, pre_hook=read_weights)
-
-    # Validate the AXI Read weights
-    assert extracted_weight_stream == old_weight_stream
-
-    y = exec_ctx["outp"][0]
-
-    # multithreshold util fxn wants NCHW input, not NHWC
-    expected = multithreshold(np.transpose(in_tensor, (0, 3, 1, 2)), T)
-    # convert back to NHWC for comparison to hw outputs
-    expected = np.transpose(expected, (0, 2, 3, 1))[1]
-
-    if act == DataType["BIPOLAR"]:
-        # binary to bipolarW
-        expected = 2 * expected - 1
-    else:
-        # signed offset
-        expected += act.min()
-
-    # Validate the output is as expected
-    assert (y == expected).all()
-
-
-@pytest.mark.parametrize("impl_style", ["hls", "rtl"])
-# configuration (ch, pe)
-@pytest.mark.parametrize("cfg", [(1, 1), (6, 2), (6, 3), (8, 4)])
-@pytest.mark.fpgadataflow
-@pytest.mark.vivado
-def test_runtime_thresholds_write(impl_style, cfg):
-    """Write threshold weights during runtime
-
-    1. Create random initial weights T_init
-    2. Create model with initial weights
-    3. Create new set of weights T_write
-    4. Write T_write using AXI bus
-    5. Read back using AXI bus to T_read
-    6. Compare T_write and T_read
-    7. Validate outputs with expected vectors
-    """
-    ch = cfg[0]
-    pe = cfg[1]
-
-    n_inp_vecs = [1, 2, 2]
-    hls_mem_mode = "internal_decoupled"
-    act = DataType["INT4"]
-    idt = DataType["INT16"]
-
-    odt = act
-    n_steps = act.get_num_possible_values() - 1
-    np.random.seed(2)
-    T_init = np.random.randint(idt.min(), idt.max() + 1, (ch, n_steps)).astype(np.float32)
-    # provide non-decreasing thresholds
-    T_init = np.sort(T_init, axis=1)
-
-    if odt == DataType["BIPOLAR"]:
-        actval = 0
-    else:
-        actval = odt.min()
-
-    model = make_single_thresholding_modelwrapper(impl_style, T_init, idt, odt, actval, n_inp_vecs)
-    model = model.transform(SpecializeLayers())
-
-    # Validate that specialize layer did not default to HLS implementation
-    assert model.graph.node[0].op_type == "Thresholding_" + str(impl_style)
-
-    op_inst = getCustomOp(model.graph.node[0])
-    op_inst.set_nodeattr("PE", pe)
-    if impl_style == "hls":
-        op_inst.set_nodeattr("mem_mode", hls_mem_mode)
-    op_inst.set_nodeattr("runtime_writeable_weights", 1)
-
-    # Make new weights for runtime write
-    np.random.seed(4)
-    T_write = np.random.randint(idt.min(), idt.max() + 1, (ch, n_steps)).astype(np.float32)
-    # provide non-decreasing thresholds
-    T_write = np.sort(T_write, axis=1)
-
-    dat_fname = f"T_write_{cfg}.dat"  # distinguish fname per paramter for distributed testing
-    op_inst.make_weight_file(T_write, "decoupled_runtime", dat_fname)
-    with open(dat_fname, "r") as f:
-        T_write_stream = f.read().strip()
-    os.remove(dat_fname)
-
-    T_write_stream = map(lambda x: int(x, 16), T_write_stream.split("\n"))
-    T_write_stream = list(T_write_stream)
-
-    # need to create stitched IP for runtime weight testing
-    model = model.transform(InsertFIFO(True))
-    model = model.transform(SpecializeLayers())
-    model = model.transform(GiveUniqueNodeNames())
-    model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
-    model = model.transform(HLSSynthIP())
-    model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns))
-    model = model.transform(PrepareRTLSim())
-    model.set_metadata_prop("exec_mode", "rtlsim")
-    # add two copies of the input tensor as the first one is just used to
-    # "flush out" the pipeline (as mvau already starts receiving old weights while
-    # we read/write new ones and reads seem to cause a disturbance too)
-    # generate input data
-    in_tensor = gen_finn_dt_tensor(idt, tuple(n_inp_vecs + [ch]))
-    in_tensor = np.tile(in_tensor, (2, 1, 1, 1))
-
-    exec_ctx_write = {"inp": in_tensor}
-
-    def write_weights(sim):
-        addr = 0
-        for nw in T_write_stream:
-            axilite_write(sim, addr, nw, basename="s_axilite_0_")
-            addr += 4
-
-    T_read_stream = []
-
-    def read_weights(sim):
-        addr = 0
-        for i in range(len(T_write_stream)):
-            T_read_stream.append(axilite_read(sim, addr, basename="s_axilite_0_"))
-            addr += 4
-
-    rtlsim_exec(model, exec_ctx_write, pre_hook=write_weights, post_hook=read_weights)
-
-    y = exec_ctx_write["outp"][1]
-
-    assert T_read_stream == T_write_stream
-
-    # multithreshold util fxn wants NCHW input, not NHWC
-    expected = multithreshold(np.transpose(in_tensor, (0, 3, 1, 2)), T_write)
-    # convert back to NHWC for comparison to hw outputs
-    expected = np.transpose(expected, (0, 2, 3, 1))[1]
-
-    if act == DataType["BIPOLAR"]:
-        # binary to bipolarW
-        expected = 2 * expected - 1
-    else:
-        # signed offset
-        expected += act.min()
-
-    # Validate the output is as expected
-    assert (y == expected).all()
diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding_runtime.py b/tests/fpgadataflow/test_fpgadataflow_thresholding_runtime.py
new file mode 100644
index 0000000000..e6175ac58b
--- /dev/null
+++ b/tests/fpgadataflow/test_fpgadataflow_thresholding_runtime.py
@@ -0,0 +1,340 @@
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+import numpy as np
+import os
+from onnx import TensorProto, helper
+from pyverilator.util.axi_utils import axilite_read, axilite_write
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.general.multithreshold import multithreshold
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.general import GiveUniqueNodeNames
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
+
+from finn.core.rtlsim_exec import rtlsim_exec
+from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
+
+test_fpga_part = "xczu3eg-sbva484-1-e"
+target_clk_ns = 5
+
+
+def generate_random_threshold_values(
+    data_type, num_input_channels, num_steps, narrow=False, per_tensor=False
+):
+    if per_tensor:
+        num_input_channels = 1
+    if narrow:
+        num_steps -= 1
+
+    return np.random.randint(
+        data_type.min(),
+        data_type.max() + 1,
+        (num_input_channels, num_steps),
+    ).astype(np.float32)
+
+
+def sort_thresholds_increasing(thresholds):
+    return np.sort(thresholds, axis=1)
+
+
+# n = batch, c = channel, h = height, w = width of feature map
+# Standard = NCHW; FINN = NHWC
+# Convert from NHWC(FINN) to NCHW(Standard)
+def layout_FINN2NCHW(data):
+    return np.transpose(data, (0, 3, 1, 2))
+
+
+# Convert from NCHW(Standard) to NHWC(FINN)
+def layout_NCHW2FINN(data):
+    return np.transpose(data, (0, 2, 3, 1))
+
+
+def make_single_thresholding_modelwrapper(impl_style, T, idt, odt, actval, n_inp_vecs, num_ch):
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, n_inp_vecs + [num_ch])
+    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, n_inp_vecs + [num_ch])
+
+    node_inp_list = ["inp", "thresh"]
+
+    Thresholding_node = helper.make_node(
+        "Thresholding",
+        node_inp_list,
+        ["outp"],
+        domain="finn.custom_op.fpgadataflow",
+        backend="fpgadataflow",
+        NumChannels=num_ch,
+        numSteps=T.shape[1],
+        inputDataType=idt.name,
+        weightDataType=idt.name,  # will be set by MinimizeAccumulatorWidth
+        outputDataType=odt.name,
+        ActVal=actval,
+        numInputVectors=n_inp_vecs,
+        preferred_impl_style=impl_style,
+    )
+    graph = helper.make_graph(
+        nodes=[Thresholding_node],
+        name="thresholding_graph",
+        inputs=[inp],
+        outputs=[outp],
+    )
+
+    model = qonnx_make_model(graph, producer_name="thresholding-model")
+    model = ModelWrapper(model)
+
+    model.set_tensor_datatype("inp", idt)
+    model.set_tensor_datatype("outp", odt)
+
+    model.set_tensor_datatype("thresh", idt)
+    model.set_initializer("thresh", T)
+    return model
+
+
+@pytest.mark.parametrize("impl_style", ["rtl", "hls"])
+@pytest.mark.parametrize(
+    "idt_act_cfg", [(DataType["INT16"], DataType["INT4"]), (DataType["UINT8"], DataType["UINT4"])]
+)
+# configuration (ch, pe)
+@pytest.mark.parametrize("cfg", [(1, 1), (6, 2), (6, 6)])
+@pytest.mark.parametrize("narrow", [True, False])
+@pytest.mark.parametrize("per_tensor", [True, False])
+@pytest.mark.fpgadataflow
+@pytest.mark.vivado
+def test_runtime_thresholds_read(impl_style, idt_act_cfg, cfg, narrow, per_tensor):
+    """Read back threshold weights during runtime
+
+    1. Create random initial weights T
+    2. Execute model
+    3. Read back weights via AXI
+    4. Compare with initial weights T
+    """
+    ch = cfg[0]
+    pe = cfg[1]
+    n_inp_vecs = [1, 2, 2]
+    hls_mem_mode = "internal_decoupled"
+    act = idt_act_cfg[1]
+    idt = idt_act_cfg[0]
+    odt = act
+    n_steps = act.get_num_possible_values() - 1
+    # Generate random thresholds and sort in ascending order
+    T = generate_random_threshold_values(idt, ch, n_steps, narrow, per_tensor)
+
+    # provide non-decreasing/ascending thresholds
+    T = sort_thresholds_increasing(T)
+
+    actval = act.min()
+    if narrow and act.signed():
+        actval += 1
+
+    model = make_single_thresholding_modelwrapper(impl_style, T, idt, odt, actval, n_inp_vecs, ch)
+    model = model.transform(SpecializeLayers(test_fpga_part))
+
+    # Make sure that specialize layer did not default to HLS implementation
+    assert model.graph.node[0].op_type == "Thresholding_" + str(impl_style)
+
+    node = model.get_nodes_by_op_type(f"Thresholding_{impl_style}")[0]
+    op_inst = getCustomOp(node)
+    op_inst.set_nodeattr("PE", pe)
+    if impl_style == "hls":
+        op_inst.set_nodeattr("mem_mode", hls_mem_mode)
+    op_inst.set_nodeattr("runtime_writeable_weights", 1)
+
+    dat_fname = f"old_weights_{cfg}.dat"
+    op_inst.make_weight_file(T, "decoupled_runtime", dat_fname)
+    with open(dat_fname, "r") as f:
+        old_weight_stream = f.read().strip()
+    os.remove(dat_fname)
+    old_weight_stream = map(lambda x: int(x, 16), old_weight_stream.split("\n"))
+    old_weight_stream = list(old_weight_stream)
+    # need to create stitched IP for runtime weight testing
+    model = model.transform(InsertFIFO(True))
+    model = model.transform(SpecializeLayers(test_fpga_part))
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
+    model = model.transform(HLSSynthIP())
+    model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns))
+    model = model.transform(PrepareRTLSim())
+    model.set_metadata_prop("exec_mode", "rtlsim")
+    # add two copies of the input tensor as the first one is just used to
+    # "flush out" the pipeline (as mvau already starts receiving old weights while
+    # we read/write new ones and reads seem to cause a disturbance too)
+    # generate input data
+    in_tensor = gen_finn_dt_tensor(idt, tuple(n_inp_vecs + [ch]))
+    in_tensor = np.tile(in_tensor, (2, 1, 1, 1))
+
+    exec_ctx = {"inp": in_tensor}
+    extracted_weight_stream = []
+
+    def read_weights(sim):
+        addr = 0
+        for i in range(len(old_weight_stream)):
+            extracted_weight_stream.append(axilite_read(sim, addr, basename="s_axilite_0_"))
+            addr += 4
+
+    rtlsim_exec(model, exec_ctx, pre_hook=read_weights)
+
+    # Validate the AXI Read weights
+    assert extracted_weight_stream == old_weight_stream
+
+    y = exec_ctx["outp"][0]
+
+    # multithreshold util fxn wants NCHW input, not NHWC
+    expected = multithreshold(np.transpose(in_tensor, (0, 3, 1, 2)), T)
+    # convert back to NHWC for comparison to hw outputs
+    expected = np.transpose(expected, (0, 2, 3, 1))[1]
+
+    # signed offset
+    expected += actval
+
+    # Validate the output is as expected
+    assert (y == expected).all()
+
+
+@pytest.mark.parametrize("impl_style", ["rtl", "hls"])
+@pytest.mark.parametrize(
+    "idt_act_cfg", [(DataType["INT16"], DataType["INT4"]), (DataType["UINT8"], DataType["UINT4"])]
+)
+# configuration (ch, pe)
+@pytest.mark.parametrize("cfg", [(1, 1), (6, 2), (6, 6)])
+@pytest.mark.parametrize("narrow", [True, False])
+@pytest.mark.parametrize("per_tensor", [True, False])
+@pytest.mark.fpgadataflow
+@pytest.mark.vivado
+def test_runtime_thresholds_write(impl_style, idt_act_cfg, cfg, narrow, per_tensor):
+    """Write threshold weights during runtime
+
+    1. Create random initial weights T_init
+    2. Create model with initial weights
+    3. Create new set of weights T_write
+    4. Write T_write using AXI bus
+    5. Read back using AXI bus to T_read
+    6. Compare T_write and T_read
+    7. Validate outputs with expected vectors
+    """
+    ch = cfg[0]
+    pe = cfg[1]
+
+    n_inp_vecs = [1, 2, 2]
+    hls_mem_mode = "internal_decoupled"
+    act = idt_act_cfg[1]
+    idt = idt_act_cfg[0]
+
+    odt = act
+    n_steps = act.get_num_possible_values() - 1
+    # Generate random thresholds and sort in ascending order
+    T_init = generate_random_threshold_values(idt, ch, n_steps, narrow, per_tensor)
+
+    # provide non-decreasing/ascending thresholds
+    T_init = sort_thresholds_increasing(T_init)
+
+    actval = act.min()
+    if narrow and act.signed():
+        actval += 1
+
+    model = make_single_thresholding_modelwrapper(
+        impl_style, T_init, idt, odt, actval, n_inp_vecs, ch
+    )
+    model = model.transform(SpecializeLayers(test_fpga_part))
+
+    # Validate that specialize layer did not default to HLS implementation
+    assert model.graph.node[0].op_type == "Thresholding_" + str(impl_style)
+
+    op_inst = getCustomOp(model.graph.node[0])
+    op_inst.set_nodeattr("PE", pe)
+    if impl_style == "hls":
+        op_inst.set_nodeattr("mem_mode", hls_mem_mode)
+    op_inst.set_nodeattr("runtime_writeable_weights", 1)
+
+    # Make new weights for runtime write
+    T_write = generate_random_threshold_values(idt, ch, n_steps, narrow, per_tensor)
+    # provide non-decreasing/ascending thresholds
+    T_write = sort_thresholds_increasing(T_write)
+
+    dat_fname = f"T_write_{cfg}.dat"  # distinguish fname per paramter for distributed testing
+    op_inst.make_weight_file(T_write, "decoupled_runtime", dat_fname)
+    with open(dat_fname, "r") as f:
+        T_write_stream = f.read().strip()
+    os.remove(dat_fname)
+
+    T_write_stream = map(lambda x: int(x, 16), T_write_stream.split("\n"))
+    T_write_stream = list(T_write_stream)
+
+    # need to create stitched IP for runtime weight testing
+    model = model.transform(InsertFIFO(True))
+    model = model.transform(SpecializeLayers(test_fpga_part))
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
+    model = model.transform(HLSSynthIP())
+    model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns))
+    model = model.transform(PrepareRTLSim())
+    model.set_metadata_prop("exec_mode", "rtlsim")
+    # add two copies of the input tensor as the first one is just used to
+    # "flush out" the pipeline (as mvau already starts receiving old weights while
+    # we read/write new ones and reads seem to cause a disturbance too)
+    # generate input data
+    in_tensor = gen_finn_dt_tensor(idt, tuple(n_inp_vecs + [ch]))
+    in_tensor = np.tile(in_tensor, (2, 1, 1, 1))
+
+    exec_ctx_write = {"inp": in_tensor}
+
+    def write_weights(sim):
+        addr = 0
+        for nw in T_write_stream:
+            axilite_write(sim, addr, nw, basename="s_axilite_0_")
+            addr += 4
+
+    T_read_stream = []
+
+    def read_weights(sim):
+        addr = 0
+        for i in range(len(T_write_stream)):
+            T_read_stream.append(axilite_read(sim, addr, basename="s_axilite_0_"))
+            addr += 4
+
+    rtlsim_exec(model, exec_ctx_write, pre_hook=write_weights, post_hook=read_weights)
+
+    y = exec_ctx_write["outp"][1]
+
+    assert T_read_stream == T_write_stream
+
+    # multithreshold util fxn wants NCHW input, not NHWC
+    expected = multithreshold(np.transpose(in_tensor, (0, 3, 1, 2)), T_write)
+    # convert back to NHWC for comparison to hw outputs
+    expected = np.transpose(expected, (0, 2, 3, 1))[1]
+
+    # signed off-set
+    expected += actval
+
+    # Validate the output is as expected
+    assert (y == expected).all()
diff --git a/tests/fpgadataflow/test_fpgadataflow_upsampler.py b/tests/fpgadataflow/test_fpgadataflow_upsampler.py
index b0da767eaa..4539917878 100644
--- a/tests/fpgadataflow/test_fpgadataflow_upsampler.py
+++ b/tests/fpgadataflow/test_fpgadataflow_upsampler.py
@@ -174,7 +174,7 @@ def test_fpgadataflow_upsampler(dt, IFMDim, scale, NumChannels, exec_mode, is_1d
     test_result = output_dict[model.graph.output[0].name]
     output_matches = np.isclose(golden_result, test_result, atol=atol).all()
 
-    model = model.transform(SpecializeLayers())
+    model = model.transform(SpecializeLayers("xc7z020clg400-1"))
 
     # Prep sim
     if exec_mode == "cppsim":
diff --git a/tests/fpgadataflow/test_runtime_weights.py b/tests/fpgadataflow/test_runtime_weights.py
index 3e7822a077..4ca61578c3 100644
--- a/tests/fpgadataflow/test_runtime_weights.py
+++ b/tests/fpgadataflow/test_runtime_weights.py
@@ -70,7 +70,7 @@ def test_runtime_weights_single_layer():
     }
     layer_spec_list = [layer_spec]
     model = hls_random_mlp_maker(layer_spec_list)
-    model = model.transform(SpecializeLayers())
+    model = model.transform(SpecializeLayers(test_fpga_part))
     fcl = model.get_nodes_by_op_type("MVAU_hls")[0]
     op_inst = getCustomOp(fcl)
     op_inst.set_nodeattr("mem_mode", "internal_decoupled")
@@ -83,7 +83,7 @@ def test_runtime_weights_single_layer():
     old_weight_stream = map(lambda x: int(x, 16), old_weight_stream.split("\n"))
     old_weight_stream = list(old_weight_stream)
     model = model.transform(InsertFIFO(True))
-    model = model.transform(SpecializeLayers())
+    model = model.transform(SpecializeLayers(test_fpga_part))
     model = model.transform(GiveUniqueNodeNames())
     model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
     model = model.transform(HLSSynthIP())
diff --git a/tests/transformation/streamline/test_round_thresholds.py b/tests/transformation/streamline/test_round_thresholds.py
index 63375598a0..6de82e6750 100644
--- a/tests/transformation/streamline/test_round_thresholds.py
+++ b/tests/transformation/streamline/test_round_thresholds.py
@@ -1,4 +1,5 @@
-# Copyright (c) 2020, Xilinx
+# Copyright (c) 2020-2022, Xilinx, Inc.
+# Copyright (C) 2022-2024, Advanced Micro Devices, Inc.
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -26,32 +27,15 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-# fmt: off
-# Disable formatter. This is deliberately formatted to stay within 80 characters
-# per line. Black, however, formats some lines going beyond this.
-
-# Testing framework
 import pytest
 
-# Use numpy for python execution / computing the ground truth expected values
 import numpy as np
-
-# Utility types and function for creating onnx nodes and graphs
 from onnx import TensorProto, helper
-
-# QONNX data types like INT25
 from qonnx.core.datatype import DataType
-
-# QONNX wrapper of ONNX model graphs
 from qonnx.core.modelwrapper import ModelWrapper
-
-# Generate random tensors of QONNX/FINN data types for testing
 from qonnx.util.basic import gen_finn_dt_tensor
 
-# Execution of onnx graphs within FINN
 import finn.core.onnx_exec as oxe
-
-# The transformation to be tested
 from finn.transformation.streamline import RoundAndClipThresholds
 
 
@@ -59,173 +43,194 @@
 # data type combinations with purely integer inputs. Without proper rounding,
 # this tests only the clipping, range and type-casting behavior of the
 # transformation.
-@pytest.mark.parametrize("i_dtype", [
-    # Explanation for selecting these test configurations:
-    # 1. Below 24-bit thresholds we will not observe any interesting rounding
-    #    behavior, as all integers < 2^24 can be exactly represented in 32-bit
-    #    floating-point. Thus, we test thresholds at 25-bit signed integers and
-    #    generate test inputs slightly above and below this.
-    # 2. We want to test out-of-range clipping of thresholds, in particular
-    #    clipping of the negative portion of signed thresholds. Thus, we only
-    #    generate signed thresholds, but test with signed and unsigned
-    #    inputs of smaller, larger and equal range.
-    # 3. Testing proper floating-point thresholds requires a separate test-case
-    "INT23", "UINT23", "INT24", "UINT24", "INT25", "UINT25", "INT26", "UINT26"
-])
-@pytest.mark.parametrize("o_dtype", [
-    # Explanation for selecting these test configurations:
-    # 1. Outputs of MultiThreshold are typically much smaller bit-width than the
-    #    inputs and thresholds.
-    # 2. However, with randomly samples thresholds from a rather large range due
-    #    to the selected input bit-widths (see above), we risk not adequately
-    #    covering the input range if we sample too few thresholds. The number of
-    #    thresholds sampled depends on the bit-width of the output, thus we use
-    #    rather high bit-width for testing.
-    # 3. For a "real" model, the quantization procedure *should* take care of
-    #    adequately covering the true input range.
-    "INT8", "UINT8"
-])
-@pytest.mark.parametrize("n_elems", [
-    # Explanation for selecting these test configurations:
-    # 1. Small edge cases and quickly running through tests: 1, 2, 3, 4
-    # 2. Large test case 256, hopefully amplifying any rarely occurring errors
-    1, 2, 3, 4, 256
-])
+@pytest.mark.parametrize(
+    "i_dtype",
+    [
+        # Explanation for selecting these test configurations:
+        # 1. Below 24-bit thresholds we will not observe any interesting rounding
+        #    behavior, as all integers < 2^24 can be exactly represented in 32-bit
+        #    floating-point. Thus, we test thresholds at 25-bit signed integers and
+        #    generate test inputs slightly above and below this.
+        # 2. We want to test out-of-range clipping of thresholds, in particular
+        #    clipping of the negative portion of signed thresholds. Thus, we only
+        #    generate signed thresholds, but test with signed and unsigned
+        #    inputs of smaller, larger and equal range.
+        # 3. Testing proper floating-point thresholds requires a separate test-case
+        "INT23",
+        "UINT23",
+        "INT24",
+        "UINT24",
+        "INT25",
+        "UINT25",
+        "INT26",
+        "UINT26",
+    ],
+)
+@pytest.mark.parametrize(
+    "o_dtype",
+    [
+        # Explanation for selecting these test configurations:
+        # 1. Outputs of MultiThreshold are typically much smaller bit-width than the
+        #    inputs and thresholds.
+        # 2. However, with randomly samples thresholds from a rather large range due
+        #    to the selected input bit-widths (see above), we risk not adequately
+        #    covering the input range if we sample too few thresholds. The number of
+        #    thresholds sampled depends on the bit-width of the output, thus we use
+        #    rather high bit-width for testing.
+        # 3. For a "real" model, the quantization procedure *should* take care of
+        #    adequately covering the true input range.
+        "INT8",
+        "UINT8",
+    ],
+)
+@pytest.mark.parametrize(
+    "n_elems",
+    [
+        # Explanation for selecting these test configurations:
+        # 1. Small edge cases and quickly running through tests: 1, 2, 3, 4
+        # 2. Large test case 256, hopefully amplifying any rarely occurring errors
+        1,
+        2,
+        3,
+        4,
+        256,
+    ],
+)
+@pytest.mark.streamline
 def test_round_and_clip_thresholds_ints(i_dtype, o_dtype, n_elems):
-    # Convert string representation of data type to onnx DataType
     i_dtype = DataType[i_dtype]
     t_dtype = DataType["INT25"]  # Note: Matches configuration above
     o_dtype = DataType[o_dtype]  # noqa: Duplicate model setup code
-    # Create a dummy MultiThreshold operation to be tested
     node = helper.make_node(
-        # Op-Type of the node
         "MultiThreshold",
-        # MultiThreshold is implemented under the qonnx domain
         domain="qonnx.custom_op.general",
-        # List the names of the input tensors
         inputs=["inp", "thresholds"],
-        # List the names of the output tensors
         outputs=["out"],
-        # The CustomOp needs to know the data type of the output to be produced
-        out_dtype=str(o_dtype)
+        out_dtype=str(o_dtype),
+        out_bias=float(o_dtype.min()),
     )
-    # Number of threshold values required to produce outputs of type o_dtype
     n_thresholds = o_dtype.get_num_possible_values() - 1
-    # Create tensor value infos for all input/output tensors involved
     inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, n_elems])
     out = helper.make_tensor_value_info("out", TensorProto.FLOAT, [1, n_elems])
-    # Create a tensor value info for the thresholds parameter tensor
-    #   Note: Number of thresholds is determined by the output data type
     thresholds = helper.make_tensor_value_info(
         "thresholds", TensorProto.FLOAT, [n_elems, n_thresholds]
     )
-    # Combine node and tensor value infos into an onnx graph
     graph = helper.make_graph([node], "thresholds", [inp, thresholds], [out])
-    # Wrap the model graph in a ModelWrapper container
     model = ModelWrapper(helper.make_model(graph))
-    # Sample random tensors of the configured input data type
+
     inp = gen_finn_dt_tensor(i_dtype, [1, n_elems])
-    # Generate sorted thresholds for each of the input channels
+    inp[0][0] = i_dtype.max()
     thresholds = np.sort(gen_finn_dt_tensor(t_dtype, [n_elems, n_thresholds]))
-    # Set data type annotations for the input and thresholds tensor
     model.set_tensor_datatype("inp", i_dtype)  # noqa: Duplicate model execution
     model.set_tensor_datatype("thresholds", t_dtype)
     model.set_tensor_datatype("out", o_dtype)
-    # Set the thresholds as initializer input to the model
     model.set_initializer("thresholds", thresholds)
+
     # Execute the model before running the RoundAndClipThresholds transformation
     out_expected = oxe.execute_onnx(model, {"inp": inp})["out"]
-    # Before rounding the threshold data type must be as annotated
     assert model.get_tensor_datatype("thresholds") == t_dtype
-    # Run the transformation to be tested
+
     model = model.transform(RoundAndClipThresholds())
+
     # After this transformation, the thresholds and output data type should be
     # inferred correctly
-    assert model.get_tensor_datatype("thresholds") == i_dtype
+    if not i_dtype.signed():
+        new_tdt = DataType.get_smallest_possible(i_dtype.max() + 1)
+    else:
+        new_tdt = DataType.get_smallest_possible(-(i_dtype.max() + 1) - 1)
+    assert model.get_tensor_datatype("thresholds") == new_tdt
     assert model.get_tensor_datatype("out") == o_dtype
+
     # After this transformation, the container type used to store the thresholds
     # values must be float32. No other type-cast or type promotion may happen.
     assert model.get_initializer("thresholds").dtype == np.float32
+
     # After rounding, all thresholds must be integers represented as float32
-    assert all(
-        x.is_integer() for x in model.get_initializer("thresholds").flatten()
-    )
+    assert all(x.is_integer() for x in model.get_initializer("thresholds").flatten())
+
     # Execute the model after running the RoundAndClipThresholds transformation
     out_produced = oxe.execute_onnx(model, {"inp": inp})["out"]
-    # Compare the results before and after: This is the pure integer test-case
-    # and no actual rounding should happen, thus the rounded operation should
-    # produce outputs exactly equal.
+
     assert np.all(out_produced == out_expected)
 
 
 # Tests the RoundAndClipThresholds transformation under various input, output
 # data type combinations with purely integer inputs. This test case tests actual
 # rounding of floating-point thresholds.
-@pytest.mark.parametrize("i_dtype", [
-    # Explanation for selecting these test configurations:
-    # 1. Below 24-bit thresholds we will not observe any interesting rounding
-    #    behavior, as all integers < 2^24 can be exactly represented in 32-bit
-    #    floating-point. Thus, we test thresholds at 25-bit signed integers and
-    #    generate test inputs slightly above and below this.
-    # 2. We want to test out-of-range clipping of thresholds, in particular
-    #    clipping of the negative portion of signed thresholds. Thus, we only
-    #    generate signed thresholds, but test with signed and unsigned
-    #    inputs of smaller, larger and equal range.
-    # 3. Testing proper floating-point thresholds requires a separate test-case
-    "INT23", "UINT23", "INT24", "UINT24", "INT25", "UINT25", "INT26", "UINT26"
-])
-@pytest.mark.parametrize("o_dtype", [
-    # Explanation for selecting these test configurations:
-    # 1. Outputs of MultiThreshold are typically much smaller bit-width than the
-    #    inputs and thresholds.
-    # 2. However, with randomly samples thresholds from a rather large range due
-    #    to the selected input bit-widths (see above), we risk not adequately
-    #    covering the input range if we sample too few thresholds. The number of
-    #    thresholds sampled depends on the bit-width of the output, thus we use
-    #    rather high bit-width for testing.
-    # 3. For a "real" model, the quantization procedure *should* take care of
-    #    adequately covering the true input range.
-    "INT8", "UINT8"
-])
-@pytest.mark.parametrize("n_elems", [
-    # Explanation for selecting these test configurations:
-    # 1. Small edge cases and quickly running through tests: 1, 2, 3, 4
-    # 2. Large test case 256, hopefully amplifying any rarely occurring errors
-    1, 2, 3, 4, 256
-])
+@pytest.mark.parametrize(
+    "i_dtype",
+    [
+        # Explanation for selecting these test configurations:
+        # 1. Below 24-bit thresholds we will not observe any interesting rounding
+        #    behavior, as all integers < 2^24 can be exactly represented in 32-bit
+        #    floating-point. Thus, we test thresholds at 25-bit signed integers and
+        #    generate test inputs slightly above and below this.
+        # 2. We want to test out-of-range clipping of thresholds, in particular
+        #    clipping of the negative portion of signed thresholds. Thus, we only
+        #    generate signed thresholds, but test with signed and unsigned
+        #    inputs of smaller, larger and equal range.
+        # 3. Testing proper floating-point thresholds requires a separate test-case
+        "INT23",
+        "UINT23",
+        "INT24",
+        "UINT24",
+        "INT25",
+        "UINT25",
+        "INT26",
+        "UINT26",
+    ],
+)
+@pytest.mark.parametrize(
+    "o_dtype",
+    [
+        # Explanation for selecting these test configurations:
+        # 1. Outputs of MultiThreshold are typically much smaller bit-width than the
+        #    inputs and thresholds.
+        # 2. However, with randomly samples thresholds from a rather large range due
+        #    to the selected input bit-widths (see above), we risk not adequately
+        #    covering the input range if we sample too few thresholds. The number of
+        #    thresholds sampled depends on the bit-width of the output, thus we use
+        #    rather high bit-width for testing.
+        # 3. For a "real" model, the quantization procedure *should* take care of
+        #    adequately covering the true input range.
+        "INT8",
+        "UINT8",
+    ],
+)
+@pytest.mark.parametrize(
+    "n_elems",
+    [
+        # Explanation for selecting these test configurations:
+        # 1. Small edge cases and quickly running through tests: 1, 2, 3, 4
+        # 2. Large test case 256, hopefully amplifying any rarely occurring errors
+        1,
+        2,
+        3,
+        4,
+        256,
+    ],
+)
+@pytest.mark.streamline
 def test_round_and_clip_thresholds_floats(i_dtype, o_dtype, n_elems):
-    # Convert string representation of data type to onnx DataType
     i_dtype = DataType[i_dtype]
     t_dtype = DataType["FLOAT32"]
     o_dtype = DataType[o_dtype]  # noqa: Duplicate model setup code
-    # Create a dummy MultiThreshold operation to be tested
     node = helper.make_node(
-        # Op-Type of the node
         "MultiThreshold",
-        # MultiThreshold is implemented under the qonnx domain
         domain="qonnx.custom_op.general",
-        # List the names of the input tensors
         inputs=["inp", "thresholds"],
-        # List the names of the output tensors
         outputs=["out"],
-        # The CustomOp needs to know the data type of the output to be produced
-        out_dtype=str(o_dtype)
+        out_dtype=str(o_dtype),
     )
-    # Number of threshold values required to produce outputs of type o_dtype
     n_thresholds = o_dtype.get_num_possible_values() - 1
-    # Create tensor value infos for all input/output tensors involved
     inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, n_elems])
     out = helper.make_tensor_value_info("out", TensorProto.FLOAT, [1, n_elems])
-    # Create a tensor value info for the thresholds parameter tensor
-    #   Note: Number of thresholds is determined by the output data type
     thresholds = helper.make_tensor_value_info(
         "thresholds", TensorProto.FLOAT, [n_elems, n_thresholds]
     )
-    # Combine node and tensor value infos into an onnx graph
     graph = helper.make_graph([node], "thresholds", [inp, thresholds], [out])
-    # Wrap the model graph in a ModelWrapper container
     model = ModelWrapper(helper.make_model(graph))
-    # Sample random tensors of the configured input data type
+
     inp = gen_finn_dt_tensor(i_dtype, [1, n_elems])
     # Draw uniformly random prototype thresholds in [0,+1] range
     thresholds = np.random.rand(n_elems, n_thresholds)
@@ -238,30 +243,28 @@ def test_round_and_clip_thresholds_floats(i_dtype, o_dtype, n_elems):
     model.set_tensor_datatype("inp", i_dtype)  # noqa: Duplicate model execution
     model.set_tensor_datatype("thresholds", t_dtype)
     model.set_tensor_datatype("out", o_dtype)
-    # Set the thresholds as initializer input to the model
     model.set_initializer("thresholds", thresholds)
+
     # Execute the model before running the RoundAndClipThresholds transformation
     out_expected = oxe.execute_onnx(model, {"inp": inp})["out"]
     # Before rounding the threshold data type must be as annotated
     assert model.get_tensor_datatype("thresholds") == t_dtype
-    # Run the transformation to be tested
+
     model = model.transform(RoundAndClipThresholds())
-    # After this transformation, the thresholds and output data type should be
-    # inferred correctly
-    assert model.get_tensor_datatype("thresholds") == i_dtype
+
+    if not i_dtype.signed():
+        new_tdt = DataType.get_smallest_possible(i_dtype.max() + 1)
+    else:
+        new_tdt = DataType.get_smallest_possible(-(i_dtype.max() + 1) - 1)
+    assert model.get_tensor_datatype("thresholds") == new_tdt
     assert model.get_tensor_datatype("out") == o_dtype
+
     # After this transformation, the container type used to store the thresholds
     # values must be float32. No other type-cast or type promotion may happen.
     assert model.get_initializer("thresholds").dtype == np.float32
     # After rounding, all thresholds must be integers represented as float32
-    assert all(
-        x.is_integer() for x in model.get_initializer("thresholds").flatten()
-    )
-    # Execute the model after running the RoundAndClipThresholds transformation
+    assert all(x.is_integer() for x in model.get_initializer("thresholds").flatten())
+
     out_produced = oxe.execute_onnx(model, {"inp": inp})["out"]
-    # Compare the results before and after: This is the floating-point test with
-    # actual rounding, this the transformed result may only be equal within some
-    # tolerance.
-    # Hm, never observed this to be relevant. For all test configurations, exact
-    # equality seems to hold, probably due to only integer inputs being tested.
+
     assert np.allclose(out_produced, out_expected, atol=1.0e-3)
diff --git a/tests/transformation/streamline/test_streamline_cnv.py b/tests/transformation/streamline/test_streamline_cnv.py
index 8a91a49278..9e206c843a 100644
--- a/tests/transformation/streamline/test_streamline_cnv.py
+++ b/tests/transformation/streamline/test_streamline_cnv.py
@@ -50,8 +50,6 @@
 from finn.util.basic import make_build_dir
 from finn.util.test import get_test_model_trained
 
-export_onnx_path = make_build_dir("test_streamline_cnv_")
-
 
 @pytest.mark.streamline
 # act bits
@@ -64,6 +62,7 @@ def test_streamline_cnv(size, wbits, abits):
     if wbits > abits:
         pytest.skip("No wbits > abits cases at the moment")
     nname = "%s_%dW%dA" % (size, wbits, abits)
+    export_onnx_path = make_build_dir("test_streamline_cnv_")
     finn_onnx = export_onnx_path + "/%s.onnx" % nname
     fc = get_test_model_trained(size, wbits, abits)
     export_qonnx(fc, torch.randn(1, 3, 32, 32), finn_onnx)
diff --git a/tests/transformation/streamline/test_streamline_fc.py b/tests/transformation/streamline/test_streamline_fc.py
index edc4a96fe2..9ce2f2ab65 100644
--- a/tests/transformation/streamline/test_streamline_fc.py
+++ b/tests/transformation/streamline/test_streamline_fc.py
@@ -52,8 +52,6 @@
 from finn.util.basic import make_build_dir
 from finn.util.test import get_test_model_trained
 
-export_onnx_path = make_build_dir("test_streamline_fc_")
-
 
 @pytest.mark.streamline
 # act bits
@@ -68,6 +66,7 @@ def test_streamline_fc(size, wbits, abits):
     if wbits > abits:
         pytest.skip("No wbits > abits cases at the moment")
     nname = "%s_%dW%dA" % (size, wbits, abits)
+    export_onnx_path = make_build_dir("test_streamline_fc_")
     finn_onnx = export_onnx_path + "/%s.onnx" % nname
     fc = get_test_model_trained(size, wbits, abits)
     export_qonnx(fc, torch.randn(1, 1, 28, 28), finn_onnx)
diff --git a/tests/util/test_build_dataflow.py b/tests/util/test_build_dataflow.py
index c8f80a8e1b..c8787b4098 100644
--- a/tests/util/test_build_dataflow.py
+++ b/tests/util/test_build_dataflow.py
@@ -64,6 +64,7 @@ def test_end2end_build_dataflow_directory():
     assert os.path.isfile(output_dir + "/bitfile/finn-accel.hwh")
     assert os.path.isfile(output_dir + "/report/post_synth_resources.xml")
     assert os.path.isfile(output_dir + "/report/post_route_timing.rpt")
+    assert os.path.isfile(output_dir + "/report/post_synth_resources.json")
     # verification outputs
     verif_batchsize = np.load(target_dir + "/input.npy").shape[0]
     for i in range(verif_batchsize):
@@ -71,5 +72,6 @@ def test_end2end_build_dataflow_directory():
         assert os.path.isfile(verify_out_dir + f"/verify_initial_python_{i}_SUCCESS.npy")
         assert os.path.isfile(verify_out_dir + f"/verify_streamlined_python_{i}_SUCCESS.npy")
         assert os.path.isfile(verify_out_dir + f"/verify_folded_hls_cppsim_{i}_SUCCESS.npy")
+        assert os.path.isfile(verify_out_dir + f"/verify_node_by_node_rtlsim_{i}_SUCCESS.npy")
         assert os.path.isfile(verify_out_dir + f"/verify_stitched_ip_rtlsim_{i}_SUCCESS.npy")
         assert os.path.isfile(output_dir + f"/report/verify_rtlsim_{i}.vcd")
diff --git a/tests/util/test_data_packing_hls.py b/tests/util/test_data_packing.py
similarity index 79%
rename from tests/util/test_data_packing_hls.py
rename to tests/util/test_data_packing.py
index b95bcd5d42..a718f171e2 100644
--- a/tests/util/test_data_packing_hls.py
+++ b/tests/util/test_data_packing.py
@@ -36,7 +36,7 @@
 from qonnx.util.basic import gen_finn_dt_tensor
 
 from finn.util.basic import make_build_dir
-from finn.util.data_packing import numpy_to_hls_code
+from finn.util.data_packing import npy_to_rtlsim_input, numpy_to_hls_code
 
 
 @pytest.mark.util
@@ -141,3 +141,42 @@ def remove_all_whitespace(s):
     eB = """{{ap_uint<4>("0xf", 16), ap_uint<4>("0xf", 16)},
      {ap_uint<4>("0x7", 16), ap_uint<4>("0xd", 16)}};"""
     assert remove_all_whitespace(ret) == remove_all_whitespace(eB)
+
+
+@pytest.mark.util
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        DataType["BINARY"],
+        DataType["BIPOLAR"],
+        DataType["TERNARY"],
+        DataType["INT2"],
+        DataType["INT7"],
+        DataType["INT8"],
+        DataType["INT22"],
+        DataType["INT32"],
+        DataType["UINT7"],
+        DataType["UINT8"],
+        DataType["UINT15"],
+        DataType["FIXED<9,6>"],
+        DataType["FLOAT32"],
+    ],
+)
+def test_npy_to_rtlsim_input(dtype):
+    # check if slow and fast data packing produce the same non-sign-extended input for rtlsim
+    # fast mode is triggered for certain data types if last (SIMD) dim = 1
+    inp_fast = gen_finn_dt_tensor(dtype, (1, 8, 8, 8 // 1, 1))  # N H W FOLD SIMD
+    inp_slow = inp_fast.reshape((1, 8, 8, 8 // 2, 2))  # N H W FOLD SIMD
+
+    output_fast = npy_to_rtlsim_input(inp_fast, dtype, 1 * dtype.bitwidth())
+    output_slow = npy_to_rtlsim_input(inp_slow, dtype, 2 * dtype.bitwidth())
+
+    output_slow_split = []
+    for x in output_slow:
+        # least significant bits = first element:
+        output_slow_split.append(x & ((1 << dtype.bitwidth()) - 1))
+        # remaining bits = second element:
+        output_slow_split.append(x >> dtype.bitwidth())
+
+    assert all([(x >> dtype.bitwidth()) == 0 for x in output_fast]), "extraneous bits detected"
+    assert np.all(output_fast == output_slow_split), "different behavior of packing modes detected"
diff --git a/tests/util/test_hls_vector.py b/tests/util/test_hls_vector.py
new file mode 100644
index 0000000000..35d9b1b2fc
--- /dev/null
+++ b/tests/util/test_hls_vector.py
@@ -0,0 +1,117 @@
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import pytest
+
+import numpy as np
+import os
+import shutil
+import subprocess
+from qonnx.core.datatype import DataType
+from qonnx.util.basic import gen_finn_dt_tensor
+
+from finn.util.basic import make_build_dir
+
+
+@pytest.mark.util
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        DataType["BINARY"],
+        DataType["UINT8"],
+        DataType["INT32"],
+        DataType["FIXED<9,6>"],
+        DataType["FLOAT32"],
+    ],
+)
+@pytest.mark.parametrize("test_shape", [(1, 2, 4), (1, 1, 64), (2, 64)])
+@pytest.mark.vivado
+def test_npy2vectorstream(test_shape, dtype):
+    ndarray = gen_finn_dt_tensor(dtype, test_shape)
+    test_dir = make_build_dir(prefix="test_npy2vectorstream_")
+    shape = ndarray.shape
+    elem_hls_type = dtype.get_hls_datatype_str()
+    vLen = shape[-1]
+    npy_in = test_dir + "/in.npy"
+    npy_out = test_dir + "/out.npy"
+    # restrict the np datatypes we can handle
+    npyt_to_ct = {
+        "float32": "float",
+        "float64": "double",
+        "int8": "int8_t",
+        "int32": "int32_t",
+        "int64": "int64_t",
+        "uint8": "uint8_t",
+        "uint32": "uint32_t",
+        "uint64": "uint64_t",
+    }
+    npy_type = npyt_to_ct[str(ndarray.dtype)]
+    shape_cpp_str = str(shape).replace("(", "{").replace(")", "}")
+    test_app_string = []
+    test_app_string += ["#include <cstddef>"]
+    test_app_string += ["#define AP_INT_MAX_W 8191"]
+    test_app_string += ['#include "ap_int.h"']
+    test_app_string += ['#include "stdint.h"']
+    test_app_string += ['#include "hls_stream.h"']
+    test_app_string += ['#include "hls_vector.h"']
+    test_app_string += ['#include "cnpy.h"']
+    test_app_string += ['#include "npy2vectorstream.hpp"']
+    test_app_string += ["int main(int argc, char *argv[]) {"]
+    test_app_string += ["hls::stream<hls::vector<%s, %d>> teststream;" % (elem_hls_type, vLen)]
+    test_app_string += [
+        'npy2vectorstream<%s, %s, %d>("%s", teststream);' % (elem_hls_type, npy_type, vLen, npy_in)
+    ]
+    test_app_string += [
+        'vectorstream2npy<%s, %s, %d>(teststream, %s, "%s");'
+        % (elem_hls_type, npy_type, vLen, shape_cpp_str, npy_out)
+    ]
+    test_app_string += ["return 0;"]
+    test_app_string += ["}"]
+    with open(test_dir + "/test.cpp", "w") as f:
+        f.write("\n".join(test_app_string))
+    cmd_compile = """
+g++ -o test_npy2vectorstream test.cpp $FINN_ROOT/deps/cnpy/cnpy.cpp \
+-I$FINN_ROOT/deps/cnpy/ -I{}/include -I$FINN_ROOT/src/finn/qnn-data/cpp \
+--std=c++14 -lz """.format(
+        os.environ["HLS_PATH"]
+    )
+    with open(test_dir + "/compile.sh", "w") as f:
+        f.write(cmd_compile)
+    compile = subprocess.Popen(["sh", "compile.sh"], stdout=subprocess.PIPE, cwd=test_dir)
+    (stdout, stderr) = compile.communicate()
+    # make copy before saving the array
+    ndarray = ndarray.copy()
+    np.save(npy_in, ndarray)
+    execute = subprocess.Popen("./test_npy2vectorstream", stdout=subprocess.PIPE, cwd=test_dir)
+    (stdout, stderr) = execute.communicate()
+    produced = np.load(npy_out)
+    success = (produced == ndarray).all()
+    # only delete generated code if test has passed
+    # useful for debug otherwise
+    if success:
+        shutil.rmtree(test_dir)
+    assert success
diff --git a/tutorials/fpga_flow/README.md b/tutorials/fpga_flow/README.md
index 2aaad0423b..71f2a2a625 100644
--- a/tutorials/fpga_flow/README.md
+++ b/tutorials/fpga_flow/README.md
@@ -25,20 +25,29 @@ This demo was created using Vivado 2022.1.
 Prior to running, insure the following prerequisites have been met:
 - Install FINN and prerequisites.  The [Getting Started](https://finn.readthedocs.io/en/latest/getting_started.html#quickstart) section of the FINN documentation might be helpful for this.
 - Ensure you have the `FINN_XILINX_PATH` and `FINN_XILINX_VERSION` env variables set appropriately for your install.  For example:
-> export FINN_XILINX_PATH=/opt/Xilinx
-> export FINN_XILINX_VERSION=2022.1
+```shell
+export FINN_XILINX_PATH=/opt/Xilinx
+export FINN_XILINX_VERSION=2022.1
+```
+
 - Set the env variable for your `finn` install top directory (where you cloned the FINN compiler repo):
-> export FINN_ROOT=/home/foo/finn
+```shell
+export FINN_ROOT=/home/foo/finn
+```
 
 Then, change to `finn` install directory and invoke the build as follows:
-> cd ${FINN_ROOT}
-> ./run-docker.sh build_custom ${FINN_ROOT}/tutorials/fpga_flow/
+```shell
+cd ${FINN_ROOT}
+./run-docker.sh build_custom ${FINN_ROOT}/tutorials/fpga_flow/
+```
 
 Alternatively, since the tutorials folder is already part of the FINN compiler installation, you can invoke it from within the Docker container:
-> cd ${FINN_ROOT}
-> ./run-docker.sh
-> cd tutorials/fpga_flow
-> python build.py
+```shell
+cd ${FINN_ROOT}
+./run-docker.sh
+cd tutorials/fpga_flow
+python build.py
+```
 
 The build should finish in about 10 minutes, and the FINN docker will close on success.
 
@@ -59,12 +68,14 @@ The build should finish in about 10 minutes, and the FINN docker will close on s
 ### Examine the Stitched IP
 
 Navigate to the stitched IP project directory:
-
-> cd ${FINN_ROOT}/tutorials/fpga_flow/output_tfc_w0a1_fpga/stitched_ip
+```shell
+cd ${FINN_ROOT}/tutorials/fpga_flow/output_tfc_w0a1_fpga/stitched_ip
+```
 
 And, open the project:
-
-> vivado finn_vivado_stitch_proj.xpr
+```shell
+vivado finn_vivado_stitch_proj.xpr
+```
 
 Explore the IPI board design and note the interfaces.
 
@@ -89,9 +100,10 @@ them under `${FINN_ROOT}/tutorials/fpga_flow/output_tfc_w0a1_fpga/sim`. Let's ex
    the FINN compiler. Used for launching the testbench simulation.
 
 You can now launch the simulation as follows:
-
-> cd ${FINN_ROOT}/tutorials/fpga_flow/output_tfc_w0a1_fpga/sim
-> vivado -mode gui -source make_sim_proj.tcl
+```shell
+cd ${FINN_ROOT}/tutorials/fpga_flow/output_tfc_w0a1_fpga/sim
+vivado -mode gui -source make_sim_proj.tcl
+```
 
 The simulation should complete with:
 
diff --git a/tutorials/fpga_flow/folding_config.json b/tutorials/fpga_flow/folding_config.json
index 642200d02b..bf94f8058d 100644
--- a/tutorials/fpga_flow/folding_config.json
+++ b/tutorials/fpga_flow/folding_config.json
@@ -1,30 +1,29 @@
 {
   "Defaults": {},
-  "Thresholding_Batch_0": {
-    "PE": 49,
-    "ram_style": "block"
+  "Thresholding_rtl_0": {
+    "PE": 49
   },
-  "MatrixVectorActivation_0": {
+  "MVAU_hls_0": {
     "PE": 16,
     "SIMD": 49,
     "ram_style": "block"
   },
-  "MatrixVectorActivation_1": {
+  "MVAU_hls_1": {
     "PE": 8,
     "SIMD": 8,
     "ram_style": "auto"
   },
-  "MatrixVectorActivation_2": {
+  "MVAU_hls_2": {
     "PE": 8,
     "SIMD": 8,
     "ram_style": "auto"
   },
-  "MatrixVectorActivation_3": {
+  "MVAU_hls_3": {
     "PE": 10,
     "SIMD": 8,
     "ram_style": "distributed"
   },
-  "LabelSelect_Batch_0": {
+  "LabelSelect_hls_0": {
     "PE": 1
   }
 }