From 3002e6239903af9f9f9444ef3fbbb8935ba2bb92 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Tue, 20 Sep 2022 11:08:59 +0100
Subject: [PATCH 001/235] HDL for new thresholding by binary search.

---
 finn-rtllib/thresholding/hdl/thresholding.sv  | 153 ++++++++++++++
 .../thresholding/hdl/thresholding_axi.sv      | 198 ++++++++++++++++++
 .../hdl/thresholding_axi_wrapper.v            | 122 +++++++++++
 3 files changed, 473 insertions(+)
 create mode 100644 finn-rtllib/thresholding/hdl/thresholding.sv
 create mode 100644 finn-rtllib/thresholding/hdl/thresholding_axi.sv
 create mode 100644 finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v

diff --git a/finn-rtllib/thresholding/hdl/thresholding.sv b/finn-rtllib/thresholding/hdl/thresholding.sv
new file mode 100644
index 0000000000..93ccdc51c5
--- /dev/null
+++ b/finn-rtllib/thresholding/hdl/thresholding.sv
@@ -0,0 +1,153 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Pipelined thresholding by binary search.
+ * @author	Thomas B. Preußer <tpreusse@amd.com>
+ *
+ * @description
+ *  Produces the N-bit count of those among 2^N-1 thresholds that are not
+ *  larger than the corresponding input:
+ *     y = Σ(T_i <= x)
+ *  The result is computed by binary search. The runtime-configurable
+ *  thresholds must be written in ascending order:
+ *     i < j => T_i < T_j
+ *  The design supports channel folding allowing each input to be processed
+ *  with respect to a selectable set of thresholds. The corresponding
+ *  threshold configuration relies on a channel address prefix. Inputs are
+ *  accompanied by a channel selector.
+ *****************************************************************************/
+module thresholding #(
+	int unsigned  N,  // output precision
+	int unsigned  M,  // input/threshold precision
+	int unsigned  C,  // number of channels
+
+	localparam int unsigned  C_BITS = C < 2? 1 : $clog2(C)
+)(
+	// Global Control
+	input	logic  clk,
+	input	logic  rst,
+
+	// Threshold Configuration
+	input	logic  twe,
+	input	logic [$clog2(C)+N-1:0]  twa,
+	input	logic [          M-1:0]  twd,
+
+	// Clock Enable for Stream Processing
+	input	logic  en,
+
+	// Input Stream
+	input	logic  ivld,
+	input	logic [C_BITS-1:0]  icnl,	// Ignored for C == 1
+	input	logic [M     -1:0]  idat,
+
+	// Output Stream
+	output	logic  ovld,
+	output	logic [C_BITS-1:0]  ocnl,
+	output	logic [N     -1:0]  odat
+);
+
+	// Pipeline Links & Feed
+	typedef struct packed {
+		logic               vld;	// Valid data identification
+		logic [C_BITS-1:0]  cnl;	// Channel
+		logic [M     -1:0]  val;	// Original input value
+		logic [0:N-1]       res;	// Assembling result with valid prefix [0:stage] after stage #stage
+	} pipe_t;
+	uwire pipe_t  pipe[0:N];
+	assign	pipe[0] = pipe_t'{ vld: ivld, cnl: icnl, val: idat, res: {N{1'bx}} };	// Feed original input
+
+	// Stages: 0, 1, ..., N-1
+	uwire [0:N-1]  tws = (twa[N-1:0]+1) & ~twa[N-1:0];   // Write Select per stage by address suffix
+	for(genvar  stage = 0; stage < N; stage++) begin : genStages
+
+		// Threshold Memory
+		uwire [M-1:0]  thresh;
+		if(1) begin : blkUpdate
+
+			// Write control: local select from global address
+			uwire  we = twe && tws[stage];
+			if((C == 1) && (stage == 0)) begin
+				logic [M-1:0]  Thresh = 'x;
+				always_ff @(posedge clk) begin
+					if(rst)      Thresh <= 'x;
+					else if(we)  Thresh <= twd;
+				end
+				assign  thresh = Thresh;
+			end
+			else begin
+				logic [M-1:0]  Threshs[C * 2**stage];
+				uwire [$clog2(C)+stage-1:0]  wa = twa[$left(twa):N-stage];
+				uwire [$clog2(C)+stage-1:0]  ra;
+				if(C > 1)  assign  ra[stage+:C_BITS] = pipe[stage].cnl;
+				if(stage)  assign  ra[stage-1:0]     = pipe[stage].res[0:stage-1];
+
+				// Write
+				always_ff @(posedge clk) begin
+					if(we)  Threshs[wa] <= twd;
+				end
+
+				// Read
+				logic [M-1:0]  RdReg;
+				always_ff @(posedge clk) begin
+					if(en)  RdReg <= Threshs[ra];
+				end
+				assign	thresh = RdReg;
+			end
+
+		end : blkUpdate
+
+		// Pipeline regs simply copying the input
+		pipe_t  State = '{ vld: 0, cnl: 'x, val: 'x, res: 'x };
+		always_ff @(posedge clk) begin
+			if(rst)      State <= '{ vld: 0, cnl: 'x, val: 'x, res: 'x };
+			else if(en)  State <= pipe[stage];
+		end
+
+		// Assemble pipeline data
+		logic [0:N-1]  res;
+		always_comb begin
+			res        = State.res;
+			res[stage] = thresh <= State.val;	// Patch in next result bit
+		end
+		assign	pipe[stage+1] = '{
+			vld: State.vld,
+			cnl: State.cnl,
+			val: State.val,
+			res: res
+		};
+
+	end : genStages
+
+	// Output
+	assign	ovld = pipe[N].vld;
+	assign	ocnl = pipe[N].cnl;
+	assign	odat = pipe[N].res;
+
+endmodule : thresholding
diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi.sv b/finn-rtllib/thresholding/hdl/thresholding_axi.sv
new file mode 100644
index 0000000000..71e54c5ca0
--- /dev/null
+++ b/finn-rtllib/thresholding/hdl/thresholding_axi.sv
@@ -0,0 +1,198 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	All-AXI interface adapter for thresholding module.
+ * @author	Thomas B. Preußer <tpreusse@amd.com>
+ *****************************************************************************/
+
+module thresholding_axi #(
+	int unsigned  N,	// output precision
+	int unsigned  M,	// input/threshold precision
+	int unsigned  C		// Channels
+)(
+	//- Global Control ------------------
+	input	logic  ap_clk,
+	input	logic  ap_rst_n,
+
+	//- AXI Lite ------------------------
+	// Writing
+	input	logic                    s_axilite_AWVALID,
+	output	logic                    s_axilite_AWREADY,
+	input	logic [$clog2(C)+N-1:0]  s_axilite_AWADDR,
+
+	input	logic         s_axilite_WVALID,
+	output	logic         s_axilite_WREADY,
+	input	logic [31:0]  s_axilite_WDATA,
+	input	logic [ 3:0]  s_axilite_WSTRB,
+
+	output	logic        s_axilite_BVALID,
+	input	logic        s_axilite_BREADY,
+	output	logic [1:0]  s_axilite_BRESP,
+
+	// Reading
+	input	logic        s_axilite_ARVALID,
+	output	logic        s_axilite_ARREADY,
+	input	logic [0:0]  s_axilite_ARADDR,
+
+	output	logic         s_axilite_RVALID,
+	input	logic         s_axilite_RREADY,
+	output	logic [31:0]  s_axilite_RDATA,
+	output	logic [ 1:0]  s_axilite_RRESP,
+
+	//- AXI Stream - Input --------------
+	output	logic  s_axis_tready,
+	input	logic  s_axis_tvalid,
+	input	logic [((M+7)/8)*8-1:0]  s_axis_tdata,
+
+	//- AXI Stream - Output -------------
+	input	logic  m_axis_tready,
+	output	logic  m_axis_tvalid,
+	output	logic [((N+7)/8)*8-1:0]  m_axis_tdata
+);
+	//- Global Control ------------------------------------------------------
+	uwire  clk = ap_clk;
+	uwire  rst = !ap_rst_n;
+
+	//- AXI Lite: Threshold Configuration -----------------------------------
+	uwire  twe;
+	uwire [$clog2(C)+N-1:0]  twa;
+	uwire [          M-1:0]  twd;
+	if(1) begin : blkAxiLite
+		logic  WABusy = 0;
+		logic  WDBusy = 0;
+		logic [$clog2(C)+N-1:0]  Addr = 'x;
+		logic [          M-1:0]  Data = 'x;
+
+		assign	twe = WABusy && WDBusy;
+		assign	twa = Addr;
+		assign	twd = Data;
+
+		uwire  clr_wr = rst || (twe && s_axilite_BREADY);
+		always_ff @(posedge clk) begin : blockName
+			if(clr_wr) begin
+				WABusy <= 0;
+				Addr <= 'x;
+				WDBusy <= 0;
+				Data <= 'x;
+			end
+			else begin
+				if(!WABusy) begin
+					WABusy <= s_axilite_AWVALID;
+					Addr   <= s_axilite_AWADDR[$clog2(C)+N-1:0];
+				end
+				if(!WDBusy) begin
+					WDBusy <= s_axilite_WVALID;
+					Data   <= s_axilite_WDATA[M-1:0];
+				end
+			end
+		end
+		assign	s_axilite_AWREADY = !WABusy;
+		assign	s_axilite_WREADY  = !WDBusy;
+		assign	s_axilite_BVALID  = WABusy && WDBusy;
+		assign	s_axilite_BRESP   = '0; // OK
+
+		// Answer all reads with '1
+		logic  RValid =  0;
+		uwire  clr_rd = rst || (RValid && s_axilite_RREADY);
+		always_ff @(posedge clk) begin
+			if(clr_rd)        RValid <=  0;
+			else if(!RValid)  RValid <= s_axilite_ARVALID;
+		end
+		assign	s_axilite_ARREADY = !RValid;
+		assign	s_axilite_RVALID  = RValid;
+		assign	s_axilite_RDATA   = '1;
+		assign	s_axilite_RRESP   = '0; // OK
+
+	end : blkAxiLite
+
+	//- IO-Sandwich with two-stage output buffer for containing a local enable
+	uwire  en;
+	uwire [N-1:0]  odat;
+	uwire  ovld;
+	if(1) begin : blkOutputDecouple
+		typedef struct {
+			logic          vld;
+			logic [N-1:0]  dat;
+		} buf_t;
+		buf_t  Buf[2] = '{ default: '{ vld: 0, dat: 'x } };
+		always_ff @(posedge clk) begin
+			if(rst)  Buf <= '{ default: '{ vld: 0, dat: 'x } };
+			else begin
+				if(!Buf[1].vld || m_axis_tready) begin
+					Buf[1] <= '{
+						vld: Buf[0].vld || ovld,
+						dat: Buf[0].vld? Buf[0].dat : odat
+					};
+				end
+				Buf[0].vld <= Buf[1].vld && !m_axis_tready && (Buf[0].vld || ovld);
+				if(!Buf[0].vld)  Buf[0].dat <= odat;
+			end
+		end
+		assign	en = !Buf[0].vld;
+
+		assign	m_axis_tvalid = Buf[1].vld;
+		assign	m_axis_tdata  = Buf[1].dat;
+
+	end : blkOutputDecouple
+
+	localparam int unsigned  C_BITS = C < 2? 1 : $clog2(C);
+	uwire  ivld = s_axis_tvalid;
+	uwire [C_BITS-1:0]  icnl;
+	uwire [M     -1:0]  idat = s_axis_tdata[M-1:0];
+	assign	s_axis_tready = en;
+	if(C == 1)  assign  icnl = 'x;
+	else begin
+		logic [C_BITS-1:0]  Chnl = 0;
+		logic               Last = 0;
+		uwire  inc = ivld && en;
+		uwire  clr = rst || (Last && inc);
+		always_ff @(posedge clk) begin
+			if(clr) begin
+				Chnl <= 0;
+				Last <= 0;
+			end
+			else if(inc) begin
+				Chnl <= Chnl + 1;
+				Last <= (~Chnl & (C-2)) == 0;
+			end
+		end
+		assign	icnl = Chnl;
+	end
+
+	// Core Thresholding Module
+	thresholding #(.N(N), .M(M), .C(C)) core (
+		.clk, .rst,
+		.twe, .twa, .twd,
+		.en,
+		.ivld, .icnl, .idat,
+		.ovld, .ocnl(), .odat
+	);
+
+endmodule : thresholding_axi
diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v
new file mode 100644
index 0000000000..bb6b17b32f
--- /dev/null
+++ b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v
@@ -0,0 +1,122 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	IPI-compatible Verilog wrapper for thresholding_axi module.
+ * @author	Thomas B. Preußer <tpreusse@amd.com>
+ *****************************************************************************/
+
+module thresholding_axi_wrapper #(
+	parameter  N,	// output precision
+	parameter  M,	// input/threshold precision
+	parameter  C,	// Channels
+	parameter  C_BITS //= $clog2(C)
+)(
+	//- Global Control ------------------
+	input	ap_clk,
+	input	ap_rst_n,
+
+	//- AXI Lite ------------------------
+	// Writing
+	input	                s_axilite_AWVALID,
+	output	                s_axilite_AWREADY,
+	input	[C_BITS+N-1:0]  s_axilite_AWADDR,
+
+	input	        s_axilite_WVALID,
+	output	        s_axilite_WREADY,
+	input	[31:0]  s_axilite_WDATA,
+	input	[ 3:0]  s_axilite_WSTRB,
+
+	output	       s_axilite_BVALID,
+	input	       s_axilite_BREADY,
+	output	[1:0]  s_axilite_BRESP,
+
+	// Reading
+	input	       s_axilite_ARVALID,
+	output	       s_axilite_ARREADY,
+	input	[0:0]  s_axilite_ARADDR,
+
+	output	        s_axilite_RVALID,
+	input	        s_axilite_RREADY,
+	output	[31:0]  s_axilite_RDATA,
+	output	[ 1:0]  s_axilite_RRESP,
+
+	//- AXI Stream - Input --------------
+	output	s_axis_tready,
+	input	s_axis_tvalid,
+	input	[((M+7)/8)*8-1:0]  s_axis_tdata,
+
+	//- AXI Stream - Output -------------
+	input	m_axis_tready,
+	output	m_axis_tvalid,
+	output	[((N+7)/8)*8-1:0]  m_axis_tdata
+);
+
+	thresholding_axi #(.N(N), .M(M), .C(C)) inst (
+		//- Global Control ------------------
+		.ap_clk(ap_clk),
+		.ap_rst_n(ap_rst_n),
+
+		//- AXI Lite ------------------------
+		// Writing
+		.s_axilite_AWVALID(s_axilite_AWVALID),
+		.s_axilite_AWREADY(s_axilite_AWREADY),
+		.s_axilite_AWADDR(s_axilite_AWADDR),
+
+		.s_axilite_WVALID(s_axilite_WVALID),
+		.s_axilite_WREADY(s_axilite_WREADY),
+		.s_axilite_WDATA(s_axilite_WDATA),
+		.s_axilite_WSTRB(s_axilite_WSTRB),
+
+		.s_axilite_BVALID(s_axilite_BVALID),
+		.s_axilite_BREADY(s_axilite_BREADY),
+		.s_axilite_BRESP(s_axilite_BRESP),
+
+		// Reading
+		.s_axilite_ARVALID(s_axilite_ARVALID),
+		.s_axilite_ARREADY(s_axilite_ARREADY),
+		.s_axilite_ARADDR(s_axilite_ARADDR),
+
+		.s_axilite_RVALID(s_axilite_RVALID),
+		.s_axilite_RREADY(s_axilite_RREADY),
+		.s_axilite_RDATA(s_axilite_RDATA),
+		.s_axilite_RRESP(s_axilite_RRESP),
+
+		//- AXI Stream - Input --------------
+		.s_axis_tready(s_axis_tready),
+		.s_axis_tvalid(s_axis_tvalid),
+		.s_axis_tdata(s_axis_tdata),
+
+		//- AXI Stream - Output -------------
+		.m_axis_tready(m_axis_tready),
+		.m_axis_tvalid(m_axis_tvalid),
+		.m_axis_tdata(m_axis_tdata)
+	);
+
+endmodule : thresholding_axi_wrapper

From 3c92c2fc460fb5e45fdb0dfcc0b92c572ae65ce7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Tue, 20 Sep 2022 13:33:01 +0100
Subject: [PATCH 002/235] IP core support files for thresholding module.

---
 finn-rtllib/thresholding/component.xml        | 817 ++++++++++++++++++
 .../xgui/thresholding_axi_wrapper_v1_0.tcl    |  74 ++
 2 files changed, 891 insertions(+)
 create mode 100644 finn-rtllib/thresholding/component.xml
 create mode 100644 finn-rtllib/thresholding/xgui/thresholding_axi_wrapper_v1_0.tcl

diff --git a/finn-rtllib/thresholding/component.xml b/finn-rtllib/thresholding/component.xml
new file mode 100644
index 0000000000..0a56f93316
--- /dev/null
+++ b/finn-rtllib/thresholding/component.xml
@@ -0,0 +1,817 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<spirit:component xmlns:xilinx="http://www.xilinx.com" xmlns:spirit="http://www.spiritconsortium.org/XMLSchema/SPIRIT/1685-2009" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+  <spirit:vendor>amd.com</spirit:vendor>
+  <spirit:library>user</spirit:library>
+  <spirit:name>thresholding_axi_wrapper</spirit:name>
+  <spirit:version>1.0</spirit:version>
+  <spirit:busInterfaces>
+    <spirit:busInterface>
+      <spirit:name>m_axis</spirit:name>
+      <spirit:busType spirit:vendor="xilinx.com" spirit:library="interface" spirit:name="axis" spirit:version="1.0"/>
+      <spirit:abstractionType spirit:vendor="xilinx.com" spirit:library="interface" spirit:name="axis_rtl" spirit:version="1.0"/>
+      <spirit:master/>
+      <spirit:portMaps>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>TDATA</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>m_axis_tdata</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>TVALID</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>m_axis_tvalid</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>TREADY</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>m_axis_tready</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+      </spirit:portMaps>
+    </spirit:busInterface>
+    <spirit:busInterface>
+      <spirit:name>s_axis</spirit:name>
+      <spirit:busType spirit:vendor="xilinx.com" spirit:library="interface" spirit:name="axis" spirit:version="1.0"/>
+      <spirit:abstractionType spirit:vendor="xilinx.com" spirit:library="interface" spirit:name="axis_rtl" spirit:version="1.0"/>
+      <spirit:slave/>
+      <spirit:portMaps>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>TDATA</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>s_axis_tdata</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>TVALID</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>s_axis_tvalid</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>TREADY</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>s_axis_tready</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+      </spirit:portMaps>
+    </spirit:busInterface>
+    <spirit:busInterface>
+      <spirit:name>s_axilite</spirit:name>
+      <spirit:busType spirit:vendor="xilinx.com" spirit:library="interface" spirit:name="aximm" spirit:version="1.0"/>
+      <spirit:abstractionType spirit:vendor="xilinx.com" spirit:library="interface" spirit:name="aximm_rtl" spirit:version="1.0"/>
+      <spirit:slave>
+        <spirit:memoryMapRef spirit:memoryMapRef="s_axilite"/>
+      </spirit:slave>
+      <spirit:portMaps>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>AWADDR</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>s_axilite_AWADDR</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>AWVALID</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>s_axilite_AWVALID</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>AWREADY</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>s_axilite_AWREADY</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>WDATA</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>s_axilite_WDATA</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>WSTRB</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>s_axilite_WSTRB</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>WVALID</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>s_axilite_WVALID</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>WREADY</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>s_axilite_WREADY</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>BRESP</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>s_axilite_BRESP</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>BVALID</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>s_axilite_BVALID</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>BREADY</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>s_axilite_BREADY</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>ARADDR</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>s_axilite_ARADDR</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>ARVALID</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>s_axilite_ARVALID</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>ARREADY</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>s_axilite_ARREADY</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>RDATA</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>s_axilite_RDATA</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>RRESP</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>s_axilite_RRESP</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>RVALID</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>s_axilite_RVALID</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>RREADY</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>s_axilite_RREADY</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+      </spirit:portMaps>
+    </spirit:busInterface>
+    <spirit:busInterface>
+      <spirit:name>ap_rst_n</spirit:name>
+      <spirit:busType spirit:vendor="xilinx.com" spirit:library="signal" spirit:name="reset" spirit:version="1.0"/>
+      <spirit:abstractionType spirit:vendor="xilinx.com" spirit:library="signal" spirit:name="reset_rtl" spirit:version="1.0"/>
+      <spirit:slave/>
+      <spirit:portMaps>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>RST</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>ap_rst_n</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+      </spirit:portMaps>
+      <spirit:parameters>
+        <spirit:parameter>
+          <spirit:name>POLARITY</spirit:name>
+          <spirit:value spirit:id="BUSIFPARAM_VALUE.AP_RST_N.POLARITY" spirit:choiceRef="choice_list_74b5137e">ACTIVE_LOW</spirit:value>
+        </spirit:parameter>
+      </spirit:parameters>
+    </spirit:busInterface>
+    <spirit:busInterface>
+      <spirit:name>ap_clk</spirit:name>
+      <spirit:busType spirit:vendor="xilinx.com" spirit:library="signal" spirit:name="clock" spirit:version="1.0"/>
+      <spirit:abstractionType spirit:vendor="xilinx.com" spirit:library="signal" spirit:name="clock_rtl" spirit:version="1.0"/>
+      <spirit:slave/>
+      <spirit:portMaps>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>CLK</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>ap_clk</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+      </spirit:portMaps>
+      <spirit:parameters>
+        <spirit:parameter>
+          <spirit:name>ASSOCIATED_RESET</spirit:name>
+          <spirit:value spirit:id="BUSIFPARAM_VALUE.AP_CLK.ASSOCIATED_RESET">ap_rst_n</spirit:value>
+        </spirit:parameter>
+        <spirit:parameter>
+          <spirit:name>ASSOCIATED_BUSIF</spirit:name>
+          <spirit:value spirit:id="BUSIFPARAM_VALUE.AP_CLK.ASSOCIATED_BUSIF">m_axis:s_axis:s_axilite</spirit:value>
+        </spirit:parameter>
+      </spirit:parameters>
+    </spirit:busInterface>
+  </spirit:busInterfaces>
+  <spirit:memoryMaps>
+    <spirit:memoryMap>
+      <spirit:name>s_axilite</spirit:name>
+      <spirit:displayName>s_axilite</spirit:displayName>
+      <spirit:addressBlock>
+        <spirit:name>reg0</spirit:name>
+        <spirit:displayName>reg0</spirit:displayName>
+        <spirit:baseAddress spirit:format="bitString" spirit:bitStringLength="1">0x0</spirit:baseAddress>
+        <spirit:range spirit:format="long" spirit:resolve="dependent" spirit:dependency="pow(2,((spirit:decode(id(&apos;MODELPARAM_VALUE.C_BITS&apos;)) + spirit:decode(id(&apos;MODELPARAM_VALUE.N&apos;))) - 1) - 0 + 1)" spirit:minimum="4096" spirit:rangeType="long">4096</spirit:range>
+        <spirit:width spirit:format="long">32</spirit:width>
+        <spirit:usage>register</spirit:usage>
+      </spirit:addressBlock>
+    </spirit:memoryMap>
+  </spirit:memoryMaps>
+  <spirit:model>
+    <spirit:views>
+      <spirit:view>
+        <spirit:name>xilinx_anylanguagesynthesis</spirit:name>
+        <spirit:displayName>Synthesis</spirit:displayName>
+        <spirit:envIdentifier>:vivado.xilinx.com:synthesis</spirit:envIdentifier>
+        <spirit:language>Verilog</spirit:language>
+        <spirit:modelName>thresholding_axi_wrapper</spirit:modelName>
+        <spirit:fileSetRef>
+          <spirit:localName>xilinx_anylanguagesynthesis_view_fileset</spirit:localName>
+        </spirit:fileSetRef>
+        <spirit:parameters>
+          <spirit:parameter>
+            <spirit:name>viewChecksum</spirit:name>
+            <spirit:value>5cc8f7a9</spirit:value>
+          </spirit:parameter>
+        </spirit:parameters>
+      </spirit:view>
+      <spirit:view>
+        <spirit:name>xilinx_xpgui</spirit:name>
+        <spirit:displayName>UI Layout</spirit:displayName>
+        <spirit:envIdentifier>:vivado.xilinx.com:xgui.ui</spirit:envIdentifier>
+        <spirit:fileSetRef>
+          <spirit:localName>xilinx_xpgui_view_fileset</spirit:localName>
+        </spirit:fileSetRef>
+        <spirit:parameters>
+          <spirit:parameter>
+            <spirit:name>viewChecksum</spirit:name>
+            <spirit:value>c456596c</spirit:value>
+          </spirit:parameter>
+        </spirit:parameters>
+      </spirit:view>
+    </spirit:views>
+    <spirit:ports>
+      <spirit:port>
+        <spirit:name>ap_clk</spirit:name>
+        <spirit:wire>
+          <spirit:direction>in</spirit:direction>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>std_logic</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>ap_rst_n</spirit:name>
+        <spirit:wire>
+          <spirit:direction>in</spirit:direction>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>std_logic</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>s_axilite_AWVALID</spirit:name>
+        <spirit:wire>
+          <spirit:direction>in</spirit:direction>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>std_logic</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+          <spirit:driver>
+            <spirit:defaultValue spirit:format="long">0</spirit:defaultValue>
+          </spirit:driver>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>s_axilite_AWREADY</spirit:name>
+        <spirit:wire>
+          <spirit:direction>out</spirit:direction>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>std_logic</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>s_axilite_AWADDR</spirit:name>
+        <spirit:wire>
+          <spirit:direction>in</spirit:direction>
+          <spirit:vector>
+            <spirit:left spirit:format="long" spirit:resolve="dependent" spirit:dependency="((spirit:decode(id(&apos;MODELPARAM_VALUE.C_BITS&apos;)) + spirit:decode(id(&apos;MODELPARAM_VALUE.N&apos;))) - 1)">3</spirit:left>
+            <spirit:right spirit:format="long">0</spirit:right>
+          </spirit:vector>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>std_logic_vector</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+          <spirit:driver>
+            <spirit:defaultValue spirit:format="long">0</spirit:defaultValue>
+          </spirit:driver>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>s_axilite_WVALID</spirit:name>
+        <spirit:wire>
+          <spirit:direction>in</spirit:direction>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>std_logic</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+          <spirit:driver>
+            <spirit:defaultValue spirit:format="long">0</spirit:defaultValue>
+          </spirit:driver>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>s_axilite_WREADY</spirit:name>
+        <spirit:wire>
+          <spirit:direction>out</spirit:direction>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>std_logic</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>s_axilite_WDATA</spirit:name>
+        <spirit:wire>
+          <spirit:direction>in</spirit:direction>
+          <spirit:vector>
+            <spirit:left spirit:format="long">31</spirit:left>
+            <spirit:right spirit:format="long">0</spirit:right>
+          </spirit:vector>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>std_logic_vector</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+          <spirit:driver>
+            <spirit:defaultValue spirit:format="long">0</spirit:defaultValue>
+          </spirit:driver>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>s_axilite_WSTRB</spirit:name>
+        <spirit:wire>
+          <spirit:direction>in</spirit:direction>
+          <spirit:vector>
+            <spirit:left spirit:format="long">3</spirit:left>
+            <spirit:right spirit:format="long">0</spirit:right>
+          </spirit:vector>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>std_logic_vector</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+          <spirit:driver>
+            <spirit:defaultValue spirit:format="long">1</spirit:defaultValue>
+          </spirit:driver>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>s_axilite_BVALID</spirit:name>
+        <spirit:wire>
+          <spirit:direction>out</spirit:direction>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>std_logic</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>s_axilite_BREADY</spirit:name>
+        <spirit:wire>
+          <spirit:direction>in</spirit:direction>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>std_logic</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+          <spirit:driver>
+            <spirit:defaultValue spirit:format="long">0</spirit:defaultValue>
+          </spirit:driver>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>s_axilite_BRESP</spirit:name>
+        <spirit:wire>
+          <spirit:direction>out</spirit:direction>
+          <spirit:vector>
+            <spirit:left spirit:format="long">1</spirit:left>
+            <spirit:right spirit:format="long">0</spirit:right>
+          </spirit:vector>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>std_logic_vector</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>s_axilite_ARVALID</spirit:name>
+        <spirit:wire>
+          <spirit:direction>in</spirit:direction>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>std_logic</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+          <spirit:driver>
+            <spirit:defaultValue spirit:format="long">0</spirit:defaultValue>
+          </spirit:driver>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>s_axilite_ARREADY</spirit:name>
+        <spirit:wire>
+          <spirit:direction>out</spirit:direction>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>std_logic</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>s_axilite_ARADDR</spirit:name>
+        <spirit:wire>
+          <spirit:direction>in</spirit:direction>
+          <spirit:vector>
+            <spirit:left spirit:format="long">0</spirit:left>
+            <spirit:right spirit:format="long">0</spirit:right>
+          </spirit:vector>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>std_logic_vector</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+          <spirit:driver>
+            <spirit:defaultValue spirit:format="long">0</spirit:defaultValue>
+          </spirit:driver>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>s_axilite_RVALID</spirit:name>
+        <spirit:wire>
+          <spirit:direction>out</spirit:direction>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>std_logic</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>s_axilite_RREADY</spirit:name>
+        <spirit:wire>
+          <spirit:direction>in</spirit:direction>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>std_logic</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+          <spirit:driver>
+            <spirit:defaultValue spirit:format="long">0</spirit:defaultValue>
+          </spirit:driver>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>s_axilite_RDATA</spirit:name>
+        <spirit:wire>
+          <spirit:direction>out</spirit:direction>
+          <spirit:vector>
+            <spirit:left spirit:format="long">31</spirit:left>
+            <spirit:right spirit:format="long">0</spirit:right>
+          </spirit:vector>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>std_logic_vector</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>s_axilite_RRESP</spirit:name>
+        <spirit:wire>
+          <spirit:direction>out</spirit:direction>
+          <spirit:vector>
+            <spirit:left spirit:format="long">1</spirit:left>
+            <spirit:right spirit:format="long">0</spirit:right>
+          </spirit:vector>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>std_logic_vector</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>s_axis_tready</spirit:name>
+        <spirit:wire>
+          <spirit:direction>out</spirit:direction>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>std_logic</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>s_axis_tvalid</spirit:name>
+        <spirit:wire>
+          <spirit:direction>in</spirit:direction>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>std_logic</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>s_axis_tdata</spirit:name>
+        <spirit:wire>
+          <spirit:direction>in</spirit:direction>
+          <spirit:vector>
+            <spirit:left spirit:format="long" spirit:resolve="dependent" spirit:dependency="((((spirit:decode(id(&apos;MODELPARAM_VALUE.M&apos;)) + 7) / 8) * 8) - 1)">15</spirit:left>
+            <spirit:right spirit:format="long">0</spirit:right>
+          </spirit:vector>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>std_logic_vector</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+          <spirit:driver>
+            <spirit:defaultValue spirit:format="long">0</spirit:defaultValue>
+          </spirit:driver>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>m_axis_tready</spirit:name>
+        <spirit:wire>
+          <spirit:direction>in</spirit:direction>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>std_logic</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+          <spirit:driver>
+            <spirit:defaultValue spirit:format="long">1</spirit:defaultValue>
+          </spirit:driver>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>m_axis_tvalid</spirit:name>
+        <spirit:wire>
+          <spirit:direction>out</spirit:direction>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>std_logic</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>m_axis_tdata</spirit:name>
+        <spirit:wire>
+          <spirit:direction>out</spirit:direction>
+          <spirit:vector>
+            <spirit:left spirit:format="long" spirit:resolve="dependent" spirit:dependency="((((spirit:decode(id(&apos;MODELPARAM_VALUE.N&apos;)) + 7) / 8) * 8) - 1)">7</spirit:left>
+            <spirit:right spirit:format="long">0</spirit:right>
+          </spirit:vector>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>std_logic_vector</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+        </spirit:wire>
+      </spirit:port>
+    </spirit:ports>
+    <spirit:modelParameters>
+      <spirit:modelParameter xsi:type="spirit:nameValueTypeType">
+        <spirit:name>N</spirit:name>
+        <spirit:displayName>N</spirit:displayName>
+        <spirit:value spirit:format="long" spirit:resolve="generated" spirit:id="MODELPARAM_VALUE.N">4</spirit:value>
+      </spirit:modelParameter>
+      <spirit:modelParameter>
+        <spirit:name>M</spirit:name>
+        <spirit:displayName>M</spirit:displayName>
+        <spirit:value spirit:format="long" spirit:resolve="generated" spirit:id="MODELPARAM_VALUE.M">16</spirit:value>
+      </spirit:modelParameter>
+      <spirit:modelParameter>
+        <spirit:name>C</spirit:name>
+        <spirit:displayName>C</spirit:displayName>
+        <spirit:value spirit:format="long" spirit:resolve="generated" spirit:id="MODELPARAM_VALUE.C">1</spirit:value>
+      </spirit:modelParameter>
+      <spirit:modelParameter>
+        <spirit:name>C_BITS</spirit:name>
+        <spirit:displayName>C Bits</spirit:displayName>
+        <spirit:value spirit:format="long" spirit:resolve="generated" spirit:id="MODELPARAM_VALUE.C_BITS">0</spirit:value>
+      </spirit:modelParameter>
+    </spirit:modelParameters>
+  </spirit:model>
+  <spirit:choices>
+    <spirit:choice>
+      <spirit:name>choice_list_74b5137e</spirit:name>
+      <spirit:enumeration>ACTIVE_HIGH</spirit:enumeration>
+      <spirit:enumeration>ACTIVE_LOW</spirit:enumeration>
+    </spirit:choice>
+  </spirit:choices>
+  <spirit:fileSets>
+    <spirit:fileSet>
+      <spirit:name>xilinx_anylanguagesynthesis_view_fileset</spirit:name>
+      <spirit:file>
+        <spirit:name>hdl/thresholding.sv</spirit:name>
+        <spirit:fileType>systemVerilogSource</spirit:fileType>
+      </spirit:file>
+      <spirit:file>
+        <spirit:name>hdl/thresholding_axi.sv</spirit:name>
+        <spirit:fileType>systemVerilogSource</spirit:fileType>
+      </spirit:file>
+      <spirit:file>
+        <spirit:name>hdl/thresholding_axi_wrapper.v</spirit:name>
+        <spirit:fileType>verilogSource</spirit:fileType>
+        <spirit:userFileType>CHECKSUM_2ec027ae</spirit:userFileType>
+      </spirit:file>
+    </spirit:fileSet>
+    <spirit:fileSet>
+      <spirit:name>xilinx_xpgui_view_fileset</spirit:name>
+      <spirit:file>
+        <spirit:name>xgui/thresholding_axi_wrapper_v1_0.tcl</spirit:name>
+        <spirit:fileType>tclSource</spirit:fileType>
+        <spirit:userFileType>CHECKSUM_c456596c</spirit:userFileType>
+        <spirit:userFileType>XGUI_VERSION_2</spirit:userFileType>
+      </spirit:file>
+    </spirit:fileSet>
+  </spirit:fileSets>
+  <spirit:description>thresholding_axi_wrapper_v1_0</spirit:description>
+  <spirit:parameters>
+    <spirit:parameter>
+      <spirit:name>N</spirit:name>
+      <spirit:displayName>N</spirit:displayName>
+      <spirit:value spirit:format="long" spirit:resolve="user" spirit:id="PARAM_VALUE.N">4</spirit:value>
+    </spirit:parameter>
+    <spirit:parameter>
+      <spirit:name>M</spirit:name>
+      <spirit:displayName>M</spirit:displayName>
+      <spirit:value spirit:format="long" spirit:resolve="user" spirit:id="PARAM_VALUE.M">16</spirit:value>
+    </spirit:parameter>
+    <spirit:parameter>
+      <spirit:name>C</spirit:name>
+      <spirit:displayName>C</spirit:displayName>
+      <spirit:value spirit:format="long" spirit:resolve="user" spirit:id="PARAM_VALUE.C">1</spirit:value>
+    </spirit:parameter>
+    <spirit:parameter>
+      <spirit:name>C_BITS</spirit:name>
+      <spirit:displayName>C_BITS</spirit:displayName>
+      <spirit:value spirit:format="long" spirit:resolve="user" spirit:id="PARAM_VALUE.C_BITS">0</spirit:value>
+    </spirit:parameter>
+    <spirit:parameter>
+      <spirit:name>Component_Name</spirit:name>
+      <spirit:value spirit:resolve="user" spirit:id="PARAM_VALUE.Component_Name" spirit:order="1">thresholding_axi_wrapper_v1_0</spirit:value>
+    </spirit:parameter>
+  </spirit:parameters>
+  <spirit:vendorExtensions>
+    <xilinx:coreExtensions>
+      <xilinx:supportedFamilies>
+        <xilinx:family xilinx:lifeCycle="Production">virtex7</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">qvirtex7</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">versal</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">kintex7</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">kintex7l</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">qkintex7</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">qkintex7l</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">akintex7</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">artix7</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">artix7l</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">aartix7</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">qartix7</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">zynq</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">qzynq</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">azynq</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">spartan7</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">aspartan7</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">virtexu</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">zynquplus</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">virtexuplus</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">virtexuplusHBM</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">virtexuplus58g</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">kintexuplus</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">artixuplus</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">kintexu</xilinx:family>
+      </xilinx:supportedFamilies>
+      <xilinx:taxonomies>
+        <xilinx:taxonomy>/UserIP</xilinx:taxonomy>
+      </xilinx:taxonomies>
+      <xilinx:displayName>thresholding_axi_wrapper_v1_0</xilinx:displayName>
+      <xilinx:definitionSource>package_project</xilinx:definitionSource>
+      <xilinx:vendorDisplayName>AMD</xilinx:vendorDisplayName>
+      <xilinx:coreRevision>2</xilinx:coreRevision>
+      <xilinx:coreCreationDateTime>2022-09-20T12:31:16Z</xilinx:coreCreationDateTime>
+    </xilinx:coreExtensions>
+    <xilinx:packagingInfo>
+      <xilinx:xilinxVersion>2022.1</xilinx:xilinxVersion>
+      <xilinx:checksum xilinx:scope="busInterfaces" xilinx:value="e262c422"/>
+      <xilinx:checksum xilinx:scope="memoryMaps" xilinx:value="d6ddd21a"/>
+      <xilinx:checksum xilinx:scope="fileGroups" xilinx:value="3302678a"/>
+      <xilinx:checksum xilinx:scope="ports" xilinx:value="c5010d89"/>
+      <xilinx:checksum xilinx:scope="hdlParameters" xilinx:value="db16a7df"/>
+      <xilinx:checksum xilinx:scope="parameters" xilinx:value="17d48459"/>
+    </xilinx:packagingInfo>
+  </spirit:vendorExtensions>
+</spirit:component>
diff --git a/finn-rtllib/thresholding/xgui/thresholding_axi_wrapper_v1_0.tcl b/finn-rtllib/thresholding/xgui/thresholding_axi_wrapper_v1_0.tcl
new file mode 100644
index 0000000000..02c373e8f2
--- /dev/null
+++ b/finn-rtllib/thresholding/xgui/thresholding_axi_wrapper_v1_0.tcl
@@ -0,0 +1,74 @@
+# Definitional proc to organize widgets for parameters.
+proc init_gui { IPINST } {
+  ipgui::add_param $IPINST -name "Component_Name"
+  #Adding Page
+  set Page_0 [ipgui::add_page $IPINST -name "Page 0"]
+  set C [ipgui::add_param $IPINST -name "C" -parent ${Page_0}]
+  set_property tooltip {Channel Count} ${C}
+  set C_BITS [ipgui::add_param $IPINST -name "C_BITS" -parent ${Page_0}]
+  set_property tooltip {Must be clog2(C)} ${C_BITS}
+  set M [ipgui::add_param $IPINST -name "M" -parent ${Page_0}]
+  set_property tooltip {Input Precision} ${M}
+  set N [ipgui::add_param $IPINST -name "N" -parent ${Page_0}]
+  set_property tooltip {Output Precision} ${N}
+
+
+}
+
+proc update_PARAM_VALUE.C { PARAM_VALUE.C } {
+	# Procedure called to update C when any of the dependent parameters in the arguments change
+}
+
+proc validate_PARAM_VALUE.C { PARAM_VALUE.C } {
+	# Procedure called to validate C
+	return true
+}
+
+proc update_PARAM_VALUE.C_BITS { PARAM_VALUE.C_BITS } {
+	# Procedure called to update C_BITS when any of the dependent parameters in the arguments change
+}
+
+proc validate_PARAM_VALUE.C_BITS { PARAM_VALUE.C_BITS } {
+	# Procedure called to validate C_BITS
+	return true
+}
+
+proc update_PARAM_VALUE.M { PARAM_VALUE.M } {
+	# Procedure called to update M when any of the dependent parameters in the arguments change
+}
+
+proc validate_PARAM_VALUE.M { PARAM_VALUE.M } {
+	# Procedure called to validate M
+	return true
+}
+
+proc update_PARAM_VALUE.N { PARAM_VALUE.N } {
+	# Procedure called to update N when any of the dependent parameters in the arguments change
+}
+
+proc validate_PARAM_VALUE.N { PARAM_VALUE.N } {
+	# Procedure called to validate N
+	return true
+}
+
+
+proc update_MODELPARAM_VALUE.N { MODELPARAM_VALUE.N PARAM_VALUE.N } {
+	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
+	set_property value [get_property value ${PARAM_VALUE.N}] ${MODELPARAM_VALUE.N}
+}
+
+proc update_MODELPARAM_VALUE.M { MODELPARAM_VALUE.M PARAM_VALUE.M } {
+	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
+	set_property value [get_property value ${PARAM_VALUE.M}] ${MODELPARAM_VALUE.M}
+}
+
+proc update_MODELPARAM_VALUE.C { MODELPARAM_VALUE.C PARAM_VALUE.C } {
+	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
+	set_property value [get_property value ${PARAM_VALUE.C}] ${MODELPARAM_VALUE.C}
+}
+
+proc update_MODELPARAM_VALUE.C_BITS { MODELPARAM_VALUE.C_BITS PARAM_VALUE.C_BITS } {
+	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
+	set_property value [get_property value ${PARAM_VALUE.C_BITS}] ${MODELPARAM_VALUE.C_BITS}
+}
+

From 09c6da9fc27c3897d3a9cb7423a3e21978f17c2c Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Wed, 21 Sep 2022 15:36:12 +0100
Subject: [PATCH 003/235] [thresholding] FINN-44: Add skeleton class for
 Threshold (the RTL version, no HLS support for this class required).

The following functions have been removed when compared to the original Thresholding_Batch class:
    - get_weightstream_width_padded()
        needed for cppsim
    - get_ap_int_max_w()
        needed for cppsim
    - get_template_param_values()
        needed for cppsim
    - get_hls_compatible_threshold_tensor()
        needed for cppsim/hlslib
    - get_verilog_top_module_intf_names()
        already have TOP verilog module interface names I think
    - get_op_and_param_counts()
        not used anywhere
    - ipgen_extra_directives()
        needed for cppsim/hlslib

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 .../thresholding_binary_search.py             | 159 ++++++++++++++++++
 1 file changed, 159 insertions(+)
 create mode 100755 src/finn/custom_op/fpgadataflow/thresholding_binary_search.py

diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
new file mode 100755
index 0000000000..0e1916706b
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
@@ -0,0 +1,159 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
+
+"""@package thresholding_binary_search
+- ONNX i/o tensor shape assumptions for Thresholding:
+- input 0 is the input tensor, shape (..., NumChannels)
+- input 1 is the threshold tensor, shape (NumChannels, n_thres)
+- output 0 is the output tensor, shape (..., NumChannels) - same as input
+- the '...' here can be any shape (representing groups of vectors)
+
+This module creates an RTL IP, HLS is not supported. See 'thresholding_batch'
+for a HLS equivalent.
+"""
+
+
+class Thresholding_Bin_Search(HLSCustomOp):
+    """Class that corresponds to finn-rtllib 'thresholding' function."""
+
+    def __init__(self, onnx_node):
+        super().__init__(onnx_node)
+
+    def get_nodeattr_types(self):
+        return {}
+
+    def calc_tmem(self):
+        return 0
+
+    def make_shape_compatible_op(self, model):
+        return []
+
+    def infer_node_datatype(self, model):
+        return
+
+    def verify_node(self):
+        return []
+
+    def bram_estimation(self):
+        return 0
+
+    def lut_estimation(self):
+        return 0
+
+    def get_input_datatype(self):
+        return None
+
+    def get_output_datatype(self):
+        return None
+
+    def get_weight_datatype(self):
+        return None
+
+    def minimize_accumulator_width(self, model):
+        return None
+
+    def get_instream_width(self):
+        return 0
+
+    def get_outstream_width(self):
+        return 0
+
+    def get_weightstream_width(self):
+        return 0
+
+    def get_folded_input_shape(self):
+        return tuple([] + [])
+
+    def get_folded_output_shape(self):
+        return tuple([] + [])
+
+    def get_normal_input_shape(self):
+        return tuple([] + [])
+
+    def get_normal_output_shape(self):
+        return tuple([] + [])
+
+    def get_number_output_values(self):
+        return 0
+
+    def get_exp_cycles(self):
+        return 0
+
+    def get_template_param_values(self):
+        return dict()
+
+    def make_weight_file(self, weights, weight_file_mode, weight_file_name):
+        """Produce a file containing given weights (thresholds) in appropriate
+        format for this layer. This file can be used for either synthesis or
+        run-time reconfig of weights.
+
+        Arguments:
+        * weights : numpy array with weights to be put into the file
+        * weight_file_mode : one of {hls_header, decoupled_verilog_dat,
+          decoupled_runtime}
+        * weight_file_name : filename for the weight file to be generated
+        """
+        return
+
+    def generate_params(self, model, path):
+        return
+
+    def execute_node(self, context, graph):
+        return
+
+    def code_generation_ipi(self):
+        return []
+
+    def global_includes(self):
+        pass
+
+    def defines(self, var):
+        pass
+
+    def read_npy_data(self):
+        pass
+
+    def strm_decl(self):
+        pass
+
+    def docompute(self):
+        pass
+
+    def dataoutstrm(self):
+        pass
+
+    def save_as_npy(self):
+        pass
+
+    def blackboxfunction(self):
+        pass
+
+    def pragmas(self):
+        pass

From 1dde2479f65de6cd8bce0be7091189c5b2d313c1 Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Wed, 21 Sep 2022 15:52:58 +0100
Subject: [PATCH 004/235] [thresholding] FINN-44: Update custom_op's __init__
 to pick up new Threshold_binary_search class

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 src/finn/custom_op/fpgadataflow/__init__.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index e5eb483a00..65fbd6e20c 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -58,6 +58,9 @@
 from finn.custom_op.fpgadataflow.streamingfifo import StreamingFIFO
 from finn.custom_op.fpgadataflow.streamingmaxpool_batch import StreamingMaxPool_Batch
 from finn.custom_op.fpgadataflow.thresholding_batch import Thresholding_Batch
+from finn.custom_op.fpgadataflow.thresholding_binary_search import (
+    Thresholding_Bin_Search,
+)
 from finn.custom_op.fpgadataflow.tlastmarker import TLastMarker
 from finn.custom_op.fpgadataflow.upsampler import UpsampleNearestNeighbour_Batch
 from finn.custom_op.fpgadataflow.vectorvectoractivation import VectorVectorActivation
@@ -79,6 +82,7 @@
 custom_op["Pool_Batch"] = Pool_Batch
 custom_op["FMPadding_Batch"] = FMPadding_Batch
 custom_op["Thresholding_Batch"] = Thresholding_Batch
+custom_op["Thresholding_Binary_search"] = Thresholding_Bin_Search
 custom_op["AddStreams_Batch"] = AddStreams_Batch
 custom_op["LabelSelect_Batch"] = LabelSelect_Batch
 custom_op["DuplicateStreams_Batch"] = DuplicateStreams_Batch

From 95082d3ce1f518494910b5444da05722fa8db09c Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Wed, 21 Sep 2022 19:01:19 +0100
Subject: [PATCH 005/235] [thresholding] FINN-44: Add inital node attributes
 for Thresholding binary search class

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 .../thresholding_binary_search.py             | 40 ++++++++++++++++++-
 1 file changed, 39 insertions(+), 1 deletion(-)

diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
index 0e1916706b..97d8e0b281 100755
--- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
@@ -47,7 +47,45 @@ def __init__(self, onnx_node):
         super().__init__(onnx_node)
 
     def get_nodeattr_types(self):
-        return {}
+        my_attrs = {
+            # parallelization; channels thresholded per cycle
+            "PE": ("i", True, 0),
+            # number of channels (each may have different thresholds)
+            "NumChannels": ("i", True, 0),
+            # number of steps in thresholding function. Used only in decoupled mode
+            "numSteps": ("i", True, 1),
+            # string defining memory type
+            "ram_style": ("s", False, "distributed", {"distributed", "block"}),
+            # FINN DataTypes for inputs, outputs
+            "inputDataType": ("s", True, ""),
+            "weightDataType": ("s", True, ""),
+            "outputDataType": ("s", True, ""),
+            # input and output FIFO depths
+            "inFIFODepth": ("i", False, 0),
+            "outFIFODepth": ("i", False, 0),
+            # number of input vectors, examples:
+            # [1] is a single vector (like a FC layer with batch=1)
+            # [4] is four vectors (like a FC layer with batch=4)
+            # [1, 4, 4] is four * four vectors (like a conv layer with batch=1)
+            "numInputVectors": ("ints", False, [1]),
+            # memory mode for the thresholds
+            # const -- embedded thresholds, default
+            # decoupled -- streaming thresholds with streamer packaged inside IP
+            "mem_mode": ("s", False, "const", {"const", "decoupled"}),
+            # (mem_mode = decoupled only) whether weights (thresholds) will be
+            # writable through an AXI-lite interface during runtime
+            # 1 for enabled, 0 for disabled.
+            # see finn-rtllib/memstream/doc/README for more about the memory
+            # address map used for writable weights
+            # IMPORTANT: After using AXI lite to either read or write the weights,
+            # always "flush" the accelerator by first passing a dummy input
+            # vector through the accelerator. This will get rid of any old
+            # weight data from the weight FIFOs.
+            "runtime_writeable_weights": ("i", False, 0, {0, 1}),
+            "gen_top_module": ("s", False, ""),
+        }
+        my_attrs.update(super().get_nodeattr_types())
+        return my_attrs
 
     def calc_tmem(self):
         return 0

From 72832be6caeefdb895a911988ba5ee77d7d2813f Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Wed, 21 Sep 2022 19:02:30 +0100
Subject: [PATCH 006/235] [thresholding] FINN-44: Add calc_tmem() method

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 src/finn/custom_op/fpgadataflow/thresholding_binary_search.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
index 97d8e0b281..6195a26afb 100755
--- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
@@ -88,7 +88,9 @@ def get_nodeattr_types(self):
         return my_attrs
 
     def calc_tmem(self):
-        return 0
+        num_channels = self.get_nodeattr("NumChannels")
+        pe = self.get_nodeattr("PE")
+        return num_channels // pe
 
     def make_shape_compatible_op(self, model):
         return []

From 0d4e3bea27fce23864729663411a80c6734ed402 Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Wed, 21 Sep 2022 19:06:07 +0100
Subject: [PATCH 007/235] [thresholding] FINN-44: Add methods for retrieving
 inut/output/weight data types

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 .../custom_op/fpgadataflow/thresholding_binary_search.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
index 6195a26afb..50a3ce5b6b 100755
--- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
@@ -26,6 +26,8 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+from qonnx.core.datatype import DataType
+
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
 
 """@package thresholding_binary_search
@@ -108,13 +110,14 @@ def lut_estimation(self):
         return 0
 
     def get_input_datatype(self):
-        return None
+        return DataType[self.get_nodeattr("inputDataType")]
 
     def get_output_datatype(self):
-        return None
+        return DataType[self.get_nodeattr("outputDataType")]
 
     def get_weight_datatype(self):
-        return None
+        """The term 'weights' and 'thresholds' are used interchangably in this class."""
+        return DataType[self.get_nodeattr("weightDataType")]
 
     def minimize_accumulator_width(self, model):
         return None

From 28568c6777d64adaa9d16f9bc58c3eda96fd7dbc Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Wed, 21 Sep 2022 19:09:24 +0100
Subject: [PATCH 008/235] [thresholding] FINN-44: Add methods for retrieving
 node input/output shapes

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 .../fpgadataflow/thresholding_binary_search.py  | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
index 50a3ce5b6b..ee74f28485 100755
--- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
@@ -132,16 +132,25 @@ def get_weightstream_width(self):
         return 0
 
     def get_folded_input_shape(self):
-        return tuple([] + [])
+        fold = self.calc_tmem()
+        pe = self.get_nodeattr("PE")
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        folded_input_shape = tuple(vecs + [fold, pe])
+        return folded_input_shape
 
     def get_folded_output_shape(self):
-        return tuple([] + [])
+        # same shape as input
+        return self.get_folded_input_shape()
 
     def get_normal_input_shape(self):
-        return tuple([] + [])
+        num_channels = self.get_nodeattr("NumChannels")
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        normal_input_shape = tuple(vecs + [num_channels])
+        return normal_input_shape
 
     def get_normal_output_shape(self):
-        return tuple([] + [])
+        # same shape as input
+        return self.get_normal_input_shape()
 
     def get_number_output_values(self):
         return 0

From 280870d25864781b2ce3683a10824049d19f9bff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Mon, 24 Oct 2022 14:58:32 +0100
Subject: [PATCH 009/235] Thresholding over signed inputs.

---
 finn-rtllib/thresholding/hdl/thresholding.sv | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/finn-rtllib/thresholding/hdl/thresholding.sv b/finn-rtllib/thresholding/hdl/thresholding.sv
index 93ccdc51c5..9deeac458c 100644
--- a/finn-rtllib/thresholding/hdl/thresholding.sv
+++ b/finn-rtllib/thresholding/hdl/thresholding.sv
@@ -64,8 +64,8 @@ module thresholding #(
 
 	// Input Stream
 	input	logic  ivld,
-	input	logic [C_BITS-1:0]  icnl,	// Ignored for C == 1
-	input	logic [M     -1:0]  idat,
+	input	logic        [C_BITS-1:0]  icnl,	// Ignored for C == 1
+	input	logic signed [M     -1:0]  idat,
 
 	// Output Stream
 	output	logic  ovld,
@@ -75,10 +75,10 @@ module thresholding #(
 
 	// Pipeline Links & Feed
 	typedef struct packed {
-		logic               vld;	// Valid data identification
-		logic [C_BITS-1:0]  cnl;	// Channel
-		logic [M     -1:0]  val;	// Original input value
-		logic [0:N-1]       res;	// Assembling result with valid prefix [0:stage] after stage #stage
+		logic                      vld;	// Valid data identification
+		logic        [C_BITS-1:0]  cnl;	// Channel
+		logic signed [M     -1:0]  val;	// Original input value
+		logic        [0:N-1]       res;	// Assembling result with valid prefix [0:stage] after stage #stage
 	} pipe_t;
 	uwire pipe_t  pipe[0:N];
 	assign	pipe[0] = pipe_t'{ vld: ivld, cnl: icnl, val: idat, res: {N{1'bx}} };	// Feed original input
@@ -88,13 +88,13 @@ module thresholding #(
 	for(genvar  stage = 0; stage < N; stage++) begin : genStages
 
 		// Threshold Memory
-		uwire [M-1:0]  thresh;
+		uwire signed [M-1:0]  thresh;
 		if(1) begin : blkUpdate
 
 			// Write control: local select from global address
 			uwire  we = twe && tws[stage];
 			if((C == 1) && (stage == 0)) begin
-				logic [M-1:0]  Thresh = 'x;
+				logic signed [M-1:0]  Thresh = 'x;
 				always_ff @(posedge clk) begin
 					if(rst)      Thresh <= 'x;
 					else if(we)  Thresh <= twd;
@@ -102,7 +102,7 @@ module thresholding #(
 				assign  thresh = Thresh;
 			end
 			else begin
-				logic [M-1:0]  Threshs[C * 2**stage];
+				logic signed [M-1:0]  Threshs[C * 2**stage];
 				uwire [$clog2(C)+stage-1:0]  wa = twa[$left(twa):N-stage];
 				uwire [$clog2(C)+stage-1:0]  ra;
 				if(C > 1)  assign  ra[stage+:C_BITS] = pipe[stage].cnl;
@@ -114,7 +114,7 @@ module thresholding #(
 				end
 
 				// Read
-				logic [M-1:0]  RdReg;
+				logic signed [M-1:0]  RdReg;
 				always_ff @(posedge clk) begin
 					if(en)  RdReg <= Threshs[ra];
 				end

From 2bf1a21e463297a885b1a7a40ab78fb2deeb2d52 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Mon, 24 Oct 2022 15:38:22 +0100
Subject: [PATCH 010/235] Introduce an optional threshold output bias.

---
 finn-rtllib/thresholding/hdl/thresholding.sv | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/finn-rtllib/thresholding/hdl/thresholding.sv b/finn-rtllib/thresholding/hdl/thresholding.sv
index 9deeac458c..cea93e40ab 100644
--- a/finn-rtllib/thresholding/hdl/thresholding.sv
+++ b/finn-rtllib/thresholding/hdl/thresholding.sv
@@ -48,7 +48,12 @@ module thresholding #(
 	int unsigned  M,  // input/threshold precision
 	int unsigned  C,  // number of channels
 
-	localparam int unsigned  C_BITS = C < 2? 1 : $clog2(C)
+	int  BIAS = 0,  // offsetting the output [0, 2^N-1) -> [-BIAS, 2^N-1 - BIAS)
+
+	localparam int unsigned  C_BITS = C < 2? 1 : $clog2(C),
+	localparam int unsigned  O_BITS = BIAS <= 0?
+		/* unsigned */ $clog2(2**N-BIAS) :
+		/* signed */ 1+$clog2(BIAS >= 2**(N-1)? BIAS : 2**N-BIAS)
 )(
 	// Global Control
 	input	logic  clk,
@@ -70,7 +75,7 @@ module thresholding #(
 	// Output Stream
 	output	logic  ovld,
 	output	logic [C_BITS-1:0]  ocnl,
-	output	logic [N     -1:0]  odat
+	output	logic [O_BITS-1:0]  odat
 );
 
 	// Pipeline Links & Feed
@@ -148,6 +153,6 @@ module thresholding #(
 	// Output
 	assign	ovld = pipe[N].vld;
 	assign	ocnl = pipe[N].cnl;
-	assign	odat = pipe[N].res;
+	assign	odat = pipe[N].res - BIAS;
 
 endmodule : thresholding

From 4c7b5acd24cf88716fdfdc1dac8d8cc2c2ece44e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Tue, 25 Oct 2022 06:17:14 +0100
Subject: [PATCH 011/235] Exposing the thresholding bias through the AXI
 adapter.

---
 finn-rtllib/thresholding/hdl/thresholding_axi.sv | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi.sv b/finn-rtllib/thresholding/hdl/thresholding_axi.sv
index 71e54c5ca0..a20952c33b 100644
--- a/finn-rtllib/thresholding/hdl/thresholding_axi.sv
+++ b/finn-rtllib/thresholding/hdl/thresholding_axi.sv
@@ -36,6 +36,12 @@ module thresholding_axi #(
 	int unsigned  N,	// output precision
 	int unsigned  M,	// input/threshold precision
 	int unsigned  C		// Channels
+
+	int  BIAS = 0,  // offsetting the output [0, 2^N-1) -> [-BIAS, 2^N-1 - BIAS)
+
+	localparam int unsigned  O_BITS = BIAS <= 0?
+		/* unsigned */ $clog2(2**N-BIAS) :
+		/* signed */ 1+$clog2(BIAS >= 2**(N-1)? BIAS : 2**N-BIAS)
 )(
 	//- Global Control ------------------
 	input	logic  ap_clk,
@@ -74,7 +80,7 @@ module thresholding_axi #(
 	//- AXI Stream - Output -------------
 	input	logic  m_axis_tready,
 	output	logic  m_axis_tvalid,
-	output	logic [((N+7)/8)*8-1:0]  m_axis_tdata
+	output	logic [((O_BITS+7)/8)*8-1:0]  m_axis_tdata
 );
 	//- Global Control ------------------------------------------------------
 	uwire  clk = ap_clk;
@@ -134,12 +140,12 @@ module thresholding_axi #(
 
 	//- IO-Sandwich with two-stage output buffer for containing a local enable
 	uwire  en;
-	uwire [N-1:0]  odat;
+	uwire [O_BITS-1:0]  odat;
 	uwire  ovld;
 	if(1) begin : blkOutputDecouple
 		typedef struct {
 			logic          vld;
-			logic [N-1:0]  dat;
+			logic [O_BITS-1:0]  dat;
 		} buf_t;
 		buf_t  Buf[2] = '{ default: '{ vld: 0, dat: 'x } };
 		always_ff @(posedge clk) begin
@@ -187,7 +193,7 @@ module thresholding_axi #(
 	end
 
 	// Core Thresholding Module
-	thresholding #(.N(N), .M(M), .C(C)) core (
+	thresholding #(.N(N), .M(M), .C(C), .BIAS(BIAS)) core (
 		.clk, .rst,
 		.twe, .twa, .twd,
 		.en,

From 7663d3f60c445ad595a193eb6b493b4f65b2f921 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Tue, 25 Oct 2022 11:55:19 +0100
Subject: [PATCH 012/235] Have thresholding wrapper pass on bias parameter and
 compute derived ones.

---
 .../thresholding/hdl/thresholding_axi_wrapper.v       | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v
index bb6b17b32f..b5c65e5879 100644
--- a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v
+++ b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v
@@ -36,7 +36,12 @@ module thresholding_axi_wrapper #(
 	parameter  N,	// output precision
 	parameter  M,	// input/threshold precision
 	parameter  C,	// Channels
-	parameter  C_BITS //= $clog2(C)
+	parameter  BIAS = 0,  // offsetting the output [0, 2^N-1) -> [-BIAS, 2^N-1 - BIAS)
+
+	localparam  C_BITS = $clog2(C),
+	localparam  O_BITS = BIAS <= 0?
+		/* unsigned */ $clog2(2**N-BIAS) :
+		/* signed */ 1+$clog2(BIAS >= 2**(N-1)? BIAS : 2**N-BIAS)
 )(
 	//- Global Control ------------------
 	input	ap_clk,
@@ -75,10 +80,10 @@ module thresholding_axi_wrapper #(
 	//- AXI Stream - Output -------------
 	input	m_axis_tready,
 	output	m_axis_tvalid,
-	output	[((N+7)/8)*8-1:0]  m_axis_tdata
+	output	[((O_BITS+7)/8)*8-1:0]  m_axis_tdata
 );
 
-	thresholding_axi #(.N(N), .M(M), .C(C)) inst (
+	thresholding_axi #(.N(N), .M(M), .C(C), .BIAS(BIAS)) inst (
 		//- Global Control ------------------
 		.ap_clk(ap_clk),
 		.ap_rst_n(ap_rst_n),

From 55e2eacd4b554456bb980f7518f9c79d7be3104d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Tue, 25 Oct 2022 15:53:11 +0100
Subject: [PATCH 013/235] Fix typo.

---
 finn-rtllib/thresholding/hdl/thresholding_axi.sv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi.sv b/finn-rtllib/thresholding/hdl/thresholding_axi.sv
index a20952c33b..6b869ba303 100644
--- a/finn-rtllib/thresholding/hdl/thresholding_axi.sv
+++ b/finn-rtllib/thresholding/hdl/thresholding_axi.sv
@@ -35,7 +35,7 @@
 module thresholding_axi #(
 	int unsigned  N,	// output precision
 	int unsigned  M,	// input/threshold precision
-	int unsigned  C		// Channels
+	int unsigned  C,	// Channels
 
 	int  BIAS = 0,  // offsetting the output [0, 2^N-1) -> [-BIAS, 2^N-1 - BIAS)
 

From fa5d71aaf2b4ba3340aa8e07e23d90bf45bee32d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Tue, 25 Oct 2022 16:58:08 +0100
Subject: [PATCH 014/235] Abandon IPI support files.

---
 finn-rtllib/thresholding/component.xml        | 817 ------------------
 .../xgui/thresholding_axi_wrapper_v1_0.tcl    |  74 --
 2 files changed, 891 deletions(-)
 delete mode 100644 finn-rtllib/thresholding/component.xml
 delete mode 100644 finn-rtllib/thresholding/xgui/thresholding_axi_wrapper_v1_0.tcl

diff --git a/finn-rtllib/thresholding/component.xml b/finn-rtllib/thresholding/component.xml
deleted file mode 100644
index 0a56f93316..0000000000
--- a/finn-rtllib/thresholding/component.xml
+++ /dev/null
@@ -1,817 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<spirit:component xmlns:xilinx="http://www.xilinx.com" xmlns:spirit="http://www.spiritconsortium.org/XMLSchema/SPIRIT/1685-2009" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
-  <spirit:vendor>amd.com</spirit:vendor>
-  <spirit:library>user</spirit:library>
-  <spirit:name>thresholding_axi_wrapper</spirit:name>
-  <spirit:version>1.0</spirit:version>
-  <spirit:busInterfaces>
-    <spirit:busInterface>
-      <spirit:name>m_axis</spirit:name>
-      <spirit:busType spirit:vendor="xilinx.com" spirit:library="interface" spirit:name="axis" spirit:version="1.0"/>
-      <spirit:abstractionType spirit:vendor="xilinx.com" spirit:library="interface" spirit:name="axis_rtl" spirit:version="1.0"/>
-      <spirit:master/>
-      <spirit:portMaps>
-        <spirit:portMap>
-          <spirit:logicalPort>
-            <spirit:name>TDATA</spirit:name>
-          </spirit:logicalPort>
-          <spirit:physicalPort>
-            <spirit:name>m_axis_tdata</spirit:name>
-          </spirit:physicalPort>
-        </spirit:portMap>
-        <spirit:portMap>
-          <spirit:logicalPort>
-            <spirit:name>TVALID</spirit:name>
-          </spirit:logicalPort>
-          <spirit:physicalPort>
-            <spirit:name>m_axis_tvalid</spirit:name>
-          </spirit:physicalPort>
-        </spirit:portMap>
-        <spirit:portMap>
-          <spirit:logicalPort>
-            <spirit:name>TREADY</spirit:name>
-          </spirit:logicalPort>
-          <spirit:physicalPort>
-            <spirit:name>m_axis_tready</spirit:name>
-          </spirit:physicalPort>
-        </spirit:portMap>
-      </spirit:portMaps>
-    </spirit:busInterface>
-    <spirit:busInterface>
-      <spirit:name>s_axis</spirit:name>
-      <spirit:busType spirit:vendor="xilinx.com" spirit:library="interface" spirit:name="axis" spirit:version="1.0"/>
-      <spirit:abstractionType spirit:vendor="xilinx.com" spirit:library="interface" spirit:name="axis_rtl" spirit:version="1.0"/>
-      <spirit:slave/>
-      <spirit:portMaps>
-        <spirit:portMap>
-          <spirit:logicalPort>
-            <spirit:name>TDATA</spirit:name>
-          </spirit:logicalPort>
-          <spirit:physicalPort>
-            <spirit:name>s_axis_tdata</spirit:name>
-          </spirit:physicalPort>
-        </spirit:portMap>
-        <spirit:portMap>
-          <spirit:logicalPort>
-            <spirit:name>TVALID</spirit:name>
-          </spirit:logicalPort>
-          <spirit:physicalPort>
-            <spirit:name>s_axis_tvalid</spirit:name>
-          </spirit:physicalPort>
-        </spirit:portMap>
-        <spirit:portMap>
-          <spirit:logicalPort>
-            <spirit:name>TREADY</spirit:name>
-          </spirit:logicalPort>
-          <spirit:physicalPort>
-            <spirit:name>s_axis_tready</spirit:name>
-          </spirit:physicalPort>
-        </spirit:portMap>
-      </spirit:portMaps>
-    </spirit:busInterface>
-    <spirit:busInterface>
-      <spirit:name>s_axilite</spirit:name>
-      <spirit:busType spirit:vendor="xilinx.com" spirit:library="interface" spirit:name="aximm" spirit:version="1.0"/>
-      <spirit:abstractionType spirit:vendor="xilinx.com" spirit:library="interface" spirit:name="aximm_rtl" spirit:version="1.0"/>
-      <spirit:slave>
-        <spirit:memoryMapRef spirit:memoryMapRef="s_axilite"/>
-      </spirit:slave>
-      <spirit:portMaps>
-        <spirit:portMap>
-          <spirit:logicalPort>
-            <spirit:name>AWADDR</spirit:name>
-          </spirit:logicalPort>
-          <spirit:physicalPort>
-            <spirit:name>s_axilite_AWADDR</spirit:name>
-          </spirit:physicalPort>
-        </spirit:portMap>
-        <spirit:portMap>
-          <spirit:logicalPort>
-            <spirit:name>AWVALID</spirit:name>
-          </spirit:logicalPort>
-          <spirit:physicalPort>
-            <spirit:name>s_axilite_AWVALID</spirit:name>
-          </spirit:physicalPort>
-        </spirit:portMap>
-        <spirit:portMap>
-          <spirit:logicalPort>
-            <spirit:name>AWREADY</spirit:name>
-          </spirit:logicalPort>
-          <spirit:physicalPort>
-            <spirit:name>s_axilite_AWREADY</spirit:name>
-          </spirit:physicalPort>
-        </spirit:portMap>
-        <spirit:portMap>
-          <spirit:logicalPort>
-            <spirit:name>WDATA</spirit:name>
-          </spirit:logicalPort>
-          <spirit:physicalPort>
-            <spirit:name>s_axilite_WDATA</spirit:name>
-          </spirit:physicalPort>
-        </spirit:portMap>
-        <spirit:portMap>
-          <spirit:logicalPort>
-            <spirit:name>WSTRB</spirit:name>
-          </spirit:logicalPort>
-          <spirit:physicalPort>
-            <spirit:name>s_axilite_WSTRB</spirit:name>
-          </spirit:physicalPort>
-        </spirit:portMap>
-        <spirit:portMap>
-          <spirit:logicalPort>
-            <spirit:name>WVALID</spirit:name>
-          </spirit:logicalPort>
-          <spirit:physicalPort>
-            <spirit:name>s_axilite_WVALID</spirit:name>
-          </spirit:physicalPort>
-        </spirit:portMap>
-        <spirit:portMap>
-          <spirit:logicalPort>
-            <spirit:name>WREADY</spirit:name>
-          </spirit:logicalPort>
-          <spirit:physicalPort>
-            <spirit:name>s_axilite_WREADY</spirit:name>
-          </spirit:physicalPort>
-        </spirit:portMap>
-        <spirit:portMap>
-          <spirit:logicalPort>
-            <spirit:name>BRESP</spirit:name>
-          </spirit:logicalPort>
-          <spirit:physicalPort>
-            <spirit:name>s_axilite_BRESP</spirit:name>
-          </spirit:physicalPort>
-        </spirit:portMap>
-        <spirit:portMap>
-          <spirit:logicalPort>
-            <spirit:name>BVALID</spirit:name>
-          </spirit:logicalPort>
-          <spirit:physicalPort>
-            <spirit:name>s_axilite_BVALID</spirit:name>
-          </spirit:physicalPort>
-        </spirit:portMap>
-        <spirit:portMap>
-          <spirit:logicalPort>
-            <spirit:name>BREADY</spirit:name>
-          </spirit:logicalPort>
-          <spirit:physicalPort>
-            <spirit:name>s_axilite_BREADY</spirit:name>
-          </spirit:physicalPort>
-        </spirit:portMap>
-        <spirit:portMap>
-          <spirit:logicalPort>
-            <spirit:name>ARADDR</spirit:name>
-          </spirit:logicalPort>
-          <spirit:physicalPort>
-            <spirit:name>s_axilite_ARADDR</spirit:name>
-          </spirit:physicalPort>
-        </spirit:portMap>
-        <spirit:portMap>
-          <spirit:logicalPort>
-            <spirit:name>ARVALID</spirit:name>
-          </spirit:logicalPort>
-          <spirit:physicalPort>
-            <spirit:name>s_axilite_ARVALID</spirit:name>
-          </spirit:physicalPort>
-        </spirit:portMap>
-        <spirit:portMap>
-          <spirit:logicalPort>
-            <spirit:name>ARREADY</spirit:name>
-          </spirit:logicalPort>
-          <spirit:physicalPort>
-            <spirit:name>s_axilite_ARREADY</spirit:name>
-          </spirit:physicalPort>
-        </spirit:portMap>
-        <spirit:portMap>
-          <spirit:logicalPort>
-            <spirit:name>RDATA</spirit:name>
-          </spirit:logicalPort>
-          <spirit:physicalPort>
-            <spirit:name>s_axilite_RDATA</spirit:name>
-          </spirit:physicalPort>
-        </spirit:portMap>
-        <spirit:portMap>
-          <spirit:logicalPort>
-            <spirit:name>RRESP</spirit:name>
-          </spirit:logicalPort>
-          <spirit:physicalPort>
-            <spirit:name>s_axilite_RRESP</spirit:name>
-          </spirit:physicalPort>
-        </spirit:portMap>
-        <spirit:portMap>
-          <spirit:logicalPort>
-            <spirit:name>RVALID</spirit:name>
-          </spirit:logicalPort>
-          <spirit:physicalPort>
-            <spirit:name>s_axilite_RVALID</spirit:name>
-          </spirit:physicalPort>
-        </spirit:portMap>
-        <spirit:portMap>
-          <spirit:logicalPort>
-            <spirit:name>RREADY</spirit:name>
-          </spirit:logicalPort>
-          <spirit:physicalPort>
-            <spirit:name>s_axilite_RREADY</spirit:name>
-          </spirit:physicalPort>
-        </spirit:portMap>
-      </spirit:portMaps>
-    </spirit:busInterface>
-    <spirit:busInterface>
-      <spirit:name>ap_rst_n</spirit:name>
-      <spirit:busType spirit:vendor="xilinx.com" spirit:library="signal" spirit:name="reset" spirit:version="1.0"/>
-      <spirit:abstractionType spirit:vendor="xilinx.com" spirit:library="signal" spirit:name="reset_rtl" spirit:version="1.0"/>
-      <spirit:slave/>
-      <spirit:portMaps>
-        <spirit:portMap>
-          <spirit:logicalPort>
-            <spirit:name>RST</spirit:name>
-          </spirit:logicalPort>
-          <spirit:physicalPort>
-            <spirit:name>ap_rst_n</spirit:name>
-          </spirit:physicalPort>
-        </spirit:portMap>
-      </spirit:portMaps>
-      <spirit:parameters>
-        <spirit:parameter>
-          <spirit:name>POLARITY</spirit:name>
-          <spirit:value spirit:id="BUSIFPARAM_VALUE.AP_RST_N.POLARITY" spirit:choiceRef="choice_list_74b5137e">ACTIVE_LOW</spirit:value>
-        </spirit:parameter>
-      </spirit:parameters>
-    </spirit:busInterface>
-    <spirit:busInterface>
-      <spirit:name>ap_clk</spirit:name>
-      <spirit:busType spirit:vendor="xilinx.com" spirit:library="signal" spirit:name="clock" spirit:version="1.0"/>
-      <spirit:abstractionType spirit:vendor="xilinx.com" spirit:library="signal" spirit:name="clock_rtl" spirit:version="1.0"/>
-      <spirit:slave/>
-      <spirit:portMaps>
-        <spirit:portMap>
-          <spirit:logicalPort>
-            <spirit:name>CLK</spirit:name>
-          </spirit:logicalPort>
-          <spirit:physicalPort>
-            <spirit:name>ap_clk</spirit:name>
-          </spirit:physicalPort>
-        </spirit:portMap>
-      </spirit:portMaps>
-      <spirit:parameters>
-        <spirit:parameter>
-          <spirit:name>ASSOCIATED_RESET</spirit:name>
-          <spirit:value spirit:id="BUSIFPARAM_VALUE.AP_CLK.ASSOCIATED_RESET">ap_rst_n</spirit:value>
-        </spirit:parameter>
-        <spirit:parameter>
-          <spirit:name>ASSOCIATED_BUSIF</spirit:name>
-          <spirit:value spirit:id="BUSIFPARAM_VALUE.AP_CLK.ASSOCIATED_BUSIF">m_axis:s_axis:s_axilite</spirit:value>
-        </spirit:parameter>
-      </spirit:parameters>
-    </spirit:busInterface>
-  </spirit:busInterfaces>
-  <spirit:memoryMaps>
-    <spirit:memoryMap>
-      <spirit:name>s_axilite</spirit:name>
-      <spirit:displayName>s_axilite</spirit:displayName>
-      <spirit:addressBlock>
-        <spirit:name>reg0</spirit:name>
-        <spirit:displayName>reg0</spirit:displayName>
-        <spirit:baseAddress spirit:format="bitString" spirit:bitStringLength="1">0x0</spirit:baseAddress>
-        <spirit:range spirit:format="long" spirit:resolve="dependent" spirit:dependency="pow(2,((spirit:decode(id(&apos;MODELPARAM_VALUE.C_BITS&apos;)) + spirit:decode(id(&apos;MODELPARAM_VALUE.N&apos;))) - 1) - 0 + 1)" spirit:minimum="4096" spirit:rangeType="long">4096</spirit:range>
-        <spirit:width spirit:format="long">32</spirit:width>
-        <spirit:usage>register</spirit:usage>
-      </spirit:addressBlock>
-    </spirit:memoryMap>
-  </spirit:memoryMaps>
-  <spirit:model>
-    <spirit:views>
-      <spirit:view>
-        <spirit:name>xilinx_anylanguagesynthesis</spirit:name>
-        <spirit:displayName>Synthesis</spirit:displayName>
-        <spirit:envIdentifier>:vivado.xilinx.com:synthesis</spirit:envIdentifier>
-        <spirit:language>Verilog</spirit:language>
-        <spirit:modelName>thresholding_axi_wrapper</spirit:modelName>
-        <spirit:fileSetRef>
-          <spirit:localName>xilinx_anylanguagesynthesis_view_fileset</spirit:localName>
-        </spirit:fileSetRef>
-        <spirit:parameters>
-          <spirit:parameter>
-            <spirit:name>viewChecksum</spirit:name>
-            <spirit:value>5cc8f7a9</spirit:value>
-          </spirit:parameter>
-        </spirit:parameters>
-      </spirit:view>
-      <spirit:view>
-        <spirit:name>xilinx_xpgui</spirit:name>
-        <spirit:displayName>UI Layout</spirit:displayName>
-        <spirit:envIdentifier>:vivado.xilinx.com:xgui.ui</spirit:envIdentifier>
-        <spirit:fileSetRef>
-          <spirit:localName>xilinx_xpgui_view_fileset</spirit:localName>
-        </spirit:fileSetRef>
-        <spirit:parameters>
-          <spirit:parameter>
-            <spirit:name>viewChecksum</spirit:name>
-            <spirit:value>c456596c</spirit:value>
-          </spirit:parameter>
-        </spirit:parameters>
-      </spirit:view>
-    </spirit:views>
-    <spirit:ports>
-      <spirit:port>
-        <spirit:name>ap_clk</spirit:name>
-        <spirit:wire>
-          <spirit:direction>in</spirit:direction>
-          <spirit:wireTypeDefs>
-            <spirit:wireTypeDef>
-              <spirit:typeName>std_logic</spirit:typeName>
-              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
-            </spirit:wireTypeDef>
-          </spirit:wireTypeDefs>
-        </spirit:wire>
-      </spirit:port>
-      <spirit:port>
-        <spirit:name>ap_rst_n</spirit:name>
-        <spirit:wire>
-          <spirit:direction>in</spirit:direction>
-          <spirit:wireTypeDefs>
-            <spirit:wireTypeDef>
-              <spirit:typeName>std_logic</spirit:typeName>
-              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
-            </spirit:wireTypeDef>
-          </spirit:wireTypeDefs>
-        </spirit:wire>
-      </spirit:port>
-      <spirit:port>
-        <spirit:name>s_axilite_AWVALID</spirit:name>
-        <spirit:wire>
-          <spirit:direction>in</spirit:direction>
-          <spirit:wireTypeDefs>
-            <spirit:wireTypeDef>
-              <spirit:typeName>std_logic</spirit:typeName>
-              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
-            </spirit:wireTypeDef>
-          </spirit:wireTypeDefs>
-          <spirit:driver>
-            <spirit:defaultValue spirit:format="long">0</spirit:defaultValue>
-          </spirit:driver>
-        </spirit:wire>
-      </spirit:port>
-      <spirit:port>
-        <spirit:name>s_axilite_AWREADY</spirit:name>
-        <spirit:wire>
-          <spirit:direction>out</spirit:direction>
-          <spirit:wireTypeDefs>
-            <spirit:wireTypeDef>
-              <spirit:typeName>std_logic</spirit:typeName>
-              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
-            </spirit:wireTypeDef>
-          </spirit:wireTypeDefs>
-        </spirit:wire>
-      </spirit:port>
-      <spirit:port>
-        <spirit:name>s_axilite_AWADDR</spirit:name>
-        <spirit:wire>
-          <spirit:direction>in</spirit:direction>
-          <spirit:vector>
-            <spirit:left spirit:format="long" spirit:resolve="dependent" spirit:dependency="((spirit:decode(id(&apos;MODELPARAM_VALUE.C_BITS&apos;)) + spirit:decode(id(&apos;MODELPARAM_VALUE.N&apos;))) - 1)">3</spirit:left>
-            <spirit:right spirit:format="long">0</spirit:right>
-          </spirit:vector>
-          <spirit:wireTypeDefs>
-            <spirit:wireTypeDef>
-              <spirit:typeName>std_logic_vector</spirit:typeName>
-              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
-            </spirit:wireTypeDef>
-          </spirit:wireTypeDefs>
-          <spirit:driver>
-            <spirit:defaultValue spirit:format="long">0</spirit:defaultValue>
-          </spirit:driver>
-        </spirit:wire>
-      </spirit:port>
-      <spirit:port>
-        <spirit:name>s_axilite_WVALID</spirit:name>
-        <spirit:wire>
-          <spirit:direction>in</spirit:direction>
-          <spirit:wireTypeDefs>
-            <spirit:wireTypeDef>
-              <spirit:typeName>std_logic</spirit:typeName>
-              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
-            </spirit:wireTypeDef>
-          </spirit:wireTypeDefs>
-          <spirit:driver>
-            <spirit:defaultValue spirit:format="long">0</spirit:defaultValue>
-          </spirit:driver>
-        </spirit:wire>
-      </spirit:port>
-      <spirit:port>
-        <spirit:name>s_axilite_WREADY</spirit:name>
-        <spirit:wire>
-          <spirit:direction>out</spirit:direction>
-          <spirit:wireTypeDefs>
-            <spirit:wireTypeDef>
-              <spirit:typeName>std_logic</spirit:typeName>
-              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
-            </spirit:wireTypeDef>
-          </spirit:wireTypeDefs>
-        </spirit:wire>
-      </spirit:port>
-      <spirit:port>
-        <spirit:name>s_axilite_WDATA</spirit:name>
-        <spirit:wire>
-          <spirit:direction>in</spirit:direction>
-          <spirit:vector>
-            <spirit:left spirit:format="long">31</spirit:left>
-            <spirit:right spirit:format="long">0</spirit:right>
-          </spirit:vector>
-          <spirit:wireTypeDefs>
-            <spirit:wireTypeDef>
-              <spirit:typeName>std_logic_vector</spirit:typeName>
-              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
-            </spirit:wireTypeDef>
-          </spirit:wireTypeDefs>
-          <spirit:driver>
-            <spirit:defaultValue spirit:format="long">0</spirit:defaultValue>
-          </spirit:driver>
-        </spirit:wire>
-      </spirit:port>
-      <spirit:port>
-        <spirit:name>s_axilite_WSTRB</spirit:name>
-        <spirit:wire>
-          <spirit:direction>in</spirit:direction>
-          <spirit:vector>
-            <spirit:left spirit:format="long">3</spirit:left>
-            <spirit:right spirit:format="long">0</spirit:right>
-          </spirit:vector>
-          <spirit:wireTypeDefs>
-            <spirit:wireTypeDef>
-              <spirit:typeName>std_logic_vector</spirit:typeName>
-              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
-            </spirit:wireTypeDef>
-          </spirit:wireTypeDefs>
-          <spirit:driver>
-            <spirit:defaultValue spirit:format="long">1</spirit:defaultValue>
-          </spirit:driver>
-        </spirit:wire>
-      </spirit:port>
-      <spirit:port>
-        <spirit:name>s_axilite_BVALID</spirit:name>
-        <spirit:wire>
-          <spirit:direction>out</spirit:direction>
-          <spirit:wireTypeDefs>
-            <spirit:wireTypeDef>
-              <spirit:typeName>std_logic</spirit:typeName>
-              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
-            </spirit:wireTypeDef>
-          </spirit:wireTypeDefs>
-        </spirit:wire>
-      </spirit:port>
-      <spirit:port>
-        <spirit:name>s_axilite_BREADY</spirit:name>
-        <spirit:wire>
-          <spirit:direction>in</spirit:direction>
-          <spirit:wireTypeDefs>
-            <spirit:wireTypeDef>
-              <spirit:typeName>std_logic</spirit:typeName>
-              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
-            </spirit:wireTypeDef>
-          </spirit:wireTypeDefs>
-          <spirit:driver>
-            <spirit:defaultValue spirit:format="long">0</spirit:defaultValue>
-          </spirit:driver>
-        </spirit:wire>
-      </spirit:port>
-      <spirit:port>
-        <spirit:name>s_axilite_BRESP</spirit:name>
-        <spirit:wire>
-          <spirit:direction>out</spirit:direction>
-          <spirit:vector>
-            <spirit:left spirit:format="long">1</spirit:left>
-            <spirit:right spirit:format="long">0</spirit:right>
-          </spirit:vector>
-          <spirit:wireTypeDefs>
-            <spirit:wireTypeDef>
-              <spirit:typeName>std_logic_vector</spirit:typeName>
-              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
-            </spirit:wireTypeDef>
-          </spirit:wireTypeDefs>
-        </spirit:wire>
-      </spirit:port>
-      <spirit:port>
-        <spirit:name>s_axilite_ARVALID</spirit:name>
-        <spirit:wire>
-          <spirit:direction>in</spirit:direction>
-          <spirit:wireTypeDefs>
-            <spirit:wireTypeDef>
-              <spirit:typeName>std_logic</spirit:typeName>
-              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
-            </spirit:wireTypeDef>
-          </spirit:wireTypeDefs>
-          <spirit:driver>
-            <spirit:defaultValue spirit:format="long">0</spirit:defaultValue>
-          </spirit:driver>
-        </spirit:wire>
-      </spirit:port>
-      <spirit:port>
-        <spirit:name>s_axilite_ARREADY</spirit:name>
-        <spirit:wire>
-          <spirit:direction>out</spirit:direction>
-          <spirit:wireTypeDefs>
-            <spirit:wireTypeDef>
-              <spirit:typeName>std_logic</spirit:typeName>
-              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
-            </spirit:wireTypeDef>
-          </spirit:wireTypeDefs>
-        </spirit:wire>
-      </spirit:port>
-      <spirit:port>
-        <spirit:name>s_axilite_ARADDR</spirit:name>
-        <spirit:wire>
-          <spirit:direction>in</spirit:direction>
-          <spirit:vector>
-            <spirit:left spirit:format="long">0</spirit:left>
-            <spirit:right spirit:format="long">0</spirit:right>
-          </spirit:vector>
-          <spirit:wireTypeDefs>
-            <spirit:wireTypeDef>
-              <spirit:typeName>std_logic_vector</spirit:typeName>
-              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
-            </spirit:wireTypeDef>
-          </spirit:wireTypeDefs>
-          <spirit:driver>
-            <spirit:defaultValue spirit:format="long">0</spirit:defaultValue>
-          </spirit:driver>
-        </spirit:wire>
-      </spirit:port>
-      <spirit:port>
-        <spirit:name>s_axilite_RVALID</spirit:name>
-        <spirit:wire>
-          <spirit:direction>out</spirit:direction>
-          <spirit:wireTypeDefs>
-            <spirit:wireTypeDef>
-              <spirit:typeName>std_logic</spirit:typeName>
-              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
-            </spirit:wireTypeDef>
-          </spirit:wireTypeDefs>
-        </spirit:wire>
-      </spirit:port>
-      <spirit:port>
-        <spirit:name>s_axilite_RREADY</spirit:name>
-        <spirit:wire>
-          <spirit:direction>in</spirit:direction>
-          <spirit:wireTypeDefs>
-            <spirit:wireTypeDef>
-              <spirit:typeName>std_logic</spirit:typeName>
-              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
-            </spirit:wireTypeDef>
-          </spirit:wireTypeDefs>
-          <spirit:driver>
-            <spirit:defaultValue spirit:format="long">0</spirit:defaultValue>
-          </spirit:driver>
-        </spirit:wire>
-      </spirit:port>
-      <spirit:port>
-        <spirit:name>s_axilite_RDATA</spirit:name>
-        <spirit:wire>
-          <spirit:direction>out</spirit:direction>
-          <spirit:vector>
-            <spirit:left spirit:format="long">31</spirit:left>
-            <spirit:right spirit:format="long">0</spirit:right>
-          </spirit:vector>
-          <spirit:wireTypeDefs>
-            <spirit:wireTypeDef>
-              <spirit:typeName>std_logic_vector</spirit:typeName>
-              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
-            </spirit:wireTypeDef>
-          </spirit:wireTypeDefs>
-        </spirit:wire>
-      </spirit:port>
-      <spirit:port>
-        <spirit:name>s_axilite_RRESP</spirit:name>
-        <spirit:wire>
-          <spirit:direction>out</spirit:direction>
-          <spirit:vector>
-            <spirit:left spirit:format="long">1</spirit:left>
-            <spirit:right spirit:format="long">0</spirit:right>
-          </spirit:vector>
-          <spirit:wireTypeDefs>
-            <spirit:wireTypeDef>
-              <spirit:typeName>std_logic_vector</spirit:typeName>
-              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
-            </spirit:wireTypeDef>
-          </spirit:wireTypeDefs>
-        </spirit:wire>
-      </spirit:port>
-      <spirit:port>
-        <spirit:name>s_axis_tready</spirit:name>
-        <spirit:wire>
-          <spirit:direction>out</spirit:direction>
-          <spirit:wireTypeDefs>
-            <spirit:wireTypeDef>
-              <spirit:typeName>std_logic</spirit:typeName>
-              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
-            </spirit:wireTypeDef>
-          </spirit:wireTypeDefs>
-        </spirit:wire>
-      </spirit:port>
-      <spirit:port>
-        <spirit:name>s_axis_tvalid</spirit:name>
-        <spirit:wire>
-          <spirit:direction>in</spirit:direction>
-          <spirit:wireTypeDefs>
-            <spirit:wireTypeDef>
-              <spirit:typeName>std_logic</spirit:typeName>
-              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
-            </spirit:wireTypeDef>
-          </spirit:wireTypeDefs>
-        </spirit:wire>
-      </spirit:port>
-      <spirit:port>
-        <spirit:name>s_axis_tdata</spirit:name>
-        <spirit:wire>
-          <spirit:direction>in</spirit:direction>
-          <spirit:vector>
-            <spirit:left spirit:format="long" spirit:resolve="dependent" spirit:dependency="((((spirit:decode(id(&apos;MODELPARAM_VALUE.M&apos;)) + 7) / 8) * 8) - 1)">15</spirit:left>
-            <spirit:right spirit:format="long">0</spirit:right>
-          </spirit:vector>
-          <spirit:wireTypeDefs>
-            <spirit:wireTypeDef>
-              <spirit:typeName>std_logic_vector</spirit:typeName>
-              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
-            </spirit:wireTypeDef>
-          </spirit:wireTypeDefs>
-          <spirit:driver>
-            <spirit:defaultValue spirit:format="long">0</spirit:defaultValue>
-          </spirit:driver>
-        </spirit:wire>
-      </spirit:port>
-      <spirit:port>
-        <spirit:name>m_axis_tready</spirit:name>
-        <spirit:wire>
-          <spirit:direction>in</spirit:direction>
-          <spirit:wireTypeDefs>
-            <spirit:wireTypeDef>
-              <spirit:typeName>std_logic</spirit:typeName>
-              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
-            </spirit:wireTypeDef>
-          </spirit:wireTypeDefs>
-          <spirit:driver>
-            <spirit:defaultValue spirit:format="long">1</spirit:defaultValue>
-          </spirit:driver>
-        </spirit:wire>
-      </spirit:port>
-      <spirit:port>
-        <spirit:name>m_axis_tvalid</spirit:name>
-        <spirit:wire>
-          <spirit:direction>out</spirit:direction>
-          <spirit:wireTypeDefs>
-            <spirit:wireTypeDef>
-              <spirit:typeName>std_logic</spirit:typeName>
-              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
-            </spirit:wireTypeDef>
-          </spirit:wireTypeDefs>
-        </spirit:wire>
-      </spirit:port>
-      <spirit:port>
-        <spirit:name>m_axis_tdata</spirit:name>
-        <spirit:wire>
-          <spirit:direction>out</spirit:direction>
-          <spirit:vector>
-            <spirit:left spirit:format="long" spirit:resolve="dependent" spirit:dependency="((((spirit:decode(id(&apos;MODELPARAM_VALUE.N&apos;)) + 7) / 8) * 8) - 1)">7</spirit:left>
-            <spirit:right spirit:format="long">0</spirit:right>
-          </spirit:vector>
-          <spirit:wireTypeDefs>
-            <spirit:wireTypeDef>
-              <spirit:typeName>std_logic_vector</spirit:typeName>
-              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
-            </spirit:wireTypeDef>
-          </spirit:wireTypeDefs>
-        </spirit:wire>
-      </spirit:port>
-    </spirit:ports>
-    <spirit:modelParameters>
-      <spirit:modelParameter xsi:type="spirit:nameValueTypeType">
-        <spirit:name>N</spirit:name>
-        <spirit:displayName>N</spirit:displayName>
-        <spirit:value spirit:format="long" spirit:resolve="generated" spirit:id="MODELPARAM_VALUE.N">4</spirit:value>
-      </spirit:modelParameter>
-      <spirit:modelParameter>
-        <spirit:name>M</spirit:name>
-        <spirit:displayName>M</spirit:displayName>
-        <spirit:value spirit:format="long" spirit:resolve="generated" spirit:id="MODELPARAM_VALUE.M">16</spirit:value>
-      </spirit:modelParameter>
-      <spirit:modelParameter>
-        <spirit:name>C</spirit:name>
-        <spirit:displayName>C</spirit:displayName>
-        <spirit:value spirit:format="long" spirit:resolve="generated" spirit:id="MODELPARAM_VALUE.C">1</spirit:value>
-      </spirit:modelParameter>
-      <spirit:modelParameter>
-        <spirit:name>C_BITS</spirit:name>
-        <spirit:displayName>C Bits</spirit:displayName>
-        <spirit:value spirit:format="long" spirit:resolve="generated" spirit:id="MODELPARAM_VALUE.C_BITS">0</spirit:value>
-      </spirit:modelParameter>
-    </spirit:modelParameters>
-  </spirit:model>
-  <spirit:choices>
-    <spirit:choice>
-      <spirit:name>choice_list_74b5137e</spirit:name>
-      <spirit:enumeration>ACTIVE_HIGH</spirit:enumeration>
-      <spirit:enumeration>ACTIVE_LOW</spirit:enumeration>
-    </spirit:choice>
-  </spirit:choices>
-  <spirit:fileSets>
-    <spirit:fileSet>
-      <spirit:name>xilinx_anylanguagesynthesis_view_fileset</spirit:name>
-      <spirit:file>
-        <spirit:name>hdl/thresholding.sv</spirit:name>
-        <spirit:fileType>systemVerilogSource</spirit:fileType>
-      </spirit:file>
-      <spirit:file>
-        <spirit:name>hdl/thresholding_axi.sv</spirit:name>
-        <spirit:fileType>systemVerilogSource</spirit:fileType>
-      </spirit:file>
-      <spirit:file>
-        <spirit:name>hdl/thresholding_axi_wrapper.v</spirit:name>
-        <spirit:fileType>verilogSource</spirit:fileType>
-        <spirit:userFileType>CHECKSUM_2ec027ae</spirit:userFileType>
-      </spirit:file>
-    </spirit:fileSet>
-    <spirit:fileSet>
-      <spirit:name>xilinx_xpgui_view_fileset</spirit:name>
-      <spirit:file>
-        <spirit:name>xgui/thresholding_axi_wrapper_v1_0.tcl</spirit:name>
-        <spirit:fileType>tclSource</spirit:fileType>
-        <spirit:userFileType>CHECKSUM_c456596c</spirit:userFileType>
-        <spirit:userFileType>XGUI_VERSION_2</spirit:userFileType>
-      </spirit:file>
-    </spirit:fileSet>
-  </spirit:fileSets>
-  <spirit:description>thresholding_axi_wrapper_v1_0</spirit:description>
-  <spirit:parameters>
-    <spirit:parameter>
-      <spirit:name>N</spirit:name>
-      <spirit:displayName>N</spirit:displayName>
-      <spirit:value spirit:format="long" spirit:resolve="user" spirit:id="PARAM_VALUE.N">4</spirit:value>
-    </spirit:parameter>
-    <spirit:parameter>
-      <spirit:name>M</spirit:name>
-      <spirit:displayName>M</spirit:displayName>
-      <spirit:value spirit:format="long" spirit:resolve="user" spirit:id="PARAM_VALUE.M">16</spirit:value>
-    </spirit:parameter>
-    <spirit:parameter>
-      <spirit:name>C</spirit:name>
-      <spirit:displayName>C</spirit:displayName>
-      <spirit:value spirit:format="long" spirit:resolve="user" spirit:id="PARAM_VALUE.C">1</spirit:value>
-    </spirit:parameter>
-    <spirit:parameter>
-      <spirit:name>C_BITS</spirit:name>
-      <spirit:displayName>C_BITS</spirit:displayName>
-      <spirit:value spirit:format="long" spirit:resolve="user" spirit:id="PARAM_VALUE.C_BITS">0</spirit:value>
-    </spirit:parameter>
-    <spirit:parameter>
-      <spirit:name>Component_Name</spirit:name>
-      <spirit:value spirit:resolve="user" spirit:id="PARAM_VALUE.Component_Name" spirit:order="1">thresholding_axi_wrapper_v1_0</spirit:value>
-    </spirit:parameter>
-  </spirit:parameters>
-  <spirit:vendorExtensions>
-    <xilinx:coreExtensions>
-      <xilinx:supportedFamilies>
-        <xilinx:family xilinx:lifeCycle="Production">virtex7</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Production">qvirtex7</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Production">versal</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Production">kintex7</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Production">kintex7l</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Production">qkintex7</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Production">qkintex7l</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Production">akintex7</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Production">artix7</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Production">artix7l</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Production">aartix7</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Production">qartix7</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Production">zynq</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Production">qzynq</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Production">azynq</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Production">spartan7</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Production">aspartan7</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Production">virtexu</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Production">zynquplus</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Production">virtexuplus</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Production">virtexuplusHBM</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Production">virtexuplus58g</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Production">kintexuplus</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Production">artixuplus</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Production">kintexu</xilinx:family>
-      </xilinx:supportedFamilies>
-      <xilinx:taxonomies>
-        <xilinx:taxonomy>/UserIP</xilinx:taxonomy>
-      </xilinx:taxonomies>
-      <xilinx:displayName>thresholding_axi_wrapper_v1_0</xilinx:displayName>
-      <xilinx:definitionSource>package_project</xilinx:definitionSource>
-      <xilinx:vendorDisplayName>AMD</xilinx:vendorDisplayName>
-      <xilinx:coreRevision>2</xilinx:coreRevision>
-      <xilinx:coreCreationDateTime>2022-09-20T12:31:16Z</xilinx:coreCreationDateTime>
-    </xilinx:coreExtensions>
-    <xilinx:packagingInfo>
-      <xilinx:xilinxVersion>2022.1</xilinx:xilinxVersion>
-      <xilinx:checksum xilinx:scope="busInterfaces" xilinx:value="e262c422"/>
-      <xilinx:checksum xilinx:scope="memoryMaps" xilinx:value="d6ddd21a"/>
-      <xilinx:checksum xilinx:scope="fileGroups" xilinx:value="3302678a"/>
-      <xilinx:checksum xilinx:scope="ports" xilinx:value="c5010d89"/>
-      <xilinx:checksum xilinx:scope="hdlParameters" xilinx:value="db16a7df"/>
-      <xilinx:checksum xilinx:scope="parameters" xilinx:value="17d48459"/>
-    </xilinx:packagingInfo>
-  </spirit:vendorExtensions>
-</spirit:component>
diff --git a/finn-rtllib/thresholding/xgui/thresholding_axi_wrapper_v1_0.tcl b/finn-rtllib/thresholding/xgui/thresholding_axi_wrapper_v1_0.tcl
deleted file mode 100644
index 02c373e8f2..0000000000
--- a/finn-rtllib/thresholding/xgui/thresholding_axi_wrapper_v1_0.tcl
+++ /dev/null
@@ -1,74 +0,0 @@
-# Definitional proc to organize widgets for parameters.
-proc init_gui { IPINST } {
-  ipgui::add_param $IPINST -name "Component_Name"
-  #Adding Page
-  set Page_0 [ipgui::add_page $IPINST -name "Page 0"]
-  set C [ipgui::add_param $IPINST -name "C" -parent ${Page_0}]
-  set_property tooltip {Channel Count} ${C}
-  set C_BITS [ipgui::add_param $IPINST -name "C_BITS" -parent ${Page_0}]
-  set_property tooltip {Must be clog2(C)} ${C_BITS}
-  set M [ipgui::add_param $IPINST -name "M" -parent ${Page_0}]
-  set_property tooltip {Input Precision} ${M}
-  set N [ipgui::add_param $IPINST -name "N" -parent ${Page_0}]
-  set_property tooltip {Output Precision} ${N}
-
-
-}
-
-proc update_PARAM_VALUE.C { PARAM_VALUE.C } {
-	# Procedure called to update C when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.C { PARAM_VALUE.C } {
-	# Procedure called to validate C
-	return true
-}
-
-proc update_PARAM_VALUE.C_BITS { PARAM_VALUE.C_BITS } {
-	# Procedure called to update C_BITS when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.C_BITS { PARAM_VALUE.C_BITS } {
-	# Procedure called to validate C_BITS
-	return true
-}
-
-proc update_PARAM_VALUE.M { PARAM_VALUE.M } {
-	# Procedure called to update M when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.M { PARAM_VALUE.M } {
-	# Procedure called to validate M
-	return true
-}
-
-proc update_PARAM_VALUE.N { PARAM_VALUE.N } {
-	# Procedure called to update N when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.N { PARAM_VALUE.N } {
-	# Procedure called to validate N
-	return true
-}
-
-
-proc update_MODELPARAM_VALUE.N { MODELPARAM_VALUE.N PARAM_VALUE.N } {
-	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
-	set_property value [get_property value ${PARAM_VALUE.N}] ${MODELPARAM_VALUE.N}
-}
-
-proc update_MODELPARAM_VALUE.M { MODELPARAM_VALUE.M PARAM_VALUE.M } {
-	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
-	set_property value [get_property value ${PARAM_VALUE.M}] ${MODELPARAM_VALUE.M}
-}
-
-proc update_MODELPARAM_VALUE.C { MODELPARAM_VALUE.C PARAM_VALUE.C } {
-	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
-	set_property value [get_property value ${PARAM_VALUE.C}] ${MODELPARAM_VALUE.C}
-}
-
-proc update_MODELPARAM_VALUE.C_BITS { MODELPARAM_VALUE.C_BITS PARAM_VALUE.C_BITS } {
-	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
-	set_property value [get_property value ${PARAM_VALUE.C_BITS}] ${MODELPARAM_VALUE.C_BITS}
-}
-

From 174c0ffe1d0614dd14013de1b073469d79c9191e Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Tue, 15 Nov 2022 17:59:23 +0000
Subject: [PATCH 015/235] [thresholding] allow for positive and negative bias
 values

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 finn-rtllib/thresholding/hdl/thresholding.sv            | 4 ++--
 finn-rtllib/thresholding/hdl/thresholding_axi.sv        | 2 +-
 finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/finn-rtllib/thresholding/hdl/thresholding.sv b/finn-rtllib/thresholding/hdl/thresholding.sv
index cea93e40ab..a99c752e17 100644
--- a/finn-rtllib/thresholding/hdl/thresholding.sv
+++ b/finn-rtllib/thresholding/hdl/thresholding.sv
@@ -51,7 +51,7 @@ module thresholding #(
 	int  BIAS = 0,  // offsetting the output [0, 2^N-1) -> [-BIAS, 2^N-1 - BIAS)
 
 	localparam int unsigned  C_BITS = C < 2? 1 : $clog2(C),
-	localparam int unsigned  O_BITS = BIAS <= 0?
+	localparam int unsigned  O_BITS = BIAS > 0?
 		/* unsigned */ $clog2(2**N-BIAS) :
 		/* signed */ 1+$clog2(BIAS >= 2**(N-1)? BIAS : 2**N-BIAS)
 )(
@@ -153,6 +153,6 @@ module thresholding #(
 	// Output
 	assign	ovld = pipe[N].vld;
 	assign	ocnl = pipe[N].cnl;
-	assign	odat = pipe[N].res - BIAS;
+	assign	odat = pipe[N].res + BIAS;
 
 endmodule : thresholding
diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi.sv b/finn-rtllib/thresholding/hdl/thresholding_axi.sv
index 6b869ba303..795683da1d 100644
--- a/finn-rtllib/thresholding/hdl/thresholding_axi.sv
+++ b/finn-rtllib/thresholding/hdl/thresholding_axi.sv
@@ -39,7 +39,7 @@ module thresholding_axi #(
 
 	int  BIAS = 0,  // offsetting the output [0, 2^N-1) -> [-BIAS, 2^N-1 - BIAS)
 
-	localparam int unsigned  O_BITS = BIAS <= 0?
+	localparam int unsigned  O_BITS = BIAS > 0?
 		/* unsigned */ $clog2(2**N-BIAS) :
 		/* signed */ 1+$clog2(BIAS >= 2**(N-1)? BIAS : 2**N-BIAS)
 )(
diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v
index b5c65e5879..6bfc2f57a4 100644
--- a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v
+++ b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v
@@ -39,7 +39,7 @@ module thresholding_axi_wrapper #(
 	parameter  BIAS = 0,  // offsetting the output [0, 2^N-1) -> [-BIAS, 2^N-1 - BIAS)
 
 	localparam  C_BITS = $clog2(C),
-	localparam  O_BITS = BIAS <= 0?
+	localparam  O_BITS = BIAS > 0?
 		/* unsigned */ $clog2(2**N-BIAS) :
 		/* signed */ 1+$clog2(BIAS >= 2**(N-1)? BIAS : 2**N-BIAS)
 )(

From 2ec20e5cab8c821d7dc6d652564e85eb1bc84b6b Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Tue, 15 Nov 2022 18:00:52 +0000
Subject: [PATCH 016/235] [thresholding] pass bias from top module to
 thresholding.sv core

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 finn-rtllib/thresholding/hdl/thresholding.sv            | 2 +-
 finn-rtllib/thresholding/hdl/thresholding_axi.sv        | 2 +-
 finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/finn-rtllib/thresholding/hdl/thresholding.sv b/finn-rtllib/thresholding/hdl/thresholding.sv
index a99c752e17..f9763af96c 100644
--- a/finn-rtllib/thresholding/hdl/thresholding.sv
+++ b/finn-rtllib/thresholding/hdl/thresholding.sv
@@ -48,7 +48,7 @@ module thresholding #(
 	int unsigned  M,  // input/threshold precision
 	int unsigned  C,  // number of channels
 
-	int  BIAS = 0,  // offsetting the output [0, 2^N-1) -> [-BIAS, 2^N-1 - BIAS)
+	int BIAS,  // offsetting the output [0, 2^N-1) -> [-BIAS, 2^N-1 - BIAS)
 
 	localparam int unsigned  C_BITS = C < 2? 1 : $clog2(C),
 	localparam int unsigned  O_BITS = BIAS > 0?
diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi.sv b/finn-rtllib/thresholding/hdl/thresholding_axi.sv
index 795683da1d..e4f3feac3f 100644
--- a/finn-rtllib/thresholding/hdl/thresholding_axi.sv
+++ b/finn-rtllib/thresholding/hdl/thresholding_axi.sv
@@ -37,7 +37,7 @@ module thresholding_axi #(
 	int unsigned  M,	// input/threshold precision
 	int unsigned  C,	// Channels
 
-	int  BIAS = 0,  // offsetting the output [0, 2^N-1) -> [-BIAS, 2^N-1 - BIAS)
+	int BIAS,  // offsetting the output [0, 2^N-1) -> [-BIAS, 2^N-1 - BIAS)
 
 	localparam int unsigned  O_BITS = BIAS > 0?
 		/* unsigned */ $clog2(2**N-BIAS) :
diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v
index 6bfc2f57a4..1b5921d8ba 100644
--- a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v
+++ b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v
@@ -36,7 +36,7 @@ module thresholding_axi_wrapper #(
 	parameter  N,	// output precision
 	parameter  M,	// input/threshold precision
 	parameter  C,	// Channels
-	parameter  BIAS = 0,  // offsetting the output [0, 2^N-1) -> [-BIAS, 2^N-1 - BIAS)
+	int BIAS = 0,  // offsetting the output [0, 2^N-1) -> [-BIAS, 2^N-1 - BIAS)
 
 	localparam  C_BITS = $clog2(C),
 	localparam  O_BITS = BIAS > 0?

From 861614837dd187dc58ab24af0b5d0cd2050c76e6 Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Tue, 15 Nov 2022 18:07:56 +0000
Subject: [PATCH 017/235] [thresholding] pass O_BITS from top module to
 thresholding.sv core

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 finn-rtllib/thresholding/hdl/thresholding.sv            | 4 +---
 finn-rtllib/thresholding/hdl/thresholding_axi.sv        | 6 ++----
 finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v | 4 ++--
 3 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/finn-rtllib/thresholding/hdl/thresholding.sv b/finn-rtllib/thresholding/hdl/thresholding.sv
index f9763af96c..04116e995c 100644
--- a/finn-rtllib/thresholding/hdl/thresholding.sv
+++ b/finn-rtllib/thresholding/hdl/thresholding.sv
@@ -51,9 +51,7 @@ module thresholding #(
 	int BIAS,  // offsetting the output [0, 2^N-1) -> [-BIAS, 2^N-1 - BIAS)
 
 	localparam int unsigned  C_BITS = C < 2? 1 : $clog2(C),
-	localparam int unsigned  O_BITS = BIAS > 0?
-		/* unsigned */ $clog2(2**N-BIAS) :
-		/* signed */ 1+$clog2(BIAS >= 2**(N-1)? BIAS : 2**N-BIAS)
+	int unsigned O_BITS
 )(
 	// Global Control
 	input	logic  clk,
diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi.sv b/finn-rtllib/thresholding/hdl/thresholding_axi.sv
index e4f3feac3f..a7eec445e0 100644
--- a/finn-rtllib/thresholding/hdl/thresholding_axi.sv
+++ b/finn-rtllib/thresholding/hdl/thresholding_axi.sv
@@ -39,9 +39,7 @@ module thresholding_axi #(
 
 	int BIAS,  // offsetting the output [0, 2^N-1) -> [-BIAS, 2^N-1 - BIAS)
 
-	localparam int unsigned  O_BITS = BIAS > 0?
-		/* unsigned */ $clog2(2**N-BIAS) :
-		/* signed */ 1+$clog2(BIAS >= 2**(N-1)? BIAS : 2**N-BIAS)
+	int unsigned O_BITS
 )(
 	//- Global Control ------------------
 	input	logic  ap_clk,
@@ -193,7 +191,7 @@ module thresholding_axi #(
 	end
 
 	// Core Thresholding Module
-	thresholding #(.N(N), .M(M), .C(C), .BIAS(BIAS)) core (
+	thresholding #(.N(N), .M(M), .C(C), .BIAS(BIAS), .O_BITS(O_BITS)) core (
 		.clk, .rst,
 		.twe, .twa, .twd,
 		.en,
diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v
index 1b5921d8ba..5c43a70445 100644
--- a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v
+++ b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v
@@ -39,7 +39,7 @@ module thresholding_axi_wrapper #(
 	int BIAS = 0,  // offsetting the output [0, 2^N-1) -> [-BIAS, 2^N-1 - BIAS)
 
 	localparam  C_BITS = $clog2(C),
-	localparam  O_BITS = BIAS > 0?
+	parameter  O_BITS = BIAS > 0?
 		/* unsigned */ $clog2(2**N-BIAS) :
 		/* signed */ 1+$clog2(BIAS >= 2**(N-1)? BIAS : 2**N-BIAS)
 )(
@@ -83,7 +83,7 @@ module thresholding_axi_wrapper #(
 	output	[((O_BITS+7)/8)*8-1:0]  m_axis_tdata
 );
 
-	thresholding_axi #(.N(N), .M(M), .C(C), .BIAS(BIAS)) inst (
+	thresholding_axi #(.N(N), .M(M), .C(C), .BIAS(BIAS), .O_BITS(O_BITS)) inst (
 		//- Global Control ------------------
 		.ap_clk(ap_clk),
 		.ap_rst_n(ap_rst_n),

From 275abaddee9504360c1589565036611bab5955da Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Tue, 15 Nov 2022 18:10:12 +0000
Subject: [PATCH 018/235] [thresholding] pass C_BITS from top module to
 thresholding.sv core

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 finn-rtllib/thresholding/hdl/thresholding.sv            | 2 +-
 finn-rtllib/thresholding/hdl/thresholding_axi.sv        | 2 +-
 finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/finn-rtllib/thresholding/hdl/thresholding.sv b/finn-rtllib/thresholding/hdl/thresholding.sv
index 04116e995c..70f94f1c22 100644
--- a/finn-rtllib/thresholding/hdl/thresholding.sv
+++ b/finn-rtllib/thresholding/hdl/thresholding.sv
@@ -50,7 +50,7 @@ module thresholding #(
 
 	int BIAS,  // offsetting the output [0, 2^N-1) -> [-BIAS, 2^N-1 - BIAS)
 
-	localparam int unsigned  C_BITS = C < 2? 1 : $clog2(C),
+	int unsigned  C_BITS,
 	int unsigned O_BITS
 )(
 	// Global Control
diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi.sv b/finn-rtllib/thresholding/hdl/thresholding_axi.sv
index a7eec445e0..fac69b33fc 100644
--- a/finn-rtllib/thresholding/hdl/thresholding_axi.sv
+++ b/finn-rtllib/thresholding/hdl/thresholding_axi.sv
@@ -191,7 +191,7 @@ module thresholding_axi #(
 	end
 
 	// Core Thresholding Module
-	thresholding #(.N(N), .M(M), .C(C), .BIAS(BIAS), .O_BITS(O_BITS)) core (
+	thresholding #(.N(N), .M(M), .C(C), .BIAS(BIAS), .O_BITS(O_BITS), .C_BITS(C_BITS)) core (
 		.clk, .rst,
 		.twe, .twa, .twd,
 		.en,
diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v
index 5c43a70445..588f9e4852 100644
--- a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v
+++ b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v
@@ -38,7 +38,7 @@ module thresholding_axi_wrapper #(
 	parameter  C,	// Channels
 	int BIAS = 0,  // offsetting the output [0, 2^N-1) -> [-BIAS, 2^N-1 - BIAS)
 
-	localparam  C_BITS = $clog2(C),
+	parameter  C_BITS = C < 2 ? 1 : $clog2(C),
 	parameter  O_BITS = BIAS > 0?
 		/* unsigned */ $clog2(2**N-BIAS) :
 		/* signed */ 1+$clog2(BIAS >= 2**(N-1)? BIAS : 2**N-BIAS)

From 8849c026b780c152dd51c0e007c5f72bdca4808c Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Wed, 16 Nov 2022 09:31:20 +0000
Subject: [PATCH 019/235] [thresholding] create & fill in RTL template values
 using FINN

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 finn-rtllib/thresholding/hdl/thresholding.sv  | 16 +--
 .../thresholding/hdl/thresholding_axi.sv      |  6 +-
 .../hdl/thresholding_axi_wrapper.v            | 14 +--
 .../thresholding_binary_search.py             | 99 +++++++++++++++++++
 4 files changed, 117 insertions(+), 18 deletions(-)

diff --git a/finn-rtllib/thresholding/hdl/thresholding.sv b/finn-rtllib/thresholding/hdl/thresholding.sv
index 70f94f1c22..25d6ff3112 100644
--- a/finn-rtllib/thresholding/hdl/thresholding.sv
+++ b/finn-rtllib/thresholding/hdl/thresholding.sv
@@ -43,7 +43,7 @@
  *  threshold configuration relies on a channel address prefix. Inputs are
  *  accompanied by a channel selector.
  *****************************************************************************/
-module thresholding #(
+module $MODULE_NAME$ #(
 	int unsigned  N,  // output precision
 	int unsigned  M,  // input/threshold precision
 	int unsigned  C,  // number of channels
@@ -68,7 +68,7 @@ module thresholding #(
 	// Input Stream
 	input	logic  ivld,
 	input	logic        [C_BITS-1:0]  icnl,	// Ignored for C == 1
-	input	logic signed [M     -1:0]  idat,
+	input	logic $SIGN$ [M     -1:0]  idat,
 
 	// Output Stream
 	output	logic  ovld,
@@ -80,7 +80,7 @@ module thresholding #(
 	typedef struct packed {
 		logic                      vld;	// Valid data identification
 		logic        [C_BITS-1:0]  cnl;	// Channel
-		logic signed [M     -1:0]  val;	// Original input value
+		logic $SIGN$ [M     -1:0]  val;	// Original input value
 		logic        [0:N-1]       res;	// Assembling result with valid prefix [0:stage] after stage #stage
 	} pipe_t;
 	uwire pipe_t  pipe[0:N];
@@ -91,13 +91,13 @@ module thresholding #(
 	for(genvar  stage = 0; stage < N; stage++) begin : genStages
 
 		// Threshold Memory
-		uwire signed [M-1:0]  thresh;
+		uwire $SIGN$ [M-1:0]  thresh;
 		if(1) begin : blkUpdate
 
 			// Write control: local select from global address
 			uwire  we = twe && tws[stage];
 			if((C == 1) && (stage == 0)) begin
-				logic signed [M-1:0]  Thresh = 'x;
+				logic $SIGN$ [M-1:0]  Thresh = 'x;
 				always_ff @(posedge clk) begin
 					if(rst)      Thresh <= 'x;
 					else if(we)  Thresh <= twd;
@@ -105,7 +105,7 @@ module thresholding #(
 				assign  thresh = Thresh;
 			end
 			else begin
-				logic signed [M-1:0]  Threshs[C * 2**stage];
+				logic $SIGN$ [M-1:0]  Threshs[C * 2**stage];
 				uwire [$clog2(C)+stage-1:0]  wa = twa[$left(twa):N-stage];
 				uwire [$clog2(C)+stage-1:0]  ra;
 				if(C > 1)  assign  ra[stage+:C_BITS] = pipe[stage].cnl;
@@ -117,7 +117,7 @@ module thresholding #(
 				end
 
 				// Read
-				logic signed [M-1:0]  RdReg;
+				logic $SIGN$ [M-1:0]  RdReg;
 				always_ff @(posedge clk) begin
 					if(en)  RdReg <= Threshs[ra];
 				end
@@ -153,4 +153,4 @@ module thresholding #(
 	assign	ocnl = pipe[N].cnl;
 	assign	odat = pipe[N].res + BIAS;
 
-endmodule : thresholding
+endmodule : $MODULE_NAME$
diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi.sv b/finn-rtllib/thresholding/hdl/thresholding_axi.sv
index fac69b33fc..97cdfd3e12 100644
--- a/finn-rtllib/thresholding/hdl/thresholding_axi.sv
+++ b/finn-rtllib/thresholding/hdl/thresholding_axi.sv
@@ -32,7 +32,7 @@
  * @author	Thomas B. Preußer <tpreusse@amd.com>
  *****************************************************************************/
 
-module thresholding_axi #(
+module $MODULE_NAME_AXI$ #(
 	int unsigned  N,	// output precision
 	int unsigned  M,	// input/threshold precision
 	int unsigned  C,	// Channels
@@ -191,7 +191,7 @@ module thresholding_axi #(
 	end
 
 	// Core Thresholding Module
-	thresholding #(.N(N), .M(M), .C(C), .BIAS(BIAS), .O_BITS(O_BITS), .C_BITS(C_BITS)) core (
+	$MODULE_NAME$ #(.N(N), .M(M), .C(C), .BIAS(BIAS), .O_BITS(O_BITS), .C_BITS(C_BITS)) core (
 		.clk, .rst,
 		.twe, .twa, .twd,
 		.en,
@@ -199,4 +199,4 @@ module thresholding_axi #(
 		.ovld, .ocnl(), .odat
 	);
 
-endmodule : thresholding_axi
+endmodule : $MODULE_NAME_AXI$
diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v
index 588f9e4852..e3f8596bc8 100644
--- a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v
+++ b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v
@@ -32,11 +32,11 @@
  * @author	Thomas B. Preußer <tpreusse@amd.com>
  *****************************************************************************/
 
-module thresholding_axi_wrapper #(
-	parameter  N,	// output precision
-	parameter  M,	// input/threshold precision
-	parameter  C,	// Channels
-	int BIAS = 0,  // offsetting the output [0, 2^N-1) -> [-BIAS, 2^N-1 - BIAS)
+module $MODULE_NAME_AXI_WRAPPER$ #(
+	parameter  N = $N$,	// output precision
+	parameter  M = $M$,	// input/threshold precision
+	parameter  C = $C$,	// Channels
+	int BIAS = $BIAS$,  // offsetting the output [0, 2^N-1) -> [-BIAS, 2^N-1 - BIAS)
 
 	parameter  C_BITS = C < 2 ? 1 : $clog2(C),
 	parameter  O_BITS = BIAS > 0?
@@ -83,7 +83,7 @@ module thresholding_axi_wrapper #(
 	output	[((O_BITS+7)/8)*8-1:0]  m_axis_tdata
 );
 
-	thresholding_axi #(.N(N), .M(M), .C(C), .BIAS(BIAS), .O_BITS(O_BITS)) inst (
+	$MODULE_NAME_AXI$ #(.N(N), .M(M), .C(C), .BIAS(BIAS), .O_BITS(O_BITS)) inst (
 		//- Global Control ------------------
 		.ap_clk(ap_clk),
 		.ap_rst_n(ap_rst_n),
@@ -124,4 +124,4 @@ module thresholding_axi_wrapper #(
 		.m_axis_tdata(m_axis_tdata)
 	);
 
-endmodule : thresholding_axi_wrapper
+endmodule : $MODULE_NAME_AXI_WRAPPER$
diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
index ee74f28485..d546d52843 100755
--- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
@@ -26,6 +26,7 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import os
 from qonnx.core.datatype import DataType
 
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
@@ -85,6 +86,7 @@ def get_nodeattr_types(self):
             # weight data from the weight FIFOs.
             "runtime_writeable_weights": ("i", False, 0, {0, 1}),
             "gen_top_module": ("s", False, ""),
+            "activation_bias": ("i", False, 0),
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
@@ -174,6 +176,103 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name):
         """
         return
 
+    # Get the integer from the DataType and string-ify it
+    # This assumes that the data is in the form "INTx" or similar
+    def conv_datatype_to_str(self, data_type):
+        # Handle the case that an int is passed to the function
+        if isinstance(data_type, int):
+            return str(data_type)
+        return str(DataType[data_type].bitwidth())
+
+    def prepare_codegen_rtl_values(self):
+        """All dictionary values produced in this function are to replace
+        their key value(s) in the RTL template files"""
+        code_gen_dict = {}
+
+        # Identify the module names
+        code_gen_dict["$MODULE_NAME$"] = [self.get_verilog_top_module_name()]
+        code_gen_dict["$MODULE_NAME_AXI$"] = [self.get_verilog_top_module_name() + "_axi"]
+        code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [self.get_verilog_top_module_name() + "_axi_wrapper"]
+        # Set the top module name - AXI wrapper
+        code_gen_dict["$TOP_MODULE$"] = code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"]
+
+        # Identify the module variables
+        output_data_type = self.get_nodeattr("outputDataType") # output precision
+        input_data_type = self.get_nodeattr("inputDataType") # input/threshold precision
+        num_channels = self.get_nodeattr("NumChannels") # number of channels
+        bias = self.get_nodeattr("activation_bias") # activation bias value
+
+        code_gen_dict["$N$"] = [self.conv_datatype_to_str(output_data_type)] # output precision
+        code_gen_dict["$M$"] = [self.conv_datatype_to_str(input_data_type)] # input/threshold precision
+        code_gen_dict["$C$"] = [self.conv_datatype_to_str(num_channels)] # number of channels
+        code_gen_dict["$BIAS$"] = [self.conv_datatype_to_str(bias)] # activation bias value
+
+        # Is the input datatype signed or unsigned? The thresholding core needs to know this
+        if self.get_input_datatype().min() < 0:
+            code_gen_dict["$SIGN$"] = ["signed"]
+        else:
+            code_gen_dict["$SIGN$"] = ["unsigned"]
+
+        return code_gen_dict
+
+    def get_rtl_file_list(self):
+        return ["thresholding.sv",
+                "thresholding_axi.sv",
+                "thresholding_axi_wrapper.v"]
+
+    def get_rtl_file_paths(self):
+        rtl_root_dir = os.environ["FINN_ROOT"] + "/finn-rtllib/thresholding/hdl/"
+        rtl_file_list = self.get_rtl_file_list()
+        rtl_file_paths = [rtl_root_dir + file for file in rtl_file_list]
+        return rtl_file_paths
+
+    def get_rtl_template_data(self, path):
+        with open(path, "r") as f:
+            template = f.read()
+        return template
+
+    def fill_in_rtl_template_data(self, replace_dict, template_data):
+        template_data_cp = template_data
+        for key in replace_dict:
+            replacement_line = "\n".join(replace_dict[key])
+            template_data_cp = template_data_cp.replace(key, replacement_line)
+        return template_data_cp
+
+    def dump_rtl_data(self, dest_dir, filename, data):
+        with open(os.path.join(dest_dir, filename), "w") as f:
+            f.write(data)
+        return
+
+    def generate_hdl(self):
+        # Generate a dictionary of values to put in RTL template
+        code_gen_dict = self.prepare_codegen_rtl_values()
+
+        # Retrieve the destination directory for the final RTL files
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+
+        for rtl_file_path in self.get_rtl_file_paths():
+            # read in original RTL template file
+            template_data = self.get_rtl_template_data(rtl_file_path)
+            # apply code generation to templates
+            data = self.fill_in_rtl_template_data(code_gen_dict, template_data)
+            # dump filled-in template to destination directory for compilation
+            file_only_path = rtl_file_path.split('/')[-1]
+            self.dump_rtl_data(code_gen_dir, file_only_path, data)
+
+        # Before we return - set the 'gen_top_module' attribute for use later by PyVerilator and IPI generation
+        self.set_nodeattr("gen_top_module", code_gen_dict["$TOP_MODULE$"][0])
+        return
+
+    def code_generation_ipgen(self, model, fpgapart, clk):
+        self.generate_hdl()
+
+        # set ipgen_path and ip_path so that HLS-Synth transformation
+        # and stich_ip transformation do not complain
+        # i.e. during the HLSSynthIP() transformation
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        self.set_nodeattr("ipgen_path", code_gen_dir)
+        self.set_nodeattr("ip_path", code_gen_dir)
+
     def generate_params(self, model, path):
         return
 

From 84704edd5aa7e53351819238f96d4c63dfb45d07 Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Wed, 16 Nov 2022 09:45:29 +0000
Subject: [PATCH 020/235] [thresholding] add method get_weightstream_width()

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 .../custom_op/fpgadataflow/thresholding_binary_search.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
index d546d52843..54fa2def1e 100755
--- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
@@ -131,7 +131,14 @@ def get_outstream_width(self):
         return 0
 
     def get_weightstream_width(self):
-        return 0
+        # Only 'decoupled' mode is supported
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode != "decoupled": raise Exception("Unrecognized memory mode for this node: {}".format(mem_mode))
+        pe = self.get_nodeattr("PE")
+        wp = self.get_weight_datatype().bitwidth()
+        n_thres_steps = self.get_nodeattr("numSteps")
+        w_width = pe * wp * n_thres_steps
+        return w_width
 
     def get_folded_input_shape(self):
         fold = self.calc_tmem()

From 9aa7ff3f8c1a0584afd8684e9280d77aada43105 Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Wed, 16 Nov 2022 09:48:56 +0000
Subject: [PATCH 021/235] [thresholding] add method get_in/output_width()

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 .../custom_op/fpgadataflow/thresholding_binary_search.py    | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
index 54fa2def1e..a1b75b3de1 100755
--- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
@@ -125,10 +125,12 @@ def minimize_accumulator_width(self, model):
         return None
 
     def get_instream_width(self):
-        return 0
+        i_bits = self.get_input_datatype().bitwidth()
+        return i_bits * self.get_nodeattr("PE")
 
     def get_outstream_width(self):
-        return 0
+        o_bits = self.get_output_datatype().bitwidth()
+        return o_bits * self.get_nodeattr("PE")
 
     def get_weightstream_width(self):
         # Only 'decoupled' mode is supported

From 608b5da9222e2ede4792c487dc4d77fb5ef02e16 Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Wed, 16 Nov 2022 09:51:10 +0000
Subject: [PATCH 022/235] [thresholding] add method body for
 code_generation_ipi()

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 .../thresholding_binary_search.py             | 27 ++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
index a1b75b3de1..4ca651be76 100755
--- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
@@ -289,7 +289,32 @@ def execute_node(self, context, graph):
         return
 
     def code_generation_ipi(self):
-        return []
+        """Constructs and returns the TCL commands for node instantiation as an RTL block."""
+        cmd = []
+        rtl_file_list = self.get_rtl_file_list()
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+
+        for rtl_file in rtl_file_list:
+            cmd.append("add_files -norecurse %s"
+            % (
+                os.path.join(
+                    code_gen_dir, rtl_file
+                )
+            ))
+
+        # Create an RTL block, not an IP core (-type ip)
+        cmd.append("create_bd_cell -type module -reference %s %s"
+            % (self.get_nodeattr("gen_top_module"), self.onnx_node.name))
+
+        # ERROR: [BD 41-237] Bus Interface property FREQ_HZ does not match between
+        # /Thresholding_Binary_Search_0/s_axis(100000000 and /StreamingFIFO_0/out_V(200000000.000000)
+        cmd.append("set_property -dict [list CONFIG.FREQ_HZ {200000000}] [get_bd_intf_pins Thresholding_Binary_Search_0/s_axis]")
+
+        # ERROR: [BD 41-237] Bus Interface property FREQ_HZ does not match between
+        # /StreamingFIFO_1/in0_V(200000000.000000) and /Thresholding_Binary_Search_0/m_axis(100000000)
+        cmd.append("set_property -dict [list CONFIG.FREQ_HZ {200000000}] [get_bd_intf_pins Thresholding_Binary_Search_0/m_axis]")
+
+        return cmd
 
     def global_includes(self):
         pass

From ca6e7e745c4ad810ac824ee3b6ccd55bdb6f724d Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Wed, 16 Nov 2022 09:56:01 +0000
Subject: [PATCH 023/235] [thresholding] add method
 get_verilog_top_module_intf_names()

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 .../thresholding_binary_search.py             | 22 +++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
index 4ca651be76..5dac98ad66 100755
--- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
@@ -316,6 +316,28 @@ def code_generation_ipi(self):
 
         return cmd
 
+    def get_verilog_top_module_intf_names(self):
+        """Return a dict of names of input and output interfaces.
+        The keys reflect the protocols each interface implements:
+        'clk', 'rst', 'm_axis', 's_axis', 'aximm', 'axilite'.
+        Values are lists of tuples (axis, aximm) or names (axilite):
+        'axis' tuples correspond to the list of node inputs in order,
+        each tuple is (interface_name, interface_width_bits).
+        axilite always assumed to be 32 bits and is not tuple (name only).
+        Each block must have at most one aximm and one axilite."""
+
+        intf_names = super().get_verilog_top_module_intf_names()
+        # Only 'decoupled' mode is supported - check before adding axilite interface
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode != "decoupled": raise Exception("Unrecognized memory mode for this node: {}".format(mem_mode))
+        intf_names["axilite"] = ["s_axilite"]
+        intf_names["s_axis"] = [["s_axis"]]
+        intf_names["m_axis"] = [["m_axis"]]
+
+        self.set_nodeattr("runtime_writeable_weights", 1)
+
+        return intf_names
+
     def global_includes(self):
         pass
 

From 7266ee91af50a149d1d8310401e2a4134cdac18c Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Wed, 16 Nov 2022 10:41:14 +0000
Subject: [PATCH 024/235] [thresholding] retrieve axilite write sequence for
 runtime weight programming

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 .../thresholding_binary_search.py             | 53 +++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
index 5dac98ad66..07b675f0f3 100755
--- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
@@ -338,6 +338,59 @@ def get_verilog_top_module_intf_names(self):
 
         return intf_names
 
+    def find_next_power_of_2(self, n):
+        # Negative values will loop infinitely below - return 0
+        if n <= 0:
+            return 0
+        # If '1' is requested, output will be '0' in the loop below, so avoid this earlier.
+        elif n == 1:
+            return 2 # i.e. 2**1
+
+        # decrement 'n' (to handle cases when `n` itself is a power of 2)
+        n = n - 1
+
+        # loop until only one bit is left
+        while n & n - 1:
+            # unset rightmost bit
+            n = n & n - 1
+        return n << 1
+
+    def twos_comp(self, val, bitwidth):
+        return (val + (1 << bitwidth)) % (1 << bitwidth)
+
+    def prep_axilite_val(self, val):
+        return self.twos_comp(int(val), self.get_weight_datatype().bitwidth())
+
+    def get_dynamic_config(self, model, address_stride=1):
+        ## TODO - not sure this description is correct
+        """Returns a configuration dictionary containing axilite write commands
+        in order to program the thresholds into the RTL core during runtime.
+        The default address stride for the weights is 1 byte."""
+
+        thresholds = model.get_initializer(self.onnx_node.input[1])
+        num_channels, num_weights_per_channel = thresholds.shape
+
+        weight_addr_boundary = self.find_next_power_of_2(num_weights_per_channel)
+        # Make sure that the next power of 2 (output) is greater than the input
+        assert weight_addr_boundary >= num_weights_per_channel
+
+        config = {}
+        channel_cntr = 0
+        for channel in thresholds:
+            channel_start_addr = (channel_cntr * weight_addr_boundary * address_stride)
+            weight_cntr = 0
+            addr = 0
+            for weight in channel:
+                key_name = "{}_{}{}_{}{}".format("axilite", "ch", str(channel_cntr), "w", str(weight_cntr))
+                config[key_name] = (channel_start_addr + addr, self.prep_axilite_val(weight))
+
+                weight_cntr += 1
+                addr += address_stride
+
+            channel_cntr += 1
+
+        return config
+
     def global_includes(self):
         pass
 

From f88bdbfeb4ade334740d29fa81f6a83174635ad2 Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Wed, 16 Nov 2022 11:06:39 +0000
Subject: [PATCH 025/235] [thresholding] add methods for creating weight files
 for each simulation type

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 .../thresholding_binary_search.py             | 130 +++++++++++++++++-
 1 file changed, 128 insertions(+), 2 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
index 07b675f0f3..6ed07287ab 100755
--- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
@@ -27,9 +27,17 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import os
+import numpy as np
 from qonnx.core.datatype import DataType
-
+from qonnx.util.basic import (
+    interleave_matrix_outer_dim_from_partitions,
+    roundup_to_integer_multiple,
+)
+from finn.util.data_packing import (
+    pack_innermost_dim_as_hex_string,
+)
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
+import warnings
 
 """@package thresholding_binary_search
 - ONNX i/o tensor shape assumptions for Thresholding:
@@ -172,6 +180,63 @@ def get_exp_cycles(self):
     def get_template_param_values(self):
         return dict()
 
+    def get_hls_compatible_threshold_tensor(self, orig_thres_matrix):
+        """Convert the original numpy weight matrix orig_weight_matrix into
+        a form suitable for passing to the hlslib call:
+        * ensure MH % PE == 0
+        * for unsigned inputs, ensure thresholds are positive
+        * interleave rows between PEs
+        * reshape into (PE, TMEM, n_thres_steps) and return
+        """
+        mh = self.get_nodeattr("NumChannels")
+        pe = self.get_nodeattr("PE")
+        tmem = mh // pe
+        assert mh % pe == 0, "Requirement NumChannels divisable by PE is violated."
+        assert (
+            orig_thres_matrix.ndim == 2
+        ), """Threshold matrix dimension is
+        not as expected (2)."""
+        n_thres_steps = orig_thres_matrix.shape[1]
+        assert n_thres_steps == self.get_nodeattr(
+            "numSteps"
+        ), "Mismatch in threshold steps"
+        if not self.get_input_datatype().signed():
+            # ensure all thresholds are nonnegative
+            assert (orig_thres_matrix >= 0).all()
+        # ensure all thresholds are integer
+        assert np.equal(
+            np.mod(orig_thres_matrix, 1), 0
+        ).all(), "Need int threshold tensor"
+        ret = orig_thres_matrix
+        # workaround for vivado_hls threshold bug
+        if ret[0][0] == 0 and n_thres_steps == 1:
+            ret = np.copy(ret)
+            ret[0][0] = 1
+            warnings.warn(
+                "Setting 0-valued first threshold to 1 to avoid vivado_hls bug"
+            )
+        # ensure channels = mh , duplicating if necessary
+        if ret.shape[0] == 1:
+            ret = np.tile(ret, (mh, 1))
+        assert (
+            ret.shape[0] == mh
+        ), "Channels of threshold matrix are not as expected (mh)"
+        # distribute rows between PEs
+        ret = interleave_matrix_outer_dim_from_partitions(ret, pe)
+        assert (
+            ret.shape[0] == pe
+        ), """First dimension after distribution of the
+        rows between PEs is not as expected (pe)"""
+        assert (
+            ret.shape[1] == tmem
+        ), """Second dimension after distribution of the
+        rows between PEs is not as expected (tmem)"""
+        assert (
+            ret.shape[2] == n_thres_steps
+        ), """Third dimension after distribution of the
+        rows between PEs is not as expected (n_thres_steps)"""
+        return ret.reshape(1, pe, tmem, n_thres_steps)
+
     def make_weight_file(self, weights, weight_file_mode, weight_file_name):
         """Produce a file containing given weights (thresholds) in appropriate
         format for this layer. This file can be used for either synthesis or
@@ -183,7 +248,68 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name):
           decoupled_runtime}
         * weight_file_name : filename for the weight file to be generated
         """
-        return
+        # There are 'decoupled_*' flavors, just make sure that the flavors are decoupled related
+        if "decoupled" not in weight_file_mode: raise Exception("Unrecognized memory mode for this node: {}".format(weight_file_mode))
+
+        threshold_tensor = self.get_hls_compatible_threshold_tensor(weights)
+        tdt = self.get_weight_datatype()
+        assert np.vectorize(tdt.allowed)(
+            threshold_tensor
+        ).all(), "Thresholds can't be expressed with type %s" % str(tdt)
+
+        # streaming thresholds need to be organized differently
+        # (1, pe, tmem, n_thres_steps) -> (1, tmem, pe, n_thres_steps)
+        decoupled_thres = np.transpose(threshold_tensor, (0, 2, 1, 3))
+        # (1, tmem, pe, n_thres_steps) -(1, tmem, pe * n_thres_steps)
+        pe = self.get_nodeattr("PE")
+        n_thres_steps = self.get_nodeattr("numSteps")
+        decoupled_thres_pe_flipped = np.flip(decoupled_thres, axis=-2)
+        decoupled_thres = decoupled_thres.reshape(1, -1, pe * n_thres_steps)
+        decoupled_thres = decoupled_thres.copy()
+        decoupled_thres_pe_flipped = decoupled_thres_pe_flipped.reshape(
+            1, -1, pe * n_thres_steps
+        )
+        decoupled_thres_pe_flipped = decoupled_thres_pe_flipped.copy()
+
+        if weight_file_mode == "decoupled_npy":
+            # save weight stream into npy for cppsim
+            np.save(weight_file_name, decoupled_thres)
+        elif weight_file_mode == "decoupled_verilog_dat":
+            # convert weight values into hexstring
+            weight_width = self.get_weightstream_width()
+            # pad to nearest 4 bits to get hex strings
+            weight_width_padded = roundup_to_integer_multiple(weight_width, 4)
+            weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string(
+                decoupled_thres_pe_flipped, tdt, weight_width_padded, prefix=""
+            )
+            weight_stream = weight_tensor_pe_flipped.flatten()
+            weight_stream = weight_stream.copy()
+            with open(weight_file_name, "w") as f:
+                for val in weight_stream:
+                    f.write(val + "\n")
+        elif weight_file_mode == "decoupled_runtime":
+            # memstream axi-lite interface will map each mem line to
+            # one or multiple 32-bit words
+            weight_width = self.get_weightstream_width()
+            words_per_memwidth = 2 ** ceil(log2(weight_width / 32))
+            if words_per_memwidth < 1:
+                words_per_memwidth = 1
+            weight_width_padded = words_per_memwidth * 32
+            # first, pack and ensure padding to 32 bits
+            weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string(
+                decoupled_thres_pe_flipped, tdt, weight_width_padded, prefix=""
+            )
+            weight_stream = weight_tensor_pe_flipped.flatten()
+            weight_stream = weight_stream.copy()
+            with open(weight_file_name, "w") as f:
+                for val in weight_stream:
+                    # split into groups of 8 hex digits (= 32 bits)
+                    words_32b = textwrap.wrap(val, 8)
+                    words_32b.reverse()
+                    for word_32b in words_32b:
+                        f.write(word_32b + "\n")
+        else:
+            raise Exception("Decoupled weight export not yet implemented")
 
     # Get the integer from the DataType and string-ify it
     # This assumes that the data is in the form "INTx" or similar

From 560771a1b87a6f25dd2274232be55d86b350f74b Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Wed, 16 Nov 2022 11:08:53 +0000
Subject: [PATCH 026/235] [thresholding] add method generate_params()

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 .../thresholding_binary_search.py             | 38 +++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
index 6ed07287ab..ff9f5f4875 100755
--- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
@@ -408,7 +408,45 @@ def code_generation_ipgen(self, model, fpgapart, clk):
         self.set_nodeattr("ipgen_path", code_gen_dir)
         self.set_nodeattr("ip_path", code_gen_dir)
 
+        # Generate params for RTLSim
+        self.generate_params(model, code_gen_dir)
+
     def generate_params(self, model, path):
+        # Only 'decoupled' mode is supported
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode != "decoupled": raise Exception("Unrecognized memory mode for this node: {}".format(mem_mode))
+
+        code_gen_dir = path
+        weight_filename_sim = "{}/thresholds.npy".format(code_gen_dir)
+        thresholds = model.get_initializer(self.onnx_node.input[1])
+        self.make_weight_file(thresholds, "decoupled_npy", weight_filename_sim)
+
+        # Verilog.dat thresholds:
+        # also save weights as Verilog .dat file
+        # note that we provide two different .dat files, one for synth
+        # and one for synthesis. this is because URAM-based weights always
+        # need zero weights for synthesis, otherwise they get inferred
+        # as BRAM
+        weight_filename_rtl_synth = "{}/memblock_synth_0.dat".format(code_gen_dir)
+        weight_filename_rtl_sim = "{}/memblock_sim_0.dat".format(code_gen_dir)
+        # sim weights are always the true weights
+        self.make_weight_file(
+            thresholds, "decoupled_verilog_dat", weight_filename_rtl_sim
+        )
+
+        # Synthesis thresholds:
+        ram_style = self.get_nodeattr("ram_style")
+        if ram_style == "ultra":
+            # UltraRAM must have no memory initializer, or only zeroes
+            # otherwise BRAM will be inferred instead of URAM
+            # as a workaround we provide a zero-weight init here
+            synth_thresholds = np.zeros_like(thresholds, dtype=np.float32)
+        else:
+            synth_thresholds = thresholds
+        self.make_weight_file(
+            synth_thresholds, "decoupled_verilog_dat", weight_filename_rtl_synth
+        )
+
         return
 
     def execute_node(self, context, graph):

From e763bf80773be4e362f9f9171a01bb4b9eb4dc8a Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Wed, 16 Nov 2022 11:11:49 +0000
Subject: [PATCH 027/235] [thresholding] add method for preparing a Pyverilator
 object for RTL simulation

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 .../thresholding_binary_search.py             | 33 ++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
index ff9f5f4875..611a75992e 100755
--- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
@@ -28,6 +28,7 @@
 
 import os
 import numpy as np
+import warnings
 from qonnx.core.datatype import DataType
 from qonnx.util.basic import (
     interleave_matrix_outer_dim_from_partitions,
@@ -37,7 +38,12 @@
     pack_innermost_dim_as_hex_string,
 )
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
-import warnings
+from finn.util.basic import make_build_dir, get_rtlsim_trace_depth
+
+try:
+    from pyverilator import PyVerilator
+except ModuleNotFoundError:
+    PyVerilator = None
 
 """@package thresholding_binary_search
 - ONNX i/o tensor shape assumptions for Thresholding:
@@ -449,6 +455,31 @@ def generate_params(self, model, path):
 
         return
 
+    def prepare_rtlsim(self):
+        """Creates a Verilator emulation library for the RTL code generated
+        for this node, sets the rtlsim_so attribute to its path and returns
+        a PyVerilator wrapper around it."""
+
+        if PyVerilator is None:
+            raise ImportError("Installation of PyVerilator is required.")
+
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        verilog_paths = [code_gen_dir]
+        verilog_files = self.get_rtl_file_list()
+
+        # build the Verilator emulation library
+        sim = PyVerilator.build(
+            verilog_files,
+            build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"),
+            verilog_path=verilog_paths,
+            trace_depth=get_rtlsim_trace_depth(),
+            top_module_name=self.get_nodeattr("gen_top_module"),
+        )
+
+        # save generated lib filename in attribute
+        self.set_nodeattr("rtlsim_so", sim.lib._name)
+        return sim
+
     def execute_node(self, context, graph):
         return
 

From 84e08f18a031dbfacec6a11b980c09885552efdf Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Wed, 16 Nov 2022 11:14:47 +0000
Subject: [PATCH 028/235] [thresholding] add method to run rtlsim on a
 thresholding binary search simulation object

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 .../thresholding_binary_search.py             | 79 +++++++++++++++++++
 1 file changed, 79 insertions(+)

diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
index 611a75992e..4c7c67af72 100755
--- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
@@ -35,6 +35,8 @@
     roundup_to_integer_multiple,
 )
 from finn.util.data_packing import (
+    npy_to_rtlsim_input,
+    rtlsim_output_to_npy,
     pack_innermost_dim_as_hex_string,
 )
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
@@ -481,6 +483,83 @@ def prepare_rtlsim(self):
         return sim
 
     def execute_node(self, context, graph):
+        # Perform input checks
+        if self.get_nodeattr("exec_mode") != "rtlsim": raise Exception("Invalid exec_mode value: {}; exec_mode must be set to 'rtlsim'".format(self.get_nodeattr("exec_mode")))
+        if self.get_nodeattr("mem_mode") != "decoupled": raise Exception("Invalid mem_mode value: {}; mem_mode must be set to 'decoupled'".format(self.get_nodeattr("mem_mode")))
+
+        node = self.onnx_node
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+
+        # create a npy file fore each input of the node (in_ind is input index)
+        in_ind = 0
+        for inputs in node.input:
+            # it is assumed that the first input of the node is the data input
+            # the second input are the weights
+            # the third input are the thresholds
+            if in_ind == 0:
+                assert (
+                    str(context[inputs].dtype) == "float32"
+                ), """Input datatype is
+                not float32 as expected."""
+                expected_inp_shape = self.get_folded_input_shape()
+                reshaped_input = context[inputs].reshape(expected_inp_shape)
+
+                if self.get_input_datatype() == DataType["BIPOLAR"]:
+                    # store bipolar activations as binary
+                    reshaped_input = (reshaped_input + 1) / 2
+                    export_idt = DataType["BINARY"]
+                else:
+                    export_idt = self.get_input_datatype()
+
+                # make copy before saving the array
+                reshaped_input = reshaped_input.copy()
+                np.save(
+                    os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)),
+                    reshaped_input,
+                )
+            elif in_ind > 2:
+                raise Exception("Unexpected input found for Thresholding_Batch")
+            in_ind += 1
+
+        # Create a PyVerilator wrapper of the RTLSim .so
+        sim = self.get_rtlsim()
+        nbits = self.get_instream_width()
+        inp = npy_to_rtlsim_input(
+            "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
+        )
+
+        super().reset_rtlsim(sim)
+        super().toggle_clk(sim)
+
+        wnbits = self.get_weightstream_width()
+        export_wdt = self.get_weight_datatype()
+        wei = npy_to_rtlsim_input(
+            "{}/thresholds.npy".format(code_gen_dir), export_wdt, wnbits
+        )
+        num_w_reps = np.prod(self.get_nodeattr("numInputVectors"))
+        io_dict = {
+            "inputs": {"in0": inp, "weights": wei * num_w_reps},
+            "outputs": {"s_axis": []},
+        }
+        self.rtlsim_multi_io(sim, io_dict)
+        output = io_dict["outputs"]["out"]
+
+        # Manage output data
+        odt = self.get_output_datatype()
+        target_bits = odt.bitwidth()
+        packed_bits = self.get_outstream_width()
+        out_npy_path = "{}/output.npy".format(code_gen_dir)
+        out_shape = self.get_folded_output_shape()
+
+        rtlsim_output_to_npy(
+            output, out_npy_path, odt, out_shape, packed_bits, target_bits
+        )
+
+        # load and reshape output
+        output = np.load(out_npy_path)
+        oshape = self.get_normal_output_shape()
+        output = np.asarray([output], dtype=np.float32).reshape(*oshape)
+        context[node.output[0]] = output
         return
 
     def code_generation_ipi(self):

From b0be07adb8e2bb0ab5005169ff0f878efc5c7c80 Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Wed, 16 Nov 2022 11:16:33 +0000
Subject: [PATCH 029/235] [thresholding] add stubbed method for
 ipgen_singlenode_code()

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 .../custom_op/fpgadataflow/thresholding_binary_search.py   | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
index 4c7c67af72..19140a0090 100755
--- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
@@ -665,6 +665,13 @@ def get_dynamic_config(self, model, address_stride=1):
 
         return config
 
+    def ipgen_singlenode_code(self):
+        """Normally: Builds the bash script for IP generation."""
+        """This is needed for the HLSSynthIP() transformation.
+        This is an IP, not a HLS node, so therefore provide an empty hook
+        to prevent any HLS synthesis."""
+        pass
+
     def global_includes(self):
         pass
 

From 30d22f88a40864257a97f7e9e9ff84f25c1bc32e Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Wed, 16 Nov 2022 13:51:10 +0000
Subject: [PATCH 030/235] [thresholding] update class name to a more consistent
 naming convention

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 src/finn/custom_op/fpgadataflow/__init__.py                   | 4 ++--
 src/finn/custom_op/fpgadataflow/thresholding_binary_search.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index 65fbd6e20c..dc9a5a349a 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -59,7 +59,7 @@
 from finn.custom_op.fpgadataflow.streamingmaxpool_batch import StreamingMaxPool_Batch
 from finn.custom_op.fpgadataflow.thresholding_batch import Thresholding_Batch
 from finn.custom_op.fpgadataflow.thresholding_binary_search import (
-    Thresholding_Bin_Search,
+    Thresholding_Binary_Search,
 )
 from finn.custom_op.fpgadataflow.tlastmarker import TLastMarker
 from finn.custom_op.fpgadataflow.upsampler import UpsampleNearestNeighbour_Batch
@@ -82,7 +82,7 @@
 custom_op["Pool_Batch"] = Pool_Batch
 custom_op["FMPadding_Batch"] = FMPadding_Batch
 custom_op["Thresholding_Batch"] = Thresholding_Batch
-custom_op["Thresholding_Binary_search"] = Thresholding_Bin_Search
+custom_op["Thresholding_Binary_Search"] = Thresholding_Binary_Search
 custom_op["AddStreams_Batch"] = AddStreams_Batch
 custom_op["LabelSelect_Batch"] = LabelSelect_Batch
 custom_op["DuplicateStreams_Batch"] = DuplicateStreams_Batch
diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
index 19140a0090..9bf36283da 100755
--- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
@@ -59,7 +59,7 @@
 """
 
 
-class Thresholding_Bin_Search(HLSCustomOp):
+class Thresholding_Binary_Search(HLSCustomOp):
     """Class that corresponds to finn-rtllib 'thresholding' function."""
 
     def __init__(self, onnx_node):

From 3594edddf51f8a13053a6ad99e179d081e15d8d4 Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Thu, 17 Nov 2022 09:54:46 +0000
Subject: [PATCH 031/235] [thresholding] add fpgadataflow pytests for
 thresholding binary search node

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 ...fpgadataflow_thresholding_binary_search.py | 417 ++++++++++++++++++
 1 file changed, 417 insertions(+)
 create mode 100755 tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py

diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py
new file mode 100755
index 0000000000..0a02503300
--- /dev/null
+++ b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py
@@ -0,0 +1,417 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+import numpy as np
+from onnx import TensorProto, helper
+from pyverilator.util.axi_utils import axilite_write, reset_rtlsim
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.general.multithreshold import multithreshold
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.general import GiveUniqueNodeNames
+from qonnx.util.basic import gen_finn_dt_tensor
+
+from finn.core.rtlsim_exec import rtlsim_exec
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+
+test_fpga_part = "xczu3eg-sbva484-1-e"
+target_clk_ns = 5
+
+# Helper functions
+def sort_thresholds_increasing(thresholds):
+    return np.sort(thresholds, axis=1)
+
+def generate_random_threshold_values(input_data_type, num_input_channels, num_steps):
+    return np.random.randint(input_data_type.min(), input_data_type.max() + 1, (num_input_channels, num_steps)).astype(np.float32)
+
+def generate_pe_value(fold, num_input_channels):
+    if fold == -1:
+        fold = num_input_channels
+    pe = num_input_channels // fold
+    assert num_input_channels % pe == 0
+    return pe
+
+# n = batch, c = channel, h = height, w = width of feature map
+# Standard = NCHW; FINN = NHWC
+# Convert from NCHW to NHWC
+def convert_np_array_to_finn_data_layout(data):
+    return np.transpose(data, (0, 2, 3, 1))
+
+# n = batch, c = channel, h = height, w = width of feature map
+# Standard = NCHW; FINN = NHWC
+# Convert from NHWC to NCHW
+def convert_np_array_to_standard_data_layout(data):
+    return np.transpose(data, (0, 3, 1, 2))
+
+def make_single_thresholding_binary_search_modelwrapper(
+    thresholds, pe, input_data_type, output_data_type, activation_bias, mem_mode, num_input_vecs
+):
+    NumChannels = thresholds.shape[0]
+
+    inp = helper.make_tensor_value_info(
+        "inp", TensorProto.FLOAT, num_input_vecs + [NumChannels]
+    )
+    outp = helper.make_tensor_value_info(
+        "outp", TensorProto.FLOAT, num_input_vecs + [NumChannels]
+    )
+
+    node_inp_list = ["inp", "thresh"]
+
+    Thresholding_node = helper.make_node(
+        "Thresholding_Binary_Search",
+        node_inp_list,
+        ["outp"],
+        domain="finn.custom_op.fpgadataflow",
+        backend="fpgadataflow",
+        NumChannels=NumChannels,
+        PE=pe,
+        numSteps=thresholds.shape[1],
+        inputDataType=input_data_type.name,
+        weightDataType=input_data_type.name,
+        outputDataType=output_data_type.name,
+        activation_bias=activation_bias,
+        mem_mode=mem_mode,
+        numInputVectors=num_input_vecs,
+    )
+    graph = helper.make_graph(
+        nodes=[Thresholding_node],
+        name="thresholding_graph",
+        inputs=[inp],
+        outputs=[outp],
+    )
+
+    model = helper.make_model(graph, producer_name="thresholding-model")
+    model = ModelWrapper(model)
+
+    model.set_tensor_datatype("inp", input_data_type)
+    model.set_tensor_datatype("outp", output_data_type)
+
+    model.set_tensor_datatype("thresh", input_data_type)
+    model.set_initializer("thresh", thresholds)
+    return model
+
+# Test brief: a particular method for this class was causing a bug - find_next_power_of_2()
+# Weights in the thresholding core are programmed on a per-channel basis and are byte-addressable.
+# When a channel is programmed, the next channel can start programming at the next power-of-2 byte boundary.
+# This test is to show that the function that calculates that boundary is working correctly.
+#
+# A Thresholding_Binary_Search layer was created and a SW generated dataset with a threshold channel
+# depth of 1 weight (1 layer of N channels in the thresholding core). However, find_next_power_of_2()
+# was returning a next-power-of-2 address boundary at address '0', instead of '2'. This unit test
+# is to prove that this bug no longer occurs. It was originally seen when the input datatype
+# was 'DataType["BIPOLAR"]'.
+@pytest.mark.tbs_unit
+@pytest.mark.tbs_all
+def test_fpgadataflow_thresholding_binary_search_unit():
+    activation = DataType["BIPOLAR"]
+    input_data_type = DataType["INT16"]
+    fold = -1
+    num_input_channels = 16
+    mem_mode = "decoupled"
+
+    # Handle inputs to the test
+    pe = generate_pe_value(fold, num_input_channels)
+    num_steps = activation.get_num_possible_values() - 1
+
+    # Other non-input parameters
+    num_input_vecs = [1, 2, 2]
+    output_data_type = activation
+    if output_data_type == DataType["BIPOLAR"]:
+        activation_bias = 0
+    else:
+        activation_bias = output_data_type.min()
+
+    # Generate random thresholds and sort in ascending order
+    thresholds = generate_random_threshold_values(input_data_type, num_input_channels, num_steps)
+
+    # Generate model from input parameters to the test
+    model = make_single_thresholding_binary_search_modelwrapper(
+        thresholds, pe, input_data_type, output_data_type, activation_bias, mem_mode, num_input_vecs
+    )
+
+    # Retrieve the class to get the method-under-test
+    tbs_node = model.get_nodes_by_op_type("Thresholding_Binary_Search")[0]
+    tbs_inst = getCustomOp(tbs_node)
+
+    test_vector = [
+        {"input": -2, "expected_result": 0},
+        {"input": -1, "expected_result": 0},
+        {"input": 0, "expected_result": 0},
+        {"input": 1, "expected_result": 2},
+        {"input": 2, "expected_result": 2},
+        {"input": 3, "expected_result": 4},
+        {"input": 4, "expected_result": 4},
+        {"input": 7, "expected_result": 8},
+        {"input": 8, "expected_result": 8},
+        {"input": 11, "expected_result": 16},
+        {"input": 15, "expected_result": 16},
+        {"input": 16, "expected_result": 16},
+        {"input": 18, "expected_result": 32},
+        {"input": 27, "expected_result": 32},
+        {"input": 31, "expected_result": 32},
+        {"input": 32, "expected_result": 32},
+        {"input": 42, "expected_result": 64},
+        {"input": 65, "expected_result": 128},
+    ]
+
+    for test_dict in test_vector:
+        output = tbs_inst.find_next_power_of_2(test_dict["input"])
+        assert output >= test_dict["input"]
+        assert output == test_dict["expected_result"]
+
+    return
+
+# Test brief: Prove that cppsim is not supported for this class
+@pytest.mark.tbs_cppsim
+@pytest.mark.tbs_all
+def test_fpgadataflow_thresholding_binary_search_cppsim():
+    input_data_type = DataType["UINT16"]
+    act = DataType["BIPOLAR"]
+    fold = -1
+    num_input_channels = 16
+    mem_mode = "decoupled" # 'const' is unsupported - see test_fpgadataflow_thresholding_binary_search_const_mem_mode
+
+    pe = generate_pe_value(fold, num_input_channels)
+    num_steps = act.get_num_possible_values() - 1
+
+    # Generate random, non-decreasing thresholds
+    thresholds = generate_random_threshold_values(input_data_type, num_input_channels, num_steps)
+
+    # make the vivado_hls threshold bug appear (incorrect rtlsim result when first
+    # threshold of first channel is zero, while using BIPOLAR output)
+    if act == DataType["BIPOLAR"]:
+        thresholds[0][0] = 0
+    thresholds = sort_thresholds_increasing(thresholds)
+
+    # Other non-input parameters
+    num_input_vecs = [1, 2, 2]
+    output_data_type = act
+    if output_data_type == DataType["BIPOLAR"]:
+        activation_bias = 0
+    else:
+        activation_bias = output_data_type.min()
+
+    # Generate model from input parameters to the test
+    model = make_single_thresholding_binary_search_modelwrapper(
+        thresholds, pe, input_data_type, output_data_type, activation_bias, mem_mode, num_input_vecs
+    )
+
+    # Cppsim is not supported for this class, catch the specific exception thrown by cppsim
+    # Exception raised in cppsim: Custom op_type Thresholding_Binary_Search is currently not supported.
+    try:
+        model = model.transform(PrepareCppSim())
+        model = model.transform(CompileCppSim())
+        model = model.transform(SetExecMode("cppsim"))
+    except Exception as e:
+        if str(e) != "Custom op_type Thresholding_Binary_Search is currently not supported.":
+            raise
+
+# Test brief: Prove that memory mode 'const' is not supported for this layer type
+@pytest.mark.tbs_const
+@pytest.mark.tbs_all
+def test_fpgadataflow_thresholding_binary_search_const_mem_mode():
+    input_data_type = DataType["INT16"]
+    activation = DataType["INT4"]
+    fold = -1
+    num_input_channels = 16
+    mem_mode = "const"
+
+    pe = generate_pe_value(fold, num_input_channels)
+    num_input_vecs = [1, 2, 2]
+    output_data_type = activation
+    activation_bias = output_data_type.min()
+
+    # Generate random thresholds and sort in ascending order
+    num_steps = activation.get_num_possible_values() - 1
+    thresholds = generate_random_threshold_values(input_data_type, num_input_channels, num_steps)
+
+    # Generate model from input parameters to the test
+    model = make_single_thresholding_binary_search_modelwrapper(
+        thresholds, pe, input_data_type, output_data_type, activation_bias, mem_mode, num_input_vecs
+    )
+
+    # Prove that 'const' memory mode is not supported for this class
+    # 'const' memory mode is not supported for this class, catch the specific exception thrown by FINN
+    # Exception: ('Unrecognized memory mode for this node:', 'const')
+    try:
+        model = model.transform(InsertFIFO(True))
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
+        model = model.transform(HLSSynthIP())
+        model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns))
+    except Exception as e:
+        if str(e) != "Unrecognized memory mode for this node: {}".format(mem_mode):
+            raise
+        # Caught the expected exception, leave the test early
+        return
+
+# Test brief: Test that PrepareRTLSim() runs successfully. This function is not
+# tested in test_fpgadataflow_thresholding_binary_search()
+@pytest.mark.tbs_prep_rtlsim
+@pytest.mark.tbs_all
+def test_fpgadataflow_thresholding_binary_search_prepare_rtlsim():
+    input_data_type = DataType["INT16"]
+    act = DataType["INT4"]
+    fold = -1
+    num_input_channels = 16
+    mem_mode = "decoupled"
+
+    # Handle inputs to the test
+    pe = generate_pe_value(fold, num_input_channels)
+    num_steps = act.get_num_possible_values() - 1
+
+    # Generate random, non-decreasing thresholds
+    thresholds = generate_random_threshold_values(input_data_type, num_input_channels, num_steps)
+    # make the vivado_hls threshold bug appear (incorrect rtlsim result when first
+    # threshold of first channel is zero, while using BIPOLAR output)
+    if act == DataType["BIPOLAR"]:
+        thresholds[0][0] = 0
+    thresholds = sort_thresholds_increasing(thresholds)
+
+    # Other non-input parameters
+    num_input_vecs = [1, 2, 2]
+    output_data_type = act
+    if output_data_type == DataType["BIPOLAR"]:
+        activation_bias = 0
+    else:
+        activation_bias = output_data_type.min()
+
+    # Generate model from input parameters to the test
+    model = make_single_thresholding_binary_search_modelwrapper(
+        thresholds, pe, input_data_type, output_data_type, activation_bias, mem_mode, num_input_vecs
+    )
+
+    model = model.transform(SetExecMode("rtlsim"))
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
+    model = model.transform(HLSSynthIP())
+    model = model.transform(PrepareRTLSim())
+    return
+
+# Test brief: Create a Thresholding binary search layer using various parameters
+# and test against a SW generated & simulated dataset
+# N.B. - fold factor of '-1' is supported only (no PE/SIMD support)
+@pytest.mark.parametrize("activation", [DataType["INT4"], DataType["BIPOLAR"]])
+@pytest.mark.parametrize("input_data_type", [DataType["INT16"], DataType["UINT16"]])
+@pytest.mark.parametrize("fold", [-1]) # 1, 2, etc. will fail
+@pytest.mark.parametrize("num_input_channels", [16])
+# no need to test 'const' mode, it's already done in test_fpgadataflow_thresholding_binary_search_const_mem_mode()
+@pytest.mark.parametrize("mem_mode", ["decoupled"])
+@pytest.mark.tbs_soak
+@pytest.mark.tbs_all
+def test_fpgadataflow_thresholding_binary_search(activation, input_data_type, fold, num_input_channels, mem_mode):
+    # Handle inputs to the test
+    pe = generate_pe_value(fold, num_input_channels)
+    num_steps = activation.get_num_possible_values() - 1
+
+    # Other non-input parameters
+    num_input_vecs = [1, 2, 2]
+    output_data_type = activation
+    if output_data_type == DataType["BIPOLAR"]:
+        activation_bias = 0
+    else:
+        activation_bias = output_data_type.min()
+
+    # generate random input data
+    tensor_shape = tuple(num_input_vecs + [num_input_channels])
+    x = gen_finn_dt_tensor(input_data_type, tensor_shape)
+
+    # Generate random thresholds and sort in ascending order
+    thresholds = generate_random_threshold_values(input_data_type, num_input_channels, num_steps)
+
+    # make the vivado_hls threshold bug appear (incorrect rtlsim result when first
+    # threshold of first channel is zero, while using BIPOLAR output)
+    if activation == DataType["BIPOLAR"]:
+        thresholds[0][0] = 0
+
+    # provide non-decreasing/ascending thresholds
+    thresholds = sort_thresholds_increasing(thresholds)
+
+    x_nhwc = convert_np_array_to_standard_data_layout(x)
+    y = multithreshold(x_nhwc, thresholds)
+
+    # convert back to NHWC for comparison to hw outputs
+    y = convert_np_array_to_finn_data_layout(y)
+    if activation == DataType["BIPOLAR"]:
+        # binary to bipolar
+        y = 2 * y - 1
+    else:
+        # signed offset
+        y += activation.min()
+
+    # Generate model from input parameters to the test
+    model = make_single_thresholding_binary_search_modelwrapper(
+        thresholds, pe, input_data_type, output_data_type, activation_bias, mem_mode, num_input_vecs
+    )
+
+    model = model.transform(InsertFIFO(True))
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
+    model = model.transform(HLSSynthIP())
+    model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns))
+
+    # Retrieve the axilite programming sequence for the weights - for decoupled mode only
+    tbs_node = model.get_nodes_by_op_type("Thresholding_Binary_Search")[0]
+    tbs_inst = getCustomOp(tbs_node)
+    config = tbs_inst.get_dynamic_config(model)
+
+    # Reshape generated data (not from model)
+    oshape = model.get_tensor_shape("outp")
+    y_expected = y.reshape(oshape)
+
+    # Helper function that delivers the hook to program the thresholds via AXI-Lite
+    def config_hook(config):
+        if config is None:
+            return None
+
+        def write_thresh_config(sim):
+            # axi_name = "s_axilite_0_" # works
+            axi_name = getCustomOp(model.get_nodes_by_op_type("Thresholding_Binary_Search")[0]).get_verilog_top_module_intf_names()['axilite'][0]
+            axi_name += "_0_"
+
+            # 1. Write config registers to the Threshold memory, dict defines (addr, value) tuples
+            for config_entry in config.values():
+                addr = config_entry[0]
+                val = config_entry[1]
+                axilite_write(sim, addr, val, basename=axi_name)
+
+            reset_rtlsim(sim)
+        return write_thresh_config
+
+    input_dict = {"inp": x}
+    rtlsim_exec(model, input_dict, pre_hook=config_hook(config))
+    y_produced = input_dict["outp"]
+    assert (y_produced == y_expected).all()

From 0bee70d5e4bc5fd163b8cf8a84931ac709aaac35 Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Thu, 17 Nov 2022 10:08:38 +0000
Subject: [PATCH 032/235] [thresholding] add linter fixes

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 .../thresholding_binary_search.py             | 121 ++++++++++++------
 ...fpgadataflow_thresholding_binary_search.py | 103 ++++++++++++---
 2 files changed, 168 insertions(+), 56 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
index 9bf36283da..b785abcaa8 100755
--- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
@@ -26,21 +26,22 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import os
 import numpy as np
+import os
 import warnings
 from qonnx.core.datatype import DataType
 from qonnx.util.basic import (
     interleave_matrix_outer_dim_from_partitions,
     roundup_to_integer_multiple,
 )
+
+from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
+from finn.util.basic import get_rtlsim_trace_depth, make_build_dir
 from finn.util.data_packing import (
     npy_to_rtlsim_input,
-    rtlsim_output_to_npy,
     pack_innermost_dim_as_hex_string,
+    rtlsim_output_to_npy,
 )
-from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
-from finn.util.basic import make_build_dir, get_rtlsim_trace_depth
 
 try:
     from pyverilator import PyVerilator
@@ -151,7 +152,10 @@ def get_outstream_width(self):
     def get_weightstream_width(self):
         # Only 'decoupled' mode is supported
         mem_mode = self.get_nodeattr("mem_mode")
-        if mem_mode != "decoupled": raise Exception("Unrecognized memory mode for this node: {}".format(mem_mode))
+        if mem_mode != "decoupled":
+            raise Exception(
+                "Unrecognized memory mode for this node: {}".format(mem_mode)
+            )
         pe = self.get_nodeattr("PE")
         wp = self.get_weight_datatype().bitwidth()
         n_thres_steps = self.get_nodeattr("numSteps")
@@ -257,7 +261,10 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name):
         * weight_file_name : filename for the weight file to be generated
         """
         # There are 'decoupled_*' flavors, just make sure that the flavors are decoupled related
-        if "decoupled" not in weight_file_mode: raise Exception("Unrecognized memory mode for this node: {}".format(weight_file_mode))
+        if "decoupled" not in weight_file_mode:
+            raise Exception(
+                "Unrecognized memory mode for this node: {}".format(weight_file_mode)
+            )
 
         threshold_tensor = self.get_hls_compatible_threshold_tensor(weights)
         tdt = self.get_weight_datatype()
@@ -334,21 +341,35 @@ def prepare_codegen_rtl_values(self):
 
         # Identify the module names
         code_gen_dict["$MODULE_NAME$"] = [self.get_verilog_top_module_name()]
-        code_gen_dict["$MODULE_NAME_AXI$"] = [self.get_verilog_top_module_name() + "_axi"]
-        code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [self.get_verilog_top_module_name() + "_axi_wrapper"]
+        code_gen_dict["$MODULE_NAME_AXI$"] = [
+            self.get_verilog_top_module_name() + "_axi"
+        ]
+        code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [
+            self.get_verilog_top_module_name() + "_axi_wrapper"
+        ]
         # Set the top module name - AXI wrapper
         code_gen_dict["$TOP_MODULE$"] = code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"]
 
         # Identify the module variables
-        output_data_type = self.get_nodeattr("outputDataType") # output precision
-        input_data_type = self.get_nodeattr("inputDataType") # input/threshold precision
-        num_channels = self.get_nodeattr("NumChannels") # number of channels
-        bias = self.get_nodeattr("activation_bias") # activation bias value
-
-        code_gen_dict["$N$"] = [self.conv_datatype_to_str(output_data_type)] # output precision
-        code_gen_dict["$M$"] = [self.conv_datatype_to_str(input_data_type)] # input/threshold precision
-        code_gen_dict["$C$"] = [self.conv_datatype_to_str(num_channels)] # number of channels
-        code_gen_dict["$BIAS$"] = [self.conv_datatype_to_str(bias)] # activation bias value
+        output_data_type = self.get_nodeattr("outputDataType")  # output precision
+        input_data_type = self.get_nodeattr(
+            "inputDataType"
+        )  # input/threshold precision
+        num_channels = self.get_nodeattr("NumChannels")  # number of channels
+        bias = self.get_nodeattr("activation_bias")  # activation bias value
+
+        code_gen_dict["$N$"] = [
+            self.conv_datatype_to_str(output_data_type)
+        ]  # output precision
+        code_gen_dict["$M$"] = [
+            self.conv_datatype_to_str(input_data_type)
+        ]  # input/threshold precision
+        code_gen_dict["$C$"] = [
+            self.conv_datatype_to_str(num_channels)
+        ]  # number of channels
+        code_gen_dict["$BIAS$"] = [
+            self.conv_datatype_to_str(bias)
+        ]  # activation bias value
 
         # Is the input datatype signed or unsigned? The thresholding core needs to know this
         if self.get_input_datatype().min() < 0:
@@ -359,9 +380,7 @@ def prepare_codegen_rtl_values(self):
         return code_gen_dict
 
     def get_rtl_file_list(self):
-        return ["thresholding.sv",
-                "thresholding_axi.sv",
-                "thresholding_axi_wrapper.v"]
+        return ["thresholding.sv", "thresholding_axi.sv", "thresholding_axi_wrapper.v"]
 
     def get_rtl_file_paths(self):
         rtl_root_dir = os.environ["FINN_ROOT"] + "/finn-rtllib/thresholding/hdl/"
@@ -399,7 +418,7 @@ def generate_hdl(self):
             # apply code generation to templates
             data = self.fill_in_rtl_template_data(code_gen_dict, template_data)
             # dump filled-in template to destination directory for compilation
-            file_only_path = rtl_file_path.split('/')[-1]
+            file_only_path = rtl_file_path.split("/")[-1]
             self.dump_rtl_data(code_gen_dir, file_only_path, data)
 
         # Before we return - set the 'gen_top_module' attribute for use later by PyVerilator and IPI generation
@@ -422,7 +441,10 @@ def code_generation_ipgen(self, model, fpgapart, clk):
     def generate_params(self, model, path):
         # Only 'decoupled' mode is supported
         mem_mode = self.get_nodeattr("mem_mode")
-        if mem_mode != "decoupled": raise Exception("Unrecognized memory mode for this node: {}".format(mem_mode))
+        if mem_mode != "decoupled":
+            raise Exception(
+                "Unrecognized memory mode for this node: {}".format(mem_mode)
+            )
 
         code_gen_dir = path
         weight_filename_sim = "{}/thresholds.npy".format(code_gen_dir)
@@ -484,8 +506,18 @@ def prepare_rtlsim(self):
 
     def execute_node(self, context, graph):
         # Perform input checks
-        if self.get_nodeattr("exec_mode") != "rtlsim": raise Exception("Invalid exec_mode value: {}; exec_mode must be set to 'rtlsim'".format(self.get_nodeattr("exec_mode")))
-        if self.get_nodeattr("mem_mode") != "decoupled": raise Exception("Invalid mem_mode value: {}; mem_mode must be set to 'decoupled'".format(self.get_nodeattr("mem_mode")))
+        if self.get_nodeattr("exec_mode") != "rtlsim":
+            raise Exception(
+                "Invalid exec_mode value: {}; exec_mode must be set to 'rtlsim'".format(
+                    self.get_nodeattr("exec_mode")
+                )
+            )
+        if self.get_nodeattr("mem_mode") != "decoupled":
+            raise Exception(
+                "Invalid mem_mode value: {}; mem_mode must be set to 'decoupled'".format(
+                    self.get_nodeattr("mem_mode")
+                )
+            )
 
         node = self.onnx_node
         code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
@@ -569,24 +601,27 @@ def code_generation_ipi(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
 
         for rtl_file in rtl_file_list:
-            cmd.append("add_files -norecurse %s"
-            % (
-                os.path.join(
-                    code_gen_dir, rtl_file
-                )
-            ))
+            cmd.append(
+                "add_files -norecurse %s" % (os.path.join(code_gen_dir, rtl_file))
+            )
 
         # Create an RTL block, not an IP core (-type ip)
-        cmd.append("create_bd_cell -type module -reference %s %s"
-            % (self.get_nodeattr("gen_top_module"), self.onnx_node.name))
+        cmd.append(
+            "create_bd_cell -type module -reference %s %s"
+            % (self.get_nodeattr("gen_top_module"), self.onnx_node.name)
+        )
 
         # ERROR: [BD 41-237] Bus Interface property FREQ_HZ does not match between
         # /Thresholding_Binary_Search_0/s_axis(100000000 and /StreamingFIFO_0/out_V(200000000.000000)
-        cmd.append("set_property -dict [list CONFIG.FREQ_HZ {200000000}] [get_bd_intf_pins Thresholding_Binary_Search_0/s_axis]")
+        cmd.append(
+            "set_property -dict [list CONFIG.FREQ_HZ {200000000}] [get_bd_intf_pins Thresholding_Binary_Search_0/s_axis]"
+        )
 
         # ERROR: [BD 41-237] Bus Interface property FREQ_HZ does not match between
         # /StreamingFIFO_1/in0_V(200000000.000000) and /Thresholding_Binary_Search_0/m_axis(100000000)
-        cmd.append("set_property -dict [list CONFIG.FREQ_HZ {200000000}] [get_bd_intf_pins Thresholding_Binary_Search_0/m_axis]")
+        cmd.append(
+            "set_property -dict [list CONFIG.FREQ_HZ {200000000}] [get_bd_intf_pins Thresholding_Binary_Search_0/m_axis]"
+        )
 
         return cmd
 
@@ -603,7 +638,10 @@ def get_verilog_top_module_intf_names(self):
         intf_names = super().get_verilog_top_module_intf_names()
         # Only 'decoupled' mode is supported - check before adding axilite interface
         mem_mode = self.get_nodeattr("mem_mode")
-        if mem_mode != "decoupled": raise Exception("Unrecognized memory mode for this node: {}".format(mem_mode))
+        if mem_mode != "decoupled":
+            raise Exception(
+                "Unrecognized memory mode for this node: {}".format(mem_mode)
+            )
         intf_names["axilite"] = ["s_axilite"]
         intf_names["s_axis"] = [["s_axis"]]
         intf_names["m_axis"] = [["m_axis"]]
@@ -618,7 +656,7 @@ def find_next_power_of_2(self, n):
             return 0
         # If '1' is requested, output will be '0' in the loop below, so avoid this earlier.
         elif n == 1:
-            return 2 # i.e. 2**1
+            return 2  # i.e. 2**1
 
         # decrement 'n' (to handle cases when `n` itself is a power of 2)
         n = n - 1
@@ -651,12 +689,17 @@ def get_dynamic_config(self, model, address_stride=1):
         config = {}
         channel_cntr = 0
         for channel in thresholds:
-            channel_start_addr = (channel_cntr * weight_addr_boundary * address_stride)
+            channel_start_addr = channel_cntr * weight_addr_boundary * address_stride
             weight_cntr = 0
             addr = 0
             for weight in channel:
-                key_name = "{}_{}{}_{}{}".format("axilite", "ch", str(channel_cntr), "w", str(weight_cntr))
-                config[key_name] = (channel_start_addr + addr, self.prep_axilite_val(weight))
+                key_name = "{}_{}{}_{}{}".format(
+                    "axilite", "ch", str(channel_cntr), "w", str(weight_cntr)
+                )
+                config[key_name] = (
+                    channel_start_addr + addr,
+                    self.prep_axilite_val(weight),
+                )
 
                 weight_cntr += 1
                 addr += address_stride
diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py
index 0a02503300..579b6fe83c 100755
--- a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py
+++ b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py
@@ -27,6 +27,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import pytest
+
 import numpy as np
 from onnx import TensorProto, helper
 from pyverilator.util.axi_utils import axilite_write, reset_rtlsim
@@ -54,8 +55,14 @@
 def sort_thresholds_increasing(thresholds):
     return np.sort(thresholds, axis=1)
 
+
 def generate_random_threshold_values(input_data_type, num_input_channels, num_steps):
-    return np.random.randint(input_data_type.min(), input_data_type.max() + 1, (num_input_channels, num_steps)).astype(np.float32)
+    return np.random.randint(
+        input_data_type.min(),
+        input_data_type.max() + 1,
+        (num_input_channels, num_steps),
+    ).astype(np.float32)
+
 
 def generate_pe_value(fold, num_input_channels):
     if fold == -1:
@@ -64,20 +71,29 @@ def generate_pe_value(fold, num_input_channels):
     assert num_input_channels % pe == 0
     return pe
 
+
 # n = batch, c = channel, h = height, w = width of feature map
 # Standard = NCHW; FINN = NHWC
 # Convert from NCHW to NHWC
 def convert_np_array_to_finn_data_layout(data):
     return np.transpose(data, (0, 2, 3, 1))
 
+
 # n = batch, c = channel, h = height, w = width of feature map
 # Standard = NCHW; FINN = NHWC
 # Convert from NHWC to NCHW
 def convert_np_array_to_standard_data_layout(data):
     return np.transpose(data, (0, 3, 1, 2))
 
+
 def make_single_thresholding_binary_search_modelwrapper(
-    thresholds, pe, input_data_type, output_data_type, activation_bias, mem_mode, num_input_vecs
+    thresholds,
+    pe,
+    input_data_type,
+    output_data_type,
+    activation_bias,
+    mem_mode,
+    num_input_vecs,
 ):
     NumChannels = thresholds.shape[0]
 
@@ -123,6 +139,7 @@ def make_single_thresholding_binary_search_modelwrapper(
     model.set_initializer("thresh", thresholds)
     return model
 
+
 # Test brief: a particular method for this class was causing a bug - find_next_power_of_2()
 # Weights in the thresholding core are programmed on a per-channel basis and are byte-addressable.
 # When a channel is programmed, the next channel can start programming at the next power-of-2 byte boundary.
@@ -155,11 +172,19 @@ def test_fpgadataflow_thresholding_binary_search_unit():
         activation_bias = output_data_type.min()
 
     # Generate random thresholds and sort in ascending order
-    thresholds = generate_random_threshold_values(input_data_type, num_input_channels, num_steps)
+    thresholds = generate_random_threshold_values(
+        input_data_type, num_input_channels, num_steps
+    )
 
     # Generate model from input parameters to the test
     model = make_single_thresholding_binary_search_modelwrapper(
-        thresholds, pe, input_data_type, output_data_type, activation_bias, mem_mode, num_input_vecs
+        thresholds,
+        pe,
+        input_data_type,
+        output_data_type,
+        activation_bias,
+        mem_mode,
+        num_input_vecs,
     )
 
     # Retrieve the class to get the method-under-test
@@ -194,6 +219,7 @@ def test_fpgadataflow_thresholding_binary_search_unit():
 
     return
 
+
 # Test brief: Prove that cppsim is not supported for this class
 @pytest.mark.tbs_cppsim
 @pytest.mark.tbs_all
@@ -202,13 +228,15 @@ def test_fpgadataflow_thresholding_binary_search_cppsim():
     act = DataType["BIPOLAR"]
     fold = -1
     num_input_channels = 16
-    mem_mode = "decoupled" # 'const' is unsupported - see test_fpgadataflow_thresholding_binary_search_const_mem_mode
+    mem_mode = "decoupled"  # 'const' is unsupported - see test_fpgadataflow_thresholding_binary_search_const_mem_mode
 
     pe = generate_pe_value(fold, num_input_channels)
     num_steps = act.get_num_possible_values() - 1
 
     # Generate random, non-decreasing thresholds
-    thresholds = generate_random_threshold_values(input_data_type, num_input_channels, num_steps)
+    thresholds = generate_random_threshold_values(
+        input_data_type, num_input_channels, num_steps
+    )
 
     # make the vivado_hls threshold bug appear (incorrect rtlsim result when first
     # threshold of first channel is zero, while using BIPOLAR output)
@@ -226,7 +254,13 @@ def test_fpgadataflow_thresholding_binary_search_cppsim():
 
     # Generate model from input parameters to the test
     model = make_single_thresholding_binary_search_modelwrapper(
-        thresholds, pe, input_data_type, output_data_type, activation_bias, mem_mode, num_input_vecs
+        thresholds,
+        pe,
+        input_data_type,
+        output_data_type,
+        activation_bias,
+        mem_mode,
+        num_input_vecs,
     )
 
     # Cppsim is not supported for this class, catch the specific exception thrown by cppsim
@@ -236,9 +270,13 @@ def test_fpgadataflow_thresholding_binary_search_cppsim():
         model = model.transform(CompileCppSim())
         model = model.transform(SetExecMode("cppsim"))
     except Exception as e:
-        if str(e) != "Custom op_type Thresholding_Binary_Search is currently not supported.":
+        if (
+            str(e)
+            != "Custom op_type Thresholding_Binary_Search is currently not supported."
+        ):
             raise
 
+
 # Test brief: Prove that memory mode 'const' is not supported for this layer type
 @pytest.mark.tbs_const
 @pytest.mark.tbs_all
@@ -256,11 +294,19 @@ def test_fpgadataflow_thresholding_binary_search_const_mem_mode():
 
     # Generate random thresholds and sort in ascending order
     num_steps = activation.get_num_possible_values() - 1
-    thresholds = generate_random_threshold_values(input_data_type, num_input_channels, num_steps)
+    thresholds = generate_random_threshold_values(
+        input_data_type, num_input_channels, num_steps
+    )
 
     # Generate model from input parameters to the test
     model = make_single_thresholding_binary_search_modelwrapper(
-        thresholds, pe, input_data_type, output_data_type, activation_bias, mem_mode, num_input_vecs
+        thresholds,
+        pe,
+        input_data_type,
+        output_data_type,
+        activation_bias,
+        mem_mode,
+        num_input_vecs,
     )
 
     # Prove that 'const' memory mode is not supported for this class
@@ -278,6 +324,7 @@ def test_fpgadataflow_thresholding_binary_search_const_mem_mode():
         # Caught the expected exception, leave the test early
         return
 
+
 # Test brief: Test that PrepareRTLSim() runs successfully. This function is not
 # tested in test_fpgadataflow_thresholding_binary_search()
 @pytest.mark.tbs_prep_rtlsim
@@ -294,7 +341,9 @@ def test_fpgadataflow_thresholding_binary_search_prepare_rtlsim():
     num_steps = act.get_num_possible_values() - 1
 
     # Generate random, non-decreasing thresholds
-    thresholds = generate_random_threshold_values(input_data_type, num_input_channels, num_steps)
+    thresholds = generate_random_threshold_values(
+        input_data_type, num_input_channels, num_steps
+    )
     # make the vivado_hls threshold bug appear (incorrect rtlsim result when first
     # threshold of first channel is zero, while using BIPOLAR output)
     if act == DataType["BIPOLAR"]:
@@ -311,7 +360,13 @@ def test_fpgadataflow_thresholding_binary_search_prepare_rtlsim():
 
     # Generate model from input parameters to the test
     model = make_single_thresholding_binary_search_modelwrapper(
-        thresholds, pe, input_data_type, output_data_type, activation_bias, mem_mode, num_input_vecs
+        thresholds,
+        pe,
+        input_data_type,
+        output_data_type,
+        activation_bias,
+        mem_mode,
+        num_input_vecs,
     )
 
     model = model.transform(SetExecMode("rtlsim"))
@@ -321,18 +376,21 @@ def test_fpgadataflow_thresholding_binary_search_prepare_rtlsim():
     model = model.transform(PrepareRTLSim())
     return
 
+
 # Test brief: Create a Thresholding binary search layer using various parameters
 # and test against a SW generated & simulated dataset
 # N.B. - fold factor of '-1' is supported only (no PE/SIMD support)
 @pytest.mark.parametrize("activation", [DataType["INT4"], DataType["BIPOLAR"]])
 @pytest.mark.parametrize("input_data_type", [DataType["INT16"], DataType["UINT16"]])
-@pytest.mark.parametrize("fold", [-1]) # 1, 2, etc. will fail
+@pytest.mark.parametrize("fold", [-1])  # 1, 2, etc. will fail
 @pytest.mark.parametrize("num_input_channels", [16])
 # no need to test 'const' mode, it's already done in test_fpgadataflow_thresholding_binary_search_const_mem_mode()
 @pytest.mark.parametrize("mem_mode", ["decoupled"])
 @pytest.mark.tbs_soak
 @pytest.mark.tbs_all
-def test_fpgadataflow_thresholding_binary_search(activation, input_data_type, fold, num_input_channels, mem_mode):
+def test_fpgadataflow_thresholding_binary_search(
+    activation, input_data_type, fold, num_input_channels, mem_mode
+):
     # Handle inputs to the test
     pe = generate_pe_value(fold, num_input_channels)
     num_steps = activation.get_num_possible_values() - 1
@@ -350,7 +408,9 @@ def test_fpgadataflow_thresholding_binary_search(activation, input_data_type, fo
     x = gen_finn_dt_tensor(input_data_type, tensor_shape)
 
     # Generate random thresholds and sort in ascending order
-    thresholds = generate_random_threshold_values(input_data_type, num_input_channels, num_steps)
+    thresholds = generate_random_threshold_values(
+        input_data_type, num_input_channels, num_steps
+    )
 
     # make the vivado_hls threshold bug appear (incorrect rtlsim result when first
     # threshold of first channel is zero, while using BIPOLAR output)
@@ -374,7 +434,13 @@ def test_fpgadataflow_thresholding_binary_search(activation, input_data_type, fo
 
     # Generate model from input parameters to the test
     model = make_single_thresholding_binary_search_modelwrapper(
-        thresholds, pe, input_data_type, output_data_type, activation_bias, mem_mode, num_input_vecs
+        thresholds,
+        pe,
+        input_data_type,
+        output_data_type,
+        activation_bias,
+        mem_mode,
+        num_input_vecs,
     )
 
     model = model.transform(InsertFIFO(True))
@@ -399,7 +465,9 @@ def config_hook(config):
 
         def write_thresh_config(sim):
             # axi_name = "s_axilite_0_" # works
-            axi_name = getCustomOp(model.get_nodes_by_op_type("Thresholding_Binary_Search")[0]).get_verilog_top_module_intf_names()['axilite'][0]
+            axi_name = getCustomOp(
+                model.get_nodes_by_op_type("Thresholding_Binary_Search")[0]
+            ).get_verilog_top_module_intf_names()["axilite"][0]
             axi_name += "_0_"
 
             # 1. Write config registers to the Threshold memory, dict defines (addr, value) tuples
@@ -409,6 +477,7 @@ def write_thresh_config(sim):
                 axilite_write(sim, addr, val, basename=axi_name)
 
             reset_rtlsim(sim)
+
         return write_thresh_config
 
     input_dict = {"inp": x}

From 0689c6a6a03cbc2e9b3982af971144ac186a2c76 Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Thu, 17 Nov 2022 10:30:50 +0000
Subject: [PATCH 033/235] [thresholding] add flake8 fixes

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 .../thresholding_binary_search.py             | 37 +++++++++-------
 ...fpgadataflow_thresholding_binary_search.py | 42 +++++++++++--------
 2 files changed, 48 insertions(+), 31 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
index b785abcaa8..003dbb2fd9 100755
--- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
@@ -28,7 +28,9 @@
 
 import numpy as np
 import os
+import textwrap
 import warnings
+from math import ceil, log2
 from qonnx.core.datatype import DataType
 from qonnx.util.basic import (
     interleave_matrix_outer_dim_from_partitions,
@@ -260,7 +262,8 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name):
           decoupled_runtime}
         * weight_file_name : filename for the weight file to be generated
         """
-        # There are 'decoupled_*' flavors, just make sure that the flavors are decoupled related
+        # There are 'decoupled_*' flavors, just make sure that the flavors
+        # are decoupled related
         if "decoupled" not in weight_file_mode:
             raise Exception(
                 "Unrecognized memory mode for this node: {}".format(weight_file_mode)
@@ -371,7 +374,8 @@ def prepare_codegen_rtl_values(self):
             self.conv_datatype_to_str(bias)
         ]  # activation bias value
 
-        # Is the input datatype signed or unsigned? The thresholding core needs to know this
+        # Is the input datatype signed or unsigned?
+        # The thresholding core needs to know this when comparing weights to inputs
         if self.get_input_datatype().min() < 0:
             code_gen_dict["$SIGN$"] = ["signed"]
         else:
@@ -421,7 +425,8 @@ def generate_hdl(self):
             file_only_path = rtl_file_path.split("/")[-1]
             self.dump_rtl_data(code_gen_dir, file_only_path, data)
 
-        # Before we return - set the 'gen_top_module' attribute for use later by PyVerilator and IPI generation
+        # Before we return - set the 'gen_top_module' attribute for use later
+        # by PyVerilator and IPI generation
         self.set_nodeattr("gen_top_module", code_gen_dict["$TOP_MODULE$"][0])
         return
 
@@ -508,14 +513,14 @@ def execute_node(self, context, graph):
         # Perform input checks
         if self.get_nodeattr("exec_mode") != "rtlsim":
             raise Exception(
-                "Invalid exec_mode value: {}; exec_mode must be set to 'rtlsim'".format(
-                    self.get_nodeattr("exec_mode")
+                "Invalid exec_mode value: {}; exec_mode must be set to '{}'".format(
+                    self.get_nodeattr("exec_mode"), "rtlsim"
                 )
             )
         if self.get_nodeattr("mem_mode") != "decoupled":
             raise Exception(
-                "Invalid mem_mode value: {}; mem_mode must be set to 'decoupled'".format(
-                    self.get_nodeattr("mem_mode")
+                "Invalid mem_mode value: {}; mem_mode must be set to '{}'".format(
+                    self.get_nodeattr("mem_mode"), "decoupled"
                 )
             )
 
@@ -595,7 +600,8 @@ def execute_node(self, context, graph):
         return
 
     def code_generation_ipi(self):
-        """Constructs and returns the TCL commands for node instantiation as an RTL block."""
+        """Constructs and returns the TCL commands for node instantiation as an RTL
+        block."""
         cmd = []
         rtl_file_list = self.get_rtl_file_list()
         code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
@@ -612,15 +618,19 @@ def code_generation_ipi(self):
         )
 
         # ERROR: [BD 41-237] Bus Interface property FREQ_HZ does not match between
-        # /Thresholding_Binary_Search_0/s_axis(100000000 and /StreamingFIFO_0/out_V(200000000.000000)
+        # /Thresholding_Binary_Search_0/s_axis(100000000 and
+        # /StreamingFIFO_0/out_V(200000000.000000)
         cmd.append(
-            "set_property -dict [list CONFIG.FREQ_HZ {200000000}] [get_bd_intf_pins Thresholding_Binary_Search_0/s_axis]"
+            "set_property -dict [list CONFIG.FREQ_HZ {200000000}] [%s %s]"
+            % ("get_bd_intf_pins", "Thresholding_Binary_Search_0/s_axis")
         )
 
         # ERROR: [BD 41-237] Bus Interface property FREQ_HZ does not match between
-        # /StreamingFIFO_1/in0_V(200000000.000000) and /Thresholding_Binary_Search_0/m_axis(100000000)
+        # /StreamingFIFO_1/in0_V(200000000.000000) and
+        # /Thresholding_Binary_Search_0/m_axis(100000000)
         cmd.append(
-            "set_property -dict [list CONFIG.FREQ_HZ {200000000}] [get_bd_intf_pins Thresholding_Binary_Search_0/m_axis]"
+            "set_property -dict [list CONFIG.FREQ_HZ {200000000}] [%s %s]"
+            % ("get_bd_intf_pins", "Thresholding_Binary_Search_0/m_axis")
         )
 
         return cmd
@@ -654,7 +664,7 @@ def find_next_power_of_2(self, n):
         # Negative values will loop infinitely below - return 0
         if n <= 0:
             return 0
-        # If '1' is requested, output will be '0' in the loop below, so avoid this earlier.
+        # If '1' is requested, output will be '0' in the loop below, avoid this now.
         elif n == 1:
             return 2  # i.e. 2**1
 
@@ -674,7 +684,6 @@ def prep_axilite_val(self, val):
         return self.twos_comp(int(val), self.get_weight_datatype().bitwidth())
 
     def get_dynamic_config(self, model, address_stride=1):
-        ## TODO - not sure this description is correct
         """Returns a configuration dictionary containing axilite write commands
         in order to program the thresholds into the RTL core during runtime.
         The default address stride for the weights is 1 byte."""
diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py
index 579b6fe83c..81a089844d 100755
--- a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py
+++ b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py
@@ -51,6 +51,7 @@
 test_fpga_part = "xczu3eg-sbva484-1-e"
 target_clk_ns = 5
 
+
 # Helper functions
 def sort_thresholds_increasing(thresholds):
     return np.sort(thresholds, axis=1)
@@ -140,16 +141,18 @@ def make_single_thresholding_binary_search_modelwrapper(
     return model
 
 
-# Test brief: a particular method for this class was causing a bug - find_next_power_of_2()
-# Weights in the thresholding core are programmed on a per-channel basis and are byte-addressable.
-# When a channel is programmed, the next channel can start programming at the next power-of-2 byte boundary.
-# This test is to show that the function that calculates that boundary is working correctly.
+# Test brief: a particular method for this class was causing a bug:
+# find_next_power_of_2()
+# Weights in the thresholding core are programmed on a per-channel basis and are
+# byte-addressable. When a channel is programmed, the next channel can start
+# programming at the next power-of-2 byte boundary. This test is to show that the
+# function that calculates that boundary is working correctly.
 #
-# A Thresholding_Binary_Search layer was created and a SW generated dataset with a threshold channel
-# depth of 1 weight (1 layer of N channels in the thresholding core). However, find_next_power_of_2()
-# was returning a next-power-of-2 address boundary at address '0', instead of '2'. This unit test
-# is to prove that this bug no longer occurs. It was originally seen when the input datatype
-# was 'DataType["BIPOLAR"]'.
+# A Thresholding_Binary_Search layer was created and a SW generated dataset with a
+# threshold channel depth of 1 weight (1 layer of N channels in the thresholding core).
+# However, find_next_power_of_2() was returning a next-power-of-2 address boundary at
+# address '0', instead of '2'. This unit test is to prove that this bug no longer
+# occurs. It was originally seen when the input datatype was 'DataType["BIPOLAR"]'.
 @pytest.mark.tbs_unit
 @pytest.mark.tbs_all
 def test_fpgadataflow_thresholding_binary_search_unit():
@@ -228,7 +231,9 @@ def test_fpgadataflow_thresholding_binary_search_cppsim():
     act = DataType["BIPOLAR"]
     fold = -1
     num_input_channels = 16
-    mem_mode = "decoupled"  # 'const' is unsupported - see test_fpgadataflow_thresholding_binary_search_const_mem_mode
+    # 'const' is unsupported see test:
+    # test_fpgadataflow_thresholding_binary_search_const_mem_mode()
+    mem_mode = "decoupled"
 
     pe = generate_pe_value(fold, num_input_channels)
     num_steps = act.get_num_possible_values() - 1
@@ -263,8 +268,9 @@ def test_fpgadataflow_thresholding_binary_search_cppsim():
         num_input_vecs,
     )
 
-    # Cppsim is not supported for this class, catch the specific exception thrown by cppsim
-    # Exception raised in cppsim: Custom op_type Thresholding_Binary_Search is currently not supported.
+    # Cppsim is not supported for this class, catch the specific exception thrown by
+    # cppsim. Exception raised in cppsim: Custom op_type Thresholding_Binary_Search is
+    # currently not supported.
     try:
         model = model.transform(PrepareCppSim())
         model = model.transform(CompileCppSim())
@@ -310,8 +316,8 @@ def test_fpgadataflow_thresholding_binary_search_const_mem_mode():
     )
 
     # Prove that 'const' memory mode is not supported for this class
-    # 'const' memory mode is not supported for this class, catch the specific exception thrown by FINN
-    # Exception: ('Unrecognized memory mode for this node:', 'const')
+    # 'const' memory mode is not supported for this class, catch the specific exception
+    # thrown by FINN. Exception: ('Unrecognized memory mode for this node:', 'const')
     try:
         model = model.transform(InsertFIFO(True))
         model = model.transform(GiveUniqueNodeNames())
@@ -384,7 +390,8 @@ def test_fpgadataflow_thresholding_binary_search_prepare_rtlsim():
 @pytest.mark.parametrize("input_data_type", [DataType["INT16"], DataType["UINT16"]])
 @pytest.mark.parametrize("fold", [-1])  # 1, 2, etc. will fail
 @pytest.mark.parametrize("num_input_channels", [16])
-# no need to test 'const' mode, it's already done in test_fpgadataflow_thresholding_binary_search_const_mem_mode()
+# no need to test 'const' mode, it's already done in:
+# test_fpgadataflow_thresholding_binary_search_const_mem_mode()
 @pytest.mark.parametrize("mem_mode", ["decoupled"])
 @pytest.mark.tbs_soak
 @pytest.mark.tbs_all
@@ -449,7 +456,7 @@ def test_fpgadataflow_thresholding_binary_search(
     model = model.transform(HLSSynthIP())
     model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns))
 
-    # Retrieve the axilite programming sequence for the weights - for decoupled mode only
+    # Retrieve the axilite programming sequence for weights - for decoupled mode only
     tbs_node = model.get_nodes_by_op_type("Thresholding_Binary_Search")[0]
     tbs_inst = getCustomOp(tbs_node)
     config = tbs_inst.get_dynamic_config(model)
@@ -470,7 +477,8 @@ def write_thresh_config(sim):
             ).get_verilog_top_module_intf_names()["axilite"][0]
             axi_name += "_0_"
 
-            # 1. Write config registers to the Threshold memory, dict defines (addr, value) tuples
+            # Write config registers to the Threshold memory.
+            # The dictionary defines (addr, value) tuples.
             for config_entry in config.values():
                 addr = config_entry[0]
                 val = config_entry[1]

From e9a4a7bb9dbdcc6dd2a7dd900f62851891793017 Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Thu, 17 Nov 2022 12:01:52 +0000
Subject: [PATCH 034/235] [thresholding] change the pytest markers to omit
 tests from quicktest

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 ...fpgadataflow_thresholding_binary_search.py | 21 ++++++++++---------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py
index 81a089844d..e2189c4c79 100755
--- a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py
+++ b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py
@@ -153,8 +153,8 @@ def make_single_thresholding_binary_search_modelwrapper(
 # However, find_next_power_of_2() was returning a next-power-of-2 address boundary at
 # address '0', instead of '2'. This unit test is to prove that this bug no longer
 # occurs. It was originally seen when the input datatype was 'DataType["BIPOLAR"]'.
-@pytest.mark.tbs_unit
-@pytest.mark.tbs_all
+@pytest.mark.fpgadataflow
+@pytest.mark.vivado
 def test_fpgadataflow_thresholding_binary_search_unit():
     activation = DataType["BIPOLAR"]
     input_data_type = DataType["INT16"]
@@ -224,8 +224,8 @@ def test_fpgadataflow_thresholding_binary_search_unit():
 
 
 # Test brief: Prove that cppsim is not supported for this class
-@pytest.mark.tbs_cppsim
-@pytest.mark.tbs_all
+@pytest.mark.fpgadataflow
+@pytest.mark.vivado
 def test_fpgadataflow_thresholding_binary_search_cppsim():
     input_data_type = DataType["UINT16"]
     act = DataType["BIPOLAR"]
@@ -284,8 +284,8 @@ def test_fpgadataflow_thresholding_binary_search_cppsim():
 
 
 # Test brief: Prove that memory mode 'const' is not supported for this layer type
-@pytest.mark.tbs_const
-@pytest.mark.tbs_all
+@pytest.mark.fpgadataflow
+@pytest.mark.vivado
 def test_fpgadataflow_thresholding_binary_search_const_mem_mode():
     input_data_type = DataType["INT16"]
     activation = DataType["INT4"]
@@ -333,8 +333,8 @@ def test_fpgadataflow_thresholding_binary_search_const_mem_mode():
 
 # Test brief: Test that PrepareRTLSim() runs successfully. This function is not
 # tested in test_fpgadataflow_thresholding_binary_search()
-@pytest.mark.tbs_prep_rtlsim
-@pytest.mark.tbs_all
+@pytest.mark.fpgadataflow
+@pytest.mark.vivado
 def test_fpgadataflow_thresholding_binary_search_prepare_rtlsim():
     input_data_type = DataType["INT16"]
     act = DataType["INT4"]
@@ -393,8 +393,9 @@ def test_fpgadataflow_thresholding_binary_search_prepare_rtlsim():
 # no need to test 'const' mode, it's already done in:
 # test_fpgadataflow_thresholding_binary_search_const_mem_mode()
 @pytest.mark.parametrize("mem_mode", ["decoupled"])
-@pytest.mark.tbs_soak
-@pytest.mark.tbs_all
+@pytest.mark.fpgadataflow
+@pytest.mark.vivado
+@pytest.mark.slow
 def test_fpgadataflow_thresholding_binary_search(
     activation, input_data_type, fold, num_input_channels, mem_mode
 ):

From 41c0b4b0799674cd468b9aabfe47a5992891e873 Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Fri, 25 Nov 2022 14:57:39 +0000
Subject: [PATCH 035/235] [thresholding] update copyright banners of files I
 have added/changed

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 src/finn/custom_op/fpgadataflow/__init__.py                     | 2 +-
 src/finn/custom_op/fpgadataflow/thresholding_binary_search.py   | 2 +-
 .../test_fpgadataflow_thresholding_binary_search.py             | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index dc9a5a349a..0e17726d48 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, Xilinx
+# Copyright (C) 2022, Advanced Micro Devices, Inc.
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
index 003dbb2fd9..7df755ae1b 100755
--- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, Xilinx
+# Copyright (C) 2022, Advanced Micro Devices, Inc.
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py
index e2189c4c79..1e3521a610 100755
--- a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py
+++ b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, Xilinx
+# Copyright (C) 2022, Advanced Micro Devices, Inc.
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without

From 71ef39b38d70365f4812cfd6f0d46a1d0198b269 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Thu, 1 Dec 2022 13:12:33 +0000
Subject: [PATCH 036/235] Translate byte to parameter word addressing in AXI
 adapter.

---
 finn-rtllib/thresholding/hdl/thresholding_axi.sv | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi.sv b/finn-rtllib/thresholding/hdl/thresholding_axi.sv
index 97cdfd3e12..c766e60b9e 100644
--- a/finn-rtllib/thresholding/hdl/thresholding_axi.sv
+++ b/finn-rtllib/thresholding/hdl/thresholding_axi.sv
@@ -30,6 +30,12 @@
  *
  * @brief	All-AXI interface adapter for thresholding module.
  * @author	Thomas B. Preußer <tpreusse@amd.com>
+ *
+ * @description
+ *	This AXI adapter fits the core thresholding functionality:
+ *	- with AXI stream data interfaces with flow control
+ *	- with implicit round-robin channel rotation as used by FINN, and
+ *	- performs aligned byte address to parameter word address translation.
  *****************************************************************************/
 
 module $MODULE_NAME_AXI$ #(
@@ -49,7 +55,7 @@ module $MODULE_NAME_AXI$ #(
 	// Writing
 	input	logic                    s_axilite_AWVALID,
 	output	logic                    s_axilite_AWREADY,
-	input	logic [$clog2(C)+N-1:0]  s_axilite_AWADDR,
+	input	logic [$clog2(C)+N+1:0]  s_axilite_AWADDR,	// lowest 2 bits (byte selectors) are ignored
 
 	input	logic         s_axilite_WVALID,
 	output	logic         s_axilite_WREADY,
@@ -109,7 +115,7 @@ module $MODULE_NAME_AXI$ #(
 			else begin
 				if(!WABusy) begin
 					WABusy <= s_axilite_AWVALID;
-					Addr   <= s_axilite_AWADDR[$clog2(C)+N-1:0];
+					Addr   <= s_axilite_AWADDR[$clog2(C)+N+1:2];
 				end
 				if(!WDBusy) begin
 					WDBusy <= s_axilite_WVALID;

From d44a66c949177163099e36ce4e57c9ac992ee70b Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Mon, 19 Dec 2022 15:05:08 +0000
Subject: [PATCH 037/235] [thresholding] remove unused attribute

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 src/finn/custom_op/fpgadataflow/thresholding_binary_search.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
index 7df755ae1b..2ebe6f0a39 100755
--- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
@@ -103,7 +103,6 @@ def get_nodeattr_types(self):
             # always "flush" the accelerator by first passing a dummy input
             # vector through the accelerator. This will get rid of any old
             # weight data from the weight FIFOs.
-            "runtime_writeable_weights": ("i", False, 0, {0, 1}),
             "gen_top_module": ("s", False, ""),
             "activation_bias": ("i", False, 0),
         }
@@ -656,8 +655,6 @@ def get_verilog_top_module_intf_names(self):
         intf_names["s_axis"] = [["s_axis"]]
         intf_names["m_axis"] = [["m_axis"]]
 
-        self.set_nodeattr("runtime_writeable_weights", 1)
-
         return intf_names
 
     def find_next_power_of_2(self, n):

From f79b9ec3e19d83d6469e6e563422fbba70f7a87a Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Mon, 19 Dec 2022 15:53:20 +0000
Subject: [PATCH 038/235] [thresholding] remove unnecessary HLS bug prevention
 check

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 ...test_fpgadataflow_thresholding_binary_search.py | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py
index 1e3521a610..ab98189ea5 100755
--- a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py
+++ b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py
@@ -242,11 +242,6 @@ def test_fpgadataflow_thresholding_binary_search_cppsim():
     thresholds = generate_random_threshold_values(
         input_data_type, num_input_channels, num_steps
     )
-
-    # make the vivado_hls threshold bug appear (incorrect rtlsim result when first
-    # threshold of first channel is zero, while using BIPOLAR output)
-    if act == DataType["BIPOLAR"]:
-        thresholds[0][0] = 0
     thresholds = sort_thresholds_increasing(thresholds)
 
     # Other non-input parameters
@@ -350,10 +345,6 @@ def test_fpgadataflow_thresholding_binary_search_prepare_rtlsim():
     thresholds = generate_random_threshold_values(
         input_data_type, num_input_channels, num_steps
     )
-    # make the vivado_hls threshold bug appear (incorrect rtlsim result when first
-    # threshold of first channel is zero, while using BIPOLAR output)
-    if act == DataType["BIPOLAR"]:
-        thresholds[0][0] = 0
     thresholds = sort_thresholds_increasing(thresholds)
 
     # Other non-input parameters
@@ -420,11 +411,6 @@ def test_fpgadataflow_thresholding_binary_search(
         input_data_type, num_input_channels, num_steps
     )
 
-    # make the vivado_hls threshold bug appear (incorrect rtlsim result when first
-    # threshold of first channel is zero, while using BIPOLAR output)
-    if activation == DataType["BIPOLAR"]:
-        thresholds[0][0] = 0
-
     # provide non-decreasing/ascending thresholds
     thresholds = sort_thresholds_increasing(thresholds)
 

From 7b82de2c78e14f9dc2017e7c5e9378865011e9da Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Mon, 19 Dec 2022 16:40:57 +0000
Subject: [PATCH 039/235] [thresholding] align methods with hlscustom class by
 adding in additional input parameter

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 .../fpgadataflow/thresholding_binary_search.py   | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
index 2ebe6f0a39..d69c7e47b7 100755
--- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
@@ -129,10 +129,10 @@ def bram_estimation(self):
     def lut_estimation(self):
         return 0
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         return DataType[self.get_nodeattr("inputDataType")]
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         return DataType[self.get_nodeattr("outputDataType")]
 
     def get_weight_datatype(self):
@@ -142,11 +142,11 @@ def get_weight_datatype(self):
     def minimize_accumulator_width(self, model):
         return None
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         i_bits = self.get_input_datatype().bitwidth()
         return i_bits * self.get_nodeattr("PE")
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         o_bits = self.get_output_datatype().bitwidth()
         return o_bits * self.get_nodeattr("PE")
 
@@ -163,24 +163,24 @@ def get_weightstream_width(self):
         w_width = pe * wp * n_thres_steps
         return w_width
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         fold = self.calc_tmem()
         pe = self.get_nodeattr("PE")
         vecs = list(self.get_nodeattr("numInputVectors"))
         folded_input_shape = tuple(vecs + [fold, pe])
         return folded_input_shape
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         # same shape as input
         return self.get_folded_input_shape()
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
         num_channels = self.get_nodeattr("NumChannels")
         vecs = list(self.get_nodeattr("numInputVectors"))
         normal_input_shape = tuple(vecs + [num_channels])
         return normal_input_shape
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         # same shape as input
         return self.get_normal_input_shape()
 

From e2816d3e1c8ce75ad9f0b1aafbef25af8b305a6c Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Mon, 19 Dec 2022 16:50:26 +0000
Subject: [PATCH 040/235] [thresholding] replace hardcoded tcl commands with
 node attributes

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 .../thresholding_binary_search.py             | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
index d69c7e47b7..fe976c7dbe 100755
--- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
@@ -105,6 +105,7 @@ def get_nodeattr_types(self):
             # weight data from the weight FIFOs.
             "gen_top_module": ("s", False, ""),
             "activation_bias": ("i", False, 0),
+            "clkFreq": ("i", False, 200000000),
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
@@ -604,6 +605,10 @@ def code_generation_ipi(self):
         cmd = []
         rtl_file_list = self.get_rtl_file_list()
         code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        node_name = self.onnx_node.name
+        dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0][0]
+        din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0]
+        clock_freq = self.get_nodeattr("clkFreq")
 
         for rtl_file in rtl_file_list:
             cmd.append(
@@ -616,20 +621,14 @@ def code_generation_ipi(self):
             % (self.get_nodeattr("gen_top_module"), self.onnx_node.name)
         )
 
-        # ERROR: [BD 41-237] Bus Interface property FREQ_HZ does not match between
-        # /Thresholding_Binary_Search_0/s_axis(100000000 and
-        # /StreamingFIFO_0/out_V(200000000.000000)
         cmd.append(
-            "set_property -dict [list CONFIG.FREQ_HZ {200000000}] [%s %s]"
-            % ("get_bd_intf_pins", "Thresholding_Binary_Search_0/s_axis")
+            "set_property -dict [list CONFIG.FREQ_HZ {%d}] [%s %s/%s]"
+            % (clock_freq, "get_bd_intf_pins", node_name, din_name)
         )
 
-        # ERROR: [BD 41-237] Bus Interface property FREQ_HZ does not match between
-        # /StreamingFIFO_1/in0_V(200000000.000000) and
-        # /Thresholding_Binary_Search_0/m_axis(100000000)
         cmd.append(
-            "set_property -dict [list CONFIG.FREQ_HZ {200000000}] [%s %s]"
-            % ("get_bd_intf_pins", "Thresholding_Binary_Search_0/m_axis")
+            "set_property -dict [list CONFIG.FREQ_HZ {%d}] [%s %s/%s]"
+            % (clock_freq, "get_bd_intf_pins", node_name, dout_name)
         )
 
         return cmd

From bda05ae16e62627d414c80452caa012dee7aa0d5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Tue, 20 Dec 2022 09:24:04 +0000
Subject: [PATCH 041/235] Fix BIAS parameter specification.

---
 finn-rtllib/thresholding/hdl/thresholding.sv     | 2 +-
 finn-rtllib/thresholding/hdl/thresholding_axi.sv | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/finn-rtllib/thresholding/hdl/thresholding.sv b/finn-rtllib/thresholding/hdl/thresholding.sv
index 25d6ff3112..b26747d1ff 100644
--- a/finn-rtllib/thresholding/hdl/thresholding.sv
+++ b/finn-rtllib/thresholding/hdl/thresholding.sv
@@ -48,7 +48,7 @@ module $MODULE_NAME$ #(
 	int unsigned  M,  // input/threshold precision
 	int unsigned  C,  // number of channels
 
-	int BIAS,  // offsetting the output [0, 2^N-1) -> [-BIAS, 2^N-1 - BIAS)
+	int BIAS,  // offsetting the output [0, 2^N-1) -> [BIAS, 2^N-1 + BIAS)
 
 	int unsigned  C_BITS,
 	int unsigned O_BITS
diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi.sv b/finn-rtllib/thresholding/hdl/thresholding_axi.sv
index c766e60b9e..5cd7746b82 100644
--- a/finn-rtllib/thresholding/hdl/thresholding_axi.sv
+++ b/finn-rtllib/thresholding/hdl/thresholding_axi.sv
@@ -43,7 +43,7 @@ module $MODULE_NAME_AXI$ #(
 	int unsigned  M,	// input/threshold precision
 	int unsigned  C,	// Channels
 
-	int BIAS,  // offsetting the output [0, 2^N-1) -> [-BIAS, 2^N-1 - BIAS)
+	int BIAS,  // offsetting the output [0, 2^N-1) -> [BIAS, 2^N-1 + BIAS)
 
 	int unsigned O_BITS
 )(

From 7388e7613ef38b6caa1fafb1129973cefef8716a Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Tue, 20 Dec 2022 14:08:07 +0000
Subject: [PATCH 042/235] [thresholding] remove unused ram_style attribute

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 .../fpgadataflow/thresholding_binary_search.py        | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
index fe976c7dbe..9cbe049be3 100755
--- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
@@ -76,8 +76,6 @@ def get_nodeattr_types(self):
             "NumChannels": ("i", True, 0),
             # number of steps in thresholding function. Used only in decoupled mode
             "numSteps": ("i", True, 1),
-            # string defining memory type
-            "ram_style": ("s", False, "distributed", {"distributed", "block"}),
             # FINN DataTypes for inputs, outputs
             "inputDataType": ("s", True, ""),
             "weightDataType": ("s", True, ""),
@@ -470,14 +468,7 @@ def generate_params(self, model, path):
         )
 
         # Synthesis thresholds:
-        ram_style = self.get_nodeattr("ram_style")
-        if ram_style == "ultra":
-            # UltraRAM must have no memory initializer, or only zeroes
-            # otherwise BRAM will be inferred instead of URAM
-            # as a workaround we provide a zero-weight init here
-            synth_thresholds = np.zeros_like(thresholds, dtype=np.float32)
-        else:
-            synth_thresholds = thresholds
+        synth_thresholds = thresholds
         self.make_weight_file(
             synth_thresholds, "decoupled_verilog_dat", weight_filename_rtl_synth
         )

From be1503a0c78fd4c4d903b1ffbf61964659725bb6 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Tue, 3 Jan 2023 15:37:42 +0000
Subject: [PATCH 043/235] First changes to custom_op for RTL-based MVAU

---
 .../matrixvectoractivation_rtl.py             | 1036 +++++++++++++++++
 1 file changed, 1036 insertions(+)
 create mode 100644 src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py

diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
new file mode 100644
index 0000000000..c8a0aa675b
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
@@ -0,0 +1,1036 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import math
+import numpy as np
+import os
+import textwrap
+import warnings
+from qonnx.core.datatype import DataType
+from qonnx.util.basic import (
+    calculate_matvec_accumulator_range,
+    interleave_matrix_outer_dim_from_partitions,
+    roundup_to_integer_multiple,
+)
+
+from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
+from finn.util.data_packing import (
+    npy_to_rtlsim_input,
+    numpy_to_hls_code,
+    pack_innermost_dim_as_hex_string,
+    rtlsim_output_to_npy,
+)
+
+from . import templates
+
+# ONNX i/o tensor shape assumptions for MatrixVectorActivation:
+# input 0 is the input tensor, shape (.., i_size) = (..., MW)
+# input 1 is the weight tensor, shape (i_size, o_size) = (MW, MH)
+# (optional) input 2 is the thresholds tensor, shape (o_size, n_thres)
+# output 0 is the output tensor, shape (.., o_size) = (..., MH)
+# the ... here can be any shape (representing groups of vectors)
+
+
+class MatrixVectorActivation_rtl(HLSCustomOp):
+    """Class that corresponds to finn-hls Matrix_Vector_Activate(_Stream)_Batch
+    function."""
+
+    def __init__(self, onnx_node):
+        super().__init__(onnx_node)
+        self.decoupled_wrapper = templates.decoupled_wrapper
+
+    def get_nodeattr_types(self):
+        my_attrs = {
+            "PE": ("i", True, 0),
+            "SIMD": ("i", True, 0),
+            "MW": ("i", True, 0),
+            "MH": ("i", True, 0),
+            "resType": ("s", False, "lut", {"auto", "lut", "dsp"}),
+            "ActVal": ("i", False, 0),
+            # FINN DataTypes for inputs, weights, outputs
+            "inputDataType": ("s", True, ""),
+            "weightDataType": ("s", True, ""),
+            "outputDataType": ("s", True, ""),
+            # FINN DataType for accumulator -- auto-computed and updated
+            "accDataType": ("s", False, "INT32"),
+            # use xnor-popcount for binary weights/inputs, thus treating them
+            # as bipolar
+            "binaryXnorMode": ("i", False, 0, {0, 1}),
+            # no-activation mode (produce accumulators)
+            "noActivation": ("i", False, 0, {0, 1}),
+            # number of input vectors, examples:
+            # [1] is a single vector (like a FC layer with batch=1)
+            # [4] is four vectors (like a FC layer with batch=4)
+            # [1, 4, 4] is four * four vectors (like a conv layer with batch=1)
+            "numInputVectors": ("ints", False, [1]),
+            # memory mode for the FC weights
+            # const -- embedded weights, default, long compile/synth times
+            # decoupled -- streaming weights with weight streamer packaged inside IP
+            # external -- streaming weights with external streamer
+            "mem_mode": ("s", False, "const", {"const", "decoupled", "external"}),
+            # FPGA resource type for memories in decoupled mode
+            # auto -- let Vivado decide
+            # block -- use BRAM
+            # distributed -- use LUTRAM
+            # ultra -- use UltraRAM (URAM), must have runtime_writeable_weights=1
+            # see also https://www.xilinx.com/support/answers/38070.html
+            "ram_style": (
+                "s",
+                False,
+                "auto",
+                {"auto", "block", "distributed", "ultra"},
+            ),
+            # FPGA resource type for threshold memories (if noActivation is False)
+            # auto -- let Vivado decide
+            # block -- use BRAM
+            # distributed -- use LUTRAM
+            "ram_style_thresholds": (
+                "s",
+                False,
+                "auto",
+                {"auto", "block", "distributed"},
+            ),
+            # (mem_mode = decoupled only) whether weights will be writable through
+            # an AXI-lite interface during runtime
+            # 1 for enabled, 0 for disabled.
+            # see finn-rtllib/memstream/doc/README for more about the memory
+            # address map used for writable weights
+            # IMPORTANT: After using AXI lite to either read or write the weights,
+            # always "flush" the accelerator by first passing a dummy input
+            # vector through the accelerator. This will get rid of any old
+            # weight data from the weight FIFOs.
+            "runtime_writeable_weights": ("i", False, 0, {0, 1}),
+        }
+        my_attrs.update(super().get_nodeattr_types())
+        return my_attrs
+
+    def calc_wmem(self):
+        """Calculates and returns WMEM."""
+        mw = self.get_nodeattr("MW")
+        mh = self.get_nodeattr("MH")
+        pe = self.get_nodeattr("PE")
+        simd = self.get_nodeattr("SIMD")
+        assert mh % pe == 0, "Requirement MH divisable by PE is violated."
+        assert mw % simd == 0, "Requirement MW divisable by SIMD is violated."
+        wmem = mw * mh // (pe * simd)
+        return wmem
+
+    def calc_tmem(self):
+        """Calculates and returns TMEM."""
+        assert self.get_nodeattr("noActivation")==1, "RTL-based MVAU does not support thresholding currently, please infer a standalone Thresholding_Batch layer"
+        return 0
+
+    def make_shape_compatible_op(self, model):
+        oshape = self.get_normal_output_shape()
+        return super().make_const_shape_op(oshape)
+
+    def infer_node_datatype(self, model):
+        node = self.onnx_node
+        idt = model.get_tensor_datatype(node.input[0])
+        if idt != self.get_input_datatype():
+            warn_str = "inputDataType changing for %s: %s -> %s " % (
+                node.name,
+                str(self.get_input_datatype()),
+                str(idt),
+            )
+            warnings.warn(warn_str)
+        self.set_nodeattr("inputDataType", idt.name)
+        # set output datatype from property
+        odt = self.get_output_datatype()
+        model.set_tensor_datatype(node.output[0], odt)
+
+    def verify_node(self):
+        info_messages = []
+        # verify that "backend" is set to "fpgadataflow"
+        backend_value = self.get_nodeattr("backend")
+        if backend_value == "fpgadataflow":
+            info_messages.append("Attribute backend is set correctly")
+        else:
+            info_messages.append('Attribute backend should be set to "fpgadataflow"')
+
+        # verify that all necessary attributes exist
+        # TODO collect automatically from get_nodeattr_types
+        try:
+            self.get_nodeattr("code_gen_dir_cppsim")
+            self.get_nodeattr("executable_path")
+            self.get_nodeattr("resType")
+            self.get_nodeattr("MW")
+            self.get_nodeattr("MH")
+            self.get_nodeattr("SIMD")
+            self.get_nodeattr("PE")
+            self.get_nodeattr("inputDataType")
+            self.get_nodeattr("weightDataType")
+            self.get_nodeattr("outputDataType")
+            info_messages.append("All necessary attributes exist")
+        except Exception:
+            info_messages.append(
+                """The required MatrixVectorActivation attributes do not exist."""
+            )
+
+        # verify the number of inputs depending on noActivation value
+        # check noActivation value to determine the number of inputs
+        no_act = self.get_nodeattr("noActivation")
+
+        if no_act == 1:
+            if len(self.onnx_node.input) == 2:
+                info_messages.append("The number of inputs is correct")
+            else:
+                info_messages.append(
+                    """RTL-based MatrixVectorActivation needs in no
+                            activation mode 2 inputs (data input and weights)"""
+                )
+        elif no_act == 0:
+            info_messages.append("RTL-based MVAU does not support thresholding currently, please infer a standalone Thresholding_Batch layer")
+        else:
+            info_messages.append(
+                """noActivation attribute contains {} should
+                be 1 for RTL-based MatrixVectorActivation""".format(
+                    no_act
+                )
+            )
+
+        mem_mode = self.get_nodeattr("mem_mode")
+
+        if mem_mode != "decoupled":
+            info_messages.append("RTL-based MVAU supports only decoupled weights currently")
+
+        return info_messages
+
+    def uram_estimation(self):
+        P = self.get_nodeattr("PE")
+        Q = self.get_nodeattr("SIMD")
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        D_in = self.get_nodeattr("MW")
+        D_out = self.get_nodeattr("MH")
+        omega = (D_in * D_out) / (Q * P)
+        mem_width = Q * W * P
+        mmode = self.get_nodeattr("mem_mode")
+        mstyle = self.get_nodeattr("ram_style")
+        if (
+            (mmode == "decoupled" and mstyle != "ultra")
+            or (mmode == "const" and self.calc_wmem() <= 128)
+            or (mmode == "external")
+        ):
+            return 0
+        width_multiplier = math.ceil(mem_width / 72)
+        depth_multiplier = math.ceil(omega / 4096)
+        return width_multiplier * depth_multiplier
+
+    def bram_estimation(self):
+        """Calculates resource estimation for BRAM based on:
+        - FINN-R: An End-to-End Deep-Learning Framework for Fast
+        Exploration of Quantized Neural Networks
+        - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien,
+        Y. Umuroglu, M. Leeser and K. Vissers
+        - 12. Sep 2018
+        """
+        # TODO add in/out FIFO contributions
+        P = self.get_nodeattr("PE")
+        Q = self.get_nodeattr("SIMD")
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        D_in = self.get_nodeattr("MW")
+        D_out = self.get_nodeattr("MH")
+        omega = (D_in * D_out) / (Q * P)
+        mem_width = Q * W * P
+        mmode = self.get_nodeattr("mem_mode")
+        mstyle = self.get_nodeattr("ram_style")
+        if (
+            (mmode == "decoupled" and mstyle in ["distributed", "ultra"])
+            or (mmode == "const" and self.calc_wmem() <= 128)
+            or (mmode == "external")
+        ):
+            return 0
+        # assuming SDP mode RAMB18s (see UG573 Table 1-10)
+        # assuming decoupled (RTL) memory, which is more efficient than const (HLS)
+        if mem_width == 1:
+            return math.ceil(omega / 16384)
+        elif mem_width == 2:
+            return math.ceil(omega / 8192)
+        elif mem_width <= 4:
+            return (math.ceil(omega / 4096)) * (math.ceil(mem_width / 4))
+        elif mem_width <= 9:
+            return (math.ceil(omega / 2048)) * (math.ceil(mem_width / 9))
+        elif mem_width <= 18 or omega > 512:
+            return (math.ceil(omega / 1024)) * (math.ceil(mem_width / 18))
+        else:
+            return (math.ceil(omega / 512)) * (math.ceil(mem_width / 36))
+
+    def bram_efficiency_estimation(self):
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        D_in = self.get_nodeattr("MW")
+        D_out = self.get_nodeattr("MH")
+        bram16_est = self.bram_estimation()
+        if bram16_est == 0:
+            return 1
+        wbits = W * D_in * D_out
+        bram16_est_capacity = bram16_est * 36 * 512
+        return wbits / bram16_est_capacity
+
+    def uram_efficiency_estimation(self):
+        """Function for URAM efficiency estimation: actual parameter storage
+        needed divided by the allocated URAM storage (from estimation)"""
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        D_in = self.get_nodeattr("MW")
+        D_out = self.get_nodeattr("MH")
+        uram_est = self.uram_estimation()
+        if uram_est == 0:
+            return 1
+        wbits = W * D_in * D_out
+        uram_est_capacity = uram_est * 72 * 4096
+        return wbits / uram_est_capacity
+
+#TODO: FIX
+    def lut_estimation(self):
+        """Calculates resource estimations for LUTs based on:
+        - FINN-R: An End-to-End Deep-Learning Framework for Fast
+        Exploration of Quantized Neural Networks
+        - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien,
+        Y. Umuroglu, M. Leeser and K. Vissers
+        - 12. Sep 2018
+        """
+        # TODO add in/out FIFO contributions
+        P = self.get_nodeattr("PE")
+        Q = self.get_nodeattr("SIMD")
+        MW = self.get_nodeattr("MW")
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        # determine tdt with input and weight data types
+        idt = self.get_input_datatype()
+        A = idt.bitwidth()
+        # parameters from experiments in paper mentioned above
+        c0 = 300
+        c1 = 1.1
+        c2 = 0
+        mmode = self.get_nodeattr("mem_mode")
+        mstyle = self.get_nodeattr("ram_style")
+        if (mmode == "decoupled" and mstyle == "distributed") or (
+            mmode == "const" and self.calc_wmem() <= 128
+        ):
+            c2 = (P * Q * W) * math.ceil(self.calc_wmem() / 64)
+
+        # multiplication
+        res_type = self.get_nodeattr("resType")
+        if res_type == "dsp":
+            mult_luts = 0
+        else:
+            mult_luts = Q * (2 * math.ceil((W + A) / 6) - 1) * (W + A)
+        # adder tree
+        addertree_luts = (W + A) * (2 * Q - 1)
+        # accumulator
+        acc_bits = W + A + np.ceil(math.log(MW, 2))
+        acc_luts = acc_bits
+        # thresholds and threshold comparators
+        thr_luts = 0
+        comp_luts = 0
+        noact = self.get_nodeattr("noActivation")
+        if noact == 0:
+            odt = self.get_output_datatype()
+            B = odt.bitwidth()
+            thr_luts = (2**B - 1) * acc_bits * math.ceil(self.calc_tmem() / 64)
+            comp_luts = (2**B - 1) * acc_bits
+
+        return int(
+            c0
+            + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts))
+            + c2
+        )
+
+#TODO: FIX
+    def dsp_estimation(self):
+        # multiplication
+        P = self.get_nodeattr("PE")
+        res_type = self.get_nodeattr("resType")
+        Q = self.get_nodeattr("SIMD")
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        idt = self.get_input_datatype()
+        A = idt.bitwidth()
+        if res_type == "dsp":
+            mult_dsp = P * Q * np.ceil((W + A) / 48)  # TODO: more accurate modelling
+        else:
+            mult_dsp = 0
+        return int(mult_dsp)
+
+#TODO: FIX
+    def get_exp_cycles(self):
+        pe = self.get_nodeattr("PE")
+        simd = self.get_nodeattr("SIMD")
+        num_inp_vec = self.get_nodeattr("numInputVectors")
+        mh = self.get_nodeattr("MH")
+        mw = self.get_nodeattr("MW")
+        # since mmv != 1 is not supported yet, we set mmv for now to 1
+        mmv = 1
+        exp_cycles = (mh / pe) * (mw / simd) * np.prod(num_inp_vec) / mmv
+        return int(exp_cycles)
+
+    def get_input_datatype(self, ind=0):
+        """Returns FINN DataType of input."""
+        # when performing FIFO insertion on an FC layer with ext weights, the ind
+        # parameter can be > 0 (referring to the weights) so handle that here
+        if ind == 0:
+            return DataType[self.get_nodeattr("inputDataType")]
+        elif ind == 1:
+            return DataType[self.get_nodeattr("weightDataType")]
+        else:
+            raise Exception("Undefined input ind for this layer type")
+
+    def get_weight_datatype(self):
+        """Returns FINN DataType of weights."""
+        return DataType[self.get_nodeattr("weightDataType")]
+
+    def get_output_datatype(self, ind=0):
+        """Returns FINN DataType of output."""
+        return DataType[self.get_nodeattr("outputDataType")]
+
+    def get_instream_width(self, ind=0):
+        i_bits = self.get_input_datatype().bitwidth()
+        assert i_bits<=9, "RTL-based MVAU only supports activations with bit-width up to 9-bits"
+        in_width = i_bits * self.get_nodeattr("SIMD")
+        return in_width
+
+    def get_outstream_width(self, ind=0):
+        o_bits = self.get_output_datatype().bitwidth()
+        out_width = o_bits * self.get_nodeattr("PE")
+        return out_width
+
+    def get_weightstream_width(self):
+        """Returns weight stream width. Used only in decoupled mode."""
+        if (
+            self.get_nodeattr("mem_mode") == "decoupled"
+            or self.get_nodeattr("mem_mode") == "external"
+        ):
+            pe = self.get_nodeattr("PE")
+            simd = self.get_nodeattr("SIMD")
+            wp = self.get_weight_datatype().bitwidth()
+            w_width = pe * simd * wp
+            assert wp<=8, "RTL-based MVAU only supports weights with bit-width up to 8-bits"
+            return w_width
+        else:
+            return 0
+
+    def get_weightstream_width_padded(self):
+        """Returns weight stream width padded to a multiple of 8. This is required
+        by the AXI Stream spec. Used in decoupled mode."""
+        weight_width = self.get_weightstream_width()
+        return roundup_to_integer_multiple(weight_width, 8)
+
+    def get_ap_int_max_w(self):
+        # base class impl (max of inp/out stream widths)
+        max_of_io = super().get_ap_int_max_w()
+        # decoupled mode weight stream
+        weightstream = self.get_weightstream_width()
+        # single PE weight entry
+        weight_bits = self.get_weight_datatype().bitwidth()
+        simd = self.get_nodeattr("SIMD")
+        single_pe_w = simd * weight_bits
+        return max([weightstream, max_of_io, single_pe_w])
+
+    def get_folded_input_shape(self, ind=0):
+        mw = self.get_nodeattr("MW")
+        mh = self.get_nodeattr("MH")
+        simd = self.get_nodeattr("SIMD")
+        pe = self.get_nodeattr("PE")
+        sf = mw // simd
+        nf = mh // pe
+        vecs = list(self.get_nodeattr("numInputVectors"))
+
+        if ind == 0:
+            # calculate shape of input 0
+            folded_input_shape = tuple(vecs + [sf, simd])
+        elif ind == 1 and self.get_nodeattr("mem_mode") == "external":
+            # calculate shape of input 1 (weights)
+            folded_input_shape = tuple(vecs + [sf * nf, simd * pe])
+        else:
+            raise Exception("Undefined input shape for requested input")
+
+        return folded_input_shape
+
+    def get_folded_output_shape(self, ind=0):
+        mh = self.get_nodeattr("MH")
+        pe = self.get_nodeattr("PE")
+        nf = mh // pe
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        folded_output_shape = tuple(vecs + [nf, pe])
+        return folded_output_shape
+
+    def get_normal_input_shape(self, ind=0):
+        mw = self.get_nodeattr("MW")
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        normal_input_shape = tuple(vecs + [mw])
+        return normal_input_shape
+
+    def get_normal_output_shape(self, ind=0):
+        mh = self.get_nodeattr("MH")
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        normal_output_shape = tuple(vecs + [mh])
+        return normal_output_shape
+
+    def get_number_output_values(self):
+        nf = np.prod(self.get_folded_output_shape()[:-1])
+        return nf
+
+    def get_hls_compatible_weight_tensor(self, orig_weight_matrix):
+        """Convert the original numpy weight matrix orig_weight_matrix into
+        a form suitable for passing to the hlslib call:
+        * ensure MH % PE == 0 and MW % SIMD == 0
+        * for bipolar {-1,+1} weights, convert to binary {0, 1}
+        * interleave rows between PEs
+        * reshape into (1, PE, WMEM, SIMD) and return
+        """
+        mw = self.get_nodeattr("MW")
+        mh = self.get_nodeattr("MH")
+        pe = self.get_nodeattr("PE")
+        simd = self.get_nodeattr("SIMD")
+        wmem = self.calc_wmem()
+        assert orig_weight_matrix.shape == (
+            mw,
+            mh,
+        ), """Weights matrix doesn't
+        have expected shape (mw, mh)"""
+        assert mw % simd == 0, "Requirement MH divisable by SIMD is violated."
+        assert mh % pe == 0, "Requirement MH divisable by PE is violated."
+        # start by transposing the original weight matrix, since ONNX and
+        # finn-hlslib use different assumptions
+        # ONNX uses (in_features, out_features) and matmul(x, W)
+        # finn-hlslib uses (out_features, in_features) and matmul(W, x)
+        ret = orig_weight_matrix.T
+        # interleave rows between PEs and reshape
+        # distribute rows between PEs
+        ret = interleave_matrix_outer_dim_from_partitions(ret, pe)
+        # create SIMD as innermost dimension and add a dummy outer dim
+        ret = ret.reshape(1, pe, wmem, simd)
+        # reverse the SIMD dimension
+        ret = np.flip(ret, axis=-1)
+        return ret
+
+    def minimize_accumulator_width(self, model):
+        weights = model.get_initializer(self.onnx_node.input[1])
+        idt = self.get_input_datatype()
+        # calculate minimum and maximum values of accumulator
+        (acc_min, acc_max) = calculate_matvec_accumulator_range(weights, idt)
+        if acc_min < 0:
+            if abs(acc_min) > acc_max:
+                adt = DataType.get_smallest_possible(acc_min)
+            else:
+                adt = DataType.get_smallest_possible(-acc_max - 1)
+        else:
+            adt = DataType.get_smallest_possible(acc_max)
+        # ensure a datatype divisible by 8-bits in case this is the last node
+        bw = roundup_to_integer_multiple(adt.bitwidth(), 8)
+        new_adt_name = adt.name.replace(str(adt.bitwidth()), str(bw))
+        adt = DataType[new_adt_name]
+        self.set_nodeattr("accDataType", adt.name)
+        # for no-activation nodes, output dt = acc dt
+        self.set_nodeattr("outputDataType", adt.name)
+        return DataType[self.get_nodeattr("accDataType")]
+
+    def make_weight_file(self, weights, weight_file_mode, weight_file_name):
+        """Produce a file containing given weights in appropriate format for this
+        layer. This file can be used for either synthesis or run-time reconfig
+        of weights.
+
+        Arguments:
+        * weights : numpy array with weights to be put into the file
+        * weight_file_mode : one of {hls_header, decoupled_verilog_dat,
+          decoupled_runtime}
+        * weight_file_name : filename for the weight file to be generated
+        """
+        # convert weights into hlslib-compatible format
+        weight_tensor = self.get_hls_compatible_weight_tensor(weights)
+        export_wdt = self.get_weight_datatype()
+        if "decoupled" in weight_file_mode:
+            # create a weight stream for various flavors of decoupled mode:
+            # transpose weight tensor from (1, PE, WMEM, SIMD) to (1, WMEM, PE, SIMD)
+            weight_tensor_unflipped = np.transpose(weight_tensor, (0, 2, 1, 3))
+            # reverse SIMD flip for saving weights in .npy
+            weight_tensor_simd_flipped = np.flip(weight_tensor_unflipped, axis=-1)
+            # PE flip for saving weights in .dat
+            weight_tensor_pe_flipped = np.flip(weight_tensor_unflipped, axis=-2)
+            # reshape weight tensor (simd_flipped and pe_flipped) to desired shape
+            pe = self.get_nodeattr("PE")
+            simd = self.get_nodeattr("SIMD")
+            # simd_flipped
+            weight_tensor_simd_flipped = weight_tensor_simd_flipped.reshape(
+                1, -1, pe * simd
+            )
+            weight_tensor_simd_flipped = weight_tensor_simd_flipped.copy()
+            # flipped
+            weight_tensor_pe_flipped = weight_tensor_pe_flipped.reshape(
+                1, -1, pe * simd
+            )
+            weight_tensor_pe_flipped = weight_tensor_pe_flipped.copy()
+            if weight_file_mode == "decoupled_verilog_dat":
+                # convert weight values into hexstring
+                weight_width = self.get_weightstream_width()
+                # pad to nearest 4 bits to get hex strings
+                weight_width_padded = roundup_to_integer_multiple(weight_width, 4)
+                weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string(
+                    weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix=""
+                )
+                # add zeroes to pad out file to 1024 entries
+                weight_stream = weight_tensor_pe_flipped.flatten()
+                weight_stream = weight_stream.copy()
+                with open(weight_file_name, "w") as f:
+                    for val in weight_stream:
+                        f.write(val + "\n")
+            elif weight_file_mode == "decoupled_runtime":
+                # memstream axi-lite interface will map each mem line to
+                # one or multiple 32-bit words
+                weight_width = self.get_weightstream_width()
+                words_per_memwidth = 2 ** math.ceil(math.log2(weight_width / 32))
+                if words_per_memwidth < 1:
+                    words_per_memwidth = 1
+                weight_width_padded = words_per_memwidth * 32
+                # first, pack and ensure padding to 32 bits
+                weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string(
+                    weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix=""
+                )
+                weight_stream = weight_tensor_pe_flipped.flatten()
+                weight_stream = weight_stream.copy()
+                with open(weight_file_name, "w") as f:
+                    for val in weight_stream:
+                        # split into groups of 8 hex digits (= 32 bits)
+                        words_32b = textwrap.wrap(val, 8)
+                        words_32b.reverse()
+                        for word_32b in words_32b:
+                            f.write(word_32b + "\n")
+            else:
+                raise Exception("Unknown/unsupported weight_file_mode")
+
+        else:
+            raise Exception("Unknown/unsupported weight_file_mode")
+
+    def generate_params(self, model, path):
+        mem_mode = self.get_nodeattr("mem_mode")
+        code_gen_dir = path
+        # weights, if not external
+        weights = model.get_initializer(self.onnx_node.input[1])
+        if mem_mode == "decoupled":
+            weight_filename_sim = "{}/weights.npy".format(code_gen_dir)
+            # save decoupled weights for cppsim
+            self.make_weight_file(weights, "decoupled_npy", weight_filename_sim)
+            # also save weights as Verilog .dat file
+            # note that we provide two different .dat files, one for synth
+            # and one for synthesis. this is because URAM-based weights always
+            # need zero weights for synthesis, otherwise they get inferred
+            # as BRAM
+            weight_filename_rtl_synth = "{}/memblock_synth_0.dat".format(
+                code_gen_dir
+            )
+            weight_filename_rtl_sim = "{}/memblock_sim_0.dat".format(code_gen_dir)
+            # sim weights are always the true weights
+            self.make_weight_file(
+                weights, "decoupled_verilog_dat", weight_filename_rtl_sim
+            )
+            ram_style = self.get_nodeattr("ram_style")
+            if ram_style == "ultra":
+                # UltraRAM must have no memory initializer, or only zeroes
+                # otherwise BRAM will be inferred instead of URAM
+                # as a workaround we provide a zero-weight init here
+                synth_weights = np.zeros_like(weights, dtype=np.float32)
+            else:
+                synth_weights = weights
+            self.make_weight_file(
+                synth_weights, "decoupled_verilog_dat", weight_filename_rtl_synth
+            )
+        else:
+            raise Exception(
+                """Please set mem_mode to "decoupled",
+                currently no other parameter value is supported!"""
+            )
+
+    def execute_node(self, context, graph):
+        mode = self.get_nodeattr("exec_mode")
+        mem_mode = self.get_nodeattr("mem_mode")
+        node = self.onnx_node
+
+        # TODO ensure codegen dir exists
+        if mode == "cppsim":
+            raise Exception(
+                "cppsim not possible for RTL MVAU, please set exec_mode to rtlsim"
+            )
+        elif mode == "rtlsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        # create a npy file fore each input of the node (in_ind is input index)
+        in_ind = 0
+        for inputs in node.input:
+            # it is assumed that the first input of the node is the data input
+            # the second input are the weights
+            # the third input are the thresholds
+            if in_ind == 0:
+                assert (
+                    str(context[inputs].dtype) == "float32"
+                ), """Input datatype is
+                not float32 as expected."""
+                expected_inp_shape = self.get_folded_input_shape()
+                reshaped_input = context[inputs].reshape(expected_inp_shape)
+                export_idt = self.get_input_datatype()
+                # make copy before saving the array
+                reshaped_input = reshaped_input.copy()
+                np.save(
+                    os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)),
+                    reshaped_input,
+                )
+            elif in_ind > 2:
+                raise Exception("Unexpected input found for MatrixVectorActivation")
+            in_ind += 1
+
+        if mode == "rtlsim":
+            sim = self.get_rtlsim()
+            nbits = self.get_instream_width()
+            inp = npy_to_rtlsim_input(
+                "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
+            )
+            super().reset_rtlsim(sim)
+            super().toggle_clk(sim)
+            if mem_mode == "external" or mem_mode == "decoupled":
+                wnbits = self.get_weightstream_width()
+                export_wdt = self.get_weight_datatype()
+                wei = npy_to_rtlsim_input(
+                    "{}/weights.npy".format(code_gen_dir), export_wdt, wnbits
+                )
+                num_w_reps = np.prod(self.get_nodeattr("numInputVectors"))
+                io_dict = {
+                    "inputs": {"in0": inp, "weights": wei * num_w_reps},
+                    "outputs": {"out": []},
+                }
+                self.rtlsim_multi_io(sim, io_dict)
+                output = io_dict["outputs"]["out"]
+            else:
+                output = self.rtlsim(sim, inp)
+            odt = self.get_output_datatype()
+            target_bits = odt.bitwidth()
+            packed_bits = self.get_outstream_width()
+            out_npy_path = "{}/output.npy".format(code_gen_dir)
+            out_shape = self.get_folded_output_shape()
+            rtlsim_output_to_npy(
+                output, out_npy_path, odt, out_shape, packed_bits, target_bits
+            )
+            # load and reshape output
+            output = np.load(out_npy_path)
+            oshape = self.get_normal_output_shape()
+            output = np.asarray([output], dtype=np.float32).reshape(*oshape)
+            context[node.output[0]] = output
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+    def code_generation_ipgen(self, model, fpgapart, clk):
+        """Normally: Generates C++ code and tcl script for IP generation.
+        Here: Generates (System-)Verilog code for IP generation."""
+        self.generate_hdl()
+
+    def ipgen_singlenode_code(self):
+        """Normally: Builds the bash script for IP generation."""
+        pass   
+
+    def code_generation_cppsim(self, model):
+        """Normally: Generates C++ code for simulation (cppsim)."""
+        pass     
+
+    def compile_singlenode_code(self):
+        pass
+
+    def global_includes(self):
+        pass
+
+    def defines(self, var):
+        pass
+
+    def read_npy_data(self):
+        pass
+
+    def strm_decl(self):
+        pass
+
+    def docompute(self):
+        pass
+
+    def dataoutstrm(self):
+        pass
+
+    def save_as_npy(self):
+        pass
+
+    def blackboxfunction(self):
+        pass
+
+    def pragmas(self):
+        pass
+
+    def code_generation_ipi(self):
+        cmd = []
+        # add streamer if needed
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode == "decoupled":
+            runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1
+            if self.get_nodeattr("ram_style") == "ultra":
+                assert (
+                    runtime_writable == 1
+                ), "Layer with URAM weights must have runtime_writeable_weights=1"
+            node_name = self.onnx_node.name
+            sname = self.hls_sname()
+            # create a hierarchy for this layer, with the same port names
+            clk_name = self.get_verilog_top_module_intf_names()["clk"][0]
+            rst_name = self.get_verilog_top_module_intf_names()["rst"][0]
+            dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0][0]
+            din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0]
+            cmd.append("create_bd_cell -type hier %s" % node_name)
+            cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name))
+            cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name))
+            cmd.append(
+                "create_bd_intf_pin -mode Master "
+                "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s"
+                % (node_name, dout_name)
+            )
+            cmd.append(
+                "create_bd_intf_pin -mode Slave "
+                "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name)
+            )
+            # instantiate the hls ip
+            cmd.append(
+                "create_bd_cell -type ip -vlnv %s /%s/%s"
+                % (self.get_nodeattr("ip_vlnv"), node_name, node_name)
+            )
+            # instantiate a streamer and connect it to the HLS IP
+            strm_vlnv = "xilinx.com:user:memstream:1.0"
+            strm_inst = node_name + "_wstrm"
+            cmd.append(
+                "create_bd_cell -type ip -vlnv %s /%s/%s"
+                % (strm_vlnv, node_name, strm_inst)
+            )
+            cmd.append(
+                "set_property -dict [list "
+                "CONFIG.NSTREAMS {1} "
+                "CONFIG.MEM_DEPTH {%d} "
+                "CONFIG.MEM_WIDTH {%d} "
+                "CONFIG.MEM_INIT {%s} "
+                "CONFIG.RAM_STYLE {%s} "
+                "CONFIG.STRM0_DEPTH {%d} "
+                "CONFIG.STRM0_WIDTH {%d} "
+                "CONFIG.STRM0_OFFSET {0} "
+                "] [get_bd_cells /%s/%s]"
+                % (
+                    self.calc_wmem(),
+                    self.get_weightstream_width_padded(),
+                    self.get_nodeattr("code_gen_dir_ipgen") + "/",
+                    self.get_nodeattr("ram_style"),
+                    self.calc_wmem(),
+                    self.get_weightstream_width_padded(),
+                    node_name,
+                    strm_inst,
+                )
+            )
+            cmd.append(
+                "connect_bd_intf_net [get_bd_intf_pins %s/%s/m_axis_0] "
+                "[get_bd_intf_pins %s/%s/weights_%s]"
+                % (node_name, strm_inst, node_name, node_name, sname)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aresetn]"
+                % (node_name, rst_name, node_name, strm_inst)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aclk]"
+                % (node_name, clk_name, node_name, strm_inst)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]"
+                % (node_name, rst_name, node_name, node_name, rst_name)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]"
+                % (node_name, clk_name, node_name, node_name, clk_name)
+            )
+            cmd.append(
+                "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
+                "[get_bd_intf_pins %s/%s/%s]"
+                % (node_name, din_name, node_name, node_name, din_name)
+            )
+            cmd.append(
+                "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
+                "[get_bd_intf_pins %s/%s/%s]"
+                % (node_name, dout_name, node_name, node_name, dout_name)
+            )
+            if runtime_writable:
+                # expose axi lite interface for writeable weights
+                axilite_name = self.get_verilog_top_module_intf_names()["axilite"][0]
+                cmd.append(
+                    "create_bd_intf_pin -mode Slave "
+                    "-vlnv xilinx.com:interface:aximm_rtl:1.0 /%s/%s"
+                    % (node_name, axilite_name)
+                )
+                cmd.append(
+                    "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
+                    "[get_bd_intf_pins %s/%s/%s]"
+                    % (node_name, axilite_name, node_name, strm_inst, axilite_name)
+                )
+                # TODO calculate and pass in segment size here
+                cmd.append("assign_bd_address")
+            cmd.append("save_bd_design")
+        elif mem_mode == "const" or mem_mode == "external":
+            # base class impl sufficient for const/external modes
+            return super().code_generation_ipi()
+        else:
+            raise Exception("Unrecognized mem_mode for MatrixVectorActivation")
+        return cmd
+
+    def get_verilog_top_module_intf_names(self):
+        intf_names = super().get_verilog_top_module_intf_names()
+        mem_mode = self.get_nodeattr("mem_mode")
+        sname = self.hls_sname()
+        if mem_mode == "external":
+            intf_names["s_axis"].append(
+                ("weights_" + sname, self.get_weightstream_width_padded())
+            )
+        if mem_mode == "decoupled":
+            # only expose axilite interface if attribute is set
+            runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1
+            if runtime_writable:
+                intf_names["axilite"] = ["s_axilite"]
+        return intf_names
+
+    def get_op_and_param_counts(self):
+        in_features = self.get_nodeattr("MW")
+        out_features = self.get_nodeattr("MH")
+        weight_bits = self.get_weight_datatype().bitwidth()
+        inp_bits = self.get_input_datatype().bitwidth()
+        num_inp_vec = self.get_nodeattr("numInputVectors")
+        num_repetitions = int(np.prod(num_inp_vec))
+        mac_count = in_features * out_features * num_repetitions
+        # cannonicalize op type: highest bitwidth operand first s.t.
+        # e.g. mac_8bx4b and mac_4bx8b don't appear as two different op types
+        bw1 = min(inp_bits, weight_bits)
+        bw2 = max(inp_bits, weight_bits)
+        mac_op_type = "op_mac_%dbx%db" % (bw1, bw2)
+        weight_param_type = "param_weight_%db" % (weight_bits)
+        weight_count = in_features * out_features
+        ret_dict = {mac_op_type: mac_count, weight_param_type: weight_count}
+        if self.get_nodeattr("noActivation") == 0:
+            tdt = DataType[self.get_nodeattr("accDataType")]
+            thres_bits = tdt.bitwidth()
+            thres_param_type = "param_threshold_%db" % (thres_bits)
+            thres_count = out_features
+            ret_dict[thres_param_type] = thres_count
+        return ret_dict
+
+    def derive_characteristic_fxns(self, period):
+        n_inps = np.prod(self.get_folded_input_shape()[:-1])
+        io_dict = {
+            "inputs": {
+                "in0": [0 for i in range(n_inps)],
+            },
+            "outputs": {"out": []},
+        }
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode in ["decoupled", "external"]:
+            n_weight_inps = self.calc_wmem()
+            num_w_reps = np.prod(self.get_nodeattr("numInputVectors"))
+            io_dict["inputs"]["weights"] = [
+                0 for i in range(num_w_reps * n_weight_inps)
+            ]
+        super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
+
+    def generate_hdl(self):
+#TODO: add distinction between (PE=MH or PE=1) and where MH dimension is folded
+        template_path, code_gen_dict = self.prepare_codegen_default()
+
+        # add general parameters to dictionary
+        code_gen_dict["$TOP_MODULE_NAME$"] = [self.get_verilog_top_module_name()]
+        # save top module name so we can refer to it after this node has been renamed
+        # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject)
+        self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name())
+#TODO: currently only ram_style=auto is supported
+        ram_style = self.get_nodeattr("ram_style")
+        if ram_style == "auto":
+            continue
+        else:
+            raise Exception("Unrecognized ram_style for MatrixVectorActivation")
+
+        # apply code generation to templates
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        with open(template_path, "r") as f:
+            template = f.read()
+        for key in code_gen_dict:
+            # transform list into long string separated by '\n'
+            code_gen_line = "\n".join(code_gen_dict[key])
+            template = template.replace(key, code_gen_line)
+            template_wrapper = template_wrapper.replace(key, code_gen_line)
+        with open(
+            os.path.join(
+                code_gen_dir, self.get_nodeattr("gen_top_module") + "_impl.sv"
+            ),
+            "w",
+        ) as f:
+            f.write(template)
+        with open(
+            os.path.join(
+                code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"
+            ),
+            "w",
+        ) as f:
+            f.write(template_wrapper)
+
+        # set ipgen_path and ip_path so that HLS-Synth transformation
+        # and stich_ip transformation do not complain
+        self.set_nodeattr("ipgen_path", code_gen_dir)
+        self.set_nodeattr("ip_path", code_gen_dir)    
+
+    def prepare_codegen_default(self):
+        # TODO: Differentiate between PE folding and fully unrolled along MH dimension
+        template_path = (
+            os.environ["FINN_ROOT"] + "/finn-rtllib/mvau/dsp58_mvau_template.vhdl"
+        )
+        code_gen_dict = {}
+
+        code_gen_dict["$PE$"] = self.get_nodeattr("PE")
+        code_gen_dict["$SIMD$"] = self.get_nodeattr("SIMD")
+        code_gen_dict["$MW$"] = self.get_nodeattr("MW")
+        code_gen_dict["$MH$"] = self.get_nodeattr("MH")
+        code_gen_dict["$ACTIVATION_WIDTH$"] = self.get_input_datatype(0).bitwidth()
+        code_gen_dict["$WEIGHT_WIDTH$"] = self.get_input_datatype(1).bitwidth()
+        code_gen_dict["$ACCU_WIDTH_BA$"] = self.get_output_datatype().bitwidth()
+
+        return template_path, code_gen_dict
+

From e965396e4ddf4848fc9a17b04fa4908a0924568e Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Thu, 5 Jan 2023 10:40:52 +0000
Subject: [PATCH 044/235] [thresholding] skip test for unsupported cppsim
 configuration and merge tests

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 ...fpgadataflow_thresholding_binary_search.py | 65 +++----------------
 1 file changed, 9 insertions(+), 56 deletions(-)

diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py
index ab98189ea5..947109794e 100755
--- a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py
+++ b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py
@@ -96,6 +96,7 @@ def make_single_thresholding_binary_search_modelwrapper(
     mem_mode,
     num_input_vecs,
 ):
+
     NumChannels = thresholds.shape[0]
 
     inp = helper.make_tensor_value_info(
@@ -223,61 +224,6 @@ def test_fpgadataflow_thresholding_binary_search_unit():
     return
 
 
-# Test brief: Prove that cppsim is not supported for this class
-@pytest.mark.fpgadataflow
-@pytest.mark.vivado
-def test_fpgadataflow_thresholding_binary_search_cppsim():
-    input_data_type = DataType["UINT16"]
-    act = DataType["BIPOLAR"]
-    fold = -1
-    num_input_channels = 16
-    # 'const' is unsupported see test:
-    # test_fpgadataflow_thresholding_binary_search_const_mem_mode()
-    mem_mode = "decoupled"
-
-    pe = generate_pe_value(fold, num_input_channels)
-    num_steps = act.get_num_possible_values() - 1
-
-    # Generate random, non-decreasing thresholds
-    thresholds = generate_random_threshold_values(
-        input_data_type, num_input_channels, num_steps
-    )
-    thresholds = sort_thresholds_increasing(thresholds)
-
-    # Other non-input parameters
-    num_input_vecs = [1, 2, 2]
-    output_data_type = act
-    if output_data_type == DataType["BIPOLAR"]:
-        activation_bias = 0
-    else:
-        activation_bias = output_data_type.min()
-
-    # Generate model from input parameters to the test
-    model = make_single_thresholding_binary_search_modelwrapper(
-        thresholds,
-        pe,
-        input_data_type,
-        output_data_type,
-        activation_bias,
-        mem_mode,
-        num_input_vecs,
-    )
-
-    # Cppsim is not supported for this class, catch the specific exception thrown by
-    # cppsim. Exception raised in cppsim: Custom op_type Thresholding_Binary_Search is
-    # currently not supported.
-    try:
-        model = model.transform(PrepareCppSim())
-        model = model.transform(CompileCppSim())
-        model = model.transform(SetExecMode("cppsim"))
-    except Exception as e:
-        if (
-            str(e)
-            != "Custom op_type Thresholding_Binary_Search is currently not supported."
-        ):
-            raise
-
-
 # Test brief: Prove that memory mode 'const' is not supported for this layer type
 @pytest.mark.fpgadataflow
 @pytest.mark.vivado
@@ -384,16 +330,23 @@ def test_fpgadataflow_thresholding_binary_search_prepare_rtlsim():
 # no need to test 'const' mode, it's already done in:
 # test_fpgadataflow_thresholding_binary_search_const_mem_mode()
 @pytest.mark.parametrize("mem_mode", ["decoupled"])
+@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
 @pytest.mark.fpgadataflow
 @pytest.mark.vivado
 @pytest.mark.slow
 def test_fpgadataflow_thresholding_binary_search(
-    activation, input_data_type, fold, num_input_channels, mem_mode
+    activation, input_data_type, fold, num_input_channels, mem_mode, exec_mode
 ):
     # Handle inputs to the test
     pe = generate_pe_value(fold, num_input_channels)
     num_steps = activation.get_num_possible_values() - 1
 
+    # Cppsim is not supported for this node (as it is an RTL node)
+    if exec_mode == "cppsim":
+        pytest.skip("cppsim not supported for RTL Thresholding Binary Search node")
+    elif exec_mode != "rtlsim":
+        raise Exception("Unknown exec_mode: {}".format(exec_mode))
+
     # Other non-input parameters
     num_input_vecs = [1, 2, 2]
     output_data_type = activation

From 2b8a674573e3415e54665ff05a2db75d5c20f30f Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Thu, 5 Jan 2023 11:07:40 +0000
Subject: [PATCH 045/235] [thresholding] moving find_next_power_of_2() to the
 util suite

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 src/finn/util/basic.py                        | 17 ++++
 ...fpgadataflow_thresholding_binary_search.py | 82 -------------------
 tests/util/test_basic.py                      | 62 ++++++++++++++
 3 files changed, 79 insertions(+), 82 deletions(-)
 create mode 100755 tests/util/test_basic.py

diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py
index 4aba87216c..9a66cf90eb 100644
--- a/src/finn/util/basic.py
+++ b/src/finn/util/basic.py
@@ -216,3 +216,20 @@ def is_exe(fpath):
                 return exe_file
 
     return None
+
+def find_next_power_of_2(n):
+    # Negative values will loop infinitely below - return 0
+    if n <= 0:
+        return 0
+    # If '1' is requested, output will be '0' in the loop below, avoid this now.
+    elif n == 1:
+        return 2  # i.e. 2**1
+
+    # decrement 'n' (to handle cases when `n` itself is a power of 2)
+    n = n - 1
+
+    # loop until only one bit is left
+    while n & n - 1:
+        # unset rightmost bit
+        n = n & n - 1
+    return n << 1
diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py
index 947109794e..29fc2828b6 100755
--- a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py
+++ b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py
@@ -142,88 +142,6 @@ def make_single_thresholding_binary_search_modelwrapper(
     return model
 
 
-# Test brief: a particular method for this class was causing a bug:
-# find_next_power_of_2()
-# Weights in the thresholding core are programmed on a per-channel basis and are
-# byte-addressable. When a channel is programmed, the next channel can start
-# programming at the next power-of-2 byte boundary. This test is to show that the
-# function that calculates that boundary is working correctly.
-#
-# A Thresholding_Binary_Search layer was created and a SW generated dataset with a
-# threshold channel depth of 1 weight (1 layer of N channels in the thresholding core).
-# However, find_next_power_of_2() was returning a next-power-of-2 address boundary at
-# address '0', instead of '2'. This unit test is to prove that this bug no longer
-# occurs. It was originally seen when the input datatype was 'DataType["BIPOLAR"]'.
-@pytest.mark.fpgadataflow
-@pytest.mark.vivado
-def test_fpgadataflow_thresholding_binary_search_unit():
-    activation = DataType["BIPOLAR"]
-    input_data_type = DataType["INT16"]
-    fold = -1
-    num_input_channels = 16
-    mem_mode = "decoupled"
-
-    # Handle inputs to the test
-    pe = generate_pe_value(fold, num_input_channels)
-    num_steps = activation.get_num_possible_values() - 1
-
-    # Other non-input parameters
-    num_input_vecs = [1, 2, 2]
-    output_data_type = activation
-    if output_data_type == DataType["BIPOLAR"]:
-        activation_bias = 0
-    else:
-        activation_bias = output_data_type.min()
-
-    # Generate random thresholds and sort in ascending order
-    thresholds = generate_random_threshold_values(
-        input_data_type, num_input_channels, num_steps
-    )
-
-    # Generate model from input parameters to the test
-    model = make_single_thresholding_binary_search_modelwrapper(
-        thresholds,
-        pe,
-        input_data_type,
-        output_data_type,
-        activation_bias,
-        mem_mode,
-        num_input_vecs,
-    )
-
-    # Retrieve the class to get the method-under-test
-    tbs_node = model.get_nodes_by_op_type("Thresholding_Binary_Search")[0]
-    tbs_inst = getCustomOp(tbs_node)
-
-    test_vector = [
-        {"input": -2, "expected_result": 0},
-        {"input": -1, "expected_result": 0},
-        {"input": 0, "expected_result": 0},
-        {"input": 1, "expected_result": 2},
-        {"input": 2, "expected_result": 2},
-        {"input": 3, "expected_result": 4},
-        {"input": 4, "expected_result": 4},
-        {"input": 7, "expected_result": 8},
-        {"input": 8, "expected_result": 8},
-        {"input": 11, "expected_result": 16},
-        {"input": 15, "expected_result": 16},
-        {"input": 16, "expected_result": 16},
-        {"input": 18, "expected_result": 32},
-        {"input": 27, "expected_result": 32},
-        {"input": 31, "expected_result": 32},
-        {"input": 32, "expected_result": 32},
-        {"input": 42, "expected_result": 64},
-        {"input": 65, "expected_result": 128},
-    ]
-
-    for test_dict in test_vector:
-        output = tbs_inst.find_next_power_of_2(test_dict["input"])
-        assert output >= test_dict["input"]
-        assert output == test_dict["expected_result"]
-
-    return
-
-
 # Test brief: Prove that memory mode 'const' is not supported for this layer type
 @pytest.mark.fpgadataflow
 @pytest.mark.vivado
diff --git a/tests/util/test_basic.py b/tests/util/test_basic.py
new file mode 100755
index 0000000000..d2586f4f19
--- /dev/null
+++ b/tests/util/test_basic.py
@@ -0,0 +1,62 @@
+# Copyright (C) 2023, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+import finn.util.basic as basic
+
+
+@pytest.mark.util
+def test_next_power_of_2():
+    test_vector = [
+        {"input": -2, "expected_result": 0},
+        {"input": -1, "expected_result": 0},
+        {"input": 0, "expected_result": 0},
+        {"input": 1, "expected_result": 2},
+        {"input": 2, "expected_result": 2},
+        {"input": 3, "expected_result": 4},
+        {"input": 4, "expected_result": 4},
+        {"input": 7, "expected_result": 8},
+        {"input": 8, "expected_result": 8},
+        {"input": 11, "expected_result": 16},
+        {"input": 15, "expected_result": 16},
+        {"input": 16, "expected_result": 16},
+        {"input": 18, "expected_result": 32},
+        {"input": 27, "expected_result": 32},
+        {"input": 31, "expected_result": 32},
+        {"input": 32, "expected_result": 32},
+        {"input": 42, "expected_result": 64},
+        {"input": 65, "expected_result": 128},
+    ]
+
+    for test_dict in test_vector:
+        output = basic.find_next_power_of_2(test_dict["input"])
+        assert output >= test_dict["input"]
+        assert output == test_dict["expected_result"]
+
+    return

From 45bb19f2821bde10cf7303a193869160fd46c72e Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Thu, 5 Jan 2023 11:22:08 +0000
Subject: [PATCH 046/235] [thresholding] remove find_next_power_of_2() from
 thresholding binary search CustomOp class

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 .../thresholding_binary_search.py             | 21 ++-----------------
 1 file changed, 2 insertions(+), 19 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
index 9cbe049be3..c681bb2631 100755
--- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
@@ -38,7 +38,7 @@
 )
 
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
-from finn.util.basic import get_rtlsim_trace_depth, make_build_dir
+from finn.util.basic import get_rtlsim_trace_depth, make_build_dir, find_next_power_of_2
 from finn.util.data_packing import (
     npy_to_rtlsim_input,
     pack_innermost_dim_as_hex_string,
@@ -647,23 +647,6 @@ def get_verilog_top_module_intf_names(self):
 
         return intf_names
 
-    def find_next_power_of_2(self, n):
-        # Negative values will loop infinitely below - return 0
-        if n <= 0:
-            return 0
-        # If '1' is requested, output will be '0' in the loop below, avoid this now.
-        elif n == 1:
-            return 2  # i.e. 2**1
-
-        # decrement 'n' (to handle cases when `n` itself is a power of 2)
-        n = n - 1
-
-        # loop until only one bit is left
-        while n & n - 1:
-            # unset rightmost bit
-            n = n & n - 1
-        return n << 1
-
     def twos_comp(self, val, bitwidth):
         return (val + (1 << bitwidth)) % (1 << bitwidth)
 
@@ -678,7 +661,7 @@ def get_dynamic_config(self, model, address_stride=1):
         thresholds = model.get_initializer(self.onnx_node.input[1])
         num_channels, num_weights_per_channel = thresholds.shape
 
-        weight_addr_boundary = self.find_next_power_of_2(num_weights_per_channel)
+        weight_addr_boundary = find_next_power_of_2(num_weights_per_channel)
         # Make sure that the next power of 2 (output) is greater than the input
         assert weight_addr_boundary >= num_weights_per_channel
 

From ca0042225c006d4545e26b0e0f1221ecd4ab68c3 Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Thu, 5 Jan 2023 12:58:01 +0000
Subject: [PATCH 047/235] [thresholding] replace math functions with existing
 functions

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 .../custom_op/fpgadataflow/thresholding_binary_search.py  | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
index c681bb2631..9113e4f9d9 100755
--- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
@@ -647,12 +647,6 @@ def get_verilog_top_module_intf_names(self):
 
         return intf_names
 
-    def twos_comp(self, val, bitwidth):
-        return (val + (1 << bitwidth)) % (1 << bitwidth)
-
-    def prep_axilite_val(self, val):
-        return self.twos_comp(int(val), self.get_weight_datatype().bitwidth())
-
     def get_dynamic_config(self, model, address_stride=1):
         """Returns a configuration dictionary containing axilite write commands
         in order to program the thresholds into the RTL core during runtime.
@@ -677,7 +671,7 @@ def get_dynamic_config(self, model, address_stride=1):
                 )
                 config[key_name] = (
                     channel_start_addr + addr,
-                    self.prep_axilite_val(weight),
+                    int(str(pack_innermost_dim_as_hex_string([weight], self.get_weight_datatype(), self.get_weight_datatype().bitwidth())), 0),
                 )
 
                 weight_cntr += 1

From 7f3455fc0d1dafedaf8cdfca8144dea41747a624 Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Thu, 5 Jan 2023 13:16:11 +0000
Subject: [PATCH 048/235] [thresholding] remove convept of mem_mode for RTL
 thresholding binary search node

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 .../thresholding_binary_search.py             | 38 ------------
 ...fpgadataflow_thresholding_binary_search.py | 58 +------------------
 2 files changed, 1 insertion(+), 95 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
index 9113e4f9d9..954850562e 100755
--- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
@@ -88,19 +88,6 @@ def get_nodeattr_types(self):
             # [4] is four vectors (like a FC layer with batch=4)
             # [1, 4, 4] is four * four vectors (like a conv layer with batch=1)
             "numInputVectors": ("ints", False, [1]),
-            # memory mode for the thresholds
-            # const -- embedded thresholds, default
-            # decoupled -- streaming thresholds with streamer packaged inside IP
-            "mem_mode": ("s", False, "const", {"const", "decoupled"}),
-            # (mem_mode = decoupled only) whether weights (thresholds) will be
-            # writable through an AXI-lite interface during runtime
-            # 1 for enabled, 0 for disabled.
-            # see finn-rtllib/memstream/doc/README for more about the memory
-            # address map used for writable weights
-            # IMPORTANT: After using AXI lite to either read or write the weights,
-            # always "flush" the accelerator by first passing a dummy input
-            # vector through the accelerator. This will get rid of any old
-            # weight data from the weight FIFOs.
             "gen_top_module": ("s", False, ""),
             "activation_bias": ("i", False, 0),
             "clkFreq": ("i", False, 200000000),
@@ -150,12 +137,6 @@ def get_outstream_width(self, ind=0):
         return o_bits * self.get_nodeattr("PE")
 
     def get_weightstream_width(self):
-        # Only 'decoupled' mode is supported
-        mem_mode = self.get_nodeattr("mem_mode")
-        if mem_mode != "decoupled":
-            raise Exception(
-                "Unrecognized memory mode for this node: {}".format(mem_mode)
-            )
         pe = self.get_nodeattr("PE")
         wp = self.get_weight_datatype().bitwidth()
         n_thres_steps = self.get_nodeattr("numSteps")
@@ -442,13 +423,6 @@ def code_generation_ipgen(self, model, fpgapart, clk):
         self.generate_params(model, code_gen_dir)
 
     def generate_params(self, model, path):
-        # Only 'decoupled' mode is supported
-        mem_mode = self.get_nodeattr("mem_mode")
-        if mem_mode != "decoupled":
-            raise Exception(
-                "Unrecognized memory mode for this node: {}".format(mem_mode)
-            )
-
         code_gen_dir = path
         weight_filename_sim = "{}/thresholds.npy".format(code_gen_dir)
         thresholds = model.get_initializer(self.onnx_node.input[1])
@@ -508,12 +482,6 @@ def execute_node(self, context, graph):
                     self.get_nodeattr("exec_mode"), "rtlsim"
                 )
             )
-        if self.get_nodeattr("mem_mode") != "decoupled":
-            raise Exception(
-                "Invalid mem_mode value: {}; mem_mode must be set to '{}'".format(
-                    self.get_nodeattr("mem_mode"), "decoupled"
-                )
-            )
 
         node = self.onnx_node
         code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
@@ -635,12 +603,6 @@ def get_verilog_top_module_intf_names(self):
         Each block must have at most one aximm and one axilite."""
 
         intf_names = super().get_verilog_top_module_intf_names()
-        # Only 'decoupled' mode is supported - check before adding axilite interface
-        mem_mode = self.get_nodeattr("mem_mode")
-        if mem_mode != "decoupled":
-            raise Exception(
-                "Unrecognized memory mode for this node: {}".format(mem_mode)
-            )
         intf_names["axilite"] = ["s_axilite"]
         intf_names["s_axis"] = [["s_axis"]]
         intf_names["m_axis"] = [["m_axis"]]
diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py
index 29fc2828b6..7ef5da8f23 100755
--- a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py
+++ b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py
@@ -93,7 +93,6 @@ def make_single_thresholding_binary_search_modelwrapper(
     input_data_type,
     output_data_type,
     activation_bias,
-    mem_mode,
     num_input_vecs,
 ):
 
@@ -121,7 +120,6 @@ def make_single_thresholding_binary_search_modelwrapper(
         weightDataType=input_data_type.name,
         outputDataType=output_data_type.name,
         activation_bias=activation_bias,
-        mem_mode=mem_mode,
         numInputVectors=num_input_vecs,
     )
     graph = helper.make_graph(
@@ -142,54 +140,6 @@ def make_single_thresholding_binary_search_modelwrapper(
     return model
 
 
-# Test brief: Prove that memory mode 'const' is not supported for this layer type
-@pytest.mark.fpgadataflow
-@pytest.mark.vivado
-def test_fpgadataflow_thresholding_binary_search_const_mem_mode():
-    input_data_type = DataType["INT16"]
-    activation = DataType["INT4"]
-    fold = -1
-    num_input_channels = 16
-    mem_mode = "const"
-
-    pe = generate_pe_value(fold, num_input_channels)
-    num_input_vecs = [1, 2, 2]
-    output_data_type = activation
-    activation_bias = output_data_type.min()
-
-    # Generate random thresholds and sort in ascending order
-    num_steps = activation.get_num_possible_values() - 1
-    thresholds = generate_random_threshold_values(
-        input_data_type, num_input_channels, num_steps
-    )
-
-    # Generate model from input parameters to the test
-    model = make_single_thresholding_binary_search_modelwrapper(
-        thresholds,
-        pe,
-        input_data_type,
-        output_data_type,
-        activation_bias,
-        mem_mode,
-        num_input_vecs,
-    )
-
-    # Prove that 'const' memory mode is not supported for this class
-    # 'const' memory mode is not supported for this class, catch the specific exception
-    # thrown by FINN. Exception: ('Unrecognized memory mode for this node:', 'const')
-    try:
-        model = model.transform(InsertFIFO(True))
-        model = model.transform(GiveUniqueNodeNames())
-        model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
-        model = model.transform(HLSSynthIP())
-        model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns))
-    except Exception as e:
-        if str(e) != "Unrecognized memory mode for this node: {}".format(mem_mode):
-            raise
-        # Caught the expected exception, leave the test early
-        return
-
-
 # Test brief: Test that PrepareRTLSim() runs successfully. This function is not
 # tested in test_fpgadataflow_thresholding_binary_search()
 @pytest.mark.fpgadataflow
@@ -199,7 +149,6 @@ def test_fpgadataflow_thresholding_binary_search_prepare_rtlsim():
     act = DataType["INT4"]
     fold = -1
     num_input_channels = 16
-    mem_mode = "decoupled"
 
     # Handle inputs to the test
     pe = generate_pe_value(fold, num_input_channels)
@@ -226,7 +175,6 @@ def test_fpgadataflow_thresholding_binary_search_prepare_rtlsim():
         input_data_type,
         output_data_type,
         activation_bias,
-        mem_mode,
         num_input_vecs,
     )
 
@@ -245,15 +193,12 @@ def test_fpgadataflow_thresholding_binary_search_prepare_rtlsim():
 @pytest.mark.parametrize("input_data_type", [DataType["INT16"], DataType["UINT16"]])
 @pytest.mark.parametrize("fold", [-1])  # 1, 2, etc. will fail
 @pytest.mark.parametrize("num_input_channels", [16])
-# no need to test 'const' mode, it's already done in:
-# test_fpgadataflow_thresholding_binary_search_const_mem_mode()
-@pytest.mark.parametrize("mem_mode", ["decoupled"])
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
 @pytest.mark.fpgadataflow
 @pytest.mark.vivado
 @pytest.mark.slow
 def test_fpgadataflow_thresholding_binary_search(
-    activation, input_data_type, fold, num_input_channels, mem_mode, exec_mode
+    activation, input_data_type, fold, num_input_channels, exec_mode
 ):
     # Handle inputs to the test
     pe = generate_pe_value(fold, num_input_channels)
@@ -304,7 +249,6 @@ def test_fpgadataflow_thresholding_binary_search(
         input_data_type,
         output_data_type,
         activation_bias,
-        mem_mode,
         num_input_vecs,
     )
 

From 4bc69f1a374821b16b80826946223a0a36cae787 Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Thu, 5 Jan 2023 16:20:02 +0000
Subject: [PATCH 049/235] [thresholding] add methods needed for convertingToHls
 transformation

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 .../thresholding_binary_search.py             | 40 +++++++++++++++++--
 1 file changed, 37 insertions(+), 3 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
index 954850562e..c342d235d9 100755
--- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
@@ -101,10 +101,23 @@ def calc_tmem(self):
         return num_channels // pe
 
     def make_shape_compatible_op(self, model):
-        return []
+        oshape = self.get_normal_output_shape()
+        return super().make_const_shape_op(oshape)
 
     def infer_node_datatype(self, model):
-        return
+        node = self.onnx_node
+        idt = model.get_tensor_datatype(node.input[0])
+        if idt != self.get_input_datatype():
+            warn_str = "inputDataType changing for %s: %s -> %s " % (
+                node.name,
+                str(self.get_input_datatype().name),
+                str(idt.name),
+            )
+            warnings.warn(warn_str)
+        self.set_nodeattr("inputDataType", idt.name)
+        # set output datatype from property
+        odt = self.get_output_datatype()
+        model.set_tensor_datatype(node.output[0], odt)
 
     def verify_node(self):
         return []
@@ -126,7 +139,28 @@ def get_weight_datatype(self):
         return DataType[self.get_nodeattr("weightDataType")]
 
     def minimize_accumulator_width(self, model):
-        return None
+        "Minimize threshold width ('accumulator width' here due to convention)"
+        thresholds = model.get_initializer(self.onnx_node.input[1])
+        threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds)
+        min_threshold = thresholds.min()
+        max_threshold = thresholds.max()
+        min_input = self.get_input_datatype().min()
+        max_input = self.get_input_datatype().max()
+        # get range required by threshold values
+        tdt_min = min(min_input, min_threshold)
+        tdt_max = max(max_input, max_threshold)
+        if tdt_min < 0:
+            if abs(tdt_min) > tdt_max:
+                tdt = DataType.get_smallest_possible(tdt_min)
+            else:
+                tdt = DataType.get_smallest_possible(-tdt_max - 1)
+        else:
+            tdt = DataType.get_smallest_possible(tdt_max)
+        assert np.vectorize(tdt.allowed)(
+            threshold_tensor
+        ).all(), "Thresholds can't be expressed with type %s" % str(tdt)
+        self.set_nodeattr("weightDataType", tdt.name)
+        return DataType[self.get_nodeattr("weightDataType")]
 
     def get_instream_width(self, ind=0):
         i_bits = self.get_input_datatype().bitwidth()

From 3b6a1980b8ac28f5a809125d1e06eeb5ab2ba3b5 Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Thu, 5 Jan 2023 16:32:23 +0000
Subject: [PATCH 050/235] [thresholding] add convertingToHls transformation for
 thresholding binary search RTL node

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 .../fpgadataflow/convert_to_hls_layers.py     | 93 +++++++++++++++----
 1 file changed, 73 insertions(+), 20 deletions(-)

diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
index 525af7ea92..17f839c5c5 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
@@ -1062,9 +1062,10 @@ def apply(self, model):
 class InferThresholdingLayer(Transformation):
     """Convert any MultiThreshold into a standalone thresholding HLS layer."""
 
-    def __init__(self, mem_mode="const"):
+    def __init__(self, mem_mode="const", use_rtl_variant=False):
         super().__init__()
         self.mem_mode = mem_mode
+        self.use_rtl_variant = use_rtl_variant
 
     def apply(self, model):
         graph = model.graph
@@ -1118,26 +1119,78 @@ def apply(self, model):
                 )
                 actval = int(actval)
                 assert (not odt.signed()) or (actval < 0), (
-                    node.name + ": Signed output requres actval < 0"
-                )
-                # create and insert new Thresholding_Batch node
-                new_node = helper.make_node(
-                    "Thresholding_Batch",
-                    [thl_input, thl_threshold],
-                    [thl_output],
-                    domain="finn.custom_op.fpgadataflow",
-                    backend="fpgadataflow",
-                    NumChannels=ifc,
-                    PE=pe,
-                    numSteps=thl_thres_shape[1],
-                    inputDataType=idt.name,
-                    weightDataType=idt.name,  # will be set by MinimizeAccumulatorWidth
-                    outputDataType=odt.name,
-                    numInputVectors=list(thl_in_shape[:-1]),
-                    ActVal=actval,
-                    mem_mode=self.mem_mode,
-                    name="Thresholding_Batch_" + node.name,
+                    node.name + ": Signed output requires actval < 0"
                 )
+
+                # Ensure that RTL variant is not inserted for unsupported configuration
+                is_rtl_variant_compatible = True
+
+                # Perform checks for RTL variant if chosen
+                if self.use_rtl_variant:
+                    # Check memory mode
+                    if self.mem_mode != "decoupled":
+                        warnings.warn(
+                            """%s : RTL Thresholding does not support 'decoupled' memory mode.
+                            Falling back to HLS implementation."""
+                            % node.name
+                        )
+                        is_rtl_variant_compatible = False
+
+                    # Check PE/SIMD value
+                    if pe != 1:
+                        warnings.warn(
+                            """%s : RTL Thresholding does not support paralellisation.
+                            Only a PE value of 1 is supported.
+                            Falling back to HLS implementation."""
+                            % node.name
+                        )
+                        is_rtl_variant_compatible = False
+
+                if self.use_rtl_variant and is_rtl_variant_compatible:
+                    new_node = helper.make_node(
+                        "Thresholding_Binary_Search",
+                        [thl_input, thl_threshold],
+                        [thl_output],
+                        domain="finn.custom_op.fpgadataflow",
+                        backend="fpgadataflow",
+                        NumChannels=ifc,
+                        PE=pe,
+                        numSteps=thl_thres_shape[1],
+                        inputDataType=idt.name,
+                        weightDataType=idt.name,  # will be set by MinimizeAccumulatorWidth
+                        outputDataType=odt.name,
+                        numInputVectors=list(thl_in_shape[:-1]),
+                        activation_bias=actval,
+                        mem_mode=self.mem_mode,
+                        name="Thresholding_Binary_Search_" + node.name,
+                    )
+                else:
+                    if self.use_rtl_variant:
+                        warnings.warn(
+                        """%s : RTL Thresholding requested for unsupported
+                            configuration. Falling back to HLS implementation."""
+                        % node.name
+                    )
+
+                    # create and insert new Thresholding_Batch node
+                    new_node = helper.make_node(
+                        "Thresholding_Batch",
+                        [thl_input, thl_threshold],
+                        [thl_output],
+                        domain="finn.custom_op.fpgadataflow",
+                        backend="fpgadataflow",
+                        NumChannels=ifc,
+                        PE=pe,
+                        numSteps=thl_thres_shape[1],
+                        inputDataType=idt.name,
+                        weightDataType=idt.name,  # will be set by MinimizeAccumulatorWidth
+                        outputDataType=odt.name,
+                        numInputVectors=list(thl_in_shape[:-1]),
+                        ActVal=actval,
+                        mem_mode=self.mem_mode,
+                        name="Thresholding_Batch_" + node.name,
+                    )
+
                 graph.node.insert(insert_point, new_node)
                 # remove old node
                 graph.node.remove(node)

From b3800cd7e258cecb0466cb9238eeb37ff738d660 Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Thu, 5 Jan 2023 16:34:09 +0000
Subject: [PATCH 051/235] [thresholding] add test for convertingToHls
 transformation for thresholding binary search node

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 .../test_convert_to_hls_thresholding.py       | 322 ++++++++++++++++++
 1 file changed, 322 insertions(+)
 create mode 100755 tests/fpgadataflow/test_convert_to_hls_thresholding.py

diff --git a/tests/fpgadataflow/test_convert_to_hls_thresholding.py b/tests/fpgadataflow/test_convert_to_hls_thresholding.py
new file mode 100755
index 0000000000..30932638b6
--- /dev/null
+++ b/tests/fpgadataflow/test_convert_to_hls_thresholding.py
@@ -0,0 +1,322 @@
+# Copyright (C) 2023, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+import numpy as np
+from onnx import TensorProto, helper
+from pyverilator.util.axi_utils import axilite_write, reset_rtlsim
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.general.multithreshold import multithreshold
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.general import GiveUniqueNodeNames
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import gen_finn_dt_tensor
+
+from finn.core.rtlsim_exec import rtlsim_exec
+import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+
+test_fpga_part = "xczu3eg-sbva484-1-e"
+target_clk_ns = 5
+
+
+# Helper functions
+def sort_thresholds_increasing(thresholds):
+    return np.sort(thresholds, axis=1)
+
+
+def generate_random_threshold_values(input_data_type, num_input_channels, num_steps):
+    return np.random.randint(
+        input_data_type.min(),
+        input_data_type.max() + 1,
+        (num_input_channels, num_steps),
+    ).astype(np.float32)
+
+
+def generate_pe_value(fold, num_input_channels):
+    if fold == -1:
+        fold = num_input_channels
+    pe = num_input_channels // fold
+    assert num_input_channels % pe == 0
+    return pe
+
+
+# n = batch, c = channel, h = height, w = width of feature map
+# Standard = NCHW; FINN = NHWC
+# Convert from NCHW to NHWC
+def convert_np_array_to_finn_data_layout(data):
+    return np.transpose(data, (0, 2, 3, 1))
+
+
+# n = batch, c = channel, h = height, w = width of feature map
+# Standard = NCHW; FINN = NHWC
+# Convert from NHWC to NCHW
+def convert_np_array_to_standard_data_layout(data):
+    return np.transpose(data, (0, 3, 1, 2))
+
+
+def make_single_thresholding_binary_search_modelwrapper(
+    thresholds,
+    pe,
+    input_data_type,
+    output_data_type,
+    activation_bias,
+    num_input_vecs,
+):
+    NumChannels = thresholds.shape[0]
+
+    inp = helper.make_tensor_value_info(
+        "inp", TensorProto.FLOAT, num_input_vecs + [NumChannels]
+    )
+    outp = helper.make_tensor_value_info(
+        "outp", TensorProto.FLOAT, num_input_vecs + [NumChannels]
+    )
+
+    node_inp_list = ["inp", "thresh"]
+
+    Thresholding_node = helper.make_node(
+        "Thresholding_Binary_Search",
+        node_inp_list,
+        ["outp"],
+        domain="finn.custom_op.fpgadataflow",
+        backend="fpgadataflow",
+        NumChannels=NumChannels,
+        PE=pe,
+        numSteps=thresholds.shape[1],
+        inputDataType=input_data_type.name,
+        weightDataType=input_data_type.name,
+        outputDataType=output_data_type.name,
+        numInputVectors=num_input_vecs,
+        activation_bias=activation_bias,
+    )
+    graph = helper.make_graph(
+        nodes=[Thresholding_node],
+        name="thresholding_graph",
+        inputs=[inp],
+        outputs=[outp],
+    )
+
+    model = helper.make_model(graph, producer_name="thresholding-model")
+    model = ModelWrapper(model)
+
+    model.set_tensor_datatype("inp", input_data_type)
+    model.set_tensor_datatype("outp", output_data_type)
+
+    model.set_tensor_datatype("thresh", input_data_type)
+    model.set_initializer("thresh", thresholds)
+    return model
+
+
+def make_single_multithresholding_modelwrapper(
+    thresholds,
+    pe,
+    input_data_type,
+    output_data_type,
+    activation_bias,
+    num_input_vecs,
+):
+    NumChannels = thresholds.shape[0]
+
+    inp = helper.make_tensor_value_info(
+        "inp", TensorProto.FLOAT, num_input_vecs + [NumChannels]
+    )
+    outp = helper.make_tensor_value_info(
+        "outp", TensorProto.FLOAT, num_input_vecs + [NumChannels]
+    )
+
+    node_inp_list = ["inp", "thresh"]
+
+    Multithresholding_node = helper.make_node(
+        "MultiThreshold",
+        node_inp_list,
+        ["outp"],
+        domain="qonnx.custom_op.general",
+        out_dtype=output_data_type.name,
+        out_bias=float(activation_bias),
+        out_scale=1.0,
+    )
+
+    graph = helper.make_graph(
+        nodes=[Multithresholding_node],
+        name="multithresholding_graph",
+        inputs=[inp],
+        outputs=[outp],
+    )
+
+    model = helper.make_model(graph, producer_name="multithresholding-model")
+    model = ModelWrapper(model)
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+    model = model.transform(GiveUniqueNodeNames())
+
+    model.set_tensor_datatype("inp", input_data_type)
+    model.set_tensor_datatype("outp", output_data_type)
+
+    model.set_tensor_datatype("thresh", input_data_type)
+    model.set_initializer("thresh", thresholds)
+    return model
+
+
+@pytest.mark.parametrize("activation", [DataType["INT4"], DataType["BIPOLAR"]])
+@pytest.mark.parametrize("input_data_type", [DataType["INT16"], DataType["UINT16"]])
+@pytest.mark.parametrize("fold", [-1])
+@pytest.mark.parametrize("num_input_channels", [16])
+@pytest.mark.parametrize("mem_mode", ["decoupled", "const"])
+@pytest.mark.fpgadataflow
+@pytest.mark.vivado
+def test_convert_to_hls_tbs_rtl_variant(activation, input_data_type, fold, num_input_channels, mem_mode):
+    # Handle inputs to the test
+    pe = generate_pe_value(fold, num_input_channels)
+    num_steps = activation.get_num_possible_values() - 1
+
+    # Cppsim is not supported for this node (as it is an RTL node)
+    if mem_mode == "const":
+        pytest.skip("const memory mode not supported for RTL Thresholding Binary Search node")
+    elif mem_mode != "decoupled":
+        raise Exception("Unknown mem_mode: {}".format(mem_mode))
+
+    if activation == DataType["BIPOLAR"]:
+        pytest.skip("Only negative activations are supported for RTL Thresholding Binary Search node")
+
+    # Other non-input parameters
+    num_input_vecs = [1, 2, 2]
+    output_data_type = activation
+    if output_data_type == DataType["BIPOLAR"]:
+        activation_bias = 0
+    else:
+        activation_bias = output_data_type.min()
+
+    # generate random input data
+    tensor_shape = tuple(num_input_vecs + [num_input_channels])
+    x = gen_finn_dt_tensor(input_data_type, tensor_shape)
+
+    # Generate random thresholds and sort in ascending order
+    thresholds = generate_random_threshold_values(
+        input_data_type, num_input_channels, num_steps
+    )
+
+    # provide non-decreasing/ascending thresholds
+    thresholds = sort_thresholds_increasing(thresholds)
+
+    x_nhwc = convert_np_array_to_standard_data_layout(x)
+    y = multithreshold(x_nhwc, thresholds)
+
+    # convert back to NHWC for comparison to hw outputs
+    y = convert_np_array_to_finn_data_layout(y)
+    if activation == DataType["BIPOLAR"]:
+        # binary to bipolar
+        y = 2 * y - 1
+    else:
+        # signed offset
+        y += activation.min()
+
+    # Generate model from input parameters to the test
+    model = make_single_thresholding_binary_search_modelwrapper(
+        thresholds,
+        pe,
+        input_data_type,
+        output_data_type,
+        activation_bias,
+        num_input_vecs,
+    )
+
+    model = model.transform(InsertFIFO(True))
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
+    model = model.transform(HLSSynthIP())
+    model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns))
+
+    # Retrieve the axilite programming sequence for weights - for decoupled mode only
+    tbs_node = model.get_nodes_by_op_type("Thresholding_Binary_Search")[0]
+    tbs_inst = getCustomOp(tbs_node)
+    config = tbs_inst.get_dynamic_config(model)
+
+    # Reshape generated data (not from model)
+    oshape = model.get_tensor_shape("outp")
+    y_expected = y.reshape(oshape)
+
+    # Helper function that delivers the hook to program the thresholds via AXI-Lite
+    def config_hook(config):
+        if config is None:
+            return None
+
+        def write_thresh_config(sim):
+            # axi_name = "s_axilite_0_" # works
+            axi_name = getCustomOp(
+                model.get_nodes_by_op_type("Thresholding_Binary_Search")[0]
+            ).get_verilog_top_module_intf_names()["axilite"][0]
+            axi_name += "_0_"
+
+            # Write config registers to the Threshold memory.
+            # The dictionary defines (addr, value) tuples.
+            for config_entry in config.values():
+                addr = config_entry[0]
+                val = config_entry[1]
+                axilite_write(sim, addr, val, basename=axi_name)
+
+            reset_rtlsim(sim)
+
+        return write_thresh_config
+
+    input_dict = {"inp": x}
+    rtlsim_exec(model, input_dict, pre_hook=config_hook(config))
+    y_produced = input_dict["outp"]
+    assert (y_produced == y_expected).all()
+
+    #### Make a Multithreshold graph and convert to thresholding binary search node
+    new_model = make_single_multithresholding_modelwrapper(
+        thresholds,
+        pe,
+        input_data_type,
+        output_data_type,
+        activation_bias,
+        num_input_vecs,
+    )
+
+    # Recreate the model using the ConvertToHLS transform
+    new_model = new_model.transform(to_hls.InferThresholdingLayer(mem_mode=mem_mode, use_rtl_variant=True))
+    new_model = new_model.transform(InsertFIFO(True))
+    new_model = new_model.transform(GiveUniqueNodeNames())
+    new_model = new_model.transform(PrepareIP(test_fpga_part, target_clk_ns))
+    new_model = new_model.transform(HLSSynthIP())
+    new_model = new_model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns))
+
+    input_dict = {"inp": x}
+    rtlsim_exec(new_model, input_dict, pre_hook=config_hook(config))
+    y_produced_new = input_dict["outp"]
+    assert (y_produced_new == y_expected).all()

From 11464d87c4857dd2227935c198adbb6115250fe3 Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Thu, 5 Jan 2023 16:35:32 +0000
Subject: [PATCH 052/235] [thresholding] skip tests with unsupported folding
 factor input

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 tests/fpgadataflow/test_convert_to_hls_thresholding.py      | 6 +++++-
 .../test_fpgadataflow_thresholding_binary_search.py         | 6 +++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/tests/fpgadataflow/test_convert_to_hls_thresholding.py b/tests/fpgadataflow/test_convert_to_hls_thresholding.py
index 30932638b6..3b56f40d9c 100755
--- a/tests/fpgadataflow/test_convert_to_hls_thresholding.py
+++ b/tests/fpgadataflow/test_convert_to_hls_thresholding.py
@@ -194,7 +194,7 @@ def make_single_multithresholding_modelwrapper(
 
 @pytest.mark.parametrize("activation", [DataType["INT4"], DataType["BIPOLAR"]])
 @pytest.mark.parametrize("input_data_type", [DataType["INT16"], DataType["UINT16"]])
-@pytest.mark.parametrize("fold", [-1])
+@pytest.mark.parametrize("fold", [-1, 1, 2])
 @pytest.mark.parametrize("num_input_channels", [16])
 @pytest.mark.parametrize("mem_mode", ["decoupled", "const"])
 @pytest.mark.fpgadataflow
@@ -213,6 +213,10 @@ def test_convert_to_hls_tbs_rtl_variant(activation, input_data_type, fold, num_i
     if activation == DataType["BIPOLAR"]:
         pytest.skip("Only negative activations are supported for RTL Thresholding Binary Search node")
 
+    # Paralellisation not supported for thresholding binary search rtl node
+    if pe != 1:
+        pytest.skip("Paralellisation of IP not supported for RTL Thresholding Binary Search node")
+
     # Other non-input parameters
     num_input_vecs = [1, 2, 2]
     output_data_type = activation
diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py
index 7ef5da8f23..0be91a2569 100755
--- a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py
+++ b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py
@@ -191,7 +191,7 @@ def test_fpgadataflow_thresholding_binary_search_prepare_rtlsim():
 # N.B. - fold factor of '-1' is supported only (no PE/SIMD support)
 @pytest.mark.parametrize("activation", [DataType["INT4"], DataType["BIPOLAR"]])
 @pytest.mark.parametrize("input_data_type", [DataType["INT16"], DataType["UINT16"]])
-@pytest.mark.parametrize("fold", [-1])  # 1, 2, etc. will fail
+@pytest.mark.parametrize("fold", [-1, 1, 2])
 @pytest.mark.parametrize("num_input_channels", [16])
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
 @pytest.mark.fpgadataflow
@@ -204,6 +204,10 @@ def test_fpgadataflow_thresholding_binary_search(
     pe = generate_pe_value(fold, num_input_channels)
     num_steps = activation.get_num_possible_values() - 1
 
+    # Paralellisation not supported for thresholding binary search rtl node
+    if pe != 1:
+        pytest.skip("Paralellisation of IP not supported for RTL Thresholding Binary Search node")
+
     # Cppsim is not supported for this node (as it is an RTL node)
     if exec_mode == "cppsim":
         pytest.skip("cppsim not supported for RTL Thresholding Binary Search node")

From e71b1c0e1487befd8ec04ac6ebcc0caf8d63b4a3 Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Thu, 5 Jan 2023 16:45:16 +0000
Subject: [PATCH 053/235] [thresholding] add comments for attributes

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 src/finn/custom_op/fpgadataflow/thresholding_binary_search.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
index c342d235d9..711e3a8270 100755
--- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
@@ -88,8 +88,12 @@ def get_nodeattr_types(self):
             # [4] is four vectors (like a FC layer with batch=4)
             # [1, 4, 4] is four * four vectors (like a conv layer with batch=1)
             "numInputVectors": ("ints", False, [1]),
+            # name of the top module in verilog template. Used by PyVerilator
+            # and IPI generation
             "gen_top_module": ("s", False, ""),
+            # bias to be applied to outputs of the node
             "activation_bias": ("i", False, 0),
+            # used for IPI step
             "clkFreq": ("i", False, 200000000),
         }
         my_attrs.update(super().get_nodeattr_types())

From 3be1140fe68058c55fc1e3685609b6964ce7e993 Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Thu, 5 Jan 2023 17:01:05 +0000
Subject: [PATCH 054/235] [thresholding] replace min() with signed() function

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 src/finn/custom_op/fpgadataflow/thresholding_binary_search.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
index 711e3a8270..2073e95b41 100755
--- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
@@ -393,7 +393,7 @@ def prepare_codegen_rtl_values(self):
 
         # Is the input datatype signed or unsigned?
         # The thresholding core needs to know this when comparing weights to inputs
-        if self.get_input_datatype().min() < 0:
+        if self.get_input_datatype().signed():
             code_gen_dict["$SIGN$"] = ["signed"]
         else:
             code_gen_dict["$SIGN$"] = ["unsigned"]

From e05effc20cd2e357f5bba38d2e320144b313c9f5 Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Thu, 5 Jan 2023 17:40:42 +0000
Subject: [PATCH 055/235] [thresholding] fix formatting from pre-commit

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 .../thresholding_binary_search.py             | 13 +++++++++--
 .../fpgadataflow/convert_to_hls_layers.py     |  6 ++---
 src/finn/util/basic.py                        |  1 +
 .../test_convert_to_hls_thresholding.py       | 22 ++++++++++++++-----
 ...fpgadataflow_thresholding_binary_search.py |  4 +++-
 5 files changed, 34 insertions(+), 12 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
index 2073e95b41..d5d5c48cce 100755
--- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
@@ -38,7 +38,7 @@
 )
 
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
-from finn.util.basic import get_rtlsim_trace_depth, make_build_dir, find_next_power_of_2
+from finn.util.basic import find_next_power_of_2, get_rtlsim_trace_depth, make_build_dir
 from finn.util.data_packing import (
     npy_to_rtlsim_input,
     pack_innermost_dim_as_hex_string,
@@ -671,7 +671,16 @@ def get_dynamic_config(self, model, address_stride=1):
                 )
                 config[key_name] = (
                     channel_start_addr + addr,
-                    int(str(pack_innermost_dim_as_hex_string([weight], self.get_weight_datatype(), self.get_weight_datatype().bitwidth())), 0),
+                    int(
+                        str(
+                            pack_innermost_dim_as_hex_string(
+                                [weight],
+                                self.get_weight_datatype(),
+                                self.get_weight_datatype().bitwidth(),
+                            )
+                        ),
+                        0,
+                    ),
                 )
 
                 weight_cntr += 1
diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
index 17f839c5c5..a0461bda82 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
@@ -1167,10 +1167,10 @@ def apply(self, model):
                 else:
                     if self.use_rtl_variant:
                         warnings.warn(
-                        """%s : RTL Thresholding requested for unsupported
+                            """%s : RTL Thresholding requested for unsupported
                             configuration. Falling back to HLS implementation."""
-                        % node.name
-                    )
+                            % node.name
+                        )
 
                     # create and insert new Thresholding_Batch node
                     new_node = helper.make_node(
diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py
index 9a66cf90eb..8782bd7f8c 100644
--- a/src/finn/util/basic.py
+++ b/src/finn/util/basic.py
@@ -217,6 +217,7 @@ def is_exe(fpath):
 
     return None
 
+
 def find_next_power_of_2(n):
     # Negative values will loop infinitely below - return 0
     if n <= 0:
diff --git a/tests/fpgadataflow/test_convert_to_hls_thresholding.py b/tests/fpgadataflow/test_convert_to_hls_thresholding.py
index 3b56f40d9c..d0502a9b74 100755
--- a/tests/fpgadataflow/test_convert_to_hls_thresholding.py
+++ b/tests/fpgadataflow/test_convert_to_hls_thresholding.py
@@ -40,8 +40,8 @@
 from qonnx.transformation.infer_shapes import InferShapes
 from qonnx.util.basic import gen_finn_dt_tensor
 
-from finn.core.rtlsim_exec import rtlsim_exec
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
+from finn.core.rtlsim_exec import rtlsim_exec
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
@@ -199,23 +199,31 @@ def make_single_multithresholding_modelwrapper(
 @pytest.mark.parametrize("mem_mode", ["decoupled", "const"])
 @pytest.mark.fpgadataflow
 @pytest.mark.vivado
-def test_convert_to_hls_tbs_rtl_variant(activation, input_data_type, fold, num_input_channels, mem_mode):
+def test_convert_to_hls_tbs_rtl_variant(
+    activation, input_data_type, fold, num_input_channels, mem_mode
+):
     # Handle inputs to the test
     pe = generate_pe_value(fold, num_input_channels)
     num_steps = activation.get_num_possible_values() - 1
 
     # Cppsim is not supported for this node (as it is an RTL node)
     if mem_mode == "const":
-        pytest.skip("const memory mode not supported for RTL Thresholding Binary Search node")
+        pytest.skip(
+            "const memory mode not supported for RTL Thresholding Binary Search node"
+        )
     elif mem_mode != "decoupled":
         raise Exception("Unknown mem_mode: {}".format(mem_mode))
 
     if activation == DataType["BIPOLAR"]:
-        pytest.skip("Only negative activations are supported for RTL Thresholding Binary Search node")
+        pytest.skip(
+            "Only negative activations are supported for RTL Thresholding Binary Search node"
+        )
 
     # Paralellisation not supported for thresholding binary search rtl node
     if pe != 1:
-        pytest.skip("Paralellisation of IP not supported for RTL Thresholding Binary Search node")
+        pytest.skip(
+            "Paralellisation of IP not supported for RTL Thresholding Binary Search node"
+        )
 
     # Other non-input parameters
     num_input_vecs = [1, 2, 2]
@@ -313,7 +321,9 @@ def write_thresh_config(sim):
     )
 
     # Recreate the model using the ConvertToHLS transform
-    new_model = new_model.transform(to_hls.InferThresholdingLayer(mem_mode=mem_mode, use_rtl_variant=True))
+    new_model = new_model.transform(
+        to_hls.InferThresholdingLayer(mem_mode=mem_mode, use_rtl_variant=True)
+    )
     new_model = new_model.transform(InsertFIFO(True))
     new_model = new_model.transform(GiveUniqueNodeNames())
     new_model = new_model.transform(PrepareIP(test_fpga_part, target_clk_ns))
diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py
index 0be91a2569..f1a03a3a89 100755
--- a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py
+++ b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py
@@ -206,7 +206,9 @@ def test_fpgadataflow_thresholding_binary_search(
 
     # Paralellisation not supported for thresholding binary search rtl node
     if pe != 1:
-        pytest.skip("Paralellisation of IP not supported for RTL Thresholding Binary Search node")
+        pytest.skip(
+            "Paralellisation of IP not supported for RTL Thresholding Binary Search node"
+        )
 
     # Cppsim is not supported for this node (as it is an RTL node)
     if exec_mode == "cppsim":

From 48c33042bbc7b17f98510a8299504e4d36c3a2e8 Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Thu, 5 Jan 2023 17:47:16 +0000
Subject: [PATCH 056/235] [thresholding] fix more flake8 formatting

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 .../fpgadataflow/convert_to_hls_layers.py            |  4 ++--
 .../fpgadataflow/test_convert_to_hls_thresholding.py | 12 +++++-------
 .../test_fpgadataflow_thresholding_binary_search.py  |  5 ++---
 3 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
index a0461bda82..f6dd466fab 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
@@ -1157,7 +1157,7 @@ def apply(self, model):
                         PE=pe,
                         numSteps=thl_thres_shape[1],
                         inputDataType=idt.name,
-                        weightDataType=idt.name,  # will be set by MinimizeAccumulatorWidth
+                        weightDataType=idt.name,
                         outputDataType=odt.name,
                         numInputVectors=list(thl_in_shape[:-1]),
                         activation_bias=actval,
@@ -1183,7 +1183,7 @@ def apply(self, model):
                         PE=pe,
                         numSteps=thl_thres_shape[1],
                         inputDataType=idt.name,
-                        weightDataType=idt.name,  # will be set by MinimizeAccumulatorWidth
+                        weightDataType=idt.name,
                         outputDataType=odt.name,
                         numInputVectors=list(thl_in_shape[:-1]),
                         ActVal=actval,
diff --git a/tests/fpgadataflow/test_convert_to_hls_thresholding.py b/tests/fpgadataflow/test_convert_to_hls_thresholding.py
index d0502a9b74..2785d91617 100755
--- a/tests/fpgadataflow/test_convert_to_hls_thresholding.py
+++ b/tests/fpgadataflow/test_convert_to_hls_thresholding.py
@@ -42,14 +42,10 @@
 
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
 from finn.core.rtlsim_exec import rtlsim_exec
-from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
-from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
-from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
-from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 
 test_fpga_part = "xczu3eg-sbva484-1-e"
 target_clk_ns = 5
@@ -209,14 +205,16 @@ def test_convert_to_hls_tbs_rtl_variant(
     # Cppsim is not supported for this node (as it is an RTL node)
     if mem_mode == "const":
         pytest.skip(
-            "const memory mode not supported for RTL Thresholding Binary Search node"
+            "const memory mode not supported for " \
+            "RTL Thresholding Binary Search node"
         )
     elif mem_mode != "decoupled":
         raise Exception("Unknown mem_mode: {}".format(mem_mode))
 
     if activation == DataType["BIPOLAR"]:
         pytest.skip(
-            "Only negative activations are supported for RTL Thresholding Binary Search node"
+            "Only negative activations are supported for " \
+            "RTL Thresholding Binary Search node"
         )
 
     # Paralellisation not supported for thresholding binary search rtl node
@@ -310,7 +308,7 @@ def write_thresh_config(sim):
     y_produced = input_dict["outp"]
     assert (y_produced == y_expected).all()
 
-    #### Make a Multithreshold graph and convert to thresholding binary search node
+    # Make a Multithreshold graph and convert to thresholding binary search node
     new_model = make_single_multithresholding_modelwrapper(
         thresholds,
         pe,
diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py
index f1a03a3a89..a4eab1e181 100755
--- a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py
+++ b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py
@@ -39,11 +39,9 @@
 from qonnx.util.basic import gen_finn_dt_tensor
 
 from finn.core.rtlsim_exec import rtlsim_exec
-from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
-from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
@@ -207,7 +205,8 @@ def test_fpgadataflow_thresholding_binary_search(
     # Paralellisation not supported for thresholding binary search rtl node
     if pe != 1:
         pytest.skip(
-            "Paralellisation of IP not supported for RTL Thresholding Binary Search node"
+            "Paralellisation of IP not supported for " \
+            "RTL Thresholding Binary Search node"
         )
 
     # Cppsim is not supported for this node (as it is an RTL node)

From 1e8a36ca3712100caeed506976a92c7e2ee4b4c4 Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Thu, 5 Jan 2023 17:55:20 +0000
Subject: [PATCH 057/235] [thresholding] remove backslashes for flake8

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 tests/fpgadataflow/test_convert_to_hls_thresholding.py        | 4 ++--
 .../test_fpgadataflow_thresholding_binary_search.py           | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/fpgadataflow/test_convert_to_hls_thresholding.py b/tests/fpgadataflow/test_convert_to_hls_thresholding.py
index 2785d91617..217ee39d74 100755
--- a/tests/fpgadataflow/test_convert_to_hls_thresholding.py
+++ b/tests/fpgadataflow/test_convert_to_hls_thresholding.py
@@ -205,7 +205,7 @@ def test_convert_to_hls_tbs_rtl_variant(
     # Cppsim is not supported for this node (as it is an RTL node)
     if mem_mode == "const":
         pytest.skip(
-            "const memory mode not supported for " \
+            "const memory mode not supported for "
             "RTL Thresholding Binary Search node"
         )
     elif mem_mode != "decoupled":
@@ -213,7 +213,7 @@ def test_convert_to_hls_tbs_rtl_variant(
 
     if activation == DataType["BIPOLAR"]:
         pytest.skip(
-            "Only negative activations are supported for " \
+            "Only negative activations are supported for "
             "RTL Thresholding Binary Search node"
         )
 
diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py
index a4eab1e181..049d65835f 100755
--- a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py
+++ b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py
@@ -205,7 +205,7 @@ def test_fpgadataflow_thresholding_binary_search(
     # Paralellisation not supported for thresholding binary search rtl node
     if pe != 1:
         pytest.skip(
-            "Paralellisation of IP not supported for " \
+            "Paralellisation of IP not supported for "
             "RTL Thresholding Binary Search node"
         )
 

From 08f1b5f49e0d5180fa739056209bc5f0a8589c7e Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Thu, 5 Jan 2023 18:00:57 +0000
Subject: [PATCH 058/235] [thresholding] more flake8 fixes

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 tests/fpgadataflow/test_convert_to_hls_thresholding.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/fpgadataflow/test_convert_to_hls_thresholding.py b/tests/fpgadataflow/test_convert_to_hls_thresholding.py
index 217ee39d74..45705dc833 100755
--- a/tests/fpgadataflow/test_convert_to_hls_thresholding.py
+++ b/tests/fpgadataflow/test_convert_to_hls_thresholding.py
@@ -205,8 +205,7 @@ def test_convert_to_hls_tbs_rtl_variant(
     # Cppsim is not supported for this node (as it is an RTL node)
     if mem_mode == "const":
         pytest.skip(
-            "const memory mode not supported for "
-            "RTL Thresholding Binary Search node"
+            "const memory mode not supported for this node"
         )
     elif mem_mode != "decoupled":
         raise Exception("Unknown mem_mode: {}".format(mem_mode))

From 481d773257e41ad04f2bb5e1b614decfac4312ab Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Thu, 5 Jan 2023 18:02:55 +0000
Subject: [PATCH 059/235] [thresholding] undo flake8 fixes

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 tests/fpgadataflow/test_convert_to_hls_thresholding.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/fpgadataflow/test_convert_to_hls_thresholding.py b/tests/fpgadataflow/test_convert_to_hls_thresholding.py
index 45705dc833..cee06ebec9 100755
--- a/tests/fpgadataflow/test_convert_to_hls_thresholding.py
+++ b/tests/fpgadataflow/test_convert_to_hls_thresholding.py
@@ -204,9 +204,7 @@ def test_convert_to_hls_tbs_rtl_variant(
 
     # Cppsim is not supported for this node (as it is an RTL node)
     if mem_mode == "const":
-        pytest.skip(
-            "const memory mode not supported for this node"
-        )
+        pytest.skip("const memory mode not supported for this node")
     elif mem_mode != "decoupled":
         raise Exception("Unknown mem_mode: {}".format(mem_mode))
 

From a51bef4e3ea906b056eb7fb3fbb114a2ae12b6aa Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Thu, 5 Jan 2023 18:04:59 +0000
Subject: [PATCH 060/235] [thresholding] another flake8 fix

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 tests/fpgadataflow/test_convert_to_hls_thresholding.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/fpgadataflow/test_convert_to_hls_thresholding.py b/tests/fpgadataflow/test_convert_to_hls_thresholding.py
index cee06ebec9..07821983e1 100755
--- a/tests/fpgadataflow/test_convert_to_hls_thresholding.py
+++ b/tests/fpgadataflow/test_convert_to_hls_thresholding.py
@@ -217,7 +217,7 @@ def test_convert_to_hls_tbs_rtl_variant(
     # Paralellisation not supported for thresholding binary search rtl node
     if pe != 1:
         pytest.skip(
-            "Paralellisation of IP not supported for RTL Thresholding Binary Search node"
+            "Paralellisation not supported for RTL Thresholding Binary Search node"
         )
 
     # Other non-input parameters

From 2c313ad01465f66a9e6f367cf6552f64b6a1dab3 Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Fri, 6 Jan 2023 11:11:25 +0000
Subject: [PATCH 061/235] [thresholding] remove cppsim test file generation

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 .../custom_op/fpgadataflow/thresholding_binary_search.py   | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
index d5d5c48cce..1a5faad72a 100755
--- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
@@ -239,13 +239,6 @@ def get_hls_compatible_threshold_tensor(self, orig_thres_matrix):
             np.mod(orig_thres_matrix, 1), 0
         ).all(), "Need int threshold tensor"
         ret = orig_thres_matrix
-        # workaround for vivado_hls threshold bug
-        if ret[0][0] == 0 and n_thres_steps == 1:
-            ret = np.copy(ret)
-            ret[0][0] = 1
-            warnings.warn(
-                "Setting 0-valued first threshold to 1 to avoid vivado_hls bug"
-            )
         # ensure channels = mh , duplicating if necessary
         if ret.shape[0] == 1:
             ret = np.tile(ret, (mh, 1))

From 49bdd28e4edc3d47ccb57161e073fcde2a2cb216 Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Fri, 6 Jan 2023 11:14:22 +0000
Subject: [PATCH 062/235] [thresholding] remove unnecessary data generation
 functions for simulators

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 .../thresholding_binary_search.py             | 107 ------------------
 1 file changed, 107 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
index 1a5faad72a..7b37b2029a 100755
--- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
@@ -261,84 +261,6 @@ def get_hls_compatible_threshold_tensor(self, orig_thres_matrix):
         rows between PEs is not as expected (n_thres_steps)"""
         return ret.reshape(1, pe, tmem, n_thres_steps)
 
-    def make_weight_file(self, weights, weight_file_mode, weight_file_name):
-        """Produce a file containing given weights (thresholds) in appropriate
-        format for this layer. This file can be used for either synthesis or
-        run-time reconfig of weights.
-
-        Arguments:
-        * weights : numpy array with weights to be put into the file
-        * weight_file_mode : one of {hls_header, decoupled_verilog_dat,
-          decoupled_runtime}
-        * weight_file_name : filename for the weight file to be generated
-        """
-        # There are 'decoupled_*' flavors, just make sure that the flavors
-        # are decoupled related
-        if "decoupled" not in weight_file_mode:
-            raise Exception(
-                "Unrecognized memory mode for this node: {}".format(weight_file_mode)
-            )
-
-        threshold_tensor = self.get_hls_compatible_threshold_tensor(weights)
-        tdt = self.get_weight_datatype()
-        assert np.vectorize(tdt.allowed)(
-            threshold_tensor
-        ).all(), "Thresholds can't be expressed with type %s" % str(tdt)
-
-        # streaming thresholds need to be organized differently
-        # (1, pe, tmem, n_thres_steps) -> (1, tmem, pe, n_thres_steps)
-        decoupled_thres = np.transpose(threshold_tensor, (0, 2, 1, 3))
-        # (1, tmem, pe, n_thres_steps) -(1, tmem, pe * n_thres_steps)
-        pe = self.get_nodeattr("PE")
-        n_thres_steps = self.get_nodeattr("numSteps")
-        decoupled_thres_pe_flipped = np.flip(decoupled_thres, axis=-2)
-        decoupled_thres = decoupled_thres.reshape(1, -1, pe * n_thres_steps)
-        decoupled_thres = decoupled_thres.copy()
-        decoupled_thres_pe_flipped = decoupled_thres_pe_flipped.reshape(
-            1, -1, pe * n_thres_steps
-        )
-        decoupled_thres_pe_flipped = decoupled_thres_pe_flipped.copy()
-
-        if weight_file_mode == "decoupled_npy":
-            # save weight stream into npy for cppsim
-            np.save(weight_file_name, decoupled_thres)
-        elif weight_file_mode == "decoupled_verilog_dat":
-            # convert weight values into hexstring
-            weight_width = self.get_weightstream_width()
-            # pad to nearest 4 bits to get hex strings
-            weight_width_padded = roundup_to_integer_multiple(weight_width, 4)
-            weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string(
-                decoupled_thres_pe_flipped, tdt, weight_width_padded, prefix=""
-            )
-            weight_stream = weight_tensor_pe_flipped.flatten()
-            weight_stream = weight_stream.copy()
-            with open(weight_file_name, "w") as f:
-                for val in weight_stream:
-                    f.write(val + "\n")
-        elif weight_file_mode == "decoupled_runtime":
-            # memstream axi-lite interface will map each mem line to
-            # one or multiple 32-bit words
-            weight_width = self.get_weightstream_width()
-            words_per_memwidth = 2 ** ceil(log2(weight_width / 32))
-            if words_per_memwidth < 1:
-                words_per_memwidth = 1
-            weight_width_padded = words_per_memwidth * 32
-            # first, pack and ensure padding to 32 bits
-            weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string(
-                decoupled_thres_pe_flipped, tdt, weight_width_padded, prefix=""
-            )
-            weight_stream = weight_tensor_pe_flipped.flatten()
-            weight_stream = weight_stream.copy()
-            with open(weight_file_name, "w") as f:
-                for val in weight_stream:
-                    # split into groups of 8 hex digits (= 32 bits)
-                    words_32b = textwrap.wrap(val, 8)
-                    words_32b.reverse()
-                    for word_32b in words_32b:
-                        f.write(word_32b + "\n")
-        else:
-            raise Exception("Decoupled weight export not yet implemented")
-
     # Get the integer from the DataType and string-ify it
     # This assumes that the data is in the form "INTx" or similar
     def conv_datatype_to_str(self, data_type):
@@ -449,35 +371,6 @@ def code_generation_ipgen(self, model, fpgapart, clk):
         code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
         self.set_nodeattr("ipgen_path", code_gen_dir)
         self.set_nodeattr("ip_path", code_gen_dir)
-
-        # Generate params for RTLSim
-        self.generate_params(model, code_gen_dir)
-
-    def generate_params(self, model, path):
-        code_gen_dir = path
-        weight_filename_sim = "{}/thresholds.npy".format(code_gen_dir)
-        thresholds = model.get_initializer(self.onnx_node.input[1])
-        self.make_weight_file(thresholds, "decoupled_npy", weight_filename_sim)
-
-        # Verilog.dat thresholds:
-        # also save weights as Verilog .dat file
-        # note that we provide two different .dat files, one for synth
-        # and one for synthesis. this is because URAM-based weights always
-        # need zero weights for synthesis, otherwise they get inferred
-        # as BRAM
-        weight_filename_rtl_synth = "{}/memblock_synth_0.dat".format(code_gen_dir)
-        weight_filename_rtl_sim = "{}/memblock_sim_0.dat".format(code_gen_dir)
-        # sim weights are always the true weights
-        self.make_weight_file(
-            thresholds, "decoupled_verilog_dat", weight_filename_rtl_sim
-        )
-
-        # Synthesis thresholds:
-        synth_thresholds = thresholds
-        self.make_weight_file(
-            synth_thresholds, "decoupled_verilog_dat", weight_filename_rtl_synth
-        )
-
         return
 
     def prepare_rtlsim(self):

From e663030e98dc6c1f194ccec1d8e5d65b9599c19c Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Fri, 6 Jan 2023 11:27:46 +0000
Subject: [PATCH 063/235] [thresholding] remove potentially problematic helper
 function

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 .../thresholding_binary_search.py             | 20 ++++++-------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
index 7b37b2029a..b14eaa1669 100755
--- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
@@ -261,14 +261,6 @@ def get_hls_compatible_threshold_tensor(self, orig_thres_matrix):
         rows between PEs is not as expected (n_thres_steps)"""
         return ret.reshape(1, pe, tmem, n_thres_steps)
 
-    # Get the integer from the DataType and string-ify it
-    # This assumes that the data is in the form "INTx" or similar
-    def conv_datatype_to_str(self, data_type):
-        # Handle the case that an int is passed to the function
-        if isinstance(data_type, int):
-            return str(data_type)
-        return str(DataType[data_type].bitwidth())
-
     def prepare_codegen_rtl_values(self):
         """All dictionary values produced in this function are to replace
         their key value(s) in the RTL template files"""
@@ -294,16 +286,16 @@ def prepare_codegen_rtl_values(self):
         bias = self.get_nodeattr("activation_bias")  # activation bias value
 
         code_gen_dict["$N$"] = [
-            self.conv_datatype_to_str(output_data_type)
-        ]  # output precision
+            str(DataType[output_data_type].bitwidth())
+        ]  # output precision - convert bitwidth to string
         code_gen_dict["$M$"] = [
-            self.conv_datatype_to_str(input_data_type)
-        ]  # input/threshold precision
+            str(DataType[input_data_type].bitwidth())
+        ]  # input/threshold precision - convert bitwidth to string
         code_gen_dict["$C$"] = [
-            self.conv_datatype_to_str(num_channels)
+            str(num_channels)
         ]  # number of channels
         code_gen_dict["$BIAS$"] = [
-            self.conv_datatype_to_str(bias)
+            str(bias)
         ]  # activation bias value
 
         # Is the input datatype signed or unsigned?

From 42dbf23938fdd1a302e88706302980c718a66d05 Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Fri, 6 Jan 2023 11:35:36 +0000
Subject: [PATCH 064/235] [thresholding] implement flake8 formatting

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 .../custom_op/fpgadataflow/thresholding_binary_search.py  | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
index b14eaa1669..6dc9130792 100755
--- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
@@ -291,12 +291,8 @@ def prepare_codegen_rtl_values(self):
         code_gen_dict["$M$"] = [
             str(DataType[input_data_type].bitwidth())
         ]  # input/threshold precision - convert bitwidth to string
-        code_gen_dict["$C$"] = [
-            str(num_channels)
-        ]  # number of channels
-        code_gen_dict["$BIAS$"] = [
-            str(bias)
-        ]  # activation bias value
+        code_gen_dict["$C$"] = [str(num_channels)]  # number of channels
+        code_gen_dict["$BIAS$"] = [str(bias)]  # activation bias value
 
         # Is the input datatype signed or unsigned?
         # The thresholding core needs to know this when comparing weights to inputs

From 933d7476d3336a6aec9c4dea852acb25ebdf4b46 Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Fri, 6 Jan 2023 11:38:11 +0000
Subject: [PATCH 065/235] [thresholding] remove unused imports

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 src/finn/custom_op/fpgadataflow/thresholding_binary_search.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
index 6dc9130792..9e1dd454f1 100755
--- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
@@ -28,9 +28,7 @@
 
 import numpy as np
 import os
-import textwrap
 import warnings
-from math import ceil, log2
 from qonnx.core.datatype import DataType
 from qonnx.util.basic import (
     interleave_matrix_outer_dim_from_partitions,

From 5c6dcd9b89a7b35328676855c5c5ac13e06da90f Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Fri, 6 Jan 2023 11:40:13 +0000
Subject: [PATCH 066/235] [thresholding] remove last ununsed import

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 src/finn/custom_op/fpgadataflow/thresholding_binary_search.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
index 9e1dd454f1..a2e0f404b2 100755
--- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
@@ -32,7 +32,6 @@
 from qonnx.core.datatype import DataType
 from qonnx.util.basic import (
     interleave_matrix_outer_dim_from_partitions,
-    roundup_to_integer_multiple,
 )
 
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp

From 51acd119eb8864ff302d0f040fcb0307c2778ccf Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Fri, 6 Jan 2023 11:42:12 +0000
Subject: [PATCH 067/235] [thresholding] reformat existing import

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 src/finn/custom_op/fpgadataflow/thresholding_binary_search.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
index a2e0f404b2..595a643acc 100755
--- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
@@ -30,9 +30,7 @@
 import os
 import warnings
 from qonnx.core.datatype import DataType
-from qonnx.util.basic import (
-    interleave_matrix_outer_dim_from_partitions,
-)
+from qonnx.util.basic import interleave_matrix_outer_dim_from_partitions
 
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
 from finn.util.basic import find_next_power_of_2, get_rtlsim_trace_depth, make_build_dir

From b886a5ae08e608808795bc584da0650eb8ff260f Mon Sep 17 00:00:00 2001
From: auphelia <jakobapk@web.de>
Date: Wed, 18 Jan 2023 11:25:51 +0000
Subject: [PATCH 068/235] [Docs] Add bin search thresholding to docs generation

---
 docs/finn/source_code/finn.custom_op.fpgadataflow.rst | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/docs/finn/source_code/finn.custom_op.fpgadataflow.rst b/docs/finn/source_code/finn.custom_op.fpgadataflow.rst
index fdcf44c6d9..3627855cfb 100644
--- a/docs/finn/source_code/finn.custom_op.fpgadataflow.rst
+++ b/docs/finn/source_code/finn.custom_op.fpgadataflow.rst
@@ -203,6 +203,14 @@ finn.custom\_op.fpgadataflow.thresholding\_batch
    :undoc-members:
    :show-inheritance:
 
+finn.custom\_op.fpgadataflow.thresholding\_binary\_search
+-----------------------------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.thresholding_binary_search
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 
 finn.custom\_op.fpgadataflow.tlastmarker
 -----------------------------------------------

From 2c3de2ab7ad12c89ee4af52e611532ff4255e258 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Mon, 23 Jan 2023 07:28:39 +0000
Subject: [PATCH 069/235] Corrected address width in Verilog wrapper for
 thresholding.

---
 finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v
index e3f8596bc8..5068cb549c 100644
--- a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v
+++ b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v
@@ -51,7 +51,7 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 	// Writing
 	input	                s_axilite_AWVALID,
 	output	                s_axilite_AWREADY,
-	input	[C_BITS+N-1:0]  s_axilite_AWADDR,
+	input	[C_BITS+N+1:0]  s_axilite_AWADDR,
 
 	input	        s_axilite_WVALID,
 	output	        s_axilite_WREADY,

From 7c9f5d8805b288a299cd1970d797af0d24327577 Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Mon, 23 Jan 2023 11:57:16 +0000
Subject: [PATCH 070/235] [thresholding] remove bug affecting input width in
 top level wrapper

The C_BITS parameter is calculating the correct width needed for the top level wrapper for the thresholding binary search IP.
However, the parameter is not 'synthesizing' correctly and does not update the width for the affected s_axilite_AWADDR signal.
This results in the MSBs of the input signal being truncated. These missing bits affected addressing when writing weights into the core.
Weights were written to the incorrect addresses in the core causing incorrect thresholding to occur.

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v
index 5068cb549c..768e7b6a5b 100644
--- a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v
+++ b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v
@@ -38,7 +38,6 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 	parameter  C = $C$,	// Channels
 	int BIAS = $BIAS$,  // offsetting the output [0, 2^N-1) -> [-BIAS, 2^N-1 - BIAS)
 
-	parameter  C_BITS = C < 2 ? 1 : $clog2(C),
 	parameter  O_BITS = BIAS > 0?
 		/* unsigned */ $clog2(2**N-BIAS) :
 		/* signed */ 1+$clog2(BIAS >= 2**(N-1)? BIAS : 2**N-BIAS)
@@ -49,9 +48,9 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 
 	//- AXI Lite ------------------------
 	// Writing
-	input	                s_axilite_AWVALID,
-	output	                s_axilite_AWREADY,
-	input	[C_BITS+N+1:0]  s_axilite_AWADDR,
+	input	                   s_axilite_AWVALID,
+	output	                   s_axilite_AWREADY,
+	input	[$clog2(C)+N+1:0]  s_axilite_AWADDR,
 
 	input	        s_axilite_WVALID,
 	output	        s_axilite_WREADY,

From 3a0d59dd6717daedb043ea83d6873e6c663b0d06 Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Mon, 23 Jan 2023 11:59:17 +0000
Subject: [PATCH 071/235] [thresholding] adjust thresholding binary search
 tests to use word addressing for programming thresholds with axilite

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 tests/fpgadataflow/test_convert_to_hls_thresholding.py          | 2 +-
 .../test_fpgadataflow_thresholding_binary_search.py             | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/fpgadataflow/test_convert_to_hls_thresholding.py b/tests/fpgadataflow/test_convert_to_hls_thresholding.py
index 07821983e1..9486513402 100755
--- a/tests/fpgadataflow/test_convert_to_hls_thresholding.py
+++ b/tests/fpgadataflow/test_convert_to_hls_thresholding.py
@@ -271,7 +271,7 @@ def test_convert_to_hls_tbs_rtl_variant(
     # Retrieve the axilite programming sequence for weights - for decoupled mode only
     tbs_node = model.get_nodes_by_op_type("Thresholding_Binary_Search")[0]
     tbs_inst = getCustomOp(tbs_node)
-    config = tbs_inst.get_dynamic_config(model)
+    config = tbs_inst.get_dynamic_config(model, 4)
 
     # Reshape generated data (not from model)
     oshape = model.get_tensor_shape("outp")
diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py
index 049d65835f..2a34971f0d 100755
--- a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py
+++ b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py
@@ -266,7 +266,7 @@ def test_fpgadataflow_thresholding_binary_search(
     # Retrieve the axilite programming sequence for weights - for decoupled mode only
     tbs_node = model.get_nodes_by_op_type("Thresholding_Binary_Search")[0]
     tbs_inst = getCustomOp(tbs_node)
-    config = tbs_inst.get_dynamic_config(model)
+    config = tbs_inst.get_dynamic_config(model, 4)
 
     # Reshape generated data (not from model)
     oshape = model.get_tensor_shape("outp")

From 757e3a1398948878e866f4fe5fe1747206a1c7d9 Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Mon, 23 Jan 2023 12:05:40 +0000
Subject: [PATCH 072/235] [thresholding] adjust typo in exception

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 src/finn/custom_op/fpgadataflow/thresholding_binary_search.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
index 595a643acc..f2f9e133b2 100755
--- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
@@ -421,7 +421,7 @@ def execute_node(self, context, graph):
                     reshaped_input,
                 )
             elif in_ind > 2:
-                raise Exception("Unexpected input found for Thresholding_Batch")
+                raise Exception("Unexpected input found for Thresholding_Binary_Search")
             in_ind += 1
 
         # Create a PyVerilator wrapper of the RTLSim .so

From 479575b224559680c559c7af5fd4f09582529919 Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Mon, 23 Jan 2023 12:07:22 +0000
Subject: [PATCH 073/235] [thresholding] undo copyright header change - only
 needed for new files

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 src/finn/custom_op/fpgadataflow/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index 0e17726d48..dc9a5a349a 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2022, Advanced Micro Devices, Inc.
+# Copyright (c) 2020, Xilinx
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without

From 0d99b6c8ed358b2feea41cc8af242d40b30c8d97 Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Mon, 23 Jan 2023 12:54:19 +0000
Subject: [PATCH 074/235] [thresholding] add docstring for migrated
 find_next_power_of_2() function

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 src/finn/util/basic.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py
index 8782bd7f8c..ee185aa94f 100644
--- a/src/finn/util/basic.py
+++ b/src/finn/util/basic.py
@@ -219,6 +219,7 @@ def is_exe(fpath):
 
 
 def find_next_power_of_2(n):
+    """For any integer 'n', find the next greatest power of 2"""
     # Negative values will loop infinitely below - return 0
     if n <= 0:
         return 0

From 5a77a326558de1ecd59e61aae38575b73ac54b1b Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Mon, 23 Jan 2023 12:55:40 +0000
Subject: [PATCH 075/235] [thresholding] add docstrings for methods not in base
 class

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 .../fpgadataflow/thresholding_binary_search.py       | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
index f2f9e133b2..7dfcd91d58 100755
--- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
@@ -95,6 +95,7 @@ def get_nodeattr_types(self):
         return my_attrs
 
     def calc_tmem(self):
+        """Calculates and returns TMEM."""
         num_channels = self.get_nodeattr("NumChannels")
         pe = self.get_nodeattr("PE")
         return num_channels // pe
@@ -104,6 +105,8 @@ def make_shape_compatible_op(self, model):
         return super().make_const_shape_op(oshape)
 
     def infer_node_datatype(self, model):
+        """Used for FINN DataType inference: set the output tensors' datatypes
+        accordingly for this node"""
         node = self.onnx_node
         idt = model.get_tensor_datatype(node.input[0])
         if idt != self.get_input_datatype():
@@ -119,6 +122,8 @@ def infer_node_datatype(self, model):
         model.set_tensor_datatype(node.output[0], odt)
 
     def verify_node(self):
+        """Required by the FINN nalysis module. Checks if custom ops in graph
+        are correctly built, with all attributes and inputs."""
         return []
 
     def bram_estimation(self):
@@ -170,6 +175,7 @@ def get_outstream_width(self, ind=0):
         return o_bits * self.get_nodeattr("PE")
 
     def get_weightstream_width(self):
+        """Returns weight stream width"""
         pe = self.get_nodeattr("PE")
         wp = self.get_weight_datatype().bitwidth()
         n_thres_steps = self.get_nodeattr("numSteps")
@@ -299,20 +305,24 @@ def prepare_codegen_rtl_values(self):
         return code_gen_dict
 
     def get_rtl_file_list(self):
+        """Thresholding binary search RTL file list"""
         return ["thresholding.sv", "thresholding_axi.sv", "thresholding_axi_wrapper.v"]
 
     def get_rtl_file_paths(self):
+        """Get full path of all RTL files"""
         rtl_root_dir = os.environ["FINN_ROOT"] + "/finn-rtllib/thresholding/hdl/"
         rtl_file_list = self.get_rtl_file_list()
         rtl_file_paths = [rtl_root_dir + file for file in rtl_file_list]
         return rtl_file_paths
 
     def get_rtl_template_data(self, path):
+        """Return RTL file contents as a template"""
         with open(path, "r") as f:
             template = f.read()
         return template
 
     def fill_in_rtl_template_data(self, replace_dict, template_data):
+        """Use attribute values to finn in RTL template placeholders"""
         template_data_cp = template_data
         for key in replace_dict:
             replacement_line = "\n".join(replace_dict[key])
@@ -320,11 +330,13 @@ def fill_in_rtl_template_data(self, replace_dict, template_data):
         return template_data_cp
 
     def dump_rtl_data(self, dest_dir, filename, data):
+        """Dump filled-in-template RTL files for future synthesis step"""
         with open(os.path.join(dest_dir, filename), "w") as f:
             f.write(data)
         return
 
     def generate_hdl(self):
+        """Prepare HDL files from templates for synthesis"""
         # Generate a dictionary of values to put in RTL template
         code_gen_dict = self.prepare_codegen_rtl_values()
 

From eeed0702125de77c293a4a702f213a1035829179 Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Mon, 23 Jan 2023 12:56:22 +0000
Subject: [PATCH 076/235] [thresholding] remove unused method

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 src/finn/custom_op/fpgadataflow/thresholding_binary_search.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
index 7dfcd91d58..94182b4ea0 100755
--- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
@@ -209,9 +209,6 @@ def get_number_output_values(self):
     def get_exp_cycles(self):
         return 0
 
-    def get_template_param_values(self):
-        return dict()
-
     def get_hls_compatible_threshold_tensor(self, orig_thres_matrix):
         """Convert the original numpy weight matrix orig_weight_matrix into
         a form suitable for passing to the hlslib call:

From c2708686e22c9eaff18a5314c2f470fbbcb819f0 Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Fri, 27 Jan 2023 14:55:16 +0000
Subject: [PATCH 077/235] [thresholding] remove 'return' at end of function -
 not needed

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 tests/util/test_basic.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/util/test_basic.py b/tests/util/test_basic.py
index d2586f4f19..97a8c50261 100755
--- a/tests/util/test_basic.py
+++ b/tests/util/test_basic.py
@@ -58,5 +58,3 @@ def test_next_power_of_2():
         output = basic.find_next_power_of_2(test_dict["input"])
         assert output >= test_dict["input"]
         assert output == test_dict["expected_result"]
-
-    return

From af22177e50ae808072d87a9d0c5260ccb6c3a67f Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Fri, 27 Jan 2023 14:59:48 +0000
Subject: [PATCH 078/235] [thresholding] remove cppsim exec_mode from test -
 not exercised

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 .../test_fpgadataflow_thresholding_binary_search.py      | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py
index 2a34971f0d..e57c4942c8 100755
--- a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py
+++ b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py
@@ -191,12 +191,11 @@ def test_fpgadataflow_thresholding_binary_search_prepare_rtlsim():
 @pytest.mark.parametrize("input_data_type", [DataType["INT16"], DataType["UINT16"]])
 @pytest.mark.parametrize("fold", [-1, 1, 2])
 @pytest.mark.parametrize("num_input_channels", [16])
-@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
 @pytest.mark.fpgadataflow
 @pytest.mark.vivado
 @pytest.mark.slow
 def test_fpgadataflow_thresholding_binary_search(
-    activation, input_data_type, fold, num_input_channels, exec_mode
+    activation, input_data_type, fold, num_input_channels
 ):
     # Handle inputs to the test
     pe = generate_pe_value(fold, num_input_channels)
@@ -209,12 +208,6 @@ def test_fpgadataflow_thresholding_binary_search(
             "RTL Thresholding Binary Search node"
         )
 
-    # Cppsim is not supported for this node (as it is an RTL node)
-    if exec_mode == "cppsim":
-        pytest.skip("cppsim not supported for RTL Thresholding Binary Search node")
-    elif exec_mode != "rtlsim":
-        raise Exception("Unknown exec_mode: {}".format(exec_mode))
-
     # Other non-input parameters
     num_input_vecs = [1, 2, 2]
     output_data_type = activation

From fab120b8218b2bacf8a94a23c7d250d0c5df12b6 Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Fri, 27 Jan 2023 15:02:38 +0000
Subject: [PATCH 079/235] [thresholding] remove unused attributes

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 src/finn/custom_op/fpgadataflow/thresholding_binary_search.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
index 94182b4ea0..43ae8e8233 100755
--- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
@@ -75,9 +75,6 @@ def get_nodeattr_types(self):
             "inputDataType": ("s", True, ""),
             "weightDataType": ("s", True, ""),
             "outputDataType": ("s", True, ""),
-            # input and output FIFO depths
-            "inFIFODepth": ("i", False, 0),
-            "outFIFODepth": ("i", False, 0),
             # number of input vectors, examples:
             # [1] is a single vector (like a FC layer with batch=1)
             # [4] is four vectors (like a FC layer with batch=4)

From 5d6c964443e0c41865a18e862830e0c27a307dd1 Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Fri, 27 Jan 2023 15:47:41 +0000
Subject: [PATCH 080/235] [thresholding] adjust i/o port names on thresholding
 RTL wrapper

Originally s_axis and m_axis port names on the thresholding RTL wrapper could synthesise, but did not adhere to the FINN i/o signal naming convention.
The FINN compiler would not recognise the IP being synthesised and would rely on the IP having the correct IP/signal wiring steps in place.
The FINN compiler did not recognise s_axis/m_axis signal naming and therefore did not automatically set the clock frequency of the IP to match the rest of the network.
This required a Tcl command to set the clock frequency of the IP, as well a user-configurable attribute to set the clock frequency for ease-of-use.

It turns out that this actually reduces user ease-of-use. Having the compiler take care of the clock signalling is preferred. To do this, the s_axis/m_axis
signals are renamed to in0_V/out_V, as the compiler expects, and this extra 'user configurability' can therefore be removed.

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 .../hdl/thresholding_axi_wrapper.v            | 26 ++++++++++---------
 .../thresholding_binary_search.py             | 19 --------------
 2 files changed, 14 insertions(+), 31 deletions(-)

diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v
index 768e7b6a5b..c16bf264dd 100644
--- a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v
+++ b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v
@@ -43,7 +43,9 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 		/* signed */ 1+$clog2(BIAS >= 2**(N-1)? BIAS : 2**N-BIAS)
 )(
 	//- Global Control ------------------
+		(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V" *)
 	input	ap_clk,
+		(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V" *)
 	input	ap_rst_n,
 
 	//- AXI Lite ------------------------
@@ -72,14 +74,14 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 	output	[ 1:0]  s_axilite_RRESP,
 
 	//- AXI Stream - Input --------------
-	output	s_axis_tready,
-	input	s_axis_tvalid,
-	input	[((M+7)/8)*8-1:0]  s_axis_tdata,
+	output	in0_V_TREADY,
+	input	in0_V_TVALID,
+	input	[((M+7)/8)*8-1:0]  in0_V_TDATA,
 
 	//- AXI Stream - Output -------------
-	input	m_axis_tready,
-	output	m_axis_tvalid,
-	output	[((O_BITS+7)/8)*8-1:0]  m_axis_tdata
+	input	out_V_TREADY,
+	output	out_V_TVALID,
+	output	[((O_BITS+7)/8)*8-1:0]  out_V_TDATA
 );
 
 	$MODULE_NAME_AXI$ #(.N(N), .M(M), .C(C), .BIAS(BIAS), .O_BITS(O_BITS)) inst (
@@ -113,14 +115,14 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 		.s_axilite_RRESP(s_axilite_RRESP),
 
 		//- AXI Stream - Input --------------
-		.s_axis_tready(s_axis_tready),
-		.s_axis_tvalid(s_axis_tvalid),
-		.s_axis_tdata(s_axis_tdata),
+		.s_axis_tready(in0_V_TREADY),
+		.s_axis_tvalid(in0_V_TVALID),
+		.s_axis_tdata(in0_V_TDATA),
 
 		//- AXI Stream - Output -------------
-		.m_axis_tready(m_axis_tready),
-		.m_axis_tvalid(m_axis_tvalid),
-		.m_axis_tdata(m_axis_tdata)
+		.m_axis_tready(out_V_TREADY),
+		.m_axis_tvalid(out_V_TVALID),
+		.m_axis_tdata(out_V_TDATA)
 	);
 
 endmodule : $MODULE_NAME_AXI_WRAPPER$
diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
index 43ae8e8233..97c9dd82c6 100755
--- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
@@ -85,8 +85,6 @@ def get_nodeattr_types(self):
             "gen_top_module": ("s", False, ""),
             # bias to be applied to outputs of the node
             "activation_bias": ("i", False, 0),
-            # used for IPI step
-            "clkFreq": ("i", False, 200000000),
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
@@ -477,10 +475,6 @@ def code_generation_ipi(self):
         cmd = []
         rtl_file_list = self.get_rtl_file_list()
         code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
-        node_name = self.onnx_node.name
-        dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0][0]
-        din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0]
-        clock_freq = self.get_nodeattr("clkFreq")
 
         for rtl_file in rtl_file_list:
             cmd.append(
@@ -493,16 +487,6 @@ def code_generation_ipi(self):
             % (self.get_nodeattr("gen_top_module"), self.onnx_node.name)
         )
 
-        cmd.append(
-            "set_property -dict [list CONFIG.FREQ_HZ {%d}] [%s %s/%s]"
-            % (clock_freq, "get_bd_intf_pins", node_name, din_name)
-        )
-
-        cmd.append(
-            "set_property -dict [list CONFIG.FREQ_HZ {%d}] [%s %s/%s]"
-            % (clock_freq, "get_bd_intf_pins", node_name, dout_name)
-        )
-
         return cmd
 
     def get_verilog_top_module_intf_names(self):
@@ -517,9 +501,6 @@ def get_verilog_top_module_intf_names(self):
 
         intf_names = super().get_verilog_top_module_intf_names()
         intf_names["axilite"] = ["s_axilite"]
-        intf_names["s_axis"] = [["s_axis"]]
-        intf_names["m_axis"] = [["m_axis"]]
-
         return intf_names
 
     def get_dynamic_config(self, model, address_stride=1):

From bdfa6cb97096680247b6648edf20d4c519dcad16 Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Tue, 31 Jan 2023 11:41:24 +0000
Subject: [PATCH 081/235] [thresholding] remove duplicated test helper function

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 .../test_convert_to_hls_thresholding.py       | 54 +------------------
 1 file changed, 2 insertions(+), 52 deletions(-)

diff --git a/tests/fpgadataflow/test_convert_to_hls_thresholding.py b/tests/fpgadataflow/test_convert_to_hls_thresholding.py
index 9486513402..84521b395c 100755
--- a/tests/fpgadataflow/test_convert_to_hls_thresholding.py
+++ b/tests/fpgadataflow/test_convert_to_hls_thresholding.py
@@ -46,6 +46,8 @@
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from test_fpgadataflow_thresholding_binary_search import make_single_thresholding_binary_search_modelwrapper
+
 
 test_fpga_part = "xczu3eg-sbva484-1-e"
 target_clk_ns = 5
@@ -86,58 +88,6 @@ def convert_np_array_to_standard_data_layout(data):
     return np.transpose(data, (0, 3, 1, 2))
 
 
-def make_single_thresholding_binary_search_modelwrapper(
-    thresholds,
-    pe,
-    input_data_type,
-    output_data_type,
-    activation_bias,
-    num_input_vecs,
-):
-    NumChannels = thresholds.shape[0]
-
-    inp = helper.make_tensor_value_info(
-        "inp", TensorProto.FLOAT, num_input_vecs + [NumChannels]
-    )
-    outp = helper.make_tensor_value_info(
-        "outp", TensorProto.FLOAT, num_input_vecs + [NumChannels]
-    )
-
-    node_inp_list = ["inp", "thresh"]
-
-    Thresholding_node = helper.make_node(
-        "Thresholding_Binary_Search",
-        node_inp_list,
-        ["outp"],
-        domain="finn.custom_op.fpgadataflow",
-        backend="fpgadataflow",
-        NumChannels=NumChannels,
-        PE=pe,
-        numSteps=thresholds.shape[1],
-        inputDataType=input_data_type.name,
-        weightDataType=input_data_type.name,
-        outputDataType=output_data_type.name,
-        numInputVectors=num_input_vecs,
-        activation_bias=activation_bias,
-    )
-    graph = helper.make_graph(
-        nodes=[Thresholding_node],
-        name="thresholding_graph",
-        inputs=[inp],
-        outputs=[outp],
-    )
-
-    model = helper.make_model(graph, producer_name="thresholding-model")
-    model = ModelWrapper(model)
-
-    model.set_tensor_datatype("inp", input_data_type)
-    model.set_tensor_datatype("outp", output_data_type)
-
-    model.set_tensor_datatype("thresh", input_data_type)
-    model.set_initializer("thresh", thresholds)
-    return model
-
-
 def make_single_multithresholding_modelwrapper(
     thresholds,
     pe,

From 6809351c5210c87a199e8b4167fa54b2dd9a48c8 Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Tue, 31 Jan 2023 12:24:41 +0000
Subject: [PATCH 082/235] [thresholding] assert on finding unsupported memory
 mode for thresholding binary search HLS conversion function

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 .../fpgadataflow/convert_to_hls_layers.py            | 12 ++++--------
 .../fpgadataflow/test_convert_to_hls_thresholding.py | 11 ++---------
 2 files changed, 6 insertions(+), 17 deletions(-)

diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
index f6dd466fab..1a331b059f 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
@@ -1127,14 +1127,10 @@ def apply(self, model):
 
                 # Perform checks for RTL variant if chosen
                 if self.use_rtl_variant:
-                    # Check memory mode
-                    if self.mem_mode != "decoupled":
-                        warnings.warn(
-                            """%s : RTL Thresholding does not support 'decoupled' memory mode.
-                            Falling back to HLS implementation."""
-                            % node.name
-                        )
-                        is_rtl_variant_compatible = False
+                    assert self.mem_mode == "decoupled", (
+                        """%s : RTL Thresholding only supports 'decoupled' memory mode."""
+                        % node.name
+                    )
 
                     # Check PE/SIMD value
                     if pe != 1:
diff --git a/tests/fpgadataflow/test_convert_to_hls_thresholding.py b/tests/fpgadataflow/test_convert_to_hls_thresholding.py
index 84521b395c..d07ffd2cbf 100755
--- a/tests/fpgadataflow/test_convert_to_hls_thresholding.py
+++ b/tests/fpgadataflow/test_convert_to_hls_thresholding.py
@@ -142,22 +142,15 @@ def make_single_multithresholding_modelwrapper(
 @pytest.mark.parametrize("input_data_type", [DataType["INT16"], DataType["UINT16"]])
 @pytest.mark.parametrize("fold", [-1, 1, 2])
 @pytest.mark.parametrize("num_input_channels", [16])
-@pytest.mark.parametrize("mem_mode", ["decoupled", "const"])
 @pytest.mark.fpgadataflow
 @pytest.mark.vivado
 def test_convert_to_hls_tbs_rtl_variant(
-    activation, input_data_type, fold, num_input_channels, mem_mode
+    activation, input_data_type, fold, num_input_channels,
 ):
     # Handle inputs to the test
     pe = generate_pe_value(fold, num_input_channels)
     num_steps = activation.get_num_possible_values() - 1
 
-    # Cppsim is not supported for this node (as it is an RTL node)
-    if mem_mode == "const":
-        pytest.skip("const memory mode not supported for this node")
-    elif mem_mode != "decoupled":
-        raise Exception("Unknown mem_mode: {}".format(mem_mode))
-
     if activation == DataType["BIPOLAR"]:
         pytest.skip(
             "Only negative activations are supported for "
@@ -267,7 +260,7 @@ def write_thresh_config(sim):
 
     # Recreate the model using the ConvertToHLS transform
     new_model = new_model.transform(
-        to_hls.InferThresholdingLayer(mem_mode=mem_mode, use_rtl_variant=True)
+        to_hls.InferThresholdingLayer(mem_mode="decoupled", use_rtl_variant=True)
     )
     new_model = new_model.transform(InsertFIFO(True))
     new_model = new_model.transform(GiveUniqueNodeNames())

From 4515cf7c6d4e55f8dfca62b52b504e2666a6b497 Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Tue, 31 Jan 2023 12:29:42 +0000
Subject: [PATCH 083/235] [thresholding] precommit fix

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 tests/fpgadataflow/test_convert_to_hls_thresholding.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tests/fpgadataflow/test_convert_to_hls_thresholding.py b/tests/fpgadataflow/test_convert_to_hls_thresholding.py
index d07ffd2cbf..75c4ef599c 100755
--- a/tests/fpgadataflow/test_convert_to_hls_thresholding.py
+++ b/tests/fpgadataflow/test_convert_to_hls_thresholding.py
@@ -46,7 +46,9 @@
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
-from test_fpgadataflow_thresholding_binary_search import make_single_thresholding_binary_search_modelwrapper
+from test_fpgadataflow_thresholding_binary_search import (
+    make_single_thresholding_binary_search_modelwrapper,
+)
 
 
 test_fpga_part = "xczu3eg-sbva484-1-e"
@@ -145,7 +147,10 @@ def make_single_multithresholding_modelwrapper(
 @pytest.mark.fpgadataflow
 @pytest.mark.vivado
 def test_convert_to_hls_tbs_rtl_variant(
-    activation, input_data_type, fold, num_input_channels,
+    activation,
+    input_data_type,
+    fold,
+    num_input_channels,
 ):
     # Handle inputs to the test
     pe = generate_pe_value(fold, num_input_channels)

From b51498ef84edcd9362f4f83270f9ae39f5d7980f Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Tue, 31 Jan 2023 12:39:39 +0000
Subject: [PATCH 084/235] [thresholding] precommit fix 2

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 tests/fpgadataflow/test_convert_to_hls_thresholding.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/tests/fpgadataflow/test_convert_to_hls_thresholding.py b/tests/fpgadataflow/test_convert_to_hls_thresholding.py
index 75c4ef599c..09067564eb 100755
--- a/tests/fpgadataflow/test_convert_to_hls_thresholding.py
+++ b/tests/fpgadataflow/test_convert_to_hls_thresholding.py
@@ -39,6 +39,9 @@
 from qonnx.transformation.infer_datatypes import InferDataTypes
 from qonnx.transformation.infer_shapes import InferShapes
 from qonnx.util.basic import gen_finn_dt_tensor
+from test_fpgadataflow_thresholding_binary_search import (
+    make_single_thresholding_binary_search_modelwrapper,
+)
 
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
 from finn.core.rtlsim_exec import rtlsim_exec
@@ -46,10 +49,6 @@
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
-from test_fpgadataflow_thresholding_binary_search import (
-    make_single_thresholding_binary_search_modelwrapper,
-)
-
 
 test_fpga_part = "xczu3eg-sbva484-1-e"
 target_clk_ns = 5

From ff3b2014d5de4bf2a98c321d14bce15a9862bf74 Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Tue, 31 Jan 2023 12:48:23 +0000
Subject: [PATCH 085/235] [thresholding] precommit fix 3

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 src/finn/transformation/fpgadataflow/convert_to_hls_layers.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
index 1a331b059f..1bc5fee664 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
@@ -1128,7 +1128,8 @@ def apply(self, model):
                 # Perform checks for RTL variant if chosen
                 if self.use_rtl_variant:
                     assert self.mem_mode == "decoupled", (
-                        """%s : RTL Thresholding only supports 'decoupled' memory mode."""
+                        """%s : RTL Thresholding only supports 'decoupled' memory
+                        mode."""
                         % node.name
                     )
 

From fc7e00db46414b88f3e1c3d3dc9dff4cf6bc84ff Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Thu, 23 Mar 2023 14:59:46 +0000
Subject: [PATCH 086/235] [thresholding] adjust templates so that .sv files are
 modular and can be used as standalone IP

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 finn-rtllib/thresholding/hdl/thresholding.sv                | 4 ++--
 finn-rtllib/thresholding/hdl/thresholding_axi.sv            | 6 +++---
 finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v     | 2 +-
 .../custom_op/fpgadataflow/thresholding_binary_search.py    | 6 +-----
 4 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/finn-rtllib/thresholding/hdl/thresholding.sv b/finn-rtllib/thresholding/hdl/thresholding.sv
index b26747d1ff..c7d5c86f6d 100644
--- a/finn-rtllib/thresholding/hdl/thresholding.sv
+++ b/finn-rtllib/thresholding/hdl/thresholding.sv
@@ -43,7 +43,7 @@
  *  threshold configuration relies on a channel address prefix. Inputs are
  *  accompanied by a channel selector.
  *****************************************************************************/
-module $MODULE_NAME$ #(
+module thresholding #(
 	int unsigned  N,  // output precision
 	int unsigned  M,  // input/threshold precision
 	int unsigned  C,  // number of channels
@@ -153,4 +153,4 @@ module $MODULE_NAME$ #(
 	assign	ocnl = pipe[N].cnl;
 	assign	odat = pipe[N].res + BIAS;
 
-endmodule : $MODULE_NAME$
+endmodule : thresholding
diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi.sv b/finn-rtllib/thresholding/hdl/thresholding_axi.sv
index 5cd7746b82..79383c7996 100644
--- a/finn-rtllib/thresholding/hdl/thresholding_axi.sv
+++ b/finn-rtllib/thresholding/hdl/thresholding_axi.sv
@@ -38,7 +38,7 @@
  *	- performs aligned byte address to parameter word address translation.
  *****************************************************************************/
 
-module $MODULE_NAME_AXI$ #(
+module thresholding_axi #(
 	int unsigned  N,	// output precision
 	int unsigned  M,	// input/threshold precision
 	int unsigned  C,	// Channels
@@ -197,7 +197,7 @@ module $MODULE_NAME_AXI$ #(
 	end
 
 	// Core Thresholding Module
-	$MODULE_NAME$ #(.N(N), .M(M), .C(C), .BIAS(BIAS), .O_BITS(O_BITS), .C_BITS(C_BITS)) core (
+	thresholding #(.N(N), .M(M), .C(C), .BIAS(BIAS), .O_BITS(O_BITS), .C_BITS(C_BITS)) core (
 		.clk, .rst,
 		.twe, .twa, .twd,
 		.en,
@@ -205,4 +205,4 @@ module $MODULE_NAME_AXI$ #(
 		.ovld, .ocnl(), .odat
 	);
 
-endmodule : $MODULE_NAME_AXI$
+endmodule : thresholding_axi
diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v
index c16bf264dd..e46d0046ee 100644
--- a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v
+++ b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v
@@ -84,7 +84,7 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 	output	[((O_BITS+7)/8)*8-1:0]  out_V_TDATA
 );
 
-	$MODULE_NAME_AXI$ #(.N(N), .M(M), .C(C), .BIAS(BIAS), .O_BITS(O_BITS)) inst (
+	thresholding_axi #(.N(N), .M(M), .C(C), .BIAS(BIAS), .O_BITS(O_BITS)) inst (
 		//- Global Control ------------------
 		.ap_clk(ap_clk),
 		.ap_rst_n(ap_rst_n),
diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
index 97c9dd82c6..9b02248185 100755
--- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
@@ -259,11 +259,7 @@ def prepare_codegen_rtl_values(self):
         their key value(s) in the RTL template files"""
         code_gen_dict = {}
 
-        # Identify the module names
-        code_gen_dict["$MODULE_NAME$"] = [self.get_verilog_top_module_name()]
-        code_gen_dict["$MODULE_NAME_AXI$"] = [
-            self.get_verilog_top_module_name() + "_axi"
-        ]
+        # Identify the module name
         code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [
             self.get_verilog_top_module_name() + "_axi_wrapper"
         ]

From f530aba05b05a59c5cd05b749666f89b82706cba Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Thu, 23 Mar 2023 15:50:43 +0000
Subject: [PATCH 087/235] [thresholding]: remove SIGN template in thresholding
 RTL and create parameter instead for more modular RTL

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 finn-rtllib/thresholding/hdl/thresholding.sv   | 18 +++++++++++-------
 .../thresholding/hdl/thresholding_axi.sv       |  3 ++-
 .../hdl/thresholding_axi_wrapper.v             |  3 ++-
 .../fpgadataflow/thresholding_binary_search.py |  4 ++--
 4 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/finn-rtllib/thresholding/hdl/thresholding.sv b/finn-rtllib/thresholding/hdl/thresholding.sv
index c7d5c86f6d..deff4fe0f8 100644
--- a/finn-rtllib/thresholding/hdl/thresholding.sv
+++ b/finn-rtllib/thresholding/hdl/thresholding.sv
@@ -48,6 +48,7 @@ module thresholding #(
 	int unsigned  M,  // input/threshold precision
 	int unsigned  C,  // number of channels
 
+	bit SIGNED,	// signed inputs
 	int BIAS,  // offsetting the output [0, 2^N-1) -> [BIAS, 2^N-1 + BIAS)
 
 	int unsigned  C_BITS,
@@ -68,7 +69,7 @@ module thresholding #(
 	// Input Stream
 	input	logic  ivld,
 	input	logic        [C_BITS-1:0]  icnl,	// Ignored for C == 1
-	input	logic $SIGN$ [M     -1:0]  idat,
+	input	logic [M     -1:0]  idat,
 
 	// Output Stream
 	output	logic  ovld,
@@ -80,7 +81,7 @@ module thresholding #(
 	typedef struct packed {
 		logic                      vld;	// Valid data identification
 		logic        [C_BITS-1:0]  cnl;	// Channel
-		logic $SIGN$ [M     -1:0]  val;	// Original input value
+		logic [M     -1:0]  val;	// Original input value
 		logic        [0:N-1]       res;	// Assembling result with valid prefix [0:stage] after stage #stage
 	} pipe_t;
 	uwire pipe_t  pipe[0:N];
@@ -91,13 +92,13 @@ module thresholding #(
 	for(genvar  stage = 0; stage < N; stage++) begin : genStages
 
 		// Threshold Memory
-		uwire $SIGN$ [M-1:0]  thresh;
+		uwire [M-1:0]  thresh;
 		if(1) begin : blkUpdate
 
 			// Write control: local select from global address
 			uwire  we = twe && tws[stage];
 			if((C == 1) && (stage == 0)) begin
-				logic $SIGN$ [M-1:0]  Thresh = 'x;
+				logic [M-1:0]  Thresh = 'x;
 				always_ff @(posedge clk) begin
 					if(rst)      Thresh <= 'x;
 					else if(we)  Thresh <= twd;
@@ -105,7 +106,7 @@ module thresholding #(
 				assign  thresh = Thresh;
 			end
 			else begin
-				logic $SIGN$ [M-1:0]  Threshs[C * 2**stage];
+				logic [M-1:0]  Threshs[C * 2**stage];
 				uwire [$clog2(C)+stage-1:0]  wa = twa[$left(twa):N-stage];
 				uwire [$clog2(C)+stage-1:0]  ra;
 				if(C > 1)  assign  ra[stage+:C_BITS] = pipe[stage].cnl;
@@ -117,7 +118,7 @@ module thresholding #(
 				end
 
 				// Read
-				logic $SIGN$ [M-1:0]  RdReg;
+				logic [M-1:0]  RdReg;
 				always_ff @(posedge clk) begin
 					if(en)  RdReg <= Threshs[ra];
 				end
@@ -135,9 +136,12 @@ module thresholding #(
 
 		// Assemble pipeline data
 		logic [0:N-1]  res;
+		uwire  cmp =
+			SIGNED?      $signed(thresh) <=   $signed(State.val) :
+			/* else */ $unsigned(thresh) <= $unsigned(State.val);
 		always_comb begin
 			res        = State.res;
-			res[stage] = thresh <= State.val;	// Patch in next result bit
+			res[stage] = cmp;	// Patch in next result bit
 		end
 		assign	pipe[stage+1] = '{
 			vld: State.vld,
diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi.sv b/finn-rtllib/thresholding/hdl/thresholding_axi.sv
index 79383c7996..6099a64746 100644
--- a/finn-rtllib/thresholding/hdl/thresholding_axi.sv
+++ b/finn-rtllib/thresholding/hdl/thresholding_axi.sv
@@ -43,6 +43,7 @@ module thresholding_axi #(
 	int unsigned  M,	// input/threshold precision
 	int unsigned  C,	// Channels
 
+	bit SIGNED,	// signed inputs
 	int BIAS,  // offsetting the output [0, 2^N-1) -> [BIAS, 2^N-1 + BIAS)
 
 	int unsigned O_BITS
@@ -197,7 +198,7 @@ module thresholding_axi #(
 	end
 
 	// Core Thresholding Module
-	thresholding #(.N(N), .M(M), .C(C), .BIAS(BIAS), .O_BITS(O_BITS), .C_BITS(C_BITS)) core (
+	thresholding #(.N(N), .M(M), .C(C), .SIGNED(SIGNED), .BIAS(BIAS), .O_BITS(O_BITS), .C_BITS(C_BITS)) core (
 		.clk, .rst,
 		.twe, .twa, .twd,
 		.en,
diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v
index e46d0046ee..caf850b5bc 100644
--- a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v
+++ b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v
@@ -36,6 +36,7 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 	parameter  N = $N$,	// output precision
 	parameter  M = $M$,	// input/threshold precision
 	parameter  C = $C$,	// Channels
+	parameter  SIGNED = $SIGNED$,	// signed inputs
 	int BIAS = $BIAS$,  // offsetting the output [0, 2^N-1) -> [-BIAS, 2^N-1 - BIAS)
 
 	parameter  O_BITS = BIAS > 0?
@@ -84,7 +85,7 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 	output	[((O_BITS+7)/8)*8-1:0]  out_V_TDATA
 );
 
-	thresholding_axi #(.N(N), .M(M), .C(C), .BIAS(BIAS), .O_BITS(O_BITS)) inst (
+	thresholding_axi #(.N(N), .M(M), .C(C), .SIGNED(SIGNED), .BIAS(BIAS), .O_BITS(O_BITS)) inst (
 		//- Global Control ------------------
 		.ap_clk(ap_clk),
 		.ap_rst_n(ap_rst_n),
diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
index 9b02248185..af9e1173fb 100755
--- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
@@ -286,9 +286,9 @@ def prepare_codegen_rtl_values(self):
         # Is the input datatype signed or unsigned?
         # The thresholding core needs to know this when comparing weights to inputs
         if self.get_input_datatype().signed():
-            code_gen_dict["$SIGN$"] = ["signed"]
+            code_gen_dict["$SIGNED$"] = [str(1)]
         else:
-            code_gen_dict["$SIGN$"] = ["unsigned"]
+            code_gen_dict["$SIGNED$"] = [str(0)]
 
         return code_gen_dict
 

From 3cd600cce8e1ff98161c55dce232d703173fa569 Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Thu, 23 Mar 2023 16:20:49 +0000
Subject: [PATCH 088/235] [thresholding]: decouple thresholding core from axi
 wrapper by removing input parameters

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 finn-rtllib/thresholding/hdl/thresholding.sv            | 6 ++++--
 finn-rtllib/thresholding/hdl/thresholding_axi.sv        | 8 +++++---
 finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v | 2 +-
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/finn-rtllib/thresholding/hdl/thresholding.sv b/finn-rtllib/thresholding/hdl/thresholding.sv
index deff4fe0f8..52d0b41b33 100644
--- a/finn-rtllib/thresholding/hdl/thresholding.sv
+++ b/finn-rtllib/thresholding/hdl/thresholding.sv
@@ -51,8 +51,10 @@ module thresholding #(
 	bit SIGNED,	// signed inputs
 	int BIAS,  // offsetting the output [0, 2^N-1) -> [BIAS, 2^N-1 + BIAS)
 
-	int unsigned  C_BITS,
-	int unsigned O_BITS
+	localparam int unsigned  C_BITS = C < 2? 1 : $clog2(C),
+	localparam int unsigned  O_BITS = BIAS >= 0?
+		/* unsigned */ $clog2(2**N+BIAS) :
+		/* signed */ 1+$clog2(-BIAS >= 2**(N-1)? -BIAS : 2**N+BIAS)
 )(
 	// Global Control
 	input	logic  clk,
diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi.sv b/finn-rtllib/thresholding/hdl/thresholding_axi.sv
index 6099a64746..4bb3add13b 100644
--- a/finn-rtllib/thresholding/hdl/thresholding_axi.sv
+++ b/finn-rtllib/thresholding/hdl/thresholding_axi.sv
@@ -46,7 +46,10 @@ module thresholding_axi #(
 	bit SIGNED,	// signed inputs
 	int BIAS,  // offsetting the output [0, 2^N-1) -> [BIAS, 2^N-1 + BIAS)
 
-	int unsigned O_BITS
+	localparam int unsigned  C_BITS = C < 2? 1 : $clog2(C),
+	localparam int unsigned  O_BITS = BIAS >= 0?
+		/* unsigned */ $clog2(2**N+BIAS) :
+		/* signed */ 1+$clog2(-BIAS >= 2**(N-1)? -BIAS : 2**N+BIAS)
 )(
 	//- Global Control ------------------
 	input	logic  ap_clk,
@@ -173,7 +176,6 @@ module thresholding_axi #(
 
 	end : blkOutputDecouple
 
-	localparam int unsigned  C_BITS = C < 2? 1 : $clog2(C);
 	uwire  ivld = s_axis_tvalid;
 	uwire [C_BITS-1:0]  icnl;
 	uwire [M     -1:0]  idat = s_axis_tdata[M-1:0];
@@ -198,7 +200,7 @@ module thresholding_axi #(
 	end
 
 	// Core Thresholding Module
-	thresholding #(.N(N), .M(M), .C(C), .SIGNED(SIGNED), .BIAS(BIAS), .O_BITS(O_BITS), .C_BITS(C_BITS)) core (
+	thresholding #(.N(N), .M(M), .C(C), .SIGNED(SIGNED), .BIAS(BIAS)) core (
 		.clk, .rst,
 		.twe, .twa, .twd,
 		.en,
diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v
index caf850b5bc..da013b667a 100644
--- a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v
+++ b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v
@@ -85,7 +85,7 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 	output	[((O_BITS+7)/8)*8-1:0]  out_V_TDATA
 );
 
-	thresholding_axi #(.N(N), .M(M), .C(C), .SIGNED(SIGNED), .BIAS(BIAS), .O_BITS(O_BITS)) inst (
+	thresholding_axi #(.N(N), .M(M), .C(C), .SIGNED(SIGNED), .BIAS(BIAS)) inst (
 		//- Global Control ------------------
 		.ap_clk(ap_clk),
 		.ap_rst_n(ap_rst_n),

From 54afa637d2b7beac8beca99979e2d727385b90f3 Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Tue, 28 Mar 2023 17:17:13 +0100
Subject: [PATCH 089/235] [thresholding]: patch in PE value to the thresholding
 AXI module and wrapper

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 .../thresholding/hdl/thresholding_axi.sv      | 117 ++++++++++++------
 .../hdl/thresholding_axi_wrapper.v            |   7 +-
 .../thresholding_binary_search.py             |   2 +
 3 files changed, 82 insertions(+), 44 deletions(-)

diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi.sv b/finn-rtllib/thresholding/hdl/thresholding_axi.sv
index 4bb3add13b..506e31b215 100644
--- a/finn-rtllib/thresholding/hdl/thresholding_axi.sv
+++ b/finn-rtllib/thresholding/hdl/thresholding_axi.sv
@@ -42,11 +42,14 @@ module thresholding_axi #(
 	int unsigned  N,	// output precision
 	int unsigned  M,	// input/threshold precision
 	int unsigned  C,	// Channels
+	int unsigned  PE,	// Processing Parallelism, requires C = M*PE
 
 	bit SIGNED,	// signed inputs
 	int BIAS,  // offsetting the output [0, 2^N-1) -> [BIAS, 2^N-1 + BIAS)
 
-	localparam int unsigned  C_BITS = C < 2? 1 : $clog2(C),
+    localparam int unsigned  CF = 1 + (C-1)/PE,	// Channel Fold
+	localparam int unsigned  ADDR_BITS = $clog2(CF) + $clog2(PE) + N + 2,
+	localparam int unsigned  C_BITS = C/PE < 2? 1 : $clog2(C/PE),
 	localparam int unsigned  O_BITS = BIAS >= 0?
 		/* unsigned */ $clog2(2**N+BIAS) :
 		/* signed */ 1+$clog2(-BIAS >= 2**(N-1)? -BIAS : 2**N+BIAS)
@@ -57,9 +60,9 @@ module thresholding_axi #(
 
 	//- AXI Lite ------------------------
 	// Writing
-	input	logic                    s_axilite_AWVALID,
-	output	logic                    s_axilite_AWREADY,
-	input	logic [$clog2(C)+N+1:0]  s_axilite_AWADDR,	// lowest 2 bits (byte selectors) are ignored
+	input	logic                  s_axilite_AWVALID,
+	output	logic                  s_axilite_AWREADY,
+	input	logic [ADDR_BITS-1:0]  s_axilite_AWADDR,	// lowest 2 bits (byte selectors) are ignored
 
 	input	logic         s_axilite_WVALID,
 	output	logic         s_axilite_WREADY,
@@ -83,33 +86,53 @@ module thresholding_axi #(
 	//- AXI Stream - Input --------------
 	output	logic  s_axis_tready,
 	input	logic  s_axis_tvalid,
-	input	logic [((M+7)/8)*8-1:0]  s_axis_tdata,
+	input	logic [((PE*M+7)/8)*8-1:0]  s_axis_tdata,
 
 	//- AXI Stream - Output -------------
 	input	logic  m_axis_tready,
 	output	logic  m_axis_tvalid,
-	output	logic [((O_BITS+7)/8)*8-1:0]  m_axis_tdata
+	output	logic [((PE*O_BITS+7)/8)*8-1:0]  m_axis_tdata
 );
+	//- Parameter Constraints Checking --------------------------------------
+	initial begin
+		if(C%PE != 0) begin
+			$error("%m: Channel count C=%0d is not a multiple of PE=%0d.", C, PE);
+			$finish;
+		end
+	end
+
 	//- Global Control ------------------------------------------------------
 	uwire  clk = ap_clk;
 	uwire  rst = !ap_rst_n;
 
 	//- AXI Lite: Threshold Configuration -----------------------------------
-	uwire  twe;
-	uwire [$clog2(C)+N-1:0]  twa;
-	uwire [          M-1:0]  twd;
+	uwire  twe[PE];
+	uwire [$clog2(CF)+N-1:0]  twa;
+	uwire [           M-1:0]  twd;
 	if(1) begin : blkAxiLite
 		logic  WABusy = 0;
 		logic  WDBusy = 0;
-		logic [$clog2(C)+N-1:0]  Addr = 'x;
-		logic [          M-1:0]  Data = 'x;
+		logic  Sel[PE] = '{ default: 'x };
+		logic [$clog2(CF)+N-1:0]  Addr = 'x;
+		logic [           M-1:0]  Data = 'x;
 
-		assign	twe = WABusy && WDBusy;
+		for(genvar  pe = 0; pe < PE; pe++) begin
+			assign	twe[pe] = WABusy && WDBusy && Sel[pe];
+		end
 		assign	twa = Addr;
 		assign	twd = Data;
 
-		uwire  clr_wr = rst || (twe && s_axilite_BREADY);
-		always_ff @(posedge clk) begin : blockName
+		if(PE == 1)  always_comb  Sel[0] = 1;
+		else begin
+			always_ff @(posedge clk) begin
+				if(!WABusy) begin
+					foreach(Sel[pe])  Sel[pe] <= s_axilite_AWADDR[N+2+:$clog2(PE)] == pe;
+				end
+			end
+		end
+
+		uwire  clr_wr = rst || (WABusy && WDBusy && s_axilite_BREADY);
+		always_ff @(posedge clk) begin
 			if(clr_wr) begin
 				WABusy <= 0;
 				Addr <= 'x;
@@ -119,7 +142,8 @@ module thresholding_axi #(
 			else begin
 				if(!WABusy) begin
 					WABusy <= s_axilite_AWVALID;
-					Addr   <= s_axilite_AWADDR[$clog2(C)+N+1:2];
+					Addr[0+:N] <= s_axilite_AWADDR[2+:N];
+					if(C > 1)  Addr[N+:$clog2(CF)] <= s_axilite_AWADDR[2+N+$clog2(PE)+:$clog2(CF)];
 				end
 				if(!WDBusy) begin
 					WDBusy <= s_axilite_WVALID;
@@ -148,39 +172,48 @@ module thresholding_axi #(
 
 	//- IO-Sandwich with two-stage output buffer for containing a local enable
 	uwire  en;
-	uwire [O_BITS-1:0]  odat;
-	uwire  ovld;
+	uwire [PE-1:0][O_BITS-1:0]  odat;
+	uwire  ovld[PE];
 	if(1) begin : blkOutputDecouple
 		typedef struct {
-			logic          vld;
-			logic [O_BITS-1:0]  dat;
+			logic  vld;
+			logic [PE-1:0][O_BITS-1:0]  dat;
 		} buf_t;
-		buf_t  Buf[2] = '{ default: '{ vld: 0, dat: 'x } };
+		buf_t  A = '{ vld: 0, dat: 'x };
+		buf_t  B = '{ vld: 0, dat: 'x };
 		always_ff @(posedge clk) begin
-			if(rst)  Buf <= '{ default: '{ vld: 0, dat: 'x } };
+			if(rst) begin
+				A <= '{ vld: 0, dat: 'x };
+				B <= '{ vld: 0, dat: 'x };
+			end
 			else begin
-				if(!Buf[1].vld || m_axis_tready) begin
-					Buf[1] <= '{
-						vld: Buf[0].vld || ovld,
-						dat: Buf[0].vld? Buf[0].dat : odat
+				if(!B.vld || m_axis_tready) begin
+					B <= '{
+						vld: A.vld || ovld[0],
+						dat: A.vld? A.dat : odat
 					};
 				end
-				Buf[0].vld <= Buf[1].vld && !m_axis_tready && (Buf[0].vld || ovld);
-				if(!Buf[0].vld)  Buf[0].dat <= odat;
+				A.vld <= B.vld && !m_axis_tready && (A.vld || ovld[0]);
+				if(!A.vld)  A.dat <= odat;
 			end
 		end
-		assign	en = !Buf[0].vld;
+		assign	en = !A.vld;
 
-		assign	m_axis_tvalid = Buf[1].vld;
-		assign	m_axis_tdata  = Buf[1].dat;
+		assign	m_axis_tvalid = B.vld;
+		assign	m_axis_tdata  = B.dat;
 
 	end : blkOutputDecouple
 
+	// localparam int unsigned  C_BITS = C/PE < 2? 1 : $clog2(C/PE);
 	uwire  ivld = s_axis_tvalid;
 	uwire [C_BITS-1:0]  icnl;
-	uwire [M     -1:0]  idat = s_axis_tdata[M-1:0];
+	uwire [M     -1:0]  idat[PE];
+	for(genvar  pe = 0; pe < PE; pe++) begin
+		assign	idat[pe] = s_axis_tdata[pe*M+:M];
+	end
+
 	assign	s_axis_tready = en;
-	if(C == 1)  assign  icnl = 'x;
+	if(C == PE)  assign  icnl = 'x;
 	else begin
 		logic [C_BITS-1:0]  Chnl = 0;
 		logic               Last = 0;
@@ -193,19 +226,21 @@ module thresholding_axi #(
 			end
 			else if(inc) begin
 				Chnl <= Chnl + 1;
-				Last <= (~Chnl & (C-2)) == 0;
+				Last <= (~Chnl & (C/PE-2)) == 0;
 			end
 		end
 		assign	icnl = Chnl;
 	end
 
-	// Core Thresholding Module
-	thresholding #(.N(N), .M(M), .C(C), .SIGNED(SIGNED), .BIAS(BIAS)) core (
-		.clk, .rst,
-		.twe, .twa, .twd,
-		.en,
-		.ivld, .icnl, .idat,
-		.ovld, .ocnl(), .odat
-	);
+	// Core Thresholding Modules
+	for(genvar  pe = 0; pe < PE; pe++) begin : genCores
+		thresholding #(.N(N), .M(M), .C(C/PE), .SIGNED(SIGNED), .BIAS(BIAS)) core (
+			.clk, .rst,
+			.twe(twe[pe]), .twa, .twd,
+			.en,
+			.ivld, .icnl, .idat(idat[pe]),
+			.ovld(ovld[pe]), .ocnl(), .odat(odat[pe])
+		);
+	end : genCores
 
 endmodule : thresholding_axi
diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v
index da013b667a..c27480f388 100644
--- a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v
+++ b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v
@@ -38,6 +38,7 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 	parameter  C = $C$,	// Channels
 	parameter  SIGNED = $SIGNED$,	// signed inputs
 	int BIAS = $BIAS$,  // offsetting the output [0, 2^N-1) -> [-BIAS, 2^N-1 - BIAS)
+	parameter  PE = $PE$,
 
 	parameter  O_BITS = BIAS > 0?
 		/* unsigned */ $clog2(2**N-BIAS) :
@@ -77,15 +78,15 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 	//- AXI Stream - Input --------------
 	output	in0_V_TREADY,
 	input	in0_V_TVALID,
-	input	[((M+7)/8)*8-1:0]  in0_V_TDATA,
+	input	[((PE*M+7)/8)*8-1:0]  in0_V_TDATA,
 
 	//- AXI Stream - Output -------------
 	input	out_V_TREADY,
 	output	out_V_TVALID,
-	output	[((O_BITS+7)/8)*8-1:0]  out_V_TDATA
+	output	[((PE*O_BITS+7)/8)*8-1:0]  out_V_TDATA
 );
 
-	thresholding_axi #(.N(N), .M(M), .C(C), .SIGNED(SIGNED), .BIAS(BIAS)) inst (
+	thresholding_axi #(.N(N), .M(M), .C(C), .PE(PE), .SIGNED(SIGNED), .BIAS(BIAS)) inst (
 		//- Global Control ------------------
 		.ap_clk(ap_clk),
 		.ap_rst_n(ap_rst_n),
diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
index af9e1173fb..e2453fcaad 100755
--- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
@@ -273,6 +273,7 @@ def prepare_codegen_rtl_values(self):
         )  # input/threshold precision
         num_channels = self.get_nodeattr("NumChannels")  # number of channels
         bias = self.get_nodeattr("activation_bias")  # activation bias value
+        pe = self.get_nodeattr("PE")
 
         code_gen_dict["$N$"] = [
             str(DataType[output_data_type].bitwidth())
@@ -282,6 +283,7 @@ def prepare_codegen_rtl_values(self):
         ]  # input/threshold precision - convert bitwidth to string
         code_gen_dict["$C$"] = [str(num_channels)]  # number of channels
         code_gen_dict["$BIAS$"] = [str(bias)]  # activation bias value
+        code_gen_dict["$PE$"] = [str(pe)] # requires C = M*PE
 
         # Is the input datatype signed or unsigned?
         # The thresholding core needs to know this when comparing weights to inputs

From 29f9e1ce637d5df00cb1dae2ada3438070da0852 Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Thu, 30 Mar 2023 15:43:36 +0100
Subject: [PATCH 090/235] [thresholding]: remove reset that erases the 0th
 stage threshold value

There is a corner case where the number of channels configured for the thresholding core is 1 and during programming the stage 0 threshold parameter.
For each other stage in this case (and all other cases) the threshold parameters are non-volatile. When a reset happens after programming the threshold parameters, all would still be intact except for the 0th stage threshold value.

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 finn-rtllib/thresholding/hdl/thresholding.sv | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/finn-rtllib/thresholding/hdl/thresholding.sv b/finn-rtllib/thresholding/hdl/thresholding.sv
index 52d0b41b33..0ce95ed3f9 100644
--- a/finn-rtllib/thresholding/hdl/thresholding.sv
+++ b/finn-rtllib/thresholding/hdl/thresholding.sv
@@ -102,8 +102,7 @@ module thresholding #(
 			if((C == 1) && (stage == 0)) begin
 				logic [M-1:0]  Thresh = 'x;
 				always_ff @(posedge clk) begin
-					if(rst)      Thresh <= 'x;
-					else if(we)  Thresh <= twd;
+					if(we)  Thresh <= twd;
 				end
 				assign  thresh = Thresh;
 			end

From 2c4c8e224f8921848713f6d121532ff345c84fd0 Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Fri, 31 Mar 2023 10:43:00 +0100
Subject: [PATCH 091/235] [thresholding]: enable PE testing of RTL threhoslding
 binary search node

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 .../fpgadataflow/convert_to_hls_layers.py             | 10 ----------
 .../fpgadataflow/test_convert_to_hls_thresholding.py  | 10 ++--------
 .../test_fpgadataflow_thresholding_binary_search.py   | 11 ++---------
 3 files changed, 4 insertions(+), 27 deletions(-)

diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
index dedcc30a38..4c06a28b75 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
@@ -1137,16 +1137,6 @@ def apply(self, model):
                         % node.name
                     )
 
-                    # Check PE/SIMD value
-                    if pe != 1:
-                        warnings.warn(
-                            """%s : RTL Thresholding does not support paralellisation.
-                            Only a PE value of 1 is supported.
-                            Falling back to HLS implementation."""
-                            % node.name
-                        )
-                        is_rtl_variant_compatible = False
-
                 if self.use_rtl_variant and is_rtl_variant_compatible:
                     new_node = helper.make_node(
                         "Thresholding_Binary_Search",
diff --git a/tests/fpgadataflow/test_convert_to_hls_thresholding.py b/tests/fpgadataflow/test_convert_to_hls_thresholding.py
index 09067564eb..895c82d4ca 100755
--- a/tests/fpgadataflow/test_convert_to_hls_thresholding.py
+++ b/tests/fpgadataflow/test_convert_to_hls_thresholding.py
@@ -138,10 +138,10 @@ def make_single_multithresholding_modelwrapper(
     model.set_initializer("thresh", thresholds)
     return model
 
-
+# N.B. Fold values where C % PE != 0 fail
 @pytest.mark.parametrize("activation", [DataType["INT4"], DataType["BIPOLAR"]])
 @pytest.mark.parametrize("input_data_type", [DataType["INT16"], DataType["UINT16"]])
-@pytest.mark.parametrize("fold", [-1, 1, 2])
+@pytest.mark.parametrize("fold", [-1, 1, 2, 4, 6])
 @pytest.mark.parametrize("num_input_channels", [16])
 @pytest.mark.fpgadataflow
 @pytest.mark.vivado
@@ -161,12 +161,6 @@ def test_convert_to_hls_tbs_rtl_variant(
             "RTL Thresholding Binary Search node"
         )
 
-    # Paralellisation not supported for thresholding binary search rtl node
-    if pe != 1:
-        pytest.skip(
-            "Paralellisation not supported for RTL Thresholding Binary Search node"
-        )
-
     # Other non-input parameters
     num_input_vecs = [1, 2, 2]
     output_data_type = activation
diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py
index e57c4942c8..24b60f5ea5 100755
--- a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py
+++ b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py
@@ -186,10 +186,10 @@ def test_fpgadataflow_thresholding_binary_search_prepare_rtlsim():
 
 # Test brief: Create a Thresholding binary search layer using various parameters
 # and test against a SW generated & simulated dataset
-# N.B. - fold factor of '-1' is supported only (no PE/SIMD support)
+# N.B. Fold values where C % PE != 0 fail
 @pytest.mark.parametrize("activation", [DataType["INT4"], DataType["BIPOLAR"]])
 @pytest.mark.parametrize("input_data_type", [DataType["INT16"], DataType["UINT16"]])
-@pytest.mark.parametrize("fold", [-1, 1, 2])
+@pytest.mark.parametrize("fold", [-1, 1, 2, 4, 6])
 @pytest.mark.parametrize("num_input_channels", [16])
 @pytest.mark.fpgadataflow
 @pytest.mark.vivado
@@ -201,13 +201,6 @@ def test_fpgadataflow_thresholding_binary_search(
     pe = generate_pe_value(fold, num_input_channels)
     num_steps = activation.get_num_possible_values() - 1
 
-    # Paralellisation not supported for thresholding binary search rtl node
-    if pe != 1:
-        pytest.skip(
-            "Paralellisation of IP not supported for "
-            "RTL Thresholding Binary Search node"
-        )
-
     # Other non-input parameters
     num_input_vecs = [1, 2, 2]
     output_data_type = activation

From 5d07a435c2994f0238fb41ec21381d75ea049796 Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Fri, 31 Mar 2023 10:45:34 +0100
Subject: [PATCH 092/235] [thresholding]: add comment about why bipolar
 activations skipped for threhsolding binary search node

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 tests/fpgadataflow/test_convert_to_hls_thresholding.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/fpgadataflow/test_convert_to_hls_thresholding.py b/tests/fpgadataflow/test_convert_to_hls_thresholding.py
index 895c82d4ca..f2d76c8416 100755
--- a/tests/fpgadataflow/test_convert_to_hls_thresholding.py
+++ b/tests/fpgadataflow/test_convert_to_hls_thresholding.py
@@ -155,6 +155,9 @@ def test_convert_to_hls_tbs_rtl_variant(
     pe = generate_pe_value(fold, num_input_channels)
     num_steps = activation.get_num_possible_values() - 1
 
+    # See convert_to_hls_layers::InferThresholdingLayer:
+    # assert (not odt.signed()) or (actval < 0)
+    # This implies that it expects a negative activation, BIPOLAR does not provide that
     if activation == DataType["BIPOLAR"]:
         pytest.skip(
             "Only negative activations are supported for "

From fcf579ce01075bbeb997580fbafc8cd9d64ed50c Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Fri, 31 Mar 2023 11:32:42 +0100
Subject: [PATCH 093/235] fix precommit issues

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 src/finn/custom_op/fpgadataflow/thresholding_binary_search.py | 2 +-
 tests/fpgadataflow/test_convert_to_hls_thresholding.py        | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
index e2453fcaad..694d25bfaa 100755
--- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
@@ -283,7 +283,7 @@ def prepare_codegen_rtl_values(self):
         ]  # input/threshold precision - convert bitwidth to string
         code_gen_dict["$C$"] = [str(num_channels)]  # number of channels
         code_gen_dict["$BIAS$"] = [str(bias)]  # activation bias value
-        code_gen_dict["$PE$"] = [str(pe)] # requires C = M*PE
+        code_gen_dict["$PE$"] = [str(pe)]  # requires C = M*PE
 
         # Is the input datatype signed or unsigned?
         # The thresholding core needs to know this when comparing weights to inputs
diff --git a/tests/fpgadataflow/test_convert_to_hls_thresholding.py b/tests/fpgadataflow/test_convert_to_hls_thresholding.py
index f2d76c8416..9c233bdd06 100755
--- a/tests/fpgadataflow/test_convert_to_hls_thresholding.py
+++ b/tests/fpgadataflow/test_convert_to_hls_thresholding.py
@@ -138,6 +138,7 @@ def make_single_multithresholding_modelwrapper(
     model.set_initializer("thresh", thresholds)
     return model
 
+
 # N.B. Fold values where C % PE != 0 fail
 @pytest.mark.parametrize("activation", [DataType["INT4"], DataType["BIPOLAR"]])
 @pytest.mark.parametrize("input_data_type", [DataType["INT16"], DataType["UINT16"]])

From 6c9d1f50177de5bb1c91eacc061d0aa8adb9cf56 Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Wed, 5 Apr 2023 16:28:55 +0100
Subject: [PATCH 094/235] [thresholding] only adjust MSB thresholding
 addressing bits when channel fold factor is present

In the case where channel fold is not present (i.e. CF is 0), we saw incorrect threshold address programming.
Without this commit and when no channel folding is present, this if statement is always stepped through and
was damaging LSBs of the Addr signal, causing incorrect threshold address programming for a PE core.

Although the logic in the if statement looks correct (i.e. programming 0 bits (clog2(CF) => 0)) and should not
harm the Addr signal, it's best to avoid stepping through a case that does not exist (i.e., there is no channel
folding and each channel has its own PE; therefore no extra bits needed to program multiple channel thresholds
into a single PE core).

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 finn-rtllib/thresholding/hdl/thresholding_axi.sv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi.sv b/finn-rtllib/thresholding/hdl/thresholding_axi.sv
index 506e31b215..d2a7420a99 100644
--- a/finn-rtllib/thresholding/hdl/thresholding_axi.sv
+++ b/finn-rtllib/thresholding/hdl/thresholding_axi.sv
@@ -143,7 +143,7 @@ module thresholding_axi #(
 				if(!WABusy) begin
 					WABusy <= s_axilite_AWVALID;
 					Addr[0+:N] <= s_axilite_AWADDR[2+:N];
-					if(C > 1)  Addr[N+:$clog2(CF)] <= s_axilite_AWADDR[2+N+$clog2(PE)+:$clog2(CF)];
+					if(CF > 1)  Addr[N+:$clog2(CF)] <= s_axilite_AWADDR[2+N+$clog2(PE)+:$clog2(CF)];
 				end
 				if(!WDBusy) begin
 					WDBusy <= s_axilite_WVALID;

From b247ffbc258ec628a51c14822ec4343283ef5a2e Mon Sep 17 00:00:00 2001
From: Fionn O'Donohoe <fionno@xilinx.com>
Date: Wed, 5 Apr 2023 19:44:36 +0100
Subject: [PATCH 095/235] [thresholding] update binary search to match qonnx
 0.2.0

commit 65822357a7dba4f917c852d5f08bdebc7dd22e9d on dev moved all custom_ops
to be compatible with qonnx 0.2.0

Signed-off-by: Fionn O'Donohoe <fionno@xilinx.com>
---
 src/finn/custom_op/fpgadataflow/thresholding_binary_search.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
index 694d25bfaa..d02b778823 100755
--- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
@@ -60,8 +60,8 @@
 class Thresholding_Binary_Search(HLSCustomOp):
     """Class that corresponds to finn-rtllib 'thresholding' function."""
 
-    def __init__(self, onnx_node):
-        super().__init__(onnx_node)
+    def __init__(self, onnx_node, **kwargs):
+        super().__init__(onnx_node, **kwargs)
 
     def get_nodeattr_types(self):
         my_attrs = {

From afab9cd6543b4fe1f612c329074d30d59706ac08 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 6 Apr 2023 12:34:01 +0100
Subject: [PATCH 096/235] [rtl custom op]: initial implementation of mvu_8sx9

---
 finn-rtllib/mvu/mvu_8sx9.sv | 284 ++++++++++++++++++++++++++++++++++++
 1 file changed, 284 insertions(+)
 create mode 100644 finn-rtllib/mvu/mvu_8sx9.sv

diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv
new file mode 100644
index 0000000000..c992990d9f
--- /dev/null
+++ b/finn-rtllib/mvu/mvu_8sx9.sv
@@ -0,0 +1,284 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Matrix Vector Unit (MVU) core compute kernel utilizing DSP58.
+ *****************************************************************************/
+
+module mvu_8sx9 #(
+    int unsigned PE,
+    int unsigned SIMD,
+    int unsigned ACTIVATION_WIDTH,
+    int unsigned WEIGHT_WIDTH,
+    bit SIGNED_ACTIVATIONS = 0,
+    int unsigned SEGMENTLEN = 0 // Default to 0 (which implies a single segment)
+  )
+  (
+    input   logic clk,
+    input   logic rst,
+    input   logic en,
+    input   logic last,
+    input   logic zero,
+    input   logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] a,
+    input   logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w,
+    output  logic vld,
+    output  logic [PE-1:0][57:0] p 
+  );
+
+//-------------------- Declare global signals --------------------\\
+localparam int unsigned CHAINLEN = (SIMD+2)/3;
+localparam int unsigned SEGLEN = SEGMENTLEN == 0 ? CHAINLEN : SEGMENTLEN; // Additional constant to default a SEGMENTLEN of '0' to the DSP-chain length
+uwire [26:0] a_in_i [CHAINLEN];
+uwire [23:0] b_in_i [PE][CHAINLEN];
+uwire [57:0] pcout [PE][CHAINLEN];
+
+//-------------------- Shift register for opmode select signal --------------------\\
+localparam int unsigned MAX_PIPELINE_STAGES = (CHAINLEN + SEGLEN-1)/SEGLEN; // >=1 (== number of pipeline registers + 1 (A/B inputs always have 1 register))
+logic L [0:1+MAX_PIPELINE_STAGES] = '{default: 0}; // After MAX_PIPELINE_STAGES (== number of pipeline stages for input data), we have 3 additional cycles latency (A/B reg, Mreg, Preg). Thus, we add +2 (since OPMODE is buffered by 1 cycle in the DSP fabric)
+
+always_ff @(posedge clk) begin
+  if(rst)     L <= '{default: 0};
+  else if(en) L <= { L[1:1+MAX_PIPELINE_STAGES], last };
+end  
+assign vld = L[0];
+
+//-------------------- Shift register for ZERO flag --------------------\\
+logic Z [0:MAX_PIPELINE_STAGES-2] = '{default:0}; // We need MAX_PIPELINE_STAGES-1 pipeline stages (note: INMODE is buffered inside DSP fabric)
+
+if (MAX_PIPELINE_STAGES > 1) begin : genZreg
+  always_ff @(posedge clk) begin
+    if (rst)      Z <= '{default: 0};
+    else if(en) begin
+        Z[0] <= zero;
+        if (MAX_PIPELINE_STAGES > 2)  Z[1:MAX_PIPELINE_STAGES-1] <= Z[0:MAX_PIPELINE_STAGES-2];
+    end    
+  end
+end;
+
+//-------------------- Buffer for input activations --------------------\\
+localparam int unsigned PAD_BITS_ACT = 9 - ACTIVATION_WIDTH;
+typedef logic [2:0][ACTIVATION_WIDTH-1:0] a_buffer_t;
+
+for (genvar i=0; i<CHAINLEN; i++) begin : genActSIMD
+  localparam int TOTAL_PREGS = i/SEGLEN;
+  localparam int EXTERNAL_PREGS = TOTAL_PREGS>1 ? TOTAL_PREGS-1 : 0;
+  
+  if (EXTERNAL_PREGS > 0) begin : genExternalPregAct
+    a_buffer_t A [0:EXTERNAL_PREGS-1];
+    always_ff @(posedge clk) begin
+      if (rst)     A <= '{default: 0};
+      else if(en) begin
+        A[EXTERNAL_PREGS-1] <= a[3*i +: 3];
+        if (EXTERNAL_PREGS > 1)   A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1];
+      end
+    end
+    assign a_in_i[i][26:0] = SIGNED_ACTIVATIONS ? { {PAD_BITS_ACT{A[0][2][ACTIVATION_WIDTH-1]}}, A[0][2], {PAD_BITS_ACT{A[0][1][ACTIVATION_WIDTH-1]}}, A[0][1], {PAD_BITS_ACT{A[0][0][ACTIVATION_WIDTH-1]}}, A[0][0]}
+                             : { {PAD_BITS_ACT{1'b0}}, A[0][2], {PAD_BITS_ACT{1'b0}}, A[0][1], {PAD_BITS_ACT{1'b0}}, A[0][0]} ;
+  end : genExternalPregAct
+  else begin : genInpDSPAct
+    assign a_in_i[i][26:0] = SIGNED_ACTIVATIONS ? { {PAD_BITS_ACT{a[3*i+2][ACTIVATION_WIDTH-1]}}, a[3*i+2], {PAD_BITS_ACT{a[3*i+1][ACTIVATION_WIDTH-1]}}, a[3*i+1], {PAD_BITS_ACT{a[3*i][ACTIVATION_WIDTH-1]}}, a[3*i]}
+                             : { {PAD_BITS_ACT{1'b0}}, a[3*i+2], {PAD_BITS_ACT{1'b0}}, a[3*i+1], {PAD_BITS_ACT{1'b0}}, a[3*i]} ;
+  end : genInpDSPAct
+
+end : genActSIMD
+
+//-------------------- Buffer for weights --------------------\\
+localparam int unsigned PAD_BITS_WEIGHT = 8 - WEIGHT_WIDTH;
+typedef logic [2:0][WEIGHT_WIDTH-1:0] b_buffer_t;
+
+for (genvar j=0; j<PE; j++) begin : genWeightPE
+  for (genvar i=0; i<CHAINLEN; i++) begin : genWeightSIMD
+    localparam int TOTAL_PREGS = i/SEGLEN;
+    localparam int EXTERNAL_PREGS = TOTAL_PREGS>1 ? TOTAL_PREGS-1 : 0;
+    
+    if (EXTERNAL_PREGS > 0) begin : genExternalPregWeight
+      b_buffer_t B [0:PE-1][0:EXTERNAL_PREGS-1];
+      always_ff @(posedge clk) begin
+        if (rst)    B <= '{default: 0};
+        else if (en) begin
+          B[j][EXTERNAL_PREGS-1] <= w[j][3*i +: 3];
+          if (EXTERNAL_PREGS > 1) B[j][0:EXTERNAL_PREGS-2] <= B[j][1:EXTERNAL_PREGS-1];
+        end
+      end
+      assign b_in_i[j][i][23:0] = { {PAD_BITS_WEIGHT{B[j][0][2][WEIGHT_WIDTH-1]}}, B[j][0][2], {PAD_BITS_WEIGHT{B[j][0][1][WEIGHT_WIDTH-1]}}, B[j][0][1], {PAD_BITS_WEIGHT{B[j][0][0][WEIGHT_WIDTH-1]}}, B[j][0][0] };
+    end : genExternalPregWeight
+    else begin : genInpDSPWeight
+      assign b_in_i[j][i][23:0] = { {PAD_BITS_WEIGHT{w[j][3*i+2][WEIGHT_WIDTH-1]}}, w[j][3*i+2], {PAD_BITS_WEIGHT{w[j][3*i+1][WEIGHT_WIDTH-1]}}, w[j][3*i+1], {PAD_BITS_WEIGHT{w[j][3*i][WEIGHT_WIDTH-1]}}, w[j][3*i] };
+    end : genInpDSPWeight
+  end : genWeightSIMD
+
+end : genWeightPE
+
+//-------------------- Instantiate PE x CHAINLEN DSPs --------------------\\
+for (genvar j=0; j<PE; j++) begin : genDSPPE
+  for (genvar i=0; i<CHAINLEN; i++) begin : genDSPChain
+    localparam int TOTAL_PREGS = i/SEGLEN;
+    localparam int INTERNAL_PREGS = TOTAL_PREGS>0 ? 2 : 1; // 1 : 0
+    localparam bit PREG = (i+1)%SEGLEN==0 || i == CHAINLEN-1;
+    localparam bit FIRST = i == 0;
+    localparam bit LAST = i == CHAINLEN-1;
+    uwire [57:0] pp;
+    
+    if (LAST) begin : genPOUT
+      assign p[j] = pp;
+    end      
+    
+    DSP58 #(
+      // Feature Control Attributes: Data Path Selection
+      .AMULTSEL("A"),                     // Selects A input to multiplier (A, AD)
+      .A_INPUT("DIRECT"),                 // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port)
+      .BMULTSEL("B"),                     // Selects B input to multiplier (AD, B)
+      .B_INPUT("DIRECT"),                 // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port)
+      .DSP_MODE("INT8"),                  // Configures DSP to a particular mode of operation. Set to INT24 for
+                                          // legacy mode.
+      .PREADDINSEL("A"),                  // Selects input to pre-adder (A, B)
+      .RND(58'h000000000000000),          // Rounding Constant
+      .USE_MULT("MULTIPLY"),              // Select multiplier usage (DYNAMIC, MULTIPLY, NONE)
+      .USE_SIMD("ONE58"),                 // SIMD selection (FOUR12, ONE58, TWO24)
+      .USE_WIDEXOR("FALSE"),              // Use the Wide XOR function (FALSE, TRUE)
+      .XORSIMD("XOR24_34_58_116"),        // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116)
+      // Pattern Detector Attributes: Pattern Detection Configuration
+      .AUTORESET_PATDET("NO_RESET"),      // NO_RESET, RESET_MATCH, RESET_NOT_MATCH
+      .AUTORESET_PRIORITY("RESET"),       // Priority of AUTORESET vs. CEP (CEP, RESET).
+      .MASK(58'h0ffffffffffffff),         // 58-bit mask value for pattern detect (1=ignore)
+      .PATTERN(58'h000000000000000),      // 58-bit pattern match for pattern detect
+      .SEL_MASK("MASK"),                  // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2
+      .SEL_PATTERN("PATTERN"),            // Select pattern value (C, PATTERN)
+      .USE_PATTERN_DETECT("NO_PATDET"),   // Enable pattern detect (NO_PATDET, PATDET)
+      // Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins
+      .IS_ALUMODE_INVERTED(4'b0000),      // Optional inversion for ALUMODE
+      .IS_CARRYIN_INVERTED(1'b0),         // Optional inversion for CARRYIN
+      .IS_CLK_INVERTED(1'b0),             // Optional inversion for CLK
+      .IS_INMODE_INVERTED(5'b00000),      // Optional inversion for INMODE
+      .IS_NEGATE_INVERTED(3'b000),        // Optional inversion for NEGATE
+      .IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0 
+                            FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN 
+                            2'b01, // Y : M
+                            2'b01  // X: M
+        }), // Optional inversion for OPMODE
+      .IS_RSTALLCARRYIN_INVERTED(1'b0),   // Optional inversion for RSTALLCARRYIN
+      .IS_RSTALUMODE_INVERTED(1'b0),      // Optional inversion for RSTALUMODE
+      .IS_RSTA_INVERTED(1'b0),            // Optional inversion for RSTA
+      .IS_RSTB_INVERTED(1'b0),            // Optional inversion for RSTB
+      .IS_RSTCTRL_INVERTED(1'b0),         // Optional inversion for STCONJUGATE_A
+      .IS_RSTC_INVERTED(1'b0),            // Optional inversion for RSTC
+      .IS_RSTD_INVERTED(1'b0),            // Optional inversion for RSTD
+      .IS_RSTINMODE_INVERTED(1'b0),       // Optional inversion for RSTINMODE
+      .IS_RSTM_INVERTED(1'b0),            // Optional inversion for RSTM
+      .IS_RSTP_INVERTED(1'b0),            // Optional inversion for RSTP
+      // Register Control Attributes: Pipeline Register Configuration
+      .ACASCREG(INTERNAL_PREGS),          // Number of pipeline stages between A/ACIN and ACOUT (0-2)
+      .ADREG(0),                          // Pipeline stages for pre-adder (0-1)
+      .ALUMODEREG(0),                     // Pipeline stages for ALUMODE (0-1)
+      .AREG(INTERNAL_PREGS),              // Pipeline stages for A (0-2)
+      .BCASCREG(INTERNAL_PREGS),          // Number of pipeline stages between B/BCIN and BCOUT (0-2)
+      .BREG(INTERNAL_PREGS),              // Pipeline stages for B (0-2)
+      .CARRYINREG(0),                     // Pipeline stages for CARRYIN (0-1)
+      .CARRYINSELREG(0),                  // Pipeline stages for CARRYINSEL (0-1)
+      .CREG(0),                           // Pipeline stages for C (0-1)
+      .DREG(0),                           // Pipeline stages for D (0-1)
+      .INMODEREG(1),                      // Pipeline stages for INMODE (0-1)
+      .MREG(1),                           // Multiplier pipeline stages (0-1)
+      .OPMODEREG(1),                      // Pipeline stages for OPMODE (0-1)
+      .PREG(PREG),                        // Number of pipeline stages for P (0-1)
+      .RESET_MODE("SYNC")                 // Selection of synchronous or asynchronous reset. (ASYNC, SYNC).
+    )
+    DSP58_inst (
+      // Cascade outputs: Cascade Ports
+      .ACOUT(),                           // 34-bit output: A port cascade
+      .BCOUT(),                           // 24-bit output: B cascade
+      .CARRYCASCOUT(),                    // 1-bit output: Cascade carry
+      .MULTSIGNOUT(),                     // 1-bit output: Multiplier sign cascade
+      .PCOUT(pcout[j][i]),                // 58-bit output: Cascade output
+      // Control outputs: Control Inputs/Status Bits
+      .OVERFLOW(),                        // 1-bit output: Overflow in add/acc
+      .PATTERNBDETECT(),                  // 1-bit output: Pattern bar detect
+      .PATTERNDETECT(),                   // 1-bit output: Pattern detect
+      .UNDERFLOW(),                       // 1-bit output: Underflow in add/acc
+      // Data outputs: Data Ports
+      .CARRYOUT(),                        // 4-bit output: Carry
+      .P(pp),                             // 58-bit output: Primary data
+      .XOROUT(),                          // 8-bit output: XOR data
+      // Cascade inputs: Cascade Ports
+      .ACIN('x),                          // 34-bit input: A cascade data
+      .BCIN('x),                          // 24-bit input: B cascade
+      .CARRYCASCIN('x),                   // 1-bit input: Cascade carry
+      .MULTSIGNIN('x),                    // 1-bit input: Multiplier sign cascade
+      .PCIN(FIRST ? 'x : pcout[j][i-1]),  // 58-bit input: P cascade
+      // Control inputs: Control Inputs/Status Bits
+      .ALUMODE(4'h0),                     // 4-bit input: ALU control
+      .CARRYINSEL('0),                    // 3-bit input: Carry select
+      .CLK(clk),                          // 1-bit input: Clock
+      .INMODE({
+              INTERNAL_PREGS==2 ? 1'b0 : 1'b1,
+              2'b00,
+              TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero,
+              INTERNAL_PREGS==2 ? 1'b0 : 1'b1        
+      }),                                 // 5-bit input: INMODE control
+      .NEGATE('0),                        // 3-bit input: Negates the input of the multiplier
+      .OPMODE({
+              LAST ? {1'b0, L[1]} : 2'b00,
+              7'b000_0000
+      }), // 9-bit input: Operation mode
+      // Data inputs: Data Ports
+      .A({ 7'bx, a_in_i[i] }),            // 34-bit input: A data
+      .B(b_in_i[j][i]),                   // 24-bit input: B data
+      .C('x),                             // 58-bit input: C data
+      .CARRYIN('0),                       // 1-bit input: Carry-in
+      .D('x),                             // 27-bit input: D data
+      // Reset/Clock Enable inputs: Reset/Clock Enable Inputs
+      .ASYNC_RST('0),                     // 1-bit input: Asynchronous reset for all registers.
+      .CEA1(en),                          // 1-bit input: Clock enable for 1st stage AREG
+      .CEA2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage AREG
+      .CEAD('0),                          // 1-bit input: Clock enable for ADREG
+      .CEALUMODE('0),                     // 1-bit input: Clock enable for ALUMODE
+      .CEB1(en),                          // 1-bit input: Clock enable for 1st stage BREG
+      .CEB2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage BREG
+      .CEC('0),                           // 1-bit input: Clock enable for CREG
+      .CECARRYIN('0),                     // 1-bit input: Clock enable for CARRYINREG
+      .CECTRL(en),                        // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG
+      .CED('0),                           // 1-bit input: Clock enable for DREG
+      .CEINMODE(en),                      // 1-bit input: Clock enable for INMODEREG
+      .CEM(en),                           // 1-bit input: Clock enable for MREG
+      .CEP(PREG && en),                   // 1-bit input: Clock enable for PREG
+      .RSTA(rst),                         // 1-bit input: Reset for AREG
+      .RSTALLCARRYIN('0),                 // 1-bit input: Reset for CARRYINREG
+      .RSTALUMODE('0),                    // 1-bit input: Reset for ALUMODEREG
+      .RSTB(rst),                         // 1-bit input: Reset for BREG
+      .RSTC('0),                          // 1-bit input: Reset for CREG
+      .RSTCTRL(rst),                      // 1-bit input: Reset for OPMODEREG and CARRYINSELREG
+      .RSTD('0),                          // 1-bit input: Reset for DREG and ADREG
+      .RSTINMODE(rst),                    // 1-bit input: Reset for INMODE register
+      .RSTM(rst),                         // 1-bit input: Reset for MREG
+      .RSTP(PREG && rst)                  // 1-bit input: Reset for PREG
+    );
+  end : genDSPChain  
+end : genDSPPE
+    
+endmodule

From a94fc3bb0759ecd4b9af212d1629236894a1b520 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 6 Apr 2023 12:34:22 +0100
Subject: [PATCH 097/235] [rtl custom op]: testbench for mvu_8sx9

---
 finn-rtllib/mvu/mvu_8sx9_tb.sv | 165 +++++++++++++++++++++++++++++++++
 1 file changed, 165 insertions(+)
 create mode 100644 finn-rtllib/mvu/mvu_8sx9_tb.sv

diff --git a/finn-rtllib/mvu/mvu_8sx9_tb.sv b/finn-rtllib/mvu/mvu_8sx9_tb.sv
new file mode 100644
index 0000000000..ea3ecbbd70
--- /dev/null
+++ b/finn-rtllib/mvu/mvu_8sx9_tb.sv
@@ -0,0 +1,165 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Testbench for MVU core compute kernel.
+ *****************************************************************************/
+
+module mvu_8sx9_tb();
+
+  //-------------------- Simulation parameters --------------------\\
+  // Matrix & parallelism config
+  localparam int unsigned MH = 256;
+  localparam int unsigned PE = 16;
+  localparam int unsigned MW = 600;
+  localparam int unsigned SIMD = 60;
+  localparam int unsigned SEGMENTLEN = 4;
+  // Bit-width config  
+  localparam int unsigned ACTIVATION_WIDTH = 8;
+  localparam int unsigned WEIGHT_WIDTH = 4;
+  localparam bit SIGNED_ACTIVATIONS = 1;
+  // Simulation constants
+  localparam int unsigned NF = MH/PE;
+  localparam int unsigned SF = MW/SIMD;
+  localparam int unsigned NUM_OF_DSP = SIMD/3;
+  
+  typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t;
+  typedef activation_t activation_vector_t[SF];
+
+  function activation_vector_t init_ACTIVATIONS;
+    automatic activation_vector_t res;
+    std::randomize(res);
+    return res;
+  endfunction : init_ACTIVATIONS
+
+  typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t;
+  typedef weight_t weight_matrix_t[NF][SF];
+  
+  function weight_matrix_t init_WEIGHTS;
+    automatic weight_matrix_t res;
+    std::randomize(res);
+    return res;
+  endfunction : init_WEIGHTS;
+  
+  typedef logic signed [PE-1:0][57:0] output_t;
+  typedef output_t output_vector_t [NF];
+
+  function output_vector_t check_output(activation_vector_t a, weight_matrix_t w);
+    automatic output_vector_t res = '{default: 0};
+    for (int j = 0; j<MH; j++) begin
+      for (int i = 0; i<MW; i++) begin
+        res[j/PE][j%PE] = $signed(res[j/PE][j%PE]) + $signed(a[i/SIMD][i%SIMD]) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
+      end
+    end  
+    return res;
+  endfunction : check_output;
+  
+  logic clk = 0;
+  always #5ns clk = !clk;
+  
+  logic rst;
+  initial begin
+    rst = 1;
+    repeat(16) @(posedge clk);
+    rst <= 0;
+  end
+   
+  logic last;
+  logic zero;
+  logic vld;
+  activation_t a;
+  weight_t w;
+  output_t p;
+  // Reference signals
+  activation_vector_t ACTIVATIONS; //   [SF-1:0][SIMD-1:0][ACTIVATION_WIDTH-1:0]
+  weight_matrix_t WEIGHTS; //           [NF-1:0][SF-1:0][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]
+  output_vector_t GOLDEN_OUTPUT; //     [NF-1:0][PE-1:0][57:0]
+  // Counter for number of outputs (NF dimension) that are produced
+  int NF_CNT = 0;
+  
+  initial begin
+    ACTIVATIONS = init_ACTIVATIONS();
+    WEIGHTS = init_WEIGHTS();
+    GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS);
+    last = 0;
+    zero = 0;
+    a = 'x;
+    w = 'x;
+    
+    @(posedge clk iff !rst);
+
+    for (int j=0; j<NF; j++) begin
+      for (int i=0; i<SF; i++) begin
+        last <= (i==SF-1) ? 1 : 0;
+        a <= ACTIVATIONS[i];
+        w <= WEIGHTS[j][i];
+        @(posedge clk iff en);
+      end
+    end
+
+    last <= 0;
+    zero <= 1;  
+
+    // Continue until all NF outputs are produced & compared
+    @(posedge clk && (NF_CNT==NF));
+
+    $finish;
+  end
+
+  logic en = 0;
+  always_ff @(posedge clk) begin
+    en <= ($urandom()%7 > 1) && !rst;
+  end
+
+  // Compare computed output against golden output when vld flag is raised by DUT
+  always_ff @(posedge clk iff (vld && en)) begin
+    foreach(p[i]) begin
+      assert ($signed(p[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+      else begin 
+        $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+        $stop;
+      end  
+    end
+    NF_CNT += 1;
+  end
+
+  // Instantiate DUT
+  mvu_8sx9 #(
+      .PE(PE),
+      .SIMD(SIMD),
+      .WEIGHT_WIDTH(WEIGHT_WIDTH),
+      .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
+      .ACTIVATION_WIDTH(ACTIVATION_WIDTH),
+      .SEGMENTLEN(SEGMENTLEN)
+    )
+    dut (
+      .clk, .rst, .en, .last, .zero, .a, .w, .vld, .p
+    );
+  
+endmodule

From 98f9accb40bed3445215e15d30398e09948e0b9f Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 6 Apr 2023 12:35:30 +0100
Subject: [PATCH 098/235] [rtl custom op]: initial implementation of flow
 control component for mvu_8sx9

---
 finn-rtllib/mvu/mvu_8sx9_axi.sv | 179 ++++++++++++++++++++++++++++++++
 1 file changed, 179 insertions(+)
 create mode 100644 finn-rtllib/mvu/mvu_8sx9_axi.sv

diff --git a/finn-rtllib/mvu/mvu_8sx9_axi.sv b/finn-rtllib/mvu/mvu_8sx9_axi.sv
new file mode 100644
index 0000000000..8765c50a26
--- /dev/null
+++ b/finn-rtllib/mvu/mvu_8sx9_axi.sv
@@ -0,0 +1,179 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Matrix Vector Unit (MVU) AXI-lite interface wrapper.
+ *****************************************************************************/
+
+module mvu_8sx9_axi #(
+    int unsigned MW,
+    int unsigned MH,
+    int unsigned PE,
+    int unsigned SIMD,
+    int unsigned ACTIVATION_WIDTH,
+    int unsigned WEIGHT_WIDTH,
+    int unsigned ACCU_WIDTH,
+    bit SIGNED_ACTIVATIONS = 0,
+    int unsigned SEGMENTLEN = 0,
+		parameter RAM_STYLE = "auto",
+
+    localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
+    localparam int unsigned INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8,
+		localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH,
+		localparam int unsigned INPUT_STREAM_WIDTH = SIMD*ACTIVATION_WIDTH,
+    localparam int unsigned SF = MW/SIMD,
+		localparam int unsigned NF = MH/PE,
+    localparam int unsigned OUTPUT_LANES = PE,
+    localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
+)
+(
+	// Global Control
+	input		logic  ap_clk,
+	input		logic  ap_rst_n,
+
+	// Weight Stream
+	input		logic [WEIGHT_STREAM_WIDTH_BA-1:0]  s_axis_weights_tdata,
+	input		logic  s_axis_weights_tvalid,
+	output	logic  s_axis_weights_tready,
+
+	// Input Stream
+	input		logic [INPUT_STREAM_WIDTH_BA-1:0]  s_axis_input_tdata,
+	input		logic  s_axis_input_tvalid,
+	output	logic  s_axis_input_tready,
+
+	// Output Stream
+	output	logic [OUTPUT_STREAM_WIDTH_BA-1:0]  m_axis_output_tdata,
+	output	logic  m_axis_output_tvalid,
+	input		logic  m_axis_output_tready
+);
+
+//-------------------- Parameter sanity checks --------------------\\
+	initial begin
+		if (MW % SIMD != 0) begin
+		$error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD);
+		$finish;
+		end
+		if (MH % PE != 0) begin
+		$error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE);
+		$finish;
+		end
+		if (ACTIVATION_WIDTH > 9) begin
+		$error("Activation width of %0d-bits exceeds maximum of 9-bits", ACTIVATION_WIDTH);
+		$finish;
+		end
+		if (WEIGHT_WIDTH > 8) begin
+		$error("Weight width of %0d-bits exceeds maximum of 8-bits", WEIGHT_WIDTH);
+		$finish;
+		end
+		if (SIGNED_ACTIVATIONS == 0 && ACTIVATION_WIDTH==9) begin
+		$error("Activation width of %0d-bits exceeds maximum of 8-bits for unsigned numbers", ACTIVATION_WIDTH);
+		$finish;
+		end
+		if (SEGMENTLEN == 0) begin
+		$warning("Segment length of %0d defaults to chain length", SEGMENTLEN);
+		end
+		if (SEGMENTLEN > (SIMD+2)/3) begin
+		$error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3);
+		$finish;
+		end
+	end
+
+	uwire clk = ap_clk;
+	uwire rst = !ap_rst_n;
+
+	typedef logic [INPUT_STREAM_WIDTH-1 : 0] mvauin_t;
+
+	uwire mvauin_t amvau;
+	uwire alast;
+	uwire afin;
+	uwire avld;
+	uwire ardy;
+
+	replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvauin_t)), .RAM_STYLE(RAM_STYLE)) activation_replay (
+		.clk, .rst,
+		.ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)),
+		.ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin)
+	);
+
+	//-------------------- Input control --------------------\\
+	uwire en;
+	uwire istb = avld && s_axis_weights_tvalid;
+	assign ardy = en && s_axis_weights_tvalid;
+	assign s_axis_weights_tready = en && avld;
+
+	//-------------------- Core MVU --------------------\\
+	uwire ovld;
+	uwire [PE-1:0][57:0] odat;
+	typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t;
+	mvu_8sx9 #(.PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
+	.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN)) core (
+		.clk, .rst, .en,
+		.last(alast), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau),
+		.vld(ovld), .p(odat)
+	);
+
+	//-------------------- Output register slice --------------------\\
+	struct {
+		logic vld;
+		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
+	} A = '{ vld: 0, default: 'x};
+
+	assign en = !A.vld || !ovld;
+
+	uwire  b_load;
+	always_ff @(posedge clk) begin
+		if(rst)  A <= '{ vld: 0, default: 'x };
+		else if(!A.vld || b_load) begin
+			A.vld <= ovld && en;
+			for(int unsigned  i = 0; i < PE; i++) begin
+				// CR-1148862:
+				// A.dat[i] <= odat[i];
+				automatic logic [57:0]  v = odat[i];
+				A.dat[i] <= v[ACCU_WIDTH-1:0];
+			end
+		end
+	end
+	
+	struct {
+		logic vld;
+		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
+	} B = '{ vld: 0, default: 'x};
+
+	assign	b_load = !B.vld || m_axis_output_tready;
+	always_ff @(posedge clk) begin
+		if(rst)		B <= '{ default: 'x };
+		else begin
+			if(b_load)	 B <= '{ vld: A.vld, dat: A.dat};
+		end	
+	end
+
+	assign	m_axis_output_tvalid = B.vld;
+	assign	m_axis_output_tdata  = B.dat;
+
+endmodule
\ No newline at end of file

From 96925a929877ce084466438128678250b09784a9 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 6 Apr 2023 12:36:00 +0100
Subject: [PATCH 099/235] [rtl custom op]: implementation of replay buffer for
 mvu

---
 finn-rtllib/mvu/replay_buffer.sv | 109 +++++++++++++++++++++++++++++++
 1 file changed, 109 insertions(+)
 create mode 100644 finn-rtllib/mvu/replay_buffer.sv

diff --git a/finn-rtllib/mvu/replay_buffer.sv b/finn-rtllib/mvu/replay_buffer.sv
new file mode 100644
index 0000000000..685ac03137
--- /dev/null
+++ b/finn-rtllib/mvu/replay_buffer.sv
@@ -0,0 +1,109 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Replay buffer for counted sequences on an AXI-lite stream.
+ * @author	Thomas B. Preußer <thomas.preusser@amd.com>
+ *****************************************************************************/
+
+module replay_buffer #(
+	int unsigned  LEN,	// Sequence length
+	int unsigned  REP,	// Sequence replay count
+	int unsigned  W,	// Data width
+	parameter RAM_STYLE = "auto" 	// ram style for buffer {block, distributed, ultra, auto}
+)(
+	input	logic  clk,
+	input	logic  rst,
+
+	input	logic [W-1:0]  idat,
+	input	logic  ivld,
+	output	logic  irdy,
+
+	output	logic [W-1:0]  odat,
+	output	logic  olast,
+	output	logic  ofin,
+	output	logic  ovld,
+	input	logic  ordy
+);
+
+	typedef logic [$clog2(REP)+$clog2(LEN)-1:0]  count_t;
+	count_t  Count = 0;
+	uwire  done_len = ((LEN-1) & ~Count[$clog2(LEN)-1:0]) == 0;
+	uwire  done_rep;
+	uwire  done_all = done_len && done_rep;
+
+	uwire  shift;
+	uwire  clr = rst || (done_all && shift);
+	always_ff @(posedge clk) begin
+		if(clr)         Count <= 0;
+		else if(shift)  Count <= Count + ((REP > 1) && done_len? 2**$clog2(LEN)-LEN+1 : 1);
+	end
+
+	typedef logic [W-1:0]  data_t;
+	uwire data_t  rdat;
+	uwire  first_rep;
+	if(REP == 1) begin
+		assign	done_rep  = 1;
+		assign	first_rep = 1;
+		assign	rdat = 'x;
+	end
+	else begin
+		assign	done_rep = ((REP-1) & ~Count[$left(Count):$clog2(LEN)]) == 0;
+
+		logic  FirstRep = 1;
+		always_ff @(posedge clk) begin
+			if(clr)         FirstRep <= 1;
+			else if(shift)  FirstRep <= FirstRep && !done_len;
+		end
+		assign	first_rep = FirstRep;
+
+		(* RAM_STYLE = RAM_STYLE *)
+		data_t  Buf[LEN];
+		if(LEN == 1) begin : genTrivial
+			always_ff @(posedge clk) begin
+				if(shift && FirstRep)  Buf[0] <= idat;
+			end
+		end : genTrivial
+		else begin : genShift
+			always_ff @(posedge clk) begin
+				if(shift)  Buf <= { odat, Buf[0:LEN-2] };
+			end
+		end : genShift
+
+		assign	rdat = Buf[LEN-1];
+	end
+
+	assign  irdy  = ordy && first_rep;
+	assign	odat  = first_rep? idat : rdat;
+	assign	olast = done_len;
+	assign	ofin  = done_all;
+	assign	ovld  = first_rep? ivld : 1;
+	assign	shift = ovld && ordy;
+
+endmodule : replay_buffer
\ No newline at end of file

From a3d11567468899bbcf33c83b509c26f908a807a3 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 6 Apr 2023 12:37:16 +0100
Subject: [PATCH 100/235] [rtl custom op]: testbench for mvu_8sx9_axi
 (including axi_wrapper & compute kernel)

---
 finn-rtllib/mvu/mvu_8sx9_axi_tb.sv | 208 +++++++++++++++++++++++++++++
 1 file changed, 208 insertions(+)
 create mode 100644 finn-rtllib/mvu/mvu_8sx9_axi_tb.sv

diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv b/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv
new file mode 100644
index 0000000000..ea97e0708c
--- /dev/null
+++ b/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv
@@ -0,0 +1,208 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Testbench for MVU AXI-lite interface wrapper.
+ *****************************************************************************/
+
+module mvu_8sx9_axi_tb();
+
+  //-------------------- Simulation parameters --------------------\\
+  // Matrix & parallelism config
+  localparam int unsigned MW = 600;
+  localparam int unsigned MH = 256;
+  localparam int unsigned SIMD = 60;
+  localparam int unsigned PE = 16;
+  localparam int unsigned SEGMENTLEN = 4;
+  // Bit-width config  
+  localparam int unsigned ACTIVATION_WIDTH = 8;
+  localparam int unsigned WEIGHT_WIDTH = 4;
+  localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW);
+  localparam bit SIGNED_ACTIVATIONS = 1;
+  // Simulation constants  
+  localparam int unsigned NF = MH/PE;
+  localparam int unsigned SF = MW/SIMD;
+  localparam int unsigned NUM_OF_DSP = SIMD/3;
+  localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8;
+  localparam int unsigned ACTIVATION_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8*8;
+  localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH;
+  localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - SIMD*ACTIVATION_WIDTH;
+  localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8;
+
+  // Generate clk and reset signal   
+  logic clk = 0;
+  always #5ns clk = !clk;
+  
+  logic ap_rst_n = 0;
+  initial begin
+    repeat(16) @(posedge clk);
+    ap_rst_n <= 1;
+  end
+
+  uwire ap_clk = clk;
+
+  // Generate activations  
+  typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t;
+  typedef activation_t activation_vector_t[SF];
+    
+  function activation_vector_t init_ACTIVATIONS;
+    automatic activation_vector_t res;
+    std::randomize(res);
+    return res;
+  endfunction : init_ACTIVATIONS
+
+  activation_vector_t ACTIVATIONS = init_ACTIVATIONS();
+
+  struct {
+    activation_t dat;
+    logic vld;
+    logic rdy;
+  } activations;
+
+  initial begin
+    activations.vld = 0;
+    activations.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain
+    @(posedge clk iff ap_rst_n);
+
+    for (int i=0; i<SF; i++) begin
+      activations.dat <= ACTIVATIONS[i];
+      do begin 
+        activations.vld = $urandom()%7 > 1;
+        @(posedge clk);
+      end while (!(activations.vld === 1 && activations.rdy === 1));
+    end
+    
+    activations.vld <= 0;
+    activations.dat <= 'x;
+  end
+   
+  // Generate weights   
+  typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t;
+  typedef weight_t weight_matrix_t[NF][SF]; 
+  
+  function weight_matrix_t init_WEIGHTS;
+    automatic weight_matrix_t res;
+    std::randomize(res);
+    return res;
+  endfunction : init_WEIGHTS;
+
+  weight_matrix_t WEIGHTS = init_WEIGHTS();
+
+  struct {
+    weight_t dat;
+    logic vld;
+    logic rdy;
+  } weights;
+
+  initial begin
+    weights.vld = 0;
+    weights.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain
+    @(posedge clk iff ap_rst_n);
+
+    weights.vld <= 1;
+    for (int i=0; i<NF; i++) begin
+      for (int j=0; j<SF; j++) begin
+        weights.dat <= WEIGHTS[i][j];
+        @(posedge clk iff weights.rdy);
+      end
+    end
+
+    weights.vld <= 0;
+    weights.dat <= 'x;
+  end
+  
+  // Function to compute golden output  
+  // a: [SF][SIMD-1:0][ACTIVATION_WIDTH-1:0]
+  // w: [NF][SF][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]
+  typedef logic signed [PE-1:0][ACCU_WIDTH-1:0] output_t;
+  typedef output_t output_vector_t [NF];
+
+  struct {
+    output_t dat;
+    logic vld;
+    logic rdy;
+  } outputs;
+
+  function output_vector_t check_output(activation_vector_t a, weight_matrix_t w);
+    automatic output_vector_t res = '{default: 0};
+    for (int j = 0; j<MH; j++) begin
+      for (int i = 0; i<MW; i++) begin
+        res[j/PE][j%PE] = $signed(res[j/PE][j%PE]) + $signed(a[i/SIMD][i%SIMD]) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
+      end
+    end  
+    return res;
+  endfunction : check_output;
+
+  output_vector_t GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS);
+
+  int unsigned NF_CNT = 0;
+  initial begin
+    outputs.rdy = 0;
+    while (NF_CNT < NF) begin
+      // Loop until both rdy & vld are asserted
+      do begin
+        outputs.rdy <= $urandom()%7 >= 1;
+        @(posedge clk iff ap_rst_n);
+      end while (!(outputs.rdy === 1 && outputs.vld === 1));
+
+      // Compare produced outputs against golden outputs
+      foreach(outputs.dat[i]) begin
+        assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+        else begin 
+          $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+          $stop;
+        end  
+      end
+      
+      NF_CNT += 1;
+    end
+    
+    $finish;  
+  end
+
+  // Instantiate DUT
+  mvu_8sx9_axi #(
+      .MW(MW),
+      .MH(MH),
+      .PE(PE),
+      .SIMD(SIMD),
+      .ACTIVATION_WIDTH(ACTIVATION_WIDTH),
+      .WEIGHT_WIDTH(WEIGHT_WIDTH),
+      .ACCU_WIDTH(ACCU_WIDTH),
+      .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
+      .SEGMENTLEN(SEGMENTLEN)
+    )
+    dut (
+      .ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld),
+      .s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld),
+      .s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld),
+      .m_axis_output_tready(outputs.rdy)
+    );
+  
+endmodule

From 2aea664b2260a4ea759909d0a3168b5f62b114a2 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 6 Apr 2023 12:37:55 +0100
Subject: [PATCH 101/235] [rtl custom op]: initial implementation of verilog
 wrapper for mvu_8sx9_axi

---
 finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v | 90 ++++++++++++++++++++++++++
 1 file changed, 90 insertions(+)
 create mode 100644 finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v

diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v
new file mode 100644
index 0000000000..ff3779d211
--- /dev/null
+++ b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v
@@ -0,0 +1,90 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Verilog AXI-lite wrapper for MVU.
+ *****************************************************************************/
+
+module $MODULE_NAME_AXI_WRAPPER$ #(
+	parameter 	MW = $MW$,
+	parameter		MH = $MH$,
+	parameter 	PE = $PE$,
+	parameter 	SIMD = $SIMD$,
+	parameter 	ACTIVATION_WIDTH = $ACTIVATION_WIDTH$,
+	parameter 	WEIGHT_WIDTH = $WEIGHT_WIDTH$,
+	parameter 	ACCU_WIDTH = $ACCU_WIDTH$,
+	parameter 	SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$,
+	parameter 	SEGMENTLEN = $SEGMENTLEN$,
+	parameter 	RAM_STYLE = $IBUF_RAM_STYLE$,
+
+	// Safely deducible parameters
+	parameter WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
+	parameter INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8,
+	parameter OUTPUT_LANES = PE,
+	parameter OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
+)(
+  // Global Control
+	input		logic  ap_clk,
+	input		logic  ap_rst_n,
+
+	// Weight Stream
+	input		logic [WEIGHT_STREAM_WIDTH_BA-1:0]  s_axis_weights_tdata,
+	input		logic  s_axis_weights_tvalid,
+	output	logic  s_axis_weights_tready,
+
+	// Input Stream
+	input		logic [INPUT_STREAM_WIDTH_BA-1:0]  s_axis_input_tdata,
+	input		logic  s_axis_input_tvalid,
+	output	logic  s_axis_input_tready,
+
+	// Output Stream
+	output	logic [OUTPUT_STREAM_WIDTH_BA-1:0]  m_axis_output_tdata,
+	output	logic  m_axis_output_tvalid,
+	input		logic  m_axis_output_tready
+);
+
+mvu_8sx9_axi #(
+	.MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH),
+	.WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
+	.SEGMENTLEN(SEGMENTLEN), .RAM_STYLE(RAM_STYLE)
+	) inst (
+	.ap_clk(ap_clk),
+	.ap_rst_n(ap_rst_n),
+	.s_axis_weights_tdata(s_axis_weights_tdata),
+	.s_axis_weights_tvalid(s_axis_weights_tvalid),
+	.s_axis_weights_tready(s_axis_weights_tready),
+	.s_axis_input_tdata(s_axis_input_tdata),
+	.s_axis_input_tvalid(s_axis_input_tvalid),
+	.s_axis_input_tready(s_axis_input_tready),
+	.m_axis_output_tdata(m_axis_output_tdata),
+	.m_axis_output_tvalid(m_axis_output_tvalid),
+	.m_axis_output_tready(m_axis_output_tready)
+)
+
+endmodule : mvau_8sx9_axi_wrapper
\ No newline at end of file

From 8b57849bb47c3119b177e78dcbaa48954f69b811 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Tue, 11 Apr 2023 15:50:24 +0100
Subject: [PATCH 102/235] [rtl mvu]: fix tab indentation

---
 finn-rtllib/mvu/mvu_8sx9.sv            | 424 ++++++++++++-------------
 finn-rtllib/mvu/mvu_8sx9_axi.sv        |  32 +-
 finn-rtllib/mvu/mvu_8sx9_axi_tb.sv     | 342 ++++++++++----------
 finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v |  26 +-
 finn-rtllib/mvu/mvu_8sx9_tb.sv         | 258 +++++++--------
 5 files changed, 541 insertions(+), 541 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv
index c992990d9f..d082d4fb2e 100644
--- a/finn-rtllib/mvu/mvu_8sx9.sv
+++ b/finn-rtllib/mvu/mvu_8sx9.sv
@@ -52,233 +52,233 @@ module mvu_8sx9 #(
   );
 
 //-------------------- Declare global signals --------------------\\
-localparam int unsigned CHAINLEN = (SIMD+2)/3;
-localparam int unsigned SEGLEN = SEGMENTLEN == 0 ? CHAINLEN : SEGMENTLEN; // Additional constant to default a SEGMENTLEN of '0' to the DSP-chain length
-uwire [26:0] a_in_i [CHAINLEN];
-uwire [23:0] b_in_i [PE][CHAINLEN];
-uwire [57:0] pcout [PE][CHAINLEN];
+	localparam int unsigned CHAINLEN = (SIMD+2)/3;
+	localparam int unsigned SEGLEN = SEGMENTLEN == 0 ? CHAINLEN : SEGMENTLEN; // Additional constant to default a SEGMENTLEN of '0' to the DSP-chain length
+	uwire [26:0] a_in_i [CHAINLEN];
+	uwire [23:0] b_in_i [PE][CHAINLEN];
+	uwire [57:0] pcout [PE][CHAINLEN];
 
 //-------------------- Shift register for opmode select signal --------------------\\
-localparam int unsigned MAX_PIPELINE_STAGES = (CHAINLEN + SEGLEN-1)/SEGLEN; // >=1 (== number of pipeline registers + 1 (A/B inputs always have 1 register))
-logic L [0:1+MAX_PIPELINE_STAGES] = '{default: 0}; // After MAX_PIPELINE_STAGES (== number of pipeline stages for input data), we have 3 additional cycles latency (A/B reg, Mreg, Preg). Thus, we add +2 (since OPMODE is buffered by 1 cycle in the DSP fabric)
+	localparam int unsigned MAX_PIPELINE_STAGES = (CHAINLEN + SEGLEN-1)/SEGLEN; // >=1 (== number of pipeline registers + 1 (A/B inputs always have 1 register))
+	logic L [0:1+MAX_PIPELINE_STAGES] = '{default: 0}; // After MAX_PIPELINE_STAGES (== number of pipeline stages for input data), we have 3 additional cycles latency (A/B reg, Mreg, Preg). Thus, we add +2 (since OPMODE is buffered by 1 cycle in the DSP fabric)
 
-always_ff @(posedge clk) begin
-  if(rst)     L <= '{default: 0};
-  else if(en) L <= { L[1:1+MAX_PIPELINE_STAGES], last };
-end  
-assign vld = L[0];
+	always_ff @(posedge clk) begin
+		if(rst)     L <= '{default: 0};
+		else if(en) L <= { L[1:1+MAX_PIPELINE_STAGES], last };
+	end  
+	assign vld = L[0];
 
 //-------------------- Shift register for ZERO flag --------------------\\
-logic Z [0:MAX_PIPELINE_STAGES-2] = '{default:0}; // We need MAX_PIPELINE_STAGES-1 pipeline stages (note: INMODE is buffered inside DSP fabric)
+	logic Z [0:MAX_PIPELINE_STAGES-2] = '{default:0}; // We need MAX_PIPELINE_STAGES-1 pipeline stages (note: INMODE is buffered inside DSP fabric)
 
-if (MAX_PIPELINE_STAGES > 1) begin : genZreg
-  always_ff @(posedge clk) begin
-    if (rst)      Z <= '{default: 0};
-    else if(en) begin
-        Z[0] <= zero;
-        if (MAX_PIPELINE_STAGES > 2)  Z[1:MAX_PIPELINE_STAGES-1] <= Z[0:MAX_PIPELINE_STAGES-2];
-    end    
-  end
-end;
+	if (MAX_PIPELINE_STAGES > 1) begin : genZreg
+		always_ff @(posedge clk) begin
+			if (rst)      Z <= '{default: 0};
+			else if(en) begin
+				Z[0] <= zero;
+				if (MAX_PIPELINE_STAGES > 2)  Z[1:MAX_PIPELINE_STAGES-1] <= Z[0:MAX_PIPELINE_STAGES-2];
+			end    
+		end
+	end;
 
 //-------------------- Buffer for input activations --------------------\\
-localparam int unsigned PAD_BITS_ACT = 9 - ACTIVATION_WIDTH;
-typedef logic [2:0][ACTIVATION_WIDTH-1:0] a_buffer_t;
+	localparam int unsigned PAD_BITS_ACT = 9 - ACTIVATION_WIDTH;
+	typedef logic [2:0][ACTIVATION_WIDTH-1:0] a_buffer_t;
 
-for (genvar i=0; i<CHAINLEN; i++) begin : genActSIMD
-  localparam int TOTAL_PREGS = i/SEGLEN;
-  localparam int EXTERNAL_PREGS = TOTAL_PREGS>1 ? TOTAL_PREGS-1 : 0;
-  
-  if (EXTERNAL_PREGS > 0) begin : genExternalPregAct
-    a_buffer_t A [0:EXTERNAL_PREGS-1];
-    always_ff @(posedge clk) begin
-      if (rst)     A <= '{default: 0};
-      else if(en) begin
-        A[EXTERNAL_PREGS-1] <= a[3*i +: 3];
-        if (EXTERNAL_PREGS > 1)   A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1];
-      end
-    end
-    assign a_in_i[i][26:0] = SIGNED_ACTIVATIONS ? { {PAD_BITS_ACT{A[0][2][ACTIVATION_WIDTH-1]}}, A[0][2], {PAD_BITS_ACT{A[0][1][ACTIVATION_WIDTH-1]}}, A[0][1], {PAD_BITS_ACT{A[0][0][ACTIVATION_WIDTH-1]}}, A[0][0]}
-                             : { {PAD_BITS_ACT{1'b0}}, A[0][2], {PAD_BITS_ACT{1'b0}}, A[0][1], {PAD_BITS_ACT{1'b0}}, A[0][0]} ;
-  end : genExternalPregAct
-  else begin : genInpDSPAct
-    assign a_in_i[i][26:0] = SIGNED_ACTIVATIONS ? { {PAD_BITS_ACT{a[3*i+2][ACTIVATION_WIDTH-1]}}, a[3*i+2], {PAD_BITS_ACT{a[3*i+1][ACTIVATION_WIDTH-1]}}, a[3*i+1], {PAD_BITS_ACT{a[3*i][ACTIVATION_WIDTH-1]}}, a[3*i]}
-                             : { {PAD_BITS_ACT{1'b0}}, a[3*i+2], {PAD_BITS_ACT{1'b0}}, a[3*i+1], {PAD_BITS_ACT{1'b0}}, a[3*i]} ;
-  end : genInpDSPAct
+	for (genvar i=0; i<CHAINLEN; i++) begin : genActSIMD
+		localparam int TOTAL_PREGS = i/SEGLEN;
+		localparam int EXTERNAL_PREGS = TOTAL_PREGS>1 ? TOTAL_PREGS-1 : 0;
 
-end : genActSIMD
+		if (EXTERNAL_PREGS > 0) begin : genExternalPregAct
+			a_buffer_t A [0:EXTERNAL_PREGS-1];
+			always_ff @(posedge clk) begin
+				if (rst)     A <= '{default: 0};
+				else if(en) begin
+					A[EXTERNAL_PREGS-1] <= a[3*i +: 3];
+					if (EXTERNAL_PREGS > 1)   A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1];
+				end
+			end
+			assign a_in_i[i][26:0] = SIGNED_ACTIVATIONS ? { {PAD_BITS_ACT{A[0][2][ACTIVATION_WIDTH-1]}}, A[0][2], {PAD_BITS_ACT{A[0][1][ACTIVATION_WIDTH-1]}}, A[0][1], {PAD_BITS_ACT{A[0][0][ACTIVATION_WIDTH-1]}}, A[0][0]}
+									: { {PAD_BITS_ACT{1'b0}}, A[0][2], {PAD_BITS_ACT{1'b0}}, A[0][1], {PAD_BITS_ACT{1'b0}}, A[0][0]} ;
+		end : genExternalPregAct
+		else begin : genInpDSPAct
+			assign a_in_i[i][26:0] = SIGNED_ACTIVATIONS ? { {PAD_BITS_ACT{a[3*i+2][ACTIVATION_WIDTH-1]}}, a[3*i+2], {PAD_BITS_ACT{a[3*i+1][ACTIVATION_WIDTH-1]}}, a[3*i+1], {PAD_BITS_ACT{a[3*i][ACTIVATION_WIDTH-1]}}, a[3*i]}
+									: { {PAD_BITS_ACT{1'b0}}, a[3*i+2], {PAD_BITS_ACT{1'b0}}, a[3*i+1], {PAD_BITS_ACT{1'b0}}, a[3*i]} ;
+		end : genInpDSPAct
+
+	end : genActSIMD
 
 //-------------------- Buffer for weights --------------------\\
-localparam int unsigned PAD_BITS_WEIGHT = 8 - WEIGHT_WIDTH;
-typedef logic [2:0][WEIGHT_WIDTH-1:0] b_buffer_t;
+	localparam int unsigned PAD_BITS_WEIGHT = 8 - WEIGHT_WIDTH;
+	typedef logic [2:0][WEIGHT_WIDTH-1:0] b_buffer_t;
 
-for (genvar j=0; j<PE; j++) begin : genWeightPE
-  for (genvar i=0; i<CHAINLEN; i++) begin : genWeightSIMD
-    localparam int TOTAL_PREGS = i/SEGLEN;
-    localparam int EXTERNAL_PREGS = TOTAL_PREGS>1 ? TOTAL_PREGS-1 : 0;
-    
-    if (EXTERNAL_PREGS > 0) begin : genExternalPregWeight
-      b_buffer_t B [0:PE-1][0:EXTERNAL_PREGS-1];
-      always_ff @(posedge clk) begin
-        if (rst)    B <= '{default: 0};
-        else if (en) begin
-          B[j][EXTERNAL_PREGS-1] <= w[j][3*i +: 3];
-          if (EXTERNAL_PREGS > 1) B[j][0:EXTERNAL_PREGS-2] <= B[j][1:EXTERNAL_PREGS-1];
-        end
-      end
-      assign b_in_i[j][i][23:0] = { {PAD_BITS_WEIGHT{B[j][0][2][WEIGHT_WIDTH-1]}}, B[j][0][2], {PAD_BITS_WEIGHT{B[j][0][1][WEIGHT_WIDTH-1]}}, B[j][0][1], {PAD_BITS_WEIGHT{B[j][0][0][WEIGHT_WIDTH-1]}}, B[j][0][0] };
-    end : genExternalPregWeight
-    else begin : genInpDSPWeight
-      assign b_in_i[j][i][23:0] = { {PAD_BITS_WEIGHT{w[j][3*i+2][WEIGHT_WIDTH-1]}}, w[j][3*i+2], {PAD_BITS_WEIGHT{w[j][3*i+1][WEIGHT_WIDTH-1]}}, w[j][3*i+1], {PAD_BITS_WEIGHT{w[j][3*i][WEIGHT_WIDTH-1]}}, w[j][3*i] };
-    end : genInpDSPWeight
-  end : genWeightSIMD
+	for (genvar j=0; j<PE; j++) begin : genWeightPE
+		for (genvar i=0; i<CHAINLEN; i++) begin : genWeightSIMD
+			localparam int TOTAL_PREGS = i/SEGLEN;
+			localparam int EXTERNAL_PREGS = TOTAL_PREGS>1 ? TOTAL_PREGS-1 : 0;
 
-end : genWeightPE
+			if (EXTERNAL_PREGS > 0) begin : genExternalPregWeight
+				b_buffer_t B [0:PE-1][0:EXTERNAL_PREGS-1];
+				always_ff @(posedge clk) begin
+					if (rst)    B <= '{default: 0};
+					else if (en) begin
+						B[j][EXTERNAL_PREGS-1] <= w[j][3*i +: 3];
+						if (EXTERNAL_PREGS > 1) B[j][0:EXTERNAL_PREGS-2] <= B[j][1:EXTERNAL_PREGS-1];
+					end
+				end
+				assign b_in_i[j][i][23:0] = { {PAD_BITS_WEIGHT{B[j][0][2][WEIGHT_WIDTH-1]}}, B[j][0][2], {PAD_BITS_WEIGHT{B[j][0][1][WEIGHT_WIDTH-1]}}, B[j][0][1], {PAD_BITS_WEIGHT{B[j][0][0][WEIGHT_WIDTH-1]}}, B[j][0][0] };
+			end : genExternalPregWeight
+			else begin : genInpDSPWeight
+				assign b_in_i[j][i][23:0] = { {PAD_BITS_WEIGHT{w[j][3*i+2][WEIGHT_WIDTH-1]}}, w[j][3*i+2], {PAD_BITS_WEIGHT{w[j][3*i+1][WEIGHT_WIDTH-1]}}, w[j][3*i+1], {PAD_BITS_WEIGHT{w[j][3*i][WEIGHT_WIDTH-1]}}, w[j][3*i] };
+			end : genInpDSPWeight
+		end : genWeightSIMD
+
+	end : genWeightPE
 
 //-------------------- Instantiate PE x CHAINLEN DSPs --------------------\\
-for (genvar j=0; j<PE; j++) begin : genDSPPE
-  for (genvar i=0; i<CHAINLEN; i++) begin : genDSPChain
-    localparam int TOTAL_PREGS = i/SEGLEN;
-    localparam int INTERNAL_PREGS = TOTAL_PREGS>0 ? 2 : 1; // 1 : 0
-    localparam bit PREG = (i+1)%SEGLEN==0 || i == CHAINLEN-1;
-    localparam bit FIRST = i == 0;
-    localparam bit LAST = i == CHAINLEN-1;
-    uwire [57:0] pp;
-    
-    if (LAST) begin : genPOUT
-      assign p[j] = pp;
-    end      
-    
-    DSP58 #(
-      // Feature Control Attributes: Data Path Selection
-      .AMULTSEL("A"),                     // Selects A input to multiplier (A, AD)
-      .A_INPUT("DIRECT"),                 // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port)
-      .BMULTSEL("B"),                     // Selects B input to multiplier (AD, B)
-      .B_INPUT("DIRECT"),                 // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port)
-      .DSP_MODE("INT8"),                  // Configures DSP to a particular mode of operation. Set to INT24 for
-                                          // legacy mode.
-      .PREADDINSEL("A"),                  // Selects input to pre-adder (A, B)
-      .RND(58'h000000000000000),          // Rounding Constant
-      .USE_MULT("MULTIPLY"),              // Select multiplier usage (DYNAMIC, MULTIPLY, NONE)
-      .USE_SIMD("ONE58"),                 // SIMD selection (FOUR12, ONE58, TWO24)
-      .USE_WIDEXOR("FALSE"),              // Use the Wide XOR function (FALSE, TRUE)
-      .XORSIMD("XOR24_34_58_116"),        // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116)
-      // Pattern Detector Attributes: Pattern Detection Configuration
-      .AUTORESET_PATDET("NO_RESET"),      // NO_RESET, RESET_MATCH, RESET_NOT_MATCH
-      .AUTORESET_PRIORITY("RESET"),       // Priority of AUTORESET vs. CEP (CEP, RESET).
-      .MASK(58'h0ffffffffffffff),         // 58-bit mask value for pattern detect (1=ignore)
-      .PATTERN(58'h000000000000000),      // 58-bit pattern match for pattern detect
-      .SEL_MASK("MASK"),                  // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2
-      .SEL_PATTERN("PATTERN"),            // Select pattern value (C, PATTERN)
-      .USE_PATTERN_DETECT("NO_PATDET"),   // Enable pattern detect (NO_PATDET, PATDET)
-      // Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins
-      .IS_ALUMODE_INVERTED(4'b0000),      // Optional inversion for ALUMODE
-      .IS_CARRYIN_INVERTED(1'b0),         // Optional inversion for CARRYIN
-      .IS_CLK_INVERTED(1'b0),             // Optional inversion for CLK
-      .IS_INMODE_INVERTED(5'b00000),      // Optional inversion for INMODE
-      .IS_NEGATE_INVERTED(3'b000),        // Optional inversion for NEGATE
-      .IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0 
-                            FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN 
-                            2'b01, // Y : M
-                            2'b01  // X: M
-        }), // Optional inversion for OPMODE
-      .IS_RSTALLCARRYIN_INVERTED(1'b0),   // Optional inversion for RSTALLCARRYIN
-      .IS_RSTALUMODE_INVERTED(1'b0),      // Optional inversion for RSTALUMODE
-      .IS_RSTA_INVERTED(1'b0),            // Optional inversion for RSTA
-      .IS_RSTB_INVERTED(1'b0),            // Optional inversion for RSTB
-      .IS_RSTCTRL_INVERTED(1'b0),         // Optional inversion for STCONJUGATE_A
-      .IS_RSTC_INVERTED(1'b0),            // Optional inversion for RSTC
-      .IS_RSTD_INVERTED(1'b0),            // Optional inversion for RSTD
-      .IS_RSTINMODE_INVERTED(1'b0),       // Optional inversion for RSTINMODE
-      .IS_RSTM_INVERTED(1'b0),            // Optional inversion for RSTM
-      .IS_RSTP_INVERTED(1'b0),            // Optional inversion for RSTP
-      // Register Control Attributes: Pipeline Register Configuration
-      .ACASCREG(INTERNAL_PREGS),          // Number of pipeline stages between A/ACIN and ACOUT (0-2)
-      .ADREG(0),                          // Pipeline stages for pre-adder (0-1)
-      .ALUMODEREG(0),                     // Pipeline stages for ALUMODE (0-1)
-      .AREG(INTERNAL_PREGS),              // Pipeline stages for A (0-2)
-      .BCASCREG(INTERNAL_PREGS),          // Number of pipeline stages between B/BCIN and BCOUT (0-2)
-      .BREG(INTERNAL_PREGS),              // Pipeline stages for B (0-2)
-      .CARRYINREG(0),                     // Pipeline stages for CARRYIN (0-1)
-      .CARRYINSELREG(0),                  // Pipeline stages for CARRYINSEL (0-1)
-      .CREG(0),                           // Pipeline stages for C (0-1)
-      .DREG(0),                           // Pipeline stages for D (0-1)
-      .INMODEREG(1),                      // Pipeline stages for INMODE (0-1)
-      .MREG(1),                           // Multiplier pipeline stages (0-1)
-      .OPMODEREG(1),                      // Pipeline stages for OPMODE (0-1)
-      .PREG(PREG),                        // Number of pipeline stages for P (0-1)
-      .RESET_MODE("SYNC")                 // Selection of synchronous or asynchronous reset. (ASYNC, SYNC).
-    )
-    DSP58_inst (
-      // Cascade outputs: Cascade Ports
-      .ACOUT(),                           // 34-bit output: A port cascade
-      .BCOUT(),                           // 24-bit output: B cascade
-      .CARRYCASCOUT(),                    // 1-bit output: Cascade carry
-      .MULTSIGNOUT(),                     // 1-bit output: Multiplier sign cascade
-      .PCOUT(pcout[j][i]),                // 58-bit output: Cascade output
-      // Control outputs: Control Inputs/Status Bits
-      .OVERFLOW(),                        // 1-bit output: Overflow in add/acc
-      .PATTERNBDETECT(),                  // 1-bit output: Pattern bar detect
-      .PATTERNDETECT(),                   // 1-bit output: Pattern detect
-      .UNDERFLOW(),                       // 1-bit output: Underflow in add/acc
-      // Data outputs: Data Ports
-      .CARRYOUT(),                        // 4-bit output: Carry
-      .P(pp),                             // 58-bit output: Primary data
-      .XOROUT(),                          // 8-bit output: XOR data
-      // Cascade inputs: Cascade Ports
-      .ACIN('x),                          // 34-bit input: A cascade data
-      .BCIN('x),                          // 24-bit input: B cascade
-      .CARRYCASCIN('x),                   // 1-bit input: Cascade carry
-      .MULTSIGNIN('x),                    // 1-bit input: Multiplier sign cascade
-      .PCIN(FIRST ? 'x : pcout[j][i-1]),  // 58-bit input: P cascade
-      // Control inputs: Control Inputs/Status Bits
-      .ALUMODE(4'h0),                     // 4-bit input: ALU control
-      .CARRYINSEL('0),                    // 3-bit input: Carry select
-      .CLK(clk),                          // 1-bit input: Clock
-      .INMODE({
-              INTERNAL_PREGS==2 ? 1'b0 : 1'b1,
-              2'b00,
-              TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero,
-              INTERNAL_PREGS==2 ? 1'b0 : 1'b1        
-      }),                                 // 5-bit input: INMODE control
-      .NEGATE('0),                        // 3-bit input: Negates the input of the multiplier
-      .OPMODE({
-              LAST ? {1'b0, L[1]} : 2'b00,
-              7'b000_0000
-      }), // 9-bit input: Operation mode
-      // Data inputs: Data Ports
-      .A({ 7'bx, a_in_i[i] }),            // 34-bit input: A data
-      .B(b_in_i[j][i]),                   // 24-bit input: B data
-      .C('x),                             // 58-bit input: C data
-      .CARRYIN('0),                       // 1-bit input: Carry-in
-      .D('x),                             // 27-bit input: D data
-      // Reset/Clock Enable inputs: Reset/Clock Enable Inputs
-      .ASYNC_RST('0),                     // 1-bit input: Asynchronous reset for all registers.
-      .CEA1(en),                          // 1-bit input: Clock enable for 1st stage AREG
-      .CEA2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage AREG
-      .CEAD('0),                          // 1-bit input: Clock enable for ADREG
-      .CEALUMODE('0),                     // 1-bit input: Clock enable for ALUMODE
-      .CEB1(en),                          // 1-bit input: Clock enable for 1st stage BREG
-      .CEB2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage BREG
-      .CEC('0),                           // 1-bit input: Clock enable for CREG
-      .CECARRYIN('0),                     // 1-bit input: Clock enable for CARRYINREG
-      .CECTRL(en),                        // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG
-      .CED('0),                           // 1-bit input: Clock enable for DREG
-      .CEINMODE(en),                      // 1-bit input: Clock enable for INMODEREG
-      .CEM(en),                           // 1-bit input: Clock enable for MREG
-      .CEP(PREG && en),                   // 1-bit input: Clock enable for PREG
-      .RSTA(rst),                         // 1-bit input: Reset for AREG
-      .RSTALLCARRYIN('0),                 // 1-bit input: Reset for CARRYINREG
-      .RSTALUMODE('0),                    // 1-bit input: Reset for ALUMODEREG
-      .RSTB(rst),                         // 1-bit input: Reset for BREG
-      .RSTC('0),                          // 1-bit input: Reset for CREG
-      .RSTCTRL(rst),                      // 1-bit input: Reset for OPMODEREG and CARRYINSELREG
-      .RSTD('0),                          // 1-bit input: Reset for DREG and ADREG
-      .RSTINMODE(rst),                    // 1-bit input: Reset for INMODE register
-      .RSTM(rst),                         // 1-bit input: Reset for MREG
-      .RSTP(PREG && rst)                  // 1-bit input: Reset for PREG
-    );
-  end : genDSPChain  
-end : genDSPPE
+	for (genvar j=0; j<PE; j++) begin : genDSPPE
+		for (genvar i=0; i<CHAINLEN; i++) begin : genDSPChain
+			localparam int TOTAL_PREGS = i/SEGLEN;
+			localparam int INTERNAL_PREGS = TOTAL_PREGS>0 ? 2 : 1; // 1 : 0
+			localparam bit PREG = (i+1)%SEGLEN==0 || i == CHAINLEN-1;
+			localparam bit FIRST = i == 0;
+			localparam bit LAST = i == CHAINLEN-1;
+			uwire [57:0] pp;
+
+			if (LAST) begin : genPOUT
+				assign p[j] = pp;
+			end      
+
+			DSP58 #(
+				// Feature Control Attributes: Data Path Selection
+				.AMULTSEL("A"),                     // Selects A input to multiplier (A, AD)
+				.A_INPUT("DIRECT"),                 // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port)
+				.BMULTSEL("B"),                     // Selects B input to multiplier (AD, B)
+				.B_INPUT("DIRECT"),                 // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port)
+				.DSP_MODE("INT8"),                  // Configures DSP to a particular mode of operation. Set to INT24 for
+													// legacy mode.
+				.PREADDINSEL("A"),                  // Selects input to pre-adder (A, B)
+				.RND(58'h000000000000000),          // Rounding Constant
+				.USE_MULT("MULTIPLY"),              // Select multiplier usage (DYNAMIC, MULTIPLY, NONE)
+				.USE_SIMD("ONE58"),                 // SIMD selection (FOUR12, ONE58, TWO24)
+				.USE_WIDEXOR("FALSE"),              // Use the Wide XOR function (FALSE, TRUE)
+				.XORSIMD("XOR24_34_58_116"),        // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116)
+				// Pattern Detector Attributes: Pattern Detection Configuration
+				.AUTORESET_PATDET("NO_RESET"),      // NO_RESET, RESET_MATCH, RESET_NOT_MATCH
+				.AUTORESET_PRIORITY("RESET"),       // Priority of AUTORESET vs. CEP (CEP, RESET).
+				.MASK(58'h0ffffffffffffff),         // 58-bit mask value for pattern detect (1=ignore)
+				.PATTERN(58'h000000000000000),      // 58-bit pattern match for pattern detect
+				.SEL_MASK("MASK"),                  // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2
+				.SEL_PATTERN("PATTERN"),            // Select pattern value (C, PATTERN)
+				.USE_PATTERN_DETECT("NO_PATDET"),   // Enable pattern detect (NO_PATDET, PATDET)
+				// Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins
+				.IS_ALUMODE_INVERTED(4'b0000),      // Optional inversion for ALUMODE
+				.IS_CARRYIN_INVERTED(1'b0),         // Optional inversion for CARRYIN
+				.IS_CLK_INVERTED(1'b0),             // Optional inversion for CLK
+				.IS_INMODE_INVERTED(5'b00000),      // Optional inversion for INMODE
+				.IS_NEGATE_INVERTED(3'b000),        // Optional inversion for NEGATE
+				.IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0 
+									FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN 
+									2'b01, // Y : M
+									2'b01  // X: M
+				}), // Optional inversion for OPMODE
+				.IS_RSTALLCARRYIN_INVERTED(1'b0),   // Optional inversion for RSTALLCARRYIN
+				.IS_RSTALUMODE_INVERTED(1'b0),      // Optional inversion for RSTALUMODE
+				.IS_RSTA_INVERTED(1'b0),            // Optional inversion for RSTA
+				.IS_RSTB_INVERTED(1'b0),            // Optional inversion for RSTB
+				.IS_RSTCTRL_INVERTED(1'b0),         // Optional inversion for STCONJUGATE_A
+				.IS_RSTC_INVERTED(1'b0),            // Optional inversion for RSTC
+				.IS_RSTD_INVERTED(1'b0),            // Optional inversion for RSTD
+				.IS_RSTINMODE_INVERTED(1'b0),       // Optional inversion for RSTINMODE
+				.IS_RSTM_INVERTED(1'b0),            // Optional inversion for RSTM
+				.IS_RSTP_INVERTED(1'b0),            // Optional inversion for RSTP
+				// Register Control Attributes: Pipeline Register Configuration
+				.ACASCREG(INTERNAL_PREGS),          // Number of pipeline stages between A/ACIN and ACOUT (0-2)
+				.ADREG(0),                          // Pipeline stages for pre-adder (0-1)
+				.ALUMODEREG(0),                     // Pipeline stages for ALUMODE (0-1)
+				.AREG(INTERNAL_PREGS),              // Pipeline stages for A (0-2)
+				.BCASCREG(INTERNAL_PREGS),          // Number of pipeline stages between B/BCIN and BCOUT (0-2)
+				.BREG(INTERNAL_PREGS),              // Pipeline stages for B (0-2)
+				.CARRYINREG(0),                     // Pipeline stages for CARRYIN (0-1)
+				.CARRYINSELREG(0),                  // Pipeline stages for CARRYINSEL (0-1)
+				.CREG(0),                           // Pipeline stages for C (0-1)
+				.DREG(0),                           // Pipeline stages for D (0-1)
+				.INMODEREG(1),                      // Pipeline stages for INMODE (0-1)
+				.MREG(1),                           // Multiplier pipeline stages (0-1)
+				.OPMODEREG(1),                      // Pipeline stages for OPMODE (0-1)
+				.PREG(PREG),                        // Number of pipeline stages for P (0-1)
+				.RESET_MODE("SYNC")                 // Selection of synchronous or asynchronous reset. (ASYNC, SYNC).
+			)
+			DSP58_inst (
+				// Cascade outputs: Cascade Ports
+				.ACOUT(),                           // 34-bit output: A port cascade
+				.BCOUT(),                           // 24-bit output: B cascade
+				.CARRYCASCOUT(),                    // 1-bit output: Cascade carry
+				.MULTSIGNOUT(),                     // 1-bit output: Multiplier sign cascade
+				.PCOUT(pcout[j][i]),                // 58-bit output: Cascade output
+				// Control outputs: Control Inputs/Status Bits
+				.OVERFLOW(),                        // 1-bit output: Overflow in add/acc
+				.PATTERNBDETECT(),                  // 1-bit output: Pattern bar detect
+				.PATTERNDETECT(),                   // 1-bit output: Pattern detect
+				.UNDERFLOW(),                       // 1-bit output: Underflow in add/acc
+				// Data outputs: Data Ports
+				.CARRYOUT(),                        // 4-bit output: Carry
+				.P(pp),                             // 58-bit output: Primary data
+				.XOROUT(),                          // 8-bit output: XOR data
+				// Cascade inputs: Cascade Ports
+				.ACIN('x),                          // 34-bit input: A cascade data
+				.BCIN('x),                          // 24-bit input: B cascade
+				.CARRYCASCIN('x),                   // 1-bit input: Cascade carry
+				.MULTSIGNIN('x),                    // 1-bit input: Multiplier sign cascade
+				.PCIN(FIRST ? 'x : pcout[j][i-1]),  // 58-bit input: P cascade
+				// Control inputs: Control Inputs/Status Bits
+				.ALUMODE(4'h0),                     // 4-bit input: ALU control
+				.CARRYINSEL('0),                    // 3-bit input: Carry select
+				.CLK(clk),                          // 1-bit input: Clock
+				.INMODE({
+						INTERNAL_PREGS==2 ? 1'b0 : 1'b1,
+						2'b00,
+						TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero,
+						INTERNAL_PREGS==2 ? 1'b0 : 1'b1        
+				}),                                 // 5-bit input: INMODE control
+				.NEGATE('0),                        // 3-bit input: Negates the input of the multiplier
+				.OPMODE({
+						LAST ? {1'b0, L[1]} : 2'b00,
+						7'b000_0000
+				}), // 9-bit input: Operation mode
+				// Data inputs: Data Ports
+				.A({ 7'bx, a_in_i[i] }),            // 34-bit input: A data
+				.B(b_in_i[j][i]),                   // 24-bit input: B data
+				.C('x),                             // 58-bit input: C data
+				.CARRYIN('0),                       // 1-bit input: Carry-in
+				.D('x),                             // 27-bit input: D data
+				// Reset/Clock Enable inputs: Reset/Clock Enable Inputs
+				.ASYNC_RST('0),                     // 1-bit input: Asynchronous reset for all registers.
+				.CEA1(en),                          // 1-bit input: Clock enable for 1st stage AREG
+				.CEA2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage AREG
+				.CEAD('0),                          // 1-bit input: Clock enable for ADREG
+				.CEALUMODE('0),                     // 1-bit input: Clock enable for ALUMODE
+				.CEB1(en),                          // 1-bit input: Clock enable for 1st stage BREG
+				.CEB2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage BREG
+				.CEC('0),                           // 1-bit input: Clock enable for CREG
+				.CECARRYIN('0),                     // 1-bit input: Clock enable for CARRYINREG
+				.CECTRL(en),                        // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG
+				.CED('0),                           // 1-bit input: Clock enable for DREG
+				.CEINMODE(en),                      // 1-bit input: Clock enable for INMODEREG
+				.CEM(en),                           // 1-bit input: Clock enable for MREG
+				.CEP(PREG && en),                   // 1-bit input: Clock enable for PREG
+				.RSTA(rst),                         // 1-bit input: Reset for AREG
+				.RSTALLCARRYIN('0),                 // 1-bit input: Reset for CARRYINREG
+				.RSTALUMODE('0),                    // 1-bit input: Reset for ALUMODEREG
+				.RSTB(rst),                         // 1-bit input: Reset for BREG
+				.RSTC('0),                          // 1-bit input: Reset for CREG
+				.RSTCTRL(rst),                      // 1-bit input: Reset for OPMODEREG and CARRYINSELREG
+				.RSTD('0),                          // 1-bit input: Reset for DREG and ADREG
+				.RSTINMODE(rst),                    // 1-bit input: Reset for INMODE register
+				.RSTM(rst),                         // 1-bit input: Reset for MREG
+				.RSTP(PREG && rst)                  // 1-bit input: Reset for PREG
+			);
+		end : genDSPChain  
+	end : genDSPPE
     
 endmodule
diff --git a/finn-rtllib/mvu/mvu_8sx9_axi.sv b/finn-rtllib/mvu/mvu_8sx9_axi.sv
index 8765c50a26..6c7eaeaeca 100644
--- a/finn-rtllib/mvu/mvu_8sx9_axi.sv
+++ b/finn-rtllib/mvu/mvu_8sx9_axi.sv
@@ -41,36 +41,36 @@ module mvu_8sx9_axi #(
     int unsigned ACCU_WIDTH,
     bit SIGNED_ACTIVATIONS = 0,
     int unsigned SEGMENTLEN = 0,
-		parameter RAM_STYLE = "auto",
+	parameter RAM_STYLE = "auto",
 
     localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
     localparam int unsigned INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8,
-		localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH,
-		localparam int unsigned INPUT_STREAM_WIDTH = SIMD*ACTIVATION_WIDTH,
+	localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH,
+	localparam int unsigned INPUT_STREAM_WIDTH = SIMD*ACTIVATION_WIDTH,
     localparam int unsigned SF = MW/SIMD,
-		localparam int unsigned NF = MH/PE,
+	localparam int unsigned NF = MH/PE,
     localparam int unsigned OUTPUT_LANES = PE,
     localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
 )
 (
 	// Global Control
-	input		logic  ap_clk,
-	input		logic  ap_rst_n,
+	input	logic  ap_clk,
+	input	logic  ap_rst_n,
 
 	// Weight Stream
-	input		logic [WEIGHT_STREAM_WIDTH_BA-1:0]  s_axis_weights_tdata,
-	input		logic  s_axis_weights_tvalid,
+	input	logic [WEIGHT_STREAM_WIDTH_BA-1:0]  s_axis_weights_tdata,
+	input	logic  s_axis_weights_tvalid,
 	output	logic  s_axis_weights_tready,
 
 	// Input Stream
-	input		logic [INPUT_STREAM_WIDTH_BA-1:0]  s_axis_input_tdata,
-	input		logic  s_axis_input_tvalid,
+	input	logic [INPUT_STREAM_WIDTH_BA-1:0]  s_axis_input_tdata,
+	input	logic  s_axis_input_tvalid,
 	output	logic  s_axis_input_tready,
 
 	// Output Stream
 	output	logic [OUTPUT_STREAM_WIDTH_BA-1:0]  m_axis_output_tdata,
 	output	logic  m_axis_output_tvalid,
-	input		logic  m_axis_output_tready
+	input	logic  m_axis_output_tready
 );
 
 //-------------------- Parameter sanity checks --------------------\\
@@ -121,13 +121,13 @@ module mvu_8sx9_axi #(
 		.ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin)
 	);
 
-	//-------------------- Input control --------------------\\
+//-------------------- Input control --------------------\\
 	uwire en;
 	uwire istb = avld && s_axis_weights_tvalid;
 	assign ardy = en && s_axis_weights_tvalid;
 	assign s_axis_weights_tready = en && avld;
 
-	//-------------------- Core MVU --------------------\\
+//-------------------- Core MVU --------------------\\
 	uwire ovld;
 	uwire [PE-1:0][57:0] odat;
 	typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t;
@@ -138,7 +138,7 @@ module mvu_8sx9_axi #(
 		.vld(ovld), .p(odat)
 	);
 
-	//-------------------- Output register slice --------------------\\
+//-------------------- Output register slice --------------------\\
 	struct {
 		logic vld;
 		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
@@ -148,7 +148,7 @@ module mvu_8sx9_axi #(
 
 	uwire  b_load;
 	always_ff @(posedge clk) begin
-		if(rst)  A <= '{ vld: 0, default: 'x };
+		if(rst)		A <= '{ vld: 0, default: 'x };
 		else if(!A.vld || b_load) begin
 			A.vld <= ovld && en;
 			for(int unsigned  i = 0; i < PE; i++) begin
@@ -169,7 +169,7 @@ module mvu_8sx9_axi #(
 	always_ff @(posedge clk) begin
 		if(rst)		B <= '{ default: 'x };
 		else begin
-			if(b_load)	 B <= '{ vld: A.vld, dat: A.dat};
+			if(b_load)	B <= '{ vld: A.vld, dat: A.dat};
 		end	
 	end
 
diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv b/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv
index ea97e0708c..70ffa096ef 100644
--- a/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv
+++ b/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv
@@ -33,176 +33,176 @@
 
 module mvu_8sx9_axi_tb();
 
-  //-------------------- Simulation parameters --------------------\\
-  // Matrix & parallelism config
-  localparam int unsigned MW = 600;
-  localparam int unsigned MH = 256;
-  localparam int unsigned SIMD = 60;
-  localparam int unsigned PE = 16;
-  localparam int unsigned SEGMENTLEN = 4;
-  // Bit-width config  
-  localparam int unsigned ACTIVATION_WIDTH = 8;
-  localparam int unsigned WEIGHT_WIDTH = 4;
-  localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW);
-  localparam bit SIGNED_ACTIVATIONS = 1;
-  // Simulation constants  
-  localparam int unsigned NF = MH/PE;
-  localparam int unsigned SF = MW/SIMD;
-  localparam int unsigned NUM_OF_DSP = SIMD/3;
-  localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8;
-  localparam int unsigned ACTIVATION_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8*8;
-  localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH;
-  localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - SIMD*ACTIVATION_WIDTH;
-  localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8;
-
-  // Generate clk and reset signal   
-  logic clk = 0;
-  always #5ns clk = !clk;
-  
-  logic ap_rst_n = 0;
-  initial begin
-    repeat(16) @(posedge clk);
-    ap_rst_n <= 1;
-  end
-
-  uwire ap_clk = clk;
-
-  // Generate activations  
-  typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t;
-  typedef activation_t activation_vector_t[SF];
-    
-  function activation_vector_t init_ACTIVATIONS;
-    automatic activation_vector_t res;
-    std::randomize(res);
-    return res;
-  endfunction : init_ACTIVATIONS
-
-  activation_vector_t ACTIVATIONS = init_ACTIVATIONS();
-
-  struct {
-    activation_t dat;
-    logic vld;
-    logic rdy;
-  } activations;
-
-  initial begin
-    activations.vld = 0;
-    activations.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain
-    @(posedge clk iff ap_rst_n);
-
-    for (int i=0; i<SF; i++) begin
-      activations.dat <= ACTIVATIONS[i];
-      do begin 
-        activations.vld = $urandom()%7 > 1;
-        @(posedge clk);
-      end while (!(activations.vld === 1 && activations.rdy === 1));
-    end
-    
-    activations.vld <= 0;
-    activations.dat <= 'x;
-  end
-   
-  // Generate weights   
-  typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t;
-  typedef weight_t weight_matrix_t[NF][SF]; 
-  
-  function weight_matrix_t init_WEIGHTS;
-    automatic weight_matrix_t res;
-    std::randomize(res);
-    return res;
-  endfunction : init_WEIGHTS;
-
-  weight_matrix_t WEIGHTS = init_WEIGHTS();
-
-  struct {
-    weight_t dat;
-    logic vld;
-    logic rdy;
-  } weights;
-
-  initial begin
-    weights.vld = 0;
-    weights.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain
-    @(posedge clk iff ap_rst_n);
-
-    weights.vld <= 1;
-    for (int i=0; i<NF; i++) begin
-      for (int j=0; j<SF; j++) begin
-        weights.dat <= WEIGHTS[i][j];
-        @(posedge clk iff weights.rdy);
-      end
-    end
-
-    weights.vld <= 0;
-    weights.dat <= 'x;
-  end
-  
-  // Function to compute golden output  
-  // a: [SF][SIMD-1:0][ACTIVATION_WIDTH-1:0]
-  // w: [NF][SF][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]
-  typedef logic signed [PE-1:0][ACCU_WIDTH-1:0] output_t;
-  typedef output_t output_vector_t [NF];
-
-  struct {
-    output_t dat;
-    logic vld;
-    logic rdy;
-  } outputs;
-
-  function output_vector_t check_output(activation_vector_t a, weight_matrix_t w);
-    automatic output_vector_t res = '{default: 0};
-    for (int j = 0; j<MH; j++) begin
-      for (int i = 0; i<MW; i++) begin
-        res[j/PE][j%PE] = $signed(res[j/PE][j%PE]) + $signed(a[i/SIMD][i%SIMD]) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
-      end
-    end  
-    return res;
-  endfunction : check_output;
-
-  output_vector_t GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS);
-
-  int unsigned NF_CNT = 0;
-  initial begin
-    outputs.rdy = 0;
-    while (NF_CNT < NF) begin
-      // Loop until both rdy & vld are asserted
-      do begin
-        outputs.rdy <= $urandom()%7 >= 1;
-        @(posedge clk iff ap_rst_n);
-      end while (!(outputs.rdy === 1 && outputs.vld === 1));
-
-      // Compare produced outputs against golden outputs
-      foreach(outputs.dat[i]) begin
-        assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
-        else begin 
-          $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
-          $stop;
-        end  
-      end
-      
-      NF_CNT += 1;
-    end
-    
-    $finish;  
-  end
-
-  // Instantiate DUT
-  mvu_8sx9_axi #(
-      .MW(MW),
-      .MH(MH),
-      .PE(PE),
-      .SIMD(SIMD),
-      .ACTIVATION_WIDTH(ACTIVATION_WIDTH),
-      .WEIGHT_WIDTH(WEIGHT_WIDTH),
-      .ACCU_WIDTH(ACCU_WIDTH),
-      .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
-      .SEGMENTLEN(SEGMENTLEN)
-    )
-    dut (
-      .ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld),
-      .s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld),
-      .s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld),
-      .m_axis_output_tready(outputs.rdy)
-    );
+//-------------------- Simulation parameters --------------------\\
+	// Matrix & parallelism config
+	localparam int unsigned MW = 600;
+	localparam int unsigned MH = 256;
+	localparam int unsigned SIMD = 60;
+	localparam int unsigned PE = 16;
+	localparam int unsigned SEGMENTLEN = 4;
+	// Bit-width config  
+	localparam int unsigned ACTIVATION_WIDTH = 8;
+	localparam int unsigned WEIGHT_WIDTH = 4;
+	localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW);
+	localparam bit SIGNED_ACTIVATIONS = 1;
+	// Simulation constants  
+	localparam int unsigned NF = MH/PE;
+	localparam int unsigned SF = MW/SIMD;
+	localparam int unsigned NUM_OF_DSP = SIMD/3;
+	localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8;
+	localparam int unsigned ACTIVATION_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8*8;
+	localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH;
+	localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - SIMD*ACTIVATION_WIDTH;
+	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8;
+
+	// Generate clk and reset signal   
+	logic clk = 0;
+	always #5ns clk = !clk;
+
+	logic ap_rst_n = 0;
+	initial begin
+		repeat(16) @(posedge clk);
+		ap_rst_n <= 1;
+	end
+
+	uwire ap_clk = clk;
+
+	// Generate activations  
+	typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t;
+	typedef activation_t activation_vector_t[SF];
+
+	function activation_vector_t init_ACTIVATIONS;
+		automatic activation_vector_t res;
+		std::randomize(res);
+		return res;
+	endfunction : init_ACTIVATIONS
+
+	activation_vector_t ACTIVATIONS = init_ACTIVATIONS();
+
+	struct {
+		activation_t dat;
+		logic vld;
+		logic rdy;
+	} activations;
+
+	initial begin
+		activations.vld = 0;
+		activations.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain
+		@(posedge clk iff ap_rst_n);
+
+		for (int i=0; i<SF; i++) begin
+			activations.dat <= ACTIVATIONS[i];
+			do begin 
+				activations.vld = $urandom()%7 > 1;
+				@(posedge clk);
+			end while (!(activations.vld === 1 && activations.rdy === 1));
+		end
+
+		activations.vld <= 0;
+		activations.dat <= 'x;
+	end
+
+	// Generate weights   
+	typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t;
+	typedef weight_t weight_matrix_t[NF][SF]; 
+
+	function weight_matrix_t init_WEIGHTS;
+		automatic weight_matrix_t res;
+		std::randomize(res);
+		return res;
+	endfunction : init_WEIGHTS;
+
+	weight_matrix_t WEIGHTS = init_WEIGHTS();
+
+	struct {
+		weight_t dat;
+		logic vld;
+		logic rdy;
+	} weights;
+
+	initial begin
+		weights.vld = 0;
+		weights.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain
+		@(posedge clk iff ap_rst_n);
+
+		weights.vld <= 1;
+		for (int i=0; i<NF; i++) begin
+			for (int j=0; j<SF; j++) begin
+				weights.dat <= WEIGHTS[i][j];
+				@(posedge clk iff weights.rdy);
+			end
+		end
+
+		weights.vld <= 0;
+		weights.dat <= 'x;
+	end
+
+	// Function to compute golden output  
+	// a: [SF][SIMD-1:0][ACTIVATION_WIDTH-1:0]
+	// w: [NF][SF][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]
+	typedef logic signed [PE-1:0][ACCU_WIDTH-1:0] output_t;
+	typedef output_t output_vector_t [NF];
+
+	struct {
+		output_t dat;
+		logic vld;
+		logic rdy;
+	} outputs;
+
+	function output_vector_t check_output(activation_vector_t a, weight_matrix_t w);
+		automatic output_vector_t res = '{default: 0};
+		for (int j = 0; j<MH; j++) begin
+			for (int i = 0; i<MW; i++) begin
+				res[j/PE][j%PE] = $signed(res[j/PE][j%PE]) + $signed(a[i/SIMD][i%SIMD]) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
+			end
+		end  
+		return res;
+	endfunction : check_output;
+
+	output_vector_t GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS);
+
+	int unsigned NF_CNT = 0;
+	initial begin
+		outputs.rdy = 0;
+		while (NF_CNT < NF) begin
+			// Loop until both rdy & vld are asserted
+			do begin
+				outputs.rdy <= $urandom()%7 >= 1;
+				@(posedge clk iff ap_rst_n);
+			end while (!(outputs.rdy === 1 && outputs.vld === 1));
+
+			// Compare produced outputs against golden outputs
+			foreach(outputs.dat[i]) begin
+				assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+				else begin 
+					$error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+					$stop;
+				end  
+			end
+			
+			NF_CNT += 1;
+		end
+
+		$finish;  
+	end
+
+	// Instantiate DUT
+	mvu_8sx9_axi #(
+		.MW(MW),
+		.MH(MH),
+		.PE(PE),
+		.SIMD(SIMD),
+		.ACTIVATION_WIDTH(ACTIVATION_WIDTH),
+		.WEIGHT_WIDTH(WEIGHT_WIDTH),
+		.ACCU_WIDTH(ACCU_WIDTH),
+		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
+		.SEGMENTLEN(SEGMENTLEN)
+	)
+	dut (
+		.ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld),
+		.s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld),
+		.s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld),
+		.m_axis_output_tready(outputs.rdy)
+	);
   
 endmodule
diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v
index ff3779d211..2456eb3a47 100644
--- a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v
+++ b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v
@@ -33,7 +33,7 @@
 
 module $MODULE_NAME_AXI_WRAPPER$ #(
 	parameter 	MW = $MW$,
-	parameter		MH = $MH$,
+	parameter	MH = $MH$,
 	parameter 	PE = $PE$,
 	parameter 	SIMD = $SIMD$,
 	parameter 	ACTIVATION_WIDTH = $ACTIVATION_WIDTH$,
@@ -44,29 +44,29 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 	parameter 	RAM_STYLE = $IBUF_RAM_STYLE$,
 
 	// Safely deducible parameters
-	parameter WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
-	parameter INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8,
-	parameter OUTPUT_LANES = PE,
-	parameter OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
+	parameter 	WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
+	parameter 	INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8,
+	parameter 	OUTPUT_LANES = PE,
+	parameter 	OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
 )(
-  // Global Control
-	input		logic  ap_clk,
-	input		logic  ap_rst_n,
+  	// Global Control
+	input	logic  ap_clk,
+	input	logic  ap_rst_n,
 
 	// Weight Stream
-	input		logic [WEIGHT_STREAM_WIDTH_BA-1:0]  s_axis_weights_tdata,
-	input		logic  s_axis_weights_tvalid,
+	input	logic [WEIGHT_STREAM_WIDTH_BA-1:0]  s_axis_weights_tdata,
+	input	logic  s_axis_weights_tvalid,
 	output	logic  s_axis_weights_tready,
 
 	// Input Stream
-	input		logic [INPUT_STREAM_WIDTH_BA-1:0]  s_axis_input_tdata,
-	input		logic  s_axis_input_tvalid,
+	input	logic [INPUT_STREAM_WIDTH_BA-1:0]  s_axis_input_tdata,
+	input	logic  s_axis_input_tvalid,
 	output	logic  s_axis_input_tready,
 
 	// Output Stream
 	output	logic [OUTPUT_STREAM_WIDTH_BA-1:0]  m_axis_output_tdata,
 	output	logic  m_axis_output_tvalid,
-	input		logic  m_axis_output_tready
+	input	logic  m_axis_output_tready
 );
 
 mvu_8sx9_axi #(
diff --git a/finn-rtllib/mvu/mvu_8sx9_tb.sv b/finn-rtllib/mvu/mvu_8sx9_tb.sv
index ea3ecbbd70..adf6a8f9c2 100644
--- a/finn-rtllib/mvu/mvu_8sx9_tb.sv
+++ b/finn-rtllib/mvu/mvu_8sx9_tb.sv
@@ -33,133 +33,133 @@
 
 module mvu_8sx9_tb();
 
-  //-------------------- Simulation parameters --------------------\\
-  // Matrix & parallelism config
-  localparam int unsigned MH = 256;
-  localparam int unsigned PE = 16;
-  localparam int unsigned MW = 600;
-  localparam int unsigned SIMD = 60;
-  localparam int unsigned SEGMENTLEN = 4;
-  // Bit-width config  
-  localparam int unsigned ACTIVATION_WIDTH = 8;
-  localparam int unsigned WEIGHT_WIDTH = 4;
-  localparam bit SIGNED_ACTIVATIONS = 1;
-  // Simulation constants
-  localparam int unsigned NF = MH/PE;
-  localparam int unsigned SF = MW/SIMD;
-  localparam int unsigned NUM_OF_DSP = SIMD/3;
-  
-  typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t;
-  typedef activation_t activation_vector_t[SF];
-
-  function activation_vector_t init_ACTIVATIONS;
-    automatic activation_vector_t res;
-    std::randomize(res);
-    return res;
-  endfunction : init_ACTIVATIONS
-
-  typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t;
-  typedef weight_t weight_matrix_t[NF][SF];
-  
-  function weight_matrix_t init_WEIGHTS;
-    automatic weight_matrix_t res;
-    std::randomize(res);
-    return res;
-  endfunction : init_WEIGHTS;
-  
-  typedef logic signed [PE-1:0][57:0] output_t;
-  typedef output_t output_vector_t [NF];
-
-  function output_vector_t check_output(activation_vector_t a, weight_matrix_t w);
-    automatic output_vector_t res = '{default: 0};
-    for (int j = 0; j<MH; j++) begin
-      for (int i = 0; i<MW; i++) begin
-        res[j/PE][j%PE] = $signed(res[j/PE][j%PE]) + $signed(a[i/SIMD][i%SIMD]) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
-      end
-    end  
-    return res;
-  endfunction : check_output;
-  
-  logic clk = 0;
-  always #5ns clk = !clk;
-  
-  logic rst;
-  initial begin
-    rst = 1;
-    repeat(16) @(posedge clk);
-    rst <= 0;
-  end
-   
-  logic last;
-  logic zero;
-  logic vld;
-  activation_t a;
-  weight_t w;
-  output_t p;
-  // Reference signals
-  activation_vector_t ACTIVATIONS; //   [SF-1:0][SIMD-1:0][ACTIVATION_WIDTH-1:0]
-  weight_matrix_t WEIGHTS; //           [NF-1:0][SF-1:0][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]
-  output_vector_t GOLDEN_OUTPUT; //     [NF-1:0][PE-1:0][57:0]
-  // Counter for number of outputs (NF dimension) that are produced
-  int NF_CNT = 0;
-  
-  initial begin
-    ACTIVATIONS = init_ACTIVATIONS();
-    WEIGHTS = init_WEIGHTS();
-    GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS);
-    last = 0;
-    zero = 0;
-    a = 'x;
-    w = 'x;
-    
-    @(posedge clk iff !rst);
-
-    for (int j=0; j<NF; j++) begin
-      for (int i=0; i<SF; i++) begin
-        last <= (i==SF-1) ? 1 : 0;
-        a <= ACTIVATIONS[i];
-        w <= WEIGHTS[j][i];
-        @(posedge clk iff en);
-      end
-    end
-
-    last <= 0;
-    zero <= 1;  
-
-    // Continue until all NF outputs are produced & compared
-    @(posedge clk && (NF_CNT==NF));
-
-    $finish;
-  end
-
-  logic en = 0;
-  always_ff @(posedge clk) begin
-    en <= ($urandom()%7 > 1) && !rst;
-  end
-
-  // Compare computed output against golden output when vld flag is raised by DUT
-  always_ff @(posedge clk iff (vld && en)) begin
-    foreach(p[i]) begin
-      assert ($signed(p[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
-      else begin 
-        $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
-        $stop;
-      end  
-    end
-    NF_CNT += 1;
-  end
-
-  // Instantiate DUT
-  mvu_8sx9 #(
-      .PE(PE),
-      .SIMD(SIMD),
-      .WEIGHT_WIDTH(WEIGHT_WIDTH),
-      .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
-      .ACTIVATION_WIDTH(ACTIVATION_WIDTH),
-      .SEGMENTLEN(SEGMENTLEN)
-    )
-    dut (
-      .clk, .rst, .en, .last, .zero, .a, .w, .vld, .p
-    );
-  
+//-------------------- Simulation parameters --------------------\\
+	// Matrix & parallelism config
+	localparam int unsigned MH = 256;
+	localparam int unsigned PE = 16;
+	localparam int unsigned MW = 600;
+	localparam int unsigned SIMD = 60;
+	localparam int unsigned SEGMENTLEN = 4;
+	// Bit-width config  
+	localparam int unsigned ACTIVATION_WIDTH = 8;
+	localparam int unsigned WEIGHT_WIDTH = 4;
+	localparam bit SIGNED_ACTIVATIONS = 1;
+	// Simulation constants
+	localparam int unsigned NF = MH/PE;
+	localparam int unsigned SF = MW/SIMD;
+	localparam int unsigned NUM_OF_DSP = SIMD/3;
+
+	typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t;
+	typedef activation_t activation_vector_t[SF];
+
+	function activation_vector_t init_ACTIVATIONS;
+		automatic activation_vector_t res;
+		std::randomize(res);
+		return res;
+	endfunction : init_ACTIVATIONS
+
+	typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t;
+	typedef weight_t weight_matrix_t[NF][SF];
+
+	function weight_matrix_t init_WEIGHTS;
+		automatic weight_matrix_t res;
+		std::randomize(res);
+		return res;
+	endfunction : init_WEIGHTS;
+
+	typedef logic signed [PE-1:0][57:0] output_t;
+	typedef output_t output_vector_t [NF];
+
+	function output_vector_t check_output(activation_vector_t a, weight_matrix_t w);
+		automatic output_vector_t res = '{default: 0};
+		for (int j = 0; j<MH; j++) begin
+			for (int i = 0; i<MW; i++) begin
+				res[j/PE][j%PE] = $signed(res[j/PE][j%PE]) + $signed(a[i/SIMD][i%SIMD]) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
+			end
+		end  
+		return res;
+	endfunction : check_output;
+
+	logic clk = 0;
+	always #5ns clk = !clk;
+
+	logic rst;
+	initial begin
+		rst = 1;
+		repeat(16) @(posedge clk);
+		rst <= 0;
+	end
+
+	logic last;
+	logic zero;
+	logic vld;
+	activation_t a;
+	weight_t w;
+	output_t p;
+	// Reference signals
+	activation_vector_t ACTIVATIONS; //   [SF-1:0][SIMD-1:0][ACTIVATION_WIDTH-1:0]
+	weight_matrix_t WEIGHTS; //           [NF-1:0][SF-1:0][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]
+	output_vector_t GOLDEN_OUTPUT; //     [NF-1:0][PE-1:0][57:0]
+	// Counter for number of outputs (NF dimension) that are produced
+	int NF_CNT = 0;
+
+	initial begin
+		ACTIVATIONS = init_ACTIVATIONS();
+		WEIGHTS = init_WEIGHTS();
+		GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS);
+		last = 0;
+		zero = 0;
+		a = 'x;
+		w = 'x;
+
+		@(posedge clk iff !rst);
+
+		for (int j=0; j<NF; j++) begin
+			for (int i=0; i<SF; i++) begin
+				last <= (i==SF-1) ? 1 : 0;
+				a <= ACTIVATIONS[i];
+				w <= WEIGHTS[j][i];
+				@(posedge clk iff en);
+			end
+		end
+
+		last <= 0;
+		zero <= 1;  
+
+		// Continue until all NF outputs are produced & compared
+		@(posedge clk && (NF_CNT==NF));
+
+		$finish;
+	end
+
+	logic en = 0;
+	always_ff @(posedge clk) begin
+		en <= ($urandom()%7 > 1) && !rst;
+	end
+
+	// Compare computed output against golden output when vld flag is raised by DUT
+	always_ff @(posedge clk iff (vld && en)) begin
+		foreach(p[i]) begin
+			assert ($signed(p[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+			else begin 
+				$error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+				$stop;
+			end  
+		end
+		NF_CNT += 1;
+	end
+
+	// Instantiate DUT
+	mvu_8sx9 #(
+		.PE(PE),
+		.SIMD(SIMD),
+		.WEIGHT_WIDTH(WEIGHT_WIDTH),
+		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
+		.ACTIVATION_WIDTH(ACTIVATION_WIDTH),
+		.SEGMENTLEN(SEGMENTLEN)
+	)
+	dut (
+		.clk, .rst, .en, .last, .zero, .a, .w, .vld, .p
+	);
+
 endmodule

From 5e61f42afd991233153ee8b7fe0fb6e9e8ac562d Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Wed, 12 Apr 2023 08:54:45 +0100
Subject: [PATCH 103/235] [rtl custom op]: fix to indentation

---
 finn-rtllib/mvu/mvu_8sx9_axi.sv | 54 ++++++++++++++++-----------------
 1 file changed, 27 insertions(+), 27 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_8sx9_axi.sv b/finn-rtllib/mvu/mvu_8sx9_axi.sv
index 6c7eaeaeca..5f215927d8 100644
--- a/finn-rtllib/mvu/mvu_8sx9_axi.sv
+++ b/finn-rtllib/mvu/mvu_8sx9_axi.sv
@@ -32,25 +32,25 @@
  *****************************************************************************/
 
 module mvu_8sx9_axi #(
-    int unsigned MW,
-    int unsigned MH,
-    int unsigned PE,
-    int unsigned SIMD,
-    int unsigned ACTIVATION_WIDTH,
-    int unsigned WEIGHT_WIDTH,
-    int unsigned ACCU_WIDTH,
-    bit SIGNED_ACTIVATIONS = 0,
-    int unsigned SEGMENTLEN = 0,
+	int unsigned MW,
+	int unsigned MH,
+	int unsigned PE,
+	int unsigned SIMD,
+	int unsigned ACTIVATION_WIDTH,
+	int unsigned WEIGHT_WIDTH,
+	int unsigned ACCU_WIDTH,
+	bit SIGNED_ACTIVATIONS = 0,
+	int unsigned SEGMENTLEN = 0,
 	parameter RAM_STYLE = "auto",
 
-    localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
-    localparam int unsigned INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8,
+	localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
+	localparam int unsigned INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8,
 	localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH,
 	localparam int unsigned INPUT_STREAM_WIDTH = SIMD*ACTIVATION_WIDTH,
-    localparam int unsigned SF = MW/SIMD,
+	localparam int unsigned SF = MW/SIMD,
 	localparam int unsigned NF = MH/PE,
-    localparam int unsigned OUTPUT_LANES = PE,
-    localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
+	localparam int unsigned OUTPUT_LANES = PE,
+	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
 )
 (
 	// Global Control
@@ -76,31 +76,31 @@ module mvu_8sx9_axi #(
 //-------------------- Parameter sanity checks --------------------\\
 	initial begin
 		if (MW % SIMD != 0) begin
-		$error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD);
-		$finish;
+			$error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD);
+			$finish;
 		end
 		if (MH % PE != 0) begin
-		$error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE);
-		$finish;
+			$error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE);
+			$finish;
 		end
 		if (ACTIVATION_WIDTH > 9) begin
-		$error("Activation width of %0d-bits exceeds maximum of 9-bits", ACTIVATION_WIDTH);
-		$finish;
+			$error("Activation width of %0d-bits exceeds maximum of 9-bits", ACTIVATION_WIDTH);
+			$finish;
 		end
 		if (WEIGHT_WIDTH > 8) begin
-		$error("Weight width of %0d-bits exceeds maximum of 8-bits", WEIGHT_WIDTH);
-		$finish;
+			$error("Weight width of %0d-bits exceeds maximum of 8-bits", WEIGHT_WIDTH);
+			$finish;
 		end
 		if (SIGNED_ACTIVATIONS == 0 && ACTIVATION_WIDTH==9) begin
-		$error("Activation width of %0d-bits exceeds maximum of 8-bits for unsigned numbers", ACTIVATION_WIDTH);
-		$finish;
+			$error("Activation width of %0d-bits exceeds maximum of 8-bits for unsigned numbers", ACTIVATION_WIDTH);
+			$finish;
 		end
 		if (SEGMENTLEN == 0) begin
-		$warning("Segment length of %0d defaults to chain length", SEGMENTLEN);
+			$warning("Segment length of %0d defaults to chain length", SEGMENTLEN);
 		end
 		if (SEGMENTLEN > (SIMD+2)/3) begin
-		$error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3);
-		$finish;
+			$error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3);
+			$finish;
 		end
 	end
 

From cbee193d746763044a870bdf1af248bbe8d31156 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Wed, 12 Apr 2023 14:33:13 +0100
Subject: [PATCH 104/235] [rtl custom-op]: minor changes for compiler
 integration

---
 finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v
index 2456eb3a47..502a72d3f2 100644
--- a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v
+++ b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v
@@ -41,7 +41,7 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 	parameter 	ACCU_WIDTH = $ACCU_WIDTH$,
 	parameter 	SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$,
 	parameter 	SEGMENTLEN = $SEGMENTLEN$,
-	parameter 	RAM_STYLE = $IBUF_RAM_STYLE$,
+	parameter 	RAM_STYLE = "$IBUF_RAM_STYLE$",
 
 	// Safely deducible parameters
 	parameter 	WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
@@ -85,6 +85,6 @@ mvu_8sx9_axi #(
 	.m_axis_output_tdata(m_axis_output_tdata),
 	.m_axis_output_tvalid(m_axis_output_tvalid),
 	.m_axis_output_tready(m_axis_output_tready)
-)
+);
 
-endmodule : mvau_8sx9_axi_wrapper
\ No newline at end of file
+endmodule : $MODULE_NAME_AXI_WRAPPER$
\ No newline at end of file

From ba5e77bde008fff2a445d6ef469072dd67f67f42 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Wed, 12 Apr 2023 23:26:05 +0100
Subject: [PATCH 105/235] [rtl custom op]: moved testbenches to separate
 directory

---
 finn-rtllib/mvu/tb/mvu_8sx9_tb.sv | 165 +++++++++++++++++++++++
 finn-rtllib/mvu/tb/mvu_axi_tb.sv  | 213 ++++++++++++++++++++++++++++++
 2 files changed, 378 insertions(+)
 create mode 100644 finn-rtllib/mvu/tb/mvu_8sx9_tb.sv
 create mode 100644 finn-rtllib/mvu/tb/mvu_axi_tb.sv

diff --git a/finn-rtllib/mvu/tb/mvu_8sx9_tb.sv b/finn-rtllib/mvu/tb/mvu_8sx9_tb.sv
new file mode 100644
index 0000000000..c8bfe5370a
--- /dev/null
+++ b/finn-rtllib/mvu/tb/mvu_8sx9_tb.sv
@@ -0,0 +1,165 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Testbench for MVU core compute kernel.
+ *****************************************************************************/
+
+module mvu_8sx9_tb();
+
+//-------------------- Simulation parameters --------------------\\
+	// Matrix & parallelism config
+	localparam int unsigned MH = 256;
+	localparam int unsigned PE = 16;
+	localparam int unsigned MW = 600;
+	localparam int unsigned SIMD = 60;
+	localparam int unsigned SEGMENTLEN = 4;
+	// Bit-width config  
+	localparam int unsigned ACTIVATION_WIDTH = 8;
+	localparam int unsigned WEIGHT_WIDTH = 4;
+	localparam bit SIGNED_ACTIVATIONS = 1;
+	// Simulation constants
+	localparam int unsigned NF = MH/PE;
+	localparam int unsigned SF = MW/SIMD;
+	localparam int unsigned NUM_OF_DSP = SIMD/3;
+
+	typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t;
+	typedef activation_t activation_vector_t[SF];
+
+	function activation_vector_t init_ACTIVATIONS;
+		automatic activation_vector_t res;
+		std::randomize(res);
+		return res;
+	endfunction : init_ACTIVATIONS
+
+	typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t;
+	typedef weight_t weight_matrix_t[NF][SF];
+
+	function weight_matrix_t init_WEIGHTS;
+		automatic weight_matrix_t res;
+		std::randomize(res);
+		return res;
+	endfunction : init_WEIGHTS;
+
+	typedef logic signed [PE-1:0][57:0] output_t;
+	typedef output_t output_vector_t [NF];
+
+	function output_vector_t check_output(activation_vector_t a, weight_matrix_t w);
+		automatic output_vector_t res = '{default: 0};
+		for (int j = 0; j<MH; j++) begin
+			for (int i = 0; i<MW; i++) begin
+				res[j/PE][j%PE] = $signed(res[j/PE][j%PE]) + $signed(a[i/SIMD][i%SIMD]) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
+			end
+		end  
+		return res;
+	endfunction : check_output;
+
+	logic clk = 0;
+	always #5ns clk = !clk;
+
+	logic rst;
+	initial begin
+		rst = 1;
+		repeat(16) @(posedge clk);
+		rst <= 0;
+	end
+
+	logic last;
+	logic zero;
+	logic vld;
+	activation_t a;
+	weight_t w;
+	output_t p;
+	// Reference signals
+	activation_vector_t ACTIVATIONS; //   [SF-1:0][SIMD-1:0][ACTIVATION_WIDTH-1:0]
+	weight_matrix_t WEIGHTS; //           [NF-1:0][SF-1:0][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]
+	output_vector_t GOLDEN_OUTPUT; //     [NF-1:0][PE-1:0][57:0]
+	// Counter for number of outputs (NF dimension) that are produced
+	int NF_CNT = 0;
+
+	initial begin
+		ACTIVATIONS = init_ACTIVATIONS();
+		WEIGHTS = init_WEIGHTS();
+		GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS);
+		last = 0;
+		zero = 0;
+		a = 'x;
+		w = 'x;
+
+		@(posedge clk iff !rst);
+
+		for (int j=0; j<NF; j++) begin
+			for (int i=0; i<SF; i++) begin
+				last <= (i==SF-1) ? 1 : 0;
+				a <= ACTIVATIONS[i];
+				w <= WEIGHTS[j][i];
+				@(posedge clk iff en);
+			end
+		end
+
+		last <= 0;
+		zero <= 1;  
+
+		// Continue until all NF outputs are produced & compared
+		@(posedge clk && (NF_CNT==NF));
+
+		$finish;
+	end
+
+	logic en = 0;
+	always_ff @(posedge clk) begin
+		en <= ($urandom()%7 > 1) && !rst;
+	end
+
+	// Compare computed output against golden output when vld flag is raised by DUT
+	always_ff @(posedge clk iff (vld && en)) begin
+		foreach(p[i]) begin
+			assert ($signed(p[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+			else begin 
+				$error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+				$stop;
+			end  
+		end
+		NF_CNT += 1;
+	end
+
+	// Instantiate DUT
+	mvu_8sx9 #(
+		.PE(PE),
+		.SIMD(SIMD),
+		.WEIGHT_WIDTH(WEIGHT_WIDTH),
+		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
+		.ACTIVATION_WIDTH(ACTIVATION_WIDTH),
+		.SEGMENTLEN(SEGMENTLEN)
+	)
+	dut (
+		.clk, .rst, .en, .last, .zero, .a, .w, .vld, .p
+	);
+
+endmodule : mvu_8sx9_tb
diff --git a/finn-rtllib/mvu/tb/mvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_axi_tb.sv
new file mode 100644
index 0000000000..08a349da84
--- /dev/null
+++ b/finn-rtllib/mvu/tb/mvu_axi_tb.sv
@@ -0,0 +1,213 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Testbench for MVU AXI-lite interface wrapper.
+ *****************************************************************************/
+
+module mvu_axi_tb();
+
+//-------------------- Simulation parameters --------------------\\
+	// Matrix & parallelism config
+	localparam int unsigned MW = 90;
+	localparam int unsigned MH = 16;
+	localparam int unsigned SIMD = 9;
+	localparam int unsigned PE = 4;
+	localparam int unsigned SEGMENTLEN = 1;
+	localparam string MVU_IMPL_STYLE = "mvu_8sx9";
+	// Bit-width config  
+	localparam int unsigned ACTIVATION_WIDTH = 8;
+	localparam int unsigned WEIGHT_WIDTH = 4;
+	localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW);
+	localparam bit SIGNED_ACTIVATIONS = 1;
+	// Simulation constants  
+	localparam int unsigned NF = MH/PE;
+	localparam int unsigned SF = MW/SIMD;
+	localparam int unsigned NUM_OF_DSP = SIMD/3;
+	localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8;
+	localparam int unsigned ACTIVATION_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8*8;
+	localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH;
+	localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - SIMD*ACTIVATION_WIDTH;
+	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8;
+
+	// Generate clk and reset signal   
+	logic clk = 0;
+	always #5ns clk = !clk;
+
+	logic ap_rst_n = 0;
+	initial begin
+		repeat(16) @(posedge clk);
+		ap_rst_n <= 1;
+	end
+
+	uwire ap_clk = clk;
+
+	// Generate activations  
+	typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t;
+	typedef activation_t activation_vector_t[SF];
+
+	function activation_vector_t init_ACTIVATIONS;
+		automatic activation_vector_t res;
+		std::randomize(res);
+		return res;
+	endfunction : init_ACTIVATIONS
+
+	activation_vector_t ACTIVATIONS = init_ACTIVATIONS();
+
+	struct {
+		activation_t dat;
+		logic vld;
+		logic rdy;
+	} activations;
+
+	initial begin
+		activations.vld = 0;
+		activations.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain
+		@(posedge clk iff ap_rst_n);
+
+		for (int i=0; i<SF; i++) begin
+			activations.dat <= ACTIVATIONS[i];
+			do begin 
+				activations.vld = $urandom()%7 > 1;
+				@(posedge clk);
+			end while (!(activations.vld === 1 && activations.rdy === 1));
+		end
+
+		activations.vld <= 0;
+		activations.dat <= 'x;
+	end
+
+	// Generate weights   
+	typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t;
+	typedef weight_t weight_matrix_t[NF][SF]; 
+
+	function weight_matrix_t init_WEIGHTS;
+		automatic weight_matrix_t res;
+		std::randomize(res);
+		return res;
+	endfunction : init_WEIGHTS;
+
+	weight_matrix_t WEIGHTS = init_WEIGHTS();
+
+	struct {
+		weight_t dat;
+		logic vld;
+		logic rdy;
+	} weights;
+
+	initial begin
+		weights.vld = 0;
+		weights.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain
+		@(posedge clk iff ap_rst_n);
+
+		weights.vld <= 1;
+		for (int i=0; i<NF; i++) begin
+			for (int j=0; j<SF; j++) begin
+				weights.dat <= WEIGHTS[i][j];
+				@(posedge clk iff weights.rdy);
+			end
+		end
+
+		weights.vld <= 0;
+		weights.dat <= 'x;
+	end
+
+	// Function to compute golden output  
+	// a: [SF][SIMD-1:0][ACTIVATION_WIDTH-1:0]
+	// w: [NF][SF][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]
+	typedef logic signed [PE-1:0][ACCU_WIDTH-1:0] output_t;
+	typedef output_t output_vector_t [NF];
+
+	struct {
+		output_t dat;
+		logic vld;
+		logic rdy;
+	} outputs;
+
+	function output_vector_t check_output(activation_vector_t a, weight_matrix_t w);
+		automatic output_vector_t res = '{default: 0};
+		for (int j = 0; j<MH; j++) begin
+			for (int i = 0; i<MW; i++) begin
+				if (SIGNED_ACTIVATIONS==1) 
+					res[j/PE][j%PE] = $signed(res[j/PE][j%PE]) + $signed(a[i/SIMD][i%SIMD]) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
+				else
+					res[j/PE][j%PE] = $signed(res[j/PE][j%PE]) + $signed({1'b0, a[i/SIMD][i%SIMD]}) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
+			end
+		end  
+		return res;
+	endfunction : check_output;
+
+	output_vector_t GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS);
+
+	int unsigned NF_CNT = 0;
+	initial begin
+		outputs.rdy = 0;
+		while (NF_CNT < NF) begin
+			// Loop until both rdy & vld are asserted
+			do begin
+				outputs.rdy <= $urandom()%7 >= 1;
+				@(posedge clk iff ap_rst_n);
+			end while (!(outputs.rdy === 1 && outputs.vld === 1));
+
+			// Compare produced outputs against golden outputs
+			foreach(outputs.dat[i]) begin
+				assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+				else begin 
+					$error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+					$stop;
+				end  
+			end
+			
+			NF_CNT += 1;
+		end
+
+		$finish;  
+	end
+
+	// Instantiate DUT
+	mvu_axi #(
+		.MW(MW),
+		.MH(MH),
+		.PE(PE),
+		.SIMD(SIMD),
+		.ACTIVATION_WIDTH(ACTIVATION_WIDTH),
+		.WEIGHT_WIDTH(WEIGHT_WIDTH),
+		.ACCU_WIDTH(ACCU_WIDTH),
+		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
+		.SEGMENTLEN(SEGMENTLEN),
+		.MVU_IMPL_STYLE(MVU_IMPL_STYLE)
+	)
+	dut (
+		.ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld),
+		.s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld),
+		.s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld),
+		.m_axis_output_tready(outputs.rdy)
+	);
+  
+endmodule : mvu_axi_tb

From 69310b4e6d2ee4bf2e60b236582656fd7f364a6d Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Wed, 12 Apr 2023 23:27:50 +0100
Subject: [PATCH 106/235] [rtl custom op]: fixed output width to ACCU_WIDTH

---
 finn-rtllib/mvu/mvu_8sx9.sv | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv
index d082d4fb2e..5af27ab0ce 100644
--- a/finn-rtllib/mvu/mvu_8sx9.sv
+++ b/finn-rtllib/mvu/mvu_8sx9.sv
@@ -36,19 +36,25 @@ module mvu_8sx9 #(
     int unsigned SIMD,
     int unsigned ACTIVATION_WIDTH,
     int unsigned WEIGHT_WIDTH,
+	int unsigned ACCU_WIDTH,
     bit SIGNED_ACTIVATIONS = 0,
     int unsigned SEGMENTLEN = 0 // Default to 0 (which implies a single segment)
   )
   (
-    input   logic clk,
+    // Global Control
+	input   logic clk,
     input   logic rst,
     input   logic en,
+
+	// Input
     input   logic last,
-    input   logic zero,
-    input   logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] a,
-    input   logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w,
-    output  logic vld,
-    output  logic [PE-1:0][57:0] p 
+    input   logic zero, // ignore current inputs and force this partial product to zero
+    input   logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // weights
+	input   logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] a, // activations
+    
+	// Ouput
+	output  logic vld,
+    output  logic [PE-1:0][ACCU_WIDTH-1:0] p
   );
 
 //-------------------- Declare global signals --------------------\\
@@ -146,7 +152,7 @@ module mvu_8sx9 #(
 			uwire [57:0] pp;
 
 			if (LAST) begin : genPOUT
-				assign p[j] = pp;
+				assign p[j] = pp[ACCU_WIDTH-1:0];
 			end      
 
 			DSP58 #(
@@ -281,4 +287,4 @@ module mvu_8sx9 #(
 		end : genDSPChain  
 	end : genDSPPE
     
-endmodule
+endmodule : mvu_8sx9

From cfcff0040c85a76d7c5a16b2bf1b6b966b62e87d Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Wed, 12 Apr 2023 23:29:06 +0100
Subject: [PATCH 107/235] [rtl custom op]: renamed file and added generic to
 switch between compute kernels

---
 finn-rtllib/mvu/mvu_axi.sv | 194 +++++++++++++++++++++++++++++++++++++
 1 file changed, 194 insertions(+)
 create mode 100644 finn-rtllib/mvu/mvu_axi.sv

diff --git a/finn-rtllib/mvu/mvu_axi.sv b/finn-rtllib/mvu/mvu_axi.sv
new file mode 100644
index 0000000000..5d8700738f
--- /dev/null
+++ b/finn-rtllib/mvu/mvu_axi.sv
@@ -0,0 +1,194 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Matrix Vector Unit (MVU) AXI-lite interface wrapper.
+ *****************************************************************************/
+
+module mvu_axi #(
+	int unsigned MW,
+	int unsigned MH,
+	int unsigned PE,
+	int unsigned SIMD,
+	int unsigned ACTIVATION_WIDTH,
+	int unsigned WEIGHT_WIDTH,
+	int unsigned ACCU_WIDTH,
+	bit SIGNED_ACTIVATIONS = 0,
+	int unsigned SEGMENTLEN = 0,
+	parameter RAM_STYLE = "auto",
+	parameter MVU_IMPL_STYLE,
+
+	localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
+	localparam int unsigned INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8,
+	localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH,
+	localparam int unsigned INPUT_STREAM_WIDTH = SIMD*ACTIVATION_WIDTH,
+	localparam int unsigned SF = MW/SIMD,
+	localparam int unsigned NF = MH/PE,
+	localparam int unsigned OUTPUT_LANES = PE,
+	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
+)
+(
+	// Global Control
+	input	logic  ap_clk,
+	input	logic  ap_rst_n,
+
+	// Weight Stream
+	input	logic [WEIGHT_STREAM_WIDTH_BA-1:0]  s_axis_weights_tdata,
+	input	logic  s_axis_weights_tvalid,
+	output	logic  s_axis_weights_tready,
+
+	// Input Stream
+	input	logic [INPUT_STREAM_WIDTH_BA-1:0]  s_axis_input_tdata,
+	input	logic  s_axis_input_tvalid,
+	output	logic  s_axis_input_tready,
+
+	// Output Stream
+	output	logic [OUTPUT_STREAM_WIDTH_BA-1:0]  m_axis_output_tdata,
+	output	logic  m_axis_output_tvalid,
+	input	logic  m_axis_output_tready
+);
+
+//-------------------- Parameter sanity checks --------------------\\
+	initial begin
+		if (MW % SIMD != 0) begin
+			$error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD);
+			$finish;
+		end
+		if (MH % PE != 0) begin
+			$error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE);
+			$finish;
+		end
+		if (ACTIVATION_WIDTH > 9) begin
+			$error("Activation width of %0d-bits exceeds maximum of 9-bits", ACTIVATION_WIDTH);
+			$finish;
+		end
+		if (WEIGHT_WIDTH > 8) begin
+			$error("Weight width of %0d-bits exceeds maximum of 8-bits", WEIGHT_WIDTH);
+			$finish;
+		end
+		if (SIGNED_ACTIVATIONS == 0 && ACTIVATION_WIDTH==9) begin
+			$error("Activation width of %0d-bits exceeds maximum of 8-bits for unsigned numbers", ACTIVATION_WIDTH);
+			$finish;
+		end
+		if (SEGMENTLEN == 0) begin
+			$warning("Segment length of %0d defaults to chain length", SEGMENTLEN);
+		end
+		if (SEGMENTLEN > (SIMD+2)/3) begin
+			$error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3);
+			$finish;
+		end
+	end
+
+	uwire clk = ap_clk;
+	uwire rst = !ap_rst_n;
+
+	typedef logic [INPUT_STREAM_WIDTH-1 : 0] mvauin_t;
+
+	uwire mvauin_t amvau;
+	uwire alast;
+	uwire afin;
+	uwire avld;
+	uwire ardy;
+
+	replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvauin_t)), .RAM_STYLE(RAM_STYLE)) activation_replay (
+		.clk, .rst,
+		.ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)),
+		.ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin)
+	);
+
+//-------------------- Input control --------------------\\
+	uwire en;
+	uwire istb = avld && s_axis_weights_tvalid;
+	assign ardy = en && s_axis_weights_tvalid;
+	assign s_axis_weights_tready = en && avld;
+
+//-------------------- Core MVU --------------------\\
+	uwire ovld;
+	uwire [PE-1:0][ACCU_WIDTH-1:0] odat;
+	typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t;
+	
+	if (MVU_IMPL_STYLE == "mvu_8sx9") begin : genMVU8sx9
+		mvu_8sx9 #(.PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
+		.ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN)) core (
+			.clk, .rst, .en,
+			.last(alast), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau),
+			.vld(ovld), .p(odat)
+		);
+	end
+	else if (MVU_IMPL_STYLE == "mvu_4sx4u") begin : genMVU4sx4u
+		mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .FORCE_BEHAVIORAL(0)) core (
+			.clk, .rst, .en,
+			.last(alast), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau),
+			.vld(ovld), .p(odat)
+		);
+	end
+	//else begin
+	//	$error("Unrecognized MVU_IMPL_STYLE!");
+	//	$finish;
+	//end
+
+//-------------------- Output register slice --------------------\\
+	struct {
+		logic vld;
+		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
+	} A = '{ vld: 0, default: 'x};
+
+	assign en = !A.vld || !ovld;
+
+	uwire  b_load;
+	always_ff @(posedge clk) begin
+		if(rst)		A <= '{ vld: 0, default: 'x };
+		else if(!A.vld || b_load) begin
+			A.vld <= ovld && en;
+			for(int unsigned  i = 0; i < PE; i++) begin
+				// CR-1148862:
+				// A.dat[i] <= odat[i];
+				automatic logic [ACCU_WIDTH-1:0]  v = odat[i];
+				A.dat[i] <= v[ACCU_WIDTH-1:0];
+			end
+		end
+	end
+	
+	struct {
+		logic vld;
+		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
+	} B = '{ vld: 0, default: 'x};
+
+	assign	b_load = !B.vld || m_axis_output_tready;
+	always_ff @(posedge clk) begin
+		if(rst)		B <= '{ default: 'x };
+		else begin
+			if(b_load)	B <= '{ vld: A.vld, dat: A.dat};
+		end	
+	end
+
+	assign	m_axis_output_tvalid = B.vld;
+	assign	m_axis_output_tdata  = B.dat;
+
+endmodule : mvu_axi
\ No newline at end of file

From 72b519691369b9ebc31983a6723485860837e37b Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Wed, 12 Apr 2023 23:29:45 +0100
Subject: [PATCH 108/235] [rtl custom op]: renamed file and added generic to
 switch between compute kernels

---
 finn-rtllib/mvu/mvu_axi_wrapper.v | 90 +++++++++++++++++++++++++++++++
 1 file changed, 90 insertions(+)
 create mode 100644 finn-rtllib/mvu/mvu_axi_wrapper.v

diff --git a/finn-rtllib/mvu/mvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_axi_wrapper.v
new file mode 100644
index 0000000000..323d2711e4
--- /dev/null
+++ b/finn-rtllib/mvu/mvu_axi_wrapper.v
@@ -0,0 +1,90 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Verilog AXI-lite wrapper for MVU.
+ *****************************************************************************/
+
+module $MODULE_NAME_AXI_WRAPPER$ #(
+	parameter 	MW = $MW$,
+	parameter	MH = $MH$,
+	parameter 	PE = $PE$,
+	parameter 	SIMD = $SIMD$,
+	parameter 	ACTIVATION_WIDTH = $ACTIVATION_WIDTH$,
+	parameter 	WEIGHT_WIDTH = $WEIGHT_WIDTH$,
+	parameter 	ACCU_WIDTH = $ACCU_WIDTH$,
+	parameter 	SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$,
+	parameter 	SEGMENTLEN = $SEGMENTLEN$,
+	parameter 	RAM_STYLE = "$IBUF_RAM_STYLE$",
+
+	// Safely deducible parameters
+	parameter 	WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
+	parameter 	INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8,
+	parameter 	OUTPUT_LANES = PE,
+	parameter 	OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
+)(
+  	// Global Control
+	input	logic  ap_clk,
+	input	logic  ap_rst_n,
+
+	// Weight Stream
+	input	logic [WEIGHT_STREAM_WIDTH_BA-1:0]  s_axis_weights_tdata,
+	input	logic  s_axis_weights_tvalid,
+	output	logic  s_axis_weights_tready,
+
+	// Input Stream
+	input	logic [INPUT_STREAM_WIDTH_BA-1:0]  s_axis_input_tdata,
+	input	logic  s_axis_input_tvalid,
+	output	logic  s_axis_input_tready,
+
+	// Output Stream
+	output	logic [OUTPUT_STREAM_WIDTH_BA-1:0]  m_axis_output_tdata,
+	output	logic  m_axis_output_tvalid,
+	input	logic  m_axis_output_tready
+);
+
+mvu_axi #(
+	.MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH),
+	.WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
+	.SEGMENTLEN(SEGMENTLEN), .RAM_STYLE(RAM_STYLE)
+	) inst (
+	.ap_clk(ap_clk),
+	.ap_rst_n(ap_rst_n),
+	.s_axis_weights_tdata(s_axis_weights_tdata),
+	.s_axis_weights_tvalid(s_axis_weights_tvalid),
+	.s_axis_weights_tready(s_axis_weights_tready),
+	.s_axis_input_tdata(s_axis_input_tdata),
+	.s_axis_input_tvalid(s_axis_input_tvalid),
+	.s_axis_input_tready(s_axis_input_tready),
+	.m_axis_output_tdata(m_axis_output_tdata),
+	.m_axis_output_tvalid(m_axis_output_tvalid),
+	.m_axis_output_tready(m_axis_output_tready)
+);
+
+endmodule : $MODULE_NAME_AXI_WRAPPER$
\ No newline at end of file

From 7be5ce412e5747f17fe0062769cd2cc476b5bfa4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Mon, 17 Apr 2023 07:53:44 +0100
Subject: [PATCH 109/235] Defaulting BIAS and SIGNED parameters. Renaming M to
 K avoiding naming collision with uniform option.

---
 finn-rtllib/thresholding/hdl/thresholding.sv  | 28 +++++++++----------
 .../thresholding/hdl/thresholding_axi.sv      | 27 +++++++++---------
 .../hdl/thresholding_axi_wrapper.v            | 18 ++++++------
 3 files changed, 36 insertions(+), 37 deletions(-)

diff --git a/finn-rtllib/thresholding/hdl/thresholding.sv b/finn-rtllib/thresholding/hdl/thresholding.sv
index 0ce95ed3f9..d16a9219d7 100644
--- a/finn-rtllib/thresholding/hdl/thresholding.sv
+++ b/finn-rtllib/thresholding/hdl/thresholding.sv
@@ -45,11 +45,11 @@
  *****************************************************************************/
 module thresholding #(
 	int unsigned  N,  // output precision
-	int unsigned  M,  // input/threshold precision
+	int unsigned  K,  // input/threshold precision
 	int unsigned  C,  // number of channels
 
-	bit SIGNED,	// signed inputs
-	int BIAS,  // offsetting the output [0, 2^N-1) -> [BIAS, 2^N-1 + BIAS)
+	bit  SIGNED = 1,  // signed inputs
+	int  BIAS   = 0,  // offsetting the output [0, 2^N-1] -> [BIAS, 2^N-1 + BIAS]
 
 	localparam int unsigned  C_BITS = C < 2? 1 : $clog2(C),
 	localparam int unsigned  O_BITS = BIAS >= 0?
@@ -63,15 +63,15 @@ module thresholding #(
 	// Threshold Configuration
 	input	logic  twe,
 	input	logic [$clog2(C)+N-1:0]  twa,
-	input	logic [          M-1:0]  twd,
+	input	logic [          K-1:0]  twd,
 
 	// Clock Enable for Stream Processing
 	input	logic  en,
 
 	// Input Stream
 	input	logic  ivld,
-	input	logic        [C_BITS-1:0]  icnl,	// Ignored for C == 1
-	input	logic [M     -1:0]  idat,
+	input	logic [C_BITS-1:0]  icnl,	// Ignored for C == 1
+	input	logic [K     -1:0]  idat,
 
 	// Output Stream
 	output	logic  ovld,
@@ -81,10 +81,10 @@ module thresholding #(
 
 	// Pipeline Links & Feed
 	typedef struct packed {
-		logic                      vld;	// Valid data identification
-		logic        [C_BITS-1:0]  cnl;	// Channel
-		logic [M     -1:0]  val;	// Original input value
-		logic        [0:N-1]       res;	// Assembling result with valid prefix [0:stage] after stage #stage
+		logic               vld;	// Valid data identification
+		logic [C_BITS-1:0]  cnl;	// Channel
+		logic [K     -1:0]  val;	// Original input value
+		logic [0:N-1]       res;	// Assembling result with valid prefix [0:stage] after stage #stage
 	} pipe_t;
 	uwire pipe_t  pipe[0:N];
 	assign	pipe[0] = pipe_t'{ vld: ivld, cnl: icnl, val: idat, res: {N{1'bx}} };	// Feed original input
@@ -94,20 +94,20 @@ module thresholding #(
 	for(genvar  stage = 0; stage < N; stage++) begin : genStages
 
 		// Threshold Memory
-		uwire [M-1:0]  thresh;
+		uwire [K-1:0]  thresh;
 		if(1) begin : blkUpdate
 
 			// Write control: local select from global address
 			uwire  we = twe && tws[stage];
 			if((C == 1) && (stage == 0)) begin
-				logic [M-1:0]  Thresh = 'x;
+				logic [K-1:0]  Thresh = 'x;
 				always_ff @(posedge clk) begin
 					if(we)  Thresh <= twd;
 				end
 				assign  thresh = Thresh;
 			end
 			else begin
-				logic [M-1:0]  Threshs[C * 2**stage];
+				logic [K-1:0]  Threshs[C * 2**stage];
 				uwire [$clog2(C)+stage-1:0]  wa = twa[$left(twa):N-stage];
 				uwire [$clog2(C)+stage-1:0]  ra;
 				if(C > 1)  assign  ra[stage+:C_BITS] = pipe[stage].cnl;
@@ -119,7 +119,7 @@ module thresholding #(
 				end
 
 				// Read
-				logic [M-1:0]  RdReg;
+				logic [K-1:0]  RdReg;
 				always_ff @(posedge clk) begin
 					if(en)  RdReg <= Threshs[ra];
 				end
diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi.sv b/finn-rtllib/thresholding/hdl/thresholding_axi.sv
index d2a7420a99..2f0393a3e7 100644
--- a/finn-rtllib/thresholding/hdl/thresholding_axi.sv
+++ b/finn-rtllib/thresholding/hdl/thresholding_axi.sv
@@ -40,16 +40,15 @@
 
 module thresholding_axi #(
 	int unsigned  N,	// output precision
-	int unsigned  M,	// input/threshold precision
+	int unsigned  K,	// input/threshold precision
 	int unsigned  C,	// Channels
-	int unsigned  PE,	// Processing Parallelism, requires C = M*PE
+	int unsigned  PE,	// Processing Parallelism, requires C = k*PE
 
-	bit SIGNED,	// signed inputs
-	int BIAS,  // offsetting the output [0, 2^N-1) -> [BIAS, 2^N-1 + BIAS)
+	bit  SIGNED = 1,	// signed inputs
+	int  BIAS   = 0,	// offsetting the output [0, 2^N-1] -> [BIAS, 2^N-1 + BIAS]
 
-    localparam int unsigned  CF = 1 + (C-1)/PE,	// Channel Fold
+	localparam int unsigned  CF = 1 + (C-1)/PE,	// Channel Fold
 	localparam int unsigned  ADDR_BITS = $clog2(CF) + $clog2(PE) + N + 2,
-	localparam int unsigned  C_BITS = C/PE < 2? 1 : $clog2(C/PE),
 	localparam int unsigned  O_BITS = BIAS >= 0?
 		/* unsigned */ $clog2(2**N+BIAS) :
 		/* signed */ 1+$clog2(-BIAS >= 2**(N-1)? -BIAS : 2**N+BIAS)
@@ -86,7 +85,7 @@ module thresholding_axi #(
 	//- AXI Stream - Input --------------
 	output	logic  s_axis_tready,
 	input	logic  s_axis_tvalid,
-	input	logic [((PE*M+7)/8)*8-1:0]  s_axis_tdata,
+	input	logic [((PE*K+7)/8)*8-1:0]  s_axis_tdata,
 
 	//- AXI Stream - Output -------------
 	input	logic  m_axis_tready,
@@ -108,13 +107,13 @@ module thresholding_axi #(
 	//- AXI Lite: Threshold Configuration -----------------------------------
 	uwire  twe[PE];
 	uwire [$clog2(CF)+N-1:0]  twa;
-	uwire [           M-1:0]  twd;
+	uwire [           K-1:0]  twd;
 	if(1) begin : blkAxiLite
 		logic  WABusy = 0;
 		logic  WDBusy = 0;
 		logic  Sel[PE] = '{ default: 'x };
 		logic [$clog2(CF)+N-1:0]  Addr = 'x;
-		logic [           M-1:0]  Data = 'x;
+		logic [           K-1:0]  Data = 'x;
 
 		for(genvar  pe = 0; pe < PE; pe++) begin
 			assign	twe[pe] = WABusy && WDBusy && Sel[pe];
@@ -147,7 +146,7 @@ module thresholding_axi #(
 				end
 				if(!WDBusy) begin
 					WDBusy <= s_axilite_WVALID;
-					Data   <= s_axilite_WDATA[M-1:0];
+					Data   <= s_axilite_WDATA[K-1:0];
 				end
 			end
 		end
@@ -204,12 +203,12 @@ module thresholding_axi #(
 
 	end : blkOutputDecouple
 
-	// localparam int unsigned  C_BITS = C/PE < 2? 1 : $clog2(C/PE);
+	localparam int unsigned  C_BITS = C/PE < 2? 1 : $clog2(C/PE);
 	uwire  ivld = s_axis_tvalid;
 	uwire [C_BITS-1:0]  icnl;
-	uwire [M     -1:0]  idat[PE];
+	uwire [K     -1:0]  idat[PE];
 	for(genvar  pe = 0; pe < PE; pe++) begin
-		assign	idat[pe] = s_axis_tdata[pe*M+:M];
+		assign	idat[pe] = s_axis_tdata[pe*K+:K];
 	end
 
 	assign	s_axis_tready = en;
@@ -234,7 +233,7 @@ module thresholding_axi #(
 
 	// Core Thresholding Modules
 	for(genvar  pe = 0; pe < PE; pe++) begin : genCores
-		thresholding #(.N(N), .M(M), .C(C/PE), .SIGNED(SIGNED), .BIAS(BIAS)) core (
+		thresholding #(.N(N), .K(K), .C(C/PE), .SIGNED(SIGNED), .BIAS(BIAS)) core (
 			.clk, .rst,
 			.twe(twe[pe]), .twa, .twd,
 			.en,
diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v
index c27480f388..2657b39d98 100644
--- a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v
+++ b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v
@@ -34,20 +34,20 @@
 
 module $MODULE_NAME_AXI_WRAPPER$ #(
 	parameter  N = $N$,	// output precision
-	parameter  M = $M$,	// input/threshold precision
+	parameter  K = $M$,	// input/threshold precision
 	parameter  C = $C$,	// Channels
-	parameter  SIGNED = $SIGNED$,	// signed inputs
-	int BIAS = $BIAS$,  // offsetting the output [0, 2^N-1) -> [-BIAS, 2^N-1 - BIAS)
 	parameter  PE = $PE$,
+	parameter  SIGNED = $SIGNED$,	// signed inputs
+	parameter  BIAS = $BIAS$,		// offsetting the output [0, 2^N-1) -> [BIAS, 2^N-1 + BIAS)
 
 	parameter  O_BITS = BIAS > 0?
-		/* unsigned */ $clog2(2**N-BIAS) :
-		/* signed */ 1+$clog2(BIAS >= 2**(N-1)? BIAS : 2**N-BIAS)
+		/* unsigned */ $clog2(2**N+BIAS) :
+		/* signed */ 1+$clog2(-BIAS >= 2**(N-1)? -BIAS : 2**N+BIAS)
 )(
 	//- Global Control ------------------
-		(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V" *)
+	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V" *)
 	input	ap_clk,
-		(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V" *)
+	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V" *)
 	input	ap_rst_n,
 
 	//- AXI Lite ------------------------
@@ -78,7 +78,7 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 	//- AXI Stream - Input --------------
 	output	in0_V_TREADY,
 	input	in0_V_TVALID,
-	input	[((PE*M+7)/8)*8-1:0]  in0_V_TDATA,
+	input	[((PE*K+7)/8)*8-1:0]  in0_V_TDATA,
 
 	//- AXI Stream - Output -------------
 	input	out_V_TREADY,
@@ -86,7 +86,7 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 	output	[((PE*O_BITS+7)/8)*8-1:0]  out_V_TDATA
 );
 
-	thresholding_axi #(.N(N), .M(M), .C(C), .PE(PE), .SIGNED(SIGNED), .BIAS(BIAS)) inst (
+	thresholding_axi #(.N(N), .K(K), .C(C), .PE(PE), .SIGNED(SIGNED), .BIAS(BIAS)) inst (
 		//- Global Control ------------------
 		.ap_clk(ap_clk),
 		.ap_rst_n(ap_rst_n),

From c068bb65c6a4b877876c5b1278e7b2663b81d8e1 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 8 May 2023 10:15:16 +0100
Subject: [PATCH 110/235] [rtl mvu]: added behavioral model DSP58

---
 finn-rtllib/mvu/mvu_8sx9.sv | 343 ++++++++++++++++++++++--------------
 1 file changed, 212 insertions(+), 131 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv
index 5af27ab0ce..2d1da26efb 100644
--- a/finn-rtllib/mvu/mvu_8sx9.sv
+++ b/finn-rtllib/mvu/mvu_8sx9.sv
@@ -38,7 +38,8 @@ module mvu_8sx9 #(
     int unsigned WEIGHT_WIDTH,
 	int unsigned ACCU_WIDTH,
     bit SIGNED_ACTIVATIONS = 0,
-    int unsigned SEGMENTLEN = 0 // Default to 0 (which implies a single segment)
+    int unsigned SEGMENTLEN = 0, // Default to 0 (which implies a single segment)
+	bit FORCE_BEHAVIORAL = 0
   )
   (
     // Global Control
@@ -70,7 +71,10 @@ module mvu_8sx9 #(
 
 	always_ff @(posedge clk) begin
 		if(rst)     L <= '{default: 0};
-		else if(en) L <= { L[1:1+MAX_PIPELINE_STAGES], last };
+		else if(en) begin
+			L[1+MAX_PIPELINE_STAGES] <= last;
+			L[0:MAX_PIPELINE_STAGES] <= L[1:1+MAX_PIPELINE_STAGES];
+		end
 	end  
 	assign vld = L[0];
 
@@ -155,135 +159,212 @@ module mvu_8sx9 #(
 				assign p[j] = pp[ACCU_WIDTH-1:0];
 			end      
 
-			DSP58 #(
-				// Feature Control Attributes: Data Path Selection
-				.AMULTSEL("A"),                     // Selects A input to multiplier (A, AD)
-				.A_INPUT("DIRECT"),                 // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port)
-				.BMULTSEL("B"),                     // Selects B input to multiplier (AD, B)
-				.B_INPUT("DIRECT"),                 // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port)
-				.DSP_MODE("INT8"),                  // Configures DSP to a particular mode of operation. Set to INT24 for
-													// legacy mode.
-				.PREADDINSEL("A"),                  // Selects input to pre-adder (A, B)
-				.RND(58'h000000000000000),          // Rounding Constant
-				.USE_MULT("MULTIPLY"),              // Select multiplier usage (DYNAMIC, MULTIPLY, NONE)
-				.USE_SIMD("ONE58"),                 // SIMD selection (FOUR12, ONE58, TWO24)
-				.USE_WIDEXOR("FALSE"),              // Use the Wide XOR function (FALSE, TRUE)
-				.XORSIMD("XOR24_34_58_116"),        // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116)
-				// Pattern Detector Attributes: Pattern Detection Configuration
-				.AUTORESET_PATDET("NO_RESET"),      // NO_RESET, RESET_MATCH, RESET_NOT_MATCH
-				.AUTORESET_PRIORITY("RESET"),       // Priority of AUTORESET vs. CEP (CEP, RESET).
-				.MASK(58'h0ffffffffffffff),         // 58-bit mask value for pattern detect (1=ignore)
-				.PATTERN(58'h000000000000000),      // 58-bit pattern match for pattern detect
-				.SEL_MASK("MASK"),                  // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2
-				.SEL_PATTERN("PATTERN"),            // Select pattern value (C, PATTERN)
-				.USE_PATTERN_DETECT("NO_PATDET"),   // Enable pattern detect (NO_PATDET, PATDET)
-				// Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins
-				.IS_ALUMODE_INVERTED(4'b0000),      // Optional inversion for ALUMODE
-				.IS_CARRYIN_INVERTED(1'b0),         // Optional inversion for CARRYIN
-				.IS_CLK_INVERTED(1'b0),             // Optional inversion for CLK
-				.IS_INMODE_INVERTED(5'b00000),      // Optional inversion for INMODE
-				.IS_NEGATE_INVERTED(3'b000),        // Optional inversion for NEGATE
-				.IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0 
-									FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN 
-									2'b01, // Y : M
-									2'b01  // X: M
-				}), // Optional inversion for OPMODE
-				.IS_RSTALLCARRYIN_INVERTED(1'b0),   // Optional inversion for RSTALLCARRYIN
-				.IS_RSTALUMODE_INVERTED(1'b0),      // Optional inversion for RSTALUMODE
-				.IS_RSTA_INVERTED(1'b0),            // Optional inversion for RSTA
-				.IS_RSTB_INVERTED(1'b0),            // Optional inversion for RSTB
-				.IS_RSTCTRL_INVERTED(1'b0),         // Optional inversion for STCONJUGATE_A
-				.IS_RSTC_INVERTED(1'b0),            // Optional inversion for RSTC
-				.IS_RSTD_INVERTED(1'b0),            // Optional inversion for RSTD
-				.IS_RSTINMODE_INVERTED(1'b0),       // Optional inversion for RSTINMODE
-				.IS_RSTM_INVERTED(1'b0),            // Optional inversion for RSTM
-				.IS_RSTP_INVERTED(1'b0),            // Optional inversion for RSTP
-				// Register Control Attributes: Pipeline Register Configuration
-				.ACASCREG(INTERNAL_PREGS),          // Number of pipeline stages between A/ACIN and ACOUT (0-2)
-				.ADREG(0),                          // Pipeline stages for pre-adder (0-1)
-				.ALUMODEREG(0),                     // Pipeline stages for ALUMODE (0-1)
-				.AREG(INTERNAL_PREGS),              // Pipeline stages for A (0-2)
-				.BCASCREG(INTERNAL_PREGS),          // Number of pipeline stages between B/BCIN and BCOUT (0-2)
-				.BREG(INTERNAL_PREGS),              // Pipeline stages for B (0-2)
-				.CARRYINREG(0),                     // Pipeline stages for CARRYIN (0-1)
-				.CARRYINSELREG(0),                  // Pipeline stages for CARRYINSEL (0-1)
-				.CREG(0),                           // Pipeline stages for C (0-1)
-				.DREG(0),                           // Pipeline stages for D (0-1)
-				.INMODEREG(1),                      // Pipeline stages for INMODE (0-1)
-				.MREG(1),                           // Multiplier pipeline stages (0-1)
-				.OPMODEREG(1),                      // Pipeline stages for OPMODE (0-1)
-				.PREG(PREG),                        // Number of pipeline stages for P (0-1)
-				.RESET_MODE("SYNC")                 // Selection of synchronous or asynchronous reset. (ASYNC, SYNC).
-			)
-			DSP58_inst (
-				// Cascade outputs: Cascade Ports
-				.ACOUT(),                           // 34-bit output: A port cascade
-				.BCOUT(),                           // 24-bit output: B cascade
-				.CARRYCASCOUT(),                    // 1-bit output: Cascade carry
-				.MULTSIGNOUT(),                     // 1-bit output: Multiplier sign cascade
-				.PCOUT(pcout[j][i]),                // 58-bit output: Cascade output
-				// Control outputs: Control Inputs/Status Bits
-				.OVERFLOW(),                        // 1-bit output: Overflow in add/acc
-				.PATTERNBDETECT(),                  // 1-bit output: Pattern bar detect
-				.PATTERNDETECT(),                   // 1-bit output: Pattern detect
-				.UNDERFLOW(),                       // 1-bit output: Underflow in add/acc
-				// Data outputs: Data Ports
-				.CARRYOUT(),                        // 4-bit output: Carry
-				.P(pp),                             // 58-bit output: Primary data
-				.XOROUT(),                          // 8-bit output: XOR data
-				// Cascade inputs: Cascade Ports
-				.ACIN('x),                          // 34-bit input: A cascade data
-				.BCIN('x),                          // 24-bit input: B cascade
-				.CARRYCASCIN('x),                   // 1-bit input: Cascade carry
-				.MULTSIGNIN('x),                    // 1-bit input: Multiplier sign cascade
-				.PCIN(FIRST ? 'x : pcout[j][i-1]),  // 58-bit input: P cascade
-				// Control inputs: Control Inputs/Status Bits
-				.ALUMODE(4'h0),                     // 4-bit input: ALU control
-				.CARRYINSEL('0),                    // 3-bit input: Carry select
-				.CLK(clk),                          // 1-bit input: Clock
-				.INMODE({
-						INTERNAL_PREGS==2 ? 1'b0 : 1'b1,
-						2'b00,
-						TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero,
-						INTERNAL_PREGS==2 ? 1'b0 : 1'b1        
-				}),                                 // 5-bit input: INMODE control
-				.NEGATE('0),                        // 3-bit input: Negates the input of the multiplier
-				.OPMODE({
-						LAST ? {1'b0, L[1]} : 2'b00,
-						7'b000_0000
-				}), // 9-bit input: Operation mode
-				// Data inputs: Data Ports
-				.A({ 7'bx, a_in_i[i] }),            // 34-bit input: A data
-				.B(b_in_i[j][i]),                   // 24-bit input: B data
-				.C('x),                             // 58-bit input: C data
-				.CARRYIN('0),                       // 1-bit input: Carry-in
-				.D('x),                             // 27-bit input: D data
-				// Reset/Clock Enable inputs: Reset/Clock Enable Inputs
-				.ASYNC_RST('0),                     // 1-bit input: Asynchronous reset for all registers.
-				.CEA1(en),                          // 1-bit input: Clock enable for 1st stage AREG
-				.CEA2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage AREG
-				.CEAD('0),                          // 1-bit input: Clock enable for ADREG
-				.CEALUMODE('0),                     // 1-bit input: Clock enable for ALUMODE
-				.CEB1(en),                          // 1-bit input: Clock enable for 1st stage BREG
-				.CEB2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage BREG
-				.CEC('0),                           // 1-bit input: Clock enable for CREG
-				.CECARRYIN('0),                     // 1-bit input: Clock enable for CARRYINREG
-				.CECTRL(en),                        // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG
-				.CED('0),                           // 1-bit input: Clock enable for DREG
-				.CEINMODE(en),                      // 1-bit input: Clock enable for INMODEREG
-				.CEM(en),                           // 1-bit input: Clock enable for MREG
-				.CEP(PREG && en),                   // 1-bit input: Clock enable for PREG
-				.RSTA(rst),                         // 1-bit input: Reset for AREG
-				.RSTALLCARRYIN('0),                 // 1-bit input: Reset for CARRYINREG
-				.RSTALUMODE('0),                    // 1-bit input: Reset for ALUMODEREG
-				.RSTB(rst),                         // 1-bit input: Reset for BREG
-				.RSTC('0),                          // 1-bit input: Reset for CREG
-				.RSTCTRL(rst),                      // 1-bit input: Reset for OPMODEREG and CARRYINSELREG
-				.RSTD('0),                          // 1-bit input: Reset for DREG and ADREG
-				.RSTINMODE(rst),                    // 1-bit input: Reset for INMODE register
-				.RSTM(rst),                         // 1-bit input: Reset for MREG
-				.RSTP(PREG && rst)                  // 1-bit input: Reset for PREG
-			);
+			// Note: Since the product B * AD is computed,
+			//       rst can be only applied to AD and zero only to B
+			//       with the same effect as zeroing both.
+			if (FORCE_BEHAVIORAL) begin : genBehav
+				// Stage #1: Input A/B
+				logic signed [33:0] Areg [INTERNAL_PREGS];
+				always_ff @(posedge clk) begin
+					if (rst)	Areg <= '{ default : 0};
+					else if (en) begin
+						Areg[0] <= { 7'bx, a_in_i[i] };
+						if (INTERNAL_PREGS == 2) Areg[1] <= Areg[0];
+					end
+				end
+				logic signed [23:0] Breg [INTERNAL_PREGS];
+				always_ff @(posedge clk) begin
+					if (rst)	Breg <= '{ default : 0};
+					else if (en) begin
+						Breg[0] <= b_in_i[j][i];
+						if (INTERNAL_PREGS == 2) Breg[1] <= Breg[0];
+					end
+				end
+
+				// Stage #2: Multiply-Accumulate
+				logic signed [57:0] Mreg;
+				logic InmodeZero = 0;
+				always_ff @(posedge clk) begin
+					if (rst)		InmodeZero <= 0;
+					else if (en)	InmodeZero <= ( TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero );
+				end
+				always_ff @(posedge clk) begin
+					if (rst)	Mreg <= 0;
+					else if (en) begin
+						automatic logic signed [57:0] m = 0;
+						for (int k = 0; k < 3; k++) begin
+							m = m + (InmodeZero ? 0 : $signed(Areg[INTERNAL_PREGS-1][9*k +: 9]) * $signed(Breg[INTERNAL_PREGS-1][8*k +: 8]));
+						end
+						Mreg <= m;
+					end
+				end
+
+				// Stage #3: Accumulate
+				logic signed [57:0] Preg;
+				logic Opmode = 0;
+				if (FIRST && !LAST) begin : genFirst
+					if (PREG) begin : genPregBehav
+						always_ff @(posedge clk) begin
+							if (rst)		Preg <= 0;
+							else if (en)	Preg <= Mreg;
+						end
+					end
+					else	assign Preg = Mreg;
+				end
+				else if (LAST) begin : genLast
+					always_ff @(posedge clk) begin
+						if (rst)		Opmode <= 0;
+						else if (en)	Opmode <= L[1];
+					end
+					always_ff @(posedge clk) begin
+						if (rst) 		Preg <= 0;
+						else if (en)	Preg <= (Opmode ? 0 : Preg) + Mreg + pcout[j][i-1];
+					end
+				end
+				else begin : genMid
+					if (PREG) begin : genPregBehav
+						always_ff @(posedge clk) begin
+							if (rst)		Preg <= 0;
+							else if (en)	Preg <= Mreg + pcout[j][i-1];
+						end
+					end
+					else	assign Preg = Mreg + pcout[j][i-1];
+				end
+				assign pp = Preg;
+				assign pcout[j][i] = pp;
+			end : genBehav
+
+			else begin: genDSP
+				DSP58 #(
+					// Feature Control Attributes: Data Path Selection
+					.AMULTSEL("A"),                     // Selects A input to multiplier (A, AD)
+					.A_INPUT("DIRECT"),                 // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port)
+					.BMULTSEL("B"),                     // Selects B input to multiplier (AD, B)
+					.B_INPUT("DIRECT"),                 // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port)
+					.DSP_MODE("INT8"),                  // Configures DSP to a particular mode of operation. Set to INT24 for
+														// legacy mode.
+					.PREADDINSEL("A"),                  // Selects input to pre-adder (A, B)
+					.RND(58'h000000000000000),          // Rounding Constant
+					.USE_MULT("MULTIPLY"),              // Select multiplier usage (DYNAMIC, MULTIPLY, NONE)
+					.USE_SIMD("ONE58"),                 // SIMD selection (FOUR12, ONE58, TWO24)
+					.USE_WIDEXOR("FALSE"),              // Use the Wide XOR function (FALSE, TRUE)
+					.XORSIMD("XOR24_34_58_116"),        // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116)
+					// Pattern Detector Attributes: Pattern Detection Configuration
+					.AUTORESET_PATDET("NO_RESET"),      // NO_RESET, RESET_MATCH, RESET_NOT_MATCH
+					.AUTORESET_PRIORITY("RESET"),       // Priority of AUTORESET vs. CEP (CEP, RESET).
+					.MASK(58'h0ffffffffffffff),         // 58-bit mask value for pattern detect (1=ignore)
+					.PATTERN(58'h000000000000000),      // 58-bit pattern match for pattern detect
+					.SEL_MASK("MASK"),                  // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2
+					.SEL_PATTERN("PATTERN"),            // Select pattern value (C, PATTERN)
+					.USE_PATTERN_DETECT("NO_PATDET"),   // Enable pattern detect (NO_PATDET, PATDET)
+					// Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins
+					.IS_ALUMODE_INVERTED(4'b0000),      // Optional inversion for ALUMODE
+					.IS_CARRYIN_INVERTED(1'b0),         // Optional inversion for CARRYIN
+					.IS_CLK_INVERTED(1'b0),             // Optional inversion for CLK
+					.IS_INMODE_INVERTED(5'b00000),      // Optional inversion for INMODE
+					.IS_NEGATE_INVERTED(3'b000),        // Optional inversion for NEGATE
+					.IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0 
+										FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN 
+										2'b01, // Y : M
+										2'b01  // X: M
+					}), // Optional inversion for OPMODE
+					.IS_RSTALLCARRYIN_INVERTED(1'b0),   // Optional inversion for RSTALLCARRYIN
+					.IS_RSTALUMODE_INVERTED(1'b0),      // Optional inversion for RSTALUMODE
+					.IS_RSTA_INVERTED(1'b0),            // Optional inversion for RSTA
+					.IS_RSTB_INVERTED(1'b0),            // Optional inversion for RSTB
+					.IS_RSTCTRL_INVERTED(1'b0),         // Optional inversion for STCONJUGATE_A
+					.IS_RSTC_INVERTED(1'b0),            // Optional inversion for RSTC
+					.IS_RSTD_INVERTED(1'b0),            // Optional inversion for RSTD
+					.IS_RSTINMODE_INVERTED(1'b0),       // Optional inversion for RSTINMODE
+					.IS_RSTM_INVERTED(1'b0),            // Optional inversion for RSTM
+					.IS_RSTP_INVERTED(1'b0),            // Optional inversion for RSTP
+					// Register Control Attributes: Pipeline Register Configuration
+					.ACASCREG(INTERNAL_PREGS),          // Number of pipeline stages between A/ACIN and ACOUT (0-2)
+					.ADREG(0),                          // Pipeline stages for pre-adder (0-1)
+					.ALUMODEREG(0),                     // Pipeline stages for ALUMODE (0-1)
+					.AREG(INTERNAL_PREGS),              // Pipeline stages for A (0-2)
+					.BCASCREG(INTERNAL_PREGS),          // Number of pipeline stages between B/BCIN and BCOUT (0-2)
+					.BREG(INTERNAL_PREGS),              // Pipeline stages for B (0-2)
+					.CARRYINREG(0),                     // Pipeline stages for CARRYIN (0-1)
+					.CARRYINSELREG(0),                  // Pipeline stages for CARRYINSEL (0-1)
+					.CREG(0),                           // Pipeline stages for C (0-1)
+					.DREG(0),                           // Pipeline stages for D (0-1)
+					.INMODEREG(1),                      // Pipeline stages for INMODE (0-1)
+					.MREG(1),                           // Multiplier pipeline stages (0-1)
+					.OPMODEREG(1),                      // Pipeline stages for OPMODE (0-1)
+					.PREG(PREG),                        // Number of pipeline stages for P (0-1)
+					.RESET_MODE("SYNC")                 // Selection of synchronous or asynchronous reset. (ASYNC, SYNC).
+				)
+				DSP58_inst (
+					// Cascade outputs: Cascade Ports
+					.ACOUT(),                           // 34-bit output: A port cascade
+					.BCOUT(),                           // 24-bit output: B cascade
+					.CARRYCASCOUT(),                    // 1-bit output: Cascade carry
+					.MULTSIGNOUT(),                     // 1-bit output: Multiplier sign cascade
+					.PCOUT(pcout[j][i]),                // 58-bit output: Cascade output
+					// Control outputs: Control Inputs/Status Bits
+					.OVERFLOW(),                        // 1-bit output: Overflow in add/acc
+					.PATTERNBDETECT(),                  // 1-bit output: Pattern bar detect
+					.PATTERNDETECT(),                   // 1-bit output: Pattern detect
+					.UNDERFLOW(),                       // 1-bit output: Underflow in add/acc
+					// Data outputs: Data Ports
+					.CARRYOUT(),                        // 4-bit output: Carry
+					.P(pp),                             // 58-bit output: Primary data
+					.XOROUT(),                          // 8-bit output: XOR data
+					// Cascade inputs: Cascade Ports
+					.ACIN('x),                          // 34-bit input: A cascade data
+					.BCIN('x),                          // 24-bit input: B cascade
+					.CARRYCASCIN('x),                   // 1-bit input: Cascade carry
+					.MULTSIGNIN('x),                    // 1-bit input: Multiplier sign cascade
+					.PCIN(FIRST ? 'x : pcout[j][i-1]),  // 58-bit input: P cascade
+					// Control inputs: Control Inputs/Status Bits
+					.ALUMODE(4'h0),                     // 4-bit input: ALU control
+					.CARRYINSEL('0),                    // 3-bit input: Carry select
+					.CLK(clk),                          // 1-bit input: Clock
+					.INMODE({
+							INTERNAL_PREGS==2 ? 1'b0 : 1'b1,
+							2'b00,
+							TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero,
+							INTERNAL_PREGS==2 ? 1'b0 : 1'b1        
+					}),                                 // 5-bit input: INMODE control
+					.NEGATE('0),                        // 3-bit input: Negates the input of the multiplier
+					.OPMODE({
+							LAST ? {1'b0, L[1]} : 2'b00,
+							7'b000_0000
+					}), // 9-bit input: Operation mode
+					// Data inputs: Data Ports
+					.A({ 7'bx, a_in_i[i] }),            // 34-bit input: A data
+					.B(b_in_i[j][i]),                   // 24-bit input: B data
+					.C('x),                             // 58-bit input: C data
+					.CARRYIN('0),                       // 1-bit input: Carry-in
+					.D('x),                             // 27-bit input: D data
+					// Reset/Clock Enable inputs: Reset/Clock Enable Inputs
+					.ASYNC_RST('0),                     // 1-bit input: Asynchronous reset for all registers.
+					.CEA1(en),                          // 1-bit input: Clock enable for 1st stage AREG
+					.CEA2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage AREG
+					.CEAD('0),                          // 1-bit input: Clock enable for ADREG
+					.CEALUMODE('0),                     // 1-bit input: Clock enable for ALUMODE
+					.CEB1(en),                          // 1-bit input: Clock enable for 1st stage BREG
+					.CEB2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage BREG
+					.CEC('0),                           // 1-bit input: Clock enable for CREG
+					.CECARRYIN('0),                     // 1-bit input: Clock enable for CARRYINREG
+					.CECTRL(en),                        // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG
+					.CED('0),                           // 1-bit input: Clock enable for DREG
+					.CEINMODE(en),                      // 1-bit input: Clock enable for INMODEREG
+					.CEM(en),                           // 1-bit input: Clock enable for MREG
+					.CEP(PREG && en),                   // 1-bit input: Clock enable for PREG
+					.RSTA(rst),                         // 1-bit input: Reset for AREG
+					.RSTALLCARRYIN('0),                 // 1-bit input: Reset for CARRYINREG
+					.RSTALUMODE('0),                    // 1-bit input: Reset for ALUMODEREG
+					.RSTB(rst),                         // 1-bit input: Reset for BREG
+					.RSTC('0),                          // 1-bit input: Reset for CREG
+					.RSTCTRL(rst),                      // 1-bit input: Reset for OPMODEREG and CARRYINSELREG
+					.RSTD('0),                          // 1-bit input: Reset for DREG and ADREG
+					.RSTINMODE(rst),                    // 1-bit input: Reset for INMODE register
+					.RSTM(rst),                         // 1-bit input: Reset for MREG
+					.RSTP(PREG && rst)                  // 1-bit input: Reset for PREG
+				);
+			end : genDSP
 		end : genDSPChain  
 	end : genDSPPE
     

From 18f94e7ab03a3034083680faa91a80359858589e Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 8 May 2023 10:18:58 +0100
Subject: [PATCH 111/235] [rtl mvu]: extended flow control wrapper with
 additional compute core and other minor changes

---
 finn-rtllib/mvu/mvu_axi.sv        | 51 +++++++++++++++++++------------
 finn-rtllib/mvu/mvu_axi_wrapper.v | 48 ++++++++++++++---------------
 2 files changed, 54 insertions(+), 45 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_axi.sv b/finn-rtllib/mvu/mvu_axi.sv
index 5d8700738f..e4a919ba88 100644
--- a/finn-rtllib/mvu/mvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_axi.sv
@@ -41,8 +41,8 @@ module mvu_axi #(
 	int unsigned ACCU_WIDTH,
 	bit SIGNED_ACTIVATIONS = 0,
 	int unsigned SEGMENTLEN = 0,
-	parameter RAM_STYLE = "auto",
-	parameter MVU_IMPL_STYLE,
+	bit FORCE_BEHAVIORAL = 0,
+	string MVU_IMPL_STYLE,
 
 	localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
 	localparam int unsigned INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8,
@@ -96,12 +96,14 @@ module mvu_axi #(
 			$error("Activation width of %0d-bits exceeds maximum of 8-bits for unsigned numbers", ACTIVATION_WIDTH);
 			$finish;
 		end
-		if (SEGMENTLEN == 0) begin
-			$warning("Segment length of %0d defaults to chain length", SEGMENTLEN);
-		end
-		if (SEGMENTLEN > (SIMD+2)/3) begin
-			$error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3);
-			$finish;
+		if (MVU_IMPL_STYLE == "mvu_8sx9") begin
+			if (SEGMENTLEN == 0) begin
+				$warning("Segment length of %0d defaults to chain length", SEGMENTLEN);
+			end
+			if (SEGMENTLEN > (SIMD+2)/3) begin
+				$error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3);
+				$finish;
+			end
 		end
 	end
 
@@ -116,7 +118,7 @@ module mvu_axi #(
 	uwire avld;
 	uwire ardy;
 
-	replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvauin_t)), .RAM_STYLE(RAM_STYLE)) activation_replay (
+	replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvauin_t))) activation_replay (
 		.clk, .rst,
 		.ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)),
 		.ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin)
@@ -133,28 +135,37 @@ module mvu_axi #(
 	uwire [PE-1:0][ACCU_WIDTH-1:0] odat;
 	typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t;
 	
-	if (MVU_IMPL_STYLE == "mvu_8sx9") begin : genMVU8sx9
+	if (MVU_IMPL_STYLE == "mvu_8sx9_dsp58") begin : genMVU8sx9
 		mvu_8sx9 #(.PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
-		.ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN)) core (
+		.ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN),
+		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
 			.clk, .rst, .en,
-			.last(alast), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau),
+			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau),
 			.vld(ovld), .p(odat)
 		);
 	end
 	else if (MVU_IMPL_STYLE == "mvu_4sx4u") begin : genMVU4sx4u
-		mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .FORCE_BEHAVIORAL(0)) core (
+		mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
 			.clk, .rst, .en,
-			.last(alast), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau),
+			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau),
 			.vld(ovld), .p(odat)
 		);
 	end
-	//else begin
-	//	$error("Unrecognized MVU_IMPL_STYLE!");
-	//	$finish;
-	//end
+	else if (MVU_IMPL_STYLE == "mvu_8sx8u_dsp48") begin : genMVU8sx8u
+		mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
+		 .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
+			.clk, .rst, .en,
+			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau),
+			.vld(ovld), .p(odat)
+		);
+	end
+	else initial begin
+		$error("Unrecognized MVU_IMPL_STYLE!");
+		$finish;
+	end
 
 //-------------------- Output register slice --------------------\\
-	struct {
+	struct packed {
 		logic vld;
 		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
 	} A = '{ vld: 0, default: 'x};
@@ -175,7 +186,7 @@ module mvu_axi #(
 		end
 	end
 	
-	struct {
+	struct packed {
 		logic vld;
 		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
 	} B = '{ vld: 0, default: 'x};
diff --git a/finn-rtllib/mvu/mvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_axi_wrapper.v
index 323d2711e4..b79ba6bbd1 100644
--- a/finn-rtllib/mvu/mvu_axi_wrapper.v
+++ b/finn-rtllib/mvu/mvu_axi_wrapper.v
@@ -41,7 +41,8 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 	parameter 	ACCU_WIDTH = $ACCU_WIDTH$,
 	parameter 	SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$,
 	parameter 	SEGMENTLEN = $SEGMENTLEN$,
-	parameter 	RAM_STYLE = "$IBUF_RAM_STYLE$",
+	parameter	MVU_IMPL_STYLE = "$MVU_IMPL_STYLE$",
+	parameter	FORCE_BEHAVIORAL = $FORCE_BEHAVIORAL$,
 
 	// Safely deducible parameters
 	parameter 	WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
@@ -50,41 +51,38 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 	parameter 	OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
 )(
   	// Global Control
-	input	logic  ap_clk,
-	input	logic  ap_rst_n,
-
+	input	ap_clk,
+	input	ap_rst_n,
 	// Weight Stream
-	input	logic [WEIGHT_STREAM_WIDTH_BA-1:0]  s_axis_weights_tdata,
-	input	logic  s_axis_weights_tvalid,
-	output	logic  s_axis_weights_tready,
-
+	input	[WEIGHT_STREAM_WIDTH_BA-1:0]  weights_V_TDATA,
+	input   weights_V_TVALID,
+	output  weights_V_TREADY,
 	// Input Stream
-	input	logic [INPUT_STREAM_WIDTH_BA-1:0]  s_axis_input_tdata,
-	input	logic  s_axis_input_tvalid,
-	output	logic  s_axis_input_tready,
-
+	input	[INPUT_STREAM_WIDTH_BA-1:0]  in0_V_TDATA,
+	input	in0_V_TVALID,
+	output	in0_V_TREADY,
 	// Output Stream
-	output	logic [OUTPUT_STREAM_WIDTH_BA-1:0]  m_axis_output_tdata,
-	output	logic  m_axis_output_tvalid,
-	input	logic  m_axis_output_tready
+	output	[OUTPUT_STREAM_WIDTH_BA-1:0]  out_V_TDATA,
+	output	out_V_TVALID,
+	input	out_V_TREADY
 );
 
 mvu_axi #(
 	.MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH),
 	.WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
-	.SEGMENTLEN(SEGMENTLEN), .RAM_STYLE(RAM_STYLE)
+	.SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL), .MVU_IMPL_STYLE(MVU_IMPL_STYLE)
 	) inst (
 	.ap_clk(ap_clk),
 	.ap_rst_n(ap_rst_n),
-	.s_axis_weights_tdata(s_axis_weights_tdata),
-	.s_axis_weights_tvalid(s_axis_weights_tvalid),
-	.s_axis_weights_tready(s_axis_weights_tready),
-	.s_axis_input_tdata(s_axis_input_tdata),
-	.s_axis_input_tvalid(s_axis_input_tvalid),
-	.s_axis_input_tready(s_axis_input_tready),
-	.m_axis_output_tdata(m_axis_output_tdata),
-	.m_axis_output_tvalid(m_axis_output_tvalid),
-	.m_axis_output_tready(m_axis_output_tready)
+	.s_axis_weights_tdata(weights_V_TDATA),
+	.s_axis_weights_tvalid(weights_V_TVALID),
+	.s_axis_weights_tready(weights_V_TREADY),
+	.s_axis_input_tdata(in0_V_TDATA),
+	.s_axis_input_tvalid(in0_V_TVALID),
+	.s_axis_input_tready(in0_V_TREADY),
+	.m_axis_output_tdata(out_V_TDATA),
+	.m_axis_output_tvalid(out_V_TVALID),
+	.m_axis_output_tready(out_V_TREADY)
 );
 
 endmodule : $MODULE_NAME_AXI_WRAPPER$
\ No newline at end of file

From 6d4a0a764e0e6ded16d7034e0d69f5408c76ca75 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 8 May 2023 10:22:51 +0100
Subject: [PATCH 112/235] [rtl mvu]: fix to done_len flag when SIMD dimension
 fully unrolled and PyVerilator-related syntax change

---
 finn-rtllib/mvu/replay_buffer.sv | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/finn-rtllib/mvu/replay_buffer.sv b/finn-rtllib/mvu/replay_buffer.sv
index 685ac03137..89bbbdb88f 100644
--- a/finn-rtllib/mvu/replay_buffer.sv
+++ b/finn-rtllib/mvu/replay_buffer.sv
@@ -35,8 +35,7 @@
 module replay_buffer #(
 	int unsigned  LEN,	// Sequence length
 	int unsigned  REP,	// Sequence replay count
-	int unsigned  W,	// Data width
-	parameter RAM_STYLE = "auto" 	// ram style for buffer {block, distributed, ultra, auto}
+	int unsigned  W 	// Data width
 )(
 	input	logic  clk,
 	input	logic  rst,
@@ -54,7 +53,7 @@ module replay_buffer #(
 
 	typedef logic [$clog2(REP)+$clog2(LEN)-1:0]  count_t;
 	count_t  Count = 0;
-	uwire  done_len = ((LEN-1) & ~Count[$clog2(LEN)-1:0]) == 0;
+	uwire  done_len = LEN == 1 ? 1 : ((LEN-1) & ~Count[$clog2(LEN)-1:0]) == 0;
 	uwire  done_rep;
 	uwire  done_all = done_len && done_rep;
 
@@ -83,7 +82,6 @@ module replay_buffer #(
 		end
 		assign	first_rep = FirstRep;
 
-		(* RAM_STYLE = RAM_STYLE *)
 		data_t  Buf[LEN];
 		if(LEN == 1) begin : genTrivial
 			always_ff @(posedge clk) begin
@@ -92,7 +90,10 @@ module replay_buffer #(
 		end : genTrivial
 		else begin : genShift
 			always_ff @(posedge clk) begin
-				if(shift)  Buf <= { odat, Buf[0:LEN-2] };
+				if(shift) begin
+					Buf[0] <= odat;
+					Buf[1:LEN-1] <= Buf[0:LEN-2];
+				end
 			end
 		end : genShift
 

From 90c547d54756aed2aa101862fb6f55c05149173c Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 8 May 2023 10:23:22 +0100
Subject: [PATCH 113/235] [rtl mvu tb]: updated testbench

---
 finn-rtllib/mvu/tb/mvu_axi_tb.sv | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/finn-rtllib/mvu/tb/mvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_axi_tb.sv
index 08a349da84..ef5fa7d682 100644
--- a/finn-rtllib/mvu/tb/mvu_axi_tb.sv
+++ b/finn-rtllib/mvu/tb/mvu_axi_tb.sv
@@ -35,17 +35,18 @@ module mvu_axi_tb();
 
 //-------------------- Simulation parameters --------------------\\
 	// Matrix & parallelism config
-	localparam int unsigned MW = 90;
-	localparam int unsigned MH = 16;
-	localparam int unsigned SIMD = 9;
-	localparam int unsigned PE = 4;
-	localparam int unsigned SEGMENTLEN = 1;
-	localparam string MVU_IMPL_STYLE = "mvu_8sx9";
+	localparam int unsigned MW = 50;
+	localparam int unsigned MH = 8;
+	localparam int unsigned SIMD = 10;
+	localparam int unsigned PE = 2;
+	localparam int unsigned SEGMENTLEN = 2;
+	localparam string MVU_IMPL_STYLE = "mvu_8sx8u_dsp48";
+	localparam bit FORCE_BEHAVIORAL = 1;
 	// Bit-width config  
 	localparam int unsigned ACTIVATION_WIDTH = 8;
-	localparam int unsigned WEIGHT_WIDTH = 4;
+	localparam int unsigned WEIGHT_WIDTH = 8;
 	localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW);
-	localparam bit SIGNED_ACTIVATIONS = 1;
+	localparam bit SIGNED_ACTIVATIONS = 0;
 	// Simulation constants  
 	localparam int unsigned NF = MH/PE;
 	localparam int unsigned SF = MW/SIMD;
@@ -94,7 +95,7 @@ module mvu_axi_tb();
 		for (int i=0; i<SF; i++) begin
 			activations.dat <= ACTIVATIONS[i];
 			do begin 
-				activations.vld = $urandom()%7 > 1;
+				activations.vld = $urandom()%7 >= 1;
 				@(posedge clk);
 			end while (!(activations.vld === 1 && activations.rdy === 1));
 		end
@@ -201,6 +202,7 @@ module mvu_axi_tb();
 		.ACCU_WIDTH(ACCU_WIDTH),
 		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
 		.SEGMENTLEN(SEGMENTLEN),
+		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL),
 		.MVU_IMPL_STYLE(MVU_IMPL_STYLE)
 	)
 	dut (

From 0c37f1f7bed1143833649accceb59bd6821bed3c Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 8 May 2023 10:25:10 +0100
Subject: [PATCH 114/235] [builder]: added specialize_to_rtl step and changed
 standalone threshold layers to be by default true

---
 src/finn/builder/build_dataflow_config.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py
index 4c3e4ff899..24940489df 100644
--- a/src/finn/builder/build_dataflow_config.py
+++ b/src/finn/builder/build_dataflow_config.py
@@ -121,6 +121,7 @@ class VerificationStepType(str, Enum):
     "step_apply_folding_config",
     "step_minimize_bit_width",
     "step_generate_estimate_reports",
+    "step_specialize_to_rtl",
     "step_hls_codegen",
     "step_hls_ipgen",
     "step_set_fifo_depths",
@@ -233,7 +234,7 @@ class DataflowBuildConfig:
     #: activations in FINN) will be implemented as stand-alone HLS layers,
     #: instead of being part of MatrixVectorActivation layer. This gives larger
     #: flexibility, and makes it possible to have runtime-writable thresholds.
-    standalone_thresholds: Optional[bool] = False
+    standalone_thresholds: Optional[bool] = True
 
     #: (Optional) Whether optimizations that minimize the bit width of the
     #: weights and accumulator will be applied. Because this optimization relies

From 5ccb016a640dbed6818a9f1f3ef46136ce949c0d Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 8 May 2023 10:26:03 +0100
Subject: [PATCH 115/235] [builder]: added specialize_to_rtl step

---
 src/finn/builder/build_dataflow_steps.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
index e43a29d632..3e4d047a51 100644
--- a/src/finn/builder/build_dataflow_steps.py
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -123,6 +123,7 @@
 )
 from finn.util.pyverilator import verilator_fifosim
 from finn.util.test import execute_parent
+import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl
 
 
 def verify_step(
@@ -483,6 +484,16 @@ def step_generate_estimate_reports(model: ModelWrapper, cfg: DataflowBuildConfig
     return model
 
 
+def step_specialize_to_rtl(model: ModelWrapper, cfg: DataflowBuildConfig):
+    """Convert layers implemented in HLS to an equivalent specialized RTL implementation if possible."""
+    specialize_to_rtl_transforms = [
+        to_rtl.InferRTLMatrixVectorActivation()
+    ]
+    for trn in specialize_to_rtl_transforms:
+        model = model.transform(trn)
+    return model
+    
+
 def step_minimize_bit_width(model: ModelWrapper, cfg: DataflowBuildConfig):
     """Tighten the weight and accumulator bit widths for each layer."""
     if cfg.minimize_bit_width:
@@ -855,6 +866,7 @@ def step_deployment_package(model: ModelWrapper, cfg: DataflowBuildConfig):
     "step_apply_folding_config": step_apply_folding_config,
     "step_minimize_bit_width": step_minimize_bit_width,
     "step_generate_estimate_reports": step_generate_estimate_reports,
+    "step_specialize_to_rtl": step_specialize_to_rtl,
     "step_hls_codegen": step_hls_codegen,
     "step_hls_ipgen": step_hls_ipgen,
     "step_set_fifo_depths": step_set_fifo_depths,

From f099f4bbfd01b628a89c6099f637a4a85a8158ca Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 8 May 2023 10:26:44 +0100
Subject: [PATCH 116/235] [custom op]: added custom op
 MatrixVectorActivation_rtl

---
 src/finn/custom_op/fpgadataflow/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index 56d4230a3a..19c0ddd999 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -49,6 +49,7 @@
 from finn.custom_op.fpgadataflow.labelselect_batch import LabelSelect_Batch
 from finn.custom_op.fpgadataflow.lookup import Lookup
 from finn.custom_op.fpgadataflow.matrixvectoractivation import MatrixVectorActivation
+from finn.custom_op.fpgadataflow.matrixvectoractivation_rtl import MatrixVectorActivation_rtl
 from finn.custom_op.fpgadataflow.pool_batch import Pool_Batch
 from finn.custom_op.fpgadataflow.streamingdataflowpartition import (
     StreamingDataflowPartition,
@@ -70,6 +71,7 @@
 custom_op["DownSampler"] = DownSampler
 custom_op["StreamingMaxPool_Batch"] = StreamingMaxPool_Batch
 custom_op["MatrixVectorActivation"] = MatrixVectorActivation
+custom_op["MatrixVectorActivation_rtl"] = MatrixVectorActivation_rtl
 custom_op["ConvolutionInputGenerator"] = ConvolutionInputGenerator
 custom_op["ConvolutionInputGenerator1D"] = ConvolutionInputGenerator1D
 custom_op["ConvolutionInputGenerator_rtl"] = ConvolutionInputGenerator_rtl

From 9a3b0fdc54f8c7c1b541c8cfdaaf6e96315da092 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 8 May 2023 10:28:34 +0100
Subject: [PATCH 117/235] [custom op]: added additional attribute to enable
 conversion to RTL (custom-op)

---
 src/finn/custom_op/fpgadataflow/matrixvectoractivation.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
index aa987384dd..e54abb0c3f 100644
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
@@ -70,7 +70,7 @@ def get_nodeattr_types(self):
             "SIMD": ("i", True, 0),
             "MW": ("i", True, 0),
             "MH": ("i", True, 0),
-            "resType": ("s", False, "lut", {"auto", "lut", "dsp"}),
+            "resType": ("s", False, "dsp", {"auto", "lut", "dsp"}),
             "ActVal": ("i", False, 0),
             # FINN DataTypes for inputs, weights, outputs
             "inputDataType": ("s", True, ""),
@@ -125,6 +125,8 @@ def get_nodeattr_types(self):
             # vector through the accelerator. This will get rid of any old
             # weight data from the weight FIFOs.
             "runtime_writeable_weights": ("i", False, 0, {0, 1}),
+            # Flag to specify whether RTL-based or HLS-based implementation is preferred
+            "impl": ("s", False, "rtl", {"hls", "rtl"})
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs

From 38aa930baa1296a7099f9df22e3d0d000c8d5a05 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 8 May 2023 10:30:15 +0100
Subject: [PATCH 118/235] [custom op]: modified ip-stitching and code
 generation

---
 .../matrixvectoractivation_rtl.py             | 231 ++++++++++--------
 1 file changed, 127 insertions(+), 104 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
index c8a0aa675b..6b1c2f3be7 100644
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
@@ -27,6 +27,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import math
+from shutil import copy
 import numpy as np
 import os
 import textwrap
@@ -45,6 +46,12 @@
     pack_innermost_dim_as_hex_string,
     rtlsim_output_to_npy,
 )
+from finn.util.basic import get_rtlsim_trace_depth, make_build_dir
+
+try:
+    from pyverilator import PyVerilator
+except ModuleNotFoundError:
+    PyVerilator = None
 
 from . import templates
 
@@ -60,8 +67,8 @@ class MatrixVectorActivation_rtl(HLSCustomOp):
     """Class that corresponds to finn-hls Matrix_Vector_Activate(_Stream)_Batch
     function."""
 
-    def __init__(self, onnx_node):
-        super().__init__(onnx_node)
+    def __init__(self, onnx_node, **kwargs):
+        super().__init__(onnx_node, **kwargs)
         self.decoupled_wrapper = templates.decoupled_wrapper
 
     def get_nodeattr_types(self):
@@ -78,11 +85,6 @@ def get_nodeattr_types(self):
             "outputDataType": ("s", True, ""),
             # FINN DataType for accumulator -- auto-computed and updated
             "accDataType": ("s", False, "INT32"),
-            # use xnor-popcount for binary weights/inputs, thus treating them
-            # as bipolar
-            "binaryXnorMode": ("i", False, 0, {0, 1}),
-            # no-activation mode (produce accumulators)
-            "noActivation": ("i", False, 0, {0, 1}),
             # number of input vectors, examples:
             # [1] is a single vector (like a FC layer with batch=1)
             # [4] is four vectors (like a FC layer with batch=4)
@@ -105,16 +107,6 @@ def get_nodeattr_types(self):
                 "auto",
                 {"auto", "block", "distributed", "ultra"},
             ),
-            # FPGA resource type for threshold memories (if noActivation is False)
-            # auto -- let Vivado decide
-            # block -- use BRAM
-            # distributed -- use LUTRAM
-            "ram_style_thresholds": (
-                "s",
-                False,
-                "auto",
-                {"auto", "block", "distributed"},
-            ),
             # (mem_mode = decoupled only) whether weights will be writable through
             # an AXI-lite interface during runtime
             # 1 for enabled, 0 for disabled.
@@ -125,6 +117,8 @@ def get_nodeattr_types(self):
             # vector through the accelerator. This will get rid of any old
             # weight data from the weight FIFOs.
             "runtime_writeable_weights": ("i", False, 0, {0, 1}),
+            # attribute to save top module name - not user configurable
+            "gen_top_module": ("s", False, ""),
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
@@ -142,7 +136,6 @@ def calc_wmem(self):
 
     def calc_tmem(self):
         """Calculates and returns TMEM."""
-        assert self.get_nodeattr("noActivation")==1, "RTL-based MVAU does not support thresholding currently, please infer a standalone Thresholding_Batch layer"
         return 0
 
     def make_shape_compatible_op(self, model):
@@ -192,27 +185,9 @@ def verify_node(self):
                 """The required MatrixVectorActivation attributes do not exist."""
             )
 
-        # verify the number of inputs depending on noActivation value
-        # check noActivation value to determine the number of inputs
-        no_act = self.get_nodeattr("noActivation")
-
-        if no_act == 1:
-            if len(self.onnx_node.input) == 2:
-                info_messages.append("The number of inputs is correct")
-            else:
-                info_messages.append(
-                    """RTL-based MatrixVectorActivation needs in no
-                            activation mode 2 inputs (data input and weights)"""
-                )
-        elif no_act == 0:
-            info_messages.append("RTL-based MVAU does not support thresholding currently, please infer a standalone Thresholding_Batch layer")
-        else:
-            info_messages.append(
-                """noActivation attribute contains {} should
-                be 1 for RTL-based MatrixVectorActivation""".format(
-                    no_act
-                )
-            )
+        num_of_inputs = len(self.onnx_node.input)
+        if num_of_inputs!=2:
+            info_messages.append("RTL-based MatrixVectorActivation expects two inputs (weights and activation), but got {} inputs.".format(len(self.onnx_node.input)))
 
         mem_mode = self.get_nodeattr("mem_mode")
 
@@ -221,6 +196,7 @@ def verify_node(self):
 
         return info_messages
 
+# TODO: Add in replay_buffer estimation
     def uram_estimation(self):
         P = self.get_nodeattr("PE")
         Q = self.get_nodeattr("SIMD")
@@ -242,6 +218,7 @@ def uram_estimation(self):
         depth_multiplier = math.ceil(omega / 4096)
         return width_multiplier * depth_multiplier
 
+# TODO: Add in replay_buffer estimation
     def bram_estimation(self):
         """Calculates resource estimation for BRAM based on:
         - FINN-R: An End-to-End Deep-Learning Framework for Fast
@@ -268,7 +245,7 @@ def bram_estimation(self):
         ):
             return 0
         # assuming SDP mode RAMB18s (see UG573 Table 1-10)
-        # assuming decoupled (RTL) memory, which is more efficient than const (HLS)
+        # assuming decoupled (RTL) memory
         if mem_width == 1:
             return math.ceil(omega / 16384)
         elif mem_width == 2:
@@ -282,6 +259,7 @@ def bram_estimation(self):
         else:
             return (math.ceil(omega / 512)) * (math.ceil(mem_width / 36))
 
+# TODO: Add in replay_buffer estimation
     def bram_efficiency_estimation(self):
         wdt = self.get_weight_datatype()
         W = wdt.bitwidth()
@@ -294,6 +272,7 @@ def bram_efficiency_estimation(self):
         bram16_est_capacity = bram16_est * 36 * 512
         return wbits / bram16_est_capacity
 
+# TODO: Add in replay_buffer estimation
     def uram_efficiency_estimation(self):
         """Function for URAM efficiency estimation: actual parameter storage
         needed divided by the allocated URAM storage (from estimation)"""
@@ -308,7 +287,7 @@ def uram_efficiency_estimation(self):
         uram_est_capacity = uram_est * 72 * 4096
         return wbits / uram_est_capacity
 
-#TODO: FIX
+#TODO: FIX: worst case estimates since segmentlen is not known at this point?
     def lut_estimation(self):
         """Calculates resource estimations for LUTs based on:
         - FINN-R: An End-to-End Deep-Learning Framework for Fast
@@ -348,23 +327,14 @@ def lut_estimation(self):
         # accumulator
         acc_bits = W + A + np.ceil(math.log(MW, 2))
         acc_luts = acc_bits
-        # thresholds and threshold comparators
-        thr_luts = 0
-        comp_luts = 0
-        noact = self.get_nodeattr("noActivation")
-        if noact == 0:
-            odt = self.get_output_datatype()
-            B = odt.bitwidth()
-            thr_luts = (2**B - 1) * acc_bits * math.ceil(self.calc_tmem() / 64)
-            comp_luts = (2**B - 1) * acc_bits
 
         return int(
             c0
-            + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts))
+            + c1 * (P * (mult_luts + addertree_luts + acc_luts))
             + c2
         )
 
-#TODO: FIX
+#TODO: FIX: worst case estimates since segmentlen is not known at this point?
     def dsp_estimation(self):
         # multiplication
         P = self.get_nodeattr("PE")
@@ -380,7 +350,7 @@ def dsp_estimation(self):
             mult_dsp = 0
         return int(mult_dsp)
 
-#TODO: FIX
+#TODO: FIX: worst case estimates since segmentlen is not known at this point
     def get_exp_cycles(self):
         pe = self.get_nodeattr("PE")
         simd = self.get_nodeattr("SIMD")
@@ -389,6 +359,7 @@ def get_exp_cycles(self):
         mw = self.get_nodeattr("MW")
         # since mmv != 1 is not supported yet, we set mmv for now to 1
         mmv = 1
+        # Actual exp_cycles is probably slightly larger (say 3 cycles (DSP A/B, M, P - reg) + additional pipeline buffer cycles. Most probably <10)
         exp_cycles = (mh / pe) * (mw / simd) * np.prod(num_inp_vec) / mmv
         return int(exp_cycles)
 
@@ -413,7 +384,7 @@ def get_output_datatype(self, ind=0):
 
     def get_instream_width(self, ind=0):
         i_bits = self.get_input_datatype().bitwidth()
-        assert i_bits<=9, "RTL-based MVAU only supports activations with bit-width up to 9-bits"
+        assert (i_bits<=9), "RTL-based MVAU only supports activations with bit-width up to 9-bits"
         in_width = i_bits * self.get_nodeattr("SIMD")
         return in_width
 
@@ -431,8 +402,8 @@ def get_weightstream_width(self):
             pe = self.get_nodeattr("PE")
             simd = self.get_nodeattr("SIMD")
             wp = self.get_weight_datatype().bitwidth()
+            assert (wp <= 8), "RTL-based MVAU only supports weights with bit-width up to 8-bits"
             w_width = pe * simd * wp
-            assert wp<=8, "RTL-based MVAU only supports weights with bit-width up to 8-bits"
             return w_width
         else:
             return 0
@@ -544,10 +515,8 @@ def minimize_accumulator_width(self, model):
                 adt = DataType.get_smallest_possible(-acc_max - 1)
         else:
             adt = DataType.get_smallest_possible(acc_max)
-        # ensure a datatype divisible by 8-bits in case this is the last node
-        bw = roundup_to_integer_multiple(adt.bitwidth(), 8)
-        new_adt_name = adt.name.replace(str(adt.bitwidth()), str(bw))
-        adt = DataType[new_adt_name]
+        # Note: we are interested in simply the width of the output dot product.
+        # Padding the actual output stream to a multiple of 8-bits is done in the RTL component
         self.set_nodeattr("accDataType", adt.name)
         # for no-activation nodes, output dt = acc dt
         self.set_nodeattr("outputDataType", adt.name)
@@ -588,7 +557,10 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name):
                 1, -1, pe * simd
             )
             weight_tensor_pe_flipped = weight_tensor_pe_flipped.copy()
-            if weight_file_mode == "decoupled_verilog_dat":
+            if weight_file_mode == "decoupled_npy":
+                # save weight stream into npy for cppsim
+                np.save(weight_file_name, weight_tensor_simd_flipped)
+            elif weight_file_mode == "decoupled_verilog_dat":
                 # convert weight values into hexstring
                 weight_width = self.get_weightstream_width()
                 # pad to nearest 4 bits to get hex strings
@@ -638,7 +610,7 @@ def generate_params(self, model, path):
             weight_filename_sim = "{}/weights.npy".format(code_gen_dir)
             # save decoupled weights for cppsim
             self.make_weight_file(weights, "decoupled_npy", weight_filename_sim)
-            # also save weights as Verilog .dat file
+            # Also save weights as Verilog .dat file
             # note that we provide two different .dat files, one for synth
             # and one for synthesis. this is because URAM-based weights always
             # need zero weights for synthesis, otherwise they get inferred
@@ -693,7 +665,6 @@ def execute_node(self, context, graph):
         for inputs in node.input:
             # it is assumed that the first input of the node is the data input
             # the second input are the weights
-            # the third input are the thresholds
             if in_ind == 0:
                 assert (
                     str(context[inputs].dtype) == "float32"
@@ -709,7 +680,7 @@ def execute_node(self, context, graph):
                     reshaped_input,
                 )
             elif in_ind > 2:
-                raise Exception("Unexpected input found for MatrixVectorActivation")
+                raise Exception("Unexpected input found for MatrixVectorActivation_rtl")
             in_ind += 1
 
         if mode == "rtlsim":
@@ -759,7 +730,7 @@ def execute_node(self, context, graph):
     def code_generation_ipgen(self, model, fpgapart, clk):
         """Normally: Generates C++ code and tcl script for IP generation.
         Here: Generates (System-)Verilog code for IP generation."""
-        self.generate_hdl()
+        self.generate_hdl(model, fpgapart, clk)
 
     def ipgen_singlenode_code(self):
         """Normally: Builds the bash script for IP generation."""
@@ -828,11 +799,21 @@ def code_generation_ipi(self):
                 "create_bd_intf_pin -mode Slave "
                 "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name)
             )
-            # instantiate the hls ip
-            cmd.append(
-                "create_bd_cell -type ip -vlnv %s /%s/%s"
-                % (self.get_nodeattr("ip_vlnv"), node_name, node_name)
-            )
+            # instantiate the RTL block
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+            rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/")
+            sourcefiles = [
+                os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"),
+                rtllib_dir + "mvu_axi.sv",
+                rtllib_dir + "replay_buffer.sv",
+                rtllib_dir + "mvu_4sx4u.sv",
+                rtllib_dir + "mvu_8sx9.sv",
+                rtllib_dir + "mvu_8sx8u_dsp48.sv"
+            ]
+            for f in sourcefiles:
+                cmd.append("add_files -norecurse %s" % (f))
+            cmd.append("create_bd_cell -type hier -reference %s /%s/%s" % (self.get_nodeattr("gen_top_module"), self.onnx_node.name, self.onnx_node.name))
+
             # instantiate a streamer and connect it to the HLS IP
             strm_vlnv = "xilinx.com:user:memstream:1.0"
             strm_inst = node_name + "_wstrm"
@@ -947,12 +928,6 @@ def get_op_and_param_counts(self):
         weight_param_type = "param_weight_%db" % (weight_bits)
         weight_count = in_features * out_features
         ret_dict = {mac_op_type: mac_count, weight_param_type: weight_count}
-        if self.get_nodeattr("noActivation") == 0:
-            tdt = DataType[self.get_nodeattr("accDataType")]
-            thres_bits = tdt.bitwidth()
-            thres_param_type = "param_threshold_%db" % (thres_bits)
-            thres_count = out_features
-            ret_dict[thres_param_type] = thres_count
         return ret_dict
 
     def derive_characteristic_fxns(self, period):
@@ -972,65 +947,113 @@ def derive_characteristic_fxns(self, period):
             ]
         super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
 
-    def generate_hdl(self):
-#TODO: add distinction between (PE=MH or PE=1) and where MH dimension is folded
-        template_path, code_gen_dict = self.prepare_codegen_default()
+# TODO: characterize max_clk and implement this function in look-up style
+    def _resolve_segment_len(self, clk):
+        # Insert pipeline registers in the DSP chain to meet target clock frequency
+        segmentlen = 0
+        return segmentlen
+
+    def _resolve_impl_style(self, fpgapart):
+        # Based on target device and activation/weight-width, choose the supported RTL module
+        act_width = self.get_input_datatype(0).bitwidth()
+        weight_width = self.get_input_datatype(1).bitwidth()
+        is_versal = fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"] or fpgapart[0:5] == "xqrvc"
+        if (act_width == 4 and weight_width == 4):
+            return "mvu_4sx4u"
+        else:
+            if (is_versal):
+                return "mvu_8sx9_dsp58"
+            else:
+                return "mvu_8sx8u_dsp48"
+
+    def generate_hdl(self, model, fpgapart, clk):
+        # Generate params as part of IP preparation
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        self.generate_params(model, code_gen_dir)
 
+        template_path, code_gen_dict = self.prepare_codegen_default(fpgapart, clk)
         # add general parameters to dictionary
-        code_gen_dict["$TOP_MODULE_NAME$"] = [self.get_verilog_top_module_name()]
+        code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [self.get_verilog_top_module_name()]
         # save top module name so we can refer to it after this node has been renamed
         # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject)
         self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name())
-#TODO: currently only ram_style=auto is supported
+
         ram_style = self.get_nodeattr("ram_style")
-        if ram_style == "auto":
-            continue
-        else:
-            raise Exception("Unrecognized ram_style for MatrixVectorActivation")
+        assert (ram_style=="auto"), "Unrecognized ram_style for MatrixVectorActivation_rtl"
 
-        # apply code generation to templates
-        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        # apply code generation to template
         with open(template_path, "r") as f:
-            template = f.read()
+            template_wrapper = f.read()
         for key in code_gen_dict:
             # transform list into long string separated by '\n'
             code_gen_line = "\n".join(code_gen_dict[key])
-            template = template.replace(key, code_gen_line)
             template_wrapper = template_wrapper.replace(key, code_gen_line)
         with open(
             os.path.join(
-                code_gen_dir, self.get_nodeattr("gen_top_module") + "_impl.sv"
+                code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"
             ),
             "w",
         ) as f:
-            f.write(template)
+            f.write(template_wrapper.replace("$FORCE_BEHAVIORAL$", str(0)))
         with open(
             os.path.join(
-                code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"
+                code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper_sim.v"
             ),
             "w",
         ) as f:
-            f.write(template_wrapper)
+            f.write(template_wrapper.replace("$FORCE_BEHAVIORAL$", str(1)))
 
         # set ipgen_path and ip_path so that HLS-Synth transformation
         # and stich_ip transformation do not complain
         self.set_nodeattr("ipgen_path", code_gen_dir)
-        self.set_nodeattr("ip_path", code_gen_dir)    
+        self.set_nodeattr("ip_path", code_gen_dir)
 
-    def prepare_codegen_default(self):
-        # TODO: Differentiate between PE folding and fully unrolled along MH dimension
+    def prepare_codegen_default(self, fpgapart, clk):
         template_path = (
-            os.environ["FINN_ROOT"] + "/finn-rtllib/mvau/dsp58_mvau_template.vhdl"
+            os.environ["FINN_ROOT"] + "/finn-rtllib/mvu/mvu_axi_wrapper.v"
         )
+        
         code_gen_dict = {}
-
-        code_gen_dict["$PE$"] = self.get_nodeattr("PE")
-        code_gen_dict["$SIMD$"] = self.get_nodeattr("SIMD")
-        code_gen_dict["$MW$"] = self.get_nodeattr("MW")
-        code_gen_dict["$MH$"] = self.get_nodeattr("MH")
-        code_gen_dict["$ACTIVATION_WIDTH$"] = self.get_input_datatype(0).bitwidth()
-        code_gen_dict["$WEIGHT_WIDTH$"] = self.get_input_datatype(1).bitwidth()
-        code_gen_dict["$ACCU_WIDTH_BA$"] = self.get_output_datatype().bitwidth()
+        code_gen_dict["$MW$"] = [str(self.get_nodeattr("MW"))]
+        code_gen_dict["$MH$"] = [str(self.get_nodeattr("MH"))]
+        code_gen_dict["$PE$"] = [str(self.get_nodeattr("PE"))]
+        code_gen_dict["$SIMD$"] = [str(self.get_nodeattr("SIMD"))]
+        code_gen_dict["$ACTIVATION_WIDTH$"] = [str(self.get_input_datatype(0).bitwidth())]
+        code_gen_dict["$WEIGHT_WIDTH$"] = [str(self.get_input_datatype(1).bitwidth())]
+        code_gen_dict["$ACCU_WIDTH$"] = [str(self.get_output_datatype().bitwidth())]
+        code_gen_dict["$SIGNED_ACTIVATIONS$"] = [str(1)] if (self.get_input_datatype(0).min() < 0) else [str(0)]
+        code_gen_dict["$SEGMENTLEN$"] = [str(self._resolve_segment_len(clk))]
+        code_gen_dict["$MVU_IMPL_STYLE$"] = [self._resolve_impl_style(fpgapart)]
 
         return template_path, code_gen_dict
 
+    def prepare_rtlsim(self):
+        """Creates a Verilator emulation library for the RTL code generated
+        for this node, sets the rtlsim_so attribute to its path and returns
+        a PyVerilator wrapper around it."""
+
+        if PyVerilator is None:
+            raise ImportError("Installation of PyVerilator is required.")
+
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")        
+        # Path to (System-)Verilog files used by top-module & path to top-module
+        verilog_paths = [
+            code_gen_dir,
+            os.environ["FINN_ROOT"] + "/finn-rtllib/mvu"
+        ]
+        verilog_files = [
+            self.get_nodeattr("gen_top_module") + "_wrapper_sim.v"
+        ]
+
+        # build the Verilator emu library
+        sim = PyVerilator.build(
+            verilog_files,
+            build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"),
+            verilog_path=verilog_paths,
+            trace_depth=get_rtlsim_trace_depth(),
+            top_module_name=self.get_verilog_top_module_name()
+        )
+        # save generated lib filename in attribute
+        self.set_nodeattr("rtlsim_so", sim.lib._name)
+        
+        return sim
\ No newline at end of file

From 4e44934c3001174e52c62caf5d320104a308e611 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 8 May 2023 10:31:35 +0100
Subject: [PATCH 119/235] [tests]: initial version of unit test for RTL custom
 op and specialize_to_rtl transformation for MVU

---
 .../test_fpgadataflow_mvau_rtl.py             | 172 ++++++++++++++++++
 1 file changed, 172 insertions(+)
 create mode 100644 tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py

diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py b/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py
new file mode 100644
index 0000000000..20a249bd08
--- /dev/null
+++ b/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py
@@ -0,0 +1,172 @@
+# Copyright (C) 2022, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+import os
+
+import numpy as np
+from onnx import TensorProto, helper
+from qonnx.util.basic import (
+    qonnx_make_model,
+    gen_finn_dt_tensor
+)
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.core.datatype import DataType
+from qonnx.transformation.general import GiveUniqueNodeNames
+import finn.core.onnx_exec as oxe
+import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from qonnx.transformation.general import ApplyConfig
+import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl
+#import qonnx.core.data_layout as DataLayout
+
+build_dir = os.environ["FINN_BUILD_DIR"]
+
+def make_single_matmul_modelwrapper(W, ofm_shape, mh, ifm, weights, idt, wdt):
+    (ofm_h, ofm_w) = ofm_shape
+    ofm = helper.make_tensor_value_info(
+        "ofm",
+        TensorProto.FLOAT,
+        (1, ofm_h, ofm_w, mh)
+    )
+
+    matmul_node = helper.make_node(
+        "MatMul",
+        ["ifm", "weights"],
+        ["ofm"]
+    )
+    graph = helper.make_graph(
+        nodes=[matmul_node],
+        name="matmul_graph",
+        inputs=[ifm],
+        outputs=[ofm]
+    )
+
+    model = qonnx_make_model(graph, producer_name="fclayer-model")
+    model = ModelWrapper(model)
+
+    model.set_tensor_datatype("ifm", idt)
+    model.set_tensor_datatype("weights", wdt)
+    model.set_tensor_datatype("ofm", DataType["INT32"]) # At this step, the MatMul layer does not optimize the bit-width of the output datatype
+    model.set_initializer("weights", W)
+
+    # model.set_tensor_layout("ifm", DataLayout.NHWC)
+
+    return model
+
+def prepare_inputs(input_tensor):
+    return {"inp": input_tensor}
+
+@pytest.mark.parametrize("mh", [16])
+@pytest.mark.parametrize("mw", [90])
+#@pytest.mark.parametrize("pe", [1, 2, 4, 8, 16])
+@pytest.mark.parametrize("pe", [16])
+#@pytest.mark.parametrize("simd", [1, 30, 90])
+@pytest.mark.parametrize("simd", [90])
+@pytest.mark.parametrize("idt", [DataType["INT8"]])
+@pytest.mark.parametrize("wdt", [DataType["UINT4"]])
+#@pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S", "xcku3p-ffva676-1-e"])
+@pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S"])
+@pytest.mark.parametrize("segmentlen", [1])
+@pytest.mark.fpgadataflow
+@pytest.mark.slow
+@pytest.mark.vivado
+def test_fpgadataflow_mvau_rtl(mh, mw, pe, simd, idt, wdt, part, segmentlen):
+    # Create test input vector (produced by SWG)
+    ofm_shape = (5, 5)
+    ofm_h, ofm_w = ofm_shape
+    ifm = helper.make_tensor_value_info(
+        "ifm",
+        TensorProto.FLOAT,
+        [1, ofm_h, ofm_w, mw]
+    )
+    weights = helper.make_tensor_value_info(
+        "weights",
+        TensorProto.FLOAT,
+        [mw, mh]
+    )
+    W = gen_finn_dt_tensor(wdt, (mw, mh))
+    model = make_single_matmul_modelwrapper(W, ofm_shape, mh, ifm, weights, idt, wdt)
+    model = model.transform(GiveUniqueNodeNames())
+
+    model.save(build_dir+"/matmul.onnx")
+
+    # Create MatMul & obtain golden reference output
+    A = gen_finn_dt_tensor(model.get_tensor_datatype("ifm"), model.get_tensor_shape("ifm"))
+    input_dict = prepare_inputs(A)
+
+    ## Execute ONNX model
+    output_matmul = oxe.execute_onnx(model, input_dict)
+
+    # Create MVAU (HLS)
+    model = model.transform(to_hls.InferQuantizedMatrixVectorActivation(mem_mode="decoupled"))
+    model = model.transform(GiveUniqueNodeNames())
+    
+    # Apply folding (i.e. specify to use DSPs)
+    folding_config = {
+        "Defaults": {},
+        "MatrixVectorActivation_0": {
+            "PE" : pe,
+            "SIMD" : simd,
+            "mem_mode" : "decoupled",
+            "ram_style" : "auto",
+            "resType" : "dsp",
+            "impl" : "rtl"
+        }
+    }
+    model = model.transform(ApplyConfig(folding_config))
+    model.save(build_dir+"/mvau_hls.onnx")
+
+    model = model.transform(SetExecMode("rtlsim"))
+    model = model.transform(PrepareIP(part, 5))
+    model = model.transform(HLSSynthIP())
+    model = model.transform(PrepareRTLSim())
+    output_mvau_hls = oxe.execute_onnx(model, input_dict)["ofm"]
+
+    # Apply convert-to-rtl step
+    model = model.transform(to_rtl.InferRTLMatrixVectorActivation())
+    model = model.transform(GiveUniqueNodeNames())
+    model.save(build_dir+"/mvau_rtl.onnx")
+
+    model = model.transform(SetExecMode("rtlsim"))
+    model = model.transform(PrepareIP("xcvm1802-vsvd1760-2MP-e-S", 5))
+    model = model.transform(HLSSynthIP())
+    model = model.transform(PrepareRTLSim())
+    output_mvau_rtl = oxe.execute_onnx(model, input_dict)["ofm"]
+
+    model.save(build_dir+"/mvau_rtl_sim.onnx")
+
+    assert (output_mvau_hls == output_mvau_rtl).all()
+    assert (output_mvau_hls.size > 0)
+
+
+# python setup.py test --addopts "-k test_fpgadataflow_mvau_rtl"
+# python setup.py test --addopts "-k test_fpgadataflow_fclayer_rtlsim"
\ No newline at end of file

From cc361d9fd4ea082e04d7a1a6bc3932406b0a4f14 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 8 May 2023 10:32:52 +0100
Subject: [PATCH 120/235] [rtl mvu]: specialized compute core for 4-bit weights
 and activations for DSP48/DSP58

---
 finn-rtllib/mvu/mvu_4sx4u.sv | 359 +++++++++++++++++++++++++++++++++++
 1 file changed, 359 insertions(+)
 create mode 100644 finn-rtllib/mvu/mvu_4sx4u.sv

diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv
new file mode 100644
index 0000000000..5993154355
--- /dev/null
+++ b/finn-rtllib/mvu/mvu_4sx4u.sv
@@ -0,0 +1,359 @@
+module mvu_4sx4u #(
+	int unsigned  PE,
+	int unsigned  SIMD,
+	int unsigned  ACCU_WIDTH,
+	bit FORCE_BEHAVIORAL = 0
+)(
+	// Global Control
+	input	logic  clk,
+	input	logic  rst,
+	input	logic  en,
+
+	// Input
+	input	logic  last,
+	input	logic  zero,	// ignore current inputs and force this partial product to zero
+	input	logic signed [PE-1:0][SIMD-1:0][3:0]  w,	// signed weights
+	input	logic                [SIMD-1:0][3:0]  a,	// unsigned activations
+
+	// Ouput
+	output	logic  vld,
+	output	logic signed [PE-1:0][ACCU_WIDTH-1:0]  p
+);
+
+	typedef int unsigned  leave_load_t[2*SIMD-1];
+	function leave_load_t init_leave_loads();
+		automatic leave_load_t  res;
+		for(int  i = 2*(SIMD-1); i >= int'(SIMD)-1; i--)  res[i] = 1;
+		for(int  i = SIMD-2; i >= 0; i--)  res[i] = res[2*i+1] + res[2*i+2];
+		return  res;
+	endfunction : init_leave_loads
+
+	// Pipeline for last indicator flag
+	logic [1:5] L = '0;
+	always_ff @(posedge clk) begin
+		if(rst)      L <= '0;
+		else if(en)  L <= { last, L[1:4] };
+	end
+	assign	vld = L[5];
+
+	// Stages #1 - #3: DSP Lanes + cross-lane canaries duplicated with SIMD parallelism
+	localparam int unsigned  D[4:0] = '{ ACCU_WIDTH+22, 22, 15, 8, 0 }; // Lane offsets
+
+	localparam int unsigned  PIPE_COUNT = (PE+3)/4;
+	for(genvar  c = 0; c < PIPE_COUNT; c++) begin : genPipes
+
+		localparam int unsigned  PE_BEG = 4*c;
+		localparam int unsigned  PE_END = PE < 4*(c+1)? PE : 4*(c+1);
+
+		uwire        [57:0]  p3[SIMD];
+		uwire signed [ 1:0]  h3[SIMD][3];
+		for(genvar  s = 0; s < SIMD; s++) begin : genSIMD
+
+			// Input Lane Assembly
+			uwire [23:0]  bb = a[s];
+			logic [33:0]  aa;
+			logic [26:0]  dd;
+			logic [ 1:0]  xx[3:1];
+			if(1) begin : blkVectorize
+				uwire [3:0]  ww[PE_END - PE_BEG];
+				for(genvar  pe = 0; pe < PE_END - PE_BEG; pe++) begin
+					assign	ww[pe] = w[PE_BEG + pe][s];
+					if(pe) begin
+//						assign  xx[pe] = zero? 0 : ww[pe] * a[s];
+						LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x (
+							.O6(xx[pe][1]),
+							.O5(xx[pe][0]),
+							.I5(1'b1),
+							.I4(zero),
+							.I3(ww[pe][1]),
+							.I2(a[s][1]),
+							.I1(ww[pe][0]),
+							.I0(a[s][0])
+						);
+					end
+				end
+				always_comb begin
+					dd = '0;
+					aa = '0;
+					for(int unsigned  pe = 0; pe < PE_END - PE_BEG; pe++) begin
+						dd[D[pe]+:3] = ww[pe];
+						aa[D[pe]+ 3] = ww[pe][3];
+					end
+				end
+			end : blkVectorize
+
+			uwire [57:0]  pp;
+
+			// Note: Since the product B * AD is computed,
+			//       rst can be only applied to AD and zero only to B
+			//       with the same effect as zeroing both.
+			if (FORCE_BEHAVIORAL) begin : genBehav
+				// Stage #1: Input Refine
+				logic signed [23:0]  B1  = 0;
+				always_ff @(posedge clk) begin
+					if(zero)     B1  <= 0;
+					else if(en)  B1  <= bb;
+				end
+
+				logic signed [26:0]  AD1 = 0;
+				always_ff @(posedge clk) begin
+					if(rst)      AD1 <= 0;
+					else if(en)  AD1 <= dd - aa;
+				end
+
+				// Stage #2: Multiply
+				logic signed [50:0]  M2 = 0;
+				always_ff @(posedge clk) begin
+					if(rst)      M2 <= 0;
+					else if(en)  M2 <=
+// synthesis translate off
+						(B1 === '0) || (AD1 === '0)? 0 :
+// synthesis translate on
+						B1 * AD1;
+				end
+
+				// Stage #3: Accumulate
+				logic signed [57:0]  P3 = 0;
+				always_ff @(posedge clk) begin
+					if(rst)      P3 <= 0;
+					else if(en)  P3 <= M2 + (L[3]? 0 : P3);
+				end
+
+				assign	pp = P3;
+			end : genBehav
+			else begin : genDSP
+				DSP48E2 #(
+					// Feature Control Attributes: Data Path Selection
+					.AMULTSEL("AD"),	// Selects A input to multiplier (A, AD)
+					.A_INPUT("DIRECT"),	// Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port)
+					.BMULTSEL("B"),		// Selects B input to multiplier (AD, B)
+					.B_INPUT("DIRECT"),	// Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port)
+					.PREADDINSEL("A"),                 // Selects input to pre-adder (A, B)
+					.RND('0),                          // Rounding Constant
+					.USE_MULT("MULTIPLY"),             // Select multiplier usage (DYNAMIC, MULTIPLY, NONE)
+					.USE_SIMD("ONE48"),                // SIMD selection (FOUR12, ONE58, TWO24)
+					.USE_WIDEXOR("FALSE"),             // Use the Wide XOR function (FALSE, TRUE)
+					.XORSIMD("XOR24_48_96"),       // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116)
+
+					// Pattern Detector Attributes: Pattern Detection Configuration
+					.AUTORESET_PATDET("NO_RESET"),     // NO_RESET, RESET_MATCH, RESET_NOT_MATCH
+					.AUTORESET_PRIORITY("RESET"),      // Priority of AUTORESET vs. CEP (CEP, RESET).
+					.MASK('1),                         // 58-bit mask value for pattern detect (1=ignore)
+					.PATTERN('0),                      // 58-bit pattern match for pattern detect
+					.SEL_MASK("MASK"),                 // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2
+					.SEL_PATTERN("PATTERN"),           // Select pattern value (C, PATTERN)
+					.USE_PATTERN_DETECT("NO_PATDET"),  // Enable pattern detect (NO_PATDET, PATDET)
+
+					// Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins
+					.IS_ALUMODE_INVERTED('0),				// Optional inversion for ALUMODE
+					.IS_CARRYIN_INVERTED('0),				// Optional inversion for CARRYIN
+					.IS_CLK_INVERTED('0),					// Optional inversion for CLK
+					.IS_INMODE_INVERTED('0),				// Optional inversion for INMODE
+					.IS_OPMODE_INVERTED(9'b00_010_01_01),	// Optional inversion for OPMODE
+					.IS_RSTALLCARRYIN_INVERTED('0),			// Optional inversion for RSTALLCARRYIN
+					.IS_RSTALUMODE_INVERTED('0),			// Optional inversion for RSTALUMODE
+					.IS_RSTA_INVERTED('0),					// Optional inversion for RSTA
+					.IS_RSTB_INVERTED('0),					// Optional inversion for RSTB
+					.IS_RSTCTRL_INVERTED('0),				// Optional inversion for STCONJUGATE_A
+					.IS_RSTC_INVERTED('0),					// Optional inversion for RSTC
+					.IS_RSTD_INVERTED('0),					// Optional inversion for RSTD
+					.IS_RSTINMODE_INVERTED('0),				// Optional inversion for RSTINMODE
+					.IS_RSTM_INVERTED('0),					// Optional inversion for RSTM
+					.IS_RSTP_INVERTED('0),					// Optional inversion for RSTP
+
+					// Register Control Attributes: Pipeline Register Configuration
+					.ACASCREG(0),                      // Number of pipeline stages between A/ACIN and ACOUT (0-2)
+					.ADREG(1),                         // Pipeline stages for pre-adder (0-1)
+					.ALUMODEREG(0),                    // Pipeline stages for ALUMODE (0-1)
+					.AREG(0),                          // Pipeline stages for A (0-2)
+					.BCASCREG(1),                      // Number of pipeline stages between B/BCIN and BCOUT (0-2)
+					.BREG(1),                          // Pipeline stages for B (0-2)
+					.CARRYINREG(0),                    // Pipeline stages for CARRYIN (0-1)
+					.CARRYINSELREG(0),                 // Pipeline stages for CARRYINSEL (0-1)
+					.CREG(0),                          // Pipeline stages for C (0-1)
+					.DREG(0),                          // Pipeline stages for D (0-1)
+					.INMODEREG(0),                     // Pipeline stages for INMODE (0-1)
+					.MREG(1),                          // Multiplier pipeline stages (0-1)
+					.OPMODEREG(1),                     // Pipeline stages for OPMODE (0-1)
+					.PREG(1)                          // Number of pipeline stages for P (0-1)
+				) dsp (
+					// Cascade outputs: Cascade Ports
+					.ACOUT(),			// 34-bit output: A port cascade
+					.BCOUT(),			// 24-bit output: B cascade
+					.CARRYCASCOUT(),	// 1-bit output: Cascade carry
+					.MULTSIGNOUT(),		// 1-bit output: Multiplier sign cascade
+					.PCOUT(),			// 58-bit output: Cascade output
+
+					// Control outputs: Control Inputs/Status Bits
+					.OVERFLOW(),		// 1-bit output: Overflow in add/acc
+					.PATTERNBDETECT(),	// 1-bit output: Pattern bar detect
+					.PATTERNDETECT(),	// 1-bit output: Pattern detect
+					.UNDERFLOW(),		// 1-bit output: Underflow in add/acc
+
+					// Data outputs: Data Ports
+					.CARRYOUT(),		// 4-bit output: Carry
+					.P(pp),				// 58-bit output: Primary data
+					.XOROUT(),			// 8-bit output: XOR data
+
+					// Cascade inputs: Cascade Ports
+					.ACIN('x),			// 34-bit input: A cascade data
+					.BCIN('x),			// 24-bit input: B cascade
+					.CARRYCASCIN('x),	// 1-bit input: Cascade carry
+					.MULTSIGNIN('x),	// 1-bit input: Multiplier sign cascade
+					.PCIN('x),			// 58-bit input: P cascade
+
+					// Control inputs: Control Inputs/Status Bits
+					.CLK(clk),					// 1-bit input: Clock
+					.ALUMODE(4'h0),				// 4-bit input: ALU control
+					.CARRYINSEL('0),			// 3-bit input: Carry select
+					.INMODE(5'b01100),			// 5-bit input: INMODE control
+					.OPMODE({ 2'b00, { 1'b0, L[2], 1'b0 }, 4'b00_00 }),	// 9-bit input: Operation mode
+
+					// Data inputs: Data Ports
+					.A(aa),						// 34-bit input: A data
+					.B(bb),						// 24-bit input: B data
+					.C('x),						// 58-bit input: C data
+					.CARRYIN('0),				// 1-bit input: Carry-in
+					.D(dd),						// 27-bit input: D data
+
+					// Reset/Clock Enable inputs: Reset/Clock Enable Inputs
+					.CEA1('0),			// 1-bit input: Clock enable for 1st stage AREG
+					.CEA2('0),			// 1-bit input: Clock enable for 2nd stage AREG
+					.CEAD(en),			// 1-bit input: Clock enable for ADREG
+					.CEALUMODE('0),		// 1-bit input: Clock enable for ALUMODE
+					.CEB1('0),			// 1-bit input: Clock enable for 1st stage BREG
+					.CEB2(en),			// 1-bit input: Clock enable for 2nd stage BREG
+					.CEC('0),			// 1-bit input: Clock enable for CREG
+					.CECARRYIN('0),		// 1-bit input: Clock enable for CARRYINREG
+					.CECTRL(en),		// 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG
+					.CED('0),			// 1-bit input: Clock enable for DREG
+					.CEINMODE('0),		// 1-bit input: Clock enable for INMODEREG
+					.CEM(en),			// 1-bit input: Clock enable for MREG
+					.CEP(en),			// 1-bit input: Clock enable for PREG
+					.RSTA('0),			// 1-bit input: Reset for AREG
+					.RSTB(				// 1-bit input: Reset for BREG
+// synthesis translate_off
+						rst ||
+// synthesis translate_on
+						zero
+					),
+					.RSTC('0),			// 1-bit input: Reset for CREG
+					.RSTD(				// 1-bit input: Reset for DREG and ADREG
+// synthesis translate_off
+						zero ||
+// synthesis translate_on
+						rst
+					),
+					.RSTALLCARRYIN('0),	// 1-bit input: Reset for CARRYINREG
+					.RSTALUMODE('0),	// 1-bit input: Reset for ALUMODEREG
+					.RSTCTRL('0),		// 1-bit input: Reset for OPMODEREG and CARRYINSELREG
+					.RSTINMODE('0),		// 1-bit input: Reset for INMODE register
+					.RSTM(rst),			// 1-bit input: Reset for MREG
+					.RSTP(rst)			// 1-bit input: Reset for PREG
+				);
+			end : genDSP
+
+			// External Canary Pipeline
+			logic [1:0]  X1[3:1] = '{ default: 0 };
+			logic [1:0]  X2[3:1] = '{ default: 0 };
+			logic [1:0]  X3[3:1] = '{ default: 0 };
+			always_ff @(posedge clk) begin
+				if(rst) begin
+					X1 <= '{ default: 0 };
+					X2 <= '{ default: 0 };
+					X3 <= '{ default: 0 };
+				end
+				else if(en) begin
+					X1 <= xx;
+					X2 <= X1;
+					foreach(X3[i]) begin
+						X3[i] <= X2[i] + (L[3]? 2'h0 : pp[D[i]+:2]);
+					end
+				end
+			end
+
+			// Derive actual cross-lane overflows
+			for(genvar  i = 0; i < 3; i++) begin
+				assign	h3[s][i] = pp[D[i+1]+:2] - X3[i+1];
+			end
+			assign	p3[s] = pp;
+
+		end : genSIMD
+
+		// Stage #4: Cross-SIMD Reduction
+
+		// Count leaves reachable from each node
+		localparam leave_load_t  LEAVE_LOAD = init_leave_loads();
+
+		uwire signed [ACCU_WIDTH  -1:0]  up4;
+		uwire signed [ACCU_WIDTH  -8:0]  hi4[3];
+		uwire        [$clog2(SIMD)+7:0]  lo4[3];
+		for(genvar  i = 0; i < 4; i++) begin
+			localparam int unsigned  LO_WIDTH = D[i+1] - D[i];
+			localparam int unsigned  HI_WIDTH = ACCU_WIDTH - LO_WIDTH;
+
+			// Conclusive high part accumulation
+			if(i < 3) begin : genHi
+				// Adder Tree across all SIMD high contributions, each from [-1:1]
+				uwire signed [$clog2(1+SIMD):0]  tree[2*SIMD-1];
+				for(genvar  s = 0; s < SIMD;   s++)  assign  tree[SIMD-1+s] = h3[s][i];
+				for(genvar  n = 0; n < SIMD-1; n++) begin
+					// Sum truncated to actual maximum bit width at this node
+					uwire signed [$clog2(1+LEAVE_LOAD[n]):0]  s = tree[2*n+1] + tree[2*n+2];
+					assign  tree[n] = s;
+				end
+
+				// High Sideband Accumulation
+				logic signed [HI_WIDTH-1:0]  Hi4 = 0;
+				always_ff @(posedge clk) begin
+					if(rst)      Hi4 <= 0;
+					else if(en)  Hi4 <= (L[4]? 0 : Hi4) + tree[0];
+				end
+				assign	hi4[i] = Hi4;
+			end : genHi
+
+			// Conclusive low part accumulation
+			if(1) begin : blkLo
+				// Adder Tree across all SIMD low contributions
+				localparam int unsigned  ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1));
+				uwire [ROOT_WIDTH-1:0]  tree[2*SIMD-1];
+				for(genvar  s = 0; s < SIMD;   s++)  assign  tree[SIMD-1+s] = p3[s][D[i]+:LO_WIDTH];
+				for(genvar  n = 0; n < SIMD-1; n++) begin
+					// Sum truncated to actual maximum bit width at this node
+					localparam int unsigned  NODE_WIDTH = $clog2(1 + LEAVE_LOAD[n]*(2**LO_WIDTH-1));
+					uwire [NODE_WIDTH-1:0]  s = tree[2*n+1] + tree[2*n+2];
+					assign  tree[n] = s;
+				end
+
+				logic [ROOT_WIDTH-1:0]  Lo4 = 0;
+				always_ff @(posedge clk) begin
+					if(rst)      Lo4 <= 0;
+					else if(en)  Lo4 <= tree[0];
+				end
+
+				if(i == 3)  assign  up4 = Lo4;
+				else  assign  lo4[i] = Lo4;
+			end : blkLo
+
+		end
+
+		// Stage #5: Resolve lane totals
+		logic signed [3:0][ACCU_WIDTH-1:0]  Res5 = '{ default: 0 };
+		always_ff @(posedge clk) begin
+			if(rst)  Res5 <= '{ default: 0 };
+			else if(en) begin
+				Res5[3] <= up4 - hi4[2];
+				Res5[2] <= $signed({ hi4[2], {(D[3] - D[2]){1'b0}} }) + $signed({ 1'b0, lo4[2] }) - hi4[1];
+				Res5[1] <= $signed({ hi4[1], {(D[2] - D[1]){1'b0}} }) + $signed({ 1'b0, lo4[1] }) - hi4[0];
+				Res5[0] <= $signed({ hi4[0], {(D[1] - D[0]){1'b0}} }) + $signed({ 1'b0, lo4[0] });
+			end
+		end
+
+		// Output
+		for(genvar  pe = PE_BEG; pe < PE_END; pe++) begin
+			assign	p[pe] = Res5[pe - PE_BEG];
+		end
+
+	end : genPipes
+
+endmodule : mvu_4sx4u
\ No newline at end of file

From 8eefb535c3da6482f95465df05b8d3e1c610be21 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 8 May 2023 10:33:31 +0100
Subject: [PATCH 121/235] [rtl mvu]: specialized compute core for > 4-bit
 weights and activations for DSP48

---
 finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 358 +++++++++++++++++++++++++++++
 1 file changed, 358 insertions(+)
 create mode 100644 finn-rtllib/mvu/mvu_8sx8u_dsp48.sv

diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
new file mode 100644
index 0000000000..e06a92c8fa
--- /dev/null
+++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
@@ -0,0 +1,358 @@
+module mvu_8sx8u_dsp48 #(
+	int unsigned  PE,
+	int unsigned  SIMD,
+	int unsigned  ACCU_WIDTH,
+	int unsigned  ACTIVATION_WIDTH,
+	int unsigned  WEIGHT_WIDTH,
+	bit FORCE_BEHAVIORAL = 0,
+
+	localparam int unsigned SINGLE_PROD_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH
+)(
+	// Global Control
+	input	logic  clk,
+	input	logic  rst,
+	input	logic  en,
+
+	// Input
+	input	logic  last,
+	input	logic  zero,	// ignore current inputs and force this partial product to zero
+	input	logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]  w,	// signed weights
+	input	logic                [SIMD-1:0][ACTIVATION_WIDTH-1:0]  a,	// unsigned activations
+
+	// Ouput
+	output	logic  vld,
+	output	logic signed [PE-1:0][ACCU_WIDTH-1:0]  p
+);
+
+	typedef int unsigned  leave_load_t[2*SIMD-1];
+	function leave_load_t init_leave_loads();
+		automatic leave_load_t  res;
+		for(int  i = 2*(SIMD-1); i >= int'(SIMD)-1; i--)  res[i] = 1;
+		for(int  i = SIMD-2; i >= 0; i--)  res[i] = res[2*i+1] + res[2*i+2];
+		return  res;
+	endfunction : init_leave_loads
+
+	// Pipeline for last indicator flag
+	logic [1:5] L = '0;
+	always_ff @(posedge clk) begin
+		if(rst)      L <= '0;
+		else if(en)  L <= { last, L[1:4] };
+	end
+	assign	vld = L[5];
+
+	// Stages #1 - #3: DSP Lanes + cross-lane canaries duplicated with SIMD parallelism
+    localparam int unsigned  D[2:0] = '{ ACCU_WIDTH+SINGLE_PROD_WIDTH, SINGLE_PROD_WIDTH, 0 }; // Lane offsets
+
+	localparam int unsigned  PIPE_COUNT = (PE+1)/2;
+	for(genvar  c = 0; c < PIPE_COUNT; c++) begin : genPipes
+
+		localparam int unsigned  PE_BEG = 2*c;
+		localparam int unsigned  PE_END = PE < 2*(c+1)? PE : 2*(c+1);
+
+		uwire        [57:0]  p3[SIMD];
+		uwire signed [ 1:0]  h3[SIMD];
+		for(genvar  s = 0; s < SIMD; s++) begin : genSIMD
+
+			// Input Lane Assembly
+			uwire [23:0]  bb = a[s];
+			logic [33:0]  aa;
+			logic [26:0]  dd;
+			logic [ 1:0]  xx;
+			if(1) begin : blkVectorize
+				uwire [WEIGHT_WIDTH-1:0]  ww[PE_END - PE_BEG];
+				for(genvar  pe = 0; pe < PE_END - PE_BEG; pe++) begin
+					assign	ww[pe] = w[PE_BEG + pe][s];
+					if(pe) begin
+//						assign  xx[pe] = zero? 0 : ww[pe] * a[s];
+						LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x (
+							.O6(xx[1]),
+							.O5(xx[0]),
+							.I5(1'b1),
+							.I4(zero),
+							.I3(ww[pe][1]),
+							.I2(a[s][1]),
+							.I1(ww[pe][0]),
+							.I0(a[s][0])
+						);
+					end
+				end
+				always_comb begin
+					dd = '0;
+					aa = '0;
+					for(int unsigned  pe = 0; pe < PE_END - PE_BEG; pe++) begin
+						dd[D[pe] +: WEIGHT_WIDTH-1] = ww[pe];
+						aa[D[pe] + WEIGHT_WIDTH-1] = ww[pe][WEIGHT_WIDTH-1];
+					end
+				end
+			end : blkVectorize
+
+			uwire [57:0]  pp;
+
+			// Note: Since the product B * AD is computed,
+			//       rst can be only applied to AD and zero only to B
+			//       with the same effect as zeroing both.
+			if (FORCE_BEHAVIORAL) begin : genBehav
+				// Stage #1: Input Refine
+				logic signed [23:0]  B1  = 0;
+				always_ff @(posedge clk) begin
+					if(zero)     B1  <= 0;
+					else if(en)  B1  <= bb;
+				end
+
+				logic signed [26:0]  AD1 = 0;
+				always_ff @(posedge clk) begin
+					if(rst)      AD1 <= 0;
+					else if(en)  AD1 <= dd - aa;
+				end
+
+				// Stage #2: Multiply
+				logic signed [50:0]  M2 = 0;
+				always_ff @(posedge clk) begin
+					if(rst)      M2 <= 0;
+					else if(en)  M2 <=
+// synthesis translate off
+						(B1 === '0) || (AD1 === '0)? 0 :
+// synthesis translate on
+						B1 * AD1;
+				end
+
+				// Stage #3: Accumulate
+				logic signed [57:0]  P3 = 0;
+				always_ff @(posedge clk) begin
+					if(rst)      P3 <= 0;
+					else if(en)  P3 <= M2 + (L[3]? 0 : P3);
+				end
+
+				assign	pp = P3;
+			end : genBehav
+			else begin : genDSP
+				DSP48E2 #(
+					// Feature Control Attributes: Data Path Selection
+					.AMULTSEL("AD"),	// Selects A input to multiplier (A, AD)
+					.A_INPUT("DIRECT"),	// Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port)
+					.BMULTSEL("B"),		// Selects B input to multiplier (AD, B)
+					.B_INPUT("DIRECT"),	// Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port)
+					.PREADDINSEL("A"),                 // Selects input to pre-adder (A, B)
+					.RND('0),                          // Rounding Constant
+					.USE_MULT("MULTIPLY"),             // Select multiplier usage (DYNAMIC, MULTIPLY, NONE)
+					.USE_SIMD("ONE48"),                // SIMD selection (FOUR12, ONE58, TWO24)
+					.USE_WIDEXOR("FALSE"),             // Use the Wide XOR function (FALSE, TRUE)
+					.XORSIMD("XOR24_48_96"),       // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116)
+
+					// Pattern Detector Attributes: Pattern Detection Configuration
+					.AUTORESET_PATDET("NO_RESET"),     // NO_RESET, RESET_MATCH, RESET_NOT_MATCH
+					.AUTORESET_PRIORITY("RESET"),      // Priority of AUTORESET vs. CEP (CEP, RESET).
+					.MASK('1),                         // 58-bit mask value for pattern detect (1=ignore)
+					.PATTERN('0),                      // 58-bit pattern match for pattern detect
+					.SEL_MASK("MASK"),                 // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2
+					.SEL_PATTERN("PATTERN"),           // Select pattern value (C, PATTERN)
+					.USE_PATTERN_DETECT("NO_PATDET"),  // Enable pattern detect (NO_PATDET, PATDET)
+
+					// Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins
+					.IS_ALUMODE_INVERTED('0),				// Optional inversion for ALUMODE
+					.IS_CARRYIN_INVERTED('0),				// Optional inversion for CARRYIN
+					.IS_CLK_INVERTED('0),					// Optional inversion for CLK
+					.IS_INMODE_INVERTED('0),				// Optional inversion for INMODE
+					.IS_OPMODE_INVERTED(9'b00_010_01_01),	// Optional inversion for OPMODE
+					.IS_RSTALLCARRYIN_INVERTED('0),			// Optional inversion for RSTALLCARRYIN
+					.IS_RSTALUMODE_INVERTED('0),			// Optional inversion for RSTALUMODE
+					.IS_RSTA_INVERTED('0),					// Optional inversion for RSTA
+					.IS_RSTB_INVERTED('0),					// Optional inversion for RSTB
+					.IS_RSTCTRL_INVERTED('0),				// Optional inversion for STCONJUGATE_A
+					.IS_RSTC_INVERTED('0),					// Optional inversion for RSTC
+					.IS_RSTD_INVERTED('0),					// Optional inversion for RSTD
+					.IS_RSTINMODE_INVERTED('0),				// Optional inversion for RSTINMODE
+					.IS_RSTM_INVERTED('0),					// Optional inversion for RSTM
+					.IS_RSTP_INVERTED('0),					// Optional inversion for RSTP
+
+					// Register Control Attributes: Pipeline Register Configuration
+					.ACASCREG(0),                      // Number of pipeline stages between A/ACIN and ACOUT (0-2)
+					.ADREG(1),                         // Pipeline stages for pre-adder (0-1)
+					.ALUMODEREG(0),                    // Pipeline stages for ALUMODE (0-1)
+					.AREG(0),                          // Pipeline stages for A (0-2)
+					.BCASCREG(1),                      // Number of pipeline stages between B/BCIN and BCOUT (0-2)
+					.BREG(1),                          // Pipeline stages for B (0-2)
+					.CARRYINREG(0),                    // Pipeline stages for CARRYIN (0-1)
+					.CARRYINSELREG(0),                 // Pipeline stages for CARRYINSEL (0-1)
+					.CREG(0),                          // Pipeline stages for C (0-1)
+					.DREG(0),                          // Pipeline stages for D (0-1)
+					.INMODEREG(0),                     // Pipeline stages for INMODE (0-1)
+					.MREG(1),                          // Multiplier pipeline stages (0-1)
+					.OPMODEREG(1),                     // Pipeline stages for OPMODE (0-1)
+					.PREG(1)                          // Number of pipeline stages for P (0-1)
+				) dsp (
+					// Cascade outputs: Cascade Ports
+					.ACOUT(),			// 34-bit output: A port cascade
+					.BCOUT(),			// 24-bit output: B cascade
+					.CARRYCASCOUT(),	// 1-bit output: Cascade carry
+					.MULTSIGNOUT(),		// 1-bit output: Multiplier sign cascade
+					.PCOUT(),			// 58-bit output: Cascade output
+
+					// Control outputs: Control Inputs/Status Bits
+					.OVERFLOW(),		// 1-bit output: Overflow in add/acc
+					.PATTERNBDETECT(),	// 1-bit output: Pattern bar detect
+					.PATTERNDETECT(),	// 1-bit output: Pattern detect
+					.UNDERFLOW(),		// 1-bit output: Underflow in add/acc
+
+					// Data outputs: Data Ports
+					.CARRYOUT(),		// 4-bit output: Carry
+					.P(pp),				// 58-bit output: Primary data
+					.XOROUT(),			// 8-bit output: XOR data
+
+					// Cascade inputs: Cascade Ports
+					.ACIN('x),			// 34-bit input: A cascade data
+					.BCIN('x),			// 24-bit input: B cascade
+					.CARRYCASCIN('x),	// 1-bit input: Cascade carry
+					.MULTSIGNIN('x),	// 1-bit input: Multiplier sign cascade
+					.PCIN('x),			// 58-bit input: P cascade
+
+					// Control inputs: Control Inputs/Status Bits
+					.CLK(clk),					// 1-bit input: Clock
+					.ALUMODE(4'h0),				// 4-bit input: ALU control
+					.CARRYINSEL('0),			// 3-bit input: Carry select
+					.INMODE(5'b01100),			// 5-bit input: INMODE control
+					.OPMODE({ 2'b00, { 1'b0, L[2], 1'b0 }, 4'b00_00 }),	// 9-bit input: Operation mode
+
+					// Data inputs: Data Ports
+					.A(aa),						// 34-bit input: A data
+					.B(bb),						// 24-bit input: B data
+					.C('x),						// 58-bit input: C data
+					.CARRYIN('0),				// 1-bit input: Carry-in
+					.D(dd),						// 27-bit input: D data
+
+					// Reset/Clock Enable inputs: Reset/Clock Enable Inputs
+					.CEA1('0),			// 1-bit input: Clock enable for 1st stage AREG
+					.CEA2('0),			// 1-bit input: Clock enable for 2nd stage AREG
+					.CEAD(en),			// 1-bit input: Clock enable for ADREG
+					.CEALUMODE('0),		// 1-bit input: Clock enable for ALUMODE
+					.CEB1('0),			// 1-bit input: Clock enable for 1st stage BREG
+					.CEB2(en),			// 1-bit input: Clock enable for 2nd stage BREG
+					.CEC('0),			// 1-bit input: Clock enable for CREG
+					.CECARRYIN('0),		// 1-bit input: Clock enable for CARRYINREG
+					.CECTRL(en),		// 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG
+					.CED('0),			// 1-bit input: Clock enable for DREG
+					.CEINMODE('0),		// 1-bit input: Clock enable for INMODEREG
+					.CEM(en),			// 1-bit input: Clock enable for MREG
+					.CEP(en),			// 1-bit input: Clock enable for PREG
+					.RSTA('0),			// 1-bit input: Reset for AREG
+					.RSTB(				// 1-bit input: Reset for BREG
+// synthesis translate_off
+						rst ||
+// synthesis translate_on
+						zero
+					),
+					.RSTC('0),			// 1-bit input: Reset for CREG
+					.RSTD(				// 1-bit input: Reset for DREG and ADREG
+// synthesis translate_off
+						zero ||
+// synthesis translate_on
+						rst
+					),
+					.RSTALLCARRYIN('0),	// 1-bit input: Reset for CARRYINREG
+					.RSTALUMODE('0),	// 1-bit input: Reset for ALUMODEREG
+					.RSTCTRL('0),		// 1-bit input: Reset for OPMODEREG and CARRYINSELREG
+					.RSTINMODE('0),		// 1-bit input: Reset for INMODE register
+					.RSTM(rst),			// 1-bit input: Reset for MREG
+					.RSTP(rst)			// 1-bit input: Reset for PREG
+				);
+			end : genDSP
+
+			// External Canary Pipeline
+			logic [1:0]  X1 = '{ default: 0 };
+			logic [1:0]  X2 = '{ default: 0 };
+			logic [1:0]  X3 = '{ default: 0 };
+			always_ff @(posedge clk) begin
+				if(rst) begin
+					X1 <= '{ default: 0 };
+					X2 <= '{ default: 0 };
+					X3 <= '{ default: 0 };
+				end
+				else if(en) begin
+					X1 <= xx;
+					X2 <= X1;
+					X3 <= X2 + (L[3]? 2'h0 : pp[D[1]+:2]);
+				end
+			end
+
+			// Derive actual cross-lane overflows
+			assign  h3[s] = pp[D[1]+:2] - X3;
+
+			assign	p3[s] = pp;
+
+		end : genSIMD
+
+		// Stage #4: Cross-SIMD Reduction
+
+		// Count leaves reachable from each node
+		localparam leave_load_t  LEAVE_LOAD = init_leave_loads();
+
+		uwire signed [ACCU_WIDTH  -1:0]  up4;
+		uwire signed [ACCU_WIDTH  -SINGLE_PROD_WIDTH:0]  hi4;
+		uwire        [$clog2(SIMD)+SINGLE_PROD_WIDTH-1:0]  lo4;
+		for(genvar  i = 0; i < 2; i++) begin
+			localparam int unsigned  LO_WIDTH = D[i+1] - D[i];
+			localparam int unsigned  HI_WIDTH = ACCU_WIDTH - LO_WIDTH;
+
+			// Conclusive high part accumulation
+			if(i == 0) begin : genHi
+				// Adder Tree across all SIMD high contributions, each from [-1:1]
+				uwire signed [$clog2(1+SIMD):0]  tree[2*SIMD-1];
+				for(genvar  s = 0; s < SIMD;   s++)  assign  tree[SIMD-1+s] = h3[s];
+				for(genvar  n = 0; n < SIMD-1; n++) begin
+					// Sum truncated to actual maximum bit width at this node
+					uwire signed [$clog2(1+LEAVE_LOAD[n]):0]  s = tree[2*n+1] + tree[2*n+2];
+					assign  tree[n] = s;
+				end
+
+				// High Sideband Accumulation
+				logic signed [HI_WIDTH-1:0]  Hi4 = 0;
+				always_ff @(posedge clk) begin
+					if(rst)      Hi4 <= 0;
+					else if(en)  Hi4 <= (L[4]? 0 : Hi4) + tree[0];
+				end
+				assign	hi4 = Hi4;
+			end : genHi
+
+			// Conclusive low part accumulation
+			if(1) begin : blkLo
+				// Adder Tree across all SIMD low contributions
+				localparam int unsigned  ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1));
+				uwire [ROOT_WIDTH-1:0]  tree[2*SIMD-1];
+				for(genvar  s = 0; s < SIMD;   s++)  assign  tree[SIMD-1+s] = p3[s][D[i]+:LO_WIDTH];
+				for(genvar  n = 0; n < SIMD-1; n++) begin
+					// Sum truncated to actual maximum bit width at this node
+					localparam int unsigned  NODE_WIDTH = $clog2(1 + LEAVE_LOAD[n]*(2**LO_WIDTH-1));
+					uwire [NODE_WIDTH-1:0]  s = tree[2*n+1] + tree[2*n+2];
+					assign  tree[n] = s;
+				end
+
+				logic [ROOT_WIDTH-1:0]  Lo4 = 0;
+				always_ff @(posedge clk) begin
+					if(rst)      Lo4 <= 0;
+					else if(en)  Lo4 <= tree[0];
+				end
+
+				if(i == 1)  assign  up4 = Lo4;
+				else  assign  lo4 = Lo4;
+			end : blkLo
+
+		end
+
+		// Stage #5: Resolve lane totals
+		logic signed [1:0][ACCU_WIDTH-1:0]  Res5 = '{ default: 0 };
+		always_ff @(posedge clk) begin
+			if(rst)  Res5 <= '{ default: 0 };
+			else if(en) begin
+				Res5[1] <= up4 - hi4;
+				Res5[0] <= $signed({ hi4, {(D[1] - D[0]){1'b0}} }) + $signed({ 1'b0, lo4 });
+			end
+		end
+
+		// Output
+		for(genvar  pe = PE_BEG; pe < PE_END; pe++) begin
+			assign	p[pe] = Res5[pe - PE_BEG];
+		end
+
+	end : genPipes
+
+endmodule : mvu_8sx8u_dsp48
\ No newline at end of file

From e7109e75161774280b24e5884f6c9b9c17a07f7b Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 8 May 2023 10:34:23 +0100
Subject: [PATCH 122/235] [fpgadataflow transform]: initial
 specialize_to_rtl_layers-transform for MVU

---
 .../fpgadataflow/specialize_to_rtl_layers.py  | 105 ++++++++++++++++++
 1 file changed, 105 insertions(+)
 create mode 100644 src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py

diff --git a/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py
new file mode 100644
index 0000000000..7d677ec216
--- /dev/null
+++ b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py
@@ -0,0 +1,105 @@
+# Copyright (c) 2023, AMD
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from qonnx.transformation.base import Transformation
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.core.datatype import DataType
+from onnx import helper
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.fpgadataflow.minimize_accumulator_width import MinimizeAccumulatorWidth
+
+class InferRTLMatrixVectorActivation(Transformation):
+    """Convert (HLS-based) MatrixVectorActivation layers to specialized RTL layers if supported."""
+
+    def __init__(self):
+        super().__init__()
+
+    def _is_rtl_variant_compatible(self, n):
+        no_activation = getCustomOp(n).get_nodeattr("noActivation") == 1
+        act_width_in_range = (DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() <= 8) or (DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() == 9 and DataType[getCustomOp(n).get_nodeattr("inputDataType")].min() < 0)
+        weight_width_in_range = DataType[getCustomOp(n).get_nodeattr("weightDataType")].bitwidth() <= 8
+        folding_supported = (getCustomOp(n).get_nodeattr("MH") % getCustomOp(n).get_nodeattr("PE") == 0) and (getCustomOp(n).get_nodeattr("MW") % getCustomOp(n).get_nodeattr("SIMD") == 0)
+
+        if (no_activation and act_width_in_range and weight_width_in_range and folding_supported):
+            return True
+        else:
+            return False
+
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for n in graph.node:
+            node_ind += 1
+            if n.op_type == "MatrixVectorActivation":
+                preferred_in_rtl = getCustomOp(n).get_nodeattr("impl") == "rtl" and getCustomOp(n).get_nodeattr("resType") == "dsp"
+                supported_in_rtl = self._is_rtl_variant_compatible(n)
+                if (preferred_in_rtl and supported_in_rtl):
+                    mvau_input = n.input[0]
+                    mvau_weight = n.input[1]
+                    mvau_output = n.output[0]
+                    inputDataType = getCustomOp(n).get_nodeattr("inputDataType")
+                    weightDataType = getCustomOp(n).get_nodeattr("weightDataType")
+                    outputDataType = getCustomOp(n).get_nodeattr("outputDataType")
+                    numInputVectors = getCustomOp(n).get_nodeattr("numInputVectors")
+                    mw = getCustomOp(n).get_nodeattr("MW")
+                    mh = getCustomOp(n).get_nodeattr("MH")
+                    simd = getCustomOp(n).get_nodeattr("SIMD")
+                    pe = getCustomOp(n).get_nodeattr("PE")
+                    mem_mode = getCustomOp(n).get_nodeattr("mem_mode")
+
+                    new_node = helper.make_node(
+                        "MatrixVectorActivation_rtl",
+                        [mvau_input, mvau_weight],
+                        [mvau_output],
+                        domain="finn.custom_op.fpgadataflow",
+                        backend="fpgadataflow",
+                        MW=mw,
+                        MH=mh,
+                        SIMD=simd,
+                        PE=pe,
+                        inputDataType=inputDataType,
+                        weightDataType=weightDataType,
+                        outputDataType=outputDataType,
+                        numInputVectors=numInputVectors,
+                        mem_mode=mem_mode,
+                        name=n.name + "_rtl",
+                    )
+                    graph.node.insert(node_ind, new_node)
+                    # remove old node
+                    graph.node.remove(n)
+                    graph_modified=True
+        
+        if graph_modified:
+            model = model.transform(MinimizeAccumulatorWidth())
+            model = model.transform(InferShapes())
+            model = model.transform(InferDataTypes())
+        
+        return (model, graph_modified)
\ No newline at end of file

From 5a868d19e5955abdb894bf1e8b93d2d1f6f8410d Mon Sep 17 00:00:00 2001
From: Yaman Umuroglu <maltanar@gmail.com>
Date: Tue, 9 May 2023 09:41:15 +0200
Subject: [PATCH 123/235] [rtl mvu] fixes for latest memstream + linting

---
 .../matrixvectoractivation_rtl.py             | 136 ++++++++++--------
 1 file changed, 77 insertions(+), 59 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
index 6b1c2f3be7..8fd261d395 100644
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
@@ -27,7 +27,6 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import math
-from shutil import copy
 import numpy as np
 import os
 import textwrap
@@ -40,20 +39,18 @@
 )
 
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
+from finn.util.basic import get_rtlsim_trace_depth, make_build_dir
 from finn.util.data_packing import (
     npy_to_rtlsim_input,
-    numpy_to_hls_code,
     pack_innermost_dim_as_hex_string,
     rtlsim_output_to_npy,
 )
-from finn.util.basic import get_rtlsim_trace_depth, make_build_dir
 
 try:
     from pyverilator import PyVerilator
 except ModuleNotFoundError:
     PyVerilator = None
 
-from . import templates
 
 # ONNX i/o tensor shape assumptions for MatrixVectorActivation:
 # input 0 is the input tensor, shape (.., i_size) = (..., MW)
@@ -69,7 +66,6 @@ class MatrixVectorActivation_rtl(HLSCustomOp):
 
     def __init__(self, onnx_node, **kwargs):
         super().__init__(onnx_node, **kwargs)
-        self.decoupled_wrapper = templates.decoupled_wrapper
 
     def get_nodeattr_types(self):
         my_attrs = {
@@ -186,17 +182,24 @@ def verify_node(self):
             )
 
         num_of_inputs = len(self.onnx_node.input)
-        if num_of_inputs!=2:
-            info_messages.append("RTL-based MatrixVectorActivation expects two inputs (weights and activation), but got {} inputs.".format(len(self.onnx_node.input)))
+        if num_of_inputs != 2:
+            info_messages.append(
+                "RTL-based MatrixVectorActivation expects two inputs "
+                "(weights and activation), but got {} inputs.".format(
+                    len(self.onnx_node.input)
+                )
+            )
 
         mem_mode = self.get_nodeattr("mem_mode")
 
         if mem_mode != "decoupled":
-            info_messages.append("RTL-based MVAU supports only decoupled weights currently")
+            info_messages.append(
+                "RTL-based MVAU supports only decoupled weights currently"
+            )
 
         return info_messages
 
-# TODO: Add in replay_buffer estimation
+    # TODO: Add in replay_buffer estimation
     def uram_estimation(self):
         P = self.get_nodeattr("PE")
         Q = self.get_nodeattr("SIMD")
@@ -218,7 +221,7 @@ def uram_estimation(self):
         depth_multiplier = math.ceil(omega / 4096)
         return width_multiplier * depth_multiplier
 
-# TODO: Add in replay_buffer estimation
+    # TODO: Add in replay_buffer estimation
     def bram_estimation(self):
         """Calculates resource estimation for BRAM based on:
         - FINN-R: An End-to-End Deep-Learning Framework for Fast
@@ -259,7 +262,7 @@ def bram_estimation(self):
         else:
             return (math.ceil(omega / 512)) * (math.ceil(mem_width / 36))
 
-# TODO: Add in replay_buffer estimation
+    # TODO: Add in replay_buffer estimation
     def bram_efficiency_estimation(self):
         wdt = self.get_weight_datatype()
         W = wdt.bitwidth()
@@ -272,7 +275,7 @@ def bram_efficiency_estimation(self):
         bram16_est_capacity = bram16_est * 36 * 512
         return wbits / bram16_est_capacity
 
-# TODO: Add in replay_buffer estimation
+    # TODO: Add in replay_buffer estimation
     def uram_efficiency_estimation(self):
         """Function for URAM efficiency estimation: actual parameter storage
         needed divided by the allocated URAM storage (from estimation)"""
@@ -287,7 +290,7 @@ def uram_efficiency_estimation(self):
         uram_est_capacity = uram_est * 72 * 4096
         return wbits / uram_est_capacity
 
-#TODO: FIX: worst case estimates since segmentlen is not known at this point?
+    # TODO: FIX: worst case estimates since segmentlen is not known at this point?
     def lut_estimation(self):
         """Calculates resource estimations for LUTs based on:
         - FINN-R: An End-to-End Deep-Learning Framework for Fast
@@ -328,13 +331,9 @@ def lut_estimation(self):
         acc_bits = W + A + np.ceil(math.log(MW, 2))
         acc_luts = acc_bits
 
-        return int(
-            c0
-            + c1 * (P * (mult_luts + addertree_luts + acc_luts))
-            + c2
-        )
+        return int(c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts)) + c2)
 
-#TODO: FIX: worst case estimates since segmentlen is not known at this point?
+    # TODO: FIX: worst case estimates since segmentlen is not known at this point?
     def dsp_estimation(self):
         # multiplication
         P = self.get_nodeattr("PE")
@@ -350,7 +349,7 @@ def dsp_estimation(self):
             mult_dsp = 0
         return int(mult_dsp)
 
-#TODO: FIX: worst case estimates since segmentlen is not known at this point
+    # TODO: FIX: worst case estimates since segmentlen is not known at this point
     def get_exp_cycles(self):
         pe = self.get_nodeattr("PE")
         simd = self.get_nodeattr("SIMD")
@@ -359,7 +358,9 @@ def get_exp_cycles(self):
         mw = self.get_nodeattr("MW")
         # since mmv != 1 is not supported yet, we set mmv for now to 1
         mmv = 1
-        # Actual exp_cycles is probably slightly larger (say 3 cycles (DSP A/B, M, P - reg) + additional pipeline buffer cycles. Most probably <10)
+        # Actual exp_cycles is probably slightly larger (say 3 cycles
+        # (DSP A/B, M, P - reg) + additional pipeline buffer cycles.
+        # Most probably <10)
         exp_cycles = (mh / pe) * (mw / simd) * np.prod(num_inp_vec) / mmv
         return int(exp_cycles)
 
@@ -384,7 +385,9 @@ def get_output_datatype(self, ind=0):
 
     def get_instream_width(self, ind=0):
         i_bits = self.get_input_datatype().bitwidth()
-        assert (i_bits<=9), "RTL-based MVAU only supports activations with bit-width up to 9-bits"
+        assert (
+            i_bits <= 9
+        ), "RTL-based MVAU only supports activations with bit-width up to 9-bits"
         in_width = i_bits * self.get_nodeattr("SIMD")
         return in_width
 
@@ -402,7 +405,9 @@ def get_weightstream_width(self):
             pe = self.get_nodeattr("PE")
             simd = self.get_nodeattr("SIMD")
             wp = self.get_weight_datatype().bitwidth()
-            assert (wp <= 8), "RTL-based MVAU only supports weights with bit-width up to 8-bits"
+            assert (
+                wp <= 8
+            ), "RTL-based MVAU only supports weights with bit-width up to 8-bits"
             w_width = pe * simd * wp
             return w_width
         else:
@@ -516,7 +521,8 @@ def minimize_accumulator_width(self, model):
         else:
             adt = DataType.get_smallest_possible(acc_max)
         # Note: we are interested in simply the width of the output dot product.
-        # Padding the actual output stream to a multiple of 8-bits is done in the RTL component
+        # Padding the actual output stream to a multiple of 8-bits is done in
+        # the RTL component
         self.set_nodeattr("accDataType", adt.name)
         # for no-activation nodes, output dt = acc dt
         self.set_nodeattr("outputDataType", adt.name)
@@ -615,9 +621,7 @@ def generate_params(self, model, path):
             # and one for synthesis. this is because URAM-based weights always
             # need zero weights for synthesis, otherwise they get inferred
             # as BRAM
-            weight_filename_rtl_synth = "{}/memblock_synth_0.dat".format(
-                code_gen_dir
-            )
+            weight_filename_rtl_synth = "{}/memblock_synth_0.dat".format(code_gen_dir)
             weight_filename_rtl_sim = "{}/memblock_sim_0.dat".format(code_gen_dir)
             # sim weights are always the true weights
             self.make_weight_file(
@@ -734,11 +738,11 @@ def code_generation_ipgen(self, model, fpgapart, clk):
 
     def ipgen_singlenode_code(self):
         """Normally: Builds the bash script for IP generation."""
-        pass   
+        pass
 
     def code_generation_cppsim(self, model):
         """Normally: Generates C++ code for simulation (cppsim)."""
-        pass     
+        pass
 
     def compile_singlenode_code(self):
         pass
@@ -803,19 +807,28 @@ def code_generation_ipi(self):
             code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
             rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/")
             sourcefiles = [
-                os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"),
+                os.path.join(
+                    code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"
+                ),
                 rtllib_dir + "mvu_axi.sv",
                 rtllib_dir + "replay_buffer.sv",
                 rtllib_dir + "mvu_4sx4u.sv",
                 rtllib_dir + "mvu_8sx9.sv",
-                rtllib_dir + "mvu_8sx8u_dsp48.sv"
+                rtllib_dir + "mvu_8sx8u_dsp48.sv",
             ]
             for f in sourcefiles:
                 cmd.append("add_files -norecurse %s" % (f))
-            cmd.append("create_bd_cell -type hier -reference %s /%s/%s" % (self.get_nodeattr("gen_top_module"), self.onnx_node.name, self.onnx_node.name))
+            cmd.append(
+                "create_bd_cell -type hier -reference %s /%s/%s"
+                % (
+                    self.get_nodeattr("gen_top_module"),
+                    self.onnx_node.name,
+                    self.onnx_node.name,
+                )
+            )
 
             # instantiate a streamer and connect it to the HLS IP
-            strm_vlnv = "xilinx.com:user:memstream:1.0"
+            strm_vlnv = "amd.com:FINN:memstream:1.0"
             strm_inst = node_name + "_wstrm"
             cmd.append(
                 "create_bd_cell -type ip -vlnv %s /%s/%s"
@@ -849,11 +862,11 @@ def code_generation_ipi(self):
                 % (node_name, strm_inst, node_name, node_name, sname)
             )
             cmd.append(
-                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aresetn]"
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/ap_rst_n]"
                 % (node_name, rst_name, node_name, strm_inst)
             )
             cmd.append(
-                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aclk]"
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/ap_clk]"
                 % (node_name, clk_name, node_name, strm_inst)
             )
             cmd.append(
@@ -947,21 +960,25 @@ def derive_characteristic_fxns(self, period):
             ]
         super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
 
-# TODO: characterize max_clk and implement this function in look-up style
+    # TODO: characterize max_clk and implement this function in look-up style
     def _resolve_segment_len(self, clk):
         # Insert pipeline registers in the DSP chain to meet target clock frequency
         segmentlen = 0
         return segmentlen
 
     def _resolve_impl_style(self, fpgapart):
-        # Based on target device and activation/weight-width, choose the supported RTL module
+        # Based on target device and activation/weight-width, choose the
+        # supported RTL module
         act_width = self.get_input_datatype(0).bitwidth()
         weight_width = self.get_input_datatype(1).bitwidth()
-        is_versal = fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"] or fpgapart[0:5] == "xqrvc"
-        if (act_width == 4 and weight_width == 4):
+        is_versal = (
+            fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"]
+            or fpgapart[0:5] == "xqrvc"
+        )
+        if act_width == 4 and weight_width == 4:
             return "mvu_4sx4u"
         else:
-            if (is_versal):
+            if is_versal:
                 return "mvu_8sx9_dsp58"
             else:
                 return "mvu_8sx8u_dsp48"
@@ -973,13 +990,17 @@ def generate_hdl(self, model, fpgapart, clk):
 
         template_path, code_gen_dict = self.prepare_codegen_default(fpgapart, clk)
         # add general parameters to dictionary
-        code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [self.get_verilog_top_module_name()]
+        code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [
+            self.get_verilog_top_module_name()
+        ]
         # save top module name so we can refer to it after this node has been renamed
         # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject)
         self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name())
 
         ram_style = self.get_nodeattr("ram_style")
-        assert (ram_style=="auto"), "Unrecognized ram_style for MatrixVectorActivation_rtl"
+        assert (
+            ram_style == "auto"
+        ), "Unrecognized ram_style for MatrixVectorActivation_rtl"
 
         # apply code generation to template
         with open(template_path, "r") as f:
@@ -1009,19 +1030,21 @@ def generate_hdl(self, model, fpgapart, clk):
         self.set_nodeattr("ip_path", code_gen_dir)
 
     def prepare_codegen_default(self, fpgapart, clk):
-        template_path = (
-            os.environ["FINN_ROOT"] + "/finn-rtllib/mvu/mvu_axi_wrapper.v"
-        )
-        
+        template_path = os.environ["FINN_ROOT"] + "/finn-rtllib/mvu/mvu_axi_wrapper.v"
+
         code_gen_dict = {}
         code_gen_dict["$MW$"] = [str(self.get_nodeattr("MW"))]
         code_gen_dict["$MH$"] = [str(self.get_nodeattr("MH"))]
         code_gen_dict["$PE$"] = [str(self.get_nodeattr("PE"))]
         code_gen_dict["$SIMD$"] = [str(self.get_nodeattr("SIMD"))]
-        code_gen_dict["$ACTIVATION_WIDTH$"] = [str(self.get_input_datatype(0).bitwidth())]
+        code_gen_dict["$ACTIVATION_WIDTH$"] = [
+            str(self.get_input_datatype(0).bitwidth())
+        ]
         code_gen_dict["$WEIGHT_WIDTH$"] = [str(self.get_input_datatype(1).bitwidth())]
         code_gen_dict["$ACCU_WIDTH$"] = [str(self.get_output_datatype().bitwidth())]
-        code_gen_dict["$SIGNED_ACTIVATIONS$"] = [str(1)] if (self.get_input_datatype(0).min() < 0) else [str(0)]
+        code_gen_dict["$SIGNED_ACTIVATIONS$"] = (
+            [str(1)] if (self.get_input_datatype(0).min() < 0) else [str(0)]
+        )
         code_gen_dict["$SEGMENTLEN$"] = [str(self._resolve_segment_len(clk))]
         code_gen_dict["$MVU_IMPL_STYLE$"] = [self._resolve_impl_style(fpgapart)]
 
@@ -1035,15 +1058,10 @@ def prepare_rtlsim(self):
         if PyVerilator is None:
             raise ImportError("Installation of PyVerilator is required.")
 
-        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")        
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
         # Path to (System-)Verilog files used by top-module & path to top-module
-        verilog_paths = [
-            code_gen_dir,
-            os.environ["FINN_ROOT"] + "/finn-rtllib/mvu"
-        ]
-        verilog_files = [
-            self.get_nodeattr("gen_top_module") + "_wrapper_sim.v"
-        ]
+        verilog_paths = [code_gen_dir, os.environ["FINN_ROOT"] + "/finn-rtllib/mvu"]
+        verilog_files = [self.get_nodeattr("gen_top_module") + "_wrapper_sim.v"]
 
         # build the Verilator emu library
         sim = PyVerilator.build(
@@ -1051,9 +1069,9 @@ def prepare_rtlsim(self):
             build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"),
             verilog_path=verilog_paths,
             trace_depth=get_rtlsim_trace_depth(),
-            top_module_name=self.get_verilog_top_module_name()
+            top_module_name=self.get_verilog_top_module_name(),
         )
         # save generated lib filename in attribute
         self.set_nodeattr("rtlsim_so", sim.lib._name)
-        
-        return sim
\ No newline at end of file
+
+        return sim

From 4a9cfa1c7a17497578faad3f76c25b80c116ba58 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 11 May 2023 10:56:07 +0100
Subject: [PATCH 124/235] [rtl custom_op]: add support for external weights

---
 .../matrixvectoractivation_rtl.py             | 67 ++++++++++---------
 1 file changed, 37 insertions(+), 30 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
index 8fd261d395..162b5e2e16 100644
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
@@ -192,9 +192,9 @@ def verify_node(self):
 
         mem_mode = self.get_nodeattr("mem_mode")
 
-        if mem_mode != "decoupled":
+        if mem_mode not in ["decoupled", "external"]:
             info_messages.append(
-                "RTL-based MVAU supports only decoupled weights currently"
+                "RTL-based MVAU supports only decoupled or external weights."
             )
 
         return info_messages
@@ -612,35 +612,20 @@ def generate_params(self, model, path):
         code_gen_dir = path
         # weights, if not external
         weights = model.get_initializer(self.onnx_node.input[1])
-        if mem_mode == "decoupled":
+        if mem_mode in ["decoupled", "external"]:
             weight_filename_sim = "{}/weights.npy".format(code_gen_dir)
             # save decoupled weights for cppsim
             self.make_weight_file(weights, "decoupled_npy", weight_filename_sim)
-            # Also save weights as Verilog .dat file
-            # note that we provide two different .dat files, one for synth
-            # and one for synthesis. this is because URAM-based weights always
-            # need zero weights for synthesis, otherwise they get inferred
-            # as BRAM
-            weight_filename_rtl_synth = "{}/memblock_synth_0.dat".format(code_gen_dir)
-            weight_filename_rtl_sim = "{}/memblock_sim_0.dat".format(code_gen_dir)
-            # sim weights are always the true weights
-            self.make_weight_file(
-                weights, "decoupled_verilog_dat", weight_filename_rtl_sim
-            )
-            ram_style = self.get_nodeattr("ram_style")
-            if ram_style == "ultra":
-                # UltraRAM must have no memory initializer, or only zeroes
-                # otherwise BRAM will be inferred instead of URAM
-                # as a workaround we provide a zero-weight init here
-                synth_weights = np.zeros_like(weights, dtype=np.float32)
-            else:
-                synth_weights = weights
-            self.make_weight_file(
-                synth_weights, "decoupled_verilog_dat", weight_filename_rtl_synth
-            )
+            if mem_mode == "decoupled":
+                # also save weights as Verilog .dat file
+                # This file will be ignored when synthesizing UltraScale memory.
+                weight_filename_rtl = "{}/memblock.dat".format(code_gen_dir)
+                self.make_weight_file(
+                    weights, "decoupled_verilog_dat", weight_filename_rtl
+                )
         else:
             raise Exception(
-                """Please set mem_mode to "decoupled",
+                """Please set mem_mode to "const", "decoupled", or "external",
                 currently no other parameter value is supported!"""
             )
 
@@ -695,7 +680,7 @@ def execute_node(self, context, graph):
             )
             super().reset_rtlsim(sim)
             super().toggle_clk(sim)
-            if mem_mode == "external" or mem_mode == "decoupled":
+            if mem_mode in ["external", "decoupled"]:
                 wnbits = self.get_weightstream_width()
                 export_wdt = self.get_weight_datatype()
                 wei = npy_to_rtlsim_input(
@@ -903,9 +888,31 @@ def code_generation_ipi(self):
                 # TODO calculate and pass in segment size here
                 cmd.append("assign_bd_address")
             cmd.append("save_bd_design")
-        elif mem_mode == "const" or mem_mode == "external":
-            # base class impl sufficient for const/external modes
-            return super().code_generation_ipi()
+        elif mem_mode == "external":
+            # instantiate the RTL block
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+            rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/")
+            sourcefiles = [
+                os.path.join(
+                    code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"
+                ),
+                rtllib_dir + "mvu_axi.sv",
+                rtllib_dir + "replay_buffer.sv",
+                rtllib_dir + "mvu_4sx4u.sv",
+                rtllib_dir + "mvu_8sx9.sv",
+                rtllib_dir + "mvu_8sx8u_dsp48.sv",
+            ]
+            for f in sourcefiles:
+                cmd.append("add_files -norecurse %s" % (f))
+            cmd.append(
+                "create_bd_cell -type module -reference %s %s"
+                % (
+                    self.get_nodeattr("gen_top_module"),
+                    self.onnx_node.name,
+                )
+            )
+            cmd.append("set_property CONFIG.FREQ_HZ 333333333.333333 [get_bd_intf_pins %s/in0_V]" % (self.onnx_node.name))
+            cmd.append("set_property CONFIG.FREQ_HZ 333333333.333333 [get_bd_intf_pins %s/out_V]" % (self.onnx_node.name))
         else:
             raise Exception("Unrecognized mem_mode for MatrixVectorActivation")
         return cmd

From 8a9ac1af4d6c62e7c9557ab41992b84cf2c37ae1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Thu, 11 May 2023 11:04:28 +0100
Subject: [PATCH 125/235] Specify clock and reset associations of bus
 interfaces.

---
 finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v | 4 +++-
 finn-rtllib/mvu/mvu_axi_wrapper.v      | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v
index 502a72d3f2..fb3c62a15a 100644
--- a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v
+++ b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v
@@ -49,8 +49,10 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 	parameter 	OUTPUT_LANES = PE,
 	parameter 	OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
 )(
-  	// Global Control
+	// Global Control
+	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF s_axis_weights:s_axis_input:m_axis_output" *)
 	input	logic  ap_clk,
+	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF s_axis_weights:s_axis_input:m_axis_output" *)
 	input	logic  ap_rst_n,
 
 	// Weight Stream
diff --git a/finn-rtllib/mvu/mvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_axi_wrapper.v
index b79ba6bbd1..d8acaefcc7 100644
--- a/finn-rtllib/mvu/mvu_axi_wrapper.v
+++ b/finn-rtllib/mvu/mvu_axi_wrapper.v
@@ -50,8 +50,10 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 	parameter 	OUTPUT_LANES = PE,
 	parameter 	OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
 )(
-  	// Global Control
+	// Global Control
+	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V" *)
 	input	ap_clk,
+	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V" *)
 	input	ap_rst_n,
 	// Weight Stream
 	input	[WEIGHT_STREAM_WIDTH_BA-1:0]  weights_V_TDATA,

From d9b90793bd54a5e112531c737fa7c60a51b21d34 Mon Sep 17 00:00:00 2001
From: Yaman Umuroglu <maltanar@gmail.com>
Date: Mon, 15 May 2023 10:16:48 +0200
Subject: [PATCH 126/235] [rtlmvu] More fixes for memstream and param gen

---
 .../fpgadataflow/matrixvectoractivation_rtl.py | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
index 162b5e2e16..1791327e78 100644
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
@@ -612,7 +612,11 @@ def generate_params(self, model, path):
         code_gen_dir = path
         # weights, if not external
         weights = model.get_initializer(self.onnx_node.input[1])
+<<<<<<< HEAD
         if mem_mode in ["decoupled", "external"]:
+=======
+        if mem_mode == "decoupled" or mem_mode == "external":
+>>>>>>> 72fe4c5b ([rtlmvu] More fixes for memstream and param gen)
             weight_filename_sim = "{}/weights.npy".format(code_gen_dir)
             # save decoupled weights for cppsim
             self.make_weight_file(weights, "decoupled_npy", weight_filename_sim)
@@ -821,22 +825,16 @@ def code_generation_ipi(self):
             )
             cmd.append(
                 "set_property -dict [list "
-                "CONFIG.NSTREAMS {1} "
-                "CONFIG.MEM_DEPTH {%d} "
-                "CONFIG.MEM_WIDTH {%d} "
-                "CONFIG.MEM_INIT {%s} "
+                "CONFIG.DEPTH {%d} "
+                "CONFIG.WIDTH {%d} "
+                "CONFIG.INIT_FILE {%s} "
                 "CONFIG.RAM_STYLE {%s} "
-                "CONFIG.STRM0_DEPTH {%d} "
-                "CONFIG.STRM0_WIDTH {%d} "
-                "CONFIG.STRM0_OFFSET {0} "
                 "] [get_bd_cells /%s/%s]"
                 % (
                     self.calc_wmem(),
                     self.get_weightstream_width_padded(),
-                    self.get_nodeattr("code_gen_dir_ipgen") + "/",
+                    self.get_nodeattr("code_gen_dir_ipgen") + "/memblock.dat",
                     self.get_nodeattr("ram_style"),
-                    self.calc_wmem(),
-                    self.get_weightstream_width_padded(),
                     node_name,
                     strm_inst,
                 )

From a5f2a83897e33acb4b3e2231d9bfa534e56bb6b2 Mon Sep 17 00:00:00 2001
From: Yaman Umuroglu <maltanar@gmail.com>
Date: Thu, 11 May 2023 23:49:10 +0200
Subject: [PATCH 127/235] [Build] apply config to only FIFO nodes in
 step_set_fifo_depths

---
 src/finn/builder/build_dataflow_steps.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
index 65ab2b0b93..d4af757491 100644
--- a/src/finn/builder/build_dataflow_steps.py
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -53,6 +53,7 @@
 from shutil import copy
 
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
+import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl
 import finn.transformation.streamline.absorb as absorb
 from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
@@ -123,7 +124,6 @@
 )
 from finn.util.pyverilator import verilator_fifosim
 from finn.util.test import execute_parent
-import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl
 
 
 def verify_step(
@@ -486,14 +486,13 @@ def step_generate_estimate_reports(model: ModelWrapper, cfg: DataflowBuildConfig
 
 
 def step_specialize_to_rtl(model: ModelWrapper, cfg: DataflowBuildConfig):
-    """Convert layers implemented in HLS to an equivalent specialized RTL implementation if possible."""
-    specialize_to_rtl_transforms = [
-        to_rtl.InferRTLMatrixVectorActivation()
-    ]
+    """Convert layers implemented in HLS to an equivalent specialized RTL
+    implementation if possible."""
+    specialize_to_rtl_transforms = [to_rtl.InferRTLMatrixVectorActivation()]
     for trn in specialize_to_rtl_transforms:
         model = model.transform(trn)
     return model
-    
+
 
 def step_minimize_bit_width(model: ModelWrapper, cfg: DataflowBuildConfig):
     """Tighten the weight and accumulator bit widths for each layer."""
@@ -594,7 +593,12 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
         model = model.transform(GiveUniqueNodeNames())
         model = model.transform(GiveReadableTensorNames())
         if cfg.folding_config_file is not None:
-            model = model.transform(ApplyConfig(cfg.folding_config_file))
+            model = model.transform(
+                ApplyConfig(
+                    cfg.folding_config_file,
+                    node_filter=lambda x: x.op_type == "StreamingFIFO",
+                )
+            )
 
     # extract the final configuration and save it as json
     hw_attrs = [

From 08cbdc59a95ed6281c3234c5e8b0b9d7327a2988 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Wed, 24 May 2023 07:58:41 +0100
Subject: [PATCH 128/235] Revised control interface attributes.

---
 finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v | 29 +++++++++++++-------------
 finn-rtllib/mvu/mvu_axi_wrapper.v      |  8 ++++---
 2 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v
index fb3c62a15a..e15f77fbae 100644
--- a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v
+++ b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v
@@ -50,25 +50,26 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 	parameter 	OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
 )(
 	// Global Control
-	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF s_axis_weights:s_axis_input:m_axis_output" *)
-	input	logic  ap_clk,
-	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF s_axis_weights:s_axis_input:m_axis_output" *)
-	input	logic  ap_rst_n,
+	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF s_axis_weights:s_axis_input:m_axis_output, ASSOCIATED_RESET ap_rst_n" *)
+	(* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *)
+	input	ap_clk,
+	(* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *)
+	input	ap_rst_n,
 
 	// Weight Stream
-	input	logic [WEIGHT_STREAM_WIDTH_BA-1:0]  s_axis_weights_tdata,
-	input	logic  s_axis_weights_tvalid,
-	output	logic  s_axis_weights_tready,
+	input	[WEIGHT_STREAM_WIDTH_BA-1:0]  s_axis_weights_tdata,
+	input	s_axis_weights_tvalid,
+	output	s_axis_weights_tready,
 
 	// Input Stream
-	input	logic [INPUT_STREAM_WIDTH_BA-1:0]  s_axis_input_tdata,
-	input	logic  s_axis_input_tvalid,
-	output	logic  s_axis_input_tready,
+	input	[INPUT_STREAM_WIDTH_BA-1:0]  s_axis_input_tdata,
+	input	s_axis_input_tvalid,
+	output	s_axis_input_tready,
 
 	// Output Stream
-	output	logic [OUTPUT_STREAM_WIDTH_BA-1:0]  m_axis_output_tdata,
-	output	logic  m_axis_output_tvalid,
-	input	logic  m_axis_output_tready
+	output	[OUTPUT_STREAM_WIDTH_BA-1:0]  m_axis_output_tdata,
+	output	m_axis_output_tvalid,
+	input	m_axis_output_tready
 );
 
 mvu_8sx9_axi #(
@@ -89,4 +90,4 @@ mvu_8sx9_axi #(
 	.m_axis_output_tready(m_axis_output_tready)
 );
 
-endmodule : $MODULE_NAME_AXI_WRAPPER$
\ No newline at end of file
+endmodule : $MODULE_NAME_AXI_WRAPPER$
diff --git a/finn-rtllib/mvu/mvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_axi_wrapper.v
index d8acaefcc7..239c5bbacd 100644
--- a/finn-rtllib/mvu/mvu_axi_wrapper.v
+++ b/finn-rtllib/mvu/mvu_axi_wrapper.v
@@ -51,10 +51,12 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 	parameter 	OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
 )(
 	// Global Control
-	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V" *)
+	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *)
+	(* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *)
 	input	ap_clk,
-	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V" *)
+	(* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *)
 	input	ap_rst_n,
+
 	// Weight Stream
 	input	[WEIGHT_STREAM_WIDTH_BA-1:0]  weights_V_TDATA,
 	input   weights_V_TVALID,
@@ -87,4 +89,4 @@ mvu_axi #(
 	.m_axis_output_tready(out_V_TREADY)
 );
 
-endmodule : $MODULE_NAME_AXI_WRAPPER$
\ No newline at end of file
+endmodule : $MODULE_NAME_AXI_WRAPPER$

From d058cc2a5c1ed71a2c2ea12034cfa921818381ec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Wed, 24 May 2023 09:16:50 +0100
Subject: [PATCH 129/235] Mask device primitives from Verilator in favor of
 using behavioral code.

---
 finn-rtllib/mvu/mvu_4sx4u.sv       | 38 ++++++++++++++++++++----------
 finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 38 ++++++++++++++++++++----------
 finn-rtllib/mvu/mvu_8sx9.sv        | 29 ++++++++++++++---------
 3 files changed, 68 insertions(+), 37 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv
index 5993154355..21594e46ac 100644
--- a/finn-rtllib/mvu/mvu_4sx4u.sv
+++ b/finn-rtllib/mvu/mvu_4sx4u.sv
@@ -19,6 +19,12 @@ module mvu_4sx4u #(
 	output	logic  vld,
 	output	logic signed [PE-1:0][ACCU_WIDTH-1:0]  p
 );
+	// Verilator always to use behavioral code
+	localparam bit  BEHAVIORAL =
+`ifdef VERILATOR
+		1 ||
+`endif
+		FORCE_BEHAVIORAL;
 
 	typedef int unsigned  leave_load_t[2*SIMD-1];
 	function leave_load_t init_leave_loads();
@@ -59,17 +65,21 @@ module mvu_4sx4u #(
 				for(genvar  pe = 0; pe < PE_END - PE_BEG; pe++) begin
 					assign	ww[pe] = w[PE_BEG + pe][s];
 					if(pe) begin
-//						assign  xx[pe] = zero? 0 : ww[pe] * a[s];
-						LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x (
-							.O6(xx[pe][1]),
-							.O5(xx[pe][0]),
-							.I5(1'b1),
-							.I4(zero),
-							.I3(ww[pe][1]),
-							.I2(a[s][1]),
-							.I1(ww[pe][0]),
-							.I0(a[s][0])
-						);
+						if(BEHAVIORAL)  assign  xx[pe] = zero? 0 : ww[pe] * a[s];
+`ifndef VERILATOR
+						else begin
+							LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x (
+								.O6(xx[pe][1]),
+								.O5(xx[pe][0]),
+								.I5(1'b1),
+								.I4(zero),
+								.I3(ww[pe][1]),
+								.I2(a[s][1]),
+								.I1(ww[pe][0]),
+								.I0(a[s][0])
+							);
+						end
+`endif
 					end
 				end
 				always_comb begin
@@ -87,7 +97,7 @@ module mvu_4sx4u #(
 			// Note: Since the product B * AD is computed,
 			//       rst can be only applied to AD and zero only to B
 			//       with the same effect as zeroing both.
-			if (FORCE_BEHAVIORAL) begin : genBehav
+			if (BEHAVIORAL) begin : genBehav
 				// Stage #1: Input Refine
 				logic signed [23:0]  B1  = 0;
 				always_ff @(posedge clk) begin
@@ -121,6 +131,7 @@ module mvu_4sx4u #(
 
 				assign	pp = P3;
 			end : genBehav
+`ifndef VERILATOR
 			else begin : genDSP
 				DSP48E2 #(
 					// Feature Control Attributes: Data Path Selection
@@ -252,6 +263,7 @@ module mvu_4sx4u #(
 					.RSTP(rst)			// 1-bit input: Reset for PREG
 				);
 			end : genDSP
+`endif
 
 			// External Canary Pipeline
 			logic [1:0]  X1[3:1] = '{ default: 0 };
@@ -356,4 +368,4 @@ module mvu_4sx4u #(
 
 	end : genPipes
 
-endmodule : mvu_4sx4u
\ No newline at end of file
+endmodule : mvu_4sx4u
diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
index e06a92c8fa..09db360b77 100644
--- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
+++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
@@ -23,6 +23,12 @@ module mvu_8sx8u_dsp48 #(
 	output	logic  vld,
 	output	logic signed [PE-1:0][ACCU_WIDTH-1:0]  p
 );
+	// Verilator always to use behavioral code
+	localparam bit  BEHAVIORAL =
+`ifdef VERILATOR
+		1 ||
+`endif
+		FORCE_BEHAVIORAL;
 
 	typedef int unsigned  leave_load_t[2*SIMD-1];
 	function leave_load_t init_leave_loads();
@@ -63,17 +69,21 @@ module mvu_8sx8u_dsp48 #(
 				for(genvar  pe = 0; pe < PE_END - PE_BEG; pe++) begin
 					assign	ww[pe] = w[PE_BEG + pe][s];
 					if(pe) begin
-//						assign  xx[pe] = zero? 0 : ww[pe] * a[s];
-						LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x (
-							.O6(xx[1]),
-							.O5(xx[0]),
-							.I5(1'b1),
-							.I4(zero),
-							.I3(ww[pe][1]),
-							.I2(a[s][1]),
-							.I1(ww[pe][0]),
-							.I0(a[s][0])
-						);
+						if(BEHAVIORAL)  assign  xx[pe] = zero? 0 : ww[pe] * a[s];
+`ifndef VERILATOR
+						else begin
+							LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x (
+								.O6(xx[1]),
+								.O5(xx[0]),
+								.I5(1'b1),
+								.I4(zero),
+								.I3(ww[pe][1]),
+								.I2(a[s][1]),
+								.I1(ww[pe][0]),
+								.I0(a[s][0])
+							);
+						end
+`endif
 					end
 				end
 				always_comb begin
@@ -91,7 +101,7 @@ module mvu_8sx8u_dsp48 #(
 			// Note: Since the product B * AD is computed,
 			//       rst can be only applied to AD and zero only to B
 			//       with the same effect as zeroing both.
-			if (FORCE_BEHAVIORAL) begin : genBehav
+			if(BEHAVIORAL) begin : genBehav
 				// Stage #1: Input Refine
 				logic signed [23:0]  B1  = 0;
 				always_ff @(posedge clk) begin
@@ -125,6 +135,7 @@ module mvu_8sx8u_dsp48 #(
 
 				assign	pp = P3;
 			end : genBehav
+`ifndef VERILATOR
 			else begin : genDSP
 				DSP48E2 #(
 					// Feature Control Attributes: Data Path Selection
@@ -256,6 +267,7 @@ module mvu_8sx8u_dsp48 #(
 					.RSTP(rst)			// 1-bit input: Reset for PREG
 				);
 			end : genDSP
+`endif
 
 			// External Canary Pipeline
 			logic [1:0]  X1 = '{ default: 0 };
@@ -355,4 +367,4 @@ module mvu_8sx8u_dsp48 #(
 
 	end : genPipes
 
-endmodule : mvu_8sx8u_dsp48
\ No newline at end of file
+endmodule : mvu_8sx8u_dsp48
diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv
index 2d1da26efb..f8e2ab3985 100644
--- a/finn-rtllib/mvu/mvu_8sx9.sv
+++ b/finn-rtllib/mvu/mvu_8sx9.sv
@@ -52,11 +52,17 @@ module mvu_8sx9 #(
     input   logic zero, // ignore current inputs and force this partial product to zero
     input   logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // weights
 	input   logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] a, // activations
-    
+
 	// Ouput
 	output  logic vld,
     output  logic [PE-1:0][ACCU_WIDTH-1:0] p
   );
+	// Verilator always to use behavioral code
+	localparam bit  BEHAVIORAL =
+`ifdef VERILATOR
+		1 ||
+`endif
+		FORCE_BEHAVIORAL;
 
 //-------------------- Declare global signals --------------------\\
 	localparam int unsigned CHAINLEN = (SIMD+2)/3;
@@ -75,7 +81,7 @@ module mvu_8sx9 #(
 			L[1+MAX_PIPELINE_STAGES] <= last;
 			L[0:MAX_PIPELINE_STAGES] <= L[1:1+MAX_PIPELINE_STAGES];
 		end
-	end  
+	end
 	assign vld = L[0];
 
 //-------------------- Shift register for ZERO flag --------------------\\
@@ -87,7 +93,7 @@ module mvu_8sx9 #(
 			else if(en) begin
 				Z[0] <= zero;
 				if (MAX_PIPELINE_STAGES > 2)  Z[1:MAX_PIPELINE_STAGES-1] <= Z[0:MAX_PIPELINE_STAGES-2];
-			end    
+			end
 		end
 	end;
 
@@ -157,12 +163,12 @@ module mvu_8sx9 #(
 
 			if (LAST) begin : genPOUT
 				assign p[j] = pp[ACCU_WIDTH-1:0];
-			end      
+			end
 
 			// Note: Since the product B * AD is computed,
 			//       rst can be only applied to AD and zero only to B
 			//       with the same effect as zeroing both.
-			if (FORCE_BEHAVIORAL) begin : genBehav
+			if(BEHAVIORAL) begin : genBehav
 				// Stage #1: Input A/B
 				logic signed [33:0] Areg [INTERNAL_PREGS];
 				always_ff @(posedge clk) begin
@@ -233,7 +239,7 @@ module mvu_8sx9 #(
 				assign pp = Preg;
 				assign pcout[j][i] = pp;
 			end : genBehav
-
+`ifndef VERILATOR
 			else begin: genDSP
 				DSP58 #(
 					// Feature Control Attributes: Data Path Selection
@@ -263,8 +269,8 @@ module mvu_8sx9 #(
 					.IS_CLK_INVERTED(1'b0),             // Optional inversion for CLK
 					.IS_INMODE_INVERTED(5'b00000),      // Optional inversion for INMODE
 					.IS_NEGATE_INVERTED(3'b000),        // Optional inversion for NEGATE
-					.IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0 
-										FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN 
+					.IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0
+										FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN
 										2'b01, // Y : M
 										2'b01  // X: M
 					}), // Optional inversion for OPMODE
@@ -325,7 +331,7 @@ module mvu_8sx9 #(
 							INTERNAL_PREGS==2 ? 1'b0 : 1'b1,
 							2'b00,
 							TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero,
-							INTERNAL_PREGS==2 ? 1'b0 : 1'b1        
+							INTERNAL_PREGS==2 ? 1'b0 : 1'b1
 					}),                                 // 5-bit input: INMODE control
 					.NEGATE('0),                        // 3-bit input: Negates the input of the multiplier
 					.OPMODE({
@@ -365,7 +371,8 @@ module mvu_8sx9 #(
 					.RSTP(PREG && rst)                  // 1-bit input: Reset for PREG
 				);
 			end : genDSP
-		end : genDSPChain  
+`endif
+		end : genDSPChain
 	end : genDSPPE
-    
+
 endmodule : mvu_8sx9

From a66f38f2d06901fd27cf874701572268ea4793d6 Mon Sep 17 00:00:00 2001
From: Yaman Umuroglu <maltanar@gmail.com>
Date: Thu, 11 May 2023 23:48:36 +0200
Subject: [PATCH 130/235] [Deps] update qonnx

---
 fetch-repos.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fetch-repos.sh b/fetch-repos.sh
index e039ca9144..f1cf8754f2 100755
--- a/fetch-repos.sh
+++ b/fetch-repos.sh
@@ -27,7 +27,7 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-QONNX_COMMIT="20a34289cf2297d2b2bbbe75d6ac152ece86e3b4"
+QONNX_COMMIT="bc36fd56bf1e4abfcf98cd76a001cad13d57baac"
 FINN_EXP_COMMIT="0aa7e1c44b20cf085b6fe42cff360f0a832afd2c"
 BREVITAS_COMMIT="c65f9c13dc124971f14739349531bbcda5c2a4aa"
 PYVERILATOR_COMMIT="766e457465f5c0dd315490d7b9cc5d74f9a76f4f"

From 8f9bd04b3311e56da4684a58d4de868d61f342ab Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Wed, 24 May 2023 12:44:53 +0100
Subject: [PATCH 131/235] Adding folding hints. Impl selection by case
 statement.

---
 finn-rtllib/mvu/mvu_axi.sv | 34 ++++++++++++++++++++++------------
 1 file changed, 22 insertions(+), 12 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_axi.sv b/finn-rtllib/mvu/mvu_axi.sv
index e4a919ba88..a181f54ac5 100644
--- a/finn-rtllib/mvu/mvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_axi.sv
@@ -29,6 +29,14 @@
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * @brief	Matrix Vector Unit (MVU) AXI-lite interface wrapper.
+ * @details
+ *  Folding hints:
+ *	 - 4-bit MVU:          PE scaling should aim at a full multiple of 4.
+ *	 - 8-bit MVU - DSP48:  PE scaling should aim at a full multiple of 2.
+ *	 - 8-bit MVU - DSP58:  SIMD scaling should aim at a full multiple of 3.
+ *	 - Otherwise, keep SIMD and PE somewhat balanced. SIMD scaling tends to
+ *	   impact critical paths more than PE scaling. PE scaling implies a
+ *	   bigger fanout on the input activations.
  *****************************************************************************/
 
 module mvu_axi #(
@@ -134,8 +142,9 @@ module mvu_axi #(
 	uwire ovld;
 	uwire [PE-1:0][ACCU_WIDTH-1:0] odat;
 	typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t;
-	
-	if (MVU_IMPL_STYLE == "mvu_8sx9_dsp58") begin : genMVU8sx9
+
+	case(MVU_IMPL_STYLE)
+	"mvu_8sx9_dsp58":
 		mvu_8sx9 #(.PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
 		.ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN),
 		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
@@ -143,26 +152,27 @@ module mvu_axi #(
 			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau),
 			.vld(ovld), .p(odat)
 		);
-	end
-	else if (MVU_IMPL_STYLE == "mvu_4sx4u") begin : genMVU4sx4u
+
+	"mvu_4sx4u":
 		mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
 			.clk, .rst, .en,
 			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau),
 			.vld(ovld), .p(odat)
 		);
-	end
-	else if (MVU_IMPL_STYLE == "mvu_8sx8u_dsp48") begin : genMVU8sx8u
+
+	"mvu_8sx8u_dsp48":
 		mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
 		 .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
 			.clk, .rst, .en,
 			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau),
 			.vld(ovld), .p(odat)
 		);
-	end
-	else initial begin
-		$error("Unrecognized MVU_IMPL_STYLE!");
+
+	default: initial begin
+		$error("Unrecognized MVU_IMPL_STYLE '%s'", MVU_IMPL_STYLE);
 		$finish;
 	end
+	endcase
 
 //-------------------- Output register slice --------------------\\
 	struct packed {
@@ -185,7 +195,7 @@ module mvu_axi #(
 			end
 		end
 	end
-	
+
 	struct packed {
 		logic vld;
 		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
@@ -196,10 +206,10 @@ module mvu_axi #(
 		if(rst)		B <= '{ default: 'x };
 		else begin
 			if(b_load)	B <= '{ vld: A.vld, dat: A.dat};
-		end	
+		end
 	end
 
 	assign	m_axis_output_tvalid = B.vld;
 	assign	m_axis_output_tdata  = B.dat;
 
-endmodule : mvu_axi
\ No newline at end of file
+endmodule : mvu_axi

From 9de5ed6f7b459f37bb127f0cd105e6f927d25611 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Wed, 24 May 2023 13:52:40 +0100
Subject: [PATCH 132/235] Fixed behavioral sideband prediction.

---
 finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
index 09db360b77..bd1f813af6 100644
--- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
+++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
@@ -69,7 +69,7 @@ module mvu_8sx8u_dsp48 #(
 				for(genvar  pe = 0; pe < PE_END - PE_BEG; pe++) begin
 					assign	ww[pe] = w[PE_BEG + pe][s];
 					if(pe) begin
-						if(BEHAVIORAL)  assign  xx[pe] = zero? 0 : ww[pe] * a[s];
+						if(BEHAVIORAL)  assign  xx = zero? 0 : ww[pe] * a[s];
 `ifndef VERILATOR
 						else begin
 							LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x (

From 239759a6a4b8cb008aa9b80d52d15f53f77e5965 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Wed, 24 May 2023 15:49:19 +0100
Subject: [PATCH 133/235] [rtl mvu]: extension to allow selecting PE values
 that are not multiples of 4

---
 finn-rtllib/mvu/mvu_4sx4u.sv | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv
index 21594e46ac..111d651cf5 100644
--- a/finn-rtllib/mvu/mvu_4sx4u.sv
+++ b/finn-rtllib/mvu/mvu_4sx4u.sv
@@ -50,6 +50,7 @@ module mvu_4sx4u #(
 
 		localparam int unsigned  PE_BEG = 4*c;
 		localparam int unsigned  PE_END = PE < 4*(c+1)? PE : 4*(c+1);
+		localparam int unsigned  PE_REM = 4*(c+1) - PE_END;
 
 		uwire        [57:0]  p3[SIMD];
 		uwire signed [ 1:0]  h3[SIMD][3];
@@ -65,12 +66,12 @@ module mvu_4sx4u #(
 				for(genvar  pe = 0; pe < PE_END - PE_BEG; pe++) begin
 					assign	ww[pe] = w[PE_BEG + pe][s];
 					if(pe) begin
-						if(BEHAVIORAL)  assign  xx[pe] = zero? 0 : ww[pe] * a[s];
+						if(BEHAVIORAL)  assign  xx[pe + PE_REM] = zero? 0 : ww[pe] * a[s];
 `ifndef VERILATOR
 						else begin
 							LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x (
-								.O6(xx[pe][1]),
-								.O5(xx[pe][0]),
+								.O6(xx[pe + PE_REM][1]),
+								.O5(xx[pe + PE_REM][0]),
 								.I5(1'b1),
 								.I4(zero),
 								.I3(ww[pe][1]),
@@ -86,8 +87,8 @@ module mvu_4sx4u #(
 					dd = '0;
 					aa = '0;
 					for(int unsigned  pe = 0; pe < PE_END - PE_BEG; pe++) begin
-						dd[D[pe]+:3] = ww[pe];
-						aa[D[pe]+ 3] = ww[pe][3];
+						dd[D[pe + PE_REM]+:3] = ww[pe];
+						aa[D[pe + PE_REM]+ 3] = ww[pe][3];
 					end
 				end
 			end : blkVectorize
@@ -305,7 +306,7 @@ module mvu_4sx4u #(
 			localparam int unsigned  HI_WIDTH = ACCU_WIDTH - LO_WIDTH;
 
 			// Conclusive high part accumulation
-			if(i < 3) begin : genHi
+			if(i >= PE_REM && i < 3) begin : genHi
 				// Adder Tree across all SIMD high contributions, each from [-1:1]
 				uwire signed [$clog2(1+SIMD):0]  tree[2*SIMD-1];
 				for(genvar  s = 0; s < SIMD;   s++)  assign  tree[SIMD-1+s] = h3[s][i];
@@ -323,9 +324,12 @@ module mvu_4sx4u #(
 				end
 				assign	hi4[i] = Hi4;
 			end : genHi
+			else begin : genHiZero
+				assign hi4[i] = '0;
+			end : genHiZero
 
 			// Conclusive low part accumulation
-			if(1) begin : blkLo
+			if(i >= PE_REM) begin : blkLo
 				// Adder Tree across all SIMD low contributions
 				localparam int unsigned  ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1));
 				uwire [ROOT_WIDTH-1:0]  tree[2*SIMD-1];
@@ -346,6 +350,9 @@ module mvu_4sx4u #(
 				if(i == 3)  assign  up4 = Lo4;
 				else  assign  lo4[i] = Lo4;
 			end : blkLo
+			else begin : blkLoZero
+				assign lo4[i] = '0;
+			end : blkLoZero
 
 		end
 
@@ -363,7 +370,7 @@ module mvu_4sx4u #(
 
 		// Output
 		for(genvar  pe = PE_BEG; pe < PE_END; pe++) begin
-			assign	p[pe] = Res5[pe - PE_BEG];
+			assign	p[pe] = Res5[pe - PE_BEG + PE_REM];
 		end
 
 	end : genPipes

From 8d3247ccf7657aeb534147a5dd9511fa397d4eb2 Mon Sep 17 00:00:00 2001
From: Yaman Umuroglu <maltanar@gmail.com>
Date: Wed, 24 May 2023 15:56:07 +0200
Subject: [PATCH 134/235] [rtlmvu] Avoid unintentional verilator metacomments

---
 finn-rtllib/mvu/mvu_4sx4u.sv       | 2 +-
 finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 2 +-
 finn-rtllib/mvu/mvu_8sx9.sv        | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv
index 21594e46ac..9f101e8c29 100644
--- a/finn-rtllib/mvu/mvu_4sx4u.sv
+++ b/finn-rtllib/mvu/mvu_4sx4u.sv
@@ -19,7 +19,7 @@ module mvu_4sx4u #(
 	output	logic  vld,
 	output	logic signed [PE-1:0][ACCU_WIDTH-1:0]  p
 );
-	// Verilator always to use behavioral code
+	// for verilator always use behavioral code
 	localparam bit  BEHAVIORAL =
 `ifdef VERILATOR
 		1 ||
diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
index bd1f813af6..6b54e91b6a 100644
--- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
+++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
@@ -23,7 +23,7 @@ module mvu_8sx8u_dsp48 #(
 	output	logic  vld,
 	output	logic signed [PE-1:0][ACCU_WIDTH-1:0]  p
 );
-	// Verilator always to use behavioral code
+	// for verilator always use behavioral code
 	localparam bit  BEHAVIORAL =
 `ifdef VERILATOR
 		1 ||
diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv
index f8e2ab3985..a601066cfd 100644
--- a/finn-rtllib/mvu/mvu_8sx9.sv
+++ b/finn-rtllib/mvu/mvu_8sx9.sv
@@ -57,7 +57,7 @@ module mvu_8sx9 #(
 	output  logic vld,
     output  logic [PE-1:0][ACCU_WIDTH-1:0] p
   );
-	// Verilator always to use behavioral code
+	// for verilator always use behavioral code
 	localparam bit  BEHAVIORAL =
 `ifdef VERILATOR
 		1 ||

From c8663505dcd2c2eeb3ddad05d361f82be32040eb Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Wed, 24 May 2023 17:14:23 +0100
Subject: [PATCH 135/235] [rtl mvu]: extension to allow selecting PE values
 that are not multiples of 2

---
 finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 57 +++++++++++++++++-------------
 1 file changed, 32 insertions(+), 25 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
index 6b54e91b6a..5cc3fa4c49 100644
--- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
+++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
@@ -54,6 +54,7 @@ module mvu_8sx8u_dsp48 #(
 
 		localparam int unsigned  PE_BEG = 2*c;
 		localparam int unsigned  PE_END = PE < 2*(c+1)? PE : 2*(c+1);
+		localparam int unsigned  PE_RES = 2*(c+1) - PE_END;
 
 		uwire        [57:0]  p3[SIMD];
 		uwire signed [ 1:0]  h3[SIMD];
@@ -90,8 +91,8 @@ module mvu_8sx8u_dsp48 #(
 					dd = '0;
 					aa = '0;
 					for(int unsigned  pe = 0; pe < PE_END - PE_BEG; pe++) begin
-						dd[D[pe] +: WEIGHT_WIDTH-1] = ww[pe];
-						aa[D[pe] + WEIGHT_WIDTH-1] = ww[pe][WEIGHT_WIDTH-1];
+						dd[D[pe + PE_RES] +: WEIGHT_WIDTH-1] = ww[pe];
+						aa[D[pe + PE_RES] + WEIGHT_WIDTH-1] = ww[pe][WEIGHT_WIDTH-1];
 					end
 				end
 			end : blkVectorize
@@ -301,32 +302,35 @@ module mvu_8sx8u_dsp48 #(
 		uwire signed [ACCU_WIDTH  -1:0]  up4;
 		uwire signed [ACCU_WIDTH  -SINGLE_PROD_WIDTH:0]  hi4;
 		uwire        [$clog2(SIMD)+SINGLE_PROD_WIDTH-1:0]  lo4;
-		for(genvar  i = 0; i < 2; i++) begin
-			localparam int unsigned  LO_WIDTH = D[i+1] - D[i];
-			localparam int unsigned  HI_WIDTH = ACCU_WIDTH - LO_WIDTH;
 
-			// Conclusive high part accumulation
-			if(i == 0) begin : genHi
-				// Adder Tree across all SIMD high contributions, each from [-1:1]
-				uwire signed [$clog2(1+SIMD):0]  tree[2*SIMD-1];
-				for(genvar  s = 0; s < SIMD;   s++)  assign  tree[SIMD-1+s] = h3[s];
-				for(genvar  n = 0; n < SIMD-1; n++) begin
-					// Sum truncated to actual maximum bit width at this node
-					uwire signed [$clog2(1+LEAVE_LOAD[n]):0]  s = tree[2*n+1] + tree[2*n+2];
-					assign  tree[n] = s;
-				end
+		// Conclusive high part accumulation
+		if(PE_RES == 0) begin : genHi
+			localparam int unsigned  HI_WIDTH = ACCU_WIDTH - D[1];
+			// Adder Tree across all SIMD high contributions, each from [-1:1]
+			uwire signed [$clog2(1+SIMD):0]  tree[2*SIMD-1];
+			for(genvar  s = 0; s < SIMD;   s++)  assign  tree[SIMD-1+s] = h3[s];
+			for(genvar  n = 0; n < SIMD-1; n++) begin
+				// Sum truncated to actual maximum bit width at this node
+				uwire signed [$clog2(1+LEAVE_LOAD[n]):0]  s = tree[2*n+1] + tree[2*n+2];
+				assign  tree[n] = s;
+			end
 
-				// High Sideband Accumulation
-				logic signed [HI_WIDTH-1:0]  Hi4 = 0;
-				always_ff @(posedge clk) begin
-					if(rst)      Hi4 <= 0;
-					else if(en)  Hi4 <= (L[4]? 0 : Hi4) + tree[0];
-				end
-				assign	hi4 = Hi4;
-			end : genHi
+			// High Sideband Accumulation
+			logic signed [HI_WIDTH-1:0]  Hi4 = 0;
+			always_ff @(posedge clk) begin
+				if(rst)      Hi4 <= 0;
+				else if(en)  Hi4 <= (L[4]? 0 : Hi4) + tree[0];
+			end
+			assign	hi4 = Hi4;
+		end : genHi
+		else begin : genHiZero
+			assign hi4 = '0;
+		end : genHiZero
 
+		for(genvar  i = 0; i < 2; i++) begin
+			localparam int unsigned  LO_WIDTH = D[i+1] - D[i];
 			// Conclusive low part accumulation
-			if(1) begin : blkLo
+			if(i >= PE_RES) begin : blkLo
 				// Adder Tree across all SIMD low contributions
 				localparam int unsigned  ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1));
 				uwire [ROOT_WIDTH-1:0]  tree[2*SIMD-1];
@@ -347,6 +351,9 @@ module mvu_8sx8u_dsp48 #(
 				if(i == 1)  assign  up4 = Lo4;
 				else  assign  lo4 = Lo4;
 			end : blkLo
+			else begin : blkLoZero
+				assign lo4 = '0;
+			end : blkLoZero
 
 		end
 
@@ -362,7 +369,7 @@ module mvu_8sx8u_dsp48 #(
 
 		// Output
 		for(genvar  pe = PE_BEG; pe < PE_END; pe++) begin
-			assign	p[pe] = Res5[pe - PE_BEG];
+			assign	p[pe] = Res5[pe - PE_BEG + PE_RES];
 		end
 
 	end : genPipes

From fd1e038c643c05199b38320f8815f430e538d936 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Wed, 24 May 2023 17:21:56 +0100
Subject: [PATCH 136/235] [rtl mvu axi]: updated comments on folding hints

---
 finn-rtllib/mvu/mvu_axi.sv | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_axi.sv b/finn-rtllib/mvu/mvu_axi.sv
index a181f54ac5..cef55949ed 100644
--- a/finn-rtllib/mvu/mvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_axi.sv
@@ -31,12 +31,13 @@
  * @brief	Matrix Vector Unit (MVU) AXI-lite interface wrapper.
  * @details
  *  Folding hints:
- *	 - 4-bit MVU:          PE scaling should aim at a full multiple of 4.
- *	 - 8-bit MVU - DSP48:  PE scaling should aim at a full multiple of 2.
- *	 - 8-bit MVU - DSP58:  SIMD scaling should aim at a full multiple of 3.
+ *	 - 4-bit MVU:          PE scaling should divide MH.
+ *	 - 8-bit MVU - DSP48:  PE scaling should divide MH.
+ *	 - 8-bit MVU - DSP58:  SIMD scaling should aim at a full multiple of 3 and divide MW.
  *	 - Otherwise, keep SIMD and PE somewhat balanced. SIMD scaling tends to
  *	   impact critical paths more than PE scaling. PE scaling implies a
  *	   bigger fanout on the input activations.
+ *	 - Full unfolding along MH (PE=MH) results in no replay buffer instantiated
  *****************************************************************************/
 
 module mvu_axi #(

From f60d4c6fa105bd29689b93aafd880ec92c32358c Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 2 Jun 2023 11:48:26 +0100
Subject: [PATCH 137/235] [rtl custom op]: minor fixes to codegen

---
 .../fpgadataflow/matrixvectoractivation_rtl.py     | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
index 1791327e78..9f8130806b 100644
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
@@ -612,11 +612,7 @@ def generate_params(self, model, path):
         code_gen_dir = path
         # weights, if not external
         weights = model.get_initializer(self.onnx_node.input[1])
-<<<<<<< HEAD
-        if mem_mode in ["decoupled", "external"]:
-=======
         if mem_mode == "decoupled" or mem_mode == "external":
->>>>>>> 72fe4c5b ([rtlmvu] More fixes for memstream and param gen)
             weight_filename_sim = "{}/weights.npy".format(code_gen_dir)
             # save decoupled weights for cppsim
             self.make_weight_file(weights, "decoupled_npy", weight_filename_sim)
@@ -909,8 +905,6 @@ def code_generation_ipi(self):
                     self.onnx_node.name,
                 )
             )
-            cmd.append("set_property CONFIG.FREQ_HZ 333333333.333333 [get_bd_intf_pins %s/in0_V]" % (self.onnx_node.name))
-            cmd.append("set_property CONFIG.FREQ_HZ 333333333.333333 [get_bd_intf_pins %s/out_V]" % (self.onnx_node.name))
         else:
             raise Exception("Unrecognized mem_mode for MatrixVectorActivation")
         return cmd
@@ -968,8 +962,7 @@ def derive_characteristic_fxns(self, period):
     # TODO: characterize max_clk and implement this function in look-up style
     def _resolve_segment_len(self, clk):
         # Insert pipeline registers in the DSP chain to meet target clock frequency
-        segmentlen = 0
-        return segmentlen
+        return 4 # default to 4 for now
 
     def _resolve_impl_style(self, fpgapart):
         # Based on target device and activation/weight-width, choose the
@@ -1002,11 +995,6 @@ def generate_hdl(self, model, fpgapart, clk):
         # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject)
         self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name())
 
-        ram_style = self.get_nodeattr("ram_style")
-        assert (
-            ram_style == "auto"
-        ), "Unrecognized ram_style for MatrixVectorActivation_rtl"
-
         # apply code generation to template
         with open(template_path, "r") as f:
             template_wrapper = f.read()

From a1ad304a42bf89b36d6507cf9f749a7a1a7d130a Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 2 Jun 2023 11:48:58 +0100
Subject: [PATCH 138/235] [specialize-to-rtl]: add ram_style and
 rt_writeable_weights support

---
 .../transformation/fpgadataflow/specialize_to_rtl_layers.py   | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py
index 7d677ec216..23b6e59abe 100644
--- a/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py
+++ b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py
@@ -74,6 +74,8 @@ def apply(self, model):
                     simd = getCustomOp(n).get_nodeattr("SIMD")
                     pe = getCustomOp(n).get_nodeattr("PE")
                     mem_mode = getCustomOp(n).get_nodeattr("mem_mode")
+                    ram_style = getCustomOp(n).get_nodeattr("ram_style")
+                    runtime_writeable_weights = getCustomOp(n).get_nodeattr("runtime_writeable_weights")
 
                     new_node = helper.make_node(
                         "MatrixVectorActivation_rtl",
@@ -91,6 +93,8 @@ def apply(self, model):
                         numInputVectors=numInputVectors,
                         mem_mode=mem_mode,
                         name=n.name + "_rtl",
+                        ram_style=ram_style,
+                        runtime_writeable_weights=runtime_writeable_weights
                     )
                     graph.node.insert(node_ind, new_node)
                     # remove old node

From 2cbb68fe016ff7ea292ffa071741b352222d1a4c Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 2 Jun 2023 11:50:05 +0100
Subject: [PATCH 139/235] [rtllib]: change string type to parameter type due to
 Vivado error

---
 finn-rtllib/mvu/mvu_axi.sv | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_axi.sv b/finn-rtllib/mvu/mvu_axi.sv
index cef55949ed..46167af95b 100644
--- a/finn-rtllib/mvu/mvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_axi.sv
@@ -51,7 +51,7 @@ module mvu_axi #(
 	bit SIGNED_ACTIVATIONS = 0,
 	int unsigned SEGMENTLEN = 0,
 	bit FORCE_BEHAVIORAL = 0,
-	string MVU_IMPL_STYLE,
+	parameter MVU_IMPL_STYLE, // string type causes error in Vivado
 
 	localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
 	localparam int unsigned INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8,
@@ -163,12 +163,11 @@ module mvu_axi #(
 
 	"mvu_8sx8u_dsp48":
 		mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
-		 .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
+		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
 			.clk, .rst, .en,
 			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau),
 			.vld(ovld), .p(odat)
 		);
-
 	default: initial begin
 		$error("Unrecognized MVU_IMPL_STYLE '%s'", MVU_IMPL_STYLE);
 		$finish;

From 92eb0edba2d059b8b170ed7e6d8ac7a224c9208c Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 2 Jun 2023 11:51:40 +0100
Subject: [PATCH 140/235] [rtllib]: renamed variable for consistency

---
 finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
index 5cc3fa4c49..3cd9cef560 100644
--- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
+++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
@@ -54,7 +54,7 @@ module mvu_8sx8u_dsp48 #(
 
 		localparam int unsigned  PE_BEG = 2*c;
 		localparam int unsigned  PE_END = PE < 2*(c+1)? PE : 2*(c+1);
-		localparam int unsigned  PE_RES = 2*(c+1) - PE_END;
+		localparam int unsigned  PE_REM = 2*(c+1) - PE_END;
 
 		uwire        [57:0]  p3[SIMD];
 		uwire signed [ 1:0]  h3[SIMD];
@@ -91,8 +91,8 @@ module mvu_8sx8u_dsp48 #(
 					dd = '0;
 					aa = '0;
 					for(int unsigned  pe = 0; pe < PE_END - PE_BEG; pe++) begin
-						dd[D[pe + PE_RES] +: WEIGHT_WIDTH-1] = ww[pe];
-						aa[D[pe + PE_RES] + WEIGHT_WIDTH-1] = ww[pe][WEIGHT_WIDTH-1];
+						dd[D[pe + PE_REM] +: WEIGHT_WIDTH-1] = ww[pe];
+						aa[D[pe + PE_REM] + WEIGHT_WIDTH-1] = ww[pe][WEIGHT_WIDTH-1];
 					end
 				end
 			end : blkVectorize
@@ -304,7 +304,7 @@ module mvu_8sx8u_dsp48 #(
 		uwire        [$clog2(SIMD)+SINGLE_PROD_WIDTH-1:0]  lo4;
 
 		// Conclusive high part accumulation
-		if(PE_RES == 0) begin : genHi
+		if(PE_REM == 0) begin : genHi
 			localparam int unsigned  HI_WIDTH = ACCU_WIDTH - D[1];
 			// Adder Tree across all SIMD high contributions, each from [-1:1]
 			uwire signed [$clog2(1+SIMD):0]  tree[2*SIMD-1];
@@ -330,7 +330,7 @@ module mvu_8sx8u_dsp48 #(
 		for(genvar  i = 0; i < 2; i++) begin
 			localparam int unsigned  LO_WIDTH = D[i+1] - D[i];
 			// Conclusive low part accumulation
-			if(i >= PE_RES) begin : blkLo
+			if(i >= PE_REM) begin : blkLo
 				// Adder Tree across all SIMD low contributions
 				localparam int unsigned  ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1));
 				uwire [ROOT_WIDTH-1:0]  tree[2*SIMD-1];
@@ -369,7 +369,7 @@ module mvu_8sx8u_dsp48 #(
 
 		// Output
 		for(genvar  pe = PE_BEG; pe < PE_END; pe++) begin
-			assign	p[pe] = Res5[pe - PE_BEG + PE_RES];
+			assign	p[pe] = Res5[pe - PE_BEG + PE_REM];
 		end
 
 	end : genPipes

From 471a221b975e549e462e7ff9488c65ad182fe278 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Fri, 2 Jun 2023 12:39:14 +0100
Subject: [PATCH 141/235] Fix improper blocking assignment & linting.

---
 finn-rtllib/mvu/tb/mvu_axi_tb.sv | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/finn-rtllib/mvu/tb/mvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_axi_tb.sv
index ef5fa7d682..b89b58f55b 100644
--- a/finn-rtllib/mvu/tb/mvu_axi_tb.sv
+++ b/finn-rtllib/mvu/tb/mvu_axi_tb.sv
@@ -42,12 +42,12 @@ module mvu_axi_tb();
 	localparam int unsigned SEGMENTLEN = 2;
 	localparam string MVU_IMPL_STYLE = "mvu_8sx8u_dsp48";
 	localparam bit FORCE_BEHAVIORAL = 1;
-	// Bit-width config  
+	// Bit-width config
 	localparam int unsigned ACTIVATION_WIDTH = 8;
 	localparam int unsigned WEIGHT_WIDTH = 8;
 	localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW);
 	localparam bit SIGNED_ACTIVATIONS = 0;
-	// Simulation constants  
+	// Simulation constants
 	localparam int unsigned NF = MH/PE;
 	localparam int unsigned SF = MW/SIMD;
 	localparam int unsigned NUM_OF_DSP = SIMD/3;
@@ -57,7 +57,7 @@ module mvu_axi_tb();
 	localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - SIMD*ACTIVATION_WIDTH;
 	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8;
 
-	// Generate clk and reset signal   
+	// Generate clk and reset signal
 	logic clk = 0;
 	always #5ns clk = !clk;
 
@@ -69,7 +69,7 @@ module mvu_axi_tb();
 
 	uwire ap_clk = clk;
 
-	// Generate activations  
+	// Generate activations
 	typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t;
 	typedef activation_t activation_vector_t[SF];
 
@@ -94,8 +94,8 @@ module mvu_axi_tb();
 
 		for (int i=0; i<SF; i++) begin
 			activations.dat <= ACTIVATIONS[i];
-			do begin 
-				activations.vld = $urandom()%7 >= 1;
+			do begin
+				activations.vld <= $urandom()%7 >= 1;
 				@(posedge clk);
 			end while (!(activations.vld === 1 && activations.rdy === 1));
 		end
@@ -104,9 +104,9 @@ module mvu_axi_tb();
 		activations.dat <= 'x;
 	end
 
-	// Generate weights   
+	// Generate weights
 	typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t;
-	typedef weight_t weight_matrix_t[NF][SF]; 
+	typedef weight_t weight_matrix_t[NF][SF];
 
 	function weight_matrix_t init_WEIGHTS;
 		automatic weight_matrix_t res;
@@ -139,7 +139,7 @@ module mvu_axi_tb();
 		weights.dat <= 'x;
 	end
 
-	// Function to compute golden output  
+	// Function to compute golden output
 	// a: [SF][SIMD-1:0][ACTIVATION_WIDTH-1:0]
 	// w: [NF][SF][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]
 	typedef logic signed [PE-1:0][ACCU_WIDTH-1:0] output_t;
@@ -155,12 +155,12 @@ module mvu_axi_tb();
 		automatic output_vector_t res = '{default: 0};
 		for (int j = 0; j<MH; j++) begin
 			for (int i = 0; i<MW; i++) begin
-				if (SIGNED_ACTIVATIONS==1) 
+				if (SIGNED_ACTIVATIONS)
 					res[j/PE][j%PE] = $signed(res[j/PE][j%PE]) + $signed(a[i/SIMD][i%SIMD]) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
 				else
 					res[j/PE][j%PE] = $signed(res[j/PE][j%PE]) + $signed({1'b0, a[i/SIMD][i%SIMD]}) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
 			end
-		end  
+		end
 		return res;
 	endfunction : check_output;
 
@@ -179,16 +179,16 @@ module mvu_axi_tb();
 			// Compare produced outputs against golden outputs
 			foreach(outputs.dat[i]) begin
 				assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
-				else begin 
+				else begin
 					$error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
 					$stop;
-				end  
+				end
 			end
-			
+
 			NF_CNT += 1;
 		end
 
-		$finish;  
+		$finish;
 	end
 
 	// Instantiate DUT
@@ -211,5 +211,5 @@ module mvu_axi_tb();
 		.s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld),
 		.m_axis_output_tready(outputs.rdy)
 	);
-  
+
 endmodule : mvu_axi_tb

From 5c5dc09c98d4e1a07a7e4cae17ca358b197a57c8 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 2 Jun 2023 13:35:04 +0100
Subject: [PATCH 142/235] [test rtl mvu]: modified/extended test cases

---
 tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py b/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py
index 20a249bd08..3db7a718f5 100644
--- a/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py
+++ b/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py
@@ -86,13 +86,12 @@ def prepare_inputs(input_tensor):
     return {"inp": input_tensor}
 
 @pytest.mark.parametrize("mh", [16])
-@pytest.mark.parametrize("mw", [90])
-#@pytest.mark.parametrize("pe", [1, 2, 4, 8, 16])
-@pytest.mark.parametrize("pe", [16])
+@pytest.mark.parametrize("mw", [32])
+@pytest.mark.parametrize("pe", [1, 4, 16])
 #@pytest.mark.parametrize("simd", [1, 30, 90])
-@pytest.mark.parametrize("simd", [90])
-@pytest.mark.parametrize("idt", [DataType["INT8"]])
-@pytest.mark.parametrize("wdt", [DataType["UINT4"]])
+@pytest.mark.parametrize("simd", [1, 4, 32])
+@pytest.mark.parametrize("idt", [DataType["UINT4"], DataType["UINT8"]])
+@pytest.mark.parametrize("wdt", [DataType["INT4"], DataType["INT6"]])
 #@pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S", "xcku3p-ffva676-1-e"])
 @pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S"])
 @pytest.mark.parametrize("segmentlen", [1])
@@ -166,7 +165,3 @@ def test_fpgadataflow_mvau_rtl(mh, mw, pe, simd, idt, wdt, part, segmentlen):
 
     assert (output_mvau_hls == output_mvau_rtl).all()
     assert (output_mvau_hls.size > 0)
-
-
-# python setup.py test --addopts "-k test_fpgadataflow_mvau_rtl"
-# python setup.py test --addopts "-k test_fpgadataflow_fclayer_rtlsim"
\ No newline at end of file

From b4eb9b69a8a6920fdb3141752395e672f78479e3 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 30 Jun 2023 15:36:17 +0100
Subject: [PATCH 143/235] [rtl mvu]: updated DSP58 >4-bit variant to lift
 SIMD%3==0 restriction

---
 finn-rtllib/mvu/mvu_8sx9.sv | 103 +++++++++++++++++++++++-------------
 1 file changed, 65 insertions(+), 38 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv
index a601066cfd..439fbc44f9 100644
--- a/finn-rtllib/mvu/mvu_8sx9.sv
+++ b/finn-rtllib/mvu/mvu_8sx9.sv
@@ -92,77 +92,95 @@ module mvu_8sx9 #(
 			if (rst)      Z <= '{default: 0};
 			else if(en) begin
 				Z[0] <= zero;
-				if (MAX_PIPELINE_STAGES > 2)  Z[1:MAX_PIPELINE_STAGES-1] <= Z[0:MAX_PIPELINE_STAGES-2];
+				if (MAX_PIPELINE_STAGES > 2)  Z[1:MAX_PIPELINE_STAGES-2] <= Z[0:MAX_PIPELINE_STAGES-3];
 			end
 		end
 	end;
 
 //-------------------- Buffer for input activations --------------------\\
 	localparam int unsigned PAD_BITS_ACT = 9 - ACTIVATION_WIDTH;
-	typedef logic [2:0][ACTIVATION_WIDTH-1:0] a_buffer_t;
 
 	for (genvar i=0; i<CHAINLEN; i++) begin : genActSIMD
 		localparam int TOTAL_PREGS = i/SEGLEN;
 		localparam int EXTERNAL_PREGS = TOTAL_PREGS>1 ? TOTAL_PREGS-1 : 0;
+		localparam int LANES_OCCUPIED = i == CHAINLEN-1 ? SIMD - 3*i : 3;
 
 		if (EXTERNAL_PREGS > 0) begin : genExternalPregAct
-			a_buffer_t A [0:EXTERNAL_PREGS-1];
+			logic [0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][ACTIVATION_WIDTH-1:0] A = '{ default : 0};
 			always_ff @(posedge clk) begin
 				if (rst)     A <= '{default: 0};
 				else if(en) begin
-					A[EXTERNAL_PREGS-1] <= a[3*i +: 3];
+					A[EXTERNAL_PREGS-1] <= a[3*i +: LANES_OCCUPIED];
 					if (EXTERNAL_PREGS > 1)   A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1];
 				end
 			end
-			assign a_in_i[i][26:0] = SIGNED_ACTIVATIONS ? { {PAD_BITS_ACT{A[0][2][ACTIVATION_WIDTH-1]}}, A[0][2], {PAD_BITS_ACT{A[0][1][ACTIVATION_WIDTH-1]}}, A[0][1], {PAD_BITS_ACT{A[0][0][ACTIVATION_WIDTH-1]}}, A[0][0]}
-									: { {PAD_BITS_ACT{1'b0}}, A[0][2], {PAD_BITS_ACT{1'b0}}, A[0][1], {PAD_BITS_ACT{1'b0}}, A[0][0]} ;
+			for (genvar j=0; j<LANES_OCCUPIED; j++) begin : genAin
+				assign a_in_i[i][9*j +: 9] = SIGNED_ACTIVATIONS ? PAD_BITS_ACT == 0 ? A[0][j] : { {PAD_BITS_ACT{A[0][j][ACTIVATION_WIDTH-1]}}, A[0][j] } 
+											: PAD_BITS_ACT == 0 ? A[0][j] : { {PAD_BITS_ACT{1'b0}}, A[0][j] } ;
+			end : genAin
+			for (genvar j=LANES_OCCUPIED; j<3; j++) begin : genAinZero
+				assign a_in_i[i][9*j +: 9] = 9'b0;
+			end : genAinZero
 		end : genExternalPregAct
 		else begin : genInpDSPAct
-			assign a_in_i[i][26:0] = SIGNED_ACTIVATIONS ? { {PAD_BITS_ACT{a[3*i+2][ACTIVATION_WIDTH-1]}}, a[3*i+2], {PAD_BITS_ACT{a[3*i+1][ACTIVATION_WIDTH-1]}}, a[3*i+1], {PAD_BITS_ACT{a[3*i][ACTIVATION_WIDTH-1]}}, a[3*i]}
-									: { {PAD_BITS_ACT{1'b0}}, a[3*i+2], {PAD_BITS_ACT{1'b0}}, a[3*i+1], {PAD_BITS_ACT{1'b0}}, a[3*i]} ;
+			for (genvar j=0; j<LANES_OCCUPIED; j++) begin : genAin
+				assign a_in_i[i][9*j +: 9] = SIGNED_ACTIVATIONS ? PAD_BITS_ACT == 0 ? a[3*i+j] : { {PAD_BITS_ACT{a[3*i+j][ACTIVATION_WIDTH-1]}}, a[3*i+j] }
+											: PAD_BITS_ACT == 0 ? a[3*i+j] : { {PAD_BITS_ACT{1'b0}}, a[3*i+j] } ;
+			end : genAin
+			for (genvar j=LANES_OCCUPIED; j<3; j++) begin : genAinZero
+				assign a_in_i[i][9*j +: 9] = 9'b0;
+			end : genAinZero
 		end : genInpDSPAct
 
 	end : genActSIMD
 
 //-------------------- Buffer for weights --------------------\\
 	localparam int unsigned PAD_BITS_WEIGHT = 8 - WEIGHT_WIDTH;
-	typedef logic [2:0][WEIGHT_WIDTH-1:0] b_buffer_t;
 
-	for (genvar j=0; j<PE; j++) begin : genWeightPE
-		for (genvar i=0; i<CHAINLEN; i++) begin : genWeightSIMD
-			localparam int TOTAL_PREGS = i/SEGLEN;
+	for (genvar i=0; i<PE; i++) begin : genWeightPE
+		for (genvar j=0; j<CHAINLEN; j++) begin : genWeightSIMD
+			localparam int TOTAL_PREGS = j/SEGLEN;
 			localparam int EXTERNAL_PREGS = TOTAL_PREGS>1 ? TOTAL_PREGS-1 : 0;
+			localparam int LANES_OCCUPIED = j == CHAINLEN-1 ? SIMD - 3*j : 3;
 
 			if (EXTERNAL_PREGS > 0) begin : genExternalPregWeight
-				b_buffer_t B [0:PE-1][0:EXTERNAL_PREGS-1];
+				logic [0:PE-1][0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][WEIGHT_WIDTH-1:0] B = '{ default : 0};
 				always_ff @(posedge clk) begin
 					if (rst)    B <= '{default: 0};
 					else if (en) begin
-						B[j][EXTERNAL_PREGS-1] <= w[j][3*i +: 3];
-						if (EXTERNAL_PREGS > 1) B[j][0:EXTERNAL_PREGS-2] <= B[j][1:EXTERNAL_PREGS-1];
+						B[i][EXTERNAL_PREGS-1] <= w[i][3*j +: LANES_OCCUPIED];
+						if (EXTERNAL_PREGS > 1) B[i][0:EXTERNAL_PREGS-2] <= B[i][1:EXTERNAL_PREGS-1];
 					end
 				end
-				assign b_in_i[j][i][23:0] = { {PAD_BITS_WEIGHT{B[j][0][2][WEIGHT_WIDTH-1]}}, B[j][0][2], {PAD_BITS_WEIGHT{B[j][0][1][WEIGHT_WIDTH-1]}}, B[j][0][1], {PAD_BITS_WEIGHT{B[j][0][0][WEIGHT_WIDTH-1]}}, B[j][0][0] };
+				for (genvar k = 0 ; k < LANES_OCCUPIED ; k++) begin : genBin
+					assign b_in_i[i][j][8*k +: 8] = PAD_BITS_WEIGHT == 0 ? B[i][0][k] : { {PAD_BITS_WEIGHT{B[i][0][k][WEIGHT_WIDTH-1]}}, B[i][0][k] };
+				end : genBin
+				for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero
+					assign b_in_i[i][j][8*k +: 8] = 8'b0;
+				end : genBinZero
 			end : genExternalPregWeight
 			else begin : genInpDSPWeight
-				assign b_in_i[j][i][23:0] = { {PAD_BITS_WEIGHT{w[j][3*i+2][WEIGHT_WIDTH-1]}}, w[j][3*i+2], {PAD_BITS_WEIGHT{w[j][3*i+1][WEIGHT_WIDTH-1]}}, w[j][3*i+1], {PAD_BITS_WEIGHT{w[j][3*i][WEIGHT_WIDTH-1]}}, w[j][3*i] };
+				for (genvar k = 0; k < LANES_OCCUPIED; k++) begin : genBin
+					assign b_in_i[i][j][8*k +: 8] = PAD_BITS_WEIGHT == 0 ? w[i][3*j+k] : { {PAD_BITS_WEIGHT{w[i][3*j+k][WEIGHT_WIDTH-1]}}, w[i][3*j+k] };
+				end : genBin
+				for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero
+					assign b_in_i[i][j][8*k +: 8] = 8'b0;
+				end : genBinZero
 			end : genInpDSPWeight
 		end : genWeightSIMD
-
 	end : genWeightPE
 
 //-------------------- Instantiate PE x CHAINLEN DSPs --------------------\\
-	for (genvar j=0; j<PE; j++) begin : genDSPPE
-		for (genvar i=0; i<CHAINLEN; i++) begin : genDSPChain
-			localparam int TOTAL_PREGS = i/SEGLEN;
+	for (genvar i=0; i<PE; i++) begin : genDSPPE
+		for (genvar j=0; j<CHAINLEN; j++) begin : genDSPChain
+			localparam int TOTAL_PREGS = j/SEGLEN;
 			localparam int INTERNAL_PREGS = TOTAL_PREGS>0 ? 2 : 1; // 1 : 0
-			localparam bit PREG = (i+1)%SEGLEN==0 || i == CHAINLEN-1;
-			localparam bit FIRST = i == 0;
-			localparam bit LAST = i == CHAINLEN-1;
-			uwire [57:0] pp;
+			localparam bit PREG = (j+1)%SEGLEN==0 || j == CHAINLEN-1;
+			localparam bit FIRST = j == 0;
+			localparam bit LAST = j == CHAINLEN-1;
 
 			if (LAST) begin : genPOUT
-				assign p[j] = pp[ACCU_WIDTH-1:0];
+				assign p[i] = pcout[i][j][ACCU_WIDTH-1:0];
 			end
 
 			// Note: Since the product B * AD is computed,
@@ -174,7 +192,7 @@ module mvu_8sx9 #(
 				always_ff @(posedge clk) begin
 					if (rst)	Areg <= '{ default : 0};
 					else if (en) begin
-						Areg[0] <= { 7'bx, a_in_i[i] };
+						Areg[0] <= { 7'bx, a_in_i[j] };
 						if (INTERNAL_PREGS == 2) Areg[1] <= Areg[0];
 					end
 				end
@@ -182,7 +200,7 @@ module mvu_8sx9 #(
 				always_ff @(posedge clk) begin
 					if (rst)	Breg <= '{ default : 0};
 					else if (en) begin
-						Breg[0] <= b_in_i[j][i];
+						Breg[0] <= b_in_i[i][j];
 						if (INTERNAL_PREGS == 2) Breg[1] <= Breg[0];
 					end
 				end
@@ -217,27 +235,36 @@ module mvu_8sx9 #(
 					end
 					else	assign Preg = Mreg;
 				end
-				else if (LAST) begin : genLast
+				else if (FIRST && LAST) begin : genSingle
+					always_ff @(posedge clk) begin
+						if (rst)		Opmode <= 0;
+						else if (en)	Opmode <= L[1];
+					end
+					always_ff @(posedge clk) begin
+						if (rst) 		Preg <= 0;
+						else if (en)	Preg <= (Opmode ? 0 : Preg) + Mreg;
+					end
+				end
+				else if (!FIRST && LAST) begin : genLast
 					always_ff @(posedge clk) begin
 						if (rst)		Opmode <= 0;
 						else if (en)	Opmode <= L[1];
 					end
 					always_ff @(posedge clk) begin
 						if (rst) 		Preg <= 0;
-						else if (en)	Preg <= (Opmode ? 0 : Preg) + Mreg + pcout[j][i-1];
+						else if (en)	Preg <= (Opmode ? 0 : Preg) + Mreg + pcout[i][j-1];
 					end
 				end
 				else begin : genMid
 					if (PREG) begin : genPregBehav
 						always_ff @(posedge clk) begin
 							if (rst)		Preg <= 0;
-							else if (en)	Preg <= Mreg + pcout[j][i-1];
+							else if (en)	Preg <= Mreg + pcout[i][j-1];
 						end
 					end
-					else	assign Preg = Mreg + pcout[j][i-1];
+					else	assign Preg = Mreg + pcout[i][j-1];
 				end
-				assign pp = Preg;
-				assign pcout[j][i] = pp;
+				assign pcout[i][j] = Preg;
 			end : genBehav
 `ifndef VERILATOR
 			else begin: genDSP
@@ -307,7 +334,7 @@ module mvu_8sx9 #(
 					.BCOUT(),                           // 24-bit output: B cascade
 					.CARRYCASCOUT(),                    // 1-bit output: Cascade carry
 					.MULTSIGNOUT(),                     // 1-bit output: Multiplier sign cascade
-					.PCOUT(pcout[j][i]),                // 58-bit output: Cascade output
+					.PCOUT(pcout[i][j]),                // 58-bit output: Cascade output
 					// Control outputs: Control Inputs/Status Bits
 					.OVERFLOW(),                        // 1-bit output: Overflow in add/acc
 					.PATTERNBDETECT(),                  // 1-bit output: Pattern bar detect
@@ -322,7 +349,7 @@ module mvu_8sx9 #(
 					.BCIN('x),                          // 24-bit input: B cascade
 					.CARRYCASCIN('x),                   // 1-bit input: Cascade carry
 					.MULTSIGNIN('x),                    // 1-bit input: Multiplier sign cascade
-					.PCIN(FIRST ? 'x : pcout[j][i-1]),  // 58-bit input: P cascade
+					.PCIN(FIRST ? 'x : pcout[i][j-1]),  // 58-bit input: P cascade
 					// Control inputs: Control Inputs/Status Bits
 					.ALUMODE(4'h0),                     // 4-bit input: ALU control
 					.CARRYINSEL('0),                    // 3-bit input: Carry select
@@ -339,8 +366,8 @@ module mvu_8sx9 #(
 							7'b000_0000
 					}), // 9-bit input: Operation mode
 					// Data inputs: Data Ports
-					.A({ 7'bx, a_in_i[i] }),            // 34-bit input: A data
-					.B(b_in_i[j][i]),                   // 24-bit input: B data
+					.A({ 7'bx, a_in_i[j] }),            // 34-bit input: A data
+					.B(b_in_i[i][j]),                   // 24-bit input: B data
 					.C('x),                             // 58-bit input: C data
 					.CARRYIN('0),                       // 1-bit input: Carry-in
 					.D('x),                             // 27-bit input: D data

From ad63673cda849ecf0df993bc83d00e676998ab03 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 30 Jun 2023 15:45:26 +0100
Subject: [PATCH 144/235] [rtl mvu]: bug fix for SIMD=1 init_leave_loads

---
 finn-rtllib/mvu/mvu_4sx4u.sv       | 2 +-
 finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv
index 4674576d23..ac95b5f8a9 100644
--- a/finn-rtllib/mvu/mvu_4sx4u.sv
+++ b/finn-rtllib/mvu/mvu_4sx4u.sv
@@ -296,7 +296,7 @@ module mvu_4sx4u #(
 		// Stage #4: Cross-SIMD Reduction
 
 		// Count leaves reachable from each node
-		localparam leave_load_t  LEAVE_LOAD = init_leave_loads();
+		localparam leave_load_t  LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 1}; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop
 
 		uwire signed [ACCU_WIDTH  -1:0]  up4;
 		uwire signed [ACCU_WIDTH  -8:0]  hi4[3];
diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
index 3cd9cef560..416c12c1cc 100644
--- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
+++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
@@ -297,7 +297,7 @@ module mvu_8sx8u_dsp48 #(
 		// Stage #4: Cross-SIMD Reduction
 
 		// Count leaves reachable from each node
-		localparam leave_load_t  LEAVE_LOAD = init_leave_loads();
+		localparam leave_load_t  LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 0}; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop
 
 		uwire signed [ACCU_WIDTH  -1:0]  up4;
 		uwire signed [ACCU_WIDTH  -SINGLE_PROD_WIDTH:0]  hi4;

From 79e8a5ef208f7bcdeafa231a5a3dff74177008c9 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 13 Jul 2023 18:34:05 +0100
Subject: [PATCH 145/235] [mvu rtl]: restrict index i to be less than 3 (within
 bounds of hi4)

---
 finn-rtllib/mvu/mvu_4sx4u.sv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv
index ac95b5f8a9..88985312c9 100644
--- a/finn-rtllib/mvu/mvu_4sx4u.sv
+++ b/finn-rtllib/mvu/mvu_4sx4u.sv
@@ -324,7 +324,7 @@ module mvu_4sx4u #(
 				end
 				assign	hi4[i] = Hi4;
 			end : genHi
-			else begin : genHiZero
+			else if (i < 3) begin : genHiZero
 				assign hi4[i] = '0;
 			end : genHiZero
 

From e3493c30529949a77a3f384fd75c030c551cd2cd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Fri, 2 Jun 2023 12:47:53 +0100
Subject: [PATCH 146/235] Rewrite replay_buffer for input elasticity.

---
 finn-rtllib/mvu/replay_buffer.sv       | 153 ++++++++++++++++++-------
 finn-rtllib/mvu/tb/replay_buffer_tb.sv | 130 +++++++++++++++++++++
 2 files changed, 242 insertions(+), 41 deletions(-)
 create mode 100644 finn-rtllib/mvu/tb/replay_buffer_tb.sv

diff --git a/finn-rtllib/mvu/replay_buffer.sv b/finn-rtllib/mvu/replay_buffer.sv
index 89bbbdb88f..3dfe72d6c6 100644
--- a/finn-rtllib/mvu/replay_buffer.sv
+++ b/finn-rtllib/mvu/replay_buffer.sv
@@ -1,5 +1,5 @@
 /******************************************************************************
- * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * Copyright (C) 2022-2023, Advanced Micro Devices, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -51,60 +51,131 @@ module replay_buffer #(
 	input	logic  ordy
 );
 
-	typedef logic [$clog2(REP)+$clog2(LEN)-1:0]  count_t;
-	count_t  Count = 0;
-	uwire  done_len = LEN == 1 ? 1 : ((LEN-1) & ~Count[$clog2(LEN)-1:0]) == 0;
-	uwire  done_rep;
-	uwire  done_all = done_len && done_rep;
+	if(LEN == 0)  initial begin
+		$error("%m: Illegal zero sequence LEN.");
+		$finish;
+	end
+	if(REP == 0) initial begin
+		$error("%m: Illegal zero REP count.");
+		$finish;
+	end
 
+	// Track position in Sequence
+	uwire  last_item;
 	uwire  shift;
-	uwire  clr = rst || (done_all && shift);
-	always_ff @(posedge clk) begin
-		if(clr)         Count <= 0;
-		else if(shift)  Count <= Count + ((REP > 1) && done_len? 2**$clog2(LEN)-LEN+1 : 1);
+	if(LEN == 1)  assign  last_item = 1;
+	else begin
+		typedef logic [$clog2(LEN)-1:0]  count_t;
+		count_t  Count = 0;
+		logic    Last  = 0;
+		always_ff @(posedge clk) begin
+			if(rst) begin
+				Count <= 0;
+				Last  <= 0;
+			end
+			else if(shift) begin
+				Count <= Count + (Last? 2**$clog2(LEN)-LEN+1 : 1);
+				Last  <= (((LEN-2) & ~Count) == 0) && ((LEN&1) || !Last);
+			end
+		end
+		assign	last_item = Last;
 	end
 
-	typedef logic [W-1:0]  data_t;
-	uwire data_t  rdat;
-	uwire  first_rep;
 	if(REP == 1) begin
-		assign	done_rep  = 1;
-		assign	first_rep = 1;
-		assign	rdat = 'x;
+		assign	shift = ivld && ordy;
+
+		assign	irdy  = ordy;
+		assign	odat  = idat;
+		assign	olast = last_item;
+		assign	ofin  = last_item;
+		assign	ovld  = ivld;
 	end
 	else begin
-		assign	done_rep = ((REP-1) & ~Count[$left(Count):$clog2(LEN)]) == 0;
 
-		logic  FirstRep = 1;
+		// Track Repetitions
+		uwire  last_rep;
+		if(1) begin : blkRep
+			typedef logic [$clog2(REP)-1:0]  rep_t;
+			rep_t  RepCnt = 0;
+			logic  RepLst = 0;
+			always_ff @(posedge clk) begin
+				if(rst) begin
+					RepCnt <= 0;
+					RepLst <= 0;
+				end
+				else if(last_item && shift) begin
+					RepCnt <= RepCnt + (RepLst? 2**$clog2(REP)-REP+1 : 1);
+					RepLst <= (((REP-2) & ~RepCnt) == 0) && ((REP&1) || !RepLst);
+				end
+			end
+			assign	last_rep = RepLst;
+		end : blkRep
+
+		localparam int unsigned  AWIDTH = $clog2(LEN);
+		typedef logic [AWIDTH  :0]  ptr_t;	// pointers with additional generational MSB
+		typedef logic [W     -1:0]  data_t;
+
+		// Output Registers
+		data_t  ODat;
+		logic   OVld =  0;
+		logic   OLst = 'x;
+		logic   OFin = 'x;
+		assign	odat  = ODat;
+		assign	olast = OLst;
+		assign	ofin  = OFin;
+		assign	ovld  = OVld;
+
+		// Buffer Memory Management
+		data_t  Mem[2**AWIDTH];
+		ptr_t  WP = 0;	// Write Pointer
+		ptr_t  RP = 0;	// Read Pointer
+		ptr_t  FP = 0;	// Free Pointer
+
+		// Operational Guards
+		//	Occupancy:    WP-FP
+		//	  WP-FP < 2**AWIDTH -> writing allowed
+		//		- increments WP
+		//	Availability: WP-RP
+		//	  WP-RP > 0         -> reading allowed
+		//		- increments RP, last in sequence rewinds to FP for non-final repetition
+		//		- increments FP in last repetition
+		assign	irdy = !((WP-FP) >> AWIDTH);
+
+		uwire  wr = irdy && ivld;
+		uwire  rd = !OVld || ordy;
 		always_ff @(posedge clk) begin
-			if(clr)         FirstRep <= 1;
-			else if(shift)  FirstRep <= FirstRep && !done_len;
+			if(wr)  Mem[WP[AWIDTH-1:0]] <= idat;
+			if(rd)  ODat <= Mem[RP[AWIDTH-1:0]];
 		end
-		assign	first_rep = FirstRep;
 
-		data_t  Buf[LEN];
-		if(LEN == 1) begin : genTrivial
-			always_ff @(posedge clk) begin
-				if(shift && FirstRep)  Buf[0] <= idat;
+		uwire  vld = (RP != WP);
+		assign	shift = rd && vld;
+		always_ff @(posedge clk) begin
+			if(rst) begin
+				WP <= 0;
+				RP <= 0;
+				FP <= 0;
+
+				OVld <=  0;
+				OLst <= 'x;
+				OFin <= 'x;
 			end
-		end : genTrivial
-		else begin : genShift
-			always_ff @(posedge clk) begin
-				if(shift) begin
-					Buf[0] <= odat;
-					Buf[1:LEN-1] <= Buf[0:LEN-2];
+			else begin
+				if(wr)  WP <= WP + 1;
+				if(rd) begin
+					if(vld) begin
+						automatic logic  rewind = last_item && !last_rep;
+						RP <= RP + (rewind? 2**(AWIDTH+1)-LEN+1 : 1);
+						FP <= FP + last_rep;
+					end
+
+					OVld <= vld;
+					OLst <= last_item;
+					OFin <= last_rep && last_item;
 				end
 			end
-		end : genShift
+		end
 
-		assign	rdat = Buf[LEN-1];
 	end
 
-	assign  irdy  = ordy && first_rep;
-	assign	odat  = first_rep? idat : rdat;
-	assign	olast = done_len;
-	assign	ofin  = done_all;
-	assign	ovld  = first_rep? ivld : 1;
-	assign	shift = ovld && ordy;
-
-endmodule : replay_buffer
\ No newline at end of file
+endmodule : replay_buffer
diff --git a/finn-rtllib/mvu/tb/replay_buffer_tb.sv b/finn-rtllib/mvu/tb/replay_buffer_tb.sv
new file mode 100644
index 0000000000..5581354e0e
--- /dev/null
+++ b/finn-rtllib/mvu/tb/replay_buffer_tb.sv
@@ -0,0 +1,130 @@
+/******************************************************************************
+ * Copyright (C) 2023, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Testbench for replay_buffer module.
+ * @author	Thomas B. Preußer <thomas.preusser@amd.com>
+ *****************************************************************************/
+
+module replay_buffer_tb;
+
+	// Global Control
+	logic  clk = 0;
+	always #5ns clk = !clk;
+	uwire  rst = 0;
+
+	// DUT Geometries
+	localparam int unsigned  DIMS[3] = '{ 7, 8, 10 };
+	localparam int unsigned  W = 8;
+	typedef logic [W-1:0]  data_t;
+
+	bit [2**$size(DIMS)-1:0]  done = 0;
+	always_comb begin
+		if(&done) begin
+			$display("Test completed.");
+			$finish;
+		end
+	end
+
+	// Parallel DUT Instantiations
+	for(genvar  r = 0; r < $size(DIMS); r++) begin
+		for(genvar  l = 0; l < $size(DIMS); l++) begin
+			localparam int unsigned  REP = DIMS[r];
+			localparam int unsigned  LEN = DIMS[l];
+
+			data_t  idat;
+			logic  ivld;
+			uwire  irdy;
+
+			uwire data_t  odat;
+			uwire  olast;
+			uwire  ofin;
+			uwire  ovld;
+			logic  ordy;
+
+			replay_buffer #(.LEN(LEN), .REP(REP), .W(W)) dut (
+				.clk, .rst,
+				.idat, .ivld, .irdy,
+				.odat, .olast, .ofin, .ovld, .ordy
+			);
+
+			// Input Feed: 0, 1, ..., 10*LEN-1
+			initial begin
+				idat = 'x;
+				ivld =  0;
+				@(posedge clk iff !rst);
+
+				for(int unsigned  i = 0; i < 10*LEN; i++) begin
+					idat <= i;
+					ivld <= 1;
+					@(posedge clk iff irdy);
+					idat <= 'x;
+					ivld <=  0;
+					while($urandom()%(REP-1) != 0) @(posedge clk);
+				end
+			end
+
+			// Output Check
+			initial begin
+				automatic int unsigned  base = 0;
+
+				ordy = 0;
+				@(posedge clk iff !rst);
+
+				for(int unsigned  k = 0; k < 10; k++) begin
+					for(int unsigned  j = 0; j < REP; j++) begin
+						for(int unsigned  i = 0; i < LEN; i++) begin
+							ordy <= 1;
+							@(posedge clk iff ovld);
+							assert(odat == base+i) else begin
+								$error("#%0d.%0d: Data mismatch: %0d instead of %0d.", r, l, odat, base+i);
+								$stop;
+							end
+							assert(olast == (i == LEN-1)) else begin
+								$error("#%0d.%0d: Last mismatch.", r, l);
+								$stop;
+							end
+							assert(ofin == ((i == LEN-1) && (j == REP-1))) else begin
+								$error("#%0d.%0d: Fin mismatch.", r, l);
+								$stop;
+							end
+
+							ordy <= 0;
+							while($urandom()%13 == 0) @(posedge clk);
+						end
+					end
+					base += LEN;
+				end
+
+				done[$size(DIMS)*r + l] <= 1;
+			end
+		end
+	end
+
+endmodule : replay_buffer_tb

From 2efba6854267873c84d58f6d8fe6b64f649eaa99 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Tue, 5 Sep 2023 13:53:01 +0100
Subject: [PATCH 147/235] [to-rtl]: Infer unique node names after
 transformation is applied

---
 .../transformation/fpgadataflow/specialize_to_rtl_layers.py     | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py
index 23b6e59abe..47ed5ce863 100644
--- a/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py
+++ b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py
@@ -32,6 +32,7 @@
 from onnx import helper
 from qonnx.transformation.infer_shapes import InferShapes
 from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.general import GiveUniqueNodeNames
 from finn.transformation.fpgadataflow.minimize_accumulator_width import MinimizeAccumulatorWidth
 
 class InferRTLMatrixVectorActivation(Transformation):
@@ -105,5 +106,6 @@ def apply(self, model):
             model = model.transform(MinimizeAccumulatorWidth())
             model = model.transform(InferShapes())
             model = model.transform(InferDataTypes())
+            model = model.transform(GiveUniqueNodeNames())
         
         return (model, graph_modified)
\ No newline at end of file

From 114ea1bfed2dd2f14196f98aea97d6cac9d1d57e Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 18 Sep 2023 14:56:07 +0100
Subject: [PATCH 148/235] [mvu rtl]: add synthesis directive to handle 'X in
 simulation

---
 finn-rtllib/mvu/mvu_8sx9.sv | 34 ++++++++++++++++++++++++++--------
 1 file changed, 26 insertions(+), 8 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv
index 439fbc44f9..34aa856b1b 100644
--- a/finn-rtllib/mvu/mvu_8sx9.sv
+++ b/finn-rtllib/mvu/mvu_8sx9.sv
@@ -110,13 +110,17 @@ module mvu_8sx9 #(
 			always_ff @(posedge clk) begin
 				if (rst)     A <= '{default: 0};
 				else if(en) begin
-					A[EXTERNAL_PREGS-1] <= a[3*i +: LANES_OCCUPIED];
+					A[EXTERNAL_PREGS-1] <= 
+// synthesis translate_off
+						zero ? '1 : 
+// synthesis translate_on						
+						a[3*i +: LANES_OCCUPIED];
 					if (EXTERNAL_PREGS > 1)   A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1];
 				end
 			end
 			for (genvar j=0; j<LANES_OCCUPIED; j++) begin : genAin
-				assign a_in_i[i][9*j +: 9] = SIGNED_ACTIVATIONS ? PAD_BITS_ACT == 0 ? A[0][j] : { {PAD_BITS_ACT{A[0][j][ACTIVATION_WIDTH-1]}}, A[0][j] } 
-											: PAD_BITS_ACT == 0 ? A[0][j] : { {PAD_BITS_ACT{1'b0}}, A[0][j] } ;
+			assign a_in_i[i][9*j +: 9] = SIGNED_ACTIVATIONS ? PAD_BITS_ACT == 0 ? A[0][j] : { {PAD_BITS_ACT{A[0][j][ACTIVATION_WIDTH-1]}}, A[0][j] } 
+												: PAD_BITS_ACT == 0 ? A[0][j] : { {PAD_BITS_ACT{1'b0}}, A[0][j] } ;
 			end : genAin
 			for (genvar j=LANES_OCCUPIED; j<3; j++) begin : genAinZero
 				assign a_in_i[i][9*j +: 9] = 9'b0;
@@ -124,8 +128,12 @@ module mvu_8sx9 #(
 		end : genExternalPregAct
 		else begin : genInpDSPAct
 			for (genvar j=0; j<LANES_OCCUPIED; j++) begin : genAin
-				assign a_in_i[i][9*j +: 9] = SIGNED_ACTIVATIONS ? PAD_BITS_ACT == 0 ? a[3*i+j] : { {PAD_BITS_ACT{a[3*i+j][ACTIVATION_WIDTH-1]}}, a[3*i+j] }
-											: PAD_BITS_ACT == 0 ? a[3*i+j] : { {PAD_BITS_ACT{1'b0}}, a[3*i+j] } ;
+				assign a_in_i[i][9*j +: 9] = 
+// synthesis translate_off
+					zero ? '1 : 				
+// synthesis translate_on
+					SIGNED_ACTIVATIONS ? PAD_BITS_ACT == 0 ? a[3*i+j] : { {PAD_BITS_ACT{a[3*i+j][ACTIVATION_WIDTH-1]}}, a[3*i+j] }
+												: PAD_BITS_ACT == 0 ? a[3*i+j] : { {PAD_BITS_ACT{1'b0}}, a[3*i+j] } ;
 			end : genAin
 			for (genvar j=LANES_OCCUPIED; j<3; j++) begin : genAinZero
 				assign a_in_i[i][9*j +: 9] = 9'b0;
@@ -148,7 +156,11 @@ module mvu_8sx9 #(
 				always_ff @(posedge clk) begin
 					if (rst)    B <= '{default: 0};
 					else if (en) begin
-						B[i][EXTERNAL_PREGS-1] <= w[i][3*j +: LANES_OCCUPIED];
+						B[i][EXTERNAL_PREGS-1] <= 
+// synthesis translate_off
+							zero ? '1 : 						
+// synthesis translate_on							
+							w[i][3*j +: LANES_OCCUPIED];
 						if (EXTERNAL_PREGS > 1) B[i][0:EXTERNAL_PREGS-2] <= B[i][1:EXTERNAL_PREGS-1];
 					end
 				end
@@ -161,7 +173,11 @@ module mvu_8sx9 #(
 			end : genExternalPregWeight
 			else begin : genInpDSPWeight
 				for (genvar k = 0; k < LANES_OCCUPIED; k++) begin : genBin
-					assign b_in_i[i][j][8*k +: 8] = PAD_BITS_WEIGHT == 0 ? w[i][3*j+k] : { {PAD_BITS_WEIGHT{w[i][3*j+k][WEIGHT_WIDTH-1]}}, w[i][3*j+k] };
+					assign b_in_i[i][j][8*k +: 8] = 
+// synthesis translate_off					
+						zero ? '1 : 
+// synthesis translate_on					
+						PAD_BITS_WEIGHT == 0 ? w[i][3*j+k] : { {PAD_BITS_WEIGHT{w[i][3*j+k][WEIGHT_WIDTH-1]}}, w[i][3*j+k] };
 				end : genBin
 				for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero
 					assign b_in_i[i][j][8*k +: 8] = 8'b0;
@@ -178,9 +194,10 @@ module mvu_8sx9 #(
 			localparam bit PREG = (j+1)%SEGLEN==0 || j == CHAINLEN-1;
 			localparam bit FIRST = j == 0;
 			localparam bit LAST = j == CHAINLEN-1;
+			uwire [57:0] pp;
 
 			if (LAST) begin : genPOUT
-				assign p[i] = pcout[i][j][ACCU_WIDTH-1:0];
+				assign p[i] = pp[ACCU_WIDTH-1:0];
 			end
 
 			// Note: Since the product B * AD is computed,
@@ -264,6 +281,7 @@ module mvu_8sx9 #(
 					end
 					else	assign Preg = Mreg + pcout[i][j-1];
 				end
+				assign pp = Preg;
 				assign pcout[i][j] = Preg;
 			end : genBehav
 `ifndef VERILATOR

From 79fafdb25a8707f740a0a7e21aa4f55ef7101882 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 18 Sep 2023 15:06:36 +0100
Subject: [PATCH 149/235] [replay buffer rtl]: minor fix to when LEN=1 (=
 AWIDTH=0)

---
 finn-rtllib/mvu/replay_buffer.sv | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/finn-rtllib/mvu/replay_buffer.sv b/finn-rtllib/mvu/replay_buffer.sv
index 3dfe72d6c6..942f1823ca 100644
--- a/finn-rtllib/mvu/replay_buffer.sv
+++ b/finn-rtllib/mvu/replay_buffer.sv
@@ -144,8 +144,8 @@ module replay_buffer #(
 		uwire  wr = irdy && ivld;
 		uwire  rd = !OVld || ordy;
 		always_ff @(posedge clk) begin
-			if(wr)  Mem[WP[AWIDTH-1:0]] <= idat;
-			if(rd)  ODat <= Mem[RP[AWIDTH-1:0]];
+			if(wr)  Mem[WP[AWIDTH:0]] <= idat;
+			if(rd)  ODat <= Mem[RP[AWIDTH:0]];
 		end
 
 		uwire  vld = (RP != WP);

From 619d9db0d5872d1afd72b1d1df841e1f87a9f33a Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 18 Sep 2023 15:09:45 +0100
Subject: [PATCH 150/235] [mvu lut]: LUT-based MVU compute core

---
 finn-rtllib/mvu/mvu_lut.sv | 102 +++++++++++++++++++++++++++++++++++++
 1 file changed, 102 insertions(+)
 create mode 100644 finn-rtllib/mvu/mvu_lut.sv

diff --git a/finn-rtllib/mvu/mvu_lut.sv b/finn-rtllib/mvu/mvu_lut.sv
new file mode 100644
index 0000000000..b100a589e8
--- /dev/null
+++ b/finn-rtllib/mvu/mvu_lut.sv
@@ -0,0 +1,102 @@
+module mvu_lut #(
+	int unsigned  PE,
+	int unsigned  SIMD,
+	int unsigned  ACCU_WIDTH,
+    int unsigned  ACTIVATION_WIDTH,
+    int unsigned  WEIGHT_WIDTH,
+    bit  SIGNED_ACTIVATIONS,
+    bit  M_REG = 1,
+
+    localparam unsigned MULT_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH
+)(
+	// Global Control
+	input	logic  clk,
+	input	logic  rst,
+	input	logic  en,
+
+	// Input
+	input	logic  last,
+	input	logic  zero,	// ignore current inputs and force this partial product to zero
+	input	logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]      w,	// signed weights
+	input	logic                [SIMD-1:0][ACTIVATION_WIDTH-1:0]  a,	// (un)signed activations
+
+	// Ouput
+	output	logic  vld,
+	output	logic signed [PE-1:0][ACCU_WIDTH-1:0]  p
+);
+
+    typedef int unsigned  leave_load_t[2*SIMD-1];
+    function leave_load_t init_leave_loads();
+        automatic leave_load_t  res;
+        for(int  i = 2*(SIMD-1); i >= int'(SIMD)-1; i--)  res[i] = 1;
+        for(int  i = SIMD-2; i >= 0; i--)  res[i] = res[2*i+1] + res[2*i+2];
+        return res;
+    endfunction : init_leave_loads
+
+    // Pipeline for last indicator flag
+    uwire last_i;
+    generate if (M_REG) begin
+        logic [0:1] L = '0;
+        always_ff @(posedge clk) begin
+            if(rst)       L <= '0;
+            else if (en)  L <= {last, L[0]};
+        end
+        assign  last_i = L[1];
+    end
+    else begin 
+        logic L = '0;
+        always_ff @(posedge clk) begin
+            if(rst)       L <= '0;
+            else if (en)  L <= last;
+        end
+        assign  last_i = L;
+    end
+    endgenerate
+
+    // For each PE generate
+    for (genvar  i = 0; i < PE; i++)  begin : genPE
+        // Stage #1: SIMD multipliers in parallel
+        uwire [MULT_WIDTH-1 : 0] m1 [SIMD];
+        for (genvar j = 0; j < SIMD; j++) begin : genSIMD
+            if (M_REG) begin : genMreg
+                logic [MULT_WIDTH-1 : 0] M [SIMD];
+                always_ff @(posedge clk) begin
+                    if(rst)         M[j] = '{ default : 0 };
+                    else if (en)    M[j] = zero ? 0 :
+                                            SIGNED_ACTIVATIONS ? $signed(a[j]) * $signed(w[i][j]) :
+                                                                 $signed({1'b0, a[j]}) * $signed(w[i][j]); 
+                    // (SIGNED_ACTIVATIONS ? $signed(a[j]) : a[j]) * $signed(w[i][j]) isn't valid -- leads to unsigned multiplication
+                end
+                assign  m1[j] = M[j];
+            end : genMreg
+            else begin : genNoMreg 
+                assign m1[j] = zero ? 0 :
+                               SIGNED_ACTIVATIONS ? $signed(a[j]) * $signed(w[i][j]) :
+                                                    $signed({1'b0, a[j]}) * $signed(w[i][j]);
+            end : genNoMreg
+        end : genSIMD
+
+        // Stage #2: Adder tree to reduce SIMD products
+        localparam leave_load_t  LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default : 1 };
+        localparam int unsigned  ROOT_WIDTH = $clog2(SIMD*(2**MULT_WIDTH-1));
+        uwire signed [2*SIMD-2:0][ROOT_WIDTH-1:0]  tree;
+        for(genvar s = 0; s < SIMD; s++)  assign  tree[SIMD-1+s] = $signed(m1[s]);
+        for(genvar n = 0; n < SIMD-1; n++) begin
+            // Sum truncated to actual maximum bit width at this node
+            localparam int unsigned  NODE_WIDTH = $clog2(LEAVE_LOAD[n]*(2**MULT_WIDTH-1));
+            uwire signed [NODE_WIDTH-1:0]  s = tree[2*n+1] + tree[2*n+2];
+            assign tree[n] = s;
+        end
+
+        // Stage #3: Buffer output
+        logic [ACCU_WIDTH-1:0] P2 [PE];
+        always_ff @(posedge clk) begin
+            if(rst)         P2[i] = '{ default : 0};
+            else if (en)    P2[i] = (last_i ? 0 : $signed(P2[i])) + $signed(tree[0]);
+        end
+
+        assign  vld = last_i;
+        assign  p[i] = P2[i];
+    end : genPE
+
+endmodule : mvu_lut

From 090f2ac4adf4b0523b23b27fce05f7422269d72a Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Tue, 19 Sep 2023 12:23:55 +0100
Subject: [PATCH 151/235] [custom op]: add preferred_backend attribute

---
 src/finn/custom_op/fpgadataflow/matrixvectoractivation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
index 73d39ce642..4f24d71ccc 100644
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
@@ -123,7 +123,7 @@ def get_nodeattr_types(self):
             # weight data from the weight FIFOs.
             "runtime_writeable_weights": ("i", False, 0, {0, 1}),
             # Flag to specify whether RTL-based or HLS-based implementation is preferred
-            "impl": ("s", False, "rtl", {"hls", "rtl"})
+            "preferred_backend": ("s", False, "rtl", {"hls", "rtl"})
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs

From ac5e82d9944f5b7475eb13546affd1bc03d57f4a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Thu, 21 Sep 2023 13:03:27 +0100
Subject: [PATCH 152/235] Ensure a minimum of two buffer slots even for
 length-1 sequences.

---
 finn-rtllib/mvu/replay_buffer.sv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/finn-rtllib/mvu/replay_buffer.sv b/finn-rtllib/mvu/replay_buffer.sv
index 942f1823ca..d4342f705c 100644
--- a/finn-rtllib/mvu/replay_buffer.sv
+++ b/finn-rtllib/mvu/replay_buffer.sv
@@ -111,7 +111,7 @@ module replay_buffer #(
 			assign	last_rep = RepLst;
 		end : blkRep
 
-		localparam int unsigned  AWIDTH = $clog2(LEN);
+		localparam int unsigned  AWIDTH = LEN < 2? 1 : $clog2(LEN);
 		typedef logic [AWIDTH  :0]  ptr_t;	// pointers with additional generational MSB
 		typedef logic [W     -1:0]  data_t;
 

From 85156935163fc803d453db5ce2c1c5163808bc9f Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 21 Sep 2023 15:07:12 +0100
Subject: [PATCH 153/235] [rtl mvu wrapper]: support for vvu layer and rename

---
 finn-rtllib/mvu/mvu_vvu_axi_wrapper.v | 92 +++++++++++++++++++++++++++
 1 file changed, 92 insertions(+)
 create mode 100644 finn-rtllib/mvu/mvu_vvu_axi_wrapper.v

diff --git a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
new file mode 100644
index 0000000000..6dbf82cb7b
--- /dev/null
+++ b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
@@ -0,0 +1,92 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Verilog AXI-lite wrapper for MVU.
+ *****************************************************************************/
+
+module $MODULE_NAME_AXI_WRAPPER$ #(
+	parameter	IS_MVU = "$IS_MVU$",
+	parameter	COMPUTE_CORE = "$COMPUTE_CORE$",
+	parameter	MW = $MW$,
+	parameter	MH = $MH$,
+	parameter	PE = $PE$,
+	parameter	SIMD = $SIMD$,
+	parameter	ACTIVATION_WIDTH = $ACTIVATION_WIDTH$,
+	parameter	WEIGHT_WIDTH = $WEIGHT_WIDTH$,
+	parameter	ACCU_WIDTH = $ACCU_WIDTH$,
+	parameter	SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$,
+	parameter	SEGMENTLEN = $SEGMENTLEN$,
+	parameter	FORCE_BEHAVIORAL = $FORCE_BEHAVIORAL$,
+
+	// Safely deducible parameters
+	parameter 	WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
+	parameter 	INPUT_STREAM_WIDTH_BA = ((IS_MVU == 1 ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8,
+	parameter 	OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8
+)(
+	// Global Control
+	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *)
+	(* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *)
+	input	ap_clk,
+	(* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *)
+	input	ap_rst_n,
+
+	// Weight Stream
+	input	[WEIGHT_STREAM_WIDTH_BA-1:0]  weights_V_TDATA,
+	input   weights_V_TVALID,
+	output  weights_V_TREADY,
+	// Input Stream
+	input	[INPUT_STREAM_WIDTH_BA-1:0]  in0_V_TDATA,
+	input	in0_V_TVALID,
+	output	in0_V_TREADY,
+	// Output Stream
+	output	[OUTPUT_STREAM_WIDTH_BA-1:0]  out_V_TDATA,
+	output	out_V_TVALID,
+	input	out_V_TREADY
+);
+
+mvu_vvu_axi #(
+	.IS_MVU(IS_MVU), .COMPUTE_CORE(COMPUTE_CORE), .MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD),
+	.ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH),
+	.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)
+	) inst (
+	.ap_clk(ap_clk),
+	.ap_rst_n(ap_rst_n),
+	.s_axis_weights_tdata(weights_V_TDATA),
+	.s_axis_weights_tvalid(weights_V_TVALID),
+	.s_axis_weights_tready(weights_V_TREADY),
+	.s_axis_input_tdata(in0_V_TDATA),
+	.s_axis_input_tvalid(in0_V_TVALID),
+	.s_axis_input_tready(in0_V_TREADY),
+	.m_axis_output_tdata(out_V_TDATA),
+	.m_axis_output_tvalid(out_V_TVALID),
+	.m_axis_output_tready(out_V_TREADY)
+);
+
+endmodule // $MODULE_NAME_AXI_WRAPPER$

From cf28d780041fec1effdf743e62390eebc5c81f98 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 21 Sep 2023 16:24:18 +0100
Subject: [PATCH 154/235] [mvu vvu tb]: modified testbench to also support
 testing VVU on DSP58

---
 finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv | 222 +++++++++++++++++++++++++++
 1 file changed, 222 insertions(+)
 create mode 100644 finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv

diff --git a/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv
new file mode 100644
index 0000000000..82c2e8e7b0
--- /dev/null
+++ b/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv
@@ -0,0 +1,222 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Testbench for MVU AXI-lite interface wrapper.
+ *****************************************************************************/
+
+module mvu_vvu_axi_tb();
+
+//-------------------- Simulation parameters --------------------\\
+	// Matrix & parallelism config
+	localparam bit IS_MVU = 1;
+	localparam string COMPUTE_CORE = "mvu_vvu_8sx9_dsp58";
+	localparam int unsigned MW = 1500;
+	localparam int unsigned MH = 256;
+	localparam int unsigned SIMD = 60;
+	localparam int unsigned PE = 16;
+	localparam int unsigned SEGMENTLEN = 2.0;
+	localparam bit FORCE_BEHAVIORAL = 1;
+	localparam bit M_REG_LUT = 1;
+	// Bit-width config
+	localparam int unsigned ACTIVATION_WIDTH = 4;
+	localparam int unsigned WEIGHT_WIDTH = 4;
+	localparam int unsigned ACCU_WIDTH = 21; // == ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW)
+	localparam bit SIGNED_ACTIVATIONS = 0;
+	// Simulation constants
+	localparam int unsigned NF = MH/PE;
+	localparam int unsigned SF = MW/SIMD;
+	localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8;
+	localparam int unsigned ACTIVATION_WIDTH_BA = ((IS_MVU ? 1 : PE)*SIMD*ACTIVATION_WIDTH+7)/8*8;
+	localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH;
+	localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - (IS_MVU ? 1 : PE)*SIMD*ACTIVATION_WIDTH;
+	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8;
+
+	// Generate clk and reset signal
+	logic clk = 0;
+	always #5ns clk = !clk;
+
+	logic ap_rst_n = 0;
+	initial begin
+		repeat(16) @(posedge clk);
+		ap_rst_n <= 1;
+	end
+
+	uwire ap_clk = clk;
+
+	// Generate activations
+	typedef logic [(IS_MVU ? 1 : PE)*SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t;
+	typedef activation_t activation_vector_t[(IS_MVU ? 1 : NF)*SF];
+
+	function activation_vector_t init_ACTIVATIONS;
+		automatic activation_vector_t res;
+		std::randomize(res);
+		return res;
+	endfunction : init_ACTIVATIONS
+
+	activation_vector_t ACTIVATIONS = init_ACTIVATIONS();
+
+	struct {
+		activation_t dat;
+		logic vld;
+		logic rdy;
+	} activations;
+
+	initial begin
+		activations.vld = 0;
+		activations.dat = 'X;
+		@(posedge clk iff ap_rst_n);
+
+		for (int j=0; j<(IS_MVU ? 1 : NF); j++) begin
+			for (int i=0; i<SF; i++) begin
+				activations.dat <= ACTIVATIONS[SF*j+i];
+				do begin
+					activations.vld <= $urandom()%7 >= 0;
+					@(posedge clk);
+				end while (!(activations.vld === 1 && activations.rdy === 1));
+			end
+		end
+
+		activations.vld <= 0;
+		activations.dat <= 'x;
+	end
+
+	// Generate weights
+	typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t;
+	typedef weight_t weight_matrix_t[NF][SF];
+
+	function weight_matrix_t init_WEIGHTS;
+		automatic weight_matrix_t res;
+		std::randomize(res);
+		return res;
+	endfunction : init_WEIGHTS;
+
+	weight_matrix_t WEIGHTS = init_WEIGHTS();
+
+	struct {
+		weight_t dat;
+		logic vld;
+		logic rdy;
+	} weights;
+
+	initial begin
+		weights.vld = 0;
+		weights.dat = 'X;
+		@(posedge clk iff ap_rst_n);
+
+		weights.vld <= 1;
+		for (int i=0; i<NF; i++) begin
+			for (int j=0; j<SF; j++) begin
+				weights.dat <= WEIGHTS[i][j];
+				@(posedge clk iff weights.rdy);
+			end
+		end
+
+		weights.vld <= 0;
+		weights.dat <= 'x;
+	end
+
+	// Function to compute golden output
+	// a: [(IS_MVU?1:NF)*SF][SIMD-1:0][ACTIVATION_WIDTH-1:0]
+	// w: [NF][SF][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]
+	typedef logic signed [PE-1:0][ACCU_WIDTH-1:0] output_t;
+	typedef output_t output_vector_t [NF];
+
+	struct {
+		output_t dat;
+		logic vld;
+		logic rdy;
+	} outputs;
+
+	function output_vector_t check_output(activation_vector_t a, weight_matrix_t w);
+		automatic output_vector_t res = '{default: 0};
+		for (int j = 0; j<MH; j++) begin
+			for (int i = 0; i<MW; i++) begin
+				if (SIGNED_ACTIVATIONS)
+					res[j/PE][j%PE] = IS_MVU ? $signed(res[j/PE][j%PE]) + $signed(a[i/SIMD][i%SIMD]) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]) : 
+											   $signed(res[j/PE][j%PE]) + ( PE > 1 ? $signed(a[j/PE*SF+i/SIMD][(i*PE + j%PE) % (SIMD*PE)]) : $signed(a[j/PE*SF+i/SIMD][i%SIMD]) ) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
+				else
+					res[j/PE][j%PE] = IS_MVU ? $signed(res[j/PE][j%PE]) + $signed({1'b0, a[i/SIMD][i%SIMD]}) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]) : 
+											   $signed(res[j/PE][j%PE]) + ( PE > 1 ? $signed({1'b0, a[j/PE*SF+i/SIMD][(i*PE + j%PE) % (SIMD*PE)]}) : $signed({1'b0, a[j/PE+SF+i/SIMD][i%SIMD]}) ) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
+			end
+		end
+		return res;
+	endfunction : check_output;
+
+	output_vector_t GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS);
+
+	int unsigned NF_CNT = 0;
+	initial begin
+		outputs.rdy = 0;
+		while (NF_CNT < NF) begin
+			// Loop until both rdy & vld are asserted
+			do begin
+				outputs.rdy <= $urandom()%7 >= 0;
+				@(posedge clk iff ap_rst_n);
+			end while (!(outputs.rdy === 1 && outputs.vld === 1));
+
+			// Compare produced outputs against golden outputs
+			foreach(outputs.dat[i]) begin
+				assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+				else begin
+					$error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+					$stop;
+				end
+			end
+
+			NF_CNT += 1;
+		end
+
+		$finish;
+	end
+
+	// Instantiate DUT
+	mvu_vvu_axi #(
+		.IS_MVU(IS_MVU),
+		.COMPUTE_CORE(COMPUTE_CORE),
+		.MW(MW),
+		.MH(MH),
+		.PE(PE),
+		.SIMD(SIMD),
+		.ACTIVATION_WIDTH(ACTIVATION_WIDTH),
+		.WEIGHT_WIDTH(WEIGHT_WIDTH),
+		.ACCU_WIDTH(ACCU_WIDTH),
+		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
+		.SEGMENTLEN(SEGMENTLEN),
+		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL),
+		.M_REG_LUT(M_REG_LUT)
+	)
+	dut (
+		.ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld),
+		.s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld),
+		.s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld),
+		.m_axis_output_tready(outputs.rdy)
+	);
+
+endmodule : mvu_vvu_axi_tb

From 2617c391e1d2c9b19fb881acb6012fc56df35eae Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 21 Sep 2023 16:25:22 +0100
Subject: [PATCH 155/235] [axi wrapper]: minor modification to comment
 description

---
 finn-rtllib/mvu/mvu_vvu_axi_wrapper.v | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
index 6dbf82cb7b..788e49a71b 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
+++ b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
@@ -28,7 +28,7 @@
  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * @brief	Verilog AXI-lite wrapper for MVU.
+ * @brief	Verilog AXI-lite wrapper for MVU & VVU.
  *****************************************************************************/
 
 module $MODULE_NAME_AXI_WRAPPER$ #(

From 8ca5fe73c003aec3e7998d83e233102c012dd531 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 21 Sep 2023 16:34:12 +0100
Subject: [PATCH 156/235] [mvu axi]: add support for VVU on DSP58

---
 finn-rtllib/mvu/mvu_axi.sv | 105 ++++++++++++++++++++++++-------------
 1 file changed, 69 insertions(+), 36 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_axi.sv b/finn-rtllib/mvu/mvu_axi.sv
index 46167af95b..07ad32e6c8 100644
--- a/finn-rtllib/mvu/mvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_axi.sv
@@ -28,19 +28,25 @@
  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * @brief	Matrix Vector Unit (MVU) AXI-lite interface wrapper.
+ * @brief	Matrix Vector Unit (MVU) & Vector Vector Unit (VVU) AXI-lite interface wrapper.
  * @details
+ *	 The following compute cores are supported:
+ *   - 4-bit MVU on DSP48 & DSP58 achieving 4 MACs/DSP, 
+ *     (4,8]-bit MVU on DSP48 achieving 2 MACs/DSP,
+ *     [4,9]-bit MVU and VVU on DSP58 achieving 3 MACs/DSP,
+ *     'unconstrained' LUT-based MVU and VVU.
  *  Folding hints:
- *	 - 4-bit MVU:          PE scaling should divide MH.
- *	 - 8-bit MVU - DSP48:  PE scaling should divide MH.
- *	 - 8-bit MVU - DSP58:  SIMD scaling should aim at a full multiple of 3 and divide MW.
+ *	 - PE scaling should divide MH.
+ *   - SIMD scaling should divide MW.
  *	 - Otherwise, keep SIMD and PE somewhat balanced. SIMD scaling tends to
  *	   impact critical paths more than PE scaling. PE scaling implies a
  *	   bigger fanout on the input activations.
  *	 - Full unfolding along MH (PE=MH) results in no replay buffer instantiated
  *****************************************************************************/
 
-module mvu_axi #(
+module mvu_vvu_axi #(
+	bit IS_MVU, // string type causes error in Vivado
+	parameter COMPUTE_CORE,
 	int unsigned MW,
 	int unsigned MH,
 	int unsigned PE,
@@ -51,16 +57,16 @@ module mvu_axi #(
 	bit SIGNED_ACTIVATIONS = 0,
 	int unsigned SEGMENTLEN = 0,
 	bit FORCE_BEHAVIORAL = 0,
-	parameter MVU_IMPL_STYLE, // string type causes error in Vivado
+	bit M_REG_LUT = 1,
 
+	// Safely deducible parameters
 	localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
-	localparam int unsigned INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8,
+	localparam int unsigned INPUT_STREAM_WIDTH_BA = ((IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8,
 	localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH,
-	localparam int unsigned INPUT_STREAM_WIDTH = SIMD*ACTIVATION_WIDTH,
+	localparam int unsigned INPUT_STREAM_WIDTH =  (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH,
 	localparam int unsigned SF = MW/SIMD,
 	localparam int unsigned NF = MH/PE,
-	localparam int unsigned OUTPUT_LANES = PE,
-	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
+	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8
 )
 (
 	// Global Control
@@ -93,27 +99,31 @@ module mvu_axi #(
 			$error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE);
 			$finish;
 		end
-		if (ACTIVATION_WIDTH > 9) begin
-			$error("Activation width of %0d-bits exceeds maximum of 9-bits", ACTIVATION_WIDTH);
-			$finish;
-		end
 		if (WEIGHT_WIDTH > 8) begin
 			$error("Weight width of %0d-bits exceeds maximum of 8-bits", WEIGHT_WIDTH);
 			$finish;
 		end
-		if (SIGNED_ACTIVATIONS == 0 && ACTIVATION_WIDTH==9) begin
-			$error("Activation width of %0d-bits exceeds maximum of 8-bits for unsigned numbers", ACTIVATION_WIDTH);
-			$finish;
+		if (ACTIVATION_WIDTH > 8) begin
+			if (!(SIGNED_ACTIVATIONS == 1 && ACTIVATION_WIDTH == 9 && COMPUTE_CORE == "mvu_vvu_8sx9_dsp58")) begin
+				$error("Activation width of %0d-bits exceeds maximum of 9-bits for signed numbers on DSP48", ACTIVATION_WIDTH);
+				$finish;
+			end
 		end
-		if (MVU_IMPL_STYLE == "mvu_8sx9") begin
+		if (COMPUTE_CORE == "mvu_vvu_8sx9_dsp58") begin
 			if (SEGMENTLEN == 0) begin
-				$warning("Segment length of %0d defaults to chain length", SEGMENTLEN);
+				$warning("Segment length of %0d defaults to chain length of %0d", SEGMENTLEN, (SIMD+2)/3);
 			end
 			if (SEGMENTLEN > (SIMD+2)/3) begin
 				$error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3);
 				$finish;
 			end
 		end
+		if (!IS_MVU) begin
+			if (COMPUTE_CORE != "mvu_vvu_8sx9_dsp58" && COMPUTE_CORE != "mvu_vvu_lut") begin
+				$error("VVU only supported on DSP58 or LUT-based implementation");
+				$finish;
+			end
+		end
 	end
 
 	uwire clk = ap_clk;
@@ -127,10 +137,10 @@ module mvu_axi #(
 	uwire avld;
 	uwire ardy;
 
-	replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvauin_t))) activation_replay (
-		.clk, .rst,
-		.ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)),
-		.ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin)
+	replay_buffer #(.LEN(SF), .REP(IS_MVU ? NF : 1), .W($bits(mvauin_t))) activation_replay (
+	.clk, .rst,
+	.ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)),
+	.ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin)
 	);
 
 //-------------------- Input control --------------------\\
@@ -139,37 +149,60 @@ module mvu_axi #(
 	assign ardy = en && s_axis_weights_tvalid;
 	assign s_axis_weights_tready = en && avld;
 
-//-------------------- Core MVU --------------------\\
+//-------------------- Core MVU/VVU --------------------\\
 	uwire ovld;
 	uwire [PE-1:0][ACCU_WIDTH-1:0] odat;
 	typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t;
-
-	case(MVU_IMPL_STYLE)
-	"mvu_8sx9_dsp58":
-		mvu_8sx9 #(.PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
+	uwire mvauin_t amvau_i;
+
+	if (IS_MVU) begin : genMVUInput
+		assign  amvau_i = amvau;
+	end : genMVUInput
+	else begin : genVVUInput
+		// The input stream will have the channels interleaved for VVU when PE>1
+		// Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..]
+		// Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like:
+		// (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to
+		// (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i)
+		localparam int num_of_elements = INPUT_STREAM_WIDTH/ACTIVATION_WIDTH;
+		for (genvar i=0; i<num_of_elements; i++) begin : genRewire
+			assign  amvau_i[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH] = (PE > 1) ?
+									amvau[(i/SIMD + (i*PE % num_of_elements) + 1) * ACTIVATION_WIDTH : (i/SIMD + (i*PE % num_of_elements)) * ACTIVATION_WIDTH]
+									: amvau[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH];
+		end : genRewire
+	end : genVVUInput
+
+	case(COMPUTE_CORE)
+	"mvu_vvu_8sx9_dsp58":
+		mvu_vvu_8sx9 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
 		.ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN),
 		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
 			.clk, .rst, .en,
-			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau),
+			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
 			.vld(ovld), .p(odat)
 		);
-
 	"mvu_4sx4u":
 		mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
 			.clk, .rst, .en,
-			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau),
+			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
 			.vld(ovld), .p(odat)
 		);
-
 	"mvu_8sx8u_dsp48":
 		mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
 		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
 			.clk, .rst, .en,
-			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau),
+			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
+			.vld(ovld), .p(odat)
+		);
+	"mvu_vvu_lut":
+		mvu_vvu_lut #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
+		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .M_REG(M_REG_LUT)) core (
+			.clk, .rst, .en,
+			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
 			.vld(ovld), .p(odat)
 		);
 	default: initial begin
-		$error("Unrecognized MVU_IMPL_STYLE '%s'", MVU_IMPL_STYLE);
+		$error("Unrecognized COMPUTE_CORE '%s'", COMPUTE_CORE);
 		$finish;
 	end
 	endcase
@@ -203,7 +236,7 @@ module mvu_axi #(
 
 	assign	b_load = !B.vld || m_axis_output_tready;
 	always_ff @(posedge clk) begin
-		if(rst)		B <= '{ default: 'x };
+		if(rst)		B <= '{ vld: 0, default: 'x };
 		else begin
 			if(b_load)	B <= '{ vld: A.vld, dat: A.dat};
 		end
@@ -212,4 +245,4 @@ module mvu_axi #(
 	assign	m_axis_output_tvalid = B.vld;
 	assign	m_axis_output_tdata  = B.dat;
 
-endmodule : mvu_axi
+endmodule : mvu_vvu_axi

From 32d6338c626b26d2e48cdb21cde438d557cc9bcd Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 21 Sep 2023 16:34:36 +0100
Subject: [PATCH 157/235] [mvu vvu axi]: renamed file for consistency purposes

---
 finn-rtllib/mvu/mvu_vvu_axi.sv | 248 +++++++++++++++++++++++++++++++++
 1 file changed, 248 insertions(+)
 create mode 100644 finn-rtllib/mvu/mvu_vvu_axi.sv

diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
new file mode 100644
index 0000000000..07ad32e6c8
--- /dev/null
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -0,0 +1,248 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Matrix Vector Unit (MVU) & Vector Vector Unit (VVU) AXI-lite interface wrapper.
+ * @details
+ *	 The following compute cores are supported:
+ *   - 4-bit MVU on DSP48 & DSP58 achieving 4 MACs/DSP, 
+ *     (4,8]-bit MVU on DSP48 achieving 2 MACs/DSP,
+ *     [4,9]-bit MVU and VVU on DSP58 achieving 3 MACs/DSP,
+ *     'unconstrained' LUT-based MVU and VVU.
+ *  Folding hints:
+ *	 - PE scaling should divide MH.
+ *   - SIMD scaling should divide MW.
+ *	 - Otherwise, keep SIMD and PE somewhat balanced. SIMD scaling tends to
+ *	   impact critical paths more than PE scaling. PE scaling implies a
+ *	   bigger fanout on the input activations.
+ *	 - Full unfolding along MH (PE=MH) results in no replay buffer instantiated
+ *****************************************************************************/
+
+module mvu_vvu_axi #(
+	bit IS_MVU, // string type causes error in Vivado
+	parameter COMPUTE_CORE,
+	int unsigned MW,
+	int unsigned MH,
+	int unsigned PE,
+	int unsigned SIMD,
+	int unsigned ACTIVATION_WIDTH,
+	int unsigned WEIGHT_WIDTH,
+	int unsigned ACCU_WIDTH,
+	bit SIGNED_ACTIVATIONS = 0,
+	int unsigned SEGMENTLEN = 0,
+	bit FORCE_BEHAVIORAL = 0,
+	bit M_REG_LUT = 1,
+
+	// Safely deducible parameters
+	localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
+	localparam int unsigned INPUT_STREAM_WIDTH_BA = ((IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8,
+	localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH,
+	localparam int unsigned INPUT_STREAM_WIDTH =  (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH,
+	localparam int unsigned SF = MW/SIMD,
+	localparam int unsigned NF = MH/PE,
+	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8
+)
+(
+	// Global Control
+	input	logic  ap_clk,
+	input	logic  ap_rst_n,
+
+	// Weight Stream
+	input	logic [WEIGHT_STREAM_WIDTH_BA-1:0]  s_axis_weights_tdata,
+	input	logic  s_axis_weights_tvalid,
+	output	logic  s_axis_weights_tready,
+
+	// Input Stream
+	input	logic [INPUT_STREAM_WIDTH_BA-1:0]  s_axis_input_tdata,
+	input	logic  s_axis_input_tvalid,
+	output	logic  s_axis_input_tready,
+
+	// Output Stream
+	output	logic [OUTPUT_STREAM_WIDTH_BA-1:0]  m_axis_output_tdata,
+	output	logic  m_axis_output_tvalid,
+	input	logic  m_axis_output_tready
+);
+
+//-------------------- Parameter sanity checks --------------------\\
+	initial begin
+		if (MW % SIMD != 0) begin
+			$error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD);
+			$finish;
+		end
+		if (MH % PE != 0) begin
+			$error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE);
+			$finish;
+		end
+		if (WEIGHT_WIDTH > 8) begin
+			$error("Weight width of %0d-bits exceeds maximum of 8-bits", WEIGHT_WIDTH);
+			$finish;
+		end
+		if (ACTIVATION_WIDTH > 8) begin
+			if (!(SIGNED_ACTIVATIONS == 1 && ACTIVATION_WIDTH == 9 && COMPUTE_CORE == "mvu_vvu_8sx9_dsp58")) begin
+				$error("Activation width of %0d-bits exceeds maximum of 9-bits for signed numbers on DSP48", ACTIVATION_WIDTH);
+				$finish;
+			end
+		end
+		if (COMPUTE_CORE == "mvu_vvu_8sx9_dsp58") begin
+			if (SEGMENTLEN == 0) begin
+				$warning("Segment length of %0d defaults to chain length of %0d", SEGMENTLEN, (SIMD+2)/3);
+			end
+			if (SEGMENTLEN > (SIMD+2)/3) begin
+				$error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3);
+				$finish;
+			end
+		end
+		if (!IS_MVU) begin
+			if (COMPUTE_CORE != "mvu_vvu_8sx9_dsp58" && COMPUTE_CORE != "mvu_vvu_lut") begin
+				$error("VVU only supported on DSP58 or LUT-based implementation");
+				$finish;
+			end
+		end
+	end
+
+	uwire clk = ap_clk;
+	uwire rst = !ap_rst_n;
+
+	typedef logic [INPUT_STREAM_WIDTH-1 : 0] mvauin_t;
+
+	uwire mvauin_t amvau;
+	uwire alast;
+	uwire afin;
+	uwire avld;
+	uwire ardy;
+
+	replay_buffer #(.LEN(SF), .REP(IS_MVU ? NF : 1), .W($bits(mvauin_t))) activation_replay (
+	.clk, .rst,
+	.ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)),
+	.ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin)
+	);
+
+//-------------------- Input control --------------------\\
+	uwire en;
+	uwire istb = avld && s_axis_weights_tvalid;
+	assign ardy = en && s_axis_weights_tvalid;
+	assign s_axis_weights_tready = en && avld;
+
+//-------------------- Core MVU/VVU --------------------\\
+	uwire ovld;
+	uwire [PE-1:0][ACCU_WIDTH-1:0] odat;
+	typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t;
+	uwire mvauin_t amvau_i;
+
+	if (IS_MVU) begin : genMVUInput
+		assign  amvau_i = amvau;
+	end : genMVUInput
+	else begin : genVVUInput
+		// The input stream will have the channels interleaved for VVU when PE>1
+		// Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..]
+		// Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like:
+		// (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to
+		// (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i)
+		localparam int num_of_elements = INPUT_STREAM_WIDTH/ACTIVATION_WIDTH;
+		for (genvar i=0; i<num_of_elements; i++) begin : genRewire
+			assign  amvau_i[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH] = (PE > 1) ?
+									amvau[(i/SIMD + (i*PE % num_of_elements) + 1) * ACTIVATION_WIDTH : (i/SIMD + (i*PE % num_of_elements)) * ACTIVATION_WIDTH]
+									: amvau[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH];
+		end : genRewire
+	end : genVVUInput
+
+	case(COMPUTE_CORE)
+	"mvu_vvu_8sx9_dsp58":
+		mvu_vvu_8sx9 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
+		.ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN),
+		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
+			.clk, .rst, .en,
+			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
+			.vld(ovld), .p(odat)
+		);
+	"mvu_4sx4u":
+		mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
+			.clk, .rst, .en,
+			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
+			.vld(ovld), .p(odat)
+		);
+	"mvu_8sx8u_dsp48":
+		mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
+		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
+			.clk, .rst, .en,
+			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
+			.vld(ovld), .p(odat)
+		);
+	"mvu_vvu_lut":
+		mvu_vvu_lut #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
+		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .M_REG(M_REG_LUT)) core (
+			.clk, .rst, .en,
+			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
+			.vld(ovld), .p(odat)
+		);
+	default: initial begin
+		$error("Unrecognized COMPUTE_CORE '%s'", COMPUTE_CORE);
+		$finish;
+	end
+	endcase
+
+//-------------------- Output register slice --------------------\\
+	struct packed {
+		logic vld;
+		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
+	} A = '{ vld: 0, default: 'x};
+
+	assign en = !A.vld || !ovld;
+
+	uwire  b_load;
+	always_ff @(posedge clk) begin
+		if(rst)		A <= '{ vld: 0, default: 'x };
+		else if(!A.vld || b_load) begin
+			A.vld <= ovld && en;
+			for(int unsigned  i = 0; i < PE; i++) begin
+				// CR-1148862:
+				// A.dat[i] <= odat[i];
+				automatic logic [ACCU_WIDTH-1:0]  v = odat[i];
+				A.dat[i] <= v[ACCU_WIDTH-1:0];
+			end
+		end
+	end
+
+	struct packed {
+		logic vld;
+		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
+	} B = '{ vld: 0, default: 'x};
+
+	assign	b_load = !B.vld || m_axis_output_tready;
+	always_ff @(posedge clk) begin
+		if(rst)		B <= '{ vld: 0, default: 'x };
+		else begin
+			if(b_load)	B <= '{ vld: A.vld, dat: A.dat};
+		end
+	end
+
+	assign	m_axis_output_tvalid = B.vld;
+	assign	m_axis_output_tdata  = B.dat;
+
+endmodule : mvu_vvu_axi

From 031406d73fa36a02638a94affd6a0bef36956c3c Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 21 Sep 2023 16:39:22 +0100
Subject: [PATCH 158/235] [mvu 8sx9]: added support for VVU on DSP58, resolved
 PyVerilator-caused error and added synthesis directive to handle 'X in input
 data

---
 finn-rtllib/mvu/mvu_8sx9.sv | 100 +++++++++++++++++++-----------------
 1 file changed, 52 insertions(+), 48 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv
index 34aa856b1b..52a93739d6 100644
--- a/finn-rtllib/mvu/mvu_8sx9.sv
+++ b/finn-rtllib/mvu/mvu_8sx9.sv
@@ -31,7 +31,8 @@
  * @brief	Matrix Vector Unit (MVU) core compute kernel utilizing DSP58.
  *****************************************************************************/
 
-module mvu_8sx9 #(
+module mvu_vvu_8sx9 #(
+	parameter IS_MVU,
     int unsigned PE,
     int unsigned SIMD,
     int unsigned ACTIVATION_WIDTH,
@@ -39,7 +40,9 @@ module mvu_8sx9 #(
 	int unsigned ACCU_WIDTH,
     bit SIGNED_ACTIVATIONS = 0,
     int unsigned SEGMENTLEN = 0, // Default to 0 (which implies a single segment)
-	bit FORCE_BEHAVIORAL = 0
+	bit FORCE_BEHAVIORAL = 0,
+
+	int unsigned  ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD
   )
   (
     // Global Control
@@ -51,7 +54,7 @@ module mvu_8sx9 #(
     input   logic last,
     input   logic zero, // ignore current inputs and force this partial product to zero
     input   logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // weights
-	input   logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] a, // activations
+	input   logic [ACTIVATION_ELEMENTS-1:0][ACTIVATION_WIDTH-1:0] a, // activations
 
 	// Ouput
 	output  logic vld,
@@ -67,9 +70,10 @@ module mvu_8sx9 #(
 //-------------------- Declare global signals --------------------\\
 	localparam int unsigned CHAINLEN = (SIMD+2)/3;
 	localparam int unsigned SEGLEN = SEGMENTLEN == 0 ? CHAINLEN : SEGMENTLEN; // Additional constant to default a SEGMENTLEN of '0' to the DSP-chain length
-	uwire [26:0] a_in_i [CHAINLEN];
+	localparam int unsigned PE_ACTIVATION = IS_MVU ? 1 : PE;
+	uwire [26:0] a_in_i [PE_ACTIVATION * CHAINLEN];
 	uwire [23:0] b_in_i [PE][CHAINLEN];
-	uwire [57:0] pcout [PE][CHAINLEN];
+	uwire [PE-1:0][CHAINLEN-1:0][57:0] pcout; // Array with packed dimension > 256 (with a loop-carried dependency) cannot be handled out-of-the-box with PyVerilator
 
 //-------------------- Shift register for opmode select signal --------------------\\
 	localparam int unsigned MAX_PIPELINE_STAGES = (CHAINLEN + SEGLEN-1)/SEGLEN; // >=1 (== number of pipeline registers + 1 (A/B inputs always have 1 register))
@@ -99,48 +103,48 @@ module mvu_8sx9 #(
 
 //-------------------- Buffer for input activations --------------------\\
 	localparam int unsigned PAD_BITS_ACT = 9 - ACTIVATION_WIDTH;
+	for (genvar k=0; k<PE_ACTIVATION; k++) begin : genActPE
+		for (genvar i=0; i<CHAINLEN; i++) begin : genActSIMD
+			localparam int TOTAL_PREGS = i/SEGLEN;
+			localparam int EXTERNAL_PREGS = TOTAL_PREGS>1 ? TOTAL_PREGS-1 : 0;
+			localparam int LANES_OCCUPIED = i == CHAINLEN-1 ? SIMD - 3*i : 3;
 
-	for (genvar i=0; i<CHAINLEN; i++) begin : genActSIMD
-		localparam int TOTAL_PREGS = i/SEGLEN;
-		localparam int EXTERNAL_PREGS = TOTAL_PREGS>1 ? TOTAL_PREGS-1 : 0;
-		localparam int LANES_OCCUPIED = i == CHAINLEN-1 ? SIMD - 3*i : 3;
-
-		if (EXTERNAL_PREGS > 0) begin : genExternalPregAct
-			logic [0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][ACTIVATION_WIDTH-1:0] A = '{ default : 0};
-			always_ff @(posedge clk) begin
-				if (rst)     A <= '{default: 0};
-				else if(en) begin
-					A[EXTERNAL_PREGS-1] <= 
-// synthesis translate_off
-						zero ? '1 : 
-// synthesis translate_on						
-						a[3*i +: LANES_OCCUPIED];
-					if (EXTERNAL_PREGS > 1)   A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1];
+			if (EXTERNAL_PREGS > 0) begin : genExternalPregAct
+				logic [0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][ACTIVATION_WIDTH-1:0] A = '{ default : 0};
+				always_ff @(posedge clk) begin
+					if (rst)     A <= '{default: 0};
+					else if(en) begin
+						A[EXTERNAL_PREGS-1] <= 
+	// synthesis translate_off
+							zero ? '1 : 
+	// synthesis translate_on						
+							a[SIMD*k + 3*i +: LANES_OCCUPIED];
+						if (EXTERNAL_PREGS > 1)   A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1];
+					end
 				end
-			end
-			for (genvar j=0; j<LANES_OCCUPIED; j++) begin : genAin
-			assign a_in_i[i][9*j +: 9] = SIGNED_ACTIVATIONS ? PAD_BITS_ACT == 0 ? A[0][j] : { {PAD_BITS_ACT{A[0][j][ACTIVATION_WIDTH-1]}}, A[0][j] } 
-												: PAD_BITS_ACT == 0 ? A[0][j] : { {PAD_BITS_ACT{1'b0}}, A[0][j] } ;
-			end : genAin
-			for (genvar j=LANES_OCCUPIED; j<3; j++) begin : genAinZero
-				assign a_in_i[i][9*j +: 9] = 9'b0;
-			end : genAinZero
-		end : genExternalPregAct
-		else begin : genInpDSPAct
-			for (genvar j=0; j<LANES_OCCUPIED; j++) begin : genAin
-				assign a_in_i[i][9*j +: 9] = 
-// synthesis translate_off
-					zero ? '1 : 				
-// synthesis translate_on
-					SIGNED_ACTIVATIONS ? PAD_BITS_ACT == 0 ? a[3*i+j] : { {PAD_BITS_ACT{a[3*i+j][ACTIVATION_WIDTH-1]}}, a[3*i+j] }
-												: PAD_BITS_ACT == 0 ? a[3*i+j] : { {PAD_BITS_ACT{1'b0}}, a[3*i+j] } ;
-			end : genAin
-			for (genvar j=LANES_OCCUPIED; j<3; j++) begin : genAinZero
-				assign a_in_i[i][9*j +: 9] = 9'b0;
-			end : genAinZero
-		end : genInpDSPAct
-
-	end : genActSIMD
+				for (genvar j=0; j<LANES_OCCUPIED; j++) begin : genAin
+				assign a_in_i[CHAINLEN*k+i][9*j +: 9] = SIGNED_ACTIVATIONS ? PAD_BITS_ACT == 0 ? A[0][j] : { {PAD_BITS_ACT{A[0][j][ACTIVATION_WIDTH-1]}}, A[0][j] } 
+													  : PAD_BITS_ACT == 0 ? A[0][j] : { {PAD_BITS_ACT{1'b0}}, A[0][j] } ;
+				end : genAin
+				for (genvar j=LANES_OCCUPIED; j<3; j++) begin : genAinZero
+					assign a_in_i[CHAINLEN*k+i][9*j +: 9] = 9'b0;
+				end : genAinZero
+			end : genExternalPregAct
+			else begin : genInpDSPAct
+				for (genvar j=0; j<LANES_OCCUPIED; j++) begin : genAin
+					assign a_in_i[CHAINLEN*k+i][9*j +: 9] = 
+	// synthesis translate_off
+						zero ? '1 : 				
+	// synthesis translate_on
+						SIGNED_ACTIVATIONS ? PAD_BITS_ACT == 0 ? a[SIMD*k+3*i+j] : { {PAD_BITS_ACT{a[SIMD*k+3*i+j][ACTIVATION_WIDTH-1]}}, a[SIMD*k+3*i+j] }
+													: PAD_BITS_ACT == 0 ? a[SIMD*k+3*i+j] : { {PAD_BITS_ACT{1'b0}}, a[SIMD*k+3*i+j] } ;
+				end : genAin
+				for (genvar j=LANES_OCCUPIED; j<3; j++) begin : genAinZero
+					assign a_in_i[CHAINLEN*k+i][9*j +: 9] = 9'b0;
+				end : genAinZero
+			end : genInpDSPAct
+		end : genActSIMD
+	end : genActPE
 
 //-------------------- Buffer for weights --------------------\\
 	localparam int unsigned PAD_BITS_WEIGHT = 8 - WEIGHT_WIDTH;
@@ -209,7 +213,7 @@ module mvu_8sx9 #(
 				always_ff @(posedge clk) begin
 					if (rst)	Areg <= '{ default : 0};
 					else if (en) begin
-						Areg[0] <= { 7'bx, a_in_i[j] };
+						Areg[0] <= { 7'bx, a_in_i[(IS_MVU ? 0 : CHAINLEN*i) + j] };
 						if (INTERNAL_PREGS == 2) Areg[1] <= Areg[0];
 					end
 				end
@@ -384,7 +388,7 @@ module mvu_8sx9 #(
 							7'b000_0000
 					}), // 9-bit input: Operation mode
 					// Data inputs: Data Ports
-					.A({ 7'bx, a_in_i[j] }),            // 34-bit input: A data
+					.A({ 7'bx, a_in_i[(IS_MVU ? 0 : CHAINLEN*i) + j] }),            // 34-bit input: A data
 					.B(b_in_i[i][j]),                   // 24-bit input: B data
 					.C('x),                             // 58-bit input: C data
 					.CARRYIN('0),                       // 1-bit input: Carry-in
@@ -420,4 +424,4 @@ module mvu_8sx9 #(
 		end : genDSPChain
 	end : genDSPPE
 
-endmodule : mvu_8sx9
+endmodule : mvu_vvu_8sx9

From e2c1f1589c374a2fd7d0eb17621568621ea88bda Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 21 Sep 2023 16:39:52 +0100
Subject: [PATCH 159/235] [mvu vvu 8sx9]: renamed compute core for consistency

---
 finn-rtllib/mvu/mvu_vvu_8sx9.sv | 427 ++++++++++++++++++++++++++++++++
 1 file changed, 427 insertions(+)
 create mode 100644 finn-rtllib/mvu/mvu_vvu_8sx9.sv

diff --git a/finn-rtllib/mvu/mvu_vvu_8sx9.sv b/finn-rtllib/mvu/mvu_vvu_8sx9.sv
new file mode 100644
index 0000000000..52a93739d6
--- /dev/null
+++ b/finn-rtllib/mvu/mvu_vvu_8sx9.sv
@@ -0,0 +1,427 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Matrix Vector Unit (MVU) core compute kernel utilizing DSP58.
+ *****************************************************************************/
+
+module mvu_vvu_8sx9 #(
+	parameter IS_MVU,
+    int unsigned PE,
+    int unsigned SIMD,
+    int unsigned ACTIVATION_WIDTH,
+    int unsigned WEIGHT_WIDTH,
+	int unsigned ACCU_WIDTH,
+    bit SIGNED_ACTIVATIONS = 0,
+    int unsigned SEGMENTLEN = 0, // Default to 0 (which implies a single segment)
+	bit FORCE_BEHAVIORAL = 0,
+
+	int unsigned  ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD
+  )
+  (
+    // Global Control
+	input   logic clk,
+    input   logic rst,
+    input   logic en,
+
+	// Input
+    input   logic last,
+    input   logic zero, // ignore current inputs and force this partial product to zero
+    input   logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // weights
+	input   logic [ACTIVATION_ELEMENTS-1:0][ACTIVATION_WIDTH-1:0] a, // activations
+
+	// Ouput
+	output  logic vld,
+    output  logic [PE-1:0][ACCU_WIDTH-1:0] p
+  );
+	// for verilator always use behavioral code
+	localparam bit  BEHAVIORAL =
+`ifdef VERILATOR
+		1 ||
+`endif
+		FORCE_BEHAVIORAL;
+
+//-------------------- Declare global signals --------------------\\
+	localparam int unsigned CHAINLEN = (SIMD+2)/3;
+	localparam int unsigned SEGLEN = SEGMENTLEN == 0 ? CHAINLEN : SEGMENTLEN; // Additional constant to default a SEGMENTLEN of '0' to the DSP-chain length
+	localparam int unsigned PE_ACTIVATION = IS_MVU ? 1 : PE;
+	uwire [26:0] a_in_i [PE_ACTIVATION * CHAINLEN];
+	uwire [23:0] b_in_i [PE][CHAINLEN];
+	uwire [PE-1:0][CHAINLEN-1:0][57:0] pcout; // Array with packed dimension > 256 (with a loop-carried dependency) cannot be handled out-of-the-box with PyVerilator
+
+//-------------------- Shift register for opmode select signal --------------------\\
+	localparam int unsigned MAX_PIPELINE_STAGES = (CHAINLEN + SEGLEN-1)/SEGLEN; // >=1 (== number of pipeline registers + 1 (A/B inputs always have 1 register))
+	logic L [0:1+MAX_PIPELINE_STAGES] = '{default: 0}; // After MAX_PIPELINE_STAGES (== number of pipeline stages for input data), we have 3 additional cycles latency (A/B reg, Mreg, Preg). Thus, we add +2 (since OPMODE is buffered by 1 cycle in the DSP fabric)
+
+	always_ff @(posedge clk) begin
+		if(rst)     L <= '{default: 0};
+		else if(en) begin
+			L[1+MAX_PIPELINE_STAGES] <= last;
+			L[0:MAX_PIPELINE_STAGES] <= L[1:1+MAX_PIPELINE_STAGES];
+		end
+	end
+	assign vld = L[0];
+
+//-------------------- Shift register for ZERO flag --------------------\\
+	logic Z [0:MAX_PIPELINE_STAGES-2] = '{default:0}; // We need MAX_PIPELINE_STAGES-1 pipeline stages (note: INMODE is buffered inside DSP fabric)
+
+	if (MAX_PIPELINE_STAGES > 1) begin : genZreg
+		always_ff @(posedge clk) begin
+			if (rst)      Z <= '{default: 0};
+			else if(en) begin
+				Z[0] <= zero;
+				if (MAX_PIPELINE_STAGES > 2)  Z[1:MAX_PIPELINE_STAGES-2] <= Z[0:MAX_PIPELINE_STAGES-3];
+			end
+		end
+	end;
+
+//-------------------- Buffer for input activations --------------------\\
+	localparam int unsigned PAD_BITS_ACT = 9 - ACTIVATION_WIDTH;
+	for (genvar k=0; k<PE_ACTIVATION; k++) begin : genActPE
+		for (genvar i=0; i<CHAINLEN; i++) begin : genActSIMD
+			localparam int TOTAL_PREGS = i/SEGLEN;
+			localparam int EXTERNAL_PREGS = TOTAL_PREGS>1 ? TOTAL_PREGS-1 : 0;
+			localparam int LANES_OCCUPIED = i == CHAINLEN-1 ? SIMD - 3*i : 3;
+
+			if (EXTERNAL_PREGS > 0) begin : genExternalPregAct
+				logic [0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][ACTIVATION_WIDTH-1:0] A = '{ default : 0};
+				always_ff @(posedge clk) begin
+					if (rst)     A <= '{default: 0};
+					else if(en) begin
+						A[EXTERNAL_PREGS-1] <= 
+	// synthesis translate_off
+							zero ? '1 : 
+	// synthesis translate_on						
+							a[SIMD*k + 3*i +: LANES_OCCUPIED];
+						if (EXTERNAL_PREGS > 1)   A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1];
+					end
+				end
+				for (genvar j=0; j<LANES_OCCUPIED; j++) begin : genAin
+				assign a_in_i[CHAINLEN*k+i][9*j +: 9] = SIGNED_ACTIVATIONS ? PAD_BITS_ACT == 0 ? A[0][j] : { {PAD_BITS_ACT{A[0][j][ACTIVATION_WIDTH-1]}}, A[0][j] } 
+													  : PAD_BITS_ACT == 0 ? A[0][j] : { {PAD_BITS_ACT{1'b0}}, A[0][j] } ;
+				end : genAin
+				for (genvar j=LANES_OCCUPIED; j<3; j++) begin : genAinZero
+					assign a_in_i[CHAINLEN*k+i][9*j +: 9] = 9'b0;
+				end : genAinZero
+			end : genExternalPregAct
+			else begin : genInpDSPAct
+				for (genvar j=0; j<LANES_OCCUPIED; j++) begin : genAin
+					assign a_in_i[CHAINLEN*k+i][9*j +: 9] = 
+	// synthesis translate_off
+						zero ? '1 : 				
+	// synthesis translate_on
+						SIGNED_ACTIVATIONS ? PAD_BITS_ACT == 0 ? a[SIMD*k+3*i+j] : { {PAD_BITS_ACT{a[SIMD*k+3*i+j][ACTIVATION_WIDTH-1]}}, a[SIMD*k+3*i+j] }
+													: PAD_BITS_ACT == 0 ? a[SIMD*k+3*i+j] : { {PAD_BITS_ACT{1'b0}}, a[SIMD*k+3*i+j] } ;
+				end : genAin
+				for (genvar j=LANES_OCCUPIED; j<3; j++) begin : genAinZero
+					assign a_in_i[CHAINLEN*k+i][9*j +: 9] = 9'b0;
+				end : genAinZero
+			end : genInpDSPAct
+		end : genActSIMD
+	end : genActPE
+
+//-------------------- Buffer for weights --------------------\\
+	localparam int unsigned PAD_BITS_WEIGHT = 8 - WEIGHT_WIDTH;
+
+	for (genvar i=0; i<PE; i++) begin : genWeightPE
+		for (genvar j=0; j<CHAINLEN; j++) begin : genWeightSIMD
+			localparam int TOTAL_PREGS = j/SEGLEN;
+			localparam int EXTERNAL_PREGS = TOTAL_PREGS>1 ? TOTAL_PREGS-1 : 0;
+			localparam int LANES_OCCUPIED = j == CHAINLEN-1 ? SIMD - 3*j : 3;
+
+			if (EXTERNAL_PREGS > 0) begin : genExternalPregWeight
+				logic [0:PE-1][0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][WEIGHT_WIDTH-1:0] B = '{ default : 0};
+				always_ff @(posedge clk) begin
+					if (rst)    B <= '{default: 0};
+					else if (en) begin
+						B[i][EXTERNAL_PREGS-1] <= 
+// synthesis translate_off
+							zero ? '1 : 						
+// synthesis translate_on							
+							w[i][3*j +: LANES_OCCUPIED];
+						if (EXTERNAL_PREGS > 1) B[i][0:EXTERNAL_PREGS-2] <= B[i][1:EXTERNAL_PREGS-1];
+					end
+				end
+				for (genvar k = 0 ; k < LANES_OCCUPIED ; k++) begin : genBin
+					assign b_in_i[i][j][8*k +: 8] = PAD_BITS_WEIGHT == 0 ? B[i][0][k] : { {PAD_BITS_WEIGHT{B[i][0][k][WEIGHT_WIDTH-1]}}, B[i][0][k] };
+				end : genBin
+				for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero
+					assign b_in_i[i][j][8*k +: 8] = 8'b0;
+				end : genBinZero
+			end : genExternalPregWeight
+			else begin : genInpDSPWeight
+				for (genvar k = 0; k < LANES_OCCUPIED; k++) begin : genBin
+					assign b_in_i[i][j][8*k +: 8] = 
+// synthesis translate_off					
+						zero ? '1 : 
+// synthesis translate_on					
+						PAD_BITS_WEIGHT == 0 ? w[i][3*j+k] : { {PAD_BITS_WEIGHT{w[i][3*j+k][WEIGHT_WIDTH-1]}}, w[i][3*j+k] };
+				end : genBin
+				for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero
+					assign b_in_i[i][j][8*k +: 8] = 8'b0;
+				end : genBinZero
+			end : genInpDSPWeight
+		end : genWeightSIMD
+	end : genWeightPE
+
+//-------------------- Instantiate PE x CHAINLEN DSPs --------------------\\
+	for (genvar i=0; i<PE; i++) begin : genDSPPE
+		for (genvar j=0; j<CHAINLEN; j++) begin : genDSPChain
+			localparam int TOTAL_PREGS = j/SEGLEN;
+			localparam int INTERNAL_PREGS = TOTAL_PREGS>0 ? 2 : 1; // 1 : 0
+			localparam bit PREG = (j+1)%SEGLEN==0 || j == CHAINLEN-1;
+			localparam bit FIRST = j == 0;
+			localparam bit LAST = j == CHAINLEN-1;
+			uwire [57:0] pp;
+
+			if (LAST) begin : genPOUT
+				assign p[i] = pp[ACCU_WIDTH-1:0];
+			end
+
+			// Note: Since the product B * AD is computed,
+			//       rst can be only applied to AD and zero only to B
+			//       with the same effect as zeroing both.
+			if(BEHAVIORAL) begin : genBehav
+				// Stage #1: Input A/B
+				logic signed [33:0] Areg [INTERNAL_PREGS];
+				always_ff @(posedge clk) begin
+					if (rst)	Areg <= '{ default : 0};
+					else if (en) begin
+						Areg[0] <= { 7'bx, a_in_i[(IS_MVU ? 0 : CHAINLEN*i) + j] };
+						if (INTERNAL_PREGS == 2) Areg[1] <= Areg[0];
+					end
+				end
+				logic signed [23:0] Breg [INTERNAL_PREGS];
+				always_ff @(posedge clk) begin
+					if (rst)	Breg <= '{ default : 0};
+					else if (en) begin
+						Breg[0] <= b_in_i[i][j];
+						if (INTERNAL_PREGS == 2) Breg[1] <= Breg[0];
+					end
+				end
+
+				// Stage #2: Multiply-Accumulate
+				logic signed [57:0] Mreg;
+				logic InmodeZero = 0;
+				always_ff @(posedge clk) begin
+					if (rst)		InmodeZero <= 0;
+					else if (en)	InmodeZero <= ( TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero );
+				end
+				always_ff @(posedge clk) begin
+					if (rst)	Mreg <= 0;
+					else if (en) begin
+						automatic logic signed [57:0] m = 0;
+						for (int k = 0; k < 3; k++) begin
+							m = m + (InmodeZero ? 0 : $signed(Areg[INTERNAL_PREGS-1][9*k +: 9]) * $signed(Breg[INTERNAL_PREGS-1][8*k +: 8]));
+						end
+						Mreg <= m;
+					end
+				end
+
+				// Stage #3: Accumulate
+				logic signed [57:0] Preg;
+				logic Opmode = 0;
+				if (FIRST && !LAST) begin : genFirst
+					if (PREG) begin : genPregBehav
+						always_ff @(posedge clk) begin
+							if (rst)		Preg <= 0;
+							else if (en)	Preg <= Mreg;
+						end
+					end
+					else	assign Preg = Mreg;
+				end
+				else if (FIRST && LAST) begin : genSingle
+					always_ff @(posedge clk) begin
+						if (rst)		Opmode <= 0;
+						else if (en)	Opmode <= L[1];
+					end
+					always_ff @(posedge clk) begin
+						if (rst) 		Preg <= 0;
+						else if (en)	Preg <= (Opmode ? 0 : Preg) + Mreg;
+					end
+				end
+				else if (!FIRST && LAST) begin : genLast
+					always_ff @(posedge clk) begin
+						if (rst)		Opmode <= 0;
+						else if (en)	Opmode <= L[1];
+					end
+					always_ff @(posedge clk) begin
+						if (rst) 		Preg <= 0;
+						else if (en)	Preg <= (Opmode ? 0 : Preg) + Mreg + pcout[i][j-1];
+					end
+				end
+				else begin : genMid
+					if (PREG) begin : genPregBehav
+						always_ff @(posedge clk) begin
+							if (rst)		Preg <= 0;
+							else if (en)	Preg <= Mreg + pcout[i][j-1];
+						end
+					end
+					else	assign Preg = Mreg + pcout[i][j-1];
+				end
+				assign pp = Preg;
+				assign pcout[i][j] = Preg;
+			end : genBehav
+`ifndef VERILATOR
+			else begin: genDSP
+				DSP58 #(
+					// Feature Control Attributes: Data Path Selection
+					.AMULTSEL("A"),                     // Selects A input to multiplier (A, AD)
+					.A_INPUT("DIRECT"),                 // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port)
+					.BMULTSEL("B"),                     // Selects B input to multiplier (AD, B)
+					.B_INPUT("DIRECT"),                 // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port)
+					.DSP_MODE("INT8"),                  // Configures DSP to a particular mode of operation. Set to INT24 for
+														// legacy mode.
+					.PREADDINSEL("A"),                  // Selects input to pre-adder (A, B)
+					.RND(58'h000000000000000),          // Rounding Constant
+					.USE_MULT("MULTIPLY"),              // Select multiplier usage (DYNAMIC, MULTIPLY, NONE)
+					.USE_SIMD("ONE58"),                 // SIMD selection (FOUR12, ONE58, TWO24)
+					.USE_WIDEXOR("FALSE"),              // Use the Wide XOR function (FALSE, TRUE)
+					.XORSIMD("XOR24_34_58_116"),        // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116)
+					// Pattern Detector Attributes: Pattern Detection Configuration
+					.AUTORESET_PATDET("NO_RESET"),      // NO_RESET, RESET_MATCH, RESET_NOT_MATCH
+					.AUTORESET_PRIORITY("RESET"),       // Priority of AUTORESET vs. CEP (CEP, RESET).
+					.MASK(58'h0ffffffffffffff),         // 58-bit mask value for pattern detect (1=ignore)
+					.PATTERN(58'h000000000000000),      // 58-bit pattern match for pattern detect
+					.SEL_MASK("MASK"),                  // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2
+					.SEL_PATTERN("PATTERN"),            // Select pattern value (C, PATTERN)
+					.USE_PATTERN_DETECT("NO_PATDET"),   // Enable pattern detect (NO_PATDET, PATDET)
+					// Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins
+					.IS_ALUMODE_INVERTED(4'b0000),      // Optional inversion for ALUMODE
+					.IS_CARRYIN_INVERTED(1'b0),         // Optional inversion for CARRYIN
+					.IS_CLK_INVERTED(1'b0),             // Optional inversion for CLK
+					.IS_INMODE_INVERTED(5'b00000),      // Optional inversion for INMODE
+					.IS_NEGATE_INVERTED(3'b000),        // Optional inversion for NEGATE
+					.IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0
+										FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN
+										2'b01, // Y : M
+										2'b01  // X: M
+					}), // Optional inversion for OPMODE
+					.IS_RSTALLCARRYIN_INVERTED(1'b0),   // Optional inversion for RSTALLCARRYIN
+					.IS_RSTALUMODE_INVERTED(1'b0),      // Optional inversion for RSTALUMODE
+					.IS_RSTA_INVERTED(1'b0),            // Optional inversion for RSTA
+					.IS_RSTB_INVERTED(1'b0),            // Optional inversion for RSTB
+					.IS_RSTCTRL_INVERTED(1'b0),         // Optional inversion for STCONJUGATE_A
+					.IS_RSTC_INVERTED(1'b0),            // Optional inversion for RSTC
+					.IS_RSTD_INVERTED(1'b0),            // Optional inversion for RSTD
+					.IS_RSTINMODE_INVERTED(1'b0),       // Optional inversion for RSTINMODE
+					.IS_RSTM_INVERTED(1'b0),            // Optional inversion for RSTM
+					.IS_RSTP_INVERTED(1'b0),            // Optional inversion for RSTP
+					// Register Control Attributes: Pipeline Register Configuration
+					.ACASCREG(INTERNAL_PREGS),          // Number of pipeline stages between A/ACIN and ACOUT (0-2)
+					.ADREG(0),                          // Pipeline stages for pre-adder (0-1)
+					.ALUMODEREG(0),                     // Pipeline stages for ALUMODE (0-1)
+					.AREG(INTERNAL_PREGS),              // Pipeline stages for A (0-2)
+					.BCASCREG(INTERNAL_PREGS),          // Number of pipeline stages between B/BCIN and BCOUT (0-2)
+					.BREG(INTERNAL_PREGS),              // Pipeline stages for B (0-2)
+					.CARRYINREG(0),                     // Pipeline stages for CARRYIN (0-1)
+					.CARRYINSELREG(0),                  // Pipeline stages for CARRYINSEL (0-1)
+					.CREG(0),                           // Pipeline stages for C (0-1)
+					.DREG(0),                           // Pipeline stages for D (0-1)
+					.INMODEREG(1),                      // Pipeline stages for INMODE (0-1)
+					.MREG(1),                           // Multiplier pipeline stages (0-1)
+					.OPMODEREG(1),                      // Pipeline stages for OPMODE (0-1)
+					.PREG(PREG),                        // Number of pipeline stages for P (0-1)
+					.RESET_MODE("SYNC")                 // Selection of synchronous or asynchronous reset. (ASYNC, SYNC).
+				)
+				DSP58_inst (
+					// Cascade outputs: Cascade Ports
+					.ACOUT(),                           // 34-bit output: A port cascade
+					.BCOUT(),                           // 24-bit output: B cascade
+					.CARRYCASCOUT(),                    // 1-bit output: Cascade carry
+					.MULTSIGNOUT(),                     // 1-bit output: Multiplier sign cascade
+					.PCOUT(pcout[i][j]),                // 58-bit output: Cascade output
+					// Control outputs: Control Inputs/Status Bits
+					.OVERFLOW(),                        // 1-bit output: Overflow in add/acc
+					.PATTERNBDETECT(),                  // 1-bit output: Pattern bar detect
+					.PATTERNDETECT(),                   // 1-bit output: Pattern detect
+					.UNDERFLOW(),                       // 1-bit output: Underflow in add/acc
+					// Data outputs: Data Ports
+					.CARRYOUT(),                        // 4-bit output: Carry
+					.P(pp),                             // 58-bit output: Primary data
+					.XOROUT(),                          // 8-bit output: XOR data
+					// Cascade inputs: Cascade Ports
+					.ACIN('x),                          // 34-bit input: A cascade data
+					.BCIN('x),                          // 24-bit input: B cascade
+					.CARRYCASCIN('x),                   // 1-bit input: Cascade carry
+					.MULTSIGNIN('x),                    // 1-bit input: Multiplier sign cascade
+					.PCIN(FIRST ? 'x : pcout[i][j-1]),  // 58-bit input: P cascade
+					// Control inputs: Control Inputs/Status Bits
+					.ALUMODE(4'h0),                     // 4-bit input: ALU control
+					.CARRYINSEL('0),                    // 3-bit input: Carry select
+					.CLK(clk),                          // 1-bit input: Clock
+					.INMODE({
+							INTERNAL_PREGS==2 ? 1'b0 : 1'b1,
+							2'b00,
+							TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero,
+							INTERNAL_PREGS==2 ? 1'b0 : 1'b1
+					}),                                 // 5-bit input: INMODE control
+					.NEGATE('0),                        // 3-bit input: Negates the input of the multiplier
+					.OPMODE({
+							LAST ? {1'b0, L[1]} : 2'b00,
+							7'b000_0000
+					}), // 9-bit input: Operation mode
+					// Data inputs: Data Ports
+					.A({ 7'bx, a_in_i[(IS_MVU ? 0 : CHAINLEN*i) + j] }),            // 34-bit input: A data
+					.B(b_in_i[i][j]),                   // 24-bit input: B data
+					.C('x),                             // 58-bit input: C data
+					.CARRYIN('0),                       // 1-bit input: Carry-in
+					.D('x),                             // 27-bit input: D data
+					// Reset/Clock Enable inputs: Reset/Clock Enable Inputs
+					.ASYNC_RST('0),                     // 1-bit input: Asynchronous reset for all registers.
+					.CEA1(en),                          // 1-bit input: Clock enable for 1st stage AREG
+					.CEA2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage AREG
+					.CEAD('0),                          // 1-bit input: Clock enable for ADREG
+					.CEALUMODE('0),                     // 1-bit input: Clock enable for ALUMODE
+					.CEB1(en),                          // 1-bit input: Clock enable for 1st stage BREG
+					.CEB2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage BREG
+					.CEC('0),                           // 1-bit input: Clock enable for CREG
+					.CECARRYIN('0),                     // 1-bit input: Clock enable for CARRYINREG
+					.CECTRL(en),                        // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG
+					.CED('0),                           // 1-bit input: Clock enable for DREG
+					.CEINMODE(en),                      // 1-bit input: Clock enable for INMODEREG
+					.CEM(en),                           // 1-bit input: Clock enable for MREG
+					.CEP(PREG && en),                   // 1-bit input: Clock enable for PREG
+					.RSTA(rst),                         // 1-bit input: Reset for AREG
+					.RSTALLCARRYIN('0),                 // 1-bit input: Reset for CARRYINREG
+					.RSTALUMODE('0),                    // 1-bit input: Reset for ALUMODEREG
+					.RSTB(rst),                         // 1-bit input: Reset for BREG
+					.RSTC('0),                          // 1-bit input: Reset for CREG
+					.RSTCTRL(rst),                      // 1-bit input: Reset for OPMODEREG and CARRYINSELREG
+					.RSTD('0),                          // 1-bit input: Reset for DREG and ADREG
+					.RSTINMODE(rst),                    // 1-bit input: Reset for INMODE register
+					.RSTM(rst),                         // 1-bit input: Reset for MREG
+					.RSTP(PREG && rst)                  // 1-bit input: Reset for PREG
+				);
+			end : genDSP
+`endif
+		end : genDSPChain
+	end : genDSPPE
+
+endmodule : mvu_vvu_8sx9

From adb58694be36bd0fa2e8558f760d1642f14a2a38 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 21 Sep 2023 16:58:20 +0100
Subject: [PATCH 160/235] [axi wrapper]: changed parameter to localparam

---
 finn-rtllib/mvu/mvu_vvu_axi_wrapper.v | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
index 788e49a71b..270fe7351f 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
+++ b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
@@ -46,9 +46,9 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 	parameter	FORCE_BEHAVIORAL = $FORCE_BEHAVIORAL$,
 
 	// Safely deducible parameters
-	parameter 	WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
-	parameter 	INPUT_STREAM_WIDTH_BA = ((IS_MVU == 1 ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8,
-	parameter 	OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8
+	localparam	WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
+	localparam 	INPUT_STREAM_WIDTH_BA = ((IS_MVU == 1 ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8,
+	localparam 	OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8
 )(
 	// Global Control
 	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *)

From f54d438f78fe4ce78c84fdd7bcbc514048bd2fe0 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 21 Sep 2023 16:59:32 +0100
Subject: [PATCH 161/235] [axi]: added support for LUT-based VVU

---
 finn-rtllib/mvu/mvu_vvu_axi.sv | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index 07ad32e6c8..ff677fc244 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -195,8 +195,8 @@ module mvu_vvu_axi #(
 			.vld(ovld), .p(odat)
 		);
 	"mvu_vvu_lut":
-		mvu_vvu_lut #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
-		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .M_REG(M_REG_LUT)) core (
+		mvu_vvu_lut #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH),
+		.WEIGHT_WIDTH(WEIGHT_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .M_REG(M_REG_LUT)) core (
 			.clk, .rst, .en,
 			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
 			.vld(ovld), .p(odat)

From a4e2ac7146afeab4271344785f638c88cf78da73 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 21 Sep 2023 17:00:07 +0100
Subject: [PATCH 162/235] [mvu vvu 8sx9]: minor change to list of generics

---
 finn-rtllib/mvu/mvu_vvu_8sx9.sv | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_8sx9.sv b/finn-rtllib/mvu/mvu_vvu_8sx9.sv
index 52a93739d6..2aa9d71b6c 100644
--- a/finn-rtllib/mvu/mvu_vvu_8sx9.sv
+++ b/finn-rtllib/mvu/mvu_vvu_8sx9.sv
@@ -32,7 +32,7 @@
  *****************************************************************************/
 
 module mvu_vvu_8sx9 #(
-	parameter IS_MVU,
+	bit IS_MVU,
     int unsigned PE,
     int unsigned SIMD,
     int unsigned ACTIVATION_WIDTH,
@@ -42,7 +42,7 @@ module mvu_vvu_8sx9 #(
     int unsigned SEGMENTLEN = 0, // Default to 0 (which implies a single segment)
 	bit FORCE_BEHAVIORAL = 0,
 
-	int unsigned  ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD
+	localparam int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD
   )
   (
     // Global Control

From 40ad0b46c03b10b47ec4d72dd04a4ad96149fa89 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 21 Sep 2023 17:00:51 +0100
Subject: [PATCH 163/235] [mvu lut]: added support for VVU

---
 finn-rtllib/mvu/mvu_lut.sv | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_lut.sv b/finn-rtllib/mvu/mvu_lut.sv
index b100a589e8..c100910d75 100644
--- a/finn-rtllib/mvu/mvu_lut.sv
+++ b/finn-rtllib/mvu/mvu_lut.sv
@@ -1,13 +1,15 @@
-module mvu_lut #(
-	int unsigned  PE,
-	int unsigned  SIMD,
+module mvu_vvu_lut #(
+    bit IS_MVU,
+    int unsigned  PE,
+    int unsigned  SIMD,
 	int unsigned  ACCU_WIDTH,
     int unsigned  ACTIVATION_WIDTH,
     int unsigned  WEIGHT_WIDTH,
     bit  SIGNED_ACTIVATIONS,
     bit  M_REG = 1,
 
-    localparam unsigned MULT_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH
+    localparam int unsigned MULT_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH,
+    localparam int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD
 )(
 	// Global Control
 	input	logic  clk,
@@ -17,8 +19,8 @@ module mvu_lut #(
 	// Input
 	input	logic  last,
 	input	logic  zero,	// ignore current inputs and force this partial product to zero
-	input	logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]      w,	// signed weights
-	input	logic                [SIMD-1:0][ACTIVATION_WIDTH-1:0]  a,	// (un)signed activations
+	input	logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]             w,	// signed weights
+	input	logic        [ACTIVATION_ELEMENTS-1:0][ACTIVATION_WIDTH-1:0]  a,	// (un)signed activations
 
 	// Ouput
 	output	logic  vld,
@@ -63,16 +65,16 @@ module mvu_lut #(
                 always_ff @(posedge clk) begin
                     if(rst)         M[j] = '{ default : 0 };
                     else if (en)    M[j] = zero ? 0 :
-                                            SIGNED_ACTIVATIONS ? $signed(a[j]) * $signed(w[i][j]) :
-                                                                 $signed({1'b0, a[j]}) * $signed(w[i][j]); 
-                    // (SIGNED_ACTIVATIONS ? $signed(a[j]) : a[j]) * $signed(w[i][j]) isn't valid -- leads to unsigned multiplication
+                                            SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) :
+                                                                 $signed({1'b0, a[(IS_MVU ? 0 : SIMD*i) + j]}) * $signed(w[i][j]); 
+                    // (SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) : a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) isn't valid -- leads to unsigned multiplication
                 end
                 assign  m1[j] = M[j];
             end : genMreg
             else begin : genNoMreg 
                 assign m1[j] = zero ? 0 :
-                               SIGNED_ACTIVATIONS ? $signed(a[j]) * $signed(w[i][j]) :
-                                                    $signed({1'b0, a[j]}) * $signed(w[i][j]);
+                               SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) :
+                                                    $signed({1'b0, a[(IS_MVU ? 0 : SIMD*i) + j]}) * $signed(w[i][j]);
             end : genNoMreg
         end : genSIMD
 
@@ -99,4 +101,4 @@ module mvu_lut #(
         assign  p[i] = P2[i];
     end : genPE
 
-endmodule : mvu_lut
+endmodule : mvu_vvu_lut

From 30fcb5b734f86d0032549a4efe29d96b13ee5451 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 21 Sep 2023 17:01:10 +0100
Subject: [PATCH 164/235] [mvu vvu lut]: renamed file for consistency

---
 finn-rtllib/mvu/mvu_vvu_lut.sv | 104 +++++++++++++++++++++++++++++++++
 1 file changed, 104 insertions(+)
 create mode 100644 finn-rtllib/mvu/mvu_vvu_lut.sv

diff --git a/finn-rtllib/mvu/mvu_vvu_lut.sv b/finn-rtllib/mvu/mvu_vvu_lut.sv
new file mode 100644
index 0000000000..c100910d75
--- /dev/null
+++ b/finn-rtllib/mvu/mvu_vvu_lut.sv
@@ -0,0 +1,104 @@
+module mvu_vvu_lut #(
+    bit IS_MVU,
+    int unsigned  PE,
+    int unsigned  SIMD,
+	int unsigned  ACCU_WIDTH,
+    int unsigned  ACTIVATION_WIDTH,
+    int unsigned  WEIGHT_WIDTH,
+    bit  SIGNED_ACTIVATIONS,
+    bit  M_REG = 1,
+
+    localparam int unsigned MULT_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH,
+    localparam int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD
+)(
+	// Global Control
+	input	logic  clk,
+	input	logic  rst,
+	input	logic  en,
+
+	// Input
+	input	logic  last,
+	input	logic  zero,	// ignore current inputs and force this partial product to zero
+	input	logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]             w,	// signed weights
+	input	logic        [ACTIVATION_ELEMENTS-1:0][ACTIVATION_WIDTH-1:0]  a,	// (un)signed activations
+
+	// Ouput
+	output	logic  vld,
+	output	logic signed [PE-1:0][ACCU_WIDTH-1:0]  p
+);
+
+    typedef int unsigned  leave_load_t[2*SIMD-1];
+    function leave_load_t init_leave_loads();
+        automatic leave_load_t  res;
+        for(int  i = 2*(SIMD-1); i >= int'(SIMD)-1; i--)  res[i] = 1;
+        for(int  i = SIMD-2; i >= 0; i--)  res[i] = res[2*i+1] + res[2*i+2];
+        return res;
+    endfunction : init_leave_loads
+
+    // Pipeline for last indicator flag
+    uwire last_i;
+    generate if (M_REG) begin
+        logic [0:1] L = '0;
+        always_ff @(posedge clk) begin
+            if(rst)       L <= '0;
+            else if (en)  L <= {last, L[0]};
+        end
+        assign  last_i = L[1];
+    end
+    else begin 
+        logic L = '0;
+        always_ff @(posedge clk) begin
+            if(rst)       L <= '0;
+            else if (en)  L <= last;
+        end
+        assign  last_i = L;
+    end
+    endgenerate
+
+    // For each PE generate
+    for (genvar  i = 0; i < PE; i++)  begin : genPE
+        // Stage #1: SIMD multipliers in parallel
+        uwire [MULT_WIDTH-1 : 0] m1 [SIMD];
+        for (genvar j = 0; j < SIMD; j++) begin : genSIMD
+            if (M_REG) begin : genMreg
+                logic [MULT_WIDTH-1 : 0] M [SIMD];
+                always_ff @(posedge clk) begin
+                    if(rst)         M[j] = '{ default : 0 };
+                    else if (en)    M[j] = zero ? 0 :
+                                            SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) :
+                                                                 $signed({1'b0, a[(IS_MVU ? 0 : SIMD*i) + j]}) * $signed(w[i][j]); 
+                    // (SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) : a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) isn't valid -- leads to unsigned multiplication
+                end
+                assign  m1[j] = M[j];
+            end : genMreg
+            else begin : genNoMreg 
+                assign m1[j] = zero ? 0 :
+                               SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) :
+                                                    $signed({1'b0, a[(IS_MVU ? 0 : SIMD*i) + j]}) * $signed(w[i][j]);
+            end : genNoMreg
+        end : genSIMD
+
+        // Stage #2: Adder tree to reduce SIMD products
+        localparam leave_load_t  LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default : 1 };
+        localparam int unsigned  ROOT_WIDTH = $clog2(SIMD*(2**MULT_WIDTH-1));
+        uwire signed [2*SIMD-2:0][ROOT_WIDTH-1:0]  tree;
+        for(genvar s = 0; s < SIMD; s++)  assign  tree[SIMD-1+s] = $signed(m1[s]);
+        for(genvar n = 0; n < SIMD-1; n++) begin
+            // Sum truncated to actual maximum bit width at this node
+            localparam int unsigned  NODE_WIDTH = $clog2(LEAVE_LOAD[n]*(2**MULT_WIDTH-1));
+            uwire signed [NODE_WIDTH-1:0]  s = tree[2*n+1] + tree[2*n+2];
+            assign tree[n] = s;
+        end
+
+        // Stage #3: Buffer output
+        logic [ACCU_WIDTH-1:0] P2 [PE];
+        always_ff @(posedge clk) begin
+            if(rst)         P2[i] = '{ default : 0};
+            else if (en)    P2[i] = (last_i ? 0 : $signed(P2[i])) + $signed(tree[0]);
+        end
+
+        assign  vld = last_i;
+        assign  p[i] = P2[i];
+    end : genPE
+
+endmodule : mvu_vvu_lut

From cb434386fa8bf6f63964dd889c8025c3e9616a6b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Thu, 21 Sep 2023 15:58:34 +0100
Subject: [PATCH 165/235] Revert to proper address truncation without
 generation bit.

---
 finn-rtllib/mvu/replay_buffer.sv | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/finn-rtllib/mvu/replay_buffer.sv b/finn-rtllib/mvu/replay_buffer.sv
index d4342f705c..3e2766f63d 100644
--- a/finn-rtllib/mvu/replay_buffer.sv
+++ b/finn-rtllib/mvu/replay_buffer.sv
@@ -144,8 +144,8 @@ module replay_buffer #(
 		uwire  wr = irdy && ivld;
 		uwire  rd = !OVld || ordy;
 		always_ff @(posedge clk) begin
-			if(wr)  Mem[WP[AWIDTH:0]] <= idat;
-			if(rd)  ODat <= Mem[RP[AWIDTH:0]];
+			if(wr)  Mem[WP[AWIDTH-1:0]] <= idat;
+			if(rd)  ODat <= Mem[RP[AWIDTH-1:0]];
 		end
 
 		uwire  vld = (RP != WP);

From b4b69f3fa7caae4be9357abf596aff4a66561228 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 21 Sep 2023 17:04:05 +0100
Subject: [PATCH 166/235] remove deletd/renamed files

---
 finn-rtllib/mvu/mvu_8sx9.sv            | 427 -------------------------
 finn-rtllib/mvu/mvu_8sx9_axi.sv        | 179 -----------
 finn-rtllib/mvu/mvu_8sx9_axi_tb.sv     | 208 ------------
 finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v |  93 ------
 finn-rtllib/mvu/mvu_8sx9_tb.sv         | 165 ----------
 finn-rtllib/mvu/mvu_axi.sv             | 248 --------------
 finn-rtllib/mvu/mvu_axi_wrapper.v      |  92 ------
 finn-rtllib/mvu/mvu_lut.sv             | 104 ------
 finn-rtllib/mvu/tb/mvu_axi_tb.sv       | 215 -------------
 9 files changed, 1731 deletions(-)
 delete mode 100644 finn-rtllib/mvu/mvu_8sx9.sv
 delete mode 100644 finn-rtllib/mvu/mvu_8sx9_axi.sv
 delete mode 100644 finn-rtllib/mvu/mvu_8sx9_axi_tb.sv
 delete mode 100644 finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v
 delete mode 100644 finn-rtllib/mvu/mvu_8sx9_tb.sv
 delete mode 100644 finn-rtllib/mvu/mvu_axi.sv
 delete mode 100644 finn-rtllib/mvu/mvu_axi_wrapper.v
 delete mode 100644 finn-rtllib/mvu/mvu_lut.sv
 delete mode 100644 finn-rtllib/mvu/tb/mvu_axi_tb.sv

diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv
deleted file mode 100644
index 52a93739d6..0000000000
--- a/finn-rtllib/mvu/mvu_8sx9.sv
+++ /dev/null
@@ -1,427 +0,0 @@
-/******************************************************************************
- * Copyright (C) 2022, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  1. Redistributions of source code must retain the above copyright notice,
- *     this list of conditions and the following disclaimer.
- *
- *  2. Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in the
- *     documentation and/or other materials provided with the distribution.
- *
- *  3. Neither the name of the copyright holder nor the names of its
- *     contributors may be used to endorse or promote products derived from
- *     this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
- * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
- * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
- * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
- * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * @brief	Matrix Vector Unit (MVU) core compute kernel utilizing DSP58.
- *****************************************************************************/
-
-module mvu_vvu_8sx9 #(
-	parameter IS_MVU,
-    int unsigned PE,
-    int unsigned SIMD,
-    int unsigned ACTIVATION_WIDTH,
-    int unsigned WEIGHT_WIDTH,
-	int unsigned ACCU_WIDTH,
-    bit SIGNED_ACTIVATIONS = 0,
-    int unsigned SEGMENTLEN = 0, // Default to 0 (which implies a single segment)
-	bit FORCE_BEHAVIORAL = 0,
-
-	int unsigned  ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD
-  )
-  (
-    // Global Control
-	input   logic clk,
-    input   logic rst,
-    input   logic en,
-
-	// Input
-    input   logic last,
-    input   logic zero, // ignore current inputs and force this partial product to zero
-    input   logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // weights
-	input   logic [ACTIVATION_ELEMENTS-1:0][ACTIVATION_WIDTH-1:0] a, // activations
-
-	// Ouput
-	output  logic vld,
-    output  logic [PE-1:0][ACCU_WIDTH-1:0] p
-  );
-	// for verilator always use behavioral code
-	localparam bit  BEHAVIORAL =
-`ifdef VERILATOR
-		1 ||
-`endif
-		FORCE_BEHAVIORAL;
-
-//-------------------- Declare global signals --------------------\\
-	localparam int unsigned CHAINLEN = (SIMD+2)/3;
-	localparam int unsigned SEGLEN = SEGMENTLEN == 0 ? CHAINLEN : SEGMENTLEN; // Additional constant to default a SEGMENTLEN of '0' to the DSP-chain length
-	localparam int unsigned PE_ACTIVATION = IS_MVU ? 1 : PE;
-	uwire [26:0] a_in_i [PE_ACTIVATION * CHAINLEN];
-	uwire [23:0] b_in_i [PE][CHAINLEN];
-	uwire [PE-1:0][CHAINLEN-1:0][57:0] pcout; // Array with packed dimension > 256 (with a loop-carried dependency) cannot be handled out-of-the-box with PyVerilator
-
-//-------------------- Shift register for opmode select signal --------------------\\
-	localparam int unsigned MAX_PIPELINE_STAGES = (CHAINLEN + SEGLEN-1)/SEGLEN; // >=1 (== number of pipeline registers + 1 (A/B inputs always have 1 register))
-	logic L [0:1+MAX_PIPELINE_STAGES] = '{default: 0}; // After MAX_PIPELINE_STAGES (== number of pipeline stages for input data), we have 3 additional cycles latency (A/B reg, Mreg, Preg). Thus, we add +2 (since OPMODE is buffered by 1 cycle in the DSP fabric)
-
-	always_ff @(posedge clk) begin
-		if(rst)     L <= '{default: 0};
-		else if(en) begin
-			L[1+MAX_PIPELINE_STAGES] <= last;
-			L[0:MAX_PIPELINE_STAGES] <= L[1:1+MAX_PIPELINE_STAGES];
-		end
-	end
-	assign vld = L[0];
-
-//-------------------- Shift register for ZERO flag --------------------\\
-	logic Z [0:MAX_PIPELINE_STAGES-2] = '{default:0}; // We need MAX_PIPELINE_STAGES-1 pipeline stages (note: INMODE is buffered inside DSP fabric)
-
-	if (MAX_PIPELINE_STAGES > 1) begin : genZreg
-		always_ff @(posedge clk) begin
-			if (rst)      Z <= '{default: 0};
-			else if(en) begin
-				Z[0] <= zero;
-				if (MAX_PIPELINE_STAGES > 2)  Z[1:MAX_PIPELINE_STAGES-2] <= Z[0:MAX_PIPELINE_STAGES-3];
-			end
-		end
-	end;
-
-//-------------------- Buffer for input activations --------------------\\
-	localparam int unsigned PAD_BITS_ACT = 9 - ACTIVATION_WIDTH;
-	for (genvar k=0; k<PE_ACTIVATION; k++) begin : genActPE
-		for (genvar i=0; i<CHAINLEN; i++) begin : genActSIMD
-			localparam int TOTAL_PREGS = i/SEGLEN;
-			localparam int EXTERNAL_PREGS = TOTAL_PREGS>1 ? TOTAL_PREGS-1 : 0;
-			localparam int LANES_OCCUPIED = i == CHAINLEN-1 ? SIMD - 3*i : 3;
-
-			if (EXTERNAL_PREGS > 0) begin : genExternalPregAct
-				logic [0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][ACTIVATION_WIDTH-1:0] A = '{ default : 0};
-				always_ff @(posedge clk) begin
-					if (rst)     A <= '{default: 0};
-					else if(en) begin
-						A[EXTERNAL_PREGS-1] <= 
-	// synthesis translate_off
-							zero ? '1 : 
-	// synthesis translate_on						
-							a[SIMD*k + 3*i +: LANES_OCCUPIED];
-						if (EXTERNAL_PREGS > 1)   A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1];
-					end
-				end
-				for (genvar j=0; j<LANES_OCCUPIED; j++) begin : genAin
-				assign a_in_i[CHAINLEN*k+i][9*j +: 9] = SIGNED_ACTIVATIONS ? PAD_BITS_ACT == 0 ? A[0][j] : { {PAD_BITS_ACT{A[0][j][ACTIVATION_WIDTH-1]}}, A[0][j] } 
-													  : PAD_BITS_ACT == 0 ? A[0][j] : { {PAD_BITS_ACT{1'b0}}, A[0][j] } ;
-				end : genAin
-				for (genvar j=LANES_OCCUPIED; j<3; j++) begin : genAinZero
-					assign a_in_i[CHAINLEN*k+i][9*j +: 9] = 9'b0;
-				end : genAinZero
-			end : genExternalPregAct
-			else begin : genInpDSPAct
-				for (genvar j=0; j<LANES_OCCUPIED; j++) begin : genAin
-					assign a_in_i[CHAINLEN*k+i][9*j +: 9] = 
-	// synthesis translate_off
-						zero ? '1 : 				
-	// synthesis translate_on
-						SIGNED_ACTIVATIONS ? PAD_BITS_ACT == 0 ? a[SIMD*k+3*i+j] : { {PAD_BITS_ACT{a[SIMD*k+3*i+j][ACTIVATION_WIDTH-1]}}, a[SIMD*k+3*i+j] }
-													: PAD_BITS_ACT == 0 ? a[SIMD*k+3*i+j] : { {PAD_BITS_ACT{1'b0}}, a[SIMD*k+3*i+j] } ;
-				end : genAin
-				for (genvar j=LANES_OCCUPIED; j<3; j++) begin : genAinZero
-					assign a_in_i[CHAINLEN*k+i][9*j +: 9] = 9'b0;
-				end : genAinZero
-			end : genInpDSPAct
-		end : genActSIMD
-	end : genActPE
-
-//-------------------- Buffer for weights --------------------\\
-	localparam int unsigned PAD_BITS_WEIGHT = 8 - WEIGHT_WIDTH;
-
-	for (genvar i=0; i<PE; i++) begin : genWeightPE
-		for (genvar j=0; j<CHAINLEN; j++) begin : genWeightSIMD
-			localparam int TOTAL_PREGS = j/SEGLEN;
-			localparam int EXTERNAL_PREGS = TOTAL_PREGS>1 ? TOTAL_PREGS-1 : 0;
-			localparam int LANES_OCCUPIED = j == CHAINLEN-1 ? SIMD - 3*j : 3;
-
-			if (EXTERNAL_PREGS > 0) begin : genExternalPregWeight
-				logic [0:PE-1][0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][WEIGHT_WIDTH-1:0] B = '{ default : 0};
-				always_ff @(posedge clk) begin
-					if (rst)    B <= '{default: 0};
-					else if (en) begin
-						B[i][EXTERNAL_PREGS-1] <= 
-// synthesis translate_off
-							zero ? '1 : 						
-// synthesis translate_on							
-							w[i][3*j +: LANES_OCCUPIED];
-						if (EXTERNAL_PREGS > 1) B[i][0:EXTERNAL_PREGS-2] <= B[i][1:EXTERNAL_PREGS-1];
-					end
-				end
-				for (genvar k = 0 ; k < LANES_OCCUPIED ; k++) begin : genBin
-					assign b_in_i[i][j][8*k +: 8] = PAD_BITS_WEIGHT == 0 ? B[i][0][k] : { {PAD_BITS_WEIGHT{B[i][0][k][WEIGHT_WIDTH-1]}}, B[i][0][k] };
-				end : genBin
-				for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero
-					assign b_in_i[i][j][8*k +: 8] = 8'b0;
-				end : genBinZero
-			end : genExternalPregWeight
-			else begin : genInpDSPWeight
-				for (genvar k = 0; k < LANES_OCCUPIED; k++) begin : genBin
-					assign b_in_i[i][j][8*k +: 8] = 
-// synthesis translate_off					
-						zero ? '1 : 
-// synthesis translate_on					
-						PAD_BITS_WEIGHT == 0 ? w[i][3*j+k] : { {PAD_BITS_WEIGHT{w[i][3*j+k][WEIGHT_WIDTH-1]}}, w[i][3*j+k] };
-				end : genBin
-				for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero
-					assign b_in_i[i][j][8*k +: 8] = 8'b0;
-				end : genBinZero
-			end : genInpDSPWeight
-		end : genWeightSIMD
-	end : genWeightPE
-
-//-------------------- Instantiate PE x CHAINLEN DSPs --------------------\\
-	for (genvar i=0; i<PE; i++) begin : genDSPPE
-		for (genvar j=0; j<CHAINLEN; j++) begin : genDSPChain
-			localparam int TOTAL_PREGS = j/SEGLEN;
-			localparam int INTERNAL_PREGS = TOTAL_PREGS>0 ? 2 : 1; // 1 : 0
-			localparam bit PREG = (j+1)%SEGLEN==0 || j == CHAINLEN-1;
-			localparam bit FIRST = j == 0;
-			localparam bit LAST = j == CHAINLEN-1;
-			uwire [57:0] pp;
-
-			if (LAST) begin : genPOUT
-				assign p[i] = pp[ACCU_WIDTH-1:0];
-			end
-
-			// Note: Since the product B * AD is computed,
-			//       rst can be only applied to AD and zero only to B
-			//       with the same effect as zeroing both.
-			if(BEHAVIORAL) begin : genBehav
-				// Stage #1: Input A/B
-				logic signed [33:0] Areg [INTERNAL_PREGS];
-				always_ff @(posedge clk) begin
-					if (rst)	Areg <= '{ default : 0};
-					else if (en) begin
-						Areg[0] <= { 7'bx, a_in_i[(IS_MVU ? 0 : CHAINLEN*i) + j] };
-						if (INTERNAL_PREGS == 2) Areg[1] <= Areg[0];
-					end
-				end
-				logic signed [23:0] Breg [INTERNAL_PREGS];
-				always_ff @(posedge clk) begin
-					if (rst)	Breg <= '{ default : 0};
-					else if (en) begin
-						Breg[0] <= b_in_i[i][j];
-						if (INTERNAL_PREGS == 2) Breg[1] <= Breg[0];
-					end
-				end
-
-				// Stage #2: Multiply-Accumulate
-				logic signed [57:0] Mreg;
-				logic InmodeZero = 0;
-				always_ff @(posedge clk) begin
-					if (rst)		InmodeZero <= 0;
-					else if (en)	InmodeZero <= ( TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero );
-				end
-				always_ff @(posedge clk) begin
-					if (rst)	Mreg <= 0;
-					else if (en) begin
-						automatic logic signed [57:0] m = 0;
-						for (int k = 0; k < 3; k++) begin
-							m = m + (InmodeZero ? 0 : $signed(Areg[INTERNAL_PREGS-1][9*k +: 9]) * $signed(Breg[INTERNAL_PREGS-1][8*k +: 8]));
-						end
-						Mreg <= m;
-					end
-				end
-
-				// Stage #3: Accumulate
-				logic signed [57:0] Preg;
-				logic Opmode = 0;
-				if (FIRST && !LAST) begin : genFirst
-					if (PREG) begin : genPregBehav
-						always_ff @(posedge clk) begin
-							if (rst)		Preg <= 0;
-							else if (en)	Preg <= Mreg;
-						end
-					end
-					else	assign Preg = Mreg;
-				end
-				else if (FIRST && LAST) begin : genSingle
-					always_ff @(posedge clk) begin
-						if (rst)		Opmode <= 0;
-						else if (en)	Opmode <= L[1];
-					end
-					always_ff @(posedge clk) begin
-						if (rst) 		Preg <= 0;
-						else if (en)	Preg <= (Opmode ? 0 : Preg) + Mreg;
-					end
-				end
-				else if (!FIRST && LAST) begin : genLast
-					always_ff @(posedge clk) begin
-						if (rst)		Opmode <= 0;
-						else if (en)	Opmode <= L[1];
-					end
-					always_ff @(posedge clk) begin
-						if (rst) 		Preg <= 0;
-						else if (en)	Preg <= (Opmode ? 0 : Preg) + Mreg + pcout[i][j-1];
-					end
-				end
-				else begin : genMid
-					if (PREG) begin : genPregBehav
-						always_ff @(posedge clk) begin
-							if (rst)		Preg <= 0;
-							else if (en)	Preg <= Mreg + pcout[i][j-1];
-						end
-					end
-					else	assign Preg = Mreg + pcout[i][j-1];
-				end
-				assign pp = Preg;
-				assign pcout[i][j] = Preg;
-			end : genBehav
-`ifndef VERILATOR
-			else begin: genDSP
-				DSP58 #(
-					// Feature Control Attributes: Data Path Selection
-					.AMULTSEL("A"),                     // Selects A input to multiplier (A, AD)
-					.A_INPUT("DIRECT"),                 // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port)
-					.BMULTSEL("B"),                     // Selects B input to multiplier (AD, B)
-					.B_INPUT("DIRECT"),                 // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port)
-					.DSP_MODE("INT8"),                  // Configures DSP to a particular mode of operation. Set to INT24 for
-														// legacy mode.
-					.PREADDINSEL("A"),                  // Selects input to pre-adder (A, B)
-					.RND(58'h000000000000000),          // Rounding Constant
-					.USE_MULT("MULTIPLY"),              // Select multiplier usage (DYNAMIC, MULTIPLY, NONE)
-					.USE_SIMD("ONE58"),                 // SIMD selection (FOUR12, ONE58, TWO24)
-					.USE_WIDEXOR("FALSE"),              // Use the Wide XOR function (FALSE, TRUE)
-					.XORSIMD("XOR24_34_58_116"),        // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116)
-					// Pattern Detector Attributes: Pattern Detection Configuration
-					.AUTORESET_PATDET("NO_RESET"),      // NO_RESET, RESET_MATCH, RESET_NOT_MATCH
-					.AUTORESET_PRIORITY("RESET"),       // Priority of AUTORESET vs. CEP (CEP, RESET).
-					.MASK(58'h0ffffffffffffff),         // 58-bit mask value for pattern detect (1=ignore)
-					.PATTERN(58'h000000000000000),      // 58-bit pattern match for pattern detect
-					.SEL_MASK("MASK"),                  // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2
-					.SEL_PATTERN("PATTERN"),            // Select pattern value (C, PATTERN)
-					.USE_PATTERN_DETECT("NO_PATDET"),   // Enable pattern detect (NO_PATDET, PATDET)
-					// Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins
-					.IS_ALUMODE_INVERTED(4'b0000),      // Optional inversion for ALUMODE
-					.IS_CARRYIN_INVERTED(1'b0),         // Optional inversion for CARRYIN
-					.IS_CLK_INVERTED(1'b0),             // Optional inversion for CLK
-					.IS_INMODE_INVERTED(5'b00000),      // Optional inversion for INMODE
-					.IS_NEGATE_INVERTED(3'b000),        // Optional inversion for NEGATE
-					.IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0
-										FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN
-										2'b01, // Y : M
-										2'b01  // X: M
-					}), // Optional inversion for OPMODE
-					.IS_RSTALLCARRYIN_INVERTED(1'b0),   // Optional inversion for RSTALLCARRYIN
-					.IS_RSTALUMODE_INVERTED(1'b0),      // Optional inversion for RSTALUMODE
-					.IS_RSTA_INVERTED(1'b0),            // Optional inversion for RSTA
-					.IS_RSTB_INVERTED(1'b0),            // Optional inversion for RSTB
-					.IS_RSTCTRL_INVERTED(1'b0),         // Optional inversion for STCONJUGATE_A
-					.IS_RSTC_INVERTED(1'b0),            // Optional inversion for RSTC
-					.IS_RSTD_INVERTED(1'b0),            // Optional inversion for RSTD
-					.IS_RSTINMODE_INVERTED(1'b0),       // Optional inversion for RSTINMODE
-					.IS_RSTM_INVERTED(1'b0),            // Optional inversion for RSTM
-					.IS_RSTP_INVERTED(1'b0),            // Optional inversion for RSTP
-					// Register Control Attributes: Pipeline Register Configuration
-					.ACASCREG(INTERNAL_PREGS),          // Number of pipeline stages between A/ACIN and ACOUT (0-2)
-					.ADREG(0),                          // Pipeline stages for pre-adder (0-1)
-					.ALUMODEREG(0),                     // Pipeline stages for ALUMODE (0-1)
-					.AREG(INTERNAL_PREGS),              // Pipeline stages for A (0-2)
-					.BCASCREG(INTERNAL_PREGS),          // Number of pipeline stages between B/BCIN and BCOUT (0-2)
-					.BREG(INTERNAL_PREGS),              // Pipeline stages for B (0-2)
-					.CARRYINREG(0),                     // Pipeline stages for CARRYIN (0-1)
-					.CARRYINSELREG(0),                  // Pipeline stages for CARRYINSEL (0-1)
-					.CREG(0),                           // Pipeline stages for C (0-1)
-					.DREG(0),                           // Pipeline stages for D (0-1)
-					.INMODEREG(1),                      // Pipeline stages for INMODE (0-1)
-					.MREG(1),                           // Multiplier pipeline stages (0-1)
-					.OPMODEREG(1),                      // Pipeline stages for OPMODE (0-1)
-					.PREG(PREG),                        // Number of pipeline stages for P (0-1)
-					.RESET_MODE("SYNC")                 // Selection of synchronous or asynchronous reset. (ASYNC, SYNC).
-				)
-				DSP58_inst (
-					// Cascade outputs: Cascade Ports
-					.ACOUT(),                           // 34-bit output: A port cascade
-					.BCOUT(),                           // 24-bit output: B cascade
-					.CARRYCASCOUT(),                    // 1-bit output: Cascade carry
-					.MULTSIGNOUT(),                     // 1-bit output: Multiplier sign cascade
-					.PCOUT(pcout[i][j]),                // 58-bit output: Cascade output
-					// Control outputs: Control Inputs/Status Bits
-					.OVERFLOW(),                        // 1-bit output: Overflow in add/acc
-					.PATTERNBDETECT(),                  // 1-bit output: Pattern bar detect
-					.PATTERNDETECT(),                   // 1-bit output: Pattern detect
-					.UNDERFLOW(),                       // 1-bit output: Underflow in add/acc
-					// Data outputs: Data Ports
-					.CARRYOUT(),                        // 4-bit output: Carry
-					.P(pp),                             // 58-bit output: Primary data
-					.XOROUT(),                          // 8-bit output: XOR data
-					// Cascade inputs: Cascade Ports
-					.ACIN('x),                          // 34-bit input: A cascade data
-					.BCIN('x),                          // 24-bit input: B cascade
-					.CARRYCASCIN('x),                   // 1-bit input: Cascade carry
-					.MULTSIGNIN('x),                    // 1-bit input: Multiplier sign cascade
-					.PCIN(FIRST ? 'x : pcout[i][j-1]),  // 58-bit input: P cascade
-					// Control inputs: Control Inputs/Status Bits
-					.ALUMODE(4'h0),                     // 4-bit input: ALU control
-					.CARRYINSEL('0),                    // 3-bit input: Carry select
-					.CLK(clk),                          // 1-bit input: Clock
-					.INMODE({
-							INTERNAL_PREGS==2 ? 1'b0 : 1'b1,
-							2'b00,
-							TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero,
-							INTERNAL_PREGS==2 ? 1'b0 : 1'b1
-					}),                                 // 5-bit input: INMODE control
-					.NEGATE('0),                        // 3-bit input: Negates the input of the multiplier
-					.OPMODE({
-							LAST ? {1'b0, L[1]} : 2'b00,
-							7'b000_0000
-					}), // 9-bit input: Operation mode
-					// Data inputs: Data Ports
-					.A({ 7'bx, a_in_i[(IS_MVU ? 0 : CHAINLEN*i) + j] }),            // 34-bit input: A data
-					.B(b_in_i[i][j]),                   // 24-bit input: B data
-					.C('x),                             // 58-bit input: C data
-					.CARRYIN('0),                       // 1-bit input: Carry-in
-					.D('x),                             // 27-bit input: D data
-					// Reset/Clock Enable inputs: Reset/Clock Enable Inputs
-					.ASYNC_RST('0),                     // 1-bit input: Asynchronous reset for all registers.
-					.CEA1(en),                          // 1-bit input: Clock enable for 1st stage AREG
-					.CEA2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage AREG
-					.CEAD('0),                          // 1-bit input: Clock enable for ADREG
-					.CEALUMODE('0),                     // 1-bit input: Clock enable for ALUMODE
-					.CEB1(en),                          // 1-bit input: Clock enable for 1st stage BREG
-					.CEB2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage BREG
-					.CEC('0),                           // 1-bit input: Clock enable for CREG
-					.CECARRYIN('0),                     // 1-bit input: Clock enable for CARRYINREG
-					.CECTRL(en),                        // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG
-					.CED('0),                           // 1-bit input: Clock enable for DREG
-					.CEINMODE(en),                      // 1-bit input: Clock enable for INMODEREG
-					.CEM(en),                           // 1-bit input: Clock enable for MREG
-					.CEP(PREG && en),                   // 1-bit input: Clock enable for PREG
-					.RSTA(rst),                         // 1-bit input: Reset for AREG
-					.RSTALLCARRYIN('0),                 // 1-bit input: Reset for CARRYINREG
-					.RSTALUMODE('0),                    // 1-bit input: Reset for ALUMODEREG
-					.RSTB(rst),                         // 1-bit input: Reset for BREG
-					.RSTC('0),                          // 1-bit input: Reset for CREG
-					.RSTCTRL(rst),                      // 1-bit input: Reset for OPMODEREG and CARRYINSELREG
-					.RSTD('0),                          // 1-bit input: Reset for DREG and ADREG
-					.RSTINMODE(rst),                    // 1-bit input: Reset for INMODE register
-					.RSTM(rst),                         // 1-bit input: Reset for MREG
-					.RSTP(PREG && rst)                  // 1-bit input: Reset for PREG
-				);
-			end : genDSP
-`endif
-		end : genDSPChain
-	end : genDSPPE
-
-endmodule : mvu_vvu_8sx9
diff --git a/finn-rtllib/mvu/mvu_8sx9_axi.sv b/finn-rtllib/mvu/mvu_8sx9_axi.sv
deleted file mode 100644
index 5f215927d8..0000000000
--- a/finn-rtllib/mvu/mvu_8sx9_axi.sv
+++ /dev/null
@@ -1,179 +0,0 @@
-/******************************************************************************
- * Copyright (C) 2022, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  1. Redistributions of source code must retain the above copyright notice,
- *     this list of conditions and the following disclaimer.
- *
- *  2. Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in the
- *     documentation and/or other materials provided with the distribution.
- *
- *  3. Neither the name of the copyright holder nor the names of its
- *     contributors may be used to endorse or promote products derived from
- *     this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
- * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
- * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
- * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
- * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * @brief	Matrix Vector Unit (MVU) AXI-lite interface wrapper.
- *****************************************************************************/
-
-module mvu_8sx9_axi #(
-	int unsigned MW,
-	int unsigned MH,
-	int unsigned PE,
-	int unsigned SIMD,
-	int unsigned ACTIVATION_WIDTH,
-	int unsigned WEIGHT_WIDTH,
-	int unsigned ACCU_WIDTH,
-	bit SIGNED_ACTIVATIONS = 0,
-	int unsigned SEGMENTLEN = 0,
-	parameter RAM_STYLE = "auto",
-
-	localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
-	localparam int unsigned INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8,
-	localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH,
-	localparam int unsigned INPUT_STREAM_WIDTH = SIMD*ACTIVATION_WIDTH,
-	localparam int unsigned SF = MW/SIMD,
-	localparam int unsigned NF = MH/PE,
-	localparam int unsigned OUTPUT_LANES = PE,
-	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
-)
-(
-	// Global Control
-	input	logic  ap_clk,
-	input	logic  ap_rst_n,
-
-	// Weight Stream
-	input	logic [WEIGHT_STREAM_WIDTH_BA-1:0]  s_axis_weights_tdata,
-	input	logic  s_axis_weights_tvalid,
-	output	logic  s_axis_weights_tready,
-
-	// Input Stream
-	input	logic [INPUT_STREAM_WIDTH_BA-1:0]  s_axis_input_tdata,
-	input	logic  s_axis_input_tvalid,
-	output	logic  s_axis_input_tready,
-
-	// Output Stream
-	output	logic [OUTPUT_STREAM_WIDTH_BA-1:0]  m_axis_output_tdata,
-	output	logic  m_axis_output_tvalid,
-	input	logic  m_axis_output_tready
-);
-
-//-------------------- Parameter sanity checks --------------------\\
-	initial begin
-		if (MW % SIMD != 0) begin
-			$error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD);
-			$finish;
-		end
-		if (MH % PE != 0) begin
-			$error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE);
-			$finish;
-		end
-		if (ACTIVATION_WIDTH > 9) begin
-			$error("Activation width of %0d-bits exceeds maximum of 9-bits", ACTIVATION_WIDTH);
-			$finish;
-		end
-		if (WEIGHT_WIDTH > 8) begin
-			$error("Weight width of %0d-bits exceeds maximum of 8-bits", WEIGHT_WIDTH);
-			$finish;
-		end
-		if (SIGNED_ACTIVATIONS == 0 && ACTIVATION_WIDTH==9) begin
-			$error("Activation width of %0d-bits exceeds maximum of 8-bits for unsigned numbers", ACTIVATION_WIDTH);
-			$finish;
-		end
-		if (SEGMENTLEN == 0) begin
-			$warning("Segment length of %0d defaults to chain length", SEGMENTLEN);
-		end
-		if (SEGMENTLEN > (SIMD+2)/3) begin
-			$error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3);
-			$finish;
-		end
-	end
-
-	uwire clk = ap_clk;
-	uwire rst = !ap_rst_n;
-
-	typedef logic [INPUT_STREAM_WIDTH-1 : 0] mvauin_t;
-
-	uwire mvauin_t amvau;
-	uwire alast;
-	uwire afin;
-	uwire avld;
-	uwire ardy;
-
-	replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvauin_t)), .RAM_STYLE(RAM_STYLE)) activation_replay (
-		.clk, .rst,
-		.ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)),
-		.ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin)
-	);
-
-//-------------------- Input control --------------------\\
-	uwire en;
-	uwire istb = avld && s_axis_weights_tvalid;
-	assign ardy = en && s_axis_weights_tvalid;
-	assign s_axis_weights_tready = en && avld;
-
-//-------------------- Core MVU --------------------\\
-	uwire ovld;
-	uwire [PE-1:0][57:0] odat;
-	typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t;
-	mvu_8sx9 #(.PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
-	.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN)) core (
-		.clk, .rst, .en,
-		.last(alast), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau),
-		.vld(ovld), .p(odat)
-	);
-
-//-------------------- Output register slice --------------------\\
-	struct {
-		logic vld;
-		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
-	} A = '{ vld: 0, default: 'x};
-
-	assign en = !A.vld || !ovld;
-
-	uwire  b_load;
-	always_ff @(posedge clk) begin
-		if(rst)		A <= '{ vld: 0, default: 'x };
-		else if(!A.vld || b_load) begin
-			A.vld <= ovld && en;
-			for(int unsigned  i = 0; i < PE; i++) begin
-				// CR-1148862:
-				// A.dat[i] <= odat[i];
-				automatic logic [57:0]  v = odat[i];
-				A.dat[i] <= v[ACCU_WIDTH-1:0];
-			end
-		end
-	end
-	
-	struct {
-		logic vld;
-		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
-	} B = '{ vld: 0, default: 'x};
-
-	assign	b_load = !B.vld || m_axis_output_tready;
-	always_ff @(posedge clk) begin
-		if(rst)		B <= '{ default: 'x };
-		else begin
-			if(b_load)	B <= '{ vld: A.vld, dat: A.dat};
-		end	
-	end
-
-	assign	m_axis_output_tvalid = B.vld;
-	assign	m_axis_output_tdata  = B.dat;
-
-endmodule
\ No newline at end of file
diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv b/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv
deleted file mode 100644
index 70ffa096ef..0000000000
--- a/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv
+++ /dev/null
@@ -1,208 +0,0 @@
-/******************************************************************************
- * Copyright (C) 2022, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  1. Redistributions of source code must retain the above copyright notice,
- *     this list of conditions and the following disclaimer.
- *
- *  2. Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in the
- *     documentation and/or other materials provided with the distribution.
- *
- *  3. Neither the name of the copyright holder nor the names of its
- *     contributors may be used to endorse or promote products derived from
- *     this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
- * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
- * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
- * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
- * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * @brief	Testbench for MVU AXI-lite interface wrapper.
- *****************************************************************************/
-
-module mvu_8sx9_axi_tb();
-
-//-------------------- Simulation parameters --------------------\\
-	// Matrix & parallelism config
-	localparam int unsigned MW = 600;
-	localparam int unsigned MH = 256;
-	localparam int unsigned SIMD = 60;
-	localparam int unsigned PE = 16;
-	localparam int unsigned SEGMENTLEN = 4;
-	// Bit-width config  
-	localparam int unsigned ACTIVATION_WIDTH = 8;
-	localparam int unsigned WEIGHT_WIDTH = 4;
-	localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW);
-	localparam bit SIGNED_ACTIVATIONS = 1;
-	// Simulation constants  
-	localparam int unsigned NF = MH/PE;
-	localparam int unsigned SF = MW/SIMD;
-	localparam int unsigned NUM_OF_DSP = SIMD/3;
-	localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8;
-	localparam int unsigned ACTIVATION_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8*8;
-	localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH;
-	localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - SIMD*ACTIVATION_WIDTH;
-	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8;
-
-	// Generate clk and reset signal   
-	logic clk = 0;
-	always #5ns clk = !clk;
-
-	logic ap_rst_n = 0;
-	initial begin
-		repeat(16) @(posedge clk);
-		ap_rst_n <= 1;
-	end
-
-	uwire ap_clk = clk;
-
-	// Generate activations  
-	typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t;
-	typedef activation_t activation_vector_t[SF];
-
-	function activation_vector_t init_ACTIVATIONS;
-		automatic activation_vector_t res;
-		std::randomize(res);
-		return res;
-	endfunction : init_ACTIVATIONS
-
-	activation_vector_t ACTIVATIONS = init_ACTIVATIONS();
-
-	struct {
-		activation_t dat;
-		logic vld;
-		logic rdy;
-	} activations;
-
-	initial begin
-		activations.vld = 0;
-		activations.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain
-		@(posedge clk iff ap_rst_n);
-
-		for (int i=0; i<SF; i++) begin
-			activations.dat <= ACTIVATIONS[i];
-			do begin 
-				activations.vld = $urandom()%7 > 1;
-				@(posedge clk);
-			end while (!(activations.vld === 1 && activations.rdy === 1));
-		end
-
-		activations.vld <= 0;
-		activations.dat <= 'x;
-	end
-
-	// Generate weights   
-	typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t;
-	typedef weight_t weight_matrix_t[NF][SF]; 
-
-	function weight_matrix_t init_WEIGHTS;
-		automatic weight_matrix_t res;
-		std::randomize(res);
-		return res;
-	endfunction : init_WEIGHTS;
-
-	weight_matrix_t WEIGHTS = init_WEIGHTS();
-
-	struct {
-		weight_t dat;
-		logic vld;
-		logic rdy;
-	} weights;
-
-	initial begin
-		weights.vld = 0;
-		weights.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain
-		@(posedge clk iff ap_rst_n);
-
-		weights.vld <= 1;
-		for (int i=0; i<NF; i++) begin
-			for (int j=0; j<SF; j++) begin
-				weights.dat <= WEIGHTS[i][j];
-				@(posedge clk iff weights.rdy);
-			end
-		end
-
-		weights.vld <= 0;
-		weights.dat <= 'x;
-	end
-
-	// Function to compute golden output  
-	// a: [SF][SIMD-1:0][ACTIVATION_WIDTH-1:0]
-	// w: [NF][SF][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]
-	typedef logic signed [PE-1:0][ACCU_WIDTH-1:0] output_t;
-	typedef output_t output_vector_t [NF];
-
-	struct {
-		output_t dat;
-		logic vld;
-		logic rdy;
-	} outputs;
-
-	function output_vector_t check_output(activation_vector_t a, weight_matrix_t w);
-		automatic output_vector_t res = '{default: 0};
-		for (int j = 0; j<MH; j++) begin
-			for (int i = 0; i<MW; i++) begin
-				res[j/PE][j%PE] = $signed(res[j/PE][j%PE]) + $signed(a[i/SIMD][i%SIMD]) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
-			end
-		end  
-		return res;
-	endfunction : check_output;
-
-	output_vector_t GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS);
-
-	int unsigned NF_CNT = 0;
-	initial begin
-		outputs.rdy = 0;
-		while (NF_CNT < NF) begin
-			// Loop until both rdy & vld are asserted
-			do begin
-				outputs.rdy <= $urandom()%7 >= 1;
-				@(posedge clk iff ap_rst_n);
-			end while (!(outputs.rdy === 1 && outputs.vld === 1));
-
-			// Compare produced outputs against golden outputs
-			foreach(outputs.dat[i]) begin
-				assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
-				else begin 
-					$error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
-					$stop;
-				end  
-			end
-			
-			NF_CNT += 1;
-		end
-
-		$finish;  
-	end
-
-	// Instantiate DUT
-	mvu_8sx9_axi #(
-		.MW(MW),
-		.MH(MH),
-		.PE(PE),
-		.SIMD(SIMD),
-		.ACTIVATION_WIDTH(ACTIVATION_WIDTH),
-		.WEIGHT_WIDTH(WEIGHT_WIDTH),
-		.ACCU_WIDTH(ACCU_WIDTH),
-		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
-		.SEGMENTLEN(SEGMENTLEN)
-	)
-	dut (
-		.ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld),
-		.s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld),
-		.s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld),
-		.m_axis_output_tready(outputs.rdy)
-	);
-  
-endmodule
diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v
deleted file mode 100644
index e15f77fbae..0000000000
--- a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v
+++ /dev/null
@@ -1,93 +0,0 @@
-/******************************************************************************
- * Copyright (C) 2022, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  1. Redistributions of source code must retain the above copyright notice,
- *     this list of conditions and the following disclaimer.
- *
- *  2. Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in the
- *     documentation and/or other materials provided with the distribution.
- *
- *  3. Neither the name of the copyright holder nor the names of its
- *     contributors may be used to endorse or promote products derived from
- *     this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
- * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
- * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
- * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
- * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * @brief	Verilog AXI-lite wrapper for MVU.
- *****************************************************************************/
-
-module $MODULE_NAME_AXI_WRAPPER$ #(
-	parameter 	MW = $MW$,
-	parameter	MH = $MH$,
-	parameter 	PE = $PE$,
-	parameter 	SIMD = $SIMD$,
-	parameter 	ACTIVATION_WIDTH = $ACTIVATION_WIDTH$,
-	parameter 	WEIGHT_WIDTH = $WEIGHT_WIDTH$,
-	parameter 	ACCU_WIDTH = $ACCU_WIDTH$,
-	parameter 	SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$,
-	parameter 	SEGMENTLEN = $SEGMENTLEN$,
-	parameter 	RAM_STYLE = "$IBUF_RAM_STYLE$",
-
-	// Safely deducible parameters
-	parameter 	WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
-	parameter 	INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8,
-	parameter 	OUTPUT_LANES = PE,
-	parameter 	OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
-)(
-	// Global Control
-	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF s_axis_weights:s_axis_input:m_axis_output, ASSOCIATED_RESET ap_rst_n" *)
-	(* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *)
-	input	ap_clk,
-	(* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *)
-	input	ap_rst_n,
-
-	// Weight Stream
-	input	[WEIGHT_STREAM_WIDTH_BA-1:0]  s_axis_weights_tdata,
-	input	s_axis_weights_tvalid,
-	output	s_axis_weights_tready,
-
-	// Input Stream
-	input	[INPUT_STREAM_WIDTH_BA-1:0]  s_axis_input_tdata,
-	input	s_axis_input_tvalid,
-	output	s_axis_input_tready,
-
-	// Output Stream
-	output	[OUTPUT_STREAM_WIDTH_BA-1:0]  m_axis_output_tdata,
-	output	m_axis_output_tvalid,
-	input	m_axis_output_tready
-);
-
-mvu_8sx9_axi #(
-	.MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH),
-	.WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
-	.SEGMENTLEN(SEGMENTLEN), .RAM_STYLE(RAM_STYLE)
-	) inst (
-	.ap_clk(ap_clk),
-	.ap_rst_n(ap_rst_n),
-	.s_axis_weights_tdata(s_axis_weights_tdata),
-	.s_axis_weights_tvalid(s_axis_weights_tvalid),
-	.s_axis_weights_tready(s_axis_weights_tready),
-	.s_axis_input_tdata(s_axis_input_tdata),
-	.s_axis_input_tvalid(s_axis_input_tvalid),
-	.s_axis_input_tready(s_axis_input_tready),
-	.m_axis_output_tdata(m_axis_output_tdata),
-	.m_axis_output_tvalid(m_axis_output_tvalid),
-	.m_axis_output_tready(m_axis_output_tready)
-);
-
-endmodule : $MODULE_NAME_AXI_WRAPPER$
diff --git a/finn-rtllib/mvu/mvu_8sx9_tb.sv b/finn-rtllib/mvu/mvu_8sx9_tb.sv
deleted file mode 100644
index adf6a8f9c2..0000000000
--- a/finn-rtllib/mvu/mvu_8sx9_tb.sv
+++ /dev/null
@@ -1,165 +0,0 @@
-/******************************************************************************
- * Copyright (C) 2022, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  1. Redistributions of source code must retain the above copyright notice,
- *     this list of conditions and the following disclaimer.
- *
- *  2. Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in the
- *     documentation and/or other materials provided with the distribution.
- *
- *  3. Neither the name of the copyright holder nor the names of its
- *     contributors may be used to endorse or promote products derived from
- *     this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
- * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
- * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
- * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
- * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * @brief	Testbench for MVU core compute kernel.
- *****************************************************************************/
-
-module mvu_8sx9_tb();
-
-//-------------------- Simulation parameters --------------------\\
-	// Matrix & parallelism config
-	localparam int unsigned MH = 256;
-	localparam int unsigned PE = 16;
-	localparam int unsigned MW = 600;
-	localparam int unsigned SIMD = 60;
-	localparam int unsigned SEGMENTLEN = 4;
-	// Bit-width config  
-	localparam int unsigned ACTIVATION_WIDTH = 8;
-	localparam int unsigned WEIGHT_WIDTH = 4;
-	localparam bit SIGNED_ACTIVATIONS = 1;
-	// Simulation constants
-	localparam int unsigned NF = MH/PE;
-	localparam int unsigned SF = MW/SIMD;
-	localparam int unsigned NUM_OF_DSP = SIMD/3;
-
-	typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t;
-	typedef activation_t activation_vector_t[SF];
-
-	function activation_vector_t init_ACTIVATIONS;
-		automatic activation_vector_t res;
-		std::randomize(res);
-		return res;
-	endfunction : init_ACTIVATIONS
-
-	typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t;
-	typedef weight_t weight_matrix_t[NF][SF];
-
-	function weight_matrix_t init_WEIGHTS;
-		automatic weight_matrix_t res;
-		std::randomize(res);
-		return res;
-	endfunction : init_WEIGHTS;
-
-	typedef logic signed [PE-1:0][57:0] output_t;
-	typedef output_t output_vector_t [NF];
-
-	function output_vector_t check_output(activation_vector_t a, weight_matrix_t w);
-		automatic output_vector_t res = '{default: 0};
-		for (int j = 0; j<MH; j++) begin
-			for (int i = 0; i<MW; i++) begin
-				res[j/PE][j%PE] = $signed(res[j/PE][j%PE]) + $signed(a[i/SIMD][i%SIMD]) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
-			end
-		end  
-		return res;
-	endfunction : check_output;
-
-	logic clk = 0;
-	always #5ns clk = !clk;
-
-	logic rst;
-	initial begin
-		rst = 1;
-		repeat(16) @(posedge clk);
-		rst <= 0;
-	end
-
-	logic last;
-	logic zero;
-	logic vld;
-	activation_t a;
-	weight_t w;
-	output_t p;
-	// Reference signals
-	activation_vector_t ACTIVATIONS; //   [SF-1:0][SIMD-1:0][ACTIVATION_WIDTH-1:0]
-	weight_matrix_t WEIGHTS; //           [NF-1:0][SF-1:0][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]
-	output_vector_t GOLDEN_OUTPUT; //     [NF-1:0][PE-1:0][57:0]
-	// Counter for number of outputs (NF dimension) that are produced
-	int NF_CNT = 0;
-
-	initial begin
-		ACTIVATIONS = init_ACTIVATIONS();
-		WEIGHTS = init_WEIGHTS();
-		GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS);
-		last = 0;
-		zero = 0;
-		a = 'x;
-		w = 'x;
-
-		@(posedge clk iff !rst);
-
-		for (int j=0; j<NF; j++) begin
-			for (int i=0; i<SF; i++) begin
-				last <= (i==SF-1) ? 1 : 0;
-				a <= ACTIVATIONS[i];
-				w <= WEIGHTS[j][i];
-				@(posedge clk iff en);
-			end
-		end
-
-		last <= 0;
-		zero <= 1;  
-
-		// Continue until all NF outputs are produced & compared
-		@(posedge clk && (NF_CNT==NF));
-
-		$finish;
-	end
-
-	logic en = 0;
-	always_ff @(posedge clk) begin
-		en <= ($urandom()%7 > 1) && !rst;
-	end
-
-	// Compare computed output against golden output when vld flag is raised by DUT
-	always_ff @(posedge clk iff (vld && en)) begin
-		foreach(p[i]) begin
-			assert ($signed(p[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
-			else begin 
-				$error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
-				$stop;
-			end  
-		end
-		NF_CNT += 1;
-	end
-
-	// Instantiate DUT
-	mvu_8sx9 #(
-		.PE(PE),
-		.SIMD(SIMD),
-		.WEIGHT_WIDTH(WEIGHT_WIDTH),
-		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
-		.ACTIVATION_WIDTH(ACTIVATION_WIDTH),
-		.SEGMENTLEN(SEGMENTLEN)
-	)
-	dut (
-		.clk, .rst, .en, .last, .zero, .a, .w, .vld, .p
-	);
-
-endmodule
diff --git a/finn-rtllib/mvu/mvu_axi.sv b/finn-rtllib/mvu/mvu_axi.sv
deleted file mode 100644
index 07ad32e6c8..0000000000
--- a/finn-rtllib/mvu/mvu_axi.sv
+++ /dev/null
@@ -1,248 +0,0 @@
-/******************************************************************************
- * Copyright (C) 2022, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  1. Redistributions of source code must retain the above copyright notice,
- *     this list of conditions and the following disclaimer.
- *
- *  2. Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in the
- *     documentation and/or other materials provided with the distribution.
- *
- *  3. Neither the name of the copyright holder nor the names of its
- *     contributors may be used to endorse or promote products derived from
- *     this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
- * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
- * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
- * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
- * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * @brief	Matrix Vector Unit (MVU) & Vector Vector Unit (VVU) AXI-lite interface wrapper.
- * @details
- *	 The following compute cores are supported:
- *   - 4-bit MVU on DSP48 & DSP58 achieving 4 MACs/DSP, 
- *     (4,8]-bit MVU on DSP48 achieving 2 MACs/DSP,
- *     [4,9]-bit MVU and VVU on DSP58 achieving 3 MACs/DSP,
- *     'unconstrained' LUT-based MVU and VVU.
- *  Folding hints:
- *	 - PE scaling should divide MH.
- *   - SIMD scaling should divide MW.
- *	 - Otherwise, keep SIMD and PE somewhat balanced. SIMD scaling tends to
- *	   impact critical paths more than PE scaling. PE scaling implies a
- *	   bigger fanout on the input activations.
- *	 - Full unfolding along MH (PE=MH) results in no replay buffer instantiated
- *****************************************************************************/
-
-module mvu_vvu_axi #(
-	bit IS_MVU, // string type causes error in Vivado
-	parameter COMPUTE_CORE,
-	int unsigned MW,
-	int unsigned MH,
-	int unsigned PE,
-	int unsigned SIMD,
-	int unsigned ACTIVATION_WIDTH,
-	int unsigned WEIGHT_WIDTH,
-	int unsigned ACCU_WIDTH,
-	bit SIGNED_ACTIVATIONS = 0,
-	int unsigned SEGMENTLEN = 0,
-	bit FORCE_BEHAVIORAL = 0,
-	bit M_REG_LUT = 1,
-
-	// Safely deducible parameters
-	localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
-	localparam int unsigned INPUT_STREAM_WIDTH_BA = ((IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8,
-	localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH,
-	localparam int unsigned INPUT_STREAM_WIDTH =  (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH,
-	localparam int unsigned SF = MW/SIMD,
-	localparam int unsigned NF = MH/PE,
-	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8
-)
-(
-	// Global Control
-	input	logic  ap_clk,
-	input	logic  ap_rst_n,
-
-	// Weight Stream
-	input	logic [WEIGHT_STREAM_WIDTH_BA-1:0]  s_axis_weights_tdata,
-	input	logic  s_axis_weights_tvalid,
-	output	logic  s_axis_weights_tready,
-
-	// Input Stream
-	input	logic [INPUT_STREAM_WIDTH_BA-1:0]  s_axis_input_tdata,
-	input	logic  s_axis_input_tvalid,
-	output	logic  s_axis_input_tready,
-
-	// Output Stream
-	output	logic [OUTPUT_STREAM_WIDTH_BA-1:0]  m_axis_output_tdata,
-	output	logic  m_axis_output_tvalid,
-	input	logic  m_axis_output_tready
-);
-
-//-------------------- Parameter sanity checks --------------------\\
-	initial begin
-		if (MW % SIMD != 0) begin
-			$error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD);
-			$finish;
-		end
-		if (MH % PE != 0) begin
-			$error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE);
-			$finish;
-		end
-		if (WEIGHT_WIDTH > 8) begin
-			$error("Weight width of %0d-bits exceeds maximum of 8-bits", WEIGHT_WIDTH);
-			$finish;
-		end
-		if (ACTIVATION_WIDTH > 8) begin
-			if (!(SIGNED_ACTIVATIONS == 1 && ACTIVATION_WIDTH == 9 && COMPUTE_CORE == "mvu_vvu_8sx9_dsp58")) begin
-				$error("Activation width of %0d-bits exceeds maximum of 9-bits for signed numbers on DSP48", ACTIVATION_WIDTH);
-				$finish;
-			end
-		end
-		if (COMPUTE_CORE == "mvu_vvu_8sx9_dsp58") begin
-			if (SEGMENTLEN == 0) begin
-				$warning("Segment length of %0d defaults to chain length of %0d", SEGMENTLEN, (SIMD+2)/3);
-			end
-			if (SEGMENTLEN > (SIMD+2)/3) begin
-				$error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3);
-				$finish;
-			end
-		end
-		if (!IS_MVU) begin
-			if (COMPUTE_CORE != "mvu_vvu_8sx9_dsp58" && COMPUTE_CORE != "mvu_vvu_lut") begin
-				$error("VVU only supported on DSP58 or LUT-based implementation");
-				$finish;
-			end
-		end
-	end
-
-	uwire clk = ap_clk;
-	uwire rst = !ap_rst_n;
-
-	typedef logic [INPUT_STREAM_WIDTH-1 : 0] mvauin_t;
-
-	uwire mvauin_t amvau;
-	uwire alast;
-	uwire afin;
-	uwire avld;
-	uwire ardy;
-
-	replay_buffer #(.LEN(SF), .REP(IS_MVU ? NF : 1), .W($bits(mvauin_t))) activation_replay (
-	.clk, .rst,
-	.ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)),
-	.ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin)
-	);
-
-//-------------------- Input control --------------------\\
-	uwire en;
-	uwire istb = avld && s_axis_weights_tvalid;
-	assign ardy = en && s_axis_weights_tvalid;
-	assign s_axis_weights_tready = en && avld;
-
-//-------------------- Core MVU/VVU --------------------\\
-	uwire ovld;
-	uwire [PE-1:0][ACCU_WIDTH-1:0] odat;
-	typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t;
-	uwire mvauin_t amvau_i;
-
-	if (IS_MVU) begin : genMVUInput
-		assign  amvau_i = amvau;
-	end : genMVUInput
-	else begin : genVVUInput
-		// The input stream will have the channels interleaved for VVU when PE>1
-		// Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..]
-		// Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like:
-		// (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to
-		// (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i)
-		localparam int num_of_elements = INPUT_STREAM_WIDTH/ACTIVATION_WIDTH;
-		for (genvar i=0; i<num_of_elements; i++) begin : genRewire
-			assign  amvau_i[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH] = (PE > 1) ?
-									amvau[(i/SIMD + (i*PE % num_of_elements) + 1) * ACTIVATION_WIDTH : (i/SIMD + (i*PE % num_of_elements)) * ACTIVATION_WIDTH]
-									: amvau[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH];
-		end : genRewire
-	end : genVVUInput
-
-	case(COMPUTE_CORE)
-	"mvu_vvu_8sx9_dsp58":
-		mvu_vvu_8sx9 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
-		.ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN),
-		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
-			.clk, .rst, .en,
-			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
-			.vld(ovld), .p(odat)
-		);
-	"mvu_4sx4u":
-		mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
-			.clk, .rst, .en,
-			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
-			.vld(ovld), .p(odat)
-		);
-	"mvu_8sx8u_dsp48":
-		mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
-		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
-			.clk, .rst, .en,
-			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
-			.vld(ovld), .p(odat)
-		);
-	"mvu_vvu_lut":
-		mvu_vvu_lut #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
-		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .M_REG(M_REG_LUT)) core (
-			.clk, .rst, .en,
-			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
-			.vld(ovld), .p(odat)
-		);
-	default: initial begin
-		$error("Unrecognized COMPUTE_CORE '%s'", COMPUTE_CORE);
-		$finish;
-	end
-	endcase
-
-//-------------------- Output register slice --------------------\\
-	struct packed {
-		logic vld;
-		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
-	} A = '{ vld: 0, default: 'x};
-
-	assign en = !A.vld || !ovld;
-
-	uwire  b_load;
-	always_ff @(posedge clk) begin
-		if(rst)		A <= '{ vld: 0, default: 'x };
-		else if(!A.vld || b_load) begin
-			A.vld <= ovld && en;
-			for(int unsigned  i = 0; i < PE; i++) begin
-				// CR-1148862:
-				// A.dat[i] <= odat[i];
-				automatic logic [ACCU_WIDTH-1:0]  v = odat[i];
-				A.dat[i] <= v[ACCU_WIDTH-1:0];
-			end
-		end
-	end
-
-	struct packed {
-		logic vld;
-		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
-	} B = '{ vld: 0, default: 'x};
-
-	assign	b_load = !B.vld || m_axis_output_tready;
-	always_ff @(posedge clk) begin
-		if(rst)		B <= '{ vld: 0, default: 'x };
-		else begin
-			if(b_load)	B <= '{ vld: A.vld, dat: A.dat};
-		end
-	end
-
-	assign	m_axis_output_tvalid = B.vld;
-	assign	m_axis_output_tdata  = B.dat;
-
-endmodule : mvu_vvu_axi
diff --git a/finn-rtllib/mvu/mvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_axi_wrapper.v
deleted file mode 100644
index 239c5bbacd..0000000000
--- a/finn-rtllib/mvu/mvu_axi_wrapper.v
+++ /dev/null
@@ -1,92 +0,0 @@
-/******************************************************************************
- * Copyright (C) 2022, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  1. Redistributions of source code must retain the above copyright notice,
- *     this list of conditions and the following disclaimer.
- *
- *  2. Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in the
- *     documentation and/or other materials provided with the distribution.
- *
- *  3. Neither the name of the copyright holder nor the names of its
- *     contributors may be used to endorse or promote products derived from
- *     this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
- * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
- * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
- * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
- * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * @brief	Verilog AXI-lite wrapper for MVU.
- *****************************************************************************/
-
-module $MODULE_NAME_AXI_WRAPPER$ #(
-	parameter 	MW = $MW$,
-	parameter	MH = $MH$,
-	parameter 	PE = $PE$,
-	parameter 	SIMD = $SIMD$,
-	parameter 	ACTIVATION_WIDTH = $ACTIVATION_WIDTH$,
-	parameter 	WEIGHT_WIDTH = $WEIGHT_WIDTH$,
-	parameter 	ACCU_WIDTH = $ACCU_WIDTH$,
-	parameter 	SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$,
-	parameter 	SEGMENTLEN = $SEGMENTLEN$,
-	parameter	MVU_IMPL_STYLE = "$MVU_IMPL_STYLE$",
-	parameter	FORCE_BEHAVIORAL = $FORCE_BEHAVIORAL$,
-
-	// Safely deducible parameters
-	parameter 	WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
-	parameter 	INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8,
-	parameter 	OUTPUT_LANES = PE,
-	parameter 	OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
-)(
-	// Global Control
-	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *)
-	(* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *)
-	input	ap_clk,
-	(* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *)
-	input	ap_rst_n,
-
-	// Weight Stream
-	input	[WEIGHT_STREAM_WIDTH_BA-1:0]  weights_V_TDATA,
-	input   weights_V_TVALID,
-	output  weights_V_TREADY,
-	// Input Stream
-	input	[INPUT_STREAM_WIDTH_BA-1:0]  in0_V_TDATA,
-	input	in0_V_TVALID,
-	output	in0_V_TREADY,
-	// Output Stream
-	output	[OUTPUT_STREAM_WIDTH_BA-1:0]  out_V_TDATA,
-	output	out_V_TVALID,
-	input	out_V_TREADY
-);
-
-mvu_axi #(
-	.MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH),
-	.WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
-	.SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL), .MVU_IMPL_STYLE(MVU_IMPL_STYLE)
-	) inst (
-	.ap_clk(ap_clk),
-	.ap_rst_n(ap_rst_n),
-	.s_axis_weights_tdata(weights_V_TDATA),
-	.s_axis_weights_tvalid(weights_V_TVALID),
-	.s_axis_weights_tready(weights_V_TREADY),
-	.s_axis_input_tdata(in0_V_TDATA),
-	.s_axis_input_tvalid(in0_V_TVALID),
-	.s_axis_input_tready(in0_V_TREADY),
-	.m_axis_output_tdata(out_V_TDATA),
-	.m_axis_output_tvalid(out_V_TVALID),
-	.m_axis_output_tready(out_V_TREADY)
-);
-
-endmodule : $MODULE_NAME_AXI_WRAPPER$
diff --git a/finn-rtllib/mvu/mvu_lut.sv b/finn-rtllib/mvu/mvu_lut.sv
deleted file mode 100644
index c100910d75..0000000000
--- a/finn-rtllib/mvu/mvu_lut.sv
+++ /dev/null
@@ -1,104 +0,0 @@
-module mvu_vvu_lut #(
-    bit IS_MVU,
-    int unsigned  PE,
-    int unsigned  SIMD,
-	int unsigned  ACCU_WIDTH,
-    int unsigned  ACTIVATION_WIDTH,
-    int unsigned  WEIGHT_WIDTH,
-    bit  SIGNED_ACTIVATIONS,
-    bit  M_REG = 1,
-
-    localparam int unsigned MULT_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH,
-    localparam int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD
-)(
-	// Global Control
-	input	logic  clk,
-	input	logic  rst,
-	input	logic  en,
-
-	// Input
-	input	logic  last,
-	input	logic  zero,	// ignore current inputs and force this partial product to zero
-	input	logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]             w,	// signed weights
-	input	logic        [ACTIVATION_ELEMENTS-1:0][ACTIVATION_WIDTH-1:0]  a,	// (un)signed activations
-
-	// Ouput
-	output	logic  vld,
-	output	logic signed [PE-1:0][ACCU_WIDTH-1:0]  p
-);
-
-    typedef int unsigned  leave_load_t[2*SIMD-1];
-    function leave_load_t init_leave_loads();
-        automatic leave_load_t  res;
-        for(int  i = 2*(SIMD-1); i >= int'(SIMD)-1; i--)  res[i] = 1;
-        for(int  i = SIMD-2; i >= 0; i--)  res[i] = res[2*i+1] + res[2*i+2];
-        return res;
-    endfunction : init_leave_loads
-
-    // Pipeline for last indicator flag
-    uwire last_i;
-    generate if (M_REG) begin
-        logic [0:1] L = '0;
-        always_ff @(posedge clk) begin
-            if(rst)       L <= '0;
-            else if (en)  L <= {last, L[0]};
-        end
-        assign  last_i = L[1];
-    end
-    else begin 
-        logic L = '0;
-        always_ff @(posedge clk) begin
-            if(rst)       L <= '0;
-            else if (en)  L <= last;
-        end
-        assign  last_i = L;
-    end
-    endgenerate
-
-    // For each PE generate
-    for (genvar  i = 0; i < PE; i++)  begin : genPE
-        // Stage #1: SIMD multipliers in parallel
-        uwire [MULT_WIDTH-1 : 0] m1 [SIMD];
-        for (genvar j = 0; j < SIMD; j++) begin : genSIMD
-            if (M_REG) begin : genMreg
-                logic [MULT_WIDTH-1 : 0] M [SIMD];
-                always_ff @(posedge clk) begin
-                    if(rst)         M[j] = '{ default : 0 };
-                    else if (en)    M[j] = zero ? 0 :
-                                            SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) :
-                                                                 $signed({1'b0, a[(IS_MVU ? 0 : SIMD*i) + j]}) * $signed(w[i][j]); 
-                    // (SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) : a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) isn't valid -- leads to unsigned multiplication
-                end
-                assign  m1[j] = M[j];
-            end : genMreg
-            else begin : genNoMreg 
-                assign m1[j] = zero ? 0 :
-                               SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) :
-                                                    $signed({1'b0, a[(IS_MVU ? 0 : SIMD*i) + j]}) * $signed(w[i][j]);
-            end : genNoMreg
-        end : genSIMD
-
-        // Stage #2: Adder tree to reduce SIMD products
-        localparam leave_load_t  LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default : 1 };
-        localparam int unsigned  ROOT_WIDTH = $clog2(SIMD*(2**MULT_WIDTH-1));
-        uwire signed [2*SIMD-2:0][ROOT_WIDTH-1:0]  tree;
-        for(genvar s = 0; s < SIMD; s++)  assign  tree[SIMD-1+s] = $signed(m1[s]);
-        for(genvar n = 0; n < SIMD-1; n++) begin
-            // Sum truncated to actual maximum bit width at this node
-            localparam int unsigned  NODE_WIDTH = $clog2(LEAVE_LOAD[n]*(2**MULT_WIDTH-1));
-            uwire signed [NODE_WIDTH-1:0]  s = tree[2*n+1] + tree[2*n+2];
-            assign tree[n] = s;
-        end
-
-        // Stage #3: Buffer output
-        logic [ACCU_WIDTH-1:0] P2 [PE];
-        always_ff @(posedge clk) begin
-            if(rst)         P2[i] = '{ default : 0};
-            else if (en)    P2[i] = (last_i ? 0 : $signed(P2[i])) + $signed(tree[0]);
-        end
-
-        assign  vld = last_i;
-        assign  p[i] = P2[i];
-    end : genPE
-
-endmodule : mvu_vvu_lut
diff --git a/finn-rtllib/mvu/tb/mvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_axi_tb.sv
deleted file mode 100644
index b89b58f55b..0000000000
--- a/finn-rtllib/mvu/tb/mvu_axi_tb.sv
+++ /dev/null
@@ -1,215 +0,0 @@
-/******************************************************************************
- * Copyright (C) 2022, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  1. Redistributions of source code must retain the above copyright notice,
- *     this list of conditions and the following disclaimer.
- *
- *  2. Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in the
- *     documentation and/or other materials provided with the distribution.
- *
- *  3. Neither the name of the copyright holder nor the names of its
- *     contributors may be used to endorse or promote products derived from
- *     this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
- * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
- * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
- * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
- * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * @brief	Testbench for MVU AXI-lite interface wrapper.
- *****************************************************************************/
-
-module mvu_axi_tb();
-
-//-------------------- Simulation parameters --------------------\\
-	// Matrix & parallelism config
-	localparam int unsigned MW = 50;
-	localparam int unsigned MH = 8;
-	localparam int unsigned SIMD = 10;
-	localparam int unsigned PE = 2;
-	localparam int unsigned SEGMENTLEN = 2;
-	localparam string MVU_IMPL_STYLE = "mvu_8sx8u_dsp48";
-	localparam bit FORCE_BEHAVIORAL = 1;
-	// Bit-width config
-	localparam int unsigned ACTIVATION_WIDTH = 8;
-	localparam int unsigned WEIGHT_WIDTH = 8;
-	localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW);
-	localparam bit SIGNED_ACTIVATIONS = 0;
-	// Simulation constants
-	localparam int unsigned NF = MH/PE;
-	localparam int unsigned SF = MW/SIMD;
-	localparam int unsigned NUM_OF_DSP = SIMD/3;
-	localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8;
-	localparam int unsigned ACTIVATION_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8*8;
-	localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH;
-	localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - SIMD*ACTIVATION_WIDTH;
-	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8;
-
-	// Generate clk and reset signal
-	logic clk = 0;
-	always #5ns clk = !clk;
-
-	logic ap_rst_n = 0;
-	initial begin
-		repeat(16) @(posedge clk);
-		ap_rst_n <= 1;
-	end
-
-	uwire ap_clk = clk;
-
-	// Generate activations
-	typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t;
-	typedef activation_t activation_vector_t[SF];
-
-	function activation_vector_t init_ACTIVATIONS;
-		automatic activation_vector_t res;
-		std::randomize(res);
-		return res;
-	endfunction : init_ACTIVATIONS
-
-	activation_vector_t ACTIVATIONS = init_ACTIVATIONS();
-
-	struct {
-		activation_t dat;
-		logic vld;
-		logic rdy;
-	} activations;
-
-	initial begin
-		activations.vld = 0;
-		activations.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain
-		@(posedge clk iff ap_rst_n);
-
-		for (int i=0; i<SF; i++) begin
-			activations.dat <= ACTIVATIONS[i];
-			do begin
-				activations.vld <= $urandom()%7 >= 1;
-				@(posedge clk);
-			end while (!(activations.vld === 1 && activations.rdy === 1));
-		end
-
-		activations.vld <= 0;
-		activations.dat <= 'x;
-	end
-
-	// Generate weights
-	typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t;
-	typedef weight_t weight_matrix_t[NF][SF];
-
-	function weight_matrix_t init_WEIGHTS;
-		automatic weight_matrix_t res;
-		std::randomize(res);
-		return res;
-	endfunction : init_WEIGHTS;
-
-	weight_matrix_t WEIGHTS = init_WEIGHTS();
-
-	struct {
-		weight_t dat;
-		logic vld;
-		logic rdy;
-	} weights;
-
-	initial begin
-		weights.vld = 0;
-		weights.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain
-		@(posedge clk iff ap_rst_n);
-
-		weights.vld <= 1;
-		for (int i=0; i<NF; i++) begin
-			for (int j=0; j<SF; j++) begin
-				weights.dat <= WEIGHTS[i][j];
-				@(posedge clk iff weights.rdy);
-			end
-		end
-
-		weights.vld <= 0;
-		weights.dat <= 'x;
-	end
-
-	// Function to compute golden output
-	// a: [SF][SIMD-1:0][ACTIVATION_WIDTH-1:0]
-	// w: [NF][SF][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]
-	typedef logic signed [PE-1:0][ACCU_WIDTH-1:0] output_t;
-	typedef output_t output_vector_t [NF];
-
-	struct {
-		output_t dat;
-		logic vld;
-		logic rdy;
-	} outputs;
-
-	function output_vector_t check_output(activation_vector_t a, weight_matrix_t w);
-		automatic output_vector_t res = '{default: 0};
-		for (int j = 0; j<MH; j++) begin
-			for (int i = 0; i<MW; i++) begin
-				if (SIGNED_ACTIVATIONS)
-					res[j/PE][j%PE] = $signed(res[j/PE][j%PE]) + $signed(a[i/SIMD][i%SIMD]) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
-				else
-					res[j/PE][j%PE] = $signed(res[j/PE][j%PE]) + $signed({1'b0, a[i/SIMD][i%SIMD]}) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
-			end
-		end
-		return res;
-	endfunction : check_output;
-
-	output_vector_t GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS);
-
-	int unsigned NF_CNT = 0;
-	initial begin
-		outputs.rdy = 0;
-		while (NF_CNT < NF) begin
-			// Loop until both rdy & vld are asserted
-			do begin
-				outputs.rdy <= $urandom()%7 >= 1;
-				@(posedge clk iff ap_rst_n);
-			end while (!(outputs.rdy === 1 && outputs.vld === 1));
-
-			// Compare produced outputs against golden outputs
-			foreach(outputs.dat[i]) begin
-				assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
-				else begin
-					$error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
-					$stop;
-				end
-			end
-
-			NF_CNT += 1;
-		end
-
-		$finish;
-	end
-
-	// Instantiate DUT
-	mvu_axi #(
-		.MW(MW),
-		.MH(MH),
-		.PE(PE),
-		.SIMD(SIMD),
-		.ACTIVATION_WIDTH(ACTIVATION_WIDTH),
-		.WEIGHT_WIDTH(WEIGHT_WIDTH),
-		.ACCU_WIDTH(ACCU_WIDTH),
-		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
-		.SEGMENTLEN(SEGMENTLEN),
-		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL),
-		.MVU_IMPL_STYLE(MVU_IMPL_STYLE)
-	)
-	dut (
-		.ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld),
-		.s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld),
-		.s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld),
-		.m_axis_output_tready(outputs.rdy)
-	);
-
-endmodule : mvu_axi_tb

From 14c5fa902820396e3489a244dc4d705fd1ebe532 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 21 Sep 2023 17:12:47 +0100
Subject: [PATCH 167/235] [mvu vvu 8sx9]: renamed for consistency

---
 finn-rtllib/mvu/{mvu_vvu_8sx9.sv => mvu_vvu_8sx9_dsp58.sv} | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
 rename finn-rtllib/mvu/{mvu_vvu_8sx9.sv => mvu_vvu_8sx9_dsp58.sv} (99%)

diff --git a/finn-rtllib/mvu/mvu_vvu_8sx9.sv b/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv
similarity index 99%
rename from finn-rtllib/mvu/mvu_vvu_8sx9.sv
rename to finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv
index 2aa9d71b6c..6ae117e3ab 100644
--- a/finn-rtllib/mvu/mvu_vvu_8sx9.sv
+++ b/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv
@@ -31,7 +31,7 @@
  * @brief	Matrix Vector Unit (MVU) core compute kernel utilizing DSP58.
  *****************************************************************************/
 
-module mvu_vvu_8sx9 #(
+module mvu_vvu_8sx9_dsp58 #(
 	bit IS_MVU,
     int unsigned PE,
     int unsigned SIMD,
@@ -424,4 +424,4 @@ module mvu_vvu_8sx9 #(
 		end : genDSPChain
 	end : genDSPPE
 
-endmodule : mvu_vvu_8sx9
+endmodule : mvu_vvu_8sx9_dsp58

From 3a3758826512fd3d5ed0bcdd23358d5fd5b724cd Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 21 Sep 2023 17:13:25 +0100
Subject: [PATCH 168/235] [mvu vvu axi]: changes for renamed module

---
 finn-rtllib/mvu/mvu_vvu_axi.sv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index ff677fc244..416480da79 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -174,7 +174,7 @@ module mvu_vvu_axi #(
 
 	case(COMPUTE_CORE)
 	"mvu_vvu_8sx9_dsp58":
-		mvu_vvu_8sx9 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
+		mvu_vvu_8sx9_dsp58 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
 		.ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN),
 		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
 			.clk, .rst, .en,

From afe36baa134b947718db34d140c8d6500b91cb2a Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 25 Sep 2023 13:44:17 +0100
Subject: [PATCH 169/235] [mvu vvu wrapper]: convert localparam to param

---
 finn-rtllib/mvu/mvu_vvu_axi_wrapper.v | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
index 270fe7351f..9c65dbc06e 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
+++ b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
@@ -46,9 +46,9 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 	parameter	FORCE_BEHAVIORAL = $FORCE_BEHAVIORAL$,
 
 	// Safely deducible parameters
-	localparam	WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
-	localparam 	INPUT_STREAM_WIDTH_BA = ((IS_MVU == 1 ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8,
-	localparam 	OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8
+	parameter	WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
+	parameter 	INPUT_STREAM_WIDTH_BA = ((IS_MVU == 1 ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8,
+	parameter 	OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8
 )(
 	// Global Control
 	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *)

From e4f2f9e0e4f1cb0bae2bf7e439c57356b3670620 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 25 Sep 2023 13:45:48 +0100
Subject: [PATCH 170/235] [mvau-rtl custom-op]: bugfix to instantiate
 memstreamer, modified renamed files and axi wrapper template fill-out

---
 .../matrixvectoractivation_rtl.py             | 92 ++++++++++---------
 1 file changed, 51 insertions(+), 41 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
index 9f8130806b..c7fb855884 100644
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
@@ -61,8 +61,7 @@
 
 
 class MatrixVectorActivation_rtl(HLSCustomOp):
-    """Class that corresponds to finn-hls Matrix_Vector_Activate(_Stream)_Batch
-    function."""
+    """Class that corresponds to finn-rtl Matrix Vector Unit."""
 
     def __init__(self, onnx_node, **kwargs):
         super().__init__(onnx_node, **kwargs)
@@ -73,8 +72,7 @@ def get_nodeattr_types(self):
             "SIMD": ("i", True, 0),
             "MW": ("i", True, 0),
             "MH": ("i", True, 0),
-            "resType": ("s", False, "lut", {"auto", "lut", "dsp"}),
-            "ActVal": ("i", False, 0),
+            "resType": ("s", False, "dsp", {"auto", "lut", "dsp"}),
             # FINN DataTypes for inputs, weights, outputs
             "inputDataType": ("s", True, ""),
             "weightDataType": ("s", True, ""),
@@ -165,7 +163,6 @@ def verify_node(self):
         # verify that all necessary attributes exist
         # TODO collect automatically from get_nodeattr_types
         try:
-            self.get_nodeattr("code_gen_dir_cppsim")
             self.get_nodeattr("executable_path")
             self.get_nodeattr("resType")
             self.get_nodeattr("MW")
@@ -199,7 +196,6 @@ def verify_node(self):
 
         return info_messages
 
-    # TODO: Add in replay_buffer estimation
     def uram_estimation(self):
         P = self.get_nodeattr("PE")
         Q = self.get_nodeattr("SIMD")
@@ -213,7 +209,6 @@ def uram_estimation(self):
         mstyle = self.get_nodeattr("ram_style")
         if (
             (mmode == "decoupled" and mstyle != "ultra")
-            or (mmode == "const" and self.calc_wmem() <= 128)
             or (mmode == "external")
         ):
             return 0
@@ -221,7 +216,6 @@ def uram_estimation(self):
         depth_multiplier = math.ceil(omega / 4096)
         return width_multiplier * depth_multiplier
 
-    # TODO: Add in replay_buffer estimation
     def bram_estimation(self):
         """Calculates resource estimation for BRAM based on:
         - FINN-R: An End-to-End Deep-Learning Framework for Fast
@@ -243,7 +237,6 @@ def bram_estimation(self):
         mstyle = self.get_nodeattr("ram_style")
         if (
             (mmode == "decoupled" and mstyle in ["distributed", "ultra"])
-            or (mmode == "const" and self.calc_wmem() <= 128)
             or (mmode == "external")
         ):
             return 0
@@ -262,7 +255,6 @@ def bram_estimation(self):
         else:
             return (math.ceil(omega / 512)) * (math.ceil(mem_width / 36))
 
-    # TODO: Add in replay_buffer estimation
     def bram_efficiency_estimation(self):
         wdt = self.get_weight_datatype()
         W = wdt.bitwidth()
@@ -275,7 +267,6 @@ def bram_efficiency_estimation(self):
         bram16_est_capacity = bram16_est * 36 * 512
         return wbits / bram16_est_capacity
 
-    # TODO: Add in replay_buffer estimation
     def uram_efficiency_estimation(self):
         """Function for URAM efficiency estimation: actual parameter storage
         needed divided by the allocated URAM storage (from estimation)"""
@@ -290,7 +281,7 @@ def uram_efficiency_estimation(self):
         uram_est_capacity = uram_est * 72 * 4096
         return wbits / uram_est_capacity
 
-    # TODO: FIX: worst case estimates since segmentlen is not known at this point?
+# TODO: fix lut estimations 
     def lut_estimation(self):
         """Calculates resource estimations for LUTs based on:
         - FINN-R: An End-to-End Deep-Learning Framework for Fast
@@ -333,9 +324,13 @@ def lut_estimation(self):
 
         return int(c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts)) + c2)
 
-    # TODO: FIX: worst case estimates since segmentlen is not known at this point?
+# TODO: fix DSP estimations --> depends on fpga_part
     def dsp_estimation(self):
         # multiplication
+        # mvu_8sx9 (DSP58): ceil(SIMD/3)
+        # mvu_4sx4u (DSP48/DSP58): ceil(PE/4)
+        # mvu_8sx8u (DSP48): ceil(PE/2)
+        # mvu_lut: 0
         P = self.get_nodeattr("PE")
         res_type = self.get_nodeattr("resType")
         Q = self.get_nodeattr("SIMD")
@@ -349,18 +344,24 @@ def dsp_estimation(self):
             mult_dsp = 0
         return int(mult_dsp)
 
-    # TODO: FIX: worst case estimates since segmentlen is not known at this point
+# TODO: fix exp_cycles estimations --> depends on fpga_part and clk
     def get_exp_cycles(self):
+        # mvu_8sx9 (DSP58):
+        # 2 (replay_buffer) + ceil(chainlen/seglen) + 2 (MREG, PREG) + 2 (output reg slice)
+        # + MW/SIMD * MH/PE
+        # mvu_4sx4u (DSP48/DSP58) / mvu_8sx8u (DSP48): 
+        # 3 (IN_REG, MREG, PREG) + 2 (replay_buffer) + 2 (output reg slice) + 1 (adder tree SIMD) + 1 (output lane)
+        # + MW/SIMD * MH/PE
+        # mvu_lut:
+        # 2 (replay_buffer) + 1 OR 2 (no MREG OR MREG) + 2 (output reg slice) 
+        # + MW/SIMD * MH/PE
         pe = self.get_nodeattr("PE")
         simd = self.get_nodeattr("SIMD")
         num_inp_vec = self.get_nodeattr("numInputVectors")
         mh = self.get_nodeattr("MH")
         mw = self.get_nodeattr("MW")
         # since mmv != 1 is not supported yet, we set mmv for now to 1
-        mmv = 1
-        # Actual exp_cycles is probably slightly larger (say 3 cycles
-        # (DSP A/B, M, P - reg) + additional pipeline buffer cycles.
-        # Most probably <10)
+        mmv = 1     
         exp_cycles = (mh / pe) * (mw / simd) * np.prod(num_inp_vec) / mmv
         return int(exp_cycles)
 
@@ -711,7 +712,7 @@ def execute_node(self, context, graph):
         else:
             raise Exception(
                 """Invalid value for attribute exec_mode! Is currently set to: {}
-            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+            has to be set to "rtlsim" """.format(
                     mode
                 )
             )
@@ -795,11 +796,12 @@ def code_generation_ipi(self):
                 os.path.join(
                     code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"
                 ),
-                rtllib_dir + "mvu_axi.sv",
+                rtllib_dir + "mvu_vvu_axi.sv",
                 rtllib_dir + "replay_buffer.sv",
                 rtllib_dir + "mvu_4sx4u.sv",
-                rtllib_dir + "mvu_8sx9.sv",
+                rtllib_dir + "mvu_vvu_8sx9_dsp58.sv",
                 rtllib_dir + "mvu_8sx8u_dsp48.sv",
+                rtllib_dir + "mvu_vvu_lut.sv",
             ]
             for f in sourcefiles:
                 cmd.append("add_files -norecurse %s" % (f))
@@ -813,7 +815,7 @@ def code_generation_ipi(self):
             )
 
             # instantiate a streamer and connect it to the HLS IP
-            strm_vlnv = "amd.com:FINN:memstream:1.0"
+            strm_vlnv = "amd.com:finn:memstream:1.0"
             strm_inst = node_name + "_wstrm"
             cmd.append(
                 "create_bd_cell -type ip -vlnv %s /%s/%s"
@@ -890,11 +892,12 @@ def code_generation_ipi(self):
                 os.path.join(
                     code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"
                 ),
-                rtllib_dir + "mvu_axi.sv",
+                rtllib_dir + "mvu_vvu_axi.sv",
                 rtllib_dir + "replay_buffer.sv",
                 rtllib_dir + "mvu_4sx4u.sv",
-                rtllib_dir + "mvu_8sx9.sv",
+                rtllib_dir + "mvu_vvu_8sx9_dsp58.sv",
                 rtllib_dir + "mvu_8sx8u_dsp48.sv",
+                rtllib_dir + "mvu_vvu_lut.sv",
             ]
             for f in sourcefiles:
                 cmd.append("add_files -norecurse %s" % (f))
@@ -959,27 +962,32 @@ def derive_characteristic_fxns(self, period):
             ]
         super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
 
-    # TODO: characterize max_clk and implement this function in look-up style
     def _resolve_segment_len(self, clk):
-        # Insert pipeline registers in the DSP chain to meet target clock frequency
-        return 4 # default to 4 for now
+        # Insert pipeline registers in the DSP58 chain to meet target clock frequency
+        # 0.741 ns seems the worst-case delay through first DSP
+        # 0.605 ns seems to be (on average) delay for all subsequent DSPs
+        dsp_chain_len = np.floor((clk - 0.741) / 0.605)
+        return max(1, dsp_chain_len)
 
     def _resolve_impl_style(self, fpgapart):
         # Based on target device and activation/weight-width, choose the
-        # supported RTL module
-        act_width = self.get_input_datatype(0).bitwidth()
-        weight_width = self.get_input_datatype(1).bitwidth()
-        is_versal = (
-            fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"]
-            or fpgapart[0:5] == "xqrvc"
-        )
-        if act_width == 4 and weight_width == 4:
-            return "mvu_4sx4u"
+        # supported RTL compute core
+        if self.get_nodeattr("resType") == "lut":
+            return "mvu_vvu_lut"
         else:
-            if is_versal:
-                return "mvu_8sx9_dsp58"
+            act_width = self.get_input_datatype(0).bitwidth()
+            weight_width = self.get_input_datatype(1).bitwidth()
+            is_versal = (
+                fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"]
+                or fpgapart[0:5] == "xqrvc"
+            )
+            if act_width == 4 and weight_width == 4:
+                return "mvu_4sx4u"
             else:
-                return "mvu_8sx8u_dsp48"
+                if is_versal:
+                    return "mvu_vvu_8sx9_dsp58"
+                else:
+                    return "mvu_8sx8u_dsp48"
 
     def generate_hdl(self, model, fpgapart, clk):
         # Generate params as part of IP preparation
@@ -1023,9 +1031,11 @@ def generate_hdl(self, model, fpgapart, clk):
         self.set_nodeattr("ip_path", code_gen_dir)
 
     def prepare_codegen_default(self, fpgapart, clk):
-        template_path = os.environ["FINN_ROOT"] + "/finn-rtllib/mvu/mvu_axi_wrapper.v"
+        template_path = os.environ["FINN_ROOT"] + "/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v"
 
         code_gen_dict = {}
+        code_gen_dict["$IS_MVU$"] = [str(1)]
+        code_gen_dict["$COMPUTE_CORE$"] = [self._resolve_impl_style(fpgapart)]
         code_gen_dict["$MW$"] = [str(self.get_nodeattr("MW"))]
         code_gen_dict["$MH$"] = [str(self.get_nodeattr("MH"))]
         code_gen_dict["$PE$"] = [str(self.get_nodeattr("PE"))]
@@ -1039,7 +1049,7 @@ def prepare_codegen_default(self, fpgapart, clk):
             [str(1)] if (self.get_input_datatype(0).min() < 0) else [str(0)]
         )
         code_gen_dict["$SEGMENTLEN$"] = [str(self._resolve_segment_len(clk))]
-        code_gen_dict["$MVU_IMPL_STYLE$"] = [self._resolve_impl_style(fpgapart)]
+        code_gen_dict["$FORCE_BEHAVIORAL$"] = [str(0)]
 
         return template_path, code_gen_dict
 

From b49b79a0a669caad9355e59e1ee877ca59b65d27 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 25 Sep 2023 13:47:50 +0100
Subject: [PATCH 171/235] [specialize to rtl]: fix to changed attribute name
 and added support for converting HLS-based VVU custom-ops to RTL-based
 custom-ops

---
 .../fpgadataflow/specialize_to_rtl_layers.py  | 82 ++++++++++++++++++-
 1 file changed, 81 insertions(+), 1 deletion(-)

diff --git a/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py
index 47ed5ce863..5061282695 100644
--- a/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py
+++ b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py
@@ -26,6 +26,7 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import numpy as np
 from qonnx.transformation.base import Transformation
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.core.datatype import DataType
@@ -60,7 +61,7 @@ def apply(self, model):
         for n in graph.node:
             node_ind += 1
             if n.op_type == "MatrixVectorActivation":
-                preferred_in_rtl = getCustomOp(n).get_nodeattr("impl") == "rtl" and getCustomOp(n).get_nodeattr("resType") == "dsp"
+                preferred_in_rtl = getCustomOp(n).get_nodeattr("preferred_backend") == "rtl"
                 supported_in_rtl = self._is_rtl_variant_compatible(n)
                 if (preferred_in_rtl and supported_in_rtl):
                     mvau_input = n.input[0]
@@ -76,6 +77,7 @@ def apply(self, model):
                     pe = getCustomOp(n).get_nodeattr("PE")
                     mem_mode = getCustomOp(n).get_nodeattr("mem_mode")
                     ram_style = getCustomOp(n).get_nodeattr("ram_style")
+                    resType = getCustomOp(n).get_nodeattr("resType")
                     runtime_writeable_weights = getCustomOp(n).get_nodeattr("runtime_writeable_weights")
 
                     new_node = helper.make_node(
@@ -93,6 +95,7 @@ def apply(self, model):
                         outputDataType=outputDataType,
                         numInputVectors=numInputVectors,
                         mem_mode=mem_mode,
+                        resType=resType,
                         name=n.name + "_rtl",
                         ram_style=ram_style,
                         runtime_writeable_weights=runtime_writeable_weights
@@ -108,4 +111,81 @@ def apply(self, model):
             model = model.transform(InferDataTypes())
             model = model.transform(GiveUniqueNodeNames())
         
+        return (model, graph_modified)
+
+class InferRTLVectorVectorActivation(Transformation):
+    """Convert (HLS-based) VectorVectorActivation layers to specialized RTL layers is supported."""
+
+    def __init__(self):
+        super().__init__()
+
+    def _is_rtl_variant_compatible(self, n):
+        no_activation = getCustomOp(n).get_nodeattr("noActivation") == 1
+        act_width_in_range = (DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() <= 8) or (DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() == 9 and DataType[getCustomOp(n).get_nodeattr("inputDataType")].min() < 0)
+        weight_width_in_range = DataType[getCustomOp(n).get_nodeattr("weightDataType")].bitwidth() <= 8
+        folding_supported = (getCustomOp(n).get_nodeattr("Channels") % getCustomOp(n).get_nodeattr("PE") == 0) and (np.prod(getCustomOp(n).get_nodeattr("Kernel")) % getCustomOp(n).get_nodeattr("SIMD") == 0)
+        
+        if (no_activation and act_width_in_range and weight_width_in_range and folding_supported):
+            return True
+        else:
+            return False
+    
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for n in graph.node:
+            node_ind += 1
+            if n.op_type == "VectorVectorActivation":
+                preferred_in_rtl = getCustomOp(n).get_nodeattr("preferred_backend") == "rtl"
+                supported_in_rtl = self._is_rtl_variant_compatible(n)
+                if (preferred_in_rtl and supported_in_rtl):
+                    vvau_input = n.input[0]
+                    vvau_weight = n.input[1]
+                    vvau_output = n.output[0]
+                    inputDataType = getCustomOp(n).get_nodeattr("inputDataType")
+                    weightDataType = getCustomOp(n).get_nodeattr("weightDataType")
+                    outputDataType = getCustomOp(n).get_nodeattr("outputDataType")
+                    pe = getCustomOp(n).get_nodeattr("PE")
+                    simd = getCustomOp(n).get_nodeattr("SIMD")
+                    dim = getCustomOp(n).get_nodeattr("Dim")
+                    channels = getCustomOp(n).get_nodeattr("Channels")
+                    kernel = getCustomOp(n).get_nodeattr("Kernel")
+                    resType = getCustomOp(n).get_nodeattr("resType")
+                    mem_mode = getCustomOp(n).get_nodeattr("mem_mode")
+                    runtime_writeable_weights = getCustomOp(n).get_nodeattr("runtime_writeable_weights")
+                    ram_style = getCustomOp(n).get_nodeattr("ram_style")
+                    resType = getCustomOp(n).get_nodeattr("resType")                    
+
+                    new_node = helper.make_node(
+                        "VectorVectorActivation_rtl",
+                        [vvau_input, vvau_weight],
+                        [vvau_output],
+                        domain="finn.custom_op.fpgadataflow",
+                        backend="fpgadataflow",
+                        name=n.name + "_rtl",
+                        PE=pe,
+                        SIMD=simd,
+                        Dim=dim,
+                        Channels=channels,
+                        Kernel=kernel,
+                        resType=resType,
+                        inputDataType=inputDataType,
+                        weightDataType=weightDataType,
+                        outputDataType=outputDataType,
+                        mem_mode=mem_mode,
+                        runtime_writeable_weights=runtime_writeable_weights,
+                        ram_style=ram_style
+                    )
+                    graph.node.insert(node_ind, new_node)
+                    # remove old node
+                    graph.node.remove(n)
+                    graph_modified=True
+        
+        if graph_modified:
+            model = model.transform(MinimizeAccumulatorWidth())
+            model = model.transform(InferShapes())
+            model = model.transform(InferDataTypes())
+            model = model.transform(GiveUniqueNodeNames())
+        
         return (model, graph_modified)
\ No newline at end of file

From 9bdba031df228a2afbe99b8ea2fb576b678bba86 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Tue, 19 Sep 2023 15:27:28 +0100
Subject: [PATCH 172/235] Adding core for DSP48 backport.

---
 finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
index 416c12c1cc..07c44cf89a 100644
--- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
+++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
@@ -4,7 +4,9 @@ module mvu_8sx8u_dsp48 #(
 	int unsigned  ACCU_WIDTH,
 	int unsigned  ACTIVATION_WIDTH,
 	int unsigned  WEIGHT_WIDTH,
-	bit FORCE_BEHAVIORAL = 0,
+
+	bit  SIGNED_ACTIVATIONS = 0,
+	bit  FORCE_BEHAVIORAL = 0,
 
 	localparam int unsigned SINGLE_PROD_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH
 )(
@@ -16,8 +18,8 @@ module mvu_8sx8u_dsp48 #(
 	// Input
 	input	logic  last,
 	input	logic  zero,	// ignore current inputs and force this partial product to zero
-	input	logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]  w,	// signed weights
-	input	logic                [SIMD-1:0][ACTIVATION_WIDTH-1:0]  a,	// unsigned activations
+	input	logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH    -1:0]  w,	// signed weights
+	input	logic                [SIMD-1:0][ACTIVATION_WIDTH-1:0]  a,	// unsigned activations (override by SIGNED_ACTIVATIONS)
 
 	// Ouput
 	output	logic  vld,
@@ -47,7 +49,7 @@ module mvu_8sx8u_dsp48 #(
 	assign	vld = L[5];
 
 	// Stages #1 - #3: DSP Lanes + cross-lane canaries duplicated with SIMD parallelism
-    localparam int unsigned  D[2:0] = '{ ACCU_WIDTH+SINGLE_PROD_WIDTH, SINGLE_PROD_WIDTH, 0 }; // Lane offsets
+	localparam int unsigned  D[2:0] = '{ ACCU_WIDTH+SINGLE_PROD_WIDTH, SINGLE_PROD_WIDTH, 0 }; // Lane offsets
 
 	localparam int unsigned  PIPE_COUNT = (PE+1)/2;
 	for(genvar  c = 0; c < PIPE_COUNT; c++) begin : genPipes
@@ -61,7 +63,7 @@ module mvu_8sx8u_dsp48 #(
 		for(genvar  s = 0; s < SIMD; s++) begin : genSIMD
 
 			// Input Lane Assembly
-			uwire [23:0]  bb = a[s];
+			uwire [23:0]  bb = { {(24-ACTIVATION_WIDTH){SIGNED_ACTIVATIONS && a[s][ACTIVATION_WIDTH-1]}}, a[s] };
 			logic [33:0]  aa;
 			logic [26:0]  dd;
 			logic [ 1:0]  xx;

From 2cf1ef70306339b1409ed61d8e18eda243bf56ad Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 25 Sep 2023 14:48:34 +0100
Subject: [PATCH 173/235] [mvu rtl core]: added support for signed activations
 for DSP48-based MVUs

---
 finn-rtllib/mvu/mvu_4sx4u.sv   | 3 ++-
 finn-rtllib/mvu/mvu_vvu_axi.sv | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv
index 88985312c9..706347d700 100644
--- a/finn-rtllib/mvu/mvu_4sx4u.sv
+++ b/finn-rtllib/mvu/mvu_4sx4u.sv
@@ -2,6 +2,7 @@ module mvu_4sx4u #(
 	int unsigned  PE,
 	int unsigned  SIMD,
 	int unsigned  ACCU_WIDTH,
+	bit SIGNED_ACTIVATIONS = 0,
 	bit FORCE_BEHAVIORAL = 0
 )(
 	// Global Control
@@ -57,7 +58,7 @@ module mvu_4sx4u #(
 		for(genvar  s = 0; s < SIMD; s++) begin : genSIMD
 
 			// Input Lane Assembly
-			uwire [23:0]  bb = a[s];
+			uwire [23:0]  bb = { {(20){SIGNED_ACTIVATIONS && a[s][3]}}, a[s] };
 			logic [33:0]  aa;
 			logic [26:0]  dd;
 			logic [ 1:0]  xx[3:1];
diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index 416480da79..da7e00cc55 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -182,14 +182,14 @@ module mvu_vvu_axi #(
 			.vld(ovld), .p(odat)
 		);
 	"mvu_4sx4u":
-		mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
+		mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
 			.clk, .rst, .en,
 			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
 			.vld(ovld), .p(odat)
 		);
 	"mvu_8sx8u_dsp48":
 		mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
-		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
+		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
 			.clk, .rst, .en,
 			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
 			.vld(ovld), .p(odat)

From ab8d4a8e075ac9b3ccf78d2a08907d5dcc116fdb Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 25 Sep 2023 16:17:38 +0100
Subject: [PATCH 174/235] [rtl mvu custom-op]: add upper bound to SEGMENTLEN
 equal to number of DSP58s chained together

---
 src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
index c7fb855884..d0a638475a 100644
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
@@ -966,7 +966,9 @@ def _resolve_segment_len(self, clk):
         # Insert pipeline registers in the DSP58 chain to meet target clock frequency
         # 0.741 ns seems the worst-case delay through first DSP
         # 0.605 ns seems to be (on average) delay for all subsequent DSPs
-        dsp_chain_len = np.floor((clk - 0.741) / 0.605)
+        critical_path_dsps = np.floor((clk - 0.741) / 0.605)
+        max_chain_len = np.ceil(self.get_nodeattr("SIMD") / 3)
+        dsp_chain_len = critical_path_dsps if critical_path_dsps < max_chain_len else max_chain_len
         return max(1, dsp_chain_len)
 
     def _resolve_impl_style(self, fpgapart):

From 74eb42bc2266071ccbd5e6fcfadc5bdf835463d8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Fri, 29 Sep 2023 15:24:28 +0100
Subject: [PATCH 175/235] Starting on pumped DSP compute.

---
 finn-rtllib/mvu/mvu_vvu_axi.sv | 260 ++++++++++++++++++++++++---------
 1 file changed, 194 insertions(+), 66 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index da7e00cc55..54a4c092d7 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -51,26 +51,27 @@ module mvu_vvu_axi #(
 	int unsigned MH,
 	int unsigned PE,
 	int unsigned SIMD,
+	int unsigned SEGMENTLEN = 0,
+
 	int unsigned ACTIVATION_WIDTH,
 	int unsigned WEIGHT_WIDTH,
 	int unsigned ACCU_WIDTH,
 	bit SIGNED_ACTIVATIONS = 0,
-	int unsigned SEGMENTLEN = 0,
+
+	bit PUMPED_COMPUTE = 0,	// requires an even SIMD % 2 == 0
 	bit FORCE_BEHAVIORAL = 0,
 	bit M_REG_LUT = 1,
 
 	// Safely deducible parameters
-	localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
-	localparam int unsigned INPUT_STREAM_WIDTH_BA = ((IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8,
-	localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH,
-	localparam int unsigned INPUT_STREAM_WIDTH =  (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH,
-	localparam int unsigned SF = MW/SIMD,
-	localparam int unsigned NF = MH/PE,
-	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8
-)
-(
+	localparam int unsigned  WEIGHT_STREAM_WIDTH    = PE * SIMD * WEIGHT_WIDTH,
+	localparam int unsigned  WEIGHT_STREAM_WIDTH_BA = (WEIGHT_STREAM_WIDTH + 7)/8 * 8,
+	localparam int unsigned  INPUT_STREAM_WIDTH     = (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH,
+	localparam int unsigned  INPUT_STREAM_WIDTH_BA  = (INPUT_STREAM_WIDTH  + 7)/8 * 8,
+	localparam int unsigned  OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8
+)(
 	// Global Control
 	input	logic  ap_clk,
+	input	logic  ap_clk2x,	// only used when PUMPED_COMPUTE
 	input	logic  ap_rst_n,
 
 	// Weight Stream
@@ -124,38 +125,45 @@ module mvu_vvu_axi #(
 				$finish;
 			end
 		end
+
+		//- Pumping Constraints ---------
+		if(PUMPED_COMPUTE) begin
+			if(SIMD % 2 != 0) begin
+				$error("Odd SIMD=%0d is incompatible with pumped compute.", SIMD);
+				$finish;
+			end
+		end
 	end
 
 	uwire clk = ap_clk;
 	uwire rst = !ap_rst_n;
 
-	typedef logic [INPUT_STREAM_WIDTH-1 : 0] mvauin_t;
-
-	uwire mvauin_t amvau;
+	//- Replay to Accommodate Neuron Fold -----------------------------------
+	typedef logic [(IS_MVU? 1:PE)*SIMD-1:0][ACTIVATION_WIDTH-1:0]  mvu_flatin_t;
+	uwire mvu_flatin_t amvau;
 	uwire alast;
 	uwire afin;
 	uwire avld;
 	uwire ardy;
 
-	replay_buffer #(.LEN(SF), .REP(IS_MVU ? NF : 1), .W($bits(mvauin_t))) activation_replay (
-	.clk, .rst,
-	.ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)),
-	.ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin)
+	localparam int unsigned  SF = MW/SIMD;
+	localparam int unsigned  NF = MH/PE;
+	replay_buffer #(.LEN(SF), .REP(IS_MVU ? NF : 1), .W($bits(mvu_flatin_t))) activation_replay (
+		.clk, .rst,
+		.ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvu_flatin_t'(s_axis_input_tdata)),
+		.ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin)
 	);
 
-//-------------------- Input control --------------------\\
-	uwire en;
-	uwire istb = avld && s_axis_weights_tvalid;
-	assign ardy = en && s_axis_weights_tvalid;
-	assign s_axis_weights_tready = en && avld;
+	//- Unflatten inputs into structured matrices ---------------------------
+	localparam int unsigned  ACT_PE = IS_MVU? 1 : PE;
+	typedef logic [PE    -1:0][SIMD-1:0][WEIGHT_WIDTH    -1:0]  mvu_w_t;
+	typedef logic [ACT_PE-1:0][SIMD-1:0][ACTIVATION_WIDTH-1:0]  mvu_a_t;
 
-//-------------------- Core MVU/VVU --------------------\\
-	uwire ovld;
-	uwire [PE-1:0][ACCU_WIDTH-1:0] odat;
-	typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t;
-	uwire mvauin_t amvau_i;
+	uwire  mvu_w_t  mvu_w = s_axis_weights_tdata;
 
-	if (IS_MVU) begin : genMVUInput
+	//- Conditional Activations Layout Adjustment for VVU
+	uwire mvu_a_t  amvau_i;
+	if (IS_MVU || (PE == 1)) begin : genMVUInput
 		assign  amvau_i = amvau;
 	end : genMVUInput
 	else begin : genVVUInput
@@ -163,49 +171,169 @@ module mvu_vvu_axi #(
 		// Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..]
 		// Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like:
 		// (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to
-		// (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i)
+		// (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i, P_1), ..., (S_i, P_i)
+		for(genvar  pe = 0; pe < (IS_MVU? 1:PE); pe++) begin
+			for(genvar  simd = 0; simd < SIMD; simd++) begin
+				assign	amvau_i[pe][simd] = amvau[];
+			end
+		end
+
 		localparam int num_of_elements = INPUT_STREAM_WIDTH/ACTIVATION_WIDTH;
 		for (genvar i=0; i<num_of_elements; i++) begin : genRewire
-			assign  amvau_i[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH] = (PE > 1) ?
-									amvau[(i/SIMD + (i*PE % num_of_elements) + 1) * ACTIVATION_WIDTH : (i/SIMD + (i*PE % num_of_elements)) * ACTIVATION_WIDTH]
-									: amvau[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH];
+			assign  amvau_i[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH] =
+									amvau[(i/SIMD + (i*PE % num_of_elements) + 1) * ACTIVATION_WIDTH : (i/SIMD + (i*PE % num_of_elements)) * ACTIVATION_WIDTH];
 		end : genRewire
 	end : genVVUInput
 
-	case(COMPUTE_CORE)
-	"mvu_vvu_8sx9_dsp58":
-		mvu_vvu_8sx9_dsp58 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
-		.ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN),
-		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
-			.clk, .rst, .en,
-			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
-			.vld(ovld), .p(odat)
-		);
-	"mvu_4sx4u":
-		mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
-			.clk, .rst, .en,
-			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
-			.vld(ovld), .p(odat)
-		);
-	"mvu_8sx8u_dsp48":
-		mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
-		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
-			.clk, .rst, .en,
-			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
-			.vld(ovld), .p(odat)
-		);
-	"mvu_vvu_lut":
-		mvu_vvu_lut #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH),
-		.WEIGHT_WIDTH(WEIGHT_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .M_REG(M_REG_LUT)) core (
-			.clk, .rst, .en,
-			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
-			.vld(ovld), .p(odat)
-		);
-	default: initial begin
-		$error("Unrecognized COMPUTE_CORE '%s'", COMPUTE_CORE);
-		$finish;
-	end
-	endcase
+	//- Flow Control Bracket around Compute Core ----------------------------
+	uwire en;
+	uwire istb = avld && s_axis_weights_tvalid;
+	assign ardy = en && s_axis_weights_tvalid;
+	assign s_axis_weights_tready = en && avld;
+
+	//- Conditionally Pumped DSP Compute ------------------------------------
+	typedef logic [PE-1:0][ACCU_WIDTH-1:0]  dsp_p_t;
+	uwire  ovld;
+	uwire dsp_p_t  odat;
+	if(1) begin : blkDsp
+		localparam int unsigned  DSP_SIMD = SIMD/(PUMPED_COMPUTE+1);
+		typedef logic [PE    -1:0][DSP_SIMD-1:0][WEIGHT_WIDTH    -1:0]  dsp_w_t;
+		typedef logic [ACT_PE-1:0][DSP_SIMD-1:0][ACTIVATION_WIDTH-1:0]  dsp_a_t;
+
+		uwire  dsp_clk;
+		uwire  dsp_en;
+
+		uwire  dsp_last;
+		uwire  dsp_zero;
+		uwire dsp_w_t  dsp_w;
+		uwire dsp_a_t  dsp_a;
+
+		uwire  dsp_vld;
+		uwire dsp_p_t  dsp_p;
+
+		if(!PUMPED_COMPUTE) begin : genUnpumpedCompute
+			assign	dsp_clk = clk;
+			assign	dsp_en  = en;
+
+			assign	dsp_last = alast && avld;
+			assign	dsp_zero = !istb;
+			assign	dsp_w = mvu_w;
+			assign	dsp_a = amvau_i;
+
+			assign	ovld = dsp_vld;
+			assign	odat = dsp_p;
+		end : genUnpumpedCompute
+		else begin : genPumpedCompute
+			assign	dsp_clk = clk2x;
+
+			// Identify second fast cycle before active slow clock edge
+			logic  Active = 0;
+			always_ff @(posedge clk2x)  Active <= clk;
+
+			// The input for a slow cycle is split across two fast cycles along the SIMD dimension.
+			//	- Both fast cycles are controlled by the same enable state.
+			//	- A zero cycle is duplicated across both fast cycles.
+			//	- The last flag must be restricted to the second fast cycle.
+			logic  En = 0;
+			logic  Last[1:0] = '{ default: 1'b0 };
+			logic  Zero = 1;
+			dsp_w_t  W[1:0] = '{ default: 'x };
+			dsp_a_t  A[1:0] = '{ default: 'x };
+			always_ff @(posedge clk2x) begin
+				if(rst) begin
+					En   <= 0;
+					Last <= '{ default: 1'b0 };
+					Zero <=  1;
+					W <= '{ default: 'x };
+					A <= '{ default: 'x };
+				end
+				else begin
+					if(Active) begin
+						En <= en;
+						if(en) begin
+							Last <= '{ alast && avld, 1'b0 };
+							Zero <= !istb;
+							for(int unsigned  simd = 0; simd < SIMD; simd++) begin
+								for(int unsigned  pe = 0; pe < PE; pe++) begin
+									W[simd / DSP_SIMD][pe][simd % DSP_SIMD] <= mvu_w[pe][simd];
+								end
+								for(int unsigned  pe = 0; pe < ACT_PE; pe++) begin
+									A[simd / DSP_SIMD][pe][simd % DSP_SIMD] <= amvau_i[pe][simd];
+								end
+							end
+						end
+					end
+					else if(En) begin
+						Last <= '{ 'x, Last[1] };
+						W    <= '{ 'x, W[1] };
+						A    <= '{ 'x, A[1] };
+					end
+				end
+			end
+			assign	dsp_en = En;
+
+			assign	dsp_last = Last[0];
+			assign	dsp_zero = Zero;
+			assign	dsp_w = W[0];
+			assign	dsp_a = A[0];
+
+			// Since no two consecutive last cycles will ever be asserted on the input,
+			// valid outputs will also always be spaced by, at least, one other cycle.
+			// We can always hold a captured output for two cycles to allow the slow
+			// clock to pick it up.
+			logic    Vld = 0;
+			dsp_p_t  P = 'x;
+			always_ff @(posedge clk2x) begin
+				if(rst) begin
+					Vld <= 0;
+					P   <= 'x;
+				end
+				else begin
+					if(dsp_vld)  P <= dsp_p;
+					Vld <= dsp_vld || (Vld && !Active);
+				end
+			end
+			assign	ovld = Vld;
+			assign	odat = P;
+
+		end : genPumpedCompute
+
+		case(COMPUTE_CORE)
+		"mvu_vvu_8sx9_dsp58":
+			mvu_vvu_8sx9_dsp58 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(DSP_SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
+			.ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN),
+			.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
+				.clk(dsp_clk), .rst, .en(dsp_en),
+				.last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a),
+				.vld(dsp_vld), .p(dsp_p)
+			);
+		"mvu_4sx4u":
+			mvu_4sx4u #(.PE(PE), .SIMD(DSP_SIMD), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
+				.clk(dsp_clk), .rst, .en(dsp_en),
+				.last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a),
+				.vld(dsp_vld), .p(dsp_p)
+			);
+		"mvu_8sx8u_dsp48":
+			mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(DSP_SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
+			.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
+				.clk(dsp_clk), .rst, .en(dsp_en),
+				.last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a),
+				.vld(dsp_vld), .p(dsp_p)
+			);
+		"mvu_vvu_lut":
+			mvu_vvu_lut #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(DSP_SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH),
+			.WEIGHT_WIDTH(WEIGHT_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .M_REG(M_REG_LUT)) core (
+				.clk(dsp_clk), .rst, .en(dsp_en),
+				.last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a),
+				.vld(dsp_vld), .p(dsp_p)
+			);
+		default: initial begin
+			$error("Unrecognized COMPUTE_CORE '%s'", COMPUTE_CORE);
+			$finish;
+		end
+		endcase
+
+	end : blkDsp
 
 //-------------------- Output register slice --------------------\\
 	struct packed {

From d9e2fc610a45f4a2acd3970b1606e9389e65db2a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Fri, 29 Sep 2023 15:36:52 +0100
Subject: [PATCH 176/235] Flag TODO.

---
 finn-rtllib/mvu/mvu_vvu_axi.sv | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index 54a4c092d7..78c9892b33 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -71,7 +71,7 @@ module mvu_vvu_axi #(
 )(
 	// Global Control
 	input	logic  ap_clk,
-	input	logic  ap_clk2x,	// only used when PUMPED_COMPUTE
+	input	logic  ap_clk2x,	// synchronous, double-speed clock; only used for PUMPED_COMPUTE
 	input	logic  ap_rst_n,
 
 	// Weight Stream
@@ -174,7 +174,7 @@ module mvu_vvu_axi #(
 		// (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i, P_1), ..., (S_i, P_i)
 		for(genvar  pe = 0; pe < (IS_MVU? 1:PE); pe++) begin
 			for(genvar  simd = 0; simd < SIMD; simd++) begin
-				assign	amvau_i[pe][simd] = amvau[];
+				assign	amvau_i[pe][simd] = amvau[];	// TODO: Do the right thing as below here.
 			end
 		end
 

From 5a429fcbe14ca6177082fab472549407f47f97d6 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 13 Oct 2023 23:29:39 +0100
Subject: [PATCH 177/235] [mvu_vvu dsp58]: change weight input to 2D instead of
 3D array

---
 finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv b/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv
index 6ae117e3ab..53cf71fd5f 100644
--- a/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv
+++ b/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv
@@ -42,7 +42,8 @@ module mvu_vvu_8sx9_dsp58 #(
     int unsigned SEGMENTLEN = 0, // Default to 0 (which implies a single segment)
 	bit FORCE_BEHAVIORAL = 0,
 
-	localparam int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD
+	localparam int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD,
+	localparam int unsigned WEIGHT_ELEMENTS = PE*SIMD
   )
   (
     // Global Control
@@ -53,7 +54,7 @@ module mvu_vvu_8sx9_dsp58 #(
 	// Input
     input   logic last,
     input   logic zero, // ignore current inputs and force this partial product to zero
-    input   logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // weights
+    input   logic [WEIGHT_ELEMENTS-1:0][WEIGHT_WIDTH-1:0] w, // weights
 	input   logic [ACTIVATION_ELEMENTS-1:0][ACTIVATION_WIDTH-1:0] a, // activations
 
 	// Ouput
@@ -164,7 +165,8 @@ module mvu_vvu_8sx9_dsp58 #(
 // synthesis translate_off
 							zero ? '1 : 						
 // synthesis translate_on							
-							w[i][3*j +: LANES_OCCUPIED];
+							//w[i][3*j +: LANES_OCCUPIED];
+							w[SIMD*i+3*j +: LANES_OCCUPIED];
 						if (EXTERNAL_PREGS > 1) B[i][0:EXTERNAL_PREGS-2] <= B[i][1:EXTERNAL_PREGS-1];
 					end
 				end
@@ -181,7 +183,8 @@ module mvu_vvu_8sx9_dsp58 #(
 // synthesis translate_off					
 						zero ? '1 : 
 // synthesis translate_on					
-						PAD_BITS_WEIGHT == 0 ? w[i][3*j+k] : { {PAD_BITS_WEIGHT{w[i][3*j+k][WEIGHT_WIDTH-1]}}, w[i][3*j+k] };
+						//PAD_BITS_WEIGHT == 0 ? w[i][3*j+k] : { {PAD_BITS_WEIGHT{w[i][3*j+k][WEIGHT_WIDTH-1]}}, w[i][3*j+k] };
+						PAD_BITS_WEIGHT == 0 ? w[SIMD*i+3*j+k] : { {PAD_BITS_WEIGHT{w[SIMD*i+3*j+k][WEIGHT_WIDTH-1]}}, w[SIMD*i+3*j+k] };
 				end : genBin
 				for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero
 					assign b_in_i[i][j][8*k +: 8] = 8'b0;

From a4a18bb08cef96bb52c02096d54b573b421bcd12 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 13 Oct 2023 23:30:55 +0100
Subject: [PATCH 178/235] [mvu_vvu axi]: re-wire weights appropriately for VVU
 DSP58

---
 finn-rtllib/mvu/mvu_vvu_axi.sv | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index da7e00cc55..f0f75c633a 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -45,7 +45,7 @@
  *****************************************************************************/
 
 module mvu_vvu_axi #(
-	bit IS_MVU, // string type causes error in Vivado
+	bit IS_MVU,
 	parameter COMPUTE_CORE,
 	int unsigned MW,
 	int unsigned MH,
@@ -64,8 +64,8 @@ module mvu_vvu_axi #(
 	localparam int unsigned INPUT_STREAM_WIDTH_BA = ((IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8,
 	localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH,
 	localparam int unsigned INPUT_STREAM_WIDTH =  (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH,
-	localparam int unsigned SF = MW/SIMD,
-	localparam int unsigned NF = MH/PE,
+	localparam int unsigned SF = IS_MVU ? MW/SIMD : MW/(SIMD*PE),
+	localparam int unsigned NF = IS_MVU ? MH/PE : 1,
 	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8
 )
 (
@@ -91,11 +91,11 @@ module mvu_vvu_axi #(
 
 //-------------------- Parameter sanity checks --------------------\\
 	initial begin
-		if (MW % SIMD != 0) begin
+		if ((MW % SIMD != 0 && IS_MVU) || (MW % (SIMD*PE) != 0 && !IS_MVU)) begin
 			$error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD);
 			$finish;
 		end
-		if (MH % PE != 0) begin
+		if (MH % PE != 0 && IS_MVU) begin
 			$error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE);
 			$finish;
 		end
@@ -137,7 +137,7 @@ module mvu_vvu_axi #(
 	uwire avld;
 	uwire ardy;
 
-	replay_buffer #(.LEN(SF), .REP(IS_MVU ? NF : 1), .W($bits(mvauin_t))) activation_replay (
+	replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvauin_t))) activation_replay (
 	.clk, .rst,
 	.ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)),
 	.ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin)
@@ -154,9 +154,11 @@ module mvu_vvu_axi #(
 	uwire [PE-1:0][ACCU_WIDTH-1:0] odat;
 	typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t;
 	uwire mvauin_t amvau_i;
+	uwire mvauin_weight_t wmvau_i;
 
 	if (IS_MVU) begin : genMVUInput
 		assign  amvau_i = amvau;
+		assign  wmvau_i = s_axis_weights_tdata;
 	end : genMVUInput
 	else begin : genVVUInput
 		// The input stream will have the channels interleaved for VVU when PE>1
@@ -164,11 +166,14 @@ module mvu_vvu_axi #(
 		// Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like:
 		// (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to
 		// (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i)
-		localparam int num_of_elements = INPUT_STREAM_WIDTH/ACTIVATION_WIDTH;
+		localparam int num_of_elements = PE*SIMD;
 		for (genvar i=0; i<num_of_elements; i++) begin : genRewire
 			assign  amvau_i[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH] = (PE > 1) ?
 									amvau[(i/SIMD + (i*PE % num_of_elements) + 1) * ACTIVATION_WIDTH : (i/SIMD + (i*PE % num_of_elements)) * ACTIVATION_WIDTH]
 									: amvau[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH];
+			assign  wmvau_i[i*WEIGHT_WIDTH +: WEIGHT_WIDTH] = (PE > 1) ? 
+									s_axis_weights_tdata[( ((SIMD-1-i) + int'(i/SIMD)*SIMD) + int'(i/SIMD) * SIMD + 1) * WEIGHT_WIDTH : ( ((SIMD-1-i) + int'(i/SIMD)*SIMD) + int'(i/SIMD) * SIMD ) * WEIGHT_WIDTH]
+									: s_axis_weights_tdata[i*WEIGHT_WIDTH +: WEIGHT_WIDTH];
 		end : genRewire
 	end : genVVUInput
 
@@ -178,7 +183,7 @@ module mvu_vvu_axi #(
 		.ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN),
 		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
 			.clk, .rst, .en,
-			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
+			.last(alast && avld), .zero(!istb), .w(wmvau_i), .a(amvau_i),
 			.vld(ovld), .p(odat)
 		);
 	"mvu_4sx4u":

From cc0737bcd00cdd6df6e3d4ff38215ac5d9eb42e6 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 13 Oct 2023 23:31:35 +0100
Subject: [PATCH 179/235] [mvu_vvu axi wrapper]: fix to IS_MVU parameter

---
 finn-rtllib/mvu/mvu_vvu_axi_wrapper.v | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
index 9c65dbc06e..01deb23840 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
+++ b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
@@ -32,7 +32,7 @@
  *****************************************************************************/
 
 module $MODULE_NAME_AXI_WRAPPER$ #(
-	parameter	IS_MVU = "$IS_MVU$",
+	parameter	IS_MVU = $IS_MVU$,
 	parameter	COMPUTE_CORE = "$COMPUTE_CORE$",
 	parameter	MW = $MW$,
 	parameter	MH = $MH$,

From c0eff0b819828a5e1d1ef80815f63be0042ce742 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 13 Oct 2023 23:32:47 +0100
Subject: [PATCH 180/235] [mvu_vvu tb]: WIP -- changes to self-checker and
 shape of input data

---
 finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv | 79 +++++++++++++++++-----------
 1 file changed, 49 insertions(+), 30 deletions(-)

diff --git a/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv
index 82c2e8e7b0..b46fc588c9 100644
--- a/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv
+++ b/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv
@@ -35,23 +35,23 @@ module mvu_vvu_axi_tb();
 
 //-------------------- Simulation parameters --------------------\\
 	// Matrix & parallelism config
-	localparam bit IS_MVU = 1;
+	localparam bit IS_MVU = 0;
 	localparam string COMPUTE_CORE = "mvu_vvu_8sx9_dsp58";
-	localparam int unsigned MW = 1500;
-	localparam int unsigned MH = 256;
-	localparam int unsigned SIMD = 60;
-	localparam int unsigned PE = 16;
-	localparam int unsigned SEGMENTLEN = 2.0;
+	localparam int unsigned MW = 36;
+	localparam int unsigned MH = 1;
+	localparam int unsigned SIMD = 3;
+	localparam int unsigned PE = 4;
+	localparam int unsigned SEGMENTLEN = 1.0;
 	localparam bit FORCE_BEHAVIORAL = 1;
 	localparam bit M_REG_LUT = 1;
 	// Bit-width config
-	localparam int unsigned ACTIVATION_WIDTH = 4;
-	localparam int unsigned WEIGHT_WIDTH = 4;
-	localparam int unsigned ACCU_WIDTH = 21; // == ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW)
-	localparam bit SIGNED_ACTIVATIONS = 0;
+	localparam int unsigned ACTIVATION_WIDTH = 8;
+	localparam int unsigned WEIGHT_WIDTH = 6;
+	localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW);
+	localparam bit SIGNED_ACTIVATIONS = 1;
 	// Simulation constants
-	localparam int unsigned NF = MH/PE;
-	localparam int unsigned SF = MW/SIMD;
+	localparam int unsigned NF = IS_MVU ? MH/PE : 1;
+	localparam int unsigned SF = IS_MVU ? MW/SIMD : MW/(SIMD*PE);
 	localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8;
 	localparam int unsigned ACTIVATION_WIDTH_BA = ((IS_MVU ? 1 : PE)*SIMD*ACTIVATION_WIDTH+7)/8*8;
 	localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH;
@@ -72,7 +72,7 @@ module mvu_vvu_axi_tb();
 
 	// Generate activations
 	typedef logic [(IS_MVU ? 1 : PE)*SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t;
-	typedef activation_t activation_vector_t[(IS_MVU ? 1 : NF)*SF];
+	typedef activation_t activation_vector_t[SF];
 
 	function activation_vector_t init_ACTIVATIONS;
 		automatic activation_vector_t res;
@@ -93,14 +93,12 @@ module mvu_vvu_axi_tb();
 		activations.dat = 'X;
 		@(posedge clk iff ap_rst_n);
 
-		for (int j=0; j<(IS_MVU ? 1 : NF); j++) begin
-			for (int i=0; i<SF; i++) begin
-				activations.dat <= ACTIVATIONS[SF*j+i];
-				do begin
-					activations.vld <= $urandom()%7 >= 0;
-					@(posedge clk);
-				end while (!(activations.vld === 1 && activations.rdy === 1));
-			end
+		for (int i=0; i<SF; i++) begin
+			activations.dat <= ACTIVATIONS[i];
+			do begin
+				activations.vld <= $urandom()%7 >= 0;
+				@(posedge clk);
+			end while (!(activations.vld === 1 && activations.rdy === 1));
 		end
 
 		activations.vld <= 0;
@@ -143,7 +141,9 @@ module mvu_vvu_axi_tb();
 	end
 
 	// Function to compute golden output
-	// a: [(IS_MVU?1:NF)*SF][SIMD-1:0][ACTIVATION_WIDTH-1:0]
+	// a: [SF][(IS_MVU ? 1 : PE)*SIMD-1:0][ACTIVATION_WIDTH-1:0]
+	// a: [SF][SIMD-1:0][ACTIVATION_WIDTH-1:0]
+	// a: [SF][PE*SIMD-1:0][ACTIVATION_WIDTH-1:0]
 	// w: [NF][SF][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]
 	typedef logic signed [PE-1:0][ACCU_WIDTH-1:0] output_t;
 	typedef output_t output_vector_t [NF];
@@ -156,14 +156,33 @@ module mvu_vvu_axi_tb();
 
 	function output_vector_t check_output(activation_vector_t a, weight_matrix_t w);
 		automatic output_vector_t res = '{default: 0};
-		for (int j = 0; j<MH; j++) begin
-			for (int i = 0; i<MW; i++) begin
-				if (SIGNED_ACTIVATIONS)
-					res[j/PE][j%PE] = IS_MVU ? $signed(res[j/PE][j%PE]) + $signed(a[i/SIMD][i%SIMD]) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]) : 
-											   $signed(res[j/PE][j%PE]) + ( PE > 1 ? $signed(a[j/PE*SF+i/SIMD][(i*PE + j%PE) % (SIMD*PE)]) : $signed(a[j/PE*SF+i/SIMD][i%SIMD]) ) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
-				else
-					res[j/PE][j%PE] = IS_MVU ? $signed(res[j/PE][j%PE]) + $signed({1'b0, a[i/SIMD][i%SIMD]}) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]) : 
-											   $signed(res[j/PE][j%PE]) + ( PE > 1 ? $signed({1'b0, a[j/PE*SF+i/SIMD][(i*PE + j%PE) % (SIMD*PE)]}) : $signed({1'b0, a[j/PE+SF+i/SIMD][i%SIMD]}) ) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
+		// for (int j = 0; j<MH; j++) begin
+		// 	for (int i = 0; i<MW; i++) begin
+		// 		if (SIGNED_ACTIVATIONS)
+		// 			res[j/PE][j%PE] = IS_MVU ? $signed(res[j/PE][j%PE]) + $signed(a[i/SIMD][i%SIMD]) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]) : 
+		// 									   $signed(res[j/PE][j%PE]) + ( PE > 1 ? $signed(a[i/SIMD/PE][i % (SIMD*PE)]) : $signed(a[i/SIMD/PE][(i)%(SIMD*PE)]) ) * $signed(w[0][i/SIMD/PE][i/PE][i%SIMD]);
+		// 		else
+		// 			res[j/PE][j%PE] = IS_MVU ? $signed(res[j/PE][j%PE]) + $signed({1'b0, a[i/SIMD][i%SIMD]}) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]) : 
+		// 									   $signed(res[j/PE][j%PE]) + ( PE > 1 ? $signed({1'b0, a[i/SIMD/PE][i % (SIMD*PE)]}) : $signed({1'b0, a[i/SIMD/PE][i%(SIMD*PE)]}) ) * $signed(w[0][i/SIMD][0][i%SIMD]);
+		// 	end
+		// end
+		// The input stream will have the channels interleaved for VVU when PE>1
+		// Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..]
+		// Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like:
+		// (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to
+		// (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i)
+		for (int i = 0; i < NF; i++) begin
+			for (int j = 0; j < SF; j++) begin
+				for (int k = 0; k < PE; k++) begin
+					for (int l = 0; l < SIMD; l++) begin
+						if (SIGNED_ACTIVATIONS)
+							res[i][k] = IS_MVU ? $signed(res[i][k]) + $signed(a[j][l]) * $signed(w[i][j][k][l]) :
+												 $signed(res[i][k]) + $signed(a[j][k + l*PE]) * $signed(w[i][j][k][l]);
+						else
+							res[i][k] = IS_MVU ? $signed(res[i][k]) + $signed({1'b0, a[j][l]}) * $signed(w[i][j][k][l]) :
+												 $signed(res[i][k]) + $signed({1'b0, a[j][k + l*PE]}) * $signed(w[i][j][k][l]);
+					end
+				end
 			end
 		end
 		return res;

From 4591bb87baf83e1d5fdb08dbd4a79b866c6076b3 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 13 Oct 2023 23:35:54 +0100
Subject: [PATCH 181/235] [vvu_hls]: add flag to specify preferred backend

---
 src/finn/custom_op/fpgadataflow/vectorvectoractivation.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py
index bd5bb75f1d..7ddf234544 100644
--- a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py
+++ b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py
@@ -100,6 +100,8 @@ def get_nodeattr_types(self):
             # use xnor-popcount for binary weights/inputs, thus treating them
             # as bipolar
             "binaryXnorMode": ("i", False, 0, {0, 1}),
+            # Flag to specify whether RTL-based or HLS-based implementation is preferred
+            "preferred_backend": ("s", False, "rtl", {"hls", "rtl"})
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs

From ef1cbbe75f05efe72604de05f0e56d2758feecfb Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 13 Oct 2023 23:36:26 +0100
Subject: [PATCH 182/235] [vvu rtl]: added new custom-op VVU_RTL

---
 .../vectorvectoractivation_rtl.py             | 1225 +++++++++++++++++
 1 file changed, 1225 insertions(+)
 create mode 100644 src/finn/custom_op/fpgadataflow/vectorvectoractivation_rtl.py

diff --git a/src/finn/custom_op/fpgadataflow/vectorvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/vectorvectoractivation_rtl.py
new file mode 100644
index 0000000000..72976bc9a8
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/vectorvectoractivation_rtl.py
@@ -0,0 +1,1225 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import math
+import numpy as np
+import os
+import textwrap
+import warnings
+from qonnx.core.datatype import DataType
+from qonnx.util.basic import (
+    calculate_matvec_accumulator_range,
+    interleave_matrix_outer_dim_from_partitions,
+    roundup_to_integer_multiple,
+)
+
+from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
+from finn.util.data_packing import (
+    npy_to_rtlsim_input,
+    numpy_to_hls_code,
+    pack_innermost_dim_as_hex_string,
+    rtlsim_output_to_npy,
+)
+from finn.util.basic import get_rtlsim_trace_depth, make_build_dir
+
+try:
+    from pyverilator import PyVerilator
+except ModuleNotFoundError:
+    PyVerilator = None
+
+
+class VectorVectorActivation_rtl(HLSCustomOp):
+    """Class that corresponds to finn-rtl Vector Vector Unit."""
+
+    def __init__(self, onnx_node, **kwargs):
+        super().__init__(onnx_node, **kwargs)
+
+    def get_nodeattr_types(self):
+        my_attrs = {
+            "PE": ("i", True, 0),
+            "SIMD": ("i", False, 1),
+            "Dim": ("ints", True, []),  # [H, W]
+            "Channels": ("i", True, 0),
+            "Kernel": ("ints", True, []),  # [H, W]
+            "resType": ("s", False, "auto", {"auto", "lut", "dsp"}),
+            # FINN DataTypes for inputs, weights, outputs
+            "inputDataType": ("s", True, ""),
+            "weightDataType": ("s", True, ""),
+            "outputDataType": ("s", True, ""),
+            # FINN DataType for accumulator -- auto-computed and updated
+            "accDataType": ("s", False, "INT32"),
+            # memory mode for the layer weights
+            # const -- embedded weights, default, long compile/synth times
+            # decoupled -- streaming weights with weight streamer packaged inside IP
+            # external -- streaming weights with external streamer
+            "mem_mode": ("s", False, "const", {"const", "decoupled", "external"}),
+            # (mem_mode = decoupled only) whether weights will be writable through
+            # an AXI-lite interface during runtime
+            # 1 for enabled, 0 for disabled.
+            # see finn-rtllib/memstream/doc/README for more about the memory
+            # address map used for writable weights
+            # IMPORTANT: After using AXI lite to either read or write the weights,
+            # always "flush" the accelerator by first passing a dummy input
+            # vector through the accelerator. This will get rid of any old
+            # weight data from the weight FIFOs.
+            "runtime_writeable_weights": ("i", False, 0, {0, 1}),
+            # FPGA resource type for memories in decoupled mode
+            # auto -- let Vivado decide
+            # block -- use BRAM
+            # distributed -- use LUTRAM
+            # ultra -- use UltraRAM (URAM), must have runtime_writeable_weights=1
+            # see also https://www.xilinx.com/support/answers/38070.html
+            "ram_style": (
+                "s",
+                False,
+                "auto",
+                {"auto", "block", "distributed", "ultra"},
+            ),
+            # attribute to save top module name - not user configurable
+            "gen_top_module": ("s", False, "")
+        }
+        my_attrs.update(super().get_nodeattr_types())
+        return my_attrs
+
+    def minimize_accumulator_width(self, model):
+        """Minimize the accumulator bit width according to the weight values,
+        input data types, and size of dot product"""
+        weights = model.get_initializer(self.onnx_node.input[1])
+        k_h, k_w = self.get_nodeattr("Kernel")
+        fm = self.get_nodeattr("Channels")
+        # put weights into the shape expected by calculate_matvec_accumulator_range
+        weights = weights.reshape(fm, k_h * k_w).transpose()
+        if len(self.onnx_node.input) > 2:
+            thresholds = model.get_initializer(self.onnx_node.input[2])
+        else:
+            thresholds = None
+        idt = self.get_input_datatype()
+
+        (acc_min, acc_max) = calculate_matvec_accumulator_range(weights, idt)
+        # if runtime-writeable weights, then the values of the weights can
+        # change and we need to use the worst-case values from the datatypes
+        if self.get_nodeattr("runtime_writeable_weights"):
+            wdt = self.get_weight_datatype()
+            lower_worst = wdt.min() * np.ones_like(weights)
+            lower_range = calculate_matvec_accumulator_range(lower_worst, idt)
+            upper_worst = wdt.max() * np.ones_like(weights)
+            upper_range = calculate_matvec_accumulator_range(upper_worst, idt)
+            acc_min = min(min(lower_range), min(upper_range))
+            acc_max = max(max(upper_range), max(upper_range))
+
+        # if the acc_range is always greater than 0, then acc_max <= 2^P - 1
+        if acc_min >= 0:
+            acc_bit_width = np.log2(acc_max + 1)
+            acc_bit_width = math.ceil(acc_bit_width)
+            adt = DataType[f"UINT{acc_bit_width}"]
+        # if the acc_range is signed, then acc_min >= -2^{P-1} and acc_max <=
+        # 2^{P - 1} - 1, which means 2^{P - 1} >= max(-acc_min, 1 + acc_max)
+        else:
+            _acc_max = max(-acc_min, 1 + acc_max)
+            acc_bit_width = np.log2(_acc_max) + 1
+            acc_bit_width = math.ceil(acc_bit_width)
+            adt = DataType[f"INT{acc_bit_width}"]
+
+        # if this is the last node in the graph, then ensure the datatype is
+        # divisibly by 8 bits
+        if model.find_direct_successors(self.onnx_node) is None:
+            bw = roundup_to_integer_multiple(adt.bitwidth(), 8)
+            new_adt_name = adt.name.replace(str(adt.bitwidth()), str(bw))
+            adt = DataType[new_adt_name]
+        # for no-activation nodes, output dt = acc dt
+        self.set_nodeattr("outputDataType", adt.name)
+        self.set_nodeattr("accDataType", adt.name)
+
+        return DataType[self.get_nodeattr("accDataType")]
+
+    def minimize_weight_bit_width(self, model):
+        """Minimize the bit width based on the values of the weights"""
+        if not self.get_nodeattr("runtime_writeable_weights"):
+            weights = model.get_initializer(self.onnx_node.input[1])
+            w_min = weights.min()
+            w_max = weights.max()
+            if w_min < 0:
+                if abs(w_min) > w_max:
+                    wdt = DataType.get_smallest_possible(w_min)
+                else:
+                    wdt = DataType.get_smallest_possible(-w_max - 1)
+            else:
+                wdt = DataType.get_smallest_possible(w_max)
+            self.set_nodeattr("weightDataType", wdt.name)
+        return DataType[self.get_nodeattr("weightDataType")]
+
+    def calc_wmem(self):
+        """Calculates and returns WMEM."""
+        ch = self.get_nodeattr("Channels")
+        k_h, k_w = self.get_nodeattr("Kernel")
+        pe = self.get_nodeattr("PE")
+        simd = self.get_nodeattr("SIMD")
+        wmem = (k_h * k_w * ch // pe) // simd
+        return wmem
+
+    def calc_tmem(self):
+        """Calculates and returns TMEM."""
+        return 0
+
+    def make_shape_compatible_op(self, model):
+        oshape = self.get_normal_output_shape()
+        return super().make_const_shape_op(oshape)
+
+    def infer_node_datatype(self, model):
+        node = self.onnx_node
+        idt = model.get_tensor_datatype(node.input[0])
+        if idt != self.get_input_datatype():
+            warn_str = "inputDataType changing for %s: %s -> %s " % (
+                node.name,
+                str(self.get_input_datatype()),
+                str(idt),
+            )
+            warnings.warn(warn_str)
+        self.set_nodeattr("inputDataType", idt.name)
+        # set output datatype from property
+        odt = self.get_output_datatype()
+        model.set_tensor_datatype(node.output[0], odt)
+
+    def verify_node(self):
+        pass
+
+    def get_input_datatype(self, ind=0):
+        """Returns FINN DataType of input."""
+        if ind == 0:
+            return DataType[self.get_nodeattr("inputDataType")]
+        elif ind == 1:
+            return DataType[self.get_nodeattr("weightDataType")]
+        else:
+            raise Exception("Undefined input ind for this layer type")
+
+    def get_weight_datatype(self):
+        """Returns FINN DataType of weights."""
+        return DataType[self.get_nodeattr("weightDataType")]
+
+    def get_accumulator_datatype(self):
+        """Returns FINN DataType of accumulator"""
+        return DataType[self.get_nodeattr("accDataType")]
+
+    def get_output_datatype(self, ind=0):
+        """Returns FINN DataType of output."""
+        return DataType[self.get_nodeattr("outputDataType")]
+
+    def get_instream_width(self, ind=0):
+        i_bits = self.get_input_datatype().bitwidth()
+        simd = self.get_nodeattr("SIMD")
+        pe = self.get_nodeattr("PE")
+        in_width = i_bits * simd * pe
+        return in_width
+
+    def get_outstream_width(self, ind=0):
+        o_bits = self.get_output_datatype().bitwidth()
+        out_width = o_bits * self.get_nodeattr("PE")
+        return out_width
+
+    def get_folded_input_shape(self, ind=0):
+        k_h, k_w = self.get_nodeattr("Kernel")
+        dim_h, dim_w = self.get_nodeattr("Dim")
+        ch = self.get_nodeattr("Channels")
+        simd = self.get_nodeattr("SIMD")
+        pe = self.get_nodeattr("PE")
+        kernel_2 = k_h * k_w
+        assert kernel_2 % simd == 0, "Requirement kernel (k_h * k_w) divisable by SIMD is violated."
+        sf = kernel_2 // simd
+        assert ch % pe == 0, "Requirement Channels divisable by PE is violated."
+        nf = ch // pe
+
+        if ind == 0:
+            # calculate shape of input 0
+            folded_input_shape = tuple([1, dim_h, dim_w, sf * nf, simd * pe])
+        elif ind == 1 and self.get_nodeattr("mem_mode") == "external":
+            # calculate shape of input 1 (weights)
+            folded_input_shape = tuple([1, sf * nf, pe])
+        else:
+            raise Exception("Undefined input shape for requested input")
+
+        return folded_input_shape
+
+    def get_folded_output_shape(self, ind=0):
+        ch = self.get_nodeattr("Channels")
+        pe = self.get_nodeattr("PE")
+        nf = ch // pe
+        dim_h, dim_w = self.get_nodeattr("Dim")
+        folded_output_shape = tuple([1, dim_h, dim_w, nf, pe])
+        return folded_output_shape
+
+    def get_normal_input_shape(self, ind=0):
+        dim_h, dim_w = self.get_nodeattr("Dim")
+        ch = self.get_nodeattr("Channels")
+        k_h, k_w = self.get_nodeattr("Kernel")
+        normal_input_shape = tuple([1, dim_h, dim_w, k_h * k_w * ch])
+        return normal_input_shape
+
+    def get_normal_output_shape(self, ind=0):
+        ch = self.get_nodeattr("Channels")
+        dim_h, dim_w = self.get_nodeattr("Dim")
+        normal_output_shape = tuple([1, dim_h, dim_w, ch])
+        return normal_output_shape
+
+    def get_number_output_values(self):
+        nf = np.prod(self.get_folded_output_shape()[:-1])
+        return nf
+
+# TODO: fix exp_cycles estimations --> depends on fpga_part and clk
+    def get_exp_cycles(self):
+        pe = self.get_nodeattr("PE")
+        simd = self.get_nodeattr("SIMD")
+        ch = self.get_nodeattr("Channels")
+        dim_h, dim_w = self.get_nodeattr("Dim")
+        k_h, k_w = self.get_nodeattr("Kernel")
+        # currently FINN supports for vvau a batch size of 1
+        batch_size = 1
+        # since mmv != 1 is not supported yet, we set mmv for now to 1
+        mmv = 1
+        exp_cycles = ((ch * k_h * k_w) / pe / simd) * batch_size * (dim_h * dim_w) / mmv
+        return int(exp_cycles)
+
+    def get_template_param_values(self):
+        """Returns the template parameter values according to input, output and weight
+        data types."""
+        ret = dict()
+        inp_hls_str = self.get_input_datatype().get_hls_datatype_str()
+        out_hls_str = self.get_output_datatype().get_hls_datatype_str()
+        inp_is_binary = self.get_input_datatype() == DataType["BINARY"]
+        # out_is_binary = self.get_output_datatype() == DataType["BINARY"]
+        wt_is_binary = self.get_weight_datatype() == DataType["BINARY"]
+        bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1
+        if (inp_is_binary or wt_is_binary) and (not bin_xnor_mode):
+            raise Exception("True binary (non-bipolar) inputs not yet supported")
+        inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"]
+        # out_is_bipolar = self.get_output_datatype() == DataType["BIPOLAR"]
+        wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"]
+        # reinterpret inp/wt as bipolar if bin_xnor_mode is iset
+        inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode)
+        wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode)
+        # fill in TSrcI and TWeightI
+        # TODO check these with Giulio
+        # TODO handle non-bipolar binary inputs
+        if inp_is_bipolar and wt_is_bipolar:
+            ret["TSrcI"] = "Recast<XnorMul>"
+            ret["TWeightI"] = "Identity"
+        elif (not inp_is_bipolar) and wt_is_bipolar:
+            ret["TSrcI"] = "Slice<%s>" % inp_hls_str
+            ret["TWeightI"] = "Recast<Binary>"
+        elif inp_is_bipolar and (not wt_is_bipolar):
+            ret["TSrcI"] = "Recast<Binary>"
+            ret["TWeightI"] = "Identity"
+        elif (not inp_is_bipolar) and (not wt_is_bipolar):
+            ret["TSrcI"] = "Slice<%s>" % inp_hls_str
+            ret["TWeightI"] = "Identity"
+
+        # fill in TDstI
+        ret["TDstI"] = "Slice<%s>" % out_hls_str
+
+        return ret
+
+    def get_hls_compatible_weight_tensor(self, orig_weight_matrix):
+        pe = self.get_nodeattr("PE")
+        simd = self.get_nodeattr("SIMD")
+        ch = self.get_nodeattr("Channels")
+        k_h, k_w = self.get_nodeattr("Kernel")
+        wmem = self.calc_wmem()
+        assert orig_weight_matrix.shape == (
+            ch,
+            1,
+            k_h,
+            k_w,
+        ), """Weights matrix doesn't
+        have expected shape (channels, 1, kernel_size, kernel_size)"""
+        ret = orig_weight_matrix
+        if self.get_weight_datatype() == DataType["BIPOLAR"]:
+            # convert bipolar to binary
+            ret = (ret + 1) / 2
+        ret = ret.reshape(ch, k_h * k_w)
+        # distribute rows between PEs
+        ret = interleave_matrix_outer_dim_from_partitions(ret, pe)
+        ret = ret.reshape(1, pe, wmem, simd)
+        return ret
+
+    def get_hls_compatible_threshold_tensor(self, orig_thres_matrix):
+        """Convert the original numpy weight matrix orig_weight_matrix into
+        a form suitable for passing to the hlslib call:
+        * ensure MH % PE == 0
+        * for bipolar weights&inputs, ensure thresholds are positive
+        * interleave rows between PEs
+        * reshape into (PE, TMEM, n_thres_steps) and return
+        """
+        ch = self.get_nodeattr("Channels")
+        pe = self.get_nodeattr("PE")
+        tmem = self.calc_tmem()
+        assert ch % pe == 0, "Requirement Channels divisable by PE is violated."
+        assert (
+            orig_thres_matrix.ndim == 2
+        ), """Threshold matrix dimension is
+        not as expected (2)."""
+        n_thres_steps = orig_thres_matrix.shape[1]
+        inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"]
+        wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"]
+        # reinterpret inp/wt as bipolar if bin_xnor_mode is iset
+        inp_is_binary = self.get_input_datatype() == DataType["BINARY"]
+        wt_is_binary = self.get_weight_datatype() == DataType["BINARY"]
+        bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1
+        inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode)
+        wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode)
+        if inp_is_bipolar and wt_is_bipolar:
+            # ensure all thresholds are nonnegative
+            assert (orig_thres_matrix >= 0).all()
+            # ensure all thresholds are integer
+            assert (orig_thres_matrix.astype(np.int32) == orig_thres_matrix).all()
+        ret = orig_thres_matrix
+        # ensure channels = mh , duplicating if necessary
+        if ret.shape[0] == 1:
+            ret = np.tile(ret, (ch, 1))
+        assert ret.shape[0] == ch, "Channels of threshold matrix are not as expected (ch)"
+        # distribute rows between PEs
+        ret = interleave_matrix_outer_dim_from_partitions(ret, pe)
+        assert (
+            ret.shape[0] == pe
+        ), """First dimension after distribution of the
+        rows between PEs is not as expected (pe)"""
+        assert (
+            ret.shape[1] == tmem
+        ), """Second dimension after distribution of the
+        rows between PEs is not as expected (tmem)"""
+        assert (
+            ret.shape[2] == n_thres_steps
+        ), """Third dimension after distribution of the
+        rows between PEs is not as expected (n_thres_steps)"""
+        return ret.reshape(1, pe, tmem, n_thres_steps)
+
+    def make_weight_file(self, weights, weight_file_mode, weight_file_name):
+        """Produce a file containing given weights in appropriate format for this
+        layer. This file can be used for either synthesis or run-time reconfig
+        of weights.
+
+        Arguments:
+
+        * weights : numpy array with weights to be put into the file
+        * weight_file_mode : one of {hls_header, decoupled_verilog_dat,
+          decoupled_runtime}
+        * weight_file_name : filename for the weight file to be generated
+
+        """
+        # convert weights into hlslib-compatible format
+        weight_tensor = self.get_hls_compatible_weight_tensor(weights)
+        export_wdt = self.get_weight_datatype()
+        # we have converted bipolar weights to binary for export,
+        # so use it as such for weight generation
+        if self.get_weight_datatype() == DataType["BIPOLAR"]:
+            export_wdt = DataType["BINARY"]
+        if weight_file_mode == "hls_header":
+            weight_hls_code = numpy_to_hls_code(weight_tensor, export_wdt, "weights", True, True)
+            # write weights into C++ header file as dictated by finn-hlslib
+            f_weights = open(weight_file_name, "w")
+            if export_wdt.bitwidth() != 1:
+                f_weights.write(
+                    "const FixedPointWeights<{},{},{},{}> weights = ".format(
+                        self.get_nodeattr("SIMD"),
+                        export_wdt.get_hls_datatype_str(),
+                        self.get_nodeattr("PE"),
+                        self.calc_wmem(),
+                    )
+                )
+            else:
+                f_weights.write(
+                    "const BinaryWeights<{},{},{}> weights = ".format(
+                        self.get_nodeattr("SIMD"),
+                        self.get_nodeattr("PE"),
+                        self.calc_wmem(),
+                    )
+                )
+            f_weights.write(weight_hls_code)
+            f_weights.close()
+        elif "decoupled" in weight_file_mode:
+            # create a weight stream for various flavors of decoupled mode:
+            # transpose weight tensor from (1, PE, WMEM, SIMD) to (1, WMEM, PE, SIMD)
+            weight_tensor_unflipped = np.transpose(weight_tensor, (0, 2, 1, 3))
+            # reverse SIMD flip for saving weights in .npy
+            weight_tensor_simd_flipped = np.flip(weight_tensor_unflipped, axis=-1)
+            # PE flip for saving weights in .dat
+            weight_tensor_pe_flipped = np.flip(weight_tensor_unflipped, axis=-2)
+            # reshape weight tensor (simd_flipped and pe_flipped) to desired shape
+            pe = self.get_nodeattr("PE")
+            simd = self.get_nodeattr("SIMD")
+            # simd_flipped
+            weight_tensor_simd_flipped = weight_tensor_simd_flipped.reshape(1, -1, pe * simd)
+            weight_tensor_simd_flipped = weight_tensor_simd_flipped.copy()
+            # flipped
+            weight_tensor_pe_flipped = weight_tensor_pe_flipped.reshape(1, -1, pe * simd)
+            weight_tensor_pe_flipped = weight_tensor_pe_flipped.copy()
+            if weight_file_mode == "decoupled_npy":
+                # save weight stream into npy for cppsim
+                np.save(weight_file_name, weight_tensor_simd_flipped)
+            elif weight_file_mode == "decoupled_verilog_dat":
+                # convert weight values into hexstring
+                weight_width = self.get_weightstream_width()
+                # pad to nearest 4 bits to get hex strings
+                weight_width_padded = roundup_to_integer_multiple(weight_width, 4)
+                weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string(
+                    weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix=""
+                )
+                # add zeroes to pad out file to 1024 entries
+                weight_stream = weight_tensor_pe_flipped.flatten()
+                weight_stream = weight_stream.copy()
+                with open(weight_file_name, "w") as f:
+                    for val in weight_stream:
+                        f.write(val + "\n")
+            elif weight_file_mode == "decoupled_runtime":
+                # memstream axi-lite interface will map each mem line to
+                # one or multiple 32-bit words
+                weight_width = self.get_weightstream_width()
+                words_per_memwidth = 2 ** math.ceil(math.log2(weight_width / 32))
+                if words_per_memwidth < 1:
+                    words_per_memwidth = 1
+                weight_width_padded = words_per_memwidth * 32
+                # first, pack and ensure padding to 32 bits
+                weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string(
+                    weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix=""
+                )
+                weight_stream = weight_tensor_pe_flipped.flatten()
+                weight_stream = weight_stream.copy()
+                with open(weight_file_name, "w") as f:
+                    for val in weight_stream:
+                        # split into groups of 8 hex digits (= 32 bits)
+                        words_32b = textwrap.wrap(val, 8)
+                        words_32b.reverse()
+                        for word_32b in words_32b:
+                            f.write(word_32b + "\n")
+            else:
+                raise Exception("Unknown weight_file_mode")
+
+        else:
+            raise Exception("Unknown weight_file_mode")
+
+    def generate_params(self, model, path):
+        mem_mode = self.get_nodeattr("mem_mode")
+        code_gen_dir = path
+        # weights, if not external
+        weights = model.get_initializer(self.onnx_node.input[1])
+        if mem_mode == "const":
+            # save hlslib-compatible weights in params.h
+            weight_filename = "{}/params.h".format(code_gen_dir)
+            self.make_weight_file(weights, "hls_header", weight_filename)
+        elif mem_mode == "decoupled" or mem_mode == "external":
+            weight_filename_sim = "{}/weights.npy".format(code_gen_dir)
+            # save decoupled weights for cppsim
+            self.make_weight_file(weights, "decoupled_npy", weight_filename_sim)
+            if mem_mode == "decoupled":
+                # also save weights as Verilog .dat file
+                # This file will be ignored when synthesizing UltraScale memory.
+                weight_filename_rtl = "{}/memblock.dat".format(code_gen_dir)
+                self.make_weight_file(weights, "decoupled_verilog_dat", weight_filename_rtl)
+        else:
+            raise Exception(
+                """Please set mem_mode to "const", "decoupled", or "external",
+                currently no other parameter value is supported!"""
+            )
+
+        # save thresholds in thresh.h
+        if len(self.onnx_node.input) > 2:
+            thresholds = model.get_initializer(self.onnx_node.input[2])
+            if thresholds is not None:
+                threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds)
+                # use UINT32 threshold export for bipolar times bipolar
+                inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"]
+                wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"]
+                # reinterpret inp/wt as bipolar if bin_xnor_mode is iset
+                inp_is_binary = self.get_input_datatype() == DataType["BINARY"]
+                wt_is_binary = self.get_weight_datatype() == DataType["BINARY"]
+                bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1
+                inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode)
+                wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode)
+                # get computed threshold datatype from attribute
+                tdt = DataType[self.get_nodeattr("accDataType")]
+
+                assert np.vectorize(tdt.allowed)(
+                    threshold_tensor
+                ).all(), "Thresholds in %s can't be expressed with type %s" % (
+                    self.onnx_node.name,
+                    str(tdt),
+                )
+                thresholds_hls_code = numpy_to_hls_code(
+                    threshold_tensor, tdt, "thresholds", False, True
+                )
+                # write thresholds into thresh.h
+                f_thresh = open("{}/thresh.h".format(code_gen_dir), "w")
+                tdt_hls = tdt.get_hls_datatype_str()
+                # use binary to export bipolar activations
+                export_odt = self.get_output_datatype()
+                if self.get_output_datatype() == DataType["BIPOLAR"]:
+                    export_odt = DataType["BINARY"]
+                odt_hls = export_odt.get_hls_datatype_str()
+                f_thresh.write(
+                    "static ThresholdsActivation<{},{},{},{},{},{},{}> threshs \
+                    = ".format(
+                        self.calc_tmem(),
+                        self.get_nodeattr("PE"),
+                        threshold_tensor.shape[-1],
+                        tdt_hls,
+                        odt_hls,
+                        self.get_nodeattr("ActVal"),
+                        "comp::less_equal<%s, %s>" % (tdt_hls, tdt_hls),
+                    )
+                )
+                f_thresh.write(thresholds_hls_code)
+                f_thresh.close()
+
+    def execute_node(self, context, graph):
+        mode = self.get_nodeattr("exec_mode")
+        mem_mode = self.get_nodeattr("mem_mode")
+        node = self.onnx_node
+
+        # TODO ensure codegen dir exists
+        if mode == "cppsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        elif mode == "rtlsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        # create a npy file fore each input of the node (in_ind is input index)
+        in_ind = 0
+        for inputs in node.input:
+            # it is assumed that the first input of the node is the data input
+            # the second input are the weights
+            # the third input are the thresholds
+            if in_ind == 0:
+                assert (
+                    str(context[inputs].dtype) == "float32"
+                ), """Input datatype is
+                not float32 as expected."""
+                expected_inp_shape = self.get_folded_input_shape()
+                reshaped_input = context[inputs].reshape(expected_inp_shape)
+                if self.get_input_datatype() == DataType["BIPOLAR"]:
+                    # store bipolar activations as binary
+                    reshaped_input = (reshaped_input + 1) / 2
+                    export_idt = DataType["BINARY"]
+                else:
+                    export_idt = self.get_input_datatype()
+                # make copy before saving the array
+                reshaped_input = reshaped_input.copy()
+                np.save(
+                    os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)),
+                    reshaped_input,
+                )
+            elif in_ind > 2:
+                raise Exception("Unexpected input found for VectorVectorActivation")
+            in_ind += 1
+
+        if mode == "cppsim":
+            # execute the precompiled model
+            super().exec_precompiled_singlenode_model()
+            # load output npy file
+            super().npy_to_dynamic_output(context)
+            # reinterpret binary output as bipolar where needed
+            if self.get_output_datatype() == DataType["BIPOLAR"]:
+                out = context[node.output[0]]
+                out = 2 * out - 1
+                context[node.output[0]] = out
+            assert (
+                context[node.output[0]].shape == self.get_normal_output_shape()
+            ), "cppsim did not produce expected output shape"
+        elif mode == "rtlsim":
+            sim = self.get_rtlsim()
+            nbits = self.get_instream_width()
+            inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits)
+            super().reset_rtlsim(sim)
+            super().toggle_clk(sim)
+
+            if mem_mode == "external" or mem_mode == "decoupled":
+                wnbits = self.get_weightstream_width()
+                export_wdt = self.get_weight_datatype()
+                # we have converted bipolar weights to binary for export,
+                # so use it as such for weight generation
+                if self.get_weight_datatype() == DataType["BIPOLAR"]:
+                    export_wdt = DataType["BINARY"]
+                wei = npy_to_rtlsim_input("{}/weights.npy".format(code_gen_dir), export_wdt, wnbits)
+                dim_h, dim_w = self.get_nodeattr("Dim")
+                num_w_reps = dim_h * dim_w
+
+                io_dict = {
+                    "inputs": {"in0": inp, "weights": wei * num_w_reps},
+                    "outputs": {"out": []},
+                }
+                self.rtlsim_multi_io(sim, io_dict)
+                output = io_dict["outputs"]["out"]
+            else:
+                output = self.rtlsim(sim, inp)
+            odt = self.get_output_datatype()
+            target_bits = odt.bitwidth()
+            packed_bits = self.get_outstream_width()
+            out_npy_path = "{}/output.npy".format(code_gen_dir)
+            out_shape = self.get_folded_output_shape()
+            rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits)
+
+            # load and reshape output
+            output = np.load(out_npy_path)
+            oshape = self.get_normal_output_shape()
+            output = np.asarray([output], dtype=np.float32).reshape(*oshape)
+            context[node.output[0]] = output
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+    def code_generation_ipgen(self, model, fpgapart, clk):
+        """Normally: Generates C++ code and tcl script for IP generation.
+        Here: Generates (System-)Verilog code for IP generation."""
+        self.generate_hdl(model, fpgapart, clk)
+
+    def ipgen_singlenode_code(self):
+        """Normally: Builds the bash script for IP generation."""
+        pass
+
+    def code_generation_cppsim(self, model):
+        """Normally: Generates C++ code for simulation (cppsim)."""
+        pass
+
+    def compile_singlenode_code(self):
+        pass
+
+    def global_includes(self):
+        pass
+
+    def defines(self, var):
+        pass
+
+    def read_npy_data(self):
+        pass
+
+    def strm_decl(self):
+        pass
+
+    def docompute(self):
+        pass
+
+    def dataoutstrm(self):
+        pass
+
+    def save_as_npy(self):
+        pass
+
+    def blackboxfunction(self):
+        pass
+
+    def pragmas(self):
+        pass
+
+    def get_verilog_top_module_intf_names(self):
+        intf_names = super().get_verilog_top_module_intf_names()
+        mem_mode = self.get_nodeattr("mem_mode")
+        sname = self.hls_sname()
+        if mem_mode == "external":
+            intf_names["s_axis"].append(("weights_" + sname, self.get_weightstream_width_padded()))
+        if mem_mode == "decoupled":
+            # only expose axilite interface if attribute is set
+            runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1
+            if runtime_writable:
+                intf_names["axilite"] = ["s_axilite"]
+        return intf_names
+
+    def code_generation_ipi(self):
+        cmd = []
+        # add streamer if needed
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode == "decoupled":
+            runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1
+            if self.get_nodeattr("ram_style") == "ultra":
+                assert (
+                    runtime_writable == 1
+                ), "Layer with URAM weights must have runtime_writeable_weights=1"
+            node_name = self.onnx_node.name
+            sname = self.hls_sname()
+            # create a hierarchy for this layer, with the same port names
+            clk_name = self.get_verilog_top_module_intf_names()["clk"][0]
+            rst_name = self.get_verilog_top_module_intf_names()["rst"][0]
+            dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0][0]
+            din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0]
+            cmd.append("create_bd_cell -type hier %s" % node_name)
+            cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name))
+            cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name))
+            cmd.append(
+                "create_bd_intf_pin -mode Master "
+                "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, dout_name)
+            )
+            cmd.append(
+                "create_bd_intf_pin -mode Slave "
+                "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name)
+            )
+            # instantiate the RTL block
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+            rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/")
+            sourcefiles = [
+                os.path.join(
+                    code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"
+                ),
+                rtllib_dir + "mvu_vvu_axi.sv",
+                rtllib_dir + "replay_buffer.sv",
+                rtllib_dir + "mvu_4sx4u.sv",
+                rtllib_dir + "mvu_vvu_8sx9_dsp58.sv",
+                rtllib_dir + "mvu_8sx8u_dsp48.sv",
+                rtllib_dir + "mvu_vvu_lut.sv",
+            ]
+            for f in sourcefiles:
+                cmd.append("add_files -norecurse %s" % (f))
+            cmd.append(
+                "create_bd_cell -type hier -reference %s /%s/%s"
+                % (
+                    self.get_nodeattr("gen_top_module"),
+                    self.onnx_node.name,
+                    self.onnx_node.name,
+                )
+            )
+
+            # instantiate a streamer and connect it to the HLS IP
+            strm_vlnv = "amd.com:finn:memstream:1.0"
+            strm_inst = node_name + "_wstrm"
+            cmd.append(
+                "create_bd_cell -type ip -vlnv %s /%s/%s" % (strm_vlnv, node_name, strm_inst)
+            )
+            cmd.append(
+                "set_property -dict [list "
+                "CONFIG.DEPTH {%d} "
+                "CONFIG.WIDTH {%d} "
+                "CONFIG.INIT_FILE {%s} "
+                "CONFIG.RAM_STYLE {%s} "
+                "] [get_bd_cells /%s/%s]"
+                % (
+                    self.calc_wmem(),
+                    self.get_weightstream_width_padded(),
+                    self.get_nodeattr("code_gen_dir_ipgen") + "/memblock.dat",
+                    self.get_nodeattr("ram_style"),
+                    node_name,
+                    strm_inst,
+                )
+            )
+            cmd.append(
+                "connect_bd_intf_net [get_bd_intf_pins %s/%s/m_axis_0] "
+                "[get_bd_intf_pins %s/%s/weights_%s]"
+                % (node_name, strm_inst, node_name, node_name, sname)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/ap_rst_n]"
+                % (node_name, rst_name, node_name, strm_inst)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/ap_clk]"
+                % (node_name, clk_name, node_name, strm_inst)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]"
+                % (node_name, rst_name, node_name, node_name, rst_name)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]"
+                % (node_name, clk_name, node_name, node_name, clk_name)
+            )
+            cmd.append(
+                "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
+                "[get_bd_intf_pins %s/%s/%s]"
+                % (node_name, din_name, node_name, node_name, din_name)
+            )
+            cmd.append(
+                "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
+                "[get_bd_intf_pins %s/%s/%s]"
+                % (node_name, dout_name, node_name, node_name, dout_name)
+            )
+            if runtime_writable:
+                # expose axi lite interface for writeable weights
+                axilite_name = self.get_verilog_top_module_intf_names()["axilite"][0]
+                cmd.append(
+                    "create_bd_intf_pin -mode Slave "
+                    "-vlnv xilinx.com:interface:aximm_rtl:1.0 /%s/%s" % (node_name, axilite_name)
+                )
+                cmd.append(
+                    "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
+                    "[get_bd_intf_pins %s/%s/%s]"
+                    % (node_name, axilite_name, node_name, strm_inst, axilite_name)
+                )
+                # TODO calculate and pass in segment size here
+                cmd.append("assign_bd_address")
+            cmd.append("save_bd_design")
+        elif mem_mode == "external":
+            # instantiate the RTL block
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+            rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/")
+            sourcefiles = [
+                os.path.join(
+                    code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"
+                ),
+                rtllib_dir + "mvu_vvu_axi.sv",
+                rtllib_dir + "replay_buffer.sv",
+                rtllib_dir + "mvu_4sx4u.sv",
+                rtllib_dir + "mvu_vvu_8sx9_dsp58.sv",
+                rtllib_dir + "mvu_8sx8u_dsp48.sv",
+                rtllib_dir + "mvu_vvu_lut.sv",
+            ]
+            for f in sourcefiles:
+                cmd.append("add_files -norecurse %s" % (f))
+            cmd.append(
+                "create_bd_cell -type module -reference %s %s"
+                % (
+                    self.get_nodeattr("gen_top_module"),
+                    self.onnx_node.name,
+                )
+            )
+        else:
+            raise Exception("Unrecognized mem_mode for VectorVectorActivation")
+        return cmd
+
+    def uram_estimation(self):
+        P = self.get_nodeattr("PE")
+        Q = self.get_nodeattr("SIMD")
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        omega = self.calc_wmem()
+        mem_width = Q * W * P
+        mmode = self.get_nodeattr("mem_mode")
+        mstyle = self.get_nodeattr("ram_style")
+        if (
+            (mmode == "decoupled" and mstyle != "ultra")
+            or (mmode == "const")
+            or (mmode == "external")
+        ):
+            return 0
+        width_multiplier = math.ceil(mem_width / 72)
+        depth_multiplier = math.ceil(omega / 4096)
+        return width_multiplier * depth_multiplier
+
+    def bram_estimation(self):
+        """Calculates resource estimation for BRAM"""
+        # TODO add in/out FIFO contributions
+        P = self.get_nodeattr("PE")
+        Q = self.get_nodeattr("SIMD")
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        omega = self.calc_wmem()
+        mem_width = Q * W * P
+        # assuming SDP mode RAMB18s (see UG573 Table 1-10)
+        # since this is HLS memory, not using the full width of a BRAM
+        # assuming memories up to 128 deep get implemented in LUTs
+        mmode = self.get_nodeattr("mem_mode")
+        mstyle = self.get_nodeattr("ram_style")
+        if (
+            (mmode == "decoupled" and mstyle in ["distributed", "ultra"])
+            or (mstyle == "auto" and self.calc_wmem() <= 128)
+            or (mmode == "const" and self.calc_wmem() <= 128)
+            or (mmode == "external")
+        ):
+            return 0
+
+        if mem_width == 1:
+            return math.ceil(omega / 16384)
+        elif mem_width == 2:
+            return math.ceil(omega / 8192)
+        elif mem_width <= 4:
+            return (math.ceil(omega / 4096)) * (math.ceil(mem_width / 4))
+        elif mem_width <= 9:
+            return (math.ceil(omega / 2048)) * (math.ceil(mem_width / 8))
+        elif mem_width <= 18 or omega > 512:
+            return (math.ceil(omega / 1024)) * (math.ceil(mem_width / 16))
+        else:
+            return (math.ceil(omega / 512)) * (math.ceil(mem_width / 32))
+
+    def bram_efficiency_estimation(self):
+        P = self.get_nodeattr("PE")
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        omega = self.calc_wmem()
+        bram16_est = self.bram_estimation()
+        if bram16_est == 0:
+            return 1
+        wbits = W * P * omega
+        bram16_est_capacity = bram16_est * 36 * 512
+        return wbits / bram16_est_capacity
+
+# TODO: fix estimations
+    def lut_estimation(self):
+        """Calculates resource estimations for LUTs based on:
+        - FINN-R: An End-to-End Deep-Learning Framework for Fast
+        Exploration of Quantized Neural Networks
+        - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien,
+        Y. Umuroglu, M. Leeser and K. Vissers
+        - 12. Sep 2018
+        """
+        # TODO add in/out FIFO contributions
+        P = self.get_nodeattr("PE")
+        Q = self.get_nodeattr("SIMD")
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        # determine tdt with input and weight data types
+        idt = self.get_input_datatype()
+        A = idt.bitwidth()
+        # parameters from experiments in paper mentioned above
+        c0 = 300
+        c1 = 1.1
+        c2 = 0
+        mmode = self.get_nodeattr("mem_mode")
+        mstyle = self.get_nodeattr("ram_style")
+        if (mmode == "decoupled" and mstyle == "distributed") or (
+            mmode == "const" and self.calc_wmem() <= 128
+        ):
+            c2 = (P * Q * W) * math.ceil(self.calc_wmem() / 64)
+
+        # multiplication
+        res_type = self.get_nodeattr("resType")
+        if res_type == "dsp":
+            mult_luts = 0
+        else:
+            mult_luts = Q * (2 * math.ceil((W + A) / 6) - 1) * (W + A)
+        # adder tree
+        addertree_luts = (W + A) * (2 * Q - 1)
+        # accumulator
+        acc_datatype = self.get_accumulator_datatype()
+        acc_bits = acc_datatype.bitwidth()
+        k_h, k_w = self.get_nodeattr("Kernel")
+        # if accDataType is not set, then it will default to INT32, which would
+        # be a large overestimate in most (if not all) cases. In this scenario,
+        # we would use the minimum accumulator as determined by the data types
+        # bound, derived in https://arxiv.org/abs/2301.13376
+        alpha = math.log(k_h * k_w, 2) + W + A - 1 - int(idt.signed())
+        acc_bits = min(
+            acc_datatype.bitwidth(),
+            np.ceil(alpha + math.log(1 + pow(2, -alpha), 2) + 1),
+        )
+        acc_luts = acc_bits
+        # thresholds and threshold comparators
+        thr_luts = 0
+        comp_luts = 0
+        noact = self.get_nodeattr("noActivation")
+        # TODO - add 'ram_style_threshold' node attribute
+        if noact == 0:
+            odt = self.get_output_datatype()
+            B = odt.bitwidth()
+            thr_luts = (2**B - 1) * acc_bits * self.calc_tmem() / 64
+            comp_luts = (2**B - 1) * acc_bits
+
+        return int(
+            c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts)) + c2
+        )
+
+# TODO: fix estimations
+    def dsp_estimation(self):
+        # multiplication
+        P = self.get_nodeattr("PE")
+        res_type = self.get_nodeattr("resType")
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        idt = self.get_input_datatype()
+        A = idt.bitwidth()
+        if res_type == "dsp":
+            mult_dsp = P * np.ceil((W + A) / 48)  # TODO: more accurate modelling
+        else:
+            mult_dsp = 0
+        return int(mult_dsp)
+
+    def get_weightstream_width(self):
+        """Returns weight stream width. Used only in decoupled mode."""
+        if (
+            self.get_nodeattr("mem_mode") == "decoupled"
+            or self.get_nodeattr("mem_mode") == "external"
+        ):
+            simd = self.get_nodeattr("SIMD")
+            pe = self.get_nodeattr("PE")
+            wp = self.get_weight_datatype().bitwidth()
+            w_width = simd * pe * wp
+            return w_width
+        else:
+            return 0
+
+    def get_weightstream_width_padded(self):
+        """Returns weight stream width padded to a multiple of 8. This is required
+        by the AXI Stream spec. Used in decoupled mode."""
+        weight_width = self.get_weightstream_width()
+        return roundup_to_integer_multiple(weight_width, 8)
+
+    def get_op_and_param_counts(self):
+        k_h, k_w = self.get_nodeattr("Kernel")
+        fm = self.get_nodeattr("Channels")
+        dim_h, dim_w = self.get_nodeattr("Dim")
+        weight_bits = self.get_weight_datatype().bitwidth()
+        inp_bits = self.get_input_datatype().bitwidth()
+        num_repetitions = int(dim_h * dim_w)
+        mac_count = k_h * k_w * fm * num_repetitions
+        # cannonicalize op type: highest bitwidth operand first s.t.
+        # e.g. mac_8bx4b and mac_4bx8b don't appear as two different op types
+        bw1 = min(inp_bits, weight_bits)
+        bw2 = max(inp_bits, weight_bits)
+        mac_op_type = "op_mac_%dbx%db" % (bw1, bw2)
+        weight_param_type = "param_weight_%db" % (weight_bits)
+        weight_count = k_h * k_w * fm
+        ret_dict = {mac_op_type: mac_count, weight_param_type: weight_count}
+        if self.get_nodeattr("noActivation") == 0:
+            tdt = DataType[self.get_nodeattr("accDataType")]
+            thres_bits = tdt.bitwidth()
+            thres_param_type = "param_threshold_%db" % (thres_bits)
+            thres_count = fm
+            ret_dict[thres_param_type] = thres_count
+        return ret_dict
+
+    def derive_characteristic_fxns(self, period):
+        n_inps = np.prod(self.get_folded_input_shape()[:-1])
+        io_dict = {
+            "inputs": {
+                "in0": [0 for i in range(n_inps)],
+            },
+            "outputs": {"out": []},
+        }
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode in ["decoupled", "external"]:
+            n_weight_inps = self.calc_wmem()
+            num_w_reps = np.prod(self.get_nodeattr("numInputVectors"))
+            io_dict["inputs"]["weights"] = [0 for i in range(num_w_reps * n_weight_inps)]
+        super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
+
+    def generate_hdl(self, model, fpgapart, clk):
+        # Generate params as part of IP preparation
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        self.generate_params(model, code_gen_dir)
+
+        template_path, code_gen_dict = self.prepare_codegen_default(fpgapart, clk)
+        # add general parameters to dictionary
+        code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [
+            self.get_verilog_top_module_name()
+        ]
+        # save top module name so we can refer to it after this node has been renamed
+        # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject)
+        self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name())
+
+        # apply code generation to template
+        with open(template_path, "r") as f:
+            template_wrapper = f.read()
+        for key in code_gen_dict:
+            # transform list into long string separated by '\n'
+            code_gen_line = "\n".join(code_gen_dict[key])
+            template_wrapper = template_wrapper.replace(key, code_gen_line)
+        with open(
+            os.path.join(
+                code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"
+            ),
+            "w",
+        ) as f:
+            f.write(template_wrapper.replace("$FORCE_BEHAVIORAL$", str(0)))
+        with open(
+            os.path.join(
+                code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper_sim.v"
+            ),
+            "w",
+        ) as f:
+            f.write(template_wrapper.replace("$FORCE_BEHAVIORAL$", str(1)))
+
+        # set ipgen_path and ip_path so that HLS-Synth transformation
+        # and stich_ip transformation do not complain
+        self.set_nodeattr("ipgen_path", code_gen_dir)
+        self.set_nodeattr("ip_path", code_gen_dir)
+
+    def _resolve_segment_len(self, clk):
+        # Insert pipeline registers in the DSP58 chain to meet target clock frequency
+        # 0.741 ns seems the worst-case delay through first DSP
+        # 0.605 ns seems to be (on average) delay for all subsequent DSPs
+        critical_path_dsps = np.floor((clk - 0.741) / 0.605)
+        max_chain_len = np.ceil(self.get_nodeattr("SIMD") / 3)
+        dsp_chain_len = critical_path_dsps if critical_path_dsps < max_chain_len else max_chain_len
+        return max(1, dsp_chain_len)
+
+    def _resolve_impl_style(self, fpgapart):
+        # Based on target device and activation/weight-width, choose the
+        # supported RTL compute core
+        if self.get_nodeattr("resType") == "lut":
+            return "mvu_vvu_lut"
+        else:
+            is_dsp_targeted = self.get_nodeattr("resType") == "dsp"
+            is_versal = (
+                fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"]
+                or fpgapart[0:5] == "xqrvc"
+            )
+            assert (is_dsp_targeted and is_versal), "DSP-based (RTL) VVU currently only supported on Versal (DSP58) devices"
+            return "mvu_vvu_8sx9_dsp58"
+                
+
+    def prepare_codegen_default(self, fpgapart, clk):
+        template_path = os.environ["FINN_ROOT"] + "/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v"
+
+        code_gen_dict = {}
+        code_gen_dict["$IS_MVU$"] = [str(0)]
+        code_gen_dict["$COMPUTE_CORE$"] = [self._resolve_impl_style(fpgapart)]
+        mw = int(np.prod(self.get_nodeattr("Kernel")) * self.get_nodeattr("Channels"))
+        code_gen_dict["$MW$"] = [str(mw)]
+        code_gen_dict["$MH$"] = [str(1)]
+        code_gen_dict["$PE$"] = [str(self.get_nodeattr("PE"))]
+        code_gen_dict["$SIMD$"] = [str(self.get_nodeattr("SIMD"))]
+        code_gen_dict["$ACTIVATION_WIDTH$"] = [
+            str(self.get_input_datatype(0).bitwidth())
+        ]
+        code_gen_dict["$WEIGHT_WIDTH$"] = [str(self.get_input_datatype(1).bitwidth())]
+        code_gen_dict["$ACCU_WIDTH$"] = [str(self.get_output_datatype().bitwidth())]
+        code_gen_dict["$SIGNED_ACTIVATIONS$"] = (
+            [str(1)] if (self.get_input_datatype(0).min() < 0) else [str(0)]
+        )
+        code_gen_dict["$SEGMENTLEN$"] = [str(self._resolve_segment_len(clk))]
+        #code_gen_dict["$FORCE_BEHAVIORAL$"] = [str(0)]
+
+        return template_path, code_gen_dict
+
+    def prepare_rtlsim(self):
+        """Creates a Verilator emulation library for the RTL code generated
+        for this node, sets the rtlsim_so attribute to its path and returns
+        a PyVerilator wrapper around it."""
+
+        if PyVerilator is None:
+            raise ImportError("Installation of PyVerilator is required.")
+
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        # Path to (System-)Verilog files used by top-module & path to top-module
+        verilog_paths = [code_gen_dir, os.environ["FINN_ROOT"] + "/finn-rtllib/mvu"]
+        verilog_files = [self.get_nodeattr("gen_top_module") + "_wrapper_sim.v"]
+
+        # build the Verilator emu library
+        sim = PyVerilator.build(
+            verilog_files,
+            build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"),
+            verilog_path=verilog_paths,
+            trace_depth=get_rtlsim_trace_depth(),
+            top_module_name=self.get_verilog_top_module_name(),
+        )
+        # save generated lib filename in attribute
+        self.set_nodeattr("rtlsim_so", sim.lib._name)
+
+        return sim
\ No newline at end of file

From 62cec5056eac6f4a28c0a4ea87051a6c0123dd41 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 13 Oct 2023 23:37:08 +0100
Subject: [PATCH 183/235] [dwc pw]: added new custom-op SDWC operating on SWG
 with parallel window mode enabled

---
 ...datawidthconverter_parallelwindow_batch.py | 390 ++++++++++++++++++
 1 file changed, 390 insertions(+)
 create mode 100644 src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_parallelwindow_batch.py

diff --git a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_parallelwindow_batch.py b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_parallelwindow_batch.py
new file mode 100644
index 0000000000..6a72f17555
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_parallelwindow_batch.py
@@ -0,0 +1,390 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import math
+import numpy as np
+import os
+import warnings
+from qonnx.core.datatype import DataType
+
+from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
+from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+
+# does not do anything at the ONNX node-by-node level, and input-output
+# tensor shapes are the same. performs data width conversion at the rtlsim level
+
+
+class StreamingDataWidthConverter_ParallelWindow_Batch(HLSCustomOp):
+    """Class that corresponds to finn-hlslib StreamingDataWidthConverter_ParallelWindow_Batch
+    function. To be inserted between an RTL-SWG with parallel window mode enabled and a 
+    VVU."""
+
+    def get_nodeattr_types(self):
+        my_attrs = {
+            # shape of input/output tensors
+            "shape": ("ints", True, []),
+            # bit width of input and output streams
+            "inWidth": ("i", True, 0),
+            "outWidth": ("i", True, 0),
+            # FINN DataTypes for inputs/outputs
+            "dataType": ("s", True, ""),
+            "SIMD": ("i", True, 0),
+            "PE": ("i", True, 0),
+            "Channels": ("i", True, 0),
+            "Kernel": ("ints", True, []),
+        }
+        my_attrs.update(super().get_nodeattr_types())
+        return my_attrs
+
+    def get_input_datatype(self, ind=0):
+        """Returns FINN DataType of input."""
+        return DataType[self.get_nodeattr("dataType")]
+
+    def get_output_datatype(self, ind=0):
+        """Returns FINN DataType of output."""
+        return DataType[self.get_nodeattr("dataType")]
+
+    def get_normal_input_shape(self, ind=0):
+        ishape = self.get_nodeattr("shape")
+        return ishape
+
+    def get_normal_output_shape(self, ind=0):
+        oshape = self.get_nodeattr("shape")
+        return oshape
+
+    def get_folded_input_shape(self, ind=0):
+        iwidth = self.get_nodeattr("inWidth")
+        ishape = self.get_normal_input_shape()
+        dummy_t = np.random.randn(*ishape)
+        ibits = self.get_input_datatype().bitwidth()
+        assert (
+            iwidth % ibits == 0
+        ), """DWC input width must be divisible by
+        input element bitwidth"""
+        ielems = int(iwidth // ibits)
+        ichannels = ishape[-1]
+        new_shape = []
+        for i in ishape[:-1]:
+            new_shape.append(i)
+        new_shape.append(int(ichannels // ielems))
+        new_shape.append(ielems)
+        dummy_t = dummy_t.reshape(new_shape)
+        return dummy_t.shape
+
+    def get_folded_output_shape(self, ind=0):
+        owidth = self.get_nodeattr("outWidth")
+        oshape = self.get_normal_output_shape()
+        dummy_t = np.random.randn(*oshape)
+        obits = self.get_output_datatype().bitwidth()
+        assert (
+            owidth % obits == 0
+        ), """DWC output width must be divisible by
+        input element bitwidth"""
+        oelems = int(owidth // obits)
+        ochannels = oshape[-1]
+        new_shape = []
+        for i in oshape[:-1]:
+            new_shape.append(i)
+        new_shape.append(int(ochannels // oelems))
+        new_shape.append(oelems)
+        dummy_t = dummy_t.reshape(new_shape)
+
+        return dummy_t.shape
+
+    def get_number_output_values(self):
+        folded_oshape = self.get_folded_output_shape()
+        return np.prod(folded_oshape[:-1])
+
+    def get_instream_width(self, ind=0):
+        in_width = self.get_nodeattr("inWidth")
+        return in_width
+
+    def get_outstream_width(self, ind=0):
+        out_width = self.get_nodeattr("outWidth")
+        return out_width
+
+    def make_shape_compatible_op(self, model):
+        exp_ishape = self.get_normal_input_shape()
+        oshape = self.get_normal_output_shape()
+        ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
+        assert ishape == tuple(exp_ishape), "Unexpect input shape for StreamingDWC."
+        return super().make_const_shape_op(oshape)
+
+    def infer_node_datatype(self, model):
+        node = self.onnx_node
+        idt = model.get_tensor_datatype(node.input[0])
+        if idt != self.get_input_datatype():
+            warn_str = "inputDataType changing for %s: %s -> %s " % (
+                node.name,
+                str(self.get_input_datatype()),
+                str(idt),
+            )
+            warnings.warn(warn_str)
+        self.set_nodeattr("dataType", idt.name)
+        # data type stays the same
+        model.set_tensor_datatype(node.output[0], idt)
+
+    def verify_node(self):
+        info_messages = []
+        # verify that "backend" is set to "fpgadataflow"
+        backend_value = self.get_nodeattr("backend")
+        if backend_value == "fpgadataflow":
+            info_messages.append("Attribute backend is set correctly")
+        else:
+            info_messages.append('Attribute backend should be set to "fpgadataflow"')
+
+        # verify the number of inputs
+        if len(self.onnx_node.input) == 1:
+            info_messages.append("The number of inputs is correct")
+        else:
+            info_messages.append("""StreamingDWC needs 1 data input""")
+
+        return info_messages
+
+    def global_includes(self):
+        self.code_gen_dict["$GLOBALS$"] = ['#include "streamtools.h"']
+
+    def defines(self, var):
+        numReps = 1
+        numInWords = int(np.prod(self.get_folded_input_shape()[:-1]))
+        inWidth = self.get_nodeattr("inWidth")
+        outWidth = self.get_nodeattr("outWidth")
+        simd = self.get_nodeattr("SIMD")
+        pe = self.get_nodeattr("PE")
+        channels = self.get_nodeattr("Channels")
+        nf = int(channels/pe)
+        sf = int(np.prod(self.get_nodeattr("Kernel")) / simd)
+        actWidth = DataType[self.get_nodeattr("dataType")].bitwidth()
+        self.code_gen_dict["$DEFINES$"] = [
+            "#define InWidth %d " % inWidth,
+            "#define OutWidth %d " % outWidth,
+            "#define SIMD %d " % simd,
+            "#define PE %d " % pe,
+            "#define Channels %d " % channels,
+            "#define NF %d " % nf,
+            "#define SF %d " % sf,
+            "#define ActWidth %d " % actWidth,
+            "#define NumInWords %d " % numInWords,
+            "#define numReps %d" % numReps,
+        ]
+
+    def read_npy_data(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_input_datatype()
+        if dtype == DataType["BIPOLAR"]:
+            # use binary for bipolar storage
+            dtype = DataType["BINARY"]
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_instream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_in = "%s/input_0.npy" % code_gen_dir
+        self.code_gen_dict["$READNPYDATA$"] = []
+        self.code_gen_dict["$READNPYDATA$"].append(
+            'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                npy_in,
+                self.hls_sname(),
+            )
+        )
+
+    def strm_decl(self):
+        self.code_gen_dict["$STREAMDECLARATIONS$"] = []
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> in0_{} ("in0_{}");'.format(
+                self.get_instream_width(), self.hls_sname(), self.hls_sname()
+            )
+        )
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> out_{} ("out_{}");'.format(
+                self.get_outstream_width(), self.hls_sname(), self.hls_sname()
+            )
+        )
+
+    def docompute(self):
+        # TODO continue with fxns below, they are copy-pasted
+        op = "StreamingDataWidthConverter_ParallelWindow_Batch"
+        self.code_gen_dict["$DOCOMPUTE$"] = [
+            "%s<InWidth, OutWidth, SIMD, PE, Channels, NF, SF, ActWidth, NumInWords>(in0_%s, out_%s, numReps);"
+            % (op, self.hls_sname(), self.hls_sname())
+        ]
+
+    def dataoutstrm(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_output_datatype()
+        if dtype == DataType["BIPOLAR"]:
+            # use binary for bipolar storage
+            dtype = DataType["BINARY"]
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_outstream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_out = "%s/output.npy" % code_gen_dir
+        oshape = self.get_folded_output_shape()
+        oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
+
+        self.code_gen_dict["$DATAOUTSTREAM$"] = [
+            'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                self.hls_sname(),
+                oshape_cpp_str,
+                npy_out,
+            )
+        ]
+
+    def save_as_npy(self):
+        self.code_gen_dict["$SAVEASCNPY$"] = []
+
+    def blackboxfunction(self):
+        in_packed_bits = self.get_instream_width()
+        in_packed_hls_type = "ap_uint<%d>" % in_packed_bits
+        out_packed_bits = self.get_outstream_width()
+        out_packed_hls_type = "ap_uint<%d>" % out_packed_bits
+        self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+            "void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s)"
+            % (
+                self.onnx_node.name,
+                in_packed_hls_type,
+                self.hls_sname(),
+                out_packed_hls_type,
+                self.hls_sname(),
+            )
+        ]
+
+    def pragmas(self):
+        self.code_gen_dict["$PRAGMAS$"] = [
+            "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname()
+        ]
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE axis port=out_" + self.hls_sname()
+        )
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return")
+
+    def execute_node(self, context, graph):
+        mode = self.get_nodeattr("exec_mode")
+        node = self.onnx_node
+        exp_shape = self.get_normal_input_shape()
+        folded_ishape = self.get_folded_input_shape()
+
+        # TODO ensure codegen dir exists
+        if mode == "cppsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        elif mode == "rtlsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        inp = context[node.input[0]]
+        assert str(inp.dtype) == "float32", "Input datatype is not float32"
+        assert inp.shape == tuple(exp_shape), "Input shape does not match expected shape."
+
+        if self.get_input_datatype() == DataType["BIPOLAR"]:
+            # store bipolar activations as binary
+            inp = (inp + 1) / 2
+            export_idt = DataType["BINARY"]
+        else:
+            export_idt = self.get_input_datatype()
+        # reshape input into folded shape
+        reshaped_input = inp.reshape(folded_ishape)
+        # make copy before saving array
+        reshaped_input = reshaped_input.copy()
+        np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
+
+        if mode == "cppsim":
+            output = inp
+            output = np.asarray([output], dtype=np.float32).reshape(*exp_shape)
+            context[node.output[0]] = output
+
+        elif mode == "rtlsim":
+            sim = self.get_rtlsim()
+            nbits = self.get_instream_width()
+            rtlsim_inp = npy_to_rtlsim_input(
+                "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
+            )
+            super().reset_rtlsim(sim)
+            super().toggle_clk(sim)
+            rtlsim_output = self.rtlsim(sim, rtlsim_inp)
+            odt = export_idt
+            target_bits = odt.bitwidth()
+            packed_bits = self.get_outstream_width()
+            out_npy_path = "{}/output.npy".format(code_gen_dir)
+            out_shape = self.get_folded_output_shape()
+            rtlsim_output_to_npy(
+                rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits
+            )
+            # load and reshape output
+            output = np.load(out_npy_path)
+            output = np.asarray([output], dtype=np.float32).reshape(exp_shape)
+            context[node.output[0]] = output
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to "rtlsim" """.format(
+                    mode
+                )
+            )
+        # binary -> bipolar if needed
+        if self.get_output_datatype() == DataType["BIPOLAR"]:
+            out = context[node.output[0]]
+            out = 2 * out - 1
+            context[node.output[0]] = out
+        assert context[node.output[0]].shape == tuple(
+            exp_shape
+        ), """Output
+        shape doesn't match expected shape, should be same as input shape"""
+
+    def code_generation_ipi(self):
+        return super().code_generation_ipi()
+
+    def lut_estimation(self):
+        """Calculates resource estimations for LUTs"""
+        return 0
+
+    def prepare_rtlsim(self):
+        super().prepare_rtlsim()
+
+    def code_generation_ipgen(self, model, fpgapart, clk):
+        super().code_generation_ipgen(model, fpgapart, clk)
+
+    def ipgen_singlenode_code(self):
+        super().ipgen_singlenode_code()

From 511f8353d2a91c66e33761ae3c83cb1e43608988 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 13 Oct 2023 23:38:12 +0100
Subject: [PATCH 184/235] [transformation]: extended InsertDWC transformation
 to instantiate a StreamingDataWidthConverter_ParallelWindow_Batch when
 applicable

---
 .../transformation/fpgadataflow/insert_dwc.py | 55 ++++++++++++++-----
 1 file changed, 41 insertions(+), 14 deletions(-)

diff --git a/src/finn/transformation/fpgadataflow/insert_dwc.py b/src/finn/transformation/fpgadataflow/insert_dwc.py
index 140d154b1a..b779241c11 100644
--- a/src/finn/transformation/fpgadataflow/insert_dwc.py
+++ b/src/finn/transformation/fpgadataflow/insert_dwc.py
@@ -7,7 +7,7 @@
 
 
 def _is_dwc_node(node):
-    if node.op_type == "StreamingDataWidthConverter_Batch":
+    if node.op_type in ["StreamingDataWidthConverter_Batch", "StreamingDataWidthConverter_ParallelWindow_Batch"]:
         return True
     else:
         return False
@@ -30,6 +30,12 @@ def _suitable_node(node):
         return False
 
 
+def _is_parallel_window_mode(producer, consumer):
+    if producer.get_nodeattr("parallel_window") == 1 and consumer.op_type in ["VectorVectorActivation", "VectorVectorActivation_rtl"]:
+        return True
+    else:
+        return False
+
 class InsertDWC(Transformation):
     """Add data width converters between layers where necessary."""
 
@@ -98,19 +104,40 @@ def apply(self, model):
                                 dwc_shape,
                             )
                             graph.value_info.append(dwc_output_tensor)
-
-                            dwc_node = oh.make_node(
-                                "StreamingDataWidthConverter_Batch",
-                                [output_name],
-                                [dwc_output_tensor.name],
-                                domain="finn.custom_op.fpgadataflow",
-                                backend="fpgadataflow",
-                                shape=dwc_shape,
-                                inWidth=dwc_in_width,
-                                outWidth=dwc_out_width,
-                                dataType=str(dtype.name),
-                                impl_style=impl_style,
-                            )
+                            
+                            if _is_parallel_window_mode(n0, consumer):
+                                simd = n1.get_nodeattr("SIMD")
+                                pe = n1.get_nodeattr("PE")
+                                channels = n1.get_nodeattr("Channels")
+                                kernel = n1.get_nodeattr("Kernel")
+                                dwc_node = oh.make_node(
+                                    "StreamingDataWidthConverter_ParallelWindow_Batch",
+                                    [output_name],
+                                    [dwc_output_tensor.name],
+                                    domain="finn.custom_op.fpgadataflow",
+                                    backend="fpgadataflow",
+                                    shape=dwc_shape,
+                                    inWidth=dwc_in_width,
+                                    outWidth=dwc_out_width,
+                                    dataType=str(dtype.name),
+                                    SIMD=simd,
+                                    PE=pe,
+                                    Channels=channels,
+                                    Kernel=kernel,
+                                )
+                            else:    
+                                dwc_node = oh.make_node(
+                                    "StreamingDataWidthConverter_Batch",
+                                    [output_name],
+                                    [dwc_output_tensor.name],
+                                    domain="finn.custom_op.fpgadataflow",
+                                    backend="fpgadataflow",
+                                    shape=dwc_shape,
+                                    inWidth=dwc_in_width,
+                                    outWidth=dwc_out_width,
+                                    dataType=str(dtype.name),
+                                    impl_style=impl_style,
+                                )
                             # insert dwc
                             graph.node.insert(node_ind + 1, dwc_node)
 

From 4d949d665a2014af3be0476f414c9102001d2db8 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 13 Oct 2023 23:38:54 +0100
Subject: [PATCH 185/235] [custom op]: added 2 new custom-ops

---
 src/finn/custom_op/fpgadataflow/__init__.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index 19c0ddd999..001e95cdc7 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -57,12 +57,14 @@
 from finn.custom_op.fpgadataflow.streamingdatawidthconverter_batch import (
     StreamingDataWidthConverter_Batch,
 )
+from finn.custom_op.fpgadataflow.streamingdatawidthconverter_parallelwindow_batch import StreamingDataWidthConverter_ParallelWindow_Batch
 from finn.custom_op.fpgadataflow.streamingfifo import StreamingFIFO
 from finn.custom_op.fpgadataflow.streamingmaxpool_batch import StreamingMaxPool_Batch
 from finn.custom_op.fpgadataflow.thresholding_batch import Thresholding_Batch
 from finn.custom_op.fpgadataflow.tlastmarker import TLastMarker
 from finn.custom_op.fpgadataflow.upsampler import UpsampleNearestNeighbour_Batch
 from finn.custom_op.fpgadataflow.vectorvectoractivation import VectorVectorActivation
+from finn.custom_op.fpgadataflow.vectorvectoractivation_rtl import VectorVectorActivation_rtl
 
 custom_op = dict()
 
@@ -77,6 +79,7 @@
 custom_op["ConvolutionInputGenerator_rtl"] = ConvolutionInputGenerator_rtl
 custom_op["TLastMarker"] = TLastMarker
 custom_op["StreamingDataWidthConverter_Batch"] = StreamingDataWidthConverter_Batch
+custom_op["StreamingDataWidthConverter_ParallelWindow_Batch"] = StreamingDataWidthConverter_ParallelWindow_Batch
 custom_op["StreamingFIFO"] = StreamingFIFO
 custom_op["GlobalAccPool_Batch"] = GlobalAccPool_Batch
 custom_op["Pool_Batch"] = Pool_Batch
@@ -86,6 +89,7 @@
 custom_op["LabelSelect_Batch"] = LabelSelect_Batch
 custom_op["DuplicateStreams_Batch"] = DuplicateStreams_Batch
 custom_op["VectorVectorActivation"] = VectorVectorActivation
+custom_op["VectorVectorActivation_rtl"] = VectorVectorActivation_rtl
 custom_op["ChannelwiseOp_Batch"] = ChannelwiseOp_Batch
 custom_op["IODMA"] = IODMA
 custom_op["StreamingDataflowPartition"] = StreamingDataflowPartition

From 05751c447dda411fbf68b2688a5186b350961713 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 13 Oct 2023 23:40:32 +0100
Subject: [PATCH 186/235] [VVU_RTL test]: added test for RTL-based VVU, which
 includes testing the StreamingDataWidthConverter_ParallelWindow_Batch

---
 .../test_fpgadataflow_vvau_rtl.py             | 214 ++++++++++++++++++
 1 file changed, 214 insertions(+)
 create mode 100644 tests/fpgadataflow/test_fpgadataflow_vvau_rtl.py

diff --git a/tests/fpgadataflow/test_fpgadataflow_vvau_rtl.py b/tests/fpgadataflow/test_fpgadataflow_vvau_rtl.py
new file mode 100644
index 0000000000..abf6029f59
--- /dev/null
+++ b/tests/fpgadataflow/test_fpgadataflow_vvau_rtl.py
@@ -0,0 +1,214 @@
+# Copyright (C) 2022, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+import os
+
+import numpy as np
+from onnx import TensorProto, helper
+from qonnx.util.basic import (
+    qonnx_make_model,
+    gen_finn_dt_tensor
+)
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.core.datatype import DataType
+from qonnx.transformation.general import GiveUniqueNodeNames
+import finn.core.onnx_exec as oxe
+import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from qonnx.transformation.general import ApplyConfig
+import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl
+from qonnx.custom_op.general.im2col import compute_conv_output_dim
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
+from qonnx.transformation.general import GiveReadableTensorNames
+import pickle
+from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
+from finn.transformation.fpgadataflow.set_fifo_depths import InsertAndSetFIFODepths
+from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
+from finn.transformation.fpgadataflow.create_dataflow_partition import CreateDataflowPartition
+from qonnx.custom_op.registry import getCustomOp
+from finn.transformation.fpgadataflow.minimize_accumulator_width import MinimizeAccumulatorWidth
+#import qonnx.core.data_layout as DataLayout
+
+build_dir = os.environ["FINN_BUILD_DIR"]
+
+def make_single_dw_conv_modelwrapper(conv_config, idt, wdt):
+    kernel_size, in_feature_dim, in_chn = conv_config
+    stride = 1
+    pad = 0
+
+    out_feature_dim = compute_conv_output_dim(in_feature_dim, kernel_size, stride, pad)
+    group = out_chn = in_chn
+    
+    conv_param_shape = [out_chn, 1, kernel_size, kernel_size]
+    input_shape = [1, in_chn, in_feature_dim, in_feature_dim]
+    output_shape = [1, out_chn, out_feature_dim, out_feature_dim]
+
+    conv_config = {}
+    conv_config["dilations"] = [1, 1]
+    conv_config["group"] = group
+    conv_config["kernel_shape"] = [kernel_size, kernel_size]
+    conv_config["pads"] = [pad, pad, pad, pad]
+    conv_config["strides"] = [stride, stride]
+
+    ifm = helper.make_tensor_value_info("ifm", TensorProto.FLOAT, input_shape)
+    ofm = helper.make_tensor_value_info("ofm", TensorProto.FLOAT, output_shape)
+    weights = [helper.make_tensor_value_info("weights", TensorProto.FLOAT, conv_param_shape)]
+
+    modelproto = qonnx_make_model(
+        helper.make_graph(
+            name="conv_test",
+            inputs=[ifm],
+            outputs=[ofm],
+            value_info=weights,
+            nodes=[helper.make_node("Conv", ["ifm", "weights"], ["ofm"], **conv_config)],
+        )
+    )
+
+    model = ModelWrapper(modelproto)
+    model.set_tensor_datatype("ifm", idt)
+    model.set_tensor_datatype("weights", wdt)
+    model.set_initializer("weights", gen_finn_dt_tensor(wdt, conv_param_shape))
+
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+
+    return model
+
+def prepare_inputs(input_tensor):
+    return {"global_in": input_tensor}
+
+@pytest.mark.parametrize("kernel_size", [3])
+@pytest.mark.parametrize("in_feature_dim", [5])
+@pytest.mark.parametrize("in_chn", [4])
+@pytest.mark.parametrize("idt", [DataType["INT8"]])
+#@pytest.mark.parametrize("idt", [DataType["UINT8"]])
+@pytest.mark.parametrize("wdt", [DataType["INT6"]])
+@pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S"])
+@pytest.mark.parametrize("segmentlen", [1])
+@pytest.mark.parametrize("pe", [4])
+@pytest.mark.parametrize("simd", [3])
+@pytest.mark.fpgadataflow
+@pytest.mark.slow
+@pytest.mark.vivado
+def test_fpgadataflow_vvau_rtl(kernel_size, in_feature_dim, in_chn, idt, wdt, part, segmentlen, pe, simd):
+    # Create depthwise-separable convolution
+    conv_config = (kernel_size, in_feature_dim, in_chn)
+    model = make_single_dw_conv_modelwrapper(conv_config, idt, wdt)
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+    model.save(build_dir+"/dw_conv.onnx")
+
+    # Obtain golden reference output
+    golden_in = gen_finn_dt_tensor(model.get_tensor_datatype("global_in"), model.get_tensor_shape("global_in"))
+    input_dict = prepare_inputs(golden_in)
+    golden_out = oxe.execute_onnx(model, input_dict, return_full_exec_context=True)
+    with open(build_dir+"/onnx_dws_conv.pkl", "wb") as f:
+        pickle.dump(golden_out, f)
+
+    # Convert to HLS custom-op first
+    model = model.transform(LowerConvsToMatMul())
+    model = model.transform(to_hls.InferConvInpGen(use_rtl_variant=True))
+    model = model.transform(to_hls.InferVectorVectorActivation())
+    model = model.transform(MinimizeAccumulatorWidth())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+    model.save(build_dir+"/hls_vvau.onnx")
+
+    # Apply folding (i.e. specify to use DSPs)
+    folding_config = {
+        "Defaults": {},
+        "ConvolutionInputGenerator_rtl_0": {
+            "SIMD" : pe,
+            "parallel_window" : 1
+        },
+        "VectorVectorActivation_0": {
+            "PE" : pe,
+            "SIMD" : simd,
+            "mem_mode" : "decoupled",
+            "ram_style" : "auto",
+            "resType" : "dsp",
+            "preferred_backend" : "rtl"
+        }
+    }
+    model = model.transform(ApplyConfig(folding_config))
+    model.save(build_dir+"/hls_vvau_folded.onnx")
+
+    # Obtain second reference from HLS-based VVAU layer
+    model = model.transform(SetExecMode("rtlsim"))
+    model = model.transform(PrepareIP(part, 5))
+    model = model.transform(HLSSynthIP())
+    model = model.transform(PrepareRTLSim())
+    conv_hls_out = oxe.execute_onnx(model, input_dict, return_full_exec_context=True)
+    with open(build_dir+"/hls_vvau_folded_output.pkl", "wb") as f:
+        pickle.dump(conv_hls_out, f)
+    #assert (golden_out["global_out"] == conv_hls_out["global_out"]).all()
+
+    # Stitched-IP RTLsim
+    model = model.transform(CreateDataflowPartition(partition_model_dir=build_dir))
+    model.save(build_dir+"/ip-stitched.onnx")
+    partition_model_path = getCustomOp(model.get_nodes_by_op_type("StreamingDataflowPartition")[0]).get_nodeattr("model")
+    partitioned_model = ModelWrapper(partition_model_path)
+    # FIFOs needed for stitched-ip RTLsim, DWC needed for VVU operating on SIMD parallelism
+    partitioned_model = partitioned_model.transform(InsertAndSetFIFODepths(part, 5))
+    partitioned_model = partitioned_model.transform(PrepareIP(part, 5))
+    partitioned_model = partitioned_model.transform(HLSSynthIP())
+    partitioned_model.save(build_dir+"/partitioned_model.onnx")
+    partitioned_model = partitioned_model.transform(CreateStitchedIP(part, 5))
+    partitioned_model.save(partition_model_path)
+    partitioned_model.set_metadata_prop("rtlsim_trace", build_dir+"/hls-vvu.vcd")
+    # set top-level prop for stitched-ip rtlsim and launch
+    partitioned_model.set_metadata_prop("exec_mode", "rtlsim")
+    # transpose input since we're now simulating HW layers (NCHW --> NHWC)
+    input_dict["global_in"] = np.transpose(input_dict["global_in"], (0,2,3,1))
+    stitched_ip_out = oxe.execute_onnx(partitioned_model, input_dict, return_full_exec_context=True)
+    with open(build_dir+"/stitched_ip_output.pkl", "wb") as f:
+        pickle.dump(stitched_ip_out, f)
+
+    # Apply convert-to-rtl step
+    partitioned_model = partitioned_model.transform(to_rtl.InferRTLVectorVectorActivation())
+    partitioned_model = partitioned_model.transform(GiveUniqueNodeNames())
+    partitioned_model = partitioned_model.transform(GiveReadableTensorNames())
+    partitioned_model = partitioned_model.transform(PrepareIP(part, 5))
+    partitioned_model = partitioned_model.transform(HLSSynthIP())
+    partitioned_model = partitioned_model.transform(CreateStitchedIP(part, 5))
+    partitioned_model.save(build_dir+"/partition_rtl_vvau.onnx")
+    partitioned_model.set_metadata_prop("rtlsim_trace", build_dir+"/rtl-vvu.vcd")
+    # Reset rtlsim_so path to re-generate Pyverilator sim object
+    partitioned_model.set_metadata_prop("rtlsim_so", "")
+    # set top-level prop for stitched-ip rtlsim and launch
+    partitioned_model.set_metadata_prop("exec_mode", "rtlsim")
+    vvu_rtl_out = oxe.execute_onnx(partitioned_model, input_dict, return_full_exec_context=True)
+    with open(build_dir+"/rtl_vvau_output.pkl", "wb") as f:
+        pickle.dump(vvu_rtl_out, f)
+    # assert (conv_hls_out["global_out"] == vvu_rtl_out["global_out"]).all(), "Mismatch"

From 6d4ee089faf8232d0bc54eaf6b8c8118ab93c6f7 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Wed, 1 Nov 2023 15:20:07 +0000
Subject: [PATCH 187/235] [mvu vvu axi]: minor bugfixes to enable VVU

---
 finn-rtllib/mvu/mvu_vvu_axi.sv | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index f0f75c633a..ddedec1e8a 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -64,7 +64,7 @@ module mvu_vvu_axi #(
 	localparam int unsigned INPUT_STREAM_WIDTH_BA = ((IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8,
 	localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH,
 	localparam int unsigned INPUT_STREAM_WIDTH =  (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH,
-	localparam int unsigned SF = IS_MVU ? MW/SIMD : MW/(SIMD*PE),
+	localparam int unsigned SF = MW/SIMD,
 	localparam int unsigned NF = IS_MVU ? MH/PE : 1,
 	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8
 )
@@ -91,11 +91,11 @@ module mvu_vvu_axi #(
 
 //-------------------- Parameter sanity checks --------------------\\
 	initial begin
-		if ((MW % SIMD != 0 && IS_MVU) || (MW % (SIMD*PE) != 0 && !IS_MVU)) begin
+		if (MW % SIMD != 0) begin
 			$error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD);
 			$finish;
 		end
-		if (MH % PE != 0 && IS_MVU) begin
+		if (MH % PE != 0) begin
 			$error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE);
 			$finish;
 		end
@@ -152,13 +152,10 @@ module mvu_vvu_axi #(
 //-------------------- Core MVU/VVU --------------------\\
 	uwire ovld;
 	uwire [PE-1:0][ACCU_WIDTH-1:0] odat;
-	typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t;
 	uwire mvauin_t amvau_i;
-	uwire mvauin_weight_t wmvau_i;
 
 	if (IS_MVU) begin : genMVUInput
 		assign  amvau_i = amvau;
-		assign  wmvau_i = s_axis_weights_tdata;
 	end : genMVUInput
 	else begin : genVVUInput
 		// The input stream will have the channels interleaved for VVU when PE>1
@@ -169,11 +166,8 @@ module mvu_vvu_axi #(
 		localparam int num_of_elements = PE*SIMD;
 		for (genvar i=0; i<num_of_elements; i++) begin : genRewire
 			assign  amvau_i[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH] = (PE > 1) ?
-									amvau[(i/SIMD + (i*PE % num_of_elements) + 1) * ACTIVATION_WIDTH : (i/SIMD + (i*PE % num_of_elements)) * ACTIVATION_WIDTH]
+									amvau[(i/SIMD + (i*PE % num_of_elements) + 1) * ACTIVATION_WIDTH -1: (i/SIMD + (i*PE % num_of_elements)) * ACTIVATION_WIDTH]
 									: amvau[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH];
-			assign  wmvau_i[i*WEIGHT_WIDTH +: WEIGHT_WIDTH] = (PE > 1) ? 
-									s_axis_weights_tdata[( ((SIMD-1-i) + int'(i/SIMD)*SIMD) + int'(i/SIMD) * SIMD + 1) * WEIGHT_WIDTH : ( ((SIMD-1-i) + int'(i/SIMD)*SIMD) + int'(i/SIMD) * SIMD ) * WEIGHT_WIDTH]
-									: s_axis_weights_tdata[i*WEIGHT_WIDTH +: WEIGHT_WIDTH];
 		end : genRewire
 	end : genVVUInput
 
@@ -183,7 +177,7 @@ module mvu_vvu_axi #(
 		.ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN),
 		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
 			.clk, .rst, .en,
-			.last(alast && avld), .zero(!istb), .w(wmvau_i), .a(amvau_i),
+			.last(alast && avld), .zero(!istb), .w(s_axis_weights_tdata), .a(amvau_i),
 			.vld(ovld), .p(odat)
 		);
 	"mvu_4sx4u":

From 39dc27ac24fcae5999536c504f2150a5f6f7be7e Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Wed, 1 Nov 2023 15:26:30 +0000
Subject: [PATCH 188/235] [mvu tb]: created separate vvu testbench and renamed
 mvu_vvu_axi tb

---
 .../tb/{mvu_vvu_axi_tb.sv => mvu_axi_tb.sv}   |  16 +-
 finn-rtllib/mvu/tb/vvu_axi_tb.sv              | 227 ++++++++++++++++++
 2 files changed, 235 insertions(+), 8 deletions(-)
 rename finn-rtllib/mvu/tb/{mvu_vvu_axi_tb.sv => mvu_axi_tb.sv} (96%)
 create mode 100644 finn-rtllib/mvu/tb/vvu_axi_tb.sv

diff --git a/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_axi_tb.sv
similarity index 96%
rename from finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv
rename to finn-rtllib/mvu/tb/mvu_axi_tb.sv
index b46fc588c9..8614e9f811 100644
--- a/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv
+++ b/finn-rtllib/mvu/tb/mvu_axi_tb.sv
@@ -31,24 +31,24 @@
  * @brief	Testbench for MVU AXI-lite interface wrapper.
  *****************************************************************************/
 
-module mvu_vvu_axi_tb();
+module mvu_axi_tb();
 
 //-------------------- Simulation parameters --------------------\\
 	// Matrix & parallelism config
 	localparam bit IS_MVU = 0;
 	localparam string COMPUTE_CORE = "mvu_vvu_8sx9_dsp58";
 	localparam int unsigned MW = 36;
-	localparam int unsigned MH = 1;
-	localparam int unsigned SIMD = 3;
+	localparam int unsigned MH = 4;
+	localparam int unsigned SIMD = 36;
 	localparam int unsigned PE = 4;
-	localparam int unsigned SEGMENTLEN = 1.0;
+	localparam int unsigned SEGMENTLEN = 2.0;
 	localparam bit FORCE_BEHAVIORAL = 1;
 	localparam bit M_REG_LUT = 1;
 	// Bit-width config
-	localparam int unsigned ACTIVATION_WIDTH = 8;
-	localparam int unsigned WEIGHT_WIDTH = 6;
+	localparam int unsigned ACTIVATION_WIDTH = 4;
+	localparam int unsigned WEIGHT_WIDTH = 4;
 	localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW);
-	localparam bit SIGNED_ACTIVATIONS = 1;
+	localparam bit SIGNED_ACTIVATIONS = 0;
 	// Simulation constants
 	localparam int unsigned NF = IS_MVU ? MH/PE : 1;
 	localparam int unsigned SF = IS_MVU ? MW/SIMD : MW/(SIMD*PE);
@@ -238,4 +238,4 @@ module mvu_vvu_axi_tb();
 		.m_axis_output_tready(outputs.rdy)
 	);
 
-endmodule : mvu_vvu_axi_tb
+endmodule : mvu_axi_tb
diff --git a/finn-rtllib/mvu/tb/vvu_axi_tb.sv b/finn-rtllib/mvu/tb/vvu_axi_tb.sv
new file mode 100644
index 0000000000..fbb45845e1
--- /dev/null
+++ b/finn-rtllib/mvu/tb/vvu_axi_tb.sv
@@ -0,0 +1,227 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Testbench for MVU AXI-lite interface wrapper.
+ *****************************************************************************/
+
+module vvu_axi_tb();
+
+//-------------------- Simulation parameters --------------------\\
+	// Matrix & parallelism config
+	localparam bit IS_MVU = 0;
+	localparam string COMPUTE_CORE = "mvu_vvu_8sx9_dsp58";
+	localparam int unsigned MW = 25; // Kernel*Kernel
+	localparam int unsigned MH = 4; // Channels
+	localparam int unsigned SIMD = 25; // MW%SIMD == 0
+	localparam int unsigned PE = 2; // MH%PE == 0
+	localparam int unsigned SEGMENTLEN = 3.0;
+	localparam bit FORCE_BEHAVIORAL = 1;
+	localparam bit M_REG_LUT = 1;
+	// Bit-width config
+	localparam int unsigned ACTIVATION_WIDTH = 4;
+	localparam int unsigned WEIGHT_WIDTH = 4;
+	localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW);
+	localparam bit SIGNED_ACTIVATIONS = 1;
+	// Simulation constants
+	localparam int unsigned NF = MH/PE;
+	localparam int unsigned SF = MW/SIMD;
+	localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8;
+	localparam int unsigned ACTIVATION_WIDTH_BA = (PE*SIMD*ACTIVATION_WIDTH+7)/8*8;
+	localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH;
+	localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - PE*SIMD*ACTIVATION_WIDTH;
+	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8;
+
+	// Generate clk and reset signal
+	logic clk = 0;
+	always #5ns clk = !clk;
+
+	logic ap_rst_n = 0;
+	initial begin
+		repeat(16) @(posedge clk);
+		ap_rst_n <= 1;
+	end
+
+	uwire ap_clk = clk;
+
+	// Generate activations
+	typedef logic [PE*SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t;
+	typedef activation_t activation_vector_t[NF*SF];
+
+	function activation_vector_t init_ACTIVATIONS;
+		automatic activation_vector_t res;
+		std::randomize(res);
+		return res;
+	endfunction : init_ACTIVATIONS
+
+	activation_vector_t ACTIVATIONS = init_ACTIVATIONS();
+
+	struct {
+		activation_t dat;
+		logic vld;
+		logic rdy;
+	} activations;
+
+	initial begin
+		activations.vld = 0;
+		activations.dat = 'X;
+		@(posedge clk iff ap_rst_n);
+
+		for (int i=0; i<SF*NF; i++) begin
+			activations.dat <= ACTIVATIONS[i];
+			do begin
+				activations.vld <= $urandom()%7 >= 0;
+				@(posedge clk);
+			end while (!(activations.vld === 1 && activations.rdy === 1));
+		end
+
+		activations.vld <= 0;
+		activations.dat <= 'x;
+	end
+
+	// Generate weights
+	typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t;
+	typedef weight_t weight_matrix_t[NF][SF];
+
+	function weight_matrix_t init_WEIGHTS;
+		automatic weight_matrix_t res;
+		std::randomize(res);
+		return res;
+	endfunction : init_WEIGHTS;
+
+	weight_matrix_t WEIGHTS = init_WEIGHTS();
+
+	struct {
+		weight_t dat;
+		logic vld;
+		logic rdy;
+	} weights;
+
+	initial begin
+		weights.vld = 0;
+		weights.dat = 'X;
+		@(posedge clk iff ap_rst_n);
+
+		weights.vld <= 1;
+		for (int i=0; i<NF; i++) begin
+			for (int j=0; j<SF; j++) begin
+				weights.dat <= WEIGHTS[i][j];
+				@(posedge clk iff weights.rdy);
+			end
+		end
+
+		weights.vld <= 0;
+		weights.dat <= 'x;
+	end
+
+	// Function to compute golden output
+	// a: [NF*SF][PE*SIMD-1:0][ACTIVATION_WIDTH-1:0]
+	// w: [NF][SF][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]
+	typedef logic signed [PE-1:0][ACCU_WIDTH-1:0] output_t;
+	typedef output_t output_vector_t [NF];
+
+	struct {
+		output_t dat;
+		logic vld;
+		logic rdy;
+	} outputs;
+
+	function output_vector_t check_output(activation_vector_t a, weight_matrix_t w);
+		automatic output_vector_t res = '{default: 0};
+		// The input stream will have the channels interleaved for VVU when PE>1
+		// Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..]
+		// Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like:
+		// (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to
+		// (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i)
+		for (int i = 0; i < NF; i++) begin
+			for (int j = 0; j < SF; j++) begin
+				for (int k = 0; k < PE; k++) begin
+					for (int l = 0; l < SIMD; l++) begin
+						if (SIGNED_ACTIVATIONS)
+							res[i][k] = $signed(res[i][k]) + $signed(a[i*SF+j][k + l*PE]) * $signed(w[i][j][k][l]);
+						else
+							res[i][k] = $signed(res[i][k]) + $signed({1'b0, a[i*SF+j][k + l*PE]}) * $signed(w[i][j][k][l]);
+					end
+				end
+			end
+		end
+		return res;
+	endfunction : check_output;
+
+	output_vector_t GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS);
+
+	int unsigned NF_CNT = 0;
+	initial begin
+		outputs.rdy = 0;
+		while (NF_CNT < NF) begin
+			// Loop until both rdy & vld are asserted
+			do begin
+				outputs.rdy <= $urandom()%7 >= 0;
+				@(posedge clk iff ap_rst_n);
+			end while (!(outputs.rdy === 1 && outputs.vld === 1));
+
+			// Compare produced outputs against golden outputs
+			foreach(outputs.dat[i]) begin
+				assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+				else begin
+					$error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+					$stop;
+				end
+			end
+
+			NF_CNT += 1;
+		end
+
+		$finish;
+	end
+
+	// Instantiate DUT
+	mvu_vvu_axi #(
+		.IS_MVU(IS_MVU),
+		.COMPUTE_CORE(COMPUTE_CORE),
+		.MW(MW),
+		.MH(MH),
+		.PE(PE),
+		.SIMD(SIMD),
+		.ACTIVATION_WIDTH(ACTIVATION_WIDTH),
+		.WEIGHT_WIDTH(WEIGHT_WIDTH),
+		.ACCU_WIDTH(ACCU_WIDTH),
+		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
+		.SEGMENTLEN(SEGMENTLEN),
+		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL),
+		.M_REG_LUT(M_REG_LUT)
+	)
+	dut (
+		.ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld),
+		.s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld),
+		.s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld),
+		.m_axis_output_tready(outputs.rdy)
+	);
+
+endmodule : vvu_axi_tb

From 87b25f9cca342f004aa79d5c4738ba1e5e8398e7 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Wed, 1 Nov 2023 15:29:42 +0000
Subject: [PATCH 189/235] [rtl-vvu custom-op]: flipped weights per SIMD-chunk
 to match pattern of incoming input activations easier

---
 .../fpgadataflow/vectorvectoractivation_rtl.py    | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/vectorvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/vectorvectoractivation_rtl.py
index 72976bc9a8..3ffef9c3a5 100644
--- a/src/finn/custom_op/fpgadataflow/vectorvectoractivation_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/vectorvectoractivation_rtl.py
@@ -467,6 +467,8 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name):
             weight_tensor_simd_flipped = np.flip(weight_tensor_unflipped, axis=-1)
             # PE flip for saving weights in .dat
             weight_tensor_pe_flipped = np.flip(weight_tensor_unflipped, axis=-2)
+            # SIMD & PE flip
+            weight_tensor_pe_simd_flipped = np.flip(weight_tensor_pe_flipped, axis=-1)
             # reshape weight tensor (simd_flipped and pe_flipped) to desired shape
             pe = self.get_nodeattr("PE")
             simd = self.get_nodeattr("SIMD")
@@ -476,6 +478,9 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name):
             # flipped
             weight_tensor_pe_flipped = weight_tensor_pe_flipped.reshape(1, -1, pe * simd)
             weight_tensor_pe_flipped = weight_tensor_pe_flipped.copy()
+            # SIMD & PE flipped
+            weight_tensor_pe_simd_flipped = weight_tensor_pe_simd_flipped.reshape(1, -1, pe * simd)
+            weight_tensor_pe_simd_flipped = weight_tensor_pe_simd_flipped.copy()
             if weight_file_mode == "decoupled_npy":
                 # save weight stream into npy for cppsim
                 np.save(weight_file_name, weight_tensor_simd_flipped)
@@ -484,11 +489,11 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name):
                 weight_width = self.get_weightstream_width()
                 # pad to nearest 4 bits to get hex strings
                 weight_width_padded = roundup_to_integer_multiple(weight_width, 4)
-                weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string(
-                    weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix=""
+                weight_tensor_pe_simd_flipped = pack_innermost_dim_as_hex_string(
+                    weight_tensor_pe_simd_flipped, export_wdt, weight_width_padded, prefix=""
                 )
                 # add zeroes to pad out file to 1024 entries
-                weight_stream = weight_tensor_pe_flipped.flatten()
+                weight_stream = weight_tensor_pe_simd_flipped.flatten()
                 weight_stream = weight_stream.copy()
                 with open(weight_file_name, "w") as f:
                     for val in weight_stream:
@@ -1180,9 +1185,9 @@ def prepare_codegen_default(self, fpgapart, clk):
         code_gen_dict = {}
         code_gen_dict["$IS_MVU$"] = [str(0)]
         code_gen_dict["$COMPUTE_CORE$"] = [self._resolve_impl_style(fpgapart)]
-        mw = int(np.prod(self.get_nodeattr("Kernel")) * self.get_nodeattr("Channels"))
+        mw = int(np.prod(self.get_nodeattr("Kernel")))
         code_gen_dict["$MW$"] = [str(mw)]
-        code_gen_dict["$MH$"] = [str(1)]
+        code_gen_dict["$MH$"] = [str(self.get_nodeattr("Channels"))]
         code_gen_dict["$PE$"] = [str(self.get_nodeattr("PE"))]
         code_gen_dict["$SIMD$"] = [str(self.get_nodeattr("SIMD"))]
         code_gen_dict["$ACTIVATION_WIDTH$"] = [

From 1476927e957100cff85205e147e1b00d4b1ba198 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Wed, 1 Nov 2023 16:01:08 +0000
Subject: [PATCH 190/235] [rtl vvu test]: extended testbench

---
 tests/fpgadataflow/test_fpgadataflow_vvau_rtl.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tests/fpgadataflow/test_fpgadataflow_vvau_rtl.py b/tests/fpgadataflow/test_fpgadataflow_vvau_rtl.py
index abf6029f59..29132da90e 100644
--- a/tests/fpgadataflow/test_fpgadataflow_vvau_rtl.py
+++ b/tests/fpgadataflow/test_fpgadataflow_vvau_rtl.py
@@ -116,8 +116,8 @@ def prepare_inputs(input_tensor):
 @pytest.mark.parametrize("wdt", [DataType["INT6"]])
 @pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S"])
 @pytest.mark.parametrize("segmentlen", [1])
-@pytest.mark.parametrize("pe", [4])
-@pytest.mark.parametrize("simd", [3])
+@pytest.mark.parametrize("pe", [1, 2, 4])
+@pytest.mark.parametrize("simd", [1, 3, 9])
 @pytest.mark.fpgadataflow
 @pytest.mark.slow
 @pytest.mark.vivado
@@ -149,7 +149,7 @@ def test_fpgadataflow_vvau_rtl(kernel_size, in_feature_dim, in_chn, idt, wdt, pa
     folding_config = {
         "Defaults": {},
         "ConvolutionInputGenerator_rtl_0": {
-            "SIMD" : pe,
+            "SIMD" : 4,
             "parallel_window" : 1
         },
         "VectorVectorActivation_0": {
@@ -172,7 +172,6 @@ def test_fpgadataflow_vvau_rtl(kernel_size, in_feature_dim, in_chn, idt, wdt, pa
     conv_hls_out = oxe.execute_onnx(model, input_dict, return_full_exec_context=True)
     with open(build_dir+"/hls_vvau_folded_output.pkl", "wb") as f:
         pickle.dump(conv_hls_out, f)
-    #assert (golden_out["global_out"] == conv_hls_out["global_out"]).all()
 
     # Stitched-IP RTLsim
     model = model.transform(CreateDataflowPartition(partition_model_dir=build_dir))
@@ -211,4 +210,6 @@ def test_fpgadataflow_vvau_rtl(kernel_size, in_feature_dim, in_chn, idt, wdt, pa
     vvu_rtl_out = oxe.execute_onnx(partitioned_model, input_dict, return_full_exec_context=True)
     with open(build_dir+"/rtl_vvau_output.pkl", "wb") as f:
         pickle.dump(vvu_rtl_out, f)
-    # assert (conv_hls_out["global_out"] == vvu_rtl_out["global_out"]).all(), "Mismatch"
+    
+    assert (vvu_rtl_out["global_out"] == golden_out["global_out"]).all(), "Output of ONNX model not matching output of stitched-IP RTL model!"
+    assert (vvu_rtl_out["global_out"] == stitched_ip_out["global_out"]).all(), "Output of stitched-IP HLS model not matching output of stitched-IP RTL model!"

From 7fc173b44b1edd2548535d9e0d9a808f8652a805 Mon Sep 17 00:00:00 2001
From: Yaman Umuroglu <yamanu@xilinx.com>
Date: Tue, 14 Nov 2023 15:19:04 +0000
Subject: [PATCH 191/235] [RTLThres] compute obits in Python and use
 placeholder in template

---
 .../hdl/thresholding_axi_wrapper.v            |  4 +-
 .../thresholding_binary_search.py             | 46 ++++++++-----------
 2 files changed, 19 insertions(+), 31 deletions(-)

diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v
index 2657b39d98..893c791ccc 100644
--- a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v
+++ b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v
@@ -40,9 +40,7 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 	parameter  SIGNED = $SIGNED$,	// signed inputs
 	parameter  BIAS = $BIAS$,		// offsetting the output [0, 2^N-1) -> [BIAS, 2^N-1 + BIAS)
 
-	parameter  O_BITS = BIAS > 0?
-		/* unsigned */ $clog2(2**N+BIAS) :
-		/* signed */ 1+$clog2(-BIAS >= 2**(N-1)? -BIAS : 2**N+BIAS)
+	parameter  O_BITS = $O_BITS$
 )(
 	//- Global Control ------------------
 	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V" *)
diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
index d02b778823..5fe818f4ac 100755
--- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py
@@ -26,6 +26,7 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import math
 import numpy as np
 import os
 import warnings
@@ -221,23 +222,17 @@ def get_hls_compatible_threshold_tensor(self, orig_thres_matrix):
         ), """Threshold matrix dimension is
         not as expected (2)."""
         n_thres_steps = orig_thres_matrix.shape[1]
-        assert n_thres_steps == self.get_nodeattr(
-            "numSteps"
-        ), "Mismatch in threshold steps"
+        assert n_thres_steps == self.get_nodeattr("numSteps"), "Mismatch in threshold steps"
         if not self.get_input_datatype().signed():
             # ensure all thresholds are nonnegative
             assert (orig_thres_matrix >= 0).all()
         # ensure all thresholds are integer
-        assert np.equal(
-            np.mod(orig_thres_matrix, 1), 0
-        ).all(), "Need int threshold tensor"
+        assert np.equal(np.mod(orig_thres_matrix, 1), 0).all(), "Need int threshold tensor"
         ret = orig_thres_matrix
         # ensure channels = mh , duplicating if necessary
         if ret.shape[0] == 1:
             ret = np.tile(ret, (mh, 1))
-        assert (
-            ret.shape[0] == mh
-        ), "Channels of threshold matrix are not as expected (mh)"
+        assert ret.shape[0] == mh, "Channels of threshold matrix are not as expected (mh)"
         # distribute rows between PEs
         ret = interleave_matrix_outer_dim_from_partitions(ret, pe)
         assert (
@@ -268,18 +263,16 @@ def prepare_codegen_rtl_values(self):
 
         # Identify the module variables
         output_data_type = self.get_nodeattr("outputDataType")  # output precision
-        input_data_type = self.get_nodeattr(
-            "inputDataType"
-        )  # input/threshold precision
+        input_data_type = self.get_nodeattr("inputDataType")  # input/threshold precision
         num_channels = self.get_nodeattr("NumChannels")  # number of channels
         bias = self.get_nodeattr("activation_bias")  # activation bias value
         pe = self.get_nodeattr("PE")
+        o_bitwidth = DataType[output_data_type].bitwidth()
+        i_bitwidth = DataType[input_data_type].bitwidth()
 
-        code_gen_dict["$N$"] = [
-            str(DataType[output_data_type].bitwidth())
-        ]  # output precision - convert bitwidth to string
+        code_gen_dict["$N$"] = [str(o_bitwidth)]  # output precision - convert bitwidth to string
         code_gen_dict["$M$"] = [
-            str(DataType[input_data_type].bitwidth())
+            str(i_bitwidth)
         ]  # input/threshold precision - convert bitwidth to string
         code_gen_dict["$C$"] = [str(num_channels)]  # number of channels
         code_gen_dict["$BIAS$"] = [str(bias)]  # activation bias value
@@ -289,8 +282,13 @@ def prepare_codegen_rtl_values(self):
         # The thresholding core needs to know this when comparing weights to inputs
         if self.get_input_datatype().signed():
             code_gen_dict["$SIGNED$"] = [str(1)]
+            o_bits = 1 + math.ceil(
+                -bias if -bias >= 2 ** (o_bitwidth - 1) else 2**o_bitwidth + bias
+            )
         else:
             code_gen_dict["$SIGNED$"] = [str(0)]
+            o_bits = math.ceil(2**o_bitwidth + bias)
+        code_gen_dict["$O_BITS$"] = [str(o_bits)]
 
         return code_gen_dict
 
@@ -429,18 +427,14 @@ def execute_node(self, context, graph):
         # Create a PyVerilator wrapper of the RTLSim .so
         sim = self.get_rtlsim()
         nbits = self.get_instream_width()
-        inp = npy_to_rtlsim_input(
-            "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
-        )
+        inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits)
 
         super().reset_rtlsim(sim)
         super().toggle_clk(sim)
 
         wnbits = self.get_weightstream_width()
         export_wdt = self.get_weight_datatype()
-        wei = npy_to_rtlsim_input(
-            "{}/thresholds.npy".format(code_gen_dir), export_wdt, wnbits
-        )
+        wei = npy_to_rtlsim_input("{}/thresholds.npy".format(code_gen_dir), export_wdt, wnbits)
         num_w_reps = np.prod(self.get_nodeattr("numInputVectors"))
         io_dict = {
             "inputs": {"in0": inp, "weights": wei * num_w_reps},
@@ -456,9 +450,7 @@ def execute_node(self, context, graph):
         out_npy_path = "{}/output.npy".format(code_gen_dir)
         out_shape = self.get_folded_output_shape()
 
-        rtlsim_output_to_npy(
-            output, out_npy_path, odt, out_shape, packed_bits, target_bits
-        )
+        rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits)
 
         # load and reshape output
         output = np.load(out_npy_path)
@@ -475,9 +467,7 @@ def code_generation_ipi(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
 
         for rtl_file in rtl_file_list:
-            cmd.append(
-                "add_files -norecurse %s" % (os.path.join(code_gen_dir, rtl_file))
-            )
+            cmd.append("add_files -norecurse %s" % (os.path.join(code_gen_dir, rtl_file)))
 
         # Create an RTL block, not an IP core (-type ip)
         cmd.append(

From a62911cda8d882e4e4dbc662815053652cda4edd Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 20 Nov 2023 14:35:45 +0000
Subject: [PATCH 192/235] [mvu vvu axi]: minor fix -- define mvauin_weight_t

---
 finn-rtllib/mvu/mvu_vvu_axi.sv | 1 +
 1 file changed, 1 insertion(+)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index ddedec1e8a..8eb92a93e6 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -153,6 +153,7 @@ module mvu_vvu_axi #(
 	uwire ovld;
 	uwire [PE-1:0][ACCU_WIDTH-1:0] odat;
 	uwire mvauin_t amvau_i;
+	typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t;
 
 	if (IS_MVU) begin : genMVUInput
 		assign  amvau_i = amvau;

From 4d4c61b80b01858c5da2b14a2125bd5e513a7c6b Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 20 Nov 2023 14:40:46 +0000
Subject: [PATCH 193/235] [specialize_to_rtl step]: add transformation to infer
 RTL-VVU

---
 src/finn/builder/build_dataflow_config.py | 2 +-
 src/finn/builder/build_dataflow_steps.py  | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py
index 073bc9e12b..0d6911035c 100644
--- a/src/finn/builder/build_dataflow_config.py
+++ b/src/finn/builder/build_dataflow_config.py
@@ -116,12 +116,12 @@ class VerificationStepType(str, Enum):
     "step_tidy_up",
     "step_streamline",
     "step_convert_to_hls",
+    "step_specialize_to_rtl",
     "step_create_dataflow_partition",
     "step_target_fps_parallelization",
     "step_apply_folding_config",
     "step_minimize_bit_width",
     "step_generate_estimate_reports",
-    "step_specialize_to_rtl",
     "step_hls_codegen",
     "step_hls_ipgen",
     "step_set_fifo_depths",
diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
index 2629efef11..83278aae41 100644
--- a/src/finn/builder/build_dataflow_steps.py
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -404,6 +404,7 @@ def step_target_fps_parallelization(model: ModelWrapper, cfg: DataflowBuildConfi
                 target_cycles_per_frame,
                 mvau_wwidth_max=cfg.mvau_wwidth_max,
                 two_pass_relaxation=cfg.folding_two_pass_relaxation,
+                fpga_part=cfg._resolve_fpga_part()
             )
         )
         # extract the suggested configuration and save it as json
@@ -476,7 +477,7 @@ def step_generate_estimate_reports(model: ModelWrapper, cfg: DataflowBuildConfig
 def step_specialize_to_rtl(model: ModelWrapper, cfg: DataflowBuildConfig):
     """Convert layers implemented in HLS to an equivalent specialized RTL
     implementation if possible."""
-    specialize_to_rtl_transforms = [to_rtl.InferRTLMatrixVectorActivation()]
+    specialize_to_rtl_transforms = [to_rtl.InferRTLMatrixVectorActivation(), to_rtl.InferRTLVectorVectorActivation()]
     for trn in specialize_to_rtl_transforms:
         model = model.transform(trn)
     return model

From 612ed8f7ee2869d98ec9ed4084b64932e7b76cb0 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 20 Nov 2023 14:41:37 +0000
Subject: [PATCH 194/235] [rtl vvu custom op]: clean-up of unused functions

---
 .../vectorvectoractivation_rtl.py             | 162 +-----------------
 1 file changed, 3 insertions(+), 159 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/vectorvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/vectorvectoractivation_rtl.py
index 3ffef9c3a5..e3f0abb6c5 100644
--- a/src/finn/custom_op/fpgadataflow/vectorvectoractivation_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/vectorvectoractivation_rtl.py
@@ -66,7 +66,7 @@ def get_nodeattr_types(self):
             "Dim": ("ints", True, []),  # [H, W]
             "Channels": ("i", True, 0),
             "Kernel": ("ints", True, []),  # [H, W]
-            "resType": ("s", False, "auto", {"auto", "lut", "dsp"}),
+            "resType": ("s", False, "dsp", {"auto", "lut", "dsp"}),
             # FINN DataTypes for inputs, weights, outputs
             "inputDataType": ("s", True, ""),
             "weightDataType": ("s", True, ""),
@@ -77,7 +77,7 @@ def get_nodeattr_types(self):
             # const -- embedded weights, default, long compile/synth times
             # decoupled -- streaming weights with weight streamer packaged inside IP
             # external -- streaming weights with external streamer
-            "mem_mode": ("s", False, "const", {"const", "decoupled", "external"}),
+            "mem_mode": ("s", False, "decoupled", {"const", "decoupled", "external"}),
             # (mem_mode = decoupled only) whether weights will be writable through
             # an AXI-lite interface during runtime
             # 1 for enabled, 0 for disabled.
@@ -303,45 +303,6 @@ def get_exp_cycles(self):
         exp_cycles = ((ch * k_h * k_w) / pe / simd) * batch_size * (dim_h * dim_w) / mmv
         return int(exp_cycles)
 
-    def get_template_param_values(self):
-        """Returns the template parameter values according to input, output and weight
-        data types."""
-        ret = dict()
-        inp_hls_str = self.get_input_datatype().get_hls_datatype_str()
-        out_hls_str = self.get_output_datatype().get_hls_datatype_str()
-        inp_is_binary = self.get_input_datatype() == DataType["BINARY"]
-        # out_is_binary = self.get_output_datatype() == DataType["BINARY"]
-        wt_is_binary = self.get_weight_datatype() == DataType["BINARY"]
-        bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1
-        if (inp_is_binary or wt_is_binary) and (not bin_xnor_mode):
-            raise Exception("True binary (non-bipolar) inputs not yet supported")
-        inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"]
-        # out_is_bipolar = self.get_output_datatype() == DataType["BIPOLAR"]
-        wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"]
-        # reinterpret inp/wt as bipolar if bin_xnor_mode is iset
-        inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode)
-        wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode)
-        # fill in TSrcI and TWeightI
-        # TODO check these with Giulio
-        # TODO handle non-bipolar binary inputs
-        if inp_is_bipolar and wt_is_bipolar:
-            ret["TSrcI"] = "Recast<XnorMul>"
-            ret["TWeightI"] = "Identity"
-        elif (not inp_is_bipolar) and wt_is_bipolar:
-            ret["TSrcI"] = "Slice<%s>" % inp_hls_str
-            ret["TWeightI"] = "Recast<Binary>"
-        elif inp_is_bipolar and (not wt_is_bipolar):
-            ret["TSrcI"] = "Recast<Binary>"
-            ret["TWeightI"] = "Identity"
-        elif (not inp_is_bipolar) and (not wt_is_bipolar):
-            ret["TSrcI"] = "Slice<%s>" % inp_hls_str
-            ret["TWeightI"] = "Identity"
-
-        # fill in TDstI
-        ret["TDstI"] = "Slice<%s>" % out_hls_str
-
-        return ret
-
     def get_hls_compatible_weight_tensor(self, orig_weight_matrix):
         pe = self.get_nodeattr("PE")
         simd = self.get_nodeattr("SIMD")
@@ -365,57 +326,6 @@ def get_hls_compatible_weight_tensor(self, orig_weight_matrix):
         ret = ret.reshape(1, pe, wmem, simd)
         return ret
 
-    def get_hls_compatible_threshold_tensor(self, orig_thres_matrix):
-        """Convert the original numpy weight matrix orig_weight_matrix into
-        a form suitable for passing to the hlslib call:
-        * ensure MH % PE == 0
-        * for bipolar weights&inputs, ensure thresholds are positive
-        * interleave rows between PEs
-        * reshape into (PE, TMEM, n_thres_steps) and return
-        """
-        ch = self.get_nodeattr("Channels")
-        pe = self.get_nodeattr("PE")
-        tmem = self.calc_tmem()
-        assert ch % pe == 0, "Requirement Channels divisable by PE is violated."
-        assert (
-            orig_thres_matrix.ndim == 2
-        ), """Threshold matrix dimension is
-        not as expected (2)."""
-        n_thres_steps = orig_thres_matrix.shape[1]
-        inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"]
-        wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"]
-        # reinterpret inp/wt as bipolar if bin_xnor_mode is iset
-        inp_is_binary = self.get_input_datatype() == DataType["BINARY"]
-        wt_is_binary = self.get_weight_datatype() == DataType["BINARY"]
-        bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1
-        inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode)
-        wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode)
-        if inp_is_bipolar and wt_is_bipolar:
-            # ensure all thresholds are nonnegative
-            assert (orig_thres_matrix >= 0).all()
-            # ensure all thresholds are integer
-            assert (orig_thres_matrix.astype(np.int32) == orig_thres_matrix).all()
-        ret = orig_thres_matrix
-        # ensure channels = mh , duplicating if necessary
-        if ret.shape[0] == 1:
-            ret = np.tile(ret, (ch, 1))
-        assert ret.shape[0] == ch, "Channels of threshold matrix are not as expected (ch)"
-        # distribute rows between PEs
-        ret = interleave_matrix_outer_dim_from_partitions(ret, pe)
-        assert (
-            ret.shape[0] == pe
-        ), """First dimension after distribution of the
-        rows between PEs is not as expected (pe)"""
-        assert (
-            ret.shape[1] == tmem
-        ), """Second dimension after distribution of the
-        rows between PEs is not as expected (tmem)"""
-        assert (
-            ret.shape[2] == n_thres_steps
-        ), """Third dimension after distribution of the
-        rows between PEs is not as expected (n_thres_steps)"""
-        return ret.reshape(1, pe, tmem, n_thres_steps)
-
     def make_weight_file(self, weights, weight_file_mode, weight_file_name):
         """Produce a file containing given weights in appropriate format for this
         layer. This file can be used for either synthesis or run-time reconfig
@@ -549,55 +459,6 @@ def generate_params(self, model, path):
                 currently no other parameter value is supported!"""
             )
 
-        # save thresholds in thresh.h
-        if len(self.onnx_node.input) > 2:
-            thresholds = model.get_initializer(self.onnx_node.input[2])
-            if thresholds is not None:
-                threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds)
-                # use UINT32 threshold export for bipolar times bipolar
-                inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"]
-                wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"]
-                # reinterpret inp/wt as bipolar if bin_xnor_mode is iset
-                inp_is_binary = self.get_input_datatype() == DataType["BINARY"]
-                wt_is_binary = self.get_weight_datatype() == DataType["BINARY"]
-                bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1
-                inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode)
-                wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode)
-                # get computed threshold datatype from attribute
-                tdt = DataType[self.get_nodeattr("accDataType")]
-
-                assert np.vectorize(tdt.allowed)(
-                    threshold_tensor
-                ).all(), "Thresholds in %s can't be expressed with type %s" % (
-                    self.onnx_node.name,
-                    str(tdt),
-                )
-                thresholds_hls_code = numpy_to_hls_code(
-                    threshold_tensor, tdt, "thresholds", False, True
-                )
-                # write thresholds into thresh.h
-                f_thresh = open("{}/thresh.h".format(code_gen_dir), "w")
-                tdt_hls = tdt.get_hls_datatype_str()
-                # use binary to export bipolar activations
-                export_odt = self.get_output_datatype()
-                if self.get_output_datatype() == DataType["BIPOLAR"]:
-                    export_odt = DataType["BINARY"]
-                odt_hls = export_odt.get_hls_datatype_str()
-                f_thresh.write(
-                    "static ThresholdsActivation<{},{},{},{},{},{},{}> threshs \
-                    = ".format(
-                        self.calc_tmem(),
-                        self.get_nodeattr("PE"),
-                        threshold_tensor.shape[-1],
-                        tdt_hls,
-                        odt_hls,
-                        self.get_nodeattr("ActVal"),
-                        "comp::less_equal<%s, %s>" % (tdt_hls, tdt_hls),
-                    )
-                )
-                f_thresh.write(thresholds_hls_code)
-                f_thresh.close()
-
     def execute_node(self, context, graph):
         mode = self.get_nodeattr("exec_mode")
         mem_mode = self.get_nodeattr("mem_mode")
@@ -1025,19 +886,8 @@ def lut_estimation(self):
             np.ceil(alpha + math.log(1 + pow(2, -alpha), 2) + 1),
         )
         acc_luts = acc_bits
-        # thresholds and threshold comparators
-        thr_luts = 0
-        comp_luts = 0
-        noact = self.get_nodeattr("noActivation")
-        # TODO - add 'ram_style_threshold' node attribute
-        if noact == 0:
-            odt = self.get_output_datatype()
-            B = odt.bitwidth()
-            thr_luts = (2**B - 1) * acc_bits * self.calc_tmem() / 64
-            comp_luts = (2**B - 1) * acc_bits
-
         return int(
-            c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts)) + c2
+            c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts)) + c2
         )
 
 # TODO: fix estimations
@@ -1091,12 +941,6 @@ def get_op_and_param_counts(self):
         weight_param_type = "param_weight_%db" % (weight_bits)
         weight_count = k_h * k_w * fm
         ret_dict = {mac_op_type: mac_count, weight_param_type: weight_count}
-        if self.get_nodeattr("noActivation") == 0:
-            tdt = DataType[self.get_nodeattr("accDataType")]
-            thres_bits = tdt.bitwidth()
-            thres_param_type = "param_threshold_%db" % (thres_bits)
-            thres_count = fm
-            ret_dict[thres_param_type] = thres_count
         return ret_dict
 
     def derive_characteristic_fxns(self, period):

From 0b31a88be11bac6e545cfda91224200b72b8d468 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 20 Nov 2023 14:43:58 +0000
Subject: [PATCH 195/235] [folding]: first attempt to extend folding
 transformation to parallelize multi-packed DSPs in MVU/VVU more efficiently

---
 .../fpgadataflow/set_folding.py               | 75 +++++++++++++++----
 1 file changed, 60 insertions(+), 15 deletions(-)

diff --git a/src/finn/transformation/fpgadataflow/set_folding.py b/src/finn/transformation/fpgadataflow/set_folding.py
index eca1053f8f..871919f3f2 100644
--- a/src/finn/transformation/fpgadataflow/set_folding.py
+++ b/src/finn/transformation/fpgadataflow/set_folding.py
@@ -31,6 +31,7 @@
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.base import Transformation
 from qonnx.transformation.general import GiveUniqueNodeNames
+from qonnx.core.datatype import DataType
 
 from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance
 from finn.transformation.fpgadataflow.annotate_cycles import AnnotateCycles
@@ -80,11 +81,12 @@ class SetFolding(Transformation):
       unfolded before SIMD is increased
     """
 
-    def __init__(self, target_cycles_per_frame=1000, mvau_wwidth_max=36, two_pass_relaxation=True):
+    def __init__(self, target_cycles_per_frame=1000, mvau_wwidth_max=36, two_pass_relaxation=True, fpga_part=None):
         super().__init__()
         self.target_cycles_per_frame = target_cycles_per_frame
         self.mvau_wwidth_max = mvau_wwidth_max
         self.two_pass_relaxation = two_pass_relaxation
+        self.fpga_part = fpga_part
 
     def optimize_attribute_val(self, node_inst, max_val, attr_name):
         node_inst.set_nodeattr(attr_name, 1)
@@ -95,6 +97,10 @@ def optimize_attribute_val(self, node_inst, max_val, attr_name):
                 # finish if target met
                 break
 
+    def _is_versal(self, fpga_part):
+        assert fpga_part is not None, "Please specify a target board before setting the folding configuration for a more efficient folding configuration for RTL-based MVU/VVU"
+        return fpga_part[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"] or fpga_partt[0:5] == "xqrvc"
+
     def apply(self, model):
         graph = model.graph
         # these ops use PE parallelism, up to a max value of NumChannels
@@ -112,13 +118,14 @@ def apply(self, model):
         simd_ops = [
             "DownSampler",
             "FMPadding_Batch",
+            "FMPadding_Batch_rtl",
             "ConvolutionInputGenerator",
             "ConvolutionInputGenerator1D",
             "ConvolutionInputGenerator_rtl",
         ]
         # these ops are preceded by depthwise SWG and have special behavior,
         # as explained in the SetFolding docstring
-        depthwise_op_exceptions = ["VectorVectorActivation", "Pool_Batch"]
+        depthwise_op_exceptions = ["VectorVectorActivation", "VectorVectorActivation_rtl", "Pool_Batch"]
         for node in graph.node:
             if not is_fpgadataflow_node(node):
                 continue
@@ -148,6 +155,37 @@ def apply(self, model):
                         break
                 # increase PE until target met or reached max_pe
                 self.optimize_attribute_val(node_inst, max_pe, "PE")
+            if op_type == "MatrixVectorActivation_rtl":
+                max_simd = node_inst.get_nodeattr("MW")
+                max_pe = node_inst.get_nodeattr("MH")
+                node_inst.set_nodeattr("PE", 1)
+                node_inst.set_nodeattr("SIMD", 1)
+                # Depending on the board and the layer's config, either the
+                # SIMD or PE folding dimension would be preferred to enable efficient DSP-packing
+                act_width = DataType[node_inst.get_nodeattr("inputDataType")].bitwidth()
+                weight_width = DataType[node_inst.get_nodeattr("weightDataType")].bitwidth()
+                is_versal = self._is_versal(self.fpga_part)
+                is_dsp48 = act_width < 5 and weight_width < 5 or not(is_versal)
+                preferred_folding_dimension = "PE" if is_dsp48 else "SIMD"
+                preferred_folding_max = max_pe if is_dsp48 else max_simd
+                second_folding_dimension = "SIMD" if is_dsp48 else "PE"
+                second_folding_max = max_simd if is_dsp48 else max_pe
+                for fold_val in divisors(preferred_folding_max):
+                    prev_fold_val = node_inst.get_nodeattr(preferred_folding_dimension)
+                    node_inst.set_nodeattr(preferred_folding_dimension, fold_val)
+                    cyc = node_inst.get_exp_cycles()
+                    if cyc < self.target_cycles_per_frame:
+                        # finish if target met
+                        break
+                    if (
+                        node_inst.get_weight_datatype().bitwidth() * node_inst.get_nodeattr(preferred_folding_dimension)
+                        > self.mvau_wwidth_max
+                    ):
+                        # revert if we've gone above width threshold
+                        node_inst.set_nodeattr(preferred_folding_dimension, prev_fold_val)
+                        break
+                # increase SIMD until target met or reached max_simd
+                self.optimize_attribute_val(node_inst, second_folding_max, second_folding_dimension)
             elif op_type in pe_ops:
                 max_pe = node_inst.get_nodeattr("NumChannels")
                 self.optimize_attribute_val(node_inst, max_pe, "PE")
@@ -156,37 +194,44 @@ def apply(self, model):
                 self.optimize_attribute_val(node_inst, max_pe, "PE")
             elif op_type in depthwise_op_exceptions:
                 # init/reset SIMD of VVAU
-                if op_type == "VectorVectorActivation":
-                    node_inst.set_nodeattr("SIMD", 1)
+                is_hls_vvu_or_pool = op_type in ["VectorVectorActivation", "Pool_Batch"]
                 max_pe = node_inst.get_nodeattr("Channels")
-                self.optimize_attribute_val(node_inst, max_pe, "PE")
-                # increase SIMD for VVAU once PE is exhausted
-                pe = node_inst.get_nodeattr("PE")
+                max_simd = np.prod(node_inst.get_nodeattr("Kernel")) if op_type.startswith("VectorVectorActivation") else 0
+                preferred_folding_dimension = "PE" if is_hls_vvu_or_pool else "SIMD"
+                preferred_folding_max = max_pe if is_hls_vvu_or_pool else max_simd
+                second_folding_dimension = "SIMD" if is_hls_vvu_or_pool else "PE"
+                second_folding_max = max_simd if is_hls_vvu_or_pool else max_pe
+                if op_type.startswith("VectorVectorActivation"):
+                    node_inst.set_nodeattr(second_folding_dimension, 1)
+                self.optimize_attribute_val(node_inst, preferred_folding_max, preferred_folding_dimension)
+                # increase SIMD(/PE) for VVAU once PE(/SIMD) is exhausted
+                fold_val = node_inst.get_nodeattr(preferred_folding_dimension)
                 cyc = node_inst.get_exp_cycles()
                 if (
-                    op_type == "VectorVectorActivation"
-                    and pe == max_pe
+                    op_type.startswith("VectorVectorActivation")
+                    and fold_val == preferred_folding_max
                     and cyc > self.target_cycles_per_frame
                 ):
-                    max_simd = np.prod(node_inst.get_nodeattr("Kernel"))
-                    self.optimize_attribute_val(node_inst, max_simd, "SIMD")
-                # also set the folding of the upsteam DW SWU
+                    self.optimize_attribute_val(node_inst, second_folding_max, second_folding_dimension)
+                # also set the folding of the upsteam DW SWU (in case of HLS-based VVU)
                 # which must be identical to this node
                 swu_node = model.find_producer(node.input[0])
                 if swu_node.op_type.startswith("ConvolutionInputGenerator"):
                     swu_node_inst = getCustomOp(swu_node)
-                    swu_node_inst.set_nodeattr("SIMD", pe)
                     # enable parallel_window mode of RTL SWG if needed
                     if swu_node.op_type == "ConvolutionInputGenerator_rtl":
                         if (
-                            op_type == "VectorVectorActivation"
+                            op_type.startswith("VectorVectorActivation")
                             and node_inst.get_nodeattr("SIMD") > 1
                         ):
                             swu_node_inst.set_nodeattr("parallel_window", 1)
+                            swu_node_inst.set_nodeattr("SIMD", max_pe)
                         else:
                             swu_node_inst.set_nodeattr("parallel_window", 0)
+                            pe = node_inst.get_nodeattr("PE")
+                            swu_node_inst.set_nodeattr("SIMD", pe)
                 else:
-                    if op_type == "VectorVectorActivation":
+                    if op_type.startswith("VectorVectorActivation"):
                         ksize = np.prod(node_inst.get_nodeattr("Kernel"))
                     elif op_type == "Pool_Batch":
                         ksize = node_inst.get_nodeattr("KernelSize")

From 92bc515255114f23fd889f3010924c07a1018fb1 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 20 Nov 2023 14:45:28 +0000
Subject: [PATCH 196/235] [to-rtl transformation]: extended with additional
 checker to ensure the HLS-based MVU/VVU does not have the activation function
 embedded

---
 .../transformation/fpgadataflow/specialize_to_rtl_layers.py     | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py
index 5061282695..1bd83217ab 100644
--- a/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py
+++ b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py
@@ -79,6 +79,7 @@ def apply(self, model):
                     ram_style = getCustomOp(n).get_nodeattr("ram_style")
                     resType = getCustomOp(n).get_nodeattr("resType")
                     runtime_writeable_weights = getCustomOp(n).get_nodeattr("runtime_writeable_weights")
+                    assert(getCustomOp(n).get_nodeattr("noActivation")==1), "Layer {} currently has thresholds embedded. Please implement the Thresholding layer standalone to enable the RTL-based MatrixVector unit".format(n.name)
 
                     new_node = helper.make_node(
                         "MatrixVectorActivation_rtl",
@@ -156,6 +157,7 @@ def apply(self, model):
                     runtime_writeable_weights = getCustomOp(n).get_nodeattr("runtime_writeable_weights")
                     ram_style = getCustomOp(n).get_nodeattr("ram_style")
                     resType = getCustomOp(n).get_nodeattr("resType")                    
+                    assert(getCustomOp(n).get_nodeattr("noActivation")==1), "Layer {} currently has thresholds embedded. Please implement the Thresholding layer standalone to enable the RTL-based MatrixVector unit".format(n.name)
 
                     new_node = helper.make_node(
                         "VectorVectorActivation_rtl",

From 31914b1d243a597f68192b932403d8b247047056 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 27 Nov 2023 11:17:44 +0000
Subject: [PATCH 197/235] [build steps]: move specialize_to_rtl step to be
 applied after convert_to_hls step

---
 src/finn/builder/build_dataflow_config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py
index 073bc9e12b..0d6911035c 100644
--- a/src/finn/builder/build_dataflow_config.py
+++ b/src/finn/builder/build_dataflow_config.py
@@ -116,12 +116,12 @@ class VerificationStepType(str, Enum):
     "step_tidy_up",
     "step_streamline",
     "step_convert_to_hls",
+    "step_specialize_to_rtl",
     "step_create_dataflow_partition",
     "step_target_fps_parallelization",
     "step_apply_folding_config",
     "step_minimize_bit_width",
     "step_generate_estimate_reports",
-    "step_specialize_to_rtl",
     "step_hls_codegen",
     "step_hls_ipgen",
     "step_set_fifo_depths",

From fa1d11624bffc717bd82dc52748c97f73d574ef2 Mon Sep 17 00:00:00 2001
From: Yaman Umuroglu <yamanu@xilinx.com>
Date: Fri, 24 Nov 2023 15:17:48 +0000
Subject: [PATCH 198/235] [Test] fix data layout for golden/ret comparison in
 RTL MVU test

---
 .../test_fpgadataflow_vvau_rtl.py             | 129 ++++++++++--------
 1 file changed, 74 insertions(+), 55 deletions(-)

diff --git a/tests/fpgadataflow/test_fpgadataflow_vvau_rtl.py b/tests/fpgadataflow/test_fpgadataflow_vvau_rtl.py
index 29132da90e..25fad308ee 100644
--- a/tests/fpgadataflow/test_fpgadataflow_vvau_rtl.py
+++ b/tests/fpgadataflow/test_fpgadataflow_vvau_rtl.py
@@ -27,41 +27,46 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import pytest
-import os
 
 import numpy as np
+import os
+import pickle
 from onnx import TensorProto, helper
-from qonnx.util.basic import (
-    qonnx_make_model,
-    gen_finn_dt_tensor
-)
-from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.core.datatype import DataType
-from qonnx.transformation.general import GiveUniqueNodeNames
-import finn.core.onnx_exec as oxe
-import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
-from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
-from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
-from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
-from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
-from qonnx.transformation.general import ApplyConfig
-import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl
+from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.general.im2col import compute_conv_output_dim
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.general import (
+    ApplyConfig,
+    GiveReadableTensorNames,
+    GiveUniqueNodeNames,
+)
 from qonnx.transformation.infer_datatypes import InferDataTypes
 from qonnx.transformation.infer_shapes import InferShapes
 from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
-from qonnx.transformation.general import GiveReadableTensorNames
-import pickle
-from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
-from finn.transformation.fpgadataflow.set_fifo_depths import InsertAndSetFIFODepths
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
+
+import finn.core.onnx_exec as oxe
+import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
+import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl
+from finn.transformation.fpgadataflow.create_dataflow_partition import (
+    CreateDataflowPartition,
+)
 from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
-from finn.transformation.fpgadataflow.create_dataflow_partition import CreateDataflowPartition
-from qonnx.custom_op.registry import getCustomOp
-from finn.transformation.fpgadataflow.minimize_accumulator_width import MinimizeAccumulatorWidth
-#import qonnx.core.data_layout as DataLayout
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.minimize_accumulator_width import (
+    MinimizeAccumulatorWidth,
+)
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.fpgadataflow.set_fifo_depths import InsertAndSetFIFODepths
+
+# import qonnx.core.data_layout as DataLayout
 
 build_dir = os.environ["FINN_BUILD_DIR"]
 
+
 def make_single_dw_conv_modelwrapper(conv_config, idt, wdt):
     kernel_size, in_feature_dim, in_chn = conv_config
     stride = 1
@@ -69,7 +74,7 @@ def make_single_dw_conv_modelwrapper(conv_config, idt, wdt):
 
     out_feature_dim = compute_conv_output_dim(in_feature_dim, kernel_size, stride, pad)
     group = out_chn = in_chn
-    
+
     conv_param_shape = [out_chn, 1, kernel_size, kernel_size]
     input_shape = [1, in_chn, in_feature_dim, in_feature_dim]
     output_shape = [1, out_chn, out_feature_dim, out_feature_dim]
@@ -105,14 +110,16 @@ def make_single_dw_conv_modelwrapper(conv_config, idt, wdt):
 
     return model
 
+
 def prepare_inputs(input_tensor):
     return {"global_in": input_tensor}
 
+
 @pytest.mark.parametrize("kernel_size", [3])
 @pytest.mark.parametrize("in_feature_dim", [5])
 @pytest.mark.parametrize("in_chn", [4])
 @pytest.mark.parametrize("idt", [DataType["INT8"]])
-#@pytest.mark.parametrize("idt", [DataType["UINT8"]])
+# @pytest.mark.parametrize("idt", [DataType["UINT8"]])
 @pytest.mark.parametrize("wdt", [DataType["INT6"]])
 @pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S"])
 @pytest.mark.parametrize("segmentlen", [1])
@@ -121,19 +128,23 @@ def prepare_inputs(input_tensor):
 @pytest.mark.fpgadataflow
 @pytest.mark.slow
 @pytest.mark.vivado
-def test_fpgadataflow_vvau_rtl(kernel_size, in_feature_dim, in_chn, idt, wdt, part, segmentlen, pe, simd):
+def test_fpgadataflow_vvau_rtl(
+    kernel_size, in_feature_dim, in_chn, idt, wdt, part, segmentlen, pe, simd
+):
     # Create depthwise-separable convolution
     conv_config = (kernel_size, in_feature_dim, in_chn)
     model = make_single_dw_conv_modelwrapper(conv_config, idt, wdt)
     model = model.transform(GiveUniqueNodeNames())
     model = model.transform(GiveReadableTensorNames())
-    model.save(build_dir+"/dw_conv.onnx")
+    model.save(build_dir + "/dw_conv.onnx")
 
     # Obtain golden reference output
-    golden_in = gen_finn_dt_tensor(model.get_tensor_datatype("global_in"), model.get_tensor_shape("global_in"))
+    golden_in = gen_finn_dt_tensor(
+        model.get_tensor_datatype("global_in"), model.get_tensor_shape("global_in")
+    )
     input_dict = prepare_inputs(golden_in)
     golden_out = oxe.execute_onnx(model, input_dict, return_full_exec_context=True)
-    with open(build_dir+"/onnx_dws_conv.pkl", "wb") as f:
+    with open(build_dir + "/onnx_dws_conv.pkl", "wb") as f:
         pickle.dump(golden_out, f)
 
     # Convert to HLS custom-op first
@@ -143,26 +154,23 @@ def test_fpgadataflow_vvau_rtl(kernel_size, in_feature_dim, in_chn, idt, wdt, pa
     model = model.transform(MinimizeAccumulatorWidth())
     model = model.transform(GiveUniqueNodeNames())
     model = model.transform(GiveReadableTensorNames())
-    model.save(build_dir+"/hls_vvau.onnx")
+    model.save(build_dir + "/hls_vvau.onnx")
 
     # Apply folding (i.e. specify to use DSPs)
     folding_config = {
         "Defaults": {},
-        "ConvolutionInputGenerator_rtl_0": {
-            "SIMD" : 4,
-            "parallel_window" : 1
-        },
+        "ConvolutionInputGenerator_rtl_0": {"SIMD": 4, "parallel_window": 1},
         "VectorVectorActivation_0": {
-            "PE" : pe,
-            "SIMD" : simd,
-            "mem_mode" : "decoupled",
-            "ram_style" : "auto",
-            "resType" : "dsp",
-            "preferred_backend" : "rtl"
-        }
+            "PE": pe,
+            "SIMD": simd,
+            "mem_mode": "decoupled",
+            "ram_style": "auto",
+            "resType": "dsp",
+            "preferred_backend": "rtl",
+        },
     }
     model = model.transform(ApplyConfig(folding_config))
-    model.save(build_dir+"/hls_vvau_folded.onnx")
+    model.save(build_dir + "/hls_vvau_folded.onnx")
 
     # Obtain second reference from HLS-based VVAU layer
     model = model.transform(SetExecMode("rtlsim"))
@@ -170,28 +178,30 @@ def test_fpgadataflow_vvau_rtl(kernel_size, in_feature_dim, in_chn, idt, wdt, pa
     model = model.transform(HLSSynthIP())
     model = model.transform(PrepareRTLSim())
     conv_hls_out = oxe.execute_onnx(model, input_dict, return_full_exec_context=True)
-    with open(build_dir+"/hls_vvau_folded_output.pkl", "wb") as f:
+    with open(build_dir + "/hls_vvau_folded_output.pkl", "wb") as f:
         pickle.dump(conv_hls_out, f)
 
     # Stitched-IP RTLsim
     model = model.transform(CreateDataflowPartition(partition_model_dir=build_dir))
-    model.save(build_dir+"/ip-stitched.onnx")
-    partition_model_path = getCustomOp(model.get_nodes_by_op_type("StreamingDataflowPartition")[0]).get_nodeattr("model")
+    model.save(build_dir + "/ip-stitched.onnx")
+    partition_model_path = getCustomOp(
+        model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
+    ).get_nodeattr("model")
     partitioned_model = ModelWrapper(partition_model_path)
     # FIFOs needed for stitched-ip RTLsim, DWC needed for VVU operating on SIMD parallelism
     partitioned_model = partitioned_model.transform(InsertAndSetFIFODepths(part, 5))
     partitioned_model = partitioned_model.transform(PrepareIP(part, 5))
     partitioned_model = partitioned_model.transform(HLSSynthIP())
-    partitioned_model.save(build_dir+"/partitioned_model.onnx")
+    partitioned_model.save(build_dir + "/partitioned_model.onnx")
     partitioned_model = partitioned_model.transform(CreateStitchedIP(part, 5))
     partitioned_model.save(partition_model_path)
-    partitioned_model.set_metadata_prop("rtlsim_trace", build_dir+"/hls-vvu.vcd")
+    partitioned_model.set_metadata_prop("rtlsim_trace", build_dir + "/hls-vvu.vcd")
     # set top-level prop for stitched-ip rtlsim and launch
     partitioned_model.set_metadata_prop("exec_mode", "rtlsim")
     # transpose input since we're now simulating HW layers (NCHW --> NHWC)
-    input_dict["global_in"] = np.transpose(input_dict["global_in"], (0,2,3,1))
+    input_dict["global_in"] = np.transpose(input_dict["global_in"], (0, 2, 3, 1))
     stitched_ip_out = oxe.execute_onnx(partitioned_model, input_dict, return_full_exec_context=True)
-    with open(build_dir+"/stitched_ip_output.pkl", "wb") as f:
+    with open(build_dir + "/stitched_ip_output.pkl", "wb") as f:
         pickle.dump(stitched_ip_out, f)
 
     # Apply convert-to-rtl step
@@ -201,15 +211,24 @@ def test_fpgadataflow_vvau_rtl(kernel_size, in_feature_dim, in_chn, idt, wdt, pa
     partitioned_model = partitioned_model.transform(PrepareIP(part, 5))
     partitioned_model = partitioned_model.transform(HLSSynthIP())
     partitioned_model = partitioned_model.transform(CreateStitchedIP(part, 5))
-    partitioned_model.save(build_dir+"/partition_rtl_vvau.onnx")
-    partitioned_model.set_metadata_prop("rtlsim_trace", build_dir+"/rtl-vvu.vcd")
+    partitioned_model.save(build_dir + "/partition_rtl_vvau.onnx")
+    partitioned_model.set_metadata_prop("rtlsim_trace", build_dir + "/rtl-vvu.vcd")
     # Reset rtlsim_so path to re-generate Pyverilator sim object
     partitioned_model.set_metadata_prop("rtlsim_so", "")
     # set top-level prop for stitched-ip rtlsim and launch
     partitioned_model.set_metadata_prop("exec_mode", "rtlsim")
     vvu_rtl_out = oxe.execute_onnx(partitioned_model, input_dict, return_full_exec_context=True)
-    with open(build_dir+"/rtl_vvau_output.pkl", "wb") as f:
+    with open(build_dir + "/rtl_vvau_output.pkl", "wb") as f:
         pickle.dump(vvu_rtl_out, f)
-    
-    assert (vvu_rtl_out["global_out"] == golden_out["global_out"]).all(), "Output of ONNX model not matching output of stitched-IP RTL model!"
-    assert (vvu_rtl_out["global_out"] == stitched_ip_out["global_out"]).all(), "Output of stitched-IP HLS model not matching output of stitched-IP RTL model!"
+
+    golden_ret = golden_out["global_out"]
+    # tranpose hardware-generated outputs NHWC -> NCHW to be comparable
+    vvu_rtl_ret = vvu_rtl_out["global_out"].transpose(0, 3, 1, 2)
+    hls_ret = stitched_ip_out["global_out"].transpose(0, 3, 1, 2)
+
+    assert (
+        vvu_rtl_ret == golden_ret
+    ).all(), "Output of ONNX model not matching output of stitched-IP RTL model!"
+    assert (
+        vvu_rtl_ret == hls_ret
+    ).all(), "Output of stitched-IP HLS model not matching output of stitched-IP RTL model!"

From becaac706358c724c27717890b9cf6e0e1bbcef1 Mon Sep 17 00:00:00 2001
From: Yaman Umuroglu <yamanu@xilinx.com>
Date: Fri, 24 Nov 2023 09:16:33 +0000
Subject: [PATCH 199/235] [RTLCustomOp] IP packaging fixes for pDWC+VVU, fix
 linting too

---
 ...datawidthconverter_parallelwindow_batch.py | 131 +++++++++++++++++-
 .../vectorvectoractivation_rtl.py             |  77 +++++-----
 2 files changed, 166 insertions(+), 42 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_parallelwindow_batch.py b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_parallelwindow_batch.py
index 6a72f17555..2fe7ae8e54 100644
--- a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_parallelwindow_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_parallelwindow_batch.py
@@ -26,22 +26,27 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import math
 import numpy as np
 import os
 import warnings
 from qonnx.core.datatype import DataType
 
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
+from finn.util.basic import get_rtlsim_trace_depth, make_build_dir
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 
+try:
+    from pyverilator import PyVerilator
+except ModuleNotFoundError:
+    PyVerilator = None
+
 # does not do anything at the ONNX node-by-node level, and input-output
 # tensor shapes are the same. performs data width conversion at the rtlsim level
 
 
 class StreamingDataWidthConverter_ParallelWindow_Batch(HLSCustomOp):
     """Class that corresponds to finn-hlslib StreamingDataWidthConverter_ParallelWindow_Batch
-    function. To be inserted between an RTL-SWG with parallel window mode enabled and a 
+    function. To be inserted between an RTL-SWG with parallel window mode enabled and a
     VVU."""
 
     def get_nodeattr_types(self):
@@ -57,6 +62,9 @@ def get_nodeattr_types(self):
             "PE": ("i", True, 0),
             "Channels": ("i", True, 0),
             "Kernel": ("ints", True, []),
+            "Mode": ("s", False, ""),
+            # attribute to save top module name - not user configurable
+            "gen_top_module": ("s", False, ""),
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
@@ -373,8 +381,89 @@ def execute_node(self, context, graph):
         ), """Output
         shape doesn't match expected shape, should be same as input shape"""
 
+    def prepare_codegen_default(self):
+        template_path = os.environ["FINN_ROOT"] + "/finn-rtllib/dwc/dwc_axi_wrapper.v"
+
+        code_gen_dict = {}
+        code_gen_dict["$IN_WIDTH$"] = [str(self.get_nodeattr("inWidth"))]
+        code_gen_dict["$OUT_WIDTH$"] = [str(self.get_nodeattr("outWidth"))]
+        code_gen_dict["$SIMD$"] = [str(self.get_nodeattr("SIMD"))]
+        code_gen_dict["$PE$"] = [str(self.get_nodeattr("PE"))]
+        code_gen_dict["$CHANNELS$"] = [str(self.get_nodeattr("Channels"))]
+        code_gen_dict["$KERNEL_PROD$"] = [str(np.prod(self.get_nodeattr("Kernel")))]
+        code_gen_dict["$ACTIVATION_WIDTH$"] = [str(self.get_input_datatype(0).bitwidth())]
+        code_gen_dict["$MODE$"] = [self.get_nodeattr("Mode")]
+
+        return template_path, code_gen_dict
+
+    def generate_hdl(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+
+        template_path, code_gen_dict = self.prepare_codegen_default()
+        # add general parameters to dictionary
+        code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [self.get_verilog_top_module_name()]
+        # save top module name so we can refer to it after this node has been renamed
+        # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject)
+        self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name())
+
+        # apply code generation to template
+        with open(template_path, "r") as f:
+            template_wrapper = f.read()
+        for key in code_gen_dict:
+            # transform list into long string separated by '\n'
+            code_gen_line = "\n".join(code_gen_dict[key])
+            template_wrapper = template_wrapper.replace(key, code_gen_line)
+        with open(
+            os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"), "w"
+        ) as f:
+            f.write(template_wrapper)
+
+        shutil.copy2(
+            os.environ["FINN_ROOT"] + "/finn-rtllib/dwc/dwc_parallelwindow.sv", code_gen_dir
+        )
+        shutil.copy2(os.environ["FINN_ROOT"] + "/finn-rtllib/dwc/dwc_upsample.sv", code_gen_dir)
+
+        # set ipgen_path and ip_path so that HLS-Synth transformation
+        # and stich_ip transformation do not complain
+        self.set_nodeattr("ipgen_path", code_gen_dir)
+        self.set_nodeattr("ip_path", code_gen_dir)
+
+    def get_all_verilog_paths(self):
+        "Return list of all folders containing Verilog code for this node."
+
+        rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/dwc/")
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        return [rtllib_dir, code_gen_dir]
+
     def code_generation_ipi(self):
-        return super().code_generation_ipi()
+        """Constructs and returns the TCL for node instantiation in Vivado IPI."""
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+
+        rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/dwc/")
+        source_files = [
+            os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"),
+            rtllib_dir + "dwc_parallelwindow.sv",
+            rtllib_dir + "dwc_upsample.sv",
+        ]
+
+        source_target = "./ip/verilog/rtl_ops/%s" % self.onnx_node.name
+        cmd = ["file mkdir %s" % source_target]
+
+        for f in source_files:
+            cmd.append("add_files -copy_to %s -norecurse %s" % (source_target, f))
+        cmd += [
+            "create_bd_cell -type module -reference %s %s"
+            % (self.get_nodeattr("gen_top_module"), self.onnx_node.name)
+        ]
+
+        return cmd
+
+    def hls_sname(self):
+        """Get the naming convention used by Vitis HLS for stream signals
+        Example: the TDATA for a stream called "out" would be out_V_TDATA.
+        """
+        # no additional prefix/suffix in interface names since this is an RTL component
+        return ""
 
     def lut_estimation(self):
         """Calculates resource estimations for LUTs"""
@@ -387,4 +476,38 @@ def code_generation_ipgen(self, model, fpgapart, clk):
         super().code_generation_ipgen(model, fpgapart, clk)
 
     def ipgen_singlenode_code(self):
-        super().ipgen_singlenode_code()
+        pass
+
+    def code_generation_cppsim(self, model):
+        """Normally: Generates C++ code for simulation (cppsim)."""
+        pass
+
+    def compile_singlenode_code(self):
+        pass
+
+    def global_includes(self):
+        pass
+
+    def defines(self, var):
+        pass
+
+    def read_npy_data(self):
+        pass
+
+    def strm_decl(self):
+        pass
+
+    def docompute(self):
+        pass
+
+    def dataoutstrm(self):
+        pass
+
+    def save_as_npy(self):
+        pass
+
+    def blackboxfunction(self):
+        pass
+
+    def pragmas(self):
+        pass
diff --git a/src/finn/custom_op/fpgadataflow/vectorvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/vectorvectoractivation_rtl.py
index e3f0abb6c5..8277f36b21 100644
--- a/src/finn/custom_op/fpgadataflow/vectorvectoractivation_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/vectorvectoractivation_rtl.py
@@ -39,13 +39,13 @@
 )
 
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
+from finn.util.basic import get_rtlsim_trace_depth, make_build_dir
 from finn.util.data_packing import (
     npy_to_rtlsim_input,
     numpy_to_hls_code,
     pack_innermost_dim_as_hex_string,
     rtlsim_output_to_npy,
 )
-from finn.util.basic import get_rtlsim_trace_depth, make_build_dir
 
 try:
     from pyverilator import PyVerilator
@@ -101,7 +101,7 @@ def get_nodeattr_types(self):
                 {"auto", "block", "distributed", "ultra"},
             ),
             # attribute to save top module name - not user configurable
-            "gen_top_module": ("s", False, "")
+            "gen_top_module": ("s", False, ""),
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
@@ -114,10 +114,6 @@ def minimize_accumulator_width(self, model):
         fm = self.get_nodeattr("Channels")
         # put weights into the shape expected by calculate_matvec_accumulator_range
         weights = weights.reshape(fm, k_h * k_w).transpose()
-        if len(self.onnx_node.input) > 2:
-            thresholds = model.get_initializer(self.onnx_node.input[2])
-        else:
-            thresholds = None
         idt = self.get_input_datatype()
 
         (acc_min, acc_max) = calculate_matvec_accumulator_range(weights, idt)
@@ -289,7 +285,7 @@ def get_number_output_values(self):
         nf = np.prod(self.get_folded_output_shape()[:-1])
         return nf
 
-# TODO: fix exp_cycles estimations --> depends on fpga_part and clk
+    # TODO: fix exp_cycles estimations --> depends on fpga_part and clk
     def get_exp_cycles(self):
         pe = self.get_nodeattr("PE")
         simd = self.get_nodeattr("SIMD")
@@ -451,7 +447,8 @@ def generate_params(self, model, path):
             if mem_mode == "decoupled":
                 # also save weights as Verilog .dat file
                 # This file will be ignored when synthesizing UltraScale memory.
-                weight_filename_rtl = "{}/memblock.dat".format(code_gen_dir)
+                weight_filename_rtl = self.get_decoupled_weight_filename(abspath=False)
+                weight_filename_rtl = code_gen_dir + "/" + weight_filename_rtl
                 self.make_weight_file(weights, "decoupled_verilog_dat", weight_filename_rtl)
         else:
             raise Exception(
@@ -622,7 +619,8 @@ def get_verilog_top_module_intf_names(self):
         return intf_names
 
     def code_generation_ipi(self):
-        cmd = []
+        source_target = "./ip/verilog/rtl_ops/%s" % self.onnx_node.name
+        cmd = ["file mkdir %s" % source_target]
         # add streamer if needed
         mem_mode = self.get_nodeattr("mem_mode")
         if mem_mode == "decoupled":
@@ -653,9 +651,7 @@ def code_generation_ipi(self):
             code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
             rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/")
             sourcefiles = [
-                os.path.join(
-                    code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"
-                ),
+                os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"),
                 rtllib_dir + "mvu_vvu_axi.sv",
                 rtllib_dir + "replay_buffer.sv",
                 rtllib_dir + "mvu_4sx4u.sv",
@@ -664,7 +660,7 @@ def code_generation_ipi(self):
                 rtllib_dir + "mvu_vvu_lut.sv",
             ]
             for f in sourcefiles:
-                cmd.append("add_files -norecurse %s" % (f))
+                cmd += ["add_files -copy_to %s -norecurse %s" % (source_target, f)]
             cmd.append(
                 "create_bd_cell -type hier -reference %s /%s/%s"
                 % (
@@ -690,7 +686,7 @@ def code_generation_ipi(self):
                 % (
                     self.calc_wmem(),
                     self.get_weightstream_width_padded(),
-                    self.get_nodeattr("code_gen_dir_ipgen") + "/memblock.dat",
+                    self.get_decoupled_weight_filename(abspath=False),
                     self.get_nodeattr("ram_style"),
                     node_name,
                     strm_inst,
@@ -747,9 +743,7 @@ def code_generation_ipi(self):
             code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
             rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/")
             sourcefiles = [
-                os.path.join(
-                    code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"
-                ),
+                os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"),
                 rtllib_dir + "mvu_vvu_axi.sv",
                 rtllib_dir + "replay_buffer.sv",
                 rtllib_dir + "mvu_4sx4u.sv",
@@ -836,7 +830,7 @@ def bram_efficiency_estimation(self):
         bram16_est_capacity = bram16_est * 36 * 512
         return wbits / bram16_est_capacity
 
-# TODO: fix estimations
+    # TODO: fix estimations
     def lut_estimation(self):
         """Calculates resource estimations for LUTs based on:
         - FINN-R: An End-to-End Deep-Learning Framework for Fast
@@ -886,11 +880,9 @@ def lut_estimation(self):
             np.ceil(alpha + math.log(1 + pow(2, -alpha), 2) + 1),
         )
         acc_luts = acc_bits
-        return int(
-            c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts)) + c2
-        )
+        return int(c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts)) + c2)
 
-# TODO: fix estimations
+    # TODO: fix estimations
     def dsp_estimation(self):
         # multiplication
         P = self.get_nodeattr("PE")
@@ -965,9 +957,7 @@ def generate_hdl(self, model, fpgapart, clk):
 
         template_path, code_gen_dict = self.prepare_codegen_default(fpgapart, clk)
         # add general parameters to dictionary
-        code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [
-            self.get_verilog_top_module_name()
-        ]
+        code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [self.get_verilog_top_module_name()]
         # save top module name so we can refer to it after this node has been renamed
         # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject)
         self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name())
@@ -980,16 +970,12 @@ def generate_hdl(self, model, fpgapart, clk):
             code_gen_line = "\n".join(code_gen_dict[key])
             template_wrapper = template_wrapper.replace(key, code_gen_line)
         with open(
-            os.path.join(
-                code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"
-            ),
+            os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"),
             "w",
         ) as f:
             f.write(template_wrapper.replace("$FORCE_BEHAVIORAL$", str(0)))
         with open(
-            os.path.join(
-                code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper_sim.v"
-            ),
+            os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper_sim.v"),
             "w",
         ) as f:
             f.write(template_wrapper.replace("$FORCE_BEHAVIORAL$", str(1)))
@@ -1019,9 +1005,10 @@ def _resolve_impl_style(self, fpgapart):
                 fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"]
                 or fpgapart[0:5] == "xqrvc"
             )
-            assert (is_dsp_targeted and is_versal), "DSP-based (RTL) VVU currently only supported on Versal (DSP58) devices"
+            assert (
+                is_dsp_targeted and is_versal
+            ), "DSP-based (RTL) VVU currently only supported on Versal (DSP58) devices"
             return "mvu_vvu_8sx9_dsp58"
-                
 
     def prepare_codegen_default(self, fpgapart, clk):
         template_path = os.environ["FINN_ROOT"] + "/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v"
@@ -1034,16 +1021,14 @@ def prepare_codegen_default(self, fpgapart, clk):
         code_gen_dict["$MH$"] = [str(self.get_nodeattr("Channels"))]
         code_gen_dict["$PE$"] = [str(self.get_nodeattr("PE"))]
         code_gen_dict["$SIMD$"] = [str(self.get_nodeattr("SIMD"))]
-        code_gen_dict["$ACTIVATION_WIDTH$"] = [
-            str(self.get_input_datatype(0).bitwidth())
-        ]
+        code_gen_dict["$ACTIVATION_WIDTH$"] = [str(self.get_input_datatype(0).bitwidth())]
         code_gen_dict["$WEIGHT_WIDTH$"] = [str(self.get_input_datatype(1).bitwidth())]
         code_gen_dict["$ACCU_WIDTH$"] = [str(self.get_output_datatype().bitwidth())]
         code_gen_dict["$SIGNED_ACTIVATIONS$"] = (
             [str(1)] if (self.get_input_datatype(0).min() < 0) else [str(0)]
         )
         code_gen_dict["$SEGMENTLEN$"] = [str(self._resolve_segment_len(clk))]
-        #code_gen_dict["$FORCE_BEHAVIORAL$"] = [str(0)]
+        # code_gen_dict["$FORCE_BEHAVIORAL$"] = [str(0)]
 
         return template_path, code_gen_dict
 
@@ -1071,4 +1056,20 @@ def prepare_rtlsim(self):
         # save generated lib filename in attribute
         self.set_nodeattr("rtlsim_so", sim.lib._name)
 
-        return sim
\ No newline at end of file
+        return sim
+
+    def get_all_verilog_paths(self):
+        "Return list of all folders containing Verilog code for this node."
+
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        # Path to (System-)Verilog files used by top-module & path to top-module
+        verilog_paths = [code_gen_dir, os.environ["FINN_ROOT"] + "/finn-rtllib/mvu"]
+        return verilog_paths
+
+    def get_verilog_top_filename(self):
+        "Return the Verilog top module filename for this node."
+
+        verilog_file = "{}/{}_wrapper.v".format(
+            self.get_nodeattr("code_gen_dir_ipgen"), self.get_nodeattr("gen_top_module")
+        )
+        return verilog_file

From cf7f4946dc44f264de665e8a23893bd858277796 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Wed, 1 Nov 2023 15:20:07 +0000
Subject: [PATCH 200/235] [mvu vvu axi]: minor bugfixes to enable VVU

---
 finn-rtllib/mvu/mvu_vvu_axi.sv | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index f0f75c633a..ddedec1e8a 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -64,7 +64,7 @@ module mvu_vvu_axi #(
 	localparam int unsigned INPUT_STREAM_WIDTH_BA = ((IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8,
 	localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH,
 	localparam int unsigned INPUT_STREAM_WIDTH =  (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH,
-	localparam int unsigned SF = IS_MVU ? MW/SIMD : MW/(SIMD*PE),
+	localparam int unsigned SF = MW/SIMD,
 	localparam int unsigned NF = IS_MVU ? MH/PE : 1,
 	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8
 )
@@ -91,11 +91,11 @@ module mvu_vvu_axi #(
 
 //-------------------- Parameter sanity checks --------------------\\
 	initial begin
-		if ((MW % SIMD != 0 && IS_MVU) || (MW % (SIMD*PE) != 0 && !IS_MVU)) begin
+		if (MW % SIMD != 0) begin
 			$error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD);
 			$finish;
 		end
-		if (MH % PE != 0 && IS_MVU) begin
+		if (MH % PE != 0) begin
 			$error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE);
 			$finish;
 		end
@@ -152,13 +152,10 @@ module mvu_vvu_axi #(
 //-------------------- Core MVU/VVU --------------------\\
 	uwire ovld;
 	uwire [PE-1:0][ACCU_WIDTH-1:0] odat;
-	typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t;
 	uwire mvauin_t amvau_i;
-	uwire mvauin_weight_t wmvau_i;
 
 	if (IS_MVU) begin : genMVUInput
 		assign  amvau_i = amvau;
-		assign  wmvau_i = s_axis_weights_tdata;
 	end : genMVUInput
 	else begin : genVVUInput
 		// The input stream will have the channels interleaved for VVU when PE>1
@@ -169,11 +166,8 @@ module mvu_vvu_axi #(
 		localparam int num_of_elements = PE*SIMD;
 		for (genvar i=0; i<num_of_elements; i++) begin : genRewire
 			assign  amvau_i[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH] = (PE > 1) ?
-									amvau[(i/SIMD + (i*PE % num_of_elements) + 1) * ACTIVATION_WIDTH : (i/SIMD + (i*PE % num_of_elements)) * ACTIVATION_WIDTH]
+									amvau[(i/SIMD + (i*PE % num_of_elements) + 1) * ACTIVATION_WIDTH -1: (i/SIMD + (i*PE % num_of_elements)) * ACTIVATION_WIDTH]
 									: amvau[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH];
-			assign  wmvau_i[i*WEIGHT_WIDTH +: WEIGHT_WIDTH] = (PE > 1) ? 
-									s_axis_weights_tdata[( ((SIMD-1-i) + int'(i/SIMD)*SIMD) + int'(i/SIMD) * SIMD + 1) * WEIGHT_WIDTH : ( ((SIMD-1-i) + int'(i/SIMD)*SIMD) + int'(i/SIMD) * SIMD ) * WEIGHT_WIDTH]
-									: s_axis_weights_tdata[i*WEIGHT_WIDTH +: WEIGHT_WIDTH];
 		end : genRewire
 	end : genVVUInput
 
@@ -183,7 +177,7 @@ module mvu_vvu_axi #(
 		.ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN),
 		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
 			.clk, .rst, .en,
-			.last(alast && avld), .zero(!istb), .w(wmvau_i), .a(amvau_i),
+			.last(alast && avld), .zero(!istb), .w(s_axis_weights_tdata), .a(amvau_i),
 			.vld(ovld), .p(odat)
 		);
 	"mvu_4sx4u":

From 5ffc221eaa07828001e423551ad05f8207178656 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 20 Nov 2023 14:35:45 +0000
Subject: [PATCH 201/235] [mvu vvu axi]: minor fix -- define mvauin_weight_t

---
 finn-rtllib/mvu/mvu_vvu_axi.sv | 1 +
 1 file changed, 1 insertion(+)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index ddedec1e8a..8eb92a93e6 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -153,6 +153,7 @@ module mvu_vvu_axi #(
 	uwire ovld;
 	uwire [PE-1:0][ACCU_WIDTH-1:0] odat;
 	uwire mvauin_t amvau_i;
+	typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t;
 
 	if (IS_MVU) begin : genMVUInput
 		assign  amvau_i = amvau;

From 40d652ccb817295e5668ed765f8e348346584465 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Wed, 29 Nov 2023 14:02:33 +0000
Subject: [PATCH 202/235] [rtl mvu op]: minor fix to chain length estimation
 and enabled behavioral mode for rtl sim

---
 .../custom_op/fpgadataflow/matrixvectoractivation_rtl.py   | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
index d0a638475a..da560d73fd 100644
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
@@ -966,10 +966,12 @@ def _resolve_segment_len(self, clk):
         # Insert pipeline registers in the DSP58 chain to meet target clock frequency
         # 0.741 ns seems the worst-case delay through first DSP
         # 0.605 ns seems to be (on average) delay for all subsequent DSPs
-        critical_path_dsps = np.floor((clk - 0.741) / 0.605)
+        # clk >= (critical_path_dsps - 1) * 0.605 + 0.741
+        assert (clk > 0.741), "Infeasible clk target of {} ns has been set, consider lowering the targeted clock frequency!".format(clk)
+        critical_path_dsps = np.floor((clk - 0.741) / 0.605 + 1)
         max_chain_len = np.ceil(self.get_nodeattr("SIMD") / 3)
         dsp_chain_len = critical_path_dsps if critical_path_dsps < max_chain_len else max_chain_len
-        return max(1, dsp_chain_len)
+        return dsp_chain_len
 
     def _resolve_impl_style(self, fpgapart):
         # Based on target device and activation/weight-width, choose the
@@ -1051,7 +1053,6 @@ def prepare_codegen_default(self, fpgapart, clk):
             [str(1)] if (self.get_input_datatype(0).min() < 0) else [str(0)]
         )
         code_gen_dict["$SEGMENTLEN$"] = [str(self._resolve_segment_len(clk))]
-        code_gen_dict["$FORCE_BEHAVIORAL$"] = [str(0)]
 
         return template_path, code_gen_dict
 

From 3a1d9d26c93451a7d8ec2c63b0832234fd10a598 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Wed, 29 Nov 2023 17:42:58 +0000
Subject: [PATCH 203/235] [mvu vvu axi]: minor changes to enable double-pumped
 DSPs for uneven SIMD

---
 finn-rtllib/mvu/mvu_vvu_axi.sv | 47 +++++++++++++---------------------
 1 file changed, 18 insertions(+), 29 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index 0294d2ce88..98fd522306 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -63,13 +63,12 @@ module mvu_vvu_axi #(
 	bit M_REG_LUT = 1,
 
 	// Safely deducible parameters
-	localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
-	localparam int unsigned INPUT_STREAM_WIDTH_BA = ((IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8,
-	localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH,
-	localparam int unsigned INPUT_STREAM_WIDTH =  (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH,
-	localparam int unsigned SF = MW/SIMD,
-	localparam int unsigned NF = IS_MVU ? MH/PE : 1,
-	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8
+	localparam int unsigned  WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
+	localparam int unsigned  INPUT_STREAM_WIDTH_BA = ((IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8,
+	localparam int unsigned  WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH,
+	localparam int unsigned  INPUT_STREAM_WIDTH =  (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH,
+	localparam int unsigned	 OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8,
+	localparam bit           SIMD_UNEVEN = SIMD % 2
 )
 (
 	// Global Control
@@ -129,17 +128,11 @@ module mvu_vvu_axi #(
 			end
 		end
 
-		//- Pumping Constraints ---------
-		if(PUMPED_COMPUTE) begin
-			if(SIMD % 2 != 0) begin
-				$error("Odd SIMD=%0d is incompatible with pumped compute.", SIMD);
-				$finish;
-			end
-		end
 	end
 
-	uwire clk = ap_clk;
-	uwire rst = !ap_rst_n;
+	uwire  clk = ap_clk;
+	uwire  clk2x = ap_clk2x;
+	uwire  rst = !ap_rst_n;
 
 	//- Replay to Accommodate Neuron Fold -----------------------------------
 	typedef logic [(IS_MVU? 1:PE)*SIMD-1:0][ACTIVATION_WIDTH-1:0]  mvu_flatin_t;
@@ -175,17 +168,11 @@ module mvu_vvu_axi #(
 		// Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like:
 		// (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to
 		// (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i, P_1), ..., (S_i, P_i)
-		for(genvar  pe = 0; pe < (IS_MVU? 1:PE); pe++) begin
+		for(genvar  pe = 0; pe < ACT_PE; pe++) begin
 			for(genvar  simd = 0; simd < SIMD; simd++) begin
-				assign	amvau_i[pe][simd] = amvau[];	// TODO: Do the right thing as below here.
+				assign	amvau_i[pe][simd] = amvau[simd*ACT_PE+pe];
 			end
 		end
-
-		localparam int num_of_elements = INPUT_STREAM_WIDTH/ACTIVATION_WIDTH;
-		for (genvar i=0; i<num_of_elements; i++) begin : genRewire
-			assign  amvau_i[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH] =
-									amvau[(i/SIMD + (i*PE % num_of_elements) + 1) * ACTIVATION_WIDTH : (i/SIMD + (i*PE % num_of_elements)) * ACTIVATION_WIDTH];
-		end : genRewire
 	end : genVVUInput
 
 	//- Flow Control Bracket around Compute Core ----------------------------
@@ -199,7 +186,8 @@ module mvu_vvu_axi #(
 	uwire  ovld;
 	uwire dsp_p_t  odat;
 	if(1) begin : blkDsp
-		localparam int unsigned  DSP_SIMD = SIMD/(PUMPED_COMPUTE+1);
+		localparam int unsigned  EFFECTIVE_SIMD = SIMD_UNEVEN && PUMPED_COMPUTE ? SIMD+1 : SIMD; 
+		localparam int unsigned  DSP_SIMD = EFFECTIVE_SIMD/(PUMPED_COMPUTE+1);
 		typedef logic [PE    -1:0][DSP_SIMD-1:0][WEIGHT_WIDTH    -1:0]  dsp_w_t;
 		typedef logic [ACT_PE-1:0][DSP_SIMD-1:0][ACTIVATION_WIDTH-1:0]  dsp_a_t;
 
@@ -242,6 +230,7 @@ module mvu_vvu_axi #(
 			logic  Zero = 1;
 			dsp_w_t  W[1:0] = '{ default: 'x };
 			dsp_a_t  A[1:0] = '{ default: 'x };
+
 			always_ff @(posedge clk2x) begin
 				if(rst) begin
 					En   <= 0;
@@ -256,12 +245,12 @@ module mvu_vvu_axi #(
 						if(en) begin
 							Last <= '{ alast && avld, 1'b0 };
 							Zero <= !istb;
-							for(int unsigned  simd = 0; simd < SIMD; simd++) begin
+							for(int unsigned  simd = 0; simd < EFFECTIVE_SIMD; simd++) begin
 								for(int unsigned  pe = 0; pe < PE; pe++) begin
-									W[simd / DSP_SIMD][pe][simd % DSP_SIMD] <= mvu_w[pe][simd];
+									W[simd / DSP_SIMD][pe][simd % DSP_SIMD] <= (simd==EFFECTIVE_SIMD-1 && SIMD_UNEVEN) ? '0 : mvu_w[pe][simd];
 								end
 								for(int unsigned  pe = 0; pe < ACT_PE; pe++) begin
-									A[simd / DSP_SIMD][pe][simd % DSP_SIMD] <= amvau_i[pe][simd];
+									A[simd / DSP_SIMD][pe][simd % DSP_SIMD] <= (simd==EFFECTIVE_SIMD-1 && SIMD_UNEVEN) ? '0 : amvau_i[pe][simd];
 								end
 							end
 						end
@@ -293,7 +282,7 @@ module mvu_vvu_axi #(
 				end
 				else begin
 					if(dsp_vld)  P <= dsp_p;
-					Vld <= dsp_vld || (Vld && !Active);
+					Vld <= dsp_vld || (Vld && Active);
 				end
 			end
 			assign	ovld = Vld;

From 493bcfe26c55912efdb37ecb39412714f5a30235 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Wed, 29 Nov 2023 17:48:37 +0000
Subject: [PATCH 204/235] [axi wrapper]: add port for double-clock

---
 finn-rtllib/mvu/mvu_vvu_axi_wrapper.v | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
index 01deb23840..11949dec24 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
+++ b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
@@ -34,6 +34,7 @@
 module $MODULE_NAME_AXI_WRAPPER$ #(
 	parameter	IS_MVU = $IS_MVU$,
 	parameter	COMPUTE_CORE = "$COMPUTE_CORE$",
+	parameter	PUMPED_COMPUTE = $PUMPED_COMPUTE$,
 	parameter	MW = $MW$,
 	parameter	MH = $MH$,
 	parameter	PE = $PE$,
@@ -54,6 +55,9 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *)
 	(* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *)
 	input	ap_clk,
+	(* X_INTERFACE_PARAMETER = "ASSOCIATED_RESET ap_rst_n" *)
+	(* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk2x CLK" *)
+	input   ap_clk2x,
 	(* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *)
 	input	ap_rst_n,
 
@@ -72,11 +76,12 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 );
 
 mvu_vvu_axi #(
-	.IS_MVU(IS_MVU), .COMPUTE_CORE(COMPUTE_CORE), .MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD),
+	.IS_MVU(IS_MVU), .COMPUTE_CORE(COMPUTE_CORE), .PUMPED_COMPUTE(PUMPED_COMPUTE), .MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD),
 	.ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH),
 	.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)
 	) inst (
 	.ap_clk(ap_clk),
+	.ap_clk2x(ap_clk2x),
 	.ap_rst_n(ap_rst_n),
 	.s_axis_weights_tdata(weights_V_TDATA),
 	.s_axis_weights_tvalid(weights_V_TVALID),

From 58f191e9cb0cf158db4a6dbc7b100cc0507d6ee6 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 1 Dec 2023 15:05:20 +0000
Subject: [PATCH 205/235] [builder]: add flag for enabling pumped compute

---
 src/finn/builder/build_dataflow_config.py | 3 +++
 src/finn/builder/build_dataflow_steps.py  | 6 ++++++
 2 files changed, 9 insertions(+)

diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py
index 0d6911035c..af1ce12dc0 100644
--- a/src/finn/builder/build_dataflow_config.py
+++ b/src/finn/builder/build_dataflow_config.py
@@ -230,6 +230,9 @@ class DataflowBuildConfig:
     #: very high performance.
     mvau_wwidth_max: Optional[int] = 36
 
+    #: (Optional) Double-pump DSP58s in MVU/VVU layers if possible
+    enable_pumped_compute: Optional[bool] = False
+
     #: (Optional) Whether thresholding layers (which implement quantized
     #: activations in FINN) will be implemented as stand-alone HLS layers,
     #: instead of being part of MatrixVectorActivation layer. This gives larger
diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
index 83278aae41..7af3f4c3ab 100644
--- a/src/finn/builder/build_dataflow_steps.py
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -480,6 +480,12 @@ def step_specialize_to_rtl(model: ModelWrapper, cfg: DataflowBuildConfig):
     specialize_to_rtl_transforms = [to_rtl.InferRTLMatrixVectorActivation(), to_rtl.InferRTLVectorVectorActivation()]
     for trn in specialize_to_rtl_transforms:
         model = model.transform(trn)
+    
+    # If double-pumping enabled, annotate relevant MVU/VVU layers
+    if cfg.enable_double_pump:
+        for n in model.graph.node:
+            if n.op_type in ["MatrixVectorActivation_rtl", "VectorVectorActivation_rtl"]:
+                getCustomOp(n).set_nodeattr("pumpedCompute", 1)
     return model
 
 

From f435aed5b2a5ffc5ca03dd65d8d45a3a3c6bb2aa Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 1 Dec 2023 15:06:50 +0000
Subject: [PATCH 206/235] [hls custom op]: add clk2x interface

---
 src/finn/custom_op/fpgadataflow/hlscustomop.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/finn/custom_op/fpgadataflow/hlscustomop.py b/src/finn/custom_op/fpgadataflow/hlscustomop.py
index 4fed8ed4b5..c0b9f0735f 100644
--- a/src/finn/custom_op/fpgadataflow/hlscustomop.py
+++ b/src/finn/custom_op/fpgadataflow/hlscustomop.py
@@ -142,6 +142,7 @@ def get_verilog_top_module_intf_names(self):
         Each block must have at most one aximm and one axilite."""
         intf_names = {}
         intf_names["clk"] = ["ap_clk"]
+        intf_names["clk2x"] = ["ap_clk2x"]
         intf_names["rst"] = ["ap_rst_n"]
         sname = self.hls_sname()
         intf_names["s_axis"] = [("in0_" + sname, self.get_instream_width_padded())]

From 4a8ff5924868be72fdfcd64652181214f07ad388 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 1 Dec 2023 15:07:30 +0000
Subject: [PATCH 207/235] [mvu rtl]: add pumped compute attribute and fill out
 template accordingly

---
 .../fpgadataflow/matrixvectoractivation_rtl.py         | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
index da560d73fd..a66c6f4b2f 100644
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
@@ -73,6 +73,7 @@ def get_nodeattr_types(self):
             "MW": ("i", True, 0),
             "MH": ("i", True, 0),
             "resType": ("s", False, "dsp", {"auto", "lut", "dsp"}),
+            "pumpedCompute": ("i", False, 0, {0, 1}),
             # FINN DataTypes for inputs, weights, outputs
             "inputDataType": ("s", True, ""),
             "weightDataType": ("s", True, ""),
@@ -779,6 +780,9 @@ def code_generation_ipi(self):
             din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0]
             cmd.append("create_bd_cell -type hier %s" % node_name)
             cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name))
+            if self.get_nodeattr("pumpedCompute"):
+                clk2x_name = self.get_verilog_top_module_intf_names()["clk2x"][0]
+                cmd.append("create_bd_pin -dir I -type clk2x /%s/%s" % (node_name, clk2x_name))
             cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name))
             cmd.append(
                 "create_bd_intf_pin -mode Master "
@@ -858,6 +862,11 @@ def code_generation_ipi(self):
                 "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]"
                 % (node_name, clk_name, node_name, node_name, clk_name)
             )
+            if self.get_nodeattr("pumpedCompute"):
+                cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]"
+                % (node_name, clk2x_name, node_name, node_name, clk2x_name)
+            )               
             cmd.append(
                 "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
                 "[get_bd_intf_pins %s/%s/%s]"
@@ -1040,6 +1049,7 @@ def prepare_codegen_default(self, fpgapart, clk):
         code_gen_dict = {}
         code_gen_dict["$IS_MVU$"] = [str(1)]
         code_gen_dict["$COMPUTE_CORE$"] = [self._resolve_impl_style(fpgapart)]
+        code_gen_dict["$PUMPED_COMPUTE$"] = [str(self.get_nodeattr("pumpedCompute"))]
         code_gen_dict["$MW$"] = [str(self.get_nodeattr("MW"))]
         code_gen_dict["$MH$"] = [str(self.get_nodeattr("MH"))]
         code_gen_dict["$PE$"] = [str(self.get_nodeattr("PE"))]

From f38fd6b5cc29b5c97e684e1d9f209148cc1e7344 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 1 Dec 2023 15:08:15 +0000
Subject: [PATCH 208/235] [stitched ip]: wire up clk2x interface

---
 .../fpgadataflow/create_stitched_ip.py         | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
index 9a653fe404..63f98e6156 100644
--- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py
+++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
@@ -102,6 +102,7 @@ def __init__(self, fpgapart, clk_ns, ip_name="finn_design", vitis=False, signatu
         # keep track of top-level interface names
         self.intf_names = {
             "clk": [],
+            "clk2x": [],
             "rst": [],
             "s_axis": [],
             "m_axis": [],
@@ -113,19 +114,25 @@ def connect_clk_rst(self, node):
         inst_name = node.name
         node_inst = getCustomOp(node)
         clock_intf_name = node_inst.get_verilog_top_module_intf_names()["clk"][0]
+        clock2x_intf_name = node_inst.get_verilog_top_module_intf_names()["clk2x"][0]
         reset_intf_name = node_inst.get_verilog_top_module_intf_names()["rst"][0]
         # make clock and reset external, if they aren't already
         if not self.clock_reset_are_external:
             self.connect_cmds.append(
                 "make_bd_pins_external [get_bd_pins %s/%s]" % (inst_name, clock_intf_name)
             )
+            self.connect_cmds.append(
+                "make_bd_pins_external [get_bd_pins %s/%s]" % (inst_name, clock2x_intf_name)
+            )
             self.connect_cmds.append("set_property name ap_clk [get_bd_ports ap_clk_0]")
+            self.connect_cmds.append("set_property name ap_clk2x [get_bd_ports ap_clk2x_0]")
             self.connect_cmds.append(
                 "make_bd_pins_external [get_bd_pins %s/%s]" % (inst_name, reset_intf_name)
             )
             self.connect_cmds.append("set_property name ap_rst_n [get_bd_ports ap_rst_n_0]")
             self.clock_reset_are_external = True
             self.intf_names["clk"] = ["ap_clk"]
+            self.intf_names["clk2x"] = ["ap_clk2x"]
             self.intf_names["rst"] = ["ap_rst_n"]
         # otherwise connect clock and reset
         else:
@@ -137,6 +144,10 @@ def connect_clk_rst(self, node):
                 "connect_bd_net [get_bd_ports ap_clk] [get_bd_pins %s/%s]"
                 % (inst_name, clock_intf_name)
             )
+            self.connect_cmds.append(
+                "connect_bd_net [get_bd_ports ap_clk2x] [get_bd_pins %s/%s]"
+                % (inst_name, clock2x_intf_name)
+            )
 
     def connect_axi(self, node):
         inst_name = node.name
@@ -376,6 +387,13 @@ def apply(self, model):
         fclk_hz = fclk_mhz * 1000000
         model.set_metadata_prop("clk_ns", str(self.clk_ns))
         tcl.append("set_property CONFIG.FREQ_HZ %d [get_bd_ports /ap_clk]" % round(fclk_hz))
+        tcl.append("set_property CONFIG.FREQ_HZ %d [get_bd_ports /ap_clk2x]" % round(2*fclk_hz))
+        # tcl.append(
+        #     "set_property CONFIG.FREQ_HZ %d [get_bd_intf_pins MatrixVectorActivation_rtl_0/s_axilite_0]" % round(fclk_hz)
+        # )
+        # tcl.append(
+        #     "set_property CONFIG.FREQ_HZ %d [get_bd_intf_pins MatrixVectorActivation_rtl_0/in0_V]" % round(fclk_hz)
+        # )
         tcl.append("validate_bd_design")
         tcl.append("save_bd_design")
         # create wrapper hdl (for rtlsim later on)

From 078888af360baf455e99b473e367b3f5f4dbbaeb Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 1 Dec 2023 15:22:24 +0000
Subject: [PATCH 209/235] [mvu vvu axi]: removed SIMD%2 constraint for
 double-pumped DSP58

---
 finn-rtllib/mvu/mvu_vvu_axi.sv | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index 98fd522306..3379577046 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -63,14 +63,13 @@ module mvu_vvu_axi #(
 	bit M_REG_LUT = 1,
 
 	// Safely deducible parameters
-	localparam int unsigned  WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
-	localparam int unsigned  INPUT_STREAM_WIDTH_BA = ((IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8,
-	localparam int unsigned  WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH,
-	localparam int unsigned  INPUT_STREAM_WIDTH =  (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH,
-	localparam int unsigned	 OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8,
-	localparam bit           SIMD_UNEVEN = SIMD % 2
-)
-(
+	localparam int unsigned  WEIGHT_STREAM_WIDTH    = PE * SIMD * WEIGHT_WIDTH,
+	localparam int unsigned  WEIGHT_STREAM_WIDTH_BA = (WEIGHT_STREAM_WIDTH + 7)/8 * 8,
+	localparam int unsigned  INPUT_STREAM_WIDTH     = (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH,
+	localparam int unsigned  INPUT_STREAM_WIDTH_BA  = (INPUT_STREAM_WIDTH  + 7)/8 * 8,
+	localparam int unsigned  OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8,
+	localparam bit  		 SIMD_UNEVEN = SIMD % 2
+)(
 	// Global Control
 	input	logic  ap_clk,
 	input	logic  ap_clk2x,	// synchronous, double-speed clock; only used for PUMPED_COMPUTE
@@ -128,6 +127,13 @@ module mvu_vvu_axi #(
 			end
 		end
 
+		// //- Pumping Constraints ---------
+		// if(PUMPED_COMPUTE) begin
+		// 	if(SIMD % 2 != 0) begin
+		// 		$error("Odd SIMD=%0d is incompatible with pumped compute.", SIMD);
+		// 		$finish;
+		// 	end
+		// end
 	end
 
 	uwire  clk = ap_clk;

From bbcbb5a8819601263dd6260137f717c020103629 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 1 Dec 2023 15:42:48 +0000
Subject: [PATCH 210/235] [builder]: minor fix to attribute naming

---
 src/finn/builder/build_dataflow_steps.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
index 7af3f4c3ab..29401b8f52 100644
--- a/src/finn/builder/build_dataflow_steps.py
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -482,7 +482,7 @@ def step_specialize_to_rtl(model: ModelWrapper, cfg: DataflowBuildConfig):
         model = model.transform(trn)
     
     # If double-pumping enabled, annotate relevant MVU/VVU layers
-    if cfg.enable_double_pump:
+    if cfg.enable_pumped_compute:
         for n in model.graph.node:
             if n.op_type in ["MatrixVectorActivation_rtl", "VectorVectorActivation_rtl"]:
                 getCustomOp(n).set_nodeattr("pumpedCompute", 1)

From b72d00de9bd2d5a947a50e18e2945b832488b471 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Sun, 3 Dec 2023 22:56:02 +0000
Subject: [PATCH 211/235] [stitched-ip]: minor fixes to creating valid
 stitched-ip with ap_clk2x interface

---
 .../fpgadataflow/create_stitched_ip.py        | 46 ++++++++++++-------
 1 file changed, 30 insertions(+), 16 deletions(-)

diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
index 63f98e6156..f797e3d841 100644
--- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py
+++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
@@ -110,32 +110,41 @@ def __init__(self, fpgapart, clk_ns, ip_name="finn_design", vitis=False, signatu
             "axilite": [],
         }
 
+    def _is_double_pumped(self, node):
+        try:
+            pumped_compute = getCustomOp(node).get_nodeattr("pumpedCompute")
+            return pumped_compute==1
+        except:
+            return False
+
     def connect_clk_rst(self, node):
         inst_name = node.name
         node_inst = getCustomOp(node)
         clock_intf_name = node_inst.get_verilog_top_module_intf_names()["clk"][0]
-        clock2x_intf_name = node_inst.get_verilog_top_module_intf_names()["clk2x"][0]
+        if self._is_double_pumped(node):
+            clock2x_intf_name = node_inst.get_verilog_top_module_intf_names()["clk2x"][0]
         reset_intf_name = node_inst.get_verilog_top_module_intf_names()["rst"][0]
         # make clock and reset external, if they aren't already
         if not self.clock_reset_are_external:
             self.connect_cmds.append(
                 "make_bd_pins_external [get_bd_pins %s/%s]" % (inst_name, clock_intf_name)
             )
-            self.connect_cmds.append(
-                "make_bd_pins_external [get_bd_pins %s/%s]" % (inst_name, clock2x_intf_name)
-            )
             self.connect_cmds.append("set_property name ap_clk [get_bd_ports ap_clk_0]")
-            self.connect_cmds.append("set_property name ap_clk2x [get_bd_ports ap_clk2x_0]")
             self.connect_cmds.append(
                 "make_bd_pins_external [get_bd_pins %s/%s]" % (inst_name, reset_intf_name)
             )
             self.connect_cmds.append("set_property name ap_rst_n [get_bd_ports ap_rst_n_0]")
             self.clock_reset_are_external = True
             self.intf_names["clk"] = ["ap_clk"]
-            self.intf_names["clk2x"] = ["ap_clk2x"]
             self.intf_names["rst"] = ["ap_rst_n"]
-        # otherwise connect clock and reset
-        else:
+        # make clk2x external, if it isn't already and connect clk and reset
+        elif self._is_double_pumped(node) and not self.clock2x_is_external:
+            self.connect_cmds.append(
+                "make_bd_pins_external [get_bd_pins %s/%s]" % (inst_name, clock2x_intf_name)
+            )
+            self.connect_cmds.append("set_property name ap_clk2x [get_bd_ports ap_clk2x_0]")
+            self.clock2x_is_external = True
+            self.intf_names["clk2x"] = ["ap_clk2x"]
             self.connect_cmds.append(
                 "connect_bd_net [get_bd_ports ap_rst_n] [get_bd_pins %s/%s]"
                 % (inst_name, reset_intf_name)
@@ -144,10 +153,21 @@ def connect_clk_rst(self, node):
                 "connect_bd_net [get_bd_ports ap_clk] [get_bd_pins %s/%s]"
                 % (inst_name, clock_intf_name)
             )
+        # otherwise connect clock and reset
+        else:
             self.connect_cmds.append(
-                "connect_bd_net [get_bd_ports ap_clk2x] [get_bd_pins %s/%s]"
-                % (inst_name, clock2x_intf_name)
+                "connect_bd_net [get_bd_ports ap_rst_n] [get_bd_pins %s/%s]"
+                % (inst_name, reset_intf_name)
             )
+            self.connect_cmds.append(
+                "connect_bd_net [get_bd_ports ap_clk] [get_bd_pins %s/%s]"
+                % (inst_name, clock_intf_name)
+            )
+            if self._is_double_pumped(node):
+                self.connect_cmds.append(
+                    "connect_bd_net [get_bd_ports ap_clk2x] [get_bd_pins %s/%s]"
+                    % (inst_name, clock2x_intf_name)
+                )
 
     def connect_axi(self, node):
         inst_name = node.name
@@ -388,12 +408,6 @@ def apply(self, model):
         model.set_metadata_prop("clk_ns", str(self.clk_ns))
         tcl.append("set_property CONFIG.FREQ_HZ %d [get_bd_ports /ap_clk]" % round(fclk_hz))
         tcl.append("set_property CONFIG.FREQ_HZ %d [get_bd_ports /ap_clk2x]" % round(2*fclk_hz))
-        # tcl.append(
-        #     "set_property CONFIG.FREQ_HZ %d [get_bd_intf_pins MatrixVectorActivation_rtl_0/s_axilite_0]" % round(fclk_hz)
-        # )
-        # tcl.append(
-        #     "set_property CONFIG.FREQ_HZ %d [get_bd_intf_pins MatrixVectorActivation_rtl_0/in0_V]" % round(fclk_hz)
-        # )
         tcl.append("validate_bd_design")
         tcl.append("save_bd_design")
         # create wrapper hdl (for rtlsim later on)

From 04f5863706103a01a986497a7b6b721cb6fd5979 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Sun, 3 Dec 2023 23:00:54 +0000
Subject: [PATCH 212/235] [rtl-vvu]: add stitching support for pumped compute,
 minor fix to segment length estimation

---
 .../fpgadataflow/vectorvectoractivation_rtl.py  | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/vectorvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/vectorvectoractivation_rtl.py
index 8277f36b21..63a00fc55f 100644
--- a/src/finn/custom_op/fpgadataflow/vectorvectoractivation_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/vectorvectoractivation_rtl.py
@@ -67,6 +67,7 @@ def get_nodeattr_types(self):
             "Channels": ("i", True, 0),
             "Kernel": ("ints", True, []),  # [H, W]
             "resType": ("s", False, "dsp", {"auto", "lut", "dsp"}),
+            "pumpedCompute": ("i", False, 0, {0, 1}),
             # FINN DataTypes for inputs, weights, outputs
             "inputDataType": ("s", True, ""),
             "weightDataType": ("s", True, ""),
@@ -638,6 +639,9 @@ def code_generation_ipi(self):
             din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0]
             cmd.append("create_bd_cell -type hier %s" % node_name)
             cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name))
+            if self.get_nodeattr("pumpedCompute"):
+                clk2x_name = self.get_verilog_top_module_intf_names()["clk2x"][0]
+                cmd.append("create_bd_pin -dir I -type clk2x /%s/%s" % (node_name, clk2x_name))
             cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name))
             cmd.append(
                 "create_bd_intf_pin -mode Master "
@@ -713,6 +717,11 @@ def code_generation_ipi(self):
                 "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]"
                 % (node_name, clk_name, node_name, node_name, clk_name)
             )
+            if self.get_nodeattr("pumpedCompute"):
+                cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]"
+                % (node_name, clk2x_name, node_name, node_name, clk2x_name)
+            )  
             cmd.append(
                 "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
                 "[get_bd_intf_pins %s/%s/%s]"
@@ -989,10 +998,12 @@ def _resolve_segment_len(self, clk):
         # Insert pipeline registers in the DSP58 chain to meet target clock frequency
         # 0.741 ns seems the worst-case delay through first DSP
         # 0.605 ns seems to be (on average) delay for all subsequent DSPs
-        critical_path_dsps = np.floor((clk - 0.741) / 0.605)
+        # clk >= (critical_path_dsps - 1) * 0.605 + 0.741
+        assert (clk > 0.741), "Infeasible clk target of {} ns has been set, consider lowering the targeted clock frequency!".format(clk)
+        critical_path_dsps = np.floor((clk - 0.741) / 0.605 + 1)
         max_chain_len = np.ceil(self.get_nodeattr("SIMD") / 3)
         dsp_chain_len = critical_path_dsps if critical_path_dsps < max_chain_len else max_chain_len
-        return max(1, dsp_chain_len)
+        return dsp_chain_len
 
     def _resolve_impl_style(self, fpgapart):
         # Based on target device and activation/weight-width, choose the
@@ -1016,6 +1027,7 @@ def prepare_codegen_default(self, fpgapart, clk):
         code_gen_dict = {}
         code_gen_dict["$IS_MVU$"] = [str(0)]
         code_gen_dict["$COMPUTE_CORE$"] = [self._resolve_impl_style(fpgapart)]
+        code_gen_dict["$PUMPED_COMPUTE$"] = [str(self.get_nodeattr("pumpedCompute"))]
         mw = int(np.prod(self.get_nodeattr("Kernel")))
         code_gen_dict["$MW$"] = [str(mw)]
         code_gen_dict["$MH$"] = [str(self.get_nodeattr("Channels"))]
@@ -1028,7 +1040,6 @@ def prepare_codegen_default(self, fpgapart, clk):
             [str(1)] if (self.get_input_datatype(0).min() < 0) else [str(0)]
         )
         code_gen_dict["$SEGMENTLEN$"] = [str(self._resolve_segment_len(clk))]
-        # code_gen_dict["$FORCE_BEHAVIORAL$"] = [str(0)]
 
         return template_path, code_gen_dict
 

From 9b80ac1d0e8baaf4a1b55eabd87e60ebd4a50396 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Sun, 3 Dec 2023 13:30:02 +0000
Subject: [PATCH 213/235] Prevent output register slice from operating in
 unnecessary ping-pong mode.

---
 finn-rtllib/mvu/mvu_vvu_axi.sv | 45 +++++++++++++++-------------------
 1 file changed, 20 insertions(+), 25 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index 3379577046..4f635bf78d 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -334,40 +334,35 @@ module mvu_vvu_axi #(
 	end : blkDsp
 
 //-------------------- Output register slice --------------------\\
-	struct packed {
+	// Make `en`computation independent from external inputs.
+	// Drive all outputs from registers.
+	typedef struct packed {
 		logic vld;
 		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
-	} A = '{ vld: 0, default: 'x};
+	} buf_t;
+	buf_t  A = '{ vld: 0, default: 'x };	// side-step register used when encountering backpressure
+	buf_t  B = '{ vld: 0, default: 'x };	// ultimate output register
 
-	assign en = !A.vld || !ovld;
+	assign	en = !A.vld || !ovld;
+	uwire  b_load = !B.vld || m_axis_output_tready;
 
-	uwire  b_load;
 	always_ff @(posedge clk) begin
-		if(rst)		A <= '{ vld: 0, default: 'x };
-		else if(!A.vld || b_load) begin
-			A.vld <= ovld && en;
-			for(int unsigned  i = 0; i < PE; i++) begin
-				// CR-1148862:
-				// A.dat[i] <= odat[i];
-				automatic logic [ACCU_WIDTH-1:0]  v = odat[i];
-				A.dat[i] <= v[ACCU_WIDTH-1:0];
-			end
+		if(rst) begin
+			A <= '{ vld: 0, default: 'x };
+			B <= '{ vld: 0, default: 'x };
 		end
-	end
-
-	struct packed {
-		logic vld;
-		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
-	} B = '{ vld: 0, default: 'x};
-
-	assign	b_load = !B.vld || m_axis_output_tready;
-	always_ff @(posedge clk) begin
-		if(rst)		B <= '{ vld: 0, default: 'x };
 		else begin
-			if(b_load)	B <= '{ vld: A.vld, dat: A.dat};
+			if(!A.vld)  A.dat <= odat;
+			A.vld <= (ovld || A.vld) && !b_load;
+
+			if(b_load) begin
+				B <= '{
+					vld: A.vld || ovld,
+					dat: A.vld? A.dat : odat
+				};
+			end
 		end
 	end
-
 	assign	m_axis_output_tvalid = B.vld;
 	assign	m_axis_output_tdata  = B.dat;
 

From 60f483a3f8b753c310c1831be73f31f72db301d6 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 7 Dec 2023 15:20:26 +0000
Subject: [PATCH 214/235] [mvu vvu axi]: verilator BLKLOOPINIT-error workaround

---
 finn-rtllib/mvu/mvu_vvu_axi.sv | 55 ++++++++++++++++++++++++++--------
 1 file changed, 43 insertions(+), 12 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index 4f635bf78d..114223052a 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -242,8 +242,6 @@ module mvu_vvu_axi #(
 					En   <= 0;
 					Last <= '{ default: 1'b0 };
 					Zero <=  1;
-					W <= '{ default: 'x };
-					A <= '{ default: 'x };
 				end
 				else begin
 					if(Active) begin
@@ -251,23 +249,56 @@ module mvu_vvu_axi #(
 						if(en) begin
 							Last <= '{ alast && avld, 1'b0 };
 							Zero <= !istb;
-							for(int unsigned  simd = 0; simd < EFFECTIVE_SIMD; simd++) begin
-								for(int unsigned  pe = 0; pe < PE; pe++) begin
-									W[simd / DSP_SIMD][pe][simd % DSP_SIMD] <= (simd==EFFECTIVE_SIMD-1 && SIMD_UNEVEN) ? '0 : mvu_w[pe][simd];
-								end
-								for(int unsigned  pe = 0; pe < ACT_PE; pe++) begin
-									A[simd / DSP_SIMD][pe][simd % DSP_SIMD] <= (simd==EFFECTIVE_SIMD-1 && SIMD_UNEVEN) ? '0 : amvau_i[pe][simd];
-								end
-							end
 						end
 					end
 					else if(En) begin
 						Last <= '{ 'x, Last[1] };
-						W    <= '{ 'x, W[1] };
-						A    <= '{ 'x, A[1] };
 					end
 				end
 			end
+
+			for(genvar  simd = 0; simd < EFFECTIVE_SIMD; simd++) begin : genSIMDRegW
+				for(genvar  pe = 0; pe < PE; pe++) begin : genPERegW
+					always_ff @(posedge clk2x) begin
+						if(rst) begin
+							W[simd / DSP_SIMD][pe][simd % DSP_SIMD] <= '{ default: 'x };
+						end
+						else begin
+							if(Active) begin
+								if(en) begin
+									W[simd / DSP_SIMD][pe][simd % DSP_SIMD] <= (simd==EFFECTIVE_SIMD-1 && SIMD_UNEVEN) ? '0 : mvu_w[pe][simd];
+								end
+							end
+							else if(En) begin
+								W[1][pe][simd % DSP_SIMD] <= 'x;
+								W[0][pe][simd % DSP_SIMD] <= W[1][pe][simd % DSP_SIMD];
+							end
+						end
+					end
+				end : genPERegW
+			end : genSIMDRegW
+
+			for(genvar  simd = 0; simd < EFFECTIVE_SIMD; simd++) begin : genSIMDRegA
+				for(genvar  pe = 0; pe < ACT_PE; pe++) begin : genPERegA
+					always_ff @(posedge clk2x) begin
+						if(rst) begin
+							A[simd / DSP_SIMD][pe][simd % DSP_SIMD] <= '{ default: 'x };
+						end
+						else begin
+							if(Active) begin
+								if(en) begin
+									A[simd / DSP_SIMD][pe][simd % DSP_SIMD] <= (simd==EFFECTIVE_SIMD-1 && SIMD_UNEVEN) ? '0 : amvau_i[pe][simd];
+								end
+							end
+							else if(En) begin
+								A[1][pe][simd % DSP_SIMD] <= 'x;
+								A[0][pe][simd % DSP_SIMD] <= A[1][pe][simd % DSP_SIMD];
+							end
+						end
+					end
+				end : genPERegA
+			end : genSIMDRegA
+
 			assign	dsp_en = En;
 
 			assign	dsp_last = Last[0];

From 23fb64f4c4e82af45d8ffac8dee1415fbbe44d25 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 8 Dec 2023 17:12:42 +0000
Subject: [PATCH 215/235] [mvu vvu axi]: sign extend output tdata
 (byte-aligned)

---
 finn-rtllib/mvu/mvu_vvu_axi.sv | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index 114223052a..b285be076f 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -67,7 +67,8 @@ module mvu_vvu_axi #(
 	localparam int unsigned  WEIGHT_STREAM_WIDTH_BA = (WEIGHT_STREAM_WIDTH + 7)/8 * 8,
 	localparam int unsigned  INPUT_STREAM_WIDTH     = (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH,
 	localparam int unsigned  INPUT_STREAM_WIDTH_BA  = (INPUT_STREAM_WIDTH  + 7)/8 * 8,
-	localparam int unsigned  OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8,
+	localparam int unsigned  OUTPUT_STREAM_WIDTH    = PE*ACCU_WIDTH,
+	localparam int unsigned  OUTPUT_STREAM_WIDTH_BA = (OUTPUT_STREAM_WIDTH + 7)/8 * 8,
 	localparam bit  		 SIMD_UNEVEN = SIMD % 2
 )(
 	// Global Control
@@ -395,6 +396,6 @@ module mvu_vvu_axi #(
 		end
 	end
 	assign	m_axis_output_tvalid = B.vld;
-	assign	m_axis_output_tdata  = B.dat;
+	assign	m_axis_output_tdata  = { {(OUTPUT_STREAM_WIDTH_BA-OUTPUT_STREAM_WIDTH){B.dat[PE-1][ACCU_WIDTH-1]}}, B.dat};
 
 endmodule : mvu_vvu_axi

From fdca45b50d0334dcbc888c989ba348fcfe67f1fa Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 11 Dec 2023 16:19:00 +0000
Subject: [PATCH 216/235] [mvu-rtl]: default seglen to 1 for now

---
 .../custom_op/fpgadataflow/matrixvectoractivation_rtl.py     | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
index a66c6f4b2f..968c880980 100644
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
@@ -980,7 +980,8 @@ def _resolve_segment_len(self, clk):
         critical_path_dsps = np.floor((clk - 0.741) / 0.605 + 1)
         max_chain_len = np.ceil(self.get_nodeattr("SIMD") / 3)
         dsp_chain_len = critical_path_dsps if critical_path_dsps < max_chain_len else max_chain_len
-        return dsp_chain_len
+        #return dsp_chain_len
+        return 1
 
     def _resolve_impl_style(self, fpgapart):
         # Based on target device and activation/weight-width, choose the
@@ -994,7 +995,7 @@ def _resolve_impl_style(self, fpgapart):
                 fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"]
                 or fpgapart[0:5] == "xqrvc"
             )
-            if act_width == 4 and weight_width == 4:
+            if (act_width == 4 and weight_width == 4) and not(is_versal):
                 return "mvu_4sx4u"
             else:
                 if is_versal:

From 45074d964b6405a41d71ed7e16fac5e5ef9b1269 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 11 Dec 2023 16:20:02 +0000
Subject: [PATCH 217/235] update test config

---
 .../test_fpgadataflow_mvau_rtl.py             | 65 ++++++++++++++-----
 1 file changed, 50 insertions(+), 15 deletions(-)

diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py b/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py
index 3db7a718f5..ebcc87102d 100644
--- a/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py
+++ b/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py
@@ -28,6 +28,7 @@
 
 import pytest
 import os
+import pickle
 
 import numpy as np
 from onnx import TensorProto, helper
@@ -46,7 +47,8 @@
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from qonnx.transformation.general import ApplyConfig
 import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl
-#import qonnx.core.data_layout as DataLayout
+from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
+from qonnx.custom_op.registry import getCustomOp
 
 build_dir = os.environ["FINN_BUILD_DIR"]
 
@@ -83,22 +85,28 @@ def make_single_matmul_modelwrapper(W, ofm_shape, mh, ifm, weights, idt, wdt):
     return model
 
 def prepare_inputs(input_tensor):
-    return {"inp": input_tensor}
-
-@pytest.mark.parametrize("mh", [16])
-@pytest.mark.parametrize("mw", [32])
-@pytest.mark.parametrize("pe", [1, 4, 16])
-#@pytest.mark.parametrize("simd", [1, 30, 90])
-@pytest.mark.parametrize("simd", [1, 4, 32])
-@pytest.mark.parametrize("idt", [DataType["UINT4"], DataType["UINT8"]])
-@pytest.mark.parametrize("wdt", [DataType["INT4"], DataType["INT6"]])
+    return {"ifm": input_tensor}
+
+@pytest.mark.parametrize("mh", [31])
+@pytest.mark.parametrize("mw", [279])
+#@pytest.mark.parametrize("pe", [1,2,4,8])
+@pytest.mark.parametrize("pe", [31])
+#@pytest.mark.parametrize("simd", [1,3,6,9,18,36])
+@pytest.mark.parametrize("simd", [9])
+#@pytest.mark.parametrize("idt", [DataType["UINT4"], DataType["UINT8"]])
+@pytest.mark.parametrize("idt", [DataType["UINT8"]])
+#@pytest.mark.parametrize("wdt", [DataType["INT4"], DataType["INT6"]])
+@pytest.mark.parametrize("wdt", [DataType["INT8"]])
 #@pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S", "xcku3p-ffva676-1-e"])
-@pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S"])
+#@pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S"])
+@pytest.mark.parametrize("part", ["xcvc1902-vsva2197-2MP-e-S"])
 @pytest.mark.parametrize("segmentlen", [1])
 @pytest.mark.fpgadataflow
 @pytest.mark.slow
 @pytest.mark.vivado
 def test_fpgadataflow_mvau_rtl(mh, mw, pe, simd, idt, wdt, part, segmentlen):
+    # Synthesis constants
+    clk_ns = 5
     # Create test input vector (produced by SWG)
     ofm_shape = (5, 5)
     ofm_h, ofm_w = ofm_shape
@@ -125,6 +133,9 @@ def test_fpgadataflow_mvau_rtl(mh, mw, pe, simd, idt, wdt, part, segmentlen):
     ## Execute ONNX model
     output_matmul = oxe.execute_onnx(model, input_dict)
 
+    with open(build_dir + "/onnx_output.pkl", "wb") as f:
+        pickle.dump(output_matmul, f)
+
     # Create MVAU (HLS)
     model = model.transform(to_hls.InferQuantizedMatrixVectorActivation(mem_mode="decoupled"))
     model = model.transform(GiveUniqueNodeNames())
@@ -138,30 +149,54 @@ def test_fpgadataflow_mvau_rtl(mh, mw, pe, simd, idt, wdt, part, segmentlen):
             "mem_mode" : "decoupled",
             "ram_style" : "auto",
             "resType" : "dsp",
-            "impl" : "rtl"
+            "preferred_backend" : "rtl"
         }
     }
     model = model.transform(ApplyConfig(folding_config))
     model.save(build_dir+"/mvau_hls.onnx")
 
     model = model.transform(SetExecMode("rtlsim"))
-    model = model.transform(PrepareIP(part, 5))
+    model = model.transform(PrepareIP(part, clk_ns))
     model = model.transform(HLSSynthIP())
     model = model.transform(PrepareRTLSim())
+    for n in model.graph.node:
+        getCustomOp(n).set_nodeattr("rtlsim_trace", "mvu_trace_hls.vcd")
     output_mvau_hls = oxe.execute_onnx(model, input_dict)["ofm"]
+    
 
     # Apply convert-to-rtl step
     model = model.transform(to_rtl.InferRTLMatrixVectorActivation())
     model = model.transform(GiveUniqueNodeNames())
+    for n in model.graph.node:
+        if n.op_type=="MatrixVectorActivation_rtl":
+            getCustomOp(n).set_nodeattr("pumpedCompute", 0)
     model.save(build_dir+"/mvau_rtl.onnx")
 
+    # Reset rtlsim_so and ip-related paths such that new Pyverilator SO and IP is generated
+    for n in model.graph.node:
+        getCustomOp(n).set_nodeattr("rtlsim_so", "")
+        getCustomOp(n).set_nodeattr("code_gen_dir_ipgen", "")
+        getCustomOp(n).set_nodeattr("ipgen_path", "")
+        getCustomOp(n).set_nodeattr("ip_path", "")
+        getCustomOp(n).set_nodeattr("rtlsim_trace", "mvu_trace_rtl.vcd")
     model = model.transform(SetExecMode("rtlsim"))
-    model = model.transform(PrepareIP("xcvm1802-vsvd1760-2MP-e-S", 5))
+    model = model.transform(PrepareIP(part, clk_ns))
     model = model.transform(HLSSynthIP())
     model = model.transform(PrepareRTLSim())
     output_mvau_rtl = oxe.execute_onnx(model, input_dict)["ofm"]
 
     model.save(build_dir+"/mvau_rtl_sim.onnx")
 
+    with open(build_dir + "/hls_output.pkl", "wb") as f:
+        pickle.dump(output_mvau_hls, f)
+
+    with open(build_dir + "/rtl_output.pkl", "wb") as f:
+        pickle.dump(output_mvau_rtl, f)
+
+    # model = model.transform(PrepareIP(part, clk_ns))
+    # model = model.transform(HLSSynthIP())
+    # model = model.transform(CreateStitchedIP(fpgapart=part, clk_ns=clk_ns, vitis=True))
+    # model.save(build_dir+"/stitched_ip.onnx")
+
     assert (output_mvau_hls == output_mvau_rtl).all()
-    assert (output_mvau_hls.size > 0)
+    # assert (output_mvau_hls.size > 0)

From 0ed36812a077cba17f5b8c6503540773a5ff6756 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Tue, 12 Dec 2023 11:27:00 +0000
Subject: [PATCH 218/235] updated test config

---
 .../test_fpgadataflow_mvau_rtl.py             | 22 +++++++++++++------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py b/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py
index ebcc87102d..5091581d75 100644
--- a/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py
+++ b/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py
@@ -87,12 +87,13 @@ def make_single_matmul_modelwrapper(W, ofm_shape, mh, ifm, weights, idt, wdt):
 def prepare_inputs(input_tensor):
     return {"ifm": input_tensor}
 
-@pytest.mark.parametrize("mh", [31])
-@pytest.mark.parametrize("mw", [279])
-#@pytest.mark.parametrize("pe", [1,2,4,8])
-@pytest.mark.parametrize("pe", [31])
-#@pytest.mark.parametrize("simd", [1,3,6,9,18,36])
-@pytest.mark.parametrize("simd", [9])
+@pytest.mark.parametrize("mh", [4])
+# @pytest.mark.parametrize("mw", [36])
+@pytest.mark.parametrize("mw", [18])
+# @pytest.mark.parametrize("pe", [1,2,4,8])
+@pytest.mark.parametrize("pe", [2])
+# @pytest.mark.parametrize("simd", [1,3,6,9,18,36])
+@pytest.mark.parametrize("simd", [6])
 #@pytest.mark.parametrize("idt", [DataType["UINT4"], DataType["UINT8"]])
 @pytest.mark.parametrize("idt", [DataType["UINT8"]])
 #@pytest.mark.parametrize("wdt", [DataType["INT4"], DataType["INT6"]])
@@ -121,6 +122,9 @@ def test_fpgadataflow_mvau_rtl(mh, mw, pe, simd, idt, wdt, part, segmentlen):
         [mw, mh]
     )
     W = gen_finn_dt_tensor(wdt, (mw, mh))
+    # np.save("weights.npy", W)
+    ##
+    W = np.load("weights.npy")
     model = make_single_matmul_modelwrapper(W, ofm_shape, mh, ifm, weights, idt, wdt)
     model = model.transform(GiveUniqueNodeNames())
 
@@ -128,6 +132,9 @@ def test_fpgadataflow_mvau_rtl(mh, mw, pe, simd, idt, wdt, part, segmentlen):
 
     # Create MatMul & obtain golden reference output
     A = gen_finn_dt_tensor(model.get_tensor_datatype("ifm"), model.get_tensor_shape("ifm"))
+    # np.save("activations.npy", A)
+    ##
+    # A = np.load("activations.npy")
     input_dict = prepare_inputs(A)
 
     ## Execute ONNX model
@@ -198,5 +205,6 @@ def test_fpgadataflow_mvau_rtl(mh, mw, pe, simd, idt, wdt, part, segmentlen):
     # model = model.transform(CreateStitchedIP(fpgapart=part, clk_ns=clk_ns, vitis=True))
     # model.save(build_dir+"/stitched_ip.onnx")
 
-    assert (output_mvau_hls == output_mvau_rtl).all()
+    #assert (output_mvau_hls == output_mvau_rtl).all()
+    assert (output_matmul['ofm'] == output_mvau_rtl).all()
     # assert (output_mvau_hls.size > 0)

From c39642510b8c55cd2173999c938947d7162371c4 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Wed, 13 Dec 2023 09:36:25 +0000
Subject: [PATCH 219/235] [rtlsim]: use pyverilator util functions

---
 src/finn/custom_op/fpgadataflow/hlscustomop.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/hlscustomop.py b/src/finn/custom_op/fpgadataflow/hlscustomop.py
index c0b9f0735f..75c9240aeb 100644
--- a/src/finn/custom_op/fpgadataflow/hlscustomop.py
+++ b/src/finn/custom_op/fpgadataflow/hlscustomop.py
@@ -31,7 +31,7 @@
 import subprocess
 import warnings
 from abc import abstractmethod
-from pyverilator.util.axi_utils import _read_signal, reset_rtlsim, rtlsim_multi_io
+from pyverilator.util.axi_utils import _read_signal, reset_rtlsim, rtlsim_multi_io, toggle_clk
 from qonnx.core.datatype import DataType
 from qonnx.custom_op.base import CustomOp
 from qonnx.util.basic import roundup_to_integer_multiple
@@ -492,15 +492,11 @@ def exec_precompiled_singlenode_model(self):
     def reset_rtlsim(self, sim):
         """Sets reset input in pyverilator to zero, toggles the clock and set it
         back to one"""
-        sim.io.ap_rst_n = 0
-        sim.io.ap_clk = 1
-        sim.io.ap_clk = 0
-        sim.io.ap_rst_n = 1
+        reset_rtlsim(sim)
 
     def toggle_clk(self, sim):
         """Toggles the clock input in pyverilator once."""
-        sim.io.ap_clk = 1
-        sim.io.ap_clk = 0
+        toggle_clk(sim)
 
     def hls_sname(self):
         """Get the naming convention used by Vitis HLS for stream signals

From 538852d4f42afc5ef4a4be6bc19567034d081727 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Wed, 13 Dec 2023 11:44:00 +0000
Subject: [PATCH 220/235] [mvu vvu axi]: fix multiple driver error

---
 finn-rtllib/mvu/mvu_vvu_axi.sv | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index b285be076f..3fb8fd2455 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -271,8 +271,7 @@ module mvu_vvu_axi #(
 								end
 							end
 							else if(En) begin
-								W[1][pe][simd % DSP_SIMD] <= 'x;
-								W[0][pe][simd % DSP_SIMD] <= W[1][pe][simd % DSP_SIMD];
+								W[simd / DSP_SIMD][pe][simd % DSP_SIMD] <= simd / DSP_SIMD == 1 ? 'x : W[1][pe][simd % DSP_SIMD];
 							end
 						end
 					end
@@ -292,8 +291,7 @@ module mvu_vvu_axi #(
 								end
 							end
 							else if(En) begin
-								A[1][pe][simd % DSP_SIMD] <= 'x;
-								A[0][pe][simd % DSP_SIMD] <= A[1][pe][simd % DSP_SIMD];
+							  A[simd / DSP_SIMD][pe][simd % DSP_SIMD] <= simd / DSP_SIMD == 1 ? 'x : A[1][pe][simd % DSP_SIMD];
 							end
 						end
 					end

From 7e5306c6439bca6f7a1d1b209709f48e38d47f77 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Mon, 18 Dec 2023 09:02:54 +0000
Subject: [PATCH 221/235] Mitigate hold time issues on feed from fast clock
 net.

---
 finn-rtllib/mvu/mvu_vvu_axi.sv | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index 3fb8fd2455..813ffb69d7 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -226,7 +226,12 @@ module mvu_vvu_axi #(
 
 			// Identify second fast cycle before active slow clock edge
 			logic  Active = 0;
-			always_ff @(posedge clk2x)  Active <= clk;
+			if(1) begin : blkActive
+				uwire  clk_lut[2];	// Put some LUT delay on the input from the fast clock net
+				(* DONT_TOUCH = "TRUE", HLUTNM = "CLK_LUT" *) LUT1 #(.INIT(2'b10)) lut(.O(clk_lut[0]), .I0(clk));
+				(* DONT_TOUCH = "TRUE", HLUTNM = "CLK_LUT" *) LUT1 #(.INIT(2'b10)) lut(.O(clk_lut[1]), .I0(clk_lut[0]));
+				always_ff @(posedge clk2x)  Active <= clk_lut[1];
+			end : blkActive
 
 			// The input for a slow cycle is split across two fast cycles along the SIMD dimension.
 			//	- Both fast cycles are controlled by the same enable state.

From 256931fcf92ff629fd267e0d40efa93a480d4811 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 18 Dec 2023 11:49:24 +0000
Subject: [PATCH 222/235] toggle P and Vld only when no backpressure is applied

---
 finn-rtllib/mvu/mvu_vvu_axi.sv | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index 813ffb69d7..31d40b7cba 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -322,8 +322,10 @@ module mvu_vvu_axi #(
 					P   <= 'x;
 				end
 				else begin
-					if(dsp_vld)  P <= dsp_p;
-					Vld <= dsp_vld || (Vld && Active);
+					if (dsp_en) begin
+						if(dsp_vld)  P <= dsp_p;
+						Vld <= dsp_vld || (Vld && Active);
+					end
 				end
 			end
 			assign	ovld = Vld;

From 020c4e09ac3e5ba292a3fa43998b842b49909bf3 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 18 Dec 2023 11:56:57 +0000
Subject: [PATCH 223/235] change naming

---
 finn-rtllib/mvu/mvu_vvu_axi.sv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index 31d40b7cba..8a593713a3 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -229,7 +229,7 @@ module mvu_vvu_axi #(
 			if(1) begin : blkActive
 				uwire  clk_lut[2];	// Put some LUT delay on the input from the fast clock net
 				(* DONT_TOUCH = "TRUE", HLUTNM = "CLK_LUT" *) LUT1 #(.INIT(2'b10)) lut(.O(clk_lut[0]), .I0(clk));
-				(* DONT_TOUCH = "TRUE", HLUTNM = "CLK_LUT" *) LUT1 #(.INIT(2'b10)) lut(.O(clk_lut[1]), .I0(clk_lut[0]));
+				(* DONT_TOUCH = "TRUE", HLUTNM = "CLK_LUT" *) LUT1 #(.INIT(2'b10)) lut2(.O(clk_lut[1]), .I0(clk_lut[0]));
 				always_ff @(posedge clk2x)  Active <= clk_lut[1];
 			end : blkActive
 

From 7e12ae4c0902882aa436c1b7b3b82dbdcc5f8dac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Tue, 19 Dec 2023 09:12:48 +0000
Subject: [PATCH 224/235] Reworking pumped DSP integration with simplified
 enable computation.

---
 finn-rtllib/mvu/mvu_vvu_axi.sv     | 141 ++++++++++++----------------
 finn-rtllib/mvu/tb/mvu_dsp58_tb.sv | 142 +++++++++++++++++++++++++++++
 2 files changed, 201 insertions(+), 82 deletions(-)
 create mode 100644 finn-rtllib/mvu/tb/mvu_dsp58_tb.sv

diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index 8a593713a3..d40c5e1b10 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -224,12 +224,12 @@ module mvu_vvu_axi #(
 		else begin : genPumpedCompute
 			assign	dsp_clk = clk2x;
 
-			// Identify second fast cycle before active slow clock edge
+			// Identify second fast cycle just before active slow clock edge
 			logic  Active = 0;
 			if(1) begin : blkActive
 				uwire  clk_lut[2];	// Put some LUT delay on the input from the fast clock net
-				(* DONT_TOUCH = "TRUE", HLUTNM = "CLK_LUT" *) LUT1 #(.INIT(2'b10)) lut(.O(clk_lut[0]), .I0(clk));
-				(* DONT_TOUCH = "TRUE", HLUTNM = "CLK_LUT" *) LUT1 #(.INIT(2'b10)) lut2(.O(clk_lut[1]), .I0(clk_lut[0]));
+				(* DONT_TOUCH = "TRUE", HLUTNM = "CLK_LUT" *) LUT1 #(.INIT(2'b10)) lut0(.O(clk_lut[0]), .I0(clk));
+				(* DONT_TOUCH = "TRUE", HLUTNM = "CLK_LUT" *) LUT1 #(.INIT(2'b10)) lut1(.O(clk_lut[1]), .I0(clk_lut[0]));
 				always_ff @(posedge clk2x)  Active <= clk_lut[1];
 			end : blkActive
 
@@ -237,78 +237,53 @@ module mvu_vvu_axi #(
 			//	- Both fast cycles are controlled by the same enable state.
 			//	- A zero cycle is duplicated across both fast cycles.
 			//	- The last flag must be restricted to the second fast cycle.
-			logic  En = 0;
-			logic  Last[1:0] = '{ default: 1'b0 };
-			logic  Zero = 1;
-			dsp_w_t  W[1:0] = '{ default: 'x };
-			dsp_a_t  A[1:0] = '{ default: 'x };
 
+			dsp_w_t  W = 'x;
+			for(genvar  pe = 0; pe < PE; pe++) begin : genPERegW
+
+				uwire [2*DSP_SIMD-1:0][WEIGHT_WIDTH-1:0]  w;
+				for(genvar  i =    0; i <       SIMD; i++)  assign  w[i] = mvu_w[pe][i];
+				for(genvar  i = SIMD; i < 2*DSP_SIMD; i++)  assign  w[i] = 0;
+
+				always_ff @(posedge clk2x) begin
+					if(rst)      W[pe] <= 'x;
+					else if(en)  W[pe] <= w[(Active? DSP_SIMD : 0) +: DSP_SIMD];
+				end
+
+			end : genPERegW
+
+			dsp_a_t  A = 'x;
+			for(genvar  pe = 0; pe < ACT_PE; pe++) begin : genPERegA
+
+				uwire [2*DSP_SIMD-1:0][ACTIVATION_WIDTH-1:0]  a;
+				for(genvar  i =    0; i <       SIMD; i++)  assign  a[i] = amvau_i[pe][i];
+				for(genvar  i = SIMD; i < 2*DSP_SIMD; i++)  assign  a[i] = 0;
+
+				always_ff @(posedge clk2x) begin
+					if(rst)      A[pe] <= 'x;
+					else if(en)  A[pe] <= a[(Active? DSP_SIMD : 0) +: DSP_SIMD];
+				end
+
+			end : genPERegA
+
+			logic  Zero = 1;
+			logic  Last = 0;
 			always_ff @(posedge clk2x) begin
 				if(rst) begin
-					En   <= 0;
-					Last <= '{ default: 1'b0 };
-					Zero <=  1;
+					Zero <= 1;
+					Last <= 0;
 				end
-				else begin
-					if(Active) begin
-						En <= en;
-						if(en) begin
-							Last <= '{ alast && avld, 1'b0 };
-							Zero <= !istb;
-						end
-					end
-					else if(En) begin
-						Last <= '{ 'x, Last[1] };
-					end
+				else if(en) begin
+					Zero <= !istb;
+					Last <= alast && avld && Active;
 				end
 			end
 
-			for(genvar  simd = 0; simd < EFFECTIVE_SIMD; simd++) begin : genSIMDRegW
-				for(genvar  pe = 0; pe < PE; pe++) begin : genPERegW
-					always_ff @(posedge clk2x) begin
-						if(rst) begin
-							W[simd / DSP_SIMD][pe][simd % DSP_SIMD] <= '{ default: 'x };
-						end
-						else begin
-							if(Active) begin
-								if(en) begin
-									W[simd / DSP_SIMD][pe][simd % DSP_SIMD] <= (simd==EFFECTIVE_SIMD-1 && SIMD_UNEVEN) ? '0 : mvu_w[pe][simd];
-								end
-							end
-							else if(En) begin
-								W[simd / DSP_SIMD][pe][simd % DSP_SIMD] <= simd / DSP_SIMD == 1 ? 'x : W[1][pe][simd % DSP_SIMD];
-							end
-						end
-					end
-				end : genPERegW
-			end : genSIMDRegW
-
-			for(genvar  simd = 0; simd < EFFECTIVE_SIMD; simd++) begin : genSIMDRegA
-				for(genvar  pe = 0; pe < ACT_PE; pe++) begin : genPERegA
-					always_ff @(posedge clk2x) begin
-						if(rst) begin
-							A[simd / DSP_SIMD][pe][simd % DSP_SIMD] <= '{ default: 'x };
-						end
-						else begin
-							if(Active) begin
-								if(en) begin
-									A[simd / DSP_SIMD][pe][simd % DSP_SIMD] <= (simd==EFFECTIVE_SIMD-1 && SIMD_UNEVEN) ? '0 : amvau_i[pe][simd];
-								end
-							end
-							else if(En) begin
-							  A[simd / DSP_SIMD][pe][simd % DSP_SIMD] <= simd / DSP_SIMD == 1 ? 'x : A[1][pe][simd % DSP_SIMD];
-							end
-						end
-					end
-				end : genPERegA
-			end : genSIMDRegA
-
-			assign	dsp_en = En;
-
-			assign	dsp_last = Last[0];
+			assign	dsp_en = en;
+			assign	dsp_last = Last;
 			assign	dsp_zero = Zero;
-			assign	dsp_w = W[0];
-			assign	dsp_a = A[0];
+			assign	dsp_w = W;
+			assign	dsp_a = A;
 
 			// Since no two consecutive last cycles will ever be asserted on the input,
 			// valid outputs will also always be spaced by, at least, one other cycle.
@@ -321,11 +296,9 @@ module mvu_vvu_axi #(
 					Vld <= 0;
 					P   <= 'x;
 				end
-				else begin
-					if (dsp_en) begin
-						if(dsp_vld)  P <= dsp_p;
-						Vld <= dsp_vld || (Vld && Active);
-					end
+				else if(en) begin
+					if(dsp_vld)  P <= dsp_p;
+					Vld <= dsp_vld || (Vld && !Active);
 				end
 			end
 			assign	ovld = Vld;
@@ -373,34 +346,38 @@ module mvu_vvu_axi #(
 //-------------------- Output register slice --------------------\\
 	// Make `en`computation independent from external inputs.
 	// Drive all outputs from registers.
-	typedef struct packed {
+	struct packed {
+		logic rdy;
+		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
+	}  A = '{ rdy: 1, default: 'x };	// side-step register used when encountering backpressure
+	struct packed {
 		logic vld;
 		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
-	} buf_t;
-	buf_t  A = '{ vld: 0, default: 'x };	// side-step register used when encountering backpressure
-	buf_t  B = '{ vld: 0, default: 'x };	// ultimate output register
+	}  B = '{ vld: 0, default: 'x };	// ultimate output register
 
-	assign	en = !A.vld || !ovld;
+	assign	en = A.rdy;
 	uwire  b_load = !B.vld || m_axis_output_tready;
 
 	always_ff @(posedge clk) begin
 		if(rst) begin
-			A <= '{ vld: 0, default: 'x };
+			A <= '{ rdy: 1, default: 'x };
 			B <= '{ vld: 0, default: 'x };
 		end
 		else begin
-			if(!A.vld)  A.dat <= odat;
-			A.vld <= (ovld || A.vld) && !b_load;
+			if(A.rdy)  A.dat <= odat;
+			A.rdy <= (A.rdy && !ovld) || b_load;
 
 			if(b_load) begin
 				B <= '{
-					vld: A.vld || ovld,
-					dat: A.vld? A.dat : odat
+					vld: ovld || !A.rdy,
+					dat: A.rdy? odat : A.dat
 				};
 			end
 		end
 	end
 	assign	m_axis_output_tvalid = B.vld;
+	// Why would we need a sign extension here potentially creating a higher signal load into the next FIFO?
+	// These extra bits should never be used. Why not 'x them out?
 	assign	m_axis_output_tdata  = { {(OUTPUT_STREAM_WIDTH_BA-OUTPUT_STREAM_WIDTH){B.dat[PE-1][ACCU_WIDTH-1]}}, B.dat};
 
 endmodule : mvu_vvu_axi
diff --git a/finn-rtllib/mvu/tb/mvu_dsp58_tb.sv b/finn-rtllib/mvu/tb/mvu_dsp58_tb.sv
new file mode 100644
index 0000000000..108980c497
--- /dev/null
+++ b/finn-rtllib/mvu/tb/mvu_dsp58_tb.sv
@@ -0,0 +1,142 @@
+module mvu_dsp58_tb;
+
+	localparam int unsigned  N = 1000;
+
+	localparam int unsigned  MW = 12;
+	localparam int unsigned  MH = 4;
+	localparam int unsigned  PE = 2;
+	localparam int unsigned  SIMD = 6;
+	localparam int unsigned  ACTIVATION_WIDTH = 8;
+	localparam int unsigned  WEIGHT_WIDTH = 8;
+	localparam int unsigned  ACCU_WIDTH = 24;
+
+	//- Global Control ------------------
+	logic  clk = 1;
+	logic  clk2x = 1;
+	always #5ns clk = !clk;
+	always #2.5ns clk2x = !clk2x;
+
+	logic  rst = 1;
+	initial begin
+		repeat(8) @(posedge clk);
+		rst <= 0;
+	end
+
+	//- DUTs ----------------------------
+
+	// Weight Stream
+	logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]  s_axis_weights_tdata;
+	logic  s_axis_weights_tvalid[2];
+	uwire  s_axis_weights_tready[2];
+
+	// Input Stream
+	logic [SIMD-1:0][ACTIVATION_WIDTH-1:0]  s_axis_input_tdata;
+	logic  s_axis_input_tvalid[2];
+	uwire  s_axis_input_tready[2];
+
+	// Output Stream
+	uwire [PE-1:0][ACCU_WIDTH-1:0]  m_axis_output_tdata[2];
+	uwire  m_axis_output_tvalid[2];
+	logic  m_axis_output_tready[2];
+
+	for(genvar  i = 0; i < 2; i++) begin : genDUTs
+		mvu_vvu_axi #(
+			.IS_MVU(1),
+			.COMPUTE_CORE("mvu_vvu_8sx9_dsp58"),
+			.MW(MW), .MH(MH),
+			.PE(PE), .SIMD(SIMD),
+			.ACTIVATION_WIDTH(ACTIVATION_WIDTH),
+			.WEIGHT_WIDTH(WEIGHT_WIDTH),
+			.ACCU_WIDTH(ACCU_WIDTH),
+			.PUMPED_COMPUTE(i)
+		) dut (
+			.ap_clk(clk), .ap_clk2x(clk2x), .ap_rst_n(!rst),
+			.s_axis_weights_tdata,                        .s_axis_weights_tvalid(s_axis_weights_tvalid[i]), .s_axis_weights_tready(s_axis_weights_tready[i]),
+			.s_axis_input_tdata,                          .s_axis_input_tvalid  (s_axis_input_tvalid  [i]), .s_axis_input_tready  (s_axis_input_tready  [i]),
+			.m_axis_output_tdata(m_axis_output_tdata[i]), .m_axis_output_tvalid (m_axis_output_tvalid [i]), .m_axis_output_tready (m_axis_output_tready [i])
+		);
+	end : genDUTs
+
+
+	//- Stimuli -------------------------
+
+	// Weight Feed
+	initial begin
+		s_axis_weights_tvalid = '{ default: 0 };
+		s_axis_weights_tdata  = 'x;
+		@(posedge clk iff !rst);
+
+		repeat(N * (MH/PE)*(MW/SIMD)) begin
+			automatic type(s_axis_weights_tdata)  weights;
+			std::randomize(weights);
+			s_axis_weights_tdata <= weights;
+			s_axis_weights_tvalid <= '{ default: 1 };
+			fork
+				begin
+					@(posedge clk iff s_axis_weights_tready[0]);
+					s_axis_weights_tvalid[0] <= 0;
+				end
+				begin
+					@(posedge clk iff s_axis_weights_tready[1]);
+					s_axis_weights_tvalid[1] <= 0;
+				end
+			join
+		end
+	end
+
+	// Input Feed
+	initial begin
+		s_axis_input_tvalid = '{ default: 0 };
+		s_axis_input_tdata  = 'x;
+		@(posedge clk iff !rst);
+
+		repeat(N * (MW/SIMD)) begin
+			automatic type(s_axis_input_tdata)  in;
+			std::randomize(in);
+			s_axis_input_tdata <= in;
+			s_axis_input_tvalid <= '{ default: 1 };
+			fork
+				begin
+					@(posedge clk iff s_axis_input_tready[0]);
+					s_axis_input_tvalid[0] <= 0;
+				end
+				begin
+					@(posedge clk iff s_axis_input_tready[1]);
+					s_axis_input_tvalid[1] <= 0;
+				end
+			join
+		end
+	end
+
+	// Output Capture and Comparison
+	initial begin
+		m_axis_output_tready = '{ default: 0 };
+		@(posedge clk iff !rst);
+
+		repeat(N * (MH/PE)) begin
+			automatic type(m_axis_output_tdata)  res;
+			m_axis_output_tready <= '{ default: 1 };
+			fork
+				begin
+					@(posedge clk iff m_axis_output_tvalid[0]);
+					m_axis_output_tready[0] <= 0;
+					res[0] = m_axis_output_tdata[0];
+				end
+				begin
+					@(posedge clk iff m_axis_output_tvalid[1]);
+					m_axis_output_tready[1] <= 0;
+					res[1] = m_axis_output_tdata[1];
+				end
+			join
+			assert(res[0] == res[1]) else begin
+				$error("Output mismatch: %0x <=> %0x", res[0], res[1]);
+				$stop;
+			end
+			while($urandom()%7 < MW/SIMD) @(posedge clk);	// Occassional backpressure
+		end
+
+		$display("Test completed.");
+		$finish;
+	end
+
+endmodule : mvu_dsp58_tb

From 6e98bac42f225e7ed8629e0cb67211e78db61d15 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Wed, 13 Dec 2023 09:36:25 +0000
Subject: [PATCH 225/235] [rtlsim]: use pyverilator util functions

---
 src/finn/custom_op/fpgadataflow/hlscustomop.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/hlscustomop.py b/src/finn/custom_op/fpgadataflow/hlscustomop.py
index 4fed8ed4b5..01b94c20ca 100644
--- a/src/finn/custom_op/fpgadataflow/hlscustomop.py
+++ b/src/finn/custom_op/fpgadataflow/hlscustomop.py
@@ -31,7 +31,7 @@
 import subprocess
 import warnings
 from abc import abstractmethod
-from pyverilator.util.axi_utils import _read_signal, reset_rtlsim, rtlsim_multi_io
+from pyverilator.util.axi_utils import _read_signal, reset_rtlsim, rtlsim_multi_io, toggle_clk
 from qonnx.core.datatype import DataType
 from qonnx.custom_op.base import CustomOp
 from qonnx.util.basic import roundup_to_integer_multiple
@@ -491,15 +491,11 @@ def exec_precompiled_singlenode_model(self):
     def reset_rtlsim(self, sim):
         """Sets reset input in pyverilator to zero, toggles the clock and set it
         back to one"""
-        sim.io.ap_rst_n = 0
-        sim.io.ap_clk = 1
-        sim.io.ap_clk = 0
-        sim.io.ap_rst_n = 1
+        reset_rtlsim(sim)
 
     def toggle_clk(self, sim):
         """Toggles the clock input in pyverilator once."""
-        sim.io.ap_clk = 1
-        sim.io.ap_clk = 0
+        toggle_clk(sim)
 
     def hls_sname(self):
         """Get the naming convention used by Vitis HLS for stream signals

From 5dd74ad1dede3bf2a0405de8c803a4adfb2e65d3 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 8 Dec 2023 17:12:42 +0000
Subject: [PATCH 226/235] [mvu vvu axi]: sign extend output tdata
 (byte-aligned)

---
 finn-rtllib/mvu/mvu_vvu_axi.sv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index 8eb92a93e6..699662bd72 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -243,6 +243,6 @@ module mvu_vvu_axi #(
 	end
 
 	assign	m_axis_output_tvalid = B.vld;
-	assign	m_axis_output_tdata  = B.dat;
+	assign	m_axis_output_tdata  = { {(OUTPUT_STREAM_WIDTH_BA-OUTPUT_STREAM_WIDTH){B.dat[PE-1][ACCU_WIDTH-1]}}, B.dat};
 
 endmodule : mvu_vvu_axi

From b20410bfd968c27395537b60bba11849b599a33a Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 8 Jan 2024 14:55:56 +0000
Subject: [PATCH 227/235] [mvu core]: dsp48 convert unpacked array to packed
 array to work around limitation on max array indices in Verilator

---
 finn-rtllib/mvu/mvu_4sx4u.sv       | 4 ++--
 finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv
index 706347d700..7a2af35742 100644
--- a/finn-rtllib/mvu/mvu_4sx4u.sv
+++ b/finn-rtllib/mvu/mvu_4sx4u.sv
@@ -309,7 +309,7 @@ module mvu_4sx4u #(
 			// Conclusive high part accumulation
 			if(i >= PE_REM && i < 3) begin : genHi
 				// Adder Tree across all SIMD high contributions, each from [-1:1]
-				uwire signed [$clog2(1+SIMD):0]  tree[2*SIMD-1];
+				uwire signed [2*SIMD-2:0][$clog2(1+SIMD):0]  tree;
 				for(genvar  s = 0; s < SIMD;   s++)  assign  tree[SIMD-1+s] = h3[s][i];
 				for(genvar  n = 0; n < SIMD-1; n++) begin
 					// Sum truncated to actual maximum bit width at this node
@@ -333,7 +333,7 @@ module mvu_4sx4u #(
 			if(i >= PE_REM) begin : blkLo
 				// Adder Tree across all SIMD low contributions
 				localparam int unsigned  ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1));
-				uwire [ROOT_WIDTH-1:0]  tree[2*SIMD-1];
+				uwire [2*SIMD-2:0][ROOT_WIDTH-1:0]  tree;
 				for(genvar  s = 0; s < SIMD;   s++)  assign  tree[SIMD-1+s] = p3[s][D[i]+:LO_WIDTH];
 				for(genvar  n = 0; n < SIMD-1; n++) begin
 					// Sum truncated to actual maximum bit width at this node
diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
index 07c44cf89a..1e6855f779 100644
--- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
+++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
@@ -335,7 +335,7 @@ module mvu_8sx8u_dsp48 #(
 			if(i >= PE_REM) begin : blkLo
 				// Adder Tree across all SIMD low contributions
 				localparam int unsigned  ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1));
-				uwire [ROOT_WIDTH-1:0]  tree[2*SIMD-1];
+				uwire [2*SIMD-2:0][ROOT_WIDTH-1:0]  tree;
 				for(genvar  s = 0; s < SIMD;   s++)  assign  tree[SIMD-1+s] = p3[s][D[i]+:LO_WIDTH];
 				for(genvar  n = 0; n < SIMD-1; n++) begin
 					// Sum truncated to actual maximum bit width at this node

From 1c2cc0c2c1d98d7cde569f65eb20873a10e1f12f Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 8 Jan 2024 14:57:19 +0000
Subject: [PATCH 228/235] [mvu axi]: update list of deduced parameters

---
 finn-rtllib/mvu/mvu_vvu_axi.sv | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index 699662bd72..dd357c94bb 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -60,13 +60,14 @@ module mvu_vvu_axi #(
 	bit M_REG_LUT = 1,
 
 	// Safely deducible parameters
-	localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
-	localparam int unsigned INPUT_STREAM_WIDTH_BA = ((IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8,
-	localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH,
-	localparam int unsigned INPUT_STREAM_WIDTH =  (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH,
-	localparam int unsigned SF = MW/SIMD,
-	localparam int unsigned NF = IS_MVU ? MH/PE : 1,
-	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8
+	localparam int unsigned  WEIGHT_STREAM_WIDTH	= PE * SIMD * WEIGHT_WIDTH,
+	localparam int unsigned  WEIGHT_STREAM_WIDTH_BA	= (WEIGHT_STREAM_WIDTH + 7) / 8 * 8,
+	localparam int unsigned  INPUT_STREAM_WIDTH	= (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH,
+	localparam int unsigned  INPUT_STREAM_WIDTH_BA	= (INPUT_STREAM_WIDTH + 7) / 8 * 8,
+	localparam int unsigned  OUTPUT_STREAM_WIDTH	= PE * ACCU_WIDTH,
+	localparam int unsigned  OUTPUT_STREAM_WIDTH_BA	= (OUTPUT_STREAM_WIDTH + 7) / 8 * 8,
+	localparam int unsigned  SF = MW / SIMD,
+	localparam int unsigned  NF = IS_MVU ? MH / PE : 1
 )
 (
 	// Global Control

From eeb3cea623865a13d8da78acb5a9c7fc621caf0e Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 8 Jan 2024 14:58:02 +0000
Subject: [PATCH 229/235] [mvu custom-op]: remove lut-based implementation and
 update compute core selection

---
 .../matrixvectoractivation_rtl.py             | 39 ++++++++++---------
 1 file changed, 21 insertions(+), 18 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
index da560d73fd..fcab06658c 100644
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
@@ -191,7 +191,12 @@ def verify_node(self):
 
         if mem_mode not in ["decoupled", "external"]:
             info_messages.append(
-                "RTL-based MVAU supports only decoupled or external weights."
+                "RTL-based MVU only supports decoupled or external weights."
+            )
+
+        if self.get_nodeattr("resType") == "lut":
+            info_message.append(
+                "RTL-based MVU only supports DSP-based implementation"
             )
 
         return info_messages
@@ -635,7 +640,6 @@ def execute_node(self, context, graph):
         mem_mode = self.get_nodeattr("mem_mode")
         node = self.onnx_node
 
-        # TODO ensure codegen dir exists
         if mode == "cppsim":
             raise Exception(
                 "cppsim not possible for RTL MVAU, please set exec_mode to rtlsim"
@@ -801,7 +805,6 @@ def code_generation_ipi(self):
                 rtllib_dir + "mvu_4sx4u.sv",
                 rtllib_dir + "mvu_vvu_8sx9_dsp58.sv",
                 rtllib_dir + "mvu_8sx8u_dsp48.sv",
-                rtllib_dir + "mvu_vvu_lut.sv",
             ]
             for f in sourcefiles:
                 cmd.append("add_files -norecurse %s" % (f))
@@ -897,7 +900,6 @@ def code_generation_ipi(self):
                 rtllib_dir + "mvu_4sx4u.sv",
                 rtllib_dir + "mvu_vvu_8sx9_dsp58.sv",
                 rtllib_dir + "mvu_8sx8u_dsp48.sv",
-                rtllib_dir + "mvu_vvu_lut.sv",
             ]
             for f in sourcefiles:
                 cmd.append("add_files -norecurse %s" % (f))
@@ -964,8 +966,8 @@ def derive_characteristic_fxns(self, period):
 
     def _resolve_segment_len(self, clk):
         # Insert pipeline registers in the DSP58 chain to meet target clock frequency
-        # 0.741 ns seems the worst-case delay through first DSP
-        # 0.605 ns seems to be (on average) delay for all subsequent DSPs
+        # ~0.741 ns seems the worst-case delay through first DSP
+        # ~0.605 ns seems to be (on average) delay for all subsequent DSPs
         # clk >= (critical_path_dsps - 1) * 0.605 + 0.741
         assert (clk > 0.741), "Infeasible clk target of {} ns has been set, consider lowering the targeted clock frequency!".format(clk)
         critical_path_dsps = np.floor((clk - 0.741) / 0.605 + 1)
@@ -976,22 +978,23 @@ def _resolve_segment_len(self, clk):
     def _resolve_impl_style(self, fpgapart):
         # Based on target device and activation/weight-width, choose the
         # supported RTL compute core
-        if self.get_nodeattr("resType") == "lut":
-            return "mvu_vvu_lut"
+        
+        assert self.get_nodeattr("resType") != "lut", "LUT-based RTL-MVU implementation currently not supported! Please change resType for {}".format(self.onnx_node.name)
+
+        act_width = self.get_input_datatype(0).bitwidth()
+        weight_width = self.get_input_datatype(1).bitwidth()
+        is_versal = (
+            fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"]
+            or fpgapart[0:5] == "xqrvc"
+        )
+        
+        if is_versal:
+            return "mvu_vvu_8sx9_dsp58"
         else:
-            act_width = self.get_input_datatype(0).bitwidth()
-            weight_width = self.get_input_datatype(1).bitwidth()
-            is_versal = (
-                fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"]
-                or fpgapart[0:5] == "xqrvc"
-            )
             if act_width == 4 and weight_width == 4:
                 return "mvu_4sx4u"
             else:
-                if is_versal:
-                    return "mvu_vvu_8sx9_dsp58"
-                else:
-                    return "mvu_8sx8u_dsp48"
+                return "mvu_8sx8u_dsp48"
 
     def generate_hdl(self, model, fpgapart, clk):
         # Generate params as part of IP preparation

From 0813d1463a219384b4666fad2db93a4f7dee1a0f Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 8 Jan 2024 14:59:30 +0000
Subject: [PATCH 230/235] [mvu axi]: remove LUT-based compute core

---
 finn-rtllib/mvu/mvu_vvu_axi.sv |  11 +---
 finn-rtllib/mvu/mvu_vvu_lut.sv | 104 ---------------------------------
 2 files changed, 2 insertions(+), 113 deletions(-)
 delete mode 100644 finn-rtllib/mvu/mvu_vvu_lut.sv

diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index dd357c94bb..a3b051c9a1 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -120,8 +120,8 @@ module mvu_vvu_axi #(
 			end
 		end
 		if (!IS_MVU) begin
-			if (COMPUTE_CORE != "mvu_vvu_8sx9_dsp58" && COMPUTE_CORE != "mvu_vvu_lut") begin
-				$error("VVU only supported on DSP58 or LUT-based implementation");
+			if (COMPUTE_CORE != "mvu_vvu_8sx9_dsp58") begin
+				$error("VVU only supported on DSP58");
 				$finish;
 			end
 		end
@@ -195,13 +195,6 @@ module mvu_vvu_axi #(
 			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
 			.vld(ovld), .p(odat)
 		);
-	"mvu_vvu_lut":
-		mvu_vvu_lut #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH),
-		.WEIGHT_WIDTH(WEIGHT_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .M_REG(M_REG_LUT)) core (
-			.clk, .rst, .en,
-			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
-			.vld(ovld), .p(odat)
-		);
 	default: initial begin
 		$error("Unrecognized COMPUTE_CORE '%s'", COMPUTE_CORE);
 		$finish;
diff --git a/finn-rtllib/mvu/mvu_vvu_lut.sv b/finn-rtllib/mvu/mvu_vvu_lut.sv
deleted file mode 100644
index c100910d75..0000000000
--- a/finn-rtllib/mvu/mvu_vvu_lut.sv
+++ /dev/null
@@ -1,104 +0,0 @@
-module mvu_vvu_lut #(
-    bit IS_MVU,
-    int unsigned  PE,
-    int unsigned  SIMD,
-	int unsigned  ACCU_WIDTH,
-    int unsigned  ACTIVATION_WIDTH,
-    int unsigned  WEIGHT_WIDTH,
-    bit  SIGNED_ACTIVATIONS,
-    bit  M_REG = 1,
-
-    localparam int unsigned MULT_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH,
-    localparam int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD
-)(
-	// Global Control
-	input	logic  clk,
-	input	logic  rst,
-	input	logic  en,
-
-	// Input
-	input	logic  last,
-	input	logic  zero,	// ignore current inputs and force this partial product to zero
-	input	logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]             w,	// signed weights
-	input	logic        [ACTIVATION_ELEMENTS-1:0][ACTIVATION_WIDTH-1:0]  a,	// (un)signed activations
-
-	// Ouput
-	output	logic  vld,
-	output	logic signed [PE-1:0][ACCU_WIDTH-1:0]  p
-);
-
-    typedef int unsigned  leave_load_t[2*SIMD-1];
-    function leave_load_t init_leave_loads();
-        automatic leave_load_t  res;
-        for(int  i = 2*(SIMD-1); i >= int'(SIMD)-1; i--)  res[i] = 1;
-        for(int  i = SIMD-2; i >= 0; i--)  res[i] = res[2*i+1] + res[2*i+2];
-        return res;
-    endfunction : init_leave_loads
-
-    // Pipeline for last indicator flag
-    uwire last_i;
-    generate if (M_REG) begin
-        logic [0:1] L = '0;
-        always_ff @(posedge clk) begin
-            if(rst)       L <= '0;
-            else if (en)  L <= {last, L[0]};
-        end
-        assign  last_i = L[1];
-    end
-    else begin 
-        logic L = '0;
-        always_ff @(posedge clk) begin
-            if(rst)       L <= '0;
-            else if (en)  L <= last;
-        end
-        assign  last_i = L;
-    end
-    endgenerate
-
-    // For each PE generate
-    for (genvar  i = 0; i < PE; i++)  begin : genPE
-        // Stage #1: SIMD multipliers in parallel
-        uwire [MULT_WIDTH-1 : 0] m1 [SIMD];
-        for (genvar j = 0; j < SIMD; j++) begin : genSIMD
-            if (M_REG) begin : genMreg
-                logic [MULT_WIDTH-1 : 0] M [SIMD];
-                always_ff @(posedge clk) begin
-                    if(rst)         M[j] = '{ default : 0 };
-                    else if (en)    M[j] = zero ? 0 :
-                                            SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) :
-                                                                 $signed({1'b0, a[(IS_MVU ? 0 : SIMD*i) + j]}) * $signed(w[i][j]); 
-                    // (SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) : a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) isn't valid -- leads to unsigned multiplication
-                end
-                assign  m1[j] = M[j];
-            end : genMreg
-            else begin : genNoMreg 
-                assign m1[j] = zero ? 0 :
-                               SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) :
-                                                    $signed({1'b0, a[(IS_MVU ? 0 : SIMD*i) + j]}) * $signed(w[i][j]);
-            end : genNoMreg
-        end : genSIMD
-
-        // Stage #2: Adder tree to reduce SIMD products
-        localparam leave_load_t  LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default : 1 };
-        localparam int unsigned  ROOT_WIDTH = $clog2(SIMD*(2**MULT_WIDTH-1));
-        uwire signed [2*SIMD-2:0][ROOT_WIDTH-1:0]  tree;
-        for(genvar s = 0; s < SIMD; s++)  assign  tree[SIMD-1+s] = $signed(m1[s]);
-        for(genvar n = 0; n < SIMD-1; n++) begin
-            // Sum truncated to actual maximum bit width at this node
-            localparam int unsigned  NODE_WIDTH = $clog2(LEAVE_LOAD[n]*(2**MULT_WIDTH-1));
-            uwire signed [NODE_WIDTH-1:0]  s = tree[2*n+1] + tree[2*n+2];
-            assign tree[n] = s;
-        end
-
-        // Stage #3: Buffer output
-        logic [ACCU_WIDTH-1:0] P2 [PE];
-        always_ff @(posedge clk) begin
-            if(rst)         P2[i] = '{ default : 0};
-            else if (en)    P2[i] = (last_i ? 0 : $signed(P2[i])) + $signed(tree[0]);
-        end
-
-        assign  vld = last_i;
-        assign  p[i] = P2[i];
-    end : genPE
-
-endmodule : mvu_vvu_lut

From 4892d6614b734a08315062b86ec6d5e1f1af0dc1 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 11 Jan 2024 12:02:38 +0000
Subject: [PATCH 231/235] [hls custom-op]: enable reset in sim

---
 src/finn/custom_op/fpgadataflow/hlscustomop.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/finn/custom_op/fpgadataflow/hlscustomop.py b/src/finn/custom_op/fpgadataflow/hlscustomop.py
index 01b94c20ca..bc59c69192 100644
--- a/src/finn/custom_op/fpgadataflow/hlscustomop.py
+++ b/src/finn/custom_op/fpgadataflow/hlscustomop.py
@@ -600,6 +600,7 @@ def rtlsim_multi_io(self, sim, io_dict):
             trace_file=trace_file,
             sname=sname,
             liveness_threshold=pyverilate_get_liveness_threshold_cycles(),
+            do_reset=True,
         )
         self.set_nodeattr("cycles_rtlsim", total_cycle_count)
 

From 44f6e0f3e70eea06408b94a31e555f0f6b9ea358 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 11 Jan 2024 12:21:00 +0000
Subject: [PATCH 232/235] [test mvu rtl]: updated test flow (DSP58 only)

---
 .../test_fpgadataflow_mvau_rtl.py             | 167 +++++++++---------
 1 file changed, 87 insertions(+), 80 deletions(-)

diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py b/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py
index 3db7a718f5..1e9de44fb2 100644
--- a/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py
+++ b/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py
@@ -27,141 +27,148 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import pytest
-import os
 
 import numpy as np
+import os
+import pickle
 from onnx import TensorProto, helper
-from qonnx.util.basic import (
-    qonnx_make_model,
-    gen_finn_dt_tensor
-)
-from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.core.datatype import DataType
-from qonnx.transformation.general import GiveUniqueNodeNames
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.general import ApplyConfig, GiveUniqueNodeNames, GiveReadableTensorNames
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
+
 import finn.core.onnx_exec as oxe
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
-from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl
+from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
-from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.set_fifo_depths import InsertAndSetFIFODepths
+
+
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
-from qonnx.transformation.general import ApplyConfig
-import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl
-#import qonnx.core.data_layout as DataLayout
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 
 build_dir = os.environ["FINN_BUILD_DIR"]
 
-def make_single_matmul_modelwrapper(W, ofm_shape, mh, ifm, weights, idt, wdt):
-    (ofm_h, ofm_w) = ofm_shape
-    ofm = helper.make_tensor_value_info(
-        "ofm",
-        TensorProto.FLOAT,
-        (1, ofm_h, ofm_w, mh)
-    )
-
-    matmul_node = helper.make_node(
-        "MatMul",
-        ["ifm", "weights"],
-        ["ofm"]
-    )
-    graph = helper.make_graph(
-        nodes=[matmul_node],
-        name="matmul_graph",
-        inputs=[ifm],
-        outputs=[ofm]
-    )
+
+def make_single_matmul_modelwrapper(ifm, ofm, idt, wdt, W):
+    matmul_node = helper.make_node("MatMul", ["ifm", "weights"], ["ofm"])
+    graph = helper.make_graph(nodes=[matmul_node], name="matmul_graph", inputs=[ifm], outputs=[ofm])
 
     model = qonnx_make_model(graph, producer_name="fclayer-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("ifm", idt)
     model.set_tensor_datatype("weights", wdt)
-    model.set_tensor_datatype("ofm", DataType["INT32"]) # At this step, the MatMul layer does not optimize the bit-width of the output datatype
+    model.set_tensor_datatype(
+        "ofm", DataType["INT32"]
+    )  # At this step, the MatMul layer does not optimize the bit-width of the output datatype
     model.set_initializer("weights", W)
-
     # model.set_tensor_layout("ifm", DataLayout.NHWC)
 
     return model
 
+
 def prepare_inputs(input_tensor):
-    return {"inp": input_tensor}
+    return {"global_in": input_tensor}
+
 
-@pytest.mark.parametrize("mh", [16])
-@pytest.mark.parametrize("mw", [32])
-@pytest.mark.parametrize("pe", [1, 4, 16])
-#@pytest.mark.parametrize("simd", [1, 30, 90])
-@pytest.mark.parametrize("simd", [1, 4, 32])
+# @pytest.mark.parametrize("mh", [36])
+# @pytest.mark.parametrize("mw", [256])
+@pytest.mark.parametrize("mh", [9])
+@pytest.mark.parametrize("mw", [36])
+# @pytest.mark.parametrize("pe", [1, 4, 9, 36])
+# @pytest.mark.parametrize("simd", [1, 4, 16, 64, 256])
+@pytest.mark.parametrize("pe", [1, 3, 9])
+@pytest.mark.parametrize("simd", [1, 3, 6, 18, 36])
 @pytest.mark.parametrize("idt", [DataType["UINT4"], DataType["UINT8"]])
-@pytest.mark.parametrize("wdt", [DataType["INT4"], DataType["INT6"]])
-#@pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S", "xcku3p-ffva676-1-e"])
-@pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S"])
-@pytest.mark.parametrize("segmentlen", [1])
+@pytest.mark.parametrize("wdt", [DataType["INT4"], DataType["INT8"]])
+# @pytest.mark.parametrize("part", ["xcvc1902-vsva2197-2MP-e-S", "xcku3p-ffva676-1-e"])
+@pytest.mark.parametrize("part", ["xcvc1902-vsva2197-2MP-e-S"])
+@pytest.mark.parametrize("clk_ns", [1.66, 4])
 @pytest.mark.fpgadataflow
 @pytest.mark.slow
 @pytest.mark.vivado
-def test_fpgadataflow_mvau_rtl(mh, mw, pe, simd, idt, wdt, part, segmentlen):
+def test_fpgadataflow_mvau_rtl(
+    mh, mw, pe, simd, idt, wdt, part, clk_ns
+):
+    if part == "xcku3p-ffva676-1-e" and clk_ns != 1.66:
+        pytest.skip("Skip test for varying clk for devices other than Versal, since this variable doesn't change anything for this test")
+
     # Create test input vector (produced by SWG)
     ofm_shape = (5, 5)
     ofm_h, ofm_w = ofm_shape
-    ifm = helper.make_tensor_value_info(
-        "ifm",
-        TensorProto.FLOAT,
-        [1, ofm_h, ofm_w, mw]
-    )
-    weights = helper.make_tensor_value_info(
-        "weights",
-        TensorProto.FLOAT,
-        [mw, mh]
-    )
+    ifm = helper.make_tensor_value_info("ifm", TensorProto.FLOAT, [1, ofm_h, ofm_w, mw])
+    ofm = helper.make_tensor_value_info("ofm", TensorProto.FLOAT, (1, ofm_h, ofm_w, mh))
     W = gen_finn_dt_tensor(wdt, (mw, mh))
-    model = make_single_matmul_modelwrapper(W, ofm_shape, mh, ifm, weights, idt, wdt)
+    model = make_single_matmul_modelwrapper(ifm, ofm, idt, wdt, W)
     model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
 
-    model.save(build_dir+"/matmul.onnx")
+    model.save(build_dir + "/matmul.onnx")
 
     # Create MatMul & obtain golden reference output
-    A = gen_finn_dt_tensor(model.get_tensor_datatype("ifm"), model.get_tensor_shape("ifm"))
+    A = gen_finn_dt_tensor(model.get_tensor_datatype("global_in"), model.get_tensor_shape("global_in"))
     input_dict = prepare_inputs(A)
 
-    ## Execute ONNX model
-    output_matmul = oxe.execute_onnx(model, input_dict)
+    # Execute ONNX model
+    output_matmul = oxe.execute_onnx(model, input_dict)["global_out"]
+
+    with open(build_dir + "/onnx_output.pkl", "wb") as f:
+        pickle.dump(output_matmul, f)
 
     # Create MVAU (HLS)
     model = model.transform(to_hls.InferQuantizedMatrixVectorActivation(mem_mode="decoupled"))
     model = model.transform(GiveUniqueNodeNames())
-    
+
     # Apply folding (i.e. specify to use DSPs)
     folding_config = {
         "Defaults": {},
         "MatrixVectorActivation_0": {
-            "PE" : pe,
-            "SIMD" : simd,
-            "mem_mode" : "decoupled",
-            "ram_style" : "auto",
-            "resType" : "dsp",
-            "impl" : "rtl"
-        }
+            "PE": pe,
+            "SIMD": simd,
+            "mem_mode": "decoupled",
+            "ram_style": "auto",
+            "resType": "dsp",
+            "preferred_backend" : "rtl"
+        },
     }
     model = model.transform(ApplyConfig(folding_config))
-    model.save(build_dir+"/mvau_hls.onnx")
-
-    model = model.transform(SetExecMode("rtlsim"))
-    model = model.transform(PrepareIP(part, 5))
-    model = model.transform(HLSSynthIP())
-    model = model.transform(PrepareRTLSim())
-    output_mvau_hls = oxe.execute_onnx(model, input_dict)["ofm"]
+    model.save(build_dir + "/mvau_hls.onnx")
 
     # Apply convert-to-rtl step
     model = model.transform(to_rtl.InferRTLMatrixVectorActivation())
     model = model.transform(GiveUniqueNodeNames())
-    model.save(build_dir+"/mvau_rtl.onnx")
+    model.save(build_dir + "/mvau_rtl.onnx")
 
+    # Reset rtlsim_so and ip-related paths such that new Pyverilator SO and IP is generated
+    for n in model.graph.node:
+        getCustomOp(n).set_nodeattr("rtlsim_trace", build_dir + "/mvu_trace_rtl_nodebynode.vcd")
+    
     model = model.transform(SetExecMode("rtlsim"))
-    model = model.transform(PrepareIP("xcvm1802-vsvd1760-2MP-e-S", 5))
+    model = model.transform(PrepareIP(part, clk_ns))
     model = model.transform(HLSSynthIP())
     model = model.transform(PrepareRTLSim())
-    output_mvau_rtl = oxe.execute_onnx(model, input_dict)["ofm"]
+    output_mvau_rtl = oxe.execute_onnx(model, input_dict)["global_out"]
+
+    with open(build_dir + "/mvau_rtl_output.pkl", "wb") as f:
+        pickle.dump(output_mvau_rtl, f)
+
+    model.save(build_dir + "/mvau_rtl_sim.onnx")
+    assert (output_matmul == output_mvau_rtl).all(), "Output of ONNX model not matching output of node-by-node sim!"
+
+    model = model.transform(InsertAndSetFIFODepths(part, clk_ns))
+    model = model.transform(PrepareIP(part, clk_ns))
+    model = model.transform(HLSSynthIP())
+    model = model.transform(CreateStitchedIP(part, clk_ns))
 
-    model.save(build_dir+"/mvau_rtl_sim.onnx")
+    os.environ["RTLSIM_TRACE_DEPTH"] = "3"
+    model.set_metadata_prop("rtlsim_so", "")
+    model.set_metadata_prop("exec_mode", "rtlsim")
+    model.set_metadata_prop("rtlsim_trace", build_dir + "/mvu_trace_rtl_stitch.vcd")
+    model.save(build_dir + "/stitched_ip.onnx")
+    output_mvau_rtl_stitch = oxe.execute_onnx(model, input_dict)["global_out"]
 
-    assert (output_mvau_hls == output_mvau_rtl).all()
-    assert (output_mvau_hls.size > 0)
+    assert (output_matmul == output_mvau_rtl_stitch).all(), "Output of ONNX model not matching output of stitched-IP RTL model!"
\ No newline at end of file

From 9b2ccebba2c3689d6a1e55b6df027f461244d216 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 11 Jan 2024 14:43:46 +0000
Subject: [PATCH 233/235] [mvu vvu axi]: reworked flow control and backpressure
 handling by tpreusser

---
 finn-rtllib/mvu/mvu_vvu_axi.sv | 130 ++++++++++++++++-----------------
 1 file changed, 61 insertions(+), 69 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index a3b051c9a1..0168f20563 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -62,12 +62,12 @@ module mvu_vvu_axi #(
 	// Safely deducible parameters
 	localparam int unsigned  WEIGHT_STREAM_WIDTH	= PE * SIMD * WEIGHT_WIDTH,
 	localparam int unsigned  WEIGHT_STREAM_WIDTH_BA	= (WEIGHT_STREAM_WIDTH + 7) / 8 * 8,
-	localparam int unsigned  INPUT_STREAM_WIDTH	= (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH,
+	localparam int unsigned  INPUT_STREAM_WIDTH	= SIMD * ACTIVATION_WIDTH,
 	localparam int unsigned  INPUT_STREAM_WIDTH_BA	= (INPUT_STREAM_WIDTH + 7) / 8 * 8,
 	localparam int unsigned  OUTPUT_STREAM_WIDTH	= PE * ACCU_WIDTH,
 	localparam int unsigned  OUTPUT_STREAM_WIDTH_BA	= (OUTPUT_STREAM_WIDTH + 7) / 8 * 8,
 	localparam int unsigned  SF = MW / SIMD,
-	localparam int unsigned  NF = IS_MVU ? MH / PE : 1
+	localparam int unsigned  NF = MH / PE
 )
 (
 	// Global Control
@@ -119,81 +119,73 @@ module mvu_vvu_axi #(
 				$finish;
 			end
 		end
-		if (!IS_MVU) begin
-			if (COMPUTE_CORE != "mvu_vvu_8sx9_dsp58") begin
-				$error("VVU only supported on DSP58");
-				$finish;
-			end
-		end
 	end
 
 	uwire clk = ap_clk;
 	uwire rst = !ap_rst_n;
 
-	typedef logic [INPUT_STREAM_WIDTH-1 : 0] mvauin_t;
-
-	uwire mvauin_t amvau;
+	//- Replay to Accommodate Neuron Fold -----------------------------------
+	typedef logic [PE*SIMD-1:0][ACTIVATION_WIDTH-1:0] mvu_flatin_t;
+	uwire mvu_flatin_t amvau;
 	uwire alast;
 	uwire afin;
 	uwire avld;
 	uwire ardy;
 
-	replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvauin_t))) activation_replay (
+	replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvu_flatin_t))) activation_replay (
 	.clk, .rst,
-	.ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)),
+	.ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvu_flatin_t'(s_axis_input_tdata)),
 	.ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin)
 	);
 
-//-------------------- Input control --------------------\\
+	//- Unflatten inputs into structured matrices ---------------------------
+	typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH    -1:0]  mvu_w_t;
+	typedef logic         [SIMD-1:0][ACTIVATION_WIDTH-1:0]  mvu_a_t;
+
+	uwire  mvu_w_t  mvu_w = s_axis_weights_tdata;
+	uwire  mvu_a_t  mvu_a = amvau;
+
+	//- Flow Control Bracket around Compute Core ----------------------------
 	uwire en;
 	uwire istb = avld && s_axis_weights_tvalid;
 	assign ardy = en && s_axis_weights_tvalid;
 	assign s_axis_weights_tready = en && avld;
 
-//-------------------- Core MVU/VVU --------------------\\
-	uwire ovld;
-	uwire [PE-1:0][ACCU_WIDTH-1:0] odat;
-	uwire mvauin_t amvau_i;
-	typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t;
-
-	if (IS_MVU) begin : genMVUInput
-		assign  amvau_i = amvau;
-	end : genMVUInput
-	else begin : genVVUInput
-		// The input stream will have the channels interleaved for VVU when PE>1
-		// Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..]
-		// Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like:
-		// (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to
-		// (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i)
-		localparam int num_of_elements = PE*SIMD;
-		for (genvar i=0; i<num_of_elements; i++) begin : genRewire
-			assign  amvau_i[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH] = (PE > 1) ?
-									amvau[(i/SIMD + (i*PE % num_of_elements) + 1) * ACTIVATION_WIDTH -1: (i/SIMD + (i*PE % num_of_elements)) * ACTIVATION_WIDTH]
-									: amvau[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH];
-		end : genRewire
-	end : genVVUInput
+	//- Instantiate compute core ----------------------------
+	typedef logic [PE-1:0][ACCU_WIDTH-1:0]  dsp_p_t;
+	uwire dsp_vld;
+	uwire dsp_p_t  dsp_p;
+
+	uwire dsp_clk = ap_clk;
+	uwire dsp_en = en;
+	uwire dsp_last = alast && avld;
+	uwire dsp_zero = !istb;
+	uwire mvu_w_t dsp_w = mvu_w;
+	uwire mvu_a_t dsp_a = mvu_a;
+	uwire ovld = dsp_vld;
+	uwire dsp_p_t  odat = dsp_p;
 
 	case(COMPUTE_CORE)
 	"mvu_vvu_8sx9_dsp58":
 		mvu_vvu_8sx9_dsp58 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
 		.ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN),
 		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
-			.clk, .rst, .en,
-			.last(alast && avld), .zero(!istb), .w(s_axis_weights_tdata), .a(amvau_i),
-			.vld(ovld), .p(odat)
+			.clk(dsp_clk), .rst, .en(dsp_en),
+			.last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a),
+			.vld(dsp_vld), .p(dsp_p)
 		);
 	"mvu_4sx4u":
 		mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
-			.clk, .rst, .en,
-			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
-			.vld(ovld), .p(odat)
+			.clk(dsp_clk), .rst, .en(dsp_en),
+			.last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a),
+			.vld(dsp_vld), .p(dsp_p)
 		);
 	"mvu_8sx8u_dsp48":
 		mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
 		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
-			.clk, .rst, .en,
-			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
-			.vld(ovld), .p(odat)
+			.clk(dsp_clk), .rst, .en(dsp_en),
+			.last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a),
+			.vld(dsp_vld), .p(dsp_p)
 		);
 	default: initial begin
 		$error("Unrecognized COMPUTE_CORE '%s'", COMPUTE_CORE);
@@ -202,41 +194,41 @@ module mvu_vvu_axi #(
 	endcase
 
 //-------------------- Output register slice --------------------\\
+	// Make `en`computation independent from external inputs.
+	// Drive all outputs from registers.
 	struct packed {
-		logic vld;
+		logic rdy;
 		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
-	} A = '{ vld: 0, default: 'x};
-
-	assign en = !A.vld || !ovld;
-
-	uwire  b_load;
-	always_ff @(posedge clk) begin
-		if(rst)		A <= '{ vld: 0, default: 'x };
-		else if(!A.vld || b_load) begin
-			A.vld <= ovld && en;
-			for(int unsigned  i = 0; i < PE; i++) begin
-				// CR-1148862:
-				// A.dat[i] <= odat[i];
-				automatic logic [ACCU_WIDTH-1:0]  v = odat[i];
-				A.dat[i] <= v[ACCU_WIDTH-1:0];
-			end
-		end
-	end
-
+	}  A = '{ rdy: 1, default: 'x };	// side-step register used when encountering backpressure
 	struct packed {
 		logic vld;
 		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
-	} B = '{ vld: 0, default: 'x};
+	}  B = '{ vld: 0, default: 'x };	// ultimate output register
+
+	assign	en = A.rdy;
+	uwire  b_load = !B.vld || m_axis_output_tready;
 
-	assign	b_load = !B.vld || m_axis_output_tready;
 	always_ff @(posedge clk) begin
-		if(rst)		B <= '{ vld: 0, default: 'x };
+		if(rst) begin
+			A <= '{ rdy: 1, default: 'x };
+			B <= '{ vld: 0, default: 'x };
+		end
 		else begin
-			if(b_load)	B <= '{ vld: A.vld, dat: A.dat};
+			if(A.rdy)  A.dat <= odat;
+			A.rdy <= (A.rdy && !ovld) || b_load;
+
+			if(b_load) begin
+				B <= '{
+					vld: ovld || !A.rdy,
+					dat: A.rdy? odat : A.dat
+				};
+			end
 		end
 	end
-
 	assign	m_axis_output_tvalid = B.vld;
+	// Why would we need a sign extension here potentially creating a higher signal load into the next FIFO?
+	// These extra bits should never be used. Why not 'x them out?
 	assign	m_axis_output_tdata  = { {(OUTPUT_STREAM_WIDTH_BA-OUTPUT_STREAM_WIDTH){B.dat[PE-1][ACCU_WIDTH-1]}}, B.dat};
 
+
 endmodule : mvu_vvu_axi

From ee9f027592e0f28deeab5cbe8d008f3be6076c92 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Wed, 31 Jan 2024 09:59:17 +0000
Subject: [PATCH 234/235] Adding DSP48E1 support for 8-bit compute. Todo: finer
 core differentiation to select DSP48E2 explicitly again.

---
 finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 165 ++++++++++++++++++++++++-----
 1 file changed, 139 insertions(+), 26 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
index 1e6855f779..f3cde9dea9 100644
--- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
+++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
@@ -5,10 +5,9 @@ module mvu_8sx8u_dsp48 #(
 	int unsigned  ACTIVATION_WIDTH,
 	int unsigned  WEIGHT_WIDTH,
 
+	int unsigned  VERSION = 1,
 	bit  SIGNED_ACTIVATIONS = 0,
-	bit  FORCE_BEHAVIORAL = 0,
-
-	localparam int unsigned SINGLE_PROD_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH
+	bit  FORCE_BEHAVIORAL = 0
 )(
 	// Global Control
 	input	logic  clk,
@@ -49,6 +48,7 @@ module mvu_8sx8u_dsp48 #(
 	assign	vld = L[5];
 
 	// Stages #1 - #3: DSP Lanes + cross-lane canaries duplicated with SIMD parallelism
+	localparam int unsigned  SINGLE_PROD_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH;
 	localparam int unsigned  D[2:0] = '{ ACCU_WIDTH+SINGLE_PROD_WIDTH, SINGLE_PROD_WIDTH, 0 }; // Lane offsets
 
 	localparam int unsigned  PIPE_COUNT = (PE+1)/2;
@@ -63,8 +63,8 @@ module mvu_8sx8u_dsp48 #(
 		for(genvar  s = 0; s < SIMD; s++) begin : genSIMD
 
 			// Input Lane Assembly
-			uwire [23:0]  bb = { {(24-ACTIVATION_WIDTH){SIGNED_ACTIVATIONS && a[s][ACTIVATION_WIDTH-1]}}, a[s] };
-			logic [33:0]  aa;
+			uwire [17:0]  bb = { {(18-ACTIVATION_WIDTH){SIGNED_ACTIVATIONS && a[s][ACTIVATION_WIDTH-1]}}, a[s] };
+			logic [29:0]  aa;
 			logic [26:0]  dd;
 			logic [ 1:0]  xx;
 			if(1) begin : blkVectorize
@@ -99,14 +99,14 @@ module mvu_8sx8u_dsp48 #(
 				end
 			end : blkVectorize
 
-			uwire [57:0]  pp;
+			uwire [47:0]  pp;
 
 			// Note: Since the product B * AD is computed,
 			//       rst can be only applied to AD and zero only to B
 			//       with the same effect as zeroing both.
 			if(BEHAVIORAL) begin : genBehav
 				// Stage #1: Input Refine
-				logic signed [23:0]  B1  = 0;
+				logic signed [17:0]  B1  = 0;
 				always_ff @(posedge clk) begin
 					if(zero)     B1  <= 0;
 					else if(en)  B1  <= bb;
@@ -119,7 +119,7 @@ module mvu_8sx8u_dsp48 #(
 				end
 
 				// Stage #2: Multiply
-				logic signed [50:0]  M2 = 0;
+				logic signed [45:0]  M2 = 0;
 				always_ff @(posedge clk) begin
 					if(rst)      M2 <= 0;
 					else if(en)  M2 <=
@@ -130,7 +130,7 @@ module mvu_8sx8u_dsp48 #(
 				end
 
 				// Stage #3: Accumulate
-				logic signed [57:0]  P3 = 0;
+				logic signed [47:0]  P3 = 0;
 				always_ff @(posedge clk) begin
 					if(rst)      P3 <= 0;
 					else if(en)  P3 <= M2 + (L[3]? 0 : P3);
@@ -140,7 +140,115 @@ module mvu_8sx8u_dsp48 #(
 			end : genBehav
 `ifndef VERILATOR
 			else begin : genDSP
-				DSP48E2 #(
+				localparam logic [6:0]  OPMODE_INVERSION = 7'b010_01_01;
+				uwire [6:0]  opmode = { { 1'b0, L[2], 1'b0 }, 4'b00_00 };
+				case(VERSION)
+				1: DSP48E1 #(
+					// Feature Control Attributes: Data Path Selection
+					.A_INPUT("DIRECT"),		// Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port)
+					.B_INPUT("DIRECT"),		// Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port)
+					.USE_DPORT("TRUE"),		// Select D port usage (TRUE or FALSE)
+					.USE_MULT("MULTIPLY"),	// Select multiplier usage ("MULTIPLY", "DYNAMIC", or "NONE")
+					.USE_SIMD("ONE48"),		// SIMD selection ("ONE48", "TWO24", "FOUR12")
+
+					// Pattern Detector Attributes: Pattern Detection Configuration
+					.AUTORESET_PATDET("NO_RESET"),		// "NO_RESET", "RESET_MATCH", "RESET_NOT_MATCH"
+					.MASK('1),							// 48-bit mask value for pattern detect (1=ignore)
+					.PATTERN('0),						// 48-bit pattern match for pattern detect
+					.SEL_MASK("MASK"),					// "C", "MASK", "ROUNDING_MODE1", "ROUNDING_MODE2"
+					.SEL_PATTERN("PATTERN"),			// Select pattern value ("PATTERN" or "C")
+					.USE_PATTERN_DETECT("NO_PATDET"),	// Enable pattern detect ("PATDET" or "NO_PATDET")
+
+					// Register Control Attributes: Pipeline Register Configuration
+					.ACASCREG(0),		// Number of pipeline stages between A/ACIN and ACOUT (0, 1 or 2)
+					.ADREG(1),			// Number of pipeline stages for pre-adder (0 or 1)
+					.ALUMODEREG(0),		// Number of pipeline stages for ALUMODE (0 or 1)
+					.AREG(0),			// Number of pipeline stages for A (0, 1 or 2)
+					.BCASCREG(1),		// Number of pipeline stages between B/BCIN and BCOUT (0, 1 or 2)
+					.BREG(1),			// Number of pipeline stages for B (0, 1 or 2)
+					.CARRYINREG(0),		// Number of pipeline stages for CARRYIN (0 or 1)
+					.CARRYINSELREG(0),	// Number of pipeline stages for CARRYINSEL (0 or 1)
+					.CREG(0),			// Number of pipeline stages for C (0 or 1)
+					.DREG(0),			// Number of pipeline stages for D (0 or 1)
+					.INMODEREG(0),		// Number of pipeline stages for INMODE (0 or 1)
+					.MREG(1),			// Number of multiplier pipeline stages (0 or 1)
+					.OPMODEREG(1),		// Number of pipeline stages for OPMODE (0 or 1)
+					.PREG(1)			// Number of pipeline stages for P (0 or 1)
+				) dsp (
+					// Cascade: 30-bit (each) output: Cascade Ports
+					.ACOUT(),			// 30-bit output: A port cascade output
+					.BCOUT(),			// 18-bit output: B port cascade output
+					.CARRYCASCOUT(),	// 1-bit output: Cascade carry output
+					.MULTSIGNOUT(),		// 1-bit output: Multiplier sign cascade output
+					.PCOUT(),			// 48-bit output: Cascade output
+
+					// Control: 1-bit (each) output: Control Inputs/Status Bits
+					.OVERFLOW(),		 // 1-bit output: Overflow in add/acc output
+					.PATTERNBDETECT(),	 // 1-bit output: Pattern bar detect output
+					.PATTERNDETECT(),	 // 1-bit output: Pattern detect output
+					.UNDERFLOW(),		 // 1-bit output: Underflow in add/acc output
+
+					// Data: 4-bit (each) output: Data Ports
+					.CARRYOUT(),	// 4-bit output: Carry output
+					.P(pp),			// 48-bit output: Primary data output
+
+					// Cascade: 30-bit (each) input: Cascade Ports
+					.ACIN('x),			 // 30-bit input: A cascade data input
+					.BCIN('x),			 // 18-bit input: B cascade input
+					.CARRYCASCIN('x),	 // 1-bit input: Cascade carry input
+					.MULTSIGNIN('x),	 // 1-bit input: Multiplier sign input
+					.PCIN('x),			 // 48-bit input: P cascade input
+
+					// Control: 4-bit (each) input: Control Inputs/Status Bits
+					.CLK(clk),				// 1-bit input: Clock input
+					.ALUMODE('0),			// 4-bit input: ALU control input
+					.CARRYINSEL('0),		// 3-bit input: Carry select input
+					.INMODE(5'b01100),		// 5-bit input: INMODE control input
+					.OPMODE(opmode ^ OPMODE_INVERSION), // 7-bit input: Operation mode input
+
+					// Data: 30-bit (each) input: Data Ports
+					.A(aa),			// 30-bit input: A data input
+					.B(bb),			// 18-bit input: B data input
+					.C('x),			// 48-bit input: C data input
+					.CARRYIN('0),	// 1-bit input: Carry input signal
+					.D(dd),			// 25-bit input: D data input
+
+					// Reset/Clock Enable: 1-bit (each) input: Reset/Clock Enable Inputs
+					.CEA1('0),			// 1-bit input: Clock enable input for 1st stage AREG
+					.CEA2('0),			// 1-bit input: Clock enable input for 2nd stage AREG
+					.CEAD(en),			// 1-bit input: Clock enable input for ADREG
+					.CEALUMODE('0),		// 1-bit input: Clock enable input for ALUMODERE
+					.CEB1('0),			// 1-bit input: Clock enable input for 1st stage BREG
+					.CEB2(en),			// 1-bit input: Clock enable input for 2nd stage BREG
+					.CEC('0),			// 1-bit input: Clock enable input for CREG
+					.CECARRYIN('0),		// 1-bit input: Clock enable input for CARRYINREG
+					.CECTRL(en),		// 1-bit input: Clock enable input for OPMODEREG and CARRYINSELREG
+					.CED('0),			// 1-bit input: Clock enable input for DREG
+					.CEINMODE('0),		// 1-bit input: Clock enable input for INMODEREG
+					.CEM(en),			// 1-bit input: Clock enable input for MREG
+					.CEP(en),			// 1-bit input: Clock enable input for PREG
+					.RSTA('0),			// 1-bit input: Reset input for AREG
+					.RSTB(				// 1-bit input: Reset for BREG
+// synthesis translate_off
+						rst ||
+// synthesis translate_on
+						zero
+					),
+					.RSTC('0),			// 1-bit input: Reset for CREG
+					.RSTD(				// 1-bit input: Reset for DREG and ADREG
+// synthesis translate_off
+						zero ||
+// synthesis translate_on
+						rst
+					),
+					.RSTALLCARRYIN('0),	// 1-bit input: Reset for CARRYINREG
+					.RSTALUMODE('0),	// 1-bit input: Reset for ALUMODEREG
+					.RSTCTRL('0),		// 1-bit input: Reset for OPMODEREG and CARRYINSELREG
+					.RSTINMODE('0),		// 1-bit input: Reset for INMODE register
+					.RSTM(rst),			// 1-bit input: Reset for MREG
+					.RSTP(rst)			// 1-bit input: Reset for PREG
+				);
+				2: DSP48E2 #(
 					// Feature Control Attributes: Data Path Selection
 					.AMULTSEL("AD"),	// Selects A input to multiplier (A, AD)
 					.A_INPUT("DIRECT"),	// Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port)
@@ -163,21 +271,21 @@ module mvu_8sx8u_dsp48 #(
 					.USE_PATTERN_DETECT("NO_PATDET"),  // Enable pattern detect (NO_PATDET, PATDET)
 
 					// Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins
-					.IS_ALUMODE_INVERTED('0),				// Optional inversion for ALUMODE
-					.IS_CARRYIN_INVERTED('0),				// Optional inversion for CARRYIN
-					.IS_CLK_INVERTED('0),					// Optional inversion for CLK
-					.IS_INMODE_INVERTED('0),				// Optional inversion for INMODE
-					.IS_OPMODE_INVERTED(9'b00_010_01_01),	// Optional inversion for OPMODE
-					.IS_RSTALLCARRYIN_INVERTED('0),			// Optional inversion for RSTALLCARRYIN
-					.IS_RSTALUMODE_INVERTED('0),			// Optional inversion for RSTALUMODE
-					.IS_RSTA_INVERTED('0),					// Optional inversion for RSTA
-					.IS_RSTB_INVERTED('0),					// Optional inversion for RSTB
-					.IS_RSTCTRL_INVERTED('0),				// Optional inversion for STCONJUGATE_A
-					.IS_RSTC_INVERTED('0),					// Optional inversion for RSTC
-					.IS_RSTD_INVERTED('0),					// Optional inversion for RSTD
-					.IS_RSTINMODE_INVERTED('0),				// Optional inversion for RSTINMODE
-					.IS_RSTM_INVERTED('0),					// Optional inversion for RSTM
-					.IS_RSTP_INVERTED('0),					// Optional inversion for RSTP
+					.IS_ALUMODE_INVERTED('0),							// Optional inversion for ALUMODE
+					.IS_CARRYIN_INVERTED('0),							// Optional inversion for CARRYIN
+					.IS_CLK_INVERTED('0),								// Optional inversion for CLK
+					.IS_INMODE_INVERTED('0),							// Optional inversion for INMODE
+					.IS_OPMODE_INVERTED({ 2'b00, OPMODE_INVERSION}),	// Optional inversion for OPMODE
+					.IS_RSTALLCARRYIN_INVERTED('0),						// Optional inversion for RSTALLCARRYIN
+					.IS_RSTALUMODE_INVERTED('0),						// Optional inversion for RSTALUMODE
+					.IS_RSTA_INVERTED('0),								// Optional inversion for RSTA
+					.IS_RSTB_INVERTED('0),								// Optional inversion for RSTB
+					.IS_RSTCTRL_INVERTED('0),							// Optional inversion for STCONJUGATE_A
+					.IS_RSTC_INVERTED('0),								// Optional inversion for RSTC
+					.IS_RSTD_INVERTED('0),								// Optional inversion for RSTD
+					.IS_RSTINMODE_INVERTED('0),							// Optional inversion for RSTINMODE
+					.IS_RSTM_INVERTED('0),								// Optional inversion for RSTM
+					.IS_RSTP_INVERTED('0),								// Optional inversion for RSTP
 
 					// Register Control Attributes: Pipeline Register Configuration
 					.ACASCREG(0),                      // Number of pipeline stages between A/ACIN and ACOUT (0-2)
@@ -225,7 +333,7 @@ module mvu_8sx8u_dsp48 #(
 					.ALUMODE(4'h0),				// 4-bit input: ALU control
 					.CARRYINSEL('0),			// 3-bit input: Carry select
 					.INMODE(5'b01100),			// 5-bit input: INMODE control
-					.OPMODE({ 2'b00, { 1'b0, L[2], 1'b0 }, 4'b00_00 }),	// 9-bit input: Operation mode
+					.OPMODE({ 2'b00, opmode }),	// 9-bit input: Operation mode
 
 					// Data inputs: Data Ports
 					.A(aa),						// 34-bit input: A data
@@ -269,6 +377,11 @@ module mvu_8sx8u_dsp48 #(
 					.RSTM(rst),			// 1-bit input: Reset for MREG
 					.RSTP(rst)			// 1-bit input: Reset for PREG
 				);
+				default: initial begin
+					$error("Unknown version DSP48E%0d.", VERSION);
+					$finish;
+				end
+				endcase
 			end : genDSP
 `endif
 

From 3ab82966e1af64aa6ddb75f88561c5e6c86196b2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Wed, 31 Jan 2024 10:15:32 +0000
Subject: [PATCH 235/235] Adding DSP48E1 support for 4-bit compute. Todo: finer
 core differentiation to select DSP48E2 explicitly again.

---
 finn-rtllib/mvu/mvu_4sx4u.sv | 169 +++++++++++++++++++++++++++++------
 1 file changed, 142 insertions(+), 27 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv
index 7a2af35742..b49315637f 100644
--- a/finn-rtllib/mvu/mvu_4sx4u.sv
+++ b/finn-rtllib/mvu/mvu_4sx4u.sv
@@ -2,8 +2,10 @@ module mvu_4sx4u #(
 	int unsigned  PE,
 	int unsigned  SIMD,
 	int unsigned  ACCU_WIDTH,
-	bit SIGNED_ACTIVATIONS = 0,
-	bit FORCE_BEHAVIORAL = 0
+
+	int unsigned  VERSION = 1,
+	bit  SIGNED_ACTIVATIONS = 0,
+	bit  FORCE_BEHAVIORAL = 0
 )(
 	// Global Control
 	input	logic  clk,
@@ -14,7 +16,7 @@ module mvu_4sx4u #(
 	input	logic  last,
 	input	logic  zero,	// ignore current inputs and force this partial product to zero
 	input	logic signed [PE-1:0][SIMD-1:0][3:0]  w,	// signed weights
-	input	logic                [SIMD-1:0][3:0]  a,	// unsigned activations
+	input	logic                [SIMD-1:0][3:0]  a,	// unsigned activations (override by SIGNED_ACTIVATIONS)
 
 	// Ouput
 	output	logic  vld,
@@ -58,8 +60,8 @@ module mvu_4sx4u #(
 		for(genvar  s = 0; s < SIMD; s++) begin : genSIMD
 
 			// Input Lane Assembly
-			uwire [23:0]  bb = { {(20){SIGNED_ACTIVATIONS && a[s][3]}}, a[s] };
-			logic [33:0]  aa;
+			uwire [17:0]  bb = { {(14){SIGNED_ACTIVATIONS && a[s][3]}}, a[s] };
+			logic [29:0]  aa;
 			logic [26:0]  dd;
 			logic [ 1:0]  xx[3:1];
 			if(1) begin : blkVectorize
@@ -94,14 +96,14 @@ module mvu_4sx4u #(
 				end
 			end : blkVectorize
 
-			uwire [57:0]  pp;
+			uwire [47:0]  pp;
 
 			// Note: Since the product B * AD is computed,
 			//       rst can be only applied to AD and zero only to B
 			//       with the same effect as zeroing both.
-			if (BEHAVIORAL) begin : genBehav
+			if(BEHAVIORAL) begin : genBehav
 				// Stage #1: Input Refine
-				logic signed [23:0]  B1  = 0;
+				logic signed [17:0]  B1  = 0;
 				always_ff @(posedge clk) begin
 					if(zero)     B1  <= 0;
 					else if(en)  B1  <= bb;
@@ -114,7 +116,7 @@ module mvu_4sx4u #(
 				end
 
 				// Stage #2: Multiply
-				logic signed [50:0]  M2 = 0;
+				logic signed [45:0]  M2 = 0;
 				always_ff @(posedge clk) begin
 					if(rst)      M2 <= 0;
 					else if(en)  M2 <=
@@ -125,7 +127,7 @@ module mvu_4sx4u #(
 				end
 
 				// Stage #3: Accumulate
-				logic signed [57:0]  P3 = 0;
+				logic signed [47:0]  P3 = 0;
 				always_ff @(posedge clk) begin
 					if(rst)      P3 <= 0;
 					else if(en)  P3 <= M2 + (L[3]? 0 : P3);
@@ -135,7 +137,115 @@ module mvu_4sx4u #(
 			end : genBehav
 `ifndef VERILATOR
 			else begin : genDSP
-				DSP48E2 #(
+				localparam logic [6:0]  OPMODE_INVERSION = 7'b010_01_01;
+				uwire [6:0]  opmode = { { 1'b0, L[2], 1'b0 }, 4'b00_00 };
+				case(VERSION)
+				1: DSP48E1 #(
+					// Feature Control Attributes: Data Path Selection
+					.A_INPUT("DIRECT"),		// Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port)
+					.B_INPUT("DIRECT"),		// Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port)
+					.USE_DPORT("TRUE"),		// Select D port usage (TRUE or FALSE)
+					.USE_MULT("MULTIPLY"),	// Select multiplier usage ("MULTIPLY", "DYNAMIC", or "NONE")
+					.USE_SIMD("ONE48"),		// SIMD selection ("ONE48", "TWO24", "FOUR12")
+
+					// Pattern Detector Attributes: Pattern Detection Configuration
+					.AUTORESET_PATDET("NO_RESET"),		// "NO_RESET", "RESET_MATCH", "RESET_NOT_MATCH"
+					.MASK('1),							// 48-bit mask value for pattern detect (1=ignore)
+					.PATTERN('0),						// 48-bit pattern match for pattern detect
+					.SEL_MASK("MASK"),					// "C", "MASK", "ROUNDING_MODE1", "ROUNDING_MODE2"
+					.SEL_PATTERN("PATTERN"),			// Select pattern value ("PATTERN" or "C")
+					.USE_PATTERN_DETECT("NO_PATDET"),	// Enable pattern detect ("PATDET" or "NO_PATDET")
+
+					// Register Control Attributes: Pipeline Register Configuration
+					.ACASCREG(0),		// Number of pipeline stages between A/ACIN and ACOUT (0, 1 or 2)
+					.ADREG(1),			// Number of pipeline stages for pre-adder (0 or 1)
+					.ALUMODEREG(0),		// Number of pipeline stages for ALUMODE (0 or 1)
+					.AREG(0),			// Number of pipeline stages for A (0, 1 or 2)
+					.BCASCREG(1),		// Number of pipeline stages between B/BCIN and BCOUT (0, 1 or 2)
+					.BREG(1),			// Number of pipeline stages for B (0, 1 or 2)
+					.CARRYINREG(0),		// Number of pipeline stages for CARRYIN (0 or 1)
+					.CARRYINSELREG(0),	// Number of pipeline stages for CARRYINSEL (0 or 1)
+					.CREG(0),			// Number of pipeline stages for C (0 or 1)
+					.DREG(0),			// Number of pipeline stages for D (0 or 1)
+					.INMODEREG(0),		// Number of pipeline stages for INMODE (0 or 1)
+					.MREG(1),			// Number of multiplier pipeline stages (0 or 1)
+					.OPMODEREG(1),		// Number of pipeline stages for OPMODE (0 or 1)
+					.PREG(1)			// Number of pipeline stages for P (0 or 1)
+				) dsp (
+					// Cascade: 30-bit (each) output: Cascade Ports
+					.ACOUT(),			// 30-bit output: A port cascade output
+					.BCOUT(),			// 18-bit output: B port cascade output
+					.CARRYCASCOUT(),	// 1-bit output: Cascade carry output
+					.MULTSIGNOUT(),		// 1-bit output: Multiplier sign cascade output
+					.PCOUT(),			// 48-bit output: Cascade output
+
+					// Control: 1-bit (each) output: Control Inputs/Status Bits
+					.OVERFLOW(),		 // 1-bit output: Overflow in add/acc output
+					.PATTERNBDETECT(),	 // 1-bit output: Pattern bar detect output
+					.PATTERNDETECT(),	 // 1-bit output: Pattern detect output
+					.UNDERFLOW(),		 // 1-bit output: Underflow in add/acc output
+
+					// Data: 4-bit (each) output: Data Ports
+					.CARRYOUT(),	// 4-bit output: Carry output
+					.P(pp),			// 48-bit output: Primary data output
+
+					// Cascade: 30-bit (each) input: Cascade Ports
+					.ACIN('x),			 // 30-bit input: A cascade data input
+					.BCIN('x),			 // 18-bit input: B cascade input
+					.CARRYCASCIN('x),	 // 1-bit input: Cascade carry input
+					.MULTSIGNIN('x),	 // 1-bit input: Multiplier sign input
+					.PCIN('x),			 // 48-bit input: P cascade input
+
+					// Control: 4-bit (each) input: Control Inputs/Status Bits
+					.CLK(clk),				// 1-bit input: Clock input
+					.ALUMODE('0),			// 4-bit input: ALU control input
+					.CARRYINSEL('0),		// 3-bit input: Carry select input
+					.INMODE(5'b01100),		// 5-bit input: INMODE control input
+					.OPMODE(opmode ^ OPMODE_INVERSION), // 7-bit input: Operation mode input
+
+					// Data: 30-bit (each) input: Data Ports
+					.A(aa),			// 30-bit input: A data input
+					.B(bb),			// 18-bit input: B data input
+					.C('x),			// 48-bit input: C data input
+					.CARRYIN('0),	// 1-bit input: Carry input signal
+					.D(dd),			// 25-bit input: D data input
+
+					// Reset/Clock Enable: 1-bit (each) input: Reset/Clock Enable Inputs
+					.CEA1('0),			// 1-bit input: Clock enable input for 1st stage AREG
+					.CEA2('0),			// 1-bit input: Clock enable input for 2nd stage AREG
+					.CEAD(en),			// 1-bit input: Clock enable input for ADREG
+					.CEALUMODE('0),		// 1-bit input: Clock enable input for ALUMODERE
+					.CEB1('0),			// 1-bit input: Clock enable input for 1st stage BREG
+					.CEB2(en),			// 1-bit input: Clock enable input for 2nd stage BREG
+					.CEC('0),			// 1-bit input: Clock enable input for CREG
+					.CECARRYIN('0),		// 1-bit input: Clock enable input for CARRYINREG
+					.CECTRL(en),		// 1-bit input: Clock enable input for OPMODEREG and CARRYINSELREG
+					.CED('0),			// 1-bit input: Clock enable input for DREG
+					.CEINMODE('0),		// 1-bit input: Clock enable input for INMODEREG
+					.CEM(en),			// 1-bit input: Clock enable input for MREG
+					.CEP(en),			// 1-bit input: Clock enable input for PREG
+					.RSTA('0),			// 1-bit input: Reset input for AREG
+					.RSTB(				// 1-bit input: Reset for BREG
+// synthesis translate_off
+						rst ||
+// synthesis translate_on
+						zero
+					),
+					.RSTC('0),			// 1-bit input: Reset for CREG
+					.RSTD(				// 1-bit input: Reset for DREG and ADREG
+// synthesis translate_off
+						zero ||
+// synthesis translate_on
+						rst
+					),
+					.RSTALLCARRYIN('0),	// 1-bit input: Reset for CARRYINREG
+					.RSTALUMODE('0),	// 1-bit input: Reset for ALUMODEREG
+					.RSTCTRL('0),		// 1-bit input: Reset for OPMODEREG and CARRYINSELREG
+					.RSTINMODE('0),		// 1-bit input: Reset for INMODE register
+					.RSTM(rst),			// 1-bit input: Reset for MREG
+					.RSTP(rst)			// 1-bit input: Reset for PREG
+				);
+				2: DSP48E2 #(
 					// Feature Control Attributes: Data Path Selection
 					.AMULTSEL("AD"),	// Selects A input to multiplier (A, AD)
 					.A_INPUT("DIRECT"),	// Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port)
@@ -158,21 +268,21 @@ module mvu_4sx4u #(
 					.USE_PATTERN_DETECT("NO_PATDET"),  // Enable pattern detect (NO_PATDET, PATDET)
 
 					// Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins
-					.IS_ALUMODE_INVERTED('0),				// Optional inversion for ALUMODE
-					.IS_CARRYIN_INVERTED('0),				// Optional inversion for CARRYIN
-					.IS_CLK_INVERTED('0),					// Optional inversion for CLK
-					.IS_INMODE_INVERTED('0),				// Optional inversion for INMODE
-					.IS_OPMODE_INVERTED(9'b00_010_01_01),	// Optional inversion for OPMODE
-					.IS_RSTALLCARRYIN_INVERTED('0),			// Optional inversion for RSTALLCARRYIN
-					.IS_RSTALUMODE_INVERTED('0),			// Optional inversion for RSTALUMODE
-					.IS_RSTA_INVERTED('0),					// Optional inversion for RSTA
-					.IS_RSTB_INVERTED('0),					// Optional inversion for RSTB
-					.IS_RSTCTRL_INVERTED('0),				// Optional inversion for STCONJUGATE_A
-					.IS_RSTC_INVERTED('0),					// Optional inversion for RSTC
-					.IS_RSTD_INVERTED('0),					// Optional inversion for RSTD
-					.IS_RSTINMODE_INVERTED('0),				// Optional inversion for RSTINMODE
-					.IS_RSTM_INVERTED('0),					// Optional inversion for RSTM
-					.IS_RSTP_INVERTED('0),					// Optional inversion for RSTP
+					.IS_ALUMODE_INVERTED('0),							// Optional inversion for ALUMODE
+					.IS_CARRYIN_INVERTED('0),							// Optional inversion for CARRYIN
+					.IS_CLK_INVERTED('0),								// Optional inversion for CLK
+					.IS_INMODE_INVERTED('0),							// Optional inversion for INMODE
+					.IS_OPMODE_INVERTED({ 2'b00, OPMODE_INVERSION}),	// Optional inversion for OPMODE
+					.IS_RSTALLCARRYIN_INVERTED('0),						// Optional inversion for RSTALLCARRYIN
+					.IS_RSTALUMODE_INVERTED('0),						// Optional inversion for RSTALUMODE
+					.IS_RSTA_INVERTED('0),								// Optional inversion for RSTA
+					.IS_RSTB_INVERTED('0),								// Optional inversion for RSTB
+					.IS_RSTCTRL_INVERTED('0),							// Optional inversion for STCONJUGATE_A
+					.IS_RSTC_INVERTED('0),								// Optional inversion for RSTC
+					.IS_RSTD_INVERTED('0),								// Optional inversion for RSTD
+					.IS_RSTINMODE_INVERTED('0),							// Optional inversion for RSTINMODE
+					.IS_RSTM_INVERTED('0),								// Optional inversion for RSTM
+					.IS_RSTP_INVERTED('0),								// Optional inversion for RSTP
 
 					// Register Control Attributes: Pipeline Register Configuration
 					.ACASCREG(0),                      // Number of pipeline stages between A/ACIN and ACOUT (0-2)
@@ -220,7 +330,7 @@ module mvu_4sx4u #(
 					.ALUMODE(4'h0),				// 4-bit input: ALU control
 					.CARRYINSEL('0),			// 3-bit input: Carry select
 					.INMODE(5'b01100),			// 5-bit input: INMODE control
-					.OPMODE({ 2'b00, { 1'b0, L[2], 1'b0 }, 4'b00_00 }),	// 9-bit input: Operation mode
+					.OPMODE({ 2'b00, opmode }),	// 9-bit input: Operation mode
 
 					// Data inputs: Data Ports
 					.A(aa),						// 34-bit input: A data
@@ -264,6 +374,11 @@ module mvu_4sx4u #(
 					.RSTM(rst),			// 1-bit input: Reset for MREG
 					.RSTP(rst)			// 1-bit input: Reset for PREG
 				);
+				default: initial begin
+					$error("Unknown version DSP48E%0d.", VERSION);
+					$finish;
+				end
+				endcase
 			end : genDSP
 `endif