From be1503a0c78fd4c4d903b1ffbf61964659725bb6 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Tue, 3 Jan 2023 15:37:42 +0000
Subject: [PATCH 001/112] First changes to custom_op for RTL-based MVAU

---
 .../matrixvectoractivation_rtl.py             | 1036 +++++++++++++++++
 1 file changed, 1036 insertions(+)
 create mode 100644 src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py

diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
new file mode 100644
index 0000000000..c8a0aa675b
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
@@ -0,0 +1,1036 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import math
+import numpy as np
+import os
+import textwrap
+import warnings
+from qonnx.core.datatype import DataType
+from qonnx.util.basic import (
+    calculate_matvec_accumulator_range,
+    interleave_matrix_outer_dim_from_partitions,
+    roundup_to_integer_multiple,
+)
+
+from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
+from finn.util.data_packing import (
+    npy_to_rtlsim_input,
+    numpy_to_hls_code,
+    pack_innermost_dim_as_hex_string,
+    rtlsim_output_to_npy,
+)
+
+from . import templates
+
+# ONNX i/o tensor shape assumptions for MatrixVectorActivation:
+# input 0 is the input tensor, shape (.., i_size) = (..., MW)
+# input 1 is the weight tensor, shape (i_size, o_size) = (MW, MH)
+# (optional) input 2 is the thresholds tensor, shape (o_size, n_thres)
+# output 0 is the output tensor, shape (.., o_size) = (..., MH)
+# the ... here can be any shape (representing groups of vectors)
+
+
+class MatrixVectorActivation_rtl(HLSCustomOp):
+    """Class that corresponds to finn-hls Matrix_Vector_Activate(_Stream)_Batch
+    function."""
+
+    def __init__(self, onnx_node):
+        super().__init__(onnx_node)
+        self.decoupled_wrapper = templates.decoupled_wrapper
+
+    def get_nodeattr_types(self):
+        my_attrs = {
+            "PE": ("i", True, 0),
+            "SIMD": ("i", True, 0),
+            "MW": ("i", True, 0),
+            "MH": ("i", True, 0),
+            "resType": ("s", False, "lut", {"auto", "lut", "dsp"}),
+            "ActVal": ("i", False, 0),
+            # FINN DataTypes for inputs, weights, outputs
+            "inputDataType": ("s", True, ""),
+            "weightDataType": ("s", True, ""),
+            "outputDataType": ("s", True, ""),
+            # FINN DataType for accumulator -- auto-computed and updated
+            "accDataType": ("s", False, "INT32"),
+            # use xnor-popcount for binary weights/inputs, thus treating them
+            # as bipolar
+            "binaryXnorMode": ("i", False, 0, {0, 1}),
+            # no-activation mode (produce accumulators)
+            "noActivation": ("i", False, 0, {0, 1}),
+            # number of input vectors, examples:
+            # [1] is a single vector (like a FC layer with batch=1)
+            # [4] is four vectors (like a FC layer with batch=4)
+            # [1, 4, 4] is four * four vectors (like a conv layer with batch=1)
+            "numInputVectors": ("ints", False, [1]),
+            # memory mode for the FC weights
+            # const -- embedded weights, default, long compile/synth times
+            # decoupled -- streaming weights with weight streamer packaged inside IP
+            # external -- streaming weights with external streamer
+            "mem_mode": ("s", False, "const", {"const", "decoupled", "external"}),
+            # FPGA resource type for memories in decoupled mode
+            # auto -- let Vivado decide
+            # block -- use BRAM
+            # distributed -- use LUTRAM
+            # ultra -- use UltraRAM (URAM), must have runtime_writeable_weights=1
+            # see also https://www.xilinx.com/support/answers/38070.html
+            "ram_style": (
+                "s",
+                False,
+                "auto",
+                {"auto", "block", "distributed", "ultra"},
+            ),
+            # FPGA resource type for threshold memories (if noActivation is False)
+            # auto -- let Vivado decide
+            # block -- use BRAM
+            # distributed -- use LUTRAM
+            "ram_style_thresholds": (
+                "s",
+                False,
+                "auto",
+                {"auto", "block", "distributed"},
+            ),
+            # (mem_mode = decoupled only) whether weights will be writable through
+            # an AXI-lite interface during runtime
+            # 1 for enabled, 0 for disabled.
+            # see finn-rtllib/memstream/doc/README for more about the memory
+            # address map used for writable weights
+            # IMPORTANT: After using AXI lite to either read or write the weights,
+            # always "flush" the accelerator by first passing a dummy input
+            # vector through the accelerator. This will get rid of any old
+            # weight data from the weight FIFOs.
+            "runtime_writeable_weights": ("i", False, 0, {0, 1}),
+        }
+        my_attrs.update(super().get_nodeattr_types())
+        return my_attrs
+
+    def calc_wmem(self):
+        """Calculates and returns WMEM."""
+        mw = self.get_nodeattr("MW")
+        mh = self.get_nodeattr("MH")
+        pe = self.get_nodeattr("PE")
+        simd = self.get_nodeattr("SIMD")
+        assert mh % pe == 0, "Requirement MH divisable by PE is violated."
+        assert mw % simd == 0, "Requirement MW divisable by SIMD is violated."
+        wmem = mw * mh // (pe * simd)
+        return wmem
+
+    def calc_tmem(self):
+        """Calculates and returns TMEM."""
+        assert self.get_nodeattr("noActivation")==1, "RTL-based MVAU does not support thresholding currently, please infer a standalone Thresholding_Batch layer"
+        return 0
+
+    def make_shape_compatible_op(self, model):
+        oshape = self.get_normal_output_shape()
+        return super().make_const_shape_op(oshape)
+
+    def infer_node_datatype(self, model):
+        node = self.onnx_node
+        idt = model.get_tensor_datatype(node.input[0])
+        if idt != self.get_input_datatype():
+            warn_str = "inputDataType changing for %s: %s -> %s " % (
+                node.name,
+                str(self.get_input_datatype()),
+                str(idt),
+            )
+            warnings.warn(warn_str)
+        self.set_nodeattr("inputDataType", idt.name)
+        # set output datatype from property
+        odt = self.get_output_datatype()
+        model.set_tensor_datatype(node.output[0], odt)
+
+    def verify_node(self):
+        info_messages = []
+        # verify that "backend" is set to "fpgadataflow"
+        backend_value = self.get_nodeattr("backend")
+        if backend_value == "fpgadataflow":
+            info_messages.append("Attribute backend is set correctly")
+        else:
+            info_messages.append('Attribute backend should be set to "fpgadataflow"')
+
+        # verify that all necessary attributes exist
+        # TODO collect automatically from get_nodeattr_types
+        try:
+            self.get_nodeattr("code_gen_dir_cppsim")
+            self.get_nodeattr("executable_path")
+            self.get_nodeattr("resType")
+            self.get_nodeattr("MW")
+            self.get_nodeattr("MH")
+            self.get_nodeattr("SIMD")
+            self.get_nodeattr("PE")
+            self.get_nodeattr("inputDataType")
+            self.get_nodeattr("weightDataType")
+            self.get_nodeattr("outputDataType")
+            info_messages.append("All necessary attributes exist")
+        except Exception:
+            info_messages.append(
+                """The required MatrixVectorActivation attributes do not exist."""
+            )
+
+        # verify the number of inputs depending on noActivation value
+        # check noActivation value to determine the number of inputs
+        no_act = self.get_nodeattr("noActivation")
+
+        if no_act == 1:
+            if len(self.onnx_node.input) == 2:
+                info_messages.append("The number of inputs is correct")
+            else:
+                info_messages.append(
+                    """RTL-based MatrixVectorActivation needs in no
+                            activation mode 2 inputs (data input and weights)"""
+                )
+        elif no_act == 0:
+            info_messages.append("RTL-based MVAU does not support thresholding currently, please infer a standalone Thresholding_Batch layer")
+        else:
+            info_messages.append(
+                """noActivation attribute contains {} should
+                be 1 for RTL-based MatrixVectorActivation""".format(
+                    no_act
+                )
+            )
+
+        mem_mode = self.get_nodeattr("mem_mode")
+
+        if mem_mode != "decoupled":
+            info_messages.append("RTL-based MVAU supports only decoupled weights currently")
+
+        return info_messages
+
+    def uram_estimation(self):
+        P = self.get_nodeattr("PE")
+        Q = self.get_nodeattr("SIMD")
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        D_in = self.get_nodeattr("MW")
+        D_out = self.get_nodeattr("MH")
+        omega = (D_in * D_out) / (Q * P)
+        mem_width = Q * W * P
+        mmode = self.get_nodeattr("mem_mode")
+        mstyle = self.get_nodeattr("ram_style")
+        if (
+            (mmode == "decoupled" and mstyle != "ultra")
+            or (mmode == "const" and self.calc_wmem() <= 128)
+            or (mmode == "external")
+        ):
+            return 0
+        width_multiplier = math.ceil(mem_width / 72)
+        depth_multiplier = math.ceil(omega / 4096)
+        return width_multiplier * depth_multiplier
+
+    def bram_estimation(self):
+        """Calculates resource estimation for BRAM based on:
+        - FINN-R: An End-to-End Deep-Learning Framework for Fast
+        Exploration of Quantized Neural Networks
+        - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien,
+        Y. Umuroglu, M. Leeser and K. Vissers
+        - 12. Sep 2018
+        """
+        # TODO add in/out FIFO contributions
+        P = self.get_nodeattr("PE")
+        Q = self.get_nodeattr("SIMD")
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        D_in = self.get_nodeattr("MW")
+        D_out = self.get_nodeattr("MH")
+        omega = (D_in * D_out) / (Q * P)
+        mem_width = Q * W * P
+        mmode = self.get_nodeattr("mem_mode")
+        mstyle = self.get_nodeattr("ram_style")
+        if (
+            (mmode == "decoupled" and mstyle in ["distributed", "ultra"])
+            or (mmode == "const" and self.calc_wmem() <= 128)
+            or (mmode == "external")
+        ):
+            return 0
+        # assuming SDP mode RAMB18s (see UG573 Table 1-10)
+        # assuming decoupled (RTL) memory, which is more efficient than const (HLS)
+        if mem_width == 1:
+            return math.ceil(omega / 16384)
+        elif mem_width == 2:
+            return math.ceil(omega / 8192)
+        elif mem_width <= 4:
+            return (math.ceil(omega / 4096)) * (math.ceil(mem_width / 4))
+        elif mem_width <= 9:
+            return (math.ceil(omega / 2048)) * (math.ceil(mem_width / 9))
+        elif mem_width <= 18 or omega > 512:
+            return (math.ceil(omega / 1024)) * (math.ceil(mem_width / 18))
+        else:
+            return (math.ceil(omega / 512)) * (math.ceil(mem_width / 36))
+
+    def bram_efficiency_estimation(self):
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        D_in = self.get_nodeattr("MW")
+        D_out = self.get_nodeattr("MH")
+        bram16_est = self.bram_estimation()
+        if bram16_est == 0:
+            return 1
+        wbits = W * D_in * D_out
+        bram16_est_capacity = bram16_est * 36 * 512
+        return wbits / bram16_est_capacity
+
+    def uram_efficiency_estimation(self):
+        """Function for URAM efficiency estimation: actual parameter storage
+        needed divided by the allocated URAM storage (from estimation)"""
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        D_in = self.get_nodeattr("MW")
+        D_out = self.get_nodeattr("MH")
+        uram_est = self.uram_estimation()
+        if uram_est == 0:
+            return 1
+        wbits = W * D_in * D_out
+        uram_est_capacity = uram_est * 72 * 4096
+        return wbits / uram_est_capacity
+
+#TODO: FIX
+    def lut_estimation(self):
+        """Calculates resource estimations for LUTs based on:
+        - FINN-R: An End-to-End Deep-Learning Framework for Fast
+        Exploration of Quantized Neural Networks
+        - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien,
+        Y. Umuroglu, M. Leeser and K. Vissers
+        - 12. Sep 2018
+        """
+        # TODO add in/out FIFO contributions
+        P = self.get_nodeattr("PE")
+        Q = self.get_nodeattr("SIMD")
+        MW = self.get_nodeattr("MW")
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        # determine tdt with input and weight data types
+        idt = self.get_input_datatype()
+        A = idt.bitwidth()
+        # parameters from experiments in paper mentioned above
+        c0 = 300
+        c1 = 1.1
+        c2 = 0
+        mmode = self.get_nodeattr("mem_mode")
+        mstyle = self.get_nodeattr("ram_style")
+        if (mmode == "decoupled" and mstyle == "distributed") or (
+            mmode == "const" and self.calc_wmem() <= 128
+        ):
+            c2 = (P * Q * W) * math.ceil(self.calc_wmem() / 64)
+
+        # multiplication
+        res_type = self.get_nodeattr("resType")
+        if res_type == "dsp":
+            mult_luts = 0
+        else:
+            mult_luts = Q * (2 * math.ceil((W + A) / 6) - 1) * (W + A)
+        # adder tree
+        addertree_luts = (W + A) * (2 * Q - 1)
+        # accumulator
+        acc_bits = W + A + np.ceil(math.log(MW, 2))
+        acc_luts = acc_bits
+        # thresholds and threshold comparators
+        thr_luts = 0
+        comp_luts = 0
+        noact = self.get_nodeattr("noActivation")
+        if noact == 0:
+            odt = self.get_output_datatype()
+            B = odt.bitwidth()
+            thr_luts = (2**B - 1) * acc_bits * math.ceil(self.calc_tmem() / 64)
+            comp_luts = (2**B - 1) * acc_bits
+
+        return int(
+            c0
+            + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts))
+            + c2
+        )
+
+#TODO: FIX
+    def dsp_estimation(self):
+        # multiplication
+        P = self.get_nodeattr("PE")
+        res_type = self.get_nodeattr("resType")
+        Q = self.get_nodeattr("SIMD")
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        idt = self.get_input_datatype()
+        A = idt.bitwidth()
+        if res_type == "dsp":
+            mult_dsp = P * Q * np.ceil((W + A) / 48)  # TODO: more accurate modelling
+        else:
+            mult_dsp = 0
+        return int(mult_dsp)
+
+#TODO: FIX
+    def get_exp_cycles(self):
+        pe = self.get_nodeattr("PE")
+        simd = self.get_nodeattr("SIMD")
+        num_inp_vec = self.get_nodeattr("numInputVectors")
+        mh = self.get_nodeattr("MH")
+        mw = self.get_nodeattr("MW")
+        # since mmv != 1 is not supported yet, we set mmv for now to 1
+        mmv = 1
+        exp_cycles = (mh / pe) * (mw / simd) * np.prod(num_inp_vec) / mmv
+        return int(exp_cycles)
+
+    def get_input_datatype(self, ind=0):
+        """Returns FINN DataType of input."""
+        # when performing FIFO insertion on an FC layer with ext weights, the ind
+        # parameter can be > 0 (referring to the weights) so handle that here
+        if ind == 0:
+            return DataType[self.get_nodeattr("inputDataType")]
+        elif ind == 1:
+            return DataType[self.get_nodeattr("weightDataType")]
+        else:
+            raise Exception("Undefined input ind for this layer type")
+
+    def get_weight_datatype(self):
+        """Returns FINN DataType of weights."""
+        return DataType[self.get_nodeattr("weightDataType")]
+
+    def get_output_datatype(self, ind=0):
+        """Returns FINN DataType of output."""
+        return DataType[self.get_nodeattr("outputDataType")]
+
+    def get_instream_width(self, ind=0):
+        i_bits = self.get_input_datatype().bitwidth()
+        assert i_bits<=9, "RTL-based MVAU only supports activations with bit-width up to 9-bits"
+        in_width = i_bits * self.get_nodeattr("SIMD")
+        return in_width
+
+    def get_outstream_width(self, ind=0):
+        o_bits = self.get_output_datatype().bitwidth()
+        out_width = o_bits * self.get_nodeattr("PE")
+        return out_width
+
+    def get_weightstream_width(self):
+        """Returns weight stream width. Used only in decoupled mode."""
+        if (
+            self.get_nodeattr("mem_mode") == "decoupled"
+            or self.get_nodeattr("mem_mode") == "external"
+        ):
+            pe = self.get_nodeattr("PE")
+            simd = self.get_nodeattr("SIMD")
+            wp = self.get_weight_datatype().bitwidth()
+            w_width = pe * simd * wp
+            assert wp<=8, "RTL-based MVAU only supports weights with bit-width up to 8-bits"
+            return w_width
+        else:
+            return 0
+
+    def get_weightstream_width_padded(self):
+        """Returns weight stream width padded to a multiple of 8. This is required
+        by the AXI Stream spec. Used in decoupled mode."""
+        weight_width = self.get_weightstream_width()
+        return roundup_to_integer_multiple(weight_width, 8)
+
+    def get_ap_int_max_w(self):
+        # base class impl (max of inp/out stream widths)
+        max_of_io = super().get_ap_int_max_w()
+        # decoupled mode weight stream
+        weightstream = self.get_weightstream_width()
+        # single PE weight entry
+        weight_bits = self.get_weight_datatype().bitwidth()
+        simd = self.get_nodeattr("SIMD")
+        single_pe_w = simd * weight_bits
+        return max([weightstream, max_of_io, single_pe_w])
+
+    def get_folded_input_shape(self, ind=0):
+        mw = self.get_nodeattr("MW")
+        mh = self.get_nodeattr("MH")
+        simd = self.get_nodeattr("SIMD")
+        pe = self.get_nodeattr("PE")
+        sf = mw // simd
+        nf = mh // pe
+        vecs = list(self.get_nodeattr("numInputVectors"))
+
+        if ind == 0:
+            # calculate shape of input 0
+            folded_input_shape = tuple(vecs + [sf, simd])
+        elif ind == 1 and self.get_nodeattr("mem_mode") == "external":
+            # calculate shape of input 1 (weights)
+            folded_input_shape = tuple(vecs + [sf * nf, simd * pe])
+        else:
+            raise Exception("Undefined input shape for requested input")
+
+        return folded_input_shape
+
+    def get_folded_output_shape(self, ind=0):
+        mh = self.get_nodeattr("MH")
+        pe = self.get_nodeattr("PE")
+        nf = mh // pe
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        folded_output_shape = tuple(vecs + [nf, pe])
+        return folded_output_shape
+
+    def get_normal_input_shape(self, ind=0):
+        mw = self.get_nodeattr("MW")
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        normal_input_shape = tuple(vecs + [mw])
+        return normal_input_shape
+
+    def get_normal_output_shape(self, ind=0):
+        mh = self.get_nodeattr("MH")
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        normal_output_shape = tuple(vecs + [mh])
+        return normal_output_shape
+
+    def get_number_output_values(self):
+        nf = np.prod(self.get_folded_output_shape()[:-1])
+        return nf
+
+    def get_hls_compatible_weight_tensor(self, orig_weight_matrix):
+        """Convert the original numpy weight matrix orig_weight_matrix into
+        a form suitable for passing to the hlslib call:
+        * ensure MH % PE == 0 and MW % SIMD == 0
+        * for bipolar {-1,+1} weights, convert to binary {0, 1}
+        * interleave rows between PEs
+        * reshape into (1, PE, WMEM, SIMD) and return
+        """
+        mw = self.get_nodeattr("MW")
+        mh = self.get_nodeattr("MH")
+        pe = self.get_nodeattr("PE")
+        simd = self.get_nodeattr("SIMD")
+        wmem = self.calc_wmem()
+        assert orig_weight_matrix.shape == (
+            mw,
+            mh,
+        ), """Weights matrix doesn't
+        have expected shape (mw, mh)"""
+        assert mw % simd == 0, "Requirement MH divisable by SIMD is violated."
+        assert mh % pe == 0, "Requirement MH divisable by PE is violated."
+        # start by transposing the original weight matrix, since ONNX and
+        # finn-hlslib use different assumptions
+        # ONNX uses (in_features, out_features) and matmul(x, W)
+        # finn-hlslib uses (out_features, in_features) and matmul(W, x)
+        ret = orig_weight_matrix.T
+        # interleave rows between PEs and reshape
+        # distribute rows between PEs
+        ret = interleave_matrix_outer_dim_from_partitions(ret, pe)
+        # create SIMD as innermost dimension and add a dummy outer dim
+        ret = ret.reshape(1, pe, wmem, simd)
+        # reverse the SIMD dimension
+        ret = np.flip(ret, axis=-1)
+        return ret
+
+    def minimize_accumulator_width(self, model):
+        weights = model.get_initializer(self.onnx_node.input[1])
+        idt = self.get_input_datatype()
+        # calculate minimum and maximum values of accumulator
+        (acc_min, acc_max) = calculate_matvec_accumulator_range(weights, idt)
+        if acc_min < 0:
+            if abs(acc_min) > acc_max:
+                adt = DataType.get_smallest_possible(acc_min)
+            else:
+                adt = DataType.get_smallest_possible(-acc_max - 1)
+        else:
+            adt = DataType.get_smallest_possible(acc_max)
+        # ensure a datatype divisible by 8-bits in case this is the last node
+        bw = roundup_to_integer_multiple(adt.bitwidth(), 8)
+        new_adt_name = adt.name.replace(str(adt.bitwidth()), str(bw))
+        adt = DataType[new_adt_name]
+        self.set_nodeattr("accDataType", adt.name)
+        # for no-activation nodes, output dt = acc dt
+        self.set_nodeattr("outputDataType", adt.name)
+        return DataType[self.get_nodeattr("accDataType")]
+
+    def make_weight_file(self, weights, weight_file_mode, weight_file_name):
+        """Produce a file containing given weights in appropriate format for this
+        layer. This file can be used for either synthesis or run-time reconfig
+        of weights.
+
+        Arguments:
+        * weights : numpy array with weights to be put into the file
+        * weight_file_mode : one of {hls_header, decoupled_verilog_dat,
+          decoupled_runtime}
+        * weight_file_name : filename for the weight file to be generated
+        """
+        # convert weights into hlslib-compatible format
+        weight_tensor = self.get_hls_compatible_weight_tensor(weights)
+        export_wdt = self.get_weight_datatype()
+        if "decoupled" in weight_file_mode:
+            # create a weight stream for various flavors of decoupled mode:
+            # transpose weight tensor from (1, PE, WMEM, SIMD) to (1, WMEM, PE, SIMD)
+            weight_tensor_unflipped = np.transpose(weight_tensor, (0, 2, 1, 3))
+            # reverse SIMD flip for saving weights in .npy
+            weight_tensor_simd_flipped = np.flip(weight_tensor_unflipped, axis=-1)
+            # PE flip for saving weights in .dat
+            weight_tensor_pe_flipped = np.flip(weight_tensor_unflipped, axis=-2)
+            # reshape weight tensor (simd_flipped and pe_flipped) to desired shape
+            pe = self.get_nodeattr("PE")
+            simd = self.get_nodeattr("SIMD")
+            # simd_flipped
+            weight_tensor_simd_flipped = weight_tensor_simd_flipped.reshape(
+                1, -1, pe * simd
+            )
+            weight_tensor_simd_flipped = weight_tensor_simd_flipped.copy()
+            # flipped
+            weight_tensor_pe_flipped = weight_tensor_pe_flipped.reshape(
+                1, -1, pe * simd
+            )
+            weight_tensor_pe_flipped = weight_tensor_pe_flipped.copy()
+            if weight_file_mode == "decoupled_verilog_dat":
+                # convert weight values into hexstring
+                weight_width = self.get_weightstream_width()
+                # pad to nearest 4 bits to get hex strings
+                weight_width_padded = roundup_to_integer_multiple(weight_width, 4)
+                weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string(
+                    weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix=""
+                )
+                # add zeroes to pad out file to 1024 entries
+                weight_stream = weight_tensor_pe_flipped.flatten()
+                weight_stream = weight_stream.copy()
+                with open(weight_file_name, "w") as f:
+                    for val in weight_stream:
+                        f.write(val + "\n")
+            elif weight_file_mode == "decoupled_runtime":
+                # memstream axi-lite interface will map each mem line to
+                # one or multiple 32-bit words
+                weight_width = self.get_weightstream_width()
+                words_per_memwidth = 2 ** math.ceil(math.log2(weight_width / 32))
+                if words_per_memwidth < 1:
+                    words_per_memwidth = 1
+                weight_width_padded = words_per_memwidth * 32
+                # first, pack and ensure padding to 32 bits
+                weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string(
+                    weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix=""
+                )
+                weight_stream = weight_tensor_pe_flipped.flatten()
+                weight_stream = weight_stream.copy()
+                with open(weight_file_name, "w") as f:
+                    for val in weight_stream:
+                        # split into groups of 8 hex digits (= 32 bits)
+                        words_32b = textwrap.wrap(val, 8)
+                        words_32b.reverse()
+                        for word_32b in words_32b:
+                            f.write(word_32b + "\n")
+            else:
+                raise Exception("Unknown/unsupported weight_file_mode")
+
+        else:
+            raise Exception("Unknown/unsupported weight_file_mode")
+
+    def generate_params(self, model, path):
+        mem_mode = self.get_nodeattr("mem_mode")
+        code_gen_dir = path
+        # weights, if not external
+        weights = model.get_initializer(self.onnx_node.input[1])
+        if mem_mode == "decoupled":
+            weight_filename_sim = "{}/weights.npy".format(code_gen_dir)
+            # save decoupled weights for cppsim
+            self.make_weight_file(weights, "decoupled_npy", weight_filename_sim)
+            # also save weights as Verilog .dat file
+            # note that we provide two different .dat files, one for synth
+            # and one for synthesis. this is because URAM-based weights always
+            # need zero weights for synthesis, otherwise they get inferred
+            # as BRAM
+            weight_filename_rtl_synth = "{}/memblock_synth_0.dat".format(
+                code_gen_dir
+            )
+            weight_filename_rtl_sim = "{}/memblock_sim_0.dat".format(code_gen_dir)
+            # sim weights are always the true weights
+            self.make_weight_file(
+                weights, "decoupled_verilog_dat", weight_filename_rtl_sim
+            )
+            ram_style = self.get_nodeattr("ram_style")
+            if ram_style == "ultra":
+                # UltraRAM must have no memory initializer, or only zeroes
+                # otherwise BRAM will be inferred instead of URAM
+                # as a workaround we provide a zero-weight init here
+                synth_weights = np.zeros_like(weights, dtype=np.float32)
+            else:
+                synth_weights = weights
+            self.make_weight_file(
+                synth_weights, "decoupled_verilog_dat", weight_filename_rtl_synth
+            )
+        else:
+            raise Exception(
+                """Please set mem_mode to "decoupled",
+                currently no other parameter value is supported!"""
+            )
+
+    def execute_node(self, context, graph):
+        mode = self.get_nodeattr("exec_mode")
+        mem_mode = self.get_nodeattr("mem_mode")
+        node = self.onnx_node
+
+        # TODO ensure codegen dir exists
+        if mode == "cppsim":
+            raise Exception(
+                "cppsim not possible for RTL MVAU, please set exec_mode to rtlsim"
+            )
+        elif mode == "rtlsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        # create a npy file fore each input of the node (in_ind is input index)
+        in_ind = 0
+        for inputs in node.input:
+            # it is assumed that the first input of the node is the data input
+            # the second input are the weights
+            # the third input are the thresholds
+            if in_ind == 0:
+                assert (
+                    str(context[inputs].dtype) == "float32"
+                ), """Input datatype is
+                not float32 as expected."""
+                expected_inp_shape = self.get_folded_input_shape()
+                reshaped_input = context[inputs].reshape(expected_inp_shape)
+                export_idt = self.get_input_datatype()
+                # make copy before saving the array
+                reshaped_input = reshaped_input.copy()
+                np.save(
+                    os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)),
+                    reshaped_input,
+                )
+            elif in_ind > 2:
+                raise Exception("Unexpected input found for MatrixVectorActivation")
+            in_ind += 1
+
+        if mode == "rtlsim":
+            sim = self.get_rtlsim()
+            nbits = self.get_instream_width()
+            inp = npy_to_rtlsim_input(
+                "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
+            )
+            super().reset_rtlsim(sim)
+            super().toggle_clk(sim)
+            if mem_mode == "external" or mem_mode == "decoupled":
+                wnbits = self.get_weightstream_width()
+                export_wdt = self.get_weight_datatype()
+                wei = npy_to_rtlsim_input(
+                    "{}/weights.npy".format(code_gen_dir), export_wdt, wnbits
+                )
+                num_w_reps = np.prod(self.get_nodeattr("numInputVectors"))
+                io_dict = {
+                    "inputs": {"in0": inp, "weights": wei * num_w_reps},
+                    "outputs": {"out": []},
+                }
+                self.rtlsim_multi_io(sim, io_dict)
+                output = io_dict["outputs"]["out"]
+            else:
+                output = self.rtlsim(sim, inp)
+            odt = self.get_output_datatype()
+            target_bits = odt.bitwidth()
+            packed_bits = self.get_outstream_width()
+            out_npy_path = "{}/output.npy".format(code_gen_dir)
+            out_shape = self.get_folded_output_shape()
+            rtlsim_output_to_npy(
+                output, out_npy_path, odt, out_shape, packed_bits, target_bits
+            )
+            # load and reshape output
+            output = np.load(out_npy_path)
+            oshape = self.get_normal_output_shape()
+            output = np.asarray([output], dtype=np.float32).reshape(*oshape)
+            context[node.output[0]] = output
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+    def code_generation_ipgen(self, model, fpgapart, clk):
+        """Normally: Generates C++ code and tcl script for IP generation.
+        Here: Generates (System-)Verilog code for IP generation."""
+        self.generate_hdl()
+
+    def ipgen_singlenode_code(self):
+        """Normally: Builds the bash script for IP generation."""
+        pass   
+
+    def code_generation_cppsim(self, model):
+        """Normally: Generates C++ code for simulation (cppsim)."""
+        pass     
+
+    def compile_singlenode_code(self):
+        pass
+
+    def global_includes(self):
+        pass
+
+    def defines(self, var):
+        pass
+
+    def read_npy_data(self):
+        pass
+
+    def strm_decl(self):
+        pass
+
+    def docompute(self):
+        pass
+
+    def dataoutstrm(self):
+        pass
+
+    def save_as_npy(self):
+        pass
+
+    def blackboxfunction(self):
+        pass
+
+    def pragmas(self):
+        pass
+
+    def code_generation_ipi(self):
+        cmd = []
+        # add streamer if needed
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode == "decoupled":
+            runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1
+            if self.get_nodeattr("ram_style") == "ultra":
+                assert (
+                    runtime_writable == 1
+                ), "Layer with URAM weights must have runtime_writeable_weights=1"
+            node_name = self.onnx_node.name
+            sname = self.hls_sname()
+            # create a hierarchy for this layer, with the same port names
+            clk_name = self.get_verilog_top_module_intf_names()["clk"][0]
+            rst_name = self.get_verilog_top_module_intf_names()["rst"][0]
+            dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0][0]
+            din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0]
+            cmd.append("create_bd_cell -type hier %s" % node_name)
+            cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name))
+            cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name))
+            cmd.append(
+                "create_bd_intf_pin -mode Master "
+                "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s"
+                % (node_name, dout_name)
+            )
+            cmd.append(
+                "create_bd_intf_pin -mode Slave "
+                "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name)
+            )
+            # instantiate the hls ip
+            cmd.append(
+                "create_bd_cell -type ip -vlnv %s /%s/%s"
+                % (self.get_nodeattr("ip_vlnv"), node_name, node_name)
+            )
+            # instantiate a streamer and connect it to the HLS IP
+            strm_vlnv = "xilinx.com:user:memstream:1.0"
+            strm_inst = node_name + "_wstrm"
+            cmd.append(
+                "create_bd_cell -type ip -vlnv %s /%s/%s"
+                % (strm_vlnv, node_name, strm_inst)
+            )
+            cmd.append(
+                "set_property -dict [list "
+                "CONFIG.NSTREAMS {1} "
+                "CONFIG.MEM_DEPTH {%d} "
+                "CONFIG.MEM_WIDTH {%d} "
+                "CONFIG.MEM_INIT {%s} "
+                "CONFIG.RAM_STYLE {%s} "
+                "CONFIG.STRM0_DEPTH {%d} "
+                "CONFIG.STRM0_WIDTH {%d} "
+                "CONFIG.STRM0_OFFSET {0} "
+                "] [get_bd_cells /%s/%s]"
+                % (
+                    self.calc_wmem(),
+                    self.get_weightstream_width_padded(),
+                    self.get_nodeattr("code_gen_dir_ipgen") + "/",
+                    self.get_nodeattr("ram_style"),
+                    self.calc_wmem(),
+                    self.get_weightstream_width_padded(),
+                    node_name,
+                    strm_inst,
+                )
+            )
+            cmd.append(
+                "connect_bd_intf_net [get_bd_intf_pins %s/%s/m_axis_0] "
+                "[get_bd_intf_pins %s/%s/weights_%s]"
+                % (node_name, strm_inst, node_name, node_name, sname)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aresetn]"
+                % (node_name, rst_name, node_name, strm_inst)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aclk]"
+                % (node_name, clk_name, node_name, strm_inst)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]"
+                % (node_name, rst_name, node_name, node_name, rst_name)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]"
+                % (node_name, clk_name, node_name, node_name, clk_name)
+            )
+            cmd.append(
+                "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
+                "[get_bd_intf_pins %s/%s/%s]"
+                % (node_name, din_name, node_name, node_name, din_name)
+            )
+            cmd.append(
+                "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
+                "[get_bd_intf_pins %s/%s/%s]"
+                % (node_name, dout_name, node_name, node_name, dout_name)
+            )
+            if runtime_writable:
+                # expose axi lite interface for writeable weights
+                axilite_name = self.get_verilog_top_module_intf_names()["axilite"][0]
+                cmd.append(
+                    "create_bd_intf_pin -mode Slave "
+                    "-vlnv xilinx.com:interface:aximm_rtl:1.0 /%s/%s"
+                    % (node_name, axilite_name)
+                )
+                cmd.append(
+                    "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
+                    "[get_bd_intf_pins %s/%s/%s]"
+                    % (node_name, axilite_name, node_name, strm_inst, axilite_name)
+                )
+                # TODO calculate and pass in segment size here
+                cmd.append("assign_bd_address")
+            cmd.append("save_bd_design")
+        elif mem_mode == "const" or mem_mode == "external":
+            # base class impl sufficient for const/external modes
+            return super().code_generation_ipi()
+        else:
+            raise Exception("Unrecognized mem_mode for MatrixVectorActivation")
+        return cmd
+
+    def get_verilog_top_module_intf_names(self):
+        intf_names = super().get_verilog_top_module_intf_names()
+        mem_mode = self.get_nodeattr("mem_mode")
+        sname = self.hls_sname()
+        if mem_mode == "external":
+            intf_names["s_axis"].append(
+                ("weights_" + sname, self.get_weightstream_width_padded())
+            )
+        if mem_mode == "decoupled":
+            # only expose axilite interface if attribute is set
+            runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1
+            if runtime_writable:
+                intf_names["axilite"] = ["s_axilite"]
+        return intf_names
+
+    def get_op_and_param_counts(self):
+        in_features = self.get_nodeattr("MW")
+        out_features = self.get_nodeattr("MH")
+        weight_bits = self.get_weight_datatype().bitwidth()
+        inp_bits = self.get_input_datatype().bitwidth()
+        num_inp_vec = self.get_nodeattr("numInputVectors")
+        num_repetitions = int(np.prod(num_inp_vec))
+        mac_count = in_features * out_features * num_repetitions
+        # cannonicalize op type: highest bitwidth operand first s.t.
+        # e.g. mac_8bx4b and mac_4bx8b don't appear as two different op types
+        bw1 = min(inp_bits, weight_bits)
+        bw2 = max(inp_bits, weight_bits)
+        mac_op_type = "op_mac_%dbx%db" % (bw1, bw2)
+        weight_param_type = "param_weight_%db" % (weight_bits)
+        weight_count = in_features * out_features
+        ret_dict = {mac_op_type: mac_count, weight_param_type: weight_count}
+        if self.get_nodeattr("noActivation") == 0:
+            tdt = DataType[self.get_nodeattr("accDataType")]
+            thres_bits = tdt.bitwidth()
+            thres_param_type = "param_threshold_%db" % (thres_bits)
+            thres_count = out_features
+            ret_dict[thres_param_type] = thres_count
+        return ret_dict
+
+    def derive_characteristic_fxns(self, period):
+        n_inps = np.prod(self.get_folded_input_shape()[:-1])
+        io_dict = {
+            "inputs": {
+                "in0": [0 for i in range(n_inps)],
+            },
+            "outputs": {"out": []},
+        }
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode in ["decoupled", "external"]:
+            n_weight_inps = self.calc_wmem()
+            num_w_reps = np.prod(self.get_nodeattr("numInputVectors"))
+            io_dict["inputs"]["weights"] = [
+                0 for i in range(num_w_reps * n_weight_inps)
+            ]
+        super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
+
+    def generate_hdl(self):
+#TODO: add distinction between (PE=MH or PE=1) and where MH dimension is folded
+        template_path, code_gen_dict = self.prepare_codegen_default()
+
+        # add general parameters to dictionary
+        code_gen_dict["$TOP_MODULE_NAME$"] = [self.get_verilog_top_module_name()]
+        # save top module name so we can refer to it after this node has been renamed
+        # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject)
+        self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name())
+#TODO: currently only ram_style=auto is supported
+        ram_style = self.get_nodeattr("ram_style")
+        if ram_style == "auto":
+            continue
+        else:
+            raise Exception("Unrecognized ram_style for MatrixVectorActivation")
+
+        # apply code generation to templates
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        with open(template_path, "r") as f:
+            template = f.read()
+        for key in code_gen_dict:
+            # transform list into long string separated by '\n'
+            code_gen_line = "\n".join(code_gen_dict[key])
+            template = template.replace(key, code_gen_line)
+            template_wrapper = template_wrapper.replace(key, code_gen_line)
+        with open(
+            os.path.join(
+                code_gen_dir, self.get_nodeattr("gen_top_module") + "_impl.sv"
+            ),
+            "w",
+        ) as f:
+            f.write(template)
+        with open(
+            os.path.join(
+                code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"
+            ),
+            "w",
+        ) as f:
+            f.write(template_wrapper)
+
+        # set ipgen_path and ip_path so that HLS-Synth transformation
+        # and stich_ip transformation do not complain
+        self.set_nodeattr("ipgen_path", code_gen_dir)
+        self.set_nodeattr("ip_path", code_gen_dir)    
+
+    def prepare_codegen_default(self):
+        # TODO: Differentiate between PE folding and fully unrolled along MH dimension
+        template_path = (
+            os.environ["FINN_ROOT"] + "/finn-rtllib/mvau/dsp58_mvau_template.vhdl"
+        )
+        code_gen_dict = {}
+
+        code_gen_dict["$PE$"] = self.get_nodeattr("PE")
+        code_gen_dict["$SIMD$"] = self.get_nodeattr("SIMD")
+        code_gen_dict["$MW$"] = self.get_nodeattr("MW")
+        code_gen_dict["$MH$"] = self.get_nodeattr("MH")
+        code_gen_dict["$ACTIVATION_WIDTH$"] = self.get_input_datatype(0).bitwidth()
+        code_gen_dict["$WEIGHT_WIDTH$"] = self.get_input_datatype(1).bitwidth()
+        code_gen_dict["$ACCU_WIDTH_BA$"] = self.get_output_datatype().bitwidth()
+
+        return template_path, code_gen_dict
+

From afab9cd6543b4fe1f612c329074d30d59706ac08 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 6 Apr 2023 12:34:01 +0100
Subject: [PATCH 002/112] [rtl custom op]: initial implementation of mvu_8sx9

---
 finn-rtllib/mvu/mvu_8sx9.sv | 284 ++++++++++++++++++++++++++++++++++++
 1 file changed, 284 insertions(+)
 create mode 100644 finn-rtllib/mvu/mvu_8sx9.sv

diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv
new file mode 100644
index 0000000000..c992990d9f
--- /dev/null
+++ b/finn-rtllib/mvu/mvu_8sx9.sv
@@ -0,0 +1,284 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Matrix Vector Unit (MVU) core compute kernel utilizing DSP58.
+ *****************************************************************************/
+
+module mvu_8sx9 #(
+    int unsigned PE,
+    int unsigned SIMD,
+    int unsigned ACTIVATION_WIDTH,
+    int unsigned WEIGHT_WIDTH,
+    bit SIGNED_ACTIVATIONS = 0,
+    int unsigned SEGMENTLEN = 0 // Default to 0 (which implies a single segment)
+  )
+  (
+    input   logic clk,
+    input   logic rst,
+    input   logic en,
+    input   logic last,
+    input   logic zero,
+    input   logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] a,
+    input   logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w,
+    output  logic vld,
+    output  logic [PE-1:0][57:0] p 
+  );
+
+//-------------------- Declare global signals --------------------\\
+localparam int unsigned CHAINLEN = (SIMD+2)/3;
+localparam int unsigned SEGLEN = SEGMENTLEN == 0 ? CHAINLEN : SEGMENTLEN; // Additional constant to default a SEGMENTLEN of '0' to the DSP-chain length
+uwire [26:0] a_in_i [CHAINLEN];
+uwire [23:0] b_in_i [PE][CHAINLEN];
+uwire [57:0] pcout [PE][CHAINLEN];
+
+//-------------------- Shift register for opmode select signal --------------------\\
+localparam int unsigned MAX_PIPELINE_STAGES = (CHAINLEN + SEGLEN-1)/SEGLEN; // >=1 (== number of pipeline registers + 1 (A/B inputs always have 1 register))
+logic L [0:1+MAX_PIPELINE_STAGES] = '{default: 0}; // After MAX_PIPELINE_STAGES (== number of pipeline stages for input data), we have 3 additional cycles latency (A/B reg, Mreg, Preg). Thus, we add +2 (since OPMODE is buffered by 1 cycle in the DSP fabric)
+
+always_ff @(posedge clk) begin
+  if(rst)     L <= '{default: 0};
+  else if(en) L <= { L[1:1+MAX_PIPELINE_STAGES], last };
+end  
+assign vld = L[0];
+
+//-------------------- Shift register for ZERO flag --------------------\\
+logic Z [0:MAX_PIPELINE_STAGES-2] = '{default:0}; // We need MAX_PIPELINE_STAGES-1 pipeline stages (note: INMODE is buffered inside DSP fabric)
+
+if (MAX_PIPELINE_STAGES > 1) begin : genZreg
+  always_ff @(posedge clk) begin
+    if (rst)      Z <= '{default: 0};
+    else if(en) begin
+        Z[0] <= zero;
+        if (MAX_PIPELINE_STAGES > 2)  Z[1:MAX_PIPELINE_STAGES-1] <= Z[0:MAX_PIPELINE_STAGES-2];
+    end    
+  end
+end;
+
+//-------------------- Buffer for input activations --------------------\\
+localparam int unsigned PAD_BITS_ACT = 9 - ACTIVATION_WIDTH;
+typedef logic [2:0][ACTIVATION_WIDTH-1:0] a_buffer_t;
+
+for (genvar i=0; i<CHAINLEN; i++) begin : genActSIMD
+  localparam int TOTAL_PREGS = i/SEGLEN;
+  localparam int EXTERNAL_PREGS = TOTAL_PREGS>1 ? TOTAL_PREGS-1 : 0;
+  
+  if (EXTERNAL_PREGS > 0) begin : genExternalPregAct
+    a_buffer_t A [0:EXTERNAL_PREGS-1];
+    always_ff @(posedge clk) begin
+      if (rst)     A <= '{default: 0};
+      else if(en) begin
+        A[EXTERNAL_PREGS-1] <= a[3*i +: 3];
+        if (EXTERNAL_PREGS > 1)   A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1];
+      end
+    end
+    assign a_in_i[i][26:0] = SIGNED_ACTIVATIONS ? { {PAD_BITS_ACT{A[0][2][ACTIVATION_WIDTH-1]}}, A[0][2], {PAD_BITS_ACT{A[0][1][ACTIVATION_WIDTH-1]}}, A[0][1], {PAD_BITS_ACT{A[0][0][ACTIVATION_WIDTH-1]}}, A[0][0]}
+                             : { {PAD_BITS_ACT{1'b0}}, A[0][2], {PAD_BITS_ACT{1'b0}}, A[0][1], {PAD_BITS_ACT{1'b0}}, A[0][0]} ;
+  end : genExternalPregAct
+  else begin : genInpDSPAct
+    assign a_in_i[i][26:0] = SIGNED_ACTIVATIONS ? { {PAD_BITS_ACT{a[3*i+2][ACTIVATION_WIDTH-1]}}, a[3*i+2], {PAD_BITS_ACT{a[3*i+1][ACTIVATION_WIDTH-1]}}, a[3*i+1], {PAD_BITS_ACT{a[3*i][ACTIVATION_WIDTH-1]}}, a[3*i]}
+                             : { {PAD_BITS_ACT{1'b0}}, a[3*i+2], {PAD_BITS_ACT{1'b0}}, a[3*i+1], {PAD_BITS_ACT{1'b0}}, a[3*i]} ;
+  end : genInpDSPAct
+
+end : genActSIMD
+
+//-------------------- Buffer for weights --------------------\\
+localparam int unsigned PAD_BITS_WEIGHT = 8 - WEIGHT_WIDTH;
+typedef logic [2:0][WEIGHT_WIDTH-1:0] b_buffer_t;
+
+for (genvar j=0; j<PE; j++) begin : genWeightPE
+  for (genvar i=0; i<CHAINLEN; i++) begin : genWeightSIMD
+    localparam int TOTAL_PREGS = i/SEGLEN;
+    localparam int EXTERNAL_PREGS = TOTAL_PREGS>1 ? TOTAL_PREGS-1 : 0;
+    
+    if (EXTERNAL_PREGS > 0) begin : genExternalPregWeight
+      b_buffer_t B [0:PE-1][0:EXTERNAL_PREGS-1];
+      always_ff @(posedge clk) begin
+        if (rst)    B <= '{default: 0};
+        else if (en) begin
+          B[j][EXTERNAL_PREGS-1] <= w[j][3*i +: 3];
+          if (EXTERNAL_PREGS > 1) B[j][0:EXTERNAL_PREGS-2] <= B[j][1:EXTERNAL_PREGS-1];
+        end
+      end
+      assign b_in_i[j][i][23:0] = { {PAD_BITS_WEIGHT{B[j][0][2][WEIGHT_WIDTH-1]}}, B[j][0][2], {PAD_BITS_WEIGHT{B[j][0][1][WEIGHT_WIDTH-1]}}, B[j][0][1], {PAD_BITS_WEIGHT{B[j][0][0][WEIGHT_WIDTH-1]}}, B[j][0][0] };
+    end : genExternalPregWeight
+    else begin : genInpDSPWeight
+      assign b_in_i[j][i][23:0] = { {PAD_BITS_WEIGHT{w[j][3*i+2][WEIGHT_WIDTH-1]}}, w[j][3*i+2], {PAD_BITS_WEIGHT{w[j][3*i+1][WEIGHT_WIDTH-1]}}, w[j][3*i+1], {PAD_BITS_WEIGHT{w[j][3*i][WEIGHT_WIDTH-1]}}, w[j][3*i] };
+    end : genInpDSPWeight
+  end : genWeightSIMD
+
+end : genWeightPE
+
+//-------------------- Instantiate PE x CHAINLEN DSPs --------------------\\
+for (genvar j=0; j<PE; j++) begin : genDSPPE
+  for (genvar i=0; i<CHAINLEN; i++) begin : genDSPChain
+    localparam int TOTAL_PREGS = i/SEGLEN;
+    localparam int INTERNAL_PREGS = TOTAL_PREGS>0 ? 2 : 1; // 1 : 0
+    localparam bit PREG = (i+1)%SEGLEN==0 || i == CHAINLEN-1;
+    localparam bit FIRST = i == 0;
+    localparam bit LAST = i == CHAINLEN-1;
+    uwire [57:0] pp;
+    
+    if (LAST) begin : genPOUT
+      assign p[j] = pp;
+    end      
+    
+    DSP58 #(
+      // Feature Control Attributes: Data Path Selection
+      .AMULTSEL("A"),                     // Selects A input to multiplier (A, AD)
+      .A_INPUT("DIRECT"),                 // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port)
+      .BMULTSEL("B"),                     // Selects B input to multiplier (AD, B)
+      .B_INPUT("DIRECT"),                 // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port)
+      .DSP_MODE("INT8"),                  // Configures DSP to a particular mode of operation. Set to INT24 for
+                                          // legacy mode.
+      .PREADDINSEL("A"),                  // Selects input to pre-adder (A, B)
+      .RND(58'h000000000000000),          // Rounding Constant
+      .USE_MULT("MULTIPLY"),              // Select multiplier usage (DYNAMIC, MULTIPLY, NONE)
+      .USE_SIMD("ONE58"),                 // SIMD selection (FOUR12, ONE58, TWO24)
+      .USE_WIDEXOR("FALSE"),              // Use the Wide XOR function (FALSE, TRUE)
+      .XORSIMD("XOR24_34_58_116"),        // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116)
+      // Pattern Detector Attributes: Pattern Detection Configuration
+      .AUTORESET_PATDET("NO_RESET"),      // NO_RESET, RESET_MATCH, RESET_NOT_MATCH
+      .AUTORESET_PRIORITY("RESET"),       // Priority of AUTORESET vs. CEP (CEP, RESET).
+      .MASK(58'h0ffffffffffffff),         // 58-bit mask value for pattern detect (1=ignore)
+      .PATTERN(58'h000000000000000),      // 58-bit pattern match for pattern detect
+      .SEL_MASK("MASK"),                  // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2
+      .SEL_PATTERN("PATTERN"),            // Select pattern value (C, PATTERN)
+      .USE_PATTERN_DETECT("NO_PATDET"),   // Enable pattern detect (NO_PATDET, PATDET)
+      // Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins
+      .IS_ALUMODE_INVERTED(4'b0000),      // Optional inversion for ALUMODE
+      .IS_CARRYIN_INVERTED(1'b0),         // Optional inversion for CARRYIN
+      .IS_CLK_INVERTED(1'b0),             // Optional inversion for CLK
+      .IS_INMODE_INVERTED(5'b00000),      // Optional inversion for INMODE
+      .IS_NEGATE_INVERTED(3'b000),        // Optional inversion for NEGATE
+      .IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0 
+                            FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN 
+                            2'b01, // Y : M
+                            2'b01  // X: M
+        }), // Optional inversion for OPMODE
+      .IS_RSTALLCARRYIN_INVERTED(1'b0),   // Optional inversion for RSTALLCARRYIN
+      .IS_RSTALUMODE_INVERTED(1'b0),      // Optional inversion for RSTALUMODE
+      .IS_RSTA_INVERTED(1'b0),            // Optional inversion for RSTA
+      .IS_RSTB_INVERTED(1'b0),            // Optional inversion for RSTB
+      .IS_RSTCTRL_INVERTED(1'b0),         // Optional inversion for STCONJUGATE_A
+      .IS_RSTC_INVERTED(1'b0),            // Optional inversion for RSTC
+      .IS_RSTD_INVERTED(1'b0),            // Optional inversion for RSTD
+      .IS_RSTINMODE_INVERTED(1'b0),       // Optional inversion for RSTINMODE
+      .IS_RSTM_INVERTED(1'b0),            // Optional inversion for RSTM
+      .IS_RSTP_INVERTED(1'b0),            // Optional inversion for RSTP
+      // Register Control Attributes: Pipeline Register Configuration
+      .ACASCREG(INTERNAL_PREGS),          // Number of pipeline stages between A/ACIN and ACOUT (0-2)
+      .ADREG(0),                          // Pipeline stages for pre-adder (0-1)
+      .ALUMODEREG(0),                     // Pipeline stages for ALUMODE (0-1)
+      .AREG(INTERNAL_PREGS),              // Pipeline stages for A (0-2)
+      .BCASCREG(INTERNAL_PREGS),          // Number of pipeline stages between B/BCIN and BCOUT (0-2)
+      .BREG(INTERNAL_PREGS),              // Pipeline stages for B (0-2)
+      .CARRYINREG(0),                     // Pipeline stages for CARRYIN (0-1)
+      .CARRYINSELREG(0),                  // Pipeline stages for CARRYINSEL (0-1)
+      .CREG(0),                           // Pipeline stages for C (0-1)
+      .DREG(0),                           // Pipeline stages for D (0-1)
+      .INMODEREG(1),                      // Pipeline stages for INMODE (0-1)
+      .MREG(1),                           // Multiplier pipeline stages (0-1)
+      .OPMODEREG(1),                      // Pipeline stages for OPMODE (0-1)
+      .PREG(PREG),                        // Number of pipeline stages for P (0-1)
+      .RESET_MODE("SYNC")                 // Selection of synchronous or asynchronous reset. (ASYNC, SYNC).
+    )
+    DSP58_inst (
+      // Cascade outputs: Cascade Ports
+      .ACOUT(),                           // 34-bit output: A port cascade
+      .BCOUT(),                           // 24-bit output: B cascade
+      .CARRYCASCOUT(),                    // 1-bit output: Cascade carry
+      .MULTSIGNOUT(),                     // 1-bit output: Multiplier sign cascade
+      .PCOUT(pcout[j][i]),                // 58-bit output: Cascade output
+      // Control outputs: Control Inputs/Status Bits
+      .OVERFLOW(),                        // 1-bit output: Overflow in add/acc
+      .PATTERNBDETECT(),                  // 1-bit output: Pattern bar detect
+      .PATTERNDETECT(),                   // 1-bit output: Pattern detect
+      .UNDERFLOW(),                       // 1-bit output: Underflow in add/acc
+      // Data outputs: Data Ports
+      .CARRYOUT(),                        // 4-bit output: Carry
+      .P(pp),                             // 58-bit output: Primary data
+      .XOROUT(),                          // 8-bit output: XOR data
+      // Cascade inputs: Cascade Ports
+      .ACIN('x),                          // 34-bit input: A cascade data
+      .BCIN('x),                          // 24-bit input: B cascade
+      .CARRYCASCIN('x),                   // 1-bit input: Cascade carry
+      .MULTSIGNIN('x),                    // 1-bit input: Multiplier sign cascade
+      .PCIN(FIRST ? 'x : pcout[j][i-1]),  // 58-bit input: P cascade
+      // Control inputs: Control Inputs/Status Bits
+      .ALUMODE(4'h0),                     // 4-bit input: ALU control
+      .CARRYINSEL('0),                    // 3-bit input: Carry select
+      .CLK(clk),                          // 1-bit input: Clock
+      .INMODE({
+              INTERNAL_PREGS==2 ? 1'b0 : 1'b1,
+              2'b00,
+              TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero,
+              INTERNAL_PREGS==2 ? 1'b0 : 1'b1        
+      }),                                 // 5-bit input: INMODE control
+      .NEGATE('0),                        // 3-bit input: Negates the input of the multiplier
+      .OPMODE({
+              LAST ? {1'b0, L[1]} : 2'b00,
+              7'b000_0000
+      }), // 9-bit input: Operation mode
+      // Data inputs: Data Ports
+      .A({ 7'bx, a_in_i[i] }),            // 34-bit input: A data
+      .B(b_in_i[j][i]),                   // 24-bit input: B data
+      .C('x),                             // 58-bit input: C data
+      .CARRYIN('0),                       // 1-bit input: Carry-in
+      .D('x),                             // 27-bit input: D data
+      // Reset/Clock Enable inputs: Reset/Clock Enable Inputs
+      .ASYNC_RST('0),                     // 1-bit input: Asynchronous reset for all registers.
+      .CEA1(en),                          // 1-bit input: Clock enable for 1st stage AREG
+      .CEA2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage AREG
+      .CEAD('0),                          // 1-bit input: Clock enable for ADREG
+      .CEALUMODE('0),                     // 1-bit input: Clock enable for ALUMODE
+      .CEB1(en),                          // 1-bit input: Clock enable for 1st stage BREG
+      .CEB2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage BREG
+      .CEC('0),                           // 1-bit input: Clock enable for CREG
+      .CECARRYIN('0),                     // 1-bit input: Clock enable for CARRYINREG
+      .CECTRL(en),                        // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG
+      .CED('0),                           // 1-bit input: Clock enable for DREG
+      .CEINMODE(en),                      // 1-bit input: Clock enable for INMODEREG
+      .CEM(en),                           // 1-bit input: Clock enable for MREG
+      .CEP(PREG && en),                   // 1-bit input: Clock enable for PREG
+      .RSTA(rst),                         // 1-bit input: Reset for AREG
+      .RSTALLCARRYIN('0),                 // 1-bit input: Reset for CARRYINREG
+      .RSTALUMODE('0),                    // 1-bit input: Reset for ALUMODEREG
+      .RSTB(rst),                         // 1-bit input: Reset for BREG
+      .RSTC('0),                          // 1-bit input: Reset for CREG
+      .RSTCTRL(rst),                      // 1-bit input: Reset for OPMODEREG and CARRYINSELREG
+      .RSTD('0),                          // 1-bit input: Reset for DREG and ADREG
+      .RSTINMODE(rst),                    // 1-bit input: Reset for INMODE register
+      .RSTM(rst),                         // 1-bit input: Reset for MREG
+      .RSTP(PREG && rst)                  // 1-bit input: Reset for PREG
+    );
+  end : genDSPChain  
+end : genDSPPE
+    
+endmodule

From a94fc3bb0759ecd4b9af212d1629236894a1b520 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 6 Apr 2023 12:34:22 +0100
Subject: [PATCH 003/112] [rtl custom op]: testbench for mvu_8sx9

---
 finn-rtllib/mvu/mvu_8sx9_tb.sv | 165 +++++++++++++++++++++++++++++++++
 1 file changed, 165 insertions(+)
 create mode 100644 finn-rtllib/mvu/mvu_8sx9_tb.sv

diff --git a/finn-rtllib/mvu/mvu_8sx9_tb.sv b/finn-rtllib/mvu/mvu_8sx9_tb.sv
new file mode 100644
index 0000000000..ea3ecbbd70
--- /dev/null
+++ b/finn-rtllib/mvu/mvu_8sx9_tb.sv
@@ -0,0 +1,165 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Testbench for MVU core compute kernel.
+ *****************************************************************************/
+
+module mvu_8sx9_tb();
+
+  //-------------------- Simulation parameters --------------------\\
+  // Matrix & parallelism config
+  localparam int unsigned MH = 256;
+  localparam int unsigned PE = 16;
+  localparam int unsigned MW = 600;
+  localparam int unsigned SIMD = 60;
+  localparam int unsigned SEGMENTLEN = 4;
+  // Bit-width config  
+  localparam int unsigned ACTIVATION_WIDTH = 8;
+  localparam int unsigned WEIGHT_WIDTH = 4;
+  localparam bit SIGNED_ACTIVATIONS = 1;
+  // Simulation constants
+  localparam int unsigned NF = MH/PE;
+  localparam int unsigned SF = MW/SIMD;
+  localparam int unsigned NUM_OF_DSP = SIMD/3;
+  
+  typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t;
+  typedef activation_t activation_vector_t[SF];
+
+  function activation_vector_t init_ACTIVATIONS;
+    automatic activation_vector_t res;
+    std::randomize(res);
+    return res;
+  endfunction : init_ACTIVATIONS
+
+  typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t;
+  typedef weight_t weight_matrix_t[NF][SF];
+  
+  function weight_matrix_t init_WEIGHTS;
+    automatic weight_matrix_t res;
+    std::randomize(res);
+    return res;
+  endfunction : init_WEIGHTS;
+  
+  typedef logic signed [PE-1:0][57:0] output_t;
+  typedef output_t output_vector_t [NF];
+
+  function output_vector_t check_output(activation_vector_t a, weight_matrix_t w);
+    automatic output_vector_t res = '{default: 0};
+    for (int j = 0; j<MH; j++) begin
+      for (int i = 0; i<MW; i++) begin
+        res[j/PE][j%PE] = $signed(res[j/PE][j%PE]) + $signed(a[i/SIMD][i%SIMD]) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
+      end
+    end  
+    return res;
+  endfunction : check_output;
+  
+  logic clk = 0;
+  always #5ns clk = !clk;
+  
+  logic rst;
+  initial begin
+    rst = 1;
+    repeat(16) @(posedge clk);
+    rst <= 0;
+  end
+   
+  logic last;
+  logic zero;
+  logic vld;
+  activation_t a;
+  weight_t w;
+  output_t p;
+  // Reference signals
+  activation_vector_t ACTIVATIONS; //   [SF-1:0][SIMD-1:0][ACTIVATION_WIDTH-1:0]
+  weight_matrix_t WEIGHTS; //           [NF-1:0][SF-1:0][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]
+  output_vector_t GOLDEN_OUTPUT; //     [NF-1:0][PE-1:0][57:0]
+  // Counter for number of outputs (NF dimension) that are produced
+  int NF_CNT = 0;
+  
+  initial begin
+    ACTIVATIONS = init_ACTIVATIONS();
+    WEIGHTS = init_WEIGHTS();
+    GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS);
+    last = 0;
+    zero = 0;
+    a = 'x;
+    w = 'x;
+    
+    @(posedge clk iff !rst);
+
+    for (int j=0; j<NF; j++) begin
+      for (int i=0; i<SF; i++) begin
+        last <= (i==SF-1) ? 1 : 0;
+        a <= ACTIVATIONS[i];
+        w <= WEIGHTS[j][i];
+        @(posedge clk iff en);
+      end
+    end
+
+    last <= 0;
+    zero <= 1;  
+
+    // Continue until all NF outputs are produced & compared
+    @(posedge clk && (NF_CNT==NF));
+
+    $finish;
+  end
+
+  logic en = 0;
+  always_ff @(posedge clk) begin
+    en <= ($urandom()%7 > 1) && !rst;
+  end
+
+  // Compare computed output against golden output when vld flag is raised by DUT
+  always_ff @(posedge clk iff (vld && en)) begin
+    foreach(p[i]) begin
+      assert ($signed(p[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+      else begin 
+        $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+        $stop;
+      end  
+    end
+    NF_CNT += 1;
+  end
+
+  // Instantiate DUT
+  mvu_8sx9 #(
+      .PE(PE),
+      .SIMD(SIMD),
+      .WEIGHT_WIDTH(WEIGHT_WIDTH),
+      .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
+      .ACTIVATION_WIDTH(ACTIVATION_WIDTH),
+      .SEGMENTLEN(SEGMENTLEN)
+    )
+    dut (
+      .clk, .rst, .en, .last, .zero, .a, .w, .vld, .p
+    );
+  
+endmodule

From 98f9accb40bed3445215e15d30398e09948e0b9f Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 6 Apr 2023 12:35:30 +0100
Subject: [PATCH 004/112] [rtl custom op]: initial implementation of flow
 control component for mvu_8sx9

---
 finn-rtllib/mvu/mvu_8sx9_axi.sv | 179 ++++++++++++++++++++++++++++++++
 1 file changed, 179 insertions(+)
 create mode 100644 finn-rtllib/mvu/mvu_8sx9_axi.sv

diff --git a/finn-rtllib/mvu/mvu_8sx9_axi.sv b/finn-rtllib/mvu/mvu_8sx9_axi.sv
new file mode 100644
index 0000000000..8765c50a26
--- /dev/null
+++ b/finn-rtllib/mvu/mvu_8sx9_axi.sv
@@ -0,0 +1,179 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Matrix Vector Unit (MVU) AXI-lite interface wrapper.
+ *****************************************************************************/
+
+module mvu_8sx9_axi #(
+    int unsigned MW,
+    int unsigned MH,
+    int unsigned PE,
+    int unsigned SIMD,
+    int unsigned ACTIVATION_WIDTH,
+    int unsigned WEIGHT_WIDTH,
+    int unsigned ACCU_WIDTH,
+    bit SIGNED_ACTIVATIONS = 0,
+    int unsigned SEGMENTLEN = 0,
+		parameter RAM_STYLE = "auto",
+
+    localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
+    localparam int unsigned INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8,
+		localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH,
+		localparam int unsigned INPUT_STREAM_WIDTH = SIMD*ACTIVATION_WIDTH,
+    localparam int unsigned SF = MW/SIMD,
+		localparam int unsigned NF = MH/PE,
+    localparam int unsigned OUTPUT_LANES = PE,
+    localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
+)
+(
+	// Global Control
+	input		logic  ap_clk,
+	input		logic  ap_rst_n,
+
+	// Weight Stream
+	input		logic [WEIGHT_STREAM_WIDTH_BA-1:0]  s_axis_weights_tdata,
+	input		logic  s_axis_weights_tvalid,
+	output	logic  s_axis_weights_tready,
+
+	// Input Stream
+	input		logic [INPUT_STREAM_WIDTH_BA-1:0]  s_axis_input_tdata,
+	input		logic  s_axis_input_tvalid,
+	output	logic  s_axis_input_tready,
+
+	// Output Stream
+	output	logic [OUTPUT_STREAM_WIDTH_BA-1:0]  m_axis_output_tdata,
+	output	logic  m_axis_output_tvalid,
+	input		logic  m_axis_output_tready
+);
+
+//-------------------- Parameter sanity checks --------------------\\
+	initial begin
+		if (MW % SIMD != 0) begin
+		$error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD);
+		$finish;
+		end
+		if (MH % PE != 0) begin
+		$error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE);
+		$finish;
+		end
+		if (ACTIVATION_WIDTH > 9) begin
+		$error("Activation width of %0d-bits exceeds maximum of 9-bits", ACTIVATION_WIDTH);
+		$finish;
+		end
+		if (WEIGHT_WIDTH > 8) begin
+		$error("Weight width of %0d-bits exceeds maximum of 8-bits", WEIGHT_WIDTH);
+		$finish;
+		end
+		if (SIGNED_ACTIVATIONS == 0 && ACTIVATION_WIDTH==9) begin
+		$error("Activation width of %0d-bits exceeds maximum of 8-bits for unsigned numbers", ACTIVATION_WIDTH);
+		$finish;
+		end
+		if (SEGMENTLEN == 0) begin
+		$warning("Segment length of %0d defaults to chain length", SEGMENTLEN);
+		end
+		if (SEGMENTLEN > (SIMD+2)/3) begin
+		$error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3);
+		$finish;
+		end
+	end
+
+	uwire clk = ap_clk;
+	uwire rst = !ap_rst_n;
+
+	typedef logic [INPUT_STREAM_WIDTH-1 : 0] mvauin_t;
+
+	uwire mvauin_t amvau;
+	uwire alast;
+	uwire afin;
+	uwire avld;
+	uwire ardy;
+
+	replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvauin_t)), .RAM_STYLE(RAM_STYLE)) activation_replay (
+		.clk, .rst,
+		.ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)),
+		.ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin)
+	);
+
+	//-------------------- Input control --------------------\\
+	uwire en;
+	uwire istb = avld && s_axis_weights_tvalid;
+	assign ardy = en && s_axis_weights_tvalid;
+	assign s_axis_weights_tready = en && avld;
+
+	//-------------------- Core MVU --------------------\\
+	uwire ovld;
+	uwire [PE-1:0][57:0] odat;
+	typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t;
+	mvu_8sx9 #(.PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
+	.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN)) core (
+		.clk, .rst, .en,
+		.last(alast), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau),
+		.vld(ovld), .p(odat)
+	);
+
+	//-------------------- Output register slice --------------------\\
+	struct {
+		logic vld;
+		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
+	} A = '{ vld: 0, default: 'x};
+
+	assign en = !A.vld || !ovld;
+
+	uwire  b_load;
+	always_ff @(posedge clk) begin
+		if(rst)  A <= '{ vld: 0, default: 'x };
+		else if(!A.vld || b_load) begin
+			A.vld <= ovld && en;
+			for(int unsigned  i = 0; i < PE; i++) begin
+				// CR-1148862:
+				// A.dat[i] <= odat[i];
+				automatic logic [57:0]  v = odat[i];
+				A.dat[i] <= v[ACCU_WIDTH-1:0];
+			end
+		end
+	end
+	
+	struct {
+		logic vld;
+		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
+	} B = '{ vld: 0, default: 'x};
+
+	assign	b_load = !B.vld || m_axis_output_tready;
+	always_ff @(posedge clk) begin
+		if(rst)		B <= '{ default: 'x };
+		else begin
+			if(b_load)	 B <= '{ vld: A.vld, dat: A.dat};
+		end	
+	end
+
+	assign	m_axis_output_tvalid = B.vld;
+	assign	m_axis_output_tdata  = B.dat;
+
+endmodule
\ No newline at end of file

From 96925a929877ce084466438128678250b09784a9 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 6 Apr 2023 12:36:00 +0100
Subject: [PATCH 005/112] [rtl custom op]: implementation of replay buffer for
 mvu

---
 finn-rtllib/mvu/replay_buffer.sv | 109 +++++++++++++++++++++++++++++++
 1 file changed, 109 insertions(+)
 create mode 100644 finn-rtllib/mvu/replay_buffer.sv

diff --git a/finn-rtllib/mvu/replay_buffer.sv b/finn-rtllib/mvu/replay_buffer.sv
new file mode 100644
index 0000000000..685ac03137
--- /dev/null
+++ b/finn-rtllib/mvu/replay_buffer.sv
@@ -0,0 +1,109 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Replay buffer for counted sequences on an AXI-lite stream.
+ * @author	Thomas B. Preußer <thomas.preusser@amd.com>
+ *****************************************************************************/
+
+module replay_buffer #(
+	int unsigned  LEN,	// Sequence length
+	int unsigned  REP,	// Sequence replay count
+	int unsigned  W,	// Data width
+	parameter RAM_STYLE = "auto" 	// ram style for buffer {block, distributed, ultra, auto}
+)(
+	input	logic  clk,
+	input	logic  rst,
+
+	input	logic [W-1:0]  idat,
+	input	logic  ivld,
+	output	logic  irdy,
+
+	output	logic [W-1:0]  odat,
+	output	logic  olast,
+	output	logic  ofin,
+	output	logic  ovld,
+	input	logic  ordy
+);
+
+	typedef logic [$clog2(REP)+$clog2(LEN)-1:0]  count_t;
+	count_t  Count = 0;
+	uwire  done_len = ((LEN-1) & ~Count[$clog2(LEN)-1:0]) == 0;
+	uwire  done_rep;
+	uwire  done_all = done_len && done_rep;
+
+	uwire  shift;
+	uwire  clr = rst || (done_all && shift);
+	always_ff @(posedge clk) begin
+		if(clr)         Count <= 0;
+		else if(shift)  Count <= Count + ((REP > 1) && done_len? 2**$clog2(LEN)-LEN+1 : 1);
+	end
+
+	typedef logic [W-1:0]  data_t;
+	uwire data_t  rdat;
+	uwire  first_rep;
+	if(REP == 1) begin
+		assign	done_rep  = 1;
+		assign	first_rep = 1;
+		assign	rdat = 'x;
+	end
+	else begin
+		assign	done_rep = ((REP-1) & ~Count[$left(Count):$clog2(LEN)]) == 0;
+
+		logic  FirstRep = 1;
+		always_ff @(posedge clk) begin
+			if(clr)         FirstRep <= 1;
+			else if(shift)  FirstRep <= FirstRep && !done_len;
+		end
+		assign	first_rep = FirstRep;
+
+		(* RAM_STYLE = RAM_STYLE *)
+		data_t  Buf[LEN];
+		if(LEN == 1) begin : genTrivial
+			always_ff @(posedge clk) begin
+				if(shift && FirstRep)  Buf[0] <= idat;
+			end
+		end : genTrivial
+		else begin : genShift
+			always_ff @(posedge clk) begin
+				if(shift)  Buf <= { odat, Buf[0:LEN-2] };
+			end
+		end : genShift
+
+		assign	rdat = Buf[LEN-1];
+	end
+
+	assign  irdy  = ordy && first_rep;
+	assign	odat  = first_rep? idat : rdat;
+	assign	olast = done_len;
+	assign	ofin  = done_all;
+	assign	ovld  = first_rep? ivld : 1;
+	assign	shift = ovld && ordy;
+
+endmodule : replay_buffer
\ No newline at end of file

From a3d11567468899bbcf33c83b509c26f908a807a3 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 6 Apr 2023 12:37:16 +0100
Subject: [PATCH 006/112] [rtl custom op]: testbench for mvu_8sx9_axi
 (including axi_wrapper & compute kernel)

---
 finn-rtllib/mvu/mvu_8sx9_axi_tb.sv | 208 +++++++++++++++++++++++++++++
 1 file changed, 208 insertions(+)
 create mode 100644 finn-rtllib/mvu/mvu_8sx9_axi_tb.sv

diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv b/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv
new file mode 100644
index 0000000000..ea97e0708c
--- /dev/null
+++ b/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv
@@ -0,0 +1,208 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Testbench for MVU AXI-lite interface wrapper.
+ *****************************************************************************/
+
+module mvu_8sx9_axi_tb();
+
+  //-------------------- Simulation parameters --------------------\\
+  // Matrix & parallelism config
+  localparam int unsigned MW = 600;
+  localparam int unsigned MH = 256;
+  localparam int unsigned SIMD = 60;
+  localparam int unsigned PE = 16;
+  localparam int unsigned SEGMENTLEN = 4;
+  // Bit-width config  
+  localparam int unsigned ACTIVATION_WIDTH = 8;
+  localparam int unsigned WEIGHT_WIDTH = 4;
+  localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW);
+  localparam bit SIGNED_ACTIVATIONS = 1;
+  // Simulation constants  
+  localparam int unsigned NF = MH/PE;
+  localparam int unsigned SF = MW/SIMD;
+  localparam int unsigned NUM_OF_DSP = SIMD/3;
+  localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8;
+  localparam int unsigned ACTIVATION_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8*8;
+  localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH;
+  localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - SIMD*ACTIVATION_WIDTH;
+  localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8;
+
+  // Generate clk and reset signal   
+  logic clk = 0;
+  always #5ns clk = !clk;
+  
+  logic ap_rst_n = 0;
+  initial begin
+    repeat(16) @(posedge clk);
+    ap_rst_n <= 1;
+  end
+
+  uwire ap_clk = clk;
+
+  // Generate activations  
+  typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t;
+  typedef activation_t activation_vector_t[SF];
+    
+  function activation_vector_t init_ACTIVATIONS;
+    automatic activation_vector_t res;
+    std::randomize(res);
+    return res;
+  endfunction : init_ACTIVATIONS
+
+  activation_vector_t ACTIVATIONS = init_ACTIVATIONS();
+
+  struct {
+    activation_t dat;
+    logic vld;
+    logic rdy;
+  } activations;
+
+  initial begin
+    activations.vld = 0;
+    activations.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain
+    @(posedge clk iff ap_rst_n);
+
+    for (int i=0; i<SF; i++) begin
+      activations.dat <= ACTIVATIONS[i];
+      do begin 
+        activations.vld = $urandom()%7 > 1;
+        @(posedge clk);
+      end while (!(activations.vld === 1 && activations.rdy === 1));
+    end
+    
+    activations.vld <= 0;
+    activations.dat <= 'x;
+  end
+   
+  // Generate weights   
+  typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t;
+  typedef weight_t weight_matrix_t[NF][SF]; 
+  
+  function weight_matrix_t init_WEIGHTS;
+    automatic weight_matrix_t res;
+    std::randomize(res);
+    return res;
+  endfunction : init_WEIGHTS;
+
+  weight_matrix_t WEIGHTS = init_WEIGHTS();
+
+  struct {
+    weight_t dat;
+    logic vld;
+    logic rdy;
+  } weights;
+
+  initial begin
+    weights.vld = 0;
+    weights.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain
+    @(posedge clk iff ap_rst_n);
+
+    weights.vld <= 1;
+    for (int i=0; i<NF; i++) begin
+      for (int j=0; j<SF; j++) begin
+        weights.dat <= WEIGHTS[i][j];
+        @(posedge clk iff weights.rdy);
+      end
+    end
+
+    weights.vld <= 0;
+    weights.dat <= 'x;
+  end
+  
+  // Function to compute golden output  
+  // a: [SF][SIMD-1:0][ACTIVATION_WIDTH-1:0]
+  // w: [NF][SF][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]
+  typedef logic signed [PE-1:0][ACCU_WIDTH-1:0] output_t;
+  typedef output_t output_vector_t [NF];
+
+  struct {
+    output_t dat;
+    logic vld;
+    logic rdy;
+  } outputs;
+
+  function output_vector_t check_output(activation_vector_t a, weight_matrix_t w);
+    automatic output_vector_t res = '{default: 0};
+    for (int j = 0; j<MH; j++) begin
+      for (int i = 0; i<MW; i++) begin
+        res[j/PE][j%PE] = $signed(res[j/PE][j%PE]) + $signed(a[i/SIMD][i%SIMD]) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
+      end
+    end  
+    return res;
+  endfunction : check_output;
+
+  output_vector_t GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS);
+
+  int unsigned NF_CNT = 0;
+  initial begin
+    outputs.rdy = 0;
+    while (NF_CNT < NF) begin
+      // Loop until both rdy & vld are asserted
+      do begin
+        outputs.rdy <= $urandom()%7 >= 1;
+        @(posedge clk iff ap_rst_n);
+      end while (!(outputs.rdy === 1 && outputs.vld === 1));
+
+      // Compare produced outputs against golden outputs
+      foreach(outputs.dat[i]) begin
+        assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+        else begin 
+          $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+          $stop;
+        end  
+      end
+      
+      NF_CNT += 1;
+    end
+    
+    $finish;  
+  end
+
+  // Instantiate DUT
+  mvu_8sx9_axi #(
+      .MW(MW),
+      .MH(MH),
+      .PE(PE),
+      .SIMD(SIMD),
+      .ACTIVATION_WIDTH(ACTIVATION_WIDTH),
+      .WEIGHT_WIDTH(WEIGHT_WIDTH),
+      .ACCU_WIDTH(ACCU_WIDTH),
+      .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
+      .SEGMENTLEN(SEGMENTLEN)
+    )
+    dut (
+      .ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld),
+      .s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld),
+      .s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld),
+      .m_axis_output_tready(outputs.rdy)
+    );
+  
+endmodule

From 2aea664b2260a4ea759909d0a3168b5f62b114a2 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 6 Apr 2023 12:37:55 +0100
Subject: [PATCH 007/112] [rtl custom op]: initial implementation of verilog
 wrapper for mvu_8sx9_axi

---
 finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v | 90 ++++++++++++++++++++++++++
 1 file changed, 90 insertions(+)
 create mode 100644 finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v

diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v
new file mode 100644
index 0000000000..ff3779d211
--- /dev/null
+++ b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v
@@ -0,0 +1,90 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Verilog AXI-lite wrapper for MVU.
+ *****************************************************************************/
+
+module $MODULE_NAME_AXI_WRAPPER$ #(
+	parameter 	MW = $MW$,
+	parameter		MH = $MH$,
+	parameter 	PE = $PE$,
+	parameter 	SIMD = $SIMD$,
+	parameter 	ACTIVATION_WIDTH = $ACTIVATION_WIDTH$,
+	parameter 	WEIGHT_WIDTH = $WEIGHT_WIDTH$,
+	parameter 	ACCU_WIDTH = $ACCU_WIDTH$,
+	parameter 	SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$,
+	parameter 	SEGMENTLEN = $SEGMENTLEN$,
+	parameter 	RAM_STYLE = $IBUF_RAM_STYLE$,
+
+	// Safely deducible parameters
+	parameter WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
+	parameter INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8,
+	parameter OUTPUT_LANES = PE,
+	parameter OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
+)(
+  // Global Control
+	input		logic  ap_clk,
+	input		logic  ap_rst_n,
+
+	// Weight Stream
+	input		logic [WEIGHT_STREAM_WIDTH_BA-1:0]  s_axis_weights_tdata,
+	input		logic  s_axis_weights_tvalid,
+	output	logic  s_axis_weights_tready,
+
+	// Input Stream
+	input		logic [INPUT_STREAM_WIDTH_BA-1:0]  s_axis_input_tdata,
+	input		logic  s_axis_input_tvalid,
+	output	logic  s_axis_input_tready,
+
+	// Output Stream
+	output	logic [OUTPUT_STREAM_WIDTH_BA-1:0]  m_axis_output_tdata,
+	output	logic  m_axis_output_tvalid,
+	input		logic  m_axis_output_tready
+);
+
+mvu_8sx9_axi #(
+	.MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH),
+	.WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
+	.SEGMENTLEN(SEGMENTLEN), .RAM_STYLE(RAM_STYLE)
+	) inst (
+	.ap_clk(ap_clk),
+	.ap_rst_n(ap_rst_n),
+	.s_axis_weights_tdata(s_axis_weights_tdata),
+	.s_axis_weights_tvalid(s_axis_weights_tvalid),
+	.s_axis_weights_tready(s_axis_weights_tready),
+	.s_axis_input_tdata(s_axis_input_tdata),
+	.s_axis_input_tvalid(s_axis_input_tvalid),
+	.s_axis_input_tready(s_axis_input_tready),
+	.m_axis_output_tdata(m_axis_output_tdata),
+	.m_axis_output_tvalid(m_axis_output_tvalid),
+	.m_axis_output_tready(m_axis_output_tready)
+)
+
+endmodule : mvau_8sx9_axi_wrapper
\ No newline at end of file

From 8b57849bb47c3119b177e78dcbaa48954f69b811 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Tue, 11 Apr 2023 15:50:24 +0100
Subject: [PATCH 008/112] [rtl mvu]: fix tab indentation

---
 finn-rtllib/mvu/mvu_8sx9.sv            | 424 ++++++++++++-------------
 finn-rtllib/mvu/mvu_8sx9_axi.sv        |  32 +-
 finn-rtllib/mvu/mvu_8sx9_axi_tb.sv     | 342 ++++++++++----------
 finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v |  26 +-
 finn-rtllib/mvu/mvu_8sx9_tb.sv         | 258 +++++++--------
 5 files changed, 541 insertions(+), 541 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv
index c992990d9f..d082d4fb2e 100644
--- a/finn-rtllib/mvu/mvu_8sx9.sv
+++ b/finn-rtllib/mvu/mvu_8sx9.sv
@@ -52,233 +52,233 @@ module mvu_8sx9 #(
   );
 
 //-------------------- Declare global signals --------------------\\
-localparam int unsigned CHAINLEN = (SIMD+2)/3;
-localparam int unsigned SEGLEN = SEGMENTLEN == 0 ? CHAINLEN : SEGMENTLEN; // Additional constant to default a SEGMENTLEN of '0' to the DSP-chain length
-uwire [26:0] a_in_i [CHAINLEN];
-uwire [23:0] b_in_i [PE][CHAINLEN];
-uwire [57:0] pcout [PE][CHAINLEN];
+	localparam int unsigned CHAINLEN = (SIMD+2)/3;
+	localparam int unsigned SEGLEN = SEGMENTLEN == 0 ? CHAINLEN : SEGMENTLEN; // Additional constant to default a SEGMENTLEN of '0' to the DSP-chain length
+	uwire [26:0] a_in_i [CHAINLEN];
+	uwire [23:0] b_in_i [PE][CHAINLEN];
+	uwire [57:0] pcout [PE][CHAINLEN];
 
 //-------------------- Shift register for opmode select signal --------------------\\
-localparam int unsigned MAX_PIPELINE_STAGES = (CHAINLEN + SEGLEN-1)/SEGLEN; // >=1 (== number of pipeline registers + 1 (A/B inputs always have 1 register))
-logic L [0:1+MAX_PIPELINE_STAGES] = '{default: 0}; // After MAX_PIPELINE_STAGES (== number of pipeline stages for input data), we have 3 additional cycles latency (A/B reg, Mreg, Preg). Thus, we add +2 (since OPMODE is buffered by 1 cycle in the DSP fabric)
+	localparam int unsigned MAX_PIPELINE_STAGES = (CHAINLEN + SEGLEN-1)/SEGLEN; // >=1 (== number of pipeline registers + 1 (A/B inputs always have 1 register))
+	logic L [0:1+MAX_PIPELINE_STAGES] = '{default: 0}; // After MAX_PIPELINE_STAGES (== number of pipeline stages for input data), we have 3 additional cycles latency (A/B reg, Mreg, Preg). Thus, we add +2 (since OPMODE is buffered by 1 cycle in the DSP fabric)
 
-always_ff @(posedge clk) begin
-  if(rst)     L <= '{default: 0};
-  else if(en) L <= { L[1:1+MAX_PIPELINE_STAGES], last };
-end  
-assign vld = L[0];
+	always_ff @(posedge clk) begin
+		if(rst)     L <= '{default: 0};
+		else if(en) L <= { L[1:1+MAX_PIPELINE_STAGES], last };
+	end  
+	assign vld = L[0];
 
 //-------------------- Shift register for ZERO flag --------------------\\
-logic Z [0:MAX_PIPELINE_STAGES-2] = '{default:0}; // We need MAX_PIPELINE_STAGES-1 pipeline stages (note: INMODE is buffered inside DSP fabric)
+	logic Z [0:MAX_PIPELINE_STAGES-2] = '{default:0}; // We need MAX_PIPELINE_STAGES-1 pipeline stages (note: INMODE is buffered inside DSP fabric)
 
-if (MAX_PIPELINE_STAGES > 1) begin : genZreg
-  always_ff @(posedge clk) begin
-    if (rst)      Z <= '{default: 0};
-    else if(en) begin
-        Z[0] <= zero;
-        if (MAX_PIPELINE_STAGES > 2)  Z[1:MAX_PIPELINE_STAGES-1] <= Z[0:MAX_PIPELINE_STAGES-2];
-    end    
-  end
-end;
+	if (MAX_PIPELINE_STAGES > 1) begin : genZreg
+		always_ff @(posedge clk) begin
+			if (rst)      Z <= '{default: 0};
+			else if(en) begin
+				Z[0] <= zero;
+				if (MAX_PIPELINE_STAGES > 2)  Z[1:MAX_PIPELINE_STAGES-1] <= Z[0:MAX_PIPELINE_STAGES-2];
+			end    
+		end
+	end;
 
 //-------------------- Buffer for input activations --------------------\\
-localparam int unsigned PAD_BITS_ACT = 9 - ACTIVATION_WIDTH;
-typedef logic [2:0][ACTIVATION_WIDTH-1:0] a_buffer_t;
+	localparam int unsigned PAD_BITS_ACT = 9 - ACTIVATION_WIDTH;
+	typedef logic [2:0][ACTIVATION_WIDTH-1:0] a_buffer_t;
 
-for (genvar i=0; i<CHAINLEN; i++) begin : genActSIMD
-  localparam int TOTAL_PREGS = i/SEGLEN;
-  localparam int EXTERNAL_PREGS = TOTAL_PREGS>1 ? TOTAL_PREGS-1 : 0;
-  
-  if (EXTERNAL_PREGS > 0) begin : genExternalPregAct
-    a_buffer_t A [0:EXTERNAL_PREGS-1];
-    always_ff @(posedge clk) begin
-      if (rst)     A <= '{default: 0};
-      else if(en) begin
-        A[EXTERNAL_PREGS-1] <= a[3*i +: 3];
-        if (EXTERNAL_PREGS > 1)   A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1];
-      end
-    end
-    assign a_in_i[i][26:0] = SIGNED_ACTIVATIONS ? { {PAD_BITS_ACT{A[0][2][ACTIVATION_WIDTH-1]}}, A[0][2], {PAD_BITS_ACT{A[0][1][ACTIVATION_WIDTH-1]}}, A[0][1], {PAD_BITS_ACT{A[0][0][ACTIVATION_WIDTH-1]}}, A[0][0]}
-                             : { {PAD_BITS_ACT{1'b0}}, A[0][2], {PAD_BITS_ACT{1'b0}}, A[0][1], {PAD_BITS_ACT{1'b0}}, A[0][0]} ;
-  end : genExternalPregAct
-  else begin : genInpDSPAct
-    assign a_in_i[i][26:0] = SIGNED_ACTIVATIONS ? { {PAD_BITS_ACT{a[3*i+2][ACTIVATION_WIDTH-1]}}, a[3*i+2], {PAD_BITS_ACT{a[3*i+1][ACTIVATION_WIDTH-1]}}, a[3*i+1], {PAD_BITS_ACT{a[3*i][ACTIVATION_WIDTH-1]}}, a[3*i]}
-                             : { {PAD_BITS_ACT{1'b0}}, a[3*i+2], {PAD_BITS_ACT{1'b0}}, a[3*i+1], {PAD_BITS_ACT{1'b0}}, a[3*i]} ;
-  end : genInpDSPAct
+	for (genvar i=0; i<CHAINLEN; i++) begin : genActSIMD
+		localparam int TOTAL_PREGS = i/SEGLEN;
+		localparam int EXTERNAL_PREGS = TOTAL_PREGS>1 ? TOTAL_PREGS-1 : 0;
 
-end : genActSIMD
+		if (EXTERNAL_PREGS > 0) begin : genExternalPregAct
+			a_buffer_t A [0:EXTERNAL_PREGS-1];
+			always_ff @(posedge clk) begin
+				if (rst)     A <= '{default: 0};
+				else if(en) begin
+					A[EXTERNAL_PREGS-1] <= a[3*i +: 3];
+					if (EXTERNAL_PREGS > 1)   A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1];
+				end
+			end
+			assign a_in_i[i][26:0] = SIGNED_ACTIVATIONS ? { {PAD_BITS_ACT{A[0][2][ACTIVATION_WIDTH-1]}}, A[0][2], {PAD_BITS_ACT{A[0][1][ACTIVATION_WIDTH-1]}}, A[0][1], {PAD_BITS_ACT{A[0][0][ACTIVATION_WIDTH-1]}}, A[0][0]}
+									: { {PAD_BITS_ACT{1'b0}}, A[0][2], {PAD_BITS_ACT{1'b0}}, A[0][1], {PAD_BITS_ACT{1'b0}}, A[0][0]} ;
+		end : genExternalPregAct
+		else begin : genInpDSPAct
+			assign a_in_i[i][26:0] = SIGNED_ACTIVATIONS ? { {PAD_BITS_ACT{a[3*i+2][ACTIVATION_WIDTH-1]}}, a[3*i+2], {PAD_BITS_ACT{a[3*i+1][ACTIVATION_WIDTH-1]}}, a[3*i+1], {PAD_BITS_ACT{a[3*i][ACTIVATION_WIDTH-1]}}, a[3*i]}
+									: { {PAD_BITS_ACT{1'b0}}, a[3*i+2], {PAD_BITS_ACT{1'b0}}, a[3*i+1], {PAD_BITS_ACT{1'b0}}, a[3*i]} ;
+		end : genInpDSPAct
+
+	end : genActSIMD
 
 //-------------------- Buffer for weights --------------------\\
-localparam int unsigned PAD_BITS_WEIGHT = 8 - WEIGHT_WIDTH;
-typedef logic [2:0][WEIGHT_WIDTH-1:0] b_buffer_t;
+	localparam int unsigned PAD_BITS_WEIGHT = 8 - WEIGHT_WIDTH;
+	typedef logic [2:0][WEIGHT_WIDTH-1:0] b_buffer_t;
 
-for (genvar j=0; j<PE; j++) begin : genWeightPE
-  for (genvar i=0; i<CHAINLEN; i++) begin : genWeightSIMD
-    localparam int TOTAL_PREGS = i/SEGLEN;
-    localparam int EXTERNAL_PREGS = TOTAL_PREGS>1 ? TOTAL_PREGS-1 : 0;
-    
-    if (EXTERNAL_PREGS > 0) begin : genExternalPregWeight
-      b_buffer_t B [0:PE-1][0:EXTERNAL_PREGS-1];
-      always_ff @(posedge clk) begin
-        if (rst)    B <= '{default: 0};
-        else if (en) begin
-          B[j][EXTERNAL_PREGS-1] <= w[j][3*i +: 3];
-          if (EXTERNAL_PREGS > 1) B[j][0:EXTERNAL_PREGS-2] <= B[j][1:EXTERNAL_PREGS-1];
-        end
-      end
-      assign b_in_i[j][i][23:0] = { {PAD_BITS_WEIGHT{B[j][0][2][WEIGHT_WIDTH-1]}}, B[j][0][2], {PAD_BITS_WEIGHT{B[j][0][1][WEIGHT_WIDTH-1]}}, B[j][0][1], {PAD_BITS_WEIGHT{B[j][0][0][WEIGHT_WIDTH-1]}}, B[j][0][0] };
-    end : genExternalPregWeight
-    else begin : genInpDSPWeight
-      assign b_in_i[j][i][23:0] = { {PAD_BITS_WEIGHT{w[j][3*i+2][WEIGHT_WIDTH-1]}}, w[j][3*i+2], {PAD_BITS_WEIGHT{w[j][3*i+1][WEIGHT_WIDTH-1]}}, w[j][3*i+1], {PAD_BITS_WEIGHT{w[j][3*i][WEIGHT_WIDTH-1]}}, w[j][3*i] };
-    end : genInpDSPWeight
-  end : genWeightSIMD
+	for (genvar j=0; j<PE; j++) begin : genWeightPE
+		for (genvar i=0; i<CHAINLEN; i++) begin : genWeightSIMD
+			localparam int TOTAL_PREGS = i/SEGLEN;
+			localparam int EXTERNAL_PREGS = TOTAL_PREGS>1 ? TOTAL_PREGS-1 : 0;
 
-end : genWeightPE
+			if (EXTERNAL_PREGS > 0) begin : genExternalPregWeight
+				b_buffer_t B [0:PE-1][0:EXTERNAL_PREGS-1];
+				always_ff @(posedge clk) begin
+					if (rst)    B <= '{default: 0};
+					else if (en) begin
+						B[j][EXTERNAL_PREGS-1] <= w[j][3*i +: 3];
+						if (EXTERNAL_PREGS > 1) B[j][0:EXTERNAL_PREGS-2] <= B[j][1:EXTERNAL_PREGS-1];
+					end
+				end
+				assign b_in_i[j][i][23:0] = { {PAD_BITS_WEIGHT{B[j][0][2][WEIGHT_WIDTH-1]}}, B[j][0][2], {PAD_BITS_WEIGHT{B[j][0][1][WEIGHT_WIDTH-1]}}, B[j][0][1], {PAD_BITS_WEIGHT{B[j][0][0][WEIGHT_WIDTH-1]}}, B[j][0][0] };
+			end : genExternalPregWeight
+			else begin : genInpDSPWeight
+				assign b_in_i[j][i][23:0] = { {PAD_BITS_WEIGHT{w[j][3*i+2][WEIGHT_WIDTH-1]}}, w[j][3*i+2], {PAD_BITS_WEIGHT{w[j][3*i+1][WEIGHT_WIDTH-1]}}, w[j][3*i+1], {PAD_BITS_WEIGHT{w[j][3*i][WEIGHT_WIDTH-1]}}, w[j][3*i] };
+			end : genInpDSPWeight
+		end : genWeightSIMD
+
+	end : genWeightPE
 
 //-------------------- Instantiate PE x CHAINLEN DSPs --------------------\\
-for (genvar j=0; j<PE; j++) begin : genDSPPE
-  for (genvar i=0; i<CHAINLEN; i++) begin : genDSPChain
-    localparam int TOTAL_PREGS = i/SEGLEN;
-    localparam int INTERNAL_PREGS = TOTAL_PREGS>0 ? 2 : 1; // 1 : 0
-    localparam bit PREG = (i+1)%SEGLEN==0 || i == CHAINLEN-1;
-    localparam bit FIRST = i == 0;
-    localparam bit LAST = i == CHAINLEN-1;
-    uwire [57:0] pp;
-    
-    if (LAST) begin : genPOUT
-      assign p[j] = pp;
-    end      
-    
-    DSP58 #(
-      // Feature Control Attributes: Data Path Selection
-      .AMULTSEL("A"),                     // Selects A input to multiplier (A, AD)
-      .A_INPUT("DIRECT"),                 // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port)
-      .BMULTSEL("B"),                     // Selects B input to multiplier (AD, B)
-      .B_INPUT("DIRECT"),                 // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port)
-      .DSP_MODE("INT8"),                  // Configures DSP to a particular mode of operation. Set to INT24 for
-                                          // legacy mode.
-      .PREADDINSEL("A"),                  // Selects input to pre-adder (A, B)
-      .RND(58'h000000000000000),          // Rounding Constant
-      .USE_MULT("MULTIPLY"),              // Select multiplier usage (DYNAMIC, MULTIPLY, NONE)
-      .USE_SIMD("ONE58"),                 // SIMD selection (FOUR12, ONE58, TWO24)
-      .USE_WIDEXOR("FALSE"),              // Use the Wide XOR function (FALSE, TRUE)
-      .XORSIMD("XOR24_34_58_116"),        // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116)
-      // Pattern Detector Attributes: Pattern Detection Configuration
-      .AUTORESET_PATDET("NO_RESET"),      // NO_RESET, RESET_MATCH, RESET_NOT_MATCH
-      .AUTORESET_PRIORITY("RESET"),       // Priority of AUTORESET vs. CEP (CEP, RESET).
-      .MASK(58'h0ffffffffffffff),         // 58-bit mask value for pattern detect (1=ignore)
-      .PATTERN(58'h000000000000000),      // 58-bit pattern match for pattern detect
-      .SEL_MASK("MASK"),                  // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2
-      .SEL_PATTERN("PATTERN"),            // Select pattern value (C, PATTERN)
-      .USE_PATTERN_DETECT("NO_PATDET"),   // Enable pattern detect (NO_PATDET, PATDET)
-      // Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins
-      .IS_ALUMODE_INVERTED(4'b0000),      // Optional inversion for ALUMODE
-      .IS_CARRYIN_INVERTED(1'b0),         // Optional inversion for CARRYIN
-      .IS_CLK_INVERTED(1'b0),             // Optional inversion for CLK
-      .IS_INMODE_INVERTED(5'b00000),      // Optional inversion for INMODE
-      .IS_NEGATE_INVERTED(3'b000),        // Optional inversion for NEGATE
-      .IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0 
-                            FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN 
-                            2'b01, // Y : M
-                            2'b01  // X: M
-        }), // Optional inversion for OPMODE
-      .IS_RSTALLCARRYIN_INVERTED(1'b0),   // Optional inversion for RSTALLCARRYIN
-      .IS_RSTALUMODE_INVERTED(1'b0),      // Optional inversion for RSTALUMODE
-      .IS_RSTA_INVERTED(1'b0),            // Optional inversion for RSTA
-      .IS_RSTB_INVERTED(1'b0),            // Optional inversion for RSTB
-      .IS_RSTCTRL_INVERTED(1'b0),         // Optional inversion for STCONJUGATE_A
-      .IS_RSTC_INVERTED(1'b0),            // Optional inversion for RSTC
-      .IS_RSTD_INVERTED(1'b0),            // Optional inversion for RSTD
-      .IS_RSTINMODE_INVERTED(1'b0),       // Optional inversion for RSTINMODE
-      .IS_RSTM_INVERTED(1'b0),            // Optional inversion for RSTM
-      .IS_RSTP_INVERTED(1'b0),            // Optional inversion for RSTP
-      // Register Control Attributes: Pipeline Register Configuration
-      .ACASCREG(INTERNAL_PREGS),          // Number of pipeline stages between A/ACIN and ACOUT (0-2)
-      .ADREG(0),                          // Pipeline stages for pre-adder (0-1)
-      .ALUMODEREG(0),                     // Pipeline stages for ALUMODE (0-1)
-      .AREG(INTERNAL_PREGS),              // Pipeline stages for A (0-2)
-      .BCASCREG(INTERNAL_PREGS),          // Number of pipeline stages between B/BCIN and BCOUT (0-2)
-      .BREG(INTERNAL_PREGS),              // Pipeline stages for B (0-2)
-      .CARRYINREG(0),                     // Pipeline stages for CARRYIN (0-1)
-      .CARRYINSELREG(0),                  // Pipeline stages for CARRYINSEL (0-1)
-      .CREG(0),                           // Pipeline stages for C (0-1)
-      .DREG(0),                           // Pipeline stages for D (0-1)
-      .INMODEREG(1),                      // Pipeline stages for INMODE (0-1)
-      .MREG(1),                           // Multiplier pipeline stages (0-1)
-      .OPMODEREG(1),                      // Pipeline stages for OPMODE (0-1)
-      .PREG(PREG),                        // Number of pipeline stages for P (0-1)
-      .RESET_MODE("SYNC")                 // Selection of synchronous or asynchronous reset. (ASYNC, SYNC).
-    )
-    DSP58_inst (
-      // Cascade outputs: Cascade Ports
-      .ACOUT(),                           // 34-bit output: A port cascade
-      .BCOUT(),                           // 24-bit output: B cascade
-      .CARRYCASCOUT(),                    // 1-bit output: Cascade carry
-      .MULTSIGNOUT(),                     // 1-bit output: Multiplier sign cascade
-      .PCOUT(pcout[j][i]),                // 58-bit output: Cascade output
-      // Control outputs: Control Inputs/Status Bits
-      .OVERFLOW(),                        // 1-bit output: Overflow in add/acc
-      .PATTERNBDETECT(),                  // 1-bit output: Pattern bar detect
-      .PATTERNDETECT(),                   // 1-bit output: Pattern detect
-      .UNDERFLOW(),                       // 1-bit output: Underflow in add/acc
-      // Data outputs: Data Ports
-      .CARRYOUT(),                        // 4-bit output: Carry
-      .P(pp),                             // 58-bit output: Primary data
-      .XOROUT(),                          // 8-bit output: XOR data
-      // Cascade inputs: Cascade Ports
-      .ACIN('x),                          // 34-bit input: A cascade data
-      .BCIN('x),                          // 24-bit input: B cascade
-      .CARRYCASCIN('x),                   // 1-bit input: Cascade carry
-      .MULTSIGNIN('x),                    // 1-bit input: Multiplier sign cascade
-      .PCIN(FIRST ? 'x : pcout[j][i-1]),  // 58-bit input: P cascade
-      // Control inputs: Control Inputs/Status Bits
-      .ALUMODE(4'h0),                     // 4-bit input: ALU control
-      .CARRYINSEL('0),                    // 3-bit input: Carry select
-      .CLK(clk),                          // 1-bit input: Clock
-      .INMODE({
-              INTERNAL_PREGS==2 ? 1'b0 : 1'b1,
-              2'b00,
-              TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero,
-              INTERNAL_PREGS==2 ? 1'b0 : 1'b1        
-      }),                                 // 5-bit input: INMODE control
-      .NEGATE('0),                        // 3-bit input: Negates the input of the multiplier
-      .OPMODE({
-              LAST ? {1'b0, L[1]} : 2'b00,
-              7'b000_0000
-      }), // 9-bit input: Operation mode
-      // Data inputs: Data Ports
-      .A({ 7'bx, a_in_i[i] }),            // 34-bit input: A data
-      .B(b_in_i[j][i]),                   // 24-bit input: B data
-      .C('x),                             // 58-bit input: C data
-      .CARRYIN('0),                       // 1-bit input: Carry-in
-      .D('x),                             // 27-bit input: D data
-      // Reset/Clock Enable inputs: Reset/Clock Enable Inputs
-      .ASYNC_RST('0),                     // 1-bit input: Asynchronous reset for all registers.
-      .CEA1(en),                          // 1-bit input: Clock enable for 1st stage AREG
-      .CEA2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage AREG
-      .CEAD('0),                          // 1-bit input: Clock enable for ADREG
-      .CEALUMODE('0),                     // 1-bit input: Clock enable for ALUMODE
-      .CEB1(en),                          // 1-bit input: Clock enable for 1st stage BREG
-      .CEB2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage BREG
-      .CEC('0),                           // 1-bit input: Clock enable for CREG
-      .CECARRYIN('0),                     // 1-bit input: Clock enable for CARRYINREG
-      .CECTRL(en),                        // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG
-      .CED('0),                           // 1-bit input: Clock enable for DREG
-      .CEINMODE(en),                      // 1-bit input: Clock enable for INMODEREG
-      .CEM(en),                           // 1-bit input: Clock enable for MREG
-      .CEP(PREG && en),                   // 1-bit input: Clock enable for PREG
-      .RSTA(rst),                         // 1-bit input: Reset for AREG
-      .RSTALLCARRYIN('0),                 // 1-bit input: Reset for CARRYINREG
-      .RSTALUMODE('0),                    // 1-bit input: Reset for ALUMODEREG
-      .RSTB(rst),                         // 1-bit input: Reset for BREG
-      .RSTC('0),                          // 1-bit input: Reset for CREG
-      .RSTCTRL(rst),                      // 1-bit input: Reset for OPMODEREG and CARRYINSELREG
-      .RSTD('0),                          // 1-bit input: Reset for DREG and ADREG
-      .RSTINMODE(rst),                    // 1-bit input: Reset for INMODE register
-      .RSTM(rst),                         // 1-bit input: Reset for MREG
-      .RSTP(PREG && rst)                  // 1-bit input: Reset for PREG
-    );
-  end : genDSPChain  
-end : genDSPPE
+	for (genvar j=0; j<PE; j++) begin : genDSPPE
+		for (genvar i=0; i<CHAINLEN; i++) begin : genDSPChain
+			localparam int TOTAL_PREGS = i/SEGLEN;
+			localparam int INTERNAL_PREGS = TOTAL_PREGS>0 ? 2 : 1; // 1 : 0
+			localparam bit PREG = (i+1)%SEGLEN==0 || i == CHAINLEN-1;
+			localparam bit FIRST = i == 0;
+			localparam bit LAST = i == CHAINLEN-1;
+			uwire [57:0] pp;
+
+			if (LAST) begin : genPOUT
+				assign p[j] = pp;
+			end      
+
+			DSP58 #(
+				// Feature Control Attributes: Data Path Selection
+				.AMULTSEL("A"),                     // Selects A input to multiplier (A, AD)
+				.A_INPUT("DIRECT"),                 // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port)
+				.BMULTSEL("B"),                     // Selects B input to multiplier (AD, B)
+				.B_INPUT("DIRECT"),                 // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port)
+				.DSP_MODE("INT8"),                  // Configures DSP to a particular mode of operation. Set to INT24 for
+													// legacy mode.
+				.PREADDINSEL("A"),                  // Selects input to pre-adder (A, B)
+				.RND(58'h000000000000000),          // Rounding Constant
+				.USE_MULT("MULTIPLY"),              // Select multiplier usage (DYNAMIC, MULTIPLY, NONE)
+				.USE_SIMD("ONE58"),                 // SIMD selection (FOUR12, ONE58, TWO24)
+				.USE_WIDEXOR("FALSE"),              // Use the Wide XOR function (FALSE, TRUE)
+				.XORSIMD("XOR24_34_58_116"),        // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116)
+				// Pattern Detector Attributes: Pattern Detection Configuration
+				.AUTORESET_PATDET("NO_RESET"),      // NO_RESET, RESET_MATCH, RESET_NOT_MATCH
+				.AUTORESET_PRIORITY("RESET"),       // Priority of AUTORESET vs. CEP (CEP, RESET).
+				.MASK(58'h0ffffffffffffff),         // 58-bit mask value for pattern detect (1=ignore)
+				.PATTERN(58'h000000000000000),      // 58-bit pattern match for pattern detect
+				.SEL_MASK("MASK"),                  // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2
+				.SEL_PATTERN("PATTERN"),            // Select pattern value (C, PATTERN)
+				.USE_PATTERN_DETECT("NO_PATDET"),   // Enable pattern detect (NO_PATDET, PATDET)
+				// Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins
+				.IS_ALUMODE_INVERTED(4'b0000),      // Optional inversion for ALUMODE
+				.IS_CARRYIN_INVERTED(1'b0),         // Optional inversion for CARRYIN
+				.IS_CLK_INVERTED(1'b0),             // Optional inversion for CLK
+				.IS_INMODE_INVERTED(5'b00000),      // Optional inversion for INMODE
+				.IS_NEGATE_INVERTED(3'b000),        // Optional inversion for NEGATE
+				.IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0 
+									FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN 
+									2'b01, // Y : M
+									2'b01  // X: M
+				}), // Optional inversion for OPMODE
+				.IS_RSTALLCARRYIN_INVERTED(1'b0),   // Optional inversion for RSTALLCARRYIN
+				.IS_RSTALUMODE_INVERTED(1'b0),      // Optional inversion for RSTALUMODE
+				.IS_RSTA_INVERTED(1'b0),            // Optional inversion for RSTA
+				.IS_RSTB_INVERTED(1'b0),            // Optional inversion for RSTB
+				.IS_RSTCTRL_INVERTED(1'b0),         // Optional inversion for STCONJUGATE_A
+				.IS_RSTC_INVERTED(1'b0),            // Optional inversion for RSTC
+				.IS_RSTD_INVERTED(1'b0),            // Optional inversion for RSTD
+				.IS_RSTINMODE_INVERTED(1'b0),       // Optional inversion for RSTINMODE
+				.IS_RSTM_INVERTED(1'b0),            // Optional inversion for RSTM
+				.IS_RSTP_INVERTED(1'b0),            // Optional inversion for RSTP
+				// Register Control Attributes: Pipeline Register Configuration
+				.ACASCREG(INTERNAL_PREGS),          // Number of pipeline stages between A/ACIN and ACOUT (0-2)
+				.ADREG(0),                          // Pipeline stages for pre-adder (0-1)
+				.ALUMODEREG(0),                     // Pipeline stages for ALUMODE (0-1)
+				.AREG(INTERNAL_PREGS),              // Pipeline stages for A (0-2)
+				.BCASCREG(INTERNAL_PREGS),          // Number of pipeline stages between B/BCIN and BCOUT (0-2)
+				.BREG(INTERNAL_PREGS),              // Pipeline stages for B (0-2)
+				.CARRYINREG(0),                     // Pipeline stages for CARRYIN (0-1)
+				.CARRYINSELREG(0),                  // Pipeline stages for CARRYINSEL (0-1)
+				.CREG(0),                           // Pipeline stages for C (0-1)
+				.DREG(0),                           // Pipeline stages for D (0-1)
+				.INMODEREG(1),                      // Pipeline stages for INMODE (0-1)
+				.MREG(1),                           // Multiplier pipeline stages (0-1)
+				.OPMODEREG(1),                      // Pipeline stages for OPMODE (0-1)
+				.PREG(PREG),                        // Number of pipeline stages for P (0-1)
+				.RESET_MODE("SYNC")                 // Selection of synchronous or asynchronous reset. (ASYNC, SYNC).
+			)
+			DSP58_inst (
+				// Cascade outputs: Cascade Ports
+				.ACOUT(),                           // 34-bit output: A port cascade
+				.BCOUT(),                           // 24-bit output: B cascade
+				.CARRYCASCOUT(),                    // 1-bit output: Cascade carry
+				.MULTSIGNOUT(),                     // 1-bit output: Multiplier sign cascade
+				.PCOUT(pcout[j][i]),                // 58-bit output: Cascade output
+				// Control outputs: Control Inputs/Status Bits
+				.OVERFLOW(),                        // 1-bit output: Overflow in add/acc
+				.PATTERNBDETECT(),                  // 1-bit output: Pattern bar detect
+				.PATTERNDETECT(),                   // 1-bit output: Pattern detect
+				.UNDERFLOW(),                       // 1-bit output: Underflow in add/acc
+				// Data outputs: Data Ports
+				.CARRYOUT(),                        // 4-bit output: Carry
+				.P(pp),                             // 58-bit output: Primary data
+				.XOROUT(),                          // 8-bit output: XOR data
+				// Cascade inputs: Cascade Ports
+				.ACIN('x),                          // 34-bit input: A cascade data
+				.BCIN('x),                          // 24-bit input: B cascade
+				.CARRYCASCIN('x),                   // 1-bit input: Cascade carry
+				.MULTSIGNIN('x),                    // 1-bit input: Multiplier sign cascade
+				.PCIN(FIRST ? 'x : pcout[j][i-1]),  // 58-bit input: P cascade
+				// Control inputs: Control Inputs/Status Bits
+				.ALUMODE(4'h0),                     // 4-bit input: ALU control
+				.CARRYINSEL('0),                    // 3-bit input: Carry select
+				.CLK(clk),                          // 1-bit input: Clock
+				.INMODE({
+						INTERNAL_PREGS==2 ? 1'b0 : 1'b1,
+						2'b00,
+						TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero,
+						INTERNAL_PREGS==2 ? 1'b0 : 1'b1        
+				}),                                 // 5-bit input: INMODE control
+				.NEGATE('0),                        // 3-bit input: Negates the input of the multiplier
+				.OPMODE({
+						LAST ? {1'b0, L[1]} : 2'b00,
+						7'b000_0000
+				}), // 9-bit input: Operation mode
+				// Data inputs: Data Ports
+				.A({ 7'bx, a_in_i[i] }),            // 34-bit input: A data
+				.B(b_in_i[j][i]),                   // 24-bit input: B data
+				.C('x),                             // 58-bit input: C data
+				.CARRYIN('0),                       // 1-bit input: Carry-in
+				.D('x),                             // 27-bit input: D data
+				// Reset/Clock Enable inputs: Reset/Clock Enable Inputs
+				.ASYNC_RST('0),                     // 1-bit input: Asynchronous reset for all registers.
+				.CEA1(en),                          // 1-bit input: Clock enable for 1st stage AREG
+				.CEA2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage AREG
+				.CEAD('0),                          // 1-bit input: Clock enable for ADREG
+				.CEALUMODE('0),                     // 1-bit input: Clock enable for ALUMODE
+				.CEB1(en),                          // 1-bit input: Clock enable for 1st stage BREG
+				.CEB2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage BREG
+				.CEC('0),                           // 1-bit input: Clock enable for CREG
+				.CECARRYIN('0),                     // 1-bit input: Clock enable for CARRYINREG
+				.CECTRL(en),                        // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG
+				.CED('0),                           // 1-bit input: Clock enable for DREG
+				.CEINMODE(en),                      // 1-bit input: Clock enable for INMODEREG
+				.CEM(en),                           // 1-bit input: Clock enable for MREG
+				.CEP(PREG && en),                   // 1-bit input: Clock enable for PREG
+				.RSTA(rst),                         // 1-bit input: Reset for AREG
+				.RSTALLCARRYIN('0),                 // 1-bit input: Reset for CARRYINREG
+				.RSTALUMODE('0),                    // 1-bit input: Reset for ALUMODEREG
+				.RSTB(rst),                         // 1-bit input: Reset for BREG
+				.RSTC('0),                          // 1-bit input: Reset for CREG
+				.RSTCTRL(rst),                      // 1-bit input: Reset for OPMODEREG and CARRYINSELREG
+				.RSTD('0),                          // 1-bit input: Reset for DREG and ADREG
+				.RSTINMODE(rst),                    // 1-bit input: Reset for INMODE register
+				.RSTM(rst),                         // 1-bit input: Reset for MREG
+				.RSTP(PREG && rst)                  // 1-bit input: Reset for PREG
+			);
+		end : genDSPChain  
+	end : genDSPPE
     
 endmodule
diff --git a/finn-rtllib/mvu/mvu_8sx9_axi.sv b/finn-rtllib/mvu/mvu_8sx9_axi.sv
index 8765c50a26..6c7eaeaeca 100644
--- a/finn-rtllib/mvu/mvu_8sx9_axi.sv
+++ b/finn-rtllib/mvu/mvu_8sx9_axi.sv
@@ -41,36 +41,36 @@ module mvu_8sx9_axi #(
     int unsigned ACCU_WIDTH,
     bit SIGNED_ACTIVATIONS = 0,
     int unsigned SEGMENTLEN = 0,
-		parameter RAM_STYLE = "auto",
+	parameter RAM_STYLE = "auto",
 
     localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
     localparam int unsigned INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8,
-		localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH,
-		localparam int unsigned INPUT_STREAM_WIDTH = SIMD*ACTIVATION_WIDTH,
+	localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH,
+	localparam int unsigned INPUT_STREAM_WIDTH = SIMD*ACTIVATION_WIDTH,
     localparam int unsigned SF = MW/SIMD,
-		localparam int unsigned NF = MH/PE,
+	localparam int unsigned NF = MH/PE,
     localparam int unsigned OUTPUT_LANES = PE,
     localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
 )
 (
 	// Global Control
-	input		logic  ap_clk,
-	input		logic  ap_rst_n,
+	input	logic  ap_clk,
+	input	logic  ap_rst_n,
 
 	// Weight Stream
-	input		logic [WEIGHT_STREAM_WIDTH_BA-1:0]  s_axis_weights_tdata,
-	input		logic  s_axis_weights_tvalid,
+	input	logic [WEIGHT_STREAM_WIDTH_BA-1:0]  s_axis_weights_tdata,
+	input	logic  s_axis_weights_tvalid,
 	output	logic  s_axis_weights_tready,
 
 	// Input Stream
-	input		logic [INPUT_STREAM_WIDTH_BA-1:0]  s_axis_input_tdata,
-	input		logic  s_axis_input_tvalid,
+	input	logic [INPUT_STREAM_WIDTH_BA-1:0]  s_axis_input_tdata,
+	input	logic  s_axis_input_tvalid,
 	output	logic  s_axis_input_tready,
 
 	// Output Stream
 	output	logic [OUTPUT_STREAM_WIDTH_BA-1:0]  m_axis_output_tdata,
 	output	logic  m_axis_output_tvalid,
-	input		logic  m_axis_output_tready
+	input	logic  m_axis_output_tready
 );
 
 //-------------------- Parameter sanity checks --------------------\\
@@ -121,13 +121,13 @@ module mvu_8sx9_axi #(
 		.ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin)
 	);
 
-	//-------------------- Input control --------------------\\
+//-------------------- Input control --------------------\\
 	uwire en;
 	uwire istb = avld && s_axis_weights_tvalid;
 	assign ardy = en && s_axis_weights_tvalid;
 	assign s_axis_weights_tready = en && avld;
 
-	//-------------------- Core MVU --------------------\\
+//-------------------- Core MVU --------------------\\
 	uwire ovld;
 	uwire [PE-1:0][57:0] odat;
 	typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t;
@@ -138,7 +138,7 @@ module mvu_8sx9_axi #(
 		.vld(ovld), .p(odat)
 	);
 
-	//-------------------- Output register slice --------------------\\
+//-------------------- Output register slice --------------------\\
 	struct {
 		logic vld;
 		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
@@ -148,7 +148,7 @@ module mvu_8sx9_axi #(
 
 	uwire  b_load;
 	always_ff @(posedge clk) begin
-		if(rst)  A <= '{ vld: 0, default: 'x };
+		if(rst)		A <= '{ vld: 0, default: 'x };
 		else if(!A.vld || b_load) begin
 			A.vld <= ovld && en;
 			for(int unsigned  i = 0; i < PE; i++) begin
@@ -169,7 +169,7 @@ module mvu_8sx9_axi #(
 	always_ff @(posedge clk) begin
 		if(rst)		B <= '{ default: 'x };
 		else begin
-			if(b_load)	 B <= '{ vld: A.vld, dat: A.dat};
+			if(b_load)	B <= '{ vld: A.vld, dat: A.dat};
 		end	
 	end
 
diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv b/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv
index ea97e0708c..70ffa096ef 100644
--- a/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv
+++ b/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv
@@ -33,176 +33,176 @@
 
 module mvu_8sx9_axi_tb();
 
-  //-------------------- Simulation parameters --------------------\\
-  // Matrix & parallelism config
-  localparam int unsigned MW = 600;
-  localparam int unsigned MH = 256;
-  localparam int unsigned SIMD = 60;
-  localparam int unsigned PE = 16;
-  localparam int unsigned SEGMENTLEN = 4;
-  // Bit-width config  
-  localparam int unsigned ACTIVATION_WIDTH = 8;
-  localparam int unsigned WEIGHT_WIDTH = 4;
-  localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW);
-  localparam bit SIGNED_ACTIVATIONS = 1;
-  // Simulation constants  
-  localparam int unsigned NF = MH/PE;
-  localparam int unsigned SF = MW/SIMD;
-  localparam int unsigned NUM_OF_DSP = SIMD/3;
-  localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8;
-  localparam int unsigned ACTIVATION_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8*8;
-  localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH;
-  localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - SIMD*ACTIVATION_WIDTH;
-  localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8;
-
-  // Generate clk and reset signal   
-  logic clk = 0;
-  always #5ns clk = !clk;
-  
-  logic ap_rst_n = 0;
-  initial begin
-    repeat(16) @(posedge clk);
-    ap_rst_n <= 1;
-  end
-
-  uwire ap_clk = clk;
-
-  // Generate activations  
-  typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t;
-  typedef activation_t activation_vector_t[SF];
-    
-  function activation_vector_t init_ACTIVATIONS;
-    automatic activation_vector_t res;
-    std::randomize(res);
-    return res;
-  endfunction : init_ACTIVATIONS
-
-  activation_vector_t ACTIVATIONS = init_ACTIVATIONS();
-
-  struct {
-    activation_t dat;
-    logic vld;
-    logic rdy;
-  } activations;
-
-  initial begin
-    activations.vld = 0;
-    activations.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain
-    @(posedge clk iff ap_rst_n);
-
-    for (int i=0; i<SF; i++) begin
-      activations.dat <= ACTIVATIONS[i];
-      do begin 
-        activations.vld = $urandom()%7 > 1;
-        @(posedge clk);
-      end while (!(activations.vld === 1 && activations.rdy === 1));
-    end
-    
-    activations.vld <= 0;
-    activations.dat <= 'x;
-  end
-   
-  // Generate weights   
-  typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t;
-  typedef weight_t weight_matrix_t[NF][SF]; 
-  
-  function weight_matrix_t init_WEIGHTS;
-    automatic weight_matrix_t res;
-    std::randomize(res);
-    return res;
-  endfunction : init_WEIGHTS;
-
-  weight_matrix_t WEIGHTS = init_WEIGHTS();
-
-  struct {
-    weight_t dat;
-    logic vld;
-    logic rdy;
-  } weights;
-
-  initial begin
-    weights.vld = 0;
-    weights.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain
-    @(posedge clk iff ap_rst_n);
-
-    weights.vld <= 1;
-    for (int i=0; i<NF; i++) begin
-      for (int j=0; j<SF; j++) begin
-        weights.dat <= WEIGHTS[i][j];
-        @(posedge clk iff weights.rdy);
-      end
-    end
-
-    weights.vld <= 0;
-    weights.dat <= 'x;
-  end
-  
-  // Function to compute golden output  
-  // a: [SF][SIMD-1:0][ACTIVATION_WIDTH-1:0]
-  // w: [NF][SF][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]
-  typedef logic signed [PE-1:0][ACCU_WIDTH-1:0] output_t;
-  typedef output_t output_vector_t [NF];
-
-  struct {
-    output_t dat;
-    logic vld;
-    logic rdy;
-  } outputs;
-
-  function output_vector_t check_output(activation_vector_t a, weight_matrix_t w);
-    automatic output_vector_t res = '{default: 0};
-    for (int j = 0; j<MH; j++) begin
-      for (int i = 0; i<MW; i++) begin
-        res[j/PE][j%PE] = $signed(res[j/PE][j%PE]) + $signed(a[i/SIMD][i%SIMD]) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
-      end
-    end  
-    return res;
-  endfunction : check_output;
-
-  output_vector_t GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS);
-
-  int unsigned NF_CNT = 0;
-  initial begin
-    outputs.rdy = 0;
-    while (NF_CNT < NF) begin
-      // Loop until both rdy & vld are asserted
-      do begin
-        outputs.rdy <= $urandom()%7 >= 1;
-        @(posedge clk iff ap_rst_n);
-      end while (!(outputs.rdy === 1 && outputs.vld === 1));
-
-      // Compare produced outputs against golden outputs
-      foreach(outputs.dat[i]) begin
-        assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
-        else begin 
-          $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
-          $stop;
-        end  
-      end
-      
-      NF_CNT += 1;
-    end
-    
-    $finish;  
-  end
-
-  // Instantiate DUT
-  mvu_8sx9_axi #(
-      .MW(MW),
-      .MH(MH),
-      .PE(PE),
-      .SIMD(SIMD),
-      .ACTIVATION_WIDTH(ACTIVATION_WIDTH),
-      .WEIGHT_WIDTH(WEIGHT_WIDTH),
-      .ACCU_WIDTH(ACCU_WIDTH),
-      .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
-      .SEGMENTLEN(SEGMENTLEN)
-    )
-    dut (
-      .ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld),
-      .s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld),
-      .s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld),
-      .m_axis_output_tready(outputs.rdy)
-    );
+//-------------------- Simulation parameters --------------------\\
+	// Matrix & parallelism config
+	localparam int unsigned MW = 600;
+	localparam int unsigned MH = 256;
+	localparam int unsigned SIMD = 60;
+	localparam int unsigned PE = 16;
+	localparam int unsigned SEGMENTLEN = 4;
+	// Bit-width config  
+	localparam int unsigned ACTIVATION_WIDTH = 8;
+	localparam int unsigned WEIGHT_WIDTH = 4;
+	localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW);
+	localparam bit SIGNED_ACTIVATIONS = 1;
+	// Simulation constants  
+	localparam int unsigned NF = MH/PE;
+	localparam int unsigned SF = MW/SIMD;
+	localparam int unsigned NUM_OF_DSP = SIMD/3;
+	localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8;
+	localparam int unsigned ACTIVATION_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8*8;
+	localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH;
+	localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - SIMD*ACTIVATION_WIDTH;
+	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8;
+
+	// Generate clk and reset signal   
+	logic clk = 0;
+	always #5ns clk = !clk;
+
+	logic ap_rst_n = 0;
+	initial begin
+		repeat(16) @(posedge clk);
+		ap_rst_n <= 1;
+	end
+
+	uwire ap_clk = clk;
+
+	// Generate activations  
+	typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t;
+	typedef activation_t activation_vector_t[SF];
+
+	function activation_vector_t init_ACTIVATIONS;
+		automatic activation_vector_t res;
+		std::randomize(res);
+		return res;
+	endfunction : init_ACTIVATIONS
+
+	activation_vector_t ACTIVATIONS = init_ACTIVATIONS();
+
+	struct {
+		activation_t dat;
+		logic vld;
+		logic rdy;
+	} activations;
+
+	initial begin
+		activations.vld = 0;
+		activations.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain
+		@(posedge clk iff ap_rst_n);
+
+		for (int i=0; i<SF; i++) begin
+			activations.dat <= ACTIVATIONS[i];
+			do begin 
+				activations.vld = $urandom()%7 > 1;
+				@(posedge clk);
+			end while (!(activations.vld === 1 && activations.rdy === 1));
+		end
+
+		activations.vld <= 0;
+		activations.dat <= 'x;
+	end
+
+	// Generate weights   
+	typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t;
+	typedef weight_t weight_matrix_t[NF][SF]; 
+
+	function weight_matrix_t init_WEIGHTS;
+		automatic weight_matrix_t res;
+		std::randomize(res);
+		return res;
+	endfunction : init_WEIGHTS;
+
+	weight_matrix_t WEIGHTS = init_WEIGHTS();
+
+	struct {
+		weight_t dat;
+		logic vld;
+		logic rdy;
+	} weights;
+
+	initial begin
+		weights.vld = 0;
+		weights.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain
+		@(posedge clk iff ap_rst_n);
+
+		weights.vld <= 1;
+		for (int i=0; i<NF; i++) begin
+			for (int j=0; j<SF; j++) begin
+				weights.dat <= WEIGHTS[i][j];
+				@(posedge clk iff weights.rdy);
+			end
+		end
+
+		weights.vld <= 0;
+		weights.dat <= 'x;
+	end
+
+	// Function to compute golden output  
+	// a: [SF][SIMD-1:0][ACTIVATION_WIDTH-1:0]
+	// w: [NF][SF][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]
+	typedef logic signed [PE-1:0][ACCU_WIDTH-1:0] output_t;
+	typedef output_t output_vector_t [NF];
+
+	struct {
+		output_t dat;
+		logic vld;
+		logic rdy;
+	} outputs;
+
+	function output_vector_t check_output(activation_vector_t a, weight_matrix_t w);
+		automatic output_vector_t res = '{default: 0};
+		for (int j = 0; j<MH; j++) begin
+			for (int i = 0; i<MW; i++) begin
+				res[j/PE][j%PE] = $signed(res[j/PE][j%PE]) + $signed(a[i/SIMD][i%SIMD]) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
+			end
+		end  
+		return res;
+	endfunction : check_output;
+
+	output_vector_t GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS);
+
+	int unsigned NF_CNT = 0;
+	initial begin
+		outputs.rdy = 0;
+		while (NF_CNT < NF) begin
+			// Loop until both rdy & vld are asserted
+			do begin
+				outputs.rdy <= $urandom()%7 >= 1;
+				@(posedge clk iff ap_rst_n);
+			end while (!(outputs.rdy === 1 && outputs.vld === 1));
+
+			// Compare produced outputs against golden outputs
+			foreach(outputs.dat[i]) begin
+				assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+				else begin 
+					$error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+					$stop;
+				end  
+			end
+			
+			NF_CNT += 1;
+		end
+
+		$finish;  
+	end
+
+	// Instantiate DUT
+	mvu_8sx9_axi #(
+		.MW(MW),
+		.MH(MH),
+		.PE(PE),
+		.SIMD(SIMD),
+		.ACTIVATION_WIDTH(ACTIVATION_WIDTH),
+		.WEIGHT_WIDTH(WEIGHT_WIDTH),
+		.ACCU_WIDTH(ACCU_WIDTH),
+		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
+		.SEGMENTLEN(SEGMENTLEN)
+	)
+	dut (
+		.ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld),
+		.s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld),
+		.s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld),
+		.m_axis_output_tready(outputs.rdy)
+	);
   
 endmodule
diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v
index ff3779d211..2456eb3a47 100644
--- a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v
+++ b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v
@@ -33,7 +33,7 @@
 
 module $MODULE_NAME_AXI_WRAPPER$ #(
 	parameter 	MW = $MW$,
-	parameter		MH = $MH$,
+	parameter	MH = $MH$,
 	parameter 	PE = $PE$,
 	parameter 	SIMD = $SIMD$,
 	parameter 	ACTIVATION_WIDTH = $ACTIVATION_WIDTH$,
@@ -44,29 +44,29 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 	parameter 	RAM_STYLE = $IBUF_RAM_STYLE$,
 
 	// Safely deducible parameters
-	parameter WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
-	parameter INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8,
-	parameter OUTPUT_LANES = PE,
-	parameter OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
+	parameter 	WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
+	parameter 	INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8,
+	parameter 	OUTPUT_LANES = PE,
+	parameter 	OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
 )(
-  // Global Control
-	input		logic  ap_clk,
-	input		logic  ap_rst_n,
+  	// Global Control
+	input	logic  ap_clk,
+	input	logic  ap_rst_n,
 
 	// Weight Stream
-	input		logic [WEIGHT_STREAM_WIDTH_BA-1:0]  s_axis_weights_tdata,
-	input		logic  s_axis_weights_tvalid,
+	input	logic [WEIGHT_STREAM_WIDTH_BA-1:0]  s_axis_weights_tdata,
+	input	logic  s_axis_weights_tvalid,
 	output	logic  s_axis_weights_tready,
 
 	// Input Stream
-	input		logic [INPUT_STREAM_WIDTH_BA-1:0]  s_axis_input_tdata,
-	input		logic  s_axis_input_tvalid,
+	input	logic [INPUT_STREAM_WIDTH_BA-1:0]  s_axis_input_tdata,
+	input	logic  s_axis_input_tvalid,
 	output	logic  s_axis_input_tready,
 
 	// Output Stream
 	output	logic [OUTPUT_STREAM_WIDTH_BA-1:0]  m_axis_output_tdata,
 	output	logic  m_axis_output_tvalid,
-	input		logic  m_axis_output_tready
+	input	logic  m_axis_output_tready
 );
 
 mvu_8sx9_axi #(
diff --git a/finn-rtllib/mvu/mvu_8sx9_tb.sv b/finn-rtllib/mvu/mvu_8sx9_tb.sv
index ea3ecbbd70..adf6a8f9c2 100644
--- a/finn-rtllib/mvu/mvu_8sx9_tb.sv
+++ b/finn-rtllib/mvu/mvu_8sx9_tb.sv
@@ -33,133 +33,133 @@
 
 module mvu_8sx9_tb();
 
-  //-------------------- Simulation parameters --------------------\\
-  // Matrix & parallelism config
-  localparam int unsigned MH = 256;
-  localparam int unsigned PE = 16;
-  localparam int unsigned MW = 600;
-  localparam int unsigned SIMD = 60;
-  localparam int unsigned SEGMENTLEN = 4;
-  // Bit-width config  
-  localparam int unsigned ACTIVATION_WIDTH = 8;
-  localparam int unsigned WEIGHT_WIDTH = 4;
-  localparam bit SIGNED_ACTIVATIONS = 1;
-  // Simulation constants
-  localparam int unsigned NF = MH/PE;
-  localparam int unsigned SF = MW/SIMD;
-  localparam int unsigned NUM_OF_DSP = SIMD/3;
-  
-  typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t;
-  typedef activation_t activation_vector_t[SF];
-
-  function activation_vector_t init_ACTIVATIONS;
-    automatic activation_vector_t res;
-    std::randomize(res);
-    return res;
-  endfunction : init_ACTIVATIONS
-
-  typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t;
-  typedef weight_t weight_matrix_t[NF][SF];
-  
-  function weight_matrix_t init_WEIGHTS;
-    automatic weight_matrix_t res;
-    std::randomize(res);
-    return res;
-  endfunction : init_WEIGHTS;
-  
-  typedef logic signed [PE-1:0][57:0] output_t;
-  typedef output_t output_vector_t [NF];
-
-  function output_vector_t check_output(activation_vector_t a, weight_matrix_t w);
-    automatic output_vector_t res = '{default: 0};
-    for (int j = 0; j<MH; j++) begin
-      for (int i = 0; i<MW; i++) begin
-        res[j/PE][j%PE] = $signed(res[j/PE][j%PE]) + $signed(a[i/SIMD][i%SIMD]) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
-      end
-    end  
-    return res;
-  endfunction : check_output;
-  
-  logic clk = 0;
-  always #5ns clk = !clk;
-  
-  logic rst;
-  initial begin
-    rst = 1;
-    repeat(16) @(posedge clk);
-    rst <= 0;
-  end
-   
-  logic last;
-  logic zero;
-  logic vld;
-  activation_t a;
-  weight_t w;
-  output_t p;
-  // Reference signals
-  activation_vector_t ACTIVATIONS; //   [SF-1:0][SIMD-1:0][ACTIVATION_WIDTH-1:0]
-  weight_matrix_t WEIGHTS; //           [NF-1:0][SF-1:0][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]
-  output_vector_t GOLDEN_OUTPUT; //     [NF-1:0][PE-1:0][57:0]
-  // Counter for number of outputs (NF dimension) that are produced
-  int NF_CNT = 0;
-  
-  initial begin
-    ACTIVATIONS = init_ACTIVATIONS();
-    WEIGHTS = init_WEIGHTS();
-    GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS);
-    last = 0;
-    zero = 0;
-    a = 'x;
-    w = 'x;
-    
-    @(posedge clk iff !rst);
-
-    for (int j=0; j<NF; j++) begin
-      for (int i=0; i<SF; i++) begin
-        last <= (i==SF-1) ? 1 : 0;
-        a <= ACTIVATIONS[i];
-        w <= WEIGHTS[j][i];
-        @(posedge clk iff en);
-      end
-    end
-
-    last <= 0;
-    zero <= 1;  
-
-    // Continue until all NF outputs are produced & compared
-    @(posedge clk && (NF_CNT==NF));
-
-    $finish;
-  end
-
-  logic en = 0;
-  always_ff @(posedge clk) begin
-    en <= ($urandom()%7 > 1) && !rst;
-  end
-
-  // Compare computed output against golden output when vld flag is raised by DUT
-  always_ff @(posedge clk iff (vld && en)) begin
-    foreach(p[i]) begin
-      assert ($signed(p[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
-      else begin 
-        $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
-        $stop;
-      end  
-    end
-    NF_CNT += 1;
-  end
-
-  // Instantiate DUT
-  mvu_8sx9 #(
-      .PE(PE),
-      .SIMD(SIMD),
-      .WEIGHT_WIDTH(WEIGHT_WIDTH),
-      .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
-      .ACTIVATION_WIDTH(ACTIVATION_WIDTH),
-      .SEGMENTLEN(SEGMENTLEN)
-    )
-    dut (
-      .clk, .rst, .en, .last, .zero, .a, .w, .vld, .p
-    );
-  
+//-------------------- Simulation parameters --------------------\\
+	// Matrix & parallelism config
+	localparam int unsigned MH = 256;
+	localparam int unsigned PE = 16;
+	localparam int unsigned MW = 600;
+	localparam int unsigned SIMD = 60;
+	localparam int unsigned SEGMENTLEN = 4;
+	// Bit-width config  
+	localparam int unsigned ACTIVATION_WIDTH = 8;
+	localparam int unsigned WEIGHT_WIDTH = 4;
+	localparam bit SIGNED_ACTIVATIONS = 1;
+	// Simulation constants
+	localparam int unsigned NF = MH/PE;
+	localparam int unsigned SF = MW/SIMD;
+	localparam int unsigned NUM_OF_DSP = SIMD/3;
+
+	typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t;
+	typedef activation_t activation_vector_t[SF];
+
+	function activation_vector_t init_ACTIVATIONS;
+		automatic activation_vector_t res;
+		std::randomize(res);
+		return res;
+	endfunction : init_ACTIVATIONS
+
+	typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t;
+	typedef weight_t weight_matrix_t[NF][SF];
+
+	function weight_matrix_t init_WEIGHTS;
+		automatic weight_matrix_t res;
+		std::randomize(res);
+		return res;
+	endfunction : init_WEIGHTS;
+
+	typedef logic signed [PE-1:0][57:0] output_t;
+	typedef output_t output_vector_t [NF];
+
+	function output_vector_t check_output(activation_vector_t a, weight_matrix_t w);
+		automatic output_vector_t res = '{default: 0};
+		for (int j = 0; j<MH; j++) begin
+			for (int i = 0; i<MW; i++) begin
+				res[j/PE][j%PE] = $signed(res[j/PE][j%PE]) + $signed(a[i/SIMD][i%SIMD]) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
+			end
+		end  
+		return res;
+	endfunction : check_output;
+
+	logic clk = 0;
+	always #5ns clk = !clk;
+
+	logic rst;
+	initial begin
+		rst = 1;
+		repeat(16) @(posedge clk);
+		rst <= 0;
+	end
+
+	logic last;
+	logic zero;
+	logic vld;
+	activation_t a;
+	weight_t w;
+	output_t p;
+	// Reference signals
+	activation_vector_t ACTIVATIONS; //   [SF-1:0][SIMD-1:0][ACTIVATION_WIDTH-1:0]
+	weight_matrix_t WEIGHTS; //           [NF-1:0][SF-1:0][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]
+	output_vector_t GOLDEN_OUTPUT; //     [NF-1:0][PE-1:0][57:0]
+	// Counter for number of outputs (NF dimension) that are produced
+	int NF_CNT = 0;
+
+	initial begin
+		ACTIVATIONS = init_ACTIVATIONS();
+		WEIGHTS = init_WEIGHTS();
+		GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS);
+		last = 0;
+		zero = 0;
+		a = 'x;
+		w = 'x;
+
+		@(posedge clk iff !rst);
+
+		for (int j=0; j<NF; j++) begin
+			for (int i=0; i<SF; i++) begin
+				last <= (i==SF-1) ? 1 : 0;
+				a <= ACTIVATIONS[i];
+				w <= WEIGHTS[j][i];
+				@(posedge clk iff en);
+			end
+		end
+
+		last <= 0;
+		zero <= 1;  
+
+		// Continue until all NF outputs are produced & compared
+		@(posedge clk && (NF_CNT==NF));
+
+		$finish;
+	end
+
+	logic en = 0;
+	always_ff @(posedge clk) begin
+		en <= ($urandom()%7 > 1) && !rst;
+	end
+
+	// Compare computed output against golden output when vld flag is raised by DUT
+	always_ff @(posedge clk iff (vld && en)) begin
+		foreach(p[i]) begin
+			assert ($signed(p[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+			else begin 
+				$error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+				$stop;
+			end  
+		end
+		NF_CNT += 1;
+	end
+
+	// Instantiate DUT
+	mvu_8sx9 #(
+		.PE(PE),
+		.SIMD(SIMD),
+		.WEIGHT_WIDTH(WEIGHT_WIDTH),
+		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
+		.ACTIVATION_WIDTH(ACTIVATION_WIDTH),
+		.SEGMENTLEN(SEGMENTLEN)
+	)
+	dut (
+		.clk, .rst, .en, .last, .zero, .a, .w, .vld, .p
+	);
+
 endmodule

From 5e61f42afd991233153ee8b7fe0fb6e9e8ac562d Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Wed, 12 Apr 2023 08:54:45 +0100
Subject: [PATCH 009/112] [rtl custom op]: fix to indentation

---
 finn-rtllib/mvu/mvu_8sx9_axi.sv | 54 ++++++++++++++++-----------------
 1 file changed, 27 insertions(+), 27 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_8sx9_axi.sv b/finn-rtllib/mvu/mvu_8sx9_axi.sv
index 6c7eaeaeca..5f215927d8 100644
--- a/finn-rtllib/mvu/mvu_8sx9_axi.sv
+++ b/finn-rtllib/mvu/mvu_8sx9_axi.sv
@@ -32,25 +32,25 @@
  *****************************************************************************/
 
 module mvu_8sx9_axi #(
-    int unsigned MW,
-    int unsigned MH,
-    int unsigned PE,
-    int unsigned SIMD,
-    int unsigned ACTIVATION_WIDTH,
-    int unsigned WEIGHT_WIDTH,
-    int unsigned ACCU_WIDTH,
-    bit SIGNED_ACTIVATIONS = 0,
-    int unsigned SEGMENTLEN = 0,
+	int unsigned MW,
+	int unsigned MH,
+	int unsigned PE,
+	int unsigned SIMD,
+	int unsigned ACTIVATION_WIDTH,
+	int unsigned WEIGHT_WIDTH,
+	int unsigned ACCU_WIDTH,
+	bit SIGNED_ACTIVATIONS = 0,
+	int unsigned SEGMENTLEN = 0,
 	parameter RAM_STYLE = "auto",
 
-    localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
-    localparam int unsigned INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8,
+	localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
+	localparam int unsigned INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8,
 	localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH,
 	localparam int unsigned INPUT_STREAM_WIDTH = SIMD*ACTIVATION_WIDTH,
-    localparam int unsigned SF = MW/SIMD,
+	localparam int unsigned SF = MW/SIMD,
 	localparam int unsigned NF = MH/PE,
-    localparam int unsigned OUTPUT_LANES = PE,
-    localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
+	localparam int unsigned OUTPUT_LANES = PE,
+	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
 )
 (
 	// Global Control
@@ -76,31 +76,31 @@ module mvu_8sx9_axi #(
 //-------------------- Parameter sanity checks --------------------\\
 	initial begin
 		if (MW % SIMD != 0) begin
-		$error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD);
-		$finish;
+			$error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD);
+			$finish;
 		end
 		if (MH % PE != 0) begin
-		$error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE);
-		$finish;
+			$error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE);
+			$finish;
 		end
 		if (ACTIVATION_WIDTH > 9) begin
-		$error("Activation width of %0d-bits exceeds maximum of 9-bits", ACTIVATION_WIDTH);
-		$finish;
+			$error("Activation width of %0d-bits exceeds maximum of 9-bits", ACTIVATION_WIDTH);
+			$finish;
 		end
 		if (WEIGHT_WIDTH > 8) begin
-		$error("Weight width of %0d-bits exceeds maximum of 8-bits", WEIGHT_WIDTH);
-		$finish;
+			$error("Weight width of %0d-bits exceeds maximum of 8-bits", WEIGHT_WIDTH);
+			$finish;
 		end
 		if (SIGNED_ACTIVATIONS == 0 && ACTIVATION_WIDTH==9) begin
-		$error("Activation width of %0d-bits exceeds maximum of 8-bits for unsigned numbers", ACTIVATION_WIDTH);
-		$finish;
+			$error("Activation width of %0d-bits exceeds maximum of 8-bits for unsigned numbers", ACTIVATION_WIDTH);
+			$finish;
 		end
 		if (SEGMENTLEN == 0) begin
-		$warning("Segment length of %0d defaults to chain length", SEGMENTLEN);
+			$warning("Segment length of %0d defaults to chain length", SEGMENTLEN);
 		end
 		if (SEGMENTLEN > (SIMD+2)/3) begin
-		$error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3);
-		$finish;
+			$error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3);
+			$finish;
 		end
 	end
 

From cbee193d746763044a870bdf1af248bbe8d31156 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Wed, 12 Apr 2023 14:33:13 +0100
Subject: [PATCH 010/112] [rtl custom-op]: minor changes for compiler
 integration

---
 finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v
index 2456eb3a47..502a72d3f2 100644
--- a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v
+++ b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v
@@ -41,7 +41,7 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 	parameter 	ACCU_WIDTH = $ACCU_WIDTH$,
 	parameter 	SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$,
 	parameter 	SEGMENTLEN = $SEGMENTLEN$,
-	parameter 	RAM_STYLE = $IBUF_RAM_STYLE$,
+	parameter 	RAM_STYLE = "$IBUF_RAM_STYLE$",
 
 	// Safely deducible parameters
 	parameter 	WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
@@ -85,6 +85,6 @@ mvu_8sx9_axi #(
 	.m_axis_output_tdata(m_axis_output_tdata),
 	.m_axis_output_tvalid(m_axis_output_tvalid),
 	.m_axis_output_tready(m_axis_output_tready)
-)
+);
 
-endmodule : mvau_8sx9_axi_wrapper
\ No newline at end of file
+endmodule : $MODULE_NAME_AXI_WRAPPER$
\ No newline at end of file

From ba5e77bde008fff2a445d6ef469072dd67f67f42 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Wed, 12 Apr 2023 23:26:05 +0100
Subject: [PATCH 011/112] [rtl custom op]: moved testbenches to separate
 directory

---
 finn-rtllib/mvu/tb/mvu_8sx9_tb.sv | 165 +++++++++++++++++++++++
 finn-rtllib/mvu/tb/mvu_axi_tb.sv  | 213 ++++++++++++++++++++++++++++++
 2 files changed, 378 insertions(+)
 create mode 100644 finn-rtllib/mvu/tb/mvu_8sx9_tb.sv
 create mode 100644 finn-rtllib/mvu/tb/mvu_axi_tb.sv

diff --git a/finn-rtllib/mvu/tb/mvu_8sx9_tb.sv b/finn-rtllib/mvu/tb/mvu_8sx9_tb.sv
new file mode 100644
index 0000000000..c8bfe5370a
--- /dev/null
+++ b/finn-rtllib/mvu/tb/mvu_8sx9_tb.sv
@@ -0,0 +1,165 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Testbench for MVU core compute kernel.
+ *****************************************************************************/
+
+module mvu_8sx9_tb();
+
+//-------------------- Simulation parameters --------------------\\
+	// Matrix & parallelism config
+	localparam int unsigned MH = 256;
+	localparam int unsigned PE = 16;
+	localparam int unsigned MW = 600;
+	localparam int unsigned SIMD = 60;
+	localparam int unsigned SEGMENTLEN = 4;
+	// Bit-width config  
+	localparam int unsigned ACTIVATION_WIDTH = 8;
+	localparam int unsigned WEIGHT_WIDTH = 4;
+	localparam bit SIGNED_ACTIVATIONS = 1;
+	// Simulation constants
+	localparam int unsigned NF = MH/PE;
+	localparam int unsigned SF = MW/SIMD;
+	localparam int unsigned NUM_OF_DSP = SIMD/3;
+
+	typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t;
+	typedef activation_t activation_vector_t[SF];
+
+	function activation_vector_t init_ACTIVATIONS;
+		automatic activation_vector_t res;
+		std::randomize(res);
+		return res;
+	endfunction : init_ACTIVATIONS
+
+	typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t;
+	typedef weight_t weight_matrix_t[NF][SF];
+
+	function weight_matrix_t init_WEIGHTS;
+		automatic weight_matrix_t res;
+		std::randomize(res);
+		return res;
+	endfunction : init_WEIGHTS;
+
+	typedef logic signed [PE-1:0][57:0] output_t;
+	typedef output_t output_vector_t [NF];
+
+	function output_vector_t check_output(activation_vector_t a, weight_matrix_t w);
+		automatic output_vector_t res = '{default: 0};
+		for (int j = 0; j<MH; j++) begin
+			for (int i = 0; i<MW; i++) begin
+				res[j/PE][j%PE] = $signed(res[j/PE][j%PE]) + $signed(a[i/SIMD][i%SIMD]) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
+			end
+		end  
+		return res;
+	endfunction : check_output;
+
+	logic clk = 0;
+	always #5ns clk = !clk;
+
+	logic rst;
+	initial begin
+		rst = 1;
+		repeat(16) @(posedge clk);
+		rst <= 0;
+	end
+
+	logic last;
+	logic zero;
+	logic vld;
+	activation_t a;
+	weight_t w;
+	output_t p;
+	// Reference signals
+	activation_vector_t ACTIVATIONS; //   [SF-1:0][SIMD-1:0][ACTIVATION_WIDTH-1:0]
+	weight_matrix_t WEIGHTS; //           [NF-1:0][SF-1:0][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]
+	output_vector_t GOLDEN_OUTPUT; //     [NF-1:0][PE-1:0][57:0]
+	// Counter for number of outputs (NF dimension) that are produced
+	int NF_CNT = 0;
+
+	initial begin
+		ACTIVATIONS = init_ACTIVATIONS();
+		WEIGHTS = init_WEIGHTS();
+		GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS);
+		last = 0;
+		zero = 0;
+		a = 'x;
+		w = 'x;
+
+		@(posedge clk iff !rst);
+
+		for (int j=0; j<NF; j++) begin
+			for (int i=0; i<SF; i++) begin
+				last <= (i==SF-1) ? 1 : 0;
+				a <= ACTIVATIONS[i];
+				w <= WEIGHTS[j][i];
+				@(posedge clk iff en);
+			end
+		end
+
+		last <= 0;
+		zero <= 1;  
+
+		// Continue until all NF outputs are produced & compared
+		@(posedge clk && (NF_CNT==NF));
+
+		$finish;
+	end
+
+	logic en = 0;
+	always_ff @(posedge clk) begin
+		en <= ($urandom()%7 > 1) && !rst;
+	end
+
+	// Compare computed output against golden output when vld flag is raised by DUT
+	always_ff @(posedge clk iff (vld && en)) begin
+		foreach(p[i]) begin
+			assert ($signed(p[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+			else begin 
+				$error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+				$stop;
+			end  
+		end
+		NF_CNT += 1;
+	end
+
+	// Instantiate DUT
+	mvu_8sx9 #(
+		.PE(PE),
+		.SIMD(SIMD),
+		.WEIGHT_WIDTH(WEIGHT_WIDTH),
+		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
+		.ACTIVATION_WIDTH(ACTIVATION_WIDTH),
+		.SEGMENTLEN(SEGMENTLEN)
+	)
+	dut (
+		.clk, .rst, .en, .last, .zero, .a, .w, .vld, .p
+	);
+
+endmodule : mvu_8sx9_tb
diff --git a/finn-rtllib/mvu/tb/mvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_axi_tb.sv
new file mode 100644
index 0000000000..08a349da84
--- /dev/null
+++ b/finn-rtllib/mvu/tb/mvu_axi_tb.sv
@@ -0,0 +1,213 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Testbench for MVU AXI-lite interface wrapper.
+ *****************************************************************************/
+
+module mvu_axi_tb();
+
+//-------------------- Simulation parameters --------------------\\
+	// Matrix & parallelism config
+	localparam int unsigned MW = 90;
+	localparam int unsigned MH = 16;
+	localparam int unsigned SIMD = 9;
+	localparam int unsigned PE = 4;
+	localparam int unsigned SEGMENTLEN = 1;
+	localparam string MVU_IMPL_STYLE = "mvu_8sx9";
+	// Bit-width config  
+	localparam int unsigned ACTIVATION_WIDTH = 8;
+	localparam int unsigned WEIGHT_WIDTH = 4;
+	localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW);
+	localparam bit SIGNED_ACTIVATIONS = 1;
+	// Simulation constants  
+	localparam int unsigned NF = MH/PE;
+	localparam int unsigned SF = MW/SIMD;
+	localparam int unsigned NUM_OF_DSP = SIMD/3;
+	localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8;
+	localparam int unsigned ACTIVATION_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8*8;
+	localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH;
+	localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - SIMD*ACTIVATION_WIDTH;
+	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8;
+
+	// Generate clk and reset signal   
+	logic clk = 0;
+	always #5ns clk = !clk;
+
+	logic ap_rst_n = 0;
+	initial begin
+		repeat(16) @(posedge clk);
+		ap_rst_n <= 1;
+	end
+
+	uwire ap_clk = clk;
+
+	// Generate activations  
+	typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t;
+	typedef activation_t activation_vector_t[SF];
+
+	function activation_vector_t init_ACTIVATIONS;
+		automatic activation_vector_t res;
+		std::randomize(res);
+		return res;
+	endfunction : init_ACTIVATIONS
+
+	activation_vector_t ACTIVATIONS = init_ACTIVATIONS();
+
+	struct {
+		activation_t dat;
+		logic vld;
+		logic rdy;
+	} activations;
+
+	initial begin
+		activations.vld = 0;
+		activations.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain
+		@(posedge clk iff ap_rst_n);
+
+		for (int i=0; i<SF; i++) begin
+			activations.dat <= ACTIVATIONS[i];
+			do begin 
+				activations.vld = $urandom()%7 > 1;
+				@(posedge clk);
+			end while (!(activations.vld === 1 && activations.rdy === 1));
+		end
+
+		activations.vld <= 0;
+		activations.dat <= 'x;
+	end
+
+	// Generate weights   
+	typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t;
+	typedef weight_t weight_matrix_t[NF][SF]; 
+
+	function weight_matrix_t init_WEIGHTS;
+		automatic weight_matrix_t res;
+		std::randomize(res);
+		return res;
+	endfunction : init_WEIGHTS;
+
+	weight_matrix_t WEIGHTS = init_WEIGHTS();
+
+	struct {
+		weight_t dat;
+		logic vld;
+		logic rdy;
+	} weights;
+
+	initial begin
+		weights.vld = 0;
+		weights.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain
+		@(posedge clk iff ap_rst_n);
+
+		weights.vld <= 1;
+		for (int i=0; i<NF; i++) begin
+			for (int j=0; j<SF; j++) begin
+				weights.dat <= WEIGHTS[i][j];
+				@(posedge clk iff weights.rdy);
+			end
+		end
+
+		weights.vld <= 0;
+		weights.dat <= 'x;
+	end
+
+	// Function to compute golden output  
+	// a: [SF][SIMD-1:0][ACTIVATION_WIDTH-1:0]
+	// w: [NF][SF][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]
+	typedef logic signed [PE-1:0][ACCU_WIDTH-1:0] output_t;
+	typedef output_t output_vector_t [NF];
+
+	struct {
+		output_t dat;
+		logic vld;
+		logic rdy;
+	} outputs;
+
+	function output_vector_t check_output(activation_vector_t a, weight_matrix_t w);
+		automatic output_vector_t res = '{default: 0};
+		for (int j = 0; j<MH; j++) begin
+			for (int i = 0; i<MW; i++) begin
+				if (SIGNED_ACTIVATIONS==1) 
+					res[j/PE][j%PE] = $signed(res[j/PE][j%PE]) + $signed(a[i/SIMD][i%SIMD]) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
+				else
+					res[j/PE][j%PE] = $signed(res[j/PE][j%PE]) + $signed({1'b0, a[i/SIMD][i%SIMD]}) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
+			end
+		end  
+		return res;
+	endfunction : check_output;
+
+	output_vector_t GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS);
+
+	int unsigned NF_CNT = 0;
+	initial begin
+		outputs.rdy = 0;
+		while (NF_CNT < NF) begin
+			// Loop until both rdy & vld are asserted
+			do begin
+				outputs.rdy <= $urandom()%7 >= 1;
+				@(posedge clk iff ap_rst_n);
+			end while (!(outputs.rdy === 1 && outputs.vld === 1));
+
+			// Compare produced outputs against golden outputs
+			foreach(outputs.dat[i]) begin
+				assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+				else begin 
+					$error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+					$stop;
+				end  
+			end
+			
+			NF_CNT += 1;
+		end
+
+		$finish;  
+	end
+
+	// Instantiate DUT
+	mvu_axi #(
+		.MW(MW),
+		.MH(MH),
+		.PE(PE),
+		.SIMD(SIMD),
+		.ACTIVATION_WIDTH(ACTIVATION_WIDTH),
+		.WEIGHT_WIDTH(WEIGHT_WIDTH),
+		.ACCU_WIDTH(ACCU_WIDTH),
+		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
+		.SEGMENTLEN(SEGMENTLEN),
+		.MVU_IMPL_STYLE(MVU_IMPL_STYLE)
+	)
+	dut (
+		.ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld),
+		.s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld),
+		.s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld),
+		.m_axis_output_tready(outputs.rdy)
+	);
+  
+endmodule : mvu_axi_tb

From 69310b4e6d2ee4bf2e60b236582656fd7f364a6d Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Wed, 12 Apr 2023 23:27:50 +0100
Subject: [PATCH 012/112] [rtl custom op]: fixed output width to ACCU_WIDTH

---
 finn-rtllib/mvu/mvu_8sx9.sv | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv
index d082d4fb2e..5af27ab0ce 100644
--- a/finn-rtllib/mvu/mvu_8sx9.sv
+++ b/finn-rtllib/mvu/mvu_8sx9.sv
@@ -36,19 +36,25 @@ module mvu_8sx9 #(
     int unsigned SIMD,
     int unsigned ACTIVATION_WIDTH,
     int unsigned WEIGHT_WIDTH,
+	int unsigned ACCU_WIDTH,
     bit SIGNED_ACTIVATIONS = 0,
     int unsigned SEGMENTLEN = 0 // Default to 0 (which implies a single segment)
   )
   (
-    input   logic clk,
+    // Global Control
+	input   logic clk,
     input   logic rst,
     input   logic en,
+
+	// Input
     input   logic last,
-    input   logic zero,
-    input   logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] a,
-    input   logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w,
-    output  logic vld,
-    output  logic [PE-1:0][57:0] p 
+    input   logic zero, // ignore current inputs and force this partial product to zero
+    input   logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // weights
+	input   logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] a, // activations
+    
+	// Ouput
+	output  logic vld,
+    output  logic [PE-1:0][ACCU_WIDTH-1:0] p
   );
 
 //-------------------- Declare global signals --------------------\\
@@ -146,7 +152,7 @@ module mvu_8sx9 #(
 			uwire [57:0] pp;
 
 			if (LAST) begin : genPOUT
-				assign p[j] = pp;
+				assign p[j] = pp[ACCU_WIDTH-1:0];
 			end      
 
 			DSP58 #(
@@ -281,4 +287,4 @@ module mvu_8sx9 #(
 		end : genDSPChain  
 	end : genDSPPE
     
-endmodule
+endmodule : mvu_8sx9

From cfcff0040c85a76d7c5a16b2bf1b6b966b62e87d Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Wed, 12 Apr 2023 23:29:06 +0100
Subject: [PATCH 013/112] [rtl custom op]: renamed file and added generic to
 switch between compute kernels

---
 finn-rtllib/mvu/mvu_axi.sv | 194 +++++++++++++++++++++++++++++++++++++
 1 file changed, 194 insertions(+)
 create mode 100644 finn-rtllib/mvu/mvu_axi.sv

diff --git a/finn-rtllib/mvu/mvu_axi.sv b/finn-rtllib/mvu/mvu_axi.sv
new file mode 100644
index 0000000000..5d8700738f
--- /dev/null
+++ b/finn-rtllib/mvu/mvu_axi.sv
@@ -0,0 +1,194 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Matrix Vector Unit (MVU) AXI-lite interface wrapper.
+ *****************************************************************************/
+
+module mvu_axi #(
+	int unsigned MW,
+	int unsigned MH,
+	int unsigned PE,
+	int unsigned SIMD,
+	int unsigned ACTIVATION_WIDTH,
+	int unsigned WEIGHT_WIDTH,
+	int unsigned ACCU_WIDTH,
+	bit SIGNED_ACTIVATIONS = 0,
+	int unsigned SEGMENTLEN = 0,
+	parameter RAM_STYLE = "auto",
+	parameter MVU_IMPL_STYLE,
+
+	localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
+	localparam int unsigned INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8,
+	localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH,
+	localparam int unsigned INPUT_STREAM_WIDTH = SIMD*ACTIVATION_WIDTH,
+	localparam int unsigned SF = MW/SIMD,
+	localparam int unsigned NF = MH/PE,
+	localparam int unsigned OUTPUT_LANES = PE,
+	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
+)
+(
+	// Global Control
+	input	logic  ap_clk,
+	input	logic  ap_rst_n,
+
+	// Weight Stream
+	input	logic [WEIGHT_STREAM_WIDTH_BA-1:0]  s_axis_weights_tdata,
+	input	logic  s_axis_weights_tvalid,
+	output	logic  s_axis_weights_tready,
+
+	// Input Stream
+	input	logic [INPUT_STREAM_WIDTH_BA-1:0]  s_axis_input_tdata,
+	input	logic  s_axis_input_tvalid,
+	output	logic  s_axis_input_tready,
+
+	// Output Stream
+	output	logic [OUTPUT_STREAM_WIDTH_BA-1:0]  m_axis_output_tdata,
+	output	logic  m_axis_output_tvalid,
+	input	logic  m_axis_output_tready
+);
+
+//-------------------- Parameter sanity checks --------------------\\
+	initial begin
+		if (MW % SIMD != 0) begin
+			$error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD);
+			$finish;
+		end
+		if (MH % PE != 0) begin
+			$error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE);
+			$finish;
+		end
+		if (ACTIVATION_WIDTH > 9) begin
+			$error("Activation width of %0d-bits exceeds maximum of 9-bits", ACTIVATION_WIDTH);
+			$finish;
+		end
+		if (WEIGHT_WIDTH > 8) begin
+			$error("Weight width of %0d-bits exceeds maximum of 8-bits", WEIGHT_WIDTH);
+			$finish;
+		end
+		if (SIGNED_ACTIVATIONS == 0 && ACTIVATION_WIDTH==9) begin
+			$error("Activation width of %0d-bits exceeds maximum of 8-bits for unsigned numbers", ACTIVATION_WIDTH);
+			$finish;
+		end
+		if (SEGMENTLEN == 0) begin
+			$warning("Segment length of %0d defaults to chain length", SEGMENTLEN);
+		end
+		if (SEGMENTLEN > (SIMD+2)/3) begin
+			$error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3);
+			$finish;
+		end
+	end
+
+	uwire clk = ap_clk;
+	uwire rst = !ap_rst_n;
+
+	typedef logic [INPUT_STREAM_WIDTH-1 : 0] mvauin_t;
+
+	uwire mvauin_t amvau;
+	uwire alast;
+	uwire afin;
+	uwire avld;
+	uwire ardy;
+
+	replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvauin_t)), .RAM_STYLE(RAM_STYLE)) activation_replay (
+		.clk, .rst,
+		.ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)),
+		.ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin)
+	);
+
+//-------------------- Input control --------------------\\
+	uwire en;
+	uwire istb = avld && s_axis_weights_tvalid;
+	assign ardy = en && s_axis_weights_tvalid;
+	assign s_axis_weights_tready = en && avld;
+
+//-------------------- Core MVU --------------------\\
+	uwire ovld;
+	uwire [PE-1:0][ACCU_WIDTH-1:0] odat;
+	typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t;
+	
+	if (MVU_IMPL_STYLE == "mvu_8sx9") begin : genMVU8sx9
+		mvu_8sx9 #(.PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
+		.ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN)) core (
+			.clk, .rst, .en,
+			.last(alast), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau),
+			.vld(ovld), .p(odat)
+		);
+	end
+	else if (MVU_IMPL_STYLE == "mvu_4sx4u") begin : genMVU4sx4u
+		mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .FORCE_BEHAVIORAL(0)) core (
+			.clk, .rst, .en,
+			.last(alast), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau),
+			.vld(ovld), .p(odat)
+		);
+	end
+	//else begin
+	//	$error("Unrecognized MVU_IMPL_STYLE!");
+	//	$finish;
+	//end
+
+//-------------------- Output register slice --------------------\\
+	struct {
+		logic vld;
+		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
+	} A = '{ vld: 0, default: 'x};
+
+	assign en = !A.vld || !ovld;
+
+	uwire  b_load;
+	always_ff @(posedge clk) begin
+		if(rst)		A <= '{ vld: 0, default: 'x };
+		else if(!A.vld || b_load) begin
+			A.vld <= ovld && en;
+			for(int unsigned  i = 0; i < PE; i++) begin
+				// CR-1148862:
+				// A.dat[i] <= odat[i];
+				automatic logic [ACCU_WIDTH-1:0]  v = odat[i];
+				A.dat[i] <= v[ACCU_WIDTH-1:0];
+			end
+		end
+	end
+	
+	struct {
+		logic vld;
+		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
+	} B = '{ vld: 0, default: 'x};
+
+	assign	b_load = !B.vld || m_axis_output_tready;
+	always_ff @(posedge clk) begin
+		if(rst)		B <= '{ default: 'x };
+		else begin
+			if(b_load)	B <= '{ vld: A.vld, dat: A.dat};
+		end	
+	end
+
+	assign	m_axis_output_tvalid = B.vld;
+	assign	m_axis_output_tdata  = B.dat;
+
+endmodule : mvu_axi
\ No newline at end of file

From 72b519691369b9ebc31983a6723485860837e37b Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Wed, 12 Apr 2023 23:29:45 +0100
Subject: [PATCH 014/112] [rtl custom op]: renamed file and added generic to
 switch between compute kernels

---
 finn-rtllib/mvu/mvu_axi_wrapper.v | 90 +++++++++++++++++++++++++++++++
 1 file changed, 90 insertions(+)
 create mode 100644 finn-rtllib/mvu/mvu_axi_wrapper.v

diff --git a/finn-rtllib/mvu/mvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_axi_wrapper.v
new file mode 100644
index 0000000000..323d2711e4
--- /dev/null
+++ b/finn-rtllib/mvu/mvu_axi_wrapper.v
@@ -0,0 +1,90 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Verilog AXI-lite wrapper for MVU.
+ *****************************************************************************/
+
+module $MODULE_NAME_AXI_WRAPPER$ #(
+	parameter 	MW = $MW$,
+	parameter	MH = $MH$,
+	parameter 	PE = $PE$,
+	parameter 	SIMD = $SIMD$,
+	parameter 	ACTIVATION_WIDTH = $ACTIVATION_WIDTH$,
+	parameter 	WEIGHT_WIDTH = $WEIGHT_WIDTH$,
+	parameter 	ACCU_WIDTH = $ACCU_WIDTH$,
+	parameter 	SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$,
+	parameter 	SEGMENTLEN = $SEGMENTLEN$,
+	parameter 	RAM_STYLE = "$IBUF_RAM_STYLE$",
+
+	// Safely deducible parameters
+	parameter 	WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
+	parameter 	INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8,
+	parameter 	OUTPUT_LANES = PE,
+	parameter 	OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
+)(
+  	// Global Control
+	input	logic  ap_clk,
+	input	logic  ap_rst_n,
+
+	// Weight Stream
+	input	logic [WEIGHT_STREAM_WIDTH_BA-1:0]  s_axis_weights_tdata,
+	input	logic  s_axis_weights_tvalid,
+	output	logic  s_axis_weights_tready,
+
+	// Input Stream
+	input	logic [INPUT_STREAM_WIDTH_BA-1:0]  s_axis_input_tdata,
+	input	logic  s_axis_input_tvalid,
+	output	logic  s_axis_input_tready,
+
+	// Output Stream
+	output	logic [OUTPUT_STREAM_WIDTH_BA-1:0]  m_axis_output_tdata,
+	output	logic  m_axis_output_tvalid,
+	input	logic  m_axis_output_tready
+);
+
+mvu_axi #(
+	.MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH),
+	.WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
+	.SEGMENTLEN(SEGMENTLEN), .RAM_STYLE(RAM_STYLE)
+	) inst (
+	.ap_clk(ap_clk),
+	.ap_rst_n(ap_rst_n),
+	.s_axis_weights_tdata(s_axis_weights_tdata),
+	.s_axis_weights_tvalid(s_axis_weights_tvalid),
+	.s_axis_weights_tready(s_axis_weights_tready),
+	.s_axis_input_tdata(s_axis_input_tdata),
+	.s_axis_input_tvalid(s_axis_input_tvalid),
+	.s_axis_input_tready(s_axis_input_tready),
+	.m_axis_output_tdata(m_axis_output_tdata),
+	.m_axis_output_tvalid(m_axis_output_tvalid),
+	.m_axis_output_tready(m_axis_output_tready)
+);
+
+endmodule : $MODULE_NAME_AXI_WRAPPER$
\ No newline at end of file

From c068bb65c6a4b877876c5b1278e7b2663b81d8e1 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 8 May 2023 10:15:16 +0100
Subject: [PATCH 015/112] [rtl mvu]: added behavioral model DSP58

---
 finn-rtllib/mvu/mvu_8sx9.sv | 343 ++++++++++++++++++++++--------------
 1 file changed, 212 insertions(+), 131 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv
index 5af27ab0ce..2d1da26efb 100644
--- a/finn-rtllib/mvu/mvu_8sx9.sv
+++ b/finn-rtllib/mvu/mvu_8sx9.sv
@@ -38,7 +38,8 @@ module mvu_8sx9 #(
     int unsigned WEIGHT_WIDTH,
 	int unsigned ACCU_WIDTH,
     bit SIGNED_ACTIVATIONS = 0,
-    int unsigned SEGMENTLEN = 0 // Default to 0 (which implies a single segment)
+    int unsigned SEGMENTLEN = 0, // Default to 0 (which implies a single segment)
+	bit FORCE_BEHAVIORAL = 0
   )
   (
     // Global Control
@@ -70,7 +71,10 @@ module mvu_8sx9 #(
 
 	always_ff @(posedge clk) begin
 		if(rst)     L <= '{default: 0};
-		else if(en) L <= { L[1:1+MAX_PIPELINE_STAGES], last };
+		else if(en) begin
+			L[1+MAX_PIPELINE_STAGES] <= last;
+			L[0:MAX_PIPELINE_STAGES] <= L[1:1+MAX_PIPELINE_STAGES];
+		end
 	end  
 	assign vld = L[0];
 
@@ -155,135 +159,212 @@ module mvu_8sx9 #(
 				assign p[j] = pp[ACCU_WIDTH-1:0];
 			end      
 
-			DSP58 #(
-				// Feature Control Attributes: Data Path Selection
-				.AMULTSEL("A"),                     // Selects A input to multiplier (A, AD)
-				.A_INPUT("DIRECT"),                 // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port)
-				.BMULTSEL("B"),                     // Selects B input to multiplier (AD, B)
-				.B_INPUT("DIRECT"),                 // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port)
-				.DSP_MODE("INT8"),                  // Configures DSP to a particular mode of operation. Set to INT24 for
-													// legacy mode.
-				.PREADDINSEL("A"),                  // Selects input to pre-adder (A, B)
-				.RND(58'h000000000000000),          // Rounding Constant
-				.USE_MULT("MULTIPLY"),              // Select multiplier usage (DYNAMIC, MULTIPLY, NONE)
-				.USE_SIMD("ONE58"),                 // SIMD selection (FOUR12, ONE58, TWO24)
-				.USE_WIDEXOR("FALSE"),              // Use the Wide XOR function (FALSE, TRUE)
-				.XORSIMD("XOR24_34_58_116"),        // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116)
-				// Pattern Detector Attributes: Pattern Detection Configuration
-				.AUTORESET_PATDET("NO_RESET"),      // NO_RESET, RESET_MATCH, RESET_NOT_MATCH
-				.AUTORESET_PRIORITY("RESET"),       // Priority of AUTORESET vs. CEP (CEP, RESET).
-				.MASK(58'h0ffffffffffffff),         // 58-bit mask value for pattern detect (1=ignore)
-				.PATTERN(58'h000000000000000),      // 58-bit pattern match for pattern detect
-				.SEL_MASK("MASK"),                  // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2
-				.SEL_PATTERN("PATTERN"),            // Select pattern value (C, PATTERN)
-				.USE_PATTERN_DETECT("NO_PATDET"),   // Enable pattern detect (NO_PATDET, PATDET)
-				// Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins
-				.IS_ALUMODE_INVERTED(4'b0000),      // Optional inversion for ALUMODE
-				.IS_CARRYIN_INVERTED(1'b0),         // Optional inversion for CARRYIN
-				.IS_CLK_INVERTED(1'b0),             // Optional inversion for CLK
-				.IS_INMODE_INVERTED(5'b00000),      // Optional inversion for INMODE
-				.IS_NEGATE_INVERTED(3'b000),        // Optional inversion for NEGATE
-				.IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0 
-									FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN 
-									2'b01, // Y : M
-									2'b01  // X: M
-				}), // Optional inversion for OPMODE
-				.IS_RSTALLCARRYIN_INVERTED(1'b0),   // Optional inversion for RSTALLCARRYIN
-				.IS_RSTALUMODE_INVERTED(1'b0),      // Optional inversion for RSTALUMODE
-				.IS_RSTA_INVERTED(1'b0),            // Optional inversion for RSTA
-				.IS_RSTB_INVERTED(1'b0),            // Optional inversion for RSTB
-				.IS_RSTCTRL_INVERTED(1'b0),         // Optional inversion for STCONJUGATE_A
-				.IS_RSTC_INVERTED(1'b0),            // Optional inversion for RSTC
-				.IS_RSTD_INVERTED(1'b0),            // Optional inversion for RSTD
-				.IS_RSTINMODE_INVERTED(1'b0),       // Optional inversion for RSTINMODE
-				.IS_RSTM_INVERTED(1'b0),            // Optional inversion for RSTM
-				.IS_RSTP_INVERTED(1'b0),            // Optional inversion for RSTP
-				// Register Control Attributes: Pipeline Register Configuration
-				.ACASCREG(INTERNAL_PREGS),          // Number of pipeline stages between A/ACIN and ACOUT (0-2)
-				.ADREG(0),                          // Pipeline stages for pre-adder (0-1)
-				.ALUMODEREG(0),                     // Pipeline stages for ALUMODE (0-1)
-				.AREG(INTERNAL_PREGS),              // Pipeline stages for A (0-2)
-				.BCASCREG(INTERNAL_PREGS),          // Number of pipeline stages between B/BCIN and BCOUT (0-2)
-				.BREG(INTERNAL_PREGS),              // Pipeline stages for B (0-2)
-				.CARRYINREG(0),                     // Pipeline stages for CARRYIN (0-1)
-				.CARRYINSELREG(0),                  // Pipeline stages for CARRYINSEL (0-1)
-				.CREG(0),                           // Pipeline stages for C (0-1)
-				.DREG(0),                           // Pipeline stages for D (0-1)
-				.INMODEREG(1),                      // Pipeline stages for INMODE (0-1)
-				.MREG(1),                           // Multiplier pipeline stages (0-1)
-				.OPMODEREG(1),                      // Pipeline stages for OPMODE (0-1)
-				.PREG(PREG),                        // Number of pipeline stages for P (0-1)
-				.RESET_MODE("SYNC")                 // Selection of synchronous or asynchronous reset. (ASYNC, SYNC).
-			)
-			DSP58_inst (
-				// Cascade outputs: Cascade Ports
-				.ACOUT(),                           // 34-bit output: A port cascade
-				.BCOUT(),                           // 24-bit output: B cascade
-				.CARRYCASCOUT(),                    // 1-bit output: Cascade carry
-				.MULTSIGNOUT(),                     // 1-bit output: Multiplier sign cascade
-				.PCOUT(pcout[j][i]),                // 58-bit output: Cascade output
-				// Control outputs: Control Inputs/Status Bits
-				.OVERFLOW(),                        // 1-bit output: Overflow in add/acc
-				.PATTERNBDETECT(),                  // 1-bit output: Pattern bar detect
-				.PATTERNDETECT(),                   // 1-bit output: Pattern detect
-				.UNDERFLOW(),                       // 1-bit output: Underflow in add/acc
-				// Data outputs: Data Ports
-				.CARRYOUT(),                        // 4-bit output: Carry
-				.P(pp),                             // 58-bit output: Primary data
-				.XOROUT(),                          // 8-bit output: XOR data
-				// Cascade inputs: Cascade Ports
-				.ACIN('x),                          // 34-bit input: A cascade data
-				.BCIN('x),                          // 24-bit input: B cascade
-				.CARRYCASCIN('x),                   // 1-bit input: Cascade carry
-				.MULTSIGNIN('x),                    // 1-bit input: Multiplier sign cascade
-				.PCIN(FIRST ? 'x : pcout[j][i-1]),  // 58-bit input: P cascade
-				// Control inputs: Control Inputs/Status Bits
-				.ALUMODE(4'h0),                     // 4-bit input: ALU control
-				.CARRYINSEL('0),                    // 3-bit input: Carry select
-				.CLK(clk),                          // 1-bit input: Clock
-				.INMODE({
-						INTERNAL_PREGS==2 ? 1'b0 : 1'b1,
-						2'b00,
-						TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero,
-						INTERNAL_PREGS==2 ? 1'b0 : 1'b1        
-				}),                                 // 5-bit input: INMODE control
-				.NEGATE('0),                        // 3-bit input: Negates the input of the multiplier
-				.OPMODE({
-						LAST ? {1'b0, L[1]} : 2'b00,
-						7'b000_0000
-				}), // 9-bit input: Operation mode
-				// Data inputs: Data Ports
-				.A({ 7'bx, a_in_i[i] }),            // 34-bit input: A data
-				.B(b_in_i[j][i]),                   // 24-bit input: B data
-				.C('x),                             // 58-bit input: C data
-				.CARRYIN('0),                       // 1-bit input: Carry-in
-				.D('x),                             // 27-bit input: D data
-				// Reset/Clock Enable inputs: Reset/Clock Enable Inputs
-				.ASYNC_RST('0),                     // 1-bit input: Asynchronous reset for all registers.
-				.CEA1(en),                          // 1-bit input: Clock enable for 1st stage AREG
-				.CEA2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage AREG
-				.CEAD('0),                          // 1-bit input: Clock enable for ADREG
-				.CEALUMODE('0),                     // 1-bit input: Clock enable for ALUMODE
-				.CEB1(en),                          // 1-bit input: Clock enable for 1st stage BREG
-				.CEB2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage BREG
-				.CEC('0),                           // 1-bit input: Clock enable for CREG
-				.CECARRYIN('0),                     // 1-bit input: Clock enable for CARRYINREG
-				.CECTRL(en),                        // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG
-				.CED('0),                           // 1-bit input: Clock enable for DREG
-				.CEINMODE(en),                      // 1-bit input: Clock enable for INMODEREG
-				.CEM(en),                           // 1-bit input: Clock enable for MREG
-				.CEP(PREG && en),                   // 1-bit input: Clock enable for PREG
-				.RSTA(rst),                         // 1-bit input: Reset for AREG
-				.RSTALLCARRYIN('0),                 // 1-bit input: Reset for CARRYINREG
-				.RSTALUMODE('0),                    // 1-bit input: Reset for ALUMODEREG
-				.RSTB(rst),                         // 1-bit input: Reset for BREG
-				.RSTC('0),                          // 1-bit input: Reset for CREG
-				.RSTCTRL(rst),                      // 1-bit input: Reset for OPMODEREG and CARRYINSELREG
-				.RSTD('0),                          // 1-bit input: Reset for DREG and ADREG
-				.RSTINMODE(rst),                    // 1-bit input: Reset for INMODE register
-				.RSTM(rst),                         // 1-bit input: Reset for MREG
-				.RSTP(PREG && rst)                  // 1-bit input: Reset for PREG
-			);
+			// Note: Since the product B * AD is computed,
+			//       rst can be only applied to AD and zero only to B
+			//       with the same effect as zeroing both.
+			if (FORCE_BEHAVIORAL) begin : genBehav
+				// Stage #1: Input A/B
+				logic signed [33:0] Areg [INTERNAL_PREGS];
+				always_ff @(posedge clk) begin
+					if (rst)	Areg <= '{ default : 0};
+					else if (en) begin
+						Areg[0] <= { 7'bx, a_in_i[i] };
+						if (INTERNAL_PREGS == 2) Areg[1] <= Areg[0];
+					end
+				end
+				logic signed [23:0] Breg [INTERNAL_PREGS];
+				always_ff @(posedge clk) begin
+					if (rst)	Breg <= '{ default : 0};
+					else if (en) begin
+						Breg[0] <= b_in_i[j][i];
+						if (INTERNAL_PREGS == 2) Breg[1] <= Breg[0];
+					end
+				end
+
+				// Stage #2: Multiply-Accumulate
+				logic signed [57:0] Mreg;
+				logic InmodeZero = 0;
+				always_ff @(posedge clk) begin
+					if (rst)		InmodeZero <= 0;
+					else if (en)	InmodeZero <= ( TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero );
+				end
+				always_ff @(posedge clk) begin
+					if (rst)	Mreg <= 0;
+					else if (en) begin
+						automatic logic signed [57:0] m = 0;
+						for (int k = 0; k < 3; k++) begin
+							m = m + (InmodeZero ? 0 : $signed(Areg[INTERNAL_PREGS-1][9*k +: 9]) * $signed(Breg[INTERNAL_PREGS-1][8*k +: 8]));
+						end
+						Mreg <= m;
+					end
+				end
+
+				// Stage #3: Accumulate
+				logic signed [57:0] Preg;
+				logic Opmode = 0;
+				if (FIRST && !LAST) begin : genFirst
+					if (PREG) begin : genPregBehav
+						always_ff @(posedge clk) begin
+							if (rst)		Preg <= 0;
+							else if (en)	Preg <= Mreg;
+						end
+					end
+					else	assign Preg = Mreg;
+				end
+				else if (LAST) begin : genLast
+					always_ff @(posedge clk) begin
+						if (rst)		Opmode <= 0;
+						else if (en)	Opmode <= L[1];
+					end
+					always_ff @(posedge clk) begin
+						if (rst) 		Preg <= 0;
+						else if (en)	Preg <= (Opmode ? 0 : Preg) + Mreg + pcout[j][i-1];
+					end
+				end
+				else begin : genMid
+					if (PREG) begin : genPregBehav
+						always_ff @(posedge clk) begin
+							if (rst)		Preg <= 0;
+							else if (en)	Preg <= Mreg + pcout[j][i-1];
+						end
+					end
+					else	assign Preg = Mreg + pcout[j][i-1];
+				end
+				assign pp = Preg;
+				assign pcout[j][i] = pp;
+			end : genBehav
+
+			else begin: genDSP
+				DSP58 #(
+					// Feature Control Attributes: Data Path Selection
+					.AMULTSEL("A"),                     // Selects A input to multiplier (A, AD)
+					.A_INPUT("DIRECT"),                 // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port)
+					.BMULTSEL("B"),                     // Selects B input to multiplier (AD, B)
+					.B_INPUT("DIRECT"),                 // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port)
+					.DSP_MODE("INT8"),                  // Configures DSP to a particular mode of operation. Set to INT24 for
+														// legacy mode.
+					.PREADDINSEL("A"),                  // Selects input to pre-adder (A, B)
+					.RND(58'h000000000000000),          // Rounding Constant
+					.USE_MULT("MULTIPLY"),              // Select multiplier usage (DYNAMIC, MULTIPLY, NONE)
+					.USE_SIMD("ONE58"),                 // SIMD selection (FOUR12, ONE58, TWO24)
+					.USE_WIDEXOR("FALSE"),              // Use the Wide XOR function (FALSE, TRUE)
+					.XORSIMD("XOR24_34_58_116"),        // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116)
+					// Pattern Detector Attributes: Pattern Detection Configuration
+					.AUTORESET_PATDET("NO_RESET"),      // NO_RESET, RESET_MATCH, RESET_NOT_MATCH
+					.AUTORESET_PRIORITY("RESET"),       // Priority of AUTORESET vs. CEP (CEP, RESET).
+					.MASK(58'h0ffffffffffffff),         // 58-bit mask value for pattern detect (1=ignore)
+					.PATTERN(58'h000000000000000),      // 58-bit pattern match for pattern detect
+					.SEL_MASK("MASK"),                  // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2
+					.SEL_PATTERN("PATTERN"),            // Select pattern value (C, PATTERN)
+					.USE_PATTERN_DETECT("NO_PATDET"),   // Enable pattern detect (NO_PATDET, PATDET)
+					// Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins
+					.IS_ALUMODE_INVERTED(4'b0000),      // Optional inversion for ALUMODE
+					.IS_CARRYIN_INVERTED(1'b0),         // Optional inversion for CARRYIN
+					.IS_CLK_INVERTED(1'b0),             // Optional inversion for CLK
+					.IS_INMODE_INVERTED(5'b00000),      // Optional inversion for INMODE
+					.IS_NEGATE_INVERTED(3'b000),        // Optional inversion for NEGATE
+					.IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0 
+										FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN 
+										2'b01, // Y : M
+										2'b01  // X: M
+					}), // Optional inversion for OPMODE
+					.IS_RSTALLCARRYIN_INVERTED(1'b0),   // Optional inversion for RSTALLCARRYIN
+					.IS_RSTALUMODE_INVERTED(1'b0),      // Optional inversion for RSTALUMODE
+					.IS_RSTA_INVERTED(1'b0),            // Optional inversion for RSTA
+					.IS_RSTB_INVERTED(1'b0),            // Optional inversion for RSTB
+					.IS_RSTCTRL_INVERTED(1'b0),         // Optional inversion for STCONJUGATE_A
+					.IS_RSTC_INVERTED(1'b0),            // Optional inversion for RSTC
+					.IS_RSTD_INVERTED(1'b0),            // Optional inversion for RSTD
+					.IS_RSTINMODE_INVERTED(1'b0),       // Optional inversion for RSTINMODE
+					.IS_RSTM_INVERTED(1'b0),            // Optional inversion for RSTM
+					.IS_RSTP_INVERTED(1'b0),            // Optional inversion for RSTP
+					// Register Control Attributes: Pipeline Register Configuration
+					.ACASCREG(INTERNAL_PREGS),          // Number of pipeline stages between A/ACIN and ACOUT (0-2)
+					.ADREG(0),                          // Pipeline stages for pre-adder (0-1)
+					.ALUMODEREG(0),                     // Pipeline stages for ALUMODE (0-1)
+					.AREG(INTERNAL_PREGS),              // Pipeline stages for A (0-2)
+					.BCASCREG(INTERNAL_PREGS),          // Number of pipeline stages between B/BCIN and BCOUT (0-2)
+					.BREG(INTERNAL_PREGS),              // Pipeline stages for B (0-2)
+					.CARRYINREG(0),                     // Pipeline stages for CARRYIN (0-1)
+					.CARRYINSELREG(0),                  // Pipeline stages for CARRYINSEL (0-1)
+					.CREG(0),                           // Pipeline stages for C (0-1)
+					.DREG(0),                           // Pipeline stages for D (0-1)
+					.INMODEREG(1),                      // Pipeline stages for INMODE (0-1)
+					.MREG(1),                           // Multiplier pipeline stages (0-1)
+					.OPMODEREG(1),                      // Pipeline stages for OPMODE (0-1)
+					.PREG(PREG),                        // Number of pipeline stages for P (0-1)
+					.RESET_MODE("SYNC")                 // Selection of synchronous or asynchronous reset. (ASYNC, SYNC).
+				)
+				DSP58_inst (
+					// Cascade outputs: Cascade Ports
+					.ACOUT(),                           // 34-bit output: A port cascade
+					.BCOUT(),                           // 24-bit output: B cascade
+					.CARRYCASCOUT(),                    // 1-bit output: Cascade carry
+					.MULTSIGNOUT(),                     // 1-bit output: Multiplier sign cascade
+					.PCOUT(pcout[j][i]),                // 58-bit output: Cascade output
+					// Control outputs: Control Inputs/Status Bits
+					.OVERFLOW(),                        // 1-bit output: Overflow in add/acc
+					.PATTERNBDETECT(),                  // 1-bit output: Pattern bar detect
+					.PATTERNDETECT(),                   // 1-bit output: Pattern detect
+					.UNDERFLOW(),                       // 1-bit output: Underflow in add/acc
+					// Data outputs: Data Ports
+					.CARRYOUT(),                        // 4-bit output: Carry
+					.P(pp),                             // 58-bit output: Primary data
+					.XOROUT(),                          // 8-bit output: XOR data
+					// Cascade inputs: Cascade Ports
+					.ACIN('x),                          // 34-bit input: A cascade data
+					.BCIN('x),                          // 24-bit input: B cascade
+					.CARRYCASCIN('x),                   // 1-bit input: Cascade carry
+					.MULTSIGNIN('x),                    // 1-bit input: Multiplier sign cascade
+					.PCIN(FIRST ? 'x : pcout[j][i-1]),  // 58-bit input: P cascade
+					// Control inputs: Control Inputs/Status Bits
+					.ALUMODE(4'h0),                     // 4-bit input: ALU control
+					.CARRYINSEL('0),                    // 3-bit input: Carry select
+					.CLK(clk),                          // 1-bit input: Clock
+					.INMODE({
+							INTERNAL_PREGS==2 ? 1'b0 : 1'b1,
+							2'b00,
+							TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero,
+							INTERNAL_PREGS==2 ? 1'b0 : 1'b1        
+					}),                                 // 5-bit input: INMODE control
+					.NEGATE('0),                        // 3-bit input: Negates the input of the multiplier
+					.OPMODE({
+							LAST ? {1'b0, L[1]} : 2'b00,
+							7'b000_0000
+					}), // 9-bit input: Operation mode
+					// Data inputs: Data Ports
+					.A({ 7'bx, a_in_i[i] }),            // 34-bit input: A data
+					.B(b_in_i[j][i]),                   // 24-bit input: B data
+					.C('x),                             // 58-bit input: C data
+					.CARRYIN('0),                       // 1-bit input: Carry-in
+					.D('x),                             // 27-bit input: D data
+					// Reset/Clock Enable inputs: Reset/Clock Enable Inputs
+					.ASYNC_RST('0),                     // 1-bit input: Asynchronous reset for all registers.
+					.CEA1(en),                          // 1-bit input: Clock enable for 1st stage AREG
+					.CEA2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage AREG
+					.CEAD('0),                          // 1-bit input: Clock enable for ADREG
+					.CEALUMODE('0),                     // 1-bit input: Clock enable for ALUMODE
+					.CEB1(en),                          // 1-bit input: Clock enable for 1st stage BREG
+					.CEB2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage BREG
+					.CEC('0),                           // 1-bit input: Clock enable for CREG
+					.CECARRYIN('0),                     // 1-bit input: Clock enable for CARRYINREG
+					.CECTRL(en),                        // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG
+					.CED('0),                           // 1-bit input: Clock enable for DREG
+					.CEINMODE(en),                      // 1-bit input: Clock enable for INMODEREG
+					.CEM(en),                           // 1-bit input: Clock enable for MREG
+					.CEP(PREG && en),                   // 1-bit input: Clock enable for PREG
+					.RSTA(rst),                         // 1-bit input: Reset for AREG
+					.RSTALLCARRYIN('0),                 // 1-bit input: Reset for CARRYINREG
+					.RSTALUMODE('0),                    // 1-bit input: Reset for ALUMODEREG
+					.RSTB(rst),                         // 1-bit input: Reset for BREG
+					.RSTC('0),                          // 1-bit input: Reset for CREG
+					.RSTCTRL(rst),                      // 1-bit input: Reset for OPMODEREG and CARRYINSELREG
+					.RSTD('0),                          // 1-bit input: Reset for DREG and ADREG
+					.RSTINMODE(rst),                    // 1-bit input: Reset for INMODE register
+					.RSTM(rst),                         // 1-bit input: Reset for MREG
+					.RSTP(PREG && rst)                  // 1-bit input: Reset for PREG
+				);
+			end : genDSP
 		end : genDSPChain  
 	end : genDSPPE
     

From 18f94e7ab03a3034083680faa91a80359858589e Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 8 May 2023 10:18:58 +0100
Subject: [PATCH 016/112] [rtl mvu]: extended flow control wrapper with
 additional compute core and other minor changes

---
 finn-rtllib/mvu/mvu_axi.sv        | 51 +++++++++++++++++++------------
 finn-rtllib/mvu/mvu_axi_wrapper.v | 48 ++++++++++++++---------------
 2 files changed, 54 insertions(+), 45 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_axi.sv b/finn-rtllib/mvu/mvu_axi.sv
index 5d8700738f..e4a919ba88 100644
--- a/finn-rtllib/mvu/mvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_axi.sv
@@ -41,8 +41,8 @@ module mvu_axi #(
 	int unsigned ACCU_WIDTH,
 	bit SIGNED_ACTIVATIONS = 0,
 	int unsigned SEGMENTLEN = 0,
-	parameter RAM_STYLE = "auto",
-	parameter MVU_IMPL_STYLE,
+	bit FORCE_BEHAVIORAL = 0,
+	string MVU_IMPL_STYLE,
 
 	localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
 	localparam int unsigned INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8,
@@ -96,12 +96,14 @@ module mvu_axi #(
 			$error("Activation width of %0d-bits exceeds maximum of 8-bits for unsigned numbers", ACTIVATION_WIDTH);
 			$finish;
 		end
-		if (SEGMENTLEN == 0) begin
-			$warning("Segment length of %0d defaults to chain length", SEGMENTLEN);
-		end
-		if (SEGMENTLEN > (SIMD+2)/3) begin
-			$error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3);
-			$finish;
+		if (MVU_IMPL_STYLE == "mvu_8sx9") begin
+			if (SEGMENTLEN == 0) begin
+				$warning("Segment length of %0d defaults to chain length", SEGMENTLEN);
+			end
+			if (SEGMENTLEN > (SIMD+2)/3) begin
+				$error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3);
+				$finish;
+			end
 		end
 	end
 
@@ -116,7 +118,7 @@ module mvu_axi #(
 	uwire avld;
 	uwire ardy;
 
-	replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvauin_t)), .RAM_STYLE(RAM_STYLE)) activation_replay (
+	replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvauin_t))) activation_replay (
 		.clk, .rst,
 		.ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)),
 		.ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin)
@@ -133,28 +135,37 @@ module mvu_axi #(
 	uwire [PE-1:0][ACCU_WIDTH-1:0] odat;
 	typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t;
 	
-	if (MVU_IMPL_STYLE == "mvu_8sx9") begin : genMVU8sx9
+	if (MVU_IMPL_STYLE == "mvu_8sx9_dsp58") begin : genMVU8sx9
 		mvu_8sx9 #(.PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
-		.ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN)) core (
+		.ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN),
+		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
 			.clk, .rst, .en,
-			.last(alast), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau),
+			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau),
 			.vld(ovld), .p(odat)
 		);
 	end
 	else if (MVU_IMPL_STYLE == "mvu_4sx4u") begin : genMVU4sx4u
-		mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .FORCE_BEHAVIORAL(0)) core (
+		mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
 			.clk, .rst, .en,
-			.last(alast), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau),
+			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau),
 			.vld(ovld), .p(odat)
 		);
 	end
-	//else begin
-	//	$error("Unrecognized MVU_IMPL_STYLE!");
-	//	$finish;
-	//end
+	else if (MVU_IMPL_STYLE == "mvu_8sx8u_dsp48") begin : genMVU8sx8u
+		mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
+		 .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
+			.clk, .rst, .en,
+			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau),
+			.vld(ovld), .p(odat)
+		);
+	end
+	else initial begin
+		$error("Unrecognized MVU_IMPL_STYLE!");
+		$finish;
+	end
 
 //-------------------- Output register slice --------------------\\
-	struct {
+	struct packed {
 		logic vld;
 		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
 	} A = '{ vld: 0, default: 'x};
@@ -175,7 +186,7 @@ module mvu_axi #(
 		end
 	end
 	
-	struct {
+	struct packed {
 		logic vld;
 		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
 	} B = '{ vld: 0, default: 'x};
diff --git a/finn-rtllib/mvu/mvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_axi_wrapper.v
index 323d2711e4..b79ba6bbd1 100644
--- a/finn-rtllib/mvu/mvu_axi_wrapper.v
+++ b/finn-rtllib/mvu/mvu_axi_wrapper.v
@@ -41,7 +41,8 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 	parameter 	ACCU_WIDTH = $ACCU_WIDTH$,
 	parameter 	SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$,
 	parameter 	SEGMENTLEN = $SEGMENTLEN$,
-	parameter 	RAM_STYLE = "$IBUF_RAM_STYLE$",
+	parameter	MVU_IMPL_STYLE = "$MVU_IMPL_STYLE$",
+	parameter	FORCE_BEHAVIORAL = $FORCE_BEHAVIORAL$,
 
 	// Safely deducible parameters
 	parameter 	WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
@@ -50,41 +51,38 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 	parameter 	OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
 )(
   	// Global Control
-	input	logic  ap_clk,
-	input	logic  ap_rst_n,
-
+	input	ap_clk,
+	input	ap_rst_n,
 	// Weight Stream
-	input	logic [WEIGHT_STREAM_WIDTH_BA-1:0]  s_axis_weights_tdata,
-	input	logic  s_axis_weights_tvalid,
-	output	logic  s_axis_weights_tready,
-
+	input	[WEIGHT_STREAM_WIDTH_BA-1:0]  weights_V_TDATA,
+	input   weights_V_TVALID,
+	output  weights_V_TREADY,
 	// Input Stream
-	input	logic [INPUT_STREAM_WIDTH_BA-1:0]  s_axis_input_tdata,
-	input	logic  s_axis_input_tvalid,
-	output	logic  s_axis_input_tready,
-
+	input	[INPUT_STREAM_WIDTH_BA-1:0]  in0_V_TDATA,
+	input	in0_V_TVALID,
+	output	in0_V_TREADY,
 	// Output Stream
-	output	logic [OUTPUT_STREAM_WIDTH_BA-1:0]  m_axis_output_tdata,
-	output	logic  m_axis_output_tvalid,
-	input	logic  m_axis_output_tready
+	output	[OUTPUT_STREAM_WIDTH_BA-1:0]  out_V_TDATA,
+	output	out_V_TVALID,
+	input	out_V_TREADY
 );
 
 mvu_axi #(
 	.MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH),
 	.WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
-	.SEGMENTLEN(SEGMENTLEN), .RAM_STYLE(RAM_STYLE)
+	.SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL), .MVU_IMPL_STYLE(MVU_IMPL_STYLE)
 	) inst (
 	.ap_clk(ap_clk),
 	.ap_rst_n(ap_rst_n),
-	.s_axis_weights_tdata(s_axis_weights_tdata),
-	.s_axis_weights_tvalid(s_axis_weights_tvalid),
-	.s_axis_weights_tready(s_axis_weights_tready),
-	.s_axis_input_tdata(s_axis_input_tdata),
-	.s_axis_input_tvalid(s_axis_input_tvalid),
-	.s_axis_input_tready(s_axis_input_tready),
-	.m_axis_output_tdata(m_axis_output_tdata),
-	.m_axis_output_tvalid(m_axis_output_tvalid),
-	.m_axis_output_tready(m_axis_output_tready)
+	.s_axis_weights_tdata(weights_V_TDATA),
+	.s_axis_weights_tvalid(weights_V_TVALID),
+	.s_axis_weights_tready(weights_V_TREADY),
+	.s_axis_input_tdata(in0_V_TDATA),
+	.s_axis_input_tvalid(in0_V_TVALID),
+	.s_axis_input_tready(in0_V_TREADY),
+	.m_axis_output_tdata(out_V_TDATA),
+	.m_axis_output_tvalid(out_V_TVALID),
+	.m_axis_output_tready(out_V_TREADY)
 );
 
 endmodule : $MODULE_NAME_AXI_WRAPPER$
\ No newline at end of file

From 6d4a0a764e0e6ded16d7034e0d69f5408c76ca75 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 8 May 2023 10:22:51 +0100
Subject: [PATCH 017/112] [rtl mvu]: fix to done_len flag when SIMD dimension
 fully unrolled and PyVerilator-related syntax change

---
 finn-rtllib/mvu/replay_buffer.sv | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/finn-rtllib/mvu/replay_buffer.sv b/finn-rtllib/mvu/replay_buffer.sv
index 685ac03137..89bbbdb88f 100644
--- a/finn-rtllib/mvu/replay_buffer.sv
+++ b/finn-rtllib/mvu/replay_buffer.sv
@@ -35,8 +35,7 @@
 module replay_buffer #(
 	int unsigned  LEN,	// Sequence length
 	int unsigned  REP,	// Sequence replay count
-	int unsigned  W,	// Data width
-	parameter RAM_STYLE = "auto" 	// ram style for buffer {block, distributed, ultra, auto}
+	int unsigned  W 	// Data width
 )(
 	input	logic  clk,
 	input	logic  rst,
@@ -54,7 +53,7 @@ module replay_buffer #(
 
 	typedef logic [$clog2(REP)+$clog2(LEN)-1:0]  count_t;
 	count_t  Count = 0;
-	uwire  done_len = ((LEN-1) & ~Count[$clog2(LEN)-1:0]) == 0;
+	uwire  done_len = LEN == 1 ? 1 : ((LEN-1) & ~Count[$clog2(LEN)-1:0]) == 0;
 	uwire  done_rep;
 	uwire  done_all = done_len && done_rep;
 
@@ -83,7 +82,6 @@ module replay_buffer #(
 		end
 		assign	first_rep = FirstRep;
 
-		(* RAM_STYLE = RAM_STYLE *)
 		data_t  Buf[LEN];
 		if(LEN == 1) begin : genTrivial
 			always_ff @(posedge clk) begin
@@ -92,7 +90,10 @@ module replay_buffer #(
 		end : genTrivial
 		else begin : genShift
 			always_ff @(posedge clk) begin
-				if(shift)  Buf <= { odat, Buf[0:LEN-2] };
+				if(shift) begin
+					Buf[0] <= odat;
+					Buf[1:LEN-1] <= Buf[0:LEN-2];
+				end
 			end
 		end : genShift
 

From 90c547d54756aed2aa101862fb6f55c05149173c Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 8 May 2023 10:23:22 +0100
Subject: [PATCH 018/112] [rtl mvu tb]: updated testbench

---
 finn-rtllib/mvu/tb/mvu_axi_tb.sv | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/finn-rtllib/mvu/tb/mvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_axi_tb.sv
index 08a349da84..ef5fa7d682 100644
--- a/finn-rtllib/mvu/tb/mvu_axi_tb.sv
+++ b/finn-rtllib/mvu/tb/mvu_axi_tb.sv
@@ -35,17 +35,18 @@ module mvu_axi_tb();
 
 //-------------------- Simulation parameters --------------------\\
 	// Matrix & parallelism config
-	localparam int unsigned MW = 90;
-	localparam int unsigned MH = 16;
-	localparam int unsigned SIMD = 9;
-	localparam int unsigned PE = 4;
-	localparam int unsigned SEGMENTLEN = 1;
-	localparam string MVU_IMPL_STYLE = "mvu_8sx9";
+	localparam int unsigned MW = 50;
+	localparam int unsigned MH = 8;
+	localparam int unsigned SIMD = 10;
+	localparam int unsigned PE = 2;
+	localparam int unsigned SEGMENTLEN = 2;
+	localparam string MVU_IMPL_STYLE = "mvu_8sx8u_dsp48";
+	localparam bit FORCE_BEHAVIORAL = 1;
 	// Bit-width config  
 	localparam int unsigned ACTIVATION_WIDTH = 8;
-	localparam int unsigned WEIGHT_WIDTH = 4;
+	localparam int unsigned WEIGHT_WIDTH = 8;
 	localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW);
-	localparam bit SIGNED_ACTIVATIONS = 1;
+	localparam bit SIGNED_ACTIVATIONS = 0;
 	// Simulation constants  
 	localparam int unsigned NF = MH/PE;
 	localparam int unsigned SF = MW/SIMD;
@@ -94,7 +95,7 @@ module mvu_axi_tb();
 		for (int i=0; i<SF; i++) begin
 			activations.dat <= ACTIVATIONS[i];
 			do begin 
-				activations.vld = $urandom()%7 > 1;
+				activations.vld = $urandom()%7 >= 1;
 				@(posedge clk);
 			end while (!(activations.vld === 1 && activations.rdy === 1));
 		end
@@ -201,6 +202,7 @@ module mvu_axi_tb();
 		.ACCU_WIDTH(ACCU_WIDTH),
 		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
 		.SEGMENTLEN(SEGMENTLEN),
+		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL),
 		.MVU_IMPL_STYLE(MVU_IMPL_STYLE)
 	)
 	dut (

From 0c37f1f7bed1143833649accceb59bd6821bed3c Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 8 May 2023 10:25:10 +0100
Subject: [PATCH 019/112] [builder]: added specialize_to_rtl step and changed
 standalone threshold layers to be by default true

---
 src/finn/builder/build_dataflow_config.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py
index 4c3e4ff899..24940489df 100644
--- a/src/finn/builder/build_dataflow_config.py
+++ b/src/finn/builder/build_dataflow_config.py
@@ -121,6 +121,7 @@ class VerificationStepType(str, Enum):
     "step_apply_folding_config",
     "step_minimize_bit_width",
     "step_generate_estimate_reports",
+    "step_specialize_to_rtl",
     "step_hls_codegen",
     "step_hls_ipgen",
     "step_set_fifo_depths",
@@ -233,7 +234,7 @@ class DataflowBuildConfig:
     #: activations in FINN) will be implemented as stand-alone HLS layers,
     #: instead of being part of MatrixVectorActivation layer. This gives larger
     #: flexibility, and makes it possible to have runtime-writable thresholds.
-    standalone_thresholds: Optional[bool] = False
+    standalone_thresholds: Optional[bool] = True
 
     #: (Optional) Whether optimizations that minimize the bit width of the
     #: weights and accumulator will be applied. Because this optimization relies

From 5ccb016a640dbed6818a9f1f3ef46136ce949c0d Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 8 May 2023 10:26:03 +0100
Subject: [PATCH 020/112] [builder]: added specialize_to_rtl step

---
 src/finn/builder/build_dataflow_steps.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
index e43a29d632..3e4d047a51 100644
--- a/src/finn/builder/build_dataflow_steps.py
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -123,6 +123,7 @@
 )
 from finn.util.pyverilator import verilator_fifosim
 from finn.util.test import execute_parent
+import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl
 
 
 def verify_step(
@@ -483,6 +484,16 @@ def step_generate_estimate_reports(model: ModelWrapper, cfg: DataflowBuildConfig
     return model
 
 
+def step_specialize_to_rtl(model: ModelWrapper, cfg: DataflowBuildConfig):
+    """Convert layers implemented in HLS to an equivalent specialized RTL implementation if possible."""
+    specialize_to_rtl_transforms = [
+        to_rtl.InferRTLMatrixVectorActivation()
+    ]
+    for trn in specialize_to_rtl_transforms:
+        model = model.transform(trn)
+    return model
+    
+
 def step_minimize_bit_width(model: ModelWrapper, cfg: DataflowBuildConfig):
     """Tighten the weight and accumulator bit widths for each layer."""
     if cfg.minimize_bit_width:
@@ -855,6 +866,7 @@ def step_deployment_package(model: ModelWrapper, cfg: DataflowBuildConfig):
     "step_apply_folding_config": step_apply_folding_config,
     "step_minimize_bit_width": step_minimize_bit_width,
     "step_generate_estimate_reports": step_generate_estimate_reports,
+    "step_specialize_to_rtl": step_specialize_to_rtl,
     "step_hls_codegen": step_hls_codegen,
     "step_hls_ipgen": step_hls_ipgen,
     "step_set_fifo_depths": step_set_fifo_depths,

From f099f4bbfd01b628a89c6099f637a4a85a8158ca Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 8 May 2023 10:26:44 +0100
Subject: [PATCH 021/112] [custom op]: added custom op
 MatrixVectorActivation_rtl

---
 src/finn/custom_op/fpgadataflow/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index 56d4230a3a..19c0ddd999 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -49,6 +49,7 @@
 from finn.custom_op.fpgadataflow.labelselect_batch import LabelSelect_Batch
 from finn.custom_op.fpgadataflow.lookup import Lookup
 from finn.custom_op.fpgadataflow.matrixvectoractivation import MatrixVectorActivation
+from finn.custom_op.fpgadataflow.matrixvectoractivation_rtl import MatrixVectorActivation_rtl
 from finn.custom_op.fpgadataflow.pool_batch import Pool_Batch
 from finn.custom_op.fpgadataflow.streamingdataflowpartition import (
     StreamingDataflowPartition,
@@ -70,6 +71,7 @@
 custom_op["DownSampler"] = DownSampler
 custom_op["StreamingMaxPool_Batch"] = StreamingMaxPool_Batch
 custom_op["MatrixVectorActivation"] = MatrixVectorActivation
+custom_op["MatrixVectorActivation_rtl"] = MatrixVectorActivation_rtl
 custom_op["ConvolutionInputGenerator"] = ConvolutionInputGenerator
 custom_op["ConvolutionInputGenerator1D"] = ConvolutionInputGenerator1D
 custom_op["ConvolutionInputGenerator_rtl"] = ConvolutionInputGenerator_rtl

From 9a3b0fdc54f8c7c1b541c8cfdaaf6e96315da092 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 8 May 2023 10:28:34 +0100
Subject: [PATCH 022/112] [custom op]: added additional attribute to enable
 conversion to RTL (custom-op)

---
 src/finn/custom_op/fpgadataflow/matrixvectoractivation.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
index aa987384dd..e54abb0c3f 100644
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
@@ -70,7 +70,7 @@ def get_nodeattr_types(self):
             "SIMD": ("i", True, 0),
             "MW": ("i", True, 0),
             "MH": ("i", True, 0),
-            "resType": ("s", False, "lut", {"auto", "lut", "dsp"}),
+            "resType": ("s", False, "dsp", {"auto", "lut", "dsp"}),
             "ActVal": ("i", False, 0),
             # FINN DataTypes for inputs, weights, outputs
             "inputDataType": ("s", True, ""),
@@ -125,6 +125,8 @@ def get_nodeattr_types(self):
             # vector through the accelerator. This will get rid of any old
             # weight data from the weight FIFOs.
             "runtime_writeable_weights": ("i", False, 0, {0, 1}),
+            # Flag to specify whether RTL-based or HLS-based implementation is preferred
+            "impl": ("s", False, "rtl", {"hls", "rtl"})
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs

From 38aa930baa1296a7099f9df22e3d0d000c8d5a05 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 8 May 2023 10:30:15 +0100
Subject: [PATCH 023/112] [custom op]: modified ip-stitching and code
 generation

---
 .../matrixvectoractivation_rtl.py             | 231 ++++++++++--------
 1 file changed, 127 insertions(+), 104 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
index c8a0aa675b..6b1c2f3be7 100644
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
@@ -27,6 +27,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import math
+from shutil import copy
 import numpy as np
 import os
 import textwrap
@@ -45,6 +46,12 @@
     pack_innermost_dim_as_hex_string,
     rtlsim_output_to_npy,
 )
+from finn.util.basic import get_rtlsim_trace_depth, make_build_dir
+
+try:
+    from pyverilator import PyVerilator
+except ModuleNotFoundError:
+    PyVerilator = None
 
 from . import templates
 
@@ -60,8 +67,8 @@ class MatrixVectorActivation_rtl(HLSCustomOp):
     """Class that corresponds to finn-hls Matrix_Vector_Activate(_Stream)_Batch
     function."""
 
-    def __init__(self, onnx_node):
-        super().__init__(onnx_node)
+    def __init__(self, onnx_node, **kwargs):
+        super().__init__(onnx_node, **kwargs)
         self.decoupled_wrapper = templates.decoupled_wrapper
 
     def get_nodeattr_types(self):
@@ -78,11 +85,6 @@ def get_nodeattr_types(self):
             "outputDataType": ("s", True, ""),
             # FINN DataType for accumulator -- auto-computed and updated
             "accDataType": ("s", False, "INT32"),
-            # use xnor-popcount for binary weights/inputs, thus treating them
-            # as bipolar
-            "binaryXnorMode": ("i", False, 0, {0, 1}),
-            # no-activation mode (produce accumulators)
-            "noActivation": ("i", False, 0, {0, 1}),
             # number of input vectors, examples:
             # [1] is a single vector (like a FC layer with batch=1)
             # [4] is four vectors (like a FC layer with batch=4)
@@ -105,16 +107,6 @@ def get_nodeattr_types(self):
                 "auto",
                 {"auto", "block", "distributed", "ultra"},
             ),
-            # FPGA resource type for threshold memories (if noActivation is False)
-            # auto -- let Vivado decide
-            # block -- use BRAM
-            # distributed -- use LUTRAM
-            "ram_style_thresholds": (
-                "s",
-                False,
-                "auto",
-                {"auto", "block", "distributed"},
-            ),
             # (mem_mode = decoupled only) whether weights will be writable through
             # an AXI-lite interface during runtime
             # 1 for enabled, 0 for disabled.
@@ -125,6 +117,8 @@ def get_nodeattr_types(self):
             # vector through the accelerator. This will get rid of any old
             # weight data from the weight FIFOs.
             "runtime_writeable_weights": ("i", False, 0, {0, 1}),
+            # attribute to save top module name - not user configurable
+            "gen_top_module": ("s", False, ""),
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
@@ -142,7 +136,6 @@ def calc_wmem(self):
 
     def calc_tmem(self):
         """Calculates and returns TMEM."""
-        assert self.get_nodeattr("noActivation")==1, "RTL-based MVAU does not support thresholding currently, please infer a standalone Thresholding_Batch layer"
         return 0
 
     def make_shape_compatible_op(self, model):
@@ -192,27 +185,9 @@ def verify_node(self):
                 """The required MatrixVectorActivation attributes do not exist."""
             )
 
-        # verify the number of inputs depending on noActivation value
-        # check noActivation value to determine the number of inputs
-        no_act = self.get_nodeattr("noActivation")
-
-        if no_act == 1:
-            if len(self.onnx_node.input) == 2:
-                info_messages.append("The number of inputs is correct")
-            else:
-                info_messages.append(
-                    """RTL-based MatrixVectorActivation needs in no
-                            activation mode 2 inputs (data input and weights)"""
-                )
-        elif no_act == 0:
-            info_messages.append("RTL-based MVAU does not support thresholding currently, please infer a standalone Thresholding_Batch layer")
-        else:
-            info_messages.append(
-                """noActivation attribute contains {} should
-                be 1 for RTL-based MatrixVectorActivation""".format(
-                    no_act
-                )
-            )
+        num_of_inputs = len(self.onnx_node.input)
+        if num_of_inputs!=2:
+            info_messages.append("RTL-based MatrixVectorActivation expects two inputs (weights and activation), but got {} inputs.".format(len(self.onnx_node.input)))
 
         mem_mode = self.get_nodeattr("mem_mode")
 
@@ -221,6 +196,7 @@ def verify_node(self):
 
         return info_messages
 
+# TODO: Add in replay_buffer estimation
     def uram_estimation(self):
         P = self.get_nodeattr("PE")
         Q = self.get_nodeattr("SIMD")
@@ -242,6 +218,7 @@ def uram_estimation(self):
         depth_multiplier = math.ceil(omega / 4096)
         return width_multiplier * depth_multiplier
 
+# TODO: Add in replay_buffer estimation
     def bram_estimation(self):
         """Calculates resource estimation for BRAM based on:
         - FINN-R: An End-to-End Deep-Learning Framework for Fast
@@ -268,7 +245,7 @@ def bram_estimation(self):
         ):
             return 0
         # assuming SDP mode RAMB18s (see UG573 Table 1-10)
-        # assuming decoupled (RTL) memory, which is more efficient than const (HLS)
+        # assuming decoupled (RTL) memory
         if mem_width == 1:
             return math.ceil(omega / 16384)
         elif mem_width == 2:
@@ -282,6 +259,7 @@ def bram_estimation(self):
         else:
             return (math.ceil(omega / 512)) * (math.ceil(mem_width / 36))
 
+# TODO: Add in replay_buffer estimation
     def bram_efficiency_estimation(self):
         wdt = self.get_weight_datatype()
         W = wdt.bitwidth()
@@ -294,6 +272,7 @@ def bram_efficiency_estimation(self):
         bram16_est_capacity = bram16_est * 36 * 512
         return wbits / bram16_est_capacity
 
+# TODO: Add in replay_buffer estimation
     def uram_efficiency_estimation(self):
         """Function for URAM efficiency estimation: actual parameter storage
         needed divided by the allocated URAM storage (from estimation)"""
@@ -308,7 +287,7 @@ def uram_efficiency_estimation(self):
         uram_est_capacity = uram_est * 72 * 4096
         return wbits / uram_est_capacity
 
-#TODO: FIX
+#TODO: FIX: worst case estimates since segmentlen is not known at this point?
     def lut_estimation(self):
         """Calculates resource estimations for LUTs based on:
         - FINN-R: An End-to-End Deep-Learning Framework for Fast
@@ -348,23 +327,14 @@ def lut_estimation(self):
         # accumulator
         acc_bits = W + A + np.ceil(math.log(MW, 2))
         acc_luts = acc_bits
-        # thresholds and threshold comparators
-        thr_luts = 0
-        comp_luts = 0
-        noact = self.get_nodeattr("noActivation")
-        if noact == 0:
-            odt = self.get_output_datatype()
-            B = odt.bitwidth()
-            thr_luts = (2**B - 1) * acc_bits * math.ceil(self.calc_tmem() / 64)
-            comp_luts = (2**B - 1) * acc_bits
 
         return int(
             c0
-            + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts))
+            + c1 * (P * (mult_luts + addertree_luts + acc_luts))
             + c2
         )
 
-#TODO: FIX
+#TODO: FIX: worst case estimates since segmentlen is not known at this point?
     def dsp_estimation(self):
         # multiplication
         P = self.get_nodeattr("PE")
@@ -380,7 +350,7 @@ def dsp_estimation(self):
             mult_dsp = 0
         return int(mult_dsp)
 
-#TODO: FIX
+#TODO: FIX: worst case estimates since segmentlen is not known at this point
     def get_exp_cycles(self):
         pe = self.get_nodeattr("PE")
         simd = self.get_nodeattr("SIMD")
@@ -389,6 +359,7 @@ def get_exp_cycles(self):
         mw = self.get_nodeattr("MW")
         # since mmv != 1 is not supported yet, we set mmv for now to 1
         mmv = 1
+        # Actual exp_cycles is probably slightly larger (say 3 cycles (DSP A/B, M, P - reg) + additional pipeline buffer cycles. Most probably <10)
         exp_cycles = (mh / pe) * (mw / simd) * np.prod(num_inp_vec) / mmv
         return int(exp_cycles)
 
@@ -413,7 +384,7 @@ def get_output_datatype(self, ind=0):
 
     def get_instream_width(self, ind=0):
         i_bits = self.get_input_datatype().bitwidth()
-        assert i_bits<=9, "RTL-based MVAU only supports activations with bit-width up to 9-bits"
+        assert (i_bits<=9), "RTL-based MVAU only supports activations with bit-width up to 9-bits"
         in_width = i_bits * self.get_nodeattr("SIMD")
         return in_width
 
@@ -431,8 +402,8 @@ def get_weightstream_width(self):
             pe = self.get_nodeattr("PE")
             simd = self.get_nodeattr("SIMD")
             wp = self.get_weight_datatype().bitwidth()
+            assert (wp <= 8), "RTL-based MVAU only supports weights with bit-width up to 8-bits"
             w_width = pe * simd * wp
-            assert wp<=8, "RTL-based MVAU only supports weights with bit-width up to 8-bits"
             return w_width
         else:
             return 0
@@ -544,10 +515,8 @@ def minimize_accumulator_width(self, model):
                 adt = DataType.get_smallest_possible(-acc_max - 1)
         else:
             adt = DataType.get_smallest_possible(acc_max)
-        # ensure a datatype divisible by 8-bits in case this is the last node
-        bw = roundup_to_integer_multiple(adt.bitwidth(), 8)
-        new_adt_name = adt.name.replace(str(adt.bitwidth()), str(bw))
-        adt = DataType[new_adt_name]
+        # Note: we are interested in simply the width of the output dot product.
+        # Padding the actual output stream to a multiple of 8-bits is done in the RTL component
         self.set_nodeattr("accDataType", adt.name)
         # for no-activation nodes, output dt = acc dt
         self.set_nodeattr("outputDataType", adt.name)
@@ -588,7 +557,10 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name):
                 1, -1, pe * simd
             )
             weight_tensor_pe_flipped = weight_tensor_pe_flipped.copy()
-            if weight_file_mode == "decoupled_verilog_dat":
+            if weight_file_mode == "decoupled_npy":
+                # save weight stream into npy for cppsim
+                np.save(weight_file_name, weight_tensor_simd_flipped)
+            elif weight_file_mode == "decoupled_verilog_dat":
                 # convert weight values into hexstring
                 weight_width = self.get_weightstream_width()
                 # pad to nearest 4 bits to get hex strings
@@ -638,7 +610,7 @@ def generate_params(self, model, path):
             weight_filename_sim = "{}/weights.npy".format(code_gen_dir)
             # save decoupled weights for cppsim
             self.make_weight_file(weights, "decoupled_npy", weight_filename_sim)
-            # also save weights as Verilog .dat file
+            # Also save weights as Verilog .dat file
             # note that we provide two different .dat files, one for synth
             # and one for synthesis. this is because URAM-based weights always
             # need zero weights for synthesis, otherwise they get inferred
@@ -693,7 +665,6 @@ def execute_node(self, context, graph):
         for inputs in node.input:
             # it is assumed that the first input of the node is the data input
             # the second input are the weights
-            # the third input are the thresholds
             if in_ind == 0:
                 assert (
                     str(context[inputs].dtype) == "float32"
@@ -709,7 +680,7 @@ def execute_node(self, context, graph):
                     reshaped_input,
                 )
             elif in_ind > 2:
-                raise Exception("Unexpected input found for MatrixVectorActivation")
+                raise Exception("Unexpected input found for MatrixVectorActivation_rtl")
             in_ind += 1
 
         if mode == "rtlsim":
@@ -759,7 +730,7 @@ def execute_node(self, context, graph):
     def code_generation_ipgen(self, model, fpgapart, clk):
         """Normally: Generates C++ code and tcl script for IP generation.
         Here: Generates (System-)Verilog code for IP generation."""
-        self.generate_hdl()
+        self.generate_hdl(model, fpgapart, clk)
 
     def ipgen_singlenode_code(self):
         """Normally: Builds the bash script for IP generation."""
@@ -828,11 +799,21 @@ def code_generation_ipi(self):
                 "create_bd_intf_pin -mode Slave "
                 "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name)
             )
-            # instantiate the hls ip
-            cmd.append(
-                "create_bd_cell -type ip -vlnv %s /%s/%s"
-                % (self.get_nodeattr("ip_vlnv"), node_name, node_name)
-            )
+            # instantiate the RTL block
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+            rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/")
+            sourcefiles = [
+                os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"),
+                rtllib_dir + "mvu_axi.sv",
+                rtllib_dir + "replay_buffer.sv",
+                rtllib_dir + "mvu_4sx4u.sv",
+                rtllib_dir + "mvu_8sx9.sv",
+                rtllib_dir + "mvu_8sx8u_dsp48.sv"
+            ]
+            for f in sourcefiles:
+                cmd.append("add_files -norecurse %s" % (f))
+            cmd.append("create_bd_cell -type hier -reference %s /%s/%s" % (self.get_nodeattr("gen_top_module"), self.onnx_node.name, self.onnx_node.name))
+
             # instantiate a streamer and connect it to the HLS IP
             strm_vlnv = "xilinx.com:user:memstream:1.0"
             strm_inst = node_name + "_wstrm"
@@ -947,12 +928,6 @@ def get_op_and_param_counts(self):
         weight_param_type = "param_weight_%db" % (weight_bits)
         weight_count = in_features * out_features
         ret_dict = {mac_op_type: mac_count, weight_param_type: weight_count}
-        if self.get_nodeattr("noActivation") == 0:
-            tdt = DataType[self.get_nodeattr("accDataType")]
-            thres_bits = tdt.bitwidth()
-            thres_param_type = "param_threshold_%db" % (thres_bits)
-            thres_count = out_features
-            ret_dict[thres_param_type] = thres_count
         return ret_dict
 
     def derive_characteristic_fxns(self, period):
@@ -972,65 +947,113 @@ def derive_characteristic_fxns(self, period):
             ]
         super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
 
-    def generate_hdl(self):
-#TODO: add distinction between (PE=MH or PE=1) and where MH dimension is folded
-        template_path, code_gen_dict = self.prepare_codegen_default()
+# TODO: characterize max_clk and implement this function in look-up style
+    def _resolve_segment_len(self, clk):
+        # Insert pipeline registers in the DSP chain to meet target clock frequency
+        segmentlen = 0
+        return segmentlen
+
+    def _resolve_impl_style(self, fpgapart):
+        # Based on target device and activation/weight-width, choose the supported RTL module
+        act_width = self.get_input_datatype(0).bitwidth()
+        weight_width = self.get_input_datatype(1).bitwidth()
+        is_versal = fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"] or fpgapart[0:5] == "xqrvc"
+        if (act_width == 4 and weight_width == 4):
+            return "mvu_4sx4u"
+        else:
+            if (is_versal):
+                return "mvu_8sx9_dsp58"
+            else:
+                return "mvu_8sx8u_dsp48"
+
+    def generate_hdl(self, model, fpgapart, clk):
+        # Generate params as part of IP preparation
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        self.generate_params(model, code_gen_dir)
 
+        template_path, code_gen_dict = self.prepare_codegen_default(fpgapart, clk)
         # add general parameters to dictionary
-        code_gen_dict["$TOP_MODULE_NAME$"] = [self.get_verilog_top_module_name()]
+        code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [self.get_verilog_top_module_name()]
         # save top module name so we can refer to it after this node has been renamed
         # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject)
         self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name())
-#TODO: currently only ram_style=auto is supported
+
         ram_style = self.get_nodeattr("ram_style")
-        if ram_style == "auto":
-            continue
-        else:
-            raise Exception("Unrecognized ram_style for MatrixVectorActivation")
+        assert (ram_style=="auto"), "Unrecognized ram_style for MatrixVectorActivation_rtl"
 
-        # apply code generation to templates
-        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        # apply code generation to template
         with open(template_path, "r") as f:
-            template = f.read()
+            template_wrapper = f.read()
         for key in code_gen_dict:
             # transform list into long string separated by '\n'
             code_gen_line = "\n".join(code_gen_dict[key])
-            template = template.replace(key, code_gen_line)
             template_wrapper = template_wrapper.replace(key, code_gen_line)
         with open(
             os.path.join(
-                code_gen_dir, self.get_nodeattr("gen_top_module") + "_impl.sv"
+                code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"
             ),
             "w",
         ) as f:
-            f.write(template)
+            f.write(template_wrapper.replace("$FORCE_BEHAVIORAL$", str(0)))
         with open(
             os.path.join(
-                code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"
+                code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper_sim.v"
             ),
             "w",
         ) as f:
-            f.write(template_wrapper)
+            f.write(template_wrapper.replace("$FORCE_BEHAVIORAL$", str(1)))
 
         # set ipgen_path and ip_path so that HLS-Synth transformation
         # and stich_ip transformation do not complain
         self.set_nodeattr("ipgen_path", code_gen_dir)
-        self.set_nodeattr("ip_path", code_gen_dir)    
+        self.set_nodeattr("ip_path", code_gen_dir)
 
-    def prepare_codegen_default(self):
-        # TODO: Differentiate between PE folding and fully unrolled along MH dimension
+    def prepare_codegen_default(self, fpgapart, clk):
         template_path = (
-            os.environ["FINN_ROOT"] + "/finn-rtllib/mvau/dsp58_mvau_template.vhdl"
+            os.environ["FINN_ROOT"] + "/finn-rtllib/mvu/mvu_axi_wrapper.v"
         )
+        
         code_gen_dict = {}
-
-        code_gen_dict["$PE$"] = self.get_nodeattr("PE")
-        code_gen_dict["$SIMD$"] = self.get_nodeattr("SIMD")
-        code_gen_dict["$MW$"] = self.get_nodeattr("MW")
-        code_gen_dict["$MH$"] = self.get_nodeattr("MH")
-        code_gen_dict["$ACTIVATION_WIDTH$"] = self.get_input_datatype(0).bitwidth()
-        code_gen_dict["$WEIGHT_WIDTH$"] = self.get_input_datatype(1).bitwidth()
-        code_gen_dict["$ACCU_WIDTH_BA$"] = self.get_output_datatype().bitwidth()
+        code_gen_dict["$MW$"] = [str(self.get_nodeattr("MW"))]
+        code_gen_dict["$MH$"] = [str(self.get_nodeattr("MH"))]
+        code_gen_dict["$PE$"] = [str(self.get_nodeattr("PE"))]
+        code_gen_dict["$SIMD$"] = [str(self.get_nodeattr("SIMD"))]
+        code_gen_dict["$ACTIVATION_WIDTH$"] = [str(self.get_input_datatype(0).bitwidth())]
+        code_gen_dict["$WEIGHT_WIDTH$"] = [str(self.get_input_datatype(1).bitwidth())]
+        code_gen_dict["$ACCU_WIDTH$"] = [str(self.get_output_datatype().bitwidth())]
+        code_gen_dict["$SIGNED_ACTIVATIONS$"] = [str(1)] if (self.get_input_datatype(0).min() < 0) else [str(0)]
+        code_gen_dict["$SEGMENTLEN$"] = [str(self._resolve_segment_len(clk))]
+        code_gen_dict["$MVU_IMPL_STYLE$"] = [self._resolve_impl_style(fpgapart)]
 
         return template_path, code_gen_dict
 
+    def prepare_rtlsim(self):
+        """Creates a Verilator emulation library for the RTL code generated
+        for this node, sets the rtlsim_so attribute to its path and returns
+        a PyVerilator wrapper around it."""
+
+        if PyVerilator is None:
+            raise ImportError("Installation of PyVerilator is required.")
+
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")        
+        # Path to (System-)Verilog files used by top-module & path to top-module
+        verilog_paths = [
+            code_gen_dir,
+            os.environ["FINN_ROOT"] + "/finn-rtllib/mvu"
+        ]
+        verilog_files = [
+            self.get_nodeattr("gen_top_module") + "_wrapper_sim.v"
+        ]
+
+        # build the Verilator emu library
+        sim = PyVerilator.build(
+            verilog_files,
+            build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"),
+            verilog_path=verilog_paths,
+            trace_depth=get_rtlsim_trace_depth(),
+            top_module_name=self.get_verilog_top_module_name()
+        )
+        # save generated lib filename in attribute
+        self.set_nodeattr("rtlsim_so", sim.lib._name)
+        
+        return sim
\ No newline at end of file

From 4e44934c3001174e52c62caf5d320104a308e611 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 8 May 2023 10:31:35 +0100
Subject: [PATCH 024/112] [tests]: initial version of unit test for RTL custom
 op and specialize_to_rtl transformation for MVU

---
 .../test_fpgadataflow_mvau_rtl.py             | 172 ++++++++++++++++++
 1 file changed, 172 insertions(+)
 create mode 100644 tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py

diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py b/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py
new file mode 100644
index 0000000000..20a249bd08
--- /dev/null
+++ b/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py
@@ -0,0 +1,172 @@
+# Copyright (C) 2022, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+import os
+
+import numpy as np
+from onnx import TensorProto, helper
+from qonnx.util.basic import (
+    qonnx_make_model,
+    gen_finn_dt_tensor
+)
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.core.datatype import DataType
+from qonnx.transformation.general import GiveUniqueNodeNames
+import finn.core.onnx_exec as oxe
+import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from qonnx.transformation.general import ApplyConfig
+import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl
+#import qonnx.core.data_layout as DataLayout
+
+build_dir = os.environ["FINN_BUILD_DIR"]
+
+def make_single_matmul_modelwrapper(W, ofm_shape, mh, ifm, weights, idt, wdt):
+    (ofm_h, ofm_w) = ofm_shape
+    ofm = helper.make_tensor_value_info(
+        "ofm",
+        TensorProto.FLOAT,
+        (1, ofm_h, ofm_w, mh)
+    )
+
+    matmul_node = helper.make_node(
+        "MatMul",
+        ["ifm", "weights"],
+        ["ofm"]
+    )
+    graph = helper.make_graph(
+        nodes=[matmul_node],
+        name="matmul_graph",
+        inputs=[ifm],
+        outputs=[ofm]
+    )
+
+    model = qonnx_make_model(graph, producer_name="fclayer-model")
+    model = ModelWrapper(model)
+
+    model.set_tensor_datatype("ifm", idt)
+    model.set_tensor_datatype("weights", wdt)
+    model.set_tensor_datatype("ofm", DataType["INT32"]) # At this step, the MatMul layer does not optimize the bit-width of the output datatype
+    model.set_initializer("weights", W)
+
+    # model.set_tensor_layout("ifm", DataLayout.NHWC)
+
+    return model
+
+def prepare_inputs(input_tensor):
+    return {"inp": input_tensor}
+
+@pytest.mark.parametrize("mh", [16])
+@pytest.mark.parametrize("mw", [90])
+#@pytest.mark.parametrize("pe", [1, 2, 4, 8, 16])
+@pytest.mark.parametrize("pe", [16])
+#@pytest.mark.parametrize("simd", [1, 30, 90])
+@pytest.mark.parametrize("simd", [90])
+@pytest.mark.parametrize("idt", [DataType["INT8"]])
+@pytest.mark.parametrize("wdt", [DataType["UINT4"]])
+#@pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S", "xcku3p-ffva676-1-e"])
+@pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S"])
+@pytest.mark.parametrize("segmentlen", [1])
+@pytest.mark.fpgadataflow
+@pytest.mark.slow
+@pytest.mark.vivado
+def test_fpgadataflow_mvau_rtl(mh, mw, pe, simd, idt, wdt, part, segmentlen):
+    # Create test input vector (produced by SWG)
+    ofm_shape = (5, 5)
+    ofm_h, ofm_w = ofm_shape
+    ifm = helper.make_tensor_value_info(
+        "ifm",
+        TensorProto.FLOAT,
+        [1, ofm_h, ofm_w, mw]
+    )
+    weights = helper.make_tensor_value_info(
+        "weights",
+        TensorProto.FLOAT,
+        [mw, mh]
+    )
+    W = gen_finn_dt_tensor(wdt, (mw, mh))
+    model = make_single_matmul_modelwrapper(W, ofm_shape, mh, ifm, weights, idt, wdt)
+    model = model.transform(GiveUniqueNodeNames())
+
+    model.save(build_dir+"/matmul.onnx")
+
+    # Create MatMul & obtain golden reference output
+    A = gen_finn_dt_tensor(model.get_tensor_datatype("ifm"), model.get_tensor_shape("ifm"))
+    input_dict = prepare_inputs(A)
+
+    ## Execute ONNX model
+    output_matmul = oxe.execute_onnx(model, input_dict)
+
+    # Create MVAU (HLS)
+    model = model.transform(to_hls.InferQuantizedMatrixVectorActivation(mem_mode="decoupled"))
+    model = model.transform(GiveUniqueNodeNames())
+    
+    # Apply folding (i.e. specify to use DSPs)
+    folding_config = {
+        "Defaults": {},
+        "MatrixVectorActivation_0": {
+            "PE" : pe,
+            "SIMD" : simd,
+            "mem_mode" : "decoupled",
+            "ram_style" : "auto",
+            "resType" : "dsp",
+            "impl" : "rtl"
+        }
+    }
+    model = model.transform(ApplyConfig(folding_config))
+    model.save(build_dir+"/mvau_hls.onnx")
+
+    model = model.transform(SetExecMode("rtlsim"))
+    model = model.transform(PrepareIP(part, 5))
+    model = model.transform(HLSSynthIP())
+    model = model.transform(PrepareRTLSim())
+    output_mvau_hls = oxe.execute_onnx(model, input_dict)["ofm"]
+
+    # Apply convert-to-rtl step
+    model = model.transform(to_rtl.InferRTLMatrixVectorActivation())
+    model = model.transform(GiveUniqueNodeNames())
+    model.save(build_dir+"/mvau_rtl.onnx")
+
+    model = model.transform(SetExecMode("rtlsim"))
+    model = model.transform(PrepareIP("xcvm1802-vsvd1760-2MP-e-S", 5))
+    model = model.transform(HLSSynthIP())
+    model = model.transform(PrepareRTLSim())
+    output_mvau_rtl = oxe.execute_onnx(model, input_dict)["ofm"]
+
+    model.save(build_dir+"/mvau_rtl_sim.onnx")
+
+    assert (output_mvau_hls == output_mvau_rtl).all()
+    assert (output_mvau_hls.size > 0)
+
+
+# python setup.py test --addopts "-k test_fpgadataflow_mvau_rtl"
+# python setup.py test --addopts "-k test_fpgadataflow_fclayer_rtlsim"
\ No newline at end of file

From cc361d9fd4ea082e04d7a1a6bc3932406b0a4f14 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 8 May 2023 10:32:52 +0100
Subject: [PATCH 025/112] [rtl mvu]: specialized compute core for 4-bit weights
 and activations for DSP48/DSP58

---
 finn-rtllib/mvu/mvu_4sx4u.sv | 359 +++++++++++++++++++++++++++++++++++
 1 file changed, 359 insertions(+)
 create mode 100644 finn-rtllib/mvu/mvu_4sx4u.sv

diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv
new file mode 100644
index 0000000000..5993154355
--- /dev/null
+++ b/finn-rtllib/mvu/mvu_4sx4u.sv
@@ -0,0 +1,359 @@
+module mvu_4sx4u #(
+	int unsigned  PE,
+	int unsigned  SIMD,
+	int unsigned  ACCU_WIDTH,
+	bit FORCE_BEHAVIORAL = 0
+)(
+	// Global Control
+	input	logic  clk,
+	input	logic  rst,
+	input	logic  en,
+
+	// Input
+	input	logic  last,
+	input	logic  zero,	// ignore current inputs and force this partial product to zero
+	input	logic signed [PE-1:0][SIMD-1:0][3:0]  w,	// signed weights
+	input	logic                [SIMD-1:0][3:0]  a,	// unsigned activations
+
+	// Ouput
+	output	logic  vld,
+	output	logic signed [PE-1:0][ACCU_WIDTH-1:0]  p
+);
+
+	typedef int unsigned  leave_load_t[2*SIMD-1];
+	function leave_load_t init_leave_loads();
+		automatic leave_load_t  res;
+		for(int  i = 2*(SIMD-1); i >= int'(SIMD)-1; i--)  res[i] = 1;
+		for(int  i = SIMD-2; i >= 0; i--)  res[i] = res[2*i+1] + res[2*i+2];
+		return  res;
+	endfunction : init_leave_loads
+
+	// Pipeline for last indicator flag
+	logic [1:5] L = '0;
+	always_ff @(posedge clk) begin
+		if(rst)      L <= '0;
+		else if(en)  L <= { last, L[1:4] };
+	end
+	assign	vld = L[5];
+
+	// Stages #1 - #3: DSP Lanes + cross-lane canaries duplicated with SIMD parallelism
+	localparam int unsigned  D[4:0] = '{ ACCU_WIDTH+22, 22, 15, 8, 0 }; // Lane offsets
+
+	localparam int unsigned  PIPE_COUNT = (PE+3)/4;
+	for(genvar  c = 0; c < PIPE_COUNT; c++) begin : genPipes
+
+		localparam int unsigned  PE_BEG = 4*c;
+		localparam int unsigned  PE_END = PE < 4*(c+1)? PE : 4*(c+1);
+
+		uwire        [57:0]  p3[SIMD];
+		uwire signed [ 1:0]  h3[SIMD][3];
+		for(genvar  s = 0; s < SIMD; s++) begin : genSIMD
+
+			// Input Lane Assembly
+			uwire [23:0]  bb = a[s];
+			logic [33:0]  aa;
+			logic [26:0]  dd;
+			logic [ 1:0]  xx[3:1];
+			if(1) begin : blkVectorize
+				uwire [3:0]  ww[PE_END - PE_BEG];
+				for(genvar  pe = 0; pe < PE_END - PE_BEG; pe++) begin
+					assign	ww[pe] = w[PE_BEG + pe][s];
+					if(pe) begin
+//						assign  xx[pe] = zero? 0 : ww[pe] * a[s];
+						LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x (
+							.O6(xx[pe][1]),
+							.O5(xx[pe][0]),
+							.I5(1'b1),
+							.I4(zero),
+							.I3(ww[pe][1]),
+							.I2(a[s][1]),
+							.I1(ww[pe][0]),
+							.I0(a[s][0])
+						);
+					end
+				end
+				always_comb begin
+					dd = '0;
+					aa = '0;
+					for(int unsigned  pe = 0; pe < PE_END - PE_BEG; pe++) begin
+						dd[D[pe]+:3] = ww[pe];
+						aa[D[pe]+ 3] = ww[pe][3];
+					end
+				end
+			end : blkVectorize
+
+			uwire [57:0]  pp;
+
+			// Note: Since the product B * AD is computed,
+			//       rst can be only applied to AD and zero only to B
+			//       with the same effect as zeroing both.
+			if (FORCE_BEHAVIORAL) begin : genBehav
+				// Stage #1: Input Refine
+				logic signed [23:0]  B1  = 0;
+				always_ff @(posedge clk) begin
+					if(zero)     B1  <= 0;
+					else if(en)  B1  <= bb;
+				end
+
+				logic signed [26:0]  AD1 = 0;
+				always_ff @(posedge clk) begin
+					if(rst)      AD1 <= 0;
+					else if(en)  AD1 <= dd - aa;
+				end
+
+				// Stage #2: Multiply
+				logic signed [50:0]  M2 = 0;
+				always_ff @(posedge clk) begin
+					if(rst)      M2 <= 0;
+					else if(en)  M2 <=
+// synthesis translate off
+						(B1 === '0) || (AD1 === '0)? 0 :
+// synthesis translate on
+						B1 * AD1;
+				end
+
+				// Stage #3: Accumulate
+				logic signed [57:0]  P3 = 0;
+				always_ff @(posedge clk) begin
+					if(rst)      P3 <= 0;
+					else if(en)  P3 <= M2 + (L[3]? 0 : P3);
+				end
+
+				assign	pp = P3;
+			end : genBehav
+			else begin : genDSP
+				DSP48E2 #(
+					// Feature Control Attributes: Data Path Selection
+					.AMULTSEL("AD"),	// Selects A input to multiplier (A, AD)
+					.A_INPUT("DIRECT"),	// Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port)
+					.BMULTSEL("B"),		// Selects B input to multiplier (AD, B)
+					.B_INPUT("DIRECT"),	// Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port)
+					.PREADDINSEL("A"),                 // Selects input to pre-adder (A, B)
+					.RND('0),                          // Rounding Constant
+					.USE_MULT("MULTIPLY"),             // Select multiplier usage (DYNAMIC, MULTIPLY, NONE)
+					.USE_SIMD("ONE48"),                // SIMD selection (FOUR12, ONE58, TWO24)
+					.USE_WIDEXOR("FALSE"),             // Use the Wide XOR function (FALSE, TRUE)
+					.XORSIMD("XOR24_48_96"),       // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116)
+
+					// Pattern Detector Attributes: Pattern Detection Configuration
+					.AUTORESET_PATDET("NO_RESET"),     // NO_RESET, RESET_MATCH, RESET_NOT_MATCH
+					.AUTORESET_PRIORITY("RESET"),      // Priority of AUTORESET vs. CEP (CEP, RESET).
+					.MASK('1),                         // 58-bit mask value for pattern detect (1=ignore)
+					.PATTERN('0),                      // 58-bit pattern match for pattern detect
+					.SEL_MASK("MASK"),                 // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2
+					.SEL_PATTERN("PATTERN"),           // Select pattern value (C, PATTERN)
+					.USE_PATTERN_DETECT("NO_PATDET"),  // Enable pattern detect (NO_PATDET, PATDET)
+
+					// Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins
+					.IS_ALUMODE_INVERTED('0),				// Optional inversion for ALUMODE
+					.IS_CARRYIN_INVERTED('0),				// Optional inversion for CARRYIN
+					.IS_CLK_INVERTED('0),					// Optional inversion for CLK
+					.IS_INMODE_INVERTED('0),				// Optional inversion for INMODE
+					.IS_OPMODE_INVERTED(9'b00_010_01_01),	// Optional inversion for OPMODE
+					.IS_RSTALLCARRYIN_INVERTED('0),			// Optional inversion for RSTALLCARRYIN
+					.IS_RSTALUMODE_INVERTED('0),			// Optional inversion for RSTALUMODE
+					.IS_RSTA_INVERTED('0),					// Optional inversion for RSTA
+					.IS_RSTB_INVERTED('0),					// Optional inversion for RSTB
+					.IS_RSTCTRL_INVERTED('0),				// Optional inversion for STCONJUGATE_A
+					.IS_RSTC_INVERTED('0),					// Optional inversion for RSTC
+					.IS_RSTD_INVERTED('0),					// Optional inversion for RSTD
+					.IS_RSTINMODE_INVERTED('0),				// Optional inversion for RSTINMODE
+					.IS_RSTM_INVERTED('0),					// Optional inversion for RSTM
+					.IS_RSTP_INVERTED('0),					// Optional inversion for RSTP
+
+					// Register Control Attributes: Pipeline Register Configuration
+					.ACASCREG(0),                      // Number of pipeline stages between A/ACIN and ACOUT (0-2)
+					.ADREG(1),                         // Pipeline stages for pre-adder (0-1)
+					.ALUMODEREG(0),                    // Pipeline stages for ALUMODE (0-1)
+					.AREG(0),                          // Pipeline stages for A (0-2)
+					.BCASCREG(1),                      // Number of pipeline stages between B/BCIN and BCOUT (0-2)
+					.BREG(1),                          // Pipeline stages for B (0-2)
+					.CARRYINREG(0),                    // Pipeline stages for CARRYIN (0-1)
+					.CARRYINSELREG(0),                 // Pipeline stages for CARRYINSEL (0-1)
+					.CREG(0),                          // Pipeline stages for C (0-1)
+					.DREG(0),                          // Pipeline stages for D (0-1)
+					.INMODEREG(0),                     // Pipeline stages for INMODE (0-1)
+					.MREG(1),                          // Multiplier pipeline stages (0-1)
+					.OPMODEREG(1),                     // Pipeline stages for OPMODE (0-1)
+					.PREG(1)                          // Number of pipeline stages for P (0-1)
+				) dsp (
+					// Cascade outputs: Cascade Ports
+					.ACOUT(),			// 34-bit output: A port cascade
+					.BCOUT(),			// 24-bit output: B cascade
+					.CARRYCASCOUT(),	// 1-bit output: Cascade carry
+					.MULTSIGNOUT(),		// 1-bit output: Multiplier sign cascade
+					.PCOUT(),			// 58-bit output: Cascade output
+
+					// Control outputs: Control Inputs/Status Bits
+					.OVERFLOW(),		// 1-bit output: Overflow in add/acc
+					.PATTERNBDETECT(),	// 1-bit output: Pattern bar detect
+					.PATTERNDETECT(),	// 1-bit output: Pattern detect
+					.UNDERFLOW(),		// 1-bit output: Underflow in add/acc
+
+					// Data outputs: Data Ports
+					.CARRYOUT(),		// 4-bit output: Carry
+					.P(pp),				// 58-bit output: Primary data
+					.XOROUT(),			// 8-bit output: XOR data
+
+					// Cascade inputs: Cascade Ports
+					.ACIN('x),			// 34-bit input: A cascade data
+					.BCIN('x),			// 24-bit input: B cascade
+					.CARRYCASCIN('x),	// 1-bit input: Cascade carry
+					.MULTSIGNIN('x),	// 1-bit input: Multiplier sign cascade
+					.PCIN('x),			// 58-bit input: P cascade
+
+					// Control inputs: Control Inputs/Status Bits
+					.CLK(clk),					// 1-bit input: Clock
+					.ALUMODE(4'h0),				// 4-bit input: ALU control
+					.CARRYINSEL('0),			// 3-bit input: Carry select
+					.INMODE(5'b01100),			// 5-bit input: INMODE control
+					.OPMODE({ 2'b00, { 1'b0, L[2], 1'b0 }, 4'b00_00 }),	// 9-bit input: Operation mode
+
+					// Data inputs: Data Ports
+					.A(aa),						// 34-bit input: A data
+					.B(bb),						// 24-bit input: B data
+					.C('x),						// 58-bit input: C data
+					.CARRYIN('0),				// 1-bit input: Carry-in
+					.D(dd),						// 27-bit input: D data
+
+					// Reset/Clock Enable inputs: Reset/Clock Enable Inputs
+					.CEA1('0),			// 1-bit input: Clock enable for 1st stage AREG
+					.CEA2('0),			// 1-bit input: Clock enable for 2nd stage AREG
+					.CEAD(en),			// 1-bit input: Clock enable for ADREG
+					.CEALUMODE('0),		// 1-bit input: Clock enable for ALUMODE
+					.CEB1('0),			// 1-bit input: Clock enable for 1st stage BREG
+					.CEB2(en),			// 1-bit input: Clock enable for 2nd stage BREG
+					.CEC('0),			// 1-bit input: Clock enable for CREG
+					.CECARRYIN('0),		// 1-bit input: Clock enable for CARRYINREG
+					.CECTRL(en),		// 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG
+					.CED('0),			// 1-bit input: Clock enable for DREG
+					.CEINMODE('0),		// 1-bit input: Clock enable for INMODEREG
+					.CEM(en),			// 1-bit input: Clock enable for MREG
+					.CEP(en),			// 1-bit input: Clock enable for PREG
+					.RSTA('0),			// 1-bit input: Reset for AREG
+					.RSTB(				// 1-bit input: Reset for BREG
+// synthesis translate_off
+						rst ||
+// synthesis translate_on
+						zero
+					),
+					.RSTC('0),			// 1-bit input: Reset for CREG
+					.RSTD(				// 1-bit input: Reset for DREG and ADREG
+// synthesis translate_off
+						zero ||
+// synthesis translate_on
+						rst
+					),
+					.RSTALLCARRYIN('0),	// 1-bit input: Reset for CARRYINREG
+					.RSTALUMODE('0),	// 1-bit input: Reset for ALUMODEREG
+					.RSTCTRL('0),		// 1-bit input: Reset for OPMODEREG and CARRYINSELREG
+					.RSTINMODE('0),		// 1-bit input: Reset for INMODE register
+					.RSTM(rst),			// 1-bit input: Reset for MREG
+					.RSTP(rst)			// 1-bit input: Reset for PREG
+				);
+			end : genDSP
+
+			// External Canary Pipeline
+			logic [1:0]  X1[3:1] = '{ default: 0 };
+			logic [1:0]  X2[3:1] = '{ default: 0 };
+			logic [1:0]  X3[3:1] = '{ default: 0 };
+			always_ff @(posedge clk) begin
+				if(rst) begin
+					X1 <= '{ default: 0 };
+					X2 <= '{ default: 0 };
+					X3 <= '{ default: 0 };
+				end
+				else if(en) begin
+					X1 <= xx;
+					X2 <= X1;
+					foreach(X3[i]) begin
+						X3[i] <= X2[i] + (L[3]? 2'h0 : pp[D[i]+:2]);
+					end
+				end
+			end
+
+			// Derive actual cross-lane overflows
+			for(genvar  i = 0; i < 3; i++) begin
+				assign	h3[s][i] = pp[D[i+1]+:2] - X3[i+1];
+			end
+			assign	p3[s] = pp;
+
+		end : genSIMD
+
+		// Stage #4: Cross-SIMD Reduction
+
+		// Count leaves reachable from each node
+		localparam leave_load_t  LEAVE_LOAD = init_leave_loads();
+
+		uwire signed [ACCU_WIDTH  -1:0]  up4;
+		uwire signed [ACCU_WIDTH  -8:0]  hi4[3];
+		uwire        [$clog2(SIMD)+7:0]  lo4[3];
+		for(genvar  i = 0; i < 4; i++) begin
+			localparam int unsigned  LO_WIDTH = D[i+1] - D[i];
+			localparam int unsigned  HI_WIDTH = ACCU_WIDTH - LO_WIDTH;
+
+			// Conclusive high part accumulation
+			if(i < 3) begin : genHi
+				// Adder Tree across all SIMD high contributions, each from [-1:1]
+				uwire signed [$clog2(1+SIMD):0]  tree[2*SIMD-1];
+				for(genvar  s = 0; s < SIMD;   s++)  assign  tree[SIMD-1+s] = h3[s][i];
+				for(genvar  n = 0; n < SIMD-1; n++) begin
+					// Sum truncated to actual maximum bit width at this node
+					uwire signed [$clog2(1+LEAVE_LOAD[n]):0]  s = tree[2*n+1] + tree[2*n+2];
+					assign  tree[n] = s;
+				end
+
+				// High Sideband Accumulation
+				logic signed [HI_WIDTH-1:0]  Hi4 = 0;
+				always_ff @(posedge clk) begin
+					if(rst)      Hi4 <= 0;
+					else if(en)  Hi4 <= (L[4]? 0 : Hi4) + tree[0];
+				end
+				assign	hi4[i] = Hi4;
+			end : genHi
+
+			// Conclusive low part accumulation
+			if(1) begin : blkLo
+				// Adder Tree across all SIMD low contributions
+				localparam int unsigned  ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1));
+				uwire [ROOT_WIDTH-1:0]  tree[2*SIMD-1];
+				for(genvar  s = 0; s < SIMD;   s++)  assign  tree[SIMD-1+s] = p3[s][D[i]+:LO_WIDTH];
+				for(genvar  n = 0; n < SIMD-1; n++) begin
+					// Sum truncated to actual maximum bit width at this node
+					localparam int unsigned  NODE_WIDTH = $clog2(1 + LEAVE_LOAD[n]*(2**LO_WIDTH-1));
+					uwire [NODE_WIDTH-1:0]  s = tree[2*n+1] + tree[2*n+2];
+					assign  tree[n] = s;
+				end
+
+				logic [ROOT_WIDTH-1:0]  Lo4 = 0;
+				always_ff @(posedge clk) begin
+					if(rst)      Lo4 <= 0;
+					else if(en)  Lo4 <= tree[0];
+				end
+
+				if(i == 3)  assign  up4 = Lo4;
+				else  assign  lo4[i] = Lo4;
+			end : blkLo
+
+		end
+
+		// Stage #5: Resolve lane totals
+		logic signed [3:0][ACCU_WIDTH-1:0]  Res5 = '{ default: 0 };
+		always_ff @(posedge clk) begin
+			if(rst)  Res5 <= '{ default: 0 };
+			else if(en) begin
+				Res5[3] <= up4 - hi4[2];
+				Res5[2] <= $signed({ hi4[2], {(D[3] - D[2]){1'b0}} }) + $signed({ 1'b0, lo4[2] }) - hi4[1];
+				Res5[1] <= $signed({ hi4[1], {(D[2] - D[1]){1'b0}} }) + $signed({ 1'b0, lo4[1] }) - hi4[0];
+				Res5[0] <= $signed({ hi4[0], {(D[1] - D[0]){1'b0}} }) + $signed({ 1'b0, lo4[0] });
+			end
+		end
+
+		// Output
+		for(genvar  pe = PE_BEG; pe < PE_END; pe++) begin
+			assign	p[pe] = Res5[pe - PE_BEG];
+		end
+
+	end : genPipes
+
+endmodule : mvu_4sx4u
\ No newline at end of file

From 8eefb535c3da6482f95465df05b8d3e1c610be21 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 8 May 2023 10:33:31 +0100
Subject: [PATCH 026/112] [rtl mvu]: specialized compute core for > 4-bit
 weights and activations for DSP48

---
 finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 358 +++++++++++++++++++++++++++++
 1 file changed, 358 insertions(+)
 create mode 100644 finn-rtllib/mvu/mvu_8sx8u_dsp48.sv

diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
new file mode 100644
index 0000000000..e06a92c8fa
--- /dev/null
+++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
@@ -0,0 +1,358 @@
+module mvu_8sx8u_dsp48 #(
+	int unsigned  PE,
+	int unsigned  SIMD,
+	int unsigned  ACCU_WIDTH,
+	int unsigned  ACTIVATION_WIDTH,
+	int unsigned  WEIGHT_WIDTH,
+	bit FORCE_BEHAVIORAL = 0,
+
+	localparam int unsigned SINGLE_PROD_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH
+)(
+	// Global Control
+	input	logic  clk,
+	input	logic  rst,
+	input	logic  en,
+
+	// Input
+	input	logic  last,
+	input	logic  zero,	// ignore current inputs and force this partial product to zero
+	input	logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]  w,	// signed weights
+	input	logic                [SIMD-1:0][ACTIVATION_WIDTH-1:0]  a,	// unsigned activations
+
+	// Ouput
+	output	logic  vld,
+	output	logic signed [PE-1:0][ACCU_WIDTH-1:0]  p
+);
+
+	typedef int unsigned  leave_load_t[2*SIMD-1];
+	function leave_load_t init_leave_loads();
+		automatic leave_load_t  res;
+		for(int  i = 2*(SIMD-1); i >= int'(SIMD)-1; i--)  res[i] = 1;
+		for(int  i = SIMD-2; i >= 0; i--)  res[i] = res[2*i+1] + res[2*i+2];
+		return  res;
+	endfunction : init_leave_loads
+
+	// Pipeline for last indicator flag
+	logic [1:5] L = '0;
+	always_ff @(posedge clk) begin
+		if(rst)      L <= '0;
+		else if(en)  L <= { last, L[1:4] };
+	end
+	assign	vld = L[5];
+
+	// Stages #1 - #3: DSP Lanes + cross-lane canaries duplicated with SIMD parallelism
+    localparam int unsigned  D[2:0] = '{ ACCU_WIDTH+SINGLE_PROD_WIDTH, SINGLE_PROD_WIDTH, 0 }; // Lane offsets
+
+	localparam int unsigned  PIPE_COUNT = (PE+1)/2;
+	for(genvar  c = 0; c < PIPE_COUNT; c++) begin : genPipes
+
+		localparam int unsigned  PE_BEG = 2*c;
+		localparam int unsigned  PE_END = PE < 2*(c+1)? PE : 2*(c+1);
+
+		uwire        [57:0]  p3[SIMD];
+		uwire signed [ 1:0]  h3[SIMD];
+		for(genvar  s = 0; s < SIMD; s++) begin : genSIMD
+
+			// Input Lane Assembly
+			uwire [23:0]  bb = a[s];
+			logic [33:0]  aa;
+			logic [26:0]  dd;
+			logic [ 1:0]  xx;
+			if(1) begin : blkVectorize
+				uwire [WEIGHT_WIDTH-1:0]  ww[PE_END - PE_BEG];
+				for(genvar  pe = 0; pe < PE_END - PE_BEG; pe++) begin
+					assign	ww[pe] = w[PE_BEG + pe][s];
+					if(pe) begin
+//						assign  xx[pe] = zero? 0 : ww[pe] * a[s];
+						LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x (
+							.O6(xx[1]),
+							.O5(xx[0]),
+							.I5(1'b1),
+							.I4(zero),
+							.I3(ww[pe][1]),
+							.I2(a[s][1]),
+							.I1(ww[pe][0]),
+							.I0(a[s][0])
+						);
+					end
+				end
+				always_comb begin
+					dd = '0;
+					aa = '0;
+					for(int unsigned  pe = 0; pe < PE_END - PE_BEG; pe++) begin
+						dd[D[pe] +: WEIGHT_WIDTH-1] = ww[pe];
+						aa[D[pe] + WEIGHT_WIDTH-1] = ww[pe][WEIGHT_WIDTH-1];
+					end
+				end
+			end : blkVectorize
+
+			uwire [57:0]  pp;
+
+			// Note: Since the product B * AD is computed,
+			//       rst can be only applied to AD and zero only to B
+			//       with the same effect as zeroing both.
+			if (FORCE_BEHAVIORAL) begin : genBehav
+				// Stage #1: Input Refine
+				logic signed [23:0]  B1  = 0;
+				always_ff @(posedge clk) begin
+					if(zero)     B1  <= 0;
+					else if(en)  B1  <= bb;
+				end
+
+				logic signed [26:0]  AD1 = 0;
+				always_ff @(posedge clk) begin
+					if(rst)      AD1 <= 0;
+					else if(en)  AD1 <= dd - aa;
+				end
+
+				// Stage #2: Multiply
+				logic signed [50:0]  M2 = 0;
+				always_ff @(posedge clk) begin
+					if(rst)      M2 <= 0;
+					else if(en)  M2 <=
+// synthesis translate off
+						(B1 === '0) || (AD1 === '0)? 0 :
+// synthesis translate on
+						B1 * AD1;
+				end
+
+				// Stage #3: Accumulate
+				logic signed [57:0]  P3 = 0;
+				always_ff @(posedge clk) begin
+					if(rst)      P3 <= 0;
+					else if(en)  P3 <= M2 + (L[3]? 0 : P3);
+				end
+
+				assign	pp = P3;
+			end : genBehav
+			else begin : genDSP
+				DSP48E2 #(
+					// Feature Control Attributes: Data Path Selection
+					.AMULTSEL("AD"),	// Selects A input to multiplier (A, AD)
+					.A_INPUT("DIRECT"),	// Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port)
+					.BMULTSEL("B"),		// Selects B input to multiplier (AD, B)
+					.B_INPUT("DIRECT"),	// Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port)
+					.PREADDINSEL("A"),                 // Selects input to pre-adder (A, B)
+					.RND('0),                          // Rounding Constant
+					.USE_MULT("MULTIPLY"),             // Select multiplier usage (DYNAMIC, MULTIPLY, NONE)
+					.USE_SIMD("ONE48"),                // SIMD selection (FOUR12, ONE58, TWO24)
+					.USE_WIDEXOR("FALSE"),             // Use the Wide XOR function (FALSE, TRUE)
+					.XORSIMD("XOR24_48_96"),       // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116)
+
+					// Pattern Detector Attributes: Pattern Detection Configuration
+					.AUTORESET_PATDET("NO_RESET"),     // NO_RESET, RESET_MATCH, RESET_NOT_MATCH
+					.AUTORESET_PRIORITY("RESET"),      // Priority of AUTORESET vs. CEP (CEP, RESET).
+					.MASK('1),                         // 58-bit mask value for pattern detect (1=ignore)
+					.PATTERN('0),                      // 58-bit pattern match for pattern detect
+					.SEL_MASK("MASK"),                 // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2
+					.SEL_PATTERN("PATTERN"),           // Select pattern value (C, PATTERN)
+					.USE_PATTERN_DETECT("NO_PATDET"),  // Enable pattern detect (NO_PATDET, PATDET)
+
+					// Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins
+					.IS_ALUMODE_INVERTED('0),				// Optional inversion for ALUMODE
+					.IS_CARRYIN_INVERTED('0),				// Optional inversion for CARRYIN
+					.IS_CLK_INVERTED('0),					// Optional inversion for CLK
+					.IS_INMODE_INVERTED('0),				// Optional inversion for INMODE
+					.IS_OPMODE_INVERTED(9'b00_010_01_01),	// Optional inversion for OPMODE
+					.IS_RSTALLCARRYIN_INVERTED('0),			// Optional inversion for RSTALLCARRYIN
+					.IS_RSTALUMODE_INVERTED('0),			// Optional inversion for RSTALUMODE
+					.IS_RSTA_INVERTED('0),					// Optional inversion for RSTA
+					.IS_RSTB_INVERTED('0),					// Optional inversion for RSTB
+					.IS_RSTCTRL_INVERTED('0),				// Optional inversion for STCONJUGATE_A
+					.IS_RSTC_INVERTED('0),					// Optional inversion for RSTC
+					.IS_RSTD_INVERTED('0),					// Optional inversion for RSTD
+					.IS_RSTINMODE_INVERTED('0),				// Optional inversion for RSTINMODE
+					.IS_RSTM_INVERTED('0),					// Optional inversion for RSTM
+					.IS_RSTP_INVERTED('0),					// Optional inversion for RSTP
+
+					// Register Control Attributes: Pipeline Register Configuration
+					.ACASCREG(0),                      // Number of pipeline stages between A/ACIN and ACOUT (0-2)
+					.ADREG(1),                         // Pipeline stages for pre-adder (0-1)
+					.ALUMODEREG(0),                    // Pipeline stages for ALUMODE (0-1)
+					.AREG(0),                          // Pipeline stages for A (0-2)
+					.BCASCREG(1),                      // Number of pipeline stages between B/BCIN and BCOUT (0-2)
+					.BREG(1),                          // Pipeline stages for B (0-2)
+					.CARRYINREG(0),                    // Pipeline stages for CARRYIN (0-1)
+					.CARRYINSELREG(0),                 // Pipeline stages for CARRYINSEL (0-1)
+					.CREG(0),                          // Pipeline stages for C (0-1)
+					.DREG(0),                          // Pipeline stages for D (0-1)
+					.INMODEREG(0),                     // Pipeline stages for INMODE (0-1)
+					.MREG(1),                          // Multiplier pipeline stages (0-1)
+					.OPMODEREG(1),                     // Pipeline stages for OPMODE (0-1)
+					.PREG(1)                          // Number of pipeline stages for P (0-1)
+				) dsp (
+					// Cascade outputs: Cascade Ports
+					.ACOUT(),			// 34-bit output: A port cascade
+					.BCOUT(),			// 24-bit output: B cascade
+					.CARRYCASCOUT(),	// 1-bit output: Cascade carry
+					.MULTSIGNOUT(),		// 1-bit output: Multiplier sign cascade
+					.PCOUT(),			// 58-bit output: Cascade output
+
+					// Control outputs: Control Inputs/Status Bits
+					.OVERFLOW(),		// 1-bit output: Overflow in add/acc
+					.PATTERNBDETECT(),	// 1-bit output: Pattern bar detect
+					.PATTERNDETECT(),	// 1-bit output: Pattern detect
+					.UNDERFLOW(),		// 1-bit output: Underflow in add/acc
+
+					// Data outputs: Data Ports
+					.CARRYOUT(),		// 4-bit output: Carry
+					.P(pp),				// 58-bit output: Primary data
+					.XOROUT(),			// 8-bit output: XOR data
+
+					// Cascade inputs: Cascade Ports
+					.ACIN('x),			// 34-bit input: A cascade data
+					.BCIN('x),			// 24-bit input: B cascade
+					.CARRYCASCIN('x),	// 1-bit input: Cascade carry
+					.MULTSIGNIN('x),	// 1-bit input: Multiplier sign cascade
+					.PCIN('x),			// 58-bit input: P cascade
+
+					// Control inputs: Control Inputs/Status Bits
+					.CLK(clk),					// 1-bit input: Clock
+					.ALUMODE(4'h0),				// 4-bit input: ALU control
+					.CARRYINSEL('0),			// 3-bit input: Carry select
+					.INMODE(5'b01100),			// 5-bit input: INMODE control
+					.OPMODE({ 2'b00, { 1'b0, L[2], 1'b0 }, 4'b00_00 }),	// 9-bit input: Operation mode
+
+					// Data inputs: Data Ports
+					.A(aa),						// 34-bit input: A data
+					.B(bb),						// 24-bit input: B data
+					.C('x),						// 58-bit input: C data
+					.CARRYIN('0),				// 1-bit input: Carry-in
+					.D(dd),						// 27-bit input: D data
+
+					// Reset/Clock Enable inputs: Reset/Clock Enable Inputs
+					.CEA1('0),			// 1-bit input: Clock enable for 1st stage AREG
+					.CEA2('0),			// 1-bit input: Clock enable for 2nd stage AREG
+					.CEAD(en),			// 1-bit input: Clock enable for ADREG
+					.CEALUMODE('0),		// 1-bit input: Clock enable for ALUMODE
+					.CEB1('0),			// 1-bit input: Clock enable for 1st stage BREG
+					.CEB2(en),			// 1-bit input: Clock enable for 2nd stage BREG
+					.CEC('0),			// 1-bit input: Clock enable for CREG
+					.CECARRYIN('0),		// 1-bit input: Clock enable for CARRYINREG
+					.CECTRL(en),		// 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG
+					.CED('0),			// 1-bit input: Clock enable for DREG
+					.CEINMODE('0),		// 1-bit input: Clock enable for INMODEREG
+					.CEM(en),			// 1-bit input: Clock enable for MREG
+					.CEP(en),			// 1-bit input: Clock enable for PREG
+					.RSTA('0),			// 1-bit input: Reset for AREG
+					.RSTB(				// 1-bit input: Reset for BREG
+// synthesis translate_off
+						rst ||
+// synthesis translate_on
+						zero
+					),
+					.RSTC('0),			// 1-bit input: Reset for CREG
+					.RSTD(				// 1-bit input: Reset for DREG and ADREG
+// synthesis translate_off
+						zero ||
+// synthesis translate_on
+						rst
+					),
+					.RSTALLCARRYIN('0),	// 1-bit input: Reset for CARRYINREG
+					.RSTALUMODE('0),	// 1-bit input: Reset for ALUMODEREG
+					.RSTCTRL('0),		// 1-bit input: Reset for OPMODEREG and CARRYINSELREG
+					.RSTINMODE('0),		// 1-bit input: Reset for INMODE register
+					.RSTM(rst),			// 1-bit input: Reset for MREG
+					.RSTP(rst)			// 1-bit input: Reset for PREG
+				);
+			end : genDSP
+
+			// External Canary Pipeline
+			logic [1:0]  X1 = '{ default: 0 };
+			logic [1:0]  X2 = '{ default: 0 };
+			logic [1:0]  X3 = '{ default: 0 };
+			always_ff @(posedge clk) begin
+				if(rst) begin
+					X1 <= '{ default: 0 };
+					X2 <= '{ default: 0 };
+					X3 <= '{ default: 0 };
+				end
+				else if(en) begin
+					X1 <= xx;
+					X2 <= X1;
+					X3 <= X2 + (L[3]? 2'h0 : pp[D[1]+:2]);
+				end
+			end
+
+			// Derive actual cross-lane overflows
+			assign  h3[s] = pp[D[1]+:2] - X3;
+
+			assign	p3[s] = pp;
+
+		end : genSIMD
+
+		// Stage #4: Cross-SIMD Reduction
+
+		// Count leaves reachable from each node
+		localparam leave_load_t  LEAVE_LOAD = init_leave_loads();
+
+		uwire signed [ACCU_WIDTH  -1:0]  up4;
+		uwire signed [ACCU_WIDTH  -SINGLE_PROD_WIDTH:0]  hi4;
+		uwire        [$clog2(SIMD)+SINGLE_PROD_WIDTH-1:0]  lo4;
+		for(genvar  i = 0; i < 2; i++) begin
+			localparam int unsigned  LO_WIDTH = D[i+1] - D[i];
+			localparam int unsigned  HI_WIDTH = ACCU_WIDTH - LO_WIDTH;
+
+			// Conclusive high part accumulation
+			if(i == 0) begin : genHi
+				// Adder Tree across all SIMD high contributions, each from [-1:1]
+				uwire signed [$clog2(1+SIMD):0]  tree[2*SIMD-1];
+				for(genvar  s = 0; s < SIMD;   s++)  assign  tree[SIMD-1+s] = h3[s];
+				for(genvar  n = 0; n < SIMD-1; n++) begin
+					// Sum truncated to actual maximum bit width at this node
+					uwire signed [$clog2(1+LEAVE_LOAD[n]):0]  s = tree[2*n+1] + tree[2*n+2];
+					assign  tree[n] = s;
+				end
+
+				// High Sideband Accumulation
+				logic signed [HI_WIDTH-1:0]  Hi4 = 0;
+				always_ff @(posedge clk) begin
+					if(rst)      Hi4 <= 0;
+					else if(en)  Hi4 <= (L[4]? 0 : Hi4) + tree[0];
+				end
+				assign	hi4 = Hi4;
+			end : genHi
+
+			// Conclusive low part accumulation
+			if(1) begin : blkLo
+				// Adder Tree across all SIMD low contributions
+				localparam int unsigned  ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1));
+				uwire [ROOT_WIDTH-1:0]  tree[2*SIMD-1];
+				for(genvar  s = 0; s < SIMD;   s++)  assign  tree[SIMD-1+s] = p3[s][D[i]+:LO_WIDTH];
+				for(genvar  n = 0; n < SIMD-1; n++) begin
+					// Sum truncated to actual maximum bit width at this node
+					localparam int unsigned  NODE_WIDTH = $clog2(1 + LEAVE_LOAD[n]*(2**LO_WIDTH-1));
+					uwire [NODE_WIDTH-1:0]  s = tree[2*n+1] + tree[2*n+2];
+					assign  tree[n] = s;
+				end
+
+				logic [ROOT_WIDTH-1:0]  Lo4 = 0;
+				always_ff @(posedge clk) begin
+					if(rst)      Lo4 <= 0;
+					else if(en)  Lo4 <= tree[0];
+				end
+
+				if(i == 1)  assign  up4 = Lo4;
+				else  assign  lo4 = Lo4;
+			end : blkLo
+
+		end
+
+		// Stage #5: Resolve lane totals
+		logic signed [1:0][ACCU_WIDTH-1:0]  Res5 = '{ default: 0 };
+		always_ff @(posedge clk) begin
+			if(rst)  Res5 <= '{ default: 0 };
+			else if(en) begin
+				Res5[1] <= up4 - hi4;
+				Res5[0] <= $signed({ hi4, {(D[1] - D[0]){1'b0}} }) + $signed({ 1'b0, lo4 });
+			end
+		end
+
+		// Output
+		for(genvar  pe = PE_BEG; pe < PE_END; pe++) begin
+			assign	p[pe] = Res5[pe - PE_BEG];
+		end
+
+	end : genPipes
+
+endmodule : mvu_8sx8u_dsp48
\ No newline at end of file

From e7109e75161774280b24e5884f6c9b9c17a07f7b Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 8 May 2023 10:34:23 +0100
Subject: [PATCH 027/112] [fpgadataflow transform]: initial
 specialize_to_rtl_layers-transform for MVU

---
 .../fpgadataflow/specialize_to_rtl_layers.py  | 105 ++++++++++++++++++
 1 file changed, 105 insertions(+)
 create mode 100644 src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py

diff --git a/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py
new file mode 100644
index 0000000000..7d677ec216
--- /dev/null
+++ b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py
@@ -0,0 +1,105 @@
+# Copyright (c) 2023, AMD
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from qonnx.transformation.base import Transformation
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.core.datatype import DataType
+from onnx import helper
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.fpgadataflow.minimize_accumulator_width import MinimizeAccumulatorWidth
+
+class InferRTLMatrixVectorActivation(Transformation):
+    """Convert (HLS-based) MatrixVectorActivation layers to specialized RTL layers if supported."""
+
+    def __init__(self):
+        super().__init__()
+
+    def _is_rtl_variant_compatible(self, n):
+        no_activation = getCustomOp(n).get_nodeattr("noActivation") == 1
+        act_width_in_range = (DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() <= 8) or (DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() == 9 and DataType[getCustomOp(n).get_nodeattr("inputDataType")].min() < 0)
+        weight_width_in_range = DataType[getCustomOp(n).get_nodeattr("weightDataType")].bitwidth() <= 8
+        folding_supported = (getCustomOp(n).get_nodeattr("MH") % getCustomOp(n).get_nodeattr("PE") == 0) and (getCustomOp(n).get_nodeattr("MW") % getCustomOp(n).get_nodeattr("SIMD") == 0)
+
+        if (no_activation and act_width_in_range and weight_width_in_range and folding_supported):
+            return True
+        else:
+            return False
+
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for n in graph.node:
+            node_ind += 1
+            if n.op_type == "MatrixVectorActivation":
+                preferred_in_rtl = getCustomOp(n).get_nodeattr("impl") == "rtl" and getCustomOp(n).get_nodeattr("resType") == "dsp"
+                supported_in_rtl = self._is_rtl_variant_compatible(n)
+                if (preferred_in_rtl and supported_in_rtl):
+                    mvau_input = n.input[0]
+                    mvau_weight = n.input[1]
+                    mvau_output = n.output[0]
+                    inputDataType = getCustomOp(n).get_nodeattr("inputDataType")
+                    weightDataType = getCustomOp(n).get_nodeattr("weightDataType")
+                    outputDataType = getCustomOp(n).get_nodeattr("outputDataType")
+                    numInputVectors = getCustomOp(n).get_nodeattr("numInputVectors")
+                    mw = getCustomOp(n).get_nodeattr("MW")
+                    mh = getCustomOp(n).get_nodeattr("MH")
+                    simd = getCustomOp(n).get_nodeattr("SIMD")
+                    pe = getCustomOp(n).get_nodeattr("PE")
+                    mem_mode = getCustomOp(n).get_nodeattr("mem_mode")
+
+                    new_node = helper.make_node(
+                        "MatrixVectorActivation_rtl",
+                        [mvau_input, mvau_weight],
+                        [mvau_output],
+                        domain="finn.custom_op.fpgadataflow",
+                        backend="fpgadataflow",
+                        MW=mw,
+                        MH=mh,
+                        SIMD=simd,
+                        PE=pe,
+                        inputDataType=inputDataType,
+                        weightDataType=weightDataType,
+                        outputDataType=outputDataType,
+                        numInputVectors=numInputVectors,
+                        mem_mode=mem_mode,
+                        name=n.name + "_rtl",
+                    )
+                    graph.node.insert(node_ind, new_node)
+                    # remove old node
+                    graph.node.remove(n)
+                    graph_modified=True
+        
+        if graph_modified:
+            model = model.transform(MinimizeAccumulatorWidth())
+            model = model.transform(InferShapes())
+            model = model.transform(InferDataTypes())
+        
+        return (model, graph_modified)
\ No newline at end of file

From 5a868d19e5955abdb894bf1e8b93d2d1f6f8410d Mon Sep 17 00:00:00 2001
From: Yaman Umuroglu <maltanar@gmail.com>
Date: Tue, 9 May 2023 09:41:15 +0200
Subject: [PATCH 028/112] [rtl mvu] fixes for latest memstream + linting

---
 .../matrixvectoractivation_rtl.py             | 136 ++++++++++--------
 1 file changed, 77 insertions(+), 59 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
index 6b1c2f3be7..8fd261d395 100644
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
@@ -27,7 +27,6 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import math
-from shutil import copy
 import numpy as np
 import os
 import textwrap
@@ -40,20 +39,18 @@
 )
 
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
+from finn.util.basic import get_rtlsim_trace_depth, make_build_dir
 from finn.util.data_packing import (
     npy_to_rtlsim_input,
-    numpy_to_hls_code,
     pack_innermost_dim_as_hex_string,
     rtlsim_output_to_npy,
 )
-from finn.util.basic import get_rtlsim_trace_depth, make_build_dir
 
 try:
     from pyverilator import PyVerilator
 except ModuleNotFoundError:
     PyVerilator = None
 
-from . import templates
 
 # ONNX i/o tensor shape assumptions for MatrixVectorActivation:
 # input 0 is the input tensor, shape (.., i_size) = (..., MW)
@@ -69,7 +66,6 @@ class MatrixVectorActivation_rtl(HLSCustomOp):
 
     def __init__(self, onnx_node, **kwargs):
         super().__init__(onnx_node, **kwargs)
-        self.decoupled_wrapper = templates.decoupled_wrapper
 
     def get_nodeattr_types(self):
         my_attrs = {
@@ -186,17 +182,24 @@ def verify_node(self):
             )
 
         num_of_inputs = len(self.onnx_node.input)
-        if num_of_inputs!=2:
-            info_messages.append("RTL-based MatrixVectorActivation expects two inputs (weights and activation), but got {} inputs.".format(len(self.onnx_node.input)))
+        if num_of_inputs != 2:
+            info_messages.append(
+                "RTL-based MatrixVectorActivation expects two inputs "
+                "(weights and activation), but got {} inputs.".format(
+                    len(self.onnx_node.input)
+                )
+            )
 
         mem_mode = self.get_nodeattr("mem_mode")
 
         if mem_mode != "decoupled":
-            info_messages.append("RTL-based MVAU supports only decoupled weights currently")
+            info_messages.append(
+                "RTL-based MVAU supports only decoupled weights currently"
+            )
 
         return info_messages
 
-# TODO: Add in replay_buffer estimation
+    # TODO: Add in replay_buffer estimation
     def uram_estimation(self):
         P = self.get_nodeattr("PE")
         Q = self.get_nodeattr("SIMD")
@@ -218,7 +221,7 @@ def uram_estimation(self):
         depth_multiplier = math.ceil(omega / 4096)
         return width_multiplier * depth_multiplier
 
-# TODO: Add in replay_buffer estimation
+    # TODO: Add in replay_buffer estimation
     def bram_estimation(self):
         """Calculates resource estimation for BRAM based on:
         - FINN-R: An End-to-End Deep-Learning Framework for Fast
@@ -259,7 +262,7 @@ def bram_estimation(self):
         else:
             return (math.ceil(omega / 512)) * (math.ceil(mem_width / 36))
 
-# TODO: Add in replay_buffer estimation
+    # TODO: Add in replay_buffer estimation
     def bram_efficiency_estimation(self):
         wdt = self.get_weight_datatype()
         W = wdt.bitwidth()
@@ -272,7 +275,7 @@ def bram_efficiency_estimation(self):
         bram16_est_capacity = bram16_est * 36 * 512
         return wbits / bram16_est_capacity
 
-# TODO: Add in replay_buffer estimation
+    # TODO: Add in replay_buffer estimation
     def uram_efficiency_estimation(self):
         """Function for URAM efficiency estimation: actual parameter storage
         needed divided by the allocated URAM storage (from estimation)"""
@@ -287,7 +290,7 @@ def uram_efficiency_estimation(self):
         uram_est_capacity = uram_est * 72 * 4096
         return wbits / uram_est_capacity
 
-#TODO: FIX: worst case estimates since segmentlen is not known at this point?
+    # TODO: FIX: worst case estimates since segmentlen is not known at this point?
     def lut_estimation(self):
         """Calculates resource estimations for LUTs based on:
         - FINN-R: An End-to-End Deep-Learning Framework for Fast
@@ -328,13 +331,9 @@ def lut_estimation(self):
         acc_bits = W + A + np.ceil(math.log(MW, 2))
         acc_luts = acc_bits
 
-        return int(
-            c0
-            + c1 * (P * (mult_luts + addertree_luts + acc_luts))
-            + c2
-        )
+        return int(c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts)) + c2)
 
-#TODO: FIX: worst case estimates since segmentlen is not known at this point?
+    # TODO: FIX: worst case estimates since segmentlen is not known at this point?
     def dsp_estimation(self):
         # multiplication
         P = self.get_nodeattr("PE")
@@ -350,7 +349,7 @@ def dsp_estimation(self):
             mult_dsp = 0
         return int(mult_dsp)
 
-#TODO: FIX: worst case estimates since segmentlen is not known at this point
+    # TODO: FIX: worst case estimates since segmentlen is not known at this point
     def get_exp_cycles(self):
         pe = self.get_nodeattr("PE")
         simd = self.get_nodeattr("SIMD")
@@ -359,7 +358,9 @@ def get_exp_cycles(self):
         mw = self.get_nodeattr("MW")
         # since mmv != 1 is not supported yet, we set mmv for now to 1
         mmv = 1
-        # Actual exp_cycles is probably slightly larger (say 3 cycles (DSP A/B, M, P - reg) + additional pipeline buffer cycles. Most probably <10)
+        # Actual exp_cycles is probably slightly larger (say 3 cycles
+        # (DSP A/B, M, P - reg) + additional pipeline buffer cycles.
+        # Most probably <10)
         exp_cycles = (mh / pe) * (mw / simd) * np.prod(num_inp_vec) / mmv
         return int(exp_cycles)
 
@@ -384,7 +385,9 @@ def get_output_datatype(self, ind=0):
 
     def get_instream_width(self, ind=0):
         i_bits = self.get_input_datatype().bitwidth()
-        assert (i_bits<=9), "RTL-based MVAU only supports activations with bit-width up to 9-bits"
+        assert (
+            i_bits <= 9
+        ), "RTL-based MVAU only supports activations with bit-width up to 9-bits"
         in_width = i_bits * self.get_nodeattr("SIMD")
         return in_width
 
@@ -402,7 +405,9 @@ def get_weightstream_width(self):
             pe = self.get_nodeattr("PE")
             simd = self.get_nodeattr("SIMD")
             wp = self.get_weight_datatype().bitwidth()
-            assert (wp <= 8), "RTL-based MVAU only supports weights with bit-width up to 8-bits"
+            assert (
+                wp <= 8
+            ), "RTL-based MVAU only supports weights with bit-width up to 8-bits"
             w_width = pe * simd * wp
             return w_width
         else:
@@ -516,7 +521,8 @@ def minimize_accumulator_width(self, model):
         else:
             adt = DataType.get_smallest_possible(acc_max)
         # Note: we are interested in simply the width of the output dot product.
-        # Padding the actual output stream to a multiple of 8-bits is done in the RTL component
+        # Padding the actual output stream to a multiple of 8-bits is done in
+        # the RTL component
         self.set_nodeattr("accDataType", adt.name)
         # for no-activation nodes, output dt = acc dt
         self.set_nodeattr("outputDataType", adt.name)
@@ -615,9 +621,7 @@ def generate_params(self, model, path):
             # and one for synthesis. this is because URAM-based weights always
             # need zero weights for synthesis, otherwise they get inferred
             # as BRAM
-            weight_filename_rtl_synth = "{}/memblock_synth_0.dat".format(
-                code_gen_dir
-            )
+            weight_filename_rtl_synth = "{}/memblock_synth_0.dat".format(code_gen_dir)
             weight_filename_rtl_sim = "{}/memblock_sim_0.dat".format(code_gen_dir)
             # sim weights are always the true weights
             self.make_weight_file(
@@ -734,11 +738,11 @@ def code_generation_ipgen(self, model, fpgapart, clk):
 
     def ipgen_singlenode_code(self):
         """Normally: Builds the bash script for IP generation."""
-        pass   
+        pass
 
     def code_generation_cppsim(self, model):
         """Normally: Generates C++ code for simulation (cppsim)."""
-        pass     
+        pass
 
     def compile_singlenode_code(self):
         pass
@@ -803,19 +807,28 @@ def code_generation_ipi(self):
             code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
             rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/")
             sourcefiles = [
-                os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"),
+                os.path.join(
+                    code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"
+                ),
                 rtllib_dir + "mvu_axi.sv",
                 rtllib_dir + "replay_buffer.sv",
                 rtllib_dir + "mvu_4sx4u.sv",
                 rtllib_dir + "mvu_8sx9.sv",
-                rtllib_dir + "mvu_8sx8u_dsp48.sv"
+                rtllib_dir + "mvu_8sx8u_dsp48.sv",
             ]
             for f in sourcefiles:
                 cmd.append("add_files -norecurse %s" % (f))
-            cmd.append("create_bd_cell -type hier -reference %s /%s/%s" % (self.get_nodeattr("gen_top_module"), self.onnx_node.name, self.onnx_node.name))
+            cmd.append(
+                "create_bd_cell -type hier -reference %s /%s/%s"
+                % (
+                    self.get_nodeattr("gen_top_module"),
+                    self.onnx_node.name,
+                    self.onnx_node.name,
+                )
+            )
 
             # instantiate a streamer and connect it to the HLS IP
-            strm_vlnv = "xilinx.com:user:memstream:1.0"
+            strm_vlnv = "amd.com:FINN:memstream:1.0"
             strm_inst = node_name + "_wstrm"
             cmd.append(
                 "create_bd_cell -type ip -vlnv %s /%s/%s"
@@ -849,11 +862,11 @@ def code_generation_ipi(self):
                 % (node_name, strm_inst, node_name, node_name, sname)
             )
             cmd.append(
-                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aresetn]"
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/ap_rst_n]"
                 % (node_name, rst_name, node_name, strm_inst)
             )
             cmd.append(
-                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aclk]"
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/ap_clk]"
                 % (node_name, clk_name, node_name, strm_inst)
             )
             cmd.append(
@@ -947,21 +960,25 @@ def derive_characteristic_fxns(self, period):
             ]
         super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
 
-# TODO: characterize max_clk and implement this function in look-up style
+    # TODO: characterize max_clk and implement this function in look-up style
     def _resolve_segment_len(self, clk):
         # Insert pipeline registers in the DSP chain to meet target clock frequency
         segmentlen = 0
         return segmentlen
 
     def _resolve_impl_style(self, fpgapart):
-        # Based on target device and activation/weight-width, choose the supported RTL module
+        # Based on target device and activation/weight-width, choose the
+        # supported RTL module
         act_width = self.get_input_datatype(0).bitwidth()
         weight_width = self.get_input_datatype(1).bitwidth()
-        is_versal = fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"] or fpgapart[0:5] == "xqrvc"
-        if (act_width == 4 and weight_width == 4):
+        is_versal = (
+            fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"]
+            or fpgapart[0:5] == "xqrvc"
+        )
+        if act_width == 4 and weight_width == 4:
             return "mvu_4sx4u"
         else:
-            if (is_versal):
+            if is_versal:
                 return "mvu_8sx9_dsp58"
             else:
                 return "mvu_8sx8u_dsp48"
@@ -973,13 +990,17 @@ def generate_hdl(self, model, fpgapart, clk):
 
         template_path, code_gen_dict = self.prepare_codegen_default(fpgapart, clk)
         # add general parameters to dictionary
-        code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [self.get_verilog_top_module_name()]
+        code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [
+            self.get_verilog_top_module_name()
+        ]
         # save top module name so we can refer to it after this node has been renamed
         # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject)
         self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name())
 
         ram_style = self.get_nodeattr("ram_style")
-        assert (ram_style=="auto"), "Unrecognized ram_style for MatrixVectorActivation_rtl"
+        assert (
+            ram_style == "auto"
+        ), "Unrecognized ram_style for MatrixVectorActivation_rtl"
 
         # apply code generation to template
         with open(template_path, "r") as f:
@@ -1009,19 +1030,21 @@ def generate_hdl(self, model, fpgapart, clk):
         self.set_nodeattr("ip_path", code_gen_dir)
 
     def prepare_codegen_default(self, fpgapart, clk):
-        template_path = (
-            os.environ["FINN_ROOT"] + "/finn-rtllib/mvu/mvu_axi_wrapper.v"
-        )
-        
+        template_path = os.environ["FINN_ROOT"] + "/finn-rtllib/mvu/mvu_axi_wrapper.v"
+
         code_gen_dict = {}
         code_gen_dict["$MW$"] = [str(self.get_nodeattr("MW"))]
         code_gen_dict["$MH$"] = [str(self.get_nodeattr("MH"))]
         code_gen_dict["$PE$"] = [str(self.get_nodeattr("PE"))]
         code_gen_dict["$SIMD$"] = [str(self.get_nodeattr("SIMD"))]
-        code_gen_dict["$ACTIVATION_WIDTH$"] = [str(self.get_input_datatype(0).bitwidth())]
+        code_gen_dict["$ACTIVATION_WIDTH$"] = [
+            str(self.get_input_datatype(0).bitwidth())
+        ]
         code_gen_dict["$WEIGHT_WIDTH$"] = [str(self.get_input_datatype(1).bitwidth())]
         code_gen_dict["$ACCU_WIDTH$"] = [str(self.get_output_datatype().bitwidth())]
-        code_gen_dict["$SIGNED_ACTIVATIONS$"] = [str(1)] if (self.get_input_datatype(0).min() < 0) else [str(0)]
+        code_gen_dict["$SIGNED_ACTIVATIONS$"] = (
+            [str(1)] if (self.get_input_datatype(0).min() < 0) else [str(0)]
+        )
         code_gen_dict["$SEGMENTLEN$"] = [str(self._resolve_segment_len(clk))]
         code_gen_dict["$MVU_IMPL_STYLE$"] = [self._resolve_impl_style(fpgapart)]
 
@@ -1035,15 +1058,10 @@ def prepare_rtlsim(self):
         if PyVerilator is None:
             raise ImportError("Installation of PyVerilator is required.")
 
-        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")        
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
         # Path to (System-)Verilog files used by top-module & path to top-module
-        verilog_paths = [
-            code_gen_dir,
-            os.environ["FINN_ROOT"] + "/finn-rtllib/mvu"
-        ]
-        verilog_files = [
-            self.get_nodeattr("gen_top_module") + "_wrapper_sim.v"
-        ]
+        verilog_paths = [code_gen_dir, os.environ["FINN_ROOT"] + "/finn-rtllib/mvu"]
+        verilog_files = [self.get_nodeattr("gen_top_module") + "_wrapper_sim.v"]
 
         # build the Verilator emu library
         sim = PyVerilator.build(
@@ -1051,9 +1069,9 @@ def prepare_rtlsim(self):
             build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"),
             verilog_path=verilog_paths,
             trace_depth=get_rtlsim_trace_depth(),
-            top_module_name=self.get_verilog_top_module_name()
+            top_module_name=self.get_verilog_top_module_name(),
         )
         # save generated lib filename in attribute
         self.set_nodeattr("rtlsim_so", sim.lib._name)
-        
-        return sim
\ No newline at end of file
+
+        return sim

From 4a9cfa1c7a17497578faad3f76c25b80c116ba58 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 11 May 2023 10:56:07 +0100
Subject: [PATCH 029/112] [rtl custom_op]: add support for external weights

---
 .../matrixvectoractivation_rtl.py             | 67 ++++++++++---------
 1 file changed, 37 insertions(+), 30 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
index 8fd261d395..162b5e2e16 100644
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
@@ -192,9 +192,9 @@ def verify_node(self):
 
         mem_mode = self.get_nodeattr("mem_mode")
 
-        if mem_mode != "decoupled":
+        if mem_mode not in ["decoupled", "external"]:
             info_messages.append(
-                "RTL-based MVAU supports only decoupled weights currently"
+                "RTL-based MVAU supports only decoupled or external weights."
             )
 
         return info_messages
@@ -612,35 +612,20 @@ def generate_params(self, model, path):
         code_gen_dir = path
         # weights, if not external
         weights = model.get_initializer(self.onnx_node.input[1])
-        if mem_mode == "decoupled":
+        if mem_mode in ["decoupled", "external"]:
             weight_filename_sim = "{}/weights.npy".format(code_gen_dir)
             # save decoupled weights for cppsim
             self.make_weight_file(weights, "decoupled_npy", weight_filename_sim)
-            # Also save weights as Verilog .dat file
-            # note that we provide two different .dat files, one for synth
-            # and one for synthesis. this is because URAM-based weights always
-            # need zero weights for synthesis, otherwise they get inferred
-            # as BRAM
-            weight_filename_rtl_synth = "{}/memblock_synth_0.dat".format(code_gen_dir)
-            weight_filename_rtl_sim = "{}/memblock_sim_0.dat".format(code_gen_dir)
-            # sim weights are always the true weights
-            self.make_weight_file(
-                weights, "decoupled_verilog_dat", weight_filename_rtl_sim
-            )
-            ram_style = self.get_nodeattr("ram_style")
-            if ram_style == "ultra":
-                # UltraRAM must have no memory initializer, or only zeroes
-                # otherwise BRAM will be inferred instead of URAM
-                # as a workaround we provide a zero-weight init here
-                synth_weights = np.zeros_like(weights, dtype=np.float32)
-            else:
-                synth_weights = weights
-            self.make_weight_file(
-                synth_weights, "decoupled_verilog_dat", weight_filename_rtl_synth
-            )
+            if mem_mode == "decoupled":
+                # also save weights as Verilog .dat file
+                # This file will be ignored when synthesizing UltraScale memory.
+                weight_filename_rtl = "{}/memblock.dat".format(code_gen_dir)
+                self.make_weight_file(
+                    weights, "decoupled_verilog_dat", weight_filename_rtl
+                )
         else:
             raise Exception(
-                """Please set mem_mode to "decoupled",
+                """Please set mem_mode to "const", "decoupled", or "external",
                 currently no other parameter value is supported!"""
             )
 
@@ -695,7 +680,7 @@ def execute_node(self, context, graph):
             )
             super().reset_rtlsim(sim)
             super().toggle_clk(sim)
-            if mem_mode == "external" or mem_mode == "decoupled":
+            if mem_mode in ["external", "decoupled"]:
                 wnbits = self.get_weightstream_width()
                 export_wdt = self.get_weight_datatype()
                 wei = npy_to_rtlsim_input(
@@ -903,9 +888,31 @@ def code_generation_ipi(self):
                 # TODO calculate and pass in segment size here
                 cmd.append("assign_bd_address")
             cmd.append("save_bd_design")
-        elif mem_mode == "const" or mem_mode == "external":
-            # base class impl sufficient for const/external modes
-            return super().code_generation_ipi()
+        elif mem_mode == "external":
+            # instantiate the RTL block
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+            rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/")
+            sourcefiles = [
+                os.path.join(
+                    code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"
+                ),
+                rtllib_dir + "mvu_axi.sv",
+                rtllib_dir + "replay_buffer.sv",
+                rtllib_dir + "mvu_4sx4u.sv",
+                rtllib_dir + "mvu_8sx9.sv",
+                rtllib_dir + "mvu_8sx8u_dsp48.sv",
+            ]
+            for f in sourcefiles:
+                cmd.append("add_files -norecurse %s" % (f))
+            cmd.append(
+                "create_bd_cell -type module -reference %s %s"
+                % (
+                    self.get_nodeattr("gen_top_module"),
+                    self.onnx_node.name,
+                )
+            )
+            cmd.append("set_property CONFIG.FREQ_HZ 333333333.333333 [get_bd_intf_pins %s/in0_V]" % (self.onnx_node.name))
+            cmd.append("set_property CONFIG.FREQ_HZ 333333333.333333 [get_bd_intf_pins %s/out_V]" % (self.onnx_node.name))
         else:
             raise Exception("Unrecognized mem_mode for MatrixVectorActivation")
         return cmd

From 8a9ac1af4d6c62e7c9557ab41992b84cf2c37ae1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Thu, 11 May 2023 11:04:28 +0100
Subject: [PATCH 030/112] Specify clock and reset associations of bus
 interfaces.

---
 finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v | 4 +++-
 finn-rtllib/mvu/mvu_axi_wrapper.v      | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v
index 502a72d3f2..fb3c62a15a 100644
--- a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v
+++ b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v
@@ -49,8 +49,10 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 	parameter 	OUTPUT_LANES = PE,
 	parameter 	OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
 )(
-  	// Global Control
+	// Global Control
+	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF s_axis_weights:s_axis_input:m_axis_output" *)
 	input	logic  ap_clk,
+	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF s_axis_weights:s_axis_input:m_axis_output" *)
 	input	logic  ap_rst_n,
 
 	// Weight Stream
diff --git a/finn-rtllib/mvu/mvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_axi_wrapper.v
index b79ba6bbd1..d8acaefcc7 100644
--- a/finn-rtllib/mvu/mvu_axi_wrapper.v
+++ b/finn-rtllib/mvu/mvu_axi_wrapper.v
@@ -50,8 +50,10 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 	parameter 	OUTPUT_LANES = PE,
 	parameter 	OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
 )(
-  	// Global Control
+	// Global Control
+	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V" *)
 	input	ap_clk,
+	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V" *)
 	input	ap_rst_n,
 	// Weight Stream
 	input	[WEIGHT_STREAM_WIDTH_BA-1:0]  weights_V_TDATA,

From d9b90793bd54a5e112531c737fa7c60a51b21d34 Mon Sep 17 00:00:00 2001
From: Yaman Umuroglu <maltanar@gmail.com>
Date: Mon, 15 May 2023 10:16:48 +0200
Subject: [PATCH 031/112] [rtlmvu] More fixes for memstream and param gen

---
 .../fpgadataflow/matrixvectoractivation_rtl.py | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
index 162b5e2e16..1791327e78 100644
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
@@ -612,7 +612,11 @@ def generate_params(self, model, path):
         code_gen_dir = path
         # weights, if not external
         weights = model.get_initializer(self.onnx_node.input[1])
+<<<<<<< HEAD
         if mem_mode in ["decoupled", "external"]:
+=======
+        if mem_mode == "decoupled" or mem_mode == "external":
+>>>>>>> 72fe4c5b ([rtlmvu] More fixes for memstream and param gen)
             weight_filename_sim = "{}/weights.npy".format(code_gen_dir)
             # save decoupled weights for cppsim
             self.make_weight_file(weights, "decoupled_npy", weight_filename_sim)
@@ -821,22 +825,16 @@ def code_generation_ipi(self):
             )
             cmd.append(
                 "set_property -dict [list "
-                "CONFIG.NSTREAMS {1} "
-                "CONFIG.MEM_DEPTH {%d} "
-                "CONFIG.MEM_WIDTH {%d} "
-                "CONFIG.MEM_INIT {%s} "
+                "CONFIG.DEPTH {%d} "
+                "CONFIG.WIDTH {%d} "
+                "CONFIG.INIT_FILE {%s} "
                 "CONFIG.RAM_STYLE {%s} "
-                "CONFIG.STRM0_DEPTH {%d} "
-                "CONFIG.STRM0_WIDTH {%d} "
-                "CONFIG.STRM0_OFFSET {0} "
                 "] [get_bd_cells /%s/%s]"
                 % (
                     self.calc_wmem(),
                     self.get_weightstream_width_padded(),
-                    self.get_nodeattr("code_gen_dir_ipgen") + "/",
+                    self.get_nodeattr("code_gen_dir_ipgen") + "/memblock.dat",
                     self.get_nodeattr("ram_style"),
-                    self.calc_wmem(),
-                    self.get_weightstream_width_padded(),
                     node_name,
                     strm_inst,
                 )

From a5f2a83897e33acb4b3e2231d9bfa534e56bb6b2 Mon Sep 17 00:00:00 2001
From: Yaman Umuroglu <maltanar@gmail.com>
Date: Thu, 11 May 2023 23:49:10 +0200
Subject: [PATCH 032/112] [Build] apply config to only FIFO nodes in
 step_set_fifo_depths

---
 src/finn/builder/build_dataflow_steps.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
index 65ab2b0b93..d4af757491 100644
--- a/src/finn/builder/build_dataflow_steps.py
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -53,6 +53,7 @@
 from shutil import copy
 
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
+import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl
 import finn.transformation.streamline.absorb as absorb
 from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
@@ -123,7 +124,6 @@
 )
 from finn.util.pyverilator import verilator_fifosim
 from finn.util.test import execute_parent
-import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl
 
 
 def verify_step(
@@ -486,14 +486,13 @@ def step_generate_estimate_reports(model: ModelWrapper, cfg: DataflowBuildConfig
 
 
 def step_specialize_to_rtl(model: ModelWrapper, cfg: DataflowBuildConfig):
-    """Convert layers implemented in HLS to an equivalent specialized RTL implementation if possible."""
-    specialize_to_rtl_transforms = [
-        to_rtl.InferRTLMatrixVectorActivation()
-    ]
+    """Convert layers implemented in HLS to an equivalent specialized RTL
+    implementation if possible."""
+    specialize_to_rtl_transforms = [to_rtl.InferRTLMatrixVectorActivation()]
     for trn in specialize_to_rtl_transforms:
         model = model.transform(trn)
     return model
-    
+
 
 def step_minimize_bit_width(model: ModelWrapper, cfg: DataflowBuildConfig):
     """Tighten the weight and accumulator bit widths for each layer."""
@@ -594,7 +593,12 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
         model = model.transform(GiveUniqueNodeNames())
         model = model.transform(GiveReadableTensorNames())
         if cfg.folding_config_file is not None:
-            model = model.transform(ApplyConfig(cfg.folding_config_file))
+            model = model.transform(
+                ApplyConfig(
+                    cfg.folding_config_file,
+                    node_filter=lambda x: x.op_type == "StreamingFIFO",
+                )
+            )
 
     # extract the final configuration and save it as json
     hw_attrs = [

From 08cbdc59a95ed6281c3234c5e8b0b9d7327a2988 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Wed, 24 May 2023 07:58:41 +0100
Subject: [PATCH 033/112] Revised control interface attributes.

---
 finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v | 29 +++++++++++++-------------
 finn-rtllib/mvu/mvu_axi_wrapper.v      |  8 ++++---
 2 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v
index fb3c62a15a..e15f77fbae 100644
--- a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v
+++ b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v
@@ -50,25 +50,26 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 	parameter 	OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
 )(
 	// Global Control
-	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF s_axis_weights:s_axis_input:m_axis_output" *)
-	input	logic  ap_clk,
-	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF s_axis_weights:s_axis_input:m_axis_output" *)
-	input	logic  ap_rst_n,
+	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF s_axis_weights:s_axis_input:m_axis_output, ASSOCIATED_RESET ap_rst_n" *)
+	(* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *)
+	input	ap_clk,
+	(* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *)
+	input	ap_rst_n,
 
 	// Weight Stream
-	input	logic [WEIGHT_STREAM_WIDTH_BA-1:0]  s_axis_weights_tdata,
-	input	logic  s_axis_weights_tvalid,
-	output	logic  s_axis_weights_tready,
+	input	[WEIGHT_STREAM_WIDTH_BA-1:0]  s_axis_weights_tdata,
+	input	s_axis_weights_tvalid,
+	output	s_axis_weights_tready,
 
 	// Input Stream
-	input	logic [INPUT_STREAM_WIDTH_BA-1:0]  s_axis_input_tdata,
-	input	logic  s_axis_input_tvalid,
-	output	logic  s_axis_input_tready,
+	input	[INPUT_STREAM_WIDTH_BA-1:0]  s_axis_input_tdata,
+	input	s_axis_input_tvalid,
+	output	s_axis_input_tready,
 
 	// Output Stream
-	output	logic [OUTPUT_STREAM_WIDTH_BA-1:0]  m_axis_output_tdata,
-	output	logic  m_axis_output_tvalid,
-	input	logic  m_axis_output_tready
+	output	[OUTPUT_STREAM_WIDTH_BA-1:0]  m_axis_output_tdata,
+	output	m_axis_output_tvalid,
+	input	m_axis_output_tready
 );
 
 mvu_8sx9_axi #(
@@ -89,4 +90,4 @@ mvu_8sx9_axi #(
 	.m_axis_output_tready(m_axis_output_tready)
 );
 
-endmodule : $MODULE_NAME_AXI_WRAPPER$
\ No newline at end of file
+endmodule : $MODULE_NAME_AXI_WRAPPER$
diff --git a/finn-rtllib/mvu/mvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_axi_wrapper.v
index d8acaefcc7..239c5bbacd 100644
--- a/finn-rtllib/mvu/mvu_axi_wrapper.v
+++ b/finn-rtllib/mvu/mvu_axi_wrapper.v
@@ -51,10 +51,12 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 	parameter 	OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
 )(
 	// Global Control
-	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V" *)
+	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *)
+	(* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *)
 	input	ap_clk,
-	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V" *)
+	(* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *)
 	input	ap_rst_n,
+
 	// Weight Stream
 	input	[WEIGHT_STREAM_WIDTH_BA-1:0]  weights_V_TDATA,
 	input   weights_V_TVALID,
@@ -87,4 +89,4 @@ mvu_axi #(
 	.m_axis_output_tready(out_V_TREADY)
 );
 
-endmodule : $MODULE_NAME_AXI_WRAPPER$
\ No newline at end of file
+endmodule : $MODULE_NAME_AXI_WRAPPER$

From d058cc2a5c1ed71a2c2ea12034cfa921818381ec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Wed, 24 May 2023 09:16:50 +0100
Subject: [PATCH 034/112] Mask device primitives from Verilator in favor of
 using behavioral code.

---
 finn-rtllib/mvu/mvu_4sx4u.sv       | 38 ++++++++++++++++++++----------
 finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 38 ++++++++++++++++++++----------
 finn-rtllib/mvu/mvu_8sx9.sv        | 29 ++++++++++++++---------
 3 files changed, 68 insertions(+), 37 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv
index 5993154355..21594e46ac 100644
--- a/finn-rtllib/mvu/mvu_4sx4u.sv
+++ b/finn-rtllib/mvu/mvu_4sx4u.sv
@@ -19,6 +19,12 @@ module mvu_4sx4u #(
 	output	logic  vld,
 	output	logic signed [PE-1:0][ACCU_WIDTH-1:0]  p
 );
+	// Verilator always to use behavioral code
+	localparam bit  BEHAVIORAL =
+`ifdef VERILATOR
+		1 ||
+`endif
+		FORCE_BEHAVIORAL;
 
 	typedef int unsigned  leave_load_t[2*SIMD-1];
 	function leave_load_t init_leave_loads();
@@ -59,17 +65,21 @@ module mvu_4sx4u #(
 				for(genvar  pe = 0; pe < PE_END - PE_BEG; pe++) begin
 					assign	ww[pe] = w[PE_BEG + pe][s];
 					if(pe) begin
-//						assign  xx[pe] = zero? 0 : ww[pe] * a[s];
-						LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x (
-							.O6(xx[pe][1]),
-							.O5(xx[pe][0]),
-							.I5(1'b1),
-							.I4(zero),
-							.I3(ww[pe][1]),
-							.I2(a[s][1]),
-							.I1(ww[pe][0]),
-							.I0(a[s][0])
-						);
+						if(BEHAVIORAL)  assign  xx[pe] = zero? 0 : ww[pe] * a[s];
+`ifndef VERILATOR
+						else begin
+							LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x (
+								.O6(xx[pe][1]),
+								.O5(xx[pe][0]),
+								.I5(1'b1),
+								.I4(zero),
+								.I3(ww[pe][1]),
+								.I2(a[s][1]),
+								.I1(ww[pe][0]),
+								.I0(a[s][0])
+							);
+						end
+`endif
 					end
 				end
 				always_comb begin
@@ -87,7 +97,7 @@ module mvu_4sx4u #(
 			// Note: Since the product B * AD is computed,
 			//       rst can be only applied to AD and zero only to B
 			//       with the same effect as zeroing both.
-			if (FORCE_BEHAVIORAL) begin : genBehav
+			if (BEHAVIORAL) begin : genBehav
 				// Stage #1: Input Refine
 				logic signed [23:0]  B1  = 0;
 				always_ff @(posedge clk) begin
@@ -121,6 +131,7 @@ module mvu_4sx4u #(
 
 				assign	pp = P3;
 			end : genBehav
+`ifndef VERILATOR
 			else begin : genDSP
 				DSP48E2 #(
 					// Feature Control Attributes: Data Path Selection
@@ -252,6 +263,7 @@ module mvu_4sx4u #(
 					.RSTP(rst)			// 1-bit input: Reset for PREG
 				);
 			end : genDSP
+`endif
 
 			// External Canary Pipeline
 			logic [1:0]  X1[3:1] = '{ default: 0 };
@@ -356,4 +368,4 @@ module mvu_4sx4u #(
 
 	end : genPipes
 
-endmodule : mvu_4sx4u
\ No newline at end of file
+endmodule : mvu_4sx4u
diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
index e06a92c8fa..09db360b77 100644
--- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
+++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
@@ -23,6 +23,12 @@ module mvu_8sx8u_dsp48 #(
 	output	logic  vld,
 	output	logic signed [PE-1:0][ACCU_WIDTH-1:0]  p
 );
+	// Verilator always to use behavioral code
+	localparam bit  BEHAVIORAL =
+`ifdef VERILATOR
+		1 ||
+`endif
+		FORCE_BEHAVIORAL;
 
 	typedef int unsigned  leave_load_t[2*SIMD-1];
 	function leave_load_t init_leave_loads();
@@ -63,17 +69,21 @@ module mvu_8sx8u_dsp48 #(
 				for(genvar  pe = 0; pe < PE_END - PE_BEG; pe++) begin
 					assign	ww[pe] = w[PE_BEG + pe][s];
 					if(pe) begin
-//						assign  xx[pe] = zero? 0 : ww[pe] * a[s];
-						LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x (
-							.O6(xx[1]),
-							.O5(xx[0]),
-							.I5(1'b1),
-							.I4(zero),
-							.I3(ww[pe][1]),
-							.I2(a[s][1]),
-							.I1(ww[pe][0]),
-							.I0(a[s][0])
-						);
+						if(BEHAVIORAL)  assign  xx[pe] = zero? 0 : ww[pe] * a[s];
+`ifndef VERILATOR
+						else begin
+							LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x (
+								.O6(xx[1]),
+								.O5(xx[0]),
+								.I5(1'b1),
+								.I4(zero),
+								.I3(ww[pe][1]),
+								.I2(a[s][1]),
+								.I1(ww[pe][0]),
+								.I0(a[s][0])
+							);
+						end
+`endif
 					end
 				end
 				always_comb begin
@@ -91,7 +101,7 @@ module mvu_8sx8u_dsp48 #(
 			// Note: Since the product B * AD is computed,
 			//       rst can be only applied to AD and zero only to B
 			//       with the same effect as zeroing both.
-			if (FORCE_BEHAVIORAL) begin : genBehav
+			if(BEHAVIORAL) begin : genBehav
 				// Stage #1: Input Refine
 				logic signed [23:0]  B1  = 0;
 				always_ff @(posedge clk) begin
@@ -125,6 +135,7 @@ module mvu_8sx8u_dsp48 #(
 
 				assign	pp = P3;
 			end : genBehav
+`ifndef VERILATOR
 			else begin : genDSP
 				DSP48E2 #(
 					// Feature Control Attributes: Data Path Selection
@@ -256,6 +267,7 @@ module mvu_8sx8u_dsp48 #(
 					.RSTP(rst)			// 1-bit input: Reset for PREG
 				);
 			end : genDSP
+`endif
 
 			// External Canary Pipeline
 			logic [1:0]  X1 = '{ default: 0 };
@@ -355,4 +367,4 @@ module mvu_8sx8u_dsp48 #(
 
 	end : genPipes
 
-endmodule : mvu_8sx8u_dsp48
\ No newline at end of file
+endmodule : mvu_8sx8u_dsp48
diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv
index 2d1da26efb..f8e2ab3985 100644
--- a/finn-rtllib/mvu/mvu_8sx9.sv
+++ b/finn-rtllib/mvu/mvu_8sx9.sv
@@ -52,11 +52,17 @@ module mvu_8sx9 #(
     input   logic zero, // ignore current inputs and force this partial product to zero
     input   logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // weights
 	input   logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] a, // activations
-    
+
 	// Ouput
 	output  logic vld,
     output  logic [PE-1:0][ACCU_WIDTH-1:0] p
   );
+	// Verilator always to use behavioral code
+	localparam bit  BEHAVIORAL =
+`ifdef VERILATOR
+		1 ||
+`endif
+		FORCE_BEHAVIORAL;
 
 //-------------------- Declare global signals --------------------\\
 	localparam int unsigned CHAINLEN = (SIMD+2)/3;
@@ -75,7 +81,7 @@ module mvu_8sx9 #(
 			L[1+MAX_PIPELINE_STAGES] <= last;
 			L[0:MAX_PIPELINE_STAGES] <= L[1:1+MAX_PIPELINE_STAGES];
 		end
-	end  
+	end
 	assign vld = L[0];
 
 //-------------------- Shift register for ZERO flag --------------------\\
@@ -87,7 +93,7 @@ module mvu_8sx9 #(
 			else if(en) begin
 				Z[0] <= zero;
 				if (MAX_PIPELINE_STAGES > 2)  Z[1:MAX_PIPELINE_STAGES-1] <= Z[0:MAX_PIPELINE_STAGES-2];
-			end    
+			end
 		end
 	end;
 
@@ -157,12 +163,12 @@ module mvu_8sx9 #(
 
 			if (LAST) begin : genPOUT
 				assign p[j] = pp[ACCU_WIDTH-1:0];
-			end      
+			end
 
 			// Note: Since the product B * AD is computed,
 			//       rst can be only applied to AD and zero only to B
 			//       with the same effect as zeroing both.
-			if (FORCE_BEHAVIORAL) begin : genBehav
+			if(BEHAVIORAL) begin : genBehav
 				// Stage #1: Input A/B
 				logic signed [33:0] Areg [INTERNAL_PREGS];
 				always_ff @(posedge clk) begin
@@ -233,7 +239,7 @@ module mvu_8sx9 #(
 				assign pp = Preg;
 				assign pcout[j][i] = pp;
 			end : genBehav
-
+`ifndef VERILATOR
 			else begin: genDSP
 				DSP58 #(
 					// Feature Control Attributes: Data Path Selection
@@ -263,8 +269,8 @@ module mvu_8sx9 #(
 					.IS_CLK_INVERTED(1'b0),             // Optional inversion for CLK
 					.IS_INMODE_INVERTED(5'b00000),      // Optional inversion for INMODE
 					.IS_NEGATE_INVERTED(3'b000),        // Optional inversion for NEGATE
-					.IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0 
-										FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN 
+					.IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0
+										FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN
 										2'b01, // Y : M
 										2'b01  // X: M
 					}), // Optional inversion for OPMODE
@@ -325,7 +331,7 @@ module mvu_8sx9 #(
 							INTERNAL_PREGS==2 ? 1'b0 : 1'b1,
 							2'b00,
 							TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero,
-							INTERNAL_PREGS==2 ? 1'b0 : 1'b1        
+							INTERNAL_PREGS==2 ? 1'b0 : 1'b1
 					}),                                 // 5-bit input: INMODE control
 					.NEGATE('0),                        // 3-bit input: Negates the input of the multiplier
 					.OPMODE({
@@ -365,7 +371,8 @@ module mvu_8sx9 #(
 					.RSTP(PREG && rst)                  // 1-bit input: Reset for PREG
 				);
 			end : genDSP
-		end : genDSPChain  
+`endif
+		end : genDSPChain
 	end : genDSPPE
-    
+
 endmodule : mvu_8sx9

From a66f38f2d06901fd27cf874701572268ea4793d6 Mon Sep 17 00:00:00 2001
From: Yaman Umuroglu <maltanar@gmail.com>
Date: Thu, 11 May 2023 23:48:36 +0200
Subject: [PATCH 035/112] [Deps] update qonnx

---
 fetch-repos.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fetch-repos.sh b/fetch-repos.sh
index e039ca9144..f1cf8754f2 100755
--- a/fetch-repos.sh
+++ b/fetch-repos.sh
@@ -27,7 +27,7 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-QONNX_COMMIT="20a34289cf2297d2b2bbbe75d6ac152ece86e3b4"
+QONNX_COMMIT="bc36fd56bf1e4abfcf98cd76a001cad13d57baac"
 FINN_EXP_COMMIT="0aa7e1c44b20cf085b6fe42cff360f0a832afd2c"
 BREVITAS_COMMIT="c65f9c13dc124971f14739349531bbcda5c2a4aa"
 PYVERILATOR_COMMIT="766e457465f5c0dd315490d7b9cc5d74f9a76f4f"

From 8f9bd04b3311e56da4684a58d4de868d61f342ab Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Wed, 24 May 2023 12:44:53 +0100
Subject: [PATCH 036/112] Adding folding hints. Impl selection by case
 statement.

---
 finn-rtllib/mvu/mvu_axi.sv | 34 ++++++++++++++++++++++------------
 1 file changed, 22 insertions(+), 12 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_axi.sv b/finn-rtllib/mvu/mvu_axi.sv
index e4a919ba88..a181f54ac5 100644
--- a/finn-rtllib/mvu/mvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_axi.sv
@@ -29,6 +29,14 @@
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * @brief	Matrix Vector Unit (MVU) AXI-lite interface wrapper.
+ * @details
+ *  Folding hints:
+ *	 - 4-bit MVU:          PE scaling should aim at a full multiple of 4.
+ *	 - 8-bit MVU - DSP48:  PE scaling should aim at a full multiple of 2.
+ *	 - 8-bit MVU - DSP58:  SIMD scaling should aim at a full multiple of 3.
+ *	 - Otherwise, keep SIMD and PE somewhat balanced. SIMD scaling tends to
+ *	   impact critical paths more than PE scaling. PE scaling implies a
+ *	   bigger fanout on the input activations.
  *****************************************************************************/
 
 module mvu_axi #(
@@ -134,8 +142,9 @@ module mvu_axi #(
 	uwire ovld;
 	uwire [PE-1:0][ACCU_WIDTH-1:0] odat;
 	typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t;
-	
-	if (MVU_IMPL_STYLE == "mvu_8sx9_dsp58") begin : genMVU8sx9
+
+	case(MVU_IMPL_STYLE)
+	"mvu_8sx9_dsp58":
 		mvu_8sx9 #(.PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
 		.ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN),
 		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
@@ -143,26 +152,27 @@ module mvu_axi #(
 			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau),
 			.vld(ovld), .p(odat)
 		);
-	end
-	else if (MVU_IMPL_STYLE == "mvu_4sx4u") begin : genMVU4sx4u
+
+	"mvu_4sx4u":
 		mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
 			.clk, .rst, .en,
 			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau),
 			.vld(ovld), .p(odat)
 		);
-	end
-	else if (MVU_IMPL_STYLE == "mvu_8sx8u_dsp48") begin : genMVU8sx8u
+
+	"mvu_8sx8u_dsp48":
 		mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
 		 .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
 			.clk, .rst, .en,
 			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau),
 			.vld(ovld), .p(odat)
 		);
-	end
-	else initial begin
-		$error("Unrecognized MVU_IMPL_STYLE!");
+
+	default: initial begin
+		$error("Unrecognized MVU_IMPL_STYLE '%s'", MVU_IMPL_STYLE);
 		$finish;
 	end
+	endcase
 
 //-------------------- Output register slice --------------------\\
 	struct packed {
@@ -185,7 +195,7 @@ module mvu_axi #(
 			end
 		end
 	end
-	
+
 	struct packed {
 		logic vld;
 		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
@@ -196,10 +206,10 @@ module mvu_axi #(
 		if(rst)		B <= '{ default: 'x };
 		else begin
 			if(b_load)	B <= '{ vld: A.vld, dat: A.dat};
-		end	
+		end
 	end
 
 	assign	m_axis_output_tvalid = B.vld;
 	assign	m_axis_output_tdata  = B.dat;
 
-endmodule : mvu_axi
\ No newline at end of file
+endmodule : mvu_axi

From 9de5ed6f7b459f37bb127f0cd105e6f927d25611 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Wed, 24 May 2023 13:52:40 +0100
Subject: [PATCH 037/112] Fixed behavioral sideband prediction.

---
 finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
index 09db360b77..bd1f813af6 100644
--- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
+++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
@@ -69,7 +69,7 @@ module mvu_8sx8u_dsp48 #(
 				for(genvar  pe = 0; pe < PE_END - PE_BEG; pe++) begin
 					assign	ww[pe] = w[PE_BEG + pe][s];
 					if(pe) begin
-						if(BEHAVIORAL)  assign  xx[pe] = zero? 0 : ww[pe] * a[s];
+						if(BEHAVIORAL)  assign  xx = zero? 0 : ww[pe] * a[s];
 `ifndef VERILATOR
 						else begin
 							LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x (

From 239759a6a4b8cb008aa9b80d52d15f53f77e5965 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Wed, 24 May 2023 15:49:19 +0100
Subject: [PATCH 038/112] [rtl mvu]: extension to allow selecting PE values
 that are not multiples of 4

---
 finn-rtllib/mvu/mvu_4sx4u.sv | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv
index 21594e46ac..111d651cf5 100644
--- a/finn-rtllib/mvu/mvu_4sx4u.sv
+++ b/finn-rtllib/mvu/mvu_4sx4u.sv
@@ -50,6 +50,7 @@ module mvu_4sx4u #(
 
 		localparam int unsigned  PE_BEG = 4*c;
 		localparam int unsigned  PE_END = PE < 4*(c+1)? PE : 4*(c+1);
+		localparam int unsigned  PE_REM = 4*(c+1) - PE_END;
 
 		uwire        [57:0]  p3[SIMD];
 		uwire signed [ 1:0]  h3[SIMD][3];
@@ -65,12 +66,12 @@ module mvu_4sx4u #(
 				for(genvar  pe = 0; pe < PE_END - PE_BEG; pe++) begin
 					assign	ww[pe] = w[PE_BEG + pe][s];
 					if(pe) begin
-						if(BEHAVIORAL)  assign  xx[pe] = zero? 0 : ww[pe] * a[s];
+						if(BEHAVIORAL)  assign  xx[pe + PE_REM] = zero? 0 : ww[pe] * a[s];
 `ifndef VERILATOR
 						else begin
 							LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x (
-								.O6(xx[pe][1]),
-								.O5(xx[pe][0]),
+								.O6(xx[pe + PE_REM][1]),
+								.O5(xx[pe + PE_REM][0]),
 								.I5(1'b1),
 								.I4(zero),
 								.I3(ww[pe][1]),
@@ -86,8 +87,8 @@ module mvu_4sx4u #(
 					dd = '0;
 					aa = '0;
 					for(int unsigned  pe = 0; pe < PE_END - PE_BEG; pe++) begin
-						dd[D[pe]+:3] = ww[pe];
-						aa[D[pe]+ 3] = ww[pe][3];
+						dd[D[pe + PE_REM]+:3] = ww[pe];
+						aa[D[pe + PE_REM]+ 3] = ww[pe][3];
 					end
 				end
 			end : blkVectorize
@@ -305,7 +306,7 @@ module mvu_4sx4u #(
 			localparam int unsigned  HI_WIDTH = ACCU_WIDTH - LO_WIDTH;
 
 			// Conclusive high part accumulation
-			if(i < 3) begin : genHi
+			if(i >= PE_REM && i < 3) begin : genHi
 				// Adder Tree across all SIMD high contributions, each from [-1:1]
 				uwire signed [$clog2(1+SIMD):0]  tree[2*SIMD-1];
 				for(genvar  s = 0; s < SIMD;   s++)  assign  tree[SIMD-1+s] = h3[s][i];
@@ -323,9 +324,12 @@ module mvu_4sx4u #(
 				end
 				assign	hi4[i] = Hi4;
 			end : genHi
+			else begin : genHiZero
+				assign hi4[i] = '0;
+			end : genHiZero
 
 			// Conclusive low part accumulation
-			if(1) begin : blkLo
+			if(i >= PE_REM) begin : blkLo
 				// Adder Tree across all SIMD low contributions
 				localparam int unsigned  ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1));
 				uwire [ROOT_WIDTH-1:0]  tree[2*SIMD-1];
@@ -346,6 +350,9 @@ module mvu_4sx4u #(
 				if(i == 3)  assign  up4 = Lo4;
 				else  assign  lo4[i] = Lo4;
 			end : blkLo
+			else begin : blkLoZero
+				assign lo4[i] = '0;
+			end : blkLoZero
 
 		end
 
@@ -363,7 +370,7 @@ module mvu_4sx4u #(
 
 		// Output
 		for(genvar  pe = PE_BEG; pe < PE_END; pe++) begin
-			assign	p[pe] = Res5[pe - PE_BEG];
+			assign	p[pe] = Res5[pe - PE_BEG + PE_REM];
 		end
 
 	end : genPipes

From 8d3247ccf7657aeb534147a5dd9511fa397d4eb2 Mon Sep 17 00:00:00 2001
From: Yaman Umuroglu <maltanar@gmail.com>
Date: Wed, 24 May 2023 15:56:07 +0200
Subject: [PATCH 039/112] [rtlmvu] Avoid unintentional verilator metacomments

---
 finn-rtllib/mvu/mvu_4sx4u.sv       | 2 +-
 finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 2 +-
 finn-rtllib/mvu/mvu_8sx9.sv        | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv
index 21594e46ac..9f101e8c29 100644
--- a/finn-rtllib/mvu/mvu_4sx4u.sv
+++ b/finn-rtllib/mvu/mvu_4sx4u.sv
@@ -19,7 +19,7 @@ module mvu_4sx4u #(
 	output	logic  vld,
 	output	logic signed [PE-1:0][ACCU_WIDTH-1:0]  p
 );
-	// Verilator always to use behavioral code
+	// for verilator always use behavioral code
 	localparam bit  BEHAVIORAL =
 `ifdef VERILATOR
 		1 ||
diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
index bd1f813af6..6b54e91b6a 100644
--- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
+++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
@@ -23,7 +23,7 @@ module mvu_8sx8u_dsp48 #(
 	output	logic  vld,
 	output	logic signed [PE-1:0][ACCU_WIDTH-1:0]  p
 );
-	// Verilator always to use behavioral code
+	// for verilator always use behavioral code
 	localparam bit  BEHAVIORAL =
 `ifdef VERILATOR
 		1 ||
diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv
index f8e2ab3985..a601066cfd 100644
--- a/finn-rtllib/mvu/mvu_8sx9.sv
+++ b/finn-rtllib/mvu/mvu_8sx9.sv
@@ -57,7 +57,7 @@ module mvu_8sx9 #(
 	output  logic vld,
     output  logic [PE-1:0][ACCU_WIDTH-1:0] p
   );
-	// Verilator always to use behavioral code
+	// for verilator always use behavioral code
 	localparam bit  BEHAVIORAL =
 `ifdef VERILATOR
 		1 ||

From c8663505dcd2c2eeb3ddad05d361f82be32040eb Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Wed, 24 May 2023 17:14:23 +0100
Subject: [PATCH 040/112] [rtl mvu]: extension to allow selecting PE values
 that are not multiples of 2

---
 finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 57 +++++++++++++++++-------------
 1 file changed, 32 insertions(+), 25 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
index 6b54e91b6a..5cc3fa4c49 100644
--- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
+++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
@@ -54,6 +54,7 @@ module mvu_8sx8u_dsp48 #(
 
 		localparam int unsigned  PE_BEG = 2*c;
 		localparam int unsigned  PE_END = PE < 2*(c+1)? PE : 2*(c+1);
+		localparam int unsigned  PE_RES = 2*(c+1) - PE_END;
 
 		uwire        [57:0]  p3[SIMD];
 		uwire signed [ 1:0]  h3[SIMD];
@@ -90,8 +91,8 @@ module mvu_8sx8u_dsp48 #(
 					dd = '0;
 					aa = '0;
 					for(int unsigned  pe = 0; pe < PE_END - PE_BEG; pe++) begin
-						dd[D[pe] +: WEIGHT_WIDTH-1] = ww[pe];
-						aa[D[pe] + WEIGHT_WIDTH-1] = ww[pe][WEIGHT_WIDTH-1];
+						dd[D[pe + PE_RES] +: WEIGHT_WIDTH-1] = ww[pe];
+						aa[D[pe + PE_RES] + WEIGHT_WIDTH-1] = ww[pe][WEIGHT_WIDTH-1];
 					end
 				end
 			end : blkVectorize
@@ -301,32 +302,35 @@ module mvu_8sx8u_dsp48 #(
 		uwire signed [ACCU_WIDTH  -1:0]  up4;
 		uwire signed [ACCU_WIDTH  -SINGLE_PROD_WIDTH:0]  hi4;
 		uwire        [$clog2(SIMD)+SINGLE_PROD_WIDTH-1:0]  lo4;
-		for(genvar  i = 0; i < 2; i++) begin
-			localparam int unsigned  LO_WIDTH = D[i+1] - D[i];
-			localparam int unsigned  HI_WIDTH = ACCU_WIDTH - LO_WIDTH;
 
-			// Conclusive high part accumulation
-			if(i == 0) begin : genHi
-				// Adder Tree across all SIMD high contributions, each from [-1:1]
-				uwire signed [$clog2(1+SIMD):0]  tree[2*SIMD-1];
-				for(genvar  s = 0; s < SIMD;   s++)  assign  tree[SIMD-1+s] = h3[s];
-				for(genvar  n = 0; n < SIMD-1; n++) begin
-					// Sum truncated to actual maximum bit width at this node
-					uwire signed [$clog2(1+LEAVE_LOAD[n]):0]  s = tree[2*n+1] + tree[2*n+2];
-					assign  tree[n] = s;
-				end
+		// Conclusive high part accumulation
+		if(PE_RES == 0) begin : genHi
+			localparam int unsigned  HI_WIDTH = ACCU_WIDTH - D[1];
+			// Adder Tree across all SIMD high contributions, each from [-1:1]
+			uwire signed [$clog2(1+SIMD):0]  tree[2*SIMD-1];
+			for(genvar  s = 0; s < SIMD;   s++)  assign  tree[SIMD-1+s] = h3[s];
+			for(genvar  n = 0; n < SIMD-1; n++) begin
+				// Sum truncated to actual maximum bit width at this node
+				uwire signed [$clog2(1+LEAVE_LOAD[n]):0]  s = tree[2*n+1] + tree[2*n+2];
+				assign  tree[n] = s;
+			end
 
-				// High Sideband Accumulation
-				logic signed [HI_WIDTH-1:0]  Hi4 = 0;
-				always_ff @(posedge clk) begin
-					if(rst)      Hi4 <= 0;
-					else if(en)  Hi4 <= (L[4]? 0 : Hi4) + tree[0];
-				end
-				assign	hi4 = Hi4;
-			end : genHi
+			// High Sideband Accumulation
+			logic signed [HI_WIDTH-1:0]  Hi4 = 0;
+			always_ff @(posedge clk) begin
+				if(rst)      Hi4 <= 0;
+				else if(en)  Hi4 <= (L[4]? 0 : Hi4) + tree[0];
+			end
+			assign	hi4 = Hi4;
+		end : genHi
+		else begin : genHiZero
+			assign hi4 = '0;
+		end : genHiZero
 
+		for(genvar  i = 0; i < 2; i++) begin
+			localparam int unsigned  LO_WIDTH = D[i+1] - D[i];
 			// Conclusive low part accumulation
-			if(1) begin : blkLo
+			if(i >= PE_RES) begin : blkLo
 				// Adder Tree across all SIMD low contributions
 				localparam int unsigned  ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1));
 				uwire [ROOT_WIDTH-1:0]  tree[2*SIMD-1];
@@ -347,6 +351,9 @@ module mvu_8sx8u_dsp48 #(
 				if(i == 1)  assign  up4 = Lo4;
 				else  assign  lo4 = Lo4;
 			end : blkLo
+			else begin : blkLoZero
+				assign lo4 = '0;
+			end : blkLoZero
 
 		end
 
@@ -362,7 +369,7 @@ module mvu_8sx8u_dsp48 #(
 
 		// Output
 		for(genvar  pe = PE_BEG; pe < PE_END; pe++) begin
-			assign	p[pe] = Res5[pe - PE_BEG];
+			assign	p[pe] = Res5[pe - PE_BEG + PE_RES];
 		end
 
 	end : genPipes

From fd1e038c643c05199b38320f8815f430e538d936 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Wed, 24 May 2023 17:21:56 +0100
Subject: [PATCH 041/112] [rtl mvu axi]: updated comments on folding hints

---
 finn-rtllib/mvu/mvu_axi.sv | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_axi.sv b/finn-rtllib/mvu/mvu_axi.sv
index a181f54ac5..cef55949ed 100644
--- a/finn-rtllib/mvu/mvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_axi.sv
@@ -31,12 +31,13 @@
  * @brief	Matrix Vector Unit (MVU) AXI-lite interface wrapper.
  * @details
  *  Folding hints:
- *	 - 4-bit MVU:          PE scaling should aim at a full multiple of 4.
- *	 - 8-bit MVU - DSP48:  PE scaling should aim at a full multiple of 2.
- *	 - 8-bit MVU - DSP58:  SIMD scaling should aim at a full multiple of 3.
+ *	 - 4-bit MVU:          PE scaling should divide MH.
+ *	 - 8-bit MVU - DSP48:  PE scaling should divide MH.
+ *	 - 8-bit MVU - DSP58:  SIMD scaling should aim at a full multiple of 3 and divide MW.
  *	 - Otherwise, keep SIMD and PE somewhat balanced. SIMD scaling tends to
  *	   impact critical paths more than PE scaling. PE scaling implies a
  *	   bigger fanout on the input activations.
+ *	 - Full unfolding along MH (PE=MH) results in no replay buffer instantiated
  *****************************************************************************/
 
 module mvu_axi #(

From f60d4c6fa105bd29689b93aafd880ec92c32358c Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 2 Jun 2023 11:48:26 +0100
Subject: [PATCH 042/112] [rtl custom op]: minor fixes to codegen

---
 .../fpgadataflow/matrixvectoractivation_rtl.py     | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
index 1791327e78..9f8130806b 100644
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
@@ -612,11 +612,7 @@ def generate_params(self, model, path):
         code_gen_dir = path
         # weights, if not external
         weights = model.get_initializer(self.onnx_node.input[1])
-<<<<<<< HEAD
-        if mem_mode in ["decoupled", "external"]:
-=======
         if mem_mode == "decoupled" or mem_mode == "external":
->>>>>>> 72fe4c5b ([rtlmvu] More fixes for memstream and param gen)
             weight_filename_sim = "{}/weights.npy".format(code_gen_dir)
             # save decoupled weights for cppsim
             self.make_weight_file(weights, "decoupled_npy", weight_filename_sim)
@@ -909,8 +905,6 @@ def code_generation_ipi(self):
                     self.onnx_node.name,
                 )
             )
-            cmd.append("set_property CONFIG.FREQ_HZ 333333333.333333 [get_bd_intf_pins %s/in0_V]" % (self.onnx_node.name))
-            cmd.append("set_property CONFIG.FREQ_HZ 333333333.333333 [get_bd_intf_pins %s/out_V]" % (self.onnx_node.name))
         else:
             raise Exception("Unrecognized mem_mode for MatrixVectorActivation")
         return cmd
@@ -968,8 +962,7 @@ def derive_characteristic_fxns(self, period):
     # TODO: characterize max_clk and implement this function in look-up style
     def _resolve_segment_len(self, clk):
         # Insert pipeline registers in the DSP chain to meet target clock frequency
-        segmentlen = 0
-        return segmentlen
+        return 4 # default to 4 for now
 
     def _resolve_impl_style(self, fpgapart):
         # Based on target device and activation/weight-width, choose the
@@ -1002,11 +995,6 @@ def generate_hdl(self, model, fpgapart, clk):
         # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject)
         self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name())
 
-        ram_style = self.get_nodeattr("ram_style")
-        assert (
-            ram_style == "auto"
-        ), "Unrecognized ram_style for MatrixVectorActivation_rtl"
-
         # apply code generation to template
         with open(template_path, "r") as f:
             template_wrapper = f.read()

From a1ad304a42bf89b36d6507cf9f749a7a1a7d130a Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 2 Jun 2023 11:48:58 +0100
Subject: [PATCH 043/112] [specialize-to-rtl]: add ram_style and
 rt_writeable_weights support

---
 .../transformation/fpgadataflow/specialize_to_rtl_layers.py   | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py
index 7d677ec216..23b6e59abe 100644
--- a/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py
+++ b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py
@@ -74,6 +74,8 @@ def apply(self, model):
                     simd = getCustomOp(n).get_nodeattr("SIMD")
                     pe = getCustomOp(n).get_nodeattr("PE")
                     mem_mode = getCustomOp(n).get_nodeattr("mem_mode")
+                    ram_style = getCustomOp(n).get_nodeattr("ram_style")
+                    runtime_writeable_weights = getCustomOp(n).get_nodeattr("runtime_writeable_weights")
 
                     new_node = helper.make_node(
                         "MatrixVectorActivation_rtl",
@@ -91,6 +93,8 @@ def apply(self, model):
                         numInputVectors=numInputVectors,
                         mem_mode=mem_mode,
                         name=n.name + "_rtl",
+                        ram_style=ram_style,
+                        runtime_writeable_weights=runtime_writeable_weights
                     )
                     graph.node.insert(node_ind, new_node)
                     # remove old node

From 2cbb68fe016ff7ea292ffa071741b352222d1a4c Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 2 Jun 2023 11:50:05 +0100
Subject: [PATCH 044/112] [rtllib]: change string type to parameter type due to
 Vivado error

---
 finn-rtllib/mvu/mvu_axi.sv | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_axi.sv b/finn-rtllib/mvu/mvu_axi.sv
index cef55949ed..46167af95b 100644
--- a/finn-rtllib/mvu/mvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_axi.sv
@@ -51,7 +51,7 @@ module mvu_axi #(
 	bit SIGNED_ACTIVATIONS = 0,
 	int unsigned SEGMENTLEN = 0,
 	bit FORCE_BEHAVIORAL = 0,
-	string MVU_IMPL_STYLE,
+	parameter MVU_IMPL_STYLE, // string type causes error in Vivado
 
 	localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
 	localparam int unsigned INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8,
@@ -163,12 +163,11 @@ module mvu_axi #(
 
 	"mvu_8sx8u_dsp48":
 		mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
-		 .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
+		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
 			.clk, .rst, .en,
 			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau),
 			.vld(ovld), .p(odat)
 		);
-
 	default: initial begin
 		$error("Unrecognized MVU_IMPL_STYLE '%s'", MVU_IMPL_STYLE);
 		$finish;

From 92eb0edba2d059b8b170ed7e6d8ac7a224c9208c Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 2 Jun 2023 11:51:40 +0100
Subject: [PATCH 045/112] [rtllib]: renamed variable for consistency

---
 finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
index 5cc3fa4c49..3cd9cef560 100644
--- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
+++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
@@ -54,7 +54,7 @@ module mvu_8sx8u_dsp48 #(
 
 		localparam int unsigned  PE_BEG = 2*c;
 		localparam int unsigned  PE_END = PE < 2*(c+1)? PE : 2*(c+1);
-		localparam int unsigned  PE_RES = 2*(c+1) - PE_END;
+		localparam int unsigned  PE_REM = 2*(c+1) - PE_END;
 
 		uwire        [57:0]  p3[SIMD];
 		uwire signed [ 1:0]  h3[SIMD];
@@ -91,8 +91,8 @@ module mvu_8sx8u_dsp48 #(
 					dd = '0;
 					aa = '0;
 					for(int unsigned  pe = 0; pe < PE_END - PE_BEG; pe++) begin
-						dd[D[pe + PE_RES] +: WEIGHT_WIDTH-1] = ww[pe];
-						aa[D[pe + PE_RES] + WEIGHT_WIDTH-1] = ww[pe][WEIGHT_WIDTH-1];
+						dd[D[pe + PE_REM] +: WEIGHT_WIDTH-1] = ww[pe];
+						aa[D[pe + PE_REM] + WEIGHT_WIDTH-1] = ww[pe][WEIGHT_WIDTH-1];
 					end
 				end
 			end : blkVectorize
@@ -304,7 +304,7 @@ module mvu_8sx8u_dsp48 #(
 		uwire        [$clog2(SIMD)+SINGLE_PROD_WIDTH-1:0]  lo4;
 
 		// Conclusive high part accumulation
-		if(PE_RES == 0) begin : genHi
+		if(PE_REM == 0) begin : genHi
 			localparam int unsigned  HI_WIDTH = ACCU_WIDTH - D[1];
 			// Adder Tree across all SIMD high contributions, each from [-1:1]
 			uwire signed [$clog2(1+SIMD):0]  tree[2*SIMD-1];
@@ -330,7 +330,7 @@ module mvu_8sx8u_dsp48 #(
 		for(genvar  i = 0; i < 2; i++) begin
 			localparam int unsigned  LO_WIDTH = D[i+1] - D[i];
 			// Conclusive low part accumulation
-			if(i >= PE_RES) begin : blkLo
+			if(i >= PE_REM) begin : blkLo
 				// Adder Tree across all SIMD low contributions
 				localparam int unsigned  ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1));
 				uwire [ROOT_WIDTH-1:0]  tree[2*SIMD-1];
@@ -369,7 +369,7 @@ module mvu_8sx8u_dsp48 #(
 
 		// Output
 		for(genvar  pe = PE_BEG; pe < PE_END; pe++) begin
-			assign	p[pe] = Res5[pe - PE_BEG + PE_RES];
+			assign	p[pe] = Res5[pe - PE_BEG + PE_REM];
 		end
 
 	end : genPipes

From 471a221b975e549e462e7ff9488c65ad182fe278 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Fri, 2 Jun 2023 12:39:14 +0100
Subject: [PATCH 046/112] Fix improper blocking assignment & linting.

---
 finn-rtllib/mvu/tb/mvu_axi_tb.sv | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/finn-rtllib/mvu/tb/mvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_axi_tb.sv
index ef5fa7d682..b89b58f55b 100644
--- a/finn-rtllib/mvu/tb/mvu_axi_tb.sv
+++ b/finn-rtllib/mvu/tb/mvu_axi_tb.sv
@@ -42,12 +42,12 @@ module mvu_axi_tb();
 	localparam int unsigned SEGMENTLEN = 2;
 	localparam string MVU_IMPL_STYLE = "mvu_8sx8u_dsp48";
 	localparam bit FORCE_BEHAVIORAL = 1;
-	// Bit-width config  
+	// Bit-width config
 	localparam int unsigned ACTIVATION_WIDTH = 8;
 	localparam int unsigned WEIGHT_WIDTH = 8;
 	localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW);
 	localparam bit SIGNED_ACTIVATIONS = 0;
-	// Simulation constants  
+	// Simulation constants
 	localparam int unsigned NF = MH/PE;
 	localparam int unsigned SF = MW/SIMD;
 	localparam int unsigned NUM_OF_DSP = SIMD/3;
@@ -57,7 +57,7 @@ module mvu_axi_tb();
 	localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - SIMD*ACTIVATION_WIDTH;
 	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8;
 
-	// Generate clk and reset signal   
+	// Generate clk and reset signal
 	logic clk = 0;
 	always #5ns clk = !clk;
 
@@ -69,7 +69,7 @@ module mvu_axi_tb();
 
 	uwire ap_clk = clk;
 
-	// Generate activations  
+	// Generate activations
 	typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t;
 	typedef activation_t activation_vector_t[SF];
 
@@ -94,8 +94,8 @@ module mvu_axi_tb();
 
 		for (int i=0; i<SF; i++) begin
 			activations.dat <= ACTIVATIONS[i];
-			do begin 
-				activations.vld = $urandom()%7 >= 1;
+			do begin
+				activations.vld <= $urandom()%7 >= 1;
 				@(posedge clk);
 			end while (!(activations.vld === 1 && activations.rdy === 1));
 		end
@@ -104,9 +104,9 @@ module mvu_axi_tb();
 		activations.dat <= 'x;
 	end
 
-	// Generate weights   
+	// Generate weights
 	typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t;
-	typedef weight_t weight_matrix_t[NF][SF]; 
+	typedef weight_t weight_matrix_t[NF][SF];
 
 	function weight_matrix_t init_WEIGHTS;
 		automatic weight_matrix_t res;
@@ -139,7 +139,7 @@ module mvu_axi_tb();
 		weights.dat <= 'x;
 	end
 
-	// Function to compute golden output  
+	// Function to compute golden output
 	// a: [SF][SIMD-1:0][ACTIVATION_WIDTH-1:0]
 	// w: [NF][SF][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]
 	typedef logic signed [PE-1:0][ACCU_WIDTH-1:0] output_t;
@@ -155,12 +155,12 @@ module mvu_axi_tb();
 		automatic output_vector_t res = '{default: 0};
 		for (int j = 0; j<MH; j++) begin
 			for (int i = 0; i<MW; i++) begin
-				if (SIGNED_ACTIVATIONS==1) 
+				if (SIGNED_ACTIVATIONS)
 					res[j/PE][j%PE] = $signed(res[j/PE][j%PE]) + $signed(a[i/SIMD][i%SIMD]) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
 				else
 					res[j/PE][j%PE] = $signed(res[j/PE][j%PE]) + $signed({1'b0, a[i/SIMD][i%SIMD]}) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
 			end
-		end  
+		end
 		return res;
 	endfunction : check_output;
 
@@ -179,16 +179,16 @@ module mvu_axi_tb();
 			// Compare produced outputs against golden outputs
 			foreach(outputs.dat[i]) begin
 				assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
-				else begin 
+				else begin
 					$error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
 					$stop;
-				end  
+				end
 			end
-			
+
 			NF_CNT += 1;
 		end
 
-		$finish;  
+		$finish;
 	end
 
 	// Instantiate DUT
@@ -211,5 +211,5 @@ module mvu_axi_tb();
 		.s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld),
 		.m_axis_output_tready(outputs.rdy)
 	);
-  
+
 endmodule : mvu_axi_tb

From 5c5dc09c98d4e1a07a7e4cae17ca358b197a57c8 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 2 Jun 2023 13:35:04 +0100
Subject: [PATCH 047/112] [test rtl mvu]: modified/extended test cases

---
 tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py b/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py
index 20a249bd08..3db7a718f5 100644
--- a/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py
+++ b/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py
@@ -86,13 +86,12 @@ def prepare_inputs(input_tensor):
     return {"inp": input_tensor}
 
 @pytest.mark.parametrize("mh", [16])
-@pytest.mark.parametrize("mw", [90])
-#@pytest.mark.parametrize("pe", [1, 2, 4, 8, 16])
-@pytest.mark.parametrize("pe", [16])
+@pytest.mark.parametrize("mw", [32])
+@pytest.mark.parametrize("pe", [1, 4, 16])
 #@pytest.mark.parametrize("simd", [1, 30, 90])
-@pytest.mark.parametrize("simd", [90])
-@pytest.mark.parametrize("idt", [DataType["INT8"]])
-@pytest.mark.parametrize("wdt", [DataType["UINT4"]])
+@pytest.mark.parametrize("simd", [1, 4, 32])
+@pytest.mark.parametrize("idt", [DataType["UINT4"], DataType["UINT8"]])
+@pytest.mark.parametrize("wdt", [DataType["INT4"], DataType["INT6"]])
 #@pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S", "xcku3p-ffva676-1-e"])
 @pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S"])
 @pytest.mark.parametrize("segmentlen", [1])
@@ -166,7 +165,3 @@ def test_fpgadataflow_mvau_rtl(mh, mw, pe, simd, idt, wdt, part, segmentlen):
 
     assert (output_mvau_hls == output_mvau_rtl).all()
     assert (output_mvau_hls.size > 0)
-
-
-# python setup.py test --addopts "-k test_fpgadataflow_mvau_rtl"
-# python setup.py test --addopts "-k test_fpgadataflow_fclayer_rtlsim"
\ No newline at end of file

From b4eb9b69a8a6920fdb3141752395e672f78479e3 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 30 Jun 2023 15:36:17 +0100
Subject: [PATCH 048/112] [rtl mvu]: updated DSP58 >4-bit variant to lift
 SIMD%3==0 restriction

---
 finn-rtllib/mvu/mvu_8sx9.sv | 103 +++++++++++++++++++++++-------------
 1 file changed, 65 insertions(+), 38 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv
index a601066cfd..439fbc44f9 100644
--- a/finn-rtllib/mvu/mvu_8sx9.sv
+++ b/finn-rtllib/mvu/mvu_8sx9.sv
@@ -92,77 +92,95 @@ module mvu_8sx9 #(
 			if (rst)      Z <= '{default: 0};
 			else if(en) begin
 				Z[0] <= zero;
-				if (MAX_PIPELINE_STAGES > 2)  Z[1:MAX_PIPELINE_STAGES-1] <= Z[0:MAX_PIPELINE_STAGES-2];
+				if (MAX_PIPELINE_STAGES > 2)  Z[1:MAX_PIPELINE_STAGES-2] <= Z[0:MAX_PIPELINE_STAGES-3];
 			end
 		end
 	end;
 
 //-------------------- Buffer for input activations --------------------\\
 	localparam int unsigned PAD_BITS_ACT = 9 - ACTIVATION_WIDTH;
-	typedef logic [2:0][ACTIVATION_WIDTH-1:0] a_buffer_t;
 
 	for (genvar i=0; i<CHAINLEN; i++) begin : genActSIMD
 		localparam int TOTAL_PREGS = i/SEGLEN;
 		localparam int EXTERNAL_PREGS = TOTAL_PREGS>1 ? TOTAL_PREGS-1 : 0;
+		localparam int LANES_OCCUPIED = i == CHAINLEN-1 ? SIMD - 3*i : 3;
 
 		if (EXTERNAL_PREGS > 0) begin : genExternalPregAct
-			a_buffer_t A [0:EXTERNAL_PREGS-1];
+			logic [0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][ACTIVATION_WIDTH-1:0] A = '{ default : 0};
 			always_ff @(posedge clk) begin
 				if (rst)     A <= '{default: 0};
 				else if(en) begin
-					A[EXTERNAL_PREGS-1] <= a[3*i +: 3];
+					A[EXTERNAL_PREGS-1] <= a[3*i +: LANES_OCCUPIED];
 					if (EXTERNAL_PREGS > 1)   A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1];
 				end
 			end
-			assign a_in_i[i][26:0] = SIGNED_ACTIVATIONS ? { {PAD_BITS_ACT{A[0][2][ACTIVATION_WIDTH-1]}}, A[0][2], {PAD_BITS_ACT{A[0][1][ACTIVATION_WIDTH-1]}}, A[0][1], {PAD_BITS_ACT{A[0][0][ACTIVATION_WIDTH-1]}}, A[0][0]}
-									: { {PAD_BITS_ACT{1'b0}}, A[0][2], {PAD_BITS_ACT{1'b0}}, A[0][1], {PAD_BITS_ACT{1'b0}}, A[0][0]} ;
+			for (genvar j=0; j<LANES_OCCUPIED; j++) begin : genAin
+				assign a_in_i[i][9*j +: 9] = SIGNED_ACTIVATIONS ? PAD_BITS_ACT == 0 ? A[0][j] : { {PAD_BITS_ACT{A[0][j][ACTIVATION_WIDTH-1]}}, A[0][j] } 
+											: PAD_BITS_ACT == 0 ? A[0][j] : { {PAD_BITS_ACT{1'b0}}, A[0][j] } ;
+			end : genAin
+			for (genvar j=LANES_OCCUPIED; j<3; j++) begin : genAinZero
+				assign a_in_i[i][9*j +: 9] = 9'b0;
+			end : genAinZero
 		end : genExternalPregAct
 		else begin : genInpDSPAct
-			assign a_in_i[i][26:0] = SIGNED_ACTIVATIONS ? { {PAD_BITS_ACT{a[3*i+2][ACTIVATION_WIDTH-1]}}, a[3*i+2], {PAD_BITS_ACT{a[3*i+1][ACTIVATION_WIDTH-1]}}, a[3*i+1], {PAD_BITS_ACT{a[3*i][ACTIVATION_WIDTH-1]}}, a[3*i]}
-									: { {PAD_BITS_ACT{1'b0}}, a[3*i+2], {PAD_BITS_ACT{1'b0}}, a[3*i+1], {PAD_BITS_ACT{1'b0}}, a[3*i]} ;
+			for (genvar j=0; j<LANES_OCCUPIED; j++) begin : genAin
+				assign a_in_i[i][9*j +: 9] = SIGNED_ACTIVATIONS ? PAD_BITS_ACT == 0 ? a[3*i+j] : { {PAD_BITS_ACT{a[3*i+j][ACTIVATION_WIDTH-1]}}, a[3*i+j] }
+											: PAD_BITS_ACT == 0 ? a[3*i+j] : { {PAD_BITS_ACT{1'b0}}, a[3*i+j] } ;
+			end : genAin
+			for (genvar j=LANES_OCCUPIED; j<3; j++) begin : genAinZero
+				assign a_in_i[i][9*j +: 9] = 9'b0;
+			end : genAinZero
 		end : genInpDSPAct
 
 	end : genActSIMD
 
 //-------------------- Buffer for weights --------------------\\
 	localparam int unsigned PAD_BITS_WEIGHT = 8 - WEIGHT_WIDTH;
-	typedef logic [2:0][WEIGHT_WIDTH-1:0] b_buffer_t;
 
-	for (genvar j=0; j<PE; j++) begin : genWeightPE
-		for (genvar i=0; i<CHAINLEN; i++) begin : genWeightSIMD
-			localparam int TOTAL_PREGS = i/SEGLEN;
+	for (genvar i=0; i<PE; i++) begin : genWeightPE
+		for (genvar j=0; j<CHAINLEN; j++) begin : genWeightSIMD
+			localparam int TOTAL_PREGS = j/SEGLEN;
 			localparam int EXTERNAL_PREGS = TOTAL_PREGS>1 ? TOTAL_PREGS-1 : 0;
+			localparam int LANES_OCCUPIED = j == CHAINLEN-1 ? SIMD - 3*j : 3;
 
 			if (EXTERNAL_PREGS > 0) begin : genExternalPregWeight
-				b_buffer_t B [0:PE-1][0:EXTERNAL_PREGS-1];
+				logic [0:PE-1][0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][WEIGHT_WIDTH-1:0] B = '{ default : 0};
 				always_ff @(posedge clk) begin
 					if (rst)    B <= '{default: 0};
 					else if (en) begin
-						B[j][EXTERNAL_PREGS-1] <= w[j][3*i +: 3];
-						if (EXTERNAL_PREGS > 1) B[j][0:EXTERNAL_PREGS-2] <= B[j][1:EXTERNAL_PREGS-1];
+						B[i][EXTERNAL_PREGS-1] <= w[i][3*j +: LANES_OCCUPIED];
+						if (EXTERNAL_PREGS > 1) B[i][0:EXTERNAL_PREGS-2] <= B[i][1:EXTERNAL_PREGS-1];
 					end
 				end
-				assign b_in_i[j][i][23:0] = { {PAD_BITS_WEIGHT{B[j][0][2][WEIGHT_WIDTH-1]}}, B[j][0][2], {PAD_BITS_WEIGHT{B[j][0][1][WEIGHT_WIDTH-1]}}, B[j][0][1], {PAD_BITS_WEIGHT{B[j][0][0][WEIGHT_WIDTH-1]}}, B[j][0][0] };
+				for (genvar k = 0 ; k < LANES_OCCUPIED ; k++) begin : genBin
+					assign b_in_i[i][j][8*k +: 8] = PAD_BITS_WEIGHT == 0 ? B[i][0][k] : { {PAD_BITS_WEIGHT{B[i][0][k][WEIGHT_WIDTH-1]}}, B[i][0][k] };
+				end : genBin
+				for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero
+					assign b_in_i[i][j][8*k +: 8] = 8'b0;
+				end : genBinZero
 			end : genExternalPregWeight
 			else begin : genInpDSPWeight
-				assign b_in_i[j][i][23:0] = { {PAD_BITS_WEIGHT{w[j][3*i+2][WEIGHT_WIDTH-1]}}, w[j][3*i+2], {PAD_BITS_WEIGHT{w[j][3*i+1][WEIGHT_WIDTH-1]}}, w[j][3*i+1], {PAD_BITS_WEIGHT{w[j][3*i][WEIGHT_WIDTH-1]}}, w[j][3*i] };
+				for (genvar k = 0; k < LANES_OCCUPIED; k++) begin : genBin
+					assign b_in_i[i][j][8*k +: 8] = PAD_BITS_WEIGHT == 0 ? w[i][3*j+k] : { {PAD_BITS_WEIGHT{w[i][3*j+k][WEIGHT_WIDTH-1]}}, w[i][3*j+k] };
+				end : genBin
+				for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero
+					assign b_in_i[i][j][8*k +: 8] = 8'b0;
+				end : genBinZero
 			end : genInpDSPWeight
 		end : genWeightSIMD
-
 	end : genWeightPE
 
 //-------------------- Instantiate PE x CHAINLEN DSPs --------------------\\
-	for (genvar j=0; j<PE; j++) begin : genDSPPE
-		for (genvar i=0; i<CHAINLEN; i++) begin : genDSPChain
-			localparam int TOTAL_PREGS = i/SEGLEN;
+	for (genvar i=0; i<PE; i++) begin : genDSPPE
+		for (genvar j=0; j<CHAINLEN; j++) begin : genDSPChain
+			localparam int TOTAL_PREGS = j/SEGLEN;
 			localparam int INTERNAL_PREGS = TOTAL_PREGS>0 ? 2 : 1; // 1 : 0
-			localparam bit PREG = (i+1)%SEGLEN==0 || i == CHAINLEN-1;
-			localparam bit FIRST = i == 0;
-			localparam bit LAST = i == CHAINLEN-1;
-			uwire [57:0] pp;
+			localparam bit PREG = (j+1)%SEGLEN==0 || j == CHAINLEN-1;
+			localparam bit FIRST = j == 0;
+			localparam bit LAST = j == CHAINLEN-1;
 
 			if (LAST) begin : genPOUT
-				assign p[j] = pp[ACCU_WIDTH-1:0];
+				assign p[i] = pcout[i][j][ACCU_WIDTH-1:0];
 			end
 
 			// Note: Since the product B * AD is computed,
@@ -174,7 +192,7 @@ module mvu_8sx9 #(
 				always_ff @(posedge clk) begin
 					if (rst)	Areg <= '{ default : 0};
 					else if (en) begin
-						Areg[0] <= { 7'bx, a_in_i[i] };
+						Areg[0] <= { 7'bx, a_in_i[j] };
 						if (INTERNAL_PREGS == 2) Areg[1] <= Areg[0];
 					end
 				end
@@ -182,7 +200,7 @@ module mvu_8sx9 #(
 				always_ff @(posedge clk) begin
 					if (rst)	Breg <= '{ default : 0};
 					else if (en) begin
-						Breg[0] <= b_in_i[j][i];
+						Breg[0] <= b_in_i[i][j];
 						if (INTERNAL_PREGS == 2) Breg[1] <= Breg[0];
 					end
 				end
@@ -217,27 +235,36 @@ module mvu_8sx9 #(
 					end
 					else	assign Preg = Mreg;
 				end
-				else if (LAST) begin : genLast
+				else if (FIRST && LAST) begin : genSingle
+					always_ff @(posedge clk) begin
+						if (rst)		Opmode <= 0;
+						else if (en)	Opmode <= L[1];
+					end
+					always_ff @(posedge clk) begin
+						if (rst) 		Preg <= 0;
+						else if (en)	Preg <= (Opmode ? 0 : Preg) + Mreg;
+					end
+				end
+				else if (!FIRST && LAST) begin : genLast
 					always_ff @(posedge clk) begin
 						if (rst)		Opmode <= 0;
 						else if (en)	Opmode <= L[1];
 					end
 					always_ff @(posedge clk) begin
 						if (rst) 		Preg <= 0;
-						else if (en)	Preg <= (Opmode ? 0 : Preg) + Mreg + pcout[j][i-1];
+						else if (en)	Preg <= (Opmode ? 0 : Preg) + Mreg + pcout[i][j-1];
 					end
 				end
 				else begin : genMid
 					if (PREG) begin : genPregBehav
 						always_ff @(posedge clk) begin
 							if (rst)		Preg <= 0;
-							else if (en)	Preg <= Mreg + pcout[j][i-1];
+							else if (en)	Preg <= Mreg + pcout[i][j-1];
 						end
 					end
-					else	assign Preg = Mreg + pcout[j][i-1];
+					else	assign Preg = Mreg + pcout[i][j-1];
 				end
-				assign pp = Preg;
-				assign pcout[j][i] = pp;
+				assign pcout[i][j] = Preg;
 			end : genBehav
 `ifndef VERILATOR
 			else begin: genDSP
@@ -307,7 +334,7 @@ module mvu_8sx9 #(
 					.BCOUT(),                           // 24-bit output: B cascade
 					.CARRYCASCOUT(),                    // 1-bit output: Cascade carry
 					.MULTSIGNOUT(),                     // 1-bit output: Multiplier sign cascade
-					.PCOUT(pcout[j][i]),                // 58-bit output: Cascade output
+					.PCOUT(pcout[i][j]),                // 58-bit output: Cascade output
 					// Control outputs: Control Inputs/Status Bits
 					.OVERFLOW(),                        // 1-bit output: Overflow in add/acc
 					.PATTERNBDETECT(),                  // 1-bit output: Pattern bar detect
@@ -322,7 +349,7 @@ module mvu_8sx9 #(
 					.BCIN('x),                          // 24-bit input: B cascade
 					.CARRYCASCIN('x),                   // 1-bit input: Cascade carry
 					.MULTSIGNIN('x),                    // 1-bit input: Multiplier sign cascade
-					.PCIN(FIRST ? 'x : pcout[j][i-1]),  // 58-bit input: P cascade
+					.PCIN(FIRST ? 'x : pcout[i][j-1]),  // 58-bit input: P cascade
 					// Control inputs: Control Inputs/Status Bits
 					.ALUMODE(4'h0),                     // 4-bit input: ALU control
 					.CARRYINSEL('0),                    // 3-bit input: Carry select
@@ -339,8 +366,8 @@ module mvu_8sx9 #(
 							7'b000_0000
 					}), // 9-bit input: Operation mode
 					// Data inputs: Data Ports
-					.A({ 7'bx, a_in_i[i] }),            // 34-bit input: A data
-					.B(b_in_i[j][i]),                   // 24-bit input: B data
+					.A({ 7'bx, a_in_i[j] }),            // 34-bit input: A data
+					.B(b_in_i[i][j]),                   // 24-bit input: B data
 					.C('x),                             // 58-bit input: C data
 					.CARRYIN('0),                       // 1-bit input: Carry-in
 					.D('x),                             // 27-bit input: D data

From ad63673cda849ecf0df993bc83d00e676998ab03 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 30 Jun 2023 15:45:26 +0100
Subject: [PATCH 049/112] [rtl mvu]: bug fix for SIMD=1 init_leave_loads

---
 finn-rtllib/mvu/mvu_4sx4u.sv       | 2 +-
 finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv
index 4674576d23..ac95b5f8a9 100644
--- a/finn-rtllib/mvu/mvu_4sx4u.sv
+++ b/finn-rtllib/mvu/mvu_4sx4u.sv
@@ -296,7 +296,7 @@ module mvu_4sx4u #(
 		// Stage #4: Cross-SIMD Reduction
 
 		// Count leaves reachable from each node
-		localparam leave_load_t  LEAVE_LOAD = init_leave_loads();
+		localparam leave_load_t  LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 1}; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop
 
 		uwire signed [ACCU_WIDTH  -1:0]  up4;
 		uwire signed [ACCU_WIDTH  -8:0]  hi4[3];
diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
index 3cd9cef560..416c12c1cc 100644
--- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
+++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
@@ -297,7 +297,7 @@ module mvu_8sx8u_dsp48 #(
 		// Stage #4: Cross-SIMD Reduction
 
 		// Count leaves reachable from each node
-		localparam leave_load_t  LEAVE_LOAD = init_leave_loads();
+		localparam leave_load_t  LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 0}; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop
 
 		uwire signed [ACCU_WIDTH  -1:0]  up4;
 		uwire signed [ACCU_WIDTH  -SINGLE_PROD_WIDTH:0]  hi4;

From 79e8a5ef208f7bcdeafa231a5a3dff74177008c9 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 13 Jul 2023 18:34:05 +0100
Subject: [PATCH 050/112] [mvu rtl]: restrict index i to be less than 3 (within
 bounds of hi4)

---
 finn-rtllib/mvu/mvu_4sx4u.sv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv
index ac95b5f8a9..88985312c9 100644
--- a/finn-rtllib/mvu/mvu_4sx4u.sv
+++ b/finn-rtllib/mvu/mvu_4sx4u.sv
@@ -324,7 +324,7 @@ module mvu_4sx4u #(
 				end
 				assign	hi4[i] = Hi4;
 			end : genHi
-			else begin : genHiZero
+			else if (i < 3) begin : genHiZero
 				assign hi4[i] = '0;
 			end : genHiZero
 

From e3493c30529949a77a3f384fd75c030c551cd2cd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Fri, 2 Jun 2023 12:47:53 +0100
Subject: [PATCH 051/112] Rewrite replay_buffer for input elasticity.

---
 finn-rtllib/mvu/replay_buffer.sv       | 153 ++++++++++++++++++-------
 finn-rtllib/mvu/tb/replay_buffer_tb.sv | 130 +++++++++++++++++++++
 2 files changed, 242 insertions(+), 41 deletions(-)
 create mode 100644 finn-rtllib/mvu/tb/replay_buffer_tb.sv

diff --git a/finn-rtllib/mvu/replay_buffer.sv b/finn-rtllib/mvu/replay_buffer.sv
index 89bbbdb88f..3dfe72d6c6 100644
--- a/finn-rtllib/mvu/replay_buffer.sv
+++ b/finn-rtllib/mvu/replay_buffer.sv
@@ -1,5 +1,5 @@
 /******************************************************************************
- * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * Copyright (C) 2022-2023, Advanced Micro Devices, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -51,60 +51,131 @@ module replay_buffer #(
 	input	logic  ordy
 );
 
-	typedef logic [$clog2(REP)+$clog2(LEN)-1:0]  count_t;
-	count_t  Count = 0;
-	uwire  done_len = LEN == 1 ? 1 : ((LEN-1) & ~Count[$clog2(LEN)-1:0]) == 0;
-	uwire  done_rep;
-	uwire  done_all = done_len && done_rep;
+	if(LEN == 0)  initial begin
+		$error("%m: Illegal zero sequence LEN.");
+		$finish;
+	end
+	if(REP == 0) initial begin
+		$error("%m: Illegal zero REP count.");
+		$finish;
+	end
 
+	// Track position in Sequence
+	uwire  last_item;
 	uwire  shift;
-	uwire  clr = rst || (done_all && shift);
-	always_ff @(posedge clk) begin
-		if(clr)         Count <= 0;
-		else if(shift)  Count <= Count + ((REP > 1) && done_len? 2**$clog2(LEN)-LEN+1 : 1);
+	if(LEN == 1)  assign  last_item = 1;
+	else begin
+		typedef logic [$clog2(LEN)-1:0]  count_t;
+		count_t  Count = 0;
+		logic    Last  = 0;
+		always_ff @(posedge clk) begin
+			if(rst) begin
+				Count <= 0;
+				Last  <= 0;
+			end
+			else if(shift) begin
+				Count <= Count + (Last? 2**$clog2(LEN)-LEN+1 : 1);
+				Last  <= (((LEN-2) & ~Count) == 0) && ((LEN&1) || !Last);
+			end
+		end
+		assign	last_item = Last;
 	end
 
-	typedef logic [W-1:0]  data_t;
-	uwire data_t  rdat;
-	uwire  first_rep;
 	if(REP == 1) begin
-		assign	done_rep  = 1;
-		assign	first_rep = 1;
-		assign	rdat = 'x;
+		assign	shift = ivld && ordy;
+
+		assign	irdy  = ordy;
+		assign	odat  = idat;
+		assign	olast = last_item;
+		assign	ofin  = last_item;
+		assign	ovld  = ivld;
 	end
 	else begin
-		assign	done_rep = ((REP-1) & ~Count[$left(Count):$clog2(LEN)]) == 0;
 
-		logic  FirstRep = 1;
+		// Track Repetitions
+		uwire  last_rep;
+		if(1) begin : blkRep
+			typedef logic [$clog2(REP)-1:0]  rep_t;
+			rep_t  RepCnt = 0;
+			logic  RepLst = 0;
+			always_ff @(posedge clk) begin
+				if(rst) begin
+					RepCnt <= 0;
+					RepLst <= 0;
+				end
+				else if(last_item && shift) begin
+					RepCnt <= RepCnt + (RepLst? 2**$clog2(REP)-REP+1 : 1);
+					RepLst <= (((REP-2) & ~RepCnt) == 0) && ((REP&1) || !RepLst);
+				end
+			end
+			assign	last_rep = RepLst;
+		end : blkRep
+
+		localparam int unsigned  AWIDTH = $clog2(LEN);
+		typedef logic [AWIDTH  :0]  ptr_t;	// pointers with additional generational MSB
+		typedef logic [W     -1:0]  data_t;
+
+		// Output Registers
+		data_t  ODat;
+		logic   OVld =  0;
+		logic   OLst = 'x;
+		logic   OFin = 'x;
+		assign	odat  = ODat;
+		assign	olast = OLst;
+		assign	ofin  = OFin;
+		assign	ovld  = OVld;
+
+		// Buffer Memory Management
+		data_t  Mem[2**AWIDTH];
+		ptr_t  WP = 0;	// Write Pointer
+		ptr_t  RP = 0;	// Read Pointer
+		ptr_t  FP = 0;	// Free Pointer
+
+		// Operational Guards
+		//	Occupancy:    WP-FP
+		//	  WP-FP < 2**AWIDTH -> writing allowed
+		//		- increments WP
+		//	Availability: WP-RP
+		//	  WP-RP > 0         -> reading allowed
+		//		- increments RP, last in sequence rewinds to FP for non-final repetition
+		//		- increments FP in last repetition
+		assign	irdy = !((WP-FP) >> AWIDTH);
+
+		uwire  wr = irdy && ivld;
+		uwire  rd = !OVld || ordy;
 		always_ff @(posedge clk) begin
-			if(clr)         FirstRep <= 1;
-			else if(shift)  FirstRep <= FirstRep && !done_len;
+			if(wr)  Mem[WP[AWIDTH-1:0]] <= idat;
+			if(rd)  ODat <= Mem[RP[AWIDTH-1:0]];
 		end
-		assign	first_rep = FirstRep;
 
-		data_t  Buf[LEN];
-		if(LEN == 1) begin : genTrivial
-			always_ff @(posedge clk) begin
-				if(shift && FirstRep)  Buf[0] <= idat;
+		uwire  vld = (RP != WP);
+		assign	shift = rd && vld;
+		always_ff @(posedge clk) begin
+			if(rst) begin
+				WP <= 0;
+				RP <= 0;
+				FP <= 0;
+
+				OVld <=  0;
+				OLst <= 'x;
+				OFin <= 'x;
 			end
-		end : genTrivial
-		else begin : genShift
-			always_ff @(posedge clk) begin
-				if(shift) begin
-					Buf[0] <= odat;
-					Buf[1:LEN-1] <= Buf[0:LEN-2];
+			else begin
+				if(wr)  WP <= WP + 1;
+				if(rd) begin
+					if(vld) begin
+						automatic logic  rewind = last_item && !last_rep;
+						RP <= RP + (rewind? 2**(AWIDTH+1)-LEN+1 : 1);
+						FP <= FP + last_rep;
+					end
+
+					OVld <= vld;
+					OLst <= last_item;
+					OFin <= last_rep && last_item;
 				end
 			end
-		end : genShift
+		end
 
-		assign	rdat = Buf[LEN-1];
 	end
 
-	assign  irdy  = ordy && first_rep;
-	assign	odat  = first_rep? idat : rdat;
-	assign	olast = done_len;
-	assign	ofin  = done_all;
-	assign	ovld  = first_rep? ivld : 1;
-	assign	shift = ovld && ordy;
-
-endmodule : replay_buffer
\ No newline at end of file
+endmodule : replay_buffer
diff --git a/finn-rtllib/mvu/tb/replay_buffer_tb.sv b/finn-rtllib/mvu/tb/replay_buffer_tb.sv
new file mode 100644
index 0000000000..5581354e0e
--- /dev/null
+++ b/finn-rtllib/mvu/tb/replay_buffer_tb.sv
@@ -0,0 +1,130 @@
+/******************************************************************************
+ * Copyright (C) 2023, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Testbench for replay_buffer module.
+ * @author	Thomas B. Preußer <thomas.preusser@amd.com>
+ *****************************************************************************/
+
+module replay_buffer_tb;
+
+	// Global Control
+	logic  clk = 0;
+	always #5ns clk = !clk;
+	uwire  rst = 0;
+
+	// DUT Geometries
+	localparam int unsigned  DIMS[3] = '{ 7, 8, 10 };
+	localparam int unsigned  W = 8;
+	typedef logic [W-1:0]  data_t;
+
+	bit [2**$size(DIMS)-1:0]  done = 0;
+	always_comb begin
+		if(&done) begin
+			$display("Test completed.");
+			$finish;
+		end
+	end
+
+	// Parallel DUT Instantiations
+	for(genvar  r = 0; r < $size(DIMS); r++) begin
+		for(genvar  l = 0; l < $size(DIMS); l++) begin
+			localparam int unsigned  REP = DIMS[r];
+			localparam int unsigned  LEN = DIMS[l];
+
+			data_t  idat;
+			logic  ivld;
+			uwire  irdy;
+
+			uwire data_t  odat;
+			uwire  olast;
+			uwire  ofin;
+			uwire  ovld;
+			logic  ordy;
+
+			replay_buffer #(.LEN(LEN), .REP(REP), .W(W)) dut (
+				.clk, .rst,
+				.idat, .ivld, .irdy,
+				.odat, .olast, .ofin, .ovld, .ordy
+			);
+
+			// Input Feed: 0, 1, ..., 10*LEN-1
+			initial begin
+				idat = 'x;
+				ivld =  0;
+				@(posedge clk iff !rst);
+
+				for(int unsigned  i = 0; i < 10*LEN; i++) begin
+					idat <= i;
+					ivld <= 1;
+					@(posedge clk iff irdy);
+					idat <= 'x;
+					ivld <=  0;
+					while($urandom()%(REP-1) != 0) @(posedge clk);
+				end
+			end
+
+			// Output Check
+			initial begin
+				automatic int unsigned  base = 0;
+
+				ordy = 0;
+				@(posedge clk iff !rst);
+
+				for(int unsigned  k = 0; k < 10; k++) begin
+					for(int unsigned  j = 0; j < REP; j++) begin
+						for(int unsigned  i = 0; i < LEN; i++) begin
+							ordy <= 1;
+							@(posedge clk iff ovld);
+							assert(odat == base+i) else begin
+								$error("#%0d.%0d: Data mismatch: %0d instead of %0d.", r, l, odat, base+i);
+								$stop;
+							end
+							assert(olast == (i == LEN-1)) else begin
+								$error("#%0d.%0d: Last mismatch.", r, l);
+								$stop;
+							end
+							assert(ofin == ((i == LEN-1) && (j == REP-1))) else begin
+								$error("#%0d.%0d: Fin mismatch.", r, l);
+								$stop;
+							end
+
+							ordy <= 0;
+							while($urandom()%13 == 0) @(posedge clk);
+						end
+					end
+					base += LEN;
+				end
+
+				done[$size(DIMS)*r + l] <= 1;
+			end
+		end
+	end
+
+endmodule : replay_buffer_tb

From 2efba6854267873c84d58f6d8fe6b64f649eaa99 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Tue, 5 Sep 2023 13:53:01 +0100
Subject: [PATCH 052/112] [to-rtl]: Infer unique node names after
 transformation is applied

---
 .../transformation/fpgadataflow/specialize_to_rtl_layers.py     | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py
index 23b6e59abe..47ed5ce863 100644
--- a/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py
+++ b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py
@@ -32,6 +32,7 @@
 from onnx import helper
 from qonnx.transformation.infer_shapes import InferShapes
 from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.general import GiveUniqueNodeNames
 from finn.transformation.fpgadataflow.minimize_accumulator_width import MinimizeAccumulatorWidth
 
 class InferRTLMatrixVectorActivation(Transformation):
@@ -105,5 +106,6 @@ def apply(self, model):
             model = model.transform(MinimizeAccumulatorWidth())
             model = model.transform(InferShapes())
             model = model.transform(InferDataTypes())
+            model = model.transform(GiveUniqueNodeNames())
         
         return (model, graph_modified)
\ No newline at end of file

From 114ea1bfed2dd2f14196f98aea97d6cac9d1d57e Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 18 Sep 2023 14:56:07 +0100
Subject: [PATCH 053/112] [mvu rtl]: add synthesis directive to handle 'X in
 simulation

---
 finn-rtllib/mvu/mvu_8sx9.sv | 34 ++++++++++++++++++++++++++--------
 1 file changed, 26 insertions(+), 8 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv
index 439fbc44f9..34aa856b1b 100644
--- a/finn-rtllib/mvu/mvu_8sx9.sv
+++ b/finn-rtllib/mvu/mvu_8sx9.sv
@@ -110,13 +110,17 @@ module mvu_8sx9 #(
 			always_ff @(posedge clk) begin
 				if (rst)     A <= '{default: 0};
 				else if(en) begin
-					A[EXTERNAL_PREGS-1] <= a[3*i +: LANES_OCCUPIED];
+					A[EXTERNAL_PREGS-1] <= 
+// synthesis translate_off
+						zero ? '1 : 
+// synthesis translate_on						
+						a[3*i +: LANES_OCCUPIED];
 					if (EXTERNAL_PREGS > 1)   A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1];
 				end
 			end
 			for (genvar j=0; j<LANES_OCCUPIED; j++) begin : genAin
-				assign a_in_i[i][9*j +: 9] = SIGNED_ACTIVATIONS ? PAD_BITS_ACT == 0 ? A[0][j] : { {PAD_BITS_ACT{A[0][j][ACTIVATION_WIDTH-1]}}, A[0][j] } 
-											: PAD_BITS_ACT == 0 ? A[0][j] : { {PAD_BITS_ACT{1'b0}}, A[0][j] } ;
+			assign a_in_i[i][9*j +: 9] = SIGNED_ACTIVATIONS ? PAD_BITS_ACT == 0 ? A[0][j] : { {PAD_BITS_ACT{A[0][j][ACTIVATION_WIDTH-1]}}, A[0][j] } 
+												: PAD_BITS_ACT == 0 ? A[0][j] : { {PAD_BITS_ACT{1'b0}}, A[0][j] } ;
 			end : genAin
 			for (genvar j=LANES_OCCUPIED; j<3; j++) begin : genAinZero
 				assign a_in_i[i][9*j +: 9] = 9'b0;
@@ -124,8 +128,12 @@ module mvu_8sx9 #(
 		end : genExternalPregAct
 		else begin : genInpDSPAct
 			for (genvar j=0; j<LANES_OCCUPIED; j++) begin : genAin
-				assign a_in_i[i][9*j +: 9] = SIGNED_ACTIVATIONS ? PAD_BITS_ACT == 0 ? a[3*i+j] : { {PAD_BITS_ACT{a[3*i+j][ACTIVATION_WIDTH-1]}}, a[3*i+j] }
-											: PAD_BITS_ACT == 0 ? a[3*i+j] : { {PAD_BITS_ACT{1'b0}}, a[3*i+j] } ;
+				assign a_in_i[i][9*j +: 9] = 
+// synthesis translate_off
+					zero ? '1 : 				
+// synthesis translate_on
+					SIGNED_ACTIVATIONS ? PAD_BITS_ACT == 0 ? a[3*i+j] : { {PAD_BITS_ACT{a[3*i+j][ACTIVATION_WIDTH-1]}}, a[3*i+j] }
+												: PAD_BITS_ACT == 0 ? a[3*i+j] : { {PAD_BITS_ACT{1'b0}}, a[3*i+j] } ;
 			end : genAin
 			for (genvar j=LANES_OCCUPIED; j<3; j++) begin : genAinZero
 				assign a_in_i[i][9*j +: 9] = 9'b0;
@@ -148,7 +156,11 @@ module mvu_8sx9 #(
 				always_ff @(posedge clk) begin
 					if (rst)    B <= '{default: 0};
 					else if (en) begin
-						B[i][EXTERNAL_PREGS-1] <= w[i][3*j +: LANES_OCCUPIED];
+						B[i][EXTERNAL_PREGS-1] <= 
+// synthesis translate_off
+							zero ? '1 : 						
+// synthesis translate_on							
+							w[i][3*j +: LANES_OCCUPIED];
 						if (EXTERNAL_PREGS > 1) B[i][0:EXTERNAL_PREGS-2] <= B[i][1:EXTERNAL_PREGS-1];
 					end
 				end
@@ -161,7 +173,11 @@ module mvu_8sx9 #(
 			end : genExternalPregWeight
 			else begin : genInpDSPWeight
 				for (genvar k = 0; k < LANES_OCCUPIED; k++) begin : genBin
-					assign b_in_i[i][j][8*k +: 8] = PAD_BITS_WEIGHT == 0 ? w[i][3*j+k] : { {PAD_BITS_WEIGHT{w[i][3*j+k][WEIGHT_WIDTH-1]}}, w[i][3*j+k] };
+					assign b_in_i[i][j][8*k +: 8] = 
+// synthesis translate_off					
+						zero ? '1 : 
+// synthesis translate_on					
+						PAD_BITS_WEIGHT == 0 ? w[i][3*j+k] : { {PAD_BITS_WEIGHT{w[i][3*j+k][WEIGHT_WIDTH-1]}}, w[i][3*j+k] };
 				end : genBin
 				for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero
 					assign b_in_i[i][j][8*k +: 8] = 8'b0;
@@ -178,9 +194,10 @@ module mvu_8sx9 #(
 			localparam bit PREG = (j+1)%SEGLEN==0 || j == CHAINLEN-1;
 			localparam bit FIRST = j == 0;
 			localparam bit LAST = j == CHAINLEN-1;
+			uwire [57:0] pp;
 
 			if (LAST) begin : genPOUT
-				assign p[i] = pcout[i][j][ACCU_WIDTH-1:0];
+				assign p[i] = pp[ACCU_WIDTH-1:0];
 			end
 
 			// Note: Since the product B * AD is computed,
@@ -264,6 +281,7 @@ module mvu_8sx9 #(
 					end
 					else	assign Preg = Mreg + pcout[i][j-1];
 				end
+				assign pp = Preg;
 				assign pcout[i][j] = Preg;
 			end : genBehav
 `ifndef VERILATOR

From 79fafdb25a8707f740a0a7e21aa4f55ef7101882 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 18 Sep 2023 15:06:36 +0100
Subject: [PATCH 054/112] [replay buffer rtl]: minor fix to when LEN=1 (=
 AWIDTH=0)

---
 finn-rtllib/mvu/replay_buffer.sv | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/finn-rtllib/mvu/replay_buffer.sv b/finn-rtllib/mvu/replay_buffer.sv
index 3dfe72d6c6..942f1823ca 100644
--- a/finn-rtllib/mvu/replay_buffer.sv
+++ b/finn-rtllib/mvu/replay_buffer.sv
@@ -144,8 +144,8 @@ module replay_buffer #(
 		uwire  wr = irdy && ivld;
 		uwire  rd = !OVld || ordy;
 		always_ff @(posedge clk) begin
-			if(wr)  Mem[WP[AWIDTH-1:0]] <= idat;
-			if(rd)  ODat <= Mem[RP[AWIDTH-1:0]];
+			if(wr)  Mem[WP[AWIDTH:0]] <= idat;
+			if(rd)  ODat <= Mem[RP[AWIDTH:0]];
 		end
 
 		uwire  vld = (RP != WP);

From 619d9db0d5872d1afd72b1d1df841e1f87a9f33a Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 18 Sep 2023 15:09:45 +0100
Subject: [PATCH 055/112] [mvu lut]: LUT-based MVU compute core

---
 finn-rtllib/mvu/mvu_lut.sv | 102 +++++++++++++++++++++++++++++++++++++
 1 file changed, 102 insertions(+)
 create mode 100644 finn-rtllib/mvu/mvu_lut.sv

diff --git a/finn-rtllib/mvu/mvu_lut.sv b/finn-rtllib/mvu/mvu_lut.sv
new file mode 100644
index 0000000000..b100a589e8
--- /dev/null
+++ b/finn-rtllib/mvu/mvu_lut.sv
@@ -0,0 +1,102 @@
+module mvu_lut #(
+	int unsigned  PE,
+	int unsigned  SIMD,
+	int unsigned  ACCU_WIDTH,
+    int unsigned  ACTIVATION_WIDTH,
+    int unsigned  WEIGHT_WIDTH,
+    bit  SIGNED_ACTIVATIONS,
+    bit  M_REG = 1,
+
+    localparam unsigned MULT_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH
+)(
+	// Global Control
+	input	logic  clk,
+	input	logic  rst,
+	input	logic  en,
+
+	// Input
+	input	logic  last,
+	input	logic  zero,	// ignore current inputs and force this partial product to zero
+	input	logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]      w,	// signed weights
+	input	logic                [SIMD-1:0][ACTIVATION_WIDTH-1:0]  a,	// (un)signed activations
+
+	// Ouput
+	output	logic  vld,
+	output	logic signed [PE-1:0][ACCU_WIDTH-1:0]  p
+);
+
+    typedef int unsigned  leave_load_t[2*SIMD-1];
+    function leave_load_t init_leave_loads();
+        automatic leave_load_t  res;
+        for(int  i = 2*(SIMD-1); i >= int'(SIMD)-1; i--)  res[i] = 1;
+        for(int  i = SIMD-2; i >= 0; i--)  res[i] = res[2*i+1] + res[2*i+2];
+        return res;
+    endfunction : init_leave_loads
+
+    // Pipeline for last indicator flag
+    uwire last_i;
+    generate if (M_REG) begin
+        logic [0:1] L = '0;
+        always_ff @(posedge clk) begin
+            if(rst)       L <= '0;
+            else if (en)  L <= {last, L[0]};
+        end
+        assign  last_i = L[1];
+    end
+    else begin 
+        logic L = '0;
+        always_ff @(posedge clk) begin
+            if(rst)       L <= '0;
+            else if (en)  L <= last;
+        end
+        assign  last_i = L;
+    end
+    endgenerate
+
+    // For each PE generate
+    for (genvar  i = 0; i < PE; i++)  begin : genPE
+        // Stage #1: SIMD multipliers in parallel
+        uwire [MULT_WIDTH-1 : 0] m1 [SIMD];
+        for (genvar j = 0; j < SIMD; j++) begin : genSIMD
+            if (M_REG) begin : genMreg
+                logic [MULT_WIDTH-1 : 0] M [SIMD];
+                always_ff @(posedge clk) begin
+                    if(rst)         M[j] = '{ default : 0 };
+                    else if (en)    M[j] = zero ? 0 :
+                                            SIGNED_ACTIVATIONS ? $signed(a[j]) * $signed(w[i][j]) :
+                                                                 $signed({1'b0, a[j]}) * $signed(w[i][j]); 
+                    // (SIGNED_ACTIVATIONS ? $signed(a[j]) : a[j]) * $signed(w[i][j]) isn't valid -- leads to unsigned multiplication
+                end
+                assign  m1[j] = M[j];
+            end : genMreg
+            else begin : genNoMreg 
+                assign m1[j] = zero ? 0 :
+                               SIGNED_ACTIVATIONS ? $signed(a[j]) * $signed(w[i][j]) :
+                                                    $signed({1'b0, a[j]}) * $signed(w[i][j]);
+            end : genNoMreg
+        end : genSIMD
+
+        // Stage #2: Adder tree to reduce SIMD products
+        localparam leave_load_t  LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default : 1 };
+        localparam int unsigned  ROOT_WIDTH = $clog2(SIMD*(2**MULT_WIDTH-1));
+        uwire signed [2*SIMD-2:0][ROOT_WIDTH-1:0]  tree;
+        for(genvar s = 0; s < SIMD; s++)  assign  tree[SIMD-1+s] = $signed(m1[s]);
+        for(genvar n = 0; n < SIMD-1; n++) begin
+            // Sum truncated to actual maximum bit width at this node
+            localparam int unsigned  NODE_WIDTH = $clog2(LEAVE_LOAD[n]*(2**MULT_WIDTH-1));
+            uwire signed [NODE_WIDTH-1:0]  s = tree[2*n+1] + tree[2*n+2];
+            assign tree[n] = s;
+        end
+
+        // Stage #3: Buffer output
+        logic [ACCU_WIDTH-1:0] P2 [PE];
+        always_ff @(posedge clk) begin
+            if(rst)         P2[i] = '{ default : 0};
+            else if (en)    P2[i] = (last_i ? 0 : $signed(P2[i])) + $signed(tree[0]);
+        end
+
+        assign  vld = last_i;
+        assign  p[i] = P2[i];
+    end : genPE
+
+endmodule : mvu_lut

From 090f2ac4adf4b0523b23b27fce05f7422269d72a Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Tue, 19 Sep 2023 12:23:55 +0100
Subject: [PATCH 056/112] [custom op]: add preferred_backend attribute

---
 src/finn/custom_op/fpgadataflow/matrixvectoractivation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
index 73d39ce642..4f24d71ccc 100644
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
@@ -123,7 +123,7 @@ def get_nodeattr_types(self):
             # weight data from the weight FIFOs.
             "runtime_writeable_weights": ("i", False, 0, {0, 1}),
             # Flag to specify whether RTL-based or HLS-based implementation is preferred
-            "impl": ("s", False, "rtl", {"hls", "rtl"})
+            "preferred_backend": ("s", False, "rtl", {"hls", "rtl"})
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs

From ac5e82d9944f5b7475eb13546affd1bc03d57f4a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Thu, 21 Sep 2023 13:03:27 +0100
Subject: [PATCH 057/112] Ensure a minimum of two buffer slots even for
 length-1 sequences.

---
 finn-rtllib/mvu/replay_buffer.sv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/finn-rtllib/mvu/replay_buffer.sv b/finn-rtllib/mvu/replay_buffer.sv
index 942f1823ca..d4342f705c 100644
--- a/finn-rtllib/mvu/replay_buffer.sv
+++ b/finn-rtllib/mvu/replay_buffer.sv
@@ -111,7 +111,7 @@ module replay_buffer #(
 			assign	last_rep = RepLst;
 		end : blkRep
 
-		localparam int unsigned  AWIDTH = $clog2(LEN);
+		localparam int unsigned  AWIDTH = LEN < 2? 1 : $clog2(LEN);
 		typedef logic [AWIDTH  :0]  ptr_t;	// pointers with additional generational MSB
 		typedef logic [W     -1:0]  data_t;
 

From 85156935163fc803d453db5ce2c1c5163808bc9f Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 21 Sep 2023 15:07:12 +0100
Subject: [PATCH 058/112] [rtl mvu wrapper]: support for vvu layer and rename

---
 finn-rtllib/mvu/mvu_vvu_axi_wrapper.v | 92 +++++++++++++++++++++++++++
 1 file changed, 92 insertions(+)
 create mode 100644 finn-rtllib/mvu/mvu_vvu_axi_wrapper.v

diff --git a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
new file mode 100644
index 0000000000..6dbf82cb7b
--- /dev/null
+++ b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
@@ -0,0 +1,92 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Verilog AXI-lite wrapper for MVU.
+ *****************************************************************************/
+
+module $MODULE_NAME_AXI_WRAPPER$ #(
+	parameter	IS_MVU = "$IS_MVU$",
+	parameter	COMPUTE_CORE = "$COMPUTE_CORE$",
+	parameter	MW = $MW$,
+	parameter	MH = $MH$,
+	parameter	PE = $PE$,
+	parameter	SIMD = $SIMD$,
+	parameter	ACTIVATION_WIDTH = $ACTIVATION_WIDTH$,
+	parameter	WEIGHT_WIDTH = $WEIGHT_WIDTH$,
+	parameter	ACCU_WIDTH = $ACCU_WIDTH$,
+	parameter	SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$,
+	parameter	SEGMENTLEN = $SEGMENTLEN$,
+	parameter	FORCE_BEHAVIORAL = $FORCE_BEHAVIORAL$,
+
+	// Safely deducible parameters
+	parameter 	WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
+	parameter 	INPUT_STREAM_WIDTH_BA = ((IS_MVU == 1 ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8,
+	parameter 	OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8
+)(
+	// Global Control
+	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *)
+	(* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *)
+	input	ap_clk,
+	(* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *)
+	input	ap_rst_n,
+
+	// Weight Stream
+	input	[WEIGHT_STREAM_WIDTH_BA-1:0]  weights_V_TDATA,
+	input   weights_V_TVALID,
+	output  weights_V_TREADY,
+	// Input Stream
+	input	[INPUT_STREAM_WIDTH_BA-1:0]  in0_V_TDATA,
+	input	in0_V_TVALID,
+	output	in0_V_TREADY,
+	// Output Stream
+	output	[OUTPUT_STREAM_WIDTH_BA-1:0]  out_V_TDATA,
+	output	out_V_TVALID,
+	input	out_V_TREADY
+);
+
+mvu_vvu_axi #(
+	.IS_MVU(IS_MVU), .COMPUTE_CORE(COMPUTE_CORE), .MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD),
+	.ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH),
+	.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)
+	) inst (
+	.ap_clk(ap_clk),
+	.ap_rst_n(ap_rst_n),
+	.s_axis_weights_tdata(weights_V_TDATA),
+	.s_axis_weights_tvalid(weights_V_TVALID),
+	.s_axis_weights_tready(weights_V_TREADY),
+	.s_axis_input_tdata(in0_V_TDATA),
+	.s_axis_input_tvalid(in0_V_TVALID),
+	.s_axis_input_tready(in0_V_TREADY),
+	.m_axis_output_tdata(out_V_TDATA),
+	.m_axis_output_tvalid(out_V_TVALID),
+	.m_axis_output_tready(out_V_TREADY)
+);
+
+endmodule // $MODULE_NAME_AXI_WRAPPER$

From cf28d780041fec1effdf743e62390eebc5c81f98 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 21 Sep 2023 16:24:18 +0100
Subject: [PATCH 059/112] [mvu vvu tb]: modified testbench to also support
 testing VVU on DSP58

---
 finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv | 222 +++++++++++++++++++++++++++
 1 file changed, 222 insertions(+)
 create mode 100644 finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv

diff --git a/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv
new file mode 100644
index 0000000000..82c2e8e7b0
--- /dev/null
+++ b/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv
@@ -0,0 +1,222 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Testbench for MVU AXI-lite interface wrapper.
+ *****************************************************************************/
+
+module mvu_vvu_axi_tb();
+
+//-------------------- Simulation parameters --------------------\\
+	// Matrix & parallelism config
+	localparam bit IS_MVU = 1;
+	localparam string COMPUTE_CORE = "mvu_vvu_8sx9_dsp58";
+	localparam int unsigned MW = 1500;
+	localparam int unsigned MH = 256;
+	localparam int unsigned SIMD = 60;
+	localparam int unsigned PE = 16;
+	localparam int unsigned SEGMENTLEN = 2.0;
+	localparam bit FORCE_BEHAVIORAL = 1;
+	localparam bit M_REG_LUT = 1;
+	// Bit-width config
+	localparam int unsigned ACTIVATION_WIDTH = 4;
+	localparam int unsigned WEIGHT_WIDTH = 4;
+	localparam int unsigned ACCU_WIDTH = 21; // == ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW)
+	localparam bit SIGNED_ACTIVATIONS = 0;
+	// Simulation constants
+	localparam int unsigned NF = MH/PE;
+	localparam int unsigned SF = MW/SIMD;
+	localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8;
+	localparam int unsigned ACTIVATION_WIDTH_BA = ((IS_MVU ? 1 : PE)*SIMD*ACTIVATION_WIDTH+7)/8*8;
+	localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH;
+	localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - (IS_MVU ? 1 : PE)*SIMD*ACTIVATION_WIDTH;
+	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8;
+
+	// Generate clk and reset signal
+	logic clk = 0;
+	always #5ns clk = !clk;
+
+	logic ap_rst_n = 0;
+	initial begin
+		repeat(16) @(posedge clk);
+		ap_rst_n <= 1;
+	end
+
+	uwire ap_clk = clk;
+
+	// Generate activations
+	typedef logic [(IS_MVU ? 1 : PE)*SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t;
+	typedef activation_t activation_vector_t[(IS_MVU ? 1 : NF)*SF];
+
+	function activation_vector_t init_ACTIVATIONS;
+		automatic activation_vector_t res;
+		std::randomize(res);
+		return res;
+	endfunction : init_ACTIVATIONS
+
+	activation_vector_t ACTIVATIONS = init_ACTIVATIONS();
+
+	struct {
+		activation_t dat;
+		logic vld;
+		logic rdy;
+	} activations;
+
+	initial begin
+		activations.vld = 0;
+		activations.dat = 'X;
+		@(posedge clk iff ap_rst_n);
+
+		for (int j=0; j<(IS_MVU ? 1 : NF); j++) begin
+			for (int i=0; i<SF; i++) begin
+				activations.dat <= ACTIVATIONS[SF*j+i];
+				do begin
+					activations.vld <= $urandom()%7 >= 0;
+					@(posedge clk);
+				end while (!(activations.vld === 1 && activations.rdy === 1));
+			end
+		end
+
+		activations.vld <= 0;
+		activations.dat <= 'x;
+	end
+
+	// Generate weights
+	typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t;
+	typedef weight_t weight_matrix_t[NF][SF];
+
+	function weight_matrix_t init_WEIGHTS;
+		automatic weight_matrix_t res;
+		std::randomize(res);
+		return res;
+	endfunction : init_WEIGHTS;
+
+	weight_matrix_t WEIGHTS = init_WEIGHTS();
+
+	struct {
+		weight_t dat;
+		logic vld;
+		logic rdy;
+	} weights;
+
+	initial begin
+		weights.vld = 0;
+		weights.dat = 'X;
+		@(posedge clk iff ap_rst_n);
+
+		weights.vld <= 1;
+		for (int i=0; i<NF; i++) begin
+			for (int j=0; j<SF; j++) begin
+				weights.dat <= WEIGHTS[i][j];
+				@(posedge clk iff weights.rdy);
+			end
+		end
+
+		weights.vld <= 0;
+		weights.dat <= 'x;
+	end
+
+	// Function to compute golden output
+	// a: [(IS_MVU?1:NF)*SF][SIMD-1:0][ACTIVATION_WIDTH-1:0]
+	// w: [NF][SF][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]
+	typedef logic signed [PE-1:0][ACCU_WIDTH-1:0] output_t;
+	typedef output_t output_vector_t [NF];
+
+	struct {
+		output_t dat;
+		logic vld;
+		logic rdy;
+	} outputs;
+
+	function output_vector_t check_output(activation_vector_t a, weight_matrix_t w);
+		automatic output_vector_t res = '{default: 0};
+		for (int j = 0; j<MH; j++) begin
+			for (int i = 0; i<MW; i++) begin
+				if (SIGNED_ACTIVATIONS)
+					res[j/PE][j%PE] = IS_MVU ? $signed(res[j/PE][j%PE]) + $signed(a[i/SIMD][i%SIMD]) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]) : 
+											   $signed(res[j/PE][j%PE]) + ( PE > 1 ? $signed(a[j/PE*SF+i/SIMD][(i*PE + j%PE) % (SIMD*PE)]) : $signed(a[j/PE*SF+i/SIMD][i%SIMD]) ) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
+				else
+					res[j/PE][j%PE] = IS_MVU ? $signed(res[j/PE][j%PE]) + $signed({1'b0, a[i/SIMD][i%SIMD]}) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]) : 
+											   $signed(res[j/PE][j%PE]) + ( PE > 1 ? $signed({1'b0, a[j/PE*SF+i/SIMD][(i*PE + j%PE) % (SIMD*PE)]}) : $signed({1'b0, a[j/PE+SF+i/SIMD][i%SIMD]}) ) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
+			end
+		end
+		return res;
+	endfunction : check_output;
+
+	output_vector_t GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS);
+
+	int unsigned NF_CNT = 0;
+	initial begin
+		outputs.rdy = 0;
+		while (NF_CNT < NF) begin
+			// Loop until both rdy & vld are asserted
+			do begin
+				outputs.rdy <= $urandom()%7 >= 0;
+				@(posedge clk iff ap_rst_n);
+			end while (!(outputs.rdy === 1 && outputs.vld === 1));
+
+			// Compare produced outputs against golden outputs
+			foreach(outputs.dat[i]) begin
+				assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+				else begin
+					$error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+					$stop;
+				end
+			end
+
+			NF_CNT += 1;
+		end
+
+		$finish;
+	end
+
+	// Instantiate DUT
+	mvu_vvu_axi #(
+		.IS_MVU(IS_MVU),
+		.COMPUTE_CORE(COMPUTE_CORE),
+		.MW(MW),
+		.MH(MH),
+		.PE(PE),
+		.SIMD(SIMD),
+		.ACTIVATION_WIDTH(ACTIVATION_WIDTH),
+		.WEIGHT_WIDTH(WEIGHT_WIDTH),
+		.ACCU_WIDTH(ACCU_WIDTH),
+		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
+		.SEGMENTLEN(SEGMENTLEN),
+		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL),
+		.M_REG_LUT(M_REG_LUT)
+	)
+	dut (
+		.ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld),
+		.s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld),
+		.s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld),
+		.m_axis_output_tready(outputs.rdy)
+	);
+
+endmodule : mvu_vvu_axi_tb

From 2617c391e1d2c9b19fb881acb6012fc56df35eae Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 21 Sep 2023 16:25:22 +0100
Subject: [PATCH 060/112] [axi wrapper]: minor modification to comment
 description

---
 finn-rtllib/mvu/mvu_vvu_axi_wrapper.v | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
index 6dbf82cb7b..788e49a71b 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
+++ b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
@@ -28,7 +28,7 @@
  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * @brief	Verilog AXI-lite wrapper for MVU.
+ * @brief	Verilog AXI-lite wrapper for MVU & VVU.
  *****************************************************************************/
 
 module $MODULE_NAME_AXI_WRAPPER$ #(

From 8ca5fe73c003aec3e7998d83e233102c012dd531 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 21 Sep 2023 16:34:12 +0100
Subject: [PATCH 061/112] [mvu axi]: add support for VVU on DSP58

---
 finn-rtllib/mvu/mvu_axi.sv | 105 ++++++++++++++++++++++++-------------
 1 file changed, 69 insertions(+), 36 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_axi.sv b/finn-rtllib/mvu/mvu_axi.sv
index 46167af95b..07ad32e6c8 100644
--- a/finn-rtllib/mvu/mvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_axi.sv
@@ -28,19 +28,25 @@
  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * @brief	Matrix Vector Unit (MVU) AXI-lite interface wrapper.
+ * @brief	Matrix Vector Unit (MVU) & Vector Vector Unit (VVU) AXI-lite interface wrapper.
  * @details
+ *	 The following compute cores are supported:
+ *   - 4-bit MVU on DSP48 & DSP58 achieving 4 MACs/DSP, 
+ *     (4,8]-bit MVU on DSP48 achieving 2 MACs/DSP,
+ *     [4,9]-bit MVU and VVU on DSP58 achieving 3 MACs/DSP,
+ *     'unconstrained' LUT-based MVU and VVU.
  *  Folding hints:
- *	 - 4-bit MVU:          PE scaling should divide MH.
- *	 - 8-bit MVU - DSP48:  PE scaling should divide MH.
- *	 - 8-bit MVU - DSP58:  SIMD scaling should aim at a full multiple of 3 and divide MW.
+ *	 - PE scaling should divide MH.
+ *   - SIMD scaling should divide MW.
  *	 - Otherwise, keep SIMD and PE somewhat balanced. SIMD scaling tends to
  *	   impact critical paths more than PE scaling. PE scaling implies a
  *	   bigger fanout on the input activations.
  *	 - Full unfolding along MH (PE=MH) results in no replay buffer instantiated
  *****************************************************************************/
 
-module mvu_axi #(
+module mvu_vvu_axi #(
+	bit IS_MVU, // string type causes error in Vivado
+	parameter COMPUTE_CORE,
 	int unsigned MW,
 	int unsigned MH,
 	int unsigned PE,
@@ -51,16 +57,16 @@ module mvu_axi #(
 	bit SIGNED_ACTIVATIONS = 0,
 	int unsigned SEGMENTLEN = 0,
 	bit FORCE_BEHAVIORAL = 0,
-	parameter MVU_IMPL_STYLE, // string type causes error in Vivado
+	bit M_REG_LUT = 1,
 
+	// Safely deducible parameters
 	localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
-	localparam int unsigned INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8,
+	localparam int unsigned INPUT_STREAM_WIDTH_BA = ((IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8,
 	localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH,
-	localparam int unsigned INPUT_STREAM_WIDTH = SIMD*ACTIVATION_WIDTH,
+	localparam int unsigned INPUT_STREAM_WIDTH =  (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH,
 	localparam int unsigned SF = MW/SIMD,
 	localparam int unsigned NF = MH/PE,
-	localparam int unsigned OUTPUT_LANES = PE,
-	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
+	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8
 )
 (
 	// Global Control
@@ -93,27 +99,31 @@ module mvu_axi #(
 			$error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE);
 			$finish;
 		end
-		if (ACTIVATION_WIDTH > 9) begin
-			$error("Activation width of %0d-bits exceeds maximum of 9-bits", ACTIVATION_WIDTH);
-			$finish;
-		end
 		if (WEIGHT_WIDTH > 8) begin
 			$error("Weight width of %0d-bits exceeds maximum of 8-bits", WEIGHT_WIDTH);
 			$finish;
 		end
-		if (SIGNED_ACTIVATIONS == 0 && ACTIVATION_WIDTH==9) begin
-			$error("Activation width of %0d-bits exceeds maximum of 8-bits for unsigned numbers", ACTIVATION_WIDTH);
-			$finish;
+		if (ACTIVATION_WIDTH > 8) begin
+			if (!(SIGNED_ACTIVATIONS == 1 && ACTIVATION_WIDTH == 9 && COMPUTE_CORE == "mvu_vvu_8sx9_dsp58")) begin
+				$error("Activation width of %0d-bits exceeds maximum of 9-bits for signed numbers on DSP48", ACTIVATION_WIDTH);
+				$finish;
+			end
 		end
-		if (MVU_IMPL_STYLE == "mvu_8sx9") begin
+		if (COMPUTE_CORE == "mvu_vvu_8sx9_dsp58") begin
 			if (SEGMENTLEN == 0) begin
-				$warning("Segment length of %0d defaults to chain length", SEGMENTLEN);
+				$warning("Segment length of %0d defaults to chain length of %0d", SEGMENTLEN, (SIMD+2)/3);
 			end
 			if (SEGMENTLEN > (SIMD+2)/3) begin
 				$error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3);
 				$finish;
 			end
 		end
+		if (!IS_MVU) begin
+			if (COMPUTE_CORE != "mvu_vvu_8sx9_dsp58" && COMPUTE_CORE != "mvu_vvu_lut") begin
+				$error("VVU only supported on DSP58 or LUT-based implementation");
+				$finish;
+			end
+		end
 	end
 
 	uwire clk = ap_clk;
@@ -127,10 +137,10 @@ module mvu_axi #(
 	uwire avld;
 	uwire ardy;
 
-	replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvauin_t))) activation_replay (
-		.clk, .rst,
-		.ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)),
-		.ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin)
+	replay_buffer #(.LEN(SF), .REP(IS_MVU ? NF : 1), .W($bits(mvauin_t))) activation_replay (
+	.clk, .rst,
+	.ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)),
+	.ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin)
 	);
 
 //-------------------- Input control --------------------\\
@@ -139,37 +149,60 @@ module mvu_axi #(
 	assign ardy = en && s_axis_weights_tvalid;
 	assign s_axis_weights_tready = en && avld;
 
-//-------------------- Core MVU --------------------\\
+//-------------------- Core MVU/VVU --------------------\\
 	uwire ovld;
 	uwire [PE-1:0][ACCU_WIDTH-1:0] odat;
 	typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t;
-
-	case(MVU_IMPL_STYLE)
-	"mvu_8sx9_dsp58":
-		mvu_8sx9 #(.PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
+	uwire mvauin_t amvau_i;
+
+	if (IS_MVU) begin : genMVUInput
+		assign  amvau_i = amvau;
+	end : genMVUInput
+	else begin : genVVUInput
+		// The input stream will have the channels interleaved for VVU when PE>1
+		// Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..]
+		// Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like:
+		// (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to
+		// (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i)
+		localparam int num_of_elements = INPUT_STREAM_WIDTH/ACTIVATION_WIDTH;
+		for (genvar i=0; i<num_of_elements; i++) begin : genRewire
+			assign  amvau_i[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH] = (PE > 1) ?
+									amvau[(i/SIMD + (i*PE % num_of_elements) + 1) * ACTIVATION_WIDTH : (i/SIMD + (i*PE % num_of_elements)) * ACTIVATION_WIDTH]
+									: amvau[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH];
+		end : genRewire
+	end : genVVUInput
+
+	case(COMPUTE_CORE)
+	"mvu_vvu_8sx9_dsp58":
+		mvu_vvu_8sx9 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
 		.ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN),
 		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
 			.clk, .rst, .en,
-			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau),
+			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
 			.vld(ovld), .p(odat)
 		);
-
 	"mvu_4sx4u":
 		mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
 			.clk, .rst, .en,
-			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau),
+			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
 			.vld(ovld), .p(odat)
 		);
-
 	"mvu_8sx8u_dsp48":
 		mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
 		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
 			.clk, .rst, .en,
-			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau),
+			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
+			.vld(ovld), .p(odat)
+		);
+	"mvu_vvu_lut":
+		mvu_vvu_lut #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
+		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .M_REG(M_REG_LUT)) core (
+			.clk, .rst, .en,
+			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
 			.vld(ovld), .p(odat)
 		);
 	default: initial begin
-		$error("Unrecognized MVU_IMPL_STYLE '%s'", MVU_IMPL_STYLE);
+		$error("Unrecognized COMPUTE_CORE '%s'", COMPUTE_CORE);
 		$finish;
 	end
 	endcase
@@ -203,7 +236,7 @@ module mvu_axi #(
 
 	assign	b_load = !B.vld || m_axis_output_tready;
 	always_ff @(posedge clk) begin
-		if(rst)		B <= '{ default: 'x };
+		if(rst)		B <= '{ vld: 0, default: 'x };
 		else begin
 			if(b_load)	B <= '{ vld: A.vld, dat: A.dat};
 		end
@@ -212,4 +245,4 @@ module mvu_axi #(
 	assign	m_axis_output_tvalid = B.vld;
 	assign	m_axis_output_tdata  = B.dat;
 
-endmodule : mvu_axi
+endmodule : mvu_vvu_axi

From 32d6338c626b26d2e48cdb21cde438d557cc9bcd Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 21 Sep 2023 16:34:36 +0100
Subject: [PATCH 062/112] [mvu vvu axi]: renamed file for consistency purposes

---
 finn-rtllib/mvu/mvu_vvu_axi.sv | 248 +++++++++++++++++++++++++++++++++
 1 file changed, 248 insertions(+)
 create mode 100644 finn-rtllib/mvu/mvu_vvu_axi.sv

diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
new file mode 100644
index 0000000000..07ad32e6c8
--- /dev/null
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -0,0 +1,248 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Matrix Vector Unit (MVU) & Vector Vector Unit (VVU) AXI-lite interface wrapper.
+ * @details
+ *	 The following compute cores are supported:
+ *   - 4-bit MVU on DSP48 & DSP58 achieving 4 MACs/DSP, 
+ *     (4,8]-bit MVU on DSP48 achieving 2 MACs/DSP,
+ *     [4,9]-bit MVU and VVU on DSP58 achieving 3 MACs/DSP,
+ *     'unconstrained' LUT-based MVU and VVU.
+ *  Folding hints:
+ *	 - PE scaling should divide MH.
+ *   - SIMD scaling should divide MW.
+ *	 - Otherwise, keep SIMD and PE somewhat balanced. SIMD scaling tends to
+ *	   impact critical paths more than PE scaling. PE scaling implies a
+ *	   bigger fanout on the input activations.
+ *	 - Full unfolding along MH (PE=MH) results in no replay buffer instantiated
+ *****************************************************************************/
+
+module mvu_vvu_axi #(
+	bit IS_MVU, // string type causes error in Vivado
+	parameter COMPUTE_CORE,
+	int unsigned MW,
+	int unsigned MH,
+	int unsigned PE,
+	int unsigned SIMD,
+	int unsigned ACTIVATION_WIDTH,
+	int unsigned WEIGHT_WIDTH,
+	int unsigned ACCU_WIDTH,
+	bit SIGNED_ACTIVATIONS = 0,
+	int unsigned SEGMENTLEN = 0,
+	bit FORCE_BEHAVIORAL = 0,
+	bit M_REG_LUT = 1,
+
+	// Safely deducible parameters
+	localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
+	localparam int unsigned INPUT_STREAM_WIDTH_BA = ((IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8,
+	localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH,
+	localparam int unsigned INPUT_STREAM_WIDTH =  (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH,
+	localparam int unsigned SF = MW/SIMD,
+	localparam int unsigned NF = MH/PE,
+	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8
+)
+(
+	// Global Control
+	input	logic  ap_clk,
+	input	logic  ap_rst_n,
+
+	// Weight Stream
+	input	logic [WEIGHT_STREAM_WIDTH_BA-1:0]  s_axis_weights_tdata,
+	input	logic  s_axis_weights_tvalid,
+	output	logic  s_axis_weights_tready,
+
+	// Input Stream
+	input	logic [INPUT_STREAM_WIDTH_BA-1:0]  s_axis_input_tdata,
+	input	logic  s_axis_input_tvalid,
+	output	logic  s_axis_input_tready,
+
+	// Output Stream
+	output	logic [OUTPUT_STREAM_WIDTH_BA-1:0]  m_axis_output_tdata,
+	output	logic  m_axis_output_tvalid,
+	input	logic  m_axis_output_tready
+);
+
+//-------------------- Parameter sanity checks --------------------\\
+	initial begin
+		if (MW % SIMD != 0) begin
+			$error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD);
+			$finish;
+		end
+		if (MH % PE != 0) begin
+			$error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE);
+			$finish;
+		end
+		if (WEIGHT_WIDTH > 8) begin
+			$error("Weight width of %0d-bits exceeds maximum of 8-bits", WEIGHT_WIDTH);
+			$finish;
+		end
+		if (ACTIVATION_WIDTH > 8) begin
+			if (!(SIGNED_ACTIVATIONS == 1 && ACTIVATION_WIDTH == 9 && COMPUTE_CORE == "mvu_vvu_8sx9_dsp58")) begin
+				$error("Activation width of %0d-bits exceeds maximum of 9-bits for signed numbers on DSP48", ACTIVATION_WIDTH);
+				$finish;
+			end
+		end
+		if (COMPUTE_CORE == "mvu_vvu_8sx9_dsp58") begin
+			if (SEGMENTLEN == 0) begin
+				$warning("Segment length of %0d defaults to chain length of %0d", SEGMENTLEN, (SIMD+2)/3);
+			end
+			if (SEGMENTLEN > (SIMD+2)/3) begin
+				$error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3);
+				$finish;
+			end
+		end
+		if (!IS_MVU) begin
+			if (COMPUTE_CORE != "mvu_vvu_8sx9_dsp58" && COMPUTE_CORE != "mvu_vvu_lut") begin
+				$error("VVU only supported on DSP58 or LUT-based implementation");
+				$finish;
+			end
+		end
+	end
+
+	uwire clk = ap_clk;
+	uwire rst = !ap_rst_n;
+
+	typedef logic [INPUT_STREAM_WIDTH-1 : 0] mvauin_t;
+
+	uwire mvauin_t amvau;
+	uwire alast;
+	uwire afin;
+	uwire avld;
+	uwire ardy;
+
+	replay_buffer #(.LEN(SF), .REP(IS_MVU ? NF : 1), .W($bits(mvauin_t))) activation_replay (
+	.clk, .rst,
+	.ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)),
+	.ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin)
+	);
+
+//-------------------- Input control --------------------\\
+	uwire en;
+	uwire istb = avld && s_axis_weights_tvalid;
+	assign ardy = en && s_axis_weights_tvalid;
+	assign s_axis_weights_tready = en && avld;
+
+//-------------------- Core MVU/VVU --------------------\\
+	uwire ovld;
+	uwire [PE-1:0][ACCU_WIDTH-1:0] odat;
+	typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t;
+	uwire mvauin_t amvau_i;
+
+	if (IS_MVU) begin : genMVUInput
+		assign  amvau_i = amvau;
+	end : genMVUInput
+	else begin : genVVUInput
+		// The input stream will have the channels interleaved for VVU when PE>1
+		// Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..]
+		// Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like:
+		// (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to
+		// (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i)
+		localparam int num_of_elements = INPUT_STREAM_WIDTH/ACTIVATION_WIDTH;
+		for (genvar i=0; i<num_of_elements; i++) begin : genRewire
+			assign  amvau_i[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH] = (PE > 1) ?
+									amvau[(i/SIMD + (i*PE % num_of_elements) + 1) * ACTIVATION_WIDTH : (i/SIMD + (i*PE % num_of_elements)) * ACTIVATION_WIDTH]
+									: amvau[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH];
+		end : genRewire
+	end : genVVUInput
+
+	case(COMPUTE_CORE)
+	"mvu_vvu_8sx9_dsp58":
+		mvu_vvu_8sx9 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
+		.ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN),
+		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
+			.clk, .rst, .en,
+			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
+			.vld(ovld), .p(odat)
+		);
+	"mvu_4sx4u":
+		mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
+			.clk, .rst, .en,
+			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
+			.vld(ovld), .p(odat)
+		);
+	"mvu_8sx8u_dsp48":
+		mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
+		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
+			.clk, .rst, .en,
+			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
+			.vld(ovld), .p(odat)
+		);
+	"mvu_vvu_lut":
+		mvu_vvu_lut #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
+		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .M_REG(M_REG_LUT)) core (
+			.clk, .rst, .en,
+			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
+			.vld(ovld), .p(odat)
+		);
+	default: initial begin
+		$error("Unrecognized COMPUTE_CORE '%s'", COMPUTE_CORE);
+		$finish;
+	end
+	endcase
+
+//-------------------- Output register slice --------------------\\
+	struct packed {
+		logic vld;
+		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
+	} A = '{ vld: 0, default: 'x};
+
+	assign en = !A.vld || !ovld;
+
+	uwire  b_load;
+	always_ff @(posedge clk) begin
+		if(rst)		A <= '{ vld: 0, default: 'x };
+		else if(!A.vld || b_load) begin
+			A.vld <= ovld && en;
+			for(int unsigned  i = 0; i < PE; i++) begin
+				// CR-1148862:
+				// A.dat[i] <= odat[i];
+				automatic logic [ACCU_WIDTH-1:0]  v = odat[i];
+				A.dat[i] <= v[ACCU_WIDTH-1:0];
+			end
+		end
+	end
+
+	struct packed {
+		logic vld;
+		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
+	} B = '{ vld: 0, default: 'x};
+
+	assign	b_load = !B.vld || m_axis_output_tready;
+	always_ff @(posedge clk) begin
+		if(rst)		B <= '{ vld: 0, default: 'x };
+		else begin
+			if(b_load)	B <= '{ vld: A.vld, dat: A.dat};
+		end
+	end
+
+	assign	m_axis_output_tvalid = B.vld;
+	assign	m_axis_output_tdata  = B.dat;
+
+endmodule : mvu_vvu_axi

From 031406d73fa36a02638a94affd6a0bef36956c3c Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 21 Sep 2023 16:39:22 +0100
Subject: [PATCH 063/112] [mvu 8sx9]: added support for VVU on DSP58, resolved
 PyVerilator-caused error and added synthesis directive to handle 'X in input
 data

---
 finn-rtllib/mvu/mvu_8sx9.sv | 100 +++++++++++++++++++-----------------
 1 file changed, 52 insertions(+), 48 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv
index 34aa856b1b..52a93739d6 100644
--- a/finn-rtllib/mvu/mvu_8sx9.sv
+++ b/finn-rtllib/mvu/mvu_8sx9.sv
@@ -31,7 +31,8 @@
  * @brief	Matrix Vector Unit (MVU) core compute kernel utilizing DSP58.
  *****************************************************************************/
 
-module mvu_8sx9 #(
+module mvu_vvu_8sx9 #(
+	parameter IS_MVU,
     int unsigned PE,
     int unsigned SIMD,
     int unsigned ACTIVATION_WIDTH,
@@ -39,7 +40,9 @@ module mvu_8sx9 #(
 	int unsigned ACCU_WIDTH,
     bit SIGNED_ACTIVATIONS = 0,
     int unsigned SEGMENTLEN = 0, // Default to 0 (which implies a single segment)
-	bit FORCE_BEHAVIORAL = 0
+	bit FORCE_BEHAVIORAL = 0,
+
+	int unsigned  ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD
   )
   (
     // Global Control
@@ -51,7 +54,7 @@ module mvu_8sx9 #(
     input   logic last,
     input   logic zero, // ignore current inputs and force this partial product to zero
     input   logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // weights
-	input   logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] a, // activations
+	input   logic [ACTIVATION_ELEMENTS-1:0][ACTIVATION_WIDTH-1:0] a, // activations
 
 	// Ouput
 	output  logic vld,
@@ -67,9 +70,10 @@ module mvu_8sx9 #(
 //-------------------- Declare global signals --------------------\\
 	localparam int unsigned CHAINLEN = (SIMD+2)/3;
 	localparam int unsigned SEGLEN = SEGMENTLEN == 0 ? CHAINLEN : SEGMENTLEN; // Additional constant to default a SEGMENTLEN of '0' to the DSP-chain length
-	uwire [26:0] a_in_i [CHAINLEN];
+	localparam int unsigned PE_ACTIVATION = IS_MVU ? 1 : PE;
+	uwire [26:0] a_in_i [PE_ACTIVATION * CHAINLEN];
 	uwire [23:0] b_in_i [PE][CHAINLEN];
-	uwire [57:0] pcout [PE][CHAINLEN];
+	uwire [PE-1:0][CHAINLEN-1:0][57:0] pcout; // Array with packed dimension > 256 (with a loop-carried dependency) cannot be handled out-of-the-box with PyVerilator
 
 //-------------------- Shift register for opmode select signal --------------------\\
 	localparam int unsigned MAX_PIPELINE_STAGES = (CHAINLEN + SEGLEN-1)/SEGLEN; // >=1 (== number of pipeline registers + 1 (A/B inputs always have 1 register))
@@ -99,48 +103,48 @@ module mvu_8sx9 #(
 
 //-------------------- Buffer for input activations --------------------\\
 	localparam int unsigned PAD_BITS_ACT = 9 - ACTIVATION_WIDTH;
+	for (genvar k=0; k<PE_ACTIVATION; k++) begin : genActPE
+		for (genvar i=0; i<CHAINLEN; i++) begin : genActSIMD
+			localparam int TOTAL_PREGS = i/SEGLEN;
+			localparam int EXTERNAL_PREGS = TOTAL_PREGS>1 ? TOTAL_PREGS-1 : 0;
+			localparam int LANES_OCCUPIED = i == CHAINLEN-1 ? SIMD - 3*i : 3;
 
-	for (genvar i=0; i<CHAINLEN; i++) begin : genActSIMD
-		localparam int TOTAL_PREGS = i/SEGLEN;
-		localparam int EXTERNAL_PREGS = TOTAL_PREGS>1 ? TOTAL_PREGS-1 : 0;
-		localparam int LANES_OCCUPIED = i == CHAINLEN-1 ? SIMD - 3*i : 3;
-
-		if (EXTERNAL_PREGS > 0) begin : genExternalPregAct
-			logic [0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][ACTIVATION_WIDTH-1:0] A = '{ default : 0};
-			always_ff @(posedge clk) begin
-				if (rst)     A <= '{default: 0};
-				else if(en) begin
-					A[EXTERNAL_PREGS-1] <= 
-// synthesis translate_off
-						zero ? '1 : 
-// synthesis translate_on						
-						a[3*i +: LANES_OCCUPIED];
-					if (EXTERNAL_PREGS > 1)   A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1];
+			if (EXTERNAL_PREGS > 0) begin : genExternalPregAct
+				logic [0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][ACTIVATION_WIDTH-1:0] A = '{ default : 0};
+				always_ff @(posedge clk) begin
+					if (rst)     A <= '{default: 0};
+					else if(en) begin
+						A[EXTERNAL_PREGS-1] <= 
+	// synthesis translate_off
+							zero ? '1 : 
+	// synthesis translate_on						
+							a[SIMD*k + 3*i +: LANES_OCCUPIED];
+						if (EXTERNAL_PREGS > 1)   A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1];
+					end
 				end
-			end
-			for (genvar j=0; j<LANES_OCCUPIED; j++) begin : genAin
-			assign a_in_i[i][9*j +: 9] = SIGNED_ACTIVATIONS ? PAD_BITS_ACT == 0 ? A[0][j] : { {PAD_BITS_ACT{A[0][j][ACTIVATION_WIDTH-1]}}, A[0][j] } 
-												: PAD_BITS_ACT == 0 ? A[0][j] : { {PAD_BITS_ACT{1'b0}}, A[0][j] } ;
-			end : genAin
-			for (genvar j=LANES_OCCUPIED; j<3; j++) begin : genAinZero
-				assign a_in_i[i][9*j +: 9] = 9'b0;
-			end : genAinZero
-		end : genExternalPregAct
-		else begin : genInpDSPAct
-			for (genvar j=0; j<LANES_OCCUPIED; j++) begin : genAin
-				assign a_in_i[i][9*j +: 9] = 
-// synthesis translate_off
-					zero ? '1 : 				
-// synthesis translate_on
-					SIGNED_ACTIVATIONS ? PAD_BITS_ACT == 0 ? a[3*i+j] : { {PAD_BITS_ACT{a[3*i+j][ACTIVATION_WIDTH-1]}}, a[3*i+j] }
-												: PAD_BITS_ACT == 0 ? a[3*i+j] : { {PAD_BITS_ACT{1'b0}}, a[3*i+j] } ;
-			end : genAin
-			for (genvar j=LANES_OCCUPIED; j<3; j++) begin : genAinZero
-				assign a_in_i[i][9*j +: 9] = 9'b0;
-			end : genAinZero
-		end : genInpDSPAct
-
-	end : genActSIMD
+				for (genvar j=0; j<LANES_OCCUPIED; j++) begin : genAin
+				assign a_in_i[CHAINLEN*k+i][9*j +: 9] = SIGNED_ACTIVATIONS ? PAD_BITS_ACT == 0 ? A[0][j] : { {PAD_BITS_ACT{A[0][j][ACTIVATION_WIDTH-1]}}, A[0][j] } 
+													  : PAD_BITS_ACT == 0 ? A[0][j] : { {PAD_BITS_ACT{1'b0}}, A[0][j] } ;
+				end : genAin
+				for (genvar j=LANES_OCCUPIED; j<3; j++) begin : genAinZero
+					assign a_in_i[CHAINLEN*k+i][9*j +: 9] = 9'b0;
+				end : genAinZero
+			end : genExternalPregAct
+			else begin : genInpDSPAct
+				for (genvar j=0; j<LANES_OCCUPIED; j++) begin : genAin
+					assign a_in_i[CHAINLEN*k+i][9*j +: 9] = 
+	// synthesis translate_off
+						zero ? '1 : 				
+	// synthesis translate_on
+						SIGNED_ACTIVATIONS ? PAD_BITS_ACT == 0 ? a[SIMD*k+3*i+j] : { {PAD_BITS_ACT{a[SIMD*k+3*i+j][ACTIVATION_WIDTH-1]}}, a[SIMD*k+3*i+j] }
+													: PAD_BITS_ACT == 0 ? a[SIMD*k+3*i+j] : { {PAD_BITS_ACT{1'b0}}, a[SIMD*k+3*i+j] } ;
+				end : genAin
+				for (genvar j=LANES_OCCUPIED; j<3; j++) begin : genAinZero
+					assign a_in_i[CHAINLEN*k+i][9*j +: 9] = 9'b0;
+				end : genAinZero
+			end : genInpDSPAct
+		end : genActSIMD
+	end : genActPE
 
 //-------------------- Buffer for weights --------------------\\
 	localparam int unsigned PAD_BITS_WEIGHT = 8 - WEIGHT_WIDTH;
@@ -209,7 +213,7 @@ module mvu_8sx9 #(
 				always_ff @(posedge clk) begin
 					if (rst)	Areg <= '{ default : 0};
 					else if (en) begin
-						Areg[0] <= { 7'bx, a_in_i[j] };
+						Areg[0] <= { 7'bx, a_in_i[(IS_MVU ? 0 : CHAINLEN*i) + j] };
 						if (INTERNAL_PREGS == 2) Areg[1] <= Areg[0];
 					end
 				end
@@ -384,7 +388,7 @@ module mvu_8sx9 #(
 							7'b000_0000
 					}), // 9-bit input: Operation mode
 					// Data inputs: Data Ports
-					.A({ 7'bx, a_in_i[j] }),            // 34-bit input: A data
+					.A({ 7'bx, a_in_i[(IS_MVU ? 0 : CHAINLEN*i) + j] }),            // 34-bit input: A data
 					.B(b_in_i[i][j]),                   // 24-bit input: B data
 					.C('x),                             // 58-bit input: C data
 					.CARRYIN('0),                       // 1-bit input: Carry-in
@@ -420,4 +424,4 @@ module mvu_8sx9 #(
 		end : genDSPChain
 	end : genDSPPE
 
-endmodule : mvu_8sx9
+endmodule : mvu_vvu_8sx9

From e2c1f1589c374a2fd7d0eb17621568621ea88bda Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 21 Sep 2023 16:39:52 +0100
Subject: [PATCH 064/112] [mvu vvu 8sx9]: renamed compute core for consistency

---
 finn-rtllib/mvu/mvu_vvu_8sx9.sv | 427 ++++++++++++++++++++++++++++++++
 1 file changed, 427 insertions(+)
 create mode 100644 finn-rtllib/mvu/mvu_vvu_8sx9.sv

diff --git a/finn-rtllib/mvu/mvu_vvu_8sx9.sv b/finn-rtllib/mvu/mvu_vvu_8sx9.sv
new file mode 100644
index 0000000000..52a93739d6
--- /dev/null
+++ b/finn-rtllib/mvu/mvu_vvu_8sx9.sv
@@ -0,0 +1,427 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Matrix Vector Unit (MVU) core compute kernel utilizing DSP58.
+ *****************************************************************************/
+
+module mvu_vvu_8sx9 #(
+	parameter IS_MVU,
+    int unsigned PE,
+    int unsigned SIMD,
+    int unsigned ACTIVATION_WIDTH,
+    int unsigned WEIGHT_WIDTH,
+	int unsigned ACCU_WIDTH,
+    bit SIGNED_ACTIVATIONS = 0,
+    int unsigned SEGMENTLEN = 0, // Default to 0 (which implies a single segment)
+	bit FORCE_BEHAVIORAL = 0,
+
+	int unsigned  ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD
+  )
+  (
+    // Global Control
+	input   logic clk,
+    input   logic rst,
+    input   logic en,
+
+	// Input
+    input   logic last,
+    input   logic zero, // ignore current inputs and force this partial product to zero
+    input   logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // weights
+	input   logic [ACTIVATION_ELEMENTS-1:0][ACTIVATION_WIDTH-1:0] a, // activations
+
+	// Ouput
+	output  logic vld,
+    output  logic [PE-1:0][ACCU_WIDTH-1:0] p
+  );
+	// for verilator always use behavioral code
+	localparam bit  BEHAVIORAL =
+`ifdef VERILATOR
+		1 ||
+`endif
+		FORCE_BEHAVIORAL;
+
+//-------------------- Declare global signals --------------------\\
+	localparam int unsigned CHAINLEN = (SIMD+2)/3;
+	localparam int unsigned SEGLEN = SEGMENTLEN == 0 ? CHAINLEN : SEGMENTLEN; // Additional constant to default a SEGMENTLEN of '0' to the DSP-chain length
+	localparam int unsigned PE_ACTIVATION = IS_MVU ? 1 : PE;
+	uwire [26:0] a_in_i [PE_ACTIVATION * CHAINLEN];
+	uwire [23:0] b_in_i [PE][CHAINLEN];
+	uwire [PE-1:0][CHAINLEN-1:0][57:0] pcout; // Array with packed dimension > 256 (with a loop-carried dependency) cannot be handled out-of-the-box with PyVerilator
+
+//-------------------- Shift register for opmode select signal --------------------\\
+	localparam int unsigned MAX_PIPELINE_STAGES = (CHAINLEN + SEGLEN-1)/SEGLEN; // >=1 (== number of pipeline registers + 1 (A/B inputs always have 1 register))
+	logic L [0:1+MAX_PIPELINE_STAGES] = '{default: 0}; // After MAX_PIPELINE_STAGES (== number of pipeline stages for input data), we have 3 additional cycles latency (A/B reg, Mreg, Preg). Thus, we add +2 (since OPMODE is buffered by 1 cycle in the DSP fabric)
+
+	always_ff @(posedge clk) begin
+		if(rst)     L <= '{default: 0};
+		else if(en) begin
+			L[1+MAX_PIPELINE_STAGES] <= last;
+			L[0:MAX_PIPELINE_STAGES] <= L[1:1+MAX_PIPELINE_STAGES];
+		end
+	end
+	assign vld = L[0];
+
+//-------------------- Shift register for ZERO flag --------------------\\
+	logic Z [0:MAX_PIPELINE_STAGES-2] = '{default:0}; // We need MAX_PIPELINE_STAGES-1 pipeline stages (note: INMODE is buffered inside DSP fabric)
+
+	if (MAX_PIPELINE_STAGES > 1) begin : genZreg
+		always_ff @(posedge clk) begin
+			if (rst)      Z <= '{default: 0};
+			else if(en) begin
+				Z[0] <= zero;
+				if (MAX_PIPELINE_STAGES > 2)  Z[1:MAX_PIPELINE_STAGES-2] <= Z[0:MAX_PIPELINE_STAGES-3];
+			end
+		end
+	end;
+
+//-------------------- Buffer for input activations --------------------\\
+	localparam int unsigned PAD_BITS_ACT = 9 - ACTIVATION_WIDTH;
+	for (genvar k=0; k<PE_ACTIVATION; k++) begin : genActPE
+		for (genvar i=0; i<CHAINLEN; i++) begin : genActSIMD
+			localparam int TOTAL_PREGS = i/SEGLEN;
+			localparam int EXTERNAL_PREGS = TOTAL_PREGS>1 ? TOTAL_PREGS-1 : 0;
+			localparam int LANES_OCCUPIED = i == CHAINLEN-1 ? SIMD - 3*i : 3;
+
+			if (EXTERNAL_PREGS > 0) begin : genExternalPregAct
+				logic [0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][ACTIVATION_WIDTH-1:0] A = '{ default : 0};
+				always_ff @(posedge clk) begin
+					if (rst)     A <= '{default: 0};
+					else if(en) begin
+						A[EXTERNAL_PREGS-1] <= 
+	// synthesis translate_off
+							zero ? '1 : 
+	// synthesis translate_on						
+							a[SIMD*k + 3*i +: LANES_OCCUPIED];
+						if (EXTERNAL_PREGS > 1)   A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1];
+					end
+				end
+				for (genvar j=0; j<LANES_OCCUPIED; j++) begin : genAin
+				assign a_in_i[CHAINLEN*k+i][9*j +: 9] = SIGNED_ACTIVATIONS ? PAD_BITS_ACT == 0 ? A[0][j] : { {PAD_BITS_ACT{A[0][j][ACTIVATION_WIDTH-1]}}, A[0][j] } 
+													  : PAD_BITS_ACT == 0 ? A[0][j] : { {PAD_BITS_ACT{1'b0}}, A[0][j] } ;
+				end : genAin
+				for (genvar j=LANES_OCCUPIED; j<3; j++) begin : genAinZero
+					assign a_in_i[CHAINLEN*k+i][9*j +: 9] = 9'b0;
+				end : genAinZero
+			end : genExternalPregAct
+			else begin : genInpDSPAct
+				for (genvar j=0; j<LANES_OCCUPIED; j++) begin : genAin
+					assign a_in_i[CHAINLEN*k+i][9*j +: 9] = 
+	// synthesis translate_off
+						zero ? '1 : 				
+	// synthesis translate_on
+						SIGNED_ACTIVATIONS ? PAD_BITS_ACT == 0 ? a[SIMD*k+3*i+j] : { {PAD_BITS_ACT{a[SIMD*k+3*i+j][ACTIVATION_WIDTH-1]}}, a[SIMD*k+3*i+j] }
+													: PAD_BITS_ACT == 0 ? a[SIMD*k+3*i+j] : { {PAD_BITS_ACT{1'b0}}, a[SIMD*k+3*i+j] } ;
+				end : genAin
+				for (genvar j=LANES_OCCUPIED; j<3; j++) begin : genAinZero
+					assign a_in_i[CHAINLEN*k+i][9*j +: 9] = 9'b0;
+				end : genAinZero
+			end : genInpDSPAct
+		end : genActSIMD
+	end : genActPE
+
+//-------------------- Buffer for weights --------------------\\
+	localparam int unsigned PAD_BITS_WEIGHT = 8 - WEIGHT_WIDTH;
+
+	for (genvar i=0; i<PE; i++) begin : genWeightPE
+		for (genvar j=0; j<CHAINLEN; j++) begin : genWeightSIMD
+			localparam int TOTAL_PREGS = j/SEGLEN;
+			localparam int EXTERNAL_PREGS = TOTAL_PREGS>1 ? TOTAL_PREGS-1 : 0;
+			localparam int LANES_OCCUPIED = j == CHAINLEN-1 ? SIMD - 3*j : 3;
+
+			if (EXTERNAL_PREGS > 0) begin : genExternalPregWeight
+				logic [0:PE-1][0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][WEIGHT_WIDTH-1:0] B = '{ default : 0};
+				always_ff @(posedge clk) begin
+					if (rst)    B <= '{default: 0};
+					else if (en) begin
+						B[i][EXTERNAL_PREGS-1] <= 
+// synthesis translate_off
+							zero ? '1 : 						
+// synthesis translate_on							
+							w[i][3*j +: LANES_OCCUPIED];
+						if (EXTERNAL_PREGS > 1) B[i][0:EXTERNAL_PREGS-2] <= B[i][1:EXTERNAL_PREGS-1];
+					end
+				end
+				for (genvar k = 0 ; k < LANES_OCCUPIED ; k++) begin : genBin
+					assign b_in_i[i][j][8*k +: 8] = PAD_BITS_WEIGHT == 0 ? B[i][0][k] : { {PAD_BITS_WEIGHT{B[i][0][k][WEIGHT_WIDTH-1]}}, B[i][0][k] };
+				end : genBin
+				for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero
+					assign b_in_i[i][j][8*k +: 8] = 8'b0;
+				end : genBinZero
+			end : genExternalPregWeight
+			else begin : genInpDSPWeight
+				for (genvar k = 0; k < LANES_OCCUPIED; k++) begin : genBin
+					assign b_in_i[i][j][8*k +: 8] = 
+// synthesis translate_off					
+						zero ? '1 : 
+// synthesis translate_on					
+						PAD_BITS_WEIGHT == 0 ? w[i][3*j+k] : { {PAD_BITS_WEIGHT{w[i][3*j+k][WEIGHT_WIDTH-1]}}, w[i][3*j+k] };
+				end : genBin
+				for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero
+					assign b_in_i[i][j][8*k +: 8] = 8'b0;
+				end : genBinZero
+			end : genInpDSPWeight
+		end : genWeightSIMD
+	end : genWeightPE
+
+//-------------------- Instantiate PE x CHAINLEN DSPs --------------------\\
+	for (genvar i=0; i<PE; i++) begin : genDSPPE
+		for (genvar j=0; j<CHAINLEN; j++) begin : genDSPChain
+			localparam int TOTAL_PREGS = j/SEGLEN;
+			localparam int INTERNAL_PREGS = TOTAL_PREGS>0 ? 2 : 1; // 1 : 0
+			localparam bit PREG = (j+1)%SEGLEN==0 || j == CHAINLEN-1;
+			localparam bit FIRST = j == 0;
+			localparam bit LAST = j == CHAINLEN-1;
+			uwire [57:0] pp;
+
+			if (LAST) begin : genPOUT
+				assign p[i] = pp[ACCU_WIDTH-1:0];
+			end
+
+			// Note: Since the product B * AD is computed,
+			//       rst can be only applied to AD and zero only to B
+			//       with the same effect as zeroing both.
+			if(BEHAVIORAL) begin : genBehav
+				// Stage #1: Input A/B
+				logic signed [33:0] Areg [INTERNAL_PREGS];
+				always_ff @(posedge clk) begin
+					if (rst)	Areg <= '{ default : 0};
+					else if (en) begin
+						Areg[0] <= { 7'bx, a_in_i[(IS_MVU ? 0 : CHAINLEN*i) + j] };
+						if (INTERNAL_PREGS == 2) Areg[1] <= Areg[0];
+					end
+				end
+				logic signed [23:0] Breg [INTERNAL_PREGS];
+				always_ff @(posedge clk) begin
+					if (rst)	Breg <= '{ default : 0};
+					else if (en) begin
+						Breg[0] <= b_in_i[i][j];
+						if (INTERNAL_PREGS == 2) Breg[1] <= Breg[0];
+					end
+				end
+
+				// Stage #2: Multiply-Accumulate
+				logic signed [57:0] Mreg;
+				logic InmodeZero = 0;
+				always_ff @(posedge clk) begin
+					if (rst)		InmodeZero <= 0;
+					else if (en)	InmodeZero <= ( TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero );
+				end
+				always_ff @(posedge clk) begin
+					if (rst)	Mreg <= 0;
+					else if (en) begin
+						automatic logic signed [57:0] m = 0;
+						for (int k = 0; k < 3; k++) begin
+							m = m + (InmodeZero ? 0 : $signed(Areg[INTERNAL_PREGS-1][9*k +: 9]) * $signed(Breg[INTERNAL_PREGS-1][8*k +: 8]));
+						end
+						Mreg <= m;
+					end
+				end
+
+				// Stage #3: Accumulate
+				logic signed [57:0] Preg;
+				logic Opmode = 0;
+				if (FIRST && !LAST) begin : genFirst
+					if (PREG) begin : genPregBehav
+						always_ff @(posedge clk) begin
+							if (rst)		Preg <= 0;
+							else if (en)	Preg <= Mreg;
+						end
+					end
+					else	assign Preg = Mreg;
+				end
+				else if (FIRST && LAST) begin : genSingle
+					always_ff @(posedge clk) begin
+						if (rst)		Opmode <= 0;
+						else if (en)	Opmode <= L[1];
+					end
+					always_ff @(posedge clk) begin
+						if (rst) 		Preg <= 0;
+						else if (en)	Preg <= (Opmode ? 0 : Preg) + Mreg;
+					end
+				end
+				else if (!FIRST && LAST) begin : genLast
+					always_ff @(posedge clk) begin
+						if (rst)		Opmode <= 0;
+						else if (en)	Opmode <= L[1];
+					end
+					always_ff @(posedge clk) begin
+						if (rst) 		Preg <= 0;
+						else if (en)	Preg <= (Opmode ? 0 : Preg) + Mreg + pcout[i][j-1];
+					end
+				end
+				else begin : genMid
+					if (PREG) begin : genPregBehav
+						always_ff @(posedge clk) begin
+							if (rst)		Preg <= 0;
+							else if (en)	Preg <= Mreg + pcout[i][j-1];
+						end
+					end
+					else	assign Preg = Mreg + pcout[i][j-1];
+				end
+				assign pp = Preg;
+				assign pcout[i][j] = Preg;
+			end : genBehav
+`ifndef VERILATOR
+			else begin: genDSP
+				DSP58 #(
+					// Feature Control Attributes: Data Path Selection
+					.AMULTSEL("A"),                     // Selects A input to multiplier (A, AD)
+					.A_INPUT("DIRECT"),                 // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port)
+					.BMULTSEL("B"),                     // Selects B input to multiplier (AD, B)
+					.B_INPUT("DIRECT"),                 // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port)
+					.DSP_MODE("INT8"),                  // Configures DSP to a particular mode of operation. Set to INT24 for
+														// legacy mode.
+					.PREADDINSEL("A"),                  // Selects input to pre-adder (A, B)
+					.RND(58'h000000000000000),          // Rounding Constant
+					.USE_MULT("MULTIPLY"),              // Select multiplier usage (DYNAMIC, MULTIPLY, NONE)
+					.USE_SIMD("ONE58"),                 // SIMD selection (FOUR12, ONE58, TWO24)
+					.USE_WIDEXOR("FALSE"),              // Use the Wide XOR function (FALSE, TRUE)
+					.XORSIMD("XOR24_34_58_116"),        // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116)
+					// Pattern Detector Attributes: Pattern Detection Configuration
+					.AUTORESET_PATDET("NO_RESET"),      // NO_RESET, RESET_MATCH, RESET_NOT_MATCH
+					.AUTORESET_PRIORITY("RESET"),       // Priority of AUTORESET vs. CEP (CEP, RESET).
+					.MASK(58'h0ffffffffffffff),         // 58-bit mask value for pattern detect (1=ignore)
+					.PATTERN(58'h000000000000000),      // 58-bit pattern match for pattern detect
+					.SEL_MASK("MASK"),                  // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2
+					.SEL_PATTERN("PATTERN"),            // Select pattern value (C, PATTERN)
+					.USE_PATTERN_DETECT("NO_PATDET"),   // Enable pattern detect (NO_PATDET, PATDET)
+					// Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins
+					.IS_ALUMODE_INVERTED(4'b0000),      // Optional inversion for ALUMODE
+					.IS_CARRYIN_INVERTED(1'b0),         // Optional inversion for CARRYIN
+					.IS_CLK_INVERTED(1'b0),             // Optional inversion for CLK
+					.IS_INMODE_INVERTED(5'b00000),      // Optional inversion for INMODE
+					.IS_NEGATE_INVERTED(3'b000),        // Optional inversion for NEGATE
+					.IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0
+										FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN
+										2'b01, // Y : M
+										2'b01  // X: M
+					}), // Optional inversion for OPMODE
+					.IS_RSTALLCARRYIN_INVERTED(1'b0),   // Optional inversion for RSTALLCARRYIN
+					.IS_RSTALUMODE_INVERTED(1'b0),      // Optional inversion for RSTALUMODE
+					.IS_RSTA_INVERTED(1'b0),            // Optional inversion for RSTA
+					.IS_RSTB_INVERTED(1'b0),            // Optional inversion for RSTB
+					.IS_RSTCTRL_INVERTED(1'b0),         // Optional inversion for STCONJUGATE_A
+					.IS_RSTC_INVERTED(1'b0),            // Optional inversion for RSTC
+					.IS_RSTD_INVERTED(1'b0),            // Optional inversion for RSTD
+					.IS_RSTINMODE_INVERTED(1'b0),       // Optional inversion for RSTINMODE
+					.IS_RSTM_INVERTED(1'b0),            // Optional inversion for RSTM
+					.IS_RSTP_INVERTED(1'b0),            // Optional inversion for RSTP
+					// Register Control Attributes: Pipeline Register Configuration
+					.ACASCREG(INTERNAL_PREGS),          // Number of pipeline stages between A/ACIN and ACOUT (0-2)
+					.ADREG(0),                          // Pipeline stages for pre-adder (0-1)
+					.ALUMODEREG(0),                     // Pipeline stages for ALUMODE (0-1)
+					.AREG(INTERNAL_PREGS),              // Pipeline stages for A (0-2)
+					.BCASCREG(INTERNAL_PREGS),          // Number of pipeline stages between B/BCIN and BCOUT (0-2)
+					.BREG(INTERNAL_PREGS),              // Pipeline stages for B (0-2)
+					.CARRYINREG(0),                     // Pipeline stages for CARRYIN (0-1)
+					.CARRYINSELREG(0),                  // Pipeline stages for CARRYINSEL (0-1)
+					.CREG(0),                           // Pipeline stages for C (0-1)
+					.DREG(0),                           // Pipeline stages for D (0-1)
+					.INMODEREG(1),                      // Pipeline stages for INMODE (0-1)
+					.MREG(1),                           // Multiplier pipeline stages (0-1)
+					.OPMODEREG(1),                      // Pipeline stages for OPMODE (0-1)
+					.PREG(PREG),                        // Number of pipeline stages for P (0-1)
+					.RESET_MODE("SYNC")                 // Selection of synchronous or asynchronous reset. (ASYNC, SYNC).
+				)
+				DSP58_inst (
+					// Cascade outputs: Cascade Ports
+					.ACOUT(),                           // 34-bit output: A port cascade
+					.BCOUT(),                           // 24-bit output: B cascade
+					.CARRYCASCOUT(),                    // 1-bit output: Cascade carry
+					.MULTSIGNOUT(),                     // 1-bit output: Multiplier sign cascade
+					.PCOUT(pcout[i][j]),                // 58-bit output: Cascade output
+					// Control outputs: Control Inputs/Status Bits
+					.OVERFLOW(),                        // 1-bit output: Overflow in add/acc
+					.PATTERNBDETECT(),                  // 1-bit output: Pattern bar detect
+					.PATTERNDETECT(),                   // 1-bit output: Pattern detect
+					.UNDERFLOW(),                       // 1-bit output: Underflow in add/acc
+					// Data outputs: Data Ports
+					.CARRYOUT(),                        // 4-bit output: Carry
+					.P(pp),                             // 58-bit output: Primary data
+					.XOROUT(),                          // 8-bit output: XOR data
+					// Cascade inputs: Cascade Ports
+					.ACIN('x),                          // 34-bit input: A cascade data
+					.BCIN('x),                          // 24-bit input: B cascade
+					.CARRYCASCIN('x),                   // 1-bit input: Cascade carry
+					.MULTSIGNIN('x),                    // 1-bit input: Multiplier sign cascade
+					.PCIN(FIRST ? 'x : pcout[i][j-1]),  // 58-bit input: P cascade
+					// Control inputs: Control Inputs/Status Bits
+					.ALUMODE(4'h0),                     // 4-bit input: ALU control
+					.CARRYINSEL('0),                    // 3-bit input: Carry select
+					.CLK(clk),                          // 1-bit input: Clock
+					.INMODE({
+							INTERNAL_PREGS==2 ? 1'b0 : 1'b1,
+							2'b00,
+							TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero,
+							INTERNAL_PREGS==2 ? 1'b0 : 1'b1
+					}),                                 // 5-bit input: INMODE control
+					.NEGATE('0),                        // 3-bit input: Negates the input of the multiplier
+					.OPMODE({
+							LAST ? {1'b0, L[1]} : 2'b00,
+							7'b000_0000
+					}), // 9-bit input: Operation mode
+					// Data inputs: Data Ports
+					.A({ 7'bx, a_in_i[(IS_MVU ? 0 : CHAINLEN*i) + j] }),            // 34-bit input: A data
+					.B(b_in_i[i][j]),                   // 24-bit input: B data
+					.C('x),                             // 58-bit input: C data
+					.CARRYIN('0),                       // 1-bit input: Carry-in
+					.D('x),                             // 27-bit input: D data
+					// Reset/Clock Enable inputs: Reset/Clock Enable Inputs
+					.ASYNC_RST('0),                     // 1-bit input: Asynchronous reset for all registers.
+					.CEA1(en),                          // 1-bit input: Clock enable for 1st stage AREG
+					.CEA2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage AREG
+					.CEAD('0),                          // 1-bit input: Clock enable for ADREG
+					.CEALUMODE('0),                     // 1-bit input: Clock enable for ALUMODE
+					.CEB1(en),                          // 1-bit input: Clock enable for 1st stage BREG
+					.CEB2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage BREG
+					.CEC('0),                           // 1-bit input: Clock enable for CREG
+					.CECARRYIN('0),                     // 1-bit input: Clock enable for CARRYINREG
+					.CECTRL(en),                        // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG
+					.CED('0),                           // 1-bit input: Clock enable for DREG
+					.CEINMODE(en),                      // 1-bit input: Clock enable for INMODEREG
+					.CEM(en),                           // 1-bit input: Clock enable for MREG
+					.CEP(PREG && en),                   // 1-bit input: Clock enable for PREG
+					.RSTA(rst),                         // 1-bit input: Reset for AREG
+					.RSTALLCARRYIN('0),                 // 1-bit input: Reset for CARRYINREG
+					.RSTALUMODE('0),                    // 1-bit input: Reset for ALUMODEREG
+					.RSTB(rst),                         // 1-bit input: Reset for BREG
+					.RSTC('0),                          // 1-bit input: Reset for CREG
+					.RSTCTRL(rst),                      // 1-bit input: Reset for OPMODEREG and CARRYINSELREG
+					.RSTD('0),                          // 1-bit input: Reset for DREG and ADREG
+					.RSTINMODE(rst),                    // 1-bit input: Reset for INMODE register
+					.RSTM(rst),                         // 1-bit input: Reset for MREG
+					.RSTP(PREG && rst)                  // 1-bit input: Reset for PREG
+				);
+			end : genDSP
+`endif
+		end : genDSPChain
+	end : genDSPPE
+
+endmodule : mvu_vvu_8sx9

From adb58694be36bd0fa2e8558f760d1642f14a2a38 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 21 Sep 2023 16:58:20 +0100
Subject: [PATCH 065/112] [axi wrapper]: changed parameter to localparam

---
 finn-rtllib/mvu/mvu_vvu_axi_wrapper.v | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
index 788e49a71b..270fe7351f 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
+++ b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
@@ -46,9 +46,9 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 	parameter	FORCE_BEHAVIORAL = $FORCE_BEHAVIORAL$,
 
 	// Safely deducible parameters
-	parameter 	WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
-	parameter 	INPUT_STREAM_WIDTH_BA = ((IS_MVU == 1 ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8,
-	parameter 	OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8
+	localparam	WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
+	localparam 	INPUT_STREAM_WIDTH_BA = ((IS_MVU == 1 ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8,
+	localparam 	OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8
 )(
 	// Global Control
 	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *)

From f54d438f78fe4ce78c84fdd7bcbc514048bd2fe0 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 21 Sep 2023 16:59:32 +0100
Subject: [PATCH 066/112] [axi]: added support for LUT-based VVU

---
 finn-rtllib/mvu/mvu_vvu_axi.sv | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index 07ad32e6c8..ff677fc244 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -195,8 +195,8 @@ module mvu_vvu_axi #(
 			.vld(ovld), .p(odat)
 		);
 	"mvu_vvu_lut":
-		mvu_vvu_lut #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
-		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .M_REG(M_REG_LUT)) core (
+		mvu_vvu_lut #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH),
+		.WEIGHT_WIDTH(WEIGHT_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .M_REG(M_REG_LUT)) core (
 			.clk, .rst, .en,
 			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
 			.vld(ovld), .p(odat)

From a4e2ac7146afeab4271344785f638c88cf78da73 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 21 Sep 2023 17:00:07 +0100
Subject: [PATCH 067/112] [mvu vvu 8sx9]: minor change to list of generics

---
 finn-rtllib/mvu/mvu_vvu_8sx9.sv | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_8sx9.sv b/finn-rtllib/mvu/mvu_vvu_8sx9.sv
index 52a93739d6..2aa9d71b6c 100644
--- a/finn-rtllib/mvu/mvu_vvu_8sx9.sv
+++ b/finn-rtllib/mvu/mvu_vvu_8sx9.sv
@@ -32,7 +32,7 @@
  *****************************************************************************/
 
 module mvu_vvu_8sx9 #(
-	parameter IS_MVU,
+	bit IS_MVU,
     int unsigned PE,
     int unsigned SIMD,
     int unsigned ACTIVATION_WIDTH,
@@ -42,7 +42,7 @@ module mvu_vvu_8sx9 #(
     int unsigned SEGMENTLEN = 0, // Default to 0 (which implies a single segment)
 	bit FORCE_BEHAVIORAL = 0,
 
-	int unsigned  ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD
+	localparam int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD
   )
   (
     // Global Control

From 40ad0b46c03b10b47ec4d72dd04a4ad96149fa89 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 21 Sep 2023 17:00:51 +0100
Subject: [PATCH 068/112] [mvu lut]: added support for VVU

---
 finn-rtllib/mvu/mvu_lut.sv | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_lut.sv b/finn-rtllib/mvu/mvu_lut.sv
index b100a589e8..c100910d75 100644
--- a/finn-rtllib/mvu/mvu_lut.sv
+++ b/finn-rtllib/mvu/mvu_lut.sv
@@ -1,13 +1,15 @@
-module mvu_lut #(
-	int unsigned  PE,
-	int unsigned  SIMD,
+module mvu_vvu_lut #(
+    bit IS_MVU,
+    int unsigned  PE,
+    int unsigned  SIMD,
 	int unsigned  ACCU_WIDTH,
     int unsigned  ACTIVATION_WIDTH,
     int unsigned  WEIGHT_WIDTH,
     bit  SIGNED_ACTIVATIONS,
     bit  M_REG = 1,
 
-    localparam unsigned MULT_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH
+    localparam int unsigned MULT_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH,
+    localparam int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD
 )(
 	// Global Control
 	input	logic  clk,
@@ -17,8 +19,8 @@ module mvu_lut #(
 	// Input
 	input	logic  last,
 	input	logic  zero,	// ignore current inputs and force this partial product to zero
-	input	logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]      w,	// signed weights
-	input	logic                [SIMD-1:0][ACTIVATION_WIDTH-1:0]  a,	// (un)signed activations
+	input	logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]             w,	// signed weights
+	input	logic        [ACTIVATION_ELEMENTS-1:0][ACTIVATION_WIDTH-1:0]  a,	// (un)signed activations
 
 	// Ouput
 	output	logic  vld,
@@ -63,16 +65,16 @@ module mvu_lut #(
                 always_ff @(posedge clk) begin
                     if(rst)         M[j] = '{ default : 0 };
                     else if (en)    M[j] = zero ? 0 :
-                                            SIGNED_ACTIVATIONS ? $signed(a[j]) * $signed(w[i][j]) :
-                                                                 $signed({1'b0, a[j]}) * $signed(w[i][j]); 
-                    // (SIGNED_ACTIVATIONS ? $signed(a[j]) : a[j]) * $signed(w[i][j]) isn't valid -- leads to unsigned multiplication
+                                            SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) :
+                                                                 $signed({1'b0, a[(IS_MVU ? 0 : SIMD*i) + j]}) * $signed(w[i][j]); 
+                    // (SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) : a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) isn't valid -- leads to unsigned multiplication
                 end
                 assign  m1[j] = M[j];
             end : genMreg
             else begin : genNoMreg 
                 assign m1[j] = zero ? 0 :
-                               SIGNED_ACTIVATIONS ? $signed(a[j]) * $signed(w[i][j]) :
-                                                    $signed({1'b0, a[j]}) * $signed(w[i][j]);
+                               SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) :
+                                                    $signed({1'b0, a[(IS_MVU ? 0 : SIMD*i) + j]}) * $signed(w[i][j]);
             end : genNoMreg
         end : genSIMD
 
@@ -99,4 +101,4 @@ module mvu_lut #(
         assign  p[i] = P2[i];
     end : genPE
 
-endmodule : mvu_lut
+endmodule : mvu_vvu_lut

From 30fcb5b734f86d0032549a4efe29d96b13ee5451 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 21 Sep 2023 17:01:10 +0100
Subject: [PATCH 069/112] [mvu vvu lut]: renamed file for consistency

---
 finn-rtllib/mvu/mvu_vvu_lut.sv | 104 +++++++++++++++++++++++++++++++++
 1 file changed, 104 insertions(+)
 create mode 100644 finn-rtllib/mvu/mvu_vvu_lut.sv

diff --git a/finn-rtllib/mvu/mvu_vvu_lut.sv b/finn-rtllib/mvu/mvu_vvu_lut.sv
new file mode 100644
index 0000000000..c100910d75
--- /dev/null
+++ b/finn-rtllib/mvu/mvu_vvu_lut.sv
@@ -0,0 +1,104 @@
+module mvu_vvu_lut #(
+    bit IS_MVU,
+    int unsigned  PE,
+    int unsigned  SIMD,
+	int unsigned  ACCU_WIDTH,
+    int unsigned  ACTIVATION_WIDTH,
+    int unsigned  WEIGHT_WIDTH,
+    bit  SIGNED_ACTIVATIONS,
+    bit  M_REG = 1,
+
+    localparam int unsigned MULT_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH,
+    localparam int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD
+)(
+	// Global Control
+	input	logic  clk,
+	input	logic  rst,
+	input	logic  en,
+
+	// Input
+	input	logic  last,
+	input	logic  zero,	// ignore current inputs and force this partial product to zero
+	input	logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]             w,	// signed weights
+	input	logic        [ACTIVATION_ELEMENTS-1:0][ACTIVATION_WIDTH-1:0]  a,	// (un)signed activations
+
+	// Ouput
+	output	logic  vld,
+	output	logic signed [PE-1:0][ACCU_WIDTH-1:0]  p
+);
+
+    typedef int unsigned  leave_load_t[2*SIMD-1];
+    function leave_load_t init_leave_loads();
+        automatic leave_load_t  res;
+        for(int  i = 2*(SIMD-1); i >= int'(SIMD)-1; i--)  res[i] = 1;
+        for(int  i = SIMD-2; i >= 0; i--)  res[i] = res[2*i+1] + res[2*i+2];
+        return res;
+    endfunction : init_leave_loads
+
+    // Pipeline for last indicator flag
+    uwire last_i;
+    generate if (M_REG) begin
+        logic [0:1] L = '0;
+        always_ff @(posedge clk) begin
+            if(rst)       L <= '0;
+            else if (en)  L <= {last, L[0]};
+        end
+        assign  last_i = L[1];
+    end
+    else begin 
+        logic L = '0;
+        always_ff @(posedge clk) begin
+            if(rst)       L <= '0;
+            else if (en)  L <= last;
+        end
+        assign  last_i = L;
+    end
+    endgenerate
+
+    // For each PE generate
+    for (genvar  i = 0; i < PE; i++)  begin : genPE
+        // Stage #1: SIMD multipliers in parallel
+        uwire [MULT_WIDTH-1 : 0] m1 [SIMD];
+        for (genvar j = 0; j < SIMD; j++) begin : genSIMD
+            if (M_REG) begin : genMreg
+                logic [MULT_WIDTH-1 : 0] M [SIMD];
+                always_ff @(posedge clk) begin
+                    if(rst)         M[j] = '{ default : 0 };
+                    else if (en)    M[j] = zero ? 0 :
+                                            SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) :
+                                                                 $signed({1'b0, a[(IS_MVU ? 0 : SIMD*i) + j]}) * $signed(w[i][j]); 
+                    // (SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) : a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) isn't valid -- leads to unsigned multiplication
+                end
+                assign  m1[j] = M[j];
+            end : genMreg
+            else begin : genNoMreg 
+                assign m1[j] = zero ? 0 :
+                               SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) :
+                                                    $signed({1'b0, a[(IS_MVU ? 0 : SIMD*i) + j]}) * $signed(w[i][j]);
+            end : genNoMreg
+        end : genSIMD
+
+        // Stage #2: Adder tree to reduce SIMD products
+        localparam leave_load_t  LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default : 1 };
+        localparam int unsigned  ROOT_WIDTH = $clog2(SIMD*(2**MULT_WIDTH-1));
+        uwire signed [2*SIMD-2:0][ROOT_WIDTH-1:0]  tree;
+        for(genvar s = 0; s < SIMD; s++)  assign  tree[SIMD-1+s] = $signed(m1[s]);
+        for(genvar n = 0; n < SIMD-1; n++) begin
+            // Sum truncated to actual maximum bit width at this node
+            localparam int unsigned  NODE_WIDTH = $clog2(LEAVE_LOAD[n]*(2**MULT_WIDTH-1));
+            uwire signed [NODE_WIDTH-1:0]  s = tree[2*n+1] + tree[2*n+2];
+            assign tree[n] = s;
+        end
+
+        // Stage #3: Buffer output
+        logic [ACCU_WIDTH-1:0] P2 [PE];
+        always_ff @(posedge clk) begin
+            if(rst)         P2[i] = '{ default : 0};
+            else if (en)    P2[i] = (last_i ? 0 : $signed(P2[i])) + $signed(tree[0]);
+        end
+
+        assign  vld = last_i;
+        assign  p[i] = P2[i];
+    end : genPE
+
+endmodule : mvu_vvu_lut

From cb434386fa8bf6f63964dd889c8025c3e9616a6b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Thu, 21 Sep 2023 15:58:34 +0100
Subject: [PATCH 070/112] Revert to proper address truncation without
 generation bit.

---
 finn-rtllib/mvu/replay_buffer.sv | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/finn-rtllib/mvu/replay_buffer.sv b/finn-rtllib/mvu/replay_buffer.sv
index d4342f705c..3e2766f63d 100644
--- a/finn-rtllib/mvu/replay_buffer.sv
+++ b/finn-rtllib/mvu/replay_buffer.sv
@@ -144,8 +144,8 @@ module replay_buffer #(
 		uwire  wr = irdy && ivld;
 		uwire  rd = !OVld || ordy;
 		always_ff @(posedge clk) begin
-			if(wr)  Mem[WP[AWIDTH:0]] <= idat;
-			if(rd)  ODat <= Mem[RP[AWIDTH:0]];
+			if(wr)  Mem[WP[AWIDTH-1:0]] <= idat;
+			if(rd)  ODat <= Mem[RP[AWIDTH-1:0]];
 		end
 
 		uwire  vld = (RP != WP);

From b4b69f3fa7caae4be9357abf596aff4a66561228 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 21 Sep 2023 17:04:05 +0100
Subject: [PATCH 071/112] remove deletd/renamed files

---
 finn-rtllib/mvu/mvu_8sx9.sv            | 427 -------------------------
 finn-rtllib/mvu/mvu_8sx9_axi.sv        | 179 -----------
 finn-rtllib/mvu/mvu_8sx9_axi_tb.sv     | 208 ------------
 finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v |  93 ------
 finn-rtllib/mvu/mvu_8sx9_tb.sv         | 165 ----------
 finn-rtllib/mvu/mvu_axi.sv             | 248 --------------
 finn-rtllib/mvu/mvu_axi_wrapper.v      |  92 ------
 finn-rtllib/mvu/mvu_lut.sv             | 104 ------
 finn-rtllib/mvu/tb/mvu_axi_tb.sv       | 215 -------------
 9 files changed, 1731 deletions(-)
 delete mode 100644 finn-rtllib/mvu/mvu_8sx9.sv
 delete mode 100644 finn-rtllib/mvu/mvu_8sx9_axi.sv
 delete mode 100644 finn-rtllib/mvu/mvu_8sx9_axi_tb.sv
 delete mode 100644 finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v
 delete mode 100644 finn-rtllib/mvu/mvu_8sx9_tb.sv
 delete mode 100644 finn-rtllib/mvu/mvu_axi.sv
 delete mode 100644 finn-rtllib/mvu/mvu_axi_wrapper.v
 delete mode 100644 finn-rtllib/mvu/mvu_lut.sv
 delete mode 100644 finn-rtllib/mvu/tb/mvu_axi_tb.sv

diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv
deleted file mode 100644
index 52a93739d6..0000000000
--- a/finn-rtllib/mvu/mvu_8sx9.sv
+++ /dev/null
@@ -1,427 +0,0 @@
-/******************************************************************************
- * Copyright (C) 2022, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  1. Redistributions of source code must retain the above copyright notice,
- *     this list of conditions and the following disclaimer.
- *
- *  2. Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in the
- *     documentation and/or other materials provided with the distribution.
- *
- *  3. Neither the name of the copyright holder nor the names of its
- *     contributors may be used to endorse or promote products derived from
- *     this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
- * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
- * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
- * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
- * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * @brief	Matrix Vector Unit (MVU) core compute kernel utilizing DSP58.
- *****************************************************************************/
-
-module mvu_vvu_8sx9 #(
-	parameter IS_MVU,
-    int unsigned PE,
-    int unsigned SIMD,
-    int unsigned ACTIVATION_WIDTH,
-    int unsigned WEIGHT_WIDTH,
-	int unsigned ACCU_WIDTH,
-    bit SIGNED_ACTIVATIONS = 0,
-    int unsigned SEGMENTLEN = 0, // Default to 0 (which implies a single segment)
-	bit FORCE_BEHAVIORAL = 0,
-
-	int unsigned  ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD
-  )
-  (
-    // Global Control
-	input   logic clk,
-    input   logic rst,
-    input   logic en,
-
-	// Input
-    input   logic last,
-    input   logic zero, // ignore current inputs and force this partial product to zero
-    input   logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // weights
-	input   logic [ACTIVATION_ELEMENTS-1:0][ACTIVATION_WIDTH-1:0] a, // activations
-
-	// Ouput
-	output  logic vld,
-    output  logic [PE-1:0][ACCU_WIDTH-1:0] p
-  );
-	// for verilator always use behavioral code
-	localparam bit  BEHAVIORAL =
-`ifdef VERILATOR
-		1 ||
-`endif
-		FORCE_BEHAVIORAL;
-
-//-------------------- Declare global signals --------------------\\
-	localparam int unsigned CHAINLEN = (SIMD+2)/3;
-	localparam int unsigned SEGLEN = SEGMENTLEN == 0 ? CHAINLEN : SEGMENTLEN; // Additional constant to default a SEGMENTLEN of '0' to the DSP-chain length
-	localparam int unsigned PE_ACTIVATION = IS_MVU ? 1 : PE;
-	uwire [26:0] a_in_i [PE_ACTIVATION * CHAINLEN];
-	uwire [23:0] b_in_i [PE][CHAINLEN];
-	uwire [PE-1:0][CHAINLEN-1:0][57:0] pcout; // Array with packed dimension > 256 (with a loop-carried dependency) cannot be handled out-of-the-box with PyVerilator
-
-//-------------------- Shift register for opmode select signal --------------------\\
-	localparam int unsigned MAX_PIPELINE_STAGES = (CHAINLEN + SEGLEN-1)/SEGLEN; // >=1 (== number of pipeline registers + 1 (A/B inputs always have 1 register))
-	logic L [0:1+MAX_PIPELINE_STAGES] = '{default: 0}; // After MAX_PIPELINE_STAGES (== number of pipeline stages for input data), we have 3 additional cycles latency (A/B reg, Mreg, Preg). Thus, we add +2 (since OPMODE is buffered by 1 cycle in the DSP fabric)
-
-	always_ff @(posedge clk) begin
-		if(rst)     L <= '{default: 0};
-		else if(en) begin
-			L[1+MAX_PIPELINE_STAGES] <= last;
-			L[0:MAX_PIPELINE_STAGES] <= L[1:1+MAX_PIPELINE_STAGES];
-		end
-	end
-	assign vld = L[0];
-
-//-------------------- Shift register for ZERO flag --------------------\\
-	logic Z [0:MAX_PIPELINE_STAGES-2] = '{default:0}; // We need MAX_PIPELINE_STAGES-1 pipeline stages (note: INMODE is buffered inside DSP fabric)
-
-	if (MAX_PIPELINE_STAGES > 1) begin : genZreg
-		always_ff @(posedge clk) begin
-			if (rst)      Z <= '{default: 0};
-			else if(en) begin
-				Z[0] <= zero;
-				if (MAX_PIPELINE_STAGES > 2)  Z[1:MAX_PIPELINE_STAGES-2] <= Z[0:MAX_PIPELINE_STAGES-3];
-			end
-		end
-	end;
-
-//-------------------- Buffer for input activations --------------------\\
-	localparam int unsigned PAD_BITS_ACT = 9 - ACTIVATION_WIDTH;
-	for (genvar k=0; k<PE_ACTIVATION; k++) begin : genActPE
-		for (genvar i=0; i<CHAINLEN; i++) begin : genActSIMD
-			localparam int TOTAL_PREGS = i/SEGLEN;
-			localparam int EXTERNAL_PREGS = TOTAL_PREGS>1 ? TOTAL_PREGS-1 : 0;
-			localparam int LANES_OCCUPIED = i == CHAINLEN-1 ? SIMD - 3*i : 3;
-
-			if (EXTERNAL_PREGS > 0) begin : genExternalPregAct
-				logic [0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][ACTIVATION_WIDTH-1:0] A = '{ default : 0};
-				always_ff @(posedge clk) begin
-					if (rst)     A <= '{default: 0};
-					else if(en) begin
-						A[EXTERNAL_PREGS-1] <= 
-	// synthesis translate_off
-							zero ? '1 : 
-	// synthesis translate_on						
-							a[SIMD*k + 3*i +: LANES_OCCUPIED];
-						if (EXTERNAL_PREGS > 1)   A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1];
-					end
-				end
-				for (genvar j=0; j<LANES_OCCUPIED; j++) begin : genAin
-				assign a_in_i[CHAINLEN*k+i][9*j +: 9] = SIGNED_ACTIVATIONS ? PAD_BITS_ACT == 0 ? A[0][j] : { {PAD_BITS_ACT{A[0][j][ACTIVATION_WIDTH-1]}}, A[0][j] } 
-													  : PAD_BITS_ACT == 0 ? A[0][j] : { {PAD_BITS_ACT{1'b0}}, A[0][j] } ;
-				end : genAin
-				for (genvar j=LANES_OCCUPIED; j<3; j++) begin : genAinZero
-					assign a_in_i[CHAINLEN*k+i][9*j +: 9] = 9'b0;
-				end : genAinZero
-			end : genExternalPregAct
-			else begin : genInpDSPAct
-				for (genvar j=0; j<LANES_OCCUPIED; j++) begin : genAin
-					assign a_in_i[CHAINLEN*k+i][9*j +: 9] = 
-	// synthesis translate_off
-						zero ? '1 : 				
-	// synthesis translate_on
-						SIGNED_ACTIVATIONS ? PAD_BITS_ACT == 0 ? a[SIMD*k+3*i+j] : { {PAD_BITS_ACT{a[SIMD*k+3*i+j][ACTIVATION_WIDTH-1]}}, a[SIMD*k+3*i+j] }
-													: PAD_BITS_ACT == 0 ? a[SIMD*k+3*i+j] : { {PAD_BITS_ACT{1'b0}}, a[SIMD*k+3*i+j] } ;
-				end : genAin
-				for (genvar j=LANES_OCCUPIED; j<3; j++) begin : genAinZero
-					assign a_in_i[CHAINLEN*k+i][9*j +: 9] = 9'b0;
-				end : genAinZero
-			end : genInpDSPAct
-		end : genActSIMD
-	end : genActPE
-
-//-------------------- Buffer for weights --------------------\\
-	localparam int unsigned PAD_BITS_WEIGHT = 8 - WEIGHT_WIDTH;
-
-	for (genvar i=0; i<PE; i++) begin : genWeightPE
-		for (genvar j=0; j<CHAINLEN; j++) begin : genWeightSIMD
-			localparam int TOTAL_PREGS = j/SEGLEN;
-			localparam int EXTERNAL_PREGS = TOTAL_PREGS>1 ? TOTAL_PREGS-1 : 0;
-			localparam int LANES_OCCUPIED = j == CHAINLEN-1 ? SIMD - 3*j : 3;
-
-			if (EXTERNAL_PREGS > 0) begin : genExternalPregWeight
-				logic [0:PE-1][0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][WEIGHT_WIDTH-1:0] B = '{ default : 0};
-				always_ff @(posedge clk) begin
-					if (rst)    B <= '{default: 0};
-					else if (en) begin
-						B[i][EXTERNAL_PREGS-1] <= 
-// synthesis translate_off
-							zero ? '1 : 						
-// synthesis translate_on							
-							w[i][3*j +: LANES_OCCUPIED];
-						if (EXTERNAL_PREGS > 1) B[i][0:EXTERNAL_PREGS-2] <= B[i][1:EXTERNAL_PREGS-1];
-					end
-				end
-				for (genvar k = 0 ; k < LANES_OCCUPIED ; k++) begin : genBin
-					assign b_in_i[i][j][8*k +: 8] = PAD_BITS_WEIGHT == 0 ? B[i][0][k] : { {PAD_BITS_WEIGHT{B[i][0][k][WEIGHT_WIDTH-1]}}, B[i][0][k] };
-				end : genBin
-				for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero
-					assign b_in_i[i][j][8*k +: 8] = 8'b0;
-				end : genBinZero
-			end : genExternalPregWeight
-			else begin : genInpDSPWeight
-				for (genvar k = 0; k < LANES_OCCUPIED; k++) begin : genBin
-					assign b_in_i[i][j][8*k +: 8] = 
-// synthesis translate_off					
-						zero ? '1 : 
-// synthesis translate_on					
-						PAD_BITS_WEIGHT == 0 ? w[i][3*j+k] : { {PAD_BITS_WEIGHT{w[i][3*j+k][WEIGHT_WIDTH-1]}}, w[i][3*j+k] };
-				end : genBin
-				for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero
-					assign b_in_i[i][j][8*k +: 8] = 8'b0;
-				end : genBinZero
-			end : genInpDSPWeight
-		end : genWeightSIMD
-	end : genWeightPE
-
-//-------------------- Instantiate PE x CHAINLEN DSPs --------------------\\
-	for (genvar i=0; i<PE; i++) begin : genDSPPE
-		for (genvar j=0; j<CHAINLEN; j++) begin : genDSPChain
-			localparam int TOTAL_PREGS = j/SEGLEN;
-			localparam int INTERNAL_PREGS = TOTAL_PREGS>0 ? 2 : 1; // 1 : 0
-			localparam bit PREG = (j+1)%SEGLEN==0 || j == CHAINLEN-1;
-			localparam bit FIRST = j == 0;
-			localparam bit LAST = j == CHAINLEN-1;
-			uwire [57:0] pp;
-
-			if (LAST) begin : genPOUT
-				assign p[i] = pp[ACCU_WIDTH-1:0];
-			end
-
-			// Note: Since the product B * AD is computed,
-			//       rst can be only applied to AD and zero only to B
-			//       with the same effect as zeroing both.
-			if(BEHAVIORAL) begin : genBehav
-				// Stage #1: Input A/B
-				logic signed [33:0] Areg [INTERNAL_PREGS];
-				always_ff @(posedge clk) begin
-					if (rst)	Areg <= '{ default : 0};
-					else if (en) begin
-						Areg[0] <= { 7'bx, a_in_i[(IS_MVU ? 0 : CHAINLEN*i) + j] };
-						if (INTERNAL_PREGS == 2) Areg[1] <= Areg[0];
-					end
-				end
-				logic signed [23:0] Breg [INTERNAL_PREGS];
-				always_ff @(posedge clk) begin
-					if (rst)	Breg <= '{ default : 0};
-					else if (en) begin
-						Breg[0] <= b_in_i[i][j];
-						if (INTERNAL_PREGS == 2) Breg[1] <= Breg[0];
-					end
-				end
-
-				// Stage #2: Multiply-Accumulate
-				logic signed [57:0] Mreg;
-				logic InmodeZero = 0;
-				always_ff @(posedge clk) begin
-					if (rst)		InmodeZero <= 0;
-					else if (en)	InmodeZero <= ( TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero );
-				end
-				always_ff @(posedge clk) begin
-					if (rst)	Mreg <= 0;
-					else if (en) begin
-						automatic logic signed [57:0] m = 0;
-						for (int k = 0; k < 3; k++) begin
-							m = m + (InmodeZero ? 0 : $signed(Areg[INTERNAL_PREGS-1][9*k +: 9]) * $signed(Breg[INTERNAL_PREGS-1][8*k +: 8]));
-						end
-						Mreg <= m;
-					end
-				end
-
-				// Stage #3: Accumulate
-				logic signed [57:0] Preg;
-				logic Opmode = 0;
-				if (FIRST && !LAST) begin : genFirst
-					if (PREG) begin : genPregBehav
-						always_ff @(posedge clk) begin
-							if (rst)		Preg <= 0;
-							else if (en)	Preg <= Mreg;
-						end
-					end
-					else	assign Preg = Mreg;
-				end
-				else if (FIRST && LAST) begin : genSingle
-					always_ff @(posedge clk) begin
-						if (rst)		Opmode <= 0;
-						else if (en)	Opmode <= L[1];
-					end
-					always_ff @(posedge clk) begin
-						if (rst) 		Preg <= 0;
-						else if (en)	Preg <= (Opmode ? 0 : Preg) + Mreg;
-					end
-				end
-				else if (!FIRST && LAST) begin : genLast
-					always_ff @(posedge clk) begin
-						if (rst)		Opmode <= 0;
-						else if (en)	Opmode <= L[1];
-					end
-					always_ff @(posedge clk) begin
-						if (rst) 		Preg <= 0;
-						else if (en)	Preg <= (Opmode ? 0 : Preg) + Mreg + pcout[i][j-1];
-					end
-				end
-				else begin : genMid
-					if (PREG) begin : genPregBehav
-						always_ff @(posedge clk) begin
-							if (rst)		Preg <= 0;
-							else if (en)	Preg <= Mreg + pcout[i][j-1];
-						end
-					end
-					else	assign Preg = Mreg + pcout[i][j-1];
-				end
-				assign pp = Preg;
-				assign pcout[i][j] = Preg;
-			end : genBehav
-`ifndef VERILATOR
-			else begin: genDSP
-				DSP58 #(
-					// Feature Control Attributes: Data Path Selection
-					.AMULTSEL("A"),                     // Selects A input to multiplier (A, AD)
-					.A_INPUT("DIRECT"),                 // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port)
-					.BMULTSEL("B"),                     // Selects B input to multiplier (AD, B)
-					.B_INPUT("DIRECT"),                 // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port)
-					.DSP_MODE("INT8"),                  // Configures DSP to a particular mode of operation. Set to INT24 for
-														// legacy mode.
-					.PREADDINSEL("A"),                  // Selects input to pre-adder (A, B)
-					.RND(58'h000000000000000),          // Rounding Constant
-					.USE_MULT("MULTIPLY"),              // Select multiplier usage (DYNAMIC, MULTIPLY, NONE)
-					.USE_SIMD("ONE58"),                 // SIMD selection (FOUR12, ONE58, TWO24)
-					.USE_WIDEXOR("FALSE"),              // Use the Wide XOR function (FALSE, TRUE)
-					.XORSIMD("XOR24_34_58_116"),        // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116)
-					// Pattern Detector Attributes: Pattern Detection Configuration
-					.AUTORESET_PATDET("NO_RESET"),      // NO_RESET, RESET_MATCH, RESET_NOT_MATCH
-					.AUTORESET_PRIORITY("RESET"),       // Priority of AUTORESET vs. CEP (CEP, RESET).
-					.MASK(58'h0ffffffffffffff),         // 58-bit mask value for pattern detect (1=ignore)
-					.PATTERN(58'h000000000000000),      // 58-bit pattern match for pattern detect
-					.SEL_MASK("MASK"),                  // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2
-					.SEL_PATTERN("PATTERN"),            // Select pattern value (C, PATTERN)
-					.USE_PATTERN_DETECT("NO_PATDET"),   // Enable pattern detect (NO_PATDET, PATDET)
-					// Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins
-					.IS_ALUMODE_INVERTED(4'b0000),      // Optional inversion for ALUMODE
-					.IS_CARRYIN_INVERTED(1'b0),         // Optional inversion for CARRYIN
-					.IS_CLK_INVERTED(1'b0),             // Optional inversion for CLK
-					.IS_INMODE_INVERTED(5'b00000),      // Optional inversion for INMODE
-					.IS_NEGATE_INVERTED(3'b000),        // Optional inversion for NEGATE
-					.IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0
-										FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN
-										2'b01, // Y : M
-										2'b01  // X: M
-					}), // Optional inversion for OPMODE
-					.IS_RSTALLCARRYIN_INVERTED(1'b0),   // Optional inversion for RSTALLCARRYIN
-					.IS_RSTALUMODE_INVERTED(1'b0),      // Optional inversion for RSTALUMODE
-					.IS_RSTA_INVERTED(1'b0),            // Optional inversion for RSTA
-					.IS_RSTB_INVERTED(1'b0),            // Optional inversion for RSTB
-					.IS_RSTCTRL_INVERTED(1'b0),         // Optional inversion for STCONJUGATE_A
-					.IS_RSTC_INVERTED(1'b0),            // Optional inversion for RSTC
-					.IS_RSTD_INVERTED(1'b0),            // Optional inversion for RSTD
-					.IS_RSTINMODE_INVERTED(1'b0),       // Optional inversion for RSTINMODE
-					.IS_RSTM_INVERTED(1'b0),            // Optional inversion for RSTM
-					.IS_RSTP_INVERTED(1'b0),            // Optional inversion for RSTP
-					// Register Control Attributes: Pipeline Register Configuration
-					.ACASCREG(INTERNAL_PREGS),          // Number of pipeline stages between A/ACIN and ACOUT (0-2)
-					.ADREG(0),                          // Pipeline stages for pre-adder (0-1)
-					.ALUMODEREG(0),                     // Pipeline stages for ALUMODE (0-1)
-					.AREG(INTERNAL_PREGS),              // Pipeline stages for A (0-2)
-					.BCASCREG(INTERNAL_PREGS),          // Number of pipeline stages between B/BCIN and BCOUT (0-2)
-					.BREG(INTERNAL_PREGS),              // Pipeline stages for B (0-2)
-					.CARRYINREG(0),                     // Pipeline stages for CARRYIN (0-1)
-					.CARRYINSELREG(0),                  // Pipeline stages for CARRYINSEL (0-1)
-					.CREG(0),                           // Pipeline stages for C (0-1)
-					.DREG(0),                           // Pipeline stages for D (0-1)
-					.INMODEREG(1),                      // Pipeline stages for INMODE (0-1)
-					.MREG(1),                           // Multiplier pipeline stages (0-1)
-					.OPMODEREG(1),                      // Pipeline stages for OPMODE (0-1)
-					.PREG(PREG),                        // Number of pipeline stages for P (0-1)
-					.RESET_MODE("SYNC")                 // Selection of synchronous or asynchronous reset. (ASYNC, SYNC).
-				)
-				DSP58_inst (
-					// Cascade outputs: Cascade Ports
-					.ACOUT(),                           // 34-bit output: A port cascade
-					.BCOUT(),                           // 24-bit output: B cascade
-					.CARRYCASCOUT(),                    // 1-bit output: Cascade carry
-					.MULTSIGNOUT(),                     // 1-bit output: Multiplier sign cascade
-					.PCOUT(pcout[i][j]),                // 58-bit output: Cascade output
-					// Control outputs: Control Inputs/Status Bits
-					.OVERFLOW(),                        // 1-bit output: Overflow in add/acc
-					.PATTERNBDETECT(),                  // 1-bit output: Pattern bar detect
-					.PATTERNDETECT(),                   // 1-bit output: Pattern detect
-					.UNDERFLOW(),                       // 1-bit output: Underflow in add/acc
-					// Data outputs: Data Ports
-					.CARRYOUT(),                        // 4-bit output: Carry
-					.P(pp),                             // 58-bit output: Primary data
-					.XOROUT(),                          // 8-bit output: XOR data
-					// Cascade inputs: Cascade Ports
-					.ACIN('x),                          // 34-bit input: A cascade data
-					.BCIN('x),                          // 24-bit input: B cascade
-					.CARRYCASCIN('x),                   // 1-bit input: Cascade carry
-					.MULTSIGNIN('x),                    // 1-bit input: Multiplier sign cascade
-					.PCIN(FIRST ? 'x : pcout[i][j-1]),  // 58-bit input: P cascade
-					// Control inputs: Control Inputs/Status Bits
-					.ALUMODE(4'h0),                     // 4-bit input: ALU control
-					.CARRYINSEL('0),                    // 3-bit input: Carry select
-					.CLK(clk),                          // 1-bit input: Clock
-					.INMODE({
-							INTERNAL_PREGS==2 ? 1'b0 : 1'b1,
-							2'b00,
-							TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero,
-							INTERNAL_PREGS==2 ? 1'b0 : 1'b1
-					}),                                 // 5-bit input: INMODE control
-					.NEGATE('0),                        // 3-bit input: Negates the input of the multiplier
-					.OPMODE({
-							LAST ? {1'b0, L[1]} : 2'b00,
-							7'b000_0000
-					}), // 9-bit input: Operation mode
-					// Data inputs: Data Ports
-					.A({ 7'bx, a_in_i[(IS_MVU ? 0 : CHAINLEN*i) + j] }),            // 34-bit input: A data
-					.B(b_in_i[i][j]),                   // 24-bit input: B data
-					.C('x),                             // 58-bit input: C data
-					.CARRYIN('0),                       // 1-bit input: Carry-in
-					.D('x),                             // 27-bit input: D data
-					// Reset/Clock Enable inputs: Reset/Clock Enable Inputs
-					.ASYNC_RST('0),                     // 1-bit input: Asynchronous reset for all registers.
-					.CEA1(en),                          // 1-bit input: Clock enable for 1st stage AREG
-					.CEA2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage AREG
-					.CEAD('0),                          // 1-bit input: Clock enable for ADREG
-					.CEALUMODE('0),                     // 1-bit input: Clock enable for ALUMODE
-					.CEB1(en),                          // 1-bit input: Clock enable for 1st stage BREG
-					.CEB2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage BREG
-					.CEC('0),                           // 1-bit input: Clock enable for CREG
-					.CECARRYIN('0),                     // 1-bit input: Clock enable for CARRYINREG
-					.CECTRL(en),                        // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG
-					.CED('0),                           // 1-bit input: Clock enable for DREG
-					.CEINMODE(en),                      // 1-bit input: Clock enable for INMODEREG
-					.CEM(en),                           // 1-bit input: Clock enable for MREG
-					.CEP(PREG && en),                   // 1-bit input: Clock enable for PREG
-					.RSTA(rst),                         // 1-bit input: Reset for AREG
-					.RSTALLCARRYIN('0),                 // 1-bit input: Reset for CARRYINREG
-					.RSTALUMODE('0),                    // 1-bit input: Reset for ALUMODEREG
-					.RSTB(rst),                         // 1-bit input: Reset for BREG
-					.RSTC('0),                          // 1-bit input: Reset for CREG
-					.RSTCTRL(rst),                      // 1-bit input: Reset for OPMODEREG and CARRYINSELREG
-					.RSTD('0),                          // 1-bit input: Reset for DREG and ADREG
-					.RSTINMODE(rst),                    // 1-bit input: Reset for INMODE register
-					.RSTM(rst),                         // 1-bit input: Reset for MREG
-					.RSTP(PREG && rst)                  // 1-bit input: Reset for PREG
-				);
-			end : genDSP
-`endif
-		end : genDSPChain
-	end : genDSPPE
-
-endmodule : mvu_vvu_8sx9
diff --git a/finn-rtllib/mvu/mvu_8sx9_axi.sv b/finn-rtllib/mvu/mvu_8sx9_axi.sv
deleted file mode 100644
index 5f215927d8..0000000000
--- a/finn-rtllib/mvu/mvu_8sx9_axi.sv
+++ /dev/null
@@ -1,179 +0,0 @@
-/******************************************************************************
- * Copyright (C) 2022, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  1. Redistributions of source code must retain the above copyright notice,
- *     this list of conditions and the following disclaimer.
- *
- *  2. Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in the
- *     documentation and/or other materials provided with the distribution.
- *
- *  3. Neither the name of the copyright holder nor the names of its
- *     contributors may be used to endorse or promote products derived from
- *     this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
- * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
- * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
- * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
- * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * @brief	Matrix Vector Unit (MVU) AXI-lite interface wrapper.
- *****************************************************************************/
-
-module mvu_8sx9_axi #(
-	int unsigned MW,
-	int unsigned MH,
-	int unsigned PE,
-	int unsigned SIMD,
-	int unsigned ACTIVATION_WIDTH,
-	int unsigned WEIGHT_WIDTH,
-	int unsigned ACCU_WIDTH,
-	bit SIGNED_ACTIVATIONS = 0,
-	int unsigned SEGMENTLEN = 0,
-	parameter RAM_STYLE = "auto",
-
-	localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
-	localparam int unsigned INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8,
-	localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH,
-	localparam int unsigned INPUT_STREAM_WIDTH = SIMD*ACTIVATION_WIDTH,
-	localparam int unsigned SF = MW/SIMD,
-	localparam int unsigned NF = MH/PE,
-	localparam int unsigned OUTPUT_LANES = PE,
-	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
-)
-(
-	// Global Control
-	input	logic  ap_clk,
-	input	logic  ap_rst_n,
-
-	// Weight Stream
-	input	logic [WEIGHT_STREAM_WIDTH_BA-1:0]  s_axis_weights_tdata,
-	input	logic  s_axis_weights_tvalid,
-	output	logic  s_axis_weights_tready,
-
-	// Input Stream
-	input	logic [INPUT_STREAM_WIDTH_BA-1:0]  s_axis_input_tdata,
-	input	logic  s_axis_input_tvalid,
-	output	logic  s_axis_input_tready,
-
-	// Output Stream
-	output	logic [OUTPUT_STREAM_WIDTH_BA-1:0]  m_axis_output_tdata,
-	output	logic  m_axis_output_tvalid,
-	input	logic  m_axis_output_tready
-);
-
-//-------------------- Parameter sanity checks --------------------\\
-	initial begin
-		if (MW % SIMD != 0) begin
-			$error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD);
-			$finish;
-		end
-		if (MH % PE != 0) begin
-			$error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE);
-			$finish;
-		end
-		if (ACTIVATION_WIDTH > 9) begin
-			$error("Activation width of %0d-bits exceeds maximum of 9-bits", ACTIVATION_WIDTH);
-			$finish;
-		end
-		if (WEIGHT_WIDTH > 8) begin
-			$error("Weight width of %0d-bits exceeds maximum of 8-bits", WEIGHT_WIDTH);
-			$finish;
-		end
-		if (SIGNED_ACTIVATIONS == 0 && ACTIVATION_WIDTH==9) begin
-			$error("Activation width of %0d-bits exceeds maximum of 8-bits for unsigned numbers", ACTIVATION_WIDTH);
-			$finish;
-		end
-		if (SEGMENTLEN == 0) begin
-			$warning("Segment length of %0d defaults to chain length", SEGMENTLEN);
-		end
-		if (SEGMENTLEN > (SIMD+2)/3) begin
-			$error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3);
-			$finish;
-		end
-	end
-
-	uwire clk = ap_clk;
-	uwire rst = !ap_rst_n;
-
-	typedef logic [INPUT_STREAM_WIDTH-1 : 0] mvauin_t;
-
-	uwire mvauin_t amvau;
-	uwire alast;
-	uwire afin;
-	uwire avld;
-	uwire ardy;
-
-	replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvauin_t)), .RAM_STYLE(RAM_STYLE)) activation_replay (
-		.clk, .rst,
-		.ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)),
-		.ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin)
-	);
-
-//-------------------- Input control --------------------\\
-	uwire en;
-	uwire istb = avld && s_axis_weights_tvalid;
-	assign ardy = en && s_axis_weights_tvalid;
-	assign s_axis_weights_tready = en && avld;
-
-//-------------------- Core MVU --------------------\\
-	uwire ovld;
-	uwire [PE-1:0][57:0] odat;
-	typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t;
-	mvu_8sx9 #(.PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
-	.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN)) core (
-		.clk, .rst, .en,
-		.last(alast), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau),
-		.vld(ovld), .p(odat)
-	);
-
-//-------------------- Output register slice --------------------\\
-	struct {
-		logic vld;
-		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
-	} A = '{ vld: 0, default: 'x};
-
-	assign en = !A.vld || !ovld;
-
-	uwire  b_load;
-	always_ff @(posedge clk) begin
-		if(rst)		A <= '{ vld: 0, default: 'x };
-		else if(!A.vld || b_load) begin
-			A.vld <= ovld && en;
-			for(int unsigned  i = 0; i < PE; i++) begin
-				// CR-1148862:
-				// A.dat[i] <= odat[i];
-				automatic logic [57:0]  v = odat[i];
-				A.dat[i] <= v[ACCU_WIDTH-1:0];
-			end
-		end
-	end
-	
-	struct {
-		logic vld;
-		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
-	} B = '{ vld: 0, default: 'x};
-
-	assign	b_load = !B.vld || m_axis_output_tready;
-	always_ff @(posedge clk) begin
-		if(rst)		B <= '{ default: 'x };
-		else begin
-			if(b_load)	B <= '{ vld: A.vld, dat: A.dat};
-		end	
-	end
-
-	assign	m_axis_output_tvalid = B.vld;
-	assign	m_axis_output_tdata  = B.dat;
-
-endmodule
\ No newline at end of file
diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv b/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv
deleted file mode 100644
index 70ffa096ef..0000000000
--- a/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv
+++ /dev/null
@@ -1,208 +0,0 @@
-/******************************************************************************
- * Copyright (C) 2022, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  1. Redistributions of source code must retain the above copyright notice,
- *     this list of conditions and the following disclaimer.
- *
- *  2. Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in the
- *     documentation and/or other materials provided with the distribution.
- *
- *  3. Neither the name of the copyright holder nor the names of its
- *     contributors may be used to endorse or promote products derived from
- *     this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
- * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
- * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
- * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
- * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * @brief	Testbench for MVU AXI-lite interface wrapper.
- *****************************************************************************/
-
-module mvu_8sx9_axi_tb();
-
-//-------------------- Simulation parameters --------------------\\
-	// Matrix & parallelism config
-	localparam int unsigned MW = 600;
-	localparam int unsigned MH = 256;
-	localparam int unsigned SIMD = 60;
-	localparam int unsigned PE = 16;
-	localparam int unsigned SEGMENTLEN = 4;
-	// Bit-width config  
-	localparam int unsigned ACTIVATION_WIDTH = 8;
-	localparam int unsigned WEIGHT_WIDTH = 4;
-	localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW);
-	localparam bit SIGNED_ACTIVATIONS = 1;
-	// Simulation constants  
-	localparam int unsigned NF = MH/PE;
-	localparam int unsigned SF = MW/SIMD;
-	localparam int unsigned NUM_OF_DSP = SIMD/3;
-	localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8;
-	localparam int unsigned ACTIVATION_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8*8;
-	localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH;
-	localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - SIMD*ACTIVATION_WIDTH;
-	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8;
-
-	// Generate clk and reset signal   
-	logic clk = 0;
-	always #5ns clk = !clk;
-
-	logic ap_rst_n = 0;
-	initial begin
-		repeat(16) @(posedge clk);
-		ap_rst_n <= 1;
-	end
-
-	uwire ap_clk = clk;
-
-	// Generate activations  
-	typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t;
-	typedef activation_t activation_vector_t[SF];
-
-	function activation_vector_t init_ACTIVATIONS;
-		automatic activation_vector_t res;
-		std::randomize(res);
-		return res;
-	endfunction : init_ACTIVATIONS
-
-	activation_vector_t ACTIVATIONS = init_ACTIVATIONS();
-
-	struct {
-		activation_t dat;
-		logic vld;
-		logic rdy;
-	} activations;
-
-	initial begin
-		activations.vld = 0;
-		activations.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain
-		@(posedge clk iff ap_rst_n);
-
-		for (int i=0; i<SF; i++) begin
-			activations.dat <= ACTIVATIONS[i];
-			do begin 
-				activations.vld = $urandom()%7 > 1;
-				@(posedge clk);
-			end while (!(activations.vld === 1 && activations.rdy === 1));
-		end
-
-		activations.vld <= 0;
-		activations.dat <= 'x;
-	end
-
-	// Generate weights   
-	typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t;
-	typedef weight_t weight_matrix_t[NF][SF]; 
-
-	function weight_matrix_t init_WEIGHTS;
-		automatic weight_matrix_t res;
-		std::randomize(res);
-		return res;
-	endfunction : init_WEIGHTS;
-
-	weight_matrix_t WEIGHTS = init_WEIGHTS();
-
-	struct {
-		weight_t dat;
-		logic vld;
-		logic rdy;
-	} weights;
-
-	initial begin
-		weights.vld = 0;
-		weights.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain
-		@(posedge clk iff ap_rst_n);
-
-		weights.vld <= 1;
-		for (int i=0; i<NF; i++) begin
-			for (int j=0; j<SF; j++) begin
-				weights.dat <= WEIGHTS[i][j];
-				@(posedge clk iff weights.rdy);
-			end
-		end
-
-		weights.vld <= 0;
-		weights.dat <= 'x;
-	end
-
-	// Function to compute golden output  
-	// a: [SF][SIMD-1:0][ACTIVATION_WIDTH-1:0]
-	// w: [NF][SF][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]
-	typedef logic signed [PE-1:0][ACCU_WIDTH-1:0] output_t;
-	typedef output_t output_vector_t [NF];
-
-	struct {
-		output_t dat;
-		logic vld;
-		logic rdy;
-	} outputs;
-
-	function output_vector_t check_output(activation_vector_t a, weight_matrix_t w);
-		automatic output_vector_t res = '{default: 0};
-		for (int j = 0; j<MH; j++) begin
-			for (int i = 0; i<MW; i++) begin
-				res[j/PE][j%PE] = $signed(res[j/PE][j%PE]) + $signed(a[i/SIMD][i%SIMD]) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
-			end
-		end  
-		return res;
-	endfunction : check_output;
-
-	output_vector_t GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS);
-
-	int unsigned NF_CNT = 0;
-	initial begin
-		outputs.rdy = 0;
-		while (NF_CNT < NF) begin
-			// Loop until both rdy & vld are asserted
-			do begin
-				outputs.rdy <= $urandom()%7 >= 1;
-				@(posedge clk iff ap_rst_n);
-			end while (!(outputs.rdy === 1 && outputs.vld === 1));
-
-			// Compare produced outputs against golden outputs
-			foreach(outputs.dat[i]) begin
-				assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
-				else begin 
-					$error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
-					$stop;
-				end  
-			end
-			
-			NF_CNT += 1;
-		end
-
-		$finish;  
-	end
-
-	// Instantiate DUT
-	mvu_8sx9_axi #(
-		.MW(MW),
-		.MH(MH),
-		.PE(PE),
-		.SIMD(SIMD),
-		.ACTIVATION_WIDTH(ACTIVATION_WIDTH),
-		.WEIGHT_WIDTH(WEIGHT_WIDTH),
-		.ACCU_WIDTH(ACCU_WIDTH),
-		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
-		.SEGMENTLEN(SEGMENTLEN)
-	)
-	dut (
-		.ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld),
-		.s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld),
-		.s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld),
-		.m_axis_output_tready(outputs.rdy)
-	);
-  
-endmodule
diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v
deleted file mode 100644
index e15f77fbae..0000000000
--- a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v
+++ /dev/null
@@ -1,93 +0,0 @@
-/******************************************************************************
- * Copyright (C) 2022, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  1. Redistributions of source code must retain the above copyright notice,
- *     this list of conditions and the following disclaimer.
- *
- *  2. Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in the
- *     documentation and/or other materials provided with the distribution.
- *
- *  3. Neither the name of the copyright holder nor the names of its
- *     contributors may be used to endorse or promote products derived from
- *     this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
- * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
- * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
- * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
- * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * @brief	Verilog AXI-lite wrapper for MVU.
- *****************************************************************************/
-
-module $MODULE_NAME_AXI_WRAPPER$ #(
-	parameter 	MW = $MW$,
-	parameter	MH = $MH$,
-	parameter 	PE = $PE$,
-	parameter 	SIMD = $SIMD$,
-	parameter 	ACTIVATION_WIDTH = $ACTIVATION_WIDTH$,
-	parameter 	WEIGHT_WIDTH = $WEIGHT_WIDTH$,
-	parameter 	ACCU_WIDTH = $ACCU_WIDTH$,
-	parameter 	SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$,
-	parameter 	SEGMENTLEN = $SEGMENTLEN$,
-	parameter 	RAM_STYLE = "$IBUF_RAM_STYLE$",
-
-	// Safely deducible parameters
-	parameter 	WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
-	parameter 	INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8,
-	parameter 	OUTPUT_LANES = PE,
-	parameter 	OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
-)(
-	// Global Control
-	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF s_axis_weights:s_axis_input:m_axis_output, ASSOCIATED_RESET ap_rst_n" *)
-	(* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *)
-	input	ap_clk,
-	(* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *)
-	input	ap_rst_n,
-
-	// Weight Stream
-	input	[WEIGHT_STREAM_WIDTH_BA-1:0]  s_axis_weights_tdata,
-	input	s_axis_weights_tvalid,
-	output	s_axis_weights_tready,
-
-	// Input Stream
-	input	[INPUT_STREAM_WIDTH_BA-1:0]  s_axis_input_tdata,
-	input	s_axis_input_tvalid,
-	output	s_axis_input_tready,
-
-	// Output Stream
-	output	[OUTPUT_STREAM_WIDTH_BA-1:0]  m_axis_output_tdata,
-	output	m_axis_output_tvalid,
-	input	m_axis_output_tready
-);
-
-mvu_8sx9_axi #(
-	.MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH),
-	.WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
-	.SEGMENTLEN(SEGMENTLEN), .RAM_STYLE(RAM_STYLE)
-	) inst (
-	.ap_clk(ap_clk),
-	.ap_rst_n(ap_rst_n),
-	.s_axis_weights_tdata(s_axis_weights_tdata),
-	.s_axis_weights_tvalid(s_axis_weights_tvalid),
-	.s_axis_weights_tready(s_axis_weights_tready),
-	.s_axis_input_tdata(s_axis_input_tdata),
-	.s_axis_input_tvalid(s_axis_input_tvalid),
-	.s_axis_input_tready(s_axis_input_tready),
-	.m_axis_output_tdata(m_axis_output_tdata),
-	.m_axis_output_tvalid(m_axis_output_tvalid),
-	.m_axis_output_tready(m_axis_output_tready)
-);
-
-endmodule : $MODULE_NAME_AXI_WRAPPER$
diff --git a/finn-rtllib/mvu/mvu_8sx9_tb.sv b/finn-rtllib/mvu/mvu_8sx9_tb.sv
deleted file mode 100644
index adf6a8f9c2..0000000000
--- a/finn-rtllib/mvu/mvu_8sx9_tb.sv
+++ /dev/null
@@ -1,165 +0,0 @@
-/******************************************************************************
- * Copyright (C) 2022, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  1. Redistributions of source code must retain the above copyright notice,
- *     this list of conditions and the following disclaimer.
- *
- *  2. Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in the
- *     documentation and/or other materials provided with the distribution.
- *
- *  3. Neither the name of the copyright holder nor the names of its
- *     contributors may be used to endorse or promote products derived from
- *     this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
- * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
- * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
- * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
- * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * @brief	Testbench for MVU core compute kernel.
- *****************************************************************************/
-
-module mvu_8sx9_tb();
-
-//-------------------- Simulation parameters --------------------\\
-	// Matrix & parallelism config
-	localparam int unsigned MH = 256;
-	localparam int unsigned PE = 16;
-	localparam int unsigned MW = 600;
-	localparam int unsigned SIMD = 60;
-	localparam int unsigned SEGMENTLEN = 4;
-	// Bit-width config  
-	localparam int unsigned ACTIVATION_WIDTH = 8;
-	localparam int unsigned WEIGHT_WIDTH = 4;
-	localparam bit SIGNED_ACTIVATIONS = 1;
-	// Simulation constants
-	localparam int unsigned NF = MH/PE;
-	localparam int unsigned SF = MW/SIMD;
-	localparam int unsigned NUM_OF_DSP = SIMD/3;
-
-	typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t;
-	typedef activation_t activation_vector_t[SF];
-
-	function activation_vector_t init_ACTIVATIONS;
-		automatic activation_vector_t res;
-		std::randomize(res);
-		return res;
-	endfunction : init_ACTIVATIONS
-
-	typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t;
-	typedef weight_t weight_matrix_t[NF][SF];
-
-	function weight_matrix_t init_WEIGHTS;
-		automatic weight_matrix_t res;
-		std::randomize(res);
-		return res;
-	endfunction : init_WEIGHTS;
-
-	typedef logic signed [PE-1:0][57:0] output_t;
-	typedef output_t output_vector_t [NF];
-
-	function output_vector_t check_output(activation_vector_t a, weight_matrix_t w);
-		automatic output_vector_t res = '{default: 0};
-		for (int j = 0; j<MH; j++) begin
-			for (int i = 0; i<MW; i++) begin
-				res[j/PE][j%PE] = $signed(res[j/PE][j%PE]) + $signed(a[i/SIMD][i%SIMD]) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
-			end
-		end  
-		return res;
-	endfunction : check_output;
-
-	logic clk = 0;
-	always #5ns clk = !clk;
-
-	logic rst;
-	initial begin
-		rst = 1;
-		repeat(16) @(posedge clk);
-		rst <= 0;
-	end
-
-	logic last;
-	logic zero;
-	logic vld;
-	activation_t a;
-	weight_t w;
-	output_t p;
-	// Reference signals
-	activation_vector_t ACTIVATIONS; //   [SF-1:0][SIMD-1:0][ACTIVATION_WIDTH-1:0]
-	weight_matrix_t WEIGHTS; //           [NF-1:0][SF-1:0][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]
-	output_vector_t GOLDEN_OUTPUT; //     [NF-1:0][PE-1:0][57:0]
-	// Counter for number of outputs (NF dimension) that are produced
-	int NF_CNT = 0;
-
-	initial begin
-		ACTIVATIONS = init_ACTIVATIONS();
-		WEIGHTS = init_WEIGHTS();
-		GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS);
-		last = 0;
-		zero = 0;
-		a = 'x;
-		w = 'x;
-
-		@(posedge clk iff !rst);
-
-		for (int j=0; j<NF; j++) begin
-			for (int i=0; i<SF; i++) begin
-				last <= (i==SF-1) ? 1 : 0;
-				a <= ACTIVATIONS[i];
-				w <= WEIGHTS[j][i];
-				@(posedge clk iff en);
-			end
-		end
-
-		last <= 0;
-		zero <= 1;  
-
-		// Continue until all NF outputs are produced & compared
-		@(posedge clk && (NF_CNT==NF));
-
-		$finish;
-	end
-
-	logic en = 0;
-	always_ff @(posedge clk) begin
-		en <= ($urandom()%7 > 1) && !rst;
-	end
-
-	// Compare computed output against golden output when vld flag is raised by DUT
-	always_ff @(posedge clk iff (vld && en)) begin
-		foreach(p[i]) begin
-			assert ($signed(p[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
-			else begin 
-				$error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
-				$stop;
-			end  
-		end
-		NF_CNT += 1;
-	end
-
-	// Instantiate DUT
-	mvu_8sx9 #(
-		.PE(PE),
-		.SIMD(SIMD),
-		.WEIGHT_WIDTH(WEIGHT_WIDTH),
-		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
-		.ACTIVATION_WIDTH(ACTIVATION_WIDTH),
-		.SEGMENTLEN(SEGMENTLEN)
-	)
-	dut (
-		.clk, .rst, .en, .last, .zero, .a, .w, .vld, .p
-	);
-
-endmodule
diff --git a/finn-rtllib/mvu/mvu_axi.sv b/finn-rtllib/mvu/mvu_axi.sv
deleted file mode 100644
index 07ad32e6c8..0000000000
--- a/finn-rtllib/mvu/mvu_axi.sv
+++ /dev/null
@@ -1,248 +0,0 @@
-/******************************************************************************
- * Copyright (C) 2022, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  1. Redistributions of source code must retain the above copyright notice,
- *     this list of conditions and the following disclaimer.
- *
- *  2. Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in the
- *     documentation and/or other materials provided with the distribution.
- *
- *  3. Neither the name of the copyright holder nor the names of its
- *     contributors may be used to endorse or promote products derived from
- *     this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
- * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
- * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
- * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
- * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * @brief	Matrix Vector Unit (MVU) & Vector Vector Unit (VVU) AXI-lite interface wrapper.
- * @details
- *	 The following compute cores are supported:
- *   - 4-bit MVU on DSP48 & DSP58 achieving 4 MACs/DSP, 
- *     (4,8]-bit MVU on DSP48 achieving 2 MACs/DSP,
- *     [4,9]-bit MVU and VVU on DSP58 achieving 3 MACs/DSP,
- *     'unconstrained' LUT-based MVU and VVU.
- *  Folding hints:
- *	 - PE scaling should divide MH.
- *   - SIMD scaling should divide MW.
- *	 - Otherwise, keep SIMD and PE somewhat balanced. SIMD scaling tends to
- *	   impact critical paths more than PE scaling. PE scaling implies a
- *	   bigger fanout on the input activations.
- *	 - Full unfolding along MH (PE=MH) results in no replay buffer instantiated
- *****************************************************************************/
-
-module mvu_vvu_axi #(
-	bit IS_MVU, // string type causes error in Vivado
-	parameter COMPUTE_CORE,
-	int unsigned MW,
-	int unsigned MH,
-	int unsigned PE,
-	int unsigned SIMD,
-	int unsigned ACTIVATION_WIDTH,
-	int unsigned WEIGHT_WIDTH,
-	int unsigned ACCU_WIDTH,
-	bit SIGNED_ACTIVATIONS = 0,
-	int unsigned SEGMENTLEN = 0,
-	bit FORCE_BEHAVIORAL = 0,
-	bit M_REG_LUT = 1,
-
-	// Safely deducible parameters
-	localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
-	localparam int unsigned INPUT_STREAM_WIDTH_BA = ((IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8,
-	localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH,
-	localparam int unsigned INPUT_STREAM_WIDTH =  (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH,
-	localparam int unsigned SF = MW/SIMD,
-	localparam int unsigned NF = MH/PE,
-	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8
-)
-(
-	// Global Control
-	input	logic  ap_clk,
-	input	logic  ap_rst_n,
-
-	// Weight Stream
-	input	logic [WEIGHT_STREAM_WIDTH_BA-1:0]  s_axis_weights_tdata,
-	input	logic  s_axis_weights_tvalid,
-	output	logic  s_axis_weights_tready,
-
-	// Input Stream
-	input	logic [INPUT_STREAM_WIDTH_BA-1:0]  s_axis_input_tdata,
-	input	logic  s_axis_input_tvalid,
-	output	logic  s_axis_input_tready,
-
-	// Output Stream
-	output	logic [OUTPUT_STREAM_WIDTH_BA-1:0]  m_axis_output_tdata,
-	output	logic  m_axis_output_tvalid,
-	input	logic  m_axis_output_tready
-);
-
-//-------------------- Parameter sanity checks --------------------\\
-	initial begin
-		if (MW % SIMD != 0) begin
-			$error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD);
-			$finish;
-		end
-		if (MH % PE != 0) begin
-			$error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE);
-			$finish;
-		end
-		if (WEIGHT_WIDTH > 8) begin
-			$error("Weight width of %0d-bits exceeds maximum of 8-bits", WEIGHT_WIDTH);
-			$finish;
-		end
-		if (ACTIVATION_WIDTH > 8) begin
-			if (!(SIGNED_ACTIVATIONS == 1 && ACTIVATION_WIDTH == 9 && COMPUTE_CORE == "mvu_vvu_8sx9_dsp58")) begin
-				$error("Activation width of %0d-bits exceeds maximum of 9-bits for signed numbers on DSP48", ACTIVATION_WIDTH);
-				$finish;
-			end
-		end
-		if (COMPUTE_CORE == "mvu_vvu_8sx9_dsp58") begin
-			if (SEGMENTLEN == 0) begin
-				$warning("Segment length of %0d defaults to chain length of %0d", SEGMENTLEN, (SIMD+2)/3);
-			end
-			if (SEGMENTLEN > (SIMD+2)/3) begin
-				$error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3);
-				$finish;
-			end
-		end
-		if (!IS_MVU) begin
-			if (COMPUTE_CORE != "mvu_vvu_8sx9_dsp58" && COMPUTE_CORE != "mvu_vvu_lut") begin
-				$error("VVU only supported on DSP58 or LUT-based implementation");
-				$finish;
-			end
-		end
-	end
-
-	uwire clk = ap_clk;
-	uwire rst = !ap_rst_n;
-
-	typedef logic [INPUT_STREAM_WIDTH-1 : 0] mvauin_t;
-
-	uwire mvauin_t amvau;
-	uwire alast;
-	uwire afin;
-	uwire avld;
-	uwire ardy;
-
-	replay_buffer #(.LEN(SF), .REP(IS_MVU ? NF : 1), .W($bits(mvauin_t))) activation_replay (
-	.clk, .rst,
-	.ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)),
-	.ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin)
-	);
-
-//-------------------- Input control --------------------\\
-	uwire en;
-	uwire istb = avld && s_axis_weights_tvalid;
-	assign ardy = en && s_axis_weights_tvalid;
-	assign s_axis_weights_tready = en && avld;
-
-//-------------------- Core MVU/VVU --------------------\\
-	uwire ovld;
-	uwire [PE-1:0][ACCU_WIDTH-1:0] odat;
-	typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t;
-	uwire mvauin_t amvau_i;
-
-	if (IS_MVU) begin : genMVUInput
-		assign  amvau_i = amvau;
-	end : genMVUInput
-	else begin : genVVUInput
-		// The input stream will have the channels interleaved for VVU when PE>1
-		// Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..]
-		// Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like:
-		// (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to
-		// (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i)
-		localparam int num_of_elements = INPUT_STREAM_WIDTH/ACTIVATION_WIDTH;
-		for (genvar i=0; i<num_of_elements; i++) begin : genRewire
-			assign  amvau_i[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH] = (PE > 1) ?
-									amvau[(i/SIMD + (i*PE % num_of_elements) + 1) * ACTIVATION_WIDTH : (i/SIMD + (i*PE % num_of_elements)) * ACTIVATION_WIDTH]
-									: amvau[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH];
-		end : genRewire
-	end : genVVUInput
-
-	case(COMPUTE_CORE)
-	"mvu_vvu_8sx9_dsp58":
-		mvu_vvu_8sx9 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
-		.ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN),
-		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
-			.clk, .rst, .en,
-			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
-			.vld(ovld), .p(odat)
-		);
-	"mvu_4sx4u":
-		mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
-			.clk, .rst, .en,
-			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
-			.vld(ovld), .p(odat)
-		);
-	"mvu_8sx8u_dsp48":
-		mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
-		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
-			.clk, .rst, .en,
-			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
-			.vld(ovld), .p(odat)
-		);
-	"mvu_vvu_lut":
-		mvu_vvu_lut #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
-		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .M_REG(M_REG_LUT)) core (
-			.clk, .rst, .en,
-			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
-			.vld(ovld), .p(odat)
-		);
-	default: initial begin
-		$error("Unrecognized COMPUTE_CORE '%s'", COMPUTE_CORE);
-		$finish;
-	end
-	endcase
-
-//-------------------- Output register slice --------------------\\
-	struct packed {
-		logic vld;
-		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
-	} A = '{ vld: 0, default: 'x};
-
-	assign en = !A.vld || !ovld;
-
-	uwire  b_load;
-	always_ff @(posedge clk) begin
-		if(rst)		A <= '{ vld: 0, default: 'x };
-		else if(!A.vld || b_load) begin
-			A.vld <= ovld && en;
-			for(int unsigned  i = 0; i < PE; i++) begin
-				// CR-1148862:
-				// A.dat[i] <= odat[i];
-				automatic logic [ACCU_WIDTH-1:0]  v = odat[i];
-				A.dat[i] <= v[ACCU_WIDTH-1:0];
-			end
-		end
-	end
-
-	struct packed {
-		logic vld;
-		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
-	} B = '{ vld: 0, default: 'x};
-
-	assign	b_load = !B.vld || m_axis_output_tready;
-	always_ff @(posedge clk) begin
-		if(rst)		B <= '{ vld: 0, default: 'x };
-		else begin
-			if(b_load)	B <= '{ vld: A.vld, dat: A.dat};
-		end
-	end
-
-	assign	m_axis_output_tvalid = B.vld;
-	assign	m_axis_output_tdata  = B.dat;
-
-endmodule : mvu_vvu_axi
diff --git a/finn-rtllib/mvu/mvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_axi_wrapper.v
deleted file mode 100644
index 239c5bbacd..0000000000
--- a/finn-rtllib/mvu/mvu_axi_wrapper.v
+++ /dev/null
@@ -1,92 +0,0 @@
-/******************************************************************************
- * Copyright (C) 2022, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  1. Redistributions of source code must retain the above copyright notice,
- *     this list of conditions and the following disclaimer.
- *
- *  2. Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in the
- *     documentation and/or other materials provided with the distribution.
- *
- *  3. Neither the name of the copyright holder nor the names of its
- *     contributors may be used to endorse or promote products derived from
- *     this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
- * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
- * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
- * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
- * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * @brief	Verilog AXI-lite wrapper for MVU.
- *****************************************************************************/
-
-module $MODULE_NAME_AXI_WRAPPER$ #(
-	parameter 	MW = $MW$,
-	parameter	MH = $MH$,
-	parameter 	PE = $PE$,
-	parameter 	SIMD = $SIMD$,
-	parameter 	ACTIVATION_WIDTH = $ACTIVATION_WIDTH$,
-	parameter 	WEIGHT_WIDTH = $WEIGHT_WIDTH$,
-	parameter 	ACCU_WIDTH = $ACCU_WIDTH$,
-	parameter 	SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$,
-	parameter 	SEGMENTLEN = $SEGMENTLEN$,
-	parameter	MVU_IMPL_STYLE = "$MVU_IMPL_STYLE$",
-	parameter	FORCE_BEHAVIORAL = $FORCE_BEHAVIORAL$,
-
-	// Safely deducible parameters
-	parameter 	WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
-	parameter 	INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8,
-	parameter 	OUTPUT_LANES = PE,
-	parameter 	OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8
-)(
-	// Global Control
-	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *)
-	(* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *)
-	input	ap_clk,
-	(* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *)
-	input	ap_rst_n,
-
-	// Weight Stream
-	input	[WEIGHT_STREAM_WIDTH_BA-1:0]  weights_V_TDATA,
-	input   weights_V_TVALID,
-	output  weights_V_TREADY,
-	// Input Stream
-	input	[INPUT_STREAM_WIDTH_BA-1:0]  in0_V_TDATA,
-	input	in0_V_TVALID,
-	output	in0_V_TREADY,
-	// Output Stream
-	output	[OUTPUT_STREAM_WIDTH_BA-1:0]  out_V_TDATA,
-	output	out_V_TVALID,
-	input	out_V_TREADY
-);
-
-mvu_axi #(
-	.MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH),
-	.WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
-	.SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL), .MVU_IMPL_STYLE(MVU_IMPL_STYLE)
-	) inst (
-	.ap_clk(ap_clk),
-	.ap_rst_n(ap_rst_n),
-	.s_axis_weights_tdata(weights_V_TDATA),
-	.s_axis_weights_tvalid(weights_V_TVALID),
-	.s_axis_weights_tready(weights_V_TREADY),
-	.s_axis_input_tdata(in0_V_TDATA),
-	.s_axis_input_tvalid(in0_V_TVALID),
-	.s_axis_input_tready(in0_V_TREADY),
-	.m_axis_output_tdata(out_V_TDATA),
-	.m_axis_output_tvalid(out_V_TVALID),
-	.m_axis_output_tready(out_V_TREADY)
-);
-
-endmodule : $MODULE_NAME_AXI_WRAPPER$
diff --git a/finn-rtllib/mvu/mvu_lut.sv b/finn-rtllib/mvu/mvu_lut.sv
deleted file mode 100644
index c100910d75..0000000000
--- a/finn-rtllib/mvu/mvu_lut.sv
+++ /dev/null
@@ -1,104 +0,0 @@
-module mvu_vvu_lut #(
-    bit IS_MVU,
-    int unsigned  PE,
-    int unsigned  SIMD,
-	int unsigned  ACCU_WIDTH,
-    int unsigned  ACTIVATION_WIDTH,
-    int unsigned  WEIGHT_WIDTH,
-    bit  SIGNED_ACTIVATIONS,
-    bit  M_REG = 1,
-
-    localparam int unsigned MULT_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH,
-    localparam int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD
-)(
-	// Global Control
-	input	logic  clk,
-	input	logic  rst,
-	input	logic  en,
-
-	// Input
-	input	logic  last,
-	input	logic  zero,	// ignore current inputs and force this partial product to zero
-	input	logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]             w,	// signed weights
-	input	logic        [ACTIVATION_ELEMENTS-1:0][ACTIVATION_WIDTH-1:0]  a,	// (un)signed activations
-
-	// Ouput
-	output	logic  vld,
-	output	logic signed [PE-1:0][ACCU_WIDTH-1:0]  p
-);
-
-    typedef int unsigned  leave_load_t[2*SIMD-1];
-    function leave_load_t init_leave_loads();
-        automatic leave_load_t  res;
-        for(int  i = 2*(SIMD-1); i >= int'(SIMD)-1; i--)  res[i] = 1;
-        for(int  i = SIMD-2; i >= 0; i--)  res[i] = res[2*i+1] + res[2*i+2];
-        return res;
-    endfunction : init_leave_loads
-
-    // Pipeline for last indicator flag
-    uwire last_i;
-    generate if (M_REG) begin
-        logic [0:1] L = '0;
-        always_ff @(posedge clk) begin
-            if(rst)       L <= '0;
-            else if (en)  L <= {last, L[0]};
-        end
-        assign  last_i = L[1];
-    end
-    else begin 
-        logic L = '0;
-        always_ff @(posedge clk) begin
-            if(rst)       L <= '0;
-            else if (en)  L <= last;
-        end
-        assign  last_i = L;
-    end
-    endgenerate
-
-    // For each PE generate
-    for (genvar  i = 0; i < PE; i++)  begin : genPE
-        // Stage #1: SIMD multipliers in parallel
-        uwire [MULT_WIDTH-1 : 0] m1 [SIMD];
-        for (genvar j = 0; j < SIMD; j++) begin : genSIMD
-            if (M_REG) begin : genMreg
-                logic [MULT_WIDTH-1 : 0] M [SIMD];
-                always_ff @(posedge clk) begin
-                    if(rst)         M[j] = '{ default : 0 };
-                    else if (en)    M[j] = zero ? 0 :
-                                            SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) :
-                                                                 $signed({1'b0, a[(IS_MVU ? 0 : SIMD*i) + j]}) * $signed(w[i][j]); 
-                    // (SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) : a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) isn't valid -- leads to unsigned multiplication
-                end
-                assign  m1[j] = M[j];
-            end : genMreg
-            else begin : genNoMreg 
-                assign m1[j] = zero ? 0 :
-                               SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) :
-                                                    $signed({1'b0, a[(IS_MVU ? 0 : SIMD*i) + j]}) * $signed(w[i][j]);
-            end : genNoMreg
-        end : genSIMD
-
-        // Stage #2: Adder tree to reduce SIMD products
-        localparam leave_load_t  LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default : 1 };
-        localparam int unsigned  ROOT_WIDTH = $clog2(SIMD*(2**MULT_WIDTH-1));
-        uwire signed [2*SIMD-2:0][ROOT_WIDTH-1:0]  tree;
-        for(genvar s = 0; s < SIMD; s++)  assign  tree[SIMD-1+s] = $signed(m1[s]);
-        for(genvar n = 0; n < SIMD-1; n++) begin
-            // Sum truncated to actual maximum bit width at this node
-            localparam int unsigned  NODE_WIDTH = $clog2(LEAVE_LOAD[n]*(2**MULT_WIDTH-1));
-            uwire signed [NODE_WIDTH-1:0]  s = tree[2*n+1] + tree[2*n+2];
-            assign tree[n] = s;
-        end
-
-        // Stage #3: Buffer output
-        logic [ACCU_WIDTH-1:0] P2 [PE];
-        always_ff @(posedge clk) begin
-            if(rst)         P2[i] = '{ default : 0};
-            else if (en)    P2[i] = (last_i ? 0 : $signed(P2[i])) + $signed(tree[0]);
-        end
-
-        assign  vld = last_i;
-        assign  p[i] = P2[i];
-    end : genPE
-
-endmodule : mvu_vvu_lut
diff --git a/finn-rtllib/mvu/tb/mvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_axi_tb.sv
deleted file mode 100644
index b89b58f55b..0000000000
--- a/finn-rtllib/mvu/tb/mvu_axi_tb.sv
+++ /dev/null
@@ -1,215 +0,0 @@
-/******************************************************************************
- * Copyright (C) 2022, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  1. Redistributions of source code must retain the above copyright notice,
- *     this list of conditions and the following disclaimer.
- *
- *  2. Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in the
- *     documentation and/or other materials provided with the distribution.
- *
- *  3. Neither the name of the copyright holder nor the names of its
- *     contributors may be used to endorse or promote products derived from
- *     this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
- * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
- * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
- * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
- * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * @brief	Testbench for MVU AXI-lite interface wrapper.
- *****************************************************************************/
-
-module mvu_axi_tb();
-
-//-------------------- Simulation parameters --------------------\\
-	// Matrix & parallelism config
-	localparam int unsigned MW = 50;
-	localparam int unsigned MH = 8;
-	localparam int unsigned SIMD = 10;
-	localparam int unsigned PE = 2;
-	localparam int unsigned SEGMENTLEN = 2;
-	localparam string MVU_IMPL_STYLE = "mvu_8sx8u_dsp48";
-	localparam bit FORCE_BEHAVIORAL = 1;
-	// Bit-width config
-	localparam int unsigned ACTIVATION_WIDTH = 8;
-	localparam int unsigned WEIGHT_WIDTH = 8;
-	localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW);
-	localparam bit SIGNED_ACTIVATIONS = 0;
-	// Simulation constants
-	localparam int unsigned NF = MH/PE;
-	localparam int unsigned SF = MW/SIMD;
-	localparam int unsigned NUM_OF_DSP = SIMD/3;
-	localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8;
-	localparam int unsigned ACTIVATION_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8*8;
-	localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH;
-	localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - SIMD*ACTIVATION_WIDTH;
-	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8;
-
-	// Generate clk and reset signal
-	logic clk = 0;
-	always #5ns clk = !clk;
-
-	logic ap_rst_n = 0;
-	initial begin
-		repeat(16) @(posedge clk);
-		ap_rst_n <= 1;
-	end
-
-	uwire ap_clk = clk;
-
-	// Generate activations
-	typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t;
-	typedef activation_t activation_vector_t[SF];
-
-	function activation_vector_t init_ACTIVATIONS;
-		automatic activation_vector_t res;
-		std::randomize(res);
-		return res;
-	endfunction : init_ACTIVATIONS
-
-	activation_vector_t ACTIVATIONS = init_ACTIVATIONS();
-
-	struct {
-		activation_t dat;
-		logic vld;
-		logic rdy;
-	} activations;
-
-	initial begin
-		activations.vld = 0;
-		activations.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain
-		@(posedge clk iff ap_rst_n);
-
-		for (int i=0; i<SF; i++) begin
-			activations.dat <= ACTIVATIONS[i];
-			do begin
-				activations.vld <= $urandom()%7 >= 1;
-				@(posedge clk);
-			end while (!(activations.vld === 1 && activations.rdy === 1));
-		end
-
-		activations.vld <= 0;
-		activations.dat <= 'x;
-	end
-
-	// Generate weights
-	typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t;
-	typedef weight_t weight_matrix_t[NF][SF];
-
-	function weight_matrix_t init_WEIGHTS;
-		automatic weight_matrix_t res;
-		std::randomize(res);
-		return res;
-	endfunction : init_WEIGHTS;
-
-	weight_matrix_t WEIGHTS = init_WEIGHTS();
-
-	struct {
-		weight_t dat;
-		logic vld;
-		logic rdy;
-	} weights;
-
-	initial begin
-		weights.vld = 0;
-		weights.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain
-		@(posedge clk iff ap_rst_n);
-
-		weights.vld <= 1;
-		for (int i=0; i<NF; i++) begin
-			for (int j=0; j<SF; j++) begin
-				weights.dat <= WEIGHTS[i][j];
-				@(posedge clk iff weights.rdy);
-			end
-		end
-
-		weights.vld <= 0;
-		weights.dat <= 'x;
-	end
-
-	// Function to compute golden output
-	// a: [SF][SIMD-1:0][ACTIVATION_WIDTH-1:0]
-	// w: [NF][SF][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]
-	typedef logic signed [PE-1:0][ACCU_WIDTH-1:0] output_t;
-	typedef output_t output_vector_t [NF];
-
-	struct {
-		output_t dat;
-		logic vld;
-		logic rdy;
-	} outputs;
-
-	function output_vector_t check_output(activation_vector_t a, weight_matrix_t w);
-		automatic output_vector_t res = '{default: 0};
-		for (int j = 0; j<MH; j++) begin
-			for (int i = 0; i<MW; i++) begin
-				if (SIGNED_ACTIVATIONS)
-					res[j/PE][j%PE] = $signed(res[j/PE][j%PE]) + $signed(a[i/SIMD][i%SIMD]) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
-				else
-					res[j/PE][j%PE] = $signed(res[j/PE][j%PE]) + $signed({1'b0, a[i/SIMD][i%SIMD]}) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
-			end
-		end
-		return res;
-	endfunction : check_output;
-
-	output_vector_t GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS);
-
-	int unsigned NF_CNT = 0;
-	initial begin
-		outputs.rdy = 0;
-		while (NF_CNT < NF) begin
-			// Loop until both rdy & vld are asserted
-			do begin
-				outputs.rdy <= $urandom()%7 >= 1;
-				@(posedge clk iff ap_rst_n);
-			end while (!(outputs.rdy === 1 && outputs.vld === 1));
-
-			// Compare produced outputs against golden outputs
-			foreach(outputs.dat[i]) begin
-				assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
-				else begin
-					$error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
-					$stop;
-				end
-			end
-
-			NF_CNT += 1;
-		end
-
-		$finish;
-	end
-
-	// Instantiate DUT
-	mvu_axi #(
-		.MW(MW),
-		.MH(MH),
-		.PE(PE),
-		.SIMD(SIMD),
-		.ACTIVATION_WIDTH(ACTIVATION_WIDTH),
-		.WEIGHT_WIDTH(WEIGHT_WIDTH),
-		.ACCU_WIDTH(ACCU_WIDTH),
-		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
-		.SEGMENTLEN(SEGMENTLEN),
-		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL),
-		.MVU_IMPL_STYLE(MVU_IMPL_STYLE)
-	)
-	dut (
-		.ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld),
-		.s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld),
-		.s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld),
-		.m_axis_output_tready(outputs.rdy)
-	);
-
-endmodule : mvu_axi_tb

From 14c5fa902820396e3489a244dc4d705fd1ebe532 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 21 Sep 2023 17:12:47 +0100
Subject: [PATCH 072/112] [mvu vvu 8sx9]: renamed for consistency

---
 finn-rtllib/mvu/{mvu_vvu_8sx9.sv => mvu_vvu_8sx9_dsp58.sv} | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
 rename finn-rtllib/mvu/{mvu_vvu_8sx9.sv => mvu_vvu_8sx9_dsp58.sv} (99%)

diff --git a/finn-rtllib/mvu/mvu_vvu_8sx9.sv b/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv
similarity index 99%
rename from finn-rtllib/mvu/mvu_vvu_8sx9.sv
rename to finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv
index 2aa9d71b6c..6ae117e3ab 100644
--- a/finn-rtllib/mvu/mvu_vvu_8sx9.sv
+++ b/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv
@@ -31,7 +31,7 @@
  * @brief	Matrix Vector Unit (MVU) core compute kernel utilizing DSP58.
  *****************************************************************************/
 
-module mvu_vvu_8sx9 #(
+module mvu_vvu_8sx9_dsp58 #(
 	bit IS_MVU,
     int unsigned PE,
     int unsigned SIMD,
@@ -424,4 +424,4 @@ module mvu_vvu_8sx9 #(
 		end : genDSPChain
 	end : genDSPPE
 
-endmodule : mvu_vvu_8sx9
+endmodule : mvu_vvu_8sx9_dsp58

From 3a3758826512fd3d5ed0bcdd23358d5fd5b724cd Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 21 Sep 2023 17:13:25 +0100
Subject: [PATCH 073/112] [mvu vvu axi]: changes for renamed module

---
 finn-rtllib/mvu/mvu_vvu_axi.sv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index ff677fc244..416480da79 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -174,7 +174,7 @@ module mvu_vvu_axi #(
 
 	case(COMPUTE_CORE)
 	"mvu_vvu_8sx9_dsp58":
-		mvu_vvu_8sx9 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
+		mvu_vvu_8sx9_dsp58 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
 		.ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN),
 		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
 			.clk, .rst, .en,

From afe36baa134b947718db34d140c8d6500b91cb2a Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 25 Sep 2023 13:44:17 +0100
Subject: [PATCH 074/112] [mvu vvu wrapper]: convert localparam to param

---
 finn-rtllib/mvu/mvu_vvu_axi_wrapper.v | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
index 270fe7351f..9c65dbc06e 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
+++ b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
@@ -46,9 +46,9 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 	parameter	FORCE_BEHAVIORAL = $FORCE_BEHAVIORAL$,
 
 	// Safely deducible parameters
-	localparam	WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
-	localparam 	INPUT_STREAM_WIDTH_BA = ((IS_MVU == 1 ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8,
-	localparam 	OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8
+	parameter	WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
+	parameter 	INPUT_STREAM_WIDTH_BA = ((IS_MVU == 1 ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8,
+	parameter 	OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8
 )(
 	// Global Control
 	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *)

From e4f2f9e0e4f1cb0bae2bf7e439c57356b3670620 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 25 Sep 2023 13:45:48 +0100
Subject: [PATCH 075/112] [mvau-rtl custom-op]: bugfix to instantiate
 memstreamer, modified renamed files and axi wrapper template fill-out

---
 .../matrixvectoractivation_rtl.py             | 92 ++++++++++---------
 1 file changed, 51 insertions(+), 41 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
index 9f8130806b..c7fb855884 100644
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
@@ -61,8 +61,7 @@
 
 
 class MatrixVectorActivation_rtl(HLSCustomOp):
-    """Class that corresponds to finn-hls Matrix_Vector_Activate(_Stream)_Batch
-    function."""
+    """Class that corresponds to finn-rtl Matrix Vector Unit."""
 
     def __init__(self, onnx_node, **kwargs):
         super().__init__(onnx_node, **kwargs)
@@ -73,8 +72,7 @@ def get_nodeattr_types(self):
             "SIMD": ("i", True, 0),
             "MW": ("i", True, 0),
             "MH": ("i", True, 0),
-            "resType": ("s", False, "lut", {"auto", "lut", "dsp"}),
-            "ActVal": ("i", False, 0),
+            "resType": ("s", False, "dsp", {"auto", "lut", "dsp"}),
             # FINN DataTypes for inputs, weights, outputs
             "inputDataType": ("s", True, ""),
             "weightDataType": ("s", True, ""),
@@ -165,7 +163,6 @@ def verify_node(self):
         # verify that all necessary attributes exist
         # TODO collect automatically from get_nodeattr_types
         try:
-            self.get_nodeattr("code_gen_dir_cppsim")
             self.get_nodeattr("executable_path")
             self.get_nodeattr("resType")
             self.get_nodeattr("MW")
@@ -199,7 +196,6 @@ def verify_node(self):
 
         return info_messages
 
-    # TODO: Add in replay_buffer estimation
     def uram_estimation(self):
         P = self.get_nodeattr("PE")
         Q = self.get_nodeattr("SIMD")
@@ -213,7 +209,6 @@ def uram_estimation(self):
         mstyle = self.get_nodeattr("ram_style")
         if (
             (mmode == "decoupled" and mstyle != "ultra")
-            or (mmode == "const" and self.calc_wmem() <= 128)
             or (mmode == "external")
         ):
             return 0
@@ -221,7 +216,6 @@ def uram_estimation(self):
         depth_multiplier = math.ceil(omega / 4096)
         return width_multiplier * depth_multiplier
 
-    # TODO: Add in replay_buffer estimation
     def bram_estimation(self):
         """Calculates resource estimation for BRAM based on:
         - FINN-R: An End-to-End Deep-Learning Framework for Fast
@@ -243,7 +237,6 @@ def bram_estimation(self):
         mstyle = self.get_nodeattr("ram_style")
         if (
             (mmode == "decoupled" and mstyle in ["distributed", "ultra"])
-            or (mmode == "const" and self.calc_wmem() <= 128)
             or (mmode == "external")
         ):
             return 0
@@ -262,7 +255,6 @@ def bram_estimation(self):
         else:
             return (math.ceil(omega / 512)) * (math.ceil(mem_width / 36))
 
-    # TODO: Add in replay_buffer estimation
     def bram_efficiency_estimation(self):
         wdt = self.get_weight_datatype()
         W = wdt.bitwidth()
@@ -275,7 +267,6 @@ def bram_efficiency_estimation(self):
         bram16_est_capacity = bram16_est * 36 * 512
         return wbits / bram16_est_capacity
 
-    # TODO: Add in replay_buffer estimation
     def uram_efficiency_estimation(self):
         """Function for URAM efficiency estimation: actual parameter storage
         needed divided by the allocated URAM storage (from estimation)"""
@@ -290,7 +281,7 @@ def uram_efficiency_estimation(self):
         uram_est_capacity = uram_est * 72 * 4096
         return wbits / uram_est_capacity
 
-    # TODO: FIX: worst case estimates since segmentlen is not known at this point?
+# TODO: fix lut estimations 
     def lut_estimation(self):
         """Calculates resource estimations for LUTs based on:
         - FINN-R: An End-to-End Deep-Learning Framework for Fast
@@ -333,9 +324,13 @@ def lut_estimation(self):
 
         return int(c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts)) + c2)
 
-    # TODO: FIX: worst case estimates since segmentlen is not known at this point?
+# TODO: fix DSP estimations --> depends on fpga_part
     def dsp_estimation(self):
         # multiplication
+        # mvu_8sx9 (DSP58): ceil(SIMD/3)
+        # mvu_4sx4u (DSP48/DSP58): ceil(PE/4)
+        # mvu_8sx8u (DSP48): ceil(PE/2)
+        # mvu_lut: 0
         P = self.get_nodeattr("PE")
         res_type = self.get_nodeattr("resType")
         Q = self.get_nodeattr("SIMD")
@@ -349,18 +344,24 @@ def dsp_estimation(self):
             mult_dsp = 0
         return int(mult_dsp)
 
-    # TODO: FIX: worst case estimates since segmentlen is not known at this point
+# TODO: fix exp_cycles estimations --> depends on fpga_part and clk
     def get_exp_cycles(self):
+        # mvu_8sx9 (DSP58):
+        # 2 (replay_buffer) + ceil(chainlen/seglen) + 2 (MREG, PREG) + 2 (output reg slice)
+        # + MW/SIMD * MH/PE
+        # mvu_4sx4u (DSP48/DSP58) / mvu_8sx8u (DSP48): 
+        # 3 (IN_REG, MREG, PREG) + 2 (replay_buffer) + 2 (output reg slice) + 1 (adder tree SIMD) + 1 (output lane)
+        # + MW/SIMD * MH/PE
+        # mvu_lut:
+        # 2 (replay_buffer) + 1 OR 2 (no MREG OR MREG) + 2 (output reg slice) 
+        # + MW/SIMD * MH/PE
         pe = self.get_nodeattr("PE")
         simd = self.get_nodeattr("SIMD")
         num_inp_vec = self.get_nodeattr("numInputVectors")
         mh = self.get_nodeattr("MH")
         mw = self.get_nodeattr("MW")
         # since mmv != 1 is not supported yet, we set mmv for now to 1
-        mmv = 1
-        # Actual exp_cycles is probably slightly larger (say 3 cycles
-        # (DSP A/B, M, P - reg) + additional pipeline buffer cycles.
-        # Most probably <10)
+        mmv = 1     
         exp_cycles = (mh / pe) * (mw / simd) * np.prod(num_inp_vec) / mmv
         return int(exp_cycles)
 
@@ -711,7 +712,7 @@ def execute_node(self, context, graph):
         else:
             raise Exception(
                 """Invalid value for attribute exec_mode! Is currently set to: {}
-            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+            has to be set to "rtlsim" """.format(
                     mode
                 )
             )
@@ -795,11 +796,12 @@ def code_generation_ipi(self):
                 os.path.join(
                     code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"
                 ),
-                rtllib_dir + "mvu_axi.sv",
+                rtllib_dir + "mvu_vvu_axi.sv",
                 rtllib_dir + "replay_buffer.sv",
                 rtllib_dir + "mvu_4sx4u.sv",
-                rtllib_dir + "mvu_8sx9.sv",
+                rtllib_dir + "mvu_vvu_8sx9_dsp58.sv",
                 rtllib_dir + "mvu_8sx8u_dsp48.sv",
+                rtllib_dir + "mvu_vvu_lut.sv",
             ]
             for f in sourcefiles:
                 cmd.append("add_files -norecurse %s" % (f))
@@ -813,7 +815,7 @@ def code_generation_ipi(self):
             )
 
             # instantiate a streamer and connect it to the HLS IP
-            strm_vlnv = "amd.com:FINN:memstream:1.0"
+            strm_vlnv = "amd.com:finn:memstream:1.0"
             strm_inst = node_name + "_wstrm"
             cmd.append(
                 "create_bd_cell -type ip -vlnv %s /%s/%s"
@@ -890,11 +892,12 @@ def code_generation_ipi(self):
                 os.path.join(
                     code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"
                 ),
-                rtllib_dir + "mvu_axi.sv",
+                rtllib_dir + "mvu_vvu_axi.sv",
                 rtllib_dir + "replay_buffer.sv",
                 rtllib_dir + "mvu_4sx4u.sv",
-                rtllib_dir + "mvu_8sx9.sv",
+                rtllib_dir + "mvu_vvu_8sx9_dsp58.sv",
                 rtllib_dir + "mvu_8sx8u_dsp48.sv",
+                rtllib_dir + "mvu_vvu_lut.sv",
             ]
             for f in sourcefiles:
                 cmd.append("add_files -norecurse %s" % (f))
@@ -959,27 +962,32 @@ def derive_characteristic_fxns(self, period):
             ]
         super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
 
-    # TODO: characterize max_clk and implement this function in look-up style
     def _resolve_segment_len(self, clk):
-        # Insert pipeline registers in the DSP chain to meet target clock frequency
-        return 4 # default to 4 for now
+        # Insert pipeline registers in the DSP58 chain to meet target clock frequency
+        # 0.741 ns seems the worst-case delay through first DSP
+        # 0.605 ns seems to be (on average) delay for all subsequent DSPs
+        dsp_chain_len = np.floor((clk - 0.741) / 0.605)
+        return max(1, dsp_chain_len)
 
     def _resolve_impl_style(self, fpgapart):
         # Based on target device and activation/weight-width, choose the
-        # supported RTL module
-        act_width = self.get_input_datatype(0).bitwidth()
-        weight_width = self.get_input_datatype(1).bitwidth()
-        is_versal = (
-            fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"]
-            or fpgapart[0:5] == "xqrvc"
-        )
-        if act_width == 4 and weight_width == 4:
-            return "mvu_4sx4u"
+        # supported RTL compute core
+        if self.get_nodeattr("resType") == "lut":
+            return "mvu_vvu_lut"
         else:
-            if is_versal:
-                return "mvu_8sx9_dsp58"
+            act_width = self.get_input_datatype(0).bitwidth()
+            weight_width = self.get_input_datatype(1).bitwidth()
+            is_versal = (
+                fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"]
+                or fpgapart[0:5] == "xqrvc"
+            )
+            if act_width == 4 and weight_width == 4:
+                return "mvu_4sx4u"
             else:
-                return "mvu_8sx8u_dsp48"
+                if is_versal:
+                    return "mvu_vvu_8sx9_dsp58"
+                else:
+                    return "mvu_8sx8u_dsp48"
 
     def generate_hdl(self, model, fpgapart, clk):
         # Generate params as part of IP preparation
@@ -1023,9 +1031,11 @@ def generate_hdl(self, model, fpgapart, clk):
         self.set_nodeattr("ip_path", code_gen_dir)
 
     def prepare_codegen_default(self, fpgapart, clk):
-        template_path = os.environ["FINN_ROOT"] + "/finn-rtllib/mvu/mvu_axi_wrapper.v"
+        template_path = os.environ["FINN_ROOT"] + "/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v"
 
         code_gen_dict = {}
+        code_gen_dict["$IS_MVU$"] = [str(1)]
+        code_gen_dict["$COMPUTE_CORE$"] = [self._resolve_impl_style(fpgapart)]
         code_gen_dict["$MW$"] = [str(self.get_nodeattr("MW"))]
         code_gen_dict["$MH$"] = [str(self.get_nodeattr("MH"))]
         code_gen_dict["$PE$"] = [str(self.get_nodeattr("PE"))]
@@ -1039,7 +1049,7 @@ def prepare_codegen_default(self, fpgapart, clk):
             [str(1)] if (self.get_input_datatype(0).min() < 0) else [str(0)]
         )
         code_gen_dict["$SEGMENTLEN$"] = [str(self._resolve_segment_len(clk))]
-        code_gen_dict["$MVU_IMPL_STYLE$"] = [self._resolve_impl_style(fpgapart)]
+        code_gen_dict["$FORCE_BEHAVIORAL$"] = [str(0)]
 
         return template_path, code_gen_dict
 

From b49b79a0a669caad9355e59e1ee877ca59b65d27 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 25 Sep 2023 13:47:50 +0100
Subject: [PATCH 076/112] [specialize to rtl]: fix to changed attribute name
 and added support for converting HLS-based VVU custom-ops to RTL-based
 custom-ops

---
 .../fpgadataflow/specialize_to_rtl_layers.py  | 82 ++++++++++++++++++-
 1 file changed, 81 insertions(+), 1 deletion(-)

diff --git a/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py
index 47ed5ce863..5061282695 100644
--- a/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py
+++ b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py
@@ -26,6 +26,7 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import numpy as np
 from qonnx.transformation.base import Transformation
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.core.datatype import DataType
@@ -60,7 +61,7 @@ def apply(self, model):
         for n in graph.node:
             node_ind += 1
             if n.op_type == "MatrixVectorActivation":
-                preferred_in_rtl = getCustomOp(n).get_nodeattr("impl") == "rtl" and getCustomOp(n).get_nodeattr("resType") == "dsp"
+                preferred_in_rtl = getCustomOp(n).get_nodeattr("preferred_backend") == "rtl"
                 supported_in_rtl = self._is_rtl_variant_compatible(n)
                 if (preferred_in_rtl and supported_in_rtl):
                     mvau_input = n.input[0]
@@ -76,6 +77,7 @@ def apply(self, model):
                     pe = getCustomOp(n).get_nodeattr("PE")
                     mem_mode = getCustomOp(n).get_nodeattr("mem_mode")
                     ram_style = getCustomOp(n).get_nodeattr("ram_style")
+                    resType = getCustomOp(n).get_nodeattr("resType")
                     runtime_writeable_weights = getCustomOp(n).get_nodeattr("runtime_writeable_weights")
 
                     new_node = helper.make_node(
@@ -93,6 +95,7 @@ def apply(self, model):
                         outputDataType=outputDataType,
                         numInputVectors=numInputVectors,
                         mem_mode=mem_mode,
+                        resType=resType,
                         name=n.name + "_rtl",
                         ram_style=ram_style,
                         runtime_writeable_weights=runtime_writeable_weights
@@ -108,4 +111,81 @@ def apply(self, model):
             model = model.transform(InferDataTypes())
             model = model.transform(GiveUniqueNodeNames())
         
+        return (model, graph_modified)
+
+class InferRTLVectorVectorActivation(Transformation):
+    """Convert (HLS-based) VectorVectorActivation layers to specialized RTL layers is supported."""
+
+    def __init__(self):
+        super().__init__()
+
+    def _is_rtl_variant_compatible(self, n):
+        no_activation = getCustomOp(n).get_nodeattr("noActivation") == 1
+        act_width_in_range = (DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() <= 8) or (DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() == 9 and DataType[getCustomOp(n).get_nodeattr("inputDataType")].min() < 0)
+        weight_width_in_range = DataType[getCustomOp(n).get_nodeattr("weightDataType")].bitwidth() <= 8
+        folding_supported = (getCustomOp(n).get_nodeattr("Channels") % getCustomOp(n).get_nodeattr("PE") == 0) and (np.prod(getCustomOp(n).get_nodeattr("Kernel")) % getCustomOp(n).get_nodeattr("SIMD") == 0)
+        
+        if (no_activation and act_width_in_range and weight_width_in_range and folding_supported):
+            return True
+        else:
+            return False
+    
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for n in graph.node:
+            node_ind += 1
+            if n.op_type == "VectorVectorActivation":
+                preferred_in_rtl = getCustomOp(n).get_nodeattr("preferred_backend") == "rtl"
+                supported_in_rtl = self._is_rtl_variant_compatible(n)
+                if (preferred_in_rtl and supported_in_rtl):
+                    vvau_input = n.input[0]
+                    vvau_weight = n.input[1]
+                    vvau_output = n.output[0]
+                    inputDataType = getCustomOp(n).get_nodeattr("inputDataType")
+                    weightDataType = getCustomOp(n).get_nodeattr("weightDataType")
+                    outputDataType = getCustomOp(n).get_nodeattr("outputDataType")
+                    pe = getCustomOp(n).get_nodeattr("PE")
+                    simd = getCustomOp(n).get_nodeattr("SIMD")
+                    dim = getCustomOp(n).get_nodeattr("Dim")
+                    channels = getCustomOp(n).get_nodeattr("Channels")
+                    kernel = getCustomOp(n).get_nodeattr("Kernel")
+                    resType = getCustomOp(n).get_nodeattr("resType")
+                    mem_mode = getCustomOp(n).get_nodeattr("mem_mode")
+                    runtime_writeable_weights = getCustomOp(n).get_nodeattr("runtime_writeable_weights")
+                    ram_style = getCustomOp(n).get_nodeattr("ram_style")
+                    resType = getCustomOp(n).get_nodeattr("resType")                    
+
+                    new_node = helper.make_node(
+                        "VectorVectorActivation_rtl",
+                        [vvau_input, vvau_weight],
+                        [vvau_output],
+                        domain="finn.custom_op.fpgadataflow",
+                        backend="fpgadataflow",
+                        name=n.name + "_rtl",
+                        PE=pe,
+                        SIMD=simd,
+                        Dim=dim,
+                        Channels=channels,
+                        Kernel=kernel,
+                        resType=resType,
+                        inputDataType=inputDataType,
+                        weightDataType=weightDataType,
+                        outputDataType=outputDataType,
+                        mem_mode=mem_mode,
+                        runtime_writeable_weights=runtime_writeable_weights,
+                        ram_style=ram_style
+                    )
+                    graph.node.insert(node_ind, new_node)
+                    # remove old node
+                    graph.node.remove(n)
+                    graph_modified=True
+        
+        if graph_modified:
+            model = model.transform(MinimizeAccumulatorWidth())
+            model = model.transform(InferShapes())
+            model = model.transform(InferDataTypes())
+            model = model.transform(GiveUniqueNodeNames())
+        
         return (model, graph_modified)
\ No newline at end of file

From 9bdba031df228a2afbe99b8ea2fb576b678bba86 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Tue, 19 Sep 2023 15:27:28 +0100
Subject: [PATCH 077/112] Adding core for DSP48 backport.

---
 finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
index 416c12c1cc..07c44cf89a 100644
--- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
+++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
@@ -4,7 +4,9 @@ module mvu_8sx8u_dsp48 #(
 	int unsigned  ACCU_WIDTH,
 	int unsigned  ACTIVATION_WIDTH,
 	int unsigned  WEIGHT_WIDTH,
-	bit FORCE_BEHAVIORAL = 0,
+
+	bit  SIGNED_ACTIVATIONS = 0,
+	bit  FORCE_BEHAVIORAL = 0,
 
 	localparam int unsigned SINGLE_PROD_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH
 )(
@@ -16,8 +18,8 @@ module mvu_8sx8u_dsp48 #(
 	// Input
 	input	logic  last,
 	input	logic  zero,	// ignore current inputs and force this partial product to zero
-	input	logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]  w,	// signed weights
-	input	logic                [SIMD-1:0][ACTIVATION_WIDTH-1:0]  a,	// unsigned activations
+	input	logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH    -1:0]  w,	// signed weights
+	input	logic                [SIMD-1:0][ACTIVATION_WIDTH-1:0]  a,	// unsigned activations (override by SIGNED_ACTIVATIONS)
 
 	// Ouput
 	output	logic  vld,
@@ -47,7 +49,7 @@ module mvu_8sx8u_dsp48 #(
 	assign	vld = L[5];
 
 	// Stages #1 - #3: DSP Lanes + cross-lane canaries duplicated with SIMD parallelism
-    localparam int unsigned  D[2:0] = '{ ACCU_WIDTH+SINGLE_PROD_WIDTH, SINGLE_PROD_WIDTH, 0 }; // Lane offsets
+	localparam int unsigned  D[2:0] = '{ ACCU_WIDTH+SINGLE_PROD_WIDTH, SINGLE_PROD_WIDTH, 0 }; // Lane offsets
 
 	localparam int unsigned  PIPE_COUNT = (PE+1)/2;
 	for(genvar  c = 0; c < PIPE_COUNT; c++) begin : genPipes
@@ -61,7 +63,7 @@ module mvu_8sx8u_dsp48 #(
 		for(genvar  s = 0; s < SIMD; s++) begin : genSIMD
 
 			// Input Lane Assembly
-			uwire [23:0]  bb = a[s];
+			uwire [23:0]  bb = { {(24-ACTIVATION_WIDTH){SIGNED_ACTIVATIONS && a[s][ACTIVATION_WIDTH-1]}}, a[s] };
 			logic [33:0]  aa;
 			logic [26:0]  dd;
 			logic [ 1:0]  xx;

From 2cf1ef70306339b1409ed61d8e18eda243bf56ad Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 25 Sep 2023 14:48:34 +0100
Subject: [PATCH 078/112] [mvu rtl core]: added support for signed activations
 for DSP48-based MVUs

---
 finn-rtllib/mvu/mvu_4sx4u.sv   | 3 ++-
 finn-rtllib/mvu/mvu_vvu_axi.sv | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv
index 88985312c9..706347d700 100644
--- a/finn-rtllib/mvu/mvu_4sx4u.sv
+++ b/finn-rtllib/mvu/mvu_4sx4u.sv
@@ -2,6 +2,7 @@ module mvu_4sx4u #(
 	int unsigned  PE,
 	int unsigned  SIMD,
 	int unsigned  ACCU_WIDTH,
+	bit SIGNED_ACTIVATIONS = 0,
 	bit FORCE_BEHAVIORAL = 0
 )(
 	// Global Control
@@ -57,7 +58,7 @@ module mvu_4sx4u #(
 		for(genvar  s = 0; s < SIMD; s++) begin : genSIMD
 
 			// Input Lane Assembly
-			uwire [23:0]  bb = a[s];
+			uwire [23:0]  bb = { {(20){SIGNED_ACTIVATIONS && a[s][3]}}, a[s] };
 			logic [33:0]  aa;
 			logic [26:0]  dd;
 			logic [ 1:0]  xx[3:1];
diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index 416480da79..da7e00cc55 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -182,14 +182,14 @@ module mvu_vvu_axi #(
 			.vld(ovld), .p(odat)
 		);
 	"mvu_4sx4u":
-		mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
+		mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
 			.clk, .rst, .en,
 			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
 			.vld(ovld), .p(odat)
 		);
 	"mvu_8sx8u_dsp48":
 		mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
-		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
+		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
 			.clk, .rst, .en,
 			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
 			.vld(ovld), .p(odat)

From ab8d4a8e075ac9b3ccf78d2a08907d5dcc116fdb Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 25 Sep 2023 16:17:38 +0100
Subject: [PATCH 079/112] [rtl mvu custom-op]: add upper bound to SEGMENTLEN
 equal to number of DSP58s chained together

---
 src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
index c7fb855884..d0a638475a 100644
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
@@ -966,7 +966,9 @@ def _resolve_segment_len(self, clk):
         # Insert pipeline registers in the DSP58 chain to meet target clock frequency
         # 0.741 ns seems the worst-case delay through first DSP
         # 0.605 ns seems to be (on average) delay for all subsequent DSPs
-        dsp_chain_len = np.floor((clk - 0.741) / 0.605)
+        critical_path_dsps = np.floor((clk - 0.741) / 0.605)
+        max_chain_len = np.ceil(self.get_nodeattr("SIMD") / 3)
+        dsp_chain_len = critical_path_dsps if critical_path_dsps < max_chain_len else max_chain_len
         return max(1, dsp_chain_len)
 
     def _resolve_impl_style(self, fpgapart):

From 5a429fcbe14ca6177082fab472549407f47f97d6 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 13 Oct 2023 23:29:39 +0100
Subject: [PATCH 080/112] [mvu_vvu dsp58]: change weight input to 2D instead of
 3D array

---
 finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv b/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv
index 6ae117e3ab..53cf71fd5f 100644
--- a/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv
+++ b/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv
@@ -42,7 +42,8 @@ module mvu_vvu_8sx9_dsp58 #(
     int unsigned SEGMENTLEN = 0, // Default to 0 (which implies a single segment)
 	bit FORCE_BEHAVIORAL = 0,
 
-	localparam int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD
+	localparam int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD,
+	localparam int unsigned WEIGHT_ELEMENTS = PE*SIMD
   )
   (
     // Global Control
@@ -53,7 +54,7 @@ module mvu_vvu_8sx9_dsp58 #(
 	// Input
     input   logic last,
     input   logic zero, // ignore current inputs and force this partial product to zero
-    input   logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // weights
+    input   logic [WEIGHT_ELEMENTS-1:0][WEIGHT_WIDTH-1:0] w, // weights
 	input   logic [ACTIVATION_ELEMENTS-1:0][ACTIVATION_WIDTH-1:0] a, // activations
 
 	// Ouput
@@ -164,7 +165,8 @@ module mvu_vvu_8sx9_dsp58 #(
 // synthesis translate_off
 							zero ? '1 : 						
 // synthesis translate_on							
-							w[i][3*j +: LANES_OCCUPIED];
+							//w[i][3*j +: LANES_OCCUPIED];
+							w[SIMD*i+3*j +: LANES_OCCUPIED];
 						if (EXTERNAL_PREGS > 1) B[i][0:EXTERNAL_PREGS-2] <= B[i][1:EXTERNAL_PREGS-1];
 					end
 				end
@@ -181,7 +183,8 @@ module mvu_vvu_8sx9_dsp58 #(
 // synthesis translate_off					
 						zero ? '1 : 
 // synthesis translate_on					
-						PAD_BITS_WEIGHT == 0 ? w[i][3*j+k] : { {PAD_BITS_WEIGHT{w[i][3*j+k][WEIGHT_WIDTH-1]}}, w[i][3*j+k] };
+						//PAD_BITS_WEIGHT == 0 ? w[i][3*j+k] : { {PAD_BITS_WEIGHT{w[i][3*j+k][WEIGHT_WIDTH-1]}}, w[i][3*j+k] };
+						PAD_BITS_WEIGHT == 0 ? w[SIMD*i+3*j+k] : { {PAD_BITS_WEIGHT{w[SIMD*i+3*j+k][WEIGHT_WIDTH-1]}}, w[SIMD*i+3*j+k] };
 				end : genBin
 				for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero
 					assign b_in_i[i][j][8*k +: 8] = 8'b0;

From a4a18bb08cef96bb52c02096d54b573b421bcd12 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 13 Oct 2023 23:30:55 +0100
Subject: [PATCH 081/112] [mvu_vvu axi]: re-wire weights appropriately for VVU
 DSP58

---
 finn-rtllib/mvu/mvu_vvu_axi.sv | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index da7e00cc55..f0f75c633a 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -45,7 +45,7 @@
  *****************************************************************************/
 
 module mvu_vvu_axi #(
-	bit IS_MVU, // string type causes error in Vivado
+	bit IS_MVU,
 	parameter COMPUTE_CORE,
 	int unsigned MW,
 	int unsigned MH,
@@ -64,8 +64,8 @@ module mvu_vvu_axi #(
 	localparam int unsigned INPUT_STREAM_WIDTH_BA = ((IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8,
 	localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH,
 	localparam int unsigned INPUT_STREAM_WIDTH =  (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH,
-	localparam int unsigned SF = MW/SIMD,
-	localparam int unsigned NF = MH/PE,
+	localparam int unsigned SF = IS_MVU ? MW/SIMD : MW/(SIMD*PE),
+	localparam int unsigned NF = IS_MVU ? MH/PE : 1,
 	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8
 )
 (
@@ -91,11 +91,11 @@ module mvu_vvu_axi #(
 
 //-------------------- Parameter sanity checks --------------------\\
 	initial begin
-		if (MW % SIMD != 0) begin
+		if ((MW % SIMD != 0 && IS_MVU) || (MW % (SIMD*PE) != 0 && !IS_MVU)) begin
 			$error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD);
 			$finish;
 		end
-		if (MH % PE != 0) begin
+		if (MH % PE != 0 && IS_MVU) begin
 			$error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE);
 			$finish;
 		end
@@ -137,7 +137,7 @@ module mvu_vvu_axi #(
 	uwire avld;
 	uwire ardy;
 
-	replay_buffer #(.LEN(SF), .REP(IS_MVU ? NF : 1), .W($bits(mvauin_t))) activation_replay (
+	replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvauin_t))) activation_replay (
 	.clk, .rst,
 	.ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)),
 	.ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin)
@@ -154,9 +154,11 @@ module mvu_vvu_axi #(
 	uwire [PE-1:0][ACCU_WIDTH-1:0] odat;
 	typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t;
 	uwire mvauin_t amvau_i;
+	uwire mvauin_weight_t wmvau_i;
 
 	if (IS_MVU) begin : genMVUInput
 		assign  amvau_i = amvau;
+		assign  wmvau_i = s_axis_weights_tdata;
 	end : genMVUInput
 	else begin : genVVUInput
 		// The input stream will have the channels interleaved for VVU when PE>1
@@ -164,11 +166,14 @@ module mvu_vvu_axi #(
 		// Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like:
 		// (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to
 		// (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i)
-		localparam int num_of_elements = INPUT_STREAM_WIDTH/ACTIVATION_WIDTH;
+		localparam int num_of_elements = PE*SIMD;
 		for (genvar i=0; i<num_of_elements; i++) begin : genRewire
 			assign  amvau_i[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH] = (PE > 1) ?
 									amvau[(i/SIMD + (i*PE % num_of_elements) + 1) * ACTIVATION_WIDTH : (i/SIMD + (i*PE % num_of_elements)) * ACTIVATION_WIDTH]
 									: amvau[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH];
+			assign  wmvau_i[i*WEIGHT_WIDTH +: WEIGHT_WIDTH] = (PE > 1) ? 
+									s_axis_weights_tdata[( ((SIMD-1-i) + int'(i/SIMD)*SIMD) + int'(i/SIMD) * SIMD + 1) * WEIGHT_WIDTH : ( ((SIMD-1-i) + int'(i/SIMD)*SIMD) + int'(i/SIMD) * SIMD ) * WEIGHT_WIDTH]
+									: s_axis_weights_tdata[i*WEIGHT_WIDTH +: WEIGHT_WIDTH];
 		end : genRewire
 	end : genVVUInput
 
@@ -178,7 +183,7 @@ module mvu_vvu_axi #(
 		.ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN),
 		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
 			.clk, .rst, .en,
-			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
+			.last(alast && avld), .zero(!istb), .w(wmvau_i), .a(amvau_i),
 			.vld(ovld), .p(odat)
 		);
 	"mvu_4sx4u":

From cc0737bcd00cdd6df6e3d4ff38215ac5d9eb42e6 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 13 Oct 2023 23:31:35 +0100
Subject: [PATCH 082/112] [mvu_vvu axi wrapper]: fix to IS_MVU parameter

---
 finn-rtllib/mvu/mvu_vvu_axi_wrapper.v | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
index 9c65dbc06e..01deb23840 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
+++ b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
@@ -32,7 +32,7 @@
  *****************************************************************************/
 
 module $MODULE_NAME_AXI_WRAPPER$ #(
-	parameter	IS_MVU = "$IS_MVU$",
+	parameter	IS_MVU = $IS_MVU$,
 	parameter	COMPUTE_CORE = "$COMPUTE_CORE$",
 	parameter	MW = $MW$,
 	parameter	MH = $MH$,

From c0eff0b819828a5e1d1ef80815f63be0042ce742 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 13 Oct 2023 23:32:47 +0100
Subject: [PATCH 083/112] [mvu_vvu tb]: WIP -- changes to self-checker and
 shape of input data

---
 finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv | 79 +++++++++++++++++-----------
 1 file changed, 49 insertions(+), 30 deletions(-)

diff --git a/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv
index 82c2e8e7b0..b46fc588c9 100644
--- a/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv
+++ b/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv
@@ -35,23 +35,23 @@ module mvu_vvu_axi_tb();
 
 //-------------------- Simulation parameters --------------------\\
 	// Matrix & parallelism config
-	localparam bit IS_MVU = 1;
+	localparam bit IS_MVU = 0;
 	localparam string COMPUTE_CORE = "mvu_vvu_8sx9_dsp58";
-	localparam int unsigned MW = 1500;
-	localparam int unsigned MH = 256;
-	localparam int unsigned SIMD = 60;
-	localparam int unsigned PE = 16;
-	localparam int unsigned SEGMENTLEN = 2.0;
+	localparam int unsigned MW = 36;
+	localparam int unsigned MH = 1;
+	localparam int unsigned SIMD = 3;
+	localparam int unsigned PE = 4;
+	localparam int unsigned SEGMENTLEN = 1.0;
 	localparam bit FORCE_BEHAVIORAL = 1;
 	localparam bit M_REG_LUT = 1;
 	// Bit-width config
-	localparam int unsigned ACTIVATION_WIDTH = 4;
-	localparam int unsigned WEIGHT_WIDTH = 4;
-	localparam int unsigned ACCU_WIDTH = 21; // == ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW)
-	localparam bit SIGNED_ACTIVATIONS = 0;
+	localparam int unsigned ACTIVATION_WIDTH = 8;
+	localparam int unsigned WEIGHT_WIDTH = 6;
+	localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW);
+	localparam bit SIGNED_ACTIVATIONS = 1;
 	// Simulation constants
-	localparam int unsigned NF = MH/PE;
-	localparam int unsigned SF = MW/SIMD;
+	localparam int unsigned NF = IS_MVU ? MH/PE : 1;
+	localparam int unsigned SF = IS_MVU ? MW/SIMD : MW/(SIMD*PE);
 	localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8;
 	localparam int unsigned ACTIVATION_WIDTH_BA = ((IS_MVU ? 1 : PE)*SIMD*ACTIVATION_WIDTH+7)/8*8;
 	localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH;
@@ -72,7 +72,7 @@ module mvu_vvu_axi_tb();
 
 	// Generate activations
 	typedef logic [(IS_MVU ? 1 : PE)*SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t;
-	typedef activation_t activation_vector_t[(IS_MVU ? 1 : NF)*SF];
+	typedef activation_t activation_vector_t[SF];
 
 	function activation_vector_t init_ACTIVATIONS;
 		automatic activation_vector_t res;
@@ -93,14 +93,12 @@ module mvu_vvu_axi_tb();
 		activations.dat = 'X;
 		@(posedge clk iff ap_rst_n);
 
-		for (int j=0; j<(IS_MVU ? 1 : NF); j++) begin
-			for (int i=0; i<SF; i++) begin
-				activations.dat <= ACTIVATIONS[SF*j+i];
-				do begin
-					activations.vld <= $urandom()%7 >= 0;
-					@(posedge clk);
-				end while (!(activations.vld === 1 && activations.rdy === 1));
-			end
+		for (int i=0; i<SF; i++) begin
+			activations.dat <= ACTIVATIONS[i];
+			do begin
+				activations.vld <= $urandom()%7 >= 0;
+				@(posedge clk);
+			end while (!(activations.vld === 1 && activations.rdy === 1));
 		end
 
 		activations.vld <= 0;
@@ -143,7 +141,9 @@ module mvu_vvu_axi_tb();
 	end
 
 	// Function to compute golden output
-	// a: [(IS_MVU?1:NF)*SF][SIMD-1:0][ACTIVATION_WIDTH-1:0]
+	// a: [SF][(IS_MVU ? 1 : PE)*SIMD-1:0][ACTIVATION_WIDTH-1:0]
+	// a: [SF][SIMD-1:0][ACTIVATION_WIDTH-1:0]
+	// a: [SF][PE*SIMD-1:0][ACTIVATION_WIDTH-1:0]
 	// w: [NF][SF][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]
 	typedef logic signed [PE-1:0][ACCU_WIDTH-1:0] output_t;
 	typedef output_t output_vector_t [NF];
@@ -156,14 +156,33 @@ module mvu_vvu_axi_tb();
 
 	function output_vector_t check_output(activation_vector_t a, weight_matrix_t w);
 		automatic output_vector_t res = '{default: 0};
-		for (int j = 0; j<MH; j++) begin
-			for (int i = 0; i<MW; i++) begin
-				if (SIGNED_ACTIVATIONS)
-					res[j/PE][j%PE] = IS_MVU ? $signed(res[j/PE][j%PE]) + $signed(a[i/SIMD][i%SIMD]) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]) : 
-											   $signed(res[j/PE][j%PE]) + ( PE > 1 ? $signed(a[j/PE*SF+i/SIMD][(i*PE + j%PE) % (SIMD*PE)]) : $signed(a[j/PE*SF+i/SIMD][i%SIMD]) ) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
-				else
-					res[j/PE][j%PE] = IS_MVU ? $signed(res[j/PE][j%PE]) + $signed({1'b0, a[i/SIMD][i%SIMD]}) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]) : 
-											   $signed(res[j/PE][j%PE]) + ( PE > 1 ? $signed({1'b0, a[j/PE*SF+i/SIMD][(i*PE + j%PE) % (SIMD*PE)]}) : $signed({1'b0, a[j/PE+SF+i/SIMD][i%SIMD]}) ) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
+		// for (int j = 0; j<MH; j++) begin
+		// 	for (int i = 0; i<MW; i++) begin
+		// 		if (SIGNED_ACTIVATIONS)
+		// 			res[j/PE][j%PE] = IS_MVU ? $signed(res[j/PE][j%PE]) + $signed(a[i/SIMD][i%SIMD]) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]) : 
+		// 									   $signed(res[j/PE][j%PE]) + ( PE > 1 ? $signed(a[i/SIMD/PE][i % (SIMD*PE)]) : $signed(a[i/SIMD/PE][(i)%(SIMD*PE)]) ) * $signed(w[0][i/SIMD/PE][i/PE][i%SIMD]);
+		// 		else
+		// 			res[j/PE][j%PE] = IS_MVU ? $signed(res[j/PE][j%PE]) + $signed({1'b0, a[i/SIMD][i%SIMD]}) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]) : 
+		// 									   $signed(res[j/PE][j%PE]) + ( PE > 1 ? $signed({1'b0, a[i/SIMD/PE][i % (SIMD*PE)]}) : $signed({1'b0, a[i/SIMD/PE][i%(SIMD*PE)]}) ) * $signed(w[0][i/SIMD][0][i%SIMD]);
+		// 	end
+		// end
+		// The input stream will have the channels interleaved for VVU when PE>1
+		// Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..]
+		// Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like:
+		// (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to
+		// (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i)
+		for (int i = 0; i < NF; i++) begin
+			for (int j = 0; j < SF; j++) begin
+				for (int k = 0; k < PE; k++) begin
+					for (int l = 0; l < SIMD; l++) begin
+						if (SIGNED_ACTIVATIONS)
+							res[i][k] = IS_MVU ? $signed(res[i][k]) + $signed(a[j][l]) * $signed(w[i][j][k][l]) :
+												 $signed(res[i][k]) + $signed(a[j][k + l*PE]) * $signed(w[i][j][k][l]);
+						else
+							res[i][k] = IS_MVU ? $signed(res[i][k]) + $signed({1'b0, a[j][l]}) * $signed(w[i][j][k][l]) :
+												 $signed(res[i][k]) + $signed({1'b0, a[j][k + l*PE]}) * $signed(w[i][j][k][l]);
+					end
+				end
 			end
 		end
 		return res;

From cf7f4946dc44f264de665e8a23893bd858277796 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Wed, 1 Nov 2023 15:20:07 +0000
Subject: [PATCH 084/112] [mvu vvu axi]: minor bugfixes to enable VVU

---
 finn-rtllib/mvu/mvu_vvu_axi.sv | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index f0f75c633a..ddedec1e8a 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -64,7 +64,7 @@ module mvu_vvu_axi #(
 	localparam int unsigned INPUT_STREAM_WIDTH_BA = ((IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8,
 	localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH,
 	localparam int unsigned INPUT_STREAM_WIDTH =  (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH,
-	localparam int unsigned SF = IS_MVU ? MW/SIMD : MW/(SIMD*PE),
+	localparam int unsigned SF = MW/SIMD,
 	localparam int unsigned NF = IS_MVU ? MH/PE : 1,
 	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8
 )
@@ -91,11 +91,11 @@ module mvu_vvu_axi #(
 
 //-------------------- Parameter sanity checks --------------------\\
 	initial begin
-		if ((MW % SIMD != 0 && IS_MVU) || (MW % (SIMD*PE) != 0 && !IS_MVU)) begin
+		if (MW % SIMD != 0) begin
 			$error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD);
 			$finish;
 		end
-		if (MH % PE != 0 && IS_MVU) begin
+		if (MH % PE != 0) begin
 			$error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE);
 			$finish;
 		end
@@ -152,13 +152,10 @@ module mvu_vvu_axi #(
 //-------------------- Core MVU/VVU --------------------\\
 	uwire ovld;
 	uwire [PE-1:0][ACCU_WIDTH-1:0] odat;
-	typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t;
 	uwire mvauin_t amvau_i;
-	uwire mvauin_weight_t wmvau_i;
 
 	if (IS_MVU) begin : genMVUInput
 		assign  amvau_i = amvau;
-		assign  wmvau_i = s_axis_weights_tdata;
 	end : genMVUInput
 	else begin : genVVUInput
 		// The input stream will have the channels interleaved for VVU when PE>1
@@ -169,11 +166,8 @@ module mvu_vvu_axi #(
 		localparam int num_of_elements = PE*SIMD;
 		for (genvar i=0; i<num_of_elements; i++) begin : genRewire
 			assign  amvau_i[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH] = (PE > 1) ?
-									amvau[(i/SIMD + (i*PE % num_of_elements) + 1) * ACTIVATION_WIDTH : (i/SIMD + (i*PE % num_of_elements)) * ACTIVATION_WIDTH]
+									amvau[(i/SIMD + (i*PE % num_of_elements) + 1) * ACTIVATION_WIDTH -1: (i/SIMD + (i*PE % num_of_elements)) * ACTIVATION_WIDTH]
 									: amvau[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH];
-			assign  wmvau_i[i*WEIGHT_WIDTH +: WEIGHT_WIDTH] = (PE > 1) ? 
-									s_axis_weights_tdata[( ((SIMD-1-i) + int'(i/SIMD)*SIMD) + int'(i/SIMD) * SIMD + 1) * WEIGHT_WIDTH : ( ((SIMD-1-i) + int'(i/SIMD)*SIMD) + int'(i/SIMD) * SIMD ) * WEIGHT_WIDTH]
-									: s_axis_weights_tdata[i*WEIGHT_WIDTH +: WEIGHT_WIDTH];
 		end : genRewire
 	end : genVVUInput
 
@@ -183,7 +177,7 @@ module mvu_vvu_axi #(
 		.ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN),
 		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
 			.clk, .rst, .en,
-			.last(alast && avld), .zero(!istb), .w(wmvau_i), .a(amvau_i),
+			.last(alast && avld), .zero(!istb), .w(s_axis_weights_tdata), .a(amvau_i),
 			.vld(ovld), .p(odat)
 		);
 	"mvu_4sx4u":

From 5ffc221eaa07828001e423551ad05f8207178656 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 20 Nov 2023 14:35:45 +0000
Subject: [PATCH 085/112] [mvu vvu axi]: minor fix -- define mvauin_weight_t

---
 finn-rtllib/mvu/mvu_vvu_axi.sv | 1 +
 1 file changed, 1 insertion(+)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index ddedec1e8a..8eb92a93e6 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -153,6 +153,7 @@ module mvu_vvu_axi #(
 	uwire ovld;
 	uwire [PE-1:0][ACCU_WIDTH-1:0] odat;
 	uwire mvauin_t amvau_i;
+	typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t;
 
 	if (IS_MVU) begin : genMVUInput
 		assign  amvau_i = amvau;

From 40d652ccb817295e5668ed765f8e348346584465 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Wed, 29 Nov 2023 14:02:33 +0000
Subject: [PATCH 086/112] [rtl mvu op]: minor fix to chain length estimation
 and enabled behavioral mode for rtl sim

---
 .../custom_op/fpgadataflow/matrixvectoractivation_rtl.py   | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
index d0a638475a..da560d73fd 100644
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
@@ -966,10 +966,12 @@ def _resolve_segment_len(self, clk):
         # Insert pipeline registers in the DSP58 chain to meet target clock frequency
         # 0.741 ns seems the worst-case delay through first DSP
         # 0.605 ns seems to be (on average) delay for all subsequent DSPs
-        critical_path_dsps = np.floor((clk - 0.741) / 0.605)
+        # clk >= (critical_path_dsps - 1) * 0.605 + 0.741
+        assert (clk > 0.741), "Infeasible clk target of {} ns has been set, consider lowering the targeted clock frequency!".format(clk)
+        critical_path_dsps = np.floor((clk - 0.741) / 0.605 + 1)
         max_chain_len = np.ceil(self.get_nodeattr("SIMD") / 3)
         dsp_chain_len = critical_path_dsps if critical_path_dsps < max_chain_len else max_chain_len
-        return max(1, dsp_chain_len)
+        return dsp_chain_len
 
     def _resolve_impl_style(self, fpgapart):
         # Based on target device and activation/weight-width, choose the
@@ -1051,7 +1053,6 @@ def prepare_codegen_default(self, fpgapart, clk):
             [str(1)] if (self.get_input_datatype(0).min() < 0) else [str(0)]
         )
         code_gen_dict["$SEGMENTLEN$"] = [str(self._resolve_segment_len(clk))]
-        code_gen_dict["$FORCE_BEHAVIORAL$"] = [str(0)]
 
         return template_path, code_gen_dict
 

From 6e98bac42f225e7ed8629e0cb67211e78db61d15 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Wed, 13 Dec 2023 09:36:25 +0000
Subject: [PATCH 087/112] [rtlsim]: use pyverilator util functions

---
 src/finn/custom_op/fpgadataflow/hlscustomop.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/hlscustomop.py b/src/finn/custom_op/fpgadataflow/hlscustomop.py
index 4fed8ed4b5..01b94c20ca 100644
--- a/src/finn/custom_op/fpgadataflow/hlscustomop.py
+++ b/src/finn/custom_op/fpgadataflow/hlscustomop.py
@@ -31,7 +31,7 @@
 import subprocess
 import warnings
 from abc import abstractmethod
-from pyverilator.util.axi_utils import _read_signal, reset_rtlsim, rtlsim_multi_io
+from pyverilator.util.axi_utils import _read_signal, reset_rtlsim, rtlsim_multi_io, toggle_clk
 from qonnx.core.datatype import DataType
 from qonnx.custom_op.base import CustomOp
 from qonnx.util.basic import roundup_to_integer_multiple
@@ -491,15 +491,11 @@ def exec_precompiled_singlenode_model(self):
     def reset_rtlsim(self, sim):
         """Sets reset input in pyverilator to zero, toggles the clock and set it
         back to one"""
-        sim.io.ap_rst_n = 0
-        sim.io.ap_clk = 1
-        sim.io.ap_clk = 0
-        sim.io.ap_rst_n = 1
+        reset_rtlsim(sim)
 
     def toggle_clk(self, sim):
         """Toggles the clock input in pyverilator once."""
-        sim.io.ap_clk = 1
-        sim.io.ap_clk = 0
+        toggle_clk(sim)
 
     def hls_sname(self):
         """Get the naming convention used by Vitis HLS for stream signals

From 5dd74ad1dede3bf2a0405de8c803a4adfb2e65d3 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 8 Dec 2023 17:12:42 +0000
Subject: [PATCH 088/112] [mvu vvu axi]: sign extend output tdata
 (byte-aligned)

---
 finn-rtllib/mvu/mvu_vvu_axi.sv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index 8eb92a93e6..699662bd72 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -243,6 +243,6 @@ module mvu_vvu_axi #(
 	end
 
 	assign	m_axis_output_tvalid = B.vld;
-	assign	m_axis_output_tdata  = B.dat;
+	assign	m_axis_output_tdata  = { {(OUTPUT_STREAM_WIDTH_BA-OUTPUT_STREAM_WIDTH){B.dat[PE-1][ACCU_WIDTH-1]}}, B.dat};
 
 endmodule : mvu_vvu_axi

From b20410bfd968c27395537b60bba11849b599a33a Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 8 Jan 2024 14:55:56 +0000
Subject: [PATCH 089/112] [mvu core]: dsp48 convert unpacked array to packed
 array to work around limitation on max array indices in Verilator

---
 finn-rtllib/mvu/mvu_4sx4u.sv       | 4 ++--
 finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv
index 706347d700..7a2af35742 100644
--- a/finn-rtllib/mvu/mvu_4sx4u.sv
+++ b/finn-rtllib/mvu/mvu_4sx4u.sv
@@ -309,7 +309,7 @@ module mvu_4sx4u #(
 			// Conclusive high part accumulation
 			if(i >= PE_REM && i < 3) begin : genHi
 				// Adder Tree across all SIMD high contributions, each from [-1:1]
-				uwire signed [$clog2(1+SIMD):0]  tree[2*SIMD-1];
+				uwire signed [2*SIMD-2:0][$clog2(1+SIMD):0]  tree;
 				for(genvar  s = 0; s < SIMD;   s++)  assign  tree[SIMD-1+s] = h3[s][i];
 				for(genvar  n = 0; n < SIMD-1; n++) begin
 					// Sum truncated to actual maximum bit width at this node
@@ -333,7 +333,7 @@ module mvu_4sx4u #(
 			if(i >= PE_REM) begin : blkLo
 				// Adder Tree across all SIMD low contributions
 				localparam int unsigned  ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1));
-				uwire [ROOT_WIDTH-1:0]  tree[2*SIMD-1];
+				uwire [2*SIMD-2:0][ROOT_WIDTH-1:0]  tree;
 				for(genvar  s = 0; s < SIMD;   s++)  assign  tree[SIMD-1+s] = p3[s][D[i]+:LO_WIDTH];
 				for(genvar  n = 0; n < SIMD-1; n++) begin
 					// Sum truncated to actual maximum bit width at this node
diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
index 07c44cf89a..1e6855f779 100644
--- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
+++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
@@ -335,7 +335,7 @@ module mvu_8sx8u_dsp48 #(
 			if(i >= PE_REM) begin : blkLo
 				// Adder Tree across all SIMD low contributions
 				localparam int unsigned  ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1));
-				uwire [ROOT_WIDTH-1:0]  tree[2*SIMD-1];
+				uwire [2*SIMD-2:0][ROOT_WIDTH-1:0]  tree;
 				for(genvar  s = 0; s < SIMD;   s++)  assign  tree[SIMD-1+s] = p3[s][D[i]+:LO_WIDTH];
 				for(genvar  n = 0; n < SIMD-1; n++) begin
 					// Sum truncated to actual maximum bit width at this node

From 1c2cc0c2c1d98d7cde569f65eb20873a10e1f12f Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 8 Jan 2024 14:57:19 +0000
Subject: [PATCH 090/112] [mvu axi]: update list of deduced parameters

---
 finn-rtllib/mvu/mvu_vvu_axi.sv | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index 699662bd72..dd357c94bb 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -60,13 +60,14 @@ module mvu_vvu_axi #(
 	bit M_REG_LUT = 1,
 
 	// Safely deducible parameters
-	localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
-	localparam int unsigned INPUT_STREAM_WIDTH_BA = ((IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8,
-	localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH,
-	localparam int unsigned INPUT_STREAM_WIDTH =  (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH,
-	localparam int unsigned SF = MW/SIMD,
-	localparam int unsigned NF = IS_MVU ? MH/PE : 1,
-	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8
+	localparam int unsigned  WEIGHT_STREAM_WIDTH	= PE * SIMD * WEIGHT_WIDTH,
+	localparam int unsigned  WEIGHT_STREAM_WIDTH_BA	= (WEIGHT_STREAM_WIDTH + 7) / 8 * 8,
+	localparam int unsigned  INPUT_STREAM_WIDTH	= (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH,
+	localparam int unsigned  INPUT_STREAM_WIDTH_BA	= (INPUT_STREAM_WIDTH + 7) / 8 * 8,
+	localparam int unsigned  OUTPUT_STREAM_WIDTH	= PE * ACCU_WIDTH,
+	localparam int unsigned  OUTPUT_STREAM_WIDTH_BA	= (OUTPUT_STREAM_WIDTH + 7) / 8 * 8,
+	localparam int unsigned  SF = MW / SIMD,
+	localparam int unsigned  NF = IS_MVU ? MH / PE : 1
 )
 (
 	// Global Control

From eeb3cea623865a13d8da78acb5a9c7fc621caf0e Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 8 Jan 2024 14:58:02 +0000
Subject: [PATCH 091/112] [mvu custom-op]: remove lut-based implementation and
 update compute core selection

---
 .../matrixvectoractivation_rtl.py             | 39 ++++++++++---------
 1 file changed, 21 insertions(+), 18 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
index da560d73fd..fcab06658c 100644
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
@@ -191,7 +191,12 @@ def verify_node(self):
 
         if mem_mode not in ["decoupled", "external"]:
             info_messages.append(
-                "RTL-based MVAU supports only decoupled or external weights."
+                "RTL-based MVU only supports decoupled or external weights."
+            )
+
+        if self.get_nodeattr("resType") == "lut":
+            info_message.append(
+                "RTL-based MVU only supports DSP-based implementation"
             )
 
         return info_messages
@@ -635,7 +640,6 @@ def execute_node(self, context, graph):
         mem_mode = self.get_nodeattr("mem_mode")
         node = self.onnx_node
 
-        # TODO ensure codegen dir exists
         if mode == "cppsim":
             raise Exception(
                 "cppsim not possible for RTL MVAU, please set exec_mode to rtlsim"
@@ -801,7 +805,6 @@ def code_generation_ipi(self):
                 rtllib_dir + "mvu_4sx4u.sv",
                 rtllib_dir + "mvu_vvu_8sx9_dsp58.sv",
                 rtllib_dir + "mvu_8sx8u_dsp48.sv",
-                rtllib_dir + "mvu_vvu_lut.sv",
             ]
             for f in sourcefiles:
                 cmd.append("add_files -norecurse %s" % (f))
@@ -897,7 +900,6 @@ def code_generation_ipi(self):
                 rtllib_dir + "mvu_4sx4u.sv",
                 rtllib_dir + "mvu_vvu_8sx9_dsp58.sv",
                 rtllib_dir + "mvu_8sx8u_dsp48.sv",
-                rtllib_dir + "mvu_vvu_lut.sv",
             ]
             for f in sourcefiles:
                 cmd.append("add_files -norecurse %s" % (f))
@@ -964,8 +966,8 @@ def derive_characteristic_fxns(self, period):
 
     def _resolve_segment_len(self, clk):
         # Insert pipeline registers in the DSP58 chain to meet target clock frequency
-        # 0.741 ns seems the worst-case delay through first DSP
-        # 0.605 ns seems to be (on average) delay for all subsequent DSPs
+        # ~0.741 ns seems the worst-case delay through first DSP
+        # ~0.605 ns seems to be (on average) delay for all subsequent DSPs
         # clk >= (critical_path_dsps - 1) * 0.605 + 0.741
         assert (clk > 0.741), "Infeasible clk target of {} ns has been set, consider lowering the targeted clock frequency!".format(clk)
         critical_path_dsps = np.floor((clk - 0.741) / 0.605 + 1)
@@ -976,22 +978,23 @@ def _resolve_segment_len(self, clk):
     def _resolve_impl_style(self, fpgapart):
         # Based on target device and activation/weight-width, choose the
         # supported RTL compute core
-        if self.get_nodeattr("resType") == "lut":
-            return "mvu_vvu_lut"
+        
+        assert self.get_nodeattr("resType") != "lut", "LUT-based RTL-MVU implementation currently not supported! Please change resType for {}".format(self.onnx_node.name)
+
+        act_width = self.get_input_datatype(0).bitwidth()
+        weight_width = self.get_input_datatype(1).bitwidth()
+        is_versal = (
+            fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"]
+            or fpgapart[0:5] == "xqrvc"
+        )
+        
+        if is_versal:
+            return "mvu_vvu_8sx9_dsp58"
         else:
-            act_width = self.get_input_datatype(0).bitwidth()
-            weight_width = self.get_input_datatype(1).bitwidth()
-            is_versal = (
-                fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"]
-                or fpgapart[0:5] == "xqrvc"
-            )
             if act_width == 4 and weight_width == 4:
                 return "mvu_4sx4u"
             else:
-                if is_versal:
-                    return "mvu_vvu_8sx9_dsp58"
-                else:
-                    return "mvu_8sx8u_dsp48"
+                return "mvu_8sx8u_dsp48"
 
     def generate_hdl(self, model, fpgapart, clk):
         # Generate params as part of IP preparation

From 0813d1463a219384b4666fad2db93a4f7dee1a0f Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 8 Jan 2024 14:59:30 +0000
Subject: [PATCH 092/112] [mvu axi]: remove LUT-based compute core

---
 finn-rtllib/mvu/mvu_vvu_axi.sv |  11 +---
 finn-rtllib/mvu/mvu_vvu_lut.sv | 104 ---------------------------------
 2 files changed, 2 insertions(+), 113 deletions(-)
 delete mode 100644 finn-rtllib/mvu/mvu_vvu_lut.sv

diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index dd357c94bb..a3b051c9a1 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -120,8 +120,8 @@ module mvu_vvu_axi #(
 			end
 		end
 		if (!IS_MVU) begin
-			if (COMPUTE_CORE != "mvu_vvu_8sx9_dsp58" && COMPUTE_CORE != "mvu_vvu_lut") begin
-				$error("VVU only supported on DSP58 or LUT-based implementation");
+			if (COMPUTE_CORE != "mvu_vvu_8sx9_dsp58") begin
+				$error("VVU only supported on DSP58");
 				$finish;
 			end
 		end
@@ -195,13 +195,6 @@ module mvu_vvu_axi #(
 			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
 			.vld(ovld), .p(odat)
 		);
-	"mvu_vvu_lut":
-		mvu_vvu_lut #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH),
-		.WEIGHT_WIDTH(WEIGHT_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .M_REG(M_REG_LUT)) core (
-			.clk, .rst, .en,
-			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
-			.vld(ovld), .p(odat)
-		);
 	default: initial begin
 		$error("Unrecognized COMPUTE_CORE '%s'", COMPUTE_CORE);
 		$finish;
diff --git a/finn-rtllib/mvu/mvu_vvu_lut.sv b/finn-rtllib/mvu/mvu_vvu_lut.sv
deleted file mode 100644
index c100910d75..0000000000
--- a/finn-rtllib/mvu/mvu_vvu_lut.sv
+++ /dev/null
@@ -1,104 +0,0 @@
-module mvu_vvu_lut #(
-    bit IS_MVU,
-    int unsigned  PE,
-    int unsigned  SIMD,
-	int unsigned  ACCU_WIDTH,
-    int unsigned  ACTIVATION_WIDTH,
-    int unsigned  WEIGHT_WIDTH,
-    bit  SIGNED_ACTIVATIONS,
-    bit  M_REG = 1,
-
-    localparam int unsigned MULT_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH,
-    localparam int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD
-)(
-	// Global Control
-	input	logic  clk,
-	input	logic  rst,
-	input	logic  en,
-
-	// Input
-	input	logic  last,
-	input	logic  zero,	// ignore current inputs and force this partial product to zero
-	input	logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]             w,	// signed weights
-	input	logic        [ACTIVATION_ELEMENTS-1:0][ACTIVATION_WIDTH-1:0]  a,	// (un)signed activations
-
-	// Ouput
-	output	logic  vld,
-	output	logic signed [PE-1:0][ACCU_WIDTH-1:0]  p
-);
-
-    typedef int unsigned  leave_load_t[2*SIMD-1];
-    function leave_load_t init_leave_loads();
-        automatic leave_load_t  res;
-        for(int  i = 2*(SIMD-1); i >= int'(SIMD)-1; i--)  res[i] = 1;
-        for(int  i = SIMD-2; i >= 0; i--)  res[i] = res[2*i+1] + res[2*i+2];
-        return res;
-    endfunction : init_leave_loads
-
-    // Pipeline for last indicator flag
-    uwire last_i;
-    generate if (M_REG) begin
-        logic [0:1] L = '0;
-        always_ff @(posedge clk) begin
-            if(rst)       L <= '0;
-            else if (en)  L <= {last, L[0]};
-        end
-        assign  last_i = L[1];
-    end
-    else begin 
-        logic L = '0;
-        always_ff @(posedge clk) begin
-            if(rst)       L <= '0;
-            else if (en)  L <= last;
-        end
-        assign  last_i = L;
-    end
-    endgenerate
-
-    // For each PE generate
-    for (genvar  i = 0; i < PE; i++)  begin : genPE
-        // Stage #1: SIMD multipliers in parallel
-        uwire [MULT_WIDTH-1 : 0] m1 [SIMD];
-        for (genvar j = 0; j < SIMD; j++) begin : genSIMD
-            if (M_REG) begin : genMreg
-                logic [MULT_WIDTH-1 : 0] M [SIMD];
-                always_ff @(posedge clk) begin
-                    if(rst)         M[j] = '{ default : 0 };
-                    else if (en)    M[j] = zero ? 0 :
-                                            SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) :
-                                                                 $signed({1'b0, a[(IS_MVU ? 0 : SIMD*i) + j]}) * $signed(w[i][j]); 
-                    // (SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) : a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) isn't valid -- leads to unsigned multiplication
-                end
-                assign  m1[j] = M[j];
-            end : genMreg
-            else begin : genNoMreg 
-                assign m1[j] = zero ? 0 :
-                               SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) :
-                                                    $signed({1'b0, a[(IS_MVU ? 0 : SIMD*i) + j]}) * $signed(w[i][j]);
-            end : genNoMreg
-        end : genSIMD
-
-        // Stage #2: Adder tree to reduce SIMD products
-        localparam leave_load_t  LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default : 1 };
-        localparam int unsigned  ROOT_WIDTH = $clog2(SIMD*(2**MULT_WIDTH-1));
-        uwire signed [2*SIMD-2:0][ROOT_WIDTH-1:0]  tree;
-        for(genvar s = 0; s < SIMD; s++)  assign  tree[SIMD-1+s] = $signed(m1[s]);
-        for(genvar n = 0; n < SIMD-1; n++) begin
-            // Sum truncated to actual maximum bit width at this node
-            localparam int unsigned  NODE_WIDTH = $clog2(LEAVE_LOAD[n]*(2**MULT_WIDTH-1));
-            uwire signed [NODE_WIDTH-1:0]  s = tree[2*n+1] + tree[2*n+2];
-            assign tree[n] = s;
-        end
-
-        // Stage #3: Buffer output
-        logic [ACCU_WIDTH-1:0] P2 [PE];
-        always_ff @(posedge clk) begin
-            if(rst)         P2[i] = '{ default : 0};
-            else if (en)    P2[i] = (last_i ? 0 : $signed(P2[i])) + $signed(tree[0]);
-        end
-
-        assign  vld = last_i;
-        assign  p[i] = P2[i];
-    end : genPE
-
-endmodule : mvu_vvu_lut

From 4892d6614b734a08315062b86ec6d5e1f1af0dc1 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 11 Jan 2024 12:02:38 +0000
Subject: [PATCH 093/112] [hls custom-op]: enable reset in sim

---
 src/finn/custom_op/fpgadataflow/hlscustomop.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/finn/custom_op/fpgadataflow/hlscustomop.py b/src/finn/custom_op/fpgadataflow/hlscustomop.py
index 01b94c20ca..bc59c69192 100644
--- a/src/finn/custom_op/fpgadataflow/hlscustomop.py
+++ b/src/finn/custom_op/fpgadataflow/hlscustomop.py
@@ -600,6 +600,7 @@ def rtlsim_multi_io(self, sim, io_dict):
             trace_file=trace_file,
             sname=sname,
             liveness_threshold=pyverilate_get_liveness_threshold_cycles(),
+            do_reset=True,
         )
         self.set_nodeattr("cycles_rtlsim", total_cycle_count)
 

From 44f6e0f3e70eea06408b94a31e555f0f6b9ea358 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 11 Jan 2024 12:21:00 +0000
Subject: [PATCH 094/112] [test mvu rtl]: updated test flow (DSP58 only)

---
 .../test_fpgadataflow_mvau_rtl.py             | 167 +++++++++---------
 1 file changed, 87 insertions(+), 80 deletions(-)

diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py b/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py
index 3db7a718f5..1e9de44fb2 100644
--- a/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py
+++ b/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py
@@ -27,141 +27,148 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import pytest
-import os
 
 import numpy as np
+import os
+import pickle
 from onnx import TensorProto, helper
-from qonnx.util.basic import (
-    qonnx_make_model,
-    gen_finn_dt_tensor
-)
-from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.core.datatype import DataType
-from qonnx.transformation.general import GiveUniqueNodeNames
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.general import ApplyConfig, GiveUniqueNodeNames, GiveReadableTensorNames
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
+
 import finn.core.onnx_exec as oxe
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
-from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl
+from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
-from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.set_fifo_depths import InsertAndSetFIFODepths
+
+
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
-from qonnx.transformation.general import ApplyConfig
-import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl
-#import qonnx.core.data_layout as DataLayout
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 
 build_dir = os.environ["FINN_BUILD_DIR"]
 
-def make_single_matmul_modelwrapper(W, ofm_shape, mh, ifm, weights, idt, wdt):
-    (ofm_h, ofm_w) = ofm_shape
-    ofm = helper.make_tensor_value_info(
-        "ofm",
-        TensorProto.FLOAT,
-        (1, ofm_h, ofm_w, mh)
-    )
-
-    matmul_node = helper.make_node(
-        "MatMul",
-        ["ifm", "weights"],
-        ["ofm"]
-    )
-    graph = helper.make_graph(
-        nodes=[matmul_node],
-        name="matmul_graph",
-        inputs=[ifm],
-        outputs=[ofm]
-    )
+
+def make_single_matmul_modelwrapper(ifm, ofm, idt, wdt, W):
+    matmul_node = helper.make_node("MatMul", ["ifm", "weights"], ["ofm"])
+    graph = helper.make_graph(nodes=[matmul_node], name="matmul_graph", inputs=[ifm], outputs=[ofm])
 
     model = qonnx_make_model(graph, producer_name="fclayer-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("ifm", idt)
     model.set_tensor_datatype("weights", wdt)
-    model.set_tensor_datatype("ofm", DataType["INT32"]) # At this step, the MatMul layer does not optimize the bit-width of the output datatype
+    model.set_tensor_datatype(
+        "ofm", DataType["INT32"]
+    )  # At this step, the MatMul layer does not optimize the bit-width of the output datatype
     model.set_initializer("weights", W)
-
     # model.set_tensor_layout("ifm", DataLayout.NHWC)
 
     return model
 
+
 def prepare_inputs(input_tensor):
-    return {"inp": input_tensor}
+    return {"global_in": input_tensor}
+
 
-@pytest.mark.parametrize("mh", [16])
-@pytest.mark.parametrize("mw", [32])
-@pytest.mark.parametrize("pe", [1, 4, 16])
-#@pytest.mark.parametrize("simd", [1, 30, 90])
-@pytest.mark.parametrize("simd", [1, 4, 32])
+# @pytest.mark.parametrize("mh", [36])
+# @pytest.mark.parametrize("mw", [256])
+@pytest.mark.parametrize("mh", [9])
+@pytest.mark.parametrize("mw", [36])
+# @pytest.mark.parametrize("pe", [1, 4, 9, 36])
+# @pytest.mark.parametrize("simd", [1, 4, 16, 64, 256])
+@pytest.mark.parametrize("pe", [1, 3, 9])
+@pytest.mark.parametrize("simd", [1, 3, 6, 18, 36])
 @pytest.mark.parametrize("idt", [DataType["UINT4"], DataType["UINT8"]])
-@pytest.mark.parametrize("wdt", [DataType["INT4"], DataType["INT6"]])
-#@pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S", "xcku3p-ffva676-1-e"])
-@pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S"])
-@pytest.mark.parametrize("segmentlen", [1])
+@pytest.mark.parametrize("wdt", [DataType["INT4"], DataType["INT8"]])
+# @pytest.mark.parametrize("part", ["xcvc1902-vsva2197-2MP-e-S", "xcku3p-ffva676-1-e"])
+@pytest.mark.parametrize("part", ["xcvc1902-vsva2197-2MP-e-S"])
+@pytest.mark.parametrize("clk_ns", [1.66, 4])
 @pytest.mark.fpgadataflow
 @pytest.mark.slow
 @pytest.mark.vivado
-def test_fpgadataflow_mvau_rtl(mh, mw, pe, simd, idt, wdt, part, segmentlen):
+def test_fpgadataflow_mvau_rtl(
+    mh, mw, pe, simd, idt, wdt, part, clk_ns
+):
+    if part == "xcku3p-ffva676-1-e" and clk_ns != 1.66:
+        pytest.skip("Skip test for varying clk for devices other than Versal, since this variable doesn't change anything for this test")
+
     # Create test input vector (produced by SWG)
     ofm_shape = (5, 5)
     ofm_h, ofm_w = ofm_shape
-    ifm = helper.make_tensor_value_info(
-        "ifm",
-        TensorProto.FLOAT,
-        [1, ofm_h, ofm_w, mw]
-    )
-    weights = helper.make_tensor_value_info(
-        "weights",
-        TensorProto.FLOAT,
-        [mw, mh]
-    )
+    ifm = helper.make_tensor_value_info("ifm", TensorProto.FLOAT, [1, ofm_h, ofm_w, mw])
+    ofm = helper.make_tensor_value_info("ofm", TensorProto.FLOAT, (1, ofm_h, ofm_w, mh))
     W = gen_finn_dt_tensor(wdt, (mw, mh))
-    model = make_single_matmul_modelwrapper(W, ofm_shape, mh, ifm, weights, idt, wdt)
+    model = make_single_matmul_modelwrapper(ifm, ofm, idt, wdt, W)
     model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
 
-    model.save(build_dir+"/matmul.onnx")
+    model.save(build_dir + "/matmul.onnx")
 
     # Create MatMul & obtain golden reference output
-    A = gen_finn_dt_tensor(model.get_tensor_datatype("ifm"), model.get_tensor_shape("ifm"))
+    A = gen_finn_dt_tensor(model.get_tensor_datatype("global_in"), model.get_tensor_shape("global_in"))
     input_dict = prepare_inputs(A)
 
-    ## Execute ONNX model
-    output_matmul = oxe.execute_onnx(model, input_dict)
+    # Execute ONNX model
+    output_matmul = oxe.execute_onnx(model, input_dict)["global_out"]
+
+    with open(build_dir + "/onnx_output.pkl", "wb") as f:
+        pickle.dump(output_matmul, f)
 
     # Create MVAU (HLS)
     model = model.transform(to_hls.InferQuantizedMatrixVectorActivation(mem_mode="decoupled"))
     model = model.transform(GiveUniqueNodeNames())
-    
+
     # Apply folding (i.e. specify to use DSPs)
     folding_config = {
         "Defaults": {},
         "MatrixVectorActivation_0": {
-            "PE" : pe,
-            "SIMD" : simd,
-            "mem_mode" : "decoupled",
-            "ram_style" : "auto",
-            "resType" : "dsp",
-            "impl" : "rtl"
-        }
+            "PE": pe,
+            "SIMD": simd,
+            "mem_mode": "decoupled",
+            "ram_style": "auto",
+            "resType": "dsp",
+            "preferred_backend" : "rtl"
+        },
     }
     model = model.transform(ApplyConfig(folding_config))
-    model.save(build_dir+"/mvau_hls.onnx")
-
-    model = model.transform(SetExecMode("rtlsim"))
-    model = model.transform(PrepareIP(part, 5))
-    model = model.transform(HLSSynthIP())
-    model = model.transform(PrepareRTLSim())
-    output_mvau_hls = oxe.execute_onnx(model, input_dict)["ofm"]
+    model.save(build_dir + "/mvau_hls.onnx")
 
     # Apply convert-to-rtl step
     model = model.transform(to_rtl.InferRTLMatrixVectorActivation())
     model = model.transform(GiveUniqueNodeNames())
-    model.save(build_dir+"/mvau_rtl.onnx")
+    model.save(build_dir + "/mvau_rtl.onnx")
 
+    # Reset rtlsim_so and ip-related paths such that new Pyverilator SO and IP is generated
+    for n in model.graph.node:
+        getCustomOp(n).set_nodeattr("rtlsim_trace", build_dir + "/mvu_trace_rtl_nodebynode.vcd")
+    
     model = model.transform(SetExecMode("rtlsim"))
-    model = model.transform(PrepareIP("xcvm1802-vsvd1760-2MP-e-S", 5))
+    model = model.transform(PrepareIP(part, clk_ns))
     model = model.transform(HLSSynthIP())
     model = model.transform(PrepareRTLSim())
-    output_mvau_rtl = oxe.execute_onnx(model, input_dict)["ofm"]
+    output_mvau_rtl = oxe.execute_onnx(model, input_dict)["global_out"]
+
+    with open(build_dir + "/mvau_rtl_output.pkl", "wb") as f:
+        pickle.dump(output_mvau_rtl, f)
+
+    model.save(build_dir + "/mvau_rtl_sim.onnx")
+    assert (output_matmul == output_mvau_rtl).all(), "Output of ONNX model not matching output of node-by-node sim!"
+
+    model = model.transform(InsertAndSetFIFODepths(part, clk_ns))
+    model = model.transform(PrepareIP(part, clk_ns))
+    model = model.transform(HLSSynthIP())
+    model = model.transform(CreateStitchedIP(part, clk_ns))
 
-    model.save(build_dir+"/mvau_rtl_sim.onnx")
+    os.environ["RTLSIM_TRACE_DEPTH"] = "3"
+    model.set_metadata_prop("rtlsim_so", "")
+    model.set_metadata_prop("exec_mode", "rtlsim")
+    model.set_metadata_prop("rtlsim_trace", build_dir + "/mvu_trace_rtl_stitch.vcd")
+    model.save(build_dir + "/stitched_ip.onnx")
+    output_mvau_rtl_stitch = oxe.execute_onnx(model, input_dict)["global_out"]
 
-    assert (output_mvau_hls == output_mvau_rtl).all()
-    assert (output_mvau_hls.size > 0)
+    assert (output_matmul == output_mvau_rtl_stitch).all(), "Output of ONNX model not matching output of stitched-IP RTL model!"
\ No newline at end of file

From 9b2ccebba2c3689d6a1e55b6df027f461244d216 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 11 Jan 2024 14:43:46 +0000
Subject: [PATCH 095/112] [mvu vvu axi]: reworked flow control and backpressure
 handling by tpreusser

---
 finn-rtllib/mvu/mvu_vvu_axi.sv | 130 ++++++++++++++++-----------------
 1 file changed, 61 insertions(+), 69 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index a3b051c9a1..0168f20563 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -62,12 +62,12 @@ module mvu_vvu_axi #(
 	// Safely deducible parameters
 	localparam int unsigned  WEIGHT_STREAM_WIDTH	= PE * SIMD * WEIGHT_WIDTH,
 	localparam int unsigned  WEIGHT_STREAM_WIDTH_BA	= (WEIGHT_STREAM_WIDTH + 7) / 8 * 8,
-	localparam int unsigned  INPUT_STREAM_WIDTH	= (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH,
+	localparam int unsigned  INPUT_STREAM_WIDTH	= SIMD * ACTIVATION_WIDTH,
 	localparam int unsigned  INPUT_STREAM_WIDTH_BA	= (INPUT_STREAM_WIDTH + 7) / 8 * 8,
 	localparam int unsigned  OUTPUT_STREAM_WIDTH	= PE * ACCU_WIDTH,
 	localparam int unsigned  OUTPUT_STREAM_WIDTH_BA	= (OUTPUT_STREAM_WIDTH + 7) / 8 * 8,
 	localparam int unsigned  SF = MW / SIMD,
-	localparam int unsigned  NF = IS_MVU ? MH / PE : 1
+	localparam int unsigned  NF = MH / PE
 )
 (
 	// Global Control
@@ -119,81 +119,73 @@ module mvu_vvu_axi #(
 				$finish;
 			end
 		end
-		if (!IS_MVU) begin
-			if (COMPUTE_CORE != "mvu_vvu_8sx9_dsp58") begin
-				$error("VVU only supported on DSP58");
-				$finish;
-			end
-		end
 	end
 
 	uwire clk = ap_clk;
 	uwire rst = !ap_rst_n;
 
-	typedef logic [INPUT_STREAM_WIDTH-1 : 0] mvauin_t;
-
-	uwire mvauin_t amvau;
+	//- Replay to Accommodate Neuron Fold -----------------------------------
+	typedef logic [PE*SIMD-1:0][ACTIVATION_WIDTH-1:0] mvu_flatin_t;
+	uwire mvu_flatin_t amvau;
 	uwire alast;
 	uwire afin;
 	uwire avld;
 	uwire ardy;
 
-	replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvauin_t))) activation_replay (
+	replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvu_flatin_t))) activation_replay (
 	.clk, .rst,
-	.ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)),
+	.ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvu_flatin_t'(s_axis_input_tdata)),
 	.ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin)
 	);
 
-//-------------------- Input control --------------------\\
+	//- Unflatten inputs into structured matrices ---------------------------
+	typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH    -1:0]  mvu_w_t;
+	typedef logic         [SIMD-1:0][ACTIVATION_WIDTH-1:0]  mvu_a_t;
+
+	uwire  mvu_w_t  mvu_w = s_axis_weights_tdata;
+	uwire  mvu_a_t  mvu_a = amvau;
+
+	//- Flow Control Bracket around Compute Core ----------------------------
 	uwire en;
 	uwire istb = avld && s_axis_weights_tvalid;
 	assign ardy = en && s_axis_weights_tvalid;
 	assign s_axis_weights_tready = en && avld;
 
-//-------------------- Core MVU/VVU --------------------\\
-	uwire ovld;
-	uwire [PE-1:0][ACCU_WIDTH-1:0] odat;
-	uwire mvauin_t amvau_i;
-	typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t;
-
-	if (IS_MVU) begin : genMVUInput
-		assign  amvau_i = amvau;
-	end : genMVUInput
-	else begin : genVVUInput
-		// The input stream will have the channels interleaved for VVU when PE>1
-		// Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..]
-		// Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like:
-		// (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to
-		// (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i)
-		localparam int num_of_elements = PE*SIMD;
-		for (genvar i=0; i<num_of_elements; i++) begin : genRewire
-			assign  amvau_i[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH] = (PE > 1) ?
-									amvau[(i/SIMD + (i*PE % num_of_elements) + 1) * ACTIVATION_WIDTH -1: (i/SIMD + (i*PE % num_of_elements)) * ACTIVATION_WIDTH]
-									: amvau[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH];
-		end : genRewire
-	end : genVVUInput
+	//- Instantiate compute core ----------------------------
+	typedef logic [PE-1:0][ACCU_WIDTH-1:0]  dsp_p_t;
+	uwire dsp_vld;
+	uwire dsp_p_t  dsp_p;
+
+	uwire dsp_clk = ap_clk;
+	uwire dsp_en = en;
+	uwire dsp_last = alast && avld;
+	uwire dsp_zero = !istb;
+	uwire mvu_w_t dsp_w = mvu_w;
+	uwire mvu_a_t dsp_a = mvu_a;
+	uwire ovld = dsp_vld;
+	uwire dsp_p_t  odat = dsp_p;
 
 	case(COMPUTE_CORE)
 	"mvu_vvu_8sx9_dsp58":
 		mvu_vvu_8sx9_dsp58 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
 		.ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN),
 		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
-			.clk, .rst, .en,
-			.last(alast && avld), .zero(!istb), .w(s_axis_weights_tdata), .a(amvau_i),
-			.vld(ovld), .p(odat)
+			.clk(dsp_clk), .rst, .en(dsp_en),
+			.last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a),
+			.vld(dsp_vld), .p(dsp_p)
 		);
 	"mvu_4sx4u":
 		mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
-			.clk, .rst, .en,
-			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
-			.vld(ovld), .p(odat)
+			.clk(dsp_clk), .rst, .en(dsp_en),
+			.last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a),
+			.vld(dsp_vld), .p(dsp_p)
 		);
 	"mvu_8sx8u_dsp48":
 		mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
 		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
-			.clk, .rst, .en,
-			.last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i),
-			.vld(ovld), .p(odat)
+			.clk(dsp_clk), .rst, .en(dsp_en),
+			.last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a),
+			.vld(dsp_vld), .p(dsp_p)
 		);
 	default: initial begin
 		$error("Unrecognized COMPUTE_CORE '%s'", COMPUTE_CORE);
@@ -202,41 +194,41 @@ module mvu_vvu_axi #(
 	endcase
 
 //-------------------- Output register slice --------------------\\
+	// Make `en`computation independent from external inputs.
+	// Drive all outputs from registers.
 	struct packed {
-		logic vld;
+		logic rdy;
 		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
-	} A = '{ vld: 0, default: 'x};
-
-	assign en = !A.vld || !ovld;
-
-	uwire  b_load;
-	always_ff @(posedge clk) begin
-		if(rst)		A <= '{ vld: 0, default: 'x };
-		else if(!A.vld || b_load) begin
-			A.vld <= ovld && en;
-			for(int unsigned  i = 0; i < PE; i++) begin
-				// CR-1148862:
-				// A.dat[i] <= odat[i];
-				automatic logic [ACCU_WIDTH-1:0]  v = odat[i];
-				A.dat[i] <= v[ACCU_WIDTH-1:0];
-			end
-		end
-	end
-
+	}  A = '{ rdy: 1, default: 'x };	// side-step register used when encountering backpressure
 	struct packed {
 		logic vld;
 		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
-	} B = '{ vld: 0, default: 'x};
+	}  B = '{ vld: 0, default: 'x };	// ultimate output register
+
+	assign	en = A.rdy;
+	uwire  b_load = !B.vld || m_axis_output_tready;
 
-	assign	b_load = !B.vld || m_axis_output_tready;
 	always_ff @(posedge clk) begin
-		if(rst)		B <= '{ vld: 0, default: 'x };
+		if(rst) begin
+			A <= '{ rdy: 1, default: 'x };
+			B <= '{ vld: 0, default: 'x };
+		end
 		else begin
-			if(b_load)	B <= '{ vld: A.vld, dat: A.dat};
+			if(A.rdy)  A.dat <= odat;
+			A.rdy <= (A.rdy && !ovld) || b_load;
+
+			if(b_load) begin
+				B <= '{
+					vld: ovld || !A.rdy,
+					dat: A.rdy? odat : A.dat
+				};
+			end
 		end
 	end
-
 	assign	m_axis_output_tvalid = B.vld;
+	// Why would we need a sign extension here potentially creating a higher signal load into the next FIFO?
+	// These extra bits should never be used. Why not 'x them out?
 	assign	m_axis_output_tdata  = { {(OUTPUT_STREAM_WIDTH_BA-OUTPUT_STREAM_WIDTH){B.dat[PE-1][ACCU_WIDTH-1]}}, B.dat};
 
+
 endmodule : mvu_vvu_axi

From 4ab65960c6e6acff1cbf9974704b17ab4e5446a5 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 11 Jan 2024 15:07:05 +0000
Subject: [PATCH 096/112] [hlsbackend]: update limit HLS axi streams (8k-1)

---
 src/finn/custom_op/fpgadataflow/hlsbackend.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/finn/custom_op/fpgadataflow/hlsbackend.py b/src/finn/custom_op/fpgadataflow/hlsbackend.py
index 403b992a05..1b37cf138b 100644
--- a/src/finn/custom_op/fpgadataflow/hlsbackend.py
+++ b/src/finn/custom_op/fpgadataflow/hlsbackend.py
@@ -415,5 +415,5 @@ def get_ap_int_max_w(self):
         instream = self.get_instream_width()
         outstream = self.get_outstream_width()
         ret = max([instream, outstream])
-        assert ret <= 32768, "AP_INT_MAX_W=%d is larger than allowed maximum of 32768" % ret
+        assert ret <= 8191, "AP_INT_MAX_W=%d is larger than allowed maximum of 8191" % ret
         return ret

From 72ccc83afd36aa4cfb88b8cc5cee5af75a01db69 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 26 Jan 2024 11:11:05 +0000
Subject: [PATCH 097/112] [mvau hls]: refactored MVAU_hls custom_op

---
 .../hls/matrixvectoractivation_hls.py         | 522 ++++++++++++++++++
 1 file changed, 522 insertions(+)
 create mode 100644 src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py

diff --git a/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py
new file mode 100644
index 0000000000..2ad9fefc07
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py
@@ -0,0 +1,522 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import math
+import numpy as np
+import os
+import textwrap
+import warnings
+from qonnx.core.datatype import DataType
+from qonnx.util.basic import (
+    calculate_matvec_accumulator_range,
+    interleave_matrix_outer_dim_from_partitions,
+    roundup_to_integer_multiple,
+)
+
+from finn.custom_op.fpgadataflow.matrixvectoractivation import MatrixVectorActivation
+from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend
+from finn.util.data_packing import (
+    npy_to_rtlsim_input,
+    numpy_to_hls_code,
+    pack_innermost_dim_as_hex_string,
+    rtlsim_output_to_npy,
+)
+
+# ONNX i/o tensor shape assumptions for MatrixVectorActivation:
+# input 0 is the input tensor, shape (.., i_size) = (..., MW)
+# input 1 is the weight tensor, shape (i_size, o_size) = (MW, MH)
+# (optional) input 2 is the thresholds tensor, shape (o_size, n_thres)
+# output 0 is the output tensor, shape (.., o_size) = (..., MH)
+# the ... here can be any shape (representing groups of vectors)
+
+
+class MatrixVectorActivation_hls(MatrixVectorActivation, HLSBackend):
+    """Corresponds to finn-hlslib MatrixVectorActivation_Batch function."""
+
+    def __init__(self, onnx_node, **kwargs):
+        super().__init__(onnx_node, **kwargs)
+    
+    def get_nodeattr_types(self):
+        my_attrs = {}
+        my_attrs.update(MatrixVectorActivation.get_nodeattr_types(self))
+        my_attrs.update(HLSBackend.get_nodeattr_types(self))
+        return my_attrs
+
+    def get_template_param_values(self):
+        """Returns the template parameter values according to input, output and weight
+        data types."""
+        ret = dict()
+        inp_hls_str = self.get_input_datatype().get_hls_datatype_str()
+        out_hls_str = self.get_output_datatype().get_hls_datatype_str()
+        inp_is_binary = self.get_input_datatype() == DataType["BINARY"]
+        # out_is_binary = self.get_output_datatype() == DataType["BINARY"]
+        wt_is_binary = self.get_weight_datatype() == DataType["BINARY"]
+        bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1
+        if (inp_is_binary or wt_is_binary) and (not bin_xnor_mode):
+            raise Exception("True binary (non-bipolar) inputs not yet supported")
+        inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"]
+        # out_is_bipolar = self.get_output_datatype() == DataType["BIPOLAR"]
+        wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"]
+        # reinterpret inp/wt as bipolar if bin_xnor_mode is iset
+        inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode)
+        wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode)
+        # fill in TSrcI and TWeightI
+        # TODO check these with Giulio
+        # TODO handle non-bipolar binary inputs
+        if inp_is_bipolar and wt_is_bipolar:
+            ret["TSrcI"] = "Recast<XnorMul>"
+            ret["TWeightI"] = "Identity"
+        elif (not inp_is_bipolar) and wt_is_bipolar:
+            ret["TSrcI"] = "Slice<%s>" % inp_hls_str
+            ret["TWeightI"] = "Recast<Binary>"
+        elif inp_is_bipolar and (not wt_is_bipolar):
+            ret["TSrcI"] = "Recast<Binary>"
+            ret["TWeightI"] = "Identity"
+        elif (not inp_is_bipolar) and (not wt_is_bipolar):
+            ret["TSrcI"] = "Slice<%s>" % inp_hls_str
+            ret["TWeightI"] = "Identity"
+
+        # fill in TDstI
+        ret["TDstI"] = "Slice<%s>" % out_hls_str
+
+        return ret
+
+    def get_verilog_top_module_intf_names(self):
+        intf_names = super().get_verilog_top_module_intf_names()
+        mem_mode = self.get_nodeattr("mem_mode")
+        sname = self.hls_sname()
+        if mem_mode == "external":
+            intf_names["s_axis"].append(("weights_" + sname, self.get_weightstream_width_padded()))
+        if mem_mode == "decoupled":
+            # only expose axilite interface if attribute is set
+            runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1
+            if runtime_writable:
+                intf_names["axilite"] = ["s_axilite"]
+        return intf_names
+
+    def global_includes(self):
+        self.code_gen_dict["$GLOBALS$"] = ['#include "weights.hpp"']
+        self.code_gen_dict["$GLOBALS$"] += ['#include "activations.hpp"']
+
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode not in ["const", "decoupled", "external"]:
+            raise Exception(
+                """Please set mem_mode to "const", "decoupled", or "external",
+                currently no other parameter value is supported!"""
+            )
+        self.code_gen_dict["$GLOBALS$"] += ['#include "mvau.hpp"']
+        if self.calc_tmem() != 0:
+            # TODO find a better way of checking for no pregenerated thresholds
+            self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"']
+
+    def defines(self, var):
+        # Only ipgen mode: Make sure that SIMD parameter satisfies minimum requirements.
+        if var == "ipgen":
+            SIMD = self.get_nodeattr("SIMD")
+            MW = self.get_nodeattr("MW")
+            condition = SIMD >= (MW / 1024)
+            msg = (
+                f"HLS synthesis of MatrixVectorActivation requires: "
+                f"SIMD >= MW / 1024. This is not fulfilled with: SIMD={SIMD} "
+                f"and MW={MW} for node: {self.onnx_node.name}."
+            )
+            assert condition, msg
+        mem_mode = self.get_nodeattr("mem_mode")
+        numInputVectors = list(self.get_nodeattr("numInputVectors"))
+        numReps = np.prod(numInputVectors)
+        self.code_gen_dict["$DEFINES$"] = [
+            """#define MW1 {}\n #define MH1 {}\n
+            #define SIMD1 {}\n #define PE1 {}\n #define WMEM1 {}\n
+            #define TMEM1 {}\n #define numReps {}""".format(
+                self.get_nodeattr("MW"),
+                self.get_nodeattr("MH"),
+                self.get_nodeattr("SIMD"),
+                self.get_nodeattr("PE"),
+                self.calc_wmem(),
+                self.calc_tmem(),
+                numReps,
+            )
+        ]
+        if mem_mode == "decoupled" or mem_mode == "external":
+            wdt = self.get_weight_datatype()
+            self.code_gen_dict["$DEFINES$"].append("#define WP1 {}\n".format(wdt.bitwidth()))
+
+    def read_npy_data(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_input_datatype()
+        if dtype == DataType["BIPOLAR"]:
+            # use binary for bipolar storage
+            dtype = DataType["BINARY"]
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_instream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_in = "%s/input_0.npy" % code_gen_dir
+        self.code_gen_dict["$READNPYDATA$"] = []
+        # note: the innermost dim is reversed for the input
+        self.code_gen_dict["$READNPYDATA$"].append(
+            'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s, false);'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                npy_in,
+                self.hls_sname(),
+            )
+        )
+
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode == "decoupled" or mem_mode == "external":
+            wdt = self.get_weight_datatype()
+            elem_bits = wdt.bitwidth()
+            packed_bits = self.get_weightstream_width()
+            packed_hls_type = "ap_uint<%d>" % packed_bits
+            elem_hls_type = wdt.get_hls_datatype_str()
+            npy_type = "float"
+            npy_in = "%s/weights.npy" % code_gen_dir
+
+            self.code_gen_dict["$READNPYDATA$"].append(
+                'npy2apintstream<%s, %s, %d, %s>("%s", weights_%s, false, numReps);'
+                % (
+                    packed_hls_type,
+                    elem_hls_type,
+                    elem_bits,
+                    npy_type,
+                    npy_in,
+                    self.hls_sname(),
+                )
+            )
+
+    def strm_decl(self):
+        mem_mode = self.get_nodeattr("mem_mode")
+        self.code_gen_dict["$STREAMDECLARATIONS$"] = []
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> in0_{} ("in0_{}");'.format(
+                self.get_instream_width(), self.hls_sname(), self.hls_sname()
+            )
+        )
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> out_{} ("out_{}");'.format(
+                self.get_outstream_width(), self.hls_sname(), self.hls_sname()
+            )
+        )
+
+        if mem_mode == "decoupled" or mem_mode == "external":
+            self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+                'hls::stream<ap_uint<{}>> weights_{} ("weights_{}");'.format(
+                    self.get_weightstream_width(), self.hls_sname(), self.hls_sname()
+                )
+            )
+
+    def docompute(self):
+        mem_mode = self.get_nodeattr("mem_mode")
+        map_to_hls_mult_style = {
+            "auto": "ap_resource_dflt()",
+            "lut": "ap_resource_lut()",
+            "dsp": "ap_resource_dsp()",
+        }
+        tmpl_args = self.get_template_param_values()
+        if self.calc_tmem() == 0:
+            odtype_hls_str = self.get_output_datatype().get_hls_datatype_str()
+            threshs = "PassThroughActivation<%s>()" % odtype_hls_str
+        else:
+            threshs = "threshs"
+        if mem_mode == "const":
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                """Matrix_Vector_Activate_Batch<MW1, MH1, SIMD1, PE1, 1, {}, {}, {}>
+                (in0_{}, out_{}, weights, {}, numReps, {});""".format(
+                    tmpl_args["TSrcI"],
+                    tmpl_args["TDstI"],
+                    tmpl_args["TWeightI"],
+                    self.hls_sname(),
+                    self.hls_sname(),
+                    threshs,
+                    map_to_hls_mult_style[self.get_nodeattr("resType")],
+                )
+            ]
+        elif mem_mode == "decoupled" or mem_mode == "external":
+            wdt = self.get_weight_datatype()
+            if wdt == DataType["BIPOLAR"]:
+                export_wdt = DataType["BINARY"]
+            else:
+                export_wdt = wdt
+            wdtype_hls_str = export_wdt.get_hls_datatype_str()
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                """Matrix_Vector_Activate_Stream_Batch<MW1, MH1, SIMD1, PE1, {}, {}, {}, {} >
+                (in0_{}, out_{}, weights_{}, {}, numReps, {});""".format(
+                    tmpl_args["TSrcI"],
+                    tmpl_args["TDstI"],
+                    tmpl_args["TWeightI"],
+                    wdtype_hls_str,
+                    self.hls_sname(),
+                    self.hls_sname(),
+                    self.hls_sname(),
+                    threshs,
+                    map_to_hls_mult_style[self.get_nodeattr("resType")],
+                )
+            ]
+
+        else:
+            raise Exception(
+                """Please set mem_mode to "const", "decoupled", or "external",
+                currently no other parameter value is supported!"""
+            )
+
+    def dataoutstrm(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_output_datatype()
+        if dtype == DataType["BIPOLAR"]:
+            # use binary for bipolar storage
+            dtype = DataType["BINARY"]
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_outstream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_out = "%s/output.npy" % code_gen_dir
+        shape = self.get_folded_output_shape()
+        shape_cpp_str = str(shape).replace("(", "{").replace(")", "}")
+
+        # note: the innermost dim is not reversed for the output
+        self.code_gen_dict["$DATAOUTSTREAM$"] = [
+            'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", false);'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                self.hls_sname(),
+                shape_cpp_str,
+                npy_out,
+            )
+        ]
+
+    def save_as_npy(self):
+        self.code_gen_dict["$SAVEASCNPY$"] = []
+
+    def blackboxfunction(self):
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode == "const":
+            self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+                """void {}(hls::stream<ap_uint<{}>> &in0_{},
+                    hls::stream<ap_uint<{}>> &out_{}
+                    )""".format(
+                    self.onnx_node.name,
+                    self.get_instream_width(),
+                    self.hls_sname(),
+                    self.get_outstream_width(),
+                    self.hls_sname(),
+                )
+            ]
+        elif mem_mode == "decoupled" or mem_mode == "external":
+            self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+                """void {}(
+                    hls::stream<ap_uint<{}>> &in0_{},
+                    hls::stream<ap_uint<{}>> &weights_{},
+                    hls::stream<ap_uint<{}>> &out_{}
+                    )""".format(
+                    self.onnx_node.name,
+                    self.get_instream_width(),
+                    self.hls_sname(),
+                    self.get_weightstream_width(),
+                    self.hls_sname(),
+                    self.get_outstream_width(),
+                    self.hls_sname(),
+                )
+            ]
+
+        else:
+            raise Exception(
+                """Please set mem_mode to "const" or "decoupled", currently no other
+                    parameter value is supported!"""
+            )
+
+    def pragmas(self):
+        mem_mode = self.get_nodeattr("mem_mode")
+        ram_style_thresholds = self.get_nodeattr("ram_style_thresholds")
+        self.code_gen_dict["$PRAGMAS$"] = [
+            "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname()
+        ]
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE axis port=out_" + self.hls_sname()
+        )
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return")
+
+        if mem_mode == "const":
+            self.code_gen_dict["$PRAGMAS$"].append('#include "params.h"')
+            # the weight tensor is ap_uint<simd*prec> [PE][WMEM]
+            # partition for parallel access along the PE dimension (dim 1)
+            self.code_gen_dict["$PRAGMAS$"].append(
+                ("#pragma HLS ARRAY_PARTITION variable=weights.m_weights " "complete dim=1")
+            )
+        elif mem_mode == "decoupled" or mem_mode == "external":
+            self.code_gen_dict["$PRAGMAS$"].append(
+                "#pragma HLS INTERFACE axis port=weights_" + self.hls_sname()
+            )
+
+        else:
+            raise Exception(
+                """Please set mem_mode to "const", "decoupled", or external,
+                currently no other parameter value is supported!"""
+            )
+
+        # the threshold tensor is acc_type [PE][TMEM][N_THRES]
+        # partition for parallel access along PE and N_THRES
+        # dimensions (dims 1 and 3)
+        if self.calc_tmem() != 0:
+            # TODO find a better way of checking for no pregenerated thresholds
+            self.code_gen_dict["$PRAGMAS$"].append(
+                ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=1")
+            )
+            self.code_gen_dict["$PRAGMAS$"].append(
+                ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=3")
+            )
+            # add resource pragma for thresholds if set
+            if ram_style_thresholds == "distributed":
+                self.code_gen_dict["$PRAGMAS$"].append(
+                    ("#pragma HLS RESOURCE variable=threshs.m_thresholds " "core=ROM_2P_LUTRAM")
+                )
+            elif ram_style_thresholds == "block":
+                self.code_gen_dict["$PRAGMAS$"].append(
+                    ("#pragma HLS RESOURCE variable=threshs.m_thresholds " "core=ROM_2P_BRAM")
+                )
+            elif ram_style_thresholds == "auto":
+                # no pragma needed
+                pass
+            else:
+                raise Exception("Unrecognized ram_style_thresholds value:" + ram_style_thresholds)
+
+    def get_ap_int_max_w(self):
+        # base class impl (max of inp/out stream widths)
+        max_of_io = super().get_ap_int_max_w()
+        # decoupled mode weight stream
+        weightstream = self.get_weightstream_width()
+        # single PE weight entry
+        weight_bits = self.get_weight_datatype().bitwidth()
+        simd = self.get_nodeattr("SIMD")
+        single_pe_w = simd * weight_bits
+        return max([weightstream, max_of_io, single_pe_w])
+
+    def execute_node(self, context, graph):
+        mode = self.get_nodeattr("exec_mode")
+        mem_mode = self.get_nodeattr("mem_mode")
+        node = self.onnx_node
+
+        if mode == "cppsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        elif mode == "rtlsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        # create a npy file fore each input of the node (in_ind is input index)
+        in_ind = 0
+        for inputs in node.input:
+            # it is assumed that the first input of the node is the data input
+            # the second input are the weights
+            if in_ind == 0:
+                assert (
+                    str(context[inputs].dtype) == "float32"
+                ), """Input datatype is
+                not float32 as expected."""
+                expected_inp_shape = self.get_folded_input_shape()
+                reshaped_input = context[inputs].reshape(expected_inp_shape)
+                export_idt = self.get_input_datatype()
+                # make copy before saving the array
+                reshaped_input = reshaped_input.copy()
+                np.save(
+                    os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)),
+                    reshaped_input,
+                )
+            elif in_ind > 2:
+                raise Exception("Unexpected input found for MatrixVectorActivation")
+            in_ind += 1
+
+        if mode == "cppsim":
+            # execute the precompiled model
+            super().exec_precompiled_singlenode_model()
+            # load output npy file
+            super().npy_to_dynamic_output(context)
+            # reinterpret binary output as bipolar where needed
+            if self.get_output_datatype() == DataType["BIPOLAR"]:
+                out = context[node.output[0]]
+                out = 2 * out - 1
+                context[node.output[0]] = out
+            assert (
+                context[node.output[0]].shape == self.get_normal_output_shape()
+            ), "cppsim did not produce expected output shape"
+        elif mode == "rtlsim":
+            sim = self.get_rtlsim()
+            nbits = self.get_instream_width()
+            inp = npy_to_rtlsim_input(
+                "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
+            )
+            self.reset_rtlsim(sim)
+            self.toggle_clk(sim)
+            if mem_mode in ["external", "decoupled"]:
+                wnbits = self.get_weightstream_width()
+                export_wdt = self.get_weight_datatype()
+                wei = npy_to_rtlsim_input(
+                    "{}/weights.npy".format(code_gen_dir), export_wdt, wnbits
+                )
+                num_w_reps = np.prod(self.get_nodeattr("numInputVectors"))
+                io_dict = {
+                    "inputs": {"in0": inp, "weights": wei * num_w_reps},
+                    "outputs": {"out": []},
+                }
+                self.rtlsim_multi_io(sim, io_dict)
+                output = io_dict["outputs"]["out"]
+            else:
+                output = self.rtlsim(sim, inp)
+            odt = self.get_output_datatype()
+            target_bits = odt.bitwidth()
+            packed_bits = self.get_outstream_width()
+            out_npy_path = "{}/output.npy".format(code_gen_dir)
+            out_shape = self.get_folded_output_shape()
+            rtlsim_output_to_npy(
+                output, out_npy_path, odt, out_shape, packed_bits, target_bits
+            )
+            # load and reshape output
+            output = np.load(out_npy_path)
+            oshape = self.get_normal_output_shape()
+            output = np.asarray([output], dtype=np.float32).reshape(*oshape)
+            context[node.output[0]] = output
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to "rtlsim" """.format(
+                    mode
+                )
+            )
\ No newline at end of file

From 7a9b82babacdbf3730e602630e03ce614f88e965 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 26 Jan 2024 11:45:12 +0000
Subject: [PATCH 098/112] [refactor]: call to base_op_type method instead of
 custom_op type

---
 src/finn/analysis/fpgadataflow/res_estimation.py           | 2 +-
 src/finn/transformation/fpgadataflow/create_stitched_ip.py | 3 ++-
 src/finn/transformation/fpgadataflow/floorplan.py          | 2 +-
 src/finn/transformation/fpgadataflow/insert_dwc.py         | 2 +-
 src/finn/transformation/fpgadataflow/insert_iodma.py       | 2 +-
 src/finn/transformation/fpgadataflow/insert_tlastmarker.py | 4 ++--
 src/finn/transformation/fpgadataflow/make_pynq_driver.py   | 2 +-
 src/finn/transformation/fpgadataflow/make_zynq_proj.py     | 2 +-
 src/finn/transformation/fpgadataflow/set_fifo_depths.py    | 6 +++---
 src/finn/transformation/fpgadataflow/set_folding.py        | 2 +-
 10 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/src/finn/analysis/fpgadataflow/res_estimation.py b/src/finn/analysis/fpgadataflow/res_estimation.py
index be4cf417bc..a7f220daa9 100644
--- a/src/finn/analysis/fpgadataflow/res_estimation.py
+++ b/src/finn/analysis/fpgadataflow/res_estimation.py
@@ -60,8 +60,8 @@ def res_estimation_complete(model):
     res_dict = {}
     for node in model.graph.node:
         if is_fpgadataflow_node(node) is True:
-            op_type = node.op_type
             inst = registry.getCustomOp(node)
+            op_type = inst.base_op_type()
             if op_type == "MatrixVectorActivation" or op_type == "VectorVectorActivation":
                 orig_restype = inst.get_nodeattr("resType")
                 res_dict[node.name] = []
diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
index 1a182c7f4f..81c5848d57 100644
--- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py
+++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
@@ -48,12 +48,13 @@ def is_external_input(model, node, i):
     # True only if input is unconnected and has no initializer
     # Only esception is second input of FC layers when mem_mode is external
     node_inst = getCustomOp(node)
+    op_type = node_inst.base_op_type()
     producer = model.find_producer(node.input[i])
     if producer is None:
         if model.get_initializer(node.input[i]) is None:
             return True
         else:
-            if node.op_type == "MatrixVectorActivation":
+            if op_type == "MatrixVectorActivation":
                 if node_inst.get_nodeattr("mem_mode") == "external":
                     return True
     return False
diff --git a/src/finn/transformation/fpgadataflow/floorplan.py b/src/finn/transformation/fpgadataflow/floorplan.py
index ceb2bdb5c9..56e644f2b8 100644
--- a/src/finn/transformation/fpgadataflow/floorplan.py
+++ b/src/finn/transformation/fpgadataflow/floorplan.py
@@ -150,7 +150,7 @@ def apply(self, model):
                 continue
 
             elif not (
-                node.op_type == "MatrixVectorActivation"
+                node_inst.base_op_type() == "MatrixVectorActivation"
                 and node_inst.get_nodeattr("mem_mode") is not None
                 and node_inst.get_nodeattr("mem_mode") == "external"
             ):
diff --git a/src/finn/transformation/fpgadataflow/insert_dwc.py b/src/finn/transformation/fpgadataflow/insert_dwc.py
index 81cee8dae4..d0029cb630 100644
--- a/src/finn/transformation/fpgadataflow/insert_dwc.py
+++ b/src/finn/transformation/fpgadataflow/insert_dwc.py
@@ -88,7 +88,7 @@ def apply(self, model):
                         # - if FC and external mem, it could be connected to input 1
                         # - if concat, could be connected to any input
                         if (
-                            consumer.op_type == "MatrixVectorActivation"
+                            n1.base_op_type() == "MatrixVectorActivation"
                             and n1.get_nodeattr("mem_mode") == "external"
                         ) or (consumer.op_type == "StreamingConcat"):
                             # get input idx
diff --git a/src/finn/transformation/fpgadataflow/insert_iodma.py b/src/finn/transformation/fpgadataflow/insert_iodma.py
index 93e3226b2a..fd546459fa 100644
--- a/src/finn/transformation/fpgadataflow/insert_iodma.py
+++ b/src/finn/transformation/fpgadataflow/insert_iodma.py
@@ -199,7 +199,7 @@ def apply(self, model):
             # attached IODMA
             fc_extw_nodes = list(
                 filter(
-                    lambda x: x.op_type in ["MatrixVectorActivation", "VectorVectorActivation"]
+                    lambda x: getCustomOp(x).base_op_type() in ["MatrixVectorActivation", "VectorVectorActivation"]
                     and getCustomOp(x).get_nodeattr("mem_mode") == "external"
                     and model.find_producer(x.input[1]) is None,
                     all_nodes,
diff --git a/src/finn/transformation/fpgadataflow/insert_tlastmarker.py b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py
index 157df46d71..ab5142e4d8 100644
--- a/src/finn/transformation/fpgadataflow/insert_tlastmarker.py
+++ b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py
@@ -103,7 +103,7 @@ def apply(self, model):
                 #    the input is in the list of graph inputs because it has an
                 #    initializer (TODO: fix this with a clean-up transform)
                 if (
-                    first_node.op_type == "MatrixVectorActivation"
+                    getCustomOp(first_node).base_op_type() == "MatrixVectorActivation"
                     and get_by_name(first_node.attribute, "mem_mode").s.decode("UTF-8")
                     != "external"
                 ):
@@ -117,7 +117,7 @@ def apply(self, model):
                     num_iters = np.prod(custom_op.get_folded_input_shape()[1:-1])
                     inp_idx = list(first_node.input).index(graph_in_name)
                     if inp_idx > 0:
-                        if first_node.op_type == "MatrixVectorActivation" and inp_idx == 1:
+                        if getCustomOp(first_node).base_op_type() == "MatrixVectorActivation" and inp_idx == 1:
                             stream_width = int(custom_op.get_weightstream_width())
                         elif first_node.op_type == "AddStreams_Batch" and inp_idx == 1:
                             stream_width = int(custom_op.get_instream_width())
diff --git a/src/finn/transformation/fpgadataflow/make_pynq_driver.py b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
index d5c2d8f2b5..e66236bf39 100644
--- a/src/finn/transformation/fpgadataflow/make_pynq_driver.py
+++ b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
@@ -282,7 +282,7 @@ def apply(self, model):
             dataflow_model = ModelWrapper(dataflow_model_filename)
             rt_layer_ind = 0
             for node in dataflow_model.graph.node:
-                if node.op_type in ["MatrixVectorActivation", "Thresholding_Batch"]:
+                if getCustomOp(node).base_op_type() == "MatrixVectorActivation" or node.op_type == "Thresholding_Batch":
                     node_inst = getCustomOp(node)
                     is_rt_weights = node_inst.get_nodeattr("runtime_writeable_weights")
                     if is_rt_weights == 1:
diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
index 989eb62a88..193e6e8b42 100644
--- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py
+++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
@@ -62,7 +62,7 @@ def collect_ip_dirs(model, ipstitch_path):
         ), """The directory that should
         contain the generated ip blocks doesn't exist."""
         ip_dirs += [ip_dir_value]
-        if node.op_type in ["MatrixVectorActivation", "Thresholding_Batch"]:
+        if getCustomOp(node).base_op_type() == "MatrixVectorActivation" or node.op_type == "Thresholding_Batch":
             if node_inst.get_nodeattr("mem_mode") == "decoupled":
                 need_memstreamer = True
     ip_dirs += [ipstitch_path + "/ip"]
diff --git a/src/finn/transformation/fpgadataflow/set_fifo_depths.py b/src/finn/transformation/fpgadataflow/set_fifo_depths.py
index da6099ab9a..8db8e4c549 100644
--- a/src/finn/transformation/fpgadataflow/set_fifo_depths.py
+++ b/src/finn/transformation/fpgadataflow/set_fifo_depths.py
@@ -173,7 +173,7 @@ def apply(self, model):
                     continue
                 if fifo_cons is None:
                     continue
-                if fifo_cons.op_type != "MatrixVectorActivation":
+                if getCustomOp(fifo_cons).base_op_type() != "MatrixVectorActivation":
                     continue
                 op_inst = getCustomOp(node)
                 depth = op_inst.get_nodeattr("depth")
@@ -280,7 +280,7 @@ def apply(self, model):
             node.set_nodeattr("inFIFODepths", ifd)
             node.set_nodeattr("outFIFODepths", ofd)
 
-            if node.onnx_node.op_type in extw_optypes:
+            if getCustomOp(node).base_op_type() in extw_optypes:
                 mmode = node.get_nodeattr("mem_mode")
                 if mmode == "external":
                     modified_fc_nodes.append(node.onnx_node.name)
@@ -421,7 +421,7 @@ def apply(self, model):
                 # (removed setting of node FIFO size attributes to 0 here)
                 # for every extw node we changed from external to decoupled,
                 # change back and reset implementation
-                if node.op_type in extw_optypes:
+                if getCustomOp(node).base_op_type() in extw_optypes:
                     if node.name in modified_fc_nodes:
                         node_inst = getCustomOp(node)
                         node_inst.set_nodeattr("mem_mode", "external")
diff --git a/src/finn/transformation/fpgadataflow/set_folding.py b/src/finn/transformation/fpgadataflow/set_folding.py
index 4045a28e16..7b65023abc 100644
--- a/src/finn/transformation/fpgadataflow/set_folding.py
+++ b/src/finn/transformation/fpgadataflow/set_folding.py
@@ -125,7 +125,7 @@ def apply(self, model):
                 continue
             op_type = node.op_type
             node_inst = getCustomOp(node)
-            if op_type == "MatrixVectorActivation":
+            if node_inst.base_op_type() == "MatrixVectorActivation":
                 max_simd = node_inst.get_nodeattr("MW")
                 max_pe = node_inst.get_nodeattr("MH")
                 node_inst.set_nodeattr("PE", 1)

From 4556d2d8973f39279ca248a38e383e90ff042c08 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 26 Jan 2024 11:46:01 +0000
Subject: [PATCH 099/112] [hls custom-op]: add mvau_hls

---
 src/finn/custom_op/fpgadataflow/hls/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/finn/custom_op/fpgadataflow/hls/__init__.py b/src/finn/custom_op/fpgadataflow/hls/__init__.py
index 3e31c9785e..8aebcdf54f 100644
--- a/src/finn/custom_op/fpgadataflow/hls/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/hls/__init__.py
@@ -49,6 +49,7 @@
 from finn.custom_op.fpgadataflow.hls.streamingmaxpool_hls import StreamingMaxPool_hls
 from finn.custom_op.fpgadataflow.hls.tlastmarker_hls import TLastMarker_hls
 from finn.custom_op.fpgadataflow.hls.upsampler_hls import UpsampleNearestNeighbour_hls
+from finn.custom_op.fpgadataflow.hls.matrixvectoractivation_hls import MatrixVectorActivation_hls
 
 custom_op = dict()
 
@@ -73,3 +74,4 @@
 custom_op["StreamingMaxPool_hls"] = StreamingMaxPool_hls
 custom_op["TLastMarker_hls"] = TLastMarker_hls
 custom_op["UpsampleNearestNeighbour_hls"] = UpsampleNearestNeighbour_hls
+custom_op["MatrixVectorActivation_hls"] = MatrixVectorActivation_hls
\ No newline at end of file

From 0b2fc98e7c15dac6f359ba0a3b76a61d562c18a5 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 26 Jan 2024 11:46:17 +0000
Subject: [PATCH 100/112] [hw custom-op]: refactor MVAU

---
 .../fpgadataflow/matrixvectoractivation.py    | 822 ++++++------------
 1 file changed, 274 insertions(+), 548 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
index 4f24d71ccc..fd5751ef7d 100644
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
@@ -31,20 +31,32 @@
 import os
 import textwrap
 import warnings
+from onnx import TensorProto, helper
 from qonnx.core.datatype import DataType
+import qonnx.custom_op.general.xnorpopcount as xp
+from qonnx.custom_op.general.multithreshold import multithreshold
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.registry import getCustomOp
 from qonnx.util.basic import (
     calculate_matvec_accumulator_range,
     interleave_matrix_outer_dim_from_partitions,
     roundup_to_integer_multiple,
+    qonnx_make_model
 )
 
-from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
+from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
 from finn.util.data_packing import (
     npy_to_rtlsim_input,
     numpy_to_hls_code,
     pack_innermost_dim_as_hex_string,
     rtlsim_output_to_npy,
 )
+import qonnx.core.data_layout as DataLayout
+import finn.core.onnx_exec as oxe
+from qonnx.transformation.infer_shapes import InferShapes
+import onnx.numpy_helper as np_helper
+from qonnx.transformation.general import GiveUniqueNodeNames
+
 
 # ONNX i/o tensor shape assumptions for MatrixVectorActivation:
 # input 0 is the input tensor, shape (.., i_size) = (..., MW)
@@ -54,9 +66,8 @@
 # the ... here can be any shape (representing groups of vectors)
 
 
-class MatrixVectorActivation(HLSCustomOp):
-    """Class that corresponds to finn-hls Matrix_Vector_Activate(_Stream)_Batch
-    function."""
+class MatrixVectorActivation(HWCustomOp):
+    """Abstraction layer for HW implementation of MatrixVectorActivation layers."""
 
     def __init__(self, onnx_node, **kwargs):
         super().__init__(onnx_node, **kwargs)
@@ -122,12 +133,14 @@ def get_nodeattr_types(self):
             # vector through the accelerator. This will get rid of any old
             # weight data from the weight FIFOs.
             "runtime_writeable_weights": ("i", False, 0, {0, 1}),
-            # Flag to specify whether RTL-based or HLS-based implementation is preferred
-            "preferred_backend": ("s", False, "rtl", {"hls", "rtl"})
-        }
+            "preferred_impl_style" : ("s", False, "hls", {"hls", "rtl"}),
+            }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
 
+    def base_op_type(self):
+        return "MatrixVectorActivation"
+
     def calc_wmem(self):
         """Calculates and returns WMEM."""
         mw = self.get_nodeattr("MW")
@@ -167,6 +180,61 @@ def infer_node_datatype(self, model):
         odt = self.get_output_datatype()
         model.set_tensor_datatype(node.output[0], odt)
 
+    def get_input_datatype(self, ind=0):
+        """Returns FINN DataType of input."""
+        # when performing FIFO insertion on an FC layer with ext weights, the ind
+        # parameter can be > 0 (referring to the weights) so handle that here
+        if ind == 0:
+            return DataType[self.get_nodeattr("inputDataType")]
+        elif ind == 1:
+            return DataType[self.get_nodeattr("weightDataType")]
+        else:
+            raise Exception("Undefined input ind for this layer type")
+
+    def get_weight_datatype(self):
+        """Returns FINN DataType of weights."""
+        return DataType[self.get_nodeattr("weightDataType")]
+
+    def get_output_datatype(self, ind=0):
+        """Returns FINN DataType of output."""
+        return DataType[self.get_nodeattr("outputDataType")]
+
+    def get_instream_width(self, ind=0):
+        i_bits = self.get_input_datatype().bitwidth()
+        assert (
+            i_bits <= 9
+        ), "RTL-based MVAU only supports activations with bit-width up to 9-bits"
+        in_width = i_bits * self.get_nodeattr("SIMD")
+        return in_width
+
+    def get_weightstream_width(self):
+        """Returns weight stream width. Used only in decoupled mode."""
+        if (
+            self.get_nodeattr("mem_mode") == "decoupled"
+            or self.get_nodeattr("mem_mode") == "external"
+        ):
+            pe = self.get_nodeattr("PE")
+            simd = self.get_nodeattr("SIMD")
+            wp = self.get_weight_datatype().bitwidth()
+            assert (
+                wp <= 8
+            ), "RTL-based MVAU only supports weights with bit-width up to 8-bits"
+            w_width = pe * simd * wp
+            return w_width
+        else:
+            return 0
+
+    def get_outstream_width(self, ind=0):
+        o_bits = self.get_output_datatype().bitwidth()
+        out_width = o_bits * self.get_nodeattr("PE")
+        return out_width
+
+    def get_weightstream_width_padded(self):
+        """Returns weight stream width padded to a multiple of 8. This is required
+        by the AXI Stream spec. Used in decoupled mode."""
+        weight_width = self.get_weightstream_width()
+        return roundup_to_integer_multiple(weight_width, 8)
+
     def verify_node(self):
         info_messages = []
         # verify that "backend" is set to "fpgadataflow"
@@ -387,6 +455,25 @@ def dsp_estimation(self):
         else:
             mult_dsp = 0
         return int(mult_dsp)
+# # TODO: fix DSP estimations --> depends on fpga_part
+#     def dsp_estimation(self):
+#         # multiplication
+#         # mvu_8sx9 (DSP58): ceil(SIMD/3)
+#         # mvu_4sx4u (DSP48/DSP58): ceil(PE/4)
+#         # mvu_8sx8u (DSP48): ceil(PE/2)
+#         # mvu_lut: 0
+#         P = self.get_nodeattr("PE")
+#         res_type = self.get_nodeattr("resType")
+#         Q = self.get_nodeattr("SIMD")
+#         wdt = self.get_weight_datatype()
+#         W = wdt.bitwidth()
+#         idt = self.get_input_datatype()
+#         A = idt.bitwidth()
+#         if res_type == "dsp":
+#             mult_dsp = P * Q * np.ceil((W + A) / 48)  # TODO: more accurate modelling
+#         else:
+#             mult_dsp = 0
+#         return int(mult_dsp)
 
     def get_exp_cycles(self):
         pe = self.get_nodeattr("PE")
@@ -399,6 +486,27 @@ def get_exp_cycles(self):
         exp_cycles = (mh / pe) * (mw / simd) * np.prod(num_inp_vec) / mmv
         return int(exp_cycles)
 
+# # TODO: fix exp_cycles estimations --> depends on fpga_part and clk
+#     def get_exp_cycles(self):
+#         # mvu_8sx9 (DSP58):
+#         # 2 (replay_buffer) + ceil(chainlen/seglen) + 2 (MREG, PREG) + 2 (output reg slice)
+#         # + MW/SIMD * MH/PE
+#         # mvu_4sx4u (DSP48/DSP58) / mvu_8sx8u (DSP48): 
+#         # 3 (IN_REG, MREG, PREG) + 2 (replay_buffer) + 2 (output reg slice) + 1 (adder tree SIMD) + 1 (output lane)
+#         # + MW/SIMD * MH/PE
+#         # mvu_lut:
+#         # 2 (replay_buffer) + 1 OR 2 (no MREG OR MREG) + 2 (output reg slice) 
+#         # + MW/SIMD * MH/PE
+#         pe = self.get_nodeattr("PE")
+#         simd = self.get_nodeattr("SIMD")
+#         num_inp_vec = self.get_nodeattr("numInputVectors")
+#         mh = self.get_nodeattr("MH")
+#         mw = self.get_nodeattr("MW")
+#         # since mmv != 1 is not supported yet, we set mmv for now to 1
+#         mmv = 1     
+#         exp_cycles = (mh / pe) * (mw / simd) * np.prod(num_inp_vec) / mmv
+#         return int(exp_cycles)
+
     def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         # when performing FIFO insertion on an FC layer with ext weights, the ind
@@ -452,17 +560,6 @@ def get_weightstream_width_padded(self):
         weight_width = self.get_weightstream_width()
         return roundup_to_integer_multiple(weight_width, 8)
 
-    def get_ap_int_max_w(self):
-        # base class impl (max of inp/out stream widths)
-        max_of_io = super().get_ap_int_max_w()
-        # decoupled mode weight stream
-        weightstream = self.get_weightstream_width()
-        # single PE weight entry
-        weight_bits = self.get_weight_datatype().bitwidth()
-        simd = self.get_nodeattr("SIMD")
-        single_pe_w = simd * weight_bits
-        return max([weightstream, max_of_io, single_pe_w])
-
     def get_folded_input_shape(self, ind=0):
         mw = self.get_nodeattr("MW")
         mh = self.get_nodeattr("MH")
@@ -507,82 +604,6 @@ def get_number_output_values(self):
         nf = np.prod(self.get_folded_output_shape()[:-1])
         return nf
 
-    def get_template_param_values(self):
-        """Returns the template parameter values according to input, output and weight
-        data types."""
-        ret = dict()
-        inp_hls_str = self.get_input_datatype().get_hls_datatype_str()
-        out_hls_str = self.get_output_datatype().get_hls_datatype_str()
-        inp_is_binary = self.get_input_datatype() == DataType["BINARY"]
-        # out_is_binary = self.get_output_datatype() == DataType["BINARY"]
-        wt_is_binary = self.get_weight_datatype() == DataType["BINARY"]
-        bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1
-        if (inp_is_binary or wt_is_binary) and (not bin_xnor_mode):
-            raise Exception("True binary (non-bipolar) inputs not yet supported")
-        inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"]
-        # out_is_bipolar = self.get_output_datatype() == DataType["BIPOLAR"]
-        wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"]
-        # reinterpret inp/wt as bipolar if bin_xnor_mode is iset
-        inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode)
-        wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode)
-        # fill in TSrcI and TWeightI
-        # TODO check these with Giulio
-        # TODO handle non-bipolar binary inputs
-        if inp_is_bipolar and wt_is_bipolar:
-            ret["TSrcI"] = "Recast<XnorMul>"
-            ret["TWeightI"] = "Identity"
-        elif (not inp_is_bipolar) and wt_is_bipolar:
-            ret["TSrcI"] = "Slice<%s>" % inp_hls_str
-            ret["TWeightI"] = "Recast<Binary>"
-        elif inp_is_bipolar and (not wt_is_bipolar):
-            ret["TSrcI"] = "Recast<Binary>"
-            ret["TWeightI"] = "Identity"
-        elif (not inp_is_bipolar) and (not wt_is_bipolar):
-            ret["TSrcI"] = "Slice<%s>" % inp_hls_str
-            ret["TWeightI"] = "Identity"
-
-        # fill in TDstI
-        ret["TDstI"] = "Slice<%s>" % out_hls_str
-
-        return ret
-
-    def get_hls_compatible_weight_tensor(self, orig_weight_matrix):
-        """Convert the original numpy weight matrix orig_weight_matrix into
-        a form suitable for passing to the hlslib call:
-        * ensure MH % PE == 0 and MW % SIMD == 0
-        * for bipolar {-1,+1} weights, convert to binary {0, 1}
-        * interleave rows between PEs
-        * reshape into (1, PE, WMEM, SIMD) and return
-        """
-        mw = self.get_nodeattr("MW")
-        mh = self.get_nodeattr("MH")
-        pe = self.get_nodeattr("PE")
-        simd = self.get_nodeattr("SIMD")
-        wmem = self.calc_wmem()
-        assert orig_weight_matrix.shape == (
-            mw,
-            mh,
-        ), """Weights matrix doesn't
-        have expected shape (mw, mh)"""
-        assert mw % simd == 0, "Requirement MH divisable by SIMD is violated."
-        assert mh % pe == 0, "Requirement MH divisable by PE is violated."
-        # start by transposing the original weight matrix, since ONNX and
-        # finn-hlslib use different assumptions
-        # ONNX uses (in_features, out_features) and matmul(x, W)
-        # finn-hlslib uses (out_features, in_features) and matmul(W, x)
-        ret = orig_weight_matrix.T
-        if self.get_weight_datatype() == DataType["BIPOLAR"]:
-            # convert bipolar to binary
-            ret = (ret + 1) / 2
-        # interleave rows between PEs and reshape
-        # distribute rows between PEs
-        ret = interleave_matrix_outer_dim_from_partitions(ret, pe)
-        # create SIMD as innermost dimension and add a dummy outer dim
-        ret = ret.reshape(1, pe, wmem, simd)
-        # reverse the SIMD dimension
-        ret = np.flip(ret, axis=-1)
-        return ret
-
     def minimize_accumulator_width(self, model):
         """Minimize the accumulator bit width according to the weight values,
         input data types, and size of dot product"""
@@ -730,6 +751,43 @@ def get_hls_compatible_threshold_tensor(self, orig_thres_matrix):
         rows between PEs is not as expected (n_thres_steps)"""
         return ret.reshape(1, pe, tmem, n_thres_steps)
 
+    def get_hls_compatible_weight_tensor(self, orig_weight_matrix):
+        """Convert the original numpy weight matrix orig_weight_matrix into
+        a form suitable for passing to the hlslib call:
+        * ensure MH % PE == 0 and MW % SIMD == 0
+        * for bipolar {-1,+1} weights, convert to binary {0, 1}
+        * interleave rows between PEs
+        * reshape into (1, PE, WMEM, SIMD) and return
+        """
+        mw = self.get_nodeattr("MW")
+        mh = self.get_nodeattr("MH")
+        pe = self.get_nodeattr("PE")
+        simd = self.get_nodeattr("SIMD")
+        wmem = self.calc_wmem()
+        assert orig_weight_matrix.shape == (
+            mw,
+            mh,
+        ), """Weights matrix doesn't
+        have expected shape (mw, mh)"""
+        assert mw % simd == 0, "Requirement MH divisable by SIMD is violated."
+        assert mh % pe == 0, "Requirement MH divisable by PE is violated."
+        # start by transposing the original weight matrix, since ONNX and
+        # finn-hlslib use different assumptions
+        # ONNX uses (in_features, out_features) and matmul(x, W)
+        # finn-hlslib uses (out_features, in_features) and matmul(W, x)
+        ret = orig_weight_matrix.T
+        if self.get_weight_datatype() == DataType["BIPOLAR"]:
+            # convert bipolar to binary
+            ret = (ret + 1) / 2
+        # interleave rows between PEs and reshape
+        # distribute rows between PEs
+        ret = interleave_matrix_outer_dim_from_partitions(ret, pe)
+        # create SIMD as innermost dimension and add a dummy outer dim
+        ret = ret.reshape(1, pe, wmem, simd)
+        # reverse the SIMD dimension
+        ret = np.flip(ret, axis=-1)
+        return ret
+
     def make_weight_file(self, weights, weight_file_mode, weight_file_name):
         """Produce a file containing given weights in appropriate format for this
         layer. This file can be used for either synthesis or run-time reconfig
@@ -907,402 +965,68 @@ def generate_params(self, model, path):
                 f_thresh.write(thresholds_hls_code)
                 f_thresh.close()
 
-    def execute_node(self, context, graph):
-        mode = self.get_nodeattr("exec_mode")
-        mem_mode = self.get_nodeattr("mem_mode")
-        node = self.onnx_node
-
-        # TODO ensure codegen dir exists
-        if mode == "cppsim":
-            code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
-        elif mode == "rtlsim":
-            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
-        else:
-            raise Exception(
-                """Invalid value for attribute exec_mode! Is currently set to: {}
-            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
-                    mode
-                )
-            )
-
-        # create a npy file fore each input of the node (in_ind is input index)
-        in_ind = 0
-        for inputs in node.input:
-            # it is assumed that the first input of the node is the data input
-            # the second input are the weights
-            # the third input are the thresholds
-            if in_ind == 0:
-                assert (
-                    str(context[inputs].dtype) == "float32"
-                ), """Input datatype is
-                not float32 as expected."""
-                expected_inp_shape = self.get_folded_input_shape()
-                reshaped_input = context[inputs].reshape(expected_inp_shape)
-                if self.get_input_datatype() == DataType["BIPOLAR"]:
-                    # store bipolar activations as binary
-                    reshaped_input = (reshaped_input + 1) / 2
-                    export_idt = DataType["BINARY"]
-                else:
-                    export_idt = self.get_input_datatype()
-                # make copy before saving the array
-                reshaped_input = reshaped_input.copy()
-                np.save(
-                    os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)),
-                    reshaped_input,
-                )
-            elif in_ind > 2:
-                raise Exception("Unexpected input found for MatrixVectorActivation")
-            in_ind += 1
-
-        if mode == "cppsim":
-            # execute the precompiled model
-            super().exec_precompiled_singlenode_model()
-            # load output npy file
-            super().npy_to_dynamic_output(context)
-            # reinterpret binary output as bipolar where needed
-            if self.get_output_datatype() == DataType["BIPOLAR"]:
-                out = context[node.output[0]]
-                out = 2 * out - 1
-                context[node.output[0]] = out
-            assert (
-                context[node.output[0]].shape == self.get_normal_output_shape()
-            ), "cppsim did not produce expected output shape"
-        elif mode == "rtlsim":
-            sim = self.get_rtlsim()
-            nbits = self.get_instream_width()
-            inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits)
-            super().reset_rtlsim(sim)
-            super().toggle_clk(sim)
-            if mem_mode == "external" or mem_mode == "decoupled":
-                wnbits = self.get_weightstream_width()
-                export_wdt = self.get_weight_datatype()
-                # we have converted bipolar weights to binary for export,
-                # so use it as such for weight generation
-                if self.get_weight_datatype() == DataType["BIPOLAR"]:
-                    export_wdt = DataType["BINARY"]
-                wei = npy_to_rtlsim_input("{}/weights.npy".format(code_gen_dir), export_wdt, wnbits)
-                num_w_reps = np.prod(self.get_nodeattr("numInputVectors"))
-                io_dict = {
-                    "inputs": {"in0": inp, "weights": wei * num_w_reps},
-                    "outputs": {"out": []},
-                }
-                self.rtlsim_multi_io(sim, io_dict)
-                output = io_dict["outputs"]["out"]
-            else:
-                output = self.rtlsim(sim, inp)
-            odt = self.get_output_datatype()
-            target_bits = odt.bitwidth()
-            packed_bits = self.get_outstream_width()
-            out_npy_path = "{}/output.npy".format(code_gen_dir)
-            out_shape = self.get_folded_output_shape()
-            rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits)
-
-            # load and reshape output
-            output = np.load(out_npy_path)
-            oshape = self.get_normal_output_shape()
-            output = np.asarray([output], dtype=np.float32).reshape(*oshape)
-            context[node.output[0]] = output
-        else:
-            raise Exception(
-                """Invalid value for attribute exec_mode! Is currently set to: {}
-            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
-                    mode
-                )
-            )
-
-    def global_includes(self):
-        self.code_gen_dict["$GLOBALS$"] = ['#include "weights.hpp"']
-        self.code_gen_dict["$GLOBALS$"] += ['#include "activations.hpp"']
-
-        mem_mode = self.get_nodeattr("mem_mode")
-        if mem_mode not in ["const", "decoupled", "external"]:
-            raise Exception(
-                """Please set mem_mode to "const", "decoupled", or "external",
-                currently no other parameter value is supported!"""
-            )
-        self.code_gen_dict["$GLOBALS$"] += ['#include "mvau.hpp"']
-        if self.calc_tmem() != 0:
-            # TODO find a better way of checking for no pregenerated thresholds
-            self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"']
-
-    def defines(self, var):
-        # Only ipgen mode: Make sure that SIMD parameter satisfies minimum requirements.
-        if var == "ipgen":
-            SIMD = self.get_nodeattr("SIMD")
-            MW = self.get_nodeattr("MW")
-            condition = SIMD >= (MW / 1024)
-            msg = (
-                f"HLS synthesis of MatrixVectorActivation requires: "
-                f"SIMD >= MW / 1024. This is not fulfilled with: SIMD={SIMD} "
-                f"and MW={MW} for node: {self.onnx_node.name}."
-            )
-            assert condition, msg
-        mem_mode = self.get_nodeattr("mem_mode")
-        numInputVectors = list(self.get_nodeattr("numInputVectors"))
-        numReps = np.prod(numInputVectors)
-        self.code_gen_dict["$DEFINES$"] = [
-            """#define MW1 {}\n #define MH1 {}\n
-            #define SIMD1 {}\n #define PE1 {}\n #define WMEM1 {}\n
-            #define TMEM1 {}\n #define numReps {}""".format(
-                self.get_nodeattr("MW"),
-                self.get_nodeattr("MH"),
-                self.get_nodeattr("SIMD"),
-                self.get_nodeattr("PE"),
-                self.calc_wmem(),
-                self.calc_tmem(),
-                numReps,
-            )
-        ]
-        if mem_mode == "decoupled" or mem_mode == "external":
-            wdt = self.get_weight_datatype()
-            self.code_gen_dict["$DEFINES$"].append("#define WP1 {}\n".format(wdt.bitwidth()))
-
-    def read_npy_data(self):
-        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
-        dtype = self.get_input_datatype()
-        if dtype == DataType["BIPOLAR"]:
-            # use binary for bipolar storage
-            dtype = DataType["BINARY"]
-        elem_bits = dtype.bitwidth()
-        packed_bits = self.get_instream_width()
-        packed_hls_type = "ap_uint<%d>" % packed_bits
-        elem_hls_type = dtype.get_hls_datatype_str()
-        npy_type = "float"
-        npy_in = "%s/input_0.npy" % code_gen_dir
-        self.code_gen_dict["$READNPYDATA$"] = []
-        # note: the innermost dim is reversed for the input
-        self.code_gen_dict["$READNPYDATA$"].append(
-            'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s, false);'
-            % (
-                packed_hls_type,
-                elem_hls_type,
-                elem_bits,
-                npy_type,
-                npy_in,
-                self.hls_sname(),
-            )
-        )
-
-        mem_mode = self.get_nodeattr("mem_mode")
-        if mem_mode == "decoupled" or mem_mode == "external":
-            wdt = self.get_weight_datatype()
-            elem_bits = wdt.bitwidth()
-            packed_bits = self.get_weightstream_width()
-            packed_hls_type = "ap_uint<%d>" % packed_bits
-            elem_hls_type = wdt.get_hls_datatype_str()
-            npy_type = "float"
-            npy_in = "%s/weights.npy" % code_gen_dir
-
-            self.code_gen_dict["$READNPYDATA$"].append(
-                'npy2apintstream<%s, %s, %d, %s>("%s", weights_%s, false, numReps);'
-                % (
-                    packed_hls_type,
-                    elem_hls_type,
-                    elem_bits,
-                    npy_type,
-                    npy_in,
-                    self.hls_sname(),
-                )
-            )
-
-    def strm_decl(self):
-        mem_mode = self.get_nodeattr("mem_mode")
-        self.code_gen_dict["$STREAMDECLARATIONS$"] = []
-        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> in0_{} ("in0_{}");'.format(
-                self.get_instream_width(), self.hls_sname(), self.hls_sname()
-            )
-        )
-        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> out_{} ("out_{}");'.format(
-                self.get_outstream_width(), self.hls_sname(), self.hls_sname()
-            )
-        )
-
-        if mem_mode == "decoupled" or mem_mode == "external":
-            self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-                'hls::stream<ap_uint<{}>> weights_{} ("weights_{}");'.format(
-                    self.get_weightstream_width(), self.hls_sname(), self.hls_sname()
-                )
-            )
+    def get_op_and_param_counts(self):
+        in_features = self.get_nodeattr("MW")
+        out_features = self.get_nodeattr("MH")
+        weight_bits = self.get_weight_datatype().bitwidth()
+        inp_bits = self.get_input_datatype().bitwidth()
+        num_inp_vec = self.get_nodeattr("numInputVectors")
+        num_repetitions = int(np.prod(num_inp_vec))
+        mac_count = in_features * out_features * num_repetitions
+        # cannonicalize op type: highest bitwidth operand first s.t.
+        # e.g. mac_8bx4b and mac_4bx8b don't appear as two different op types
+        bw1 = min(inp_bits, weight_bits)
+        bw2 = max(inp_bits, weight_bits)
+        mac_op_type = "op_mac_%dbx%db" % (bw1, bw2)
+        weight_param_type = "param_weight_%db" % (weight_bits)
+        weight_count = in_features * out_features
+        ret_dict = {mac_op_type: mac_count, weight_param_type: weight_count}
+        if self.get_nodeattr("noActivation") == 0:
+            tdt = DataType[self.get_nodeattr("accDataType")]
+            thres_bits = tdt.bitwidth()
+            thres_param_type = "param_threshold_%db" % (thres_bits)
+            thres_count = out_features
+            ret_dict[thres_param_type] = thres_count
+        return ret_dict
 
-    def docompute(self):
-        mem_mode = self.get_nodeattr("mem_mode")
-        map_to_hls_mult_style = {
-            "auto": "ap_resource_dflt()",
-            "lut": "ap_resource_lut()",
-            "dsp": "ap_resource_dsp()",
+    def derive_characteristic_fxns(self, period):
+        n_inps = np.prod(self.get_folded_input_shape()[:-1])
+        io_dict = {
+            "inputs": {
+                "in0": [0 for i in range(n_inps)],
+            },
+            "outputs": {"out": []},
         }
-        tmpl_args = self.get_template_param_values()
-        if self.calc_tmem() == 0:
-            odtype_hls_str = self.get_output_datatype().get_hls_datatype_str()
-            threshs = "PassThroughActivation<%s>()" % odtype_hls_str
-        else:
-            threshs = "threshs"
-        if mem_mode == "const":
-            self.code_gen_dict["$DOCOMPUTE$"] = [
-                """Matrix_Vector_Activate_Batch<MW1, MH1, SIMD1, PE1, 1, {}, {}, {}>
-                (in0_{}, out_{}, weights, {}, numReps, {});""".format(
-                    tmpl_args["TSrcI"],
-                    tmpl_args["TDstI"],
-                    tmpl_args["TWeightI"],
-                    self.hls_sname(),
-                    self.hls_sname(),
-                    threshs,
-                    map_to_hls_mult_style[self.get_nodeattr("resType")],
-                )
-            ]
-        elif mem_mode == "decoupled" or mem_mode == "external":
-            wdt = self.get_weight_datatype()
-            if wdt == DataType["BIPOLAR"]:
-                export_wdt = DataType["BINARY"]
-            else:
-                export_wdt = wdt
-            wdtype_hls_str = export_wdt.get_hls_datatype_str()
-            self.code_gen_dict["$DOCOMPUTE$"] = [
-                """Matrix_Vector_Activate_Stream_Batch<MW1, MH1, SIMD1, PE1, {}, {}, {}, {} >
-                (in0_{}, out_{}, weights_{}, {}, numReps, {});""".format(
-                    tmpl_args["TSrcI"],
-                    tmpl_args["TDstI"],
-                    tmpl_args["TWeightI"],
-                    wdtype_hls_str,
-                    self.hls_sname(),
-                    self.hls_sname(),
-                    self.hls_sname(),
-                    threshs,
-                    map_to_hls_mult_style[self.get_nodeattr("resType")],
-                )
-            ]
-
-        else:
-            raise Exception(
-                """Please set mem_mode to "const", "decoupled", or "external",
-                currently no other parameter value is supported!"""
-            )
-
-    def dataoutstrm(self):
-        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
-        dtype = self.get_output_datatype()
-        if dtype == DataType["BIPOLAR"]:
-            # use binary for bipolar storage
-            dtype = DataType["BINARY"]
-        elem_bits = dtype.bitwidth()
-        packed_bits = self.get_outstream_width()
-        packed_hls_type = "ap_uint<%d>" % packed_bits
-        elem_hls_type = dtype.get_hls_datatype_str()
-        npy_type = "float"
-        npy_out = "%s/output.npy" % code_gen_dir
-        shape = self.get_folded_output_shape()
-        shape_cpp_str = str(shape).replace("(", "{").replace(")", "}")
-
-        # note: the innermost dim is not reversed for the output
-        self.code_gen_dict["$DATAOUTSTREAM$"] = [
-            'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", false);'
-            % (
-                packed_hls_type,
-                elem_hls_type,
-                elem_bits,
-                npy_type,
-                self.hls_sname(),
-                shape_cpp_str,
-                npy_out,
-            )
-        ]
-
-    def save_as_npy(self):
-        self.code_gen_dict["$SAVEASCNPY$"] = []
-
-    def blackboxfunction(self):
-        mem_mode = self.get_nodeattr("mem_mode")
-        if mem_mode == "const":
-            self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
-                """void {}(hls::stream<ap_uint<{}>> &in0_{},
-                    hls::stream<ap_uint<{}>> &out_{}
-                    )""".format(
-                    self.onnx_node.name,
-                    self.get_instream_width(),
-                    self.hls_sname(),
-                    self.get_outstream_width(),
-                    self.hls_sname(),
-                )
-            ]
-        elif mem_mode == "decoupled" or mem_mode == "external":
-            self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
-                """void {}(
-                    hls::stream<ap_uint<{}>> &in0_{},
-                    hls::stream<ap_uint<{}>> &weights_{},
-                    hls::stream<ap_uint<{}>> &out_{}
-                    )""".format(
-                    self.onnx_node.name,
-                    self.get_instream_width(),
-                    self.hls_sname(),
-                    self.get_weightstream_width(),
-                    self.hls_sname(),
-                    self.get_outstream_width(),
-                    self.hls_sname(),
-                )
-            ]
-
-        else:
-            raise Exception(
-                """Please set mem_mode to "const" or "decoupled", currently no other
-                    parameter value is supported!"""
-            )
-
-    def pragmas(self):
         mem_mode = self.get_nodeattr("mem_mode")
-        ram_style_thresholds = self.get_nodeattr("ram_style_thresholds")
-        self.code_gen_dict["$PRAGMAS$"] = [
-            "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname()
-        ]
-        self.code_gen_dict["$PRAGMAS$"].append(
-            "#pragma HLS INTERFACE axis port=out_" + self.hls_sname()
-        )
-        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return")
-
-        if mem_mode == "const":
-            self.code_gen_dict["$PRAGMAS$"].append('#include "params.h"')
-            # the weight tensor is ap_uint<simd*prec> [PE][WMEM]
-            # partition for parallel access along the PE dimension (dim 1)
-            self.code_gen_dict["$PRAGMAS$"].append(
-                ("#pragma HLS ARRAY_PARTITION variable=weights.m_weights " "complete dim=1")
-            )
-        elif mem_mode == "decoupled" or mem_mode == "external":
-            self.code_gen_dict["$PRAGMAS$"].append(
-                "#pragma HLS INTERFACE axis port=weights_" + self.hls_sname()
-            )
+        if mem_mode in ["decoupled", "external"]:
+            n_weight_inps = self.calc_wmem()
+            num_w_reps = np.prod(self.get_nodeattr("numInputVectors"))
+            io_dict["inputs"]["weights"] = [0 for i in range(num_w_reps * n_weight_inps)]
+        super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
 
+    def execute_node(self, context, graph):
+        node = self.onnx_node
+        in_act = context[node.input[0]]
+        mvau_w_init = [x for x in graph.initializer if x.name == node.input[1]][0]
+        mvau_w = np_helper.to_array(mvau_w_init)
+        # Matrix multiplication
+        if self.get_nodeattr("binaryXnorMode"):
+            # Note: activation/weights are expected to be binary (by design coming from the transformation inferring this operation mode)
+            result = xp.xnorpopcountmatmul(in_act, mvau_w)
+        elif (self.get_nodeattr("inputDataType") == "BIPOLAR" and self.get_nodeattr("weightDataType") == "BIPOLAR"):
+            result = xp.xnorpopcountmatmul((in_act+1)/2, (mvau_w+1)/2)
         else:
-            raise Exception(
-                """Please set mem_mode to "const", "decoupled", or external,
-                currently no other parameter value is supported!"""
-            )
-
-        # the threshold tensor is acc_type [PE][TMEM][N_THRES]
-        # partition for parallel access along PE and N_THRES
-        # dimensions (dims 1 and 3)
-        if self.calc_tmem() != 0:
-            # TODO find a better way of checking for no pregenerated thresholds
-            self.code_gen_dict["$PRAGMAS$"].append(
-                ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=1")
-            )
-            self.code_gen_dict["$PRAGMAS$"].append(
-                ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=3")
-            )
-            # add resource pragma for thresholds if set
-            if ram_style_thresholds == "distributed":
-                self.code_gen_dict["$PRAGMAS$"].append(
-                    ("#pragma HLS RESOURCE variable=threshs.m_thresholds " "core=ROM_2P_LUTRAM")
-                )
-            elif ram_style_thresholds == "block":
-                self.code_gen_dict["$PRAGMAS$"].append(
-                    ("#pragma HLS RESOURCE variable=threshs.m_thresholds " "core=ROM_2P_BRAM")
-                )
-            elif ram_style_thresholds == "auto":
-                # no pragma needed
-                pass
-            else:
-                raise Exception("Unrecognized ram_style_thresholds value:" + ram_style_thresholds)
+            result = np.matmul(in_act, mvau_w)
+        # Thresholding if noActivation==0
+        if self.get_nodeattr("noActivation") == 0:
+            mvau_thr_init = [x for x in graph.initializer if x.name == node.input[2]][0]
+            mvau_thr = np_helper.to_array(mvau_thr_init)
+            odt_is_bipolar = self.get_nodeattr("outputDataType") == DataType["BIPOLAR"]
+            out_scale = 2 if odt_is_bipolar else 1
+            out_bias = -1 if odt_is_bipolar else self.get_nodeattr("ActVal")
+            result = multithreshold(result, mvau_thr, out_scale, out_bias)
+        
+        context[node.output[0]] = result
 
     def code_generation_ipi(self):
         cmd = []
@@ -1326,22 +1050,51 @@ def code_generation_ipi(self):
             cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name))
             cmd.append(
                 "create_bd_intf_pin -mode Master "
-                "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, dout_name)
+                "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s"
+                % (node_name, dout_name)
             )
             cmd.append(
                 "create_bd_intf_pin -mode Slave "
                 "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name)
             )
-            # instantiate the hls ip
-            cmd.append(
-                "create_bd_cell -type ip -vlnv %s /%s/%s"
-                % (self.get_nodeattr("ip_vlnv"), node_name, node_name)
-            )
+            is_rtl_op = self.__class__.__name__ == "MatrixVectorActivation_rtl"
+            if is_rtl_op:
+                # instantiate the RTL block
+                code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+                rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/")
+                sourcefiles = [
+                    os.path.join(
+                        code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"
+                    ),
+                    rtllib_dir + "mvu_vvu_axi.sv",
+                    rtllib_dir + "replay_buffer.sv",
+                    rtllib_dir + "mvu_4sx4u.sv",
+                    rtllib_dir + "mvu_vvu_8sx9_dsp58.sv",
+                    rtllib_dir + "mvu_8sx8u_dsp48.sv",
+                ]
+                for f in sourcefiles:
+                    cmd.append("add_files -norecurse %s" % (f))
+                cmd.append(
+                    "create_bd_cell -type hier -reference %s /%s/%s"
+                    % (
+                        self.get_nodeattr("gen_top_module"),
+                        self.onnx_node.name,
+                        self.onnx_node.name,
+                    )
+                )
+            else:
+                # instantiate the hls ip
+                cmd.append(
+                    "create_bd_cell -type ip -vlnv %s /%s/%s"
+                    % (self.get_nodeattr("ip_vlnv"), node_name, node_name)
+                )
+
             # instantiate a streamer and connect it to the HLS IP
             strm_vlnv = "amd.com:finn:memstream:1.0"
             strm_inst = node_name + "_wstrm"
             cmd.append(
-                "create_bd_cell -type ip -vlnv %s /%s/%s" % (strm_vlnv, node_name, strm_inst)
+                "create_bd_cell -type ip -vlnv %s /%s/%s"
+                % (strm_vlnv, node_name, strm_inst)
             )
             cmd.append(
                 "set_property -dict [list "
@@ -1395,7 +1148,8 @@ def code_generation_ipi(self):
                 axilite_name = self.get_verilog_top_module_intf_names()["axilite"][0]
                 cmd.append(
                     "create_bd_intf_pin -mode Slave "
-                    "-vlnv xilinx.com:interface:aximm_rtl:1.0 /%s/%s" % (node_name, axilite_name)
+                    "-vlnv xilinx.com:interface:aximm_rtl:1.0 /%s/%s"
+                    % (node_name, axilite_name)
                 )
                 cmd.append(
                     "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
@@ -1406,60 +1160,32 @@ def code_generation_ipi(self):
                 cmd.append("assign_bd_address")
             cmd.append("save_bd_design")
         elif mem_mode == "const" or mem_mode == "external":
-            # base class impl sufficient for const/external modes
-            return super().code_generation_ipi()
+            if is_rtl_op and mem_mode == "external":
+                # instantiate the RTL block
+                code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+                rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/")
+                sourcefiles = [
+                    os.path.join(
+                        code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"
+                    ),
+                    rtllib_dir + "mvu_vvu_axi.sv",
+                    rtllib_dir + "replay_buffer.sv",
+                    rtllib_dir + "mvu_4sx4u.sv",
+                    rtllib_dir + "mvu_vvu_8sx9_dsp58.sv",
+                    rtllib_dir + "mvu_8sx8u_dsp48.sv",
+                ]
+                for f in sourcefiles:
+                    cmd.append("add_files -norecurse %s" % (f))
+                cmd.append(
+                    "create_bd_cell -type module -reference %s %s"
+                    % (
+                        self.get_nodeattr("gen_top_module"),
+                        self.onnx_node.name,
+                    )
+                )
+            else:
+                # base class impl sufficient for const/external modes
+                return super().code_generation_ipi()
         else:
             raise Exception("Unrecognized mem_mode for MatrixVectorActivation")
-        return cmd
-
-    def get_verilog_top_module_intf_names(self):
-        intf_names = super().get_verilog_top_module_intf_names()
-        mem_mode = self.get_nodeattr("mem_mode")
-        sname = self.hls_sname()
-        if mem_mode == "external":
-            intf_names["s_axis"].append(("weights_" + sname, self.get_weightstream_width_padded()))
-        if mem_mode == "decoupled":
-            # only expose axilite interface if attribute is set
-            runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1
-            if runtime_writable:
-                intf_names["axilite"] = ["s_axilite"]
-        return intf_names
-
-    def get_op_and_param_counts(self):
-        in_features = self.get_nodeattr("MW")
-        out_features = self.get_nodeattr("MH")
-        weight_bits = self.get_weight_datatype().bitwidth()
-        inp_bits = self.get_input_datatype().bitwidth()
-        num_inp_vec = self.get_nodeattr("numInputVectors")
-        num_repetitions = int(np.prod(num_inp_vec))
-        mac_count = in_features * out_features * num_repetitions
-        # cannonicalize op type: highest bitwidth operand first s.t.
-        # e.g. mac_8bx4b and mac_4bx8b don't appear as two different op types
-        bw1 = min(inp_bits, weight_bits)
-        bw2 = max(inp_bits, weight_bits)
-        mac_op_type = "op_mac_%dbx%db" % (bw1, bw2)
-        weight_param_type = "param_weight_%db" % (weight_bits)
-        weight_count = in_features * out_features
-        ret_dict = {mac_op_type: mac_count, weight_param_type: weight_count}
-        if self.get_nodeattr("noActivation") == 0:
-            tdt = DataType[self.get_nodeattr("accDataType")]
-            thres_bits = tdt.bitwidth()
-            thres_param_type = "param_threshold_%db" % (thres_bits)
-            thres_count = out_features
-            ret_dict[thres_param_type] = thres_count
-        return ret_dict
-
-    def derive_characteristic_fxns(self, period):
-        n_inps = np.prod(self.get_folded_input_shape()[:-1])
-        io_dict = {
-            "inputs": {
-                "in0": [0 for i in range(n_inps)],
-            },
-            "outputs": {"out": []},
-        }
-        mem_mode = self.get_nodeattr("mem_mode")
-        if mem_mode in ["decoupled", "external"]:
-            n_weight_inps = self.calc_wmem()
-            num_w_reps = np.prod(self.get_nodeattr("numInputVectors"))
-            io_dict["inputs"]["weights"] = [0 for i in range(num_w_reps * n_weight_inps)]
-        super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
+        return cmd
\ No newline at end of file

From 1a40e6a5ac5670a04d74ab893b82dab59e0538f9 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 26 Jan 2024 11:46:51 +0000
Subject: [PATCH 101/112] [VVAU hw custom-op]: add base_op_type method

---
 src/finn/custom_op/fpgadataflow/vectorvectoractivation.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py
index bd5bb75f1d..891730ece3 100644
--- a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py
+++ b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py
@@ -104,6 +104,9 @@ def get_nodeattr_types(self):
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
 
+    def base_op_type(self):
+        return "VectorVectorActivation"
+
     def minimize_accumulator_width(self, model):
         """Minimize the accumulator bit width according to the weight values,
         input data types, and size of dot product"""

From 5e1ed9be433f8ced81de8dbfd1bb52f5a505a6b4 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 26 Jan 2024 11:47:27 +0000
Subject: [PATCH 102/112] [transform]: add transformation to infer MVAU hw
 custom-op

---
 .../fpgadataflow/convert_to_hw_layers.py      | 136 ++++++++++++++++++
 1 file changed, 136 insertions(+)

diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
index 2b8433e59c..4ea7f9298a 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
@@ -1196,3 +1196,139 @@ def apply(self, model):
                 graph_modified = True
 
         return (model, graph_modified)
+
+class InferQuantizedMatrixVectorActivation(Transformation):
+    """Convert MatMul layers with quantized inputs and weights to
+    MatrixVectorActivation layers."""
+
+    def __init__(self, mem_mode="const"):
+        super().__init__()
+        self.mem_mode = mem_mode
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for n in graph.node:
+            node_ind += 1
+            if n.op_type == "MatMul" and model.get_tensor_sparsity(n.input[1]) is None:
+                mm_input = n.input[0]
+                mm_weight = n.input[1]
+                mm_output = n.output[0]
+                mm_in_shape = model.get_tensor_shape(mm_input)
+                mm_out_shape = model.get_tensor_shape(mm_output)
+                idt = model.get_tensor_datatype(mm_input)
+                wdt = model.get_tensor_datatype(mm_weight)
+                if idt.is_integer() and wdt.is_integer():
+                    mm_output = n.output[0]
+                    W = model.get_initializer(mm_weight)
+                    # extract weight shape, note that ONNX and finn-hlslib
+                    # make different assumptions about dim order here
+                    # ONNX assumes W has (in, out) shape
+                    # finn-hlslib assumes W has (out, in) shape
+                    mh = int(W.shape[1])
+                    mw = int(W.shape[0])
+                    # create node with no parallelization first
+                    pe = 1
+                    simd = 1
+                    wmem = mw * mh // (pe * simd)
+                    assert mw * mh == wmem * pe * simd, (
+                        n.name
+                        + """: Requirement (MW * MH) divisible by
+                    (WMEM * PE * SIMD) is violated."""
+                    )
+                    # see if we have any following thresholds
+                    consumer = model.find_consumer(mm_output)
+                    if consumer is not None and consumer.op_type == "MultiThreshold":
+                        # TODO ensure integer thresholds?
+                        # create MVTU (i.e. including activation)
+                        mt_output = consumer.output[0]
+                        mt_out_shape = model.get_tensor_shape(mt_output)
+                        mt_thres = consumer.input[1]
+                        T = model.get_initializer(mt_thres)
+                        assert T.shape[0] == 1 or T.shape[0] == mh, (
+                            consumer.name
+                            + """: First dimension of
+                        thresholds neither 1 nor MH."""
+                        )
+                        odt = model.get_tensor_datatype(mt_output)
+                        scale = getCustomOp(consumer).get_nodeattr("out_scale")
+                        actval = getCustomOp(consumer).get_nodeattr("out_bias")
+                        assert int(actval) == actval, (
+                            consumer.name + ": out_bias must be integer for HLS conversion."
+                        )
+                        actval = int(actval)
+                        odt_is_bipolar = odt == DataType["BIPOLAR"]
+                        bipolar_ok = odt_is_bipolar and (scale == 2.0) and (actval == -1)
+                        assert scale == 1.0 or bipolar_ok, (
+                            consumer.name + ": out_scale=1 or bipolar output needed for conversion."
+                        )
+                        assert (not odt.signed()) or (actval < 0), (
+                            consumer.name + ": Signed output requres actval < 0"
+                        )
+                        model.set_tensor_shape(mm_input, mm_in_shape)
+                        model.set_tensor_shape(mt_output, mt_out_shape)
+                        if bipolar_ok:
+                            # remove bias for bipolar, since
+                            # binary->bipolar is achieved by reinterpretation
+                            actval = 0
+                        # create and insert new MatrixVectorActivation node
+                        new_node = helper.make_node(
+                            "MatrixVectorActivation",
+                            [mm_input, mm_weight, mt_thres],
+                            [mt_output],
+                            domain="finn.custom_op.fpgadataflow",
+                            backend="fpgadataflow",
+                            MW=mw,
+                            MH=mh,
+                            SIMD=simd,
+                            PE=pe,
+                            inputDataType=idt.name,
+                            weightDataType=wdt.name,
+                            outputDataType=odt.name,
+                            ActVal=actval,
+                            binaryXnorMode=0,
+                            noActivation=0,
+                            numInputVectors=list(mm_in_shape[:-1]),
+                            mem_mode=self.mem_mode,
+                            name="MatrixVectorActivation_" + n.name,
+                        )
+                        graph.node.insert(node_ind, new_node)
+                        # remove old nodes
+                        graph.node.remove(n)
+                        graph.node.remove(consumer)
+                        graph_modified = True
+                    else:
+                        # no activation, matmul only
+                        odt = model.get_tensor_datatype(mm_output)
+                        model.set_tensor_shape(mm_input, mm_in_shape)
+                        model.set_tensor_shape(mm_output, mm_out_shape)
+                        # create and insert new MatrixVectorActivation node
+                        new_node = helper.make_node(
+                            "MatrixVectorActivation",
+                            [mm_input, mm_weight],
+                            [mm_output],
+                            domain="finn.custom_op.fpgadataflow",
+                            backend="fpgadataflow",
+                            MW=mw,
+                            MH=mh,
+                            SIMD=simd,
+                            PE=pe,
+                            inputDataType=idt.name,
+                            weightDataType=wdt.name,
+                            outputDataType=odt.name,
+                            ActVal=0,
+                            binaryXnorMode=0,
+                            noActivation=1,
+                            numInputVectors=list(mm_in_shape[:-1]),
+                            mem_mode=self.mem_mode,
+                            name="MatrixVectorActivation_" + n.name,
+                        )
+                        graph.node.insert(node_ind, new_node)
+                        # remove old node
+                        graph.node.remove(n)
+                        graph_modified = True
+        if graph_modified:
+            model = model.transform(InferShapes())
+            model = model.transform(InferDataTypes())
+        return (model, graph_modified)
\ No newline at end of file

From 63c73c2970609ccc999df1bb122501e94f606ebb Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 26 Jan 2024 12:22:52 +0000
Subject: [PATCH 103/112] removed mvu rtl code to clean up PR

---
 finn-rtllib/mvu/mvu_4sx4u.sv           | 379 ----------------------
 finn-rtllib/mvu/mvu_8sx8u_dsp48.sv     | 379 ----------------------
 finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv  | 430 -------------------------
 finn-rtllib/mvu/mvu_vvu_axi.sv         | 234 --------------
 finn-rtllib/mvu/mvu_vvu_axi_wrapper.v  |  92 ------
 finn-rtllib/mvu/replay_buffer.sv       | 181 -----------
 finn-rtllib/mvu/tb/mvu_8sx9_tb.sv      | 165 ----------
 finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv   | 241 --------------
 finn-rtllib/mvu/tb/replay_buffer_tb.sv | 130 --------
 9 files changed, 2231 deletions(-)
 delete mode 100644 finn-rtllib/mvu/mvu_4sx4u.sv
 delete mode 100644 finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
 delete mode 100644 finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv
 delete mode 100644 finn-rtllib/mvu/mvu_vvu_axi.sv
 delete mode 100644 finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
 delete mode 100644 finn-rtllib/mvu/replay_buffer.sv
 delete mode 100644 finn-rtllib/mvu/tb/mvu_8sx9_tb.sv
 delete mode 100644 finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv
 delete mode 100644 finn-rtllib/mvu/tb/replay_buffer_tb.sv

diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv
deleted file mode 100644
index 7a2af35742..0000000000
--- a/finn-rtllib/mvu/mvu_4sx4u.sv
+++ /dev/null
@@ -1,379 +0,0 @@
-module mvu_4sx4u #(
-	int unsigned  PE,
-	int unsigned  SIMD,
-	int unsigned  ACCU_WIDTH,
-	bit SIGNED_ACTIVATIONS = 0,
-	bit FORCE_BEHAVIORAL = 0
-)(
-	// Global Control
-	input	logic  clk,
-	input	logic  rst,
-	input	logic  en,
-
-	// Input
-	input	logic  last,
-	input	logic  zero,	// ignore current inputs and force this partial product to zero
-	input	logic signed [PE-1:0][SIMD-1:0][3:0]  w,	// signed weights
-	input	logic                [SIMD-1:0][3:0]  a,	// unsigned activations
-
-	// Ouput
-	output	logic  vld,
-	output	logic signed [PE-1:0][ACCU_WIDTH-1:0]  p
-);
-	// for verilator always use behavioral code
-	localparam bit  BEHAVIORAL =
-`ifdef VERILATOR
-		1 ||
-`endif
-		FORCE_BEHAVIORAL;
-
-	typedef int unsigned  leave_load_t[2*SIMD-1];
-	function leave_load_t init_leave_loads();
-		automatic leave_load_t  res;
-		for(int  i = 2*(SIMD-1); i >= int'(SIMD)-1; i--)  res[i] = 1;
-		for(int  i = SIMD-2; i >= 0; i--)  res[i] = res[2*i+1] + res[2*i+2];
-		return  res;
-	endfunction : init_leave_loads
-
-	// Pipeline for last indicator flag
-	logic [1:5] L = '0;
-	always_ff @(posedge clk) begin
-		if(rst)      L <= '0;
-		else if(en)  L <= { last, L[1:4] };
-	end
-	assign	vld = L[5];
-
-	// Stages #1 - #3: DSP Lanes + cross-lane canaries duplicated with SIMD parallelism
-	localparam int unsigned  D[4:0] = '{ ACCU_WIDTH+22, 22, 15, 8, 0 }; // Lane offsets
-
-	localparam int unsigned  PIPE_COUNT = (PE+3)/4;
-	for(genvar  c = 0; c < PIPE_COUNT; c++) begin : genPipes
-
-		localparam int unsigned  PE_BEG = 4*c;
-		localparam int unsigned  PE_END = PE < 4*(c+1)? PE : 4*(c+1);
-		localparam int unsigned  PE_REM = 4*(c+1) - PE_END;
-
-		uwire        [57:0]  p3[SIMD];
-		uwire signed [ 1:0]  h3[SIMD][3];
-		for(genvar  s = 0; s < SIMD; s++) begin : genSIMD
-
-			// Input Lane Assembly
-			uwire [23:0]  bb = { {(20){SIGNED_ACTIVATIONS && a[s][3]}}, a[s] };
-			logic [33:0]  aa;
-			logic [26:0]  dd;
-			logic [ 1:0]  xx[3:1];
-			if(1) begin : blkVectorize
-				uwire [3:0]  ww[PE_END - PE_BEG];
-				for(genvar  pe = 0; pe < PE_END - PE_BEG; pe++) begin
-					assign	ww[pe] = w[PE_BEG + pe][s];
-					if(pe) begin
-						if(BEHAVIORAL)  assign  xx[pe + PE_REM] = zero? 0 : ww[pe] * a[s];
-`ifndef VERILATOR
-						else begin
-							LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x (
-								.O6(xx[pe + PE_REM][1]),
-								.O5(xx[pe + PE_REM][0]),
-								.I5(1'b1),
-								.I4(zero),
-								.I3(ww[pe][1]),
-								.I2(a[s][1]),
-								.I1(ww[pe][0]),
-								.I0(a[s][0])
-							);
-						end
-`endif
-					end
-				end
-				always_comb begin
-					dd = '0;
-					aa = '0;
-					for(int unsigned  pe = 0; pe < PE_END - PE_BEG; pe++) begin
-						dd[D[pe + PE_REM]+:3] = ww[pe];
-						aa[D[pe + PE_REM]+ 3] = ww[pe][3];
-					end
-				end
-			end : blkVectorize
-
-			uwire [57:0]  pp;
-
-			// Note: Since the product B * AD is computed,
-			//       rst can be only applied to AD and zero only to B
-			//       with the same effect as zeroing both.
-			if (BEHAVIORAL) begin : genBehav
-				// Stage #1: Input Refine
-				logic signed [23:0]  B1  = 0;
-				always_ff @(posedge clk) begin
-					if(zero)     B1  <= 0;
-					else if(en)  B1  <= bb;
-				end
-
-				logic signed [26:0]  AD1 = 0;
-				always_ff @(posedge clk) begin
-					if(rst)      AD1 <= 0;
-					else if(en)  AD1 <= dd - aa;
-				end
-
-				// Stage #2: Multiply
-				logic signed [50:0]  M2 = 0;
-				always_ff @(posedge clk) begin
-					if(rst)      M2 <= 0;
-					else if(en)  M2 <=
-// synthesis translate off
-						(B1 === '0) || (AD1 === '0)? 0 :
-// synthesis translate on
-						B1 * AD1;
-				end
-
-				// Stage #3: Accumulate
-				logic signed [57:0]  P3 = 0;
-				always_ff @(posedge clk) begin
-					if(rst)      P3 <= 0;
-					else if(en)  P3 <= M2 + (L[3]? 0 : P3);
-				end
-
-				assign	pp = P3;
-			end : genBehav
-`ifndef VERILATOR
-			else begin : genDSP
-				DSP48E2 #(
-					// Feature Control Attributes: Data Path Selection
-					.AMULTSEL("AD"),	// Selects A input to multiplier (A, AD)
-					.A_INPUT("DIRECT"),	// Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port)
-					.BMULTSEL("B"),		// Selects B input to multiplier (AD, B)
-					.B_INPUT("DIRECT"),	// Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port)
-					.PREADDINSEL("A"),                 // Selects input to pre-adder (A, B)
-					.RND('0),                          // Rounding Constant
-					.USE_MULT("MULTIPLY"),             // Select multiplier usage (DYNAMIC, MULTIPLY, NONE)
-					.USE_SIMD("ONE48"),                // SIMD selection (FOUR12, ONE58, TWO24)
-					.USE_WIDEXOR("FALSE"),             // Use the Wide XOR function (FALSE, TRUE)
-					.XORSIMD("XOR24_48_96"),       // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116)
-
-					// Pattern Detector Attributes: Pattern Detection Configuration
-					.AUTORESET_PATDET("NO_RESET"),     // NO_RESET, RESET_MATCH, RESET_NOT_MATCH
-					.AUTORESET_PRIORITY("RESET"),      // Priority of AUTORESET vs. CEP (CEP, RESET).
-					.MASK('1),                         // 58-bit mask value for pattern detect (1=ignore)
-					.PATTERN('0),                      // 58-bit pattern match for pattern detect
-					.SEL_MASK("MASK"),                 // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2
-					.SEL_PATTERN("PATTERN"),           // Select pattern value (C, PATTERN)
-					.USE_PATTERN_DETECT("NO_PATDET"),  // Enable pattern detect (NO_PATDET, PATDET)
-
-					// Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins
-					.IS_ALUMODE_INVERTED('0),				// Optional inversion for ALUMODE
-					.IS_CARRYIN_INVERTED('0),				// Optional inversion for CARRYIN
-					.IS_CLK_INVERTED('0),					// Optional inversion for CLK
-					.IS_INMODE_INVERTED('0),				// Optional inversion for INMODE
-					.IS_OPMODE_INVERTED(9'b00_010_01_01),	// Optional inversion for OPMODE
-					.IS_RSTALLCARRYIN_INVERTED('0),			// Optional inversion for RSTALLCARRYIN
-					.IS_RSTALUMODE_INVERTED('0),			// Optional inversion for RSTALUMODE
-					.IS_RSTA_INVERTED('0),					// Optional inversion for RSTA
-					.IS_RSTB_INVERTED('0),					// Optional inversion for RSTB
-					.IS_RSTCTRL_INVERTED('0),				// Optional inversion for STCONJUGATE_A
-					.IS_RSTC_INVERTED('0),					// Optional inversion for RSTC
-					.IS_RSTD_INVERTED('0),					// Optional inversion for RSTD
-					.IS_RSTINMODE_INVERTED('0),				// Optional inversion for RSTINMODE
-					.IS_RSTM_INVERTED('0),					// Optional inversion for RSTM
-					.IS_RSTP_INVERTED('0),					// Optional inversion for RSTP
-
-					// Register Control Attributes: Pipeline Register Configuration
-					.ACASCREG(0),                      // Number of pipeline stages between A/ACIN and ACOUT (0-2)
-					.ADREG(1),                         // Pipeline stages for pre-adder (0-1)
-					.ALUMODEREG(0),                    // Pipeline stages for ALUMODE (0-1)
-					.AREG(0),                          // Pipeline stages for A (0-2)
-					.BCASCREG(1),                      // Number of pipeline stages between B/BCIN and BCOUT (0-2)
-					.BREG(1),                          // Pipeline stages for B (0-2)
-					.CARRYINREG(0),                    // Pipeline stages for CARRYIN (0-1)
-					.CARRYINSELREG(0),                 // Pipeline stages for CARRYINSEL (0-1)
-					.CREG(0),                          // Pipeline stages for C (0-1)
-					.DREG(0),                          // Pipeline stages for D (0-1)
-					.INMODEREG(0),                     // Pipeline stages for INMODE (0-1)
-					.MREG(1),                          // Multiplier pipeline stages (0-1)
-					.OPMODEREG(1),                     // Pipeline stages for OPMODE (0-1)
-					.PREG(1)                          // Number of pipeline stages for P (0-1)
-				) dsp (
-					// Cascade outputs: Cascade Ports
-					.ACOUT(),			// 34-bit output: A port cascade
-					.BCOUT(),			// 24-bit output: B cascade
-					.CARRYCASCOUT(),	// 1-bit output: Cascade carry
-					.MULTSIGNOUT(),		// 1-bit output: Multiplier sign cascade
-					.PCOUT(),			// 58-bit output: Cascade output
-
-					// Control outputs: Control Inputs/Status Bits
-					.OVERFLOW(),		// 1-bit output: Overflow in add/acc
-					.PATTERNBDETECT(),	// 1-bit output: Pattern bar detect
-					.PATTERNDETECT(),	// 1-bit output: Pattern detect
-					.UNDERFLOW(),		// 1-bit output: Underflow in add/acc
-
-					// Data outputs: Data Ports
-					.CARRYOUT(),		// 4-bit output: Carry
-					.P(pp),				// 58-bit output: Primary data
-					.XOROUT(),			// 8-bit output: XOR data
-
-					// Cascade inputs: Cascade Ports
-					.ACIN('x),			// 34-bit input: A cascade data
-					.BCIN('x),			// 24-bit input: B cascade
-					.CARRYCASCIN('x),	// 1-bit input: Cascade carry
-					.MULTSIGNIN('x),	// 1-bit input: Multiplier sign cascade
-					.PCIN('x),			// 58-bit input: P cascade
-
-					// Control inputs: Control Inputs/Status Bits
-					.CLK(clk),					// 1-bit input: Clock
-					.ALUMODE(4'h0),				// 4-bit input: ALU control
-					.CARRYINSEL('0),			// 3-bit input: Carry select
-					.INMODE(5'b01100),			// 5-bit input: INMODE control
-					.OPMODE({ 2'b00, { 1'b0, L[2], 1'b0 }, 4'b00_00 }),	// 9-bit input: Operation mode
-
-					// Data inputs: Data Ports
-					.A(aa),						// 34-bit input: A data
-					.B(bb),						// 24-bit input: B data
-					.C('x),						// 58-bit input: C data
-					.CARRYIN('0),				// 1-bit input: Carry-in
-					.D(dd),						// 27-bit input: D data
-
-					// Reset/Clock Enable inputs: Reset/Clock Enable Inputs
-					.CEA1('0),			// 1-bit input: Clock enable for 1st stage AREG
-					.CEA2('0),			// 1-bit input: Clock enable for 2nd stage AREG
-					.CEAD(en),			// 1-bit input: Clock enable for ADREG
-					.CEALUMODE('0),		// 1-bit input: Clock enable for ALUMODE
-					.CEB1('0),			// 1-bit input: Clock enable for 1st stage BREG
-					.CEB2(en),			// 1-bit input: Clock enable for 2nd stage BREG
-					.CEC('0),			// 1-bit input: Clock enable for CREG
-					.CECARRYIN('0),		// 1-bit input: Clock enable for CARRYINREG
-					.CECTRL(en),		// 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG
-					.CED('0),			// 1-bit input: Clock enable for DREG
-					.CEINMODE('0),		// 1-bit input: Clock enable for INMODEREG
-					.CEM(en),			// 1-bit input: Clock enable for MREG
-					.CEP(en),			// 1-bit input: Clock enable for PREG
-					.RSTA('0),			// 1-bit input: Reset for AREG
-					.RSTB(				// 1-bit input: Reset for BREG
-// synthesis translate_off
-						rst ||
-// synthesis translate_on
-						zero
-					),
-					.RSTC('0),			// 1-bit input: Reset for CREG
-					.RSTD(				// 1-bit input: Reset for DREG and ADREG
-// synthesis translate_off
-						zero ||
-// synthesis translate_on
-						rst
-					),
-					.RSTALLCARRYIN('0),	// 1-bit input: Reset for CARRYINREG
-					.RSTALUMODE('0),	// 1-bit input: Reset for ALUMODEREG
-					.RSTCTRL('0),		// 1-bit input: Reset for OPMODEREG and CARRYINSELREG
-					.RSTINMODE('0),		// 1-bit input: Reset for INMODE register
-					.RSTM(rst),			// 1-bit input: Reset for MREG
-					.RSTP(rst)			// 1-bit input: Reset for PREG
-				);
-			end : genDSP
-`endif
-
-			// External Canary Pipeline
-			logic [1:0]  X1[3:1] = '{ default: 0 };
-			logic [1:0]  X2[3:1] = '{ default: 0 };
-			logic [1:0]  X3[3:1] = '{ default: 0 };
-			always_ff @(posedge clk) begin
-				if(rst) begin
-					X1 <= '{ default: 0 };
-					X2 <= '{ default: 0 };
-					X3 <= '{ default: 0 };
-				end
-				else if(en) begin
-					X1 <= xx;
-					X2 <= X1;
-					foreach(X3[i]) begin
-						X3[i] <= X2[i] + (L[3]? 2'h0 : pp[D[i]+:2]);
-					end
-				end
-			end
-
-			// Derive actual cross-lane overflows
-			for(genvar  i = 0; i < 3; i++) begin
-				assign	h3[s][i] = pp[D[i+1]+:2] - X3[i+1];
-			end
-			assign	p3[s] = pp;
-
-		end : genSIMD
-
-		// Stage #4: Cross-SIMD Reduction
-
-		// Count leaves reachable from each node
-		localparam leave_load_t  LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 1}; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop
-
-		uwire signed [ACCU_WIDTH  -1:0]  up4;
-		uwire signed [ACCU_WIDTH  -8:0]  hi4[3];
-		uwire        [$clog2(SIMD)+7:0]  lo4[3];
-		for(genvar  i = 0; i < 4; i++) begin
-			localparam int unsigned  LO_WIDTH = D[i+1] - D[i];
-			localparam int unsigned  HI_WIDTH = ACCU_WIDTH - LO_WIDTH;
-
-			// Conclusive high part accumulation
-			if(i >= PE_REM && i < 3) begin : genHi
-				// Adder Tree across all SIMD high contributions, each from [-1:1]
-				uwire signed [2*SIMD-2:0][$clog2(1+SIMD):0]  tree;
-				for(genvar  s = 0; s < SIMD;   s++)  assign  tree[SIMD-1+s] = h3[s][i];
-				for(genvar  n = 0; n < SIMD-1; n++) begin
-					// Sum truncated to actual maximum bit width at this node
-					uwire signed [$clog2(1+LEAVE_LOAD[n]):0]  s = tree[2*n+1] + tree[2*n+2];
-					assign  tree[n] = s;
-				end
-
-				// High Sideband Accumulation
-				logic signed [HI_WIDTH-1:0]  Hi4 = 0;
-				always_ff @(posedge clk) begin
-					if(rst)      Hi4 <= 0;
-					else if(en)  Hi4 <= (L[4]? 0 : Hi4) + tree[0];
-				end
-				assign	hi4[i] = Hi4;
-			end : genHi
-			else if (i < 3) begin : genHiZero
-				assign hi4[i] = '0;
-			end : genHiZero
-
-			// Conclusive low part accumulation
-			if(i >= PE_REM) begin : blkLo
-				// Adder Tree across all SIMD low contributions
-				localparam int unsigned  ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1));
-				uwire [2*SIMD-2:0][ROOT_WIDTH-1:0]  tree;
-				for(genvar  s = 0; s < SIMD;   s++)  assign  tree[SIMD-1+s] = p3[s][D[i]+:LO_WIDTH];
-				for(genvar  n = 0; n < SIMD-1; n++) begin
-					// Sum truncated to actual maximum bit width at this node
-					localparam int unsigned  NODE_WIDTH = $clog2(1 + LEAVE_LOAD[n]*(2**LO_WIDTH-1));
-					uwire [NODE_WIDTH-1:0]  s = tree[2*n+1] + tree[2*n+2];
-					assign  tree[n] = s;
-				end
-
-				logic [ROOT_WIDTH-1:0]  Lo4 = 0;
-				always_ff @(posedge clk) begin
-					if(rst)      Lo4 <= 0;
-					else if(en)  Lo4 <= tree[0];
-				end
-
-				if(i == 3)  assign  up4 = Lo4;
-				else  assign  lo4[i] = Lo4;
-			end : blkLo
-			else begin : blkLoZero
-				assign lo4[i] = '0;
-			end : blkLoZero
-
-		end
-
-		// Stage #5: Resolve lane totals
-		logic signed [3:0][ACCU_WIDTH-1:0]  Res5 = '{ default: 0 };
-		always_ff @(posedge clk) begin
-			if(rst)  Res5 <= '{ default: 0 };
-			else if(en) begin
-				Res5[3] <= up4 - hi4[2];
-				Res5[2] <= $signed({ hi4[2], {(D[3] - D[2]){1'b0}} }) + $signed({ 1'b0, lo4[2] }) - hi4[1];
-				Res5[1] <= $signed({ hi4[1], {(D[2] - D[1]){1'b0}} }) + $signed({ 1'b0, lo4[1] }) - hi4[0];
-				Res5[0] <= $signed({ hi4[0], {(D[1] - D[0]){1'b0}} }) + $signed({ 1'b0, lo4[0] });
-			end
-		end
-
-		// Output
-		for(genvar  pe = PE_BEG; pe < PE_END; pe++) begin
-			assign	p[pe] = Res5[pe - PE_BEG + PE_REM];
-		end
-
-	end : genPipes
-
-endmodule : mvu_4sx4u
diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
deleted file mode 100644
index 1e6855f779..0000000000
--- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
+++ /dev/null
@@ -1,379 +0,0 @@
-module mvu_8sx8u_dsp48 #(
-	int unsigned  PE,
-	int unsigned  SIMD,
-	int unsigned  ACCU_WIDTH,
-	int unsigned  ACTIVATION_WIDTH,
-	int unsigned  WEIGHT_WIDTH,
-
-	bit  SIGNED_ACTIVATIONS = 0,
-	bit  FORCE_BEHAVIORAL = 0,
-
-	localparam int unsigned SINGLE_PROD_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH
-)(
-	// Global Control
-	input	logic  clk,
-	input	logic  rst,
-	input	logic  en,
-
-	// Input
-	input	logic  last,
-	input	logic  zero,	// ignore current inputs and force this partial product to zero
-	input	logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH    -1:0]  w,	// signed weights
-	input	logic                [SIMD-1:0][ACTIVATION_WIDTH-1:0]  a,	// unsigned activations (override by SIGNED_ACTIVATIONS)
-
-	// Ouput
-	output	logic  vld,
-	output	logic signed [PE-1:0][ACCU_WIDTH-1:0]  p
-);
-	// for verilator always use behavioral code
-	localparam bit  BEHAVIORAL =
-`ifdef VERILATOR
-		1 ||
-`endif
-		FORCE_BEHAVIORAL;
-
-	typedef int unsigned  leave_load_t[2*SIMD-1];
-	function leave_load_t init_leave_loads();
-		automatic leave_load_t  res;
-		for(int  i = 2*(SIMD-1); i >= int'(SIMD)-1; i--)  res[i] = 1;
-		for(int  i = SIMD-2; i >= 0; i--)  res[i] = res[2*i+1] + res[2*i+2];
-		return  res;
-	endfunction : init_leave_loads
-
-	// Pipeline for last indicator flag
-	logic [1:5] L = '0;
-	always_ff @(posedge clk) begin
-		if(rst)      L <= '0;
-		else if(en)  L <= { last, L[1:4] };
-	end
-	assign	vld = L[5];
-
-	// Stages #1 - #3: DSP Lanes + cross-lane canaries duplicated with SIMD parallelism
-	localparam int unsigned  D[2:0] = '{ ACCU_WIDTH+SINGLE_PROD_WIDTH, SINGLE_PROD_WIDTH, 0 }; // Lane offsets
-
-	localparam int unsigned  PIPE_COUNT = (PE+1)/2;
-	for(genvar  c = 0; c < PIPE_COUNT; c++) begin : genPipes
-
-		localparam int unsigned  PE_BEG = 2*c;
-		localparam int unsigned  PE_END = PE < 2*(c+1)? PE : 2*(c+1);
-		localparam int unsigned  PE_REM = 2*(c+1) - PE_END;
-
-		uwire        [57:0]  p3[SIMD];
-		uwire signed [ 1:0]  h3[SIMD];
-		for(genvar  s = 0; s < SIMD; s++) begin : genSIMD
-
-			// Input Lane Assembly
-			uwire [23:0]  bb = { {(24-ACTIVATION_WIDTH){SIGNED_ACTIVATIONS && a[s][ACTIVATION_WIDTH-1]}}, a[s] };
-			logic [33:0]  aa;
-			logic [26:0]  dd;
-			logic [ 1:0]  xx;
-			if(1) begin : blkVectorize
-				uwire [WEIGHT_WIDTH-1:0]  ww[PE_END - PE_BEG];
-				for(genvar  pe = 0; pe < PE_END - PE_BEG; pe++) begin
-					assign	ww[pe] = w[PE_BEG + pe][s];
-					if(pe) begin
-						if(BEHAVIORAL)  assign  xx = zero? 0 : ww[pe] * a[s];
-`ifndef VERILATOR
-						else begin
-							LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x (
-								.O6(xx[1]),
-								.O5(xx[0]),
-								.I5(1'b1),
-								.I4(zero),
-								.I3(ww[pe][1]),
-								.I2(a[s][1]),
-								.I1(ww[pe][0]),
-								.I0(a[s][0])
-							);
-						end
-`endif
-					end
-				end
-				always_comb begin
-					dd = '0;
-					aa = '0;
-					for(int unsigned  pe = 0; pe < PE_END - PE_BEG; pe++) begin
-						dd[D[pe + PE_REM] +: WEIGHT_WIDTH-1] = ww[pe];
-						aa[D[pe + PE_REM] + WEIGHT_WIDTH-1] = ww[pe][WEIGHT_WIDTH-1];
-					end
-				end
-			end : blkVectorize
-
-			uwire [57:0]  pp;
-
-			// Note: Since the product B * AD is computed,
-			//       rst can be only applied to AD and zero only to B
-			//       with the same effect as zeroing both.
-			if(BEHAVIORAL) begin : genBehav
-				// Stage #1: Input Refine
-				logic signed [23:0]  B1  = 0;
-				always_ff @(posedge clk) begin
-					if(zero)     B1  <= 0;
-					else if(en)  B1  <= bb;
-				end
-
-				logic signed [26:0]  AD1 = 0;
-				always_ff @(posedge clk) begin
-					if(rst)      AD1 <= 0;
-					else if(en)  AD1 <= dd - aa;
-				end
-
-				// Stage #2: Multiply
-				logic signed [50:0]  M2 = 0;
-				always_ff @(posedge clk) begin
-					if(rst)      M2 <= 0;
-					else if(en)  M2 <=
-// synthesis translate off
-						(B1 === '0) || (AD1 === '0)? 0 :
-// synthesis translate on
-						B1 * AD1;
-				end
-
-				// Stage #3: Accumulate
-				logic signed [57:0]  P3 = 0;
-				always_ff @(posedge clk) begin
-					if(rst)      P3 <= 0;
-					else if(en)  P3 <= M2 + (L[3]? 0 : P3);
-				end
-
-				assign	pp = P3;
-			end : genBehav
-`ifndef VERILATOR
-			else begin : genDSP
-				DSP48E2 #(
-					// Feature Control Attributes: Data Path Selection
-					.AMULTSEL("AD"),	// Selects A input to multiplier (A, AD)
-					.A_INPUT("DIRECT"),	// Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port)
-					.BMULTSEL("B"),		// Selects B input to multiplier (AD, B)
-					.B_INPUT("DIRECT"),	// Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port)
-					.PREADDINSEL("A"),                 // Selects input to pre-adder (A, B)
-					.RND('0),                          // Rounding Constant
-					.USE_MULT("MULTIPLY"),             // Select multiplier usage (DYNAMIC, MULTIPLY, NONE)
-					.USE_SIMD("ONE48"),                // SIMD selection (FOUR12, ONE58, TWO24)
-					.USE_WIDEXOR("FALSE"),             // Use the Wide XOR function (FALSE, TRUE)
-					.XORSIMD("XOR24_48_96"),       // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116)
-
-					// Pattern Detector Attributes: Pattern Detection Configuration
-					.AUTORESET_PATDET("NO_RESET"),     // NO_RESET, RESET_MATCH, RESET_NOT_MATCH
-					.AUTORESET_PRIORITY("RESET"),      // Priority of AUTORESET vs. CEP (CEP, RESET).
-					.MASK('1),                         // 58-bit mask value for pattern detect (1=ignore)
-					.PATTERN('0),                      // 58-bit pattern match for pattern detect
-					.SEL_MASK("MASK"),                 // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2
-					.SEL_PATTERN("PATTERN"),           // Select pattern value (C, PATTERN)
-					.USE_PATTERN_DETECT("NO_PATDET"),  // Enable pattern detect (NO_PATDET, PATDET)
-
-					// Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins
-					.IS_ALUMODE_INVERTED('0),				// Optional inversion for ALUMODE
-					.IS_CARRYIN_INVERTED('0),				// Optional inversion for CARRYIN
-					.IS_CLK_INVERTED('0),					// Optional inversion for CLK
-					.IS_INMODE_INVERTED('0),				// Optional inversion for INMODE
-					.IS_OPMODE_INVERTED(9'b00_010_01_01),	// Optional inversion for OPMODE
-					.IS_RSTALLCARRYIN_INVERTED('0),			// Optional inversion for RSTALLCARRYIN
-					.IS_RSTALUMODE_INVERTED('0),			// Optional inversion for RSTALUMODE
-					.IS_RSTA_INVERTED('0),					// Optional inversion for RSTA
-					.IS_RSTB_INVERTED('0),					// Optional inversion for RSTB
-					.IS_RSTCTRL_INVERTED('0),				// Optional inversion for STCONJUGATE_A
-					.IS_RSTC_INVERTED('0),					// Optional inversion for RSTC
-					.IS_RSTD_INVERTED('0),					// Optional inversion for RSTD
-					.IS_RSTINMODE_INVERTED('0),				// Optional inversion for RSTINMODE
-					.IS_RSTM_INVERTED('0),					// Optional inversion for RSTM
-					.IS_RSTP_INVERTED('0),					// Optional inversion for RSTP
-
-					// Register Control Attributes: Pipeline Register Configuration
-					.ACASCREG(0),                      // Number of pipeline stages between A/ACIN and ACOUT (0-2)
-					.ADREG(1),                         // Pipeline stages for pre-adder (0-1)
-					.ALUMODEREG(0),                    // Pipeline stages for ALUMODE (0-1)
-					.AREG(0),                          // Pipeline stages for A (0-2)
-					.BCASCREG(1),                      // Number of pipeline stages between B/BCIN and BCOUT (0-2)
-					.BREG(1),                          // Pipeline stages for B (0-2)
-					.CARRYINREG(0),                    // Pipeline stages for CARRYIN (0-1)
-					.CARRYINSELREG(0),                 // Pipeline stages for CARRYINSEL (0-1)
-					.CREG(0),                          // Pipeline stages for C (0-1)
-					.DREG(0),                          // Pipeline stages for D (0-1)
-					.INMODEREG(0),                     // Pipeline stages for INMODE (0-1)
-					.MREG(1),                          // Multiplier pipeline stages (0-1)
-					.OPMODEREG(1),                     // Pipeline stages for OPMODE (0-1)
-					.PREG(1)                          // Number of pipeline stages for P (0-1)
-				) dsp (
-					// Cascade outputs: Cascade Ports
-					.ACOUT(),			// 34-bit output: A port cascade
-					.BCOUT(),			// 24-bit output: B cascade
-					.CARRYCASCOUT(),	// 1-bit output: Cascade carry
-					.MULTSIGNOUT(),		// 1-bit output: Multiplier sign cascade
-					.PCOUT(),			// 58-bit output: Cascade output
-
-					// Control outputs: Control Inputs/Status Bits
-					.OVERFLOW(),		// 1-bit output: Overflow in add/acc
-					.PATTERNBDETECT(),	// 1-bit output: Pattern bar detect
-					.PATTERNDETECT(),	// 1-bit output: Pattern detect
-					.UNDERFLOW(),		// 1-bit output: Underflow in add/acc
-
-					// Data outputs: Data Ports
-					.CARRYOUT(),		// 4-bit output: Carry
-					.P(pp),				// 58-bit output: Primary data
-					.XOROUT(),			// 8-bit output: XOR data
-
-					// Cascade inputs: Cascade Ports
-					.ACIN('x),			// 34-bit input: A cascade data
-					.BCIN('x),			// 24-bit input: B cascade
-					.CARRYCASCIN('x),	// 1-bit input: Cascade carry
-					.MULTSIGNIN('x),	// 1-bit input: Multiplier sign cascade
-					.PCIN('x),			// 58-bit input: P cascade
-
-					// Control inputs: Control Inputs/Status Bits
-					.CLK(clk),					// 1-bit input: Clock
-					.ALUMODE(4'h0),				// 4-bit input: ALU control
-					.CARRYINSEL('0),			// 3-bit input: Carry select
-					.INMODE(5'b01100),			// 5-bit input: INMODE control
-					.OPMODE({ 2'b00, { 1'b0, L[2], 1'b0 }, 4'b00_00 }),	// 9-bit input: Operation mode
-
-					// Data inputs: Data Ports
-					.A(aa),						// 34-bit input: A data
-					.B(bb),						// 24-bit input: B data
-					.C('x),						// 58-bit input: C data
-					.CARRYIN('0),				// 1-bit input: Carry-in
-					.D(dd),						// 27-bit input: D data
-
-					// Reset/Clock Enable inputs: Reset/Clock Enable Inputs
-					.CEA1('0),			// 1-bit input: Clock enable for 1st stage AREG
-					.CEA2('0),			// 1-bit input: Clock enable for 2nd stage AREG
-					.CEAD(en),			// 1-bit input: Clock enable for ADREG
-					.CEALUMODE('0),		// 1-bit input: Clock enable for ALUMODE
-					.CEB1('0),			// 1-bit input: Clock enable for 1st stage BREG
-					.CEB2(en),			// 1-bit input: Clock enable for 2nd stage BREG
-					.CEC('0),			// 1-bit input: Clock enable for CREG
-					.CECARRYIN('0),		// 1-bit input: Clock enable for CARRYINREG
-					.CECTRL(en),		// 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG
-					.CED('0),			// 1-bit input: Clock enable for DREG
-					.CEINMODE('0),		// 1-bit input: Clock enable for INMODEREG
-					.CEM(en),			// 1-bit input: Clock enable for MREG
-					.CEP(en),			// 1-bit input: Clock enable for PREG
-					.RSTA('0),			// 1-bit input: Reset for AREG
-					.RSTB(				// 1-bit input: Reset for BREG
-// synthesis translate_off
-						rst ||
-// synthesis translate_on
-						zero
-					),
-					.RSTC('0),			// 1-bit input: Reset for CREG
-					.RSTD(				// 1-bit input: Reset for DREG and ADREG
-// synthesis translate_off
-						zero ||
-// synthesis translate_on
-						rst
-					),
-					.RSTALLCARRYIN('0),	// 1-bit input: Reset for CARRYINREG
-					.RSTALUMODE('0),	// 1-bit input: Reset for ALUMODEREG
-					.RSTCTRL('0),		// 1-bit input: Reset for OPMODEREG and CARRYINSELREG
-					.RSTINMODE('0),		// 1-bit input: Reset for INMODE register
-					.RSTM(rst),			// 1-bit input: Reset for MREG
-					.RSTP(rst)			// 1-bit input: Reset for PREG
-				);
-			end : genDSP
-`endif
-
-			// External Canary Pipeline
-			logic [1:0]  X1 = '{ default: 0 };
-			logic [1:0]  X2 = '{ default: 0 };
-			logic [1:0]  X3 = '{ default: 0 };
-			always_ff @(posedge clk) begin
-				if(rst) begin
-					X1 <= '{ default: 0 };
-					X2 <= '{ default: 0 };
-					X3 <= '{ default: 0 };
-				end
-				else if(en) begin
-					X1 <= xx;
-					X2 <= X1;
-					X3 <= X2 + (L[3]? 2'h0 : pp[D[1]+:2]);
-				end
-			end
-
-			// Derive actual cross-lane overflows
-			assign  h3[s] = pp[D[1]+:2] - X3;
-
-			assign	p3[s] = pp;
-
-		end : genSIMD
-
-		// Stage #4: Cross-SIMD Reduction
-
-		// Count leaves reachable from each node
-		localparam leave_load_t  LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 0}; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop
-
-		uwire signed [ACCU_WIDTH  -1:0]  up4;
-		uwire signed [ACCU_WIDTH  -SINGLE_PROD_WIDTH:0]  hi4;
-		uwire        [$clog2(SIMD)+SINGLE_PROD_WIDTH-1:0]  lo4;
-
-		// Conclusive high part accumulation
-		if(PE_REM == 0) begin : genHi
-			localparam int unsigned  HI_WIDTH = ACCU_WIDTH - D[1];
-			// Adder Tree across all SIMD high contributions, each from [-1:1]
-			uwire signed [$clog2(1+SIMD):0]  tree[2*SIMD-1];
-			for(genvar  s = 0; s < SIMD;   s++)  assign  tree[SIMD-1+s] = h3[s];
-			for(genvar  n = 0; n < SIMD-1; n++) begin
-				// Sum truncated to actual maximum bit width at this node
-				uwire signed [$clog2(1+LEAVE_LOAD[n]):0]  s = tree[2*n+1] + tree[2*n+2];
-				assign  tree[n] = s;
-			end
-
-			// High Sideband Accumulation
-			logic signed [HI_WIDTH-1:0]  Hi4 = 0;
-			always_ff @(posedge clk) begin
-				if(rst)      Hi4 <= 0;
-				else if(en)  Hi4 <= (L[4]? 0 : Hi4) + tree[0];
-			end
-			assign	hi4 = Hi4;
-		end : genHi
-		else begin : genHiZero
-			assign hi4 = '0;
-		end : genHiZero
-
-		for(genvar  i = 0; i < 2; i++) begin
-			localparam int unsigned  LO_WIDTH = D[i+1] - D[i];
-			// Conclusive low part accumulation
-			if(i >= PE_REM) begin : blkLo
-				// Adder Tree across all SIMD low contributions
-				localparam int unsigned  ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1));
-				uwire [2*SIMD-2:0][ROOT_WIDTH-1:0]  tree;
-				for(genvar  s = 0; s < SIMD;   s++)  assign  tree[SIMD-1+s] = p3[s][D[i]+:LO_WIDTH];
-				for(genvar  n = 0; n < SIMD-1; n++) begin
-					// Sum truncated to actual maximum bit width at this node
-					localparam int unsigned  NODE_WIDTH = $clog2(1 + LEAVE_LOAD[n]*(2**LO_WIDTH-1));
-					uwire [NODE_WIDTH-1:0]  s = tree[2*n+1] + tree[2*n+2];
-					assign  tree[n] = s;
-				end
-
-				logic [ROOT_WIDTH-1:0]  Lo4 = 0;
-				always_ff @(posedge clk) begin
-					if(rst)      Lo4 <= 0;
-					else if(en)  Lo4 <= tree[0];
-				end
-
-				if(i == 1)  assign  up4 = Lo4;
-				else  assign  lo4 = Lo4;
-			end : blkLo
-			else begin : blkLoZero
-				assign lo4 = '0;
-			end : blkLoZero
-
-		end
-
-		// Stage #5: Resolve lane totals
-		logic signed [1:0][ACCU_WIDTH-1:0]  Res5 = '{ default: 0 };
-		always_ff @(posedge clk) begin
-			if(rst)  Res5 <= '{ default: 0 };
-			else if(en) begin
-				Res5[1] <= up4 - hi4;
-				Res5[0] <= $signed({ hi4, {(D[1] - D[0]){1'b0}} }) + $signed({ 1'b0, lo4 });
-			end
-		end
-
-		// Output
-		for(genvar  pe = PE_BEG; pe < PE_END; pe++) begin
-			assign	p[pe] = Res5[pe - PE_BEG + PE_REM];
-		end
-
-	end : genPipes
-
-endmodule : mvu_8sx8u_dsp48
diff --git a/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv b/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv
deleted file mode 100644
index 53cf71fd5f..0000000000
--- a/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv
+++ /dev/null
@@ -1,430 +0,0 @@
-/******************************************************************************
- * Copyright (C) 2022, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  1. Redistributions of source code must retain the above copyright notice,
- *     this list of conditions and the following disclaimer.
- *
- *  2. Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in the
- *     documentation and/or other materials provided with the distribution.
- *
- *  3. Neither the name of the copyright holder nor the names of its
- *     contributors may be used to endorse or promote products derived from
- *     this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
- * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
- * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
- * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
- * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * @brief	Matrix Vector Unit (MVU) core compute kernel utilizing DSP58.
- *****************************************************************************/
-
-module mvu_vvu_8sx9_dsp58 #(
-	bit IS_MVU,
-    int unsigned PE,
-    int unsigned SIMD,
-    int unsigned ACTIVATION_WIDTH,
-    int unsigned WEIGHT_WIDTH,
-	int unsigned ACCU_WIDTH,
-    bit SIGNED_ACTIVATIONS = 0,
-    int unsigned SEGMENTLEN = 0, // Default to 0 (which implies a single segment)
-	bit FORCE_BEHAVIORAL = 0,
-
-	localparam int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD,
-	localparam int unsigned WEIGHT_ELEMENTS = PE*SIMD
-  )
-  (
-    // Global Control
-	input   logic clk,
-    input   logic rst,
-    input   logic en,
-
-	// Input
-    input   logic last,
-    input   logic zero, // ignore current inputs and force this partial product to zero
-    input   logic [WEIGHT_ELEMENTS-1:0][WEIGHT_WIDTH-1:0] w, // weights
-	input   logic [ACTIVATION_ELEMENTS-1:0][ACTIVATION_WIDTH-1:0] a, // activations
-
-	// Ouput
-	output  logic vld,
-    output  logic [PE-1:0][ACCU_WIDTH-1:0] p
-  );
-	// for verilator always use behavioral code
-	localparam bit  BEHAVIORAL =
-`ifdef VERILATOR
-		1 ||
-`endif
-		FORCE_BEHAVIORAL;
-
-//-------------------- Declare global signals --------------------\\
-	localparam int unsigned CHAINLEN = (SIMD+2)/3;
-	localparam int unsigned SEGLEN = SEGMENTLEN == 0 ? CHAINLEN : SEGMENTLEN; // Additional constant to default a SEGMENTLEN of '0' to the DSP-chain length
-	localparam int unsigned PE_ACTIVATION = IS_MVU ? 1 : PE;
-	uwire [26:0] a_in_i [PE_ACTIVATION * CHAINLEN];
-	uwire [23:0] b_in_i [PE][CHAINLEN];
-	uwire [PE-1:0][CHAINLEN-1:0][57:0] pcout; // Array with packed dimension > 256 (with a loop-carried dependency) cannot be handled out-of-the-box with PyVerilator
-
-//-------------------- Shift register for opmode select signal --------------------\\
-	localparam int unsigned MAX_PIPELINE_STAGES = (CHAINLEN + SEGLEN-1)/SEGLEN; // >=1 (== number of pipeline registers + 1 (A/B inputs always have 1 register))
-	logic L [0:1+MAX_PIPELINE_STAGES] = '{default: 0}; // After MAX_PIPELINE_STAGES (== number of pipeline stages for input data), we have 3 additional cycles latency (A/B reg, Mreg, Preg). Thus, we add +2 (since OPMODE is buffered by 1 cycle in the DSP fabric)
-
-	always_ff @(posedge clk) begin
-		if(rst)     L <= '{default: 0};
-		else if(en) begin
-			L[1+MAX_PIPELINE_STAGES] <= last;
-			L[0:MAX_PIPELINE_STAGES] <= L[1:1+MAX_PIPELINE_STAGES];
-		end
-	end
-	assign vld = L[0];
-
-//-------------------- Shift register for ZERO flag --------------------\\
-	logic Z [0:MAX_PIPELINE_STAGES-2] = '{default:0}; // We need MAX_PIPELINE_STAGES-1 pipeline stages (note: INMODE is buffered inside DSP fabric)
-
-	if (MAX_PIPELINE_STAGES > 1) begin : genZreg
-		always_ff @(posedge clk) begin
-			if (rst)      Z <= '{default: 0};
-			else if(en) begin
-				Z[0] <= zero;
-				if (MAX_PIPELINE_STAGES > 2)  Z[1:MAX_PIPELINE_STAGES-2] <= Z[0:MAX_PIPELINE_STAGES-3];
-			end
-		end
-	end;
-
-//-------------------- Buffer for input activations --------------------\\
-	localparam int unsigned PAD_BITS_ACT = 9 - ACTIVATION_WIDTH;
-	for (genvar k=0; k<PE_ACTIVATION; k++) begin : genActPE
-		for (genvar i=0; i<CHAINLEN; i++) begin : genActSIMD
-			localparam int TOTAL_PREGS = i/SEGLEN;
-			localparam int EXTERNAL_PREGS = TOTAL_PREGS>1 ? TOTAL_PREGS-1 : 0;
-			localparam int LANES_OCCUPIED = i == CHAINLEN-1 ? SIMD - 3*i : 3;
-
-			if (EXTERNAL_PREGS > 0) begin : genExternalPregAct
-				logic [0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][ACTIVATION_WIDTH-1:0] A = '{ default : 0};
-				always_ff @(posedge clk) begin
-					if (rst)     A <= '{default: 0};
-					else if(en) begin
-						A[EXTERNAL_PREGS-1] <= 
-	// synthesis translate_off
-							zero ? '1 : 
-	// synthesis translate_on						
-							a[SIMD*k + 3*i +: LANES_OCCUPIED];
-						if (EXTERNAL_PREGS > 1)   A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1];
-					end
-				end
-				for (genvar j=0; j<LANES_OCCUPIED; j++) begin : genAin
-				assign a_in_i[CHAINLEN*k+i][9*j +: 9] = SIGNED_ACTIVATIONS ? PAD_BITS_ACT == 0 ? A[0][j] : { {PAD_BITS_ACT{A[0][j][ACTIVATION_WIDTH-1]}}, A[0][j] } 
-													  : PAD_BITS_ACT == 0 ? A[0][j] : { {PAD_BITS_ACT{1'b0}}, A[0][j] } ;
-				end : genAin
-				for (genvar j=LANES_OCCUPIED; j<3; j++) begin : genAinZero
-					assign a_in_i[CHAINLEN*k+i][9*j +: 9] = 9'b0;
-				end : genAinZero
-			end : genExternalPregAct
-			else begin : genInpDSPAct
-				for (genvar j=0; j<LANES_OCCUPIED; j++) begin : genAin
-					assign a_in_i[CHAINLEN*k+i][9*j +: 9] = 
-	// synthesis translate_off
-						zero ? '1 : 				
-	// synthesis translate_on
-						SIGNED_ACTIVATIONS ? PAD_BITS_ACT == 0 ? a[SIMD*k+3*i+j] : { {PAD_BITS_ACT{a[SIMD*k+3*i+j][ACTIVATION_WIDTH-1]}}, a[SIMD*k+3*i+j] }
-													: PAD_BITS_ACT == 0 ? a[SIMD*k+3*i+j] : { {PAD_BITS_ACT{1'b0}}, a[SIMD*k+3*i+j] } ;
-				end : genAin
-				for (genvar j=LANES_OCCUPIED; j<3; j++) begin : genAinZero
-					assign a_in_i[CHAINLEN*k+i][9*j +: 9] = 9'b0;
-				end : genAinZero
-			end : genInpDSPAct
-		end : genActSIMD
-	end : genActPE
-
-//-------------------- Buffer for weights --------------------\\
-	localparam int unsigned PAD_BITS_WEIGHT = 8 - WEIGHT_WIDTH;
-
-	for (genvar i=0; i<PE; i++) begin : genWeightPE
-		for (genvar j=0; j<CHAINLEN; j++) begin : genWeightSIMD
-			localparam int TOTAL_PREGS = j/SEGLEN;
-			localparam int EXTERNAL_PREGS = TOTAL_PREGS>1 ? TOTAL_PREGS-1 : 0;
-			localparam int LANES_OCCUPIED = j == CHAINLEN-1 ? SIMD - 3*j : 3;
-
-			if (EXTERNAL_PREGS > 0) begin : genExternalPregWeight
-				logic [0:PE-1][0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][WEIGHT_WIDTH-1:0] B = '{ default : 0};
-				always_ff @(posedge clk) begin
-					if (rst)    B <= '{default: 0};
-					else if (en) begin
-						B[i][EXTERNAL_PREGS-1] <= 
-// synthesis translate_off
-							zero ? '1 : 						
-// synthesis translate_on							
-							//w[i][3*j +: LANES_OCCUPIED];
-							w[SIMD*i+3*j +: LANES_OCCUPIED];
-						if (EXTERNAL_PREGS > 1) B[i][0:EXTERNAL_PREGS-2] <= B[i][1:EXTERNAL_PREGS-1];
-					end
-				end
-				for (genvar k = 0 ; k < LANES_OCCUPIED ; k++) begin : genBin
-					assign b_in_i[i][j][8*k +: 8] = PAD_BITS_WEIGHT == 0 ? B[i][0][k] : { {PAD_BITS_WEIGHT{B[i][0][k][WEIGHT_WIDTH-1]}}, B[i][0][k] };
-				end : genBin
-				for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero
-					assign b_in_i[i][j][8*k +: 8] = 8'b0;
-				end : genBinZero
-			end : genExternalPregWeight
-			else begin : genInpDSPWeight
-				for (genvar k = 0; k < LANES_OCCUPIED; k++) begin : genBin
-					assign b_in_i[i][j][8*k +: 8] = 
-// synthesis translate_off					
-						zero ? '1 : 
-// synthesis translate_on					
-						//PAD_BITS_WEIGHT == 0 ? w[i][3*j+k] : { {PAD_BITS_WEIGHT{w[i][3*j+k][WEIGHT_WIDTH-1]}}, w[i][3*j+k] };
-						PAD_BITS_WEIGHT == 0 ? w[SIMD*i+3*j+k] : { {PAD_BITS_WEIGHT{w[SIMD*i+3*j+k][WEIGHT_WIDTH-1]}}, w[SIMD*i+3*j+k] };
-				end : genBin
-				for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero
-					assign b_in_i[i][j][8*k +: 8] = 8'b0;
-				end : genBinZero
-			end : genInpDSPWeight
-		end : genWeightSIMD
-	end : genWeightPE
-
-//-------------------- Instantiate PE x CHAINLEN DSPs --------------------\\
-	for (genvar i=0; i<PE; i++) begin : genDSPPE
-		for (genvar j=0; j<CHAINLEN; j++) begin : genDSPChain
-			localparam int TOTAL_PREGS = j/SEGLEN;
-			localparam int INTERNAL_PREGS = TOTAL_PREGS>0 ? 2 : 1; // 1 : 0
-			localparam bit PREG = (j+1)%SEGLEN==0 || j == CHAINLEN-1;
-			localparam bit FIRST = j == 0;
-			localparam bit LAST = j == CHAINLEN-1;
-			uwire [57:0] pp;
-
-			if (LAST) begin : genPOUT
-				assign p[i] = pp[ACCU_WIDTH-1:0];
-			end
-
-			// Note: Since the product B * AD is computed,
-			//       rst can be only applied to AD and zero only to B
-			//       with the same effect as zeroing both.
-			if(BEHAVIORAL) begin : genBehav
-				// Stage #1: Input A/B
-				logic signed [33:0] Areg [INTERNAL_PREGS];
-				always_ff @(posedge clk) begin
-					if (rst)	Areg <= '{ default : 0};
-					else if (en) begin
-						Areg[0] <= { 7'bx, a_in_i[(IS_MVU ? 0 : CHAINLEN*i) + j] };
-						if (INTERNAL_PREGS == 2) Areg[1] <= Areg[0];
-					end
-				end
-				logic signed [23:0] Breg [INTERNAL_PREGS];
-				always_ff @(posedge clk) begin
-					if (rst)	Breg <= '{ default : 0};
-					else if (en) begin
-						Breg[0] <= b_in_i[i][j];
-						if (INTERNAL_PREGS == 2) Breg[1] <= Breg[0];
-					end
-				end
-
-				// Stage #2: Multiply-Accumulate
-				logic signed [57:0] Mreg;
-				logic InmodeZero = 0;
-				always_ff @(posedge clk) begin
-					if (rst)		InmodeZero <= 0;
-					else if (en)	InmodeZero <= ( TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero );
-				end
-				always_ff @(posedge clk) begin
-					if (rst)	Mreg <= 0;
-					else if (en) begin
-						automatic logic signed [57:0] m = 0;
-						for (int k = 0; k < 3; k++) begin
-							m = m + (InmodeZero ? 0 : $signed(Areg[INTERNAL_PREGS-1][9*k +: 9]) * $signed(Breg[INTERNAL_PREGS-1][8*k +: 8]));
-						end
-						Mreg <= m;
-					end
-				end
-
-				// Stage #3: Accumulate
-				logic signed [57:0] Preg;
-				logic Opmode = 0;
-				if (FIRST && !LAST) begin : genFirst
-					if (PREG) begin : genPregBehav
-						always_ff @(posedge clk) begin
-							if (rst)		Preg <= 0;
-							else if (en)	Preg <= Mreg;
-						end
-					end
-					else	assign Preg = Mreg;
-				end
-				else if (FIRST && LAST) begin : genSingle
-					always_ff @(posedge clk) begin
-						if (rst)		Opmode <= 0;
-						else if (en)	Opmode <= L[1];
-					end
-					always_ff @(posedge clk) begin
-						if (rst) 		Preg <= 0;
-						else if (en)	Preg <= (Opmode ? 0 : Preg) + Mreg;
-					end
-				end
-				else if (!FIRST && LAST) begin : genLast
-					always_ff @(posedge clk) begin
-						if (rst)		Opmode <= 0;
-						else if (en)	Opmode <= L[1];
-					end
-					always_ff @(posedge clk) begin
-						if (rst) 		Preg <= 0;
-						else if (en)	Preg <= (Opmode ? 0 : Preg) + Mreg + pcout[i][j-1];
-					end
-				end
-				else begin : genMid
-					if (PREG) begin : genPregBehav
-						always_ff @(posedge clk) begin
-							if (rst)		Preg <= 0;
-							else if (en)	Preg <= Mreg + pcout[i][j-1];
-						end
-					end
-					else	assign Preg = Mreg + pcout[i][j-1];
-				end
-				assign pp = Preg;
-				assign pcout[i][j] = Preg;
-			end : genBehav
-`ifndef VERILATOR
-			else begin: genDSP
-				DSP58 #(
-					// Feature Control Attributes: Data Path Selection
-					.AMULTSEL("A"),                     // Selects A input to multiplier (A, AD)
-					.A_INPUT("DIRECT"),                 // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port)
-					.BMULTSEL("B"),                     // Selects B input to multiplier (AD, B)
-					.B_INPUT("DIRECT"),                 // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port)
-					.DSP_MODE("INT8"),                  // Configures DSP to a particular mode of operation. Set to INT24 for
-														// legacy mode.
-					.PREADDINSEL("A"),                  // Selects input to pre-adder (A, B)
-					.RND(58'h000000000000000),          // Rounding Constant
-					.USE_MULT("MULTIPLY"),              // Select multiplier usage (DYNAMIC, MULTIPLY, NONE)
-					.USE_SIMD("ONE58"),                 // SIMD selection (FOUR12, ONE58, TWO24)
-					.USE_WIDEXOR("FALSE"),              // Use the Wide XOR function (FALSE, TRUE)
-					.XORSIMD("XOR24_34_58_116"),        // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116)
-					// Pattern Detector Attributes: Pattern Detection Configuration
-					.AUTORESET_PATDET("NO_RESET"),      // NO_RESET, RESET_MATCH, RESET_NOT_MATCH
-					.AUTORESET_PRIORITY("RESET"),       // Priority of AUTORESET vs. CEP (CEP, RESET).
-					.MASK(58'h0ffffffffffffff),         // 58-bit mask value for pattern detect (1=ignore)
-					.PATTERN(58'h000000000000000),      // 58-bit pattern match for pattern detect
-					.SEL_MASK("MASK"),                  // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2
-					.SEL_PATTERN("PATTERN"),            // Select pattern value (C, PATTERN)
-					.USE_PATTERN_DETECT("NO_PATDET"),   // Enable pattern detect (NO_PATDET, PATDET)
-					// Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins
-					.IS_ALUMODE_INVERTED(4'b0000),      // Optional inversion for ALUMODE
-					.IS_CARRYIN_INVERTED(1'b0),         // Optional inversion for CARRYIN
-					.IS_CLK_INVERTED(1'b0),             // Optional inversion for CLK
-					.IS_INMODE_INVERTED(5'b00000),      // Optional inversion for INMODE
-					.IS_NEGATE_INVERTED(3'b000),        // Optional inversion for NEGATE
-					.IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0
-										FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN
-										2'b01, // Y : M
-										2'b01  // X: M
-					}), // Optional inversion for OPMODE
-					.IS_RSTALLCARRYIN_INVERTED(1'b0),   // Optional inversion for RSTALLCARRYIN
-					.IS_RSTALUMODE_INVERTED(1'b0),      // Optional inversion for RSTALUMODE
-					.IS_RSTA_INVERTED(1'b0),            // Optional inversion for RSTA
-					.IS_RSTB_INVERTED(1'b0),            // Optional inversion for RSTB
-					.IS_RSTCTRL_INVERTED(1'b0),         // Optional inversion for STCONJUGATE_A
-					.IS_RSTC_INVERTED(1'b0),            // Optional inversion for RSTC
-					.IS_RSTD_INVERTED(1'b0),            // Optional inversion for RSTD
-					.IS_RSTINMODE_INVERTED(1'b0),       // Optional inversion for RSTINMODE
-					.IS_RSTM_INVERTED(1'b0),            // Optional inversion for RSTM
-					.IS_RSTP_INVERTED(1'b0),            // Optional inversion for RSTP
-					// Register Control Attributes: Pipeline Register Configuration
-					.ACASCREG(INTERNAL_PREGS),          // Number of pipeline stages between A/ACIN and ACOUT (0-2)
-					.ADREG(0),                          // Pipeline stages for pre-adder (0-1)
-					.ALUMODEREG(0),                     // Pipeline stages for ALUMODE (0-1)
-					.AREG(INTERNAL_PREGS),              // Pipeline stages for A (0-2)
-					.BCASCREG(INTERNAL_PREGS),          // Number of pipeline stages between B/BCIN and BCOUT (0-2)
-					.BREG(INTERNAL_PREGS),              // Pipeline stages for B (0-2)
-					.CARRYINREG(0),                     // Pipeline stages for CARRYIN (0-1)
-					.CARRYINSELREG(0),                  // Pipeline stages for CARRYINSEL (0-1)
-					.CREG(0),                           // Pipeline stages for C (0-1)
-					.DREG(0),                           // Pipeline stages for D (0-1)
-					.INMODEREG(1),                      // Pipeline stages for INMODE (0-1)
-					.MREG(1),                           // Multiplier pipeline stages (0-1)
-					.OPMODEREG(1),                      // Pipeline stages for OPMODE (0-1)
-					.PREG(PREG),                        // Number of pipeline stages for P (0-1)
-					.RESET_MODE("SYNC")                 // Selection of synchronous or asynchronous reset. (ASYNC, SYNC).
-				)
-				DSP58_inst (
-					// Cascade outputs: Cascade Ports
-					.ACOUT(),                           // 34-bit output: A port cascade
-					.BCOUT(),                           // 24-bit output: B cascade
-					.CARRYCASCOUT(),                    // 1-bit output: Cascade carry
-					.MULTSIGNOUT(),                     // 1-bit output: Multiplier sign cascade
-					.PCOUT(pcout[i][j]),                // 58-bit output: Cascade output
-					// Control outputs: Control Inputs/Status Bits
-					.OVERFLOW(),                        // 1-bit output: Overflow in add/acc
-					.PATTERNBDETECT(),                  // 1-bit output: Pattern bar detect
-					.PATTERNDETECT(),                   // 1-bit output: Pattern detect
-					.UNDERFLOW(),                       // 1-bit output: Underflow in add/acc
-					// Data outputs: Data Ports
-					.CARRYOUT(),                        // 4-bit output: Carry
-					.P(pp),                             // 58-bit output: Primary data
-					.XOROUT(),                          // 8-bit output: XOR data
-					// Cascade inputs: Cascade Ports
-					.ACIN('x),                          // 34-bit input: A cascade data
-					.BCIN('x),                          // 24-bit input: B cascade
-					.CARRYCASCIN('x),                   // 1-bit input: Cascade carry
-					.MULTSIGNIN('x),                    // 1-bit input: Multiplier sign cascade
-					.PCIN(FIRST ? 'x : pcout[i][j-1]),  // 58-bit input: P cascade
-					// Control inputs: Control Inputs/Status Bits
-					.ALUMODE(4'h0),                     // 4-bit input: ALU control
-					.CARRYINSEL('0),                    // 3-bit input: Carry select
-					.CLK(clk),                          // 1-bit input: Clock
-					.INMODE({
-							INTERNAL_PREGS==2 ? 1'b0 : 1'b1,
-							2'b00,
-							TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero,
-							INTERNAL_PREGS==2 ? 1'b0 : 1'b1
-					}),                                 // 5-bit input: INMODE control
-					.NEGATE('0),                        // 3-bit input: Negates the input of the multiplier
-					.OPMODE({
-							LAST ? {1'b0, L[1]} : 2'b00,
-							7'b000_0000
-					}), // 9-bit input: Operation mode
-					// Data inputs: Data Ports
-					.A({ 7'bx, a_in_i[(IS_MVU ? 0 : CHAINLEN*i) + j] }),            // 34-bit input: A data
-					.B(b_in_i[i][j]),                   // 24-bit input: B data
-					.C('x),                             // 58-bit input: C data
-					.CARRYIN('0),                       // 1-bit input: Carry-in
-					.D('x),                             // 27-bit input: D data
-					// Reset/Clock Enable inputs: Reset/Clock Enable Inputs
-					.ASYNC_RST('0),                     // 1-bit input: Asynchronous reset for all registers.
-					.CEA1(en),                          // 1-bit input: Clock enable for 1st stage AREG
-					.CEA2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage AREG
-					.CEAD('0),                          // 1-bit input: Clock enable for ADREG
-					.CEALUMODE('0),                     // 1-bit input: Clock enable for ALUMODE
-					.CEB1(en),                          // 1-bit input: Clock enable for 1st stage BREG
-					.CEB2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage BREG
-					.CEC('0),                           // 1-bit input: Clock enable for CREG
-					.CECARRYIN('0),                     // 1-bit input: Clock enable for CARRYINREG
-					.CECTRL(en),                        // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG
-					.CED('0),                           // 1-bit input: Clock enable for DREG
-					.CEINMODE(en),                      // 1-bit input: Clock enable for INMODEREG
-					.CEM(en),                           // 1-bit input: Clock enable for MREG
-					.CEP(PREG && en),                   // 1-bit input: Clock enable for PREG
-					.RSTA(rst),                         // 1-bit input: Reset for AREG
-					.RSTALLCARRYIN('0),                 // 1-bit input: Reset for CARRYINREG
-					.RSTALUMODE('0),                    // 1-bit input: Reset for ALUMODEREG
-					.RSTB(rst),                         // 1-bit input: Reset for BREG
-					.RSTC('0),                          // 1-bit input: Reset for CREG
-					.RSTCTRL(rst),                      // 1-bit input: Reset for OPMODEREG and CARRYINSELREG
-					.RSTD('0),                          // 1-bit input: Reset for DREG and ADREG
-					.RSTINMODE(rst),                    // 1-bit input: Reset for INMODE register
-					.RSTM(rst),                         // 1-bit input: Reset for MREG
-					.RSTP(PREG && rst)                  // 1-bit input: Reset for PREG
-				);
-			end : genDSP
-`endif
-		end : genDSPChain
-	end : genDSPPE
-
-endmodule : mvu_vvu_8sx9_dsp58
diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
deleted file mode 100644
index 0168f20563..0000000000
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ /dev/null
@@ -1,234 +0,0 @@
-/******************************************************************************
- * Copyright (C) 2022, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  1. Redistributions of source code must retain the above copyright notice,
- *     this list of conditions and the following disclaimer.
- *
- *  2. Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in the
- *     documentation and/or other materials provided with the distribution.
- *
- *  3. Neither the name of the copyright holder nor the names of its
- *     contributors may be used to endorse or promote products derived from
- *     this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
- * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
- * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
- * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
- * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * @brief	Matrix Vector Unit (MVU) & Vector Vector Unit (VVU) AXI-lite interface wrapper.
- * @details
- *	 The following compute cores are supported:
- *   - 4-bit MVU on DSP48 & DSP58 achieving 4 MACs/DSP, 
- *     (4,8]-bit MVU on DSP48 achieving 2 MACs/DSP,
- *     [4,9]-bit MVU and VVU on DSP58 achieving 3 MACs/DSP,
- *     'unconstrained' LUT-based MVU and VVU.
- *  Folding hints:
- *	 - PE scaling should divide MH.
- *   - SIMD scaling should divide MW.
- *	 - Otherwise, keep SIMD and PE somewhat balanced. SIMD scaling tends to
- *	   impact critical paths more than PE scaling. PE scaling implies a
- *	   bigger fanout on the input activations.
- *	 - Full unfolding along MH (PE=MH) results in no replay buffer instantiated
- *****************************************************************************/
-
-module mvu_vvu_axi #(
-	bit IS_MVU,
-	parameter COMPUTE_CORE,
-	int unsigned MW,
-	int unsigned MH,
-	int unsigned PE,
-	int unsigned SIMD,
-	int unsigned ACTIVATION_WIDTH,
-	int unsigned WEIGHT_WIDTH,
-	int unsigned ACCU_WIDTH,
-	bit SIGNED_ACTIVATIONS = 0,
-	int unsigned SEGMENTLEN = 0,
-	bit FORCE_BEHAVIORAL = 0,
-	bit M_REG_LUT = 1,
-
-	// Safely deducible parameters
-	localparam int unsigned  WEIGHT_STREAM_WIDTH	= PE * SIMD * WEIGHT_WIDTH,
-	localparam int unsigned  WEIGHT_STREAM_WIDTH_BA	= (WEIGHT_STREAM_WIDTH + 7) / 8 * 8,
-	localparam int unsigned  INPUT_STREAM_WIDTH	= SIMD * ACTIVATION_WIDTH,
-	localparam int unsigned  INPUT_STREAM_WIDTH_BA	= (INPUT_STREAM_WIDTH + 7) / 8 * 8,
-	localparam int unsigned  OUTPUT_STREAM_WIDTH	= PE * ACCU_WIDTH,
-	localparam int unsigned  OUTPUT_STREAM_WIDTH_BA	= (OUTPUT_STREAM_WIDTH + 7) / 8 * 8,
-	localparam int unsigned  SF = MW / SIMD,
-	localparam int unsigned  NF = MH / PE
-)
-(
-	// Global Control
-	input	logic  ap_clk,
-	input	logic  ap_rst_n,
-
-	// Weight Stream
-	input	logic [WEIGHT_STREAM_WIDTH_BA-1:0]  s_axis_weights_tdata,
-	input	logic  s_axis_weights_tvalid,
-	output	logic  s_axis_weights_tready,
-
-	// Input Stream
-	input	logic [INPUT_STREAM_WIDTH_BA-1:0]  s_axis_input_tdata,
-	input	logic  s_axis_input_tvalid,
-	output	logic  s_axis_input_tready,
-
-	// Output Stream
-	output	logic [OUTPUT_STREAM_WIDTH_BA-1:0]  m_axis_output_tdata,
-	output	logic  m_axis_output_tvalid,
-	input	logic  m_axis_output_tready
-);
-
-//-------------------- Parameter sanity checks --------------------\\
-	initial begin
-		if (MW % SIMD != 0) begin
-			$error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD);
-			$finish;
-		end
-		if (MH % PE != 0) begin
-			$error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE);
-			$finish;
-		end
-		if (WEIGHT_WIDTH > 8) begin
-			$error("Weight width of %0d-bits exceeds maximum of 8-bits", WEIGHT_WIDTH);
-			$finish;
-		end
-		if (ACTIVATION_WIDTH > 8) begin
-			if (!(SIGNED_ACTIVATIONS == 1 && ACTIVATION_WIDTH == 9 && COMPUTE_CORE == "mvu_vvu_8sx9_dsp58")) begin
-				$error("Activation width of %0d-bits exceeds maximum of 9-bits for signed numbers on DSP48", ACTIVATION_WIDTH);
-				$finish;
-			end
-		end
-		if (COMPUTE_CORE == "mvu_vvu_8sx9_dsp58") begin
-			if (SEGMENTLEN == 0) begin
-				$warning("Segment length of %0d defaults to chain length of %0d", SEGMENTLEN, (SIMD+2)/3);
-			end
-			if (SEGMENTLEN > (SIMD+2)/3) begin
-				$error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3);
-				$finish;
-			end
-		end
-	end
-
-	uwire clk = ap_clk;
-	uwire rst = !ap_rst_n;
-
-	//- Replay to Accommodate Neuron Fold -----------------------------------
-	typedef logic [PE*SIMD-1:0][ACTIVATION_WIDTH-1:0] mvu_flatin_t;
-	uwire mvu_flatin_t amvau;
-	uwire alast;
-	uwire afin;
-	uwire avld;
-	uwire ardy;
-
-	replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvu_flatin_t))) activation_replay (
-	.clk, .rst,
-	.ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvu_flatin_t'(s_axis_input_tdata)),
-	.ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin)
-	);
-
-	//- Unflatten inputs into structured matrices ---------------------------
-	typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH    -1:0]  mvu_w_t;
-	typedef logic         [SIMD-1:0][ACTIVATION_WIDTH-1:0]  mvu_a_t;
-
-	uwire  mvu_w_t  mvu_w = s_axis_weights_tdata;
-	uwire  mvu_a_t  mvu_a = amvau;
-
-	//- Flow Control Bracket around Compute Core ----------------------------
-	uwire en;
-	uwire istb = avld && s_axis_weights_tvalid;
-	assign ardy = en && s_axis_weights_tvalid;
-	assign s_axis_weights_tready = en && avld;
-
-	//- Instantiate compute core ----------------------------
-	typedef logic [PE-1:0][ACCU_WIDTH-1:0]  dsp_p_t;
-	uwire dsp_vld;
-	uwire dsp_p_t  dsp_p;
-
-	uwire dsp_clk = ap_clk;
-	uwire dsp_en = en;
-	uwire dsp_last = alast && avld;
-	uwire dsp_zero = !istb;
-	uwire mvu_w_t dsp_w = mvu_w;
-	uwire mvu_a_t dsp_a = mvu_a;
-	uwire ovld = dsp_vld;
-	uwire dsp_p_t  odat = dsp_p;
-
-	case(COMPUTE_CORE)
-	"mvu_vvu_8sx9_dsp58":
-		mvu_vvu_8sx9_dsp58 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
-		.ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN),
-		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
-			.clk(dsp_clk), .rst, .en(dsp_en),
-			.last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a),
-			.vld(dsp_vld), .p(dsp_p)
-		);
-	"mvu_4sx4u":
-		mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
-			.clk(dsp_clk), .rst, .en(dsp_en),
-			.last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a),
-			.vld(dsp_vld), .p(dsp_p)
-		);
-	"mvu_8sx8u_dsp48":
-		mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
-		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
-			.clk(dsp_clk), .rst, .en(dsp_en),
-			.last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a),
-			.vld(dsp_vld), .p(dsp_p)
-		);
-	default: initial begin
-		$error("Unrecognized COMPUTE_CORE '%s'", COMPUTE_CORE);
-		$finish;
-	end
-	endcase
-
-//-------------------- Output register slice --------------------\\
-	// Make `en`computation independent from external inputs.
-	// Drive all outputs from registers.
-	struct packed {
-		logic rdy;
-		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
-	}  A = '{ rdy: 1, default: 'x };	// side-step register used when encountering backpressure
-	struct packed {
-		logic vld;
-		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
-	}  B = '{ vld: 0, default: 'x };	// ultimate output register
-
-	assign	en = A.rdy;
-	uwire  b_load = !B.vld || m_axis_output_tready;
-
-	always_ff @(posedge clk) begin
-		if(rst) begin
-			A <= '{ rdy: 1, default: 'x };
-			B <= '{ vld: 0, default: 'x };
-		end
-		else begin
-			if(A.rdy)  A.dat <= odat;
-			A.rdy <= (A.rdy && !ovld) || b_load;
-
-			if(b_load) begin
-				B <= '{
-					vld: ovld || !A.rdy,
-					dat: A.rdy? odat : A.dat
-				};
-			end
-		end
-	end
-	assign	m_axis_output_tvalid = B.vld;
-	// Why would we need a sign extension here potentially creating a higher signal load into the next FIFO?
-	// These extra bits should never be used. Why not 'x them out?
-	assign	m_axis_output_tdata  = { {(OUTPUT_STREAM_WIDTH_BA-OUTPUT_STREAM_WIDTH){B.dat[PE-1][ACCU_WIDTH-1]}}, B.dat};
-
-
-endmodule : mvu_vvu_axi
diff --git a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
deleted file mode 100644
index 01deb23840..0000000000
--- a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
+++ /dev/null
@@ -1,92 +0,0 @@
-/******************************************************************************
- * Copyright (C) 2022, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  1. Redistributions of source code must retain the above copyright notice,
- *     this list of conditions and the following disclaimer.
- *
- *  2. Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in the
- *     documentation and/or other materials provided with the distribution.
- *
- *  3. Neither the name of the copyright holder nor the names of its
- *     contributors may be used to endorse or promote products derived from
- *     this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
- * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
- * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
- * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
- * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * @brief	Verilog AXI-lite wrapper for MVU & VVU.
- *****************************************************************************/
-
-module $MODULE_NAME_AXI_WRAPPER$ #(
-	parameter	IS_MVU = $IS_MVU$,
-	parameter	COMPUTE_CORE = "$COMPUTE_CORE$",
-	parameter	MW = $MW$,
-	parameter	MH = $MH$,
-	parameter	PE = $PE$,
-	parameter	SIMD = $SIMD$,
-	parameter	ACTIVATION_WIDTH = $ACTIVATION_WIDTH$,
-	parameter	WEIGHT_WIDTH = $WEIGHT_WIDTH$,
-	parameter	ACCU_WIDTH = $ACCU_WIDTH$,
-	parameter	SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$,
-	parameter	SEGMENTLEN = $SEGMENTLEN$,
-	parameter	FORCE_BEHAVIORAL = $FORCE_BEHAVIORAL$,
-
-	// Safely deducible parameters
-	parameter	WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
-	parameter 	INPUT_STREAM_WIDTH_BA = ((IS_MVU == 1 ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8,
-	parameter 	OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8
-)(
-	// Global Control
-	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *)
-	(* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *)
-	input	ap_clk,
-	(* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *)
-	input	ap_rst_n,
-
-	// Weight Stream
-	input	[WEIGHT_STREAM_WIDTH_BA-1:0]  weights_V_TDATA,
-	input   weights_V_TVALID,
-	output  weights_V_TREADY,
-	// Input Stream
-	input	[INPUT_STREAM_WIDTH_BA-1:0]  in0_V_TDATA,
-	input	in0_V_TVALID,
-	output	in0_V_TREADY,
-	// Output Stream
-	output	[OUTPUT_STREAM_WIDTH_BA-1:0]  out_V_TDATA,
-	output	out_V_TVALID,
-	input	out_V_TREADY
-);
-
-mvu_vvu_axi #(
-	.IS_MVU(IS_MVU), .COMPUTE_CORE(COMPUTE_CORE), .MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD),
-	.ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH),
-	.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)
-	) inst (
-	.ap_clk(ap_clk),
-	.ap_rst_n(ap_rst_n),
-	.s_axis_weights_tdata(weights_V_TDATA),
-	.s_axis_weights_tvalid(weights_V_TVALID),
-	.s_axis_weights_tready(weights_V_TREADY),
-	.s_axis_input_tdata(in0_V_TDATA),
-	.s_axis_input_tvalid(in0_V_TVALID),
-	.s_axis_input_tready(in0_V_TREADY),
-	.m_axis_output_tdata(out_V_TDATA),
-	.m_axis_output_tvalid(out_V_TVALID),
-	.m_axis_output_tready(out_V_TREADY)
-);
-
-endmodule // $MODULE_NAME_AXI_WRAPPER$
diff --git a/finn-rtllib/mvu/replay_buffer.sv b/finn-rtllib/mvu/replay_buffer.sv
deleted file mode 100644
index 3e2766f63d..0000000000
--- a/finn-rtllib/mvu/replay_buffer.sv
+++ /dev/null
@@ -1,181 +0,0 @@
-/******************************************************************************
- * Copyright (C) 2022-2023, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  1. Redistributions of source code must retain the above copyright notice,
- *     this list of conditions and the following disclaimer.
- *
- *  2. Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in the
- *     documentation and/or other materials provided with the distribution.
- *
- *  3. Neither the name of the copyright holder nor the names of its
- *     contributors may be used to endorse or promote products derived from
- *     this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
- * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
- * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
- * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
- * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * @brief	Replay buffer for counted sequences on an AXI-lite stream.
- * @author	Thomas B. Preußer <thomas.preusser@amd.com>
- *****************************************************************************/
-
-module replay_buffer #(
-	int unsigned  LEN,	// Sequence length
-	int unsigned  REP,	// Sequence replay count
-	int unsigned  W 	// Data width
-)(
-	input	logic  clk,
-	input	logic  rst,
-
-	input	logic [W-1:0]  idat,
-	input	logic  ivld,
-	output	logic  irdy,
-
-	output	logic [W-1:0]  odat,
-	output	logic  olast,
-	output	logic  ofin,
-	output	logic  ovld,
-	input	logic  ordy
-);
-
-	if(LEN == 0)  initial begin
-		$error("%m: Illegal zero sequence LEN.");
-		$finish;
-	end
-	if(REP == 0) initial begin
-		$error("%m: Illegal zero REP count.");
-		$finish;
-	end
-
-	// Track position in Sequence
-	uwire  last_item;
-	uwire  shift;
-	if(LEN == 1)  assign  last_item = 1;
-	else begin
-		typedef logic [$clog2(LEN)-1:0]  count_t;
-		count_t  Count = 0;
-		logic    Last  = 0;
-		always_ff @(posedge clk) begin
-			if(rst) begin
-				Count <= 0;
-				Last  <= 0;
-			end
-			else if(shift) begin
-				Count <= Count + (Last? 2**$clog2(LEN)-LEN+1 : 1);
-				Last  <= (((LEN-2) & ~Count) == 0) && ((LEN&1) || !Last);
-			end
-		end
-		assign	last_item = Last;
-	end
-
-	if(REP == 1) begin
-		assign	shift = ivld && ordy;
-
-		assign	irdy  = ordy;
-		assign	odat  = idat;
-		assign	olast = last_item;
-		assign	ofin  = last_item;
-		assign	ovld  = ivld;
-	end
-	else begin
-
-		// Track Repetitions
-		uwire  last_rep;
-		if(1) begin : blkRep
-			typedef logic [$clog2(REP)-1:0]  rep_t;
-			rep_t  RepCnt = 0;
-			logic  RepLst = 0;
-			always_ff @(posedge clk) begin
-				if(rst) begin
-					RepCnt <= 0;
-					RepLst <= 0;
-				end
-				else if(last_item && shift) begin
-					RepCnt <= RepCnt + (RepLst? 2**$clog2(REP)-REP+1 : 1);
-					RepLst <= (((REP-2) & ~RepCnt) == 0) && ((REP&1) || !RepLst);
-				end
-			end
-			assign	last_rep = RepLst;
-		end : blkRep
-
-		localparam int unsigned  AWIDTH = LEN < 2? 1 : $clog2(LEN);
-		typedef logic [AWIDTH  :0]  ptr_t;	// pointers with additional generational MSB
-		typedef logic [W     -1:0]  data_t;
-
-		// Output Registers
-		data_t  ODat;
-		logic   OVld =  0;
-		logic   OLst = 'x;
-		logic   OFin = 'x;
-		assign	odat  = ODat;
-		assign	olast = OLst;
-		assign	ofin  = OFin;
-		assign	ovld  = OVld;
-
-		// Buffer Memory Management
-		data_t  Mem[2**AWIDTH];
-		ptr_t  WP = 0;	// Write Pointer
-		ptr_t  RP = 0;	// Read Pointer
-		ptr_t  FP = 0;	// Free Pointer
-
-		// Operational Guards
-		//	Occupancy:    WP-FP
-		//	  WP-FP < 2**AWIDTH -> writing allowed
-		//		- increments WP
-		//	Availability: WP-RP
-		//	  WP-RP > 0         -> reading allowed
-		//		- increments RP, last in sequence rewinds to FP for non-final repetition
-		//		- increments FP in last repetition
-		assign	irdy = !((WP-FP) >> AWIDTH);
-
-		uwire  wr = irdy && ivld;
-		uwire  rd = !OVld || ordy;
-		always_ff @(posedge clk) begin
-			if(wr)  Mem[WP[AWIDTH-1:0]] <= idat;
-			if(rd)  ODat <= Mem[RP[AWIDTH-1:0]];
-		end
-
-		uwire  vld = (RP != WP);
-		assign	shift = rd && vld;
-		always_ff @(posedge clk) begin
-			if(rst) begin
-				WP <= 0;
-				RP <= 0;
-				FP <= 0;
-
-				OVld <=  0;
-				OLst <= 'x;
-				OFin <= 'x;
-			end
-			else begin
-				if(wr)  WP <= WP + 1;
-				if(rd) begin
-					if(vld) begin
-						automatic logic  rewind = last_item && !last_rep;
-						RP <= RP + (rewind? 2**(AWIDTH+1)-LEN+1 : 1);
-						FP <= FP + last_rep;
-					end
-
-					OVld <= vld;
-					OLst <= last_item;
-					OFin <= last_rep && last_item;
-				end
-			end
-		end
-
-	end
-
-endmodule : replay_buffer
diff --git a/finn-rtllib/mvu/tb/mvu_8sx9_tb.sv b/finn-rtllib/mvu/tb/mvu_8sx9_tb.sv
deleted file mode 100644
index c8bfe5370a..0000000000
--- a/finn-rtllib/mvu/tb/mvu_8sx9_tb.sv
+++ /dev/null
@@ -1,165 +0,0 @@
-/******************************************************************************
- * Copyright (C) 2022, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  1. Redistributions of source code must retain the above copyright notice,
- *     this list of conditions and the following disclaimer.
- *
- *  2. Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in the
- *     documentation and/or other materials provided with the distribution.
- *
- *  3. Neither the name of the copyright holder nor the names of its
- *     contributors may be used to endorse or promote products derived from
- *     this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
- * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
- * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
- * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
- * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * @brief	Testbench for MVU core compute kernel.
- *****************************************************************************/
-
-module mvu_8sx9_tb();
-
-//-------------------- Simulation parameters --------------------\\
-	// Matrix & parallelism config
-	localparam int unsigned MH = 256;
-	localparam int unsigned PE = 16;
-	localparam int unsigned MW = 600;
-	localparam int unsigned SIMD = 60;
-	localparam int unsigned SEGMENTLEN = 4;
-	// Bit-width config  
-	localparam int unsigned ACTIVATION_WIDTH = 8;
-	localparam int unsigned WEIGHT_WIDTH = 4;
-	localparam bit SIGNED_ACTIVATIONS = 1;
-	// Simulation constants
-	localparam int unsigned NF = MH/PE;
-	localparam int unsigned SF = MW/SIMD;
-	localparam int unsigned NUM_OF_DSP = SIMD/3;
-
-	typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t;
-	typedef activation_t activation_vector_t[SF];
-
-	function activation_vector_t init_ACTIVATIONS;
-		automatic activation_vector_t res;
-		std::randomize(res);
-		return res;
-	endfunction : init_ACTIVATIONS
-
-	typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t;
-	typedef weight_t weight_matrix_t[NF][SF];
-
-	function weight_matrix_t init_WEIGHTS;
-		automatic weight_matrix_t res;
-		std::randomize(res);
-		return res;
-	endfunction : init_WEIGHTS;
-
-	typedef logic signed [PE-1:0][57:0] output_t;
-	typedef output_t output_vector_t [NF];
-
-	function output_vector_t check_output(activation_vector_t a, weight_matrix_t w);
-		automatic output_vector_t res = '{default: 0};
-		for (int j = 0; j<MH; j++) begin
-			for (int i = 0; i<MW; i++) begin
-				res[j/PE][j%PE] = $signed(res[j/PE][j%PE]) + $signed(a[i/SIMD][i%SIMD]) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]);
-			end
-		end  
-		return res;
-	endfunction : check_output;
-
-	logic clk = 0;
-	always #5ns clk = !clk;
-
-	logic rst;
-	initial begin
-		rst = 1;
-		repeat(16) @(posedge clk);
-		rst <= 0;
-	end
-
-	logic last;
-	logic zero;
-	logic vld;
-	activation_t a;
-	weight_t w;
-	output_t p;
-	// Reference signals
-	activation_vector_t ACTIVATIONS; //   [SF-1:0][SIMD-1:0][ACTIVATION_WIDTH-1:0]
-	weight_matrix_t WEIGHTS; //           [NF-1:0][SF-1:0][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]
-	output_vector_t GOLDEN_OUTPUT; //     [NF-1:0][PE-1:0][57:0]
-	// Counter for number of outputs (NF dimension) that are produced
-	int NF_CNT = 0;
-
-	initial begin
-		ACTIVATIONS = init_ACTIVATIONS();
-		WEIGHTS = init_WEIGHTS();
-		GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS);
-		last = 0;
-		zero = 0;
-		a = 'x;
-		w = 'x;
-
-		@(posedge clk iff !rst);
-
-		for (int j=0; j<NF; j++) begin
-			for (int i=0; i<SF; i++) begin
-				last <= (i==SF-1) ? 1 : 0;
-				a <= ACTIVATIONS[i];
-				w <= WEIGHTS[j][i];
-				@(posedge clk iff en);
-			end
-		end
-
-		last <= 0;
-		zero <= 1;  
-
-		// Continue until all NF outputs are produced & compared
-		@(posedge clk && (NF_CNT==NF));
-
-		$finish;
-	end
-
-	logic en = 0;
-	always_ff @(posedge clk) begin
-		en <= ($urandom()%7 > 1) && !rst;
-	end
-
-	// Compare computed output against golden output when vld flag is raised by DUT
-	always_ff @(posedge clk iff (vld && en)) begin
-		foreach(p[i]) begin
-			assert ($signed(p[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
-			else begin 
-				$error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
-				$stop;
-			end  
-		end
-		NF_CNT += 1;
-	end
-
-	// Instantiate DUT
-	mvu_8sx9 #(
-		.PE(PE),
-		.SIMD(SIMD),
-		.WEIGHT_WIDTH(WEIGHT_WIDTH),
-		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
-		.ACTIVATION_WIDTH(ACTIVATION_WIDTH),
-		.SEGMENTLEN(SEGMENTLEN)
-	)
-	dut (
-		.clk, .rst, .en, .last, .zero, .a, .w, .vld, .p
-	);
-
-endmodule : mvu_8sx9_tb
diff --git a/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv
deleted file mode 100644
index b46fc588c9..0000000000
--- a/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv
+++ /dev/null
@@ -1,241 +0,0 @@
-/******************************************************************************
- * Copyright (C) 2022, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  1. Redistributions of source code must retain the above copyright notice,
- *     this list of conditions and the following disclaimer.
- *
- *  2. Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in the
- *     documentation and/or other materials provided with the distribution.
- *
- *  3. Neither the name of the copyright holder nor the names of its
- *     contributors may be used to endorse or promote products derived from
- *     this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
- * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
- * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
- * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
- * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * @brief	Testbench for MVU AXI-lite interface wrapper.
- *****************************************************************************/
-
-module mvu_vvu_axi_tb();
-
-//-------------------- Simulation parameters --------------------\\
-	// Matrix & parallelism config
-	localparam bit IS_MVU = 0;
-	localparam string COMPUTE_CORE = "mvu_vvu_8sx9_dsp58";
-	localparam int unsigned MW = 36;
-	localparam int unsigned MH = 1;
-	localparam int unsigned SIMD = 3;
-	localparam int unsigned PE = 4;
-	localparam int unsigned SEGMENTLEN = 1.0;
-	localparam bit FORCE_BEHAVIORAL = 1;
-	localparam bit M_REG_LUT = 1;
-	// Bit-width config
-	localparam int unsigned ACTIVATION_WIDTH = 8;
-	localparam int unsigned WEIGHT_WIDTH = 6;
-	localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW);
-	localparam bit SIGNED_ACTIVATIONS = 1;
-	// Simulation constants
-	localparam int unsigned NF = IS_MVU ? MH/PE : 1;
-	localparam int unsigned SF = IS_MVU ? MW/SIMD : MW/(SIMD*PE);
-	localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8;
-	localparam int unsigned ACTIVATION_WIDTH_BA = ((IS_MVU ? 1 : PE)*SIMD*ACTIVATION_WIDTH+7)/8*8;
-	localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH;
-	localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - (IS_MVU ? 1 : PE)*SIMD*ACTIVATION_WIDTH;
-	localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8;
-
-	// Generate clk and reset signal
-	logic clk = 0;
-	always #5ns clk = !clk;
-
-	logic ap_rst_n = 0;
-	initial begin
-		repeat(16) @(posedge clk);
-		ap_rst_n <= 1;
-	end
-
-	uwire ap_clk = clk;
-
-	// Generate activations
-	typedef logic [(IS_MVU ? 1 : PE)*SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t;
-	typedef activation_t activation_vector_t[SF];
-
-	function activation_vector_t init_ACTIVATIONS;
-		automatic activation_vector_t res;
-		std::randomize(res);
-		return res;
-	endfunction : init_ACTIVATIONS
-
-	activation_vector_t ACTIVATIONS = init_ACTIVATIONS();
-
-	struct {
-		activation_t dat;
-		logic vld;
-		logic rdy;
-	} activations;
-
-	initial begin
-		activations.vld = 0;
-		activations.dat = 'X;
-		@(posedge clk iff ap_rst_n);
-
-		for (int i=0; i<SF; i++) begin
-			activations.dat <= ACTIVATIONS[i];
-			do begin
-				activations.vld <= $urandom()%7 >= 0;
-				@(posedge clk);
-			end while (!(activations.vld === 1 && activations.rdy === 1));
-		end
-
-		activations.vld <= 0;
-		activations.dat <= 'x;
-	end
-
-	// Generate weights
-	typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t;
-	typedef weight_t weight_matrix_t[NF][SF];
-
-	function weight_matrix_t init_WEIGHTS;
-		automatic weight_matrix_t res;
-		std::randomize(res);
-		return res;
-	endfunction : init_WEIGHTS;
-
-	weight_matrix_t WEIGHTS = init_WEIGHTS();
-
-	struct {
-		weight_t dat;
-		logic vld;
-		logic rdy;
-	} weights;
-
-	initial begin
-		weights.vld = 0;
-		weights.dat = 'X;
-		@(posedge clk iff ap_rst_n);
-
-		weights.vld <= 1;
-		for (int i=0; i<NF; i++) begin
-			for (int j=0; j<SF; j++) begin
-				weights.dat <= WEIGHTS[i][j];
-				@(posedge clk iff weights.rdy);
-			end
-		end
-
-		weights.vld <= 0;
-		weights.dat <= 'x;
-	end
-
-	// Function to compute golden output
-	// a: [SF][(IS_MVU ? 1 : PE)*SIMD-1:0][ACTIVATION_WIDTH-1:0]
-	// a: [SF][SIMD-1:0][ACTIVATION_WIDTH-1:0]
-	// a: [SF][PE*SIMD-1:0][ACTIVATION_WIDTH-1:0]
-	// w: [NF][SF][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0]
-	typedef logic signed [PE-1:0][ACCU_WIDTH-1:0] output_t;
-	typedef output_t output_vector_t [NF];
-
-	struct {
-		output_t dat;
-		logic vld;
-		logic rdy;
-	} outputs;
-
-	function output_vector_t check_output(activation_vector_t a, weight_matrix_t w);
-		automatic output_vector_t res = '{default: 0};
-		// for (int j = 0; j<MH; j++) begin
-		// 	for (int i = 0; i<MW; i++) begin
-		// 		if (SIGNED_ACTIVATIONS)
-		// 			res[j/PE][j%PE] = IS_MVU ? $signed(res[j/PE][j%PE]) + $signed(a[i/SIMD][i%SIMD]) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]) : 
-		// 									   $signed(res[j/PE][j%PE]) + ( PE > 1 ? $signed(a[i/SIMD/PE][i % (SIMD*PE)]) : $signed(a[i/SIMD/PE][(i)%(SIMD*PE)]) ) * $signed(w[0][i/SIMD/PE][i/PE][i%SIMD]);
-		// 		else
-		// 			res[j/PE][j%PE] = IS_MVU ? $signed(res[j/PE][j%PE]) + $signed({1'b0, a[i/SIMD][i%SIMD]}) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]) : 
-		// 									   $signed(res[j/PE][j%PE]) + ( PE > 1 ? $signed({1'b0, a[i/SIMD/PE][i % (SIMD*PE)]}) : $signed({1'b0, a[i/SIMD/PE][i%(SIMD*PE)]}) ) * $signed(w[0][i/SIMD][0][i%SIMD]);
-		// 	end
-		// end
-		// The input stream will have the channels interleaved for VVU when PE>1
-		// Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..]
-		// Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like:
-		// (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to
-		// (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i)
-		for (int i = 0; i < NF; i++) begin
-			for (int j = 0; j < SF; j++) begin
-				for (int k = 0; k < PE; k++) begin
-					for (int l = 0; l < SIMD; l++) begin
-						if (SIGNED_ACTIVATIONS)
-							res[i][k] = IS_MVU ? $signed(res[i][k]) + $signed(a[j][l]) * $signed(w[i][j][k][l]) :
-												 $signed(res[i][k]) + $signed(a[j][k + l*PE]) * $signed(w[i][j][k][l]);
-						else
-							res[i][k] = IS_MVU ? $signed(res[i][k]) + $signed({1'b0, a[j][l]}) * $signed(w[i][j][k][l]) :
-												 $signed(res[i][k]) + $signed({1'b0, a[j][k + l*PE]}) * $signed(w[i][j][k][l]);
-					end
-				end
-			end
-		end
-		return res;
-	endfunction : check_output;
-
-	output_vector_t GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS);
-
-	int unsigned NF_CNT = 0;
-	initial begin
-		outputs.rdy = 0;
-		while (NF_CNT < NF) begin
-			// Loop until both rdy & vld are asserted
-			do begin
-				outputs.rdy <= $urandom()%7 >= 0;
-				@(posedge clk iff ap_rst_n);
-			end while (!(outputs.rdy === 1 && outputs.vld === 1));
-
-			// Compare produced outputs against golden outputs
-			foreach(outputs.dat[i]) begin
-				assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
-				else begin
-					$error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
-					$stop;
-				end
-			end
-
-			NF_CNT += 1;
-		end
-
-		$finish;
-	end
-
-	// Instantiate DUT
-	mvu_vvu_axi #(
-		.IS_MVU(IS_MVU),
-		.COMPUTE_CORE(COMPUTE_CORE),
-		.MW(MW),
-		.MH(MH),
-		.PE(PE),
-		.SIMD(SIMD),
-		.ACTIVATION_WIDTH(ACTIVATION_WIDTH),
-		.WEIGHT_WIDTH(WEIGHT_WIDTH),
-		.ACCU_WIDTH(ACCU_WIDTH),
-		.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
-		.SEGMENTLEN(SEGMENTLEN),
-		.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL),
-		.M_REG_LUT(M_REG_LUT)
-	)
-	dut (
-		.ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld),
-		.s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld),
-		.s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld),
-		.m_axis_output_tready(outputs.rdy)
-	);
-
-endmodule : mvu_vvu_axi_tb
diff --git a/finn-rtllib/mvu/tb/replay_buffer_tb.sv b/finn-rtllib/mvu/tb/replay_buffer_tb.sv
deleted file mode 100644
index 5581354e0e..0000000000
--- a/finn-rtllib/mvu/tb/replay_buffer_tb.sv
+++ /dev/null
@@ -1,130 +0,0 @@
-/******************************************************************************
- * Copyright (C) 2023, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  1. Redistributions of source code must retain the above copyright notice,
- *     this list of conditions and the following disclaimer.
- *
- *  2. Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in the
- *     documentation and/or other materials provided with the distribution.
- *
- *  3. Neither the name of the copyright holder nor the names of its
- *     contributors may be used to endorse or promote products derived from
- *     this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
- * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
- * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
- * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
- * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * @brief	Testbench for replay_buffer module.
- * @author	Thomas B. Preußer <thomas.preusser@amd.com>
- *****************************************************************************/
-
-module replay_buffer_tb;
-
-	// Global Control
-	logic  clk = 0;
-	always #5ns clk = !clk;
-	uwire  rst = 0;
-
-	// DUT Geometries
-	localparam int unsigned  DIMS[3] = '{ 7, 8, 10 };
-	localparam int unsigned  W = 8;
-	typedef logic [W-1:0]  data_t;
-
-	bit [2**$size(DIMS)-1:0]  done = 0;
-	always_comb begin
-		if(&done) begin
-			$display("Test completed.");
-			$finish;
-		end
-	end
-
-	// Parallel DUT Instantiations
-	for(genvar  r = 0; r < $size(DIMS); r++) begin
-		for(genvar  l = 0; l < $size(DIMS); l++) begin
-			localparam int unsigned  REP = DIMS[r];
-			localparam int unsigned  LEN = DIMS[l];
-
-			data_t  idat;
-			logic  ivld;
-			uwire  irdy;
-
-			uwire data_t  odat;
-			uwire  olast;
-			uwire  ofin;
-			uwire  ovld;
-			logic  ordy;
-
-			replay_buffer #(.LEN(LEN), .REP(REP), .W(W)) dut (
-				.clk, .rst,
-				.idat, .ivld, .irdy,
-				.odat, .olast, .ofin, .ovld, .ordy
-			);
-
-			// Input Feed: 0, 1, ..., 10*LEN-1
-			initial begin
-				idat = 'x;
-				ivld =  0;
-				@(posedge clk iff !rst);
-
-				for(int unsigned  i = 0; i < 10*LEN; i++) begin
-					idat <= i;
-					ivld <= 1;
-					@(posedge clk iff irdy);
-					idat <= 'x;
-					ivld <=  0;
-					while($urandom()%(REP-1) != 0) @(posedge clk);
-				end
-			end
-
-			// Output Check
-			initial begin
-				automatic int unsigned  base = 0;
-
-				ordy = 0;
-				@(posedge clk iff !rst);
-
-				for(int unsigned  k = 0; k < 10; k++) begin
-					for(int unsigned  j = 0; j < REP; j++) begin
-						for(int unsigned  i = 0; i < LEN; i++) begin
-							ordy <= 1;
-							@(posedge clk iff ovld);
-							assert(odat == base+i) else begin
-								$error("#%0d.%0d: Data mismatch: %0d instead of %0d.", r, l, odat, base+i);
-								$stop;
-							end
-							assert(olast == (i == LEN-1)) else begin
-								$error("#%0d.%0d: Last mismatch.", r, l);
-								$stop;
-							end
-							assert(ofin == ((i == LEN-1) && (j == REP-1))) else begin
-								$error("#%0d.%0d: Fin mismatch.", r, l);
-								$stop;
-							end
-
-							ordy <= 0;
-							while($urandom()%13 == 0) @(posedge clk);
-						end
-					end
-					base += LEN;
-				end
-
-				done[$size(DIMS)*r + l] <= 1;
-			end
-		end
-	end
-
-endmodule : replay_buffer_tb

From 472ce110b262d6ab5a6937a6ca82c9ab1bf69e45 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 26 Jan 2024 12:46:06 +0000
Subject: [PATCH 104/112] [test mvau]: modified to support new custom-ops

---
 tests/fpgadataflow/test_fpgadataflow_mvau.py | 120 +++++++++++++++++--
 1 file changed, 113 insertions(+), 7 deletions(-)

diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau.py b/tests/fpgadataflow/test_fpgadataflow_mvau.py
index b80ef76a19..bd283855e3 100644
--- a/tests/fpgadataflow/test_fpgadataflow_mvau.py
+++ b/tests/fpgadataflow/test_fpgadataflow_mvau.py
@@ -52,6 +52,9 @@
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from qonnx.transformation.general import ApplyConfig, GiveUniqueNodeNames, GiveReadableTensorNames
+from qonnx.transformation.infer_shapes import InferShapes
+from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
 
 
 def make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T=None, tdt=None):
@@ -135,6 +138,87 @@ def prepare_inputs(input_tensor, idt, wdt):
         return {"inp": input_tensor}
 
 
+# activation: None or DataType
+@pytest.mark.parametrize("act", [None, DataType["BIPOLAR"], DataType["INT4"]])
+# weight datatype
+@pytest.mark.parametrize("wdt", [DataType["BIPOLAR"], DataType["INT4"]])
+# input datatype
+@pytest.mark.parametrize("idt", [DataType["BIPOLAR"], DataType["INT4"]])
+# neuron folding, -1 is maximum possible
+@pytest.mark.parametrize("nf", [-1, 2, 1])
+# synapse folding, -1 is maximum possible
+@pytest.mark.parametrize("sf", [-1, 2, 1])
+# HLS matrix width (input features)
+@pytest.mark.parametrize("mw", [16])
+# HLS matrix height (output features)
+@pytest.mark.parametrize("mh", [16])
+@pytest.mark.fpgadataflow
+@pytest.mark.slow
+@pytest.mark.vivado
+def test_fpgadataflow_fclayer_hwop(idt, wdt, act, nf, sf, mw, mh):
+    if nf == -1:
+        nf = mh
+    if sf == -1:
+        sf = mw
+    pe = mh // nf
+    simd = mw // sf
+    assert mh % pe == 0
+    assert mw % sf == 0
+    # generate weights
+    W = gen_finn_dt_tensor(wdt, (mw, mh))
+    # generate input data
+    x = gen_finn_dt_tensor(idt, (1, mw))
+    if act is None:
+        # no activation, produce accumulators
+        T = None
+        tdt = None
+        if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]:
+            odt = DataType["UINT32"]
+        else:
+            odt = DataType["INT32"]
+    else:
+        odt = act
+        (min, max) = calculate_signed_dot_prod_range(idt, wdt, mw)
+        n_steps = act.get_num_possible_values() - 1
+        T = np.random.randint(min, max - 1, (mh, n_steps)).astype(np.float32)
+        # provide non-decreasing thresholds
+        T = np.sort(T, axis=1)
+        # generate thresholds for activation
+        if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]:
+            tdt = DataType["UINT32"]
+            # bias thresholds to be positive
+            T = np.ceil((T + mw) / 2)
+            assert (T >= 0).all()
+        else:
+            tdt = DataType["INT32"]
+    model = make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T, tdt)
+    # prepare input data
+    input_dict = prepare_inputs(x, idt, wdt)
+    if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]:
+        # convert inputs to binary and use xnorpopcountmatmul
+        y = xp.xnorpopcountmatmul((x + 1) / 2, (W + 1) / 2)
+    else:
+        y = np.matmul(x, W)
+    if T is not None:
+        # y = multithreshold(y, T)
+        if act == DataType["BIPOLAR"]:
+            # binary to bipolar
+            # y = 2 * y - 1
+            y = multithreshold(y, T, 2, -1)
+        else:
+            # signed offset
+            # y += act.min()
+            y = multithreshold(y, T, 1, act.min())
+    oshape = model.get_tensor_shape("outp")
+    y_expected = y.reshape(oshape)
+    # execute model
+    y_produced = oxe.execute_onnx(model, input_dict)["outp"]
+
+    y_produced = y_produced.reshape(y_expected.shape)
+
+    assert (y_produced == y_expected).all(), "cppsim hw-op failed"
+
+
 # mem_mode: const or decoupled
 @pytest.mark.parametrize("mem_mode", ["const", "decoupled", "external"])
 # activation: None or DataType
@@ -154,7 +238,9 @@ def prepare_inputs(input_tensor, idt, wdt):
 @pytest.mark.fpgadataflow
 @pytest.mark.slow
 @pytest.mark.vivado
-def test_fpgadataflow_fclayer_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
+def test_fpgadataflow_fclayer_hlsop_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
+    if idt == DataType["BIPOLAR"] and wdt != DataType["BIPOLAR"] or idt != DataType["BIPOLAR"] and wdt == DataType["BIPOLAR"]:
+        pytest.skip("Bipolar activations/weights only supported in MVU if both operands are bipolar")
     if nf == -1:
         nf = mh
     if sf == -1:
@@ -195,6 +281,8 @@ def test_fpgadataflow_fclayer_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
         # lookup op_type in registry of CustomOps
         inst = getCustomOp(node)
         inst.set_nodeattr("mem_mode", mem_mode)
+        inst.set_nodeattr("preferred_impl_style", "hls")
+    model = model.transform(SpecializeLayers())
     model = model.transform(SetExecMode("cppsim"))
     model = model.transform(PrepareCppSim())
     model = model.transform(CompileCppSim())
@@ -220,7 +308,7 @@ def test_fpgadataflow_fclayer_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
 
     y_produced = y_produced.reshape(y_expected.shape)
 
-    assert (y_produced == y_expected).all(), "cppsim failed"
+    assert (y_produced == y_expected).all(), "cppsim hls-op failed"
 
 
 # mem_mode: const or decoupled
@@ -239,10 +327,14 @@ def test_fpgadataflow_fclayer_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
 @pytest.mark.parametrize("mw", [16])
 # HLS matrix height (output features)
 @pytest.mark.parametrize("mh", [16])
+# Backend
+@pytest.mark.parametrize("backend", ["rtl", "hls"])
 @pytest.mark.fpgadataflow
 @pytest.mark.slow
 @pytest.mark.vivado
-def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
+def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh, backend):
+    if backend == "rtl" and act is not None:
+        pytest.skip("RTL MVU doesn't support embedded thresholding functionality.")
     if nf == -1:
         nf = mh
     if sf == -1:
@@ -283,6 +375,7 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
         # lookup op_type in registry of CustomOps
         inst = getCustomOp(node)
         inst.set_nodeattr("mem_mode", mem_mode)
+        inst.set_nodeattr("preferred_impl_style", backend)
 
     # prepare input data
     input_dict = prepare_inputs(x, idt, wdt)
@@ -303,6 +396,7 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
     y_expected = y.reshape(oshape)
     # TODO split up into several dependent tests -- need to check how this
     # works for parametrized tests...
+    model = model.transform(SpecializeLayers())
     model = model.transform(SetExecMode("rtlsim"))
     model = model.transform(GiveUniqueNodeNames())
     model = model.transform(PrepareIP("xc7z020clg400-1", 5))
@@ -312,7 +406,10 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
     assert (y_produced.reshape(y_expected.shape) == y_expected).all(), "rtlsim failed"
 
     hls_synt_res_est = model.analysis(hls_synth_res_estimation)
-    assert "MatrixVectorActivation_0" in hls_synt_res_est
+    if backend == "hls":
+        assert "MatrixVectorActivation_hls_0" in hls_synt_res_est
+    else:
+        assert "MatrixVectorActivation_rtl_0" in hls_synt_res_est
 
     node = model.get_nodes_by_op_type("MatrixVectorActivation")[0]
     inst = getCustomOp(node)
@@ -339,10 +436,12 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
 @pytest.mark.parametrize("mw", [128])
 # HLS matrix height (output features)
 @pytest.mark.parametrize("mh", [128])
+# Backend
+@pytest.mark.parametrize("backend", ["rtl", "hls"])
 @pytest.mark.fpgadataflow
 @pytest.mark.vivado
 def test_fpgadataflow_fclayer_large_depth_decoupled_mode_rtlsim(
-    mem_mode, idt, wdt, act, nf, sf, mw, mh
+    mem_mode, idt, wdt, act, nf, sf, mw, mh, backend
 ):
     if nf == -1:
         nf = mh
@@ -404,6 +503,7 @@ def test_fpgadataflow_fclayer_large_depth_decoupled_mode_rtlsim(
     y_expected = y.reshape(oshape)
     # TODO split up into several dependent tests -- need to check how this
     # works for parametrized tests...
+    model = model.transform(SpecializeLayers())
     model = model.transform(SetExecMode("rtlsim"))
     model = model.transform(GiveUniqueNodeNames())
     model = model.transform(PrepareIP("xc7z020clg400-1", 5))
@@ -413,7 +513,10 @@ def test_fpgadataflow_fclayer_large_depth_decoupled_mode_rtlsim(
     assert (y_produced.reshape(y_expected.shape) == y_expected).all(), "rtlsim failed"
 
     hls_synt_res_est = model.analysis(hls_synth_res_estimation)
-    assert "MatrixVectorActivation_0" in hls_synt_res_est
+    if backend == "hls":
+        assert "MatrixVectorActivation_hls_0" in hls_synt_res_est
+    else:
+        assert "MatrixVectorActivation_rtl_0" in hls_synt_res_est
 
     node = model.get_nodes_by_op_type("MatrixVectorActivation")[0]
     inst = getCustomOp(node)
@@ -440,9 +543,11 @@ def test_fpgadataflow_fclayer_large_depth_decoupled_mode_rtlsim(
 @pytest.mark.parametrize("mw", [32])
 # HLS matrix height (output features)
 @pytest.mark.parametrize("mh", [32])
+# Backend
+@pytest.mark.parametrize("backend", ["rtl", "hls"])
 @pytest.mark.fpgadataflow
 @pytest.mark.vivado
-def test_fclayer_fifocharacterize_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
+def test_fclayer_fifocharacterize_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh, backend):
     if nf == -1:
         nf = mh
     if sf == -1:
@@ -469,6 +574,7 @@ def test_fclayer_fifocharacterize_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh
         inst.set_nodeattr("mem_mode", mem_mode)
     total_fold = nf * sf
     exp_total_cycles = total_fold + 10
+    model = model.transform(SpecializeLayers())
     model = model.transform(SetExecMode("rtlsim"))
     model = model.transform(GiveUniqueNodeNames())
     model = model.transform(PrepareIP("xc7z020clg400-1", 5))

From 31083d52da1abfe61a732199ddb70e5bf9e9f4b6 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 26 Jan 2024 19:47:17 +0000
Subject: [PATCH 105/112] removed rtl refactoring steps

---
 src/finn/builder/build_dataflow_config.py |  3 +--
 src/finn/builder/build_dataflow_steps.py  | 11 -----------
 2 files changed, 1 insertion(+), 13 deletions(-)

diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py
index 073bc9e12b..e4fed05731 100644
--- a/src/finn/builder/build_dataflow_config.py
+++ b/src/finn/builder/build_dataflow_config.py
@@ -121,7 +121,6 @@ class VerificationStepType(str, Enum):
     "step_apply_folding_config",
     "step_minimize_bit_width",
     "step_generate_estimate_reports",
-    "step_specialize_to_rtl",
     "step_hls_codegen",
     "step_hls_ipgen",
     "step_set_fifo_depths",
@@ -234,7 +233,7 @@ class DataflowBuildConfig:
     #: activations in FINN) will be implemented as stand-alone HLS layers,
     #: instead of being part of MatrixVectorActivation layer. This gives larger
     #: flexibility, and makes it possible to have runtime-writable thresholds.
-    standalone_thresholds: Optional[bool] = True
+    standalone_thresholds: Optional[bool] = False
 
     #: (Optional) Whether optimizations that minimize the bit width of the
     #: weights and accumulator will be applied. Because this optimization relies
diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
index 2629efef11..11107ccb64 100644
--- a/src/finn/builder/build_dataflow_steps.py
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -53,7 +53,6 @@
 from shutil import copy
 
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
-import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl
 import finn.transformation.streamline.absorb as absorb
 from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
@@ -473,15 +472,6 @@ def step_generate_estimate_reports(model: ModelWrapper, cfg: DataflowBuildConfig
     return model
 
 
-def step_specialize_to_rtl(model: ModelWrapper, cfg: DataflowBuildConfig):
-    """Convert layers implemented in HLS to an equivalent specialized RTL
-    implementation if possible."""
-    specialize_to_rtl_transforms = [to_rtl.InferRTLMatrixVectorActivation()]
-    for trn in specialize_to_rtl_transforms:
-        model = model.transform(trn)
-    return model
-
-
 def step_minimize_bit_width(model: ModelWrapper, cfg: DataflowBuildConfig):
     """Tighten the weight and accumulator bit widths for each layer."""
     if cfg.minimize_bit_width:
@@ -844,7 +834,6 @@ def step_deployment_package(model: ModelWrapper, cfg: DataflowBuildConfig):
     "step_apply_folding_config": step_apply_folding_config,
     "step_minimize_bit_width": step_minimize_bit_width,
     "step_generate_estimate_reports": step_generate_estimate_reports,
-    "step_specialize_to_rtl": step_specialize_to_rtl,
     "step_hls_codegen": step_hls_codegen,
     "step_hls_ipgen": step_hls_ipgen,
     "step_set_fifo_depths": step_set_fifo_depths,

From 0032743e605651ce4a705297115c4cabf104b45d Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 26 Jan 2024 19:48:16 +0000
Subject: [PATCH 106/112] removed old rtl custom-op

---
 .../matrixvectoractivation_rtl.py             | 1086 -----------------
 1 file changed, 1086 deletions(-)
 delete mode 100644 src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py

diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
deleted file mode 100644
index fcab06658c..0000000000
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py
+++ /dev/null
@@ -1,1086 +0,0 @@
-# Copyright (c) 2020, Xilinx
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of FINN nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import math
-import numpy as np
-import os
-import textwrap
-import warnings
-from qonnx.core.datatype import DataType
-from qonnx.util.basic import (
-    calculate_matvec_accumulator_range,
-    interleave_matrix_outer_dim_from_partitions,
-    roundup_to_integer_multiple,
-)
-
-from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
-from finn.util.basic import get_rtlsim_trace_depth, make_build_dir
-from finn.util.data_packing import (
-    npy_to_rtlsim_input,
-    pack_innermost_dim_as_hex_string,
-    rtlsim_output_to_npy,
-)
-
-try:
-    from pyverilator import PyVerilator
-except ModuleNotFoundError:
-    PyVerilator = None
-
-
-# ONNX i/o tensor shape assumptions for MatrixVectorActivation:
-# input 0 is the input tensor, shape (.., i_size) = (..., MW)
-# input 1 is the weight tensor, shape (i_size, o_size) = (MW, MH)
-# (optional) input 2 is the thresholds tensor, shape (o_size, n_thres)
-# output 0 is the output tensor, shape (.., o_size) = (..., MH)
-# the ... here can be any shape (representing groups of vectors)
-
-
-class MatrixVectorActivation_rtl(HLSCustomOp):
-    """Class that corresponds to finn-rtl Matrix Vector Unit."""
-
-    def __init__(self, onnx_node, **kwargs):
-        super().__init__(onnx_node, **kwargs)
-
-    def get_nodeattr_types(self):
-        my_attrs = {
-            "PE": ("i", True, 0),
-            "SIMD": ("i", True, 0),
-            "MW": ("i", True, 0),
-            "MH": ("i", True, 0),
-            "resType": ("s", False, "dsp", {"auto", "lut", "dsp"}),
-            # FINN DataTypes for inputs, weights, outputs
-            "inputDataType": ("s", True, ""),
-            "weightDataType": ("s", True, ""),
-            "outputDataType": ("s", True, ""),
-            # FINN DataType for accumulator -- auto-computed and updated
-            "accDataType": ("s", False, "INT32"),
-            # number of input vectors, examples:
-            # [1] is a single vector (like a FC layer with batch=1)
-            # [4] is four vectors (like a FC layer with batch=4)
-            # [1, 4, 4] is four * four vectors (like a conv layer with batch=1)
-            "numInputVectors": ("ints", False, [1]),
-            # memory mode for the FC weights
-            # const -- embedded weights, default, long compile/synth times
-            # decoupled -- streaming weights with weight streamer packaged inside IP
-            # external -- streaming weights with external streamer
-            "mem_mode": ("s", False, "const", {"const", "decoupled", "external"}),
-            # FPGA resource type for memories in decoupled mode
-            # auto -- let Vivado decide
-            # block -- use BRAM
-            # distributed -- use LUTRAM
-            # ultra -- use UltraRAM (URAM), must have runtime_writeable_weights=1
-            # see also https://www.xilinx.com/support/answers/38070.html
-            "ram_style": (
-                "s",
-                False,
-                "auto",
-                {"auto", "block", "distributed", "ultra"},
-            ),
-            # (mem_mode = decoupled only) whether weights will be writable through
-            # an AXI-lite interface during runtime
-            # 1 for enabled, 0 for disabled.
-            # see finn-rtllib/memstream/doc/README for more about the memory
-            # address map used for writable weights
-            # IMPORTANT: After using AXI lite to either read or write the weights,
-            # always "flush" the accelerator by first passing a dummy input
-            # vector through the accelerator. This will get rid of any old
-            # weight data from the weight FIFOs.
-            "runtime_writeable_weights": ("i", False, 0, {0, 1}),
-            # attribute to save top module name - not user configurable
-            "gen_top_module": ("s", False, ""),
-        }
-        my_attrs.update(super().get_nodeattr_types())
-        return my_attrs
-
-    def calc_wmem(self):
-        """Calculates and returns WMEM."""
-        mw = self.get_nodeattr("MW")
-        mh = self.get_nodeattr("MH")
-        pe = self.get_nodeattr("PE")
-        simd = self.get_nodeattr("SIMD")
-        assert mh % pe == 0, "Requirement MH divisable by PE is violated."
-        assert mw % simd == 0, "Requirement MW divisable by SIMD is violated."
-        wmem = mw * mh // (pe * simd)
-        return wmem
-
-    def calc_tmem(self):
-        """Calculates and returns TMEM."""
-        return 0
-
-    def make_shape_compatible_op(self, model):
-        oshape = self.get_normal_output_shape()
-        return super().make_const_shape_op(oshape)
-
-    def infer_node_datatype(self, model):
-        node = self.onnx_node
-        idt = model.get_tensor_datatype(node.input[0])
-        if idt != self.get_input_datatype():
-            warn_str = "inputDataType changing for %s: %s -> %s " % (
-                node.name,
-                str(self.get_input_datatype()),
-                str(idt),
-            )
-            warnings.warn(warn_str)
-        self.set_nodeattr("inputDataType", idt.name)
-        # set output datatype from property
-        odt = self.get_output_datatype()
-        model.set_tensor_datatype(node.output[0], odt)
-
-    def verify_node(self):
-        info_messages = []
-        # verify that "backend" is set to "fpgadataflow"
-        backend_value = self.get_nodeattr("backend")
-        if backend_value == "fpgadataflow":
-            info_messages.append("Attribute backend is set correctly")
-        else:
-            info_messages.append('Attribute backend should be set to "fpgadataflow"')
-
-        # verify that all necessary attributes exist
-        # TODO collect automatically from get_nodeattr_types
-        try:
-            self.get_nodeattr("executable_path")
-            self.get_nodeattr("resType")
-            self.get_nodeattr("MW")
-            self.get_nodeattr("MH")
-            self.get_nodeattr("SIMD")
-            self.get_nodeattr("PE")
-            self.get_nodeattr("inputDataType")
-            self.get_nodeattr("weightDataType")
-            self.get_nodeattr("outputDataType")
-            info_messages.append("All necessary attributes exist")
-        except Exception:
-            info_messages.append(
-                """The required MatrixVectorActivation attributes do not exist."""
-            )
-
-        num_of_inputs = len(self.onnx_node.input)
-        if num_of_inputs != 2:
-            info_messages.append(
-                "RTL-based MatrixVectorActivation expects two inputs "
-                "(weights and activation), but got {} inputs.".format(
-                    len(self.onnx_node.input)
-                )
-            )
-
-        mem_mode = self.get_nodeattr("mem_mode")
-
-        if mem_mode not in ["decoupled", "external"]:
-            info_messages.append(
-                "RTL-based MVU only supports decoupled or external weights."
-            )
-
-        if self.get_nodeattr("resType") == "lut":
-            info_message.append(
-                "RTL-based MVU only supports DSP-based implementation"
-            )
-
-        return info_messages
-
-    def uram_estimation(self):
-        P = self.get_nodeattr("PE")
-        Q = self.get_nodeattr("SIMD")
-        wdt = self.get_weight_datatype()
-        W = wdt.bitwidth()
-        D_in = self.get_nodeattr("MW")
-        D_out = self.get_nodeattr("MH")
-        omega = (D_in * D_out) / (Q * P)
-        mem_width = Q * W * P
-        mmode = self.get_nodeattr("mem_mode")
-        mstyle = self.get_nodeattr("ram_style")
-        if (
-            (mmode == "decoupled" and mstyle != "ultra")
-            or (mmode == "external")
-        ):
-            return 0
-        width_multiplier = math.ceil(mem_width / 72)
-        depth_multiplier = math.ceil(omega / 4096)
-        return width_multiplier * depth_multiplier
-
-    def bram_estimation(self):
-        """Calculates resource estimation for BRAM based on:
-        - FINN-R: An End-to-End Deep-Learning Framework for Fast
-        Exploration of Quantized Neural Networks
-        - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien,
-        Y. Umuroglu, M. Leeser and K. Vissers
-        - 12. Sep 2018
-        """
-        # TODO add in/out FIFO contributions
-        P = self.get_nodeattr("PE")
-        Q = self.get_nodeattr("SIMD")
-        wdt = self.get_weight_datatype()
-        W = wdt.bitwidth()
-        D_in = self.get_nodeattr("MW")
-        D_out = self.get_nodeattr("MH")
-        omega = (D_in * D_out) / (Q * P)
-        mem_width = Q * W * P
-        mmode = self.get_nodeattr("mem_mode")
-        mstyle = self.get_nodeattr("ram_style")
-        if (
-            (mmode == "decoupled" and mstyle in ["distributed", "ultra"])
-            or (mmode == "external")
-        ):
-            return 0
-        # assuming SDP mode RAMB18s (see UG573 Table 1-10)
-        # assuming decoupled (RTL) memory
-        if mem_width == 1:
-            return math.ceil(omega / 16384)
-        elif mem_width == 2:
-            return math.ceil(omega / 8192)
-        elif mem_width <= 4:
-            return (math.ceil(omega / 4096)) * (math.ceil(mem_width / 4))
-        elif mem_width <= 9:
-            return (math.ceil(omega / 2048)) * (math.ceil(mem_width / 9))
-        elif mem_width <= 18 or omega > 512:
-            return (math.ceil(omega / 1024)) * (math.ceil(mem_width / 18))
-        else:
-            return (math.ceil(omega / 512)) * (math.ceil(mem_width / 36))
-
-    def bram_efficiency_estimation(self):
-        wdt = self.get_weight_datatype()
-        W = wdt.bitwidth()
-        D_in = self.get_nodeattr("MW")
-        D_out = self.get_nodeattr("MH")
-        bram16_est = self.bram_estimation()
-        if bram16_est == 0:
-            return 1
-        wbits = W * D_in * D_out
-        bram16_est_capacity = bram16_est * 36 * 512
-        return wbits / bram16_est_capacity
-
-    def uram_efficiency_estimation(self):
-        """Function for URAM efficiency estimation: actual parameter storage
-        needed divided by the allocated URAM storage (from estimation)"""
-        wdt = self.get_weight_datatype()
-        W = wdt.bitwidth()
-        D_in = self.get_nodeattr("MW")
-        D_out = self.get_nodeattr("MH")
-        uram_est = self.uram_estimation()
-        if uram_est == 0:
-            return 1
-        wbits = W * D_in * D_out
-        uram_est_capacity = uram_est * 72 * 4096
-        return wbits / uram_est_capacity
-
-# TODO: fix lut estimations 
-    def lut_estimation(self):
-        """Calculates resource estimations for LUTs based on:
-        - FINN-R: An End-to-End Deep-Learning Framework for Fast
-        Exploration of Quantized Neural Networks
-        - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien,
-        Y. Umuroglu, M. Leeser and K. Vissers
-        - 12. Sep 2018
-        """
-        # TODO add in/out FIFO contributions
-        P = self.get_nodeattr("PE")
-        Q = self.get_nodeattr("SIMD")
-        MW = self.get_nodeattr("MW")
-        wdt = self.get_weight_datatype()
-        W = wdt.bitwidth()
-        # determine tdt with input and weight data types
-        idt = self.get_input_datatype()
-        A = idt.bitwidth()
-        # parameters from experiments in paper mentioned above
-        c0 = 300
-        c1 = 1.1
-        c2 = 0
-        mmode = self.get_nodeattr("mem_mode")
-        mstyle = self.get_nodeattr("ram_style")
-        if (mmode == "decoupled" and mstyle == "distributed") or (
-            mmode == "const" and self.calc_wmem() <= 128
-        ):
-            c2 = (P * Q * W) * math.ceil(self.calc_wmem() / 64)
-
-        # multiplication
-        res_type = self.get_nodeattr("resType")
-        if res_type == "dsp":
-            mult_luts = 0
-        else:
-            mult_luts = Q * (2 * math.ceil((W + A) / 6) - 1) * (W + A)
-        # adder tree
-        addertree_luts = (W + A) * (2 * Q - 1)
-        # accumulator
-        acc_bits = W + A + np.ceil(math.log(MW, 2))
-        acc_luts = acc_bits
-
-        return int(c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts)) + c2)
-
-# TODO: fix DSP estimations --> depends on fpga_part
-    def dsp_estimation(self):
-        # multiplication
-        # mvu_8sx9 (DSP58): ceil(SIMD/3)
-        # mvu_4sx4u (DSP48/DSP58): ceil(PE/4)
-        # mvu_8sx8u (DSP48): ceil(PE/2)
-        # mvu_lut: 0
-        P = self.get_nodeattr("PE")
-        res_type = self.get_nodeattr("resType")
-        Q = self.get_nodeattr("SIMD")
-        wdt = self.get_weight_datatype()
-        W = wdt.bitwidth()
-        idt = self.get_input_datatype()
-        A = idt.bitwidth()
-        if res_type == "dsp":
-            mult_dsp = P * Q * np.ceil((W + A) / 48)  # TODO: more accurate modelling
-        else:
-            mult_dsp = 0
-        return int(mult_dsp)
-
-# TODO: fix exp_cycles estimations --> depends on fpga_part and clk
-    def get_exp_cycles(self):
-        # mvu_8sx9 (DSP58):
-        # 2 (replay_buffer) + ceil(chainlen/seglen) + 2 (MREG, PREG) + 2 (output reg slice)
-        # + MW/SIMD * MH/PE
-        # mvu_4sx4u (DSP48/DSP58) / mvu_8sx8u (DSP48): 
-        # 3 (IN_REG, MREG, PREG) + 2 (replay_buffer) + 2 (output reg slice) + 1 (adder tree SIMD) + 1 (output lane)
-        # + MW/SIMD * MH/PE
-        # mvu_lut:
-        # 2 (replay_buffer) + 1 OR 2 (no MREG OR MREG) + 2 (output reg slice) 
-        # + MW/SIMD * MH/PE
-        pe = self.get_nodeattr("PE")
-        simd = self.get_nodeattr("SIMD")
-        num_inp_vec = self.get_nodeattr("numInputVectors")
-        mh = self.get_nodeattr("MH")
-        mw = self.get_nodeattr("MW")
-        # since mmv != 1 is not supported yet, we set mmv for now to 1
-        mmv = 1     
-        exp_cycles = (mh / pe) * (mw / simd) * np.prod(num_inp_vec) / mmv
-        return int(exp_cycles)
-
-    def get_input_datatype(self, ind=0):
-        """Returns FINN DataType of input."""
-        # when performing FIFO insertion on an FC layer with ext weights, the ind
-        # parameter can be > 0 (referring to the weights) so handle that here
-        if ind == 0:
-            return DataType[self.get_nodeattr("inputDataType")]
-        elif ind == 1:
-            return DataType[self.get_nodeattr("weightDataType")]
-        else:
-            raise Exception("Undefined input ind for this layer type")
-
-    def get_weight_datatype(self):
-        """Returns FINN DataType of weights."""
-        return DataType[self.get_nodeattr("weightDataType")]
-
-    def get_output_datatype(self, ind=0):
-        """Returns FINN DataType of output."""
-        return DataType[self.get_nodeattr("outputDataType")]
-
-    def get_instream_width(self, ind=0):
-        i_bits = self.get_input_datatype().bitwidth()
-        assert (
-            i_bits <= 9
-        ), "RTL-based MVAU only supports activations with bit-width up to 9-bits"
-        in_width = i_bits * self.get_nodeattr("SIMD")
-        return in_width
-
-    def get_outstream_width(self, ind=0):
-        o_bits = self.get_output_datatype().bitwidth()
-        out_width = o_bits * self.get_nodeattr("PE")
-        return out_width
-
-    def get_weightstream_width(self):
-        """Returns weight stream width. Used only in decoupled mode."""
-        if (
-            self.get_nodeattr("mem_mode") == "decoupled"
-            or self.get_nodeattr("mem_mode") == "external"
-        ):
-            pe = self.get_nodeattr("PE")
-            simd = self.get_nodeattr("SIMD")
-            wp = self.get_weight_datatype().bitwidth()
-            assert (
-                wp <= 8
-            ), "RTL-based MVAU only supports weights with bit-width up to 8-bits"
-            w_width = pe * simd * wp
-            return w_width
-        else:
-            return 0
-
-    def get_weightstream_width_padded(self):
-        """Returns weight stream width padded to a multiple of 8. This is required
-        by the AXI Stream spec. Used in decoupled mode."""
-        weight_width = self.get_weightstream_width()
-        return roundup_to_integer_multiple(weight_width, 8)
-
-    def get_ap_int_max_w(self):
-        # base class impl (max of inp/out stream widths)
-        max_of_io = super().get_ap_int_max_w()
-        # decoupled mode weight stream
-        weightstream = self.get_weightstream_width()
-        # single PE weight entry
-        weight_bits = self.get_weight_datatype().bitwidth()
-        simd = self.get_nodeattr("SIMD")
-        single_pe_w = simd * weight_bits
-        return max([weightstream, max_of_io, single_pe_w])
-
-    def get_folded_input_shape(self, ind=0):
-        mw = self.get_nodeattr("MW")
-        mh = self.get_nodeattr("MH")
-        simd = self.get_nodeattr("SIMD")
-        pe = self.get_nodeattr("PE")
-        sf = mw // simd
-        nf = mh // pe
-        vecs = list(self.get_nodeattr("numInputVectors"))
-
-        if ind == 0:
-            # calculate shape of input 0
-            folded_input_shape = tuple(vecs + [sf, simd])
-        elif ind == 1 and self.get_nodeattr("mem_mode") == "external":
-            # calculate shape of input 1 (weights)
-            folded_input_shape = tuple(vecs + [sf * nf, simd * pe])
-        else:
-            raise Exception("Undefined input shape for requested input")
-
-        return folded_input_shape
-
-    def get_folded_output_shape(self, ind=0):
-        mh = self.get_nodeattr("MH")
-        pe = self.get_nodeattr("PE")
-        nf = mh // pe
-        vecs = list(self.get_nodeattr("numInputVectors"))
-        folded_output_shape = tuple(vecs + [nf, pe])
-        return folded_output_shape
-
-    def get_normal_input_shape(self, ind=0):
-        mw = self.get_nodeattr("MW")
-        vecs = list(self.get_nodeattr("numInputVectors"))
-        normal_input_shape = tuple(vecs + [mw])
-        return normal_input_shape
-
-    def get_normal_output_shape(self, ind=0):
-        mh = self.get_nodeattr("MH")
-        vecs = list(self.get_nodeattr("numInputVectors"))
-        normal_output_shape = tuple(vecs + [mh])
-        return normal_output_shape
-
-    def get_number_output_values(self):
-        nf = np.prod(self.get_folded_output_shape()[:-1])
-        return nf
-
-    def get_hls_compatible_weight_tensor(self, orig_weight_matrix):
-        """Convert the original numpy weight matrix orig_weight_matrix into
-        a form suitable for passing to the hlslib call:
-        * ensure MH % PE == 0 and MW % SIMD == 0
-        * for bipolar {-1,+1} weights, convert to binary {0, 1}
-        * interleave rows between PEs
-        * reshape into (1, PE, WMEM, SIMD) and return
-        """
-        mw = self.get_nodeattr("MW")
-        mh = self.get_nodeattr("MH")
-        pe = self.get_nodeattr("PE")
-        simd = self.get_nodeattr("SIMD")
-        wmem = self.calc_wmem()
-        assert orig_weight_matrix.shape == (
-            mw,
-            mh,
-        ), """Weights matrix doesn't
-        have expected shape (mw, mh)"""
-        assert mw % simd == 0, "Requirement MH divisable by SIMD is violated."
-        assert mh % pe == 0, "Requirement MH divisable by PE is violated."
-        # start by transposing the original weight matrix, since ONNX and
-        # finn-hlslib use different assumptions
-        # ONNX uses (in_features, out_features) and matmul(x, W)
-        # finn-hlslib uses (out_features, in_features) and matmul(W, x)
-        ret = orig_weight_matrix.T
-        # interleave rows between PEs and reshape
-        # distribute rows between PEs
-        ret = interleave_matrix_outer_dim_from_partitions(ret, pe)
-        # create SIMD as innermost dimension and add a dummy outer dim
-        ret = ret.reshape(1, pe, wmem, simd)
-        # reverse the SIMD dimension
-        ret = np.flip(ret, axis=-1)
-        return ret
-
-    def minimize_accumulator_width(self, model):
-        weights = model.get_initializer(self.onnx_node.input[1])
-        idt = self.get_input_datatype()
-        # calculate minimum and maximum values of accumulator
-        (acc_min, acc_max) = calculate_matvec_accumulator_range(weights, idt)
-        if acc_min < 0:
-            if abs(acc_min) > acc_max:
-                adt = DataType.get_smallest_possible(acc_min)
-            else:
-                adt = DataType.get_smallest_possible(-acc_max - 1)
-        else:
-            adt = DataType.get_smallest_possible(acc_max)
-        # Note: we are interested in simply the width of the output dot product.
-        # Padding the actual output stream to a multiple of 8-bits is done in
-        # the RTL component
-        self.set_nodeattr("accDataType", adt.name)
-        # for no-activation nodes, output dt = acc dt
-        self.set_nodeattr("outputDataType", adt.name)
-        return DataType[self.get_nodeattr("accDataType")]
-
-    def make_weight_file(self, weights, weight_file_mode, weight_file_name):
-        """Produce a file containing given weights in appropriate format for this
-        layer. This file can be used for either synthesis or run-time reconfig
-        of weights.
-
-        Arguments:
-        * weights : numpy array with weights to be put into the file
-        * weight_file_mode : one of {hls_header, decoupled_verilog_dat,
-          decoupled_runtime}
-        * weight_file_name : filename for the weight file to be generated
-        """
-        # convert weights into hlslib-compatible format
-        weight_tensor = self.get_hls_compatible_weight_tensor(weights)
-        export_wdt = self.get_weight_datatype()
-        if "decoupled" in weight_file_mode:
-            # create a weight stream for various flavors of decoupled mode:
-            # transpose weight tensor from (1, PE, WMEM, SIMD) to (1, WMEM, PE, SIMD)
-            weight_tensor_unflipped = np.transpose(weight_tensor, (0, 2, 1, 3))
-            # reverse SIMD flip for saving weights in .npy
-            weight_tensor_simd_flipped = np.flip(weight_tensor_unflipped, axis=-1)
-            # PE flip for saving weights in .dat
-            weight_tensor_pe_flipped = np.flip(weight_tensor_unflipped, axis=-2)
-            # reshape weight tensor (simd_flipped and pe_flipped) to desired shape
-            pe = self.get_nodeattr("PE")
-            simd = self.get_nodeattr("SIMD")
-            # simd_flipped
-            weight_tensor_simd_flipped = weight_tensor_simd_flipped.reshape(
-                1, -1, pe * simd
-            )
-            weight_tensor_simd_flipped = weight_tensor_simd_flipped.copy()
-            # flipped
-            weight_tensor_pe_flipped = weight_tensor_pe_flipped.reshape(
-                1, -1, pe * simd
-            )
-            weight_tensor_pe_flipped = weight_tensor_pe_flipped.copy()
-            if weight_file_mode == "decoupled_npy":
-                # save weight stream into npy for cppsim
-                np.save(weight_file_name, weight_tensor_simd_flipped)
-            elif weight_file_mode == "decoupled_verilog_dat":
-                # convert weight values into hexstring
-                weight_width = self.get_weightstream_width()
-                # pad to nearest 4 bits to get hex strings
-                weight_width_padded = roundup_to_integer_multiple(weight_width, 4)
-                weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string(
-                    weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix=""
-                )
-                # add zeroes to pad out file to 1024 entries
-                weight_stream = weight_tensor_pe_flipped.flatten()
-                weight_stream = weight_stream.copy()
-                with open(weight_file_name, "w") as f:
-                    for val in weight_stream:
-                        f.write(val + "\n")
-            elif weight_file_mode == "decoupled_runtime":
-                # memstream axi-lite interface will map each mem line to
-                # one or multiple 32-bit words
-                weight_width = self.get_weightstream_width()
-                words_per_memwidth = 2 ** math.ceil(math.log2(weight_width / 32))
-                if words_per_memwidth < 1:
-                    words_per_memwidth = 1
-                weight_width_padded = words_per_memwidth * 32
-                # first, pack and ensure padding to 32 bits
-                weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string(
-                    weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix=""
-                )
-                weight_stream = weight_tensor_pe_flipped.flatten()
-                weight_stream = weight_stream.copy()
-                with open(weight_file_name, "w") as f:
-                    for val in weight_stream:
-                        # split into groups of 8 hex digits (= 32 bits)
-                        words_32b = textwrap.wrap(val, 8)
-                        words_32b.reverse()
-                        for word_32b in words_32b:
-                            f.write(word_32b + "\n")
-            else:
-                raise Exception("Unknown/unsupported weight_file_mode")
-
-        else:
-            raise Exception("Unknown/unsupported weight_file_mode")
-
-    def generate_params(self, model, path):
-        mem_mode = self.get_nodeattr("mem_mode")
-        code_gen_dir = path
-        # weights, if not external
-        weights = model.get_initializer(self.onnx_node.input[1])
-        if mem_mode == "decoupled" or mem_mode == "external":
-            weight_filename_sim = "{}/weights.npy".format(code_gen_dir)
-            # save decoupled weights for cppsim
-            self.make_weight_file(weights, "decoupled_npy", weight_filename_sim)
-            if mem_mode == "decoupled":
-                # also save weights as Verilog .dat file
-                # This file will be ignored when synthesizing UltraScale memory.
-                weight_filename_rtl = "{}/memblock.dat".format(code_gen_dir)
-                self.make_weight_file(
-                    weights, "decoupled_verilog_dat", weight_filename_rtl
-                )
-        else:
-            raise Exception(
-                """Please set mem_mode to "const", "decoupled", or "external",
-                currently no other parameter value is supported!"""
-            )
-
-    def execute_node(self, context, graph):
-        mode = self.get_nodeattr("exec_mode")
-        mem_mode = self.get_nodeattr("mem_mode")
-        node = self.onnx_node
-
-        if mode == "cppsim":
-            raise Exception(
-                "cppsim not possible for RTL MVAU, please set exec_mode to rtlsim"
-            )
-        elif mode == "rtlsim":
-            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
-        else:
-            raise Exception(
-                """Invalid value for attribute exec_mode! Is currently set to: {}
-            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
-                    mode
-                )
-            )
-
-        # create a npy file fore each input of the node (in_ind is input index)
-        in_ind = 0
-        for inputs in node.input:
-            # it is assumed that the first input of the node is the data input
-            # the second input are the weights
-            if in_ind == 0:
-                assert (
-                    str(context[inputs].dtype) == "float32"
-                ), """Input datatype is
-                not float32 as expected."""
-                expected_inp_shape = self.get_folded_input_shape()
-                reshaped_input = context[inputs].reshape(expected_inp_shape)
-                export_idt = self.get_input_datatype()
-                # make copy before saving the array
-                reshaped_input = reshaped_input.copy()
-                np.save(
-                    os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)),
-                    reshaped_input,
-                )
-            elif in_ind > 2:
-                raise Exception("Unexpected input found for MatrixVectorActivation_rtl")
-            in_ind += 1
-
-        if mode == "rtlsim":
-            sim = self.get_rtlsim()
-            nbits = self.get_instream_width()
-            inp = npy_to_rtlsim_input(
-                "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
-            )
-            super().reset_rtlsim(sim)
-            super().toggle_clk(sim)
-            if mem_mode in ["external", "decoupled"]:
-                wnbits = self.get_weightstream_width()
-                export_wdt = self.get_weight_datatype()
-                wei = npy_to_rtlsim_input(
-                    "{}/weights.npy".format(code_gen_dir), export_wdt, wnbits
-                )
-                num_w_reps = np.prod(self.get_nodeattr("numInputVectors"))
-                io_dict = {
-                    "inputs": {"in0": inp, "weights": wei * num_w_reps},
-                    "outputs": {"out": []},
-                }
-                self.rtlsim_multi_io(sim, io_dict)
-                output = io_dict["outputs"]["out"]
-            else:
-                output = self.rtlsim(sim, inp)
-            odt = self.get_output_datatype()
-            target_bits = odt.bitwidth()
-            packed_bits = self.get_outstream_width()
-            out_npy_path = "{}/output.npy".format(code_gen_dir)
-            out_shape = self.get_folded_output_shape()
-            rtlsim_output_to_npy(
-                output, out_npy_path, odt, out_shape, packed_bits, target_bits
-            )
-            # load and reshape output
-            output = np.load(out_npy_path)
-            oshape = self.get_normal_output_shape()
-            output = np.asarray([output], dtype=np.float32).reshape(*oshape)
-            context[node.output[0]] = output
-        else:
-            raise Exception(
-                """Invalid value for attribute exec_mode! Is currently set to: {}
-            has to be set to "rtlsim" """.format(
-                    mode
-                )
-            )
-
-    def code_generation_ipgen(self, model, fpgapart, clk):
-        """Normally: Generates C++ code and tcl script for IP generation.
-        Here: Generates (System-)Verilog code for IP generation."""
-        self.generate_hdl(model, fpgapart, clk)
-
-    def ipgen_singlenode_code(self):
-        """Normally: Builds the bash script for IP generation."""
-        pass
-
-    def code_generation_cppsim(self, model):
-        """Normally: Generates C++ code for simulation (cppsim)."""
-        pass
-
-    def compile_singlenode_code(self):
-        pass
-
-    def global_includes(self):
-        pass
-
-    def defines(self, var):
-        pass
-
-    def read_npy_data(self):
-        pass
-
-    def strm_decl(self):
-        pass
-
-    def docompute(self):
-        pass
-
-    def dataoutstrm(self):
-        pass
-
-    def save_as_npy(self):
-        pass
-
-    def blackboxfunction(self):
-        pass
-
-    def pragmas(self):
-        pass
-
-    def code_generation_ipi(self):
-        cmd = []
-        # add streamer if needed
-        mem_mode = self.get_nodeattr("mem_mode")
-        if mem_mode == "decoupled":
-            runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1
-            if self.get_nodeattr("ram_style") == "ultra":
-                assert (
-                    runtime_writable == 1
-                ), "Layer with URAM weights must have runtime_writeable_weights=1"
-            node_name = self.onnx_node.name
-            sname = self.hls_sname()
-            # create a hierarchy for this layer, with the same port names
-            clk_name = self.get_verilog_top_module_intf_names()["clk"][0]
-            rst_name = self.get_verilog_top_module_intf_names()["rst"][0]
-            dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0][0]
-            din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0]
-            cmd.append("create_bd_cell -type hier %s" % node_name)
-            cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name))
-            cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name))
-            cmd.append(
-                "create_bd_intf_pin -mode Master "
-                "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s"
-                % (node_name, dout_name)
-            )
-            cmd.append(
-                "create_bd_intf_pin -mode Slave "
-                "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name)
-            )
-            # instantiate the RTL block
-            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
-            rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/")
-            sourcefiles = [
-                os.path.join(
-                    code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"
-                ),
-                rtllib_dir + "mvu_vvu_axi.sv",
-                rtllib_dir + "replay_buffer.sv",
-                rtllib_dir + "mvu_4sx4u.sv",
-                rtllib_dir + "mvu_vvu_8sx9_dsp58.sv",
-                rtllib_dir + "mvu_8sx8u_dsp48.sv",
-            ]
-            for f in sourcefiles:
-                cmd.append("add_files -norecurse %s" % (f))
-            cmd.append(
-                "create_bd_cell -type hier -reference %s /%s/%s"
-                % (
-                    self.get_nodeattr("gen_top_module"),
-                    self.onnx_node.name,
-                    self.onnx_node.name,
-                )
-            )
-
-            # instantiate a streamer and connect it to the HLS IP
-            strm_vlnv = "amd.com:finn:memstream:1.0"
-            strm_inst = node_name + "_wstrm"
-            cmd.append(
-                "create_bd_cell -type ip -vlnv %s /%s/%s"
-                % (strm_vlnv, node_name, strm_inst)
-            )
-            cmd.append(
-                "set_property -dict [list "
-                "CONFIG.DEPTH {%d} "
-                "CONFIG.WIDTH {%d} "
-                "CONFIG.INIT_FILE {%s} "
-                "CONFIG.RAM_STYLE {%s} "
-                "] [get_bd_cells /%s/%s]"
-                % (
-                    self.calc_wmem(),
-                    self.get_weightstream_width_padded(),
-                    self.get_nodeattr("code_gen_dir_ipgen") + "/memblock.dat",
-                    self.get_nodeattr("ram_style"),
-                    node_name,
-                    strm_inst,
-                )
-            )
-            cmd.append(
-                "connect_bd_intf_net [get_bd_intf_pins %s/%s/m_axis_0] "
-                "[get_bd_intf_pins %s/%s/weights_%s]"
-                % (node_name, strm_inst, node_name, node_name, sname)
-            )
-            cmd.append(
-                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/ap_rst_n]"
-                % (node_name, rst_name, node_name, strm_inst)
-            )
-            cmd.append(
-                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/ap_clk]"
-                % (node_name, clk_name, node_name, strm_inst)
-            )
-            cmd.append(
-                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]"
-                % (node_name, rst_name, node_name, node_name, rst_name)
-            )
-            cmd.append(
-                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]"
-                % (node_name, clk_name, node_name, node_name, clk_name)
-            )
-            cmd.append(
-                "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
-                "[get_bd_intf_pins %s/%s/%s]"
-                % (node_name, din_name, node_name, node_name, din_name)
-            )
-            cmd.append(
-                "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
-                "[get_bd_intf_pins %s/%s/%s]"
-                % (node_name, dout_name, node_name, node_name, dout_name)
-            )
-            if runtime_writable:
-                # expose axi lite interface for writeable weights
-                axilite_name = self.get_verilog_top_module_intf_names()["axilite"][0]
-                cmd.append(
-                    "create_bd_intf_pin -mode Slave "
-                    "-vlnv xilinx.com:interface:aximm_rtl:1.0 /%s/%s"
-                    % (node_name, axilite_name)
-                )
-                cmd.append(
-                    "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
-                    "[get_bd_intf_pins %s/%s/%s]"
-                    % (node_name, axilite_name, node_name, strm_inst, axilite_name)
-                )
-                # TODO calculate and pass in segment size here
-                cmd.append("assign_bd_address")
-            cmd.append("save_bd_design")
-        elif mem_mode == "external":
-            # instantiate the RTL block
-            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
-            rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/")
-            sourcefiles = [
-                os.path.join(
-                    code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"
-                ),
-                rtllib_dir + "mvu_vvu_axi.sv",
-                rtllib_dir + "replay_buffer.sv",
-                rtllib_dir + "mvu_4sx4u.sv",
-                rtllib_dir + "mvu_vvu_8sx9_dsp58.sv",
-                rtllib_dir + "mvu_8sx8u_dsp48.sv",
-            ]
-            for f in sourcefiles:
-                cmd.append("add_files -norecurse %s" % (f))
-            cmd.append(
-                "create_bd_cell -type module -reference %s %s"
-                % (
-                    self.get_nodeattr("gen_top_module"),
-                    self.onnx_node.name,
-                )
-            )
-        else:
-            raise Exception("Unrecognized mem_mode for MatrixVectorActivation")
-        return cmd
-
-    def get_verilog_top_module_intf_names(self):
-        intf_names = super().get_verilog_top_module_intf_names()
-        mem_mode = self.get_nodeattr("mem_mode")
-        sname = self.hls_sname()
-        if mem_mode == "external":
-            intf_names["s_axis"].append(
-                ("weights_" + sname, self.get_weightstream_width_padded())
-            )
-        if mem_mode == "decoupled":
-            # only expose axilite interface if attribute is set
-            runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1
-            if runtime_writable:
-                intf_names["axilite"] = ["s_axilite"]
-        return intf_names
-
-    def get_op_and_param_counts(self):
-        in_features = self.get_nodeattr("MW")
-        out_features = self.get_nodeattr("MH")
-        weight_bits = self.get_weight_datatype().bitwidth()
-        inp_bits = self.get_input_datatype().bitwidth()
-        num_inp_vec = self.get_nodeattr("numInputVectors")
-        num_repetitions = int(np.prod(num_inp_vec))
-        mac_count = in_features * out_features * num_repetitions
-        # cannonicalize op type: highest bitwidth operand first s.t.
-        # e.g. mac_8bx4b and mac_4bx8b don't appear as two different op types
-        bw1 = min(inp_bits, weight_bits)
-        bw2 = max(inp_bits, weight_bits)
-        mac_op_type = "op_mac_%dbx%db" % (bw1, bw2)
-        weight_param_type = "param_weight_%db" % (weight_bits)
-        weight_count = in_features * out_features
-        ret_dict = {mac_op_type: mac_count, weight_param_type: weight_count}
-        return ret_dict
-
-    def derive_characteristic_fxns(self, period):
-        n_inps = np.prod(self.get_folded_input_shape()[:-1])
-        io_dict = {
-            "inputs": {
-                "in0": [0 for i in range(n_inps)],
-            },
-            "outputs": {"out": []},
-        }
-        mem_mode = self.get_nodeattr("mem_mode")
-        if mem_mode in ["decoupled", "external"]:
-            n_weight_inps = self.calc_wmem()
-            num_w_reps = np.prod(self.get_nodeattr("numInputVectors"))
-            io_dict["inputs"]["weights"] = [
-                0 for i in range(num_w_reps * n_weight_inps)
-            ]
-        super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
-
-    def _resolve_segment_len(self, clk):
-        # Insert pipeline registers in the DSP58 chain to meet target clock frequency
-        # ~0.741 ns seems the worst-case delay through first DSP
-        # ~0.605 ns seems to be (on average) delay for all subsequent DSPs
-        # clk >= (critical_path_dsps - 1) * 0.605 + 0.741
-        assert (clk > 0.741), "Infeasible clk target of {} ns has been set, consider lowering the targeted clock frequency!".format(clk)
-        critical_path_dsps = np.floor((clk - 0.741) / 0.605 + 1)
-        max_chain_len = np.ceil(self.get_nodeattr("SIMD") / 3)
-        dsp_chain_len = critical_path_dsps if critical_path_dsps < max_chain_len else max_chain_len
-        return dsp_chain_len
-
-    def _resolve_impl_style(self, fpgapart):
-        # Based on target device and activation/weight-width, choose the
-        # supported RTL compute core
-        
-        assert self.get_nodeattr("resType") != "lut", "LUT-based RTL-MVU implementation currently not supported! Please change resType for {}".format(self.onnx_node.name)
-
-        act_width = self.get_input_datatype(0).bitwidth()
-        weight_width = self.get_input_datatype(1).bitwidth()
-        is_versal = (
-            fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"]
-            or fpgapart[0:5] == "xqrvc"
-        )
-        
-        if is_versal:
-            return "mvu_vvu_8sx9_dsp58"
-        else:
-            if act_width == 4 and weight_width == 4:
-                return "mvu_4sx4u"
-            else:
-                return "mvu_8sx8u_dsp48"
-
-    def generate_hdl(self, model, fpgapart, clk):
-        # Generate params as part of IP preparation
-        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
-        self.generate_params(model, code_gen_dir)
-
-        template_path, code_gen_dict = self.prepare_codegen_default(fpgapart, clk)
-        # add general parameters to dictionary
-        code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [
-            self.get_verilog_top_module_name()
-        ]
-        # save top module name so we can refer to it after this node has been renamed
-        # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject)
-        self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name())
-
-        # apply code generation to template
-        with open(template_path, "r") as f:
-            template_wrapper = f.read()
-        for key in code_gen_dict:
-            # transform list into long string separated by '\n'
-            code_gen_line = "\n".join(code_gen_dict[key])
-            template_wrapper = template_wrapper.replace(key, code_gen_line)
-        with open(
-            os.path.join(
-                code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"
-            ),
-            "w",
-        ) as f:
-            f.write(template_wrapper.replace("$FORCE_BEHAVIORAL$", str(0)))
-        with open(
-            os.path.join(
-                code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper_sim.v"
-            ),
-            "w",
-        ) as f:
-            f.write(template_wrapper.replace("$FORCE_BEHAVIORAL$", str(1)))
-
-        # set ipgen_path and ip_path so that HLS-Synth transformation
-        # and stich_ip transformation do not complain
-        self.set_nodeattr("ipgen_path", code_gen_dir)
-        self.set_nodeattr("ip_path", code_gen_dir)
-
-    def prepare_codegen_default(self, fpgapart, clk):
-        template_path = os.environ["FINN_ROOT"] + "/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v"
-
-        code_gen_dict = {}
-        code_gen_dict["$IS_MVU$"] = [str(1)]
-        code_gen_dict["$COMPUTE_CORE$"] = [self._resolve_impl_style(fpgapart)]
-        code_gen_dict["$MW$"] = [str(self.get_nodeattr("MW"))]
-        code_gen_dict["$MH$"] = [str(self.get_nodeattr("MH"))]
-        code_gen_dict["$PE$"] = [str(self.get_nodeattr("PE"))]
-        code_gen_dict["$SIMD$"] = [str(self.get_nodeattr("SIMD"))]
-        code_gen_dict["$ACTIVATION_WIDTH$"] = [
-            str(self.get_input_datatype(0).bitwidth())
-        ]
-        code_gen_dict["$WEIGHT_WIDTH$"] = [str(self.get_input_datatype(1).bitwidth())]
-        code_gen_dict["$ACCU_WIDTH$"] = [str(self.get_output_datatype().bitwidth())]
-        code_gen_dict["$SIGNED_ACTIVATIONS$"] = (
-            [str(1)] if (self.get_input_datatype(0).min() < 0) else [str(0)]
-        )
-        code_gen_dict["$SEGMENTLEN$"] = [str(self._resolve_segment_len(clk))]
-
-        return template_path, code_gen_dict
-
-    def prepare_rtlsim(self):
-        """Creates a Verilator emulation library for the RTL code generated
-        for this node, sets the rtlsim_so attribute to its path and returns
-        a PyVerilator wrapper around it."""
-
-        if PyVerilator is None:
-            raise ImportError("Installation of PyVerilator is required.")
-
-        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
-        # Path to (System-)Verilog files used by top-module & path to top-module
-        verilog_paths = [code_gen_dir, os.environ["FINN_ROOT"] + "/finn-rtllib/mvu"]
-        verilog_files = [self.get_nodeattr("gen_top_module") + "_wrapper_sim.v"]
-
-        # build the Verilator emu library
-        sim = PyVerilator.build(
-            verilog_files,
-            build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"),
-            verilog_path=verilog_paths,
-            trace_depth=get_rtlsim_trace_depth(),
-            top_module_name=self.get_verilog_top_module_name(),
-        )
-        # save generated lib filename in attribute
-        self.set_nodeattr("rtlsim_so", sim.lib._name)
-
-        return sim

From 04ec5620ba5460b41d366e54f1a6ad099c551808 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 26 Jan 2024 19:49:15 +0000
Subject: [PATCH 107/112] removed old specialize_to_rtl transform

---
 .../fpgadataflow/specialize_to_rtl_layers.py  | 191 ------------------
 1 file changed, 191 deletions(-)
 delete mode 100644 src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py

diff --git a/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py
deleted file mode 100644
index 5061282695..0000000000
--- a/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py
+++ /dev/null
@@ -1,191 +0,0 @@
-# Copyright (c) 2023, AMD
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of FINN nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import numpy as np
-from qonnx.transformation.base import Transformation
-from qonnx.custom_op.registry import getCustomOp
-from qonnx.core.datatype import DataType
-from onnx import helper
-from qonnx.transformation.infer_shapes import InferShapes
-from qonnx.transformation.infer_datatypes import InferDataTypes
-from qonnx.transformation.general import GiveUniqueNodeNames
-from finn.transformation.fpgadataflow.minimize_accumulator_width import MinimizeAccumulatorWidth
-
-class InferRTLMatrixVectorActivation(Transformation):
-    """Convert (HLS-based) MatrixVectorActivation layers to specialized RTL layers if supported."""
-
-    def __init__(self):
-        super().__init__()
-
-    def _is_rtl_variant_compatible(self, n):
-        no_activation = getCustomOp(n).get_nodeattr("noActivation") == 1
-        act_width_in_range = (DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() <= 8) or (DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() == 9 and DataType[getCustomOp(n).get_nodeattr("inputDataType")].min() < 0)
-        weight_width_in_range = DataType[getCustomOp(n).get_nodeattr("weightDataType")].bitwidth() <= 8
-        folding_supported = (getCustomOp(n).get_nodeattr("MH") % getCustomOp(n).get_nodeattr("PE") == 0) and (getCustomOp(n).get_nodeattr("MW") % getCustomOp(n).get_nodeattr("SIMD") == 0)
-
-        if (no_activation and act_width_in_range and weight_width_in_range and folding_supported):
-            return True
-        else:
-            return False
-
-
-    def apply(self, model):
-        graph = model.graph
-        node_ind = 0
-        graph_modified = False
-        for n in graph.node:
-            node_ind += 1
-            if n.op_type == "MatrixVectorActivation":
-                preferred_in_rtl = getCustomOp(n).get_nodeattr("preferred_backend") == "rtl"
-                supported_in_rtl = self._is_rtl_variant_compatible(n)
-                if (preferred_in_rtl and supported_in_rtl):
-                    mvau_input = n.input[0]
-                    mvau_weight = n.input[1]
-                    mvau_output = n.output[0]
-                    inputDataType = getCustomOp(n).get_nodeattr("inputDataType")
-                    weightDataType = getCustomOp(n).get_nodeattr("weightDataType")
-                    outputDataType = getCustomOp(n).get_nodeattr("outputDataType")
-                    numInputVectors = getCustomOp(n).get_nodeattr("numInputVectors")
-                    mw = getCustomOp(n).get_nodeattr("MW")
-                    mh = getCustomOp(n).get_nodeattr("MH")
-                    simd = getCustomOp(n).get_nodeattr("SIMD")
-                    pe = getCustomOp(n).get_nodeattr("PE")
-                    mem_mode = getCustomOp(n).get_nodeattr("mem_mode")
-                    ram_style = getCustomOp(n).get_nodeattr("ram_style")
-                    resType = getCustomOp(n).get_nodeattr("resType")
-                    runtime_writeable_weights = getCustomOp(n).get_nodeattr("runtime_writeable_weights")
-
-                    new_node = helper.make_node(
-                        "MatrixVectorActivation_rtl",
-                        [mvau_input, mvau_weight],
-                        [mvau_output],
-                        domain="finn.custom_op.fpgadataflow",
-                        backend="fpgadataflow",
-                        MW=mw,
-                        MH=mh,
-                        SIMD=simd,
-                        PE=pe,
-                        inputDataType=inputDataType,
-                        weightDataType=weightDataType,
-                        outputDataType=outputDataType,
-                        numInputVectors=numInputVectors,
-                        mem_mode=mem_mode,
-                        resType=resType,
-                        name=n.name + "_rtl",
-                        ram_style=ram_style,
-                        runtime_writeable_weights=runtime_writeable_weights
-                    )
-                    graph.node.insert(node_ind, new_node)
-                    # remove old node
-                    graph.node.remove(n)
-                    graph_modified=True
-        
-        if graph_modified:
-            model = model.transform(MinimizeAccumulatorWidth())
-            model = model.transform(InferShapes())
-            model = model.transform(InferDataTypes())
-            model = model.transform(GiveUniqueNodeNames())
-        
-        return (model, graph_modified)
-
-class InferRTLVectorVectorActivation(Transformation):
-    """Convert (HLS-based) VectorVectorActivation layers to specialized RTL layers is supported."""
-
-    def __init__(self):
-        super().__init__()
-
-    def _is_rtl_variant_compatible(self, n):
-        no_activation = getCustomOp(n).get_nodeattr("noActivation") == 1
-        act_width_in_range = (DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() <= 8) or (DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() == 9 and DataType[getCustomOp(n).get_nodeattr("inputDataType")].min() < 0)
-        weight_width_in_range = DataType[getCustomOp(n).get_nodeattr("weightDataType")].bitwidth() <= 8
-        folding_supported = (getCustomOp(n).get_nodeattr("Channels") % getCustomOp(n).get_nodeattr("PE") == 0) and (np.prod(getCustomOp(n).get_nodeattr("Kernel")) % getCustomOp(n).get_nodeattr("SIMD") == 0)
-        
-        if (no_activation and act_width_in_range and weight_width_in_range and folding_supported):
-            return True
-        else:
-            return False
-    
-    def apply(self, model):
-        graph = model.graph
-        node_ind = 0
-        graph_modified = False
-        for n in graph.node:
-            node_ind += 1
-            if n.op_type == "VectorVectorActivation":
-                preferred_in_rtl = getCustomOp(n).get_nodeattr("preferred_backend") == "rtl"
-                supported_in_rtl = self._is_rtl_variant_compatible(n)
-                if (preferred_in_rtl and supported_in_rtl):
-                    vvau_input = n.input[0]
-                    vvau_weight = n.input[1]
-                    vvau_output = n.output[0]
-                    inputDataType = getCustomOp(n).get_nodeattr("inputDataType")
-                    weightDataType = getCustomOp(n).get_nodeattr("weightDataType")
-                    outputDataType = getCustomOp(n).get_nodeattr("outputDataType")
-                    pe = getCustomOp(n).get_nodeattr("PE")
-                    simd = getCustomOp(n).get_nodeattr("SIMD")
-                    dim = getCustomOp(n).get_nodeattr("Dim")
-                    channels = getCustomOp(n).get_nodeattr("Channels")
-                    kernel = getCustomOp(n).get_nodeattr("Kernel")
-                    resType = getCustomOp(n).get_nodeattr("resType")
-                    mem_mode = getCustomOp(n).get_nodeattr("mem_mode")
-                    runtime_writeable_weights = getCustomOp(n).get_nodeattr("runtime_writeable_weights")
-                    ram_style = getCustomOp(n).get_nodeattr("ram_style")
-                    resType = getCustomOp(n).get_nodeattr("resType")                    
-
-                    new_node = helper.make_node(
-                        "VectorVectorActivation_rtl",
-                        [vvau_input, vvau_weight],
-                        [vvau_output],
-                        domain="finn.custom_op.fpgadataflow",
-                        backend="fpgadataflow",
-                        name=n.name + "_rtl",
-                        PE=pe,
-                        SIMD=simd,
-                        Dim=dim,
-                        Channels=channels,
-                        Kernel=kernel,
-                        resType=resType,
-                        inputDataType=inputDataType,
-                        weightDataType=weightDataType,
-                        outputDataType=outputDataType,
-                        mem_mode=mem_mode,
-                        runtime_writeable_weights=runtime_writeable_weights,
-                        ram_style=ram_style
-                    )
-                    graph.node.insert(node_ind, new_node)
-                    # remove old node
-                    graph.node.remove(n)
-                    graph_modified=True
-        
-        if graph_modified:
-            model = model.transform(MinimizeAccumulatorWidth())
-            model = model.transform(InferShapes())
-            model = model.transform(InferDataTypes())
-            model = model.transform(GiveUniqueNodeNames())
-        
-        return (model, graph_modified)
\ No newline at end of file

From de778cf73ba2dabbb056320ad4babdb24cacc0ad Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Fri, 26 Jan 2024 19:50:01 +0000
Subject: [PATCH 108/112] removed rtl custom-op test

---
 .../test_fpgadataflow_mvau_rtl.py             | 174 ------------------
 1 file changed, 174 deletions(-)
 delete mode 100644 tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py

diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py b/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py
deleted file mode 100644
index 1e9de44fb2..0000000000
--- a/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py
+++ /dev/null
@@ -1,174 +0,0 @@
-# Copyright (C) 2022, Advanced Micro Devices, Inc.
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of FINN nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import pytest
-
-import numpy as np
-import os
-import pickle
-from onnx import TensorProto, helper
-from qonnx.core.datatype import DataType
-from qonnx.core.modelwrapper import ModelWrapper
-from qonnx.custom_op.registry import getCustomOp
-from qonnx.transformation.general import ApplyConfig, GiveUniqueNodeNames, GiveReadableTensorNames
-from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
-
-import finn.core.onnx_exec as oxe
-import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
-import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl
-from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
-from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
-from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
-from finn.transformation.fpgadataflow.set_fifo_depths import InsertAndSetFIFODepths
-
-
-from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
-from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
-
-build_dir = os.environ["FINN_BUILD_DIR"]
-
-
-def make_single_matmul_modelwrapper(ifm, ofm, idt, wdt, W):
-    matmul_node = helper.make_node("MatMul", ["ifm", "weights"], ["ofm"])
-    graph = helper.make_graph(nodes=[matmul_node], name="matmul_graph", inputs=[ifm], outputs=[ofm])
-
-    model = qonnx_make_model(graph, producer_name="fclayer-model")
-    model = ModelWrapper(model)
-
-    model.set_tensor_datatype("ifm", idt)
-    model.set_tensor_datatype("weights", wdt)
-    model.set_tensor_datatype(
-        "ofm", DataType["INT32"]
-    )  # At this step, the MatMul layer does not optimize the bit-width of the output datatype
-    model.set_initializer("weights", W)
-    # model.set_tensor_layout("ifm", DataLayout.NHWC)
-
-    return model
-
-
-def prepare_inputs(input_tensor):
-    return {"global_in": input_tensor}
-
-
-# @pytest.mark.parametrize("mh", [36])
-# @pytest.mark.parametrize("mw", [256])
-@pytest.mark.parametrize("mh", [9])
-@pytest.mark.parametrize("mw", [36])
-# @pytest.mark.parametrize("pe", [1, 4, 9, 36])
-# @pytest.mark.parametrize("simd", [1, 4, 16, 64, 256])
-@pytest.mark.parametrize("pe", [1, 3, 9])
-@pytest.mark.parametrize("simd", [1, 3, 6, 18, 36])
-@pytest.mark.parametrize("idt", [DataType["UINT4"], DataType["UINT8"]])
-@pytest.mark.parametrize("wdt", [DataType["INT4"], DataType["INT8"]])
-# @pytest.mark.parametrize("part", ["xcvc1902-vsva2197-2MP-e-S", "xcku3p-ffva676-1-e"])
-@pytest.mark.parametrize("part", ["xcvc1902-vsva2197-2MP-e-S"])
-@pytest.mark.parametrize("clk_ns", [1.66, 4])
-@pytest.mark.fpgadataflow
-@pytest.mark.slow
-@pytest.mark.vivado
-def test_fpgadataflow_mvau_rtl(
-    mh, mw, pe, simd, idt, wdt, part, clk_ns
-):
-    if part == "xcku3p-ffva676-1-e" and clk_ns != 1.66:
-        pytest.skip("Skip test for varying clk for devices other than Versal, since this variable doesn't change anything for this test")
-
-    # Create test input vector (produced by SWG)
-    ofm_shape = (5, 5)
-    ofm_h, ofm_w = ofm_shape
-    ifm = helper.make_tensor_value_info("ifm", TensorProto.FLOAT, [1, ofm_h, ofm_w, mw])
-    ofm = helper.make_tensor_value_info("ofm", TensorProto.FLOAT, (1, ofm_h, ofm_w, mh))
-    W = gen_finn_dt_tensor(wdt, (mw, mh))
-    model = make_single_matmul_modelwrapper(ifm, ofm, idt, wdt, W)
-    model = model.transform(GiveUniqueNodeNames())
-    model = model.transform(GiveReadableTensorNames())
-
-    model.save(build_dir + "/matmul.onnx")
-
-    # Create MatMul & obtain golden reference output
-    A = gen_finn_dt_tensor(model.get_tensor_datatype("global_in"), model.get_tensor_shape("global_in"))
-    input_dict = prepare_inputs(A)
-
-    # Execute ONNX model
-    output_matmul = oxe.execute_onnx(model, input_dict)["global_out"]
-
-    with open(build_dir + "/onnx_output.pkl", "wb") as f:
-        pickle.dump(output_matmul, f)
-
-    # Create MVAU (HLS)
-    model = model.transform(to_hls.InferQuantizedMatrixVectorActivation(mem_mode="decoupled"))
-    model = model.transform(GiveUniqueNodeNames())
-
-    # Apply folding (i.e. specify to use DSPs)
-    folding_config = {
-        "Defaults": {},
-        "MatrixVectorActivation_0": {
-            "PE": pe,
-            "SIMD": simd,
-            "mem_mode": "decoupled",
-            "ram_style": "auto",
-            "resType": "dsp",
-            "preferred_backend" : "rtl"
-        },
-    }
-    model = model.transform(ApplyConfig(folding_config))
-    model.save(build_dir + "/mvau_hls.onnx")
-
-    # Apply convert-to-rtl step
-    model = model.transform(to_rtl.InferRTLMatrixVectorActivation())
-    model = model.transform(GiveUniqueNodeNames())
-    model.save(build_dir + "/mvau_rtl.onnx")
-
-    # Reset rtlsim_so and ip-related paths such that new Pyverilator SO and IP is generated
-    for n in model.graph.node:
-        getCustomOp(n).set_nodeattr("rtlsim_trace", build_dir + "/mvu_trace_rtl_nodebynode.vcd")
-    
-    model = model.transform(SetExecMode("rtlsim"))
-    model = model.transform(PrepareIP(part, clk_ns))
-    model = model.transform(HLSSynthIP())
-    model = model.transform(PrepareRTLSim())
-    output_mvau_rtl = oxe.execute_onnx(model, input_dict)["global_out"]
-
-    with open(build_dir + "/mvau_rtl_output.pkl", "wb") as f:
-        pickle.dump(output_mvau_rtl, f)
-
-    model.save(build_dir + "/mvau_rtl_sim.onnx")
-    assert (output_matmul == output_mvau_rtl).all(), "Output of ONNX model not matching output of node-by-node sim!"
-
-    model = model.transform(InsertAndSetFIFODepths(part, clk_ns))
-    model = model.transform(PrepareIP(part, clk_ns))
-    model = model.transform(HLSSynthIP())
-    model = model.transform(CreateStitchedIP(part, clk_ns))
-
-    os.environ["RTLSIM_TRACE_DEPTH"] = "3"
-    model.set_metadata_prop("rtlsim_so", "")
-    model.set_metadata_prop("exec_mode", "rtlsim")
-    model.set_metadata_prop("rtlsim_trace", build_dir + "/mvu_trace_rtl_stitch.vcd")
-    model.save(build_dir + "/stitched_ip.onnx")
-    output_mvau_rtl_stitch = oxe.execute_onnx(model, input_dict)["global_out"]
-
-    assert (output_matmul == output_mvau_rtl_stitch).all(), "Output of ONNX model not matching output of stitched-IP RTL model!"
\ No newline at end of file

From 911239c09fe0a262e745c9cb4d91478da2315d79 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 1 Feb 2024 14:34:15 +0000
Subject: [PATCH 109/112] [vvau hls]: add custom op to dict

---
 src/finn/custom_op/fpgadataflow/hls/__init__.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/finn/custom_op/fpgadataflow/hls/__init__.py b/src/finn/custom_op/fpgadataflow/hls/__init__.py
index 1f1448b9fc..ebb5ce98da 100644
--- a/src/finn/custom_op/fpgadataflow/hls/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/hls/__init__.py
@@ -51,6 +51,7 @@
 from finn.custom_op.fpgadataflow.hls.tlastmarker_hls import TLastMarker_hls
 from finn.custom_op.fpgadataflow.hls.upsampler_hls import UpsampleNearestNeighbour_hls
 from finn.custom_op.fpgadataflow.hls.matrixvectoractivation_hls import MatrixVectorActivation_hls
+from finn.custom_op.fpgadataflow.hls.vectorvectoractivation_hls import VectorVectorActivation_hls
 
 custom_op = dict()
 
@@ -76,4 +77,5 @@
 custom_op["Thresholding_hls"] = Thresholding_hls
 custom_op["TLastMarker_hls"] = TLastMarker_hls
 custom_op["UpsampleNearestNeighbour_hls"] = UpsampleNearestNeighbour_hls
-custom_op["MatrixVectorActivation_hls"] = MatrixVectorActivation_hls
\ No newline at end of file
+custom_op["MatrixVectorActivation_hls"] = MatrixVectorActivation_hls
+custom_op["VectorVectorActivation_hls"] = VectorVectorActivation_hls
\ No newline at end of file

From b1ee54098d221c73f8d6491826338b1847489038 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 1 Feb 2024 14:35:56 +0000
Subject: [PATCH 110/112] [vvu hw-op]: refactored hw custom-op VVAU

---
 .../fpgadataflow/vectorvectoractivation.py    | 1196 ++++++-----------
 1 file changed, 423 insertions(+), 773 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py
index 891730ece3..2168474298 100644
--- a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py
+++ b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py
@@ -38,17 +38,21 @@
     roundup_to_integer_multiple,
 )
 
-from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
+from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
 from finn.util.data_packing import (
     npy_to_rtlsim_input,
     numpy_to_hls_code,
     pack_innermost_dim_as_hex_string,
     rtlsim_output_to_npy,
 )
+import onnx.numpy_helper as np_helper
+import qonnx.custom_op.general.xnorpopcount as xp
+from qonnx.custom_op.general.multithreshold import multithreshold
 
 
-class VectorVectorActivation(HLSCustomOp):
-    """Class that corresponds to finn-hlslib Vector_Vector_Activate_Batch function"""
+
+class VectorVectorActivation(HWCustomOp):
+    """Abstraction layer for HW implementation of VectorVectorActivation layers."""
 
     def __init__(self, onnx_node, **kwargs):
         super().__init__(onnx_node, **kwargs)
@@ -100,6 +104,10 @@ def get_nodeattr_types(self):
             # use xnor-popcount for binary weights/inputs, thus treating them
             # as bipolar
             "binaryXnorMode": ("i", False, 0, {0, 1}),
+            # Backend implementation for layer
+            # hls -- Vivado HLS
+            # rtl -- (System)Verilog
+            "preferred_impl_style": ("s", False, "hls", {"hls", "rtl"}),
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
@@ -107,124 +115,55 @@ def get_nodeattr_types(self):
     def base_op_type(self):
         return "VectorVectorActivation"
 
-    def minimize_accumulator_width(self, model):
-        """Minimize the accumulator bit width according to the weight values,
-        input data types, and size of dot product"""
-        weights = model.get_initializer(self.onnx_node.input[1])
-        k_h, k_w = self.get_nodeattr("Kernel")
-        fm = self.get_nodeattr("Channels")
-        # put weights into the shape expected by calculate_matvec_accumulator_range
-        weights = weights.reshape(fm, k_h * k_w).transpose()
-        # since in the calculation the values of the weight matrix are used,
-        # for the bipolar case they need to be converted to bipolar
-        if self.get_nodeattr("binaryXnorMode"):
-            weights = 2 * weights - 1
-        if len(self.onnx_node.input) > 2:
-            thresholds = model.get_initializer(self.onnx_node.input[2])
-        else:
-            thresholds = None
-        idt = self.get_input_datatype()
-
-        (acc_min, acc_max) = calculate_matvec_accumulator_range(weights, idt)
-        # if runtime-writeable weights, then the values of the weights can
-        # change and we need to use the worst-case values from the datatypes
-        if self.get_nodeattr("runtime_writeable_weights"):
-            wdt = self.get_weight_datatype()
-            lower_worst = wdt.min() * np.ones_like(weights)
-            lower_range = calculate_matvec_accumulator_range(lower_worst, idt)
-            upper_worst = wdt.max() * np.ones_like(weights)
-            upper_range = calculate_matvec_accumulator_range(upper_worst, idt)
-            acc_min = min(min(lower_range), min(upper_range))
-            acc_max = max(max(upper_range), max(upper_range))
-
-        # if the thresholds can be used to determine range, then adjust the range
-        # according to the known values of the thresholds
-        if thresholds is not None:
-            threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds)
-            # set threshold datatype (and accumulator datatype implicitly)
-            min_threshold = thresholds.min()
-            max_threshold = thresholds.max()
-            # clip threshold values
-            if max_threshold > acc_max or min_threshold < acc_min:
-                warnings.warn("Clipping some thresholds in %s" % self.onnx_node.name)
-                thresholds = np.clip(thresholds, acc_min, acc_max)
-                model.set_initializer(self.onnx_node.input[2], thresholds)
-                threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds)
-                min_threshold = thresholds.min()
-                max_threshold = thresholds.max()
-            acc_min = min(min_threshold, acc_min)
-            acc_max = max(max_threshold, acc_max)
+    def _infer_sparse_weight_tensor(self, W_conv, k_h, k_w, channels):
+        W_sparse = np.zeros((channels, channels, k_h, k_w), dtype=np.float32)
+        for ch in range(channels):
+            W_sparse[ch][ch] = W_conv[ch][0]
+        W_conv = W_sparse.astype(np.float32)
+        W_matmul = W_conv.transpose(0, 2, 3, 1)
+        W_matmul = W_matmul.reshape(channels, channels * k_h * k_w)
+        W_matmul = W_matmul.T
+        return W_matmul
 
-        # if the acc_range is always greater than 0, then acc_max <= 2^P - 1
-        if acc_min >= 0:
-            acc_bit_width = np.log2(acc_max + 1)
-            acc_bit_width = math.ceil(acc_bit_width)
-            adt = DataType[f"UINT{acc_bit_width}"]
-        # if the acc_range is signed, then acc_min >= -2^{P-1} and acc_max <=
-        # 2^{P - 1} - 1, which means 2^{P - 1} >= max(-acc_min, 1 + acc_max)
+    def execute_node(self, context, graph):
+        node = self.onnx_node
+        in_act = context[node.input[0]]
+        (_, dim_h, dim_w, _) = in_act.shape
+        (k_h, k_w) = self.get_nodeattr("Kernel")
+        channels = self.get_nodeattr("Channels")
+        # Reshape input activations in right format
+        in_act = in_act.reshape(1, dim_h, dim_w, channels, k_h*k_w)
+        in_act = in_act.transpose(0, 1, 2, 4, 3)
+        in_act = in_act.reshape(1, dim_h, dim_w, channels*k_h*k_w)
+        # Reshape
+        vvau_w_init = [x for x in graph.initializer if x.name == node.input[1]][0]
+        vvau_w = np_helper.to_array(vvau_w_init)
+        vvau_w_onnx = self._infer_sparse_weight_tensor(vvau_w, k_h, k_w, channels)
+
+        if self.get_nodeattr("inputDataType") == "BIPOLAR" and self.get_nodeattr("weightDataType") == "BIPOLAR":
+            result = np.matmul(in_act, vvau_w_onnx)
+            result = (result + k_h*k_w) / 2
         else:
-            _acc_max = max(-acc_min, 1 + acc_max)
-            acc_bit_width = np.log2(_acc_max) + 1
-            acc_bit_width = math.ceil(acc_bit_width)
-            adt = DataType[f"INT{acc_bit_width}"]
-
-        # if activation, assert that the thresholds can be expressed with adt
-        if thresholds is not None:
-            assert np.vectorize(adt.allowed)(
-                threshold_tensor
-            ).all(), "Thresholds in %s can't be expressed with type %s" % (
-                self.onnx_node.name,
-                str(adt),
-            )
-
-        # if no activation, output and accumulator datatypes are the same
-        if self.get_nodeattr("noActivation"):
-            # if this is the last node in the graph, then ensure the datatype is
-            # divisibly by 8 bits
-            if model.find_direct_successors(self.onnx_node) is None:
-                bw = roundup_to_integer_multiple(adt.bitwidth(), 8)
-                new_adt_name = adt.name.replace(str(adt.bitwidth()), str(bw))
-                adt = DataType[new_adt_name]
-            # for no-activation nodes, output dt = acc dt
-            self.set_nodeattr("outputDataType", adt.name)
-        self.set_nodeattr("accDataType", adt.name)
-
-        return DataType[self.get_nodeattr("accDataType")]
-
-    def minimize_weight_bit_width(self, model):
-        """Minimize the bit width based on the values of the weights"""
-        if not self.get_nodeattr("runtime_writeable_weights"):
-            weights = model.get_initializer(self.onnx_node.input[1])
-            w_min = weights.min()
-            w_max = weights.max()
-            if w_min < 0:
-                if abs(w_min) > w_max:
-                    wdt = DataType.get_smallest_possible(w_min)
-                else:
-                    wdt = DataType.get_smallest_possible(-w_max - 1)
-            else:
-                wdt = DataType.get_smallest_possible(w_max)
-            self.set_nodeattr("weightDataType", wdt.name)
-        return DataType[self.get_nodeattr("weightDataType")]
-
-    def calc_wmem(self):
-        """Calculates and returns WMEM."""
-        ch = self.get_nodeattr("Channels")
-        k_h, k_w = self.get_nodeattr("Kernel")
-        pe = self.get_nodeattr("PE")
-        simd = self.get_nodeattr("SIMD")
-        wmem = (k_h * k_w * ch // pe) // simd
-        return wmem
+            result = np.matmul(in_act, vvau_w_onnx) # result is in [N, H, W, C] format
 
-    def calc_tmem(self):
-        """Calculates and returns TMEM."""
-        if self.get_nodeattr("noActivation") == 1:
-            return 0
-        else:
-            ch = self.get_nodeattr("Channels")
-            pe = self.get_nodeattr("PE")
-            return ch // pe
+        if self.get_nodeattr("noActivation") == 0:
+            vvau_thr_init = [x for x in graph.initializer if x.name == node.input[2]][0]
+            vvau_thr = np_helper.to_array(vvau_thr_init)
+            odt_is_bipolar = self.get_nodeattr("outputDataType") == DataType["BIPOLAR"]
+            out_scale = 2 if odt_is_bipolar else 1
+            out_bias = -1 if odt_is_bipolar else self.get_nodeattr("ActVal")
+            # NHWC to NCHW for multithreshold node
+            result = result.transpose((0,3,1,2))
+            result = multithreshold(result, vvau_thr, out_scale, out_bias)
+            # NCHW to NHWC
+            result = result.transpose((0,2,3,1))
+        
+        # for i in range(self.get_nodeattr("Channels")):
+        context[node.output[0]] = result
 
+    def verify_node(self):
+        pass
+  
     def make_shape_compatible_op(self, model):
         oshape = self.get_normal_output_shape()
         return super().make_const_shape_op(oshape)
@@ -244,9 +183,6 @@ def infer_node_datatype(self, model):
         odt = self.get_output_datatype()
         model.set_tensor_datatype(node.output[0], odt)
 
-    def verify_node(self):
-        pass
-
     def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         return DataType[self.get_nodeattr("inputDataType")]
@@ -269,12 +205,32 @@ def get_instream_width(self, ind=0):
         pe = self.get_nodeattr("PE")
         in_width = i_bits * simd * pe
         return in_width
+    
+    def get_weightstream_width(self):
+        """Returns weight stream width. Used only in decoupled mode."""
+        if (
+            self.get_nodeattr("mem_mode") == "decoupled"
+            or self.get_nodeattr("mem_mode") == "external"
+        ):
+            simd = self.get_nodeattr("SIMD")
+            pe = self.get_nodeattr("PE")
+            wp = self.get_weight_datatype().bitwidth()
+            w_width = simd * pe * wp
+            return w_width
+        else:
+            return 0
 
     def get_outstream_width(self, ind=0):
         o_bits = self.get_output_datatype().bitwidth()
         out_width = o_bits * self.get_nodeattr("PE")
         return out_width
 
+    def get_weightstream_width_padded(self):
+        """Returns weight stream width padded to a multiple of 8. This is required
+        by the AXI Stream spec. Used in decoupled mode."""
+        weight_width = self.get_weightstream_width()
+        return roundup_to_integer_multiple(weight_width, 8)
+
     def get_folded_input_shape(self, ind=0):
         k_h, k_w = self.get_nodeattr("Kernel")
         dim_h, dim_w = self.get_nodeattr("Dim")
@@ -323,88 +279,302 @@ def get_number_output_values(self):
         nf = np.prod(self.get_folded_output_shape()[:-1])
         return nf
 
-    def get_exp_cycles(self):
-        pe = self.get_nodeattr("PE")
-        simd = self.get_nodeattr("SIMD")
+    def calc_wmem(self):
+        """Calculates and returns WMEM."""
         ch = self.get_nodeattr("Channels")
-        dim_h, dim_w = self.get_nodeattr("Dim")
         k_h, k_w = self.get_nodeattr("Kernel")
-        # currently FINN supports for vvau a batch size of 1
-        batch_size = 1
-        # since mmv != 1 is not supported yet, we set mmv for now to 1
-        mmv = 1
-        exp_cycles = ((ch * k_h * k_w) / pe / simd) * batch_size * (dim_h * dim_w) / mmv
-        return int(exp_cycles)
+        pe = self.get_nodeattr("PE")
+        simd = self.get_nodeattr("SIMD")
+        wmem = (k_h * k_w * ch // pe) // simd
+        return wmem
 
-    def get_template_param_values(self):
-        """Returns the template parameter values according to input, output and weight
-        data types."""
-        ret = dict()
-        inp_hls_str = self.get_input_datatype().get_hls_datatype_str()
-        out_hls_str = self.get_output_datatype().get_hls_datatype_str()
-        inp_is_binary = self.get_input_datatype() == DataType["BINARY"]
-        # out_is_binary = self.get_output_datatype() == DataType["BINARY"]
-        wt_is_binary = self.get_weight_datatype() == DataType["BINARY"]
-        bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1
-        if (inp_is_binary or wt_is_binary) and (not bin_xnor_mode):
-            raise Exception("True binary (non-bipolar) inputs not yet supported")
-        inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"]
-        # out_is_bipolar = self.get_output_datatype() == DataType["BIPOLAR"]
-        wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"]
-        # reinterpret inp/wt as bipolar if bin_xnor_mode is iset
-        inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode)
-        wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode)
-        # fill in TSrcI and TWeightI
-        # TODO check these with Giulio
-        # TODO handle non-bipolar binary inputs
-        if inp_is_bipolar and wt_is_bipolar:
-            ret["TSrcI"] = "Recast<XnorMul>"
-            ret["TWeightI"] = "Identity"
-        elif (not inp_is_bipolar) and wt_is_bipolar:
-            ret["TSrcI"] = "Slice<%s>" % inp_hls_str
-            ret["TWeightI"] = "Recast<Binary>"
-        elif inp_is_bipolar and (not wt_is_bipolar):
-            ret["TSrcI"] = "Recast<Binary>"
-            ret["TWeightI"] = "Identity"
-        elif (not inp_is_bipolar) and (not wt_is_bipolar):
-            ret["TSrcI"] = "Slice<%s>" % inp_hls_str
-            ret["TWeightI"] = "Identity"
+    def calc_tmem(self):
+        """Calculates and returns TMEM."""
+        if self.get_nodeattr("noActivation") == 1:
+            return 0
+        else:
+            ch = self.get_nodeattr("Channels")
+            pe = self.get_nodeattr("PE")
+            return ch // pe
 
-        # fill in TDstI
-        ret["TDstI"] = "Slice<%s>" % out_hls_str
+    def uram_estimation(self):
+        P = self.get_nodeattr("PE")
+        Q = self.get_nodeattr("SIMD")
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        omega = self.calc_wmem()
+        mem_width = Q * W * P
+        mmode = self.get_nodeattr("mem_mode")
+        mstyle = self.get_nodeattr("ram_style")
+        if (
+            (mmode == "decoupled" and mstyle != "ultra")
+            or (mmode == "const")
+            or (mmode == "external")
+        ):
+            return 0
+        width_multiplier = math.ceil(mem_width / 72)
+        depth_multiplier = math.ceil(omega / 4096)
+        return width_multiplier * depth_multiplier
 
-        return ret
+    def bram_estimation(self):
+        """Calculates resource estimation for BRAM"""
+        # TODO add in/out FIFO contributions
+        P = self.get_nodeattr("PE")
+        Q = self.get_nodeattr("SIMD")
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        omega = self.calc_wmem()
+        mem_width = Q * W * P
+        # assuming SDP mode RAMB18s (see UG573 Table 1-10)
+        # since this is HLS memory, not using the full width of a BRAM
+        # assuming memories up to 128 deep get implemented in LUTs
+        mmode = self.get_nodeattr("mem_mode")
+        mstyle = self.get_nodeattr("ram_style")
+        if (
+            (mmode == "decoupled" and mstyle in ["distributed", "ultra"])
+            or (mstyle == "auto" and self.calc_wmem() <= 128)
+            or (mmode == "const" and self.calc_wmem() <= 128)
+            or (mmode == "external")
+        ):
+            return 0
 
-    def get_hls_compatible_weight_tensor(self, orig_weight_matrix):
-        pe = self.get_nodeattr("PE")
-        simd = self.get_nodeattr("SIMD")
-        ch = self.get_nodeattr("Channels")
-        k_h, k_w = self.get_nodeattr("Kernel")
-        wmem = self.calc_wmem()
-        assert orig_weight_matrix.shape == (
-            ch,
-            1,
-            k_h,
-            k_w,
-        ), """Weights matrix doesn't
-        have expected shape (channels, 1, kernel_size, kernel_size)"""
-        ret = orig_weight_matrix
-        if self.get_weight_datatype() == DataType["BIPOLAR"]:
-            # convert bipolar to binary
-            ret = (ret + 1) / 2
-        ret = ret.reshape(ch, k_h * k_w)
-        # distribute rows between PEs
-        ret = interleave_matrix_outer_dim_from_partitions(ret, pe)
-        ret = ret.reshape(1, pe, wmem, simd)
-        return ret
+        if mem_width == 1:
+            return math.ceil(omega / 16384)
+        elif mem_width == 2:
+            return math.ceil(omega / 8192)
+        elif mem_width <= 4:
+            return (math.ceil(omega / 4096)) * (math.ceil(mem_width / 4))
+        elif mem_width <= 9:
+            return (math.ceil(omega / 2048)) * (math.ceil(mem_width / 8))
+        elif mem_width <= 18 or omega > 512:
+            return (math.ceil(omega / 1024)) * (math.ceil(mem_width / 16))
+        else:
+            return (math.ceil(omega / 512)) * (math.ceil(mem_width / 32))
 
-    def get_hls_compatible_threshold_tensor(self, orig_thres_matrix):
-        """Convert the original numpy weight matrix orig_weight_matrix into
-        a form suitable for passing to the hlslib call:
-        * ensure MH % PE == 0
-        * for bipolar weights&inputs, ensure thresholds are positive
-        * interleave rows between PEs
-        * reshape into (PE, TMEM, n_thres_steps) and return
+    def bram_efficiency_estimation(self):
+        P = self.get_nodeattr("PE")
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        omega = self.calc_wmem()
+        bram16_est = self.bram_estimation()
+        if bram16_est == 0:
+            return 1
+        wbits = W * P * omega
+        bram16_est_capacity = bram16_est * 36 * 512
+        return wbits / bram16_est_capacity
+
+    def uram_efficiency_estimation(self):
+        """Function for URAM efficiency estimation: actual parameter storage
+        needed divided by the allocated URAM storage (from estimation)"""
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        D_in = int(np.prod(self.get_nodeattr("Kernel")))
+        D_out = self.get_nodeattr("Channels")
+        uram_est = self.uram_estimation()
+        if uram_est == 0:
+            return 1
+        wbits = W * D_in * D_out
+        uram_est_capacity = uram_est * 72 * 4096
+        return wbits / uram_est_capacity
+
+    def lut_estimation(self):
+        """Calculates resource estimations for LUTs based on:
+        - FINN-R: An End-to-End Deep-Learning Framework for Fast
+        Exploration of Quantized Neural Networks
+        - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien,
+        Y. Umuroglu, M. Leeser and K. Vissers
+        - 12. Sep 2018
+        """
+        # TODO add in/out FIFO contributions
+        P = self.get_nodeattr("PE")
+        Q = self.get_nodeattr("SIMD")
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        # determine tdt with input and weight data types
+        idt = self.get_input_datatype()
+        A = idt.bitwidth()
+        # parameters from experiments in paper mentioned above
+        c0 = 300
+        c1 = 1.1
+        c2 = 0
+        mmode = self.get_nodeattr("mem_mode")
+        mstyle = self.get_nodeattr("ram_style")
+        if (mmode == "decoupled" and mstyle == "distributed") or (
+            mmode == "const" and self.calc_wmem() <= 128
+        ):
+            c2 = (P * Q * W) * math.ceil(self.calc_wmem() / 64)
+
+        # multiplication
+        res_type = self.get_nodeattr("resType")
+        if res_type == "dsp":
+            mult_luts = 0
+        else:
+            mult_luts = Q * (2 * math.ceil((W + A) / 6) - 1) * (W + A)
+        # adder tree
+        addertree_luts = (W + A) * (2 * Q - 1)
+        # accumulator
+        acc_datatype = self.get_accumulator_datatype()
+        acc_bits = acc_datatype.bitwidth()
+        k_h, k_w = self.get_nodeattr("Kernel")
+        # if accDataType is not set, then it will default to INT32, which would
+        # be a large overestimate in most (if not all) cases. In this scenario,
+        # we would use the minimum accumulator as determined by the data types
+        # bound, derived in https://arxiv.org/abs/2301.13376
+        alpha = math.log(k_h * k_w, 2) + W + A - 1 - int(idt.signed())
+        acc_bits = min(
+            acc_datatype.bitwidth(),
+            np.ceil(alpha + math.log(1 + pow(2, -alpha), 2) + 1),
+        )
+        acc_luts = acc_bits
+        # thresholds and threshold comparators
+        thr_luts = 0
+        comp_luts = 0
+        noact = self.get_nodeattr("noActivation")
+        # TODO - add 'ram_style_threshold' node attribute
+        if noact == 0:
+            odt = self.get_output_datatype()
+            B = odt.bitwidth()
+            thr_luts = (2**B - 1) * acc_bits * self.calc_tmem() / 64
+            comp_luts = (2**B - 1) * acc_bits
+
+        return int(
+            c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts)) + c2
+        )
+
+    def dsp_estimation(self):
+        # multiplication
+        P = self.get_nodeattr("PE")
+        res_type = self.get_nodeattr("resType")
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        idt = self.get_input_datatype()
+        A = idt.bitwidth()
+        if res_type == "dsp":
+            mult_dsp = P * np.ceil((W + A) / 48)  # TODO: more accurate modelling
+        else:
+            mult_dsp = 0
+        return int(mult_dsp)
+
+    def get_exp_cycles(self):
+        pe = self.get_nodeattr("PE")
+        simd = self.get_nodeattr("SIMD")
+        ch = self.get_nodeattr("Channels")
+        dim_h, dim_w = self.get_nodeattr("Dim")
+        k_h, k_w = self.get_nodeattr("Kernel")
+        # currently FINN supports for vvau a batch size of 1
+        batch_size = 1
+        # since mmv != 1 is not supported yet, we set mmv for now to 1
+        mmv = 1
+        exp_cycles = ((ch * k_h * k_w) / pe / simd) * batch_size * (dim_h * dim_w) / mmv
+        return int(exp_cycles)
+
+    def minimize_accumulator_width(self, model):
+        """Minimize the accumulator bit width according to the weight values,
+        input data types, and size of dot product"""
+        weights = model.get_initializer(self.onnx_node.input[1])
+        k_h, k_w = self.get_nodeattr("Kernel")
+        fm = self.get_nodeattr("Channels")
+        # put weights into the shape expected by calculate_matvec_accumulator_range
+        weights = weights.reshape(fm, k_h * k_w).transpose()
+        # since in the calculation the values of the weight matrix are used,
+        # for the bipolar case they need to be converted to bipolar
+        if self.get_nodeattr("binaryXnorMode"):
+            weights = 2 * weights - 1
+        if len(self.onnx_node.input) > 2:
+            thresholds = model.get_initializer(self.onnx_node.input[2])
+        else:
+            thresholds = None
+        idt = self.get_input_datatype()
+
+        (acc_min, acc_max) = calculate_matvec_accumulator_range(weights, idt)
+        # if runtime-writeable weights, then the values of the weights can
+        # change and we need to use the worst-case values from the datatypes
+        if self.get_nodeattr("runtime_writeable_weights"):
+            wdt = self.get_weight_datatype()
+            lower_worst = wdt.min() * np.ones_like(weights)
+            lower_range = calculate_matvec_accumulator_range(lower_worst, idt)
+            upper_worst = wdt.max() * np.ones_like(weights)
+            upper_range = calculate_matvec_accumulator_range(upper_worst, idt)
+            acc_min = min(min(lower_range), min(upper_range))
+            acc_max = max(max(upper_range), max(upper_range))
+
+        # if the thresholds can be used to determine range, then adjust the range
+        # according to the known values of the thresholds
+        if thresholds is not None:
+            threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds)
+            # set threshold datatype (and accumulator datatype implicitly)
+            min_threshold = thresholds.min()
+            max_threshold = thresholds.max()
+            # clip threshold values
+            if max_threshold > acc_max or min_threshold < acc_min:
+                warnings.warn("Clipping some thresholds in %s" % self.onnx_node.name)
+                thresholds = np.clip(thresholds, acc_min, acc_max)
+                model.set_initializer(self.onnx_node.input[2], thresholds)
+                threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds)
+                min_threshold = thresholds.min()
+                max_threshold = thresholds.max()
+            acc_min = min(min_threshold, acc_min)
+            acc_max = max(max_threshold, acc_max)
+
+        # if the acc_range is always greater than 0, then acc_max <= 2^P - 1
+        if acc_min >= 0:
+            acc_bit_width = np.log2(acc_max + 1)
+            acc_bit_width = math.ceil(acc_bit_width)
+            adt = DataType[f"UINT{acc_bit_width}"]
+        # if the acc_range is signed, then acc_min >= -2^{P-1} and acc_max <=
+        # 2^{P - 1} - 1, which means 2^{P - 1} >= max(-acc_min, 1 + acc_max)
+        else:
+            _acc_max = max(-acc_min, 1 + acc_max)
+            acc_bit_width = np.log2(_acc_max) + 1
+            acc_bit_width = math.ceil(acc_bit_width)
+            adt = DataType[f"INT{acc_bit_width}"]
+
+        # if activation, assert that the thresholds can be expressed with adt
+        if thresholds is not None:
+            assert np.vectorize(adt.allowed)(
+                threshold_tensor
+            ).all(), "Thresholds in %s can't be expressed with type %s" % (
+                self.onnx_node.name,
+                str(adt),
+            )
+
+        # if no activation, output and accumulator datatypes are the same
+        if self.get_nodeattr("noActivation"):
+            # if this is the last node in the graph, then ensure the datatype is
+            # divisibly by 8 bits
+            if model.find_direct_successors(self.onnx_node) is None:
+                bw = roundup_to_integer_multiple(adt.bitwidth(), 8)
+                new_adt_name = adt.name.replace(str(adt.bitwidth()), str(bw))
+                adt = DataType[new_adt_name]
+            # for no-activation nodes, output dt = acc dt
+            self.set_nodeattr("outputDataType", adt.name)
+        self.set_nodeattr("accDataType", adt.name)
+
+        return DataType[self.get_nodeattr("accDataType")]
+
+    def minimize_weight_bit_width(self, model):
+        """Minimize the bit width based on the values of the weights"""
+        if not self.get_nodeattr("runtime_writeable_weights"):
+            weights = model.get_initializer(self.onnx_node.input[1])
+            w_min = weights.min()
+            w_max = weights.max()
+            if w_min < 0:
+                if abs(w_min) > w_max:
+                    wdt = DataType.get_smallest_possible(w_min)
+                else:
+                    wdt = DataType.get_smallest_possible(-w_max - 1)
+            else:
+                wdt = DataType.get_smallest_possible(w_max)
+            self.set_nodeattr("weightDataType", wdt.name)
+        return DataType[self.get_nodeattr("weightDataType")]
+
+    def get_hls_compatible_threshold_tensor(self, orig_thres_matrix):
+        """Convert the original numpy weight matrix orig_weight_matrix into
+        a form suitable for passing to the hlslib call:
+        * ensure MH % PE == 0
+        * for bipolar weights&inputs, ensure thresholds are positive
+        * interleave rows between PEs
+        * reshape into (PE, TMEM, n_thres_steps) and return
         """
         ch = self.get_nodeattr("Channels")
         pe = self.get_nodeattr("PE")
@@ -449,6 +619,29 @@ def get_hls_compatible_threshold_tensor(self, orig_thres_matrix):
         rows between PEs is not as expected (n_thres_steps)"""
         return ret.reshape(1, pe, tmem, n_thres_steps)
 
+    def get_hls_compatible_weight_tensor(self, orig_weight_matrix):
+        pe = self.get_nodeattr("PE")
+        simd = self.get_nodeattr("SIMD")
+        ch = self.get_nodeattr("Channels")
+        k_h, k_w = self.get_nodeattr("Kernel")
+        wmem = self.calc_wmem()
+        assert orig_weight_matrix.shape == (
+            ch,
+            1,
+            k_h,
+            k_w,
+        ), """Weights matrix doesn't
+        have expected shape (channels, 1, kernel_size, kernel_size)"""
+        ret = orig_weight_matrix
+        if self.get_weight_datatype() == DataType["BIPOLAR"]:
+            # convert bipolar to binary
+            ret = (ret + 1) / 2
+        ret = ret.reshape(ch, k_h * k_w)
+        # distribute rows between PEs
+        ret = interleave_matrix_outer_dim_from_partitions(ret, pe)
+        ret = ret.reshape(1, pe, wmem, simd)
+        return ret
+
     def make_weight_file(self, weights, weight_file_mode, weight_file_name):
         """Produce a file containing given weights in appropriate format for this
         layer. This file can be used for either synthesis or run-time reconfig
@@ -626,384 +819,44 @@ def generate_params(self, model, path):
                 f_thresh.write(thresholds_hls_code)
                 f_thresh.close()
 
-    def execute_node(self, context, graph):
-        mode = self.get_nodeattr("exec_mode")
-        mem_mode = self.get_nodeattr("mem_mode")
-        node = self.onnx_node
-
-        # TODO ensure codegen dir exists
-        if mode == "cppsim":
-            code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
-        elif mode == "rtlsim":
-            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
-        else:
-            raise Exception(
-                """Invalid value for attribute exec_mode! Is currently set to: {}
-            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
-                    mode
-                )
-            )
-
-        # create a npy file fore each input of the node (in_ind is input index)
-        in_ind = 0
-        for inputs in node.input:
-            # it is assumed that the first input of the node is the data input
-            # the second input are the weights
-            # the third input are the thresholds
-            if in_ind == 0:
-                assert (
-                    str(context[inputs].dtype) == "float32"
-                ), """Input datatype is
-                not float32 as expected."""
-                expected_inp_shape = self.get_folded_input_shape()
-                reshaped_input = context[inputs].reshape(expected_inp_shape)
-                if self.get_input_datatype() == DataType["BIPOLAR"]:
-                    # store bipolar activations as binary
-                    reshaped_input = (reshaped_input + 1) / 2
-                    export_idt = DataType["BINARY"]
-                else:
-                    export_idt = self.get_input_datatype()
-                # make copy before saving the array
-                reshaped_input = reshaped_input.copy()
-                np.save(
-                    os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)),
-                    reshaped_input,
-                )
-            elif in_ind > 2:
-                raise Exception("Unexpected input found for VectorVectorActivation")
-            in_ind += 1
-
-        if mode == "cppsim":
-            # execute the precompiled model
-            super().exec_precompiled_singlenode_model()
-            # load output npy file
-            super().npy_to_dynamic_output(context)
-            # reinterpret binary output as bipolar where needed
-            if self.get_output_datatype() == DataType["BIPOLAR"]:
-                out = context[node.output[0]]
-                out = 2 * out - 1
-                context[node.output[0]] = out
-            assert (
-                context[node.output[0]].shape == self.get_normal_output_shape()
-            ), "cppsim did not produce expected output shape"
-        elif mode == "rtlsim":
-            sim = self.get_rtlsim()
-            nbits = self.get_instream_width()
-            inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits)
-            super().reset_rtlsim(sim)
-            super().toggle_clk(sim)
-
-            if mem_mode == "external" or mem_mode == "decoupled":
-                wnbits = self.get_weightstream_width()
-                export_wdt = self.get_weight_datatype()
-                # we have converted bipolar weights to binary for export,
-                # so use it as such for weight generation
-                if self.get_weight_datatype() == DataType["BIPOLAR"]:
-                    export_wdt = DataType["BINARY"]
-                wei = npy_to_rtlsim_input("{}/weights.npy".format(code_gen_dir), export_wdt, wnbits)
-                dim_h, dim_w = self.get_nodeattr("Dim")
-                num_w_reps = dim_h * dim_w
-
-                io_dict = {
-                    "inputs": {"in0": inp, "weights": wei * num_w_reps},
-                    "outputs": {"out": []},
-                }
-                self.rtlsim_multi_io(sim, io_dict)
-                output = io_dict["outputs"]["out"]
-            else:
-                output = self.rtlsim(sim, inp)
-            odt = self.get_output_datatype()
-            target_bits = odt.bitwidth()
-            packed_bits = self.get_outstream_width()
-            out_npy_path = "{}/output.npy".format(code_gen_dir)
-            out_shape = self.get_folded_output_shape()
-            rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits)
-
-            # load and reshape output
-            output = np.load(out_npy_path)
-            oshape = self.get_normal_output_shape()
-            output = np.asarray([output], dtype=np.float32).reshape(*oshape)
-            context[node.output[0]] = output
-        else:
-            raise Exception(
-                """Invalid value for attribute exec_mode! Is currently set to: {}
-            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
-                    mode
-                )
-            )
-
-    def global_includes(self):
-        self.code_gen_dict["$GLOBALS$"] = ['#include "weights.hpp"']
-        self.code_gen_dict["$GLOBALS$"] += ['#include "activations.hpp"']
-        mem_mode = self.get_nodeattr("mem_mode")
-        if mem_mode not in ["const", "decoupled", "external"]:
-            raise Exception(
-                """Please set mem_mode to "const", "decoupled", or "external",
-                currently no other parameter value is supported!"""
-            )
-        if self.calc_tmem() != 0:
-            self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"']
-
-    def defines(self, var):
-        dim_h, dim_w = self.get_nodeattr("Dim")
-        numReps = 1 * dim_h * dim_w
+    def get_op_and_param_counts(self):
         k_h, k_w = self.get_nodeattr("Kernel")
-        innerProdDim = k_h * k_w
-        mem_mode = self.get_nodeattr("mem_mode")
-
-        self.code_gen_dict["$DEFINES$"] = [
-            """#define Channels1 {}\n #define InnerProdDim {}\n
-            #define SIMD1 {}\n #define PE1 {}\n #define numReps {}""".format(
-                self.get_nodeattr("Channels"),
-                innerProdDim,
-                self.get_nodeattr("SIMD"),
-                self.get_nodeattr("PE"),
-                numReps,
-            )
-        ]
-        if mem_mode == "decoupled" or mem_mode == "external":
-            wdt = self.get_weight_datatype()
-            self.code_gen_dict["$DEFINES$"].append("#define WP1 {}\n".format(wdt.bitwidth()))
-
-    def read_npy_data(self):
-        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
-        dtype = self.get_input_datatype()
-        if dtype == DataType["BIPOLAR"]:
-            # use binary for bipolar storage
-            dtype = DataType["BINARY"]
-        elem_bits = dtype.bitwidth()
-        packed_bits = self.get_instream_width()
-        packed_hls_type = "ap_uint<%d>" % packed_bits
-        elem_hls_type = dtype.get_hls_datatype_str()
-        npy_type = "float"
-        npy_in = "%s/input_0.npy" % code_gen_dir
-        self.code_gen_dict["$READNPYDATA$"] = []
-        # note: the innermost dim is reversed for the input
-        self.code_gen_dict["$READNPYDATA$"].append(
-            'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s, false);'
-            % (
-                packed_hls_type,
-                elem_hls_type,
-                elem_bits,
-                npy_type,
-                npy_in,
-                self.hls_sname(),
-            )
-        )
-
-        mem_mode = self.get_nodeattr("mem_mode")
-        if mem_mode == "decoupled" or mem_mode == "external":
-            wdt = self.get_weight_datatype()
-            elem_bits = wdt.bitwidth()
-            packed_bits = self.get_weightstream_width()
-            packed_hls_type = "ap_uint<%d>" % packed_bits
-            elem_hls_type = wdt.get_hls_datatype_str()
-            npy_type = "float"
-            npy_in = "%s/weights.npy" % code_gen_dir
-
-            self.code_gen_dict["$READNPYDATA$"].append(
-                'npy2apintstream<%s, %s, %d, %s>("%s", weights_%s, false, numReps);'
-                % (
-                    packed_hls_type,
-                    elem_hls_type,
-                    elem_bits,
-                    npy_type,
-                    npy_in,
-                    self.hls_sname(),
-                )
-            )
-
-    def strm_decl(self):
-        mem_mode = self.get_nodeattr("mem_mode")
-        self.code_gen_dict["$STREAMDECLARATIONS$"] = []
-        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> in0_{} ("in0_{}");'.format(
-                self.get_instream_width(), self.hls_sname(), self.hls_sname()
-            )
-        )
-        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> out_{} ("out_{}");'.format(
-                self.get_outstream_width(), self.hls_sname(), self.hls_sname()
-            )
-        )
-        if mem_mode == "decoupled" or mem_mode == "external":
-            self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-                'hls::stream<ap_uint<{}>> weights_{} ("weights_{}");'.format(
-                    self.get_weightstream_width(), self.hls_sname(), self.hls_sname()
-                )
-            )
+        fm = self.get_nodeattr("Channels")
+        dim_h, dim_w = self.get_nodeattr("Dim")
+        weight_bits = self.get_weight_datatype().bitwidth()
+        inp_bits = self.get_input_datatype().bitwidth()
+        num_repetitions = int(dim_h * dim_w)
+        mac_count = k_h * k_w * fm * num_repetitions
+        # cannonicalize op type: highest bitwidth operand first s.t.
+        # e.g. mac_8bx4b and mac_4bx8b don't appear as two different op types
+        bw1 = min(inp_bits, weight_bits)
+        bw2 = max(inp_bits, weight_bits)
+        mac_op_type = "op_mac_%dbx%db" % (bw1, bw2)
+        weight_param_type = "param_weight_%db" % (weight_bits)
+        weight_count = k_h * k_w * fm
+        ret_dict = {mac_op_type: mac_count, weight_param_type: weight_count}
+        if self.get_nodeattr("noActivation") == 0:
+            tdt = DataType[self.get_nodeattr("accDataType")]
+            thres_bits = tdt.bitwidth()
+            thres_param_type = "param_threshold_%db" % (thres_bits)
+            thres_count = fm
+            ret_dict[thres_param_type] = thres_count
+        return ret_dict
 
-    def docompute(self):
-        mem_mode = self.get_nodeattr("mem_mode")
-        map_to_hls_mult_style = {
-            "auto": "ap_resource_dflt()",
-            "lut": "ap_resource_lut()",
-            "dsp": "ap_resource_dsp()",
+    def derive_characteristic_fxns(self, period):
+        n_inps = np.prod(self.get_folded_input_shape()[:-1])
+        io_dict = {
+            "inputs": {
+                "in0": [0 for i in range(n_inps)],
+            },
+            "outputs": {"out": []},
         }
-        tmpl_args = self.get_template_param_values()
-        if self.calc_tmem() == 0:
-            odtype_hls_str = self.get_output_datatype().get_hls_datatype_str()
-            threshs = "PassThroughActivation<%s>()" % odtype_hls_str
-        else:
-            threshs = "threshs"
-
-        if mem_mode == "const":
-            self.code_gen_dict["$DOCOMPUTE$"] = [
-                """Vector_Vector_Activate_Batch<Channels1, InnerProdDim, SIMD1, PE1, 1, {}, {}, {}>
-                (in0_{}, out_{}, weights, {}, numReps, {});""".format(
-                    tmpl_args["TSrcI"],
-                    tmpl_args["TDstI"],
-                    tmpl_args["TWeightI"],
-                    self.hls_sname(),
-                    self.hls_sname(),
-                    threshs,
-                    map_to_hls_mult_style[self.get_nodeattr("resType")],
-                )
-            ]
-        elif mem_mode == "decoupled" or mem_mode == "external":
-            wdt = self.get_weight_datatype()
-            if wdt == DataType["BIPOLAR"]:
-                export_wdt = DataType["BINARY"]
-            else:
-                export_wdt = wdt
-            wdtype_hls_str = export_wdt.get_hls_datatype_str()
-            self.code_gen_dict["$DOCOMPUTE$"] = [
-                """{}<Channels1, InnerProdDim, SIMD1, PE1, 1, {}, {}, {}, {}>
-                (in0_{}, out_{}, weights_{}, {}, numReps, {});""".format(
-                    "Vector_Vector_Activate_Stream_Batch",
-                    tmpl_args["TSrcI"],
-                    tmpl_args["TDstI"],
-                    tmpl_args["TWeightI"],
-                    wdtype_hls_str,
-                    self.hls_sname(),
-                    self.hls_sname(),
-                    self.hls_sname(),
-                    threshs,
-                    map_to_hls_mult_style[self.get_nodeattr("resType")],
-                )
-            ]
-        else:
-            raise Exception(
-                """Please set mem_mode to "const", "decoupled", or "external",
-                currently no other parameter value is supported!"""
-            )
-
-    def dataoutstrm(self):
-        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
-        dtype = self.get_output_datatype()
-        if dtype == DataType["BIPOLAR"]:
-            # use binary for bipolar storage
-            dtype = DataType["BINARY"]
-        elem_bits = dtype.bitwidth()
-        packed_bits = self.get_outstream_width()
-        packed_hls_type = "ap_uint<%d>" % packed_bits
-        elem_hls_type = dtype.get_hls_datatype_str()
-        npy_type = "float"
-        npy_out = "%s/output.npy" % code_gen_dir
-        shape = self.get_folded_output_shape()
-        shape_cpp_str = str(shape).replace("(", "{").replace(")", "}")
-
-        # note: the innermost dim is not reversed for the output
-        self.code_gen_dict["$DATAOUTSTREAM$"] = [
-            'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", false);'
-            % (
-                packed_hls_type,
-                elem_hls_type,
-                elem_bits,
-                npy_type,
-                self.hls_sname(),
-                shape_cpp_str,
-                npy_out,
-            )
-        ]
-
-    def save_as_npy(self):
-        self.code_gen_dict["$SAVEASCNPY$"] = []
-
-    def blackboxfunction(self):
-        mem_mode = self.get_nodeattr("mem_mode")
-        if mem_mode == "const":
-            self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
-                """void {}(hls::stream<ap_uint<{}>> &in0_{},
-                hls::stream<ap_uint<{}>> &out_{}
-                )""".format(
-                    self.onnx_node.name,
-                    self.get_instream_width(),
-                    self.hls_sname(),
-                    self.get_outstream_width(),
-                    self.hls_sname(),
-                )
-            ]
-        elif mem_mode == "decoupled" or mem_mode == "external":
-            self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
-                """void {}(
-                    hls::stream<ap_uint<{}>> &in0_{},
-                    hls::stream<ap_uint<{}>> &weights_{},
-                    hls::stream<ap_uint<{}>> &out_{}
-                    )""".format(
-                    self.onnx_node.name,
-                    self.get_instream_width(),
-                    self.hls_sname(),
-                    self.get_weightstream_width(),
-                    self.hls_sname(),
-                    self.get_outstream_width(),
-                    self.hls_sname(),
-                )
-            ]
-        else:
-            raise Exception(
-                """Please set mem_mode to "const" or "decoupled", currently no other
-                    parameter value is supported!"""
-            )
-
-    def pragmas(self):
-        mem_mode = self.get_nodeattr("mem_mode")
-        self.code_gen_dict["$PRAGMAS$"] = [
-            "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname()
-        ]
-        self.code_gen_dict["$PRAGMAS$"].append(
-            "#pragma HLS INTERFACE axis port=out_" + self.hls_sname()
-        )
-        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return")
-
-        if mem_mode == "const":
-            self.code_gen_dict["$PRAGMAS$"].append('#include "params.h"')
-            # the weight tensor is ap_uint<ch*prec> [PE][WMEM]
-            # partition for parallel access along the PE dimension (dim 1)
-            self.code_gen_dict["$PRAGMAS$"].append(
-                ("#pragma HLS ARRAY_PARTITION variable=weights.m_weights " "complete dim=1")
-            )
-        elif mem_mode == "decoupled" or mem_mode == "external":
-            self.code_gen_dict["$PRAGMAS$"].append(
-                "#pragma HLS INTERFACE axis port=weights_" + self.hls_sname()
-            )
-        else:
-            raise Exception(
-                """Please set mem_mode to "const", "decoupled", or external,
-                currently no other parameter value is supported!"""
-            )
-
-        if self.calc_tmem() != 0:
-            # TODO find a better way of checking for no pregenerated thresholds
-            self.code_gen_dict["$PRAGMAS$"].append(
-                ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=1")
-            )
-            self.code_gen_dict["$PRAGMAS$"].append(
-                ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=3")
-            )
-
-    def get_verilog_top_module_intf_names(self):
-        intf_names = super().get_verilog_top_module_intf_names()
         mem_mode = self.get_nodeattr("mem_mode")
-        sname = self.hls_sname()
-        if mem_mode == "external":
-            intf_names["s_axis"].append(("weights_" + sname, self.get_weightstream_width_padded()))
-        if mem_mode == "decoupled":
-            # only expose axilite interface if attribute is set
-            runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1
-            if runtime_writable:
-                intf_names["axilite"] = ["s_axilite"]
-        return intf_names
+        if mem_mode in ["decoupled", "external"]:
+            n_weight_inps = self.calc_wmem()
+            num_w_reps = np.prod(self.get_nodeattr("numInputVectors"))
+            io_dict["inputs"]["weights"] = [0 for i in range(num_w_reps * n_weight_inps)]
+        super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
 
     def code_generation_ipi(self):
         cmd = []
@@ -1111,207 +964,4 @@ def code_generation_ipi(self):
             return super().code_generation_ipi()
         else:
             raise Exception("Unrecognized mem_mode for VectorVectorActivation")
-        return cmd
-
-    def uram_estimation(self):
-        P = self.get_nodeattr("PE")
-        Q = self.get_nodeattr("SIMD")
-        wdt = self.get_weight_datatype()
-        W = wdt.bitwidth()
-        omega = self.calc_wmem()
-        mem_width = Q * W * P
-        mmode = self.get_nodeattr("mem_mode")
-        mstyle = self.get_nodeattr("ram_style")
-        if (
-            (mmode == "decoupled" and mstyle != "ultra")
-            or (mmode == "const")
-            or (mmode == "external")
-        ):
-            return 0
-        width_multiplier = math.ceil(mem_width / 72)
-        depth_multiplier = math.ceil(omega / 4096)
-        return width_multiplier * depth_multiplier
-
-    def bram_estimation(self):
-        """Calculates resource estimation for BRAM"""
-        # TODO add in/out FIFO contributions
-        P = self.get_nodeattr("PE")
-        Q = self.get_nodeattr("SIMD")
-        wdt = self.get_weight_datatype()
-        W = wdt.bitwidth()
-        omega = self.calc_wmem()
-        mem_width = Q * W * P
-        # assuming SDP mode RAMB18s (see UG573 Table 1-10)
-        # since this is HLS memory, not using the full width of a BRAM
-        # assuming memories up to 128 deep get implemented in LUTs
-        mmode = self.get_nodeattr("mem_mode")
-        mstyle = self.get_nodeattr("ram_style")
-        if (
-            (mmode == "decoupled" and mstyle in ["distributed", "ultra"])
-            or (mstyle == "auto" and self.calc_wmem() <= 128)
-            or (mmode == "const" and self.calc_wmem() <= 128)
-            or (mmode == "external")
-        ):
-            return 0
-
-        if mem_width == 1:
-            return math.ceil(omega / 16384)
-        elif mem_width == 2:
-            return math.ceil(omega / 8192)
-        elif mem_width <= 4:
-            return (math.ceil(omega / 4096)) * (math.ceil(mem_width / 4))
-        elif mem_width <= 9:
-            return (math.ceil(omega / 2048)) * (math.ceil(mem_width / 8))
-        elif mem_width <= 18 or omega > 512:
-            return (math.ceil(omega / 1024)) * (math.ceil(mem_width / 16))
-        else:
-            return (math.ceil(omega / 512)) * (math.ceil(mem_width / 32))
-
-    def bram_efficiency_estimation(self):
-        P = self.get_nodeattr("PE")
-        wdt = self.get_weight_datatype()
-        W = wdt.bitwidth()
-        omega = self.calc_wmem()
-        bram16_est = self.bram_estimation()
-        if bram16_est == 0:
-            return 1
-        wbits = W * P * omega
-        bram16_est_capacity = bram16_est * 36 * 512
-        return wbits / bram16_est_capacity
-
-    def lut_estimation(self):
-        """Calculates resource estimations for LUTs based on:
-        - FINN-R: An End-to-End Deep-Learning Framework for Fast
-        Exploration of Quantized Neural Networks
-        - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien,
-        Y. Umuroglu, M. Leeser and K. Vissers
-        - 12. Sep 2018
-        """
-        # TODO add in/out FIFO contributions
-        P = self.get_nodeattr("PE")
-        Q = self.get_nodeattr("SIMD")
-        wdt = self.get_weight_datatype()
-        W = wdt.bitwidth()
-        # determine tdt with input and weight data types
-        idt = self.get_input_datatype()
-        A = idt.bitwidth()
-        # parameters from experiments in paper mentioned above
-        c0 = 300
-        c1 = 1.1
-        c2 = 0
-        mmode = self.get_nodeattr("mem_mode")
-        mstyle = self.get_nodeattr("ram_style")
-        if (mmode == "decoupled" and mstyle == "distributed") or (
-            mmode == "const" and self.calc_wmem() <= 128
-        ):
-            c2 = (P * Q * W) * math.ceil(self.calc_wmem() / 64)
-
-        # multiplication
-        res_type = self.get_nodeattr("resType")
-        if res_type == "dsp":
-            mult_luts = 0
-        else:
-            mult_luts = Q * (2 * math.ceil((W + A) / 6) - 1) * (W + A)
-        # adder tree
-        addertree_luts = (W + A) * (2 * Q - 1)
-        # accumulator
-        acc_datatype = self.get_accumulator_datatype()
-        acc_bits = acc_datatype.bitwidth()
-        k_h, k_w = self.get_nodeattr("Kernel")
-        # if accDataType is not set, then it will default to INT32, which would
-        # be a large overestimate in most (if not all) cases. In this scenario,
-        # we would use the minimum accumulator as determined by the data types
-        # bound, derived in https://arxiv.org/abs/2301.13376
-        alpha = math.log(k_h * k_w, 2) + W + A - 1 - int(idt.signed())
-        acc_bits = min(
-            acc_datatype.bitwidth(),
-            np.ceil(alpha + math.log(1 + pow(2, -alpha), 2) + 1),
-        )
-        acc_luts = acc_bits
-        # thresholds and threshold comparators
-        thr_luts = 0
-        comp_luts = 0
-        noact = self.get_nodeattr("noActivation")
-        # TODO - add 'ram_style_threshold' node attribute
-        if noact == 0:
-            odt = self.get_output_datatype()
-            B = odt.bitwidth()
-            thr_luts = (2**B - 1) * acc_bits * self.calc_tmem() / 64
-            comp_luts = (2**B - 1) * acc_bits
-
-        return int(
-            c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts)) + c2
-        )
-
-    def dsp_estimation(self):
-        # multiplication
-        P = self.get_nodeattr("PE")
-        res_type = self.get_nodeattr("resType")
-        wdt = self.get_weight_datatype()
-        W = wdt.bitwidth()
-        idt = self.get_input_datatype()
-        A = idt.bitwidth()
-        if res_type == "dsp":
-            mult_dsp = P * np.ceil((W + A) / 48)  # TODO: more accurate modelling
-        else:
-            mult_dsp = 0
-        return int(mult_dsp)
-
-    def get_weightstream_width(self):
-        """Returns weight stream width. Used only in decoupled mode."""
-        if (
-            self.get_nodeattr("mem_mode") == "decoupled"
-            or self.get_nodeattr("mem_mode") == "external"
-        ):
-            simd = self.get_nodeattr("SIMD")
-            pe = self.get_nodeattr("PE")
-            wp = self.get_weight_datatype().bitwidth()
-            w_width = simd * pe * wp
-            return w_width
-        else:
-            return 0
-
-    def get_weightstream_width_padded(self):
-        """Returns weight stream width padded to a multiple of 8. This is required
-        by the AXI Stream spec. Used in decoupled mode."""
-        weight_width = self.get_weightstream_width()
-        return roundup_to_integer_multiple(weight_width, 8)
-
-    def get_op_and_param_counts(self):
-        k_h, k_w = self.get_nodeattr("Kernel")
-        fm = self.get_nodeattr("Channels")
-        dim_h, dim_w = self.get_nodeattr("Dim")
-        weight_bits = self.get_weight_datatype().bitwidth()
-        inp_bits = self.get_input_datatype().bitwidth()
-        num_repetitions = int(dim_h * dim_w)
-        mac_count = k_h * k_w * fm * num_repetitions
-        # cannonicalize op type: highest bitwidth operand first s.t.
-        # e.g. mac_8bx4b and mac_4bx8b don't appear as two different op types
-        bw1 = min(inp_bits, weight_bits)
-        bw2 = max(inp_bits, weight_bits)
-        mac_op_type = "op_mac_%dbx%db" % (bw1, bw2)
-        weight_param_type = "param_weight_%db" % (weight_bits)
-        weight_count = k_h * k_w * fm
-        ret_dict = {mac_op_type: mac_count, weight_param_type: weight_count}
-        if self.get_nodeattr("noActivation") == 0:
-            tdt = DataType[self.get_nodeattr("accDataType")]
-            thres_bits = tdt.bitwidth()
-            thres_param_type = "param_threshold_%db" % (thres_bits)
-            thres_count = fm
-            ret_dict[thres_param_type] = thres_count
-        return ret_dict
-
-    def derive_characteristic_fxns(self, period):
-        n_inps = np.prod(self.get_folded_input_shape()[:-1])
-        io_dict = {
-            "inputs": {
-                "in0": [0 for i in range(n_inps)],
-            },
-            "outputs": {"out": []},
-        }
-        mem_mode = self.get_nodeattr("mem_mode")
-        if mem_mode in ["decoupled", "external"]:
-            n_weight_inps = self.calc_wmem()
-            num_w_reps = np.prod(self.get_nodeattr("numInputVectors"))
-            io_dict["inputs"]["weights"] = [0 for i in range(num_w_reps * n_weight_inps)]
-        super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
+        return cmd
\ No newline at end of file

From bc44a4d487590c857652d3dfd4ab0a11962816d1 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 1 Feb 2024 14:36:31 +0000
Subject: [PATCH 111/112] [vvau hls-op]: refactored HLS custom-op VVAU

---
 .../hls/vectorvectoractivation_hls.py         | 372 ++++++++++++++++++
 1 file changed, 372 insertions(+)
 create mode 100644 src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py

diff --git a/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py b/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py
new file mode 100644
index 0000000000..51de49f1c7
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py
@@ -0,0 +1,372 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import math
+import numpy as np
+import os
+import textwrap
+import warnings
+from qonnx.core.datatype import DataType
+from qonnx.util.basic import (
+    calculate_matvec_accumulator_range,
+    interleave_matrix_outer_dim_from_partitions,
+    roundup_to_integer_multiple,
+)
+
+from finn.util.data_packing import (
+    npy_to_rtlsim_input,
+    numpy_to_hls_code,
+    pack_innermost_dim_as_hex_string,
+    rtlsim_output_to_npy,
+)
+from finn.custom_op.fpgadataflow.vectorvectoractivation import VectorVectorActivation
+from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend
+
+class VectorVectorActivation_hls(VectorVectorActivation, HLSBackend):
+    """Corresponds to finn-hlslib Vector_Vector_Activate_Batch function"""
+
+    def __init__(self, onnx_node, **kwargs):
+        super().__init__(onnx_node, **kwargs)
+
+    def get_nodeattr_types(self):
+        my_attrs = {}
+        my_attrs.update(VectorVectorActivation.get_nodeattr_types(self))
+        my_attrs.update(HLSBackend.get_nodeattr_types(self))
+        return my_attrs
+
+    def get_template_param_values(self):
+        """Returns the template parameter values according to input, output and weight
+        data types."""
+        ret = dict()
+        inp_hls_str = self.get_input_datatype().get_hls_datatype_str()
+        out_hls_str = self.get_output_datatype().get_hls_datatype_str()
+        inp_is_binary = self.get_input_datatype() == DataType["BINARY"]
+        # out_is_binary = self.get_output_datatype() == DataType["BINARY"]
+        wt_is_binary = self.get_weight_datatype() == DataType["BINARY"]
+        bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1
+        if (inp_is_binary or wt_is_binary) and (not bin_xnor_mode):
+            raise Exception("True binary (non-bipolar) inputs not yet supported")
+        inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"]
+        # out_is_bipolar = self.get_output_datatype() == DataType["BIPOLAR"]
+        wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"]
+        # reinterpret inp/wt as bipolar if bin_xnor_mode is iset
+        inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode)
+        wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode)
+        # fill in TSrcI and TWeightI
+        # TODO check these with Giulio
+        # TODO handle non-bipolar binary inputs
+        if inp_is_bipolar and wt_is_bipolar:
+            ret["TSrcI"] = "Recast<XnorMul>"
+            ret["TWeightI"] = "Identity"
+        elif (not inp_is_bipolar) and wt_is_bipolar:
+            ret["TSrcI"] = "Slice<%s>" % inp_hls_str
+            ret["TWeightI"] = "Recast<Binary>"
+        elif inp_is_bipolar and (not wt_is_bipolar):
+            ret["TSrcI"] = "Recast<Binary>"
+            ret["TWeightI"] = "Identity"
+        elif (not inp_is_bipolar) and (not wt_is_bipolar):
+            ret["TSrcI"] = "Slice<%s>" % inp_hls_str
+            ret["TWeightI"] = "Identity"
+
+        # fill in TDstI
+        ret["TDstI"] = "Slice<%s>" % out_hls_str
+
+        return ret
+
+    def global_includes(self):
+        self.code_gen_dict["$GLOBALS$"] = ['#include "weights.hpp"']
+        self.code_gen_dict["$GLOBALS$"] += ['#include "activations.hpp"']
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode not in ["const", "decoupled", "external"]:
+            raise Exception(
+                """Please set mem_mode to "const", "decoupled", or "external",
+                currently no other parameter value is supported!"""
+            )
+        if self.calc_tmem() != 0:
+            self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"']
+
+    def defines(self, var):
+        dim_h, dim_w = self.get_nodeattr("Dim")
+        numReps = 1 * dim_h * dim_w
+        k_h, k_w = self.get_nodeattr("Kernel")
+        innerProdDim = k_h * k_w
+        mem_mode = self.get_nodeattr("mem_mode")
+
+        self.code_gen_dict["$DEFINES$"] = [
+            """#define Channels1 {}\n #define InnerProdDim {}\n
+            #define SIMD1 {}\n #define PE1 {}\n #define numReps {}""".format(
+                self.get_nodeattr("Channels"),
+                innerProdDim,
+                self.get_nodeattr("SIMD"),
+                self.get_nodeattr("PE"),
+                numReps,
+            )
+        ]
+        if mem_mode == "decoupled" or mem_mode == "external":
+            wdt = self.get_weight_datatype()
+            self.code_gen_dict["$DEFINES$"].append("#define WP1 {}\n".format(wdt.bitwidth()))
+
+    def read_npy_data(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_input_datatype()
+        if dtype == DataType["BIPOLAR"]:
+            # use binary for bipolar storage
+            dtype = DataType["BINARY"]
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_instream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_in = "%s/input_0.npy" % code_gen_dir
+        self.code_gen_dict["$READNPYDATA$"] = []
+        # note: the innermost dim is reversed for the input
+        self.code_gen_dict["$READNPYDATA$"].append(
+            'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s, false);'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                npy_in,
+                self.hls_sname(),
+            )
+        )
+
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode == "decoupled" or mem_mode == "external":
+            wdt = self.get_weight_datatype()
+            elem_bits = wdt.bitwidth()
+            packed_bits = self.get_weightstream_width()
+            packed_hls_type = "ap_uint<%d>" % packed_bits
+            elem_hls_type = wdt.get_hls_datatype_str()
+            npy_type = "float"
+            npy_in = "%s/weights.npy" % code_gen_dir
+
+            self.code_gen_dict["$READNPYDATA$"].append(
+                'npy2apintstream<%s, %s, %d, %s>("%s", weights_%s, false, numReps);'
+                % (
+                    packed_hls_type,
+                    elem_hls_type,
+                    elem_bits,
+                    npy_type,
+                    npy_in,
+                    self.hls_sname(),
+                )
+            )
+
+    def strm_decl(self):
+        mem_mode = self.get_nodeattr("mem_mode")
+        self.code_gen_dict["$STREAMDECLARATIONS$"] = []
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> in0_{} ("in0_{}");'.format(
+                self.get_instream_width(), self.hls_sname(), self.hls_sname()
+            )
+        )
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> out_{} ("out_{}");'.format(
+                self.get_outstream_width(), self.hls_sname(), self.hls_sname()
+            )
+        )
+        if mem_mode == "decoupled" or mem_mode == "external":
+            self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+                'hls::stream<ap_uint<{}>> weights_{} ("weights_{}");'.format(
+                    self.get_weightstream_width(), self.hls_sname(), self.hls_sname()
+                )
+            )
+
+    def docompute(self):
+        mem_mode = self.get_nodeattr("mem_mode")
+        map_to_hls_mult_style = {
+            "auto": "ap_resource_dflt()",
+            "lut": "ap_resource_lut()",
+            "dsp": "ap_resource_dsp()",
+        }
+        tmpl_args = self.get_template_param_values()
+        if self.calc_tmem() == 0:
+            odtype_hls_str = self.get_output_datatype().get_hls_datatype_str()
+            threshs = "PassThroughActivation<%s>()" % odtype_hls_str
+        else:
+            threshs = "threshs"
+
+        if mem_mode == "const":
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                """Vector_Vector_Activate_Batch<Channels1, InnerProdDim, SIMD1, PE1, 1, {}, {}, {}>
+                (in0_{}, out_{}, weights, {}, numReps, {});""".format(
+                    tmpl_args["TSrcI"],
+                    tmpl_args["TDstI"],
+                    tmpl_args["TWeightI"],
+                    self.hls_sname(),
+                    self.hls_sname(),
+                    threshs,
+                    map_to_hls_mult_style[self.get_nodeattr("resType")],
+                )
+            ]
+        elif mem_mode == "decoupled" or mem_mode == "external":
+            wdt = self.get_weight_datatype()
+            if wdt == DataType["BIPOLAR"]:
+                export_wdt = DataType["BINARY"]
+            else:
+                export_wdt = wdt
+            wdtype_hls_str = export_wdt.get_hls_datatype_str()
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                """{}<Channels1, InnerProdDim, SIMD1, PE1, 1, {}, {}, {}, {}>
+                (in0_{}, out_{}, weights_{}, {}, numReps, {});""".format(
+                    "Vector_Vector_Activate_Stream_Batch",
+                    tmpl_args["TSrcI"],
+                    tmpl_args["TDstI"],
+                    tmpl_args["TWeightI"],
+                    wdtype_hls_str,
+                    self.hls_sname(),
+                    self.hls_sname(),
+                    self.hls_sname(),
+                    threshs,
+                    map_to_hls_mult_style[self.get_nodeattr("resType")],
+                )
+            ]
+        else:
+            raise Exception(
+                """Please set mem_mode to "const", "decoupled", or "external",
+                currently no other parameter value is supported!"""
+            )
+
+    def dataoutstrm(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_output_datatype()
+        if dtype == DataType["BIPOLAR"]:
+            # use binary for bipolar storage
+            dtype = DataType["BINARY"]
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_outstream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_out = "%s/output.npy" % code_gen_dir
+        shape = self.get_folded_output_shape()
+        shape_cpp_str = str(shape).replace("(", "{").replace(")", "}")
+
+        # note: the innermost dim is not reversed for the output
+        self.code_gen_dict["$DATAOUTSTREAM$"] = [
+            'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", false);'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                self.hls_sname(),
+                shape_cpp_str,
+                npy_out,
+            )
+        ]
+
+    def save_as_npy(self):
+        self.code_gen_dict["$SAVEASCNPY$"] = []
+
+    def blackboxfunction(self):
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode == "const":
+            self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+                """void {}(hls::stream<ap_uint<{}>> &in0_{},
+                hls::stream<ap_uint<{}>> &out_{}
+                )""".format(
+                    self.onnx_node.name,
+                    self.get_instream_width(),
+                    self.hls_sname(),
+                    self.get_outstream_width(),
+                    self.hls_sname(),
+                )
+            ]
+        elif mem_mode == "decoupled" or mem_mode == "external":
+            self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+                """void {}(
+                    hls::stream<ap_uint<{}>> &in0_{},
+                    hls::stream<ap_uint<{}>> &weights_{},
+                    hls::stream<ap_uint<{}>> &out_{}
+                    )""".format(
+                    self.onnx_node.name,
+                    self.get_instream_width(),
+                    self.hls_sname(),
+                    self.get_weightstream_width(),
+                    self.hls_sname(),
+                    self.get_outstream_width(),
+                    self.hls_sname(),
+                )
+            ]
+        else:
+            raise Exception(
+                """Please set mem_mode to "const" or "decoupled", currently no other
+                    parameter value is supported!"""
+            )
+
+    def pragmas(self):
+        mem_mode = self.get_nodeattr("mem_mode")
+        self.code_gen_dict["$PRAGMAS$"] = [
+            "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname()
+        ]
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE axis port=out_" + self.hls_sname()
+        )
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return")
+
+        if mem_mode == "const":
+            self.code_gen_dict["$PRAGMAS$"].append('#include "params.h"')
+            # the weight tensor is ap_uint<ch*prec> [PE][WMEM]
+            # partition for parallel access along the PE dimension (dim 1)
+            self.code_gen_dict["$PRAGMAS$"].append(
+                ("#pragma HLS ARRAY_PARTITION variable=weights.m_weights " "complete dim=1")
+            )
+        elif mem_mode == "decoupled" or mem_mode == "external":
+            self.code_gen_dict["$PRAGMAS$"].append(
+                "#pragma HLS INTERFACE axis port=weights_" + self.hls_sname()
+            )
+        else:
+            raise Exception(
+                """Please set mem_mode to "const", "decoupled", or external,
+                currently no other parameter value is supported!"""
+            )
+
+        if self.calc_tmem() != 0:
+            # TODO find a better way of checking for no pregenerated thresholds
+            self.code_gen_dict["$PRAGMAS$"].append(
+                ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=1")
+            )
+            self.code_gen_dict["$PRAGMAS$"].append(
+                ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=3")
+            )
+
+    def get_verilog_top_module_intf_names(self):
+        intf_names = super().get_verilog_top_module_intf_names()
+        mem_mode = self.get_nodeattr("mem_mode")
+        sname = self.hls_sname()
+        if mem_mode == "external":
+            intf_names["s_axis"].append(("weights_" + sname, self.get_weightstream_width_padded()))
+        if mem_mode == "decoupled":
+            # only expose axilite interface if attribute is set
+            runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1
+            if runtime_writable:
+                intf_names["axilite"] = ["s_axilite"]
+        return intf_names
\ No newline at end of file

From faabc0fe32392975e21f4be58cc6352db414f40f Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 1 Feb 2024 14:37:12 +0000
Subject: [PATCH 112/112] [convert-to-hw]: added transformations to infer
 binary-MVAU and VVAU

---
 .../fpgadataflow/convert_to_hw_layers.py      | 279 ++++++++++++++++++
 1 file changed, 279 insertions(+)

diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
index eb6dd337f5..26cd0b74ad 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
@@ -1281,6 +1281,139 @@ def apply(self, model):
 
         return (model, graph_modified)
 
+class InferBinaryMatrixVectorActivation(Transformation):
+    """Convert XnorPopcountMatMul layers to
+    MatrixVectorActivation layers. Any immediately following MultiThreshold
+    layers will also be absorbed into the MVTU."""
+
+    def __init__(self, mem_mode="const"):
+        super().__init__()
+        self.mem_mode = mem_mode
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for n in graph.node:
+            node_ind += 1
+            if n.op_type == "XnorPopcountMatMul":
+                mm_input = n.input[0]
+                mm_weight = n.input[1]
+                mm_output = n.output[0]
+                mm_in_shape = model.get_tensor_shape(mm_input)
+                mm_out_shape = model.get_tensor_shape(mm_output)
+                assert model.get_tensor_datatype(mm_input) == DataType["BINARY"], (
+                    n.name
+                    + """: First
+                input for xnorpopcount is not Wset to FINN DataType BINARY."""
+                )
+                assert model.get_tensor_datatype(mm_weight) == DataType["BINARY"], (
+                    n.name
+                    + """: Second
+                input (weights) for xnorpopcount is not set to FINN DataType BINARY."""
+                )
+                idt = DataType["BINARY"]
+                wdt = DataType["BINARY"]
+                mm_output = n.output[0]
+                W = model.get_initializer(mm_weight)
+                # extract weight shape, note that ONNX and finn-hlslib
+                # make different assumptions about dim order here
+                # ONNX assumes W has (in, out) shape
+                # finn-hlslib assumes W has (out, in) shape
+                mh = int(W.shape[1])
+                mw = int(W.shape[0])
+                # create node with no parallelization first
+                pe = 1
+                simd = 1
+                wmem = mw * mh // (pe * simd)
+                assert mw * mh == wmem * pe * simd, (
+                    n.name
+                    + """: Requirement (MW * MH) divisiable by
+                (WMEM * PE * SIMD) is violated."""
+                )
+                # see if we have any following thresholds
+                consumer = model.find_consumer(mm_output)
+                if consumer is not None and consumer.op_type == "MultiThreshold":
+                    # TODO ensure integer thresholds?
+                    # create MVTU (i.e. including activation)
+                    mt_output = consumer.output[0]
+                    mt_out_shape = model.get_tensor_shape(mt_output)
+                    mt_thres = consumer.input[1]
+                    T = model.get_initializer(mt_thres)
+                    assert T.shape[0] == 1 or T.shape[0] == mh, (
+                        consumer.name
+                        + """: First dimension of
+                    thresholds neither 1 nor MH."""
+                    )
+                    odt = model.get_tensor_datatype(mt_output)
+                    if odt.bitwidth() == 1:
+                        # covers both bipolar and binary
+                        actval = 0
+                    else:
+                        actval = odt.min()
+                    model.set_tensor_shape(mm_input, mm_in_shape)
+                    model.set_tensor_shape(mt_output, mt_out_shape)
+                    # create and insert new MatrixVectorActivation node
+                    new_node = helper.make_node(
+                        "MatrixVectorActivation",
+                        [mm_input, mm_weight, mt_thres],
+                        [mt_output],
+                        domain="finn.custom_op.fpgadataflow",
+                        backend="fpgadataflow",
+                        MW=mw,
+                        MH=mh,
+                        SIMD=simd,
+                        PE=pe,
+                        inputDataType=idt.name,
+                        weightDataType=wdt.name,
+                        outputDataType=odt.name,
+                        ActVal=actval,
+                        binaryXnorMode=1,
+                        noActivation=0,
+                        numInputVectors=list(mm_in_shape[:-1]),
+                        mem_mode=self.mem_mode,
+                        name=n.name,
+                    )
+                    graph.node.insert(node_ind, new_node)
+                    # remove old nodes
+                    graph.node.remove(n)
+                    graph.node.remove(consumer)
+                    graph_modified = True
+                else:
+                    # no activation, matmul only
+                    odt = model.get_tensor_datatype(mm_output)
+                    model.set_tensor_shape(mm_input, mm_in_shape)
+                    model.set_tensor_shape(mm_output, mm_out_shape)
+                    # create and insert new MatrixVectorActivation node
+                    new_node = helper.make_node(
+                        "MatrixVectorActivation",
+                        [mm_input, mm_weight],
+                        [mm_output],
+                        domain="finn.custom_op.fpgadataflow",
+                        backend="fpgadataflow",
+                        MW=mw,
+                        MH=mh,
+                        SIMD=simd,
+                        PE=pe,
+                        inputDataType=idt.name,
+                        weightDataType=wdt.name,
+                        outputDataType=odt.name,
+                        ActVal=0,
+                        binaryXnorMode=1,
+                        noActivation=1,
+                        numInputVectors=list(mm_in_shape[:-1]),
+                        mem_mode=self.mem_mode,
+                        name=n.name,
+                    )
+                    graph.node.insert(node_ind, new_node)
+                    # remove old node
+                    graph.node.remove(n)
+                    graph_modified = True
+        if graph_modified:
+            model = model.transform(InferShapes())
+            model = model.transform(InferDataTypes())
+        return (model, graph_modified)
+
 class InferQuantizedMatrixVectorActivation(Transformation):
     """Convert MatMul layers with quantized inputs and weights to
     MatrixVectorActivation layers."""
@@ -1415,4 +1548,150 @@ def apply(self, model):
         if graph_modified:
             model = model.transform(InferShapes())
             model = model.transform(InferDataTypes())
+        return (model, graph_modified)
+
+class InferVectorVectorActivation(Transformation):
+    """Convert MatMul layers with quantized inputs and weights to
+    VectorVectorActivation layers, if the sparsity annotation
+    of the weight matrix indicates that the MatMul layer belongs to
+    a depthwise convolution. Any immediately following MultiThreshold
+    layers will also be absorbed into the VVAU."""
+
+    def __init__(self, mem_mode="const"):
+        super().__init__()
+        self.mem_mode = mem_mode
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for n in graph.node:
+            node_ind += 1
+            if n.op_type == "MatMul" and model.get_tensor_sparsity(n.input[1]) is not None:
+                sparsity = model.get_tensor_sparsity(n.input[1])
+                try:
+                    k_h, k_w = sparsity["dw"]["kernel_shape"]
+                except KeyError:
+                    raise Exception(
+                        n.name
+                        + """: sparsity annotation doesn't indicate that MatMul
+                        belongs to a depthwise convolution."""
+                    )
+
+                mm_input = n.input[0]
+                mm_weight = n.input[1]
+                mm_output = n.output[0]
+                mm_in_shape = model.get_tensor_shape(mm_input)
+                mm_out_shape = model.get_tensor_shape(mm_output)
+                idt = model.get_tensor_datatype(mm_input)
+                wdt = model.get_tensor_datatype(mm_weight)
+                if idt.is_integer() and wdt.is_integer():
+                    mm_output = n.output[0]
+                    W = model.get_initializer(mm_weight)
+                    # infer dense weight tensor from sparse weight matrix
+                    # kernel size (k_h, k_w) which was extracted above and the value of
+                    # the channels is used.
+                    # the weight matrix has a shape of (k_h * k_w * Channels, Channels)
+                    # we need to reverse the creation of the sparse weight matrix
+                    # to achieve a weight tensor of shape (Channels, 1, k_h, k_w)
+                    channels = int(W.shape[1])
+                    # transpose to achieve a shape of (k_h * k_w * Channels, Channels)
+                    W = W.T
+                    # reshape to (Channels, k_h, k_w, Channels) to transpose afterwards
+                    # to (Channels, Channels, k_h, k_w)
+                    W = W.reshape(channels, k_h, k_w, channels)
+                    W = W.transpose(0, 3, 1, 2)
+                    # now we can extract the values using a for loop over the channels
+                    # and fill a zero numpy array in the correct shape
+                    w_tensor = np.zeros((channels, 1, k_h, k_w), dtype=np.float32)
+                    for ch in range(channels):
+                        w_tensor[ch][0] = W[ch][ch]
+                    model.set_initializer(mm_weight, w_tensor)
+                    model.set_tensor_shape(mm_weight, (channels, 1, k_h, k_w))
+                    # create node with pe=channels as default
+                    pe = channels
+                    # see if we have any following thresholds
+                    consumer = model.find_consumer(mm_output)
+                    if consumer is not None and consumer.op_type == "MultiThreshold":
+                        # create VVAU (i.e. including activation)
+                        mt_output = consumer.output[0]
+                        mt_out_shape = model.get_tensor_shape(mt_output)
+                        mt_thres = consumer.input[1]
+                        T = model.get_initializer(mt_thres)
+                        assert T.shape[0] == 1 or T.shape[0] == channels, (
+                            consumer.name
+                            + """: First dimension of
+                        thresholds neither 1 nor Channels."""
+                        )
+                        odt = model.get_tensor_datatype(mt_output)
+                        scale = getCustomOp(consumer).get_nodeattr("out_scale")
+                        assert scale == 1.0, (
+                            consumer.name + ": out_scale must be equal to 1.0 for HLS conversion."
+                        )
+                        actval = getCustomOp(consumer).get_nodeattr("out_bias")
+                        assert int(actval) == actval, (
+                            consumer.name + ": out_bias must be integer for HLS conversion."
+                        )
+                        actval = int(actval)
+                        assert (not odt.signed()) or (actval < 0), (
+                            consumer.name + ": Signed output requres actval < 0"
+                        )
+                        model.set_tensor_shape(mm_input, mm_in_shape)
+                        model.set_tensor_shape(mt_output, mt_out_shape)
+                        # create and insert new VectorVectorActivation node
+                        new_node = helper.make_node(
+                            "VectorVectorActivation",
+                            [mm_input, mm_weight, mt_thres],
+                            [mt_output],
+                            domain="finn.custom_op.fpgadataflow",
+                            backend="fpgadataflow",
+                            resType="lut",
+                            PE=pe,
+                            Dim=[mm_in_shape[1], mm_in_shape[2]],
+                            Channels=channels,
+                            Kernel=[k_h, k_w],
+                            inputDataType=idt.name,
+                            weightDataType=wdt.name,
+                            outputDataType=odt.name,
+                            ActVal=actval,
+                            noActivation=0,
+                            name="VectorVectorActivation_" + n.name,
+                            mem_mode=self.mem_mode,
+                        )
+                        graph.node.insert(node_ind, new_node)
+                        # remove old nodes
+                        graph.node.remove(n)
+                        graph.node.remove(consumer)
+                        graph_modified = True
+                    else:
+                        # no activation, matmul only
+                        odt = model.get_tensor_datatype(mm_output)
+                        model.set_tensor_shape(mm_input, mm_in_shape)
+                        model.set_tensor_shape(mm_output, mm_out_shape)
+                        # create and insert new VVAU node
+                        new_node = helper.make_node(
+                            "VectorVectorActivation",
+                            [mm_input, mm_weight],
+                            [mm_output],
+                            domain="finn.custom_op.fpgadataflow",
+                            backend="fpgadataflow",
+                            resType="lut",
+                            PE=pe,
+                            Dim=[mm_in_shape[1], mm_in_shape[2]],
+                            Channels=channels,
+                            Kernel=[k_h, k_w],
+                            inputDataType=idt.name,
+                            weightDataType=wdt.name,
+                            outputDataType=odt.name,
+                            ActVal=0,
+                            noActivation=1,
+                            name="VectorVectorActivation_" + n.name,
+                        )
+                        graph.node.insert(node_ind, new_node)
+                        # remove old node
+                        graph.node.remove(n)
+                        graph_modified = True
+        if graph_modified:
+            model = model.transform(InferShapes())
+            model = model.transform(InferDataTypes())
         return (model, graph_modified)
\ No newline at end of file