From be1503a0c78fd4c4d903b1ffbf61964659725bb6 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Tue, 3 Jan 2023 15:37:42 +0000 Subject: [PATCH 001/112] First changes to custom_op for RTL-based MVAU --- .../matrixvectoractivation_rtl.py | 1036 +++++++++++++++++ 1 file changed, 1036 insertions(+) create mode 100644 src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py new file mode 100644 index 0000000000..c8a0aa675b --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py @@ -0,0 +1,1036 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import math +import numpy as np +import os +import textwrap +import warnings +from qonnx.core.datatype import DataType +from qonnx.util.basic import ( + calculate_matvec_accumulator_range, + interleave_matrix_outer_dim_from_partitions, + roundup_to_integer_multiple, +) + +from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.util.data_packing import ( + npy_to_rtlsim_input, + numpy_to_hls_code, + pack_innermost_dim_as_hex_string, + rtlsim_output_to_npy, +) + +from . import templates + +# ONNX i/o tensor shape assumptions for MatrixVectorActivation: +# input 0 is the input tensor, shape (.., i_size) = (..., MW) +# input 1 is the weight tensor, shape (i_size, o_size) = (MW, MH) +# (optional) input 2 is the thresholds tensor, shape (o_size, n_thres) +# output 0 is the output tensor, shape (.., o_size) = (..., MH) +# the ... here can be any shape (representing groups of vectors) + + +class MatrixVectorActivation_rtl(HLSCustomOp): + """Class that corresponds to finn-hls Matrix_Vector_Activate(_Stream)_Batch + function.""" + + def __init__(self, onnx_node): + super().__init__(onnx_node) + self.decoupled_wrapper = templates.decoupled_wrapper + + def get_nodeattr_types(self): + my_attrs = { + "PE": ("i", True, 0), + "SIMD": ("i", True, 0), + "MW": ("i", True, 0), + "MH": ("i", True, 0), + "resType": ("s", False, "lut", {"auto", "lut", "dsp"}), + "ActVal": ("i", False, 0), + # FINN DataTypes for inputs, weights, outputs + "inputDataType": ("s", True, ""), + "weightDataType": ("s", True, ""), + "outputDataType": ("s", True, ""), + # FINN DataType for accumulator -- auto-computed and updated + "accDataType": ("s", False, "INT32"), + # use xnor-popcount for binary weights/inputs, thus treating them + # as bipolar + "binaryXnorMode": ("i", False, 0, {0, 1}), + # no-activation mode (produce accumulators) + "noActivation": ("i", False, 0, {0, 1}), + # number of input vectors, examples: + # [1] is a single vector (like a FC layer with batch=1) + # [4] is four vectors (like a FC layer with batch=4) + # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) + "numInputVectors": ("ints", False, [1]), + # memory mode for the FC weights + # const -- embedded weights, default, long compile/synth times + # decoupled -- streaming weights with weight streamer packaged inside IP + # external -- streaming weights with external streamer + "mem_mode": ("s", False, "const", {"const", "decoupled", "external"}), + # FPGA resource type for memories in decoupled mode + # auto -- let Vivado decide + # block -- use BRAM + # distributed -- use LUTRAM + # ultra -- use UltraRAM (URAM), must have runtime_writeable_weights=1 + # see also https://www.xilinx.com/support/answers/38070.html + "ram_style": ( + "s", + False, + "auto", + {"auto", "block", "distributed", "ultra"}, + ), + # FPGA resource type for threshold memories (if noActivation is False) + # auto -- let Vivado decide + # block -- use BRAM + # distributed -- use LUTRAM + "ram_style_thresholds": ( + "s", + False, + "auto", + {"auto", "block", "distributed"}, + ), + # (mem_mode = decoupled only) whether weights will be writable through + # an AXI-lite interface during runtime + # 1 for enabled, 0 for disabled. + # see finn-rtllib/memstream/doc/README for more about the memory + # address map used for writable weights + # IMPORTANT: After using AXI lite to either read or write the weights, + # always "flush" the accelerator by first passing a dummy input + # vector through the accelerator. This will get rid of any old + # weight data from the weight FIFOs. + "runtime_writeable_weights": ("i", False, 0, {0, 1}), + } + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def calc_wmem(self): + """Calculates and returns WMEM.""" + mw = self.get_nodeattr("MW") + mh = self.get_nodeattr("MH") + pe = self.get_nodeattr("PE") + simd = self.get_nodeattr("SIMD") + assert mh % pe == 0, "Requirement MH divisable by PE is violated." + assert mw % simd == 0, "Requirement MW divisable by SIMD is violated." + wmem = mw * mh // (pe * simd) + return wmem + + def calc_tmem(self): + """Calculates and returns TMEM.""" + assert self.get_nodeattr("noActivation")==1, "RTL-based MVAU does not support thresholding currently, please infer a standalone Thresholding_Batch layer" + return 0 + + def make_shape_compatible_op(self, model): + oshape = self.get_normal_output_shape() + return super().make_const_shape_op(oshape) + + def infer_node_datatype(self, model): + node = self.onnx_node + idt = model.get_tensor_datatype(node.input[0]) + if idt != self.get_input_datatype(): + warn_str = "inputDataType changing for %s: %s -> %s " % ( + node.name, + str(self.get_input_datatype()), + str(idt), + ) + warnings.warn(warn_str) + self.set_nodeattr("inputDataType", idt.name) + # set output datatype from property + odt = self.get_output_datatype() + model.set_tensor_datatype(node.output[0], odt) + + def verify_node(self): + info_messages = [] + # verify that "backend" is set to "fpgadataflow" + backend_value = self.get_nodeattr("backend") + if backend_value == "fpgadataflow": + info_messages.append("Attribute backend is set correctly") + else: + info_messages.append('Attribute backend should be set to "fpgadataflow"') + + # verify that all necessary attributes exist + # TODO collect automatically from get_nodeattr_types + try: + self.get_nodeattr("code_gen_dir_cppsim") + self.get_nodeattr("executable_path") + self.get_nodeattr("resType") + self.get_nodeattr("MW") + self.get_nodeattr("MH") + self.get_nodeattr("SIMD") + self.get_nodeattr("PE") + self.get_nodeattr("inputDataType") + self.get_nodeattr("weightDataType") + self.get_nodeattr("outputDataType") + info_messages.append("All necessary attributes exist") + except Exception: + info_messages.append( + """The required MatrixVectorActivation attributes do not exist.""" + ) + + # verify the number of inputs depending on noActivation value + # check noActivation value to determine the number of inputs + no_act = self.get_nodeattr("noActivation") + + if no_act == 1: + if len(self.onnx_node.input) == 2: + info_messages.append("The number of inputs is correct") + else: + info_messages.append( + """RTL-based MatrixVectorActivation needs in no + activation mode 2 inputs (data input and weights)""" + ) + elif no_act == 0: + info_messages.append("RTL-based MVAU does not support thresholding currently, please infer a standalone Thresholding_Batch layer") + else: + info_messages.append( + """noActivation attribute contains {} should + be 1 for RTL-based MatrixVectorActivation""".format( + no_act + ) + ) + + mem_mode = self.get_nodeattr("mem_mode") + + if mem_mode != "decoupled": + info_messages.append("RTL-based MVAU supports only decoupled weights currently") + + return info_messages + + def uram_estimation(self): + P = self.get_nodeattr("PE") + Q = self.get_nodeattr("SIMD") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + D_in = self.get_nodeattr("MW") + D_out = self.get_nodeattr("MH") + omega = (D_in * D_out) / (Q * P) + mem_width = Q * W * P + mmode = self.get_nodeattr("mem_mode") + mstyle = self.get_nodeattr("ram_style") + if ( + (mmode == "decoupled" and mstyle != "ultra") + or (mmode == "const" and self.calc_wmem() <= 128) + or (mmode == "external") + ): + return 0 + width_multiplier = math.ceil(mem_width / 72) + depth_multiplier = math.ceil(omega / 4096) + return width_multiplier * depth_multiplier + + def bram_estimation(self): + """Calculates resource estimation for BRAM based on: + - FINN-R: An End-to-End Deep-Learning Framework for Fast + Exploration of Quantized Neural Networks + - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien, + Y. Umuroglu, M. Leeser and K. Vissers + - 12. Sep 2018 + """ + # TODO add in/out FIFO contributions + P = self.get_nodeattr("PE") + Q = self.get_nodeattr("SIMD") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + D_in = self.get_nodeattr("MW") + D_out = self.get_nodeattr("MH") + omega = (D_in * D_out) / (Q * P) + mem_width = Q * W * P + mmode = self.get_nodeattr("mem_mode") + mstyle = self.get_nodeattr("ram_style") + if ( + (mmode == "decoupled" and mstyle in ["distributed", "ultra"]) + or (mmode == "const" and self.calc_wmem() <= 128) + or (mmode == "external") + ): + return 0 + # assuming SDP mode RAMB18s (see UG573 Table 1-10) + # assuming decoupled (RTL) memory, which is more efficient than const (HLS) + if mem_width == 1: + return math.ceil(omega / 16384) + elif mem_width == 2: + return math.ceil(omega / 8192) + elif mem_width <= 4: + return (math.ceil(omega / 4096)) * (math.ceil(mem_width / 4)) + elif mem_width <= 9: + return (math.ceil(omega / 2048)) * (math.ceil(mem_width / 9)) + elif mem_width <= 18 or omega > 512: + return (math.ceil(omega / 1024)) * (math.ceil(mem_width / 18)) + else: + return (math.ceil(omega / 512)) * (math.ceil(mem_width / 36)) + + def bram_efficiency_estimation(self): + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + D_in = self.get_nodeattr("MW") + D_out = self.get_nodeattr("MH") + bram16_est = self.bram_estimation() + if bram16_est == 0: + return 1 + wbits = W * D_in * D_out + bram16_est_capacity = bram16_est * 36 * 512 + return wbits / bram16_est_capacity + + def uram_efficiency_estimation(self): + """Function for URAM efficiency estimation: actual parameter storage + needed divided by the allocated URAM storage (from estimation)""" + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + D_in = self.get_nodeattr("MW") + D_out = self.get_nodeattr("MH") + uram_est = self.uram_estimation() + if uram_est == 0: + return 1 + wbits = W * D_in * D_out + uram_est_capacity = uram_est * 72 * 4096 + return wbits / uram_est_capacity + +#TODO: FIX + def lut_estimation(self): + """Calculates resource estimations for LUTs based on: + - FINN-R: An End-to-End Deep-Learning Framework for Fast + Exploration of Quantized Neural Networks + - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien, + Y. Umuroglu, M. Leeser and K. Vissers + - 12. Sep 2018 + """ + # TODO add in/out FIFO contributions + P = self.get_nodeattr("PE") + Q = self.get_nodeattr("SIMD") + MW = self.get_nodeattr("MW") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + # determine tdt with input and weight data types + idt = self.get_input_datatype() + A = idt.bitwidth() + # parameters from experiments in paper mentioned above + c0 = 300 + c1 = 1.1 + c2 = 0 + mmode = self.get_nodeattr("mem_mode") + mstyle = self.get_nodeattr("ram_style") + if (mmode == "decoupled" and mstyle == "distributed") or ( + mmode == "const" and self.calc_wmem() <= 128 + ): + c2 = (P * Q * W) * math.ceil(self.calc_wmem() / 64) + + # multiplication + res_type = self.get_nodeattr("resType") + if res_type == "dsp": + mult_luts = 0 + else: + mult_luts = Q * (2 * math.ceil((W + A) / 6) - 1) * (W + A) + # adder tree + addertree_luts = (W + A) * (2 * Q - 1) + # accumulator + acc_bits = W + A + np.ceil(math.log(MW, 2)) + acc_luts = acc_bits + # thresholds and threshold comparators + thr_luts = 0 + comp_luts = 0 + noact = self.get_nodeattr("noActivation") + if noact == 0: + odt = self.get_output_datatype() + B = odt.bitwidth() + thr_luts = (2**B - 1) * acc_bits * math.ceil(self.calc_tmem() / 64) + comp_luts = (2**B - 1) * acc_bits + + return int( + c0 + + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts)) + + c2 + ) + +#TODO: FIX + def dsp_estimation(self): + # multiplication + P = self.get_nodeattr("PE") + res_type = self.get_nodeattr("resType") + Q = self.get_nodeattr("SIMD") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + idt = self.get_input_datatype() + A = idt.bitwidth() + if res_type == "dsp": + mult_dsp = P * Q * np.ceil((W + A) / 48) # TODO: more accurate modelling + else: + mult_dsp = 0 + return int(mult_dsp) + +#TODO: FIX + def get_exp_cycles(self): + pe = self.get_nodeattr("PE") + simd = self.get_nodeattr("SIMD") + num_inp_vec = self.get_nodeattr("numInputVectors") + mh = self.get_nodeattr("MH") + mw = self.get_nodeattr("MW") + # since mmv != 1 is not supported yet, we set mmv for now to 1 + mmv = 1 + exp_cycles = (mh / pe) * (mw / simd) * np.prod(num_inp_vec) / mmv + return int(exp_cycles) + + def get_input_datatype(self, ind=0): + """Returns FINN DataType of input.""" + # when performing FIFO insertion on an FC layer with ext weights, the ind + # parameter can be > 0 (referring to the weights) so handle that here + if ind == 0: + return DataType[self.get_nodeattr("inputDataType")] + elif ind == 1: + return DataType[self.get_nodeattr("weightDataType")] + else: + raise Exception("Undefined input ind for this layer type") + + def get_weight_datatype(self): + """Returns FINN DataType of weights.""" + return DataType[self.get_nodeattr("weightDataType")] + + def get_output_datatype(self, ind=0): + """Returns FINN DataType of output.""" + return DataType[self.get_nodeattr("outputDataType")] + + def get_instream_width(self, ind=0): + i_bits = self.get_input_datatype().bitwidth() + assert i_bits<=9, "RTL-based MVAU only supports activations with bit-width up to 9-bits" + in_width = i_bits * self.get_nodeattr("SIMD") + return in_width + + def get_outstream_width(self, ind=0): + o_bits = self.get_output_datatype().bitwidth() + out_width = o_bits * self.get_nodeattr("PE") + return out_width + + def get_weightstream_width(self): + """Returns weight stream width. Used only in decoupled mode.""" + if ( + self.get_nodeattr("mem_mode") == "decoupled" + or self.get_nodeattr("mem_mode") == "external" + ): + pe = self.get_nodeattr("PE") + simd = self.get_nodeattr("SIMD") + wp = self.get_weight_datatype().bitwidth() + w_width = pe * simd * wp + assert wp<=8, "RTL-based MVAU only supports weights with bit-width up to 8-bits" + return w_width + else: + return 0 + + def get_weightstream_width_padded(self): + """Returns weight stream width padded to a multiple of 8. This is required + by the AXI Stream spec. Used in decoupled mode.""" + weight_width = self.get_weightstream_width() + return roundup_to_integer_multiple(weight_width, 8) + + def get_ap_int_max_w(self): + # base class impl (max of inp/out stream widths) + max_of_io = super().get_ap_int_max_w() + # decoupled mode weight stream + weightstream = self.get_weightstream_width() + # single PE weight entry + weight_bits = self.get_weight_datatype().bitwidth() + simd = self.get_nodeattr("SIMD") + single_pe_w = simd * weight_bits + return max([weightstream, max_of_io, single_pe_w]) + + def get_folded_input_shape(self, ind=0): + mw = self.get_nodeattr("MW") + mh = self.get_nodeattr("MH") + simd = self.get_nodeattr("SIMD") + pe = self.get_nodeattr("PE") + sf = mw // simd + nf = mh // pe + vecs = list(self.get_nodeattr("numInputVectors")) + + if ind == 0: + # calculate shape of input 0 + folded_input_shape = tuple(vecs + [sf, simd]) + elif ind == 1 and self.get_nodeattr("mem_mode") == "external": + # calculate shape of input 1 (weights) + folded_input_shape = tuple(vecs + [sf * nf, simd * pe]) + else: + raise Exception("Undefined input shape for requested input") + + return folded_input_shape + + def get_folded_output_shape(self, ind=0): + mh = self.get_nodeattr("MH") + pe = self.get_nodeattr("PE") + nf = mh // pe + vecs = list(self.get_nodeattr("numInputVectors")) + folded_output_shape = tuple(vecs + [nf, pe]) + return folded_output_shape + + def get_normal_input_shape(self, ind=0): + mw = self.get_nodeattr("MW") + vecs = list(self.get_nodeattr("numInputVectors")) + normal_input_shape = tuple(vecs + [mw]) + return normal_input_shape + + def get_normal_output_shape(self, ind=0): + mh = self.get_nodeattr("MH") + vecs = list(self.get_nodeattr("numInputVectors")) + normal_output_shape = tuple(vecs + [mh]) + return normal_output_shape + + def get_number_output_values(self): + nf = np.prod(self.get_folded_output_shape()[:-1]) + return nf + + def get_hls_compatible_weight_tensor(self, orig_weight_matrix): + """Convert the original numpy weight matrix orig_weight_matrix into + a form suitable for passing to the hlslib call: + * ensure MH % PE == 0 and MW % SIMD == 0 + * for bipolar {-1,+1} weights, convert to binary {0, 1} + * interleave rows between PEs + * reshape into (1, PE, WMEM, SIMD) and return + """ + mw = self.get_nodeattr("MW") + mh = self.get_nodeattr("MH") + pe = self.get_nodeattr("PE") + simd = self.get_nodeattr("SIMD") + wmem = self.calc_wmem() + assert orig_weight_matrix.shape == ( + mw, + mh, + ), """Weights matrix doesn't + have expected shape (mw, mh)""" + assert mw % simd == 0, "Requirement MH divisable by SIMD is violated." + assert mh % pe == 0, "Requirement MH divisable by PE is violated." + # start by transposing the original weight matrix, since ONNX and + # finn-hlslib use different assumptions + # ONNX uses (in_features, out_features) and matmul(x, W) + # finn-hlslib uses (out_features, in_features) and matmul(W, x) + ret = orig_weight_matrix.T + # interleave rows between PEs and reshape + # distribute rows between PEs + ret = interleave_matrix_outer_dim_from_partitions(ret, pe) + # create SIMD as innermost dimension and add a dummy outer dim + ret = ret.reshape(1, pe, wmem, simd) + # reverse the SIMD dimension + ret = np.flip(ret, axis=-1) + return ret + + def minimize_accumulator_width(self, model): + weights = model.get_initializer(self.onnx_node.input[1]) + idt = self.get_input_datatype() + # calculate minimum and maximum values of accumulator + (acc_min, acc_max) = calculate_matvec_accumulator_range(weights, idt) + if acc_min < 0: + if abs(acc_min) > acc_max: + adt = DataType.get_smallest_possible(acc_min) + else: + adt = DataType.get_smallest_possible(-acc_max - 1) + else: + adt = DataType.get_smallest_possible(acc_max) + # ensure a datatype divisible by 8-bits in case this is the last node + bw = roundup_to_integer_multiple(adt.bitwidth(), 8) + new_adt_name = adt.name.replace(str(adt.bitwidth()), str(bw)) + adt = DataType[new_adt_name] + self.set_nodeattr("accDataType", adt.name) + # for no-activation nodes, output dt = acc dt + self.set_nodeattr("outputDataType", adt.name) + return DataType[self.get_nodeattr("accDataType")] + + def make_weight_file(self, weights, weight_file_mode, weight_file_name): + """Produce a file containing given weights in appropriate format for this + layer. This file can be used for either synthesis or run-time reconfig + of weights. + + Arguments: + * weights : numpy array with weights to be put into the file + * weight_file_mode : one of {hls_header, decoupled_verilog_dat, + decoupled_runtime} + * weight_file_name : filename for the weight file to be generated + """ + # convert weights into hlslib-compatible format + weight_tensor = self.get_hls_compatible_weight_tensor(weights) + export_wdt = self.get_weight_datatype() + if "decoupled" in weight_file_mode: + # create a weight stream for various flavors of decoupled mode: + # transpose weight tensor from (1, PE, WMEM, SIMD) to (1, WMEM, PE, SIMD) + weight_tensor_unflipped = np.transpose(weight_tensor, (0, 2, 1, 3)) + # reverse SIMD flip for saving weights in .npy + weight_tensor_simd_flipped = np.flip(weight_tensor_unflipped, axis=-1) + # PE flip for saving weights in .dat + weight_tensor_pe_flipped = np.flip(weight_tensor_unflipped, axis=-2) + # reshape weight tensor (simd_flipped and pe_flipped) to desired shape + pe = self.get_nodeattr("PE") + simd = self.get_nodeattr("SIMD") + # simd_flipped + weight_tensor_simd_flipped = weight_tensor_simd_flipped.reshape( + 1, -1, pe * simd + ) + weight_tensor_simd_flipped = weight_tensor_simd_flipped.copy() + # flipped + weight_tensor_pe_flipped = weight_tensor_pe_flipped.reshape( + 1, -1, pe * simd + ) + weight_tensor_pe_flipped = weight_tensor_pe_flipped.copy() + if weight_file_mode == "decoupled_verilog_dat": + # convert weight values into hexstring + weight_width = self.get_weightstream_width() + # pad to nearest 4 bits to get hex strings + weight_width_padded = roundup_to_integer_multiple(weight_width, 4) + weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string( + weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix="" + ) + # add zeroes to pad out file to 1024 entries + weight_stream = weight_tensor_pe_flipped.flatten() + weight_stream = weight_stream.copy() + with open(weight_file_name, "w") as f: + for val in weight_stream: + f.write(val + "\n") + elif weight_file_mode == "decoupled_runtime": + # memstream axi-lite interface will map each mem line to + # one or multiple 32-bit words + weight_width = self.get_weightstream_width() + words_per_memwidth = 2 ** math.ceil(math.log2(weight_width / 32)) + if words_per_memwidth < 1: + words_per_memwidth = 1 + weight_width_padded = words_per_memwidth * 32 + # first, pack and ensure padding to 32 bits + weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string( + weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix="" + ) + weight_stream = weight_tensor_pe_flipped.flatten() + weight_stream = weight_stream.copy() + with open(weight_file_name, "w") as f: + for val in weight_stream: + # split into groups of 8 hex digits (= 32 bits) + words_32b = textwrap.wrap(val, 8) + words_32b.reverse() + for word_32b in words_32b: + f.write(word_32b + "\n") + else: + raise Exception("Unknown/unsupported weight_file_mode") + + else: + raise Exception("Unknown/unsupported weight_file_mode") + + def generate_params(self, model, path): + mem_mode = self.get_nodeattr("mem_mode") + code_gen_dir = path + # weights, if not external + weights = model.get_initializer(self.onnx_node.input[1]) + if mem_mode == "decoupled": + weight_filename_sim = "{}/weights.npy".format(code_gen_dir) + # save decoupled weights for cppsim + self.make_weight_file(weights, "decoupled_npy", weight_filename_sim) + # also save weights as Verilog .dat file + # note that we provide two different .dat files, one for synth + # and one for synthesis. this is because URAM-based weights always + # need zero weights for synthesis, otherwise they get inferred + # as BRAM + weight_filename_rtl_synth = "{}/memblock_synth_0.dat".format( + code_gen_dir + ) + weight_filename_rtl_sim = "{}/memblock_sim_0.dat".format(code_gen_dir) + # sim weights are always the true weights + self.make_weight_file( + weights, "decoupled_verilog_dat", weight_filename_rtl_sim + ) + ram_style = self.get_nodeattr("ram_style") + if ram_style == "ultra": + # UltraRAM must have no memory initializer, or only zeroes + # otherwise BRAM will be inferred instead of URAM + # as a workaround we provide a zero-weight init here + synth_weights = np.zeros_like(weights, dtype=np.float32) + else: + synth_weights = weights + self.make_weight_file( + synth_weights, "decoupled_verilog_dat", weight_filename_rtl_synth + ) + else: + raise Exception( + """Please set mem_mode to "decoupled", + currently no other parameter value is supported!""" + ) + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + mem_mode = self.get_nodeattr("mem_mode") + node = self.onnx_node + + # TODO ensure codegen dir exists + if mode == "cppsim": + raise Exception( + "cppsim not possible for RTL MVAU, please set exec_mode to rtlsim" + ) + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + # create a npy file fore each input of the node (in_ind is input index) + in_ind = 0 + for inputs in node.input: + # it is assumed that the first input of the node is the data input + # the second input are the weights + # the third input are the thresholds + if in_ind == 0: + assert ( + str(context[inputs].dtype) == "float32" + ), """Input datatype is + not float32 as expected.""" + expected_inp_shape = self.get_folded_input_shape() + reshaped_input = context[inputs].reshape(expected_inp_shape) + export_idt = self.get_input_datatype() + # make copy before saving the array + reshaped_input = reshaped_input.copy() + np.save( + os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)), + reshaped_input, + ) + elif in_ind > 2: + raise Exception("Unexpected input found for MatrixVectorActivation") + in_ind += 1 + + if mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + if mem_mode == "external" or mem_mode == "decoupled": + wnbits = self.get_weightstream_width() + export_wdt = self.get_weight_datatype() + wei = npy_to_rtlsim_input( + "{}/weights.npy".format(code_gen_dir), export_wdt, wnbits + ) + num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) + io_dict = { + "inputs": {"in0": inp, "weights": wei * num_w_reps}, + "outputs": {"out": []}, + } + self.rtlsim_multi_io(sim, io_dict) + output = io_dict["outputs"]["out"] + else: + output = self.rtlsim(sim, inp) + odt = self.get_output_datatype() + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + oshape = self.get_normal_output_shape() + output = np.asarray([output], dtype=np.float32).reshape(*oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + def code_generation_ipgen(self, model, fpgapart, clk): + """Normally: Generates C++ code and tcl script for IP generation. + Here: Generates (System-)Verilog code for IP generation.""" + self.generate_hdl() + + def ipgen_singlenode_code(self): + """Normally: Builds the bash script for IP generation.""" + pass + + def code_generation_cppsim(self, model): + """Normally: Generates C++ code for simulation (cppsim).""" + pass + + def compile_singlenode_code(self): + pass + + def global_includes(self): + pass + + def defines(self, var): + pass + + def read_npy_data(self): + pass + + def strm_decl(self): + pass + + def docompute(self): + pass + + def dataoutstrm(self): + pass + + def save_as_npy(self): + pass + + def blackboxfunction(self): + pass + + def pragmas(self): + pass + + def code_generation_ipi(self): + cmd = [] + # add streamer if needed + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "decoupled": + runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 + if self.get_nodeattr("ram_style") == "ultra": + assert ( + runtime_writable == 1 + ), "Layer with URAM weights must have runtime_writeable_weights=1" + node_name = self.onnx_node.name + sname = self.hls_sname() + # create a hierarchy for this layer, with the same port names + clk_name = self.get_verilog_top_module_intf_names()["clk"][0] + rst_name = self.get_verilog_top_module_intf_names()["rst"][0] + dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0][0] + din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0] + cmd.append("create_bd_cell -type hier %s" % node_name) + cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name)) + cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name)) + cmd.append( + "create_bd_intf_pin -mode Master " + "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" + % (node_name, dout_name) + ) + cmd.append( + "create_bd_intf_pin -mode Slave " + "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name) + ) + # instantiate the hls ip + cmd.append( + "create_bd_cell -type ip -vlnv %s /%s/%s" + % (self.get_nodeattr("ip_vlnv"), node_name, node_name) + ) + # instantiate a streamer and connect it to the HLS IP + strm_vlnv = "xilinx.com:user:memstream:1.0" + strm_inst = node_name + "_wstrm" + cmd.append( + "create_bd_cell -type ip -vlnv %s /%s/%s" + % (strm_vlnv, node_name, strm_inst) + ) + cmd.append( + "set_property -dict [list " + "CONFIG.NSTREAMS {1} " + "CONFIG.MEM_DEPTH {%d} " + "CONFIG.MEM_WIDTH {%d} " + "CONFIG.MEM_INIT {%s} " + "CONFIG.RAM_STYLE {%s} " + "CONFIG.STRM0_DEPTH {%d} " + "CONFIG.STRM0_WIDTH {%d} " + "CONFIG.STRM0_OFFSET {0} " + "] [get_bd_cells /%s/%s]" + % ( + self.calc_wmem(), + self.get_weightstream_width_padded(), + self.get_nodeattr("code_gen_dir_ipgen") + "/", + self.get_nodeattr("ram_style"), + self.calc_wmem(), + self.get_weightstream_width_padded(), + node_name, + strm_inst, + ) + ) + cmd.append( + "connect_bd_intf_net [get_bd_intf_pins %s/%s/m_axis_0] " + "[get_bd_intf_pins %s/%s/weights_%s]" + % (node_name, strm_inst, node_name, node_name, sname) + ) + cmd.append( + "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aresetn]" + % (node_name, rst_name, node_name, strm_inst) + ) + cmd.append( + "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aclk]" + % (node_name, clk_name, node_name, strm_inst) + ) + cmd.append( + "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]" + % (node_name, rst_name, node_name, node_name, rst_name) + ) + cmd.append( + "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]" + % (node_name, clk_name, node_name, node_name, clk_name) + ) + cmd.append( + "connect_bd_intf_net [get_bd_intf_pins %s/%s] " + "[get_bd_intf_pins %s/%s/%s]" + % (node_name, din_name, node_name, node_name, din_name) + ) + cmd.append( + "connect_bd_intf_net [get_bd_intf_pins %s/%s] " + "[get_bd_intf_pins %s/%s/%s]" + % (node_name, dout_name, node_name, node_name, dout_name) + ) + if runtime_writable: + # expose axi lite interface for writeable weights + axilite_name = self.get_verilog_top_module_intf_names()["axilite"][0] + cmd.append( + "create_bd_intf_pin -mode Slave " + "-vlnv xilinx.com:interface:aximm_rtl:1.0 /%s/%s" + % (node_name, axilite_name) + ) + cmd.append( + "connect_bd_intf_net [get_bd_intf_pins %s/%s] " + "[get_bd_intf_pins %s/%s/%s]" + % (node_name, axilite_name, node_name, strm_inst, axilite_name) + ) + # TODO calculate and pass in segment size here + cmd.append("assign_bd_address") + cmd.append("save_bd_design") + elif mem_mode == "const" or mem_mode == "external": + # base class impl sufficient for const/external modes + return super().code_generation_ipi() + else: + raise Exception("Unrecognized mem_mode for MatrixVectorActivation") + return cmd + + def get_verilog_top_module_intf_names(self): + intf_names = super().get_verilog_top_module_intf_names() + mem_mode = self.get_nodeattr("mem_mode") + sname = self.hls_sname() + if mem_mode == "external": + intf_names["s_axis"].append( + ("weights_" + sname, self.get_weightstream_width_padded()) + ) + if mem_mode == "decoupled": + # only expose axilite interface if attribute is set + runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 + if runtime_writable: + intf_names["axilite"] = ["s_axilite"] + return intf_names + + def get_op_and_param_counts(self): + in_features = self.get_nodeattr("MW") + out_features = self.get_nodeattr("MH") + weight_bits = self.get_weight_datatype().bitwidth() + inp_bits = self.get_input_datatype().bitwidth() + num_inp_vec = self.get_nodeattr("numInputVectors") + num_repetitions = int(np.prod(num_inp_vec)) + mac_count = in_features * out_features * num_repetitions + # cannonicalize op type: highest bitwidth operand first s.t. + # e.g. mac_8bx4b and mac_4bx8b don't appear as two different op types + bw1 = min(inp_bits, weight_bits) + bw2 = max(inp_bits, weight_bits) + mac_op_type = "op_mac_%dbx%db" % (bw1, bw2) + weight_param_type = "param_weight_%db" % (weight_bits) + weight_count = in_features * out_features + ret_dict = {mac_op_type: mac_count, weight_param_type: weight_count} + if self.get_nodeattr("noActivation") == 0: + tdt = DataType[self.get_nodeattr("accDataType")] + thres_bits = tdt.bitwidth() + thres_param_type = "param_threshold_%db" % (thres_bits) + thres_count = out_features + ret_dict[thres_param_type] = thres_count + return ret_dict + + def derive_characteristic_fxns(self, period): + n_inps = np.prod(self.get_folded_input_shape()[:-1]) + io_dict = { + "inputs": { + "in0": [0 for i in range(n_inps)], + }, + "outputs": {"out": []}, + } + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode in ["decoupled", "external"]: + n_weight_inps = self.calc_wmem() + num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) + io_dict["inputs"]["weights"] = [ + 0 for i in range(num_w_reps * n_weight_inps) + ] + super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) + + def generate_hdl(self): +#TODO: add distinction between (PE=MH or PE=1) and where MH dimension is folded + template_path, code_gen_dict = self.prepare_codegen_default() + + # add general parameters to dictionary + code_gen_dict["$TOP_MODULE_NAME$"] = [self.get_verilog_top_module_name()] + # save top module name so we can refer to it after this node has been renamed + # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject) + self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name()) +#TODO: currently only ram_style=auto is supported + ram_style = self.get_nodeattr("ram_style") + if ram_style == "auto": + continue + else: + raise Exception("Unrecognized ram_style for MatrixVectorActivation") + + # apply code generation to templates + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + with open(template_path, "r") as f: + template = f.read() + for key in code_gen_dict: + # transform list into long string separated by '\n' + code_gen_line = "\n".join(code_gen_dict[key]) + template = template.replace(key, code_gen_line) + template_wrapper = template_wrapper.replace(key, code_gen_line) + with open( + os.path.join( + code_gen_dir, self.get_nodeattr("gen_top_module") + "_impl.sv" + ), + "w", + ) as f: + f.write(template) + with open( + os.path.join( + code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v" + ), + "w", + ) as f: + f.write(template_wrapper) + + # set ipgen_path and ip_path so that HLS-Synth transformation + # and stich_ip transformation do not complain + self.set_nodeattr("ipgen_path", code_gen_dir) + self.set_nodeattr("ip_path", code_gen_dir) + + def prepare_codegen_default(self): + # TODO: Differentiate between PE folding and fully unrolled along MH dimension + template_path = ( + os.environ["FINN_ROOT"] + "/finn-rtllib/mvau/dsp58_mvau_template.vhdl" + ) + code_gen_dict = {} + + code_gen_dict["$PE$"] = self.get_nodeattr("PE") + code_gen_dict["$SIMD$"] = self.get_nodeattr("SIMD") + code_gen_dict["$MW$"] = self.get_nodeattr("MW") + code_gen_dict["$MH$"] = self.get_nodeattr("MH") + code_gen_dict["$ACTIVATION_WIDTH$"] = self.get_input_datatype(0).bitwidth() + code_gen_dict["$WEIGHT_WIDTH$"] = self.get_input_datatype(1).bitwidth() + code_gen_dict["$ACCU_WIDTH_BA$"] = self.get_output_datatype().bitwidth() + + return template_path, code_gen_dict + From afab9cd6543b4fe1f612c329074d30d59706ac08 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 6 Apr 2023 12:34:01 +0100 Subject: [PATCH 002/112] [rtl custom op]: initial implementation of mvu_8sx9 --- finn-rtllib/mvu/mvu_8sx9.sv | 284 ++++++++++++++++++++++++++++++++++++ 1 file changed, 284 insertions(+) create mode 100644 finn-rtllib/mvu/mvu_8sx9.sv diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv new file mode 100644 index 0000000000..c992990d9f --- /dev/null +++ b/finn-rtllib/mvu/mvu_8sx9.sv @@ -0,0 +1,284 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Matrix Vector Unit (MVU) core compute kernel utilizing DSP58. + *****************************************************************************/ + +module mvu_8sx9 #( + int unsigned PE, + int unsigned SIMD, + int unsigned ACTIVATION_WIDTH, + int unsigned WEIGHT_WIDTH, + bit SIGNED_ACTIVATIONS = 0, + int unsigned SEGMENTLEN = 0 // Default to 0 (which implies a single segment) + ) + ( + input logic clk, + input logic rst, + input logic en, + input logic last, + input logic zero, + input logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] a, + input logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, + output logic vld, + output logic [PE-1:0][57:0] p + ); + +//-------------------- Declare global signals --------------------\\ +localparam int unsigned CHAINLEN = (SIMD+2)/3; +localparam int unsigned SEGLEN = SEGMENTLEN == 0 ? CHAINLEN : SEGMENTLEN; // Additional constant to default a SEGMENTLEN of '0' to the DSP-chain length +uwire [26:0] a_in_i [CHAINLEN]; +uwire [23:0] b_in_i [PE][CHAINLEN]; +uwire [57:0] pcout [PE][CHAINLEN]; + +//-------------------- Shift register for opmode select signal --------------------\\ +localparam int unsigned MAX_PIPELINE_STAGES = (CHAINLEN + SEGLEN-1)/SEGLEN; // >=1 (== number of pipeline registers + 1 (A/B inputs always have 1 register)) +logic L [0:1+MAX_PIPELINE_STAGES] = '{default: 0}; // After MAX_PIPELINE_STAGES (== number of pipeline stages for input data), we have 3 additional cycles latency (A/B reg, Mreg, Preg). Thus, we add +2 (since OPMODE is buffered by 1 cycle in the DSP fabric) + +always_ff @(posedge clk) begin + if(rst) L <= '{default: 0}; + else if(en) L <= { L[1:1+MAX_PIPELINE_STAGES], last }; +end +assign vld = L[0]; + +//-------------------- Shift register for ZERO flag --------------------\\ +logic Z [0:MAX_PIPELINE_STAGES-2] = '{default:0}; // We need MAX_PIPELINE_STAGES-1 pipeline stages (note: INMODE is buffered inside DSP fabric) + +if (MAX_PIPELINE_STAGES > 1) begin : genZreg + always_ff @(posedge clk) begin + if (rst) Z <= '{default: 0}; + else if(en) begin + Z[0] <= zero; + if (MAX_PIPELINE_STAGES > 2) Z[1:MAX_PIPELINE_STAGES-1] <= Z[0:MAX_PIPELINE_STAGES-2]; + end + end +end; + +//-------------------- Buffer for input activations --------------------\\ +localparam int unsigned PAD_BITS_ACT = 9 - ACTIVATION_WIDTH; +typedef logic [2:0][ACTIVATION_WIDTH-1:0] a_buffer_t; + +for (genvar i=0; i1 ? TOTAL_PREGS-1 : 0; + + if (EXTERNAL_PREGS > 0) begin : genExternalPregAct + a_buffer_t A [0:EXTERNAL_PREGS-1]; + always_ff @(posedge clk) begin + if (rst) A <= '{default: 0}; + else if(en) begin + A[EXTERNAL_PREGS-1] <= a[3*i +: 3]; + if (EXTERNAL_PREGS > 1) A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1]; + end + end + assign a_in_i[i][26:0] = SIGNED_ACTIVATIONS ? { {PAD_BITS_ACT{A[0][2][ACTIVATION_WIDTH-1]}}, A[0][2], {PAD_BITS_ACT{A[0][1][ACTIVATION_WIDTH-1]}}, A[0][1], {PAD_BITS_ACT{A[0][0][ACTIVATION_WIDTH-1]}}, A[0][0]} + : { {PAD_BITS_ACT{1'b0}}, A[0][2], {PAD_BITS_ACT{1'b0}}, A[0][1], {PAD_BITS_ACT{1'b0}}, A[0][0]} ; + end : genExternalPregAct + else begin : genInpDSPAct + assign a_in_i[i][26:0] = SIGNED_ACTIVATIONS ? { {PAD_BITS_ACT{a[3*i+2][ACTIVATION_WIDTH-1]}}, a[3*i+2], {PAD_BITS_ACT{a[3*i+1][ACTIVATION_WIDTH-1]}}, a[3*i+1], {PAD_BITS_ACT{a[3*i][ACTIVATION_WIDTH-1]}}, a[3*i]} + : { {PAD_BITS_ACT{1'b0}}, a[3*i+2], {PAD_BITS_ACT{1'b0}}, a[3*i+1], {PAD_BITS_ACT{1'b0}}, a[3*i]} ; + end : genInpDSPAct + +end : genActSIMD + +//-------------------- Buffer for weights --------------------\\ +localparam int unsigned PAD_BITS_WEIGHT = 8 - WEIGHT_WIDTH; +typedef logic [2:0][WEIGHT_WIDTH-1:0] b_buffer_t; + +for (genvar j=0; j1 ? TOTAL_PREGS-1 : 0; + + if (EXTERNAL_PREGS > 0) begin : genExternalPregWeight + b_buffer_t B [0:PE-1][0:EXTERNAL_PREGS-1]; + always_ff @(posedge clk) begin + if (rst) B <= '{default: 0}; + else if (en) begin + B[j][EXTERNAL_PREGS-1] <= w[j][3*i +: 3]; + if (EXTERNAL_PREGS > 1) B[j][0:EXTERNAL_PREGS-2] <= B[j][1:EXTERNAL_PREGS-1]; + end + end + assign b_in_i[j][i][23:0] = { {PAD_BITS_WEIGHT{B[j][0][2][WEIGHT_WIDTH-1]}}, B[j][0][2], {PAD_BITS_WEIGHT{B[j][0][1][WEIGHT_WIDTH-1]}}, B[j][0][1], {PAD_BITS_WEIGHT{B[j][0][0][WEIGHT_WIDTH-1]}}, B[j][0][0] }; + end : genExternalPregWeight + else begin : genInpDSPWeight + assign b_in_i[j][i][23:0] = { {PAD_BITS_WEIGHT{w[j][3*i+2][WEIGHT_WIDTH-1]}}, w[j][3*i+2], {PAD_BITS_WEIGHT{w[j][3*i+1][WEIGHT_WIDTH-1]}}, w[j][3*i+1], {PAD_BITS_WEIGHT{w[j][3*i][WEIGHT_WIDTH-1]}}, w[j][3*i] }; + end : genInpDSPWeight + end : genWeightSIMD + +end : genWeightPE + +//-------------------- Instantiate PE x CHAINLEN DSPs --------------------\\ +for (genvar j=0; j0 ? 2 : 1; // 1 : 0 + localparam bit PREG = (i+1)%SEGLEN==0 || i == CHAINLEN-1; + localparam bit FIRST = i == 0; + localparam bit LAST = i == CHAINLEN-1; + uwire [57:0] pp; + + if (LAST) begin : genPOUT + assign p[j] = pp; + end + + DSP58 #( + // Feature Control Attributes: Data Path Selection + .AMULTSEL("A"), // Selects A input to multiplier (A, AD) + .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) + .BMULTSEL("B"), // Selects B input to multiplier (AD, B) + .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port) + .DSP_MODE("INT8"), // Configures DSP to a particular mode of operation. Set to INT24 for + // legacy mode. + .PREADDINSEL("A"), // Selects input to pre-adder (A, B) + .RND(58'h000000000000000), // Rounding Constant + .USE_MULT("MULTIPLY"), // Select multiplier usage (DYNAMIC, MULTIPLY, NONE) + .USE_SIMD("ONE58"), // SIMD selection (FOUR12, ONE58, TWO24) + .USE_WIDEXOR("FALSE"), // Use the Wide XOR function (FALSE, TRUE) + .XORSIMD("XOR24_34_58_116"), // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116) + // Pattern Detector Attributes: Pattern Detection Configuration + .AUTORESET_PATDET("NO_RESET"), // NO_RESET, RESET_MATCH, RESET_NOT_MATCH + .AUTORESET_PRIORITY("RESET"), // Priority of AUTORESET vs. CEP (CEP, RESET). + .MASK(58'h0ffffffffffffff), // 58-bit mask value for pattern detect (1=ignore) + .PATTERN(58'h000000000000000), // 58-bit pattern match for pattern detect + .SEL_MASK("MASK"), // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2 + .SEL_PATTERN("PATTERN"), // Select pattern value (C, PATTERN) + .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect (NO_PATDET, PATDET) + // Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins + .IS_ALUMODE_INVERTED(4'b0000), // Optional inversion for ALUMODE + .IS_CARRYIN_INVERTED(1'b0), // Optional inversion for CARRYIN + .IS_CLK_INVERTED(1'b0), // Optional inversion for CLK + .IS_INMODE_INVERTED(5'b00000), // Optional inversion for INMODE + .IS_NEGATE_INVERTED(3'b000), // Optional inversion for NEGATE + .IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0 + FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN + 2'b01, // Y : M + 2'b01 // X: M + }), // Optional inversion for OPMODE + .IS_RSTALLCARRYIN_INVERTED(1'b0), // Optional inversion for RSTALLCARRYIN + .IS_RSTALUMODE_INVERTED(1'b0), // Optional inversion for RSTALUMODE + .IS_RSTA_INVERTED(1'b0), // Optional inversion for RSTA + .IS_RSTB_INVERTED(1'b0), // Optional inversion for RSTB + .IS_RSTCTRL_INVERTED(1'b0), // Optional inversion for STCONJUGATE_A + .IS_RSTC_INVERTED(1'b0), // Optional inversion for RSTC + .IS_RSTD_INVERTED(1'b0), // Optional inversion for RSTD + .IS_RSTINMODE_INVERTED(1'b0), // Optional inversion for RSTINMODE + .IS_RSTM_INVERTED(1'b0), // Optional inversion for RSTM + .IS_RSTP_INVERTED(1'b0), // Optional inversion for RSTP + // Register Control Attributes: Pipeline Register Configuration + .ACASCREG(INTERNAL_PREGS), // Number of pipeline stages between A/ACIN and ACOUT (0-2) + .ADREG(0), // Pipeline stages for pre-adder (0-1) + .ALUMODEREG(0), // Pipeline stages for ALUMODE (0-1) + .AREG(INTERNAL_PREGS), // Pipeline stages for A (0-2) + .BCASCREG(INTERNAL_PREGS), // Number of pipeline stages between B/BCIN and BCOUT (0-2) + .BREG(INTERNAL_PREGS), // Pipeline stages for B (0-2) + .CARRYINREG(0), // Pipeline stages for CARRYIN (0-1) + .CARRYINSELREG(0), // Pipeline stages for CARRYINSEL (0-1) + .CREG(0), // Pipeline stages for C (0-1) + .DREG(0), // Pipeline stages for D (0-1) + .INMODEREG(1), // Pipeline stages for INMODE (0-1) + .MREG(1), // Multiplier pipeline stages (0-1) + .OPMODEREG(1), // Pipeline stages for OPMODE (0-1) + .PREG(PREG), // Number of pipeline stages for P (0-1) + .RESET_MODE("SYNC") // Selection of synchronous or asynchronous reset. (ASYNC, SYNC). + ) + DSP58_inst ( + // Cascade outputs: Cascade Ports + .ACOUT(), // 34-bit output: A port cascade + .BCOUT(), // 24-bit output: B cascade + .CARRYCASCOUT(), // 1-bit output: Cascade carry + .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade + .PCOUT(pcout[j][i]), // 58-bit output: Cascade output + // Control outputs: Control Inputs/Status Bits + .OVERFLOW(), // 1-bit output: Overflow in add/acc + .PATTERNBDETECT(), // 1-bit output: Pattern bar detect + .PATTERNDETECT(), // 1-bit output: Pattern detect + .UNDERFLOW(), // 1-bit output: Underflow in add/acc + // Data outputs: Data Ports + .CARRYOUT(), // 4-bit output: Carry + .P(pp), // 58-bit output: Primary data + .XOROUT(), // 8-bit output: XOR data + // Cascade inputs: Cascade Ports + .ACIN('x), // 34-bit input: A cascade data + .BCIN('x), // 24-bit input: B cascade + .CARRYCASCIN('x), // 1-bit input: Cascade carry + .MULTSIGNIN('x), // 1-bit input: Multiplier sign cascade + .PCIN(FIRST ? 'x : pcout[j][i-1]), // 58-bit input: P cascade + // Control inputs: Control Inputs/Status Bits + .ALUMODE(4'h0), // 4-bit input: ALU control + .CARRYINSEL('0), // 3-bit input: Carry select + .CLK(clk), // 1-bit input: Clock + .INMODE({ + INTERNAL_PREGS==2 ? 1'b0 : 1'b1, + 2'b00, + TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero, + INTERNAL_PREGS==2 ? 1'b0 : 1'b1 + }), // 5-bit input: INMODE control + .NEGATE('0), // 3-bit input: Negates the input of the multiplier + .OPMODE({ + LAST ? {1'b0, L[1]} : 2'b00, + 7'b000_0000 + }), // 9-bit input: Operation mode + // Data inputs: Data Ports + .A({ 7'bx, a_in_i[i] }), // 34-bit input: A data + .B(b_in_i[j][i]), // 24-bit input: B data + .C('x), // 58-bit input: C data + .CARRYIN('0), // 1-bit input: Carry-in + .D('x), // 27-bit input: D data + // Reset/Clock Enable inputs: Reset/Clock Enable Inputs + .ASYNC_RST('0), // 1-bit input: Asynchronous reset for all registers. + .CEA1(en), // 1-bit input: Clock enable for 1st stage AREG + .CEA2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage AREG + .CEAD('0), // 1-bit input: Clock enable for ADREG + .CEALUMODE('0), // 1-bit input: Clock enable for ALUMODE + .CEB1(en), // 1-bit input: Clock enable for 1st stage BREG + .CEB2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage BREG + .CEC('0), // 1-bit input: Clock enable for CREG + .CECARRYIN('0), // 1-bit input: Clock enable for CARRYINREG + .CECTRL(en), // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG + .CED('0), // 1-bit input: Clock enable for DREG + .CEINMODE(en), // 1-bit input: Clock enable for INMODEREG + .CEM(en), // 1-bit input: Clock enable for MREG + .CEP(PREG && en), // 1-bit input: Clock enable for PREG + .RSTA(rst), // 1-bit input: Reset for AREG + .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG + .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG + .RSTB(rst), // 1-bit input: Reset for BREG + .RSTC('0), // 1-bit input: Reset for CREG + .RSTCTRL(rst), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG + .RSTD('0), // 1-bit input: Reset for DREG and ADREG + .RSTINMODE(rst), // 1-bit input: Reset for INMODE register + .RSTM(rst), // 1-bit input: Reset for MREG + .RSTP(PREG && rst) // 1-bit input: Reset for PREG + ); + end : genDSPChain +end : genDSPPE + +endmodule From a94fc3bb0759ecd4b9af212d1629236894a1b520 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 6 Apr 2023 12:34:22 +0100 Subject: [PATCH 003/112] [rtl custom op]: testbench for mvu_8sx9 --- finn-rtllib/mvu/mvu_8sx9_tb.sv | 165 +++++++++++++++++++++++++++++++++ 1 file changed, 165 insertions(+) create mode 100644 finn-rtllib/mvu/mvu_8sx9_tb.sv diff --git a/finn-rtllib/mvu/mvu_8sx9_tb.sv b/finn-rtllib/mvu/mvu_8sx9_tb.sv new file mode 100644 index 0000000000..ea3ecbbd70 --- /dev/null +++ b/finn-rtllib/mvu/mvu_8sx9_tb.sv @@ -0,0 +1,165 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Testbench for MVU core compute kernel. + *****************************************************************************/ + +module mvu_8sx9_tb(); + + //-------------------- Simulation parameters --------------------\\ + // Matrix & parallelism config + localparam int unsigned MH = 256; + localparam int unsigned PE = 16; + localparam int unsigned MW = 600; + localparam int unsigned SIMD = 60; + localparam int unsigned SEGMENTLEN = 4; + // Bit-width config + localparam int unsigned ACTIVATION_WIDTH = 8; + localparam int unsigned WEIGHT_WIDTH = 4; + localparam bit SIGNED_ACTIVATIONS = 1; + // Simulation constants + localparam int unsigned NF = MH/PE; + localparam int unsigned SF = MW/SIMD; + localparam int unsigned NUM_OF_DSP = SIMD/3; + + typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; + typedef activation_t activation_vector_t[SF]; + + function activation_vector_t init_ACTIVATIONS; + automatic activation_vector_t res; + std::randomize(res); + return res; + endfunction : init_ACTIVATIONS + + typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; + typedef weight_t weight_matrix_t[NF][SF]; + + function weight_matrix_t init_WEIGHTS; + automatic weight_matrix_t res; + std::randomize(res); + return res; + endfunction : init_WEIGHTS; + + typedef logic signed [PE-1:0][57:0] output_t; + typedef output_t output_vector_t [NF]; + + function output_vector_t check_output(activation_vector_t a, weight_matrix_t w); + automatic output_vector_t res = '{default: 0}; + for (int j = 0; j 1) && !rst; + end + + // Compare computed output against golden output when vld flag is raised by DUT + always_ff @(posedge clk iff (vld && en)) begin + foreach(p[i]) begin + assert ($signed(p[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + else begin + $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + $stop; + end + end + NF_CNT += 1; + end + + // Instantiate DUT + mvu_8sx9 #( + .PE(PE), + .SIMD(SIMD), + .WEIGHT_WIDTH(WEIGHT_WIDTH), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), + .ACTIVATION_WIDTH(ACTIVATION_WIDTH), + .SEGMENTLEN(SEGMENTLEN) + ) + dut ( + .clk, .rst, .en, .last, .zero, .a, .w, .vld, .p + ); + +endmodule From 98f9accb40bed3445215e15d30398e09948e0b9f Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 6 Apr 2023 12:35:30 +0100 Subject: [PATCH 004/112] [rtl custom op]: initial implementation of flow control component for mvu_8sx9 --- finn-rtllib/mvu/mvu_8sx9_axi.sv | 179 ++++++++++++++++++++++++++++++++ 1 file changed, 179 insertions(+) create mode 100644 finn-rtllib/mvu/mvu_8sx9_axi.sv diff --git a/finn-rtllib/mvu/mvu_8sx9_axi.sv b/finn-rtllib/mvu/mvu_8sx9_axi.sv new file mode 100644 index 0000000000..8765c50a26 --- /dev/null +++ b/finn-rtllib/mvu/mvu_8sx9_axi.sv @@ -0,0 +1,179 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Matrix Vector Unit (MVU) AXI-lite interface wrapper. + *****************************************************************************/ + +module mvu_8sx9_axi #( + int unsigned MW, + int unsigned MH, + int unsigned PE, + int unsigned SIMD, + int unsigned ACTIVATION_WIDTH, + int unsigned WEIGHT_WIDTH, + int unsigned ACCU_WIDTH, + bit SIGNED_ACTIVATIONS = 0, + int unsigned SEGMENTLEN = 0, + parameter RAM_STYLE = "auto", + + localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, + localparam int unsigned INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8, + localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH, + localparam int unsigned INPUT_STREAM_WIDTH = SIMD*ACTIVATION_WIDTH, + localparam int unsigned SF = MW/SIMD, + localparam int unsigned NF = MH/PE, + localparam int unsigned OUTPUT_LANES = PE, + localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8 +) +( + // Global Control + input logic ap_clk, + input logic ap_rst_n, + + // Weight Stream + input logic [WEIGHT_STREAM_WIDTH_BA-1:0] s_axis_weights_tdata, + input logic s_axis_weights_tvalid, + output logic s_axis_weights_tready, + + // Input Stream + input logic [INPUT_STREAM_WIDTH_BA-1:0] s_axis_input_tdata, + input logic s_axis_input_tvalid, + output logic s_axis_input_tready, + + // Output Stream + output logic [OUTPUT_STREAM_WIDTH_BA-1:0] m_axis_output_tdata, + output logic m_axis_output_tvalid, + input logic m_axis_output_tready +); + +//-------------------- Parameter sanity checks --------------------\\ + initial begin + if (MW % SIMD != 0) begin + $error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD); + $finish; + end + if (MH % PE != 0) begin + $error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE); + $finish; + end + if (ACTIVATION_WIDTH > 9) begin + $error("Activation width of %0d-bits exceeds maximum of 9-bits", ACTIVATION_WIDTH); + $finish; + end + if (WEIGHT_WIDTH > 8) begin + $error("Weight width of %0d-bits exceeds maximum of 8-bits", WEIGHT_WIDTH); + $finish; + end + if (SIGNED_ACTIVATIONS == 0 && ACTIVATION_WIDTH==9) begin + $error("Activation width of %0d-bits exceeds maximum of 8-bits for unsigned numbers", ACTIVATION_WIDTH); + $finish; + end + if (SEGMENTLEN == 0) begin + $warning("Segment length of %0d defaults to chain length", SEGMENTLEN); + end + if (SEGMENTLEN > (SIMD+2)/3) begin + $error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3); + $finish; + end + end + + uwire clk = ap_clk; + uwire rst = !ap_rst_n; + + typedef logic [INPUT_STREAM_WIDTH-1 : 0] mvauin_t; + + uwire mvauin_t amvau; + uwire alast; + uwire afin; + uwire avld; + uwire ardy; + + replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvauin_t)), .RAM_STYLE(RAM_STYLE)) activation_replay ( + .clk, .rst, + .ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)), + .ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin) + ); + + //-------------------- Input control --------------------\\ + uwire en; + uwire istb = avld && s_axis_weights_tvalid; + assign ardy = en && s_axis_weights_tvalid; + assign s_axis_weights_tready = en && avld; + + //-------------------- Core MVU --------------------\\ + uwire ovld; + uwire [PE-1:0][57:0] odat; + typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t; + mvu_8sx9 #(.PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN)) core ( + .clk, .rst, .en, + .last(alast), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau), + .vld(ovld), .p(odat) + ); + + //-------------------- Output register slice --------------------\\ + struct { + logic vld; + logic [PE-1:0][ACCU_WIDTH-1:0] dat; + } A = '{ vld: 0, default: 'x}; + + assign en = !A.vld || !ovld; + + uwire b_load; + always_ff @(posedge clk) begin + if(rst) A <= '{ vld: 0, default: 'x }; + else if(!A.vld || b_load) begin + A.vld <= ovld && en; + for(int unsigned i = 0; i < PE; i++) begin + // CR-1148862: + // A.dat[i] <= odat[i]; + automatic logic [57:0] v = odat[i]; + A.dat[i] <= v[ACCU_WIDTH-1:0]; + end + end + end + + struct { + logic vld; + logic [PE-1:0][ACCU_WIDTH-1:0] dat; + } B = '{ vld: 0, default: 'x}; + + assign b_load = !B.vld || m_axis_output_tready; + always_ff @(posedge clk) begin + if(rst) B <= '{ default: 'x }; + else begin + if(b_load) B <= '{ vld: A.vld, dat: A.dat}; + end + end + + assign m_axis_output_tvalid = B.vld; + assign m_axis_output_tdata = B.dat; + +endmodule \ No newline at end of file From 96925a929877ce084466438128678250b09784a9 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 6 Apr 2023 12:36:00 +0100 Subject: [PATCH 005/112] [rtl custom op]: implementation of replay buffer for mvu --- finn-rtllib/mvu/replay_buffer.sv | 109 +++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 finn-rtllib/mvu/replay_buffer.sv diff --git a/finn-rtllib/mvu/replay_buffer.sv b/finn-rtllib/mvu/replay_buffer.sv new file mode 100644 index 0000000000..685ac03137 --- /dev/null +++ b/finn-rtllib/mvu/replay_buffer.sv @@ -0,0 +1,109 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Replay buffer for counted sequences on an AXI-lite stream. + * @author Thomas B. Preußer + *****************************************************************************/ + +module replay_buffer #( + int unsigned LEN, // Sequence length + int unsigned REP, // Sequence replay count + int unsigned W, // Data width + parameter RAM_STYLE = "auto" // ram style for buffer {block, distributed, ultra, auto} +)( + input logic clk, + input logic rst, + + input logic [W-1:0] idat, + input logic ivld, + output logic irdy, + + output logic [W-1:0] odat, + output logic olast, + output logic ofin, + output logic ovld, + input logic ordy +); + + typedef logic [$clog2(REP)+$clog2(LEN)-1:0] count_t; + count_t Count = 0; + uwire done_len = ((LEN-1) & ~Count[$clog2(LEN)-1:0]) == 0; + uwire done_rep; + uwire done_all = done_len && done_rep; + + uwire shift; + uwire clr = rst || (done_all && shift); + always_ff @(posedge clk) begin + if(clr) Count <= 0; + else if(shift) Count <= Count + ((REP > 1) && done_len? 2**$clog2(LEN)-LEN+1 : 1); + end + + typedef logic [W-1:0] data_t; + uwire data_t rdat; + uwire first_rep; + if(REP == 1) begin + assign done_rep = 1; + assign first_rep = 1; + assign rdat = 'x; + end + else begin + assign done_rep = ((REP-1) & ~Count[$left(Count):$clog2(LEN)]) == 0; + + logic FirstRep = 1; + always_ff @(posedge clk) begin + if(clr) FirstRep <= 1; + else if(shift) FirstRep <= FirstRep && !done_len; + end + assign first_rep = FirstRep; + + (* RAM_STYLE = RAM_STYLE *) + data_t Buf[LEN]; + if(LEN == 1) begin : genTrivial + always_ff @(posedge clk) begin + if(shift && FirstRep) Buf[0] <= idat; + end + end : genTrivial + else begin : genShift + always_ff @(posedge clk) begin + if(shift) Buf <= { odat, Buf[0:LEN-2] }; + end + end : genShift + + assign rdat = Buf[LEN-1]; + end + + assign irdy = ordy && first_rep; + assign odat = first_rep? idat : rdat; + assign olast = done_len; + assign ofin = done_all; + assign ovld = first_rep? ivld : 1; + assign shift = ovld && ordy; + +endmodule : replay_buffer \ No newline at end of file From a3d11567468899bbcf33c83b509c26f908a807a3 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 6 Apr 2023 12:37:16 +0100 Subject: [PATCH 006/112] [rtl custom op]: testbench for mvu_8sx9_axi (including axi_wrapper & compute kernel) --- finn-rtllib/mvu/mvu_8sx9_axi_tb.sv | 208 +++++++++++++++++++++++++++++ 1 file changed, 208 insertions(+) create mode 100644 finn-rtllib/mvu/mvu_8sx9_axi_tb.sv diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv b/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv new file mode 100644 index 0000000000..ea97e0708c --- /dev/null +++ b/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv @@ -0,0 +1,208 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Testbench for MVU AXI-lite interface wrapper. + *****************************************************************************/ + +module mvu_8sx9_axi_tb(); + + //-------------------- Simulation parameters --------------------\\ + // Matrix & parallelism config + localparam int unsigned MW = 600; + localparam int unsigned MH = 256; + localparam int unsigned SIMD = 60; + localparam int unsigned PE = 16; + localparam int unsigned SEGMENTLEN = 4; + // Bit-width config + localparam int unsigned ACTIVATION_WIDTH = 8; + localparam int unsigned WEIGHT_WIDTH = 4; + localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW); + localparam bit SIGNED_ACTIVATIONS = 1; + // Simulation constants + localparam int unsigned NF = MH/PE; + localparam int unsigned SF = MW/SIMD; + localparam int unsigned NUM_OF_DSP = SIMD/3; + localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8; + localparam int unsigned ACTIVATION_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8*8; + localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH; + localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - SIMD*ACTIVATION_WIDTH; + localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8; + + // Generate clk and reset signal + logic clk = 0; + always #5ns clk = !clk; + + logic ap_rst_n = 0; + initial begin + repeat(16) @(posedge clk); + ap_rst_n <= 1; + end + + uwire ap_clk = clk; + + // Generate activations + typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; + typedef activation_t activation_vector_t[SF]; + + function activation_vector_t init_ACTIVATIONS; + automatic activation_vector_t res; + std::randomize(res); + return res; + endfunction : init_ACTIVATIONS + + activation_vector_t ACTIVATIONS = init_ACTIVATIONS(); + + struct { + activation_t dat; + logic vld; + logic rdy; + } activations; + + initial begin + activations.vld = 0; + activations.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain + @(posedge clk iff ap_rst_n); + + for (int i=0; i 1; + @(posedge clk); + end while (!(activations.vld === 1 && activations.rdy === 1)); + end + + activations.vld <= 0; + activations.dat <= 'x; + end + + // Generate weights + typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; + typedef weight_t weight_matrix_t[NF][SF]; + + function weight_matrix_t init_WEIGHTS; + automatic weight_matrix_t res; + std::randomize(res); + return res; + endfunction : init_WEIGHTS; + + weight_matrix_t WEIGHTS = init_WEIGHTS(); + + struct { + weight_t dat; + logic vld; + logic rdy; + } weights; + + initial begin + weights.vld = 0; + weights.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain + @(posedge clk iff ap_rst_n); + + weights.vld <= 1; + for (int i=0; i= 1; + @(posedge clk iff ap_rst_n); + end while (!(outputs.rdy === 1 && outputs.vld === 1)); + + // Compare produced outputs against golden outputs + foreach(outputs.dat[i]) begin + assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + else begin + $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + $stop; + end + end + + NF_CNT += 1; + end + + $finish; + end + + // Instantiate DUT + mvu_8sx9_axi #( + .MW(MW), + .MH(MH), + .PE(PE), + .SIMD(SIMD), + .ACTIVATION_WIDTH(ACTIVATION_WIDTH), + .WEIGHT_WIDTH(WEIGHT_WIDTH), + .ACCU_WIDTH(ACCU_WIDTH), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), + .SEGMENTLEN(SEGMENTLEN) + ) + dut ( + .ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld), + .s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld), + .s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld), + .m_axis_output_tready(outputs.rdy) + ); + +endmodule From 2aea664b2260a4ea759909d0a3168b5f62b114a2 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 6 Apr 2023 12:37:55 +0100 Subject: [PATCH 007/112] [rtl custom op]: initial implementation of verilog wrapper for mvu_8sx9_axi --- finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v | 90 ++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v new file mode 100644 index 0000000000..ff3779d211 --- /dev/null +++ b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v @@ -0,0 +1,90 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Verilog AXI-lite wrapper for MVU. + *****************************************************************************/ + +module $MODULE_NAME_AXI_WRAPPER$ #( + parameter MW = $MW$, + parameter MH = $MH$, + parameter PE = $PE$, + parameter SIMD = $SIMD$, + parameter ACTIVATION_WIDTH = $ACTIVATION_WIDTH$, + parameter WEIGHT_WIDTH = $WEIGHT_WIDTH$, + parameter ACCU_WIDTH = $ACCU_WIDTH$, + parameter SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$, + parameter SEGMENTLEN = $SEGMENTLEN$, + parameter RAM_STYLE = $IBUF_RAM_STYLE$, + + // Safely deducible parameters + parameter WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, + parameter INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8, + parameter OUTPUT_LANES = PE, + parameter OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8 +)( + // Global Control + input logic ap_clk, + input logic ap_rst_n, + + // Weight Stream + input logic [WEIGHT_STREAM_WIDTH_BA-1:0] s_axis_weights_tdata, + input logic s_axis_weights_tvalid, + output logic s_axis_weights_tready, + + // Input Stream + input logic [INPUT_STREAM_WIDTH_BA-1:0] s_axis_input_tdata, + input logic s_axis_input_tvalid, + output logic s_axis_input_tready, + + // Output Stream + output logic [OUTPUT_STREAM_WIDTH_BA-1:0] m_axis_output_tdata, + output logic m_axis_output_tvalid, + input logic m_axis_output_tready +); + +mvu_8sx9_axi #( + .MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), + .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), + .SEGMENTLEN(SEGMENTLEN), .RAM_STYLE(RAM_STYLE) + ) inst ( + .ap_clk(ap_clk), + .ap_rst_n(ap_rst_n), + .s_axis_weights_tdata(s_axis_weights_tdata), + .s_axis_weights_tvalid(s_axis_weights_tvalid), + .s_axis_weights_tready(s_axis_weights_tready), + .s_axis_input_tdata(s_axis_input_tdata), + .s_axis_input_tvalid(s_axis_input_tvalid), + .s_axis_input_tready(s_axis_input_tready), + .m_axis_output_tdata(m_axis_output_tdata), + .m_axis_output_tvalid(m_axis_output_tvalid), + .m_axis_output_tready(m_axis_output_tready) +) + +endmodule : mvau_8sx9_axi_wrapper \ No newline at end of file From 8b57849bb47c3119b177e78dcbaa48954f69b811 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Tue, 11 Apr 2023 15:50:24 +0100 Subject: [PATCH 008/112] [rtl mvu]: fix tab indentation --- finn-rtllib/mvu/mvu_8sx9.sv | 424 ++++++++++++------------- finn-rtllib/mvu/mvu_8sx9_axi.sv | 32 +- finn-rtllib/mvu/mvu_8sx9_axi_tb.sv | 342 ++++++++++---------- finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v | 26 +- finn-rtllib/mvu/mvu_8sx9_tb.sv | 258 +++++++-------- 5 files changed, 541 insertions(+), 541 deletions(-) diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv index c992990d9f..d082d4fb2e 100644 --- a/finn-rtllib/mvu/mvu_8sx9.sv +++ b/finn-rtllib/mvu/mvu_8sx9.sv @@ -52,233 +52,233 @@ module mvu_8sx9 #( ); //-------------------- Declare global signals --------------------\\ -localparam int unsigned CHAINLEN = (SIMD+2)/3; -localparam int unsigned SEGLEN = SEGMENTLEN == 0 ? CHAINLEN : SEGMENTLEN; // Additional constant to default a SEGMENTLEN of '0' to the DSP-chain length -uwire [26:0] a_in_i [CHAINLEN]; -uwire [23:0] b_in_i [PE][CHAINLEN]; -uwire [57:0] pcout [PE][CHAINLEN]; + localparam int unsigned CHAINLEN = (SIMD+2)/3; + localparam int unsigned SEGLEN = SEGMENTLEN == 0 ? CHAINLEN : SEGMENTLEN; // Additional constant to default a SEGMENTLEN of '0' to the DSP-chain length + uwire [26:0] a_in_i [CHAINLEN]; + uwire [23:0] b_in_i [PE][CHAINLEN]; + uwire [57:0] pcout [PE][CHAINLEN]; //-------------------- Shift register for opmode select signal --------------------\\ -localparam int unsigned MAX_PIPELINE_STAGES = (CHAINLEN + SEGLEN-1)/SEGLEN; // >=1 (== number of pipeline registers + 1 (A/B inputs always have 1 register)) -logic L [0:1+MAX_PIPELINE_STAGES] = '{default: 0}; // After MAX_PIPELINE_STAGES (== number of pipeline stages for input data), we have 3 additional cycles latency (A/B reg, Mreg, Preg). Thus, we add +2 (since OPMODE is buffered by 1 cycle in the DSP fabric) + localparam int unsigned MAX_PIPELINE_STAGES = (CHAINLEN + SEGLEN-1)/SEGLEN; // >=1 (== number of pipeline registers + 1 (A/B inputs always have 1 register)) + logic L [0:1+MAX_PIPELINE_STAGES] = '{default: 0}; // After MAX_PIPELINE_STAGES (== number of pipeline stages for input data), we have 3 additional cycles latency (A/B reg, Mreg, Preg). Thus, we add +2 (since OPMODE is buffered by 1 cycle in the DSP fabric) -always_ff @(posedge clk) begin - if(rst) L <= '{default: 0}; - else if(en) L <= { L[1:1+MAX_PIPELINE_STAGES], last }; -end -assign vld = L[0]; + always_ff @(posedge clk) begin + if(rst) L <= '{default: 0}; + else if(en) L <= { L[1:1+MAX_PIPELINE_STAGES], last }; + end + assign vld = L[0]; //-------------------- Shift register for ZERO flag --------------------\\ -logic Z [0:MAX_PIPELINE_STAGES-2] = '{default:0}; // We need MAX_PIPELINE_STAGES-1 pipeline stages (note: INMODE is buffered inside DSP fabric) + logic Z [0:MAX_PIPELINE_STAGES-2] = '{default:0}; // We need MAX_PIPELINE_STAGES-1 pipeline stages (note: INMODE is buffered inside DSP fabric) -if (MAX_PIPELINE_STAGES > 1) begin : genZreg - always_ff @(posedge clk) begin - if (rst) Z <= '{default: 0}; - else if(en) begin - Z[0] <= zero; - if (MAX_PIPELINE_STAGES > 2) Z[1:MAX_PIPELINE_STAGES-1] <= Z[0:MAX_PIPELINE_STAGES-2]; - end - end -end; + if (MAX_PIPELINE_STAGES > 1) begin : genZreg + always_ff @(posedge clk) begin + if (rst) Z <= '{default: 0}; + else if(en) begin + Z[0] <= zero; + if (MAX_PIPELINE_STAGES > 2) Z[1:MAX_PIPELINE_STAGES-1] <= Z[0:MAX_PIPELINE_STAGES-2]; + end + end + end; //-------------------- Buffer for input activations --------------------\\ -localparam int unsigned PAD_BITS_ACT = 9 - ACTIVATION_WIDTH; -typedef logic [2:0][ACTIVATION_WIDTH-1:0] a_buffer_t; + localparam int unsigned PAD_BITS_ACT = 9 - ACTIVATION_WIDTH; + typedef logic [2:0][ACTIVATION_WIDTH-1:0] a_buffer_t; -for (genvar i=0; i1 ? TOTAL_PREGS-1 : 0; - - if (EXTERNAL_PREGS > 0) begin : genExternalPregAct - a_buffer_t A [0:EXTERNAL_PREGS-1]; - always_ff @(posedge clk) begin - if (rst) A <= '{default: 0}; - else if(en) begin - A[EXTERNAL_PREGS-1] <= a[3*i +: 3]; - if (EXTERNAL_PREGS > 1) A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1]; - end - end - assign a_in_i[i][26:0] = SIGNED_ACTIVATIONS ? { {PAD_BITS_ACT{A[0][2][ACTIVATION_WIDTH-1]}}, A[0][2], {PAD_BITS_ACT{A[0][1][ACTIVATION_WIDTH-1]}}, A[0][1], {PAD_BITS_ACT{A[0][0][ACTIVATION_WIDTH-1]}}, A[0][0]} - : { {PAD_BITS_ACT{1'b0}}, A[0][2], {PAD_BITS_ACT{1'b0}}, A[0][1], {PAD_BITS_ACT{1'b0}}, A[0][0]} ; - end : genExternalPregAct - else begin : genInpDSPAct - assign a_in_i[i][26:0] = SIGNED_ACTIVATIONS ? { {PAD_BITS_ACT{a[3*i+2][ACTIVATION_WIDTH-1]}}, a[3*i+2], {PAD_BITS_ACT{a[3*i+1][ACTIVATION_WIDTH-1]}}, a[3*i+1], {PAD_BITS_ACT{a[3*i][ACTIVATION_WIDTH-1]}}, a[3*i]} - : { {PAD_BITS_ACT{1'b0}}, a[3*i+2], {PAD_BITS_ACT{1'b0}}, a[3*i+1], {PAD_BITS_ACT{1'b0}}, a[3*i]} ; - end : genInpDSPAct + for (genvar i=0; i1 ? TOTAL_PREGS-1 : 0; -end : genActSIMD + if (EXTERNAL_PREGS > 0) begin : genExternalPregAct + a_buffer_t A [0:EXTERNAL_PREGS-1]; + always_ff @(posedge clk) begin + if (rst) A <= '{default: 0}; + else if(en) begin + A[EXTERNAL_PREGS-1] <= a[3*i +: 3]; + if (EXTERNAL_PREGS > 1) A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1]; + end + end + assign a_in_i[i][26:0] = SIGNED_ACTIVATIONS ? { {PAD_BITS_ACT{A[0][2][ACTIVATION_WIDTH-1]}}, A[0][2], {PAD_BITS_ACT{A[0][1][ACTIVATION_WIDTH-1]}}, A[0][1], {PAD_BITS_ACT{A[0][0][ACTIVATION_WIDTH-1]}}, A[0][0]} + : { {PAD_BITS_ACT{1'b0}}, A[0][2], {PAD_BITS_ACT{1'b0}}, A[0][1], {PAD_BITS_ACT{1'b0}}, A[0][0]} ; + end : genExternalPregAct + else begin : genInpDSPAct + assign a_in_i[i][26:0] = SIGNED_ACTIVATIONS ? { {PAD_BITS_ACT{a[3*i+2][ACTIVATION_WIDTH-1]}}, a[3*i+2], {PAD_BITS_ACT{a[3*i+1][ACTIVATION_WIDTH-1]}}, a[3*i+1], {PAD_BITS_ACT{a[3*i][ACTIVATION_WIDTH-1]}}, a[3*i]} + : { {PAD_BITS_ACT{1'b0}}, a[3*i+2], {PAD_BITS_ACT{1'b0}}, a[3*i+1], {PAD_BITS_ACT{1'b0}}, a[3*i]} ; + end : genInpDSPAct + + end : genActSIMD //-------------------- Buffer for weights --------------------\\ -localparam int unsigned PAD_BITS_WEIGHT = 8 - WEIGHT_WIDTH; -typedef logic [2:0][WEIGHT_WIDTH-1:0] b_buffer_t; + localparam int unsigned PAD_BITS_WEIGHT = 8 - WEIGHT_WIDTH; + typedef logic [2:0][WEIGHT_WIDTH-1:0] b_buffer_t; -for (genvar j=0; j1 ? TOTAL_PREGS-1 : 0; - - if (EXTERNAL_PREGS > 0) begin : genExternalPregWeight - b_buffer_t B [0:PE-1][0:EXTERNAL_PREGS-1]; - always_ff @(posedge clk) begin - if (rst) B <= '{default: 0}; - else if (en) begin - B[j][EXTERNAL_PREGS-1] <= w[j][3*i +: 3]; - if (EXTERNAL_PREGS > 1) B[j][0:EXTERNAL_PREGS-2] <= B[j][1:EXTERNAL_PREGS-1]; - end - end - assign b_in_i[j][i][23:0] = { {PAD_BITS_WEIGHT{B[j][0][2][WEIGHT_WIDTH-1]}}, B[j][0][2], {PAD_BITS_WEIGHT{B[j][0][1][WEIGHT_WIDTH-1]}}, B[j][0][1], {PAD_BITS_WEIGHT{B[j][0][0][WEIGHT_WIDTH-1]}}, B[j][0][0] }; - end : genExternalPregWeight - else begin : genInpDSPWeight - assign b_in_i[j][i][23:0] = { {PAD_BITS_WEIGHT{w[j][3*i+2][WEIGHT_WIDTH-1]}}, w[j][3*i+2], {PAD_BITS_WEIGHT{w[j][3*i+1][WEIGHT_WIDTH-1]}}, w[j][3*i+1], {PAD_BITS_WEIGHT{w[j][3*i][WEIGHT_WIDTH-1]}}, w[j][3*i] }; - end : genInpDSPWeight - end : genWeightSIMD + for (genvar j=0; j1 ? TOTAL_PREGS-1 : 0; -end : genWeightPE + if (EXTERNAL_PREGS > 0) begin : genExternalPregWeight + b_buffer_t B [0:PE-1][0:EXTERNAL_PREGS-1]; + always_ff @(posedge clk) begin + if (rst) B <= '{default: 0}; + else if (en) begin + B[j][EXTERNAL_PREGS-1] <= w[j][3*i +: 3]; + if (EXTERNAL_PREGS > 1) B[j][0:EXTERNAL_PREGS-2] <= B[j][1:EXTERNAL_PREGS-1]; + end + end + assign b_in_i[j][i][23:0] = { {PAD_BITS_WEIGHT{B[j][0][2][WEIGHT_WIDTH-1]}}, B[j][0][2], {PAD_BITS_WEIGHT{B[j][0][1][WEIGHT_WIDTH-1]}}, B[j][0][1], {PAD_BITS_WEIGHT{B[j][0][0][WEIGHT_WIDTH-1]}}, B[j][0][0] }; + end : genExternalPregWeight + else begin : genInpDSPWeight + assign b_in_i[j][i][23:0] = { {PAD_BITS_WEIGHT{w[j][3*i+2][WEIGHT_WIDTH-1]}}, w[j][3*i+2], {PAD_BITS_WEIGHT{w[j][3*i+1][WEIGHT_WIDTH-1]}}, w[j][3*i+1], {PAD_BITS_WEIGHT{w[j][3*i][WEIGHT_WIDTH-1]}}, w[j][3*i] }; + end : genInpDSPWeight + end : genWeightSIMD + + end : genWeightPE //-------------------- Instantiate PE x CHAINLEN DSPs --------------------\\ -for (genvar j=0; j0 ? 2 : 1; // 1 : 0 - localparam bit PREG = (i+1)%SEGLEN==0 || i == CHAINLEN-1; - localparam bit FIRST = i == 0; - localparam bit LAST = i == CHAINLEN-1; - uwire [57:0] pp; - - if (LAST) begin : genPOUT - assign p[j] = pp; - end - - DSP58 #( - // Feature Control Attributes: Data Path Selection - .AMULTSEL("A"), // Selects A input to multiplier (A, AD) - .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) - .BMULTSEL("B"), // Selects B input to multiplier (AD, B) - .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port) - .DSP_MODE("INT8"), // Configures DSP to a particular mode of operation. Set to INT24 for - // legacy mode. - .PREADDINSEL("A"), // Selects input to pre-adder (A, B) - .RND(58'h000000000000000), // Rounding Constant - .USE_MULT("MULTIPLY"), // Select multiplier usage (DYNAMIC, MULTIPLY, NONE) - .USE_SIMD("ONE58"), // SIMD selection (FOUR12, ONE58, TWO24) - .USE_WIDEXOR("FALSE"), // Use the Wide XOR function (FALSE, TRUE) - .XORSIMD("XOR24_34_58_116"), // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116) - // Pattern Detector Attributes: Pattern Detection Configuration - .AUTORESET_PATDET("NO_RESET"), // NO_RESET, RESET_MATCH, RESET_NOT_MATCH - .AUTORESET_PRIORITY("RESET"), // Priority of AUTORESET vs. CEP (CEP, RESET). - .MASK(58'h0ffffffffffffff), // 58-bit mask value for pattern detect (1=ignore) - .PATTERN(58'h000000000000000), // 58-bit pattern match for pattern detect - .SEL_MASK("MASK"), // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2 - .SEL_PATTERN("PATTERN"), // Select pattern value (C, PATTERN) - .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect (NO_PATDET, PATDET) - // Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins - .IS_ALUMODE_INVERTED(4'b0000), // Optional inversion for ALUMODE - .IS_CARRYIN_INVERTED(1'b0), // Optional inversion for CARRYIN - .IS_CLK_INVERTED(1'b0), // Optional inversion for CLK - .IS_INMODE_INVERTED(5'b00000), // Optional inversion for INMODE - .IS_NEGATE_INVERTED(3'b000), // Optional inversion for NEGATE - .IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0 - FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN - 2'b01, // Y : M - 2'b01 // X: M - }), // Optional inversion for OPMODE - .IS_RSTALLCARRYIN_INVERTED(1'b0), // Optional inversion for RSTALLCARRYIN - .IS_RSTALUMODE_INVERTED(1'b0), // Optional inversion for RSTALUMODE - .IS_RSTA_INVERTED(1'b0), // Optional inversion for RSTA - .IS_RSTB_INVERTED(1'b0), // Optional inversion for RSTB - .IS_RSTCTRL_INVERTED(1'b0), // Optional inversion for STCONJUGATE_A - .IS_RSTC_INVERTED(1'b0), // Optional inversion for RSTC - .IS_RSTD_INVERTED(1'b0), // Optional inversion for RSTD - .IS_RSTINMODE_INVERTED(1'b0), // Optional inversion for RSTINMODE - .IS_RSTM_INVERTED(1'b0), // Optional inversion for RSTM - .IS_RSTP_INVERTED(1'b0), // Optional inversion for RSTP - // Register Control Attributes: Pipeline Register Configuration - .ACASCREG(INTERNAL_PREGS), // Number of pipeline stages between A/ACIN and ACOUT (0-2) - .ADREG(0), // Pipeline stages for pre-adder (0-1) - .ALUMODEREG(0), // Pipeline stages for ALUMODE (0-1) - .AREG(INTERNAL_PREGS), // Pipeline stages for A (0-2) - .BCASCREG(INTERNAL_PREGS), // Number of pipeline stages between B/BCIN and BCOUT (0-2) - .BREG(INTERNAL_PREGS), // Pipeline stages for B (0-2) - .CARRYINREG(0), // Pipeline stages for CARRYIN (0-1) - .CARRYINSELREG(0), // Pipeline stages for CARRYINSEL (0-1) - .CREG(0), // Pipeline stages for C (0-1) - .DREG(0), // Pipeline stages for D (0-1) - .INMODEREG(1), // Pipeline stages for INMODE (0-1) - .MREG(1), // Multiplier pipeline stages (0-1) - .OPMODEREG(1), // Pipeline stages for OPMODE (0-1) - .PREG(PREG), // Number of pipeline stages for P (0-1) - .RESET_MODE("SYNC") // Selection of synchronous or asynchronous reset. (ASYNC, SYNC). - ) - DSP58_inst ( - // Cascade outputs: Cascade Ports - .ACOUT(), // 34-bit output: A port cascade - .BCOUT(), // 24-bit output: B cascade - .CARRYCASCOUT(), // 1-bit output: Cascade carry - .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade - .PCOUT(pcout[j][i]), // 58-bit output: Cascade output - // Control outputs: Control Inputs/Status Bits - .OVERFLOW(), // 1-bit output: Overflow in add/acc - .PATTERNBDETECT(), // 1-bit output: Pattern bar detect - .PATTERNDETECT(), // 1-bit output: Pattern detect - .UNDERFLOW(), // 1-bit output: Underflow in add/acc - // Data outputs: Data Ports - .CARRYOUT(), // 4-bit output: Carry - .P(pp), // 58-bit output: Primary data - .XOROUT(), // 8-bit output: XOR data - // Cascade inputs: Cascade Ports - .ACIN('x), // 34-bit input: A cascade data - .BCIN('x), // 24-bit input: B cascade - .CARRYCASCIN('x), // 1-bit input: Cascade carry - .MULTSIGNIN('x), // 1-bit input: Multiplier sign cascade - .PCIN(FIRST ? 'x : pcout[j][i-1]), // 58-bit input: P cascade - // Control inputs: Control Inputs/Status Bits - .ALUMODE(4'h0), // 4-bit input: ALU control - .CARRYINSEL('0), // 3-bit input: Carry select - .CLK(clk), // 1-bit input: Clock - .INMODE({ - INTERNAL_PREGS==2 ? 1'b0 : 1'b1, - 2'b00, - TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero, - INTERNAL_PREGS==2 ? 1'b0 : 1'b1 - }), // 5-bit input: INMODE control - .NEGATE('0), // 3-bit input: Negates the input of the multiplier - .OPMODE({ - LAST ? {1'b0, L[1]} : 2'b00, - 7'b000_0000 - }), // 9-bit input: Operation mode - // Data inputs: Data Ports - .A({ 7'bx, a_in_i[i] }), // 34-bit input: A data - .B(b_in_i[j][i]), // 24-bit input: B data - .C('x), // 58-bit input: C data - .CARRYIN('0), // 1-bit input: Carry-in - .D('x), // 27-bit input: D data - // Reset/Clock Enable inputs: Reset/Clock Enable Inputs - .ASYNC_RST('0), // 1-bit input: Asynchronous reset for all registers. - .CEA1(en), // 1-bit input: Clock enable for 1st stage AREG - .CEA2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage AREG - .CEAD('0), // 1-bit input: Clock enable for ADREG - .CEALUMODE('0), // 1-bit input: Clock enable for ALUMODE - .CEB1(en), // 1-bit input: Clock enable for 1st stage BREG - .CEB2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage BREG - .CEC('0), // 1-bit input: Clock enable for CREG - .CECARRYIN('0), // 1-bit input: Clock enable for CARRYINREG - .CECTRL(en), // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG - .CED('0), // 1-bit input: Clock enable for DREG - .CEINMODE(en), // 1-bit input: Clock enable for INMODEREG - .CEM(en), // 1-bit input: Clock enable for MREG - .CEP(PREG && en), // 1-bit input: Clock enable for PREG - .RSTA(rst), // 1-bit input: Reset for AREG - .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG - .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG - .RSTB(rst), // 1-bit input: Reset for BREG - .RSTC('0), // 1-bit input: Reset for CREG - .RSTCTRL(rst), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG - .RSTD('0), // 1-bit input: Reset for DREG and ADREG - .RSTINMODE(rst), // 1-bit input: Reset for INMODE register - .RSTM(rst), // 1-bit input: Reset for MREG - .RSTP(PREG && rst) // 1-bit input: Reset for PREG - ); - end : genDSPChain -end : genDSPPE + for (genvar j=0; j0 ? 2 : 1; // 1 : 0 + localparam bit PREG = (i+1)%SEGLEN==0 || i == CHAINLEN-1; + localparam bit FIRST = i == 0; + localparam bit LAST = i == CHAINLEN-1; + uwire [57:0] pp; + + if (LAST) begin : genPOUT + assign p[j] = pp; + end + + DSP58 #( + // Feature Control Attributes: Data Path Selection + .AMULTSEL("A"), // Selects A input to multiplier (A, AD) + .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) + .BMULTSEL("B"), // Selects B input to multiplier (AD, B) + .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port) + .DSP_MODE("INT8"), // Configures DSP to a particular mode of operation. Set to INT24 for + // legacy mode. + .PREADDINSEL("A"), // Selects input to pre-adder (A, B) + .RND(58'h000000000000000), // Rounding Constant + .USE_MULT("MULTIPLY"), // Select multiplier usage (DYNAMIC, MULTIPLY, NONE) + .USE_SIMD("ONE58"), // SIMD selection (FOUR12, ONE58, TWO24) + .USE_WIDEXOR("FALSE"), // Use the Wide XOR function (FALSE, TRUE) + .XORSIMD("XOR24_34_58_116"), // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116) + // Pattern Detector Attributes: Pattern Detection Configuration + .AUTORESET_PATDET("NO_RESET"), // NO_RESET, RESET_MATCH, RESET_NOT_MATCH + .AUTORESET_PRIORITY("RESET"), // Priority of AUTORESET vs. CEP (CEP, RESET). + .MASK(58'h0ffffffffffffff), // 58-bit mask value for pattern detect (1=ignore) + .PATTERN(58'h000000000000000), // 58-bit pattern match for pattern detect + .SEL_MASK("MASK"), // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2 + .SEL_PATTERN("PATTERN"), // Select pattern value (C, PATTERN) + .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect (NO_PATDET, PATDET) + // Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins + .IS_ALUMODE_INVERTED(4'b0000), // Optional inversion for ALUMODE + .IS_CARRYIN_INVERTED(1'b0), // Optional inversion for CARRYIN + .IS_CLK_INVERTED(1'b0), // Optional inversion for CLK + .IS_INMODE_INVERTED(5'b00000), // Optional inversion for INMODE + .IS_NEGATE_INVERTED(3'b000), // Optional inversion for NEGATE + .IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0 + FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN + 2'b01, // Y : M + 2'b01 // X: M + }), // Optional inversion for OPMODE + .IS_RSTALLCARRYIN_INVERTED(1'b0), // Optional inversion for RSTALLCARRYIN + .IS_RSTALUMODE_INVERTED(1'b0), // Optional inversion for RSTALUMODE + .IS_RSTA_INVERTED(1'b0), // Optional inversion for RSTA + .IS_RSTB_INVERTED(1'b0), // Optional inversion for RSTB + .IS_RSTCTRL_INVERTED(1'b0), // Optional inversion for STCONJUGATE_A + .IS_RSTC_INVERTED(1'b0), // Optional inversion for RSTC + .IS_RSTD_INVERTED(1'b0), // Optional inversion for RSTD + .IS_RSTINMODE_INVERTED(1'b0), // Optional inversion for RSTINMODE + .IS_RSTM_INVERTED(1'b0), // Optional inversion for RSTM + .IS_RSTP_INVERTED(1'b0), // Optional inversion for RSTP + // Register Control Attributes: Pipeline Register Configuration + .ACASCREG(INTERNAL_PREGS), // Number of pipeline stages between A/ACIN and ACOUT (0-2) + .ADREG(0), // Pipeline stages for pre-adder (0-1) + .ALUMODEREG(0), // Pipeline stages for ALUMODE (0-1) + .AREG(INTERNAL_PREGS), // Pipeline stages for A (0-2) + .BCASCREG(INTERNAL_PREGS), // Number of pipeline stages between B/BCIN and BCOUT (0-2) + .BREG(INTERNAL_PREGS), // Pipeline stages for B (0-2) + .CARRYINREG(0), // Pipeline stages for CARRYIN (0-1) + .CARRYINSELREG(0), // Pipeline stages for CARRYINSEL (0-1) + .CREG(0), // Pipeline stages for C (0-1) + .DREG(0), // Pipeline stages for D (0-1) + .INMODEREG(1), // Pipeline stages for INMODE (0-1) + .MREG(1), // Multiplier pipeline stages (0-1) + .OPMODEREG(1), // Pipeline stages for OPMODE (0-1) + .PREG(PREG), // Number of pipeline stages for P (0-1) + .RESET_MODE("SYNC") // Selection of synchronous or asynchronous reset. (ASYNC, SYNC). + ) + DSP58_inst ( + // Cascade outputs: Cascade Ports + .ACOUT(), // 34-bit output: A port cascade + .BCOUT(), // 24-bit output: B cascade + .CARRYCASCOUT(), // 1-bit output: Cascade carry + .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade + .PCOUT(pcout[j][i]), // 58-bit output: Cascade output + // Control outputs: Control Inputs/Status Bits + .OVERFLOW(), // 1-bit output: Overflow in add/acc + .PATTERNBDETECT(), // 1-bit output: Pattern bar detect + .PATTERNDETECT(), // 1-bit output: Pattern detect + .UNDERFLOW(), // 1-bit output: Underflow in add/acc + // Data outputs: Data Ports + .CARRYOUT(), // 4-bit output: Carry + .P(pp), // 58-bit output: Primary data + .XOROUT(), // 8-bit output: XOR data + // Cascade inputs: Cascade Ports + .ACIN('x), // 34-bit input: A cascade data + .BCIN('x), // 24-bit input: B cascade + .CARRYCASCIN('x), // 1-bit input: Cascade carry + .MULTSIGNIN('x), // 1-bit input: Multiplier sign cascade + .PCIN(FIRST ? 'x : pcout[j][i-1]), // 58-bit input: P cascade + // Control inputs: Control Inputs/Status Bits + .ALUMODE(4'h0), // 4-bit input: ALU control + .CARRYINSEL('0), // 3-bit input: Carry select + .CLK(clk), // 1-bit input: Clock + .INMODE({ + INTERNAL_PREGS==2 ? 1'b0 : 1'b1, + 2'b00, + TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero, + INTERNAL_PREGS==2 ? 1'b0 : 1'b1 + }), // 5-bit input: INMODE control + .NEGATE('0), // 3-bit input: Negates the input of the multiplier + .OPMODE({ + LAST ? {1'b0, L[1]} : 2'b00, + 7'b000_0000 + }), // 9-bit input: Operation mode + // Data inputs: Data Ports + .A({ 7'bx, a_in_i[i] }), // 34-bit input: A data + .B(b_in_i[j][i]), // 24-bit input: B data + .C('x), // 58-bit input: C data + .CARRYIN('0), // 1-bit input: Carry-in + .D('x), // 27-bit input: D data + // Reset/Clock Enable inputs: Reset/Clock Enable Inputs + .ASYNC_RST('0), // 1-bit input: Asynchronous reset for all registers. + .CEA1(en), // 1-bit input: Clock enable for 1st stage AREG + .CEA2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage AREG + .CEAD('0), // 1-bit input: Clock enable for ADREG + .CEALUMODE('0), // 1-bit input: Clock enable for ALUMODE + .CEB1(en), // 1-bit input: Clock enable for 1st stage BREG + .CEB2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage BREG + .CEC('0), // 1-bit input: Clock enable for CREG + .CECARRYIN('0), // 1-bit input: Clock enable for CARRYINREG + .CECTRL(en), // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG + .CED('0), // 1-bit input: Clock enable for DREG + .CEINMODE(en), // 1-bit input: Clock enable for INMODEREG + .CEM(en), // 1-bit input: Clock enable for MREG + .CEP(PREG && en), // 1-bit input: Clock enable for PREG + .RSTA(rst), // 1-bit input: Reset for AREG + .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG + .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG + .RSTB(rst), // 1-bit input: Reset for BREG + .RSTC('0), // 1-bit input: Reset for CREG + .RSTCTRL(rst), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG + .RSTD('0), // 1-bit input: Reset for DREG and ADREG + .RSTINMODE(rst), // 1-bit input: Reset for INMODE register + .RSTM(rst), // 1-bit input: Reset for MREG + .RSTP(PREG && rst) // 1-bit input: Reset for PREG + ); + end : genDSPChain + end : genDSPPE endmodule diff --git a/finn-rtllib/mvu/mvu_8sx9_axi.sv b/finn-rtllib/mvu/mvu_8sx9_axi.sv index 8765c50a26..6c7eaeaeca 100644 --- a/finn-rtllib/mvu/mvu_8sx9_axi.sv +++ b/finn-rtllib/mvu/mvu_8sx9_axi.sv @@ -41,36 +41,36 @@ module mvu_8sx9_axi #( int unsigned ACCU_WIDTH, bit SIGNED_ACTIVATIONS = 0, int unsigned SEGMENTLEN = 0, - parameter RAM_STYLE = "auto", + parameter RAM_STYLE = "auto", localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, localparam int unsigned INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8, - localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH, - localparam int unsigned INPUT_STREAM_WIDTH = SIMD*ACTIVATION_WIDTH, + localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH, + localparam int unsigned INPUT_STREAM_WIDTH = SIMD*ACTIVATION_WIDTH, localparam int unsigned SF = MW/SIMD, - localparam int unsigned NF = MH/PE, + localparam int unsigned NF = MH/PE, localparam int unsigned OUTPUT_LANES = PE, localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8 ) ( // Global Control - input logic ap_clk, - input logic ap_rst_n, + input logic ap_clk, + input logic ap_rst_n, // Weight Stream - input logic [WEIGHT_STREAM_WIDTH_BA-1:0] s_axis_weights_tdata, - input logic s_axis_weights_tvalid, + input logic [WEIGHT_STREAM_WIDTH_BA-1:0] s_axis_weights_tdata, + input logic s_axis_weights_tvalid, output logic s_axis_weights_tready, // Input Stream - input logic [INPUT_STREAM_WIDTH_BA-1:0] s_axis_input_tdata, - input logic s_axis_input_tvalid, + input logic [INPUT_STREAM_WIDTH_BA-1:0] s_axis_input_tdata, + input logic s_axis_input_tvalid, output logic s_axis_input_tready, // Output Stream output logic [OUTPUT_STREAM_WIDTH_BA-1:0] m_axis_output_tdata, output logic m_axis_output_tvalid, - input logic m_axis_output_tready + input logic m_axis_output_tready ); //-------------------- Parameter sanity checks --------------------\\ @@ -121,13 +121,13 @@ module mvu_8sx9_axi #( .ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin) ); - //-------------------- Input control --------------------\\ +//-------------------- Input control --------------------\\ uwire en; uwire istb = avld && s_axis_weights_tvalid; assign ardy = en && s_axis_weights_tvalid; assign s_axis_weights_tready = en && avld; - //-------------------- Core MVU --------------------\\ +//-------------------- Core MVU --------------------\\ uwire ovld; uwire [PE-1:0][57:0] odat; typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t; @@ -138,7 +138,7 @@ module mvu_8sx9_axi #( .vld(ovld), .p(odat) ); - //-------------------- Output register slice --------------------\\ +//-------------------- Output register slice --------------------\\ struct { logic vld; logic [PE-1:0][ACCU_WIDTH-1:0] dat; @@ -148,7 +148,7 @@ module mvu_8sx9_axi #( uwire b_load; always_ff @(posedge clk) begin - if(rst) A <= '{ vld: 0, default: 'x }; + if(rst) A <= '{ vld: 0, default: 'x }; else if(!A.vld || b_load) begin A.vld <= ovld && en; for(int unsigned i = 0; i < PE; i++) begin @@ -169,7 +169,7 @@ module mvu_8sx9_axi #( always_ff @(posedge clk) begin if(rst) B <= '{ default: 'x }; else begin - if(b_load) B <= '{ vld: A.vld, dat: A.dat}; + if(b_load) B <= '{ vld: A.vld, dat: A.dat}; end end diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv b/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv index ea97e0708c..70ffa096ef 100644 --- a/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv +++ b/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv @@ -33,176 +33,176 @@ module mvu_8sx9_axi_tb(); - //-------------------- Simulation parameters --------------------\\ - // Matrix & parallelism config - localparam int unsigned MW = 600; - localparam int unsigned MH = 256; - localparam int unsigned SIMD = 60; - localparam int unsigned PE = 16; - localparam int unsigned SEGMENTLEN = 4; - // Bit-width config - localparam int unsigned ACTIVATION_WIDTH = 8; - localparam int unsigned WEIGHT_WIDTH = 4; - localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW); - localparam bit SIGNED_ACTIVATIONS = 1; - // Simulation constants - localparam int unsigned NF = MH/PE; - localparam int unsigned SF = MW/SIMD; - localparam int unsigned NUM_OF_DSP = SIMD/3; - localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8; - localparam int unsigned ACTIVATION_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8*8; - localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH; - localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - SIMD*ACTIVATION_WIDTH; - localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8; - - // Generate clk and reset signal - logic clk = 0; - always #5ns clk = !clk; - - logic ap_rst_n = 0; - initial begin - repeat(16) @(posedge clk); - ap_rst_n <= 1; - end - - uwire ap_clk = clk; - - // Generate activations - typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; - typedef activation_t activation_vector_t[SF]; - - function activation_vector_t init_ACTIVATIONS; - automatic activation_vector_t res; - std::randomize(res); - return res; - endfunction : init_ACTIVATIONS - - activation_vector_t ACTIVATIONS = init_ACTIVATIONS(); - - struct { - activation_t dat; - logic vld; - logic rdy; - } activations; - - initial begin - activations.vld = 0; - activations.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain - @(posedge clk iff ap_rst_n); - - for (int i=0; i 1; - @(posedge clk); - end while (!(activations.vld === 1 && activations.rdy === 1)); - end - - activations.vld <= 0; - activations.dat <= 'x; - end - - // Generate weights - typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; - typedef weight_t weight_matrix_t[NF][SF]; - - function weight_matrix_t init_WEIGHTS; - automatic weight_matrix_t res; - std::randomize(res); - return res; - endfunction : init_WEIGHTS; - - weight_matrix_t WEIGHTS = init_WEIGHTS(); - - struct { - weight_t dat; - logic vld; - logic rdy; - } weights; - - initial begin - weights.vld = 0; - weights.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain - @(posedge clk iff ap_rst_n); - - weights.vld <= 1; - for (int i=0; i= 1; - @(posedge clk iff ap_rst_n); - end while (!(outputs.rdy === 1 && outputs.vld === 1)); - - // Compare produced outputs against golden outputs - foreach(outputs.dat[i]) begin - assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); - else begin - $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); - $stop; - end - end - - NF_CNT += 1; - end - - $finish; - end - - // Instantiate DUT - mvu_8sx9_axi #( - .MW(MW), - .MH(MH), - .PE(PE), - .SIMD(SIMD), - .ACTIVATION_WIDTH(ACTIVATION_WIDTH), - .WEIGHT_WIDTH(WEIGHT_WIDTH), - .ACCU_WIDTH(ACCU_WIDTH), - .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), - .SEGMENTLEN(SEGMENTLEN) - ) - dut ( - .ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld), - .s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld), - .s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld), - .m_axis_output_tready(outputs.rdy) - ); +//-------------------- Simulation parameters --------------------\\ + // Matrix & parallelism config + localparam int unsigned MW = 600; + localparam int unsigned MH = 256; + localparam int unsigned SIMD = 60; + localparam int unsigned PE = 16; + localparam int unsigned SEGMENTLEN = 4; + // Bit-width config + localparam int unsigned ACTIVATION_WIDTH = 8; + localparam int unsigned WEIGHT_WIDTH = 4; + localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW); + localparam bit SIGNED_ACTIVATIONS = 1; + // Simulation constants + localparam int unsigned NF = MH/PE; + localparam int unsigned SF = MW/SIMD; + localparam int unsigned NUM_OF_DSP = SIMD/3; + localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8; + localparam int unsigned ACTIVATION_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8*8; + localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH; + localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - SIMD*ACTIVATION_WIDTH; + localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8; + + // Generate clk and reset signal + logic clk = 0; + always #5ns clk = !clk; + + logic ap_rst_n = 0; + initial begin + repeat(16) @(posedge clk); + ap_rst_n <= 1; + end + + uwire ap_clk = clk; + + // Generate activations + typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; + typedef activation_t activation_vector_t[SF]; + + function activation_vector_t init_ACTIVATIONS; + automatic activation_vector_t res; + std::randomize(res); + return res; + endfunction : init_ACTIVATIONS + + activation_vector_t ACTIVATIONS = init_ACTIVATIONS(); + + struct { + activation_t dat; + logic vld; + logic rdy; + } activations; + + initial begin + activations.vld = 0; + activations.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain + @(posedge clk iff ap_rst_n); + + for (int i=0; i 1; + @(posedge clk); + end while (!(activations.vld === 1 && activations.rdy === 1)); + end + + activations.vld <= 0; + activations.dat <= 'x; + end + + // Generate weights + typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; + typedef weight_t weight_matrix_t[NF][SF]; + + function weight_matrix_t init_WEIGHTS; + automatic weight_matrix_t res; + std::randomize(res); + return res; + endfunction : init_WEIGHTS; + + weight_matrix_t WEIGHTS = init_WEIGHTS(); + + struct { + weight_t dat; + logic vld; + logic rdy; + } weights; + + initial begin + weights.vld = 0; + weights.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain + @(posedge clk iff ap_rst_n); + + weights.vld <= 1; + for (int i=0; i= 1; + @(posedge clk iff ap_rst_n); + end while (!(outputs.rdy === 1 && outputs.vld === 1)); + + // Compare produced outputs against golden outputs + foreach(outputs.dat[i]) begin + assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + else begin + $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + $stop; + end + end + + NF_CNT += 1; + end + + $finish; + end + + // Instantiate DUT + mvu_8sx9_axi #( + .MW(MW), + .MH(MH), + .PE(PE), + .SIMD(SIMD), + .ACTIVATION_WIDTH(ACTIVATION_WIDTH), + .WEIGHT_WIDTH(WEIGHT_WIDTH), + .ACCU_WIDTH(ACCU_WIDTH), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), + .SEGMENTLEN(SEGMENTLEN) + ) + dut ( + .ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld), + .s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld), + .s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld), + .m_axis_output_tready(outputs.rdy) + ); endmodule diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v index ff3779d211..2456eb3a47 100644 --- a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v +++ b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v @@ -33,7 +33,7 @@ module $MODULE_NAME_AXI_WRAPPER$ #( parameter MW = $MW$, - parameter MH = $MH$, + parameter MH = $MH$, parameter PE = $PE$, parameter SIMD = $SIMD$, parameter ACTIVATION_WIDTH = $ACTIVATION_WIDTH$, @@ -44,29 +44,29 @@ module $MODULE_NAME_AXI_WRAPPER$ #( parameter RAM_STYLE = $IBUF_RAM_STYLE$, // Safely deducible parameters - parameter WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, - parameter INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8, - parameter OUTPUT_LANES = PE, - parameter OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8 + parameter WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, + parameter INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8, + parameter OUTPUT_LANES = PE, + parameter OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8 )( - // Global Control - input logic ap_clk, - input logic ap_rst_n, + // Global Control + input logic ap_clk, + input logic ap_rst_n, // Weight Stream - input logic [WEIGHT_STREAM_WIDTH_BA-1:0] s_axis_weights_tdata, - input logic s_axis_weights_tvalid, + input logic [WEIGHT_STREAM_WIDTH_BA-1:0] s_axis_weights_tdata, + input logic s_axis_weights_tvalid, output logic s_axis_weights_tready, // Input Stream - input logic [INPUT_STREAM_WIDTH_BA-1:0] s_axis_input_tdata, - input logic s_axis_input_tvalid, + input logic [INPUT_STREAM_WIDTH_BA-1:0] s_axis_input_tdata, + input logic s_axis_input_tvalid, output logic s_axis_input_tready, // Output Stream output logic [OUTPUT_STREAM_WIDTH_BA-1:0] m_axis_output_tdata, output logic m_axis_output_tvalid, - input logic m_axis_output_tready + input logic m_axis_output_tready ); mvu_8sx9_axi #( diff --git a/finn-rtllib/mvu/mvu_8sx9_tb.sv b/finn-rtllib/mvu/mvu_8sx9_tb.sv index ea3ecbbd70..adf6a8f9c2 100644 --- a/finn-rtllib/mvu/mvu_8sx9_tb.sv +++ b/finn-rtllib/mvu/mvu_8sx9_tb.sv @@ -33,133 +33,133 @@ module mvu_8sx9_tb(); - //-------------------- Simulation parameters --------------------\\ - // Matrix & parallelism config - localparam int unsigned MH = 256; - localparam int unsigned PE = 16; - localparam int unsigned MW = 600; - localparam int unsigned SIMD = 60; - localparam int unsigned SEGMENTLEN = 4; - // Bit-width config - localparam int unsigned ACTIVATION_WIDTH = 8; - localparam int unsigned WEIGHT_WIDTH = 4; - localparam bit SIGNED_ACTIVATIONS = 1; - // Simulation constants - localparam int unsigned NF = MH/PE; - localparam int unsigned SF = MW/SIMD; - localparam int unsigned NUM_OF_DSP = SIMD/3; - - typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; - typedef activation_t activation_vector_t[SF]; - - function activation_vector_t init_ACTIVATIONS; - automatic activation_vector_t res; - std::randomize(res); - return res; - endfunction : init_ACTIVATIONS - - typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; - typedef weight_t weight_matrix_t[NF][SF]; - - function weight_matrix_t init_WEIGHTS; - automatic weight_matrix_t res; - std::randomize(res); - return res; - endfunction : init_WEIGHTS; - - typedef logic signed [PE-1:0][57:0] output_t; - typedef output_t output_vector_t [NF]; - - function output_vector_t check_output(activation_vector_t a, weight_matrix_t w); - automatic output_vector_t res = '{default: 0}; - for (int j = 0; j 1) && !rst; - end - - // Compare computed output against golden output when vld flag is raised by DUT - always_ff @(posedge clk iff (vld && en)) begin - foreach(p[i]) begin - assert ($signed(p[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); - else begin - $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); - $stop; - end - end - NF_CNT += 1; - end - - // Instantiate DUT - mvu_8sx9 #( - .PE(PE), - .SIMD(SIMD), - .WEIGHT_WIDTH(WEIGHT_WIDTH), - .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), - .ACTIVATION_WIDTH(ACTIVATION_WIDTH), - .SEGMENTLEN(SEGMENTLEN) - ) - dut ( - .clk, .rst, .en, .last, .zero, .a, .w, .vld, .p - ); - +//-------------------- Simulation parameters --------------------\\ + // Matrix & parallelism config + localparam int unsigned MH = 256; + localparam int unsigned PE = 16; + localparam int unsigned MW = 600; + localparam int unsigned SIMD = 60; + localparam int unsigned SEGMENTLEN = 4; + // Bit-width config + localparam int unsigned ACTIVATION_WIDTH = 8; + localparam int unsigned WEIGHT_WIDTH = 4; + localparam bit SIGNED_ACTIVATIONS = 1; + // Simulation constants + localparam int unsigned NF = MH/PE; + localparam int unsigned SF = MW/SIMD; + localparam int unsigned NUM_OF_DSP = SIMD/3; + + typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; + typedef activation_t activation_vector_t[SF]; + + function activation_vector_t init_ACTIVATIONS; + automatic activation_vector_t res; + std::randomize(res); + return res; + endfunction : init_ACTIVATIONS + + typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; + typedef weight_t weight_matrix_t[NF][SF]; + + function weight_matrix_t init_WEIGHTS; + automatic weight_matrix_t res; + std::randomize(res); + return res; + endfunction : init_WEIGHTS; + + typedef logic signed [PE-1:0][57:0] output_t; + typedef output_t output_vector_t [NF]; + + function output_vector_t check_output(activation_vector_t a, weight_matrix_t w); + automatic output_vector_t res = '{default: 0}; + for (int j = 0; j 1) && !rst; + end + + // Compare computed output against golden output when vld flag is raised by DUT + always_ff @(posedge clk iff (vld && en)) begin + foreach(p[i]) begin + assert ($signed(p[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + else begin + $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + $stop; + end + end + NF_CNT += 1; + end + + // Instantiate DUT + mvu_8sx9 #( + .PE(PE), + .SIMD(SIMD), + .WEIGHT_WIDTH(WEIGHT_WIDTH), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), + .ACTIVATION_WIDTH(ACTIVATION_WIDTH), + .SEGMENTLEN(SEGMENTLEN) + ) + dut ( + .clk, .rst, .en, .last, .zero, .a, .w, .vld, .p + ); + endmodule From 5e61f42afd991233153ee8b7fe0fb6e9e8ac562d Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 12 Apr 2023 08:54:45 +0100 Subject: [PATCH 009/112] [rtl custom op]: fix to indentation --- finn-rtllib/mvu/mvu_8sx9_axi.sv | 54 ++++++++++++++++----------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/finn-rtllib/mvu/mvu_8sx9_axi.sv b/finn-rtllib/mvu/mvu_8sx9_axi.sv index 6c7eaeaeca..5f215927d8 100644 --- a/finn-rtllib/mvu/mvu_8sx9_axi.sv +++ b/finn-rtllib/mvu/mvu_8sx9_axi.sv @@ -32,25 +32,25 @@ *****************************************************************************/ module mvu_8sx9_axi #( - int unsigned MW, - int unsigned MH, - int unsigned PE, - int unsigned SIMD, - int unsigned ACTIVATION_WIDTH, - int unsigned WEIGHT_WIDTH, - int unsigned ACCU_WIDTH, - bit SIGNED_ACTIVATIONS = 0, - int unsigned SEGMENTLEN = 0, + int unsigned MW, + int unsigned MH, + int unsigned PE, + int unsigned SIMD, + int unsigned ACTIVATION_WIDTH, + int unsigned WEIGHT_WIDTH, + int unsigned ACCU_WIDTH, + bit SIGNED_ACTIVATIONS = 0, + int unsigned SEGMENTLEN = 0, parameter RAM_STYLE = "auto", - localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, - localparam int unsigned INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8, + localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, + localparam int unsigned INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8, localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH, localparam int unsigned INPUT_STREAM_WIDTH = SIMD*ACTIVATION_WIDTH, - localparam int unsigned SF = MW/SIMD, + localparam int unsigned SF = MW/SIMD, localparam int unsigned NF = MH/PE, - localparam int unsigned OUTPUT_LANES = PE, - localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8 + localparam int unsigned OUTPUT_LANES = PE, + localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8 ) ( // Global Control @@ -76,31 +76,31 @@ module mvu_8sx9_axi #( //-------------------- Parameter sanity checks --------------------\\ initial begin if (MW % SIMD != 0) begin - $error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD); - $finish; + $error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD); + $finish; end if (MH % PE != 0) begin - $error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE); - $finish; + $error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE); + $finish; end if (ACTIVATION_WIDTH > 9) begin - $error("Activation width of %0d-bits exceeds maximum of 9-bits", ACTIVATION_WIDTH); - $finish; + $error("Activation width of %0d-bits exceeds maximum of 9-bits", ACTIVATION_WIDTH); + $finish; end if (WEIGHT_WIDTH > 8) begin - $error("Weight width of %0d-bits exceeds maximum of 8-bits", WEIGHT_WIDTH); - $finish; + $error("Weight width of %0d-bits exceeds maximum of 8-bits", WEIGHT_WIDTH); + $finish; end if (SIGNED_ACTIVATIONS == 0 && ACTIVATION_WIDTH==9) begin - $error("Activation width of %0d-bits exceeds maximum of 8-bits for unsigned numbers", ACTIVATION_WIDTH); - $finish; + $error("Activation width of %0d-bits exceeds maximum of 8-bits for unsigned numbers", ACTIVATION_WIDTH); + $finish; end if (SEGMENTLEN == 0) begin - $warning("Segment length of %0d defaults to chain length", SEGMENTLEN); + $warning("Segment length of %0d defaults to chain length", SEGMENTLEN); end if (SEGMENTLEN > (SIMD+2)/3) begin - $error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3); - $finish; + $error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3); + $finish; end end From cbee193d746763044a870bdf1af248bbe8d31156 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 12 Apr 2023 14:33:13 +0100 Subject: [PATCH 010/112] [rtl custom-op]: minor changes for compiler integration --- finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v index 2456eb3a47..502a72d3f2 100644 --- a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v +++ b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v @@ -41,7 +41,7 @@ module $MODULE_NAME_AXI_WRAPPER$ #( parameter ACCU_WIDTH = $ACCU_WIDTH$, parameter SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$, parameter SEGMENTLEN = $SEGMENTLEN$, - parameter RAM_STYLE = $IBUF_RAM_STYLE$, + parameter RAM_STYLE = "$IBUF_RAM_STYLE$", // Safely deducible parameters parameter WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, @@ -85,6 +85,6 @@ mvu_8sx9_axi #( .m_axis_output_tdata(m_axis_output_tdata), .m_axis_output_tvalid(m_axis_output_tvalid), .m_axis_output_tready(m_axis_output_tready) -) +); -endmodule : mvau_8sx9_axi_wrapper \ No newline at end of file +endmodule : $MODULE_NAME_AXI_WRAPPER$ \ No newline at end of file From ba5e77bde008fff2a445d6ef469072dd67f67f42 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 12 Apr 2023 23:26:05 +0100 Subject: [PATCH 011/112] [rtl custom op]: moved testbenches to separate directory --- finn-rtllib/mvu/tb/mvu_8sx9_tb.sv | 165 +++++++++++++++++++++++ finn-rtllib/mvu/tb/mvu_axi_tb.sv | 213 ++++++++++++++++++++++++++++++ 2 files changed, 378 insertions(+) create mode 100644 finn-rtllib/mvu/tb/mvu_8sx9_tb.sv create mode 100644 finn-rtllib/mvu/tb/mvu_axi_tb.sv diff --git a/finn-rtllib/mvu/tb/mvu_8sx9_tb.sv b/finn-rtllib/mvu/tb/mvu_8sx9_tb.sv new file mode 100644 index 0000000000..c8bfe5370a --- /dev/null +++ b/finn-rtllib/mvu/tb/mvu_8sx9_tb.sv @@ -0,0 +1,165 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Testbench for MVU core compute kernel. + *****************************************************************************/ + +module mvu_8sx9_tb(); + +//-------------------- Simulation parameters --------------------\\ + // Matrix & parallelism config + localparam int unsigned MH = 256; + localparam int unsigned PE = 16; + localparam int unsigned MW = 600; + localparam int unsigned SIMD = 60; + localparam int unsigned SEGMENTLEN = 4; + // Bit-width config + localparam int unsigned ACTIVATION_WIDTH = 8; + localparam int unsigned WEIGHT_WIDTH = 4; + localparam bit SIGNED_ACTIVATIONS = 1; + // Simulation constants + localparam int unsigned NF = MH/PE; + localparam int unsigned SF = MW/SIMD; + localparam int unsigned NUM_OF_DSP = SIMD/3; + + typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; + typedef activation_t activation_vector_t[SF]; + + function activation_vector_t init_ACTIVATIONS; + automatic activation_vector_t res; + std::randomize(res); + return res; + endfunction : init_ACTIVATIONS + + typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; + typedef weight_t weight_matrix_t[NF][SF]; + + function weight_matrix_t init_WEIGHTS; + automatic weight_matrix_t res; + std::randomize(res); + return res; + endfunction : init_WEIGHTS; + + typedef logic signed [PE-1:0][57:0] output_t; + typedef output_t output_vector_t [NF]; + + function output_vector_t check_output(activation_vector_t a, weight_matrix_t w); + automatic output_vector_t res = '{default: 0}; + for (int j = 0; j 1) && !rst; + end + + // Compare computed output against golden output when vld flag is raised by DUT + always_ff @(posedge clk iff (vld && en)) begin + foreach(p[i]) begin + assert ($signed(p[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + else begin + $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + $stop; + end + end + NF_CNT += 1; + end + + // Instantiate DUT + mvu_8sx9 #( + .PE(PE), + .SIMD(SIMD), + .WEIGHT_WIDTH(WEIGHT_WIDTH), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), + .ACTIVATION_WIDTH(ACTIVATION_WIDTH), + .SEGMENTLEN(SEGMENTLEN) + ) + dut ( + .clk, .rst, .en, .last, .zero, .a, .w, .vld, .p + ); + +endmodule : mvu_8sx9_tb diff --git a/finn-rtllib/mvu/tb/mvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_axi_tb.sv new file mode 100644 index 0000000000..08a349da84 --- /dev/null +++ b/finn-rtllib/mvu/tb/mvu_axi_tb.sv @@ -0,0 +1,213 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Testbench for MVU AXI-lite interface wrapper. + *****************************************************************************/ + +module mvu_axi_tb(); + +//-------------------- Simulation parameters --------------------\\ + // Matrix & parallelism config + localparam int unsigned MW = 90; + localparam int unsigned MH = 16; + localparam int unsigned SIMD = 9; + localparam int unsigned PE = 4; + localparam int unsigned SEGMENTLEN = 1; + localparam string MVU_IMPL_STYLE = "mvu_8sx9"; + // Bit-width config + localparam int unsigned ACTIVATION_WIDTH = 8; + localparam int unsigned WEIGHT_WIDTH = 4; + localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW); + localparam bit SIGNED_ACTIVATIONS = 1; + // Simulation constants + localparam int unsigned NF = MH/PE; + localparam int unsigned SF = MW/SIMD; + localparam int unsigned NUM_OF_DSP = SIMD/3; + localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8; + localparam int unsigned ACTIVATION_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8*8; + localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH; + localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - SIMD*ACTIVATION_WIDTH; + localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8; + + // Generate clk and reset signal + logic clk = 0; + always #5ns clk = !clk; + + logic ap_rst_n = 0; + initial begin + repeat(16) @(posedge clk); + ap_rst_n <= 1; + end + + uwire ap_clk = clk; + + // Generate activations + typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; + typedef activation_t activation_vector_t[SF]; + + function activation_vector_t init_ACTIVATIONS; + automatic activation_vector_t res; + std::randomize(res); + return res; + endfunction : init_ACTIVATIONS + + activation_vector_t ACTIVATIONS = init_ACTIVATIONS(); + + struct { + activation_t dat; + logic vld; + logic rdy; + } activations; + + initial begin + activations.vld = 0; + activations.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain + @(posedge clk iff ap_rst_n); + + for (int i=0; i 1; + @(posedge clk); + end while (!(activations.vld === 1 && activations.rdy === 1)); + end + + activations.vld <= 0; + activations.dat <= 'x; + end + + // Generate weights + typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; + typedef weight_t weight_matrix_t[NF][SF]; + + function weight_matrix_t init_WEIGHTS; + automatic weight_matrix_t res; + std::randomize(res); + return res; + endfunction : init_WEIGHTS; + + weight_matrix_t WEIGHTS = init_WEIGHTS(); + + struct { + weight_t dat; + logic vld; + logic rdy; + } weights; + + initial begin + weights.vld = 0; + weights.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain + @(posedge clk iff ap_rst_n); + + weights.vld <= 1; + for (int i=0; i= 1; + @(posedge clk iff ap_rst_n); + end while (!(outputs.rdy === 1 && outputs.vld === 1)); + + // Compare produced outputs against golden outputs + foreach(outputs.dat[i]) begin + assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + else begin + $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + $stop; + end + end + + NF_CNT += 1; + end + + $finish; + end + + // Instantiate DUT + mvu_axi #( + .MW(MW), + .MH(MH), + .PE(PE), + .SIMD(SIMD), + .ACTIVATION_WIDTH(ACTIVATION_WIDTH), + .WEIGHT_WIDTH(WEIGHT_WIDTH), + .ACCU_WIDTH(ACCU_WIDTH), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), + .SEGMENTLEN(SEGMENTLEN), + .MVU_IMPL_STYLE(MVU_IMPL_STYLE) + ) + dut ( + .ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld), + .s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld), + .s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld), + .m_axis_output_tready(outputs.rdy) + ); + +endmodule : mvu_axi_tb From 69310b4e6d2ee4bf2e60b236582656fd7f364a6d Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 12 Apr 2023 23:27:50 +0100 Subject: [PATCH 012/112] [rtl custom op]: fixed output width to ACCU_WIDTH --- finn-rtllib/mvu/mvu_8sx9.sv | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv index d082d4fb2e..5af27ab0ce 100644 --- a/finn-rtllib/mvu/mvu_8sx9.sv +++ b/finn-rtllib/mvu/mvu_8sx9.sv @@ -36,19 +36,25 @@ module mvu_8sx9 #( int unsigned SIMD, int unsigned ACTIVATION_WIDTH, int unsigned WEIGHT_WIDTH, + int unsigned ACCU_WIDTH, bit SIGNED_ACTIVATIONS = 0, int unsigned SEGMENTLEN = 0 // Default to 0 (which implies a single segment) ) ( - input logic clk, + // Global Control + input logic clk, input logic rst, input logic en, + + // Input input logic last, - input logic zero, - input logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] a, - input logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, - output logic vld, - output logic [PE-1:0][57:0] p + input logic zero, // ignore current inputs and force this partial product to zero + input logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // weights + input logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] a, // activations + + // Ouput + output logic vld, + output logic [PE-1:0][ACCU_WIDTH-1:0] p ); //-------------------- Declare global signals --------------------\\ @@ -146,7 +152,7 @@ module mvu_8sx9 #( uwire [57:0] pp; if (LAST) begin : genPOUT - assign p[j] = pp; + assign p[j] = pp[ACCU_WIDTH-1:0]; end DSP58 #( @@ -281,4 +287,4 @@ module mvu_8sx9 #( end : genDSPChain end : genDSPPE -endmodule +endmodule : mvu_8sx9 From cfcff0040c85a76d7c5a16b2bf1b6b966b62e87d Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 12 Apr 2023 23:29:06 +0100 Subject: [PATCH 013/112] [rtl custom op]: renamed file and added generic to switch between compute kernels --- finn-rtllib/mvu/mvu_axi.sv | 194 +++++++++++++++++++++++++++++++++++++ 1 file changed, 194 insertions(+) create mode 100644 finn-rtllib/mvu/mvu_axi.sv diff --git a/finn-rtllib/mvu/mvu_axi.sv b/finn-rtllib/mvu/mvu_axi.sv new file mode 100644 index 0000000000..5d8700738f --- /dev/null +++ b/finn-rtllib/mvu/mvu_axi.sv @@ -0,0 +1,194 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Matrix Vector Unit (MVU) AXI-lite interface wrapper. + *****************************************************************************/ + +module mvu_axi #( + int unsigned MW, + int unsigned MH, + int unsigned PE, + int unsigned SIMD, + int unsigned ACTIVATION_WIDTH, + int unsigned WEIGHT_WIDTH, + int unsigned ACCU_WIDTH, + bit SIGNED_ACTIVATIONS = 0, + int unsigned SEGMENTLEN = 0, + parameter RAM_STYLE = "auto", + parameter MVU_IMPL_STYLE, + + localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, + localparam int unsigned INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8, + localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH, + localparam int unsigned INPUT_STREAM_WIDTH = SIMD*ACTIVATION_WIDTH, + localparam int unsigned SF = MW/SIMD, + localparam int unsigned NF = MH/PE, + localparam int unsigned OUTPUT_LANES = PE, + localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8 +) +( + // Global Control + input logic ap_clk, + input logic ap_rst_n, + + // Weight Stream + input logic [WEIGHT_STREAM_WIDTH_BA-1:0] s_axis_weights_tdata, + input logic s_axis_weights_tvalid, + output logic s_axis_weights_tready, + + // Input Stream + input logic [INPUT_STREAM_WIDTH_BA-1:0] s_axis_input_tdata, + input logic s_axis_input_tvalid, + output logic s_axis_input_tready, + + // Output Stream + output logic [OUTPUT_STREAM_WIDTH_BA-1:0] m_axis_output_tdata, + output logic m_axis_output_tvalid, + input logic m_axis_output_tready +); + +//-------------------- Parameter sanity checks --------------------\\ + initial begin + if (MW % SIMD != 0) begin + $error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD); + $finish; + end + if (MH % PE != 0) begin + $error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE); + $finish; + end + if (ACTIVATION_WIDTH > 9) begin + $error("Activation width of %0d-bits exceeds maximum of 9-bits", ACTIVATION_WIDTH); + $finish; + end + if (WEIGHT_WIDTH > 8) begin + $error("Weight width of %0d-bits exceeds maximum of 8-bits", WEIGHT_WIDTH); + $finish; + end + if (SIGNED_ACTIVATIONS == 0 && ACTIVATION_WIDTH==9) begin + $error("Activation width of %0d-bits exceeds maximum of 8-bits for unsigned numbers", ACTIVATION_WIDTH); + $finish; + end + if (SEGMENTLEN == 0) begin + $warning("Segment length of %0d defaults to chain length", SEGMENTLEN); + end + if (SEGMENTLEN > (SIMD+2)/3) begin + $error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3); + $finish; + end + end + + uwire clk = ap_clk; + uwire rst = !ap_rst_n; + + typedef logic [INPUT_STREAM_WIDTH-1 : 0] mvauin_t; + + uwire mvauin_t amvau; + uwire alast; + uwire afin; + uwire avld; + uwire ardy; + + replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvauin_t)), .RAM_STYLE(RAM_STYLE)) activation_replay ( + .clk, .rst, + .ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)), + .ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin) + ); + +//-------------------- Input control --------------------\\ + uwire en; + uwire istb = avld && s_axis_weights_tvalid; + assign ardy = en && s_axis_weights_tvalid; + assign s_axis_weights_tready = en && avld; + +//-------------------- Core MVU --------------------\\ + uwire ovld; + uwire [PE-1:0][ACCU_WIDTH-1:0] odat; + typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t; + + if (MVU_IMPL_STYLE == "mvu_8sx9") begin : genMVU8sx9 + mvu_8sx9 #(.PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), + .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN)) core ( + .clk, .rst, .en, + .last(alast), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau), + .vld(ovld), .p(odat) + ); + end + else if (MVU_IMPL_STYLE == "mvu_4sx4u") begin : genMVU4sx4u + mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .FORCE_BEHAVIORAL(0)) core ( + .clk, .rst, .en, + .last(alast), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau), + .vld(ovld), .p(odat) + ); + end + //else begin + // $error("Unrecognized MVU_IMPL_STYLE!"); + // $finish; + //end + +//-------------------- Output register slice --------------------\\ + struct { + logic vld; + logic [PE-1:0][ACCU_WIDTH-1:0] dat; + } A = '{ vld: 0, default: 'x}; + + assign en = !A.vld || !ovld; + + uwire b_load; + always_ff @(posedge clk) begin + if(rst) A <= '{ vld: 0, default: 'x }; + else if(!A.vld || b_load) begin + A.vld <= ovld && en; + for(int unsigned i = 0; i < PE; i++) begin + // CR-1148862: + // A.dat[i] <= odat[i]; + automatic logic [ACCU_WIDTH-1:0] v = odat[i]; + A.dat[i] <= v[ACCU_WIDTH-1:0]; + end + end + end + + struct { + logic vld; + logic [PE-1:0][ACCU_WIDTH-1:0] dat; + } B = '{ vld: 0, default: 'x}; + + assign b_load = !B.vld || m_axis_output_tready; + always_ff @(posedge clk) begin + if(rst) B <= '{ default: 'x }; + else begin + if(b_load) B <= '{ vld: A.vld, dat: A.dat}; + end + end + + assign m_axis_output_tvalid = B.vld; + assign m_axis_output_tdata = B.dat; + +endmodule : mvu_axi \ No newline at end of file From 72b519691369b9ebc31983a6723485860837e37b Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 12 Apr 2023 23:29:45 +0100 Subject: [PATCH 014/112] [rtl custom op]: renamed file and added generic to switch between compute kernels --- finn-rtllib/mvu/mvu_axi_wrapper.v | 90 +++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 finn-rtllib/mvu/mvu_axi_wrapper.v diff --git a/finn-rtllib/mvu/mvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_axi_wrapper.v new file mode 100644 index 0000000000..323d2711e4 --- /dev/null +++ b/finn-rtllib/mvu/mvu_axi_wrapper.v @@ -0,0 +1,90 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Verilog AXI-lite wrapper for MVU. + *****************************************************************************/ + +module $MODULE_NAME_AXI_WRAPPER$ #( + parameter MW = $MW$, + parameter MH = $MH$, + parameter PE = $PE$, + parameter SIMD = $SIMD$, + parameter ACTIVATION_WIDTH = $ACTIVATION_WIDTH$, + parameter WEIGHT_WIDTH = $WEIGHT_WIDTH$, + parameter ACCU_WIDTH = $ACCU_WIDTH$, + parameter SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$, + parameter SEGMENTLEN = $SEGMENTLEN$, + parameter RAM_STYLE = "$IBUF_RAM_STYLE$", + + // Safely deducible parameters + parameter WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, + parameter INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8, + parameter OUTPUT_LANES = PE, + parameter OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8 +)( + // Global Control + input logic ap_clk, + input logic ap_rst_n, + + // Weight Stream + input logic [WEIGHT_STREAM_WIDTH_BA-1:0] s_axis_weights_tdata, + input logic s_axis_weights_tvalid, + output logic s_axis_weights_tready, + + // Input Stream + input logic [INPUT_STREAM_WIDTH_BA-1:0] s_axis_input_tdata, + input logic s_axis_input_tvalid, + output logic s_axis_input_tready, + + // Output Stream + output logic [OUTPUT_STREAM_WIDTH_BA-1:0] m_axis_output_tdata, + output logic m_axis_output_tvalid, + input logic m_axis_output_tready +); + +mvu_axi #( + .MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), + .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), + .SEGMENTLEN(SEGMENTLEN), .RAM_STYLE(RAM_STYLE) + ) inst ( + .ap_clk(ap_clk), + .ap_rst_n(ap_rst_n), + .s_axis_weights_tdata(s_axis_weights_tdata), + .s_axis_weights_tvalid(s_axis_weights_tvalid), + .s_axis_weights_tready(s_axis_weights_tready), + .s_axis_input_tdata(s_axis_input_tdata), + .s_axis_input_tvalid(s_axis_input_tvalid), + .s_axis_input_tready(s_axis_input_tready), + .m_axis_output_tdata(m_axis_output_tdata), + .m_axis_output_tvalid(m_axis_output_tvalid), + .m_axis_output_tready(m_axis_output_tready) +); + +endmodule : $MODULE_NAME_AXI_WRAPPER$ \ No newline at end of file From c068bb65c6a4b877876c5b1278e7b2663b81d8e1 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 8 May 2023 10:15:16 +0100 Subject: [PATCH 015/112] [rtl mvu]: added behavioral model DSP58 --- finn-rtllib/mvu/mvu_8sx9.sv | 343 ++++++++++++++++++++++-------------- 1 file changed, 212 insertions(+), 131 deletions(-) diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv index 5af27ab0ce..2d1da26efb 100644 --- a/finn-rtllib/mvu/mvu_8sx9.sv +++ b/finn-rtllib/mvu/mvu_8sx9.sv @@ -38,7 +38,8 @@ module mvu_8sx9 #( int unsigned WEIGHT_WIDTH, int unsigned ACCU_WIDTH, bit SIGNED_ACTIVATIONS = 0, - int unsigned SEGMENTLEN = 0 // Default to 0 (which implies a single segment) + int unsigned SEGMENTLEN = 0, // Default to 0 (which implies a single segment) + bit FORCE_BEHAVIORAL = 0 ) ( // Global Control @@ -70,7 +71,10 @@ module mvu_8sx9 #( always_ff @(posedge clk) begin if(rst) L <= '{default: 0}; - else if(en) L <= { L[1:1+MAX_PIPELINE_STAGES], last }; + else if(en) begin + L[1+MAX_PIPELINE_STAGES] <= last; + L[0:MAX_PIPELINE_STAGES] <= L[1:1+MAX_PIPELINE_STAGES]; + end end assign vld = L[0]; @@ -155,135 +159,212 @@ module mvu_8sx9 #( assign p[j] = pp[ACCU_WIDTH-1:0]; end - DSP58 #( - // Feature Control Attributes: Data Path Selection - .AMULTSEL("A"), // Selects A input to multiplier (A, AD) - .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) - .BMULTSEL("B"), // Selects B input to multiplier (AD, B) - .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port) - .DSP_MODE("INT8"), // Configures DSP to a particular mode of operation. Set to INT24 for - // legacy mode. - .PREADDINSEL("A"), // Selects input to pre-adder (A, B) - .RND(58'h000000000000000), // Rounding Constant - .USE_MULT("MULTIPLY"), // Select multiplier usage (DYNAMIC, MULTIPLY, NONE) - .USE_SIMD("ONE58"), // SIMD selection (FOUR12, ONE58, TWO24) - .USE_WIDEXOR("FALSE"), // Use the Wide XOR function (FALSE, TRUE) - .XORSIMD("XOR24_34_58_116"), // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116) - // Pattern Detector Attributes: Pattern Detection Configuration - .AUTORESET_PATDET("NO_RESET"), // NO_RESET, RESET_MATCH, RESET_NOT_MATCH - .AUTORESET_PRIORITY("RESET"), // Priority of AUTORESET vs. CEP (CEP, RESET). - .MASK(58'h0ffffffffffffff), // 58-bit mask value for pattern detect (1=ignore) - .PATTERN(58'h000000000000000), // 58-bit pattern match for pattern detect - .SEL_MASK("MASK"), // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2 - .SEL_PATTERN("PATTERN"), // Select pattern value (C, PATTERN) - .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect (NO_PATDET, PATDET) - // Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins - .IS_ALUMODE_INVERTED(4'b0000), // Optional inversion for ALUMODE - .IS_CARRYIN_INVERTED(1'b0), // Optional inversion for CARRYIN - .IS_CLK_INVERTED(1'b0), // Optional inversion for CLK - .IS_INMODE_INVERTED(5'b00000), // Optional inversion for INMODE - .IS_NEGATE_INVERTED(3'b000), // Optional inversion for NEGATE - .IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0 - FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN - 2'b01, // Y : M - 2'b01 // X: M - }), // Optional inversion for OPMODE - .IS_RSTALLCARRYIN_INVERTED(1'b0), // Optional inversion for RSTALLCARRYIN - .IS_RSTALUMODE_INVERTED(1'b0), // Optional inversion for RSTALUMODE - .IS_RSTA_INVERTED(1'b0), // Optional inversion for RSTA - .IS_RSTB_INVERTED(1'b0), // Optional inversion for RSTB - .IS_RSTCTRL_INVERTED(1'b0), // Optional inversion for STCONJUGATE_A - .IS_RSTC_INVERTED(1'b0), // Optional inversion for RSTC - .IS_RSTD_INVERTED(1'b0), // Optional inversion for RSTD - .IS_RSTINMODE_INVERTED(1'b0), // Optional inversion for RSTINMODE - .IS_RSTM_INVERTED(1'b0), // Optional inversion for RSTM - .IS_RSTP_INVERTED(1'b0), // Optional inversion for RSTP - // Register Control Attributes: Pipeline Register Configuration - .ACASCREG(INTERNAL_PREGS), // Number of pipeline stages between A/ACIN and ACOUT (0-2) - .ADREG(0), // Pipeline stages for pre-adder (0-1) - .ALUMODEREG(0), // Pipeline stages for ALUMODE (0-1) - .AREG(INTERNAL_PREGS), // Pipeline stages for A (0-2) - .BCASCREG(INTERNAL_PREGS), // Number of pipeline stages between B/BCIN and BCOUT (0-2) - .BREG(INTERNAL_PREGS), // Pipeline stages for B (0-2) - .CARRYINREG(0), // Pipeline stages for CARRYIN (0-1) - .CARRYINSELREG(0), // Pipeline stages for CARRYINSEL (0-1) - .CREG(0), // Pipeline stages for C (0-1) - .DREG(0), // Pipeline stages for D (0-1) - .INMODEREG(1), // Pipeline stages for INMODE (0-1) - .MREG(1), // Multiplier pipeline stages (0-1) - .OPMODEREG(1), // Pipeline stages for OPMODE (0-1) - .PREG(PREG), // Number of pipeline stages for P (0-1) - .RESET_MODE("SYNC") // Selection of synchronous or asynchronous reset. (ASYNC, SYNC). - ) - DSP58_inst ( - // Cascade outputs: Cascade Ports - .ACOUT(), // 34-bit output: A port cascade - .BCOUT(), // 24-bit output: B cascade - .CARRYCASCOUT(), // 1-bit output: Cascade carry - .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade - .PCOUT(pcout[j][i]), // 58-bit output: Cascade output - // Control outputs: Control Inputs/Status Bits - .OVERFLOW(), // 1-bit output: Overflow in add/acc - .PATTERNBDETECT(), // 1-bit output: Pattern bar detect - .PATTERNDETECT(), // 1-bit output: Pattern detect - .UNDERFLOW(), // 1-bit output: Underflow in add/acc - // Data outputs: Data Ports - .CARRYOUT(), // 4-bit output: Carry - .P(pp), // 58-bit output: Primary data - .XOROUT(), // 8-bit output: XOR data - // Cascade inputs: Cascade Ports - .ACIN('x), // 34-bit input: A cascade data - .BCIN('x), // 24-bit input: B cascade - .CARRYCASCIN('x), // 1-bit input: Cascade carry - .MULTSIGNIN('x), // 1-bit input: Multiplier sign cascade - .PCIN(FIRST ? 'x : pcout[j][i-1]), // 58-bit input: P cascade - // Control inputs: Control Inputs/Status Bits - .ALUMODE(4'h0), // 4-bit input: ALU control - .CARRYINSEL('0), // 3-bit input: Carry select - .CLK(clk), // 1-bit input: Clock - .INMODE({ - INTERNAL_PREGS==2 ? 1'b0 : 1'b1, - 2'b00, - TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero, - INTERNAL_PREGS==2 ? 1'b0 : 1'b1 - }), // 5-bit input: INMODE control - .NEGATE('0), // 3-bit input: Negates the input of the multiplier - .OPMODE({ - LAST ? {1'b0, L[1]} : 2'b00, - 7'b000_0000 - }), // 9-bit input: Operation mode - // Data inputs: Data Ports - .A({ 7'bx, a_in_i[i] }), // 34-bit input: A data - .B(b_in_i[j][i]), // 24-bit input: B data - .C('x), // 58-bit input: C data - .CARRYIN('0), // 1-bit input: Carry-in - .D('x), // 27-bit input: D data - // Reset/Clock Enable inputs: Reset/Clock Enable Inputs - .ASYNC_RST('0), // 1-bit input: Asynchronous reset for all registers. - .CEA1(en), // 1-bit input: Clock enable for 1st stage AREG - .CEA2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage AREG - .CEAD('0), // 1-bit input: Clock enable for ADREG - .CEALUMODE('0), // 1-bit input: Clock enable for ALUMODE - .CEB1(en), // 1-bit input: Clock enable for 1st stage BREG - .CEB2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage BREG - .CEC('0), // 1-bit input: Clock enable for CREG - .CECARRYIN('0), // 1-bit input: Clock enable for CARRYINREG - .CECTRL(en), // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG - .CED('0), // 1-bit input: Clock enable for DREG - .CEINMODE(en), // 1-bit input: Clock enable for INMODEREG - .CEM(en), // 1-bit input: Clock enable for MREG - .CEP(PREG && en), // 1-bit input: Clock enable for PREG - .RSTA(rst), // 1-bit input: Reset for AREG - .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG - .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG - .RSTB(rst), // 1-bit input: Reset for BREG - .RSTC('0), // 1-bit input: Reset for CREG - .RSTCTRL(rst), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG - .RSTD('0), // 1-bit input: Reset for DREG and ADREG - .RSTINMODE(rst), // 1-bit input: Reset for INMODE register - .RSTM(rst), // 1-bit input: Reset for MREG - .RSTP(PREG && rst) // 1-bit input: Reset for PREG - ); + // Note: Since the product B * AD is computed, + // rst can be only applied to AD and zero only to B + // with the same effect as zeroing both. + if (FORCE_BEHAVIORAL) begin : genBehav + // Stage #1: Input A/B + logic signed [33:0] Areg [INTERNAL_PREGS]; + always_ff @(posedge clk) begin + if (rst) Areg <= '{ default : 0}; + else if (en) begin + Areg[0] <= { 7'bx, a_in_i[i] }; + if (INTERNAL_PREGS == 2) Areg[1] <= Areg[0]; + end + end + logic signed [23:0] Breg [INTERNAL_PREGS]; + always_ff @(posedge clk) begin + if (rst) Breg <= '{ default : 0}; + else if (en) begin + Breg[0] <= b_in_i[j][i]; + if (INTERNAL_PREGS == 2) Breg[1] <= Breg[0]; + end + end + + // Stage #2: Multiply-Accumulate + logic signed [57:0] Mreg; + logic InmodeZero = 0; + always_ff @(posedge clk) begin + if (rst) InmodeZero <= 0; + else if (en) InmodeZero <= ( TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero ); + end + always_ff @(posedge clk) begin + if (rst) Mreg <= 0; + else if (en) begin + automatic logic signed [57:0] m = 0; + for (int k = 0; k < 3; k++) begin + m = m + (InmodeZero ? 0 : $signed(Areg[INTERNAL_PREGS-1][9*k +: 9]) * $signed(Breg[INTERNAL_PREGS-1][8*k +: 8])); + end + Mreg <= m; + end + end + + // Stage #3: Accumulate + logic signed [57:0] Preg; + logic Opmode = 0; + if (FIRST && !LAST) begin : genFirst + if (PREG) begin : genPregBehav + always_ff @(posedge clk) begin + if (rst) Preg <= 0; + else if (en) Preg <= Mreg; + end + end + else assign Preg = Mreg; + end + else if (LAST) begin : genLast + always_ff @(posedge clk) begin + if (rst) Opmode <= 0; + else if (en) Opmode <= L[1]; + end + always_ff @(posedge clk) begin + if (rst) Preg <= 0; + else if (en) Preg <= (Opmode ? 0 : Preg) + Mreg + pcout[j][i-1]; + end + end + else begin : genMid + if (PREG) begin : genPregBehav + always_ff @(posedge clk) begin + if (rst) Preg <= 0; + else if (en) Preg <= Mreg + pcout[j][i-1]; + end + end + else assign Preg = Mreg + pcout[j][i-1]; + end + assign pp = Preg; + assign pcout[j][i] = pp; + end : genBehav + + else begin: genDSP + DSP58 #( + // Feature Control Attributes: Data Path Selection + .AMULTSEL("A"), // Selects A input to multiplier (A, AD) + .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) + .BMULTSEL("B"), // Selects B input to multiplier (AD, B) + .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port) + .DSP_MODE("INT8"), // Configures DSP to a particular mode of operation. Set to INT24 for + // legacy mode. + .PREADDINSEL("A"), // Selects input to pre-adder (A, B) + .RND(58'h000000000000000), // Rounding Constant + .USE_MULT("MULTIPLY"), // Select multiplier usage (DYNAMIC, MULTIPLY, NONE) + .USE_SIMD("ONE58"), // SIMD selection (FOUR12, ONE58, TWO24) + .USE_WIDEXOR("FALSE"), // Use the Wide XOR function (FALSE, TRUE) + .XORSIMD("XOR24_34_58_116"), // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116) + // Pattern Detector Attributes: Pattern Detection Configuration + .AUTORESET_PATDET("NO_RESET"), // NO_RESET, RESET_MATCH, RESET_NOT_MATCH + .AUTORESET_PRIORITY("RESET"), // Priority of AUTORESET vs. CEP (CEP, RESET). + .MASK(58'h0ffffffffffffff), // 58-bit mask value for pattern detect (1=ignore) + .PATTERN(58'h000000000000000), // 58-bit pattern match for pattern detect + .SEL_MASK("MASK"), // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2 + .SEL_PATTERN("PATTERN"), // Select pattern value (C, PATTERN) + .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect (NO_PATDET, PATDET) + // Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins + .IS_ALUMODE_INVERTED(4'b0000), // Optional inversion for ALUMODE + .IS_CARRYIN_INVERTED(1'b0), // Optional inversion for CARRYIN + .IS_CLK_INVERTED(1'b0), // Optional inversion for CLK + .IS_INMODE_INVERTED(5'b00000), // Optional inversion for INMODE + .IS_NEGATE_INVERTED(3'b000), // Optional inversion for NEGATE + .IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0 + FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN + 2'b01, // Y : M + 2'b01 // X: M + }), // Optional inversion for OPMODE + .IS_RSTALLCARRYIN_INVERTED(1'b0), // Optional inversion for RSTALLCARRYIN + .IS_RSTALUMODE_INVERTED(1'b0), // Optional inversion for RSTALUMODE + .IS_RSTA_INVERTED(1'b0), // Optional inversion for RSTA + .IS_RSTB_INVERTED(1'b0), // Optional inversion for RSTB + .IS_RSTCTRL_INVERTED(1'b0), // Optional inversion for STCONJUGATE_A + .IS_RSTC_INVERTED(1'b0), // Optional inversion for RSTC + .IS_RSTD_INVERTED(1'b0), // Optional inversion for RSTD + .IS_RSTINMODE_INVERTED(1'b0), // Optional inversion for RSTINMODE + .IS_RSTM_INVERTED(1'b0), // Optional inversion for RSTM + .IS_RSTP_INVERTED(1'b0), // Optional inversion for RSTP + // Register Control Attributes: Pipeline Register Configuration + .ACASCREG(INTERNAL_PREGS), // Number of pipeline stages between A/ACIN and ACOUT (0-2) + .ADREG(0), // Pipeline stages for pre-adder (0-1) + .ALUMODEREG(0), // Pipeline stages for ALUMODE (0-1) + .AREG(INTERNAL_PREGS), // Pipeline stages for A (0-2) + .BCASCREG(INTERNAL_PREGS), // Number of pipeline stages between B/BCIN and BCOUT (0-2) + .BREG(INTERNAL_PREGS), // Pipeline stages for B (0-2) + .CARRYINREG(0), // Pipeline stages for CARRYIN (0-1) + .CARRYINSELREG(0), // Pipeline stages for CARRYINSEL (0-1) + .CREG(0), // Pipeline stages for C (0-1) + .DREG(0), // Pipeline stages for D (0-1) + .INMODEREG(1), // Pipeline stages for INMODE (0-1) + .MREG(1), // Multiplier pipeline stages (0-1) + .OPMODEREG(1), // Pipeline stages for OPMODE (0-1) + .PREG(PREG), // Number of pipeline stages for P (0-1) + .RESET_MODE("SYNC") // Selection of synchronous or asynchronous reset. (ASYNC, SYNC). + ) + DSP58_inst ( + // Cascade outputs: Cascade Ports + .ACOUT(), // 34-bit output: A port cascade + .BCOUT(), // 24-bit output: B cascade + .CARRYCASCOUT(), // 1-bit output: Cascade carry + .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade + .PCOUT(pcout[j][i]), // 58-bit output: Cascade output + // Control outputs: Control Inputs/Status Bits + .OVERFLOW(), // 1-bit output: Overflow in add/acc + .PATTERNBDETECT(), // 1-bit output: Pattern bar detect + .PATTERNDETECT(), // 1-bit output: Pattern detect + .UNDERFLOW(), // 1-bit output: Underflow in add/acc + // Data outputs: Data Ports + .CARRYOUT(), // 4-bit output: Carry + .P(pp), // 58-bit output: Primary data + .XOROUT(), // 8-bit output: XOR data + // Cascade inputs: Cascade Ports + .ACIN('x), // 34-bit input: A cascade data + .BCIN('x), // 24-bit input: B cascade + .CARRYCASCIN('x), // 1-bit input: Cascade carry + .MULTSIGNIN('x), // 1-bit input: Multiplier sign cascade + .PCIN(FIRST ? 'x : pcout[j][i-1]), // 58-bit input: P cascade + // Control inputs: Control Inputs/Status Bits + .ALUMODE(4'h0), // 4-bit input: ALU control + .CARRYINSEL('0), // 3-bit input: Carry select + .CLK(clk), // 1-bit input: Clock + .INMODE({ + INTERNAL_PREGS==2 ? 1'b0 : 1'b1, + 2'b00, + TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero, + INTERNAL_PREGS==2 ? 1'b0 : 1'b1 + }), // 5-bit input: INMODE control + .NEGATE('0), // 3-bit input: Negates the input of the multiplier + .OPMODE({ + LAST ? {1'b0, L[1]} : 2'b00, + 7'b000_0000 + }), // 9-bit input: Operation mode + // Data inputs: Data Ports + .A({ 7'bx, a_in_i[i] }), // 34-bit input: A data + .B(b_in_i[j][i]), // 24-bit input: B data + .C('x), // 58-bit input: C data + .CARRYIN('0), // 1-bit input: Carry-in + .D('x), // 27-bit input: D data + // Reset/Clock Enable inputs: Reset/Clock Enable Inputs + .ASYNC_RST('0), // 1-bit input: Asynchronous reset for all registers. + .CEA1(en), // 1-bit input: Clock enable for 1st stage AREG + .CEA2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage AREG + .CEAD('0), // 1-bit input: Clock enable for ADREG + .CEALUMODE('0), // 1-bit input: Clock enable for ALUMODE + .CEB1(en), // 1-bit input: Clock enable for 1st stage BREG + .CEB2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage BREG + .CEC('0), // 1-bit input: Clock enable for CREG + .CECARRYIN('0), // 1-bit input: Clock enable for CARRYINREG + .CECTRL(en), // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG + .CED('0), // 1-bit input: Clock enable for DREG + .CEINMODE(en), // 1-bit input: Clock enable for INMODEREG + .CEM(en), // 1-bit input: Clock enable for MREG + .CEP(PREG && en), // 1-bit input: Clock enable for PREG + .RSTA(rst), // 1-bit input: Reset for AREG + .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG + .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG + .RSTB(rst), // 1-bit input: Reset for BREG + .RSTC('0), // 1-bit input: Reset for CREG + .RSTCTRL(rst), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG + .RSTD('0), // 1-bit input: Reset for DREG and ADREG + .RSTINMODE(rst), // 1-bit input: Reset for INMODE register + .RSTM(rst), // 1-bit input: Reset for MREG + .RSTP(PREG && rst) // 1-bit input: Reset for PREG + ); + end : genDSP end : genDSPChain end : genDSPPE From 18f94e7ab03a3034083680faa91a80359858589e Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 8 May 2023 10:18:58 +0100 Subject: [PATCH 016/112] [rtl mvu]: extended flow control wrapper with additional compute core and other minor changes --- finn-rtllib/mvu/mvu_axi.sv | 51 +++++++++++++++++++------------ finn-rtllib/mvu/mvu_axi_wrapper.v | 48 ++++++++++++++--------------- 2 files changed, 54 insertions(+), 45 deletions(-) diff --git a/finn-rtllib/mvu/mvu_axi.sv b/finn-rtllib/mvu/mvu_axi.sv index 5d8700738f..e4a919ba88 100644 --- a/finn-rtllib/mvu/mvu_axi.sv +++ b/finn-rtllib/mvu/mvu_axi.sv @@ -41,8 +41,8 @@ module mvu_axi #( int unsigned ACCU_WIDTH, bit SIGNED_ACTIVATIONS = 0, int unsigned SEGMENTLEN = 0, - parameter RAM_STYLE = "auto", - parameter MVU_IMPL_STYLE, + bit FORCE_BEHAVIORAL = 0, + string MVU_IMPL_STYLE, localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, localparam int unsigned INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8, @@ -96,12 +96,14 @@ module mvu_axi #( $error("Activation width of %0d-bits exceeds maximum of 8-bits for unsigned numbers", ACTIVATION_WIDTH); $finish; end - if (SEGMENTLEN == 0) begin - $warning("Segment length of %0d defaults to chain length", SEGMENTLEN); - end - if (SEGMENTLEN > (SIMD+2)/3) begin - $error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3); - $finish; + if (MVU_IMPL_STYLE == "mvu_8sx9") begin + if (SEGMENTLEN == 0) begin + $warning("Segment length of %0d defaults to chain length", SEGMENTLEN); + end + if (SEGMENTLEN > (SIMD+2)/3) begin + $error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3); + $finish; + end end end @@ -116,7 +118,7 @@ module mvu_axi #( uwire avld; uwire ardy; - replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvauin_t)), .RAM_STYLE(RAM_STYLE)) activation_replay ( + replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvauin_t))) activation_replay ( .clk, .rst, .ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)), .ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin) @@ -133,28 +135,37 @@ module mvu_axi #( uwire [PE-1:0][ACCU_WIDTH-1:0] odat; typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t; - if (MVU_IMPL_STYLE == "mvu_8sx9") begin : genMVU8sx9 + if (MVU_IMPL_STYLE == "mvu_8sx9_dsp58") begin : genMVU8sx9 mvu_8sx9 #(.PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), - .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN)) core ( + .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), + .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( .clk, .rst, .en, - .last(alast), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau), + .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau), .vld(ovld), .p(odat) ); end else if (MVU_IMPL_STYLE == "mvu_4sx4u") begin : genMVU4sx4u - mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .FORCE_BEHAVIORAL(0)) core ( + mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( .clk, .rst, .en, - .last(alast), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau), + .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau), .vld(ovld), .p(odat) ); end - //else begin - // $error("Unrecognized MVU_IMPL_STYLE!"); - // $finish; - //end + else if (MVU_IMPL_STYLE == "mvu_8sx8u_dsp48") begin : genMVU8sx8u + mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), + .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( + .clk, .rst, .en, + .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau), + .vld(ovld), .p(odat) + ); + end + else initial begin + $error("Unrecognized MVU_IMPL_STYLE!"); + $finish; + end //-------------------- Output register slice --------------------\\ - struct { + struct packed { logic vld; logic [PE-1:0][ACCU_WIDTH-1:0] dat; } A = '{ vld: 0, default: 'x}; @@ -175,7 +186,7 @@ module mvu_axi #( end end - struct { + struct packed { logic vld; logic [PE-1:0][ACCU_WIDTH-1:0] dat; } B = '{ vld: 0, default: 'x}; diff --git a/finn-rtllib/mvu/mvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_axi_wrapper.v index 323d2711e4..b79ba6bbd1 100644 --- a/finn-rtllib/mvu/mvu_axi_wrapper.v +++ b/finn-rtllib/mvu/mvu_axi_wrapper.v @@ -41,7 +41,8 @@ module $MODULE_NAME_AXI_WRAPPER$ #( parameter ACCU_WIDTH = $ACCU_WIDTH$, parameter SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$, parameter SEGMENTLEN = $SEGMENTLEN$, - parameter RAM_STYLE = "$IBUF_RAM_STYLE$", + parameter MVU_IMPL_STYLE = "$MVU_IMPL_STYLE$", + parameter FORCE_BEHAVIORAL = $FORCE_BEHAVIORAL$, // Safely deducible parameters parameter WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, @@ -50,41 +51,38 @@ module $MODULE_NAME_AXI_WRAPPER$ #( parameter OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8 )( // Global Control - input logic ap_clk, - input logic ap_rst_n, - + input ap_clk, + input ap_rst_n, // Weight Stream - input logic [WEIGHT_STREAM_WIDTH_BA-1:0] s_axis_weights_tdata, - input logic s_axis_weights_tvalid, - output logic s_axis_weights_tready, - + input [WEIGHT_STREAM_WIDTH_BA-1:0] weights_V_TDATA, + input weights_V_TVALID, + output weights_V_TREADY, // Input Stream - input logic [INPUT_STREAM_WIDTH_BA-1:0] s_axis_input_tdata, - input logic s_axis_input_tvalid, - output logic s_axis_input_tready, - + input [INPUT_STREAM_WIDTH_BA-1:0] in0_V_TDATA, + input in0_V_TVALID, + output in0_V_TREADY, // Output Stream - output logic [OUTPUT_STREAM_WIDTH_BA-1:0] m_axis_output_tdata, - output logic m_axis_output_tvalid, - input logic m_axis_output_tready + output [OUTPUT_STREAM_WIDTH_BA-1:0] out_V_TDATA, + output out_V_TVALID, + input out_V_TREADY ); mvu_axi #( .MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), - .SEGMENTLEN(SEGMENTLEN), .RAM_STYLE(RAM_STYLE) + .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL), .MVU_IMPL_STYLE(MVU_IMPL_STYLE) ) inst ( .ap_clk(ap_clk), .ap_rst_n(ap_rst_n), - .s_axis_weights_tdata(s_axis_weights_tdata), - .s_axis_weights_tvalid(s_axis_weights_tvalid), - .s_axis_weights_tready(s_axis_weights_tready), - .s_axis_input_tdata(s_axis_input_tdata), - .s_axis_input_tvalid(s_axis_input_tvalid), - .s_axis_input_tready(s_axis_input_tready), - .m_axis_output_tdata(m_axis_output_tdata), - .m_axis_output_tvalid(m_axis_output_tvalid), - .m_axis_output_tready(m_axis_output_tready) + .s_axis_weights_tdata(weights_V_TDATA), + .s_axis_weights_tvalid(weights_V_TVALID), + .s_axis_weights_tready(weights_V_TREADY), + .s_axis_input_tdata(in0_V_TDATA), + .s_axis_input_tvalid(in0_V_TVALID), + .s_axis_input_tready(in0_V_TREADY), + .m_axis_output_tdata(out_V_TDATA), + .m_axis_output_tvalid(out_V_TVALID), + .m_axis_output_tready(out_V_TREADY) ); endmodule : $MODULE_NAME_AXI_WRAPPER$ \ No newline at end of file From 6d4a0a764e0e6ded16d7034e0d69f5408c76ca75 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 8 May 2023 10:22:51 +0100 Subject: [PATCH 017/112] [rtl mvu]: fix to done_len flag when SIMD dimension fully unrolled and PyVerilator-related syntax change --- finn-rtllib/mvu/replay_buffer.sv | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/finn-rtllib/mvu/replay_buffer.sv b/finn-rtllib/mvu/replay_buffer.sv index 685ac03137..89bbbdb88f 100644 --- a/finn-rtllib/mvu/replay_buffer.sv +++ b/finn-rtllib/mvu/replay_buffer.sv @@ -35,8 +35,7 @@ module replay_buffer #( int unsigned LEN, // Sequence length int unsigned REP, // Sequence replay count - int unsigned W, // Data width - parameter RAM_STYLE = "auto" // ram style for buffer {block, distributed, ultra, auto} + int unsigned W // Data width )( input logic clk, input logic rst, @@ -54,7 +53,7 @@ module replay_buffer #( typedef logic [$clog2(REP)+$clog2(LEN)-1:0] count_t; count_t Count = 0; - uwire done_len = ((LEN-1) & ~Count[$clog2(LEN)-1:0]) == 0; + uwire done_len = LEN == 1 ? 1 : ((LEN-1) & ~Count[$clog2(LEN)-1:0]) == 0; uwire done_rep; uwire done_all = done_len && done_rep; @@ -83,7 +82,6 @@ module replay_buffer #( end assign first_rep = FirstRep; - (* RAM_STYLE = RAM_STYLE *) data_t Buf[LEN]; if(LEN == 1) begin : genTrivial always_ff @(posedge clk) begin @@ -92,7 +90,10 @@ module replay_buffer #( end : genTrivial else begin : genShift always_ff @(posedge clk) begin - if(shift) Buf <= { odat, Buf[0:LEN-2] }; + if(shift) begin + Buf[0] <= odat; + Buf[1:LEN-1] <= Buf[0:LEN-2]; + end end end : genShift From 90c547d54756aed2aa101862fb6f55c05149173c Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 8 May 2023 10:23:22 +0100 Subject: [PATCH 018/112] [rtl mvu tb]: updated testbench --- finn-rtllib/mvu/tb/mvu_axi_tb.sv | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/finn-rtllib/mvu/tb/mvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_axi_tb.sv index 08a349da84..ef5fa7d682 100644 --- a/finn-rtllib/mvu/tb/mvu_axi_tb.sv +++ b/finn-rtllib/mvu/tb/mvu_axi_tb.sv @@ -35,17 +35,18 @@ module mvu_axi_tb(); //-------------------- Simulation parameters --------------------\\ // Matrix & parallelism config - localparam int unsigned MW = 90; - localparam int unsigned MH = 16; - localparam int unsigned SIMD = 9; - localparam int unsigned PE = 4; - localparam int unsigned SEGMENTLEN = 1; - localparam string MVU_IMPL_STYLE = "mvu_8sx9"; + localparam int unsigned MW = 50; + localparam int unsigned MH = 8; + localparam int unsigned SIMD = 10; + localparam int unsigned PE = 2; + localparam int unsigned SEGMENTLEN = 2; + localparam string MVU_IMPL_STYLE = "mvu_8sx8u_dsp48"; + localparam bit FORCE_BEHAVIORAL = 1; // Bit-width config localparam int unsigned ACTIVATION_WIDTH = 8; - localparam int unsigned WEIGHT_WIDTH = 4; + localparam int unsigned WEIGHT_WIDTH = 8; localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW); - localparam bit SIGNED_ACTIVATIONS = 1; + localparam bit SIGNED_ACTIVATIONS = 0; // Simulation constants localparam int unsigned NF = MH/PE; localparam int unsigned SF = MW/SIMD; @@ -94,7 +95,7 @@ module mvu_axi_tb(); for (int i=0; i 1; + activations.vld = $urandom()%7 >= 1; @(posedge clk); end while (!(activations.vld === 1 && activations.rdy === 1)); end @@ -201,6 +202,7 @@ module mvu_axi_tb(); .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), + .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL), .MVU_IMPL_STYLE(MVU_IMPL_STYLE) ) dut ( From 0c37f1f7bed1143833649accceb59bd6821bed3c Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 8 May 2023 10:25:10 +0100 Subject: [PATCH 019/112] [builder]: added specialize_to_rtl step and changed standalone threshold layers to be by default true --- src/finn/builder/build_dataflow_config.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py index 4c3e4ff899..24940489df 100644 --- a/src/finn/builder/build_dataflow_config.py +++ b/src/finn/builder/build_dataflow_config.py @@ -121,6 +121,7 @@ class VerificationStepType(str, Enum): "step_apply_folding_config", "step_minimize_bit_width", "step_generate_estimate_reports", + "step_specialize_to_rtl", "step_hls_codegen", "step_hls_ipgen", "step_set_fifo_depths", @@ -233,7 +234,7 @@ class DataflowBuildConfig: #: activations in FINN) will be implemented as stand-alone HLS layers, #: instead of being part of MatrixVectorActivation layer. This gives larger #: flexibility, and makes it possible to have runtime-writable thresholds. - standalone_thresholds: Optional[bool] = False + standalone_thresholds: Optional[bool] = True #: (Optional) Whether optimizations that minimize the bit width of the #: weights and accumulator will be applied. Because this optimization relies From 5ccb016a640dbed6818a9f1f3ef46136ce949c0d Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 8 May 2023 10:26:03 +0100 Subject: [PATCH 020/112] [builder]: added specialize_to_rtl step --- src/finn/builder/build_dataflow_steps.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py index e43a29d632..3e4d047a51 100644 --- a/src/finn/builder/build_dataflow_steps.py +++ b/src/finn/builder/build_dataflow_steps.py @@ -123,6 +123,7 @@ ) from finn.util.pyverilator import verilator_fifosim from finn.util.test import execute_parent +import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl def verify_step( @@ -483,6 +484,16 @@ def step_generate_estimate_reports(model: ModelWrapper, cfg: DataflowBuildConfig return model +def step_specialize_to_rtl(model: ModelWrapper, cfg: DataflowBuildConfig): + """Convert layers implemented in HLS to an equivalent specialized RTL implementation if possible.""" + specialize_to_rtl_transforms = [ + to_rtl.InferRTLMatrixVectorActivation() + ] + for trn in specialize_to_rtl_transforms: + model = model.transform(trn) + return model + + def step_minimize_bit_width(model: ModelWrapper, cfg: DataflowBuildConfig): """Tighten the weight and accumulator bit widths for each layer.""" if cfg.minimize_bit_width: @@ -855,6 +866,7 @@ def step_deployment_package(model: ModelWrapper, cfg: DataflowBuildConfig): "step_apply_folding_config": step_apply_folding_config, "step_minimize_bit_width": step_minimize_bit_width, "step_generate_estimate_reports": step_generate_estimate_reports, + "step_specialize_to_rtl": step_specialize_to_rtl, "step_hls_codegen": step_hls_codegen, "step_hls_ipgen": step_hls_ipgen, "step_set_fifo_depths": step_set_fifo_depths, From f099f4bbfd01b628a89c6099f637a4a85a8158ca Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 8 May 2023 10:26:44 +0100 Subject: [PATCH 021/112] [custom op]: added custom op MatrixVectorActivation_rtl --- src/finn/custom_op/fpgadataflow/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index 56d4230a3a..19c0ddd999 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -49,6 +49,7 @@ from finn.custom_op.fpgadataflow.labelselect_batch import LabelSelect_Batch from finn.custom_op.fpgadataflow.lookup import Lookup from finn.custom_op.fpgadataflow.matrixvectoractivation import MatrixVectorActivation +from finn.custom_op.fpgadataflow.matrixvectoractivation_rtl import MatrixVectorActivation_rtl from finn.custom_op.fpgadataflow.pool_batch import Pool_Batch from finn.custom_op.fpgadataflow.streamingdataflowpartition import ( StreamingDataflowPartition, @@ -70,6 +71,7 @@ custom_op["DownSampler"] = DownSampler custom_op["StreamingMaxPool_Batch"] = StreamingMaxPool_Batch custom_op["MatrixVectorActivation"] = MatrixVectorActivation +custom_op["MatrixVectorActivation_rtl"] = MatrixVectorActivation_rtl custom_op["ConvolutionInputGenerator"] = ConvolutionInputGenerator custom_op["ConvolutionInputGenerator1D"] = ConvolutionInputGenerator1D custom_op["ConvolutionInputGenerator_rtl"] = ConvolutionInputGenerator_rtl From 9a3b0fdc54f8c7c1b541c8cfdaaf6e96315da092 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 8 May 2023 10:28:34 +0100 Subject: [PATCH 022/112] [custom op]: added additional attribute to enable conversion to RTL (custom-op) --- src/finn/custom_op/fpgadataflow/matrixvectoractivation.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py index aa987384dd..e54abb0c3f 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py @@ -70,7 +70,7 @@ def get_nodeattr_types(self): "SIMD": ("i", True, 0), "MW": ("i", True, 0), "MH": ("i", True, 0), - "resType": ("s", False, "lut", {"auto", "lut", "dsp"}), + "resType": ("s", False, "dsp", {"auto", "lut", "dsp"}), "ActVal": ("i", False, 0), # FINN DataTypes for inputs, weights, outputs "inputDataType": ("s", True, ""), @@ -125,6 +125,8 @@ def get_nodeattr_types(self): # vector through the accelerator. This will get rid of any old # weight data from the weight FIFOs. "runtime_writeable_weights": ("i", False, 0, {0, 1}), + # Flag to specify whether RTL-based or HLS-based implementation is preferred + "impl": ("s", False, "rtl", {"hls", "rtl"}) } my_attrs.update(super().get_nodeattr_types()) return my_attrs From 38aa930baa1296a7099f9df22e3d0d000c8d5a05 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 8 May 2023 10:30:15 +0100 Subject: [PATCH 023/112] [custom op]: modified ip-stitching and code generation --- .../matrixvectoractivation_rtl.py | 231 ++++++++++-------- 1 file changed, 127 insertions(+), 104 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py index c8a0aa675b..6b1c2f3be7 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py @@ -27,6 +27,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import math +from shutil import copy import numpy as np import os import textwrap @@ -45,6 +46,12 @@ pack_innermost_dim_as_hex_string, rtlsim_output_to_npy, ) +from finn.util.basic import get_rtlsim_trace_depth, make_build_dir + +try: + from pyverilator import PyVerilator +except ModuleNotFoundError: + PyVerilator = None from . import templates @@ -60,8 +67,8 @@ class MatrixVectorActivation_rtl(HLSCustomOp): """Class that corresponds to finn-hls Matrix_Vector_Activate(_Stream)_Batch function.""" - def __init__(self, onnx_node): - super().__init__(onnx_node) + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) self.decoupled_wrapper = templates.decoupled_wrapper def get_nodeattr_types(self): @@ -78,11 +85,6 @@ def get_nodeattr_types(self): "outputDataType": ("s", True, ""), # FINN DataType for accumulator -- auto-computed and updated "accDataType": ("s", False, "INT32"), - # use xnor-popcount for binary weights/inputs, thus treating them - # as bipolar - "binaryXnorMode": ("i", False, 0, {0, 1}), - # no-activation mode (produce accumulators) - "noActivation": ("i", False, 0, {0, 1}), # number of input vectors, examples: # [1] is a single vector (like a FC layer with batch=1) # [4] is four vectors (like a FC layer with batch=4) @@ -105,16 +107,6 @@ def get_nodeattr_types(self): "auto", {"auto", "block", "distributed", "ultra"}, ), - # FPGA resource type for threshold memories (if noActivation is False) - # auto -- let Vivado decide - # block -- use BRAM - # distributed -- use LUTRAM - "ram_style_thresholds": ( - "s", - False, - "auto", - {"auto", "block", "distributed"}, - ), # (mem_mode = decoupled only) whether weights will be writable through # an AXI-lite interface during runtime # 1 for enabled, 0 for disabled. @@ -125,6 +117,8 @@ def get_nodeattr_types(self): # vector through the accelerator. This will get rid of any old # weight data from the weight FIFOs. "runtime_writeable_weights": ("i", False, 0, {0, 1}), + # attribute to save top module name - not user configurable + "gen_top_module": ("s", False, ""), } my_attrs.update(super().get_nodeattr_types()) return my_attrs @@ -142,7 +136,6 @@ def calc_wmem(self): def calc_tmem(self): """Calculates and returns TMEM.""" - assert self.get_nodeattr("noActivation")==1, "RTL-based MVAU does not support thresholding currently, please infer a standalone Thresholding_Batch layer" return 0 def make_shape_compatible_op(self, model): @@ -192,27 +185,9 @@ def verify_node(self): """The required MatrixVectorActivation attributes do not exist.""" ) - # verify the number of inputs depending on noActivation value - # check noActivation value to determine the number of inputs - no_act = self.get_nodeattr("noActivation") - - if no_act == 1: - if len(self.onnx_node.input) == 2: - info_messages.append("The number of inputs is correct") - else: - info_messages.append( - """RTL-based MatrixVectorActivation needs in no - activation mode 2 inputs (data input and weights)""" - ) - elif no_act == 0: - info_messages.append("RTL-based MVAU does not support thresholding currently, please infer a standalone Thresholding_Batch layer") - else: - info_messages.append( - """noActivation attribute contains {} should - be 1 for RTL-based MatrixVectorActivation""".format( - no_act - ) - ) + num_of_inputs = len(self.onnx_node.input) + if num_of_inputs!=2: + info_messages.append("RTL-based MatrixVectorActivation expects two inputs (weights and activation), but got {} inputs.".format(len(self.onnx_node.input))) mem_mode = self.get_nodeattr("mem_mode") @@ -221,6 +196,7 @@ def verify_node(self): return info_messages +# TODO: Add in replay_buffer estimation def uram_estimation(self): P = self.get_nodeattr("PE") Q = self.get_nodeattr("SIMD") @@ -242,6 +218,7 @@ def uram_estimation(self): depth_multiplier = math.ceil(omega / 4096) return width_multiplier * depth_multiplier +# TODO: Add in replay_buffer estimation def bram_estimation(self): """Calculates resource estimation for BRAM based on: - FINN-R: An End-to-End Deep-Learning Framework for Fast @@ -268,7 +245,7 @@ def bram_estimation(self): ): return 0 # assuming SDP mode RAMB18s (see UG573 Table 1-10) - # assuming decoupled (RTL) memory, which is more efficient than const (HLS) + # assuming decoupled (RTL) memory if mem_width == 1: return math.ceil(omega / 16384) elif mem_width == 2: @@ -282,6 +259,7 @@ def bram_estimation(self): else: return (math.ceil(omega / 512)) * (math.ceil(mem_width / 36)) +# TODO: Add in replay_buffer estimation def bram_efficiency_estimation(self): wdt = self.get_weight_datatype() W = wdt.bitwidth() @@ -294,6 +272,7 @@ def bram_efficiency_estimation(self): bram16_est_capacity = bram16_est * 36 * 512 return wbits / bram16_est_capacity +# TODO: Add in replay_buffer estimation def uram_efficiency_estimation(self): """Function for URAM efficiency estimation: actual parameter storage needed divided by the allocated URAM storage (from estimation)""" @@ -308,7 +287,7 @@ def uram_efficiency_estimation(self): uram_est_capacity = uram_est * 72 * 4096 return wbits / uram_est_capacity -#TODO: FIX +#TODO: FIX: worst case estimates since segmentlen is not known at this point? def lut_estimation(self): """Calculates resource estimations for LUTs based on: - FINN-R: An End-to-End Deep-Learning Framework for Fast @@ -348,23 +327,14 @@ def lut_estimation(self): # accumulator acc_bits = W + A + np.ceil(math.log(MW, 2)) acc_luts = acc_bits - # thresholds and threshold comparators - thr_luts = 0 - comp_luts = 0 - noact = self.get_nodeattr("noActivation") - if noact == 0: - odt = self.get_output_datatype() - B = odt.bitwidth() - thr_luts = (2**B - 1) * acc_bits * math.ceil(self.calc_tmem() / 64) - comp_luts = (2**B - 1) * acc_bits return int( c0 - + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts)) + + c1 * (P * (mult_luts + addertree_luts + acc_luts)) + c2 ) -#TODO: FIX +#TODO: FIX: worst case estimates since segmentlen is not known at this point? def dsp_estimation(self): # multiplication P = self.get_nodeattr("PE") @@ -380,7 +350,7 @@ def dsp_estimation(self): mult_dsp = 0 return int(mult_dsp) -#TODO: FIX +#TODO: FIX: worst case estimates since segmentlen is not known at this point def get_exp_cycles(self): pe = self.get_nodeattr("PE") simd = self.get_nodeattr("SIMD") @@ -389,6 +359,7 @@ def get_exp_cycles(self): mw = self.get_nodeattr("MW") # since mmv != 1 is not supported yet, we set mmv for now to 1 mmv = 1 + # Actual exp_cycles is probably slightly larger (say 3 cycles (DSP A/B, M, P - reg) + additional pipeline buffer cycles. Most probably <10) exp_cycles = (mh / pe) * (mw / simd) * np.prod(num_inp_vec) / mmv return int(exp_cycles) @@ -413,7 +384,7 @@ def get_output_datatype(self, ind=0): def get_instream_width(self, ind=0): i_bits = self.get_input_datatype().bitwidth() - assert i_bits<=9, "RTL-based MVAU only supports activations with bit-width up to 9-bits" + assert (i_bits<=9), "RTL-based MVAU only supports activations with bit-width up to 9-bits" in_width = i_bits * self.get_nodeattr("SIMD") return in_width @@ -431,8 +402,8 @@ def get_weightstream_width(self): pe = self.get_nodeattr("PE") simd = self.get_nodeattr("SIMD") wp = self.get_weight_datatype().bitwidth() + assert (wp <= 8), "RTL-based MVAU only supports weights with bit-width up to 8-bits" w_width = pe * simd * wp - assert wp<=8, "RTL-based MVAU only supports weights with bit-width up to 8-bits" return w_width else: return 0 @@ -544,10 +515,8 @@ def minimize_accumulator_width(self, model): adt = DataType.get_smallest_possible(-acc_max - 1) else: adt = DataType.get_smallest_possible(acc_max) - # ensure a datatype divisible by 8-bits in case this is the last node - bw = roundup_to_integer_multiple(adt.bitwidth(), 8) - new_adt_name = adt.name.replace(str(adt.bitwidth()), str(bw)) - adt = DataType[new_adt_name] + # Note: we are interested in simply the width of the output dot product. + # Padding the actual output stream to a multiple of 8-bits is done in the RTL component self.set_nodeattr("accDataType", adt.name) # for no-activation nodes, output dt = acc dt self.set_nodeattr("outputDataType", adt.name) @@ -588,7 +557,10 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name): 1, -1, pe * simd ) weight_tensor_pe_flipped = weight_tensor_pe_flipped.copy() - if weight_file_mode == "decoupled_verilog_dat": + if weight_file_mode == "decoupled_npy": + # save weight stream into npy for cppsim + np.save(weight_file_name, weight_tensor_simd_flipped) + elif weight_file_mode == "decoupled_verilog_dat": # convert weight values into hexstring weight_width = self.get_weightstream_width() # pad to nearest 4 bits to get hex strings @@ -638,7 +610,7 @@ def generate_params(self, model, path): weight_filename_sim = "{}/weights.npy".format(code_gen_dir) # save decoupled weights for cppsim self.make_weight_file(weights, "decoupled_npy", weight_filename_sim) - # also save weights as Verilog .dat file + # Also save weights as Verilog .dat file # note that we provide two different .dat files, one for synth # and one for synthesis. this is because URAM-based weights always # need zero weights for synthesis, otherwise they get inferred @@ -693,7 +665,6 @@ def execute_node(self, context, graph): for inputs in node.input: # it is assumed that the first input of the node is the data input # the second input are the weights - # the third input are the thresholds if in_ind == 0: assert ( str(context[inputs].dtype) == "float32" @@ -709,7 +680,7 @@ def execute_node(self, context, graph): reshaped_input, ) elif in_ind > 2: - raise Exception("Unexpected input found for MatrixVectorActivation") + raise Exception("Unexpected input found for MatrixVectorActivation_rtl") in_ind += 1 if mode == "rtlsim": @@ -759,7 +730,7 @@ def execute_node(self, context, graph): def code_generation_ipgen(self, model, fpgapart, clk): """Normally: Generates C++ code and tcl script for IP generation. Here: Generates (System-)Verilog code for IP generation.""" - self.generate_hdl() + self.generate_hdl(model, fpgapart, clk) def ipgen_singlenode_code(self): """Normally: Builds the bash script for IP generation.""" @@ -828,11 +799,21 @@ def code_generation_ipi(self): "create_bd_intf_pin -mode Slave " "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name) ) - # instantiate the hls ip - cmd.append( - "create_bd_cell -type ip -vlnv %s /%s/%s" - % (self.get_nodeattr("ip_vlnv"), node_name, node_name) - ) + # instantiate the RTL block + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/") + sourcefiles = [ + os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"), + rtllib_dir + "mvu_axi.sv", + rtllib_dir + "replay_buffer.sv", + rtllib_dir + "mvu_4sx4u.sv", + rtllib_dir + "mvu_8sx9.sv", + rtllib_dir + "mvu_8sx8u_dsp48.sv" + ] + for f in sourcefiles: + cmd.append("add_files -norecurse %s" % (f)) + cmd.append("create_bd_cell -type hier -reference %s /%s/%s" % (self.get_nodeattr("gen_top_module"), self.onnx_node.name, self.onnx_node.name)) + # instantiate a streamer and connect it to the HLS IP strm_vlnv = "xilinx.com:user:memstream:1.0" strm_inst = node_name + "_wstrm" @@ -947,12 +928,6 @@ def get_op_and_param_counts(self): weight_param_type = "param_weight_%db" % (weight_bits) weight_count = in_features * out_features ret_dict = {mac_op_type: mac_count, weight_param_type: weight_count} - if self.get_nodeattr("noActivation") == 0: - tdt = DataType[self.get_nodeattr("accDataType")] - thres_bits = tdt.bitwidth() - thres_param_type = "param_threshold_%db" % (thres_bits) - thres_count = out_features - ret_dict[thres_param_type] = thres_count return ret_dict def derive_characteristic_fxns(self, period): @@ -972,65 +947,113 @@ def derive_characteristic_fxns(self, period): ] super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) - def generate_hdl(self): -#TODO: add distinction between (PE=MH or PE=1) and where MH dimension is folded - template_path, code_gen_dict = self.prepare_codegen_default() +# TODO: characterize max_clk and implement this function in look-up style + def _resolve_segment_len(self, clk): + # Insert pipeline registers in the DSP chain to meet target clock frequency + segmentlen = 0 + return segmentlen + + def _resolve_impl_style(self, fpgapart): + # Based on target device and activation/weight-width, choose the supported RTL module + act_width = self.get_input_datatype(0).bitwidth() + weight_width = self.get_input_datatype(1).bitwidth() + is_versal = fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"] or fpgapart[0:5] == "xqrvc" + if (act_width == 4 and weight_width == 4): + return "mvu_4sx4u" + else: + if (is_versal): + return "mvu_8sx9_dsp58" + else: + return "mvu_8sx8u_dsp48" + + def generate_hdl(self, model, fpgapart, clk): + # Generate params as part of IP preparation + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + self.generate_params(model, code_gen_dir) + template_path, code_gen_dict = self.prepare_codegen_default(fpgapart, clk) # add general parameters to dictionary - code_gen_dict["$TOP_MODULE_NAME$"] = [self.get_verilog_top_module_name()] + code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [self.get_verilog_top_module_name()] # save top module name so we can refer to it after this node has been renamed # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject) self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name()) -#TODO: currently only ram_style=auto is supported + ram_style = self.get_nodeattr("ram_style") - if ram_style == "auto": - continue - else: - raise Exception("Unrecognized ram_style for MatrixVectorActivation") + assert (ram_style=="auto"), "Unrecognized ram_style for MatrixVectorActivation_rtl" - # apply code generation to templates - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + # apply code generation to template with open(template_path, "r") as f: - template = f.read() + template_wrapper = f.read() for key in code_gen_dict: # transform list into long string separated by '\n' code_gen_line = "\n".join(code_gen_dict[key]) - template = template.replace(key, code_gen_line) template_wrapper = template_wrapper.replace(key, code_gen_line) with open( os.path.join( - code_gen_dir, self.get_nodeattr("gen_top_module") + "_impl.sv" + code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v" ), "w", ) as f: - f.write(template) + f.write(template_wrapper.replace("$FORCE_BEHAVIORAL$", str(0))) with open( os.path.join( - code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v" + code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper_sim.v" ), "w", ) as f: - f.write(template_wrapper) + f.write(template_wrapper.replace("$FORCE_BEHAVIORAL$", str(1))) # set ipgen_path and ip_path so that HLS-Synth transformation # and stich_ip transformation do not complain self.set_nodeattr("ipgen_path", code_gen_dir) - self.set_nodeattr("ip_path", code_gen_dir) + self.set_nodeattr("ip_path", code_gen_dir) - def prepare_codegen_default(self): - # TODO: Differentiate between PE folding and fully unrolled along MH dimension + def prepare_codegen_default(self, fpgapart, clk): template_path = ( - os.environ["FINN_ROOT"] + "/finn-rtllib/mvau/dsp58_mvau_template.vhdl" + os.environ["FINN_ROOT"] + "/finn-rtllib/mvu/mvu_axi_wrapper.v" ) + code_gen_dict = {} - - code_gen_dict["$PE$"] = self.get_nodeattr("PE") - code_gen_dict["$SIMD$"] = self.get_nodeattr("SIMD") - code_gen_dict["$MW$"] = self.get_nodeattr("MW") - code_gen_dict["$MH$"] = self.get_nodeattr("MH") - code_gen_dict["$ACTIVATION_WIDTH$"] = self.get_input_datatype(0).bitwidth() - code_gen_dict["$WEIGHT_WIDTH$"] = self.get_input_datatype(1).bitwidth() - code_gen_dict["$ACCU_WIDTH_BA$"] = self.get_output_datatype().bitwidth() + code_gen_dict["$MW$"] = [str(self.get_nodeattr("MW"))] + code_gen_dict["$MH$"] = [str(self.get_nodeattr("MH"))] + code_gen_dict["$PE$"] = [str(self.get_nodeattr("PE"))] + code_gen_dict["$SIMD$"] = [str(self.get_nodeattr("SIMD"))] + code_gen_dict["$ACTIVATION_WIDTH$"] = [str(self.get_input_datatype(0).bitwidth())] + code_gen_dict["$WEIGHT_WIDTH$"] = [str(self.get_input_datatype(1).bitwidth())] + code_gen_dict["$ACCU_WIDTH$"] = [str(self.get_output_datatype().bitwidth())] + code_gen_dict["$SIGNED_ACTIVATIONS$"] = [str(1)] if (self.get_input_datatype(0).min() < 0) else [str(0)] + code_gen_dict["$SEGMENTLEN$"] = [str(self._resolve_segment_len(clk))] + code_gen_dict["$MVU_IMPL_STYLE$"] = [self._resolve_impl_style(fpgapart)] return template_path, code_gen_dict + def prepare_rtlsim(self): + """Creates a Verilator emulation library for the RTL code generated + for this node, sets the rtlsim_so attribute to its path and returns + a PyVerilator wrapper around it.""" + + if PyVerilator is None: + raise ImportError("Installation of PyVerilator is required.") + + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + # Path to (System-)Verilog files used by top-module & path to top-module + verilog_paths = [ + code_gen_dir, + os.environ["FINN_ROOT"] + "/finn-rtllib/mvu" + ] + verilog_files = [ + self.get_nodeattr("gen_top_module") + "_wrapper_sim.v" + ] + + # build the Verilator emu library + sim = PyVerilator.build( + verilog_files, + build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"), + verilog_path=verilog_paths, + trace_depth=get_rtlsim_trace_depth(), + top_module_name=self.get_verilog_top_module_name() + ) + # save generated lib filename in attribute + self.set_nodeattr("rtlsim_so", sim.lib._name) + + return sim \ No newline at end of file From 4e44934c3001174e52c62caf5d320104a308e611 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 8 May 2023 10:31:35 +0100 Subject: [PATCH 024/112] [tests]: initial version of unit test for RTL custom op and specialize_to_rtl transformation for MVU --- .../test_fpgadataflow_mvau_rtl.py | 172 ++++++++++++++++++ 1 file changed, 172 insertions(+) create mode 100644 tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py b/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py new file mode 100644 index 0000000000..20a249bd08 --- /dev/null +++ b/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py @@ -0,0 +1,172 @@ +# Copyright (C) 2022, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest +import os + +import numpy as np +from onnx import TensorProto, helper +from qonnx.util.basic import ( + qonnx_make_model, + gen_finn_dt_tensor +) +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.core.datatype import DataType +from qonnx.transformation.general import GiveUniqueNodeNames +import finn.core.onnx_exec as oxe +import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim +from qonnx.transformation.general import ApplyConfig +import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl +#import qonnx.core.data_layout as DataLayout + +build_dir = os.environ["FINN_BUILD_DIR"] + +def make_single_matmul_modelwrapper(W, ofm_shape, mh, ifm, weights, idt, wdt): + (ofm_h, ofm_w) = ofm_shape + ofm = helper.make_tensor_value_info( + "ofm", + TensorProto.FLOAT, + (1, ofm_h, ofm_w, mh) + ) + + matmul_node = helper.make_node( + "MatMul", + ["ifm", "weights"], + ["ofm"] + ) + graph = helper.make_graph( + nodes=[matmul_node], + name="matmul_graph", + inputs=[ifm], + outputs=[ofm] + ) + + model = qonnx_make_model(graph, producer_name="fclayer-model") + model = ModelWrapper(model) + + model.set_tensor_datatype("ifm", idt) + model.set_tensor_datatype("weights", wdt) + model.set_tensor_datatype("ofm", DataType["INT32"]) # At this step, the MatMul layer does not optimize the bit-width of the output datatype + model.set_initializer("weights", W) + + # model.set_tensor_layout("ifm", DataLayout.NHWC) + + return model + +def prepare_inputs(input_tensor): + return {"inp": input_tensor} + +@pytest.mark.parametrize("mh", [16]) +@pytest.mark.parametrize("mw", [90]) +#@pytest.mark.parametrize("pe", [1, 2, 4, 8, 16]) +@pytest.mark.parametrize("pe", [16]) +#@pytest.mark.parametrize("simd", [1, 30, 90]) +@pytest.mark.parametrize("simd", [90]) +@pytest.mark.parametrize("idt", [DataType["INT8"]]) +@pytest.mark.parametrize("wdt", [DataType["UINT4"]]) +#@pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S", "xcku3p-ffva676-1-e"]) +@pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S"]) +@pytest.mark.parametrize("segmentlen", [1]) +@pytest.mark.fpgadataflow +@pytest.mark.slow +@pytest.mark.vivado +def test_fpgadataflow_mvau_rtl(mh, mw, pe, simd, idt, wdt, part, segmentlen): + # Create test input vector (produced by SWG) + ofm_shape = (5, 5) + ofm_h, ofm_w = ofm_shape + ifm = helper.make_tensor_value_info( + "ifm", + TensorProto.FLOAT, + [1, ofm_h, ofm_w, mw] + ) + weights = helper.make_tensor_value_info( + "weights", + TensorProto.FLOAT, + [mw, mh] + ) + W = gen_finn_dt_tensor(wdt, (mw, mh)) + model = make_single_matmul_modelwrapper(W, ofm_shape, mh, ifm, weights, idt, wdt) + model = model.transform(GiveUniqueNodeNames()) + + model.save(build_dir+"/matmul.onnx") + + # Create MatMul & obtain golden reference output + A = gen_finn_dt_tensor(model.get_tensor_datatype("ifm"), model.get_tensor_shape("ifm")) + input_dict = prepare_inputs(A) + + ## Execute ONNX model + output_matmul = oxe.execute_onnx(model, input_dict) + + # Create MVAU (HLS) + model = model.transform(to_hls.InferQuantizedMatrixVectorActivation(mem_mode="decoupled")) + model = model.transform(GiveUniqueNodeNames()) + + # Apply folding (i.e. specify to use DSPs) + folding_config = { + "Defaults": {}, + "MatrixVectorActivation_0": { + "PE" : pe, + "SIMD" : simd, + "mem_mode" : "decoupled", + "ram_style" : "auto", + "resType" : "dsp", + "impl" : "rtl" + } + } + model = model.transform(ApplyConfig(folding_config)) + model.save(build_dir+"/mvau_hls.onnx") + + model = model.transform(SetExecMode("rtlsim")) + model = model.transform(PrepareIP(part, 5)) + model = model.transform(HLSSynthIP()) + model = model.transform(PrepareRTLSim()) + output_mvau_hls = oxe.execute_onnx(model, input_dict)["ofm"] + + # Apply convert-to-rtl step + model = model.transform(to_rtl.InferRTLMatrixVectorActivation()) + model = model.transform(GiveUniqueNodeNames()) + model.save(build_dir+"/mvau_rtl.onnx") + + model = model.transform(SetExecMode("rtlsim")) + model = model.transform(PrepareIP("xcvm1802-vsvd1760-2MP-e-S", 5)) + model = model.transform(HLSSynthIP()) + model = model.transform(PrepareRTLSim()) + output_mvau_rtl = oxe.execute_onnx(model, input_dict)["ofm"] + + model.save(build_dir+"/mvau_rtl_sim.onnx") + + assert (output_mvau_hls == output_mvau_rtl).all() + assert (output_mvau_hls.size > 0) + + +# python setup.py test --addopts "-k test_fpgadataflow_mvau_rtl" +# python setup.py test --addopts "-k test_fpgadataflow_fclayer_rtlsim" \ No newline at end of file From cc361d9fd4ea082e04d7a1a6bc3932406b0a4f14 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 8 May 2023 10:32:52 +0100 Subject: [PATCH 025/112] [rtl mvu]: specialized compute core for 4-bit weights and activations for DSP48/DSP58 --- finn-rtllib/mvu/mvu_4sx4u.sv | 359 +++++++++++++++++++++++++++++++++++ 1 file changed, 359 insertions(+) create mode 100644 finn-rtllib/mvu/mvu_4sx4u.sv diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv new file mode 100644 index 0000000000..5993154355 --- /dev/null +++ b/finn-rtllib/mvu/mvu_4sx4u.sv @@ -0,0 +1,359 @@ +module mvu_4sx4u #( + int unsigned PE, + int unsigned SIMD, + int unsigned ACCU_WIDTH, + bit FORCE_BEHAVIORAL = 0 +)( + // Global Control + input logic clk, + input logic rst, + input logic en, + + // Input + input logic last, + input logic zero, // ignore current inputs and force this partial product to zero + input logic signed [PE-1:0][SIMD-1:0][3:0] w, // signed weights + input logic [SIMD-1:0][3:0] a, // unsigned activations + + // Ouput + output logic vld, + output logic signed [PE-1:0][ACCU_WIDTH-1:0] p +); + + typedef int unsigned leave_load_t[2*SIMD-1]; + function leave_load_t init_leave_loads(); + automatic leave_load_t res; + for(int i = 2*(SIMD-1); i >= int'(SIMD)-1; i--) res[i] = 1; + for(int i = SIMD-2; i >= 0; i--) res[i] = res[2*i+1] + res[2*i+2]; + return res; + endfunction : init_leave_loads + + // Pipeline for last indicator flag + logic [1:5] L = '0; + always_ff @(posedge clk) begin + if(rst) L <= '0; + else if(en) L <= { last, L[1:4] }; + end + assign vld = L[5]; + + // Stages #1 - #3: DSP Lanes + cross-lane canaries duplicated with SIMD parallelism + localparam int unsigned D[4:0] = '{ ACCU_WIDTH+22, 22, 15, 8, 0 }; // Lane offsets + + localparam int unsigned PIPE_COUNT = (PE+3)/4; + for(genvar c = 0; c < PIPE_COUNT; c++) begin : genPipes + + localparam int unsigned PE_BEG = 4*c; + localparam int unsigned PE_END = PE < 4*(c+1)? PE : 4*(c+1); + + uwire [57:0] p3[SIMD]; + uwire signed [ 1:0] h3[SIMD][3]; + for(genvar s = 0; s < SIMD; s++) begin : genSIMD + + // Input Lane Assembly + uwire [23:0] bb = a[s]; + logic [33:0] aa; + logic [26:0] dd; + logic [ 1:0] xx[3:1]; + if(1) begin : blkVectorize + uwire [3:0] ww[PE_END - PE_BEG]; + for(genvar pe = 0; pe < PE_END - PE_BEG; pe++) begin + assign ww[pe] = w[PE_BEG + pe][s]; + if(pe) begin +// assign xx[pe] = zero? 0 : ww[pe] * a[s]; + LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x ( + .O6(xx[pe][1]), + .O5(xx[pe][0]), + .I5(1'b1), + .I4(zero), + .I3(ww[pe][1]), + .I2(a[s][1]), + .I1(ww[pe][0]), + .I0(a[s][0]) + ); + end + end + always_comb begin + dd = '0; + aa = '0; + for(int unsigned pe = 0; pe < PE_END - PE_BEG; pe++) begin + dd[D[pe]+:3] = ww[pe]; + aa[D[pe]+ 3] = ww[pe][3]; + end + end + end : blkVectorize + + uwire [57:0] pp; + + // Note: Since the product B * AD is computed, + // rst can be only applied to AD and zero only to B + // with the same effect as zeroing both. + if (FORCE_BEHAVIORAL) begin : genBehav + // Stage #1: Input Refine + logic signed [23:0] B1 = 0; + always_ff @(posedge clk) begin + if(zero) B1 <= 0; + else if(en) B1 <= bb; + end + + logic signed [26:0] AD1 = 0; + always_ff @(posedge clk) begin + if(rst) AD1 <= 0; + else if(en) AD1 <= dd - aa; + end + + // Stage #2: Multiply + logic signed [50:0] M2 = 0; + always_ff @(posedge clk) begin + if(rst) M2 <= 0; + else if(en) M2 <= +// synthesis translate off + (B1 === '0) || (AD1 === '0)? 0 : +// synthesis translate on + B1 * AD1; + end + + // Stage #3: Accumulate + logic signed [57:0] P3 = 0; + always_ff @(posedge clk) begin + if(rst) P3 <= 0; + else if(en) P3 <= M2 + (L[3]? 0 : P3); + end + + assign pp = P3; + end : genBehav + else begin : genDSP + DSP48E2 #( + // Feature Control Attributes: Data Path Selection + .AMULTSEL("AD"), // Selects A input to multiplier (A, AD) + .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) + .BMULTSEL("B"), // Selects B input to multiplier (AD, B) + .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port) + .PREADDINSEL("A"), // Selects input to pre-adder (A, B) + .RND('0), // Rounding Constant + .USE_MULT("MULTIPLY"), // Select multiplier usage (DYNAMIC, MULTIPLY, NONE) + .USE_SIMD("ONE48"), // SIMD selection (FOUR12, ONE58, TWO24) + .USE_WIDEXOR("FALSE"), // Use the Wide XOR function (FALSE, TRUE) + .XORSIMD("XOR24_48_96"), // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116) + + // Pattern Detector Attributes: Pattern Detection Configuration + .AUTORESET_PATDET("NO_RESET"), // NO_RESET, RESET_MATCH, RESET_NOT_MATCH + .AUTORESET_PRIORITY("RESET"), // Priority of AUTORESET vs. CEP (CEP, RESET). + .MASK('1), // 58-bit mask value for pattern detect (1=ignore) + .PATTERN('0), // 58-bit pattern match for pattern detect + .SEL_MASK("MASK"), // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2 + .SEL_PATTERN("PATTERN"), // Select pattern value (C, PATTERN) + .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect (NO_PATDET, PATDET) + + // Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins + .IS_ALUMODE_INVERTED('0), // Optional inversion for ALUMODE + .IS_CARRYIN_INVERTED('0), // Optional inversion for CARRYIN + .IS_CLK_INVERTED('0), // Optional inversion for CLK + .IS_INMODE_INVERTED('0), // Optional inversion for INMODE + .IS_OPMODE_INVERTED(9'b00_010_01_01), // Optional inversion for OPMODE + .IS_RSTALLCARRYIN_INVERTED('0), // Optional inversion for RSTALLCARRYIN + .IS_RSTALUMODE_INVERTED('0), // Optional inversion for RSTALUMODE + .IS_RSTA_INVERTED('0), // Optional inversion for RSTA + .IS_RSTB_INVERTED('0), // Optional inversion for RSTB + .IS_RSTCTRL_INVERTED('0), // Optional inversion for STCONJUGATE_A + .IS_RSTC_INVERTED('0), // Optional inversion for RSTC + .IS_RSTD_INVERTED('0), // Optional inversion for RSTD + .IS_RSTINMODE_INVERTED('0), // Optional inversion for RSTINMODE + .IS_RSTM_INVERTED('0), // Optional inversion for RSTM + .IS_RSTP_INVERTED('0), // Optional inversion for RSTP + + // Register Control Attributes: Pipeline Register Configuration + .ACASCREG(0), // Number of pipeline stages between A/ACIN and ACOUT (0-2) + .ADREG(1), // Pipeline stages for pre-adder (0-1) + .ALUMODEREG(0), // Pipeline stages for ALUMODE (0-1) + .AREG(0), // Pipeline stages for A (0-2) + .BCASCREG(1), // Number of pipeline stages between B/BCIN and BCOUT (0-2) + .BREG(1), // Pipeline stages for B (0-2) + .CARRYINREG(0), // Pipeline stages for CARRYIN (0-1) + .CARRYINSELREG(0), // Pipeline stages for CARRYINSEL (0-1) + .CREG(0), // Pipeline stages for C (0-1) + .DREG(0), // Pipeline stages for D (0-1) + .INMODEREG(0), // Pipeline stages for INMODE (0-1) + .MREG(1), // Multiplier pipeline stages (0-1) + .OPMODEREG(1), // Pipeline stages for OPMODE (0-1) + .PREG(1) // Number of pipeline stages for P (0-1) + ) dsp ( + // Cascade outputs: Cascade Ports + .ACOUT(), // 34-bit output: A port cascade + .BCOUT(), // 24-bit output: B cascade + .CARRYCASCOUT(), // 1-bit output: Cascade carry + .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade + .PCOUT(), // 58-bit output: Cascade output + + // Control outputs: Control Inputs/Status Bits + .OVERFLOW(), // 1-bit output: Overflow in add/acc + .PATTERNBDETECT(), // 1-bit output: Pattern bar detect + .PATTERNDETECT(), // 1-bit output: Pattern detect + .UNDERFLOW(), // 1-bit output: Underflow in add/acc + + // Data outputs: Data Ports + .CARRYOUT(), // 4-bit output: Carry + .P(pp), // 58-bit output: Primary data + .XOROUT(), // 8-bit output: XOR data + + // Cascade inputs: Cascade Ports + .ACIN('x), // 34-bit input: A cascade data + .BCIN('x), // 24-bit input: B cascade + .CARRYCASCIN('x), // 1-bit input: Cascade carry + .MULTSIGNIN('x), // 1-bit input: Multiplier sign cascade + .PCIN('x), // 58-bit input: P cascade + + // Control inputs: Control Inputs/Status Bits + .CLK(clk), // 1-bit input: Clock + .ALUMODE(4'h0), // 4-bit input: ALU control + .CARRYINSEL('0), // 3-bit input: Carry select + .INMODE(5'b01100), // 5-bit input: INMODE control + .OPMODE({ 2'b00, { 1'b0, L[2], 1'b0 }, 4'b00_00 }), // 9-bit input: Operation mode + + // Data inputs: Data Ports + .A(aa), // 34-bit input: A data + .B(bb), // 24-bit input: B data + .C('x), // 58-bit input: C data + .CARRYIN('0), // 1-bit input: Carry-in + .D(dd), // 27-bit input: D data + + // Reset/Clock Enable inputs: Reset/Clock Enable Inputs + .CEA1('0), // 1-bit input: Clock enable for 1st stage AREG + .CEA2('0), // 1-bit input: Clock enable for 2nd stage AREG + .CEAD(en), // 1-bit input: Clock enable for ADREG + .CEALUMODE('0), // 1-bit input: Clock enable for ALUMODE + .CEB1('0), // 1-bit input: Clock enable for 1st stage BREG + .CEB2(en), // 1-bit input: Clock enable for 2nd stage BREG + .CEC('0), // 1-bit input: Clock enable for CREG + .CECARRYIN('0), // 1-bit input: Clock enable for CARRYINREG + .CECTRL(en), // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG + .CED('0), // 1-bit input: Clock enable for DREG + .CEINMODE('0), // 1-bit input: Clock enable for INMODEREG + .CEM(en), // 1-bit input: Clock enable for MREG + .CEP(en), // 1-bit input: Clock enable for PREG + .RSTA('0), // 1-bit input: Reset for AREG + .RSTB( // 1-bit input: Reset for BREG +// synthesis translate_off + rst || +// synthesis translate_on + zero + ), + .RSTC('0), // 1-bit input: Reset for CREG + .RSTD( // 1-bit input: Reset for DREG and ADREG +// synthesis translate_off + zero || +// synthesis translate_on + rst + ), + .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG + .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG + .RSTCTRL('0), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG + .RSTINMODE('0), // 1-bit input: Reset for INMODE register + .RSTM(rst), // 1-bit input: Reset for MREG + .RSTP(rst) // 1-bit input: Reset for PREG + ); + end : genDSP + + // External Canary Pipeline + logic [1:0] X1[3:1] = '{ default: 0 }; + logic [1:0] X2[3:1] = '{ default: 0 }; + logic [1:0] X3[3:1] = '{ default: 0 }; + always_ff @(posedge clk) begin + if(rst) begin + X1 <= '{ default: 0 }; + X2 <= '{ default: 0 }; + X3 <= '{ default: 0 }; + end + else if(en) begin + X1 <= xx; + X2 <= X1; + foreach(X3[i]) begin + X3[i] <= X2[i] + (L[3]? 2'h0 : pp[D[i]+:2]); + end + end + end + + // Derive actual cross-lane overflows + for(genvar i = 0; i < 3; i++) begin + assign h3[s][i] = pp[D[i+1]+:2] - X3[i+1]; + end + assign p3[s] = pp; + + end : genSIMD + + // Stage #4: Cross-SIMD Reduction + + // Count leaves reachable from each node + localparam leave_load_t LEAVE_LOAD = init_leave_loads(); + + uwire signed [ACCU_WIDTH -1:0] up4; + uwire signed [ACCU_WIDTH -8:0] hi4[3]; + uwire [$clog2(SIMD)+7:0] lo4[3]; + for(genvar i = 0; i < 4; i++) begin + localparam int unsigned LO_WIDTH = D[i+1] - D[i]; + localparam int unsigned HI_WIDTH = ACCU_WIDTH - LO_WIDTH; + + // Conclusive high part accumulation + if(i < 3) begin : genHi + // Adder Tree across all SIMD high contributions, each from [-1:1] + uwire signed [$clog2(1+SIMD):0] tree[2*SIMD-1]; + for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = h3[s][i]; + for(genvar n = 0; n < SIMD-1; n++) begin + // Sum truncated to actual maximum bit width at this node + uwire signed [$clog2(1+LEAVE_LOAD[n]):0] s = tree[2*n+1] + tree[2*n+2]; + assign tree[n] = s; + end + + // High Sideband Accumulation + logic signed [HI_WIDTH-1:0] Hi4 = 0; + always_ff @(posedge clk) begin + if(rst) Hi4 <= 0; + else if(en) Hi4 <= (L[4]? 0 : Hi4) + tree[0]; + end + assign hi4[i] = Hi4; + end : genHi + + // Conclusive low part accumulation + if(1) begin : blkLo + // Adder Tree across all SIMD low contributions + localparam int unsigned ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1)); + uwire [ROOT_WIDTH-1:0] tree[2*SIMD-1]; + for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = p3[s][D[i]+:LO_WIDTH]; + for(genvar n = 0; n < SIMD-1; n++) begin + // Sum truncated to actual maximum bit width at this node + localparam int unsigned NODE_WIDTH = $clog2(1 + LEAVE_LOAD[n]*(2**LO_WIDTH-1)); + uwire [NODE_WIDTH-1:0] s = tree[2*n+1] + tree[2*n+2]; + assign tree[n] = s; + end + + logic [ROOT_WIDTH-1:0] Lo4 = 0; + always_ff @(posedge clk) begin + if(rst) Lo4 <= 0; + else if(en) Lo4 <= tree[0]; + end + + if(i == 3) assign up4 = Lo4; + else assign lo4[i] = Lo4; + end : blkLo + + end + + // Stage #5: Resolve lane totals + logic signed [3:0][ACCU_WIDTH-1:0] Res5 = '{ default: 0 }; + always_ff @(posedge clk) begin + if(rst) Res5 <= '{ default: 0 }; + else if(en) begin + Res5[3] <= up4 - hi4[2]; + Res5[2] <= $signed({ hi4[2], {(D[3] - D[2]){1'b0}} }) + $signed({ 1'b0, lo4[2] }) - hi4[1]; + Res5[1] <= $signed({ hi4[1], {(D[2] - D[1]){1'b0}} }) + $signed({ 1'b0, lo4[1] }) - hi4[0]; + Res5[0] <= $signed({ hi4[0], {(D[1] - D[0]){1'b0}} }) + $signed({ 1'b0, lo4[0] }); + end + end + + // Output + for(genvar pe = PE_BEG; pe < PE_END; pe++) begin + assign p[pe] = Res5[pe - PE_BEG]; + end + + end : genPipes + +endmodule : mvu_4sx4u \ No newline at end of file From 8eefb535c3da6482f95465df05b8d3e1c610be21 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 8 May 2023 10:33:31 +0100 Subject: [PATCH 026/112] [rtl mvu]: specialized compute core for > 4-bit weights and activations for DSP48 --- finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 358 +++++++++++++++++++++++++++++ 1 file changed, 358 insertions(+) create mode 100644 finn-rtllib/mvu/mvu_8sx8u_dsp48.sv diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv new file mode 100644 index 0000000000..e06a92c8fa --- /dev/null +++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv @@ -0,0 +1,358 @@ +module mvu_8sx8u_dsp48 #( + int unsigned PE, + int unsigned SIMD, + int unsigned ACCU_WIDTH, + int unsigned ACTIVATION_WIDTH, + int unsigned WEIGHT_WIDTH, + bit FORCE_BEHAVIORAL = 0, + + localparam int unsigned SINGLE_PROD_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH +)( + // Global Control + input logic clk, + input logic rst, + input logic en, + + // Input + input logic last, + input logic zero, // ignore current inputs and force this partial product to zero + input logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // signed weights + input logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] a, // unsigned activations + + // Ouput + output logic vld, + output logic signed [PE-1:0][ACCU_WIDTH-1:0] p +); + + typedef int unsigned leave_load_t[2*SIMD-1]; + function leave_load_t init_leave_loads(); + automatic leave_load_t res; + for(int i = 2*(SIMD-1); i >= int'(SIMD)-1; i--) res[i] = 1; + for(int i = SIMD-2; i >= 0; i--) res[i] = res[2*i+1] + res[2*i+2]; + return res; + endfunction : init_leave_loads + + // Pipeline for last indicator flag + logic [1:5] L = '0; + always_ff @(posedge clk) begin + if(rst) L <= '0; + else if(en) L <= { last, L[1:4] }; + end + assign vld = L[5]; + + // Stages #1 - #3: DSP Lanes + cross-lane canaries duplicated with SIMD parallelism + localparam int unsigned D[2:0] = '{ ACCU_WIDTH+SINGLE_PROD_WIDTH, SINGLE_PROD_WIDTH, 0 }; // Lane offsets + + localparam int unsigned PIPE_COUNT = (PE+1)/2; + for(genvar c = 0; c < PIPE_COUNT; c++) begin : genPipes + + localparam int unsigned PE_BEG = 2*c; + localparam int unsigned PE_END = PE < 2*(c+1)? PE : 2*(c+1); + + uwire [57:0] p3[SIMD]; + uwire signed [ 1:0] h3[SIMD]; + for(genvar s = 0; s < SIMD; s++) begin : genSIMD + + // Input Lane Assembly + uwire [23:0] bb = a[s]; + logic [33:0] aa; + logic [26:0] dd; + logic [ 1:0] xx; + if(1) begin : blkVectorize + uwire [WEIGHT_WIDTH-1:0] ww[PE_END - PE_BEG]; + for(genvar pe = 0; pe < PE_END - PE_BEG; pe++) begin + assign ww[pe] = w[PE_BEG + pe][s]; + if(pe) begin +// assign xx[pe] = zero? 0 : ww[pe] * a[s]; + LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x ( + .O6(xx[1]), + .O5(xx[0]), + .I5(1'b1), + .I4(zero), + .I3(ww[pe][1]), + .I2(a[s][1]), + .I1(ww[pe][0]), + .I0(a[s][0]) + ); + end + end + always_comb begin + dd = '0; + aa = '0; + for(int unsigned pe = 0; pe < PE_END - PE_BEG; pe++) begin + dd[D[pe] +: WEIGHT_WIDTH-1] = ww[pe]; + aa[D[pe] + WEIGHT_WIDTH-1] = ww[pe][WEIGHT_WIDTH-1]; + end + end + end : blkVectorize + + uwire [57:0] pp; + + // Note: Since the product B * AD is computed, + // rst can be only applied to AD and zero only to B + // with the same effect as zeroing both. + if (FORCE_BEHAVIORAL) begin : genBehav + // Stage #1: Input Refine + logic signed [23:0] B1 = 0; + always_ff @(posedge clk) begin + if(zero) B1 <= 0; + else if(en) B1 <= bb; + end + + logic signed [26:0] AD1 = 0; + always_ff @(posedge clk) begin + if(rst) AD1 <= 0; + else if(en) AD1 <= dd - aa; + end + + // Stage #2: Multiply + logic signed [50:0] M2 = 0; + always_ff @(posedge clk) begin + if(rst) M2 <= 0; + else if(en) M2 <= +// synthesis translate off + (B1 === '0) || (AD1 === '0)? 0 : +// synthesis translate on + B1 * AD1; + end + + // Stage #3: Accumulate + logic signed [57:0] P3 = 0; + always_ff @(posedge clk) begin + if(rst) P3 <= 0; + else if(en) P3 <= M2 + (L[3]? 0 : P3); + end + + assign pp = P3; + end : genBehav + else begin : genDSP + DSP48E2 #( + // Feature Control Attributes: Data Path Selection + .AMULTSEL("AD"), // Selects A input to multiplier (A, AD) + .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) + .BMULTSEL("B"), // Selects B input to multiplier (AD, B) + .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port) + .PREADDINSEL("A"), // Selects input to pre-adder (A, B) + .RND('0), // Rounding Constant + .USE_MULT("MULTIPLY"), // Select multiplier usage (DYNAMIC, MULTIPLY, NONE) + .USE_SIMD("ONE48"), // SIMD selection (FOUR12, ONE58, TWO24) + .USE_WIDEXOR("FALSE"), // Use the Wide XOR function (FALSE, TRUE) + .XORSIMD("XOR24_48_96"), // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116) + + // Pattern Detector Attributes: Pattern Detection Configuration + .AUTORESET_PATDET("NO_RESET"), // NO_RESET, RESET_MATCH, RESET_NOT_MATCH + .AUTORESET_PRIORITY("RESET"), // Priority of AUTORESET vs. CEP (CEP, RESET). + .MASK('1), // 58-bit mask value for pattern detect (1=ignore) + .PATTERN('0), // 58-bit pattern match for pattern detect + .SEL_MASK("MASK"), // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2 + .SEL_PATTERN("PATTERN"), // Select pattern value (C, PATTERN) + .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect (NO_PATDET, PATDET) + + // Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins + .IS_ALUMODE_INVERTED('0), // Optional inversion for ALUMODE + .IS_CARRYIN_INVERTED('0), // Optional inversion for CARRYIN + .IS_CLK_INVERTED('0), // Optional inversion for CLK + .IS_INMODE_INVERTED('0), // Optional inversion for INMODE + .IS_OPMODE_INVERTED(9'b00_010_01_01), // Optional inversion for OPMODE + .IS_RSTALLCARRYIN_INVERTED('0), // Optional inversion for RSTALLCARRYIN + .IS_RSTALUMODE_INVERTED('0), // Optional inversion for RSTALUMODE + .IS_RSTA_INVERTED('0), // Optional inversion for RSTA + .IS_RSTB_INVERTED('0), // Optional inversion for RSTB + .IS_RSTCTRL_INVERTED('0), // Optional inversion for STCONJUGATE_A + .IS_RSTC_INVERTED('0), // Optional inversion for RSTC + .IS_RSTD_INVERTED('0), // Optional inversion for RSTD + .IS_RSTINMODE_INVERTED('0), // Optional inversion for RSTINMODE + .IS_RSTM_INVERTED('0), // Optional inversion for RSTM + .IS_RSTP_INVERTED('0), // Optional inversion for RSTP + + // Register Control Attributes: Pipeline Register Configuration + .ACASCREG(0), // Number of pipeline stages between A/ACIN and ACOUT (0-2) + .ADREG(1), // Pipeline stages for pre-adder (0-1) + .ALUMODEREG(0), // Pipeline stages for ALUMODE (0-1) + .AREG(0), // Pipeline stages for A (0-2) + .BCASCREG(1), // Number of pipeline stages between B/BCIN and BCOUT (0-2) + .BREG(1), // Pipeline stages for B (0-2) + .CARRYINREG(0), // Pipeline stages for CARRYIN (0-1) + .CARRYINSELREG(0), // Pipeline stages for CARRYINSEL (0-1) + .CREG(0), // Pipeline stages for C (0-1) + .DREG(0), // Pipeline stages for D (0-1) + .INMODEREG(0), // Pipeline stages for INMODE (0-1) + .MREG(1), // Multiplier pipeline stages (0-1) + .OPMODEREG(1), // Pipeline stages for OPMODE (0-1) + .PREG(1) // Number of pipeline stages for P (0-1) + ) dsp ( + // Cascade outputs: Cascade Ports + .ACOUT(), // 34-bit output: A port cascade + .BCOUT(), // 24-bit output: B cascade + .CARRYCASCOUT(), // 1-bit output: Cascade carry + .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade + .PCOUT(), // 58-bit output: Cascade output + + // Control outputs: Control Inputs/Status Bits + .OVERFLOW(), // 1-bit output: Overflow in add/acc + .PATTERNBDETECT(), // 1-bit output: Pattern bar detect + .PATTERNDETECT(), // 1-bit output: Pattern detect + .UNDERFLOW(), // 1-bit output: Underflow in add/acc + + // Data outputs: Data Ports + .CARRYOUT(), // 4-bit output: Carry + .P(pp), // 58-bit output: Primary data + .XOROUT(), // 8-bit output: XOR data + + // Cascade inputs: Cascade Ports + .ACIN('x), // 34-bit input: A cascade data + .BCIN('x), // 24-bit input: B cascade + .CARRYCASCIN('x), // 1-bit input: Cascade carry + .MULTSIGNIN('x), // 1-bit input: Multiplier sign cascade + .PCIN('x), // 58-bit input: P cascade + + // Control inputs: Control Inputs/Status Bits + .CLK(clk), // 1-bit input: Clock + .ALUMODE(4'h0), // 4-bit input: ALU control + .CARRYINSEL('0), // 3-bit input: Carry select + .INMODE(5'b01100), // 5-bit input: INMODE control + .OPMODE({ 2'b00, { 1'b0, L[2], 1'b0 }, 4'b00_00 }), // 9-bit input: Operation mode + + // Data inputs: Data Ports + .A(aa), // 34-bit input: A data + .B(bb), // 24-bit input: B data + .C('x), // 58-bit input: C data + .CARRYIN('0), // 1-bit input: Carry-in + .D(dd), // 27-bit input: D data + + // Reset/Clock Enable inputs: Reset/Clock Enable Inputs + .CEA1('0), // 1-bit input: Clock enable for 1st stage AREG + .CEA2('0), // 1-bit input: Clock enable for 2nd stage AREG + .CEAD(en), // 1-bit input: Clock enable for ADREG + .CEALUMODE('0), // 1-bit input: Clock enable for ALUMODE + .CEB1('0), // 1-bit input: Clock enable for 1st stage BREG + .CEB2(en), // 1-bit input: Clock enable for 2nd stage BREG + .CEC('0), // 1-bit input: Clock enable for CREG + .CECARRYIN('0), // 1-bit input: Clock enable for CARRYINREG + .CECTRL(en), // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG + .CED('0), // 1-bit input: Clock enable for DREG + .CEINMODE('0), // 1-bit input: Clock enable for INMODEREG + .CEM(en), // 1-bit input: Clock enable for MREG + .CEP(en), // 1-bit input: Clock enable for PREG + .RSTA('0), // 1-bit input: Reset for AREG + .RSTB( // 1-bit input: Reset for BREG +// synthesis translate_off + rst || +// synthesis translate_on + zero + ), + .RSTC('0), // 1-bit input: Reset for CREG + .RSTD( // 1-bit input: Reset for DREG and ADREG +// synthesis translate_off + zero || +// synthesis translate_on + rst + ), + .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG + .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG + .RSTCTRL('0), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG + .RSTINMODE('0), // 1-bit input: Reset for INMODE register + .RSTM(rst), // 1-bit input: Reset for MREG + .RSTP(rst) // 1-bit input: Reset for PREG + ); + end : genDSP + + // External Canary Pipeline + logic [1:0] X1 = '{ default: 0 }; + logic [1:0] X2 = '{ default: 0 }; + logic [1:0] X3 = '{ default: 0 }; + always_ff @(posedge clk) begin + if(rst) begin + X1 <= '{ default: 0 }; + X2 <= '{ default: 0 }; + X3 <= '{ default: 0 }; + end + else if(en) begin + X1 <= xx; + X2 <= X1; + X3 <= X2 + (L[3]? 2'h0 : pp[D[1]+:2]); + end + end + + // Derive actual cross-lane overflows + assign h3[s] = pp[D[1]+:2] - X3; + + assign p3[s] = pp; + + end : genSIMD + + // Stage #4: Cross-SIMD Reduction + + // Count leaves reachable from each node + localparam leave_load_t LEAVE_LOAD = init_leave_loads(); + + uwire signed [ACCU_WIDTH -1:0] up4; + uwire signed [ACCU_WIDTH -SINGLE_PROD_WIDTH:0] hi4; + uwire [$clog2(SIMD)+SINGLE_PROD_WIDTH-1:0] lo4; + for(genvar i = 0; i < 2; i++) begin + localparam int unsigned LO_WIDTH = D[i+1] - D[i]; + localparam int unsigned HI_WIDTH = ACCU_WIDTH - LO_WIDTH; + + // Conclusive high part accumulation + if(i == 0) begin : genHi + // Adder Tree across all SIMD high contributions, each from [-1:1] + uwire signed [$clog2(1+SIMD):0] tree[2*SIMD-1]; + for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = h3[s]; + for(genvar n = 0; n < SIMD-1; n++) begin + // Sum truncated to actual maximum bit width at this node + uwire signed [$clog2(1+LEAVE_LOAD[n]):0] s = tree[2*n+1] + tree[2*n+2]; + assign tree[n] = s; + end + + // High Sideband Accumulation + logic signed [HI_WIDTH-1:0] Hi4 = 0; + always_ff @(posedge clk) begin + if(rst) Hi4 <= 0; + else if(en) Hi4 <= (L[4]? 0 : Hi4) + tree[0]; + end + assign hi4 = Hi4; + end : genHi + + // Conclusive low part accumulation + if(1) begin : blkLo + // Adder Tree across all SIMD low contributions + localparam int unsigned ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1)); + uwire [ROOT_WIDTH-1:0] tree[2*SIMD-1]; + for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = p3[s][D[i]+:LO_WIDTH]; + for(genvar n = 0; n < SIMD-1; n++) begin + // Sum truncated to actual maximum bit width at this node + localparam int unsigned NODE_WIDTH = $clog2(1 + LEAVE_LOAD[n]*(2**LO_WIDTH-1)); + uwire [NODE_WIDTH-1:0] s = tree[2*n+1] + tree[2*n+2]; + assign tree[n] = s; + end + + logic [ROOT_WIDTH-1:0] Lo4 = 0; + always_ff @(posedge clk) begin + if(rst) Lo4 <= 0; + else if(en) Lo4 <= tree[0]; + end + + if(i == 1) assign up4 = Lo4; + else assign lo4 = Lo4; + end : blkLo + + end + + // Stage #5: Resolve lane totals + logic signed [1:0][ACCU_WIDTH-1:0] Res5 = '{ default: 0 }; + always_ff @(posedge clk) begin + if(rst) Res5 <= '{ default: 0 }; + else if(en) begin + Res5[1] <= up4 - hi4; + Res5[0] <= $signed({ hi4, {(D[1] - D[0]){1'b0}} }) + $signed({ 1'b0, lo4 }); + end + end + + // Output + for(genvar pe = PE_BEG; pe < PE_END; pe++) begin + assign p[pe] = Res5[pe - PE_BEG]; + end + + end : genPipes + +endmodule : mvu_8sx8u_dsp48 \ No newline at end of file From e7109e75161774280b24e5884f6c9b9c17a07f7b Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 8 May 2023 10:34:23 +0100 Subject: [PATCH 027/112] [fpgadataflow transform]: initial specialize_to_rtl_layers-transform for MVU --- .../fpgadataflow/specialize_to_rtl_layers.py | 105 ++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100644 src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py diff --git a/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py new file mode 100644 index 0000000000..7d677ec216 --- /dev/null +++ b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py @@ -0,0 +1,105 @@ +# Copyright (c) 2023, AMD +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from qonnx.transformation.base import Transformation +from qonnx.custom_op.registry import getCustomOp +from qonnx.core.datatype import DataType +from onnx import helper +from qonnx.transformation.infer_shapes import InferShapes +from qonnx.transformation.infer_datatypes import InferDataTypes +from finn.transformation.fpgadataflow.minimize_accumulator_width import MinimizeAccumulatorWidth + +class InferRTLMatrixVectorActivation(Transformation): + """Convert (HLS-based) MatrixVectorActivation layers to specialized RTL layers if supported.""" + + def __init__(self): + super().__init__() + + def _is_rtl_variant_compatible(self, n): + no_activation = getCustomOp(n).get_nodeattr("noActivation") == 1 + act_width_in_range = (DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() <= 8) or (DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() == 9 and DataType[getCustomOp(n).get_nodeattr("inputDataType")].min() < 0) + weight_width_in_range = DataType[getCustomOp(n).get_nodeattr("weightDataType")].bitwidth() <= 8 + folding_supported = (getCustomOp(n).get_nodeattr("MH") % getCustomOp(n).get_nodeattr("PE") == 0) and (getCustomOp(n).get_nodeattr("MW") % getCustomOp(n).get_nodeattr("SIMD") == 0) + + if (no_activation and act_width_in_range and weight_width_in_range and folding_supported): + return True + else: + return False + + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for n in graph.node: + node_ind += 1 + if n.op_type == "MatrixVectorActivation": + preferred_in_rtl = getCustomOp(n).get_nodeattr("impl") == "rtl" and getCustomOp(n).get_nodeattr("resType") == "dsp" + supported_in_rtl = self._is_rtl_variant_compatible(n) + if (preferred_in_rtl and supported_in_rtl): + mvau_input = n.input[0] + mvau_weight = n.input[1] + mvau_output = n.output[0] + inputDataType = getCustomOp(n).get_nodeattr("inputDataType") + weightDataType = getCustomOp(n).get_nodeattr("weightDataType") + outputDataType = getCustomOp(n).get_nodeattr("outputDataType") + numInputVectors = getCustomOp(n).get_nodeattr("numInputVectors") + mw = getCustomOp(n).get_nodeattr("MW") + mh = getCustomOp(n).get_nodeattr("MH") + simd = getCustomOp(n).get_nodeattr("SIMD") + pe = getCustomOp(n).get_nodeattr("PE") + mem_mode = getCustomOp(n).get_nodeattr("mem_mode") + + new_node = helper.make_node( + "MatrixVectorActivation_rtl", + [mvau_input, mvau_weight], + [mvau_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + MW=mw, + MH=mh, + SIMD=simd, + PE=pe, + inputDataType=inputDataType, + weightDataType=weightDataType, + outputDataType=outputDataType, + numInputVectors=numInputVectors, + mem_mode=mem_mode, + name=n.name + "_rtl", + ) + graph.node.insert(node_ind, new_node) + # remove old node + graph.node.remove(n) + graph_modified=True + + if graph_modified: + model = model.transform(MinimizeAccumulatorWidth()) + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + + return (model, graph_modified) \ No newline at end of file From 5a868d19e5955abdb894bf1e8b93d2d1f6f8410d Mon Sep 17 00:00:00 2001 From: Yaman Umuroglu Date: Tue, 9 May 2023 09:41:15 +0200 Subject: [PATCH 028/112] [rtl mvu] fixes for latest memstream + linting --- .../matrixvectoractivation_rtl.py | 136 ++++++++++-------- 1 file changed, 77 insertions(+), 59 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py index 6b1c2f3be7..8fd261d395 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py @@ -27,7 +27,6 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import math -from shutil import copy import numpy as np import os import textwrap @@ -40,20 +39,18 @@ ) from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.util.basic import get_rtlsim_trace_depth, make_build_dir from finn.util.data_packing import ( npy_to_rtlsim_input, - numpy_to_hls_code, pack_innermost_dim_as_hex_string, rtlsim_output_to_npy, ) -from finn.util.basic import get_rtlsim_trace_depth, make_build_dir try: from pyverilator import PyVerilator except ModuleNotFoundError: PyVerilator = None -from . import templates # ONNX i/o tensor shape assumptions for MatrixVectorActivation: # input 0 is the input tensor, shape (.., i_size) = (..., MW) @@ -69,7 +66,6 @@ class MatrixVectorActivation_rtl(HLSCustomOp): def __init__(self, onnx_node, **kwargs): super().__init__(onnx_node, **kwargs) - self.decoupled_wrapper = templates.decoupled_wrapper def get_nodeattr_types(self): my_attrs = { @@ -186,17 +182,24 @@ def verify_node(self): ) num_of_inputs = len(self.onnx_node.input) - if num_of_inputs!=2: - info_messages.append("RTL-based MatrixVectorActivation expects two inputs (weights and activation), but got {} inputs.".format(len(self.onnx_node.input))) + if num_of_inputs != 2: + info_messages.append( + "RTL-based MatrixVectorActivation expects two inputs " + "(weights and activation), but got {} inputs.".format( + len(self.onnx_node.input) + ) + ) mem_mode = self.get_nodeattr("mem_mode") if mem_mode != "decoupled": - info_messages.append("RTL-based MVAU supports only decoupled weights currently") + info_messages.append( + "RTL-based MVAU supports only decoupled weights currently" + ) return info_messages -# TODO: Add in replay_buffer estimation + # TODO: Add in replay_buffer estimation def uram_estimation(self): P = self.get_nodeattr("PE") Q = self.get_nodeattr("SIMD") @@ -218,7 +221,7 @@ def uram_estimation(self): depth_multiplier = math.ceil(omega / 4096) return width_multiplier * depth_multiplier -# TODO: Add in replay_buffer estimation + # TODO: Add in replay_buffer estimation def bram_estimation(self): """Calculates resource estimation for BRAM based on: - FINN-R: An End-to-End Deep-Learning Framework for Fast @@ -259,7 +262,7 @@ def bram_estimation(self): else: return (math.ceil(omega / 512)) * (math.ceil(mem_width / 36)) -# TODO: Add in replay_buffer estimation + # TODO: Add in replay_buffer estimation def bram_efficiency_estimation(self): wdt = self.get_weight_datatype() W = wdt.bitwidth() @@ -272,7 +275,7 @@ def bram_efficiency_estimation(self): bram16_est_capacity = bram16_est * 36 * 512 return wbits / bram16_est_capacity -# TODO: Add in replay_buffer estimation + # TODO: Add in replay_buffer estimation def uram_efficiency_estimation(self): """Function for URAM efficiency estimation: actual parameter storage needed divided by the allocated URAM storage (from estimation)""" @@ -287,7 +290,7 @@ def uram_efficiency_estimation(self): uram_est_capacity = uram_est * 72 * 4096 return wbits / uram_est_capacity -#TODO: FIX: worst case estimates since segmentlen is not known at this point? + # TODO: FIX: worst case estimates since segmentlen is not known at this point? def lut_estimation(self): """Calculates resource estimations for LUTs based on: - FINN-R: An End-to-End Deep-Learning Framework for Fast @@ -328,13 +331,9 @@ def lut_estimation(self): acc_bits = W + A + np.ceil(math.log(MW, 2)) acc_luts = acc_bits - return int( - c0 - + c1 * (P * (mult_luts + addertree_luts + acc_luts)) - + c2 - ) + return int(c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts)) + c2) -#TODO: FIX: worst case estimates since segmentlen is not known at this point? + # TODO: FIX: worst case estimates since segmentlen is not known at this point? def dsp_estimation(self): # multiplication P = self.get_nodeattr("PE") @@ -350,7 +349,7 @@ def dsp_estimation(self): mult_dsp = 0 return int(mult_dsp) -#TODO: FIX: worst case estimates since segmentlen is not known at this point + # TODO: FIX: worst case estimates since segmentlen is not known at this point def get_exp_cycles(self): pe = self.get_nodeattr("PE") simd = self.get_nodeattr("SIMD") @@ -359,7 +358,9 @@ def get_exp_cycles(self): mw = self.get_nodeattr("MW") # since mmv != 1 is not supported yet, we set mmv for now to 1 mmv = 1 - # Actual exp_cycles is probably slightly larger (say 3 cycles (DSP A/B, M, P - reg) + additional pipeline buffer cycles. Most probably <10) + # Actual exp_cycles is probably slightly larger (say 3 cycles + # (DSP A/B, M, P - reg) + additional pipeline buffer cycles. + # Most probably <10) exp_cycles = (mh / pe) * (mw / simd) * np.prod(num_inp_vec) / mmv return int(exp_cycles) @@ -384,7 +385,9 @@ def get_output_datatype(self, ind=0): def get_instream_width(self, ind=0): i_bits = self.get_input_datatype().bitwidth() - assert (i_bits<=9), "RTL-based MVAU only supports activations with bit-width up to 9-bits" + assert ( + i_bits <= 9 + ), "RTL-based MVAU only supports activations with bit-width up to 9-bits" in_width = i_bits * self.get_nodeattr("SIMD") return in_width @@ -402,7 +405,9 @@ def get_weightstream_width(self): pe = self.get_nodeattr("PE") simd = self.get_nodeattr("SIMD") wp = self.get_weight_datatype().bitwidth() - assert (wp <= 8), "RTL-based MVAU only supports weights with bit-width up to 8-bits" + assert ( + wp <= 8 + ), "RTL-based MVAU only supports weights with bit-width up to 8-bits" w_width = pe * simd * wp return w_width else: @@ -516,7 +521,8 @@ def minimize_accumulator_width(self, model): else: adt = DataType.get_smallest_possible(acc_max) # Note: we are interested in simply the width of the output dot product. - # Padding the actual output stream to a multiple of 8-bits is done in the RTL component + # Padding the actual output stream to a multiple of 8-bits is done in + # the RTL component self.set_nodeattr("accDataType", adt.name) # for no-activation nodes, output dt = acc dt self.set_nodeattr("outputDataType", adt.name) @@ -615,9 +621,7 @@ def generate_params(self, model, path): # and one for synthesis. this is because URAM-based weights always # need zero weights for synthesis, otherwise they get inferred # as BRAM - weight_filename_rtl_synth = "{}/memblock_synth_0.dat".format( - code_gen_dir - ) + weight_filename_rtl_synth = "{}/memblock_synth_0.dat".format(code_gen_dir) weight_filename_rtl_sim = "{}/memblock_sim_0.dat".format(code_gen_dir) # sim weights are always the true weights self.make_weight_file( @@ -734,11 +738,11 @@ def code_generation_ipgen(self, model, fpgapart, clk): def ipgen_singlenode_code(self): """Normally: Builds the bash script for IP generation.""" - pass + pass def code_generation_cppsim(self, model): """Normally: Generates C++ code for simulation (cppsim).""" - pass + pass def compile_singlenode_code(self): pass @@ -803,19 +807,28 @@ def code_generation_ipi(self): code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/") sourcefiles = [ - os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"), + os.path.join( + code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v" + ), rtllib_dir + "mvu_axi.sv", rtllib_dir + "replay_buffer.sv", rtllib_dir + "mvu_4sx4u.sv", rtllib_dir + "mvu_8sx9.sv", - rtllib_dir + "mvu_8sx8u_dsp48.sv" + rtllib_dir + "mvu_8sx8u_dsp48.sv", ] for f in sourcefiles: cmd.append("add_files -norecurse %s" % (f)) - cmd.append("create_bd_cell -type hier -reference %s /%s/%s" % (self.get_nodeattr("gen_top_module"), self.onnx_node.name, self.onnx_node.name)) + cmd.append( + "create_bd_cell -type hier -reference %s /%s/%s" + % ( + self.get_nodeattr("gen_top_module"), + self.onnx_node.name, + self.onnx_node.name, + ) + ) # instantiate a streamer and connect it to the HLS IP - strm_vlnv = "xilinx.com:user:memstream:1.0" + strm_vlnv = "amd.com:FINN:memstream:1.0" strm_inst = node_name + "_wstrm" cmd.append( "create_bd_cell -type ip -vlnv %s /%s/%s" @@ -849,11 +862,11 @@ def code_generation_ipi(self): % (node_name, strm_inst, node_name, node_name, sname) ) cmd.append( - "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aresetn]" + "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/ap_rst_n]" % (node_name, rst_name, node_name, strm_inst) ) cmd.append( - "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aclk]" + "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/ap_clk]" % (node_name, clk_name, node_name, strm_inst) ) cmd.append( @@ -947,21 +960,25 @@ def derive_characteristic_fxns(self, period): ] super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) -# TODO: characterize max_clk and implement this function in look-up style + # TODO: characterize max_clk and implement this function in look-up style def _resolve_segment_len(self, clk): # Insert pipeline registers in the DSP chain to meet target clock frequency segmentlen = 0 return segmentlen def _resolve_impl_style(self, fpgapart): - # Based on target device and activation/weight-width, choose the supported RTL module + # Based on target device and activation/weight-width, choose the + # supported RTL module act_width = self.get_input_datatype(0).bitwidth() weight_width = self.get_input_datatype(1).bitwidth() - is_versal = fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"] or fpgapart[0:5] == "xqrvc" - if (act_width == 4 and weight_width == 4): + is_versal = ( + fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"] + or fpgapart[0:5] == "xqrvc" + ) + if act_width == 4 and weight_width == 4: return "mvu_4sx4u" else: - if (is_versal): + if is_versal: return "mvu_8sx9_dsp58" else: return "mvu_8sx8u_dsp48" @@ -973,13 +990,17 @@ def generate_hdl(self, model, fpgapart, clk): template_path, code_gen_dict = self.prepare_codegen_default(fpgapart, clk) # add general parameters to dictionary - code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [self.get_verilog_top_module_name()] + code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [ + self.get_verilog_top_module_name() + ] # save top module name so we can refer to it after this node has been renamed # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject) self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name()) ram_style = self.get_nodeattr("ram_style") - assert (ram_style=="auto"), "Unrecognized ram_style for MatrixVectorActivation_rtl" + assert ( + ram_style == "auto" + ), "Unrecognized ram_style for MatrixVectorActivation_rtl" # apply code generation to template with open(template_path, "r") as f: @@ -1009,19 +1030,21 @@ def generate_hdl(self, model, fpgapart, clk): self.set_nodeattr("ip_path", code_gen_dir) def prepare_codegen_default(self, fpgapart, clk): - template_path = ( - os.environ["FINN_ROOT"] + "/finn-rtllib/mvu/mvu_axi_wrapper.v" - ) - + template_path = os.environ["FINN_ROOT"] + "/finn-rtllib/mvu/mvu_axi_wrapper.v" + code_gen_dict = {} code_gen_dict["$MW$"] = [str(self.get_nodeattr("MW"))] code_gen_dict["$MH$"] = [str(self.get_nodeattr("MH"))] code_gen_dict["$PE$"] = [str(self.get_nodeattr("PE"))] code_gen_dict["$SIMD$"] = [str(self.get_nodeattr("SIMD"))] - code_gen_dict["$ACTIVATION_WIDTH$"] = [str(self.get_input_datatype(0).bitwidth())] + code_gen_dict["$ACTIVATION_WIDTH$"] = [ + str(self.get_input_datatype(0).bitwidth()) + ] code_gen_dict["$WEIGHT_WIDTH$"] = [str(self.get_input_datatype(1).bitwidth())] code_gen_dict["$ACCU_WIDTH$"] = [str(self.get_output_datatype().bitwidth())] - code_gen_dict["$SIGNED_ACTIVATIONS$"] = [str(1)] if (self.get_input_datatype(0).min() < 0) else [str(0)] + code_gen_dict["$SIGNED_ACTIVATIONS$"] = ( + [str(1)] if (self.get_input_datatype(0).min() < 0) else [str(0)] + ) code_gen_dict["$SEGMENTLEN$"] = [str(self._resolve_segment_len(clk))] code_gen_dict["$MVU_IMPL_STYLE$"] = [self._resolve_impl_style(fpgapart)] @@ -1035,15 +1058,10 @@ def prepare_rtlsim(self): if PyVerilator is None: raise ImportError("Installation of PyVerilator is required.") - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") # Path to (System-)Verilog files used by top-module & path to top-module - verilog_paths = [ - code_gen_dir, - os.environ["FINN_ROOT"] + "/finn-rtllib/mvu" - ] - verilog_files = [ - self.get_nodeattr("gen_top_module") + "_wrapper_sim.v" - ] + verilog_paths = [code_gen_dir, os.environ["FINN_ROOT"] + "/finn-rtllib/mvu"] + verilog_files = [self.get_nodeattr("gen_top_module") + "_wrapper_sim.v"] # build the Verilator emu library sim = PyVerilator.build( @@ -1051,9 +1069,9 @@ def prepare_rtlsim(self): build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"), verilog_path=verilog_paths, trace_depth=get_rtlsim_trace_depth(), - top_module_name=self.get_verilog_top_module_name() + top_module_name=self.get_verilog_top_module_name(), ) # save generated lib filename in attribute self.set_nodeattr("rtlsim_so", sim.lib._name) - - return sim \ No newline at end of file + + return sim From 4a9cfa1c7a17497578faad3f76c25b80c116ba58 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 11 May 2023 10:56:07 +0100 Subject: [PATCH 029/112] [rtl custom_op]: add support for external weights --- .../matrixvectoractivation_rtl.py | 67 ++++++++++--------- 1 file changed, 37 insertions(+), 30 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py index 8fd261d395..162b5e2e16 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py @@ -192,9 +192,9 @@ def verify_node(self): mem_mode = self.get_nodeattr("mem_mode") - if mem_mode != "decoupled": + if mem_mode not in ["decoupled", "external"]: info_messages.append( - "RTL-based MVAU supports only decoupled weights currently" + "RTL-based MVAU supports only decoupled or external weights." ) return info_messages @@ -612,35 +612,20 @@ def generate_params(self, model, path): code_gen_dir = path # weights, if not external weights = model.get_initializer(self.onnx_node.input[1]) - if mem_mode == "decoupled": + if mem_mode in ["decoupled", "external"]: weight_filename_sim = "{}/weights.npy".format(code_gen_dir) # save decoupled weights for cppsim self.make_weight_file(weights, "decoupled_npy", weight_filename_sim) - # Also save weights as Verilog .dat file - # note that we provide two different .dat files, one for synth - # and one for synthesis. this is because URAM-based weights always - # need zero weights for synthesis, otherwise they get inferred - # as BRAM - weight_filename_rtl_synth = "{}/memblock_synth_0.dat".format(code_gen_dir) - weight_filename_rtl_sim = "{}/memblock_sim_0.dat".format(code_gen_dir) - # sim weights are always the true weights - self.make_weight_file( - weights, "decoupled_verilog_dat", weight_filename_rtl_sim - ) - ram_style = self.get_nodeattr("ram_style") - if ram_style == "ultra": - # UltraRAM must have no memory initializer, or only zeroes - # otherwise BRAM will be inferred instead of URAM - # as a workaround we provide a zero-weight init here - synth_weights = np.zeros_like(weights, dtype=np.float32) - else: - synth_weights = weights - self.make_weight_file( - synth_weights, "decoupled_verilog_dat", weight_filename_rtl_synth - ) + if mem_mode == "decoupled": + # also save weights as Verilog .dat file + # This file will be ignored when synthesizing UltraScale memory. + weight_filename_rtl = "{}/memblock.dat".format(code_gen_dir) + self.make_weight_file( + weights, "decoupled_verilog_dat", weight_filename_rtl + ) else: raise Exception( - """Please set mem_mode to "decoupled", + """Please set mem_mode to "const", "decoupled", or "external", currently no other parameter value is supported!""" ) @@ -695,7 +680,7 @@ def execute_node(self, context, graph): ) super().reset_rtlsim(sim) super().toggle_clk(sim) - if mem_mode == "external" or mem_mode == "decoupled": + if mem_mode in ["external", "decoupled"]: wnbits = self.get_weightstream_width() export_wdt = self.get_weight_datatype() wei = npy_to_rtlsim_input( @@ -903,9 +888,31 @@ def code_generation_ipi(self): # TODO calculate and pass in segment size here cmd.append("assign_bd_address") cmd.append("save_bd_design") - elif mem_mode == "const" or mem_mode == "external": - # base class impl sufficient for const/external modes - return super().code_generation_ipi() + elif mem_mode == "external": + # instantiate the RTL block + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/") + sourcefiles = [ + os.path.join( + code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v" + ), + rtllib_dir + "mvu_axi.sv", + rtllib_dir + "replay_buffer.sv", + rtllib_dir + "mvu_4sx4u.sv", + rtllib_dir + "mvu_8sx9.sv", + rtllib_dir + "mvu_8sx8u_dsp48.sv", + ] + for f in sourcefiles: + cmd.append("add_files -norecurse %s" % (f)) + cmd.append( + "create_bd_cell -type module -reference %s %s" + % ( + self.get_nodeattr("gen_top_module"), + self.onnx_node.name, + ) + ) + cmd.append("set_property CONFIG.FREQ_HZ 333333333.333333 [get_bd_intf_pins %s/in0_V]" % (self.onnx_node.name)) + cmd.append("set_property CONFIG.FREQ_HZ 333333333.333333 [get_bd_intf_pins %s/out_V]" % (self.onnx_node.name)) else: raise Exception("Unrecognized mem_mode for MatrixVectorActivation") return cmd From 8a9ac1af4d6c62e7c9557ab41992b84cf2c37ae1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Thu, 11 May 2023 11:04:28 +0100 Subject: [PATCH 030/112] Specify clock and reset associations of bus interfaces. --- finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v | 4 +++- finn-rtllib/mvu/mvu_axi_wrapper.v | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v index 502a72d3f2..fb3c62a15a 100644 --- a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v +++ b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v @@ -49,8 +49,10 @@ module $MODULE_NAME_AXI_WRAPPER$ #( parameter OUTPUT_LANES = PE, parameter OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8 )( - // Global Control + // Global Control + (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF s_axis_weights:s_axis_input:m_axis_output" *) input logic ap_clk, + (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF s_axis_weights:s_axis_input:m_axis_output" *) input logic ap_rst_n, // Weight Stream diff --git a/finn-rtllib/mvu/mvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_axi_wrapper.v index b79ba6bbd1..d8acaefcc7 100644 --- a/finn-rtllib/mvu/mvu_axi_wrapper.v +++ b/finn-rtllib/mvu/mvu_axi_wrapper.v @@ -50,8 +50,10 @@ module $MODULE_NAME_AXI_WRAPPER$ #( parameter OUTPUT_LANES = PE, parameter OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8 )( - // Global Control + // Global Control + (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V" *) input ap_clk, + (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V" *) input ap_rst_n, // Weight Stream input [WEIGHT_STREAM_WIDTH_BA-1:0] weights_V_TDATA, From d9b90793bd54a5e112531c737fa7c60a51b21d34 Mon Sep 17 00:00:00 2001 From: Yaman Umuroglu Date: Mon, 15 May 2023 10:16:48 +0200 Subject: [PATCH 031/112] [rtlmvu] More fixes for memstream and param gen --- .../fpgadataflow/matrixvectoractivation_rtl.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py index 162b5e2e16..1791327e78 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py @@ -612,7 +612,11 @@ def generate_params(self, model, path): code_gen_dir = path # weights, if not external weights = model.get_initializer(self.onnx_node.input[1]) +<<<<<<< HEAD if mem_mode in ["decoupled", "external"]: +======= + if mem_mode == "decoupled" or mem_mode == "external": +>>>>>>> 72fe4c5b ([rtlmvu] More fixes for memstream and param gen) weight_filename_sim = "{}/weights.npy".format(code_gen_dir) # save decoupled weights for cppsim self.make_weight_file(weights, "decoupled_npy", weight_filename_sim) @@ -821,22 +825,16 @@ def code_generation_ipi(self): ) cmd.append( "set_property -dict [list " - "CONFIG.NSTREAMS {1} " - "CONFIG.MEM_DEPTH {%d} " - "CONFIG.MEM_WIDTH {%d} " - "CONFIG.MEM_INIT {%s} " + "CONFIG.DEPTH {%d} " + "CONFIG.WIDTH {%d} " + "CONFIG.INIT_FILE {%s} " "CONFIG.RAM_STYLE {%s} " - "CONFIG.STRM0_DEPTH {%d} " - "CONFIG.STRM0_WIDTH {%d} " - "CONFIG.STRM0_OFFSET {0} " "] [get_bd_cells /%s/%s]" % ( self.calc_wmem(), self.get_weightstream_width_padded(), - self.get_nodeattr("code_gen_dir_ipgen") + "/", + self.get_nodeattr("code_gen_dir_ipgen") + "/memblock.dat", self.get_nodeattr("ram_style"), - self.calc_wmem(), - self.get_weightstream_width_padded(), node_name, strm_inst, ) From a5f2a83897e33acb4b3e2231d9bfa534e56bb6b2 Mon Sep 17 00:00:00 2001 From: Yaman Umuroglu Date: Thu, 11 May 2023 23:49:10 +0200 Subject: [PATCH 032/112] [Build] apply config to only FIFO nodes in step_set_fifo_depths --- src/finn/builder/build_dataflow_steps.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py index 65ab2b0b93..d4af757491 100644 --- a/src/finn/builder/build_dataflow_steps.py +++ b/src/finn/builder/build_dataflow_steps.py @@ -53,6 +53,7 @@ from shutil import copy import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl import finn.transformation.streamline.absorb as absorb from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer @@ -123,7 +124,6 @@ ) from finn.util.pyverilator import verilator_fifosim from finn.util.test import execute_parent -import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl def verify_step( @@ -486,14 +486,13 @@ def step_generate_estimate_reports(model: ModelWrapper, cfg: DataflowBuildConfig def step_specialize_to_rtl(model: ModelWrapper, cfg: DataflowBuildConfig): - """Convert layers implemented in HLS to an equivalent specialized RTL implementation if possible.""" - specialize_to_rtl_transforms = [ - to_rtl.InferRTLMatrixVectorActivation() - ] + """Convert layers implemented in HLS to an equivalent specialized RTL + implementation if possible.""" + specialize_to_rtl_transforms = [to_rtl.InferRTLMatrixVectorActivation()] for trn in specialize_to_rtl_transforms: model = model.transform(trn) return model - + def step_minimize_bit_width(model: ModelWrapper, cfg: DataflowBuildConfig): """Tighten the weight and accumulator bit widths for each layer.""" @@ -594,7 +593,12 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): model = model.transform(GiveUniqueNodeNames()) model = model.transform(GiveReadableTensorNames()) if cfg.folding_config_file is not None: - model = model.transform(ApplyConfig(cfg.folding_config_file)) + model = model.transform( + ApplyConfig( + cfg.folding_config_file, + node_filter=lambda x: x.op_type == "StreamingFIFO", + ) + ) # extract the final configuration and save it as json hw_attrs = [ From 08cbdc59a95ed6281c3234c5e8b0b9d7327a2988 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Wed, 24 May 2023 07:58:41 +0100 Subject: [PATCH 033/112] Revised control interface attributes. --- finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v | 29 +++++++++++++------------- finn-rtllib/mvu/mvu_axi_wrapper.v | 8 ++++--- 2 files changed, 20 insertions(+), 17 deletions(-) diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v index fb3c62a15a..e15f77fbae 100644 --- a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v +++ b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v @@ -50,25 +50,26 @@ module $MODULE_NAME_AXI_WRAPPER$ #( parameter OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8 )( // Global Control - (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF s_axis_weights:s_axis_input:m_axis_output" *) - input logic ap_clk, - (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF s_axis_weights:s_axis_input:m_axis_output" *) - input logic ap_rst_n, + (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF s_axis_weights:s_axis_input:m_axis_output, ASSOCIATED_RESET ap_rst_n" *) + (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *) + input ap_clk, + (* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *) + input ap_rst_n, // Weight Stream - input logic [WEIGHT_STREAM_WIDTH_BA-1:0] s_axis_weights_tdata, - input logic s_axis_weights_tvalid, - output logic s_axis_weights_tready, + input [WEIGHT_STREAM_WIDTH_BA-1:0] s_axis_weights_tdata, + input s_axis_weights_tvalid, + output s_axis_weights_tready, // Input Stream - input logic [INPUT_STREAM_WIDTH_BA-1:0] s_axis_input_tdata, - input logic s_axis_input_tvalid, - output logic s_axis_input_tready, + input [INPUT_STREAM_WIDTH_BA-1:0] s_axis_input_tdata, + input s_axis_input_tvalid, + output s_axis_input_tready, // Output Stream - output logic [OUTPUT_STREAM_WIDTH_BA-1:0] m_axis_output_tdata, - output logic m_axis_output_tvalid, - input logic m_axis_output_tready + output [OUTPUT_STREAM_WIDTH_BA-1:0] m_axis_output_tdata, + output m_axis_output_tvalid, + input m_axis_output_tready ); mvu_8sx9_axi #( @@ -89,4 +90,4 @@ mvu_8sx9_axi #( .m_axis_output_tready(m_axis_output_tready) ); -endmodule : $MODULE_NAME_AXI_WRAPPER$ \ No newline at end of file +endmodule : $MODULE_NAME_AXI_WRAPPER$ diff --git a/finn-rtllib/mvu/mvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_axi_wrapper.v index d8acaefcc7..239c5bbacd 100644 --- a/finn-rtllib/mvu/mvu_axi_wrapper.v +++ b/finn-rtllib/mvu/mvu_axi_wrapper.v @@ -51,10 +51,12 @@ module $MODULE_NAME_AXI_WRAPPER$ #( parameter OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8 )( // Global Control - (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V" *) + (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *) + (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *) input ap_clk, - (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V" *) + (* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *) input ap_rst_n, + // Weight Stream input [WEIGHT_STREAM_WIDTH_BA-1:0] weights_V_TDATA, input weights_V_TVALID, @@ -87,4 +89,4 @@ mvu_axi #( .m_axis_output_tready(out_V_TREADY) ); -endmodule : $MODULE_NAME_AXI_WRAPPER$ \ No newline at end of file +endmodule : $MODULE_NAME_AXI_WRAPPER$ From d058cc2a5c1ed71a2c2ea12034cfa921818381ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Wed, 24 May 2023 09:16:50 +0100 Subject: [PATCH 034/112] Mask device primitives from Verilator in favor of using behavioral code. --- finn-rtllib/mvu/mvu_4sx4u.sv | 38 ++++++++++++++++++++---------- finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 38 ++++++++++++++++++++---------- finn-rtllib/mvu/mvu_8sx9.sv | 29 ++++++++++++++--------- 3 files changed, 68 insertions(+), 37 deletions(-) diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv index 5993154355..21594e46ac 100644 --- a/finn-rtllib/mvu/mvu_4sx4u.sv +++ b/finn-rtllib/mvu/mvu_4sx4u.sv @@ -19,6 +19,12 @@ module mvu_4sx4u #( output logic vld, output logic signed [PE-1:0][ACCU_WIDTH-1:0] p ); + // Verilator always to use behavioral code + localparam bit BEHAVIORAL = +`ifdef VERILATOR + 1 || +`endif + FORCE_BEHAVIORAL; typedef int unsigned leave_load_t[2*SIMD-1]; function leave_load_t init_leave_loads(); @@ -59,17 +65,21 @@ module mvu_4sx4u #( for(genvar pe = 0; pe < PE_END - PE_BEG; pe++) begin assign ww[pe] = w[PE_BEG + pe][s]; if(pe) begin -// assign xx[pe] = zero? 0 : ww[pe] * a[s]; - LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x ( - .O6(xx[pe][1]), - .O5(xx[pe][0]), - .I5(1'b1), - .I4(zero), - .I3(ww[pe][1]), - .I2(a[s][1]), - .I1(ww[pe][0]), - .I0(a[s][0]) - ); + if(BEHAVIORAL) assign xx[pe] = zero? 0 : ww[pe] * a[s]; +`ifndef VERILATOR + else begin + LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x ( + .O6(xx[pe][1]), + .O5(xx[pe][0]), + .I5(1'b1), + .I4(zero), + .I3(ww[pe][1]), + .I2(a[s][1]), + .I1(ww[pe][0]), + .I0(a[s][0]) + ); + end +`endif end end always_comb begin @@ -87,7 +97,7 @@ module mvu_4sx4u #( // Note: Since the product B * AD is computed, // rst can be only applied to AD and zero only to B // with the same effect as zeroing both. - if (FORCE_BEHAVIORAL) begin : genBehav + if (BEHAVIORAL) begin : genBehav // Stage #1: Input Refine logic signed [23:0] B1 = 0; always_ff @(posedge clk) begin @@ -121,6 +131,7 @@ module mvu_4sx4u #( assign pp = P3; end : genBehav +`ifndef VERILATOR else begin : genDSP DSP48E2 #( // Feature Control Attributes: Data Path Selection @@ -252,6 +263,7 @@ module mvu_4sx4u #( .RSTP(rst) // 1-bit input: Reset for PREG ); end : genDSP +`endif // External Canary Pipeline logic [1:0] X1[3:1] = '{ default: 0 }; @@ -356,4 +368,4 @@ module mvu_4sx4u #( end : genPipes -endmodule : mvu_4sx4u \ No newline at end of file +endmodule : mvu_4sx4u diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv index e06a92c8fa..09db360b77 100644 --- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv +++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv @@ -23,6 +23,12 @@ module mvu_8sx8u_dsp48 #( output logic vld, output logic signed [PE-1:0][ACCU_WIDTH-1:0] p ); + // Verilator always to use behavioral code + localparam bit BEHAVIORAL = +`ifdef VERILATOR + 1 || +`endif + FORCE_BEHAVIORAL; typedef int unsigned leave_load_t[2*SIMD-1]; function leave_load_t init_leave_loads(); @@ -63,17 +69,21 @@ module mvu_8sx8u_dsp48 #( for(genvar pe = 0; pe < PE_END - PE_BEG; pe++) begin assign ww[pe] = w[PE_BEG + pe][s]; if(pe) begin -// assign xx[pe] = zero? 0 : ww[pe] * a[s]; - LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x ( - .O6(xx[1]), - .O5(xx[0]), - .I5(1'b1), - .I4(zero), - .I3(ww[pe][1]), - .I2(a[s][1]), - .I1(ww[pe][0]), - .I0(a[s][0]) - ); + if(BEHAVIORAL) assign xx[pe] = zero? 0 : ww[pe] * a[s]; +`ifndef VERILATOR + else begin + LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x ( + .O6(xx[1]), + .O5(xx[0]), + .I5(1'b1), + .I4(zero), + .I3(ww[pe][1]), + .I2(a[s][1]), + .I1(ww[pe][0]), + .I0(a[s][0]) + ); + end +`endif end end always_comb begin @@ -91,7 +101,7 @@ module mvu_8sx8u_dsp48 #( // Note: Since the product B * AD is computed, // rst can be only applied to AD and zero only to B // with the same effect as zeroing both. - if (FORCE_BEHAVIORAL) begin : genBehav + if(BEHAVIORAL) begin : genBehav // Stage #1: Input Refine logic signed [23:0] B1 = 0; always_ff @(posedge clk) begin @@ -125,6 +135,7 @@ module mvu_8sx8u_dsp48 #( assign pp = P3; end : genBehav +`ifndef VERILATOR else begin : genDSP DSP48E2 #( // Feature Control Attributes: Data Path Selection @@ -256,6 +267,7 @@ module mvu_8sx8u_dsp48 #( .RSTP(rst) // 1-bit input: Reset for PREG ); end : genDSP +`endif // External Canary Pipeline logic [1:0] X1 = '{ default: 0 }; @@ -355,4 +367,4 @@ module mvu_8sx8u_dsp48 #( end : genPipes -endmodule : mvu_8sx8u_dsp48 \ No newline at end of file +endmodule : mvu_8sx8u_dsp48 diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv index 2d1da26efb..f8e2ab3985 100644 --- a/finn-rtllib/mvu/mvu_8sx9.sv +++ b/finn-rtllib/mvu/mvu_8sx9.sv @@ -52,11 +52,17 @@ module mvu_8sx9 #( input logic zero, // ignore current inputs and force this partial product to zero input logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // weights input logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] a, // activations - + // Ouput output logic vld, output logic [PE-1:0][ACCU_WIDTH-1:0] p ); + // Verilator always to use behavioral code + localparam bit BEHAVIORAL = +`ifdef VERILATOR + 1 || +`endif + FORCE_BEHAVIORAL; //-------------------- Declare global signals --------------------\\ localparam int unsigned CHAINLEN = (SIMD+2)/3; @@ -75,7 +81,7 @@ module mvu_8sx9 #( L[1+MAX_PIPELINE_STAGES] <= last; L[0:MAX_PIPELINE_STAGES] <= L[1:1+MAX_PIPELINE_STAGES]; end - end + end assign vld = L[0]; //-------------------- Shift register for ZERO flag --------------------\\ @@ -87,7 +93,7 @@ module mvu_8sx9 #( else if(en) begin Z[0] <= zero; if (MAX_PIPELINE_STAGES > 2) Z[1:MAX_PIPELINE_STAGES-1] <= Z[0:MAX_PIPELINE_STAGES-2]; - end + end end end; @@ -157,12 +163,12 @@ module mvu_8sx9 #( if (LAST) begin : genPOUT assign p[j] = pp[ACCU_WIDTH-1:0]; - end + end // Note: Since the product B * AD is computed, // rst can be only applied to AD and zero only to B // with the same effect as zeroing both. - if (FORCE_BEHAVIORAL) begin : genBehav + if(BEHAVIORAL) begin : genBehav // Stage #1: Input A/B logic signed [33:0] Areg [INTERNAL_PREGS]; always_ff @(posedge clk) begin @@ -233,7 +239,7 @@ module mvu_8sx9 #( assign pp = Preg; assign pcout[j][i] = pp; end : genBehav - +`ifndef VERILATOR else begin: genDSP DSP58 #( // Feature Control Attributes: Data Path Selection @@ -263,8 +269,8 @@ module mvu_8sx9 #( .IS_CLK_INVERTED(1'b0), // Optional inversion for CLK .IS_INMODE_INVERTED(5'b00000), // Optional inversion for INMODE .IS_NEGATE_INVERTED(3'b000), // Optional inversion for NEGATE - .IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0 - FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN + .IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0 + FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN 2'b01, // Y : M 2'b01 // X: M }), // Optional inversion for OPMODE @@ -325,7 +331,7 @@ module mvu_8sx9 #( INTERNAL_PREGS==2 ? 1'b0 : 1'b1, 2'b00, TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero, - INTERNAL_PREGS==2 ? 1'b0 : 1'b1 + INTERNAL_PREGS==2 ? 1'b0 : 1'b1 }), // 5-bit input: INMODE control .NEGATE('0), // 3-bit input: Negates the input of the multiplier .OPMODE({ @@ -365,7 +371,8 @@ module mvu_8sx9 #( .RSTP(PREG && rst) // 1-bit input: Reset for PREG ); end : genDSP - end : genDSPChain +`endif + end : genDSPChain end : genDSPPE - + endmodule : mvu_8sx9 From a66f38f2d06901fd27cf874701572268ea4793d6 Mon Sep 17 00:00:00 2001 From: Yaman Umuroglu Date: Thu, 11 May 2023 23:48:36 +0200 Subject: [PATCH 035/112] [Deps] update qonnx --- fetch-repos.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fetch-repos.sh b/fetch-repos.sh index e039ca9144..f1cf8754f2 100755 --- a/fetch-repos.sh +++ b/fetch-repos.sh @@ -27,7 +27,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -QONNX_COMMIT="20a34289cf2297d2b2bbbe75d6ac152ece86e3b4" +QONNX_COMMIT="bc36fd56bf1e4abfcf98cd76a001cad13d57baac" FINN_EXP_COMMIT="0aa7e1c44b20cf085b6fe42cff360f0a832afd2c" BREVITAS_COMMIT="c65f9c13dc124971f14739349531bbcda5c2a4aa" PYVERILATOR_COMMIT="766e457465f5c0dd315490d7b9cc5d74f9a76f4f" From 8f9bd04b3311e56da4684a58d4de868d61f342ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Wed, 24 May 2023 12:44:53 +0100 Subject: [PATCH 036/112] Adding folding hints. Impl selection by case statement. --- finn-rtllib/mvu/mvu_axi.sv | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/finn-rtllib/mvu/mvu_axi.sv b/finn-rtllib/mvu/mvu_axi.sv index e4a919ba88..a181f54ac5 100644 --- a/finn-rtllib/mvu/mvu_axi.sv +++ b/finn-rtllib/mvu/mvu_axi.sv @@ -29,6 +29,14 @@ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * @brief Matrix Vector Unit (MVU) AXI-lite interface wrapper. + * @details + * Folding hints: + * - 4-bit MVU: PE scaling should aim at a full multiple of 4. + * - 8-bit MVU - DSP48: PE scaling should aim at a full multiple of 2. + * - 8-bit MVU - DSP58: SIMD scaling should aim at a full multiple of 3. + * - Otherwise, keep SIMD and PE somewhat balanced. SIMD scaling tends to + * impact critical paths more than PE scaling. PE scaling implies a + * bigger fanout on the input activations. *****************************************************************************/ module mvu_axi #( @@ -134,8 +142,9 @@ module mvu_axi #( uwire ovld; uwire [PE-1:0][ACCU_WIDTH-1:0] odat; typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t; - - if (MVU_IMPL_STYLE == "mvu_8sx9_dsp58") begin : genMVU8sx9 + + case(MVU_IMPL_STYLE) + "mvu_8sx9_dsp58": mvu_8sx9 #(.PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( @@ -143,26 +152,27 @@ module mvu_axi #( .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau), .vld(ovld), .p(odat) ); - end - else if (MVU_IMPL_STYLE == "mvu_4sx4u") begin : genMVU4sx4u + + "mvu_4sx4u": mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( .clk, .rst, .en, .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau), .vld(ovld), .p(odat) ); - end - else if (MVU_IMPL_STYLE == "mvu_8sx8u_dsp48") begin : genMVU8sx8u + + "mvu_8sx8u_dsp48": mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( .clk, .rst, .en, .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau), .vld(ovld), .p(odat) ); - end - else initial begin - $error("Unrecognized MVU_IMPL_STYLE!"); + + default: initial begin + $error("Unrecognized MVU_IMPL_STYLE '%s'", MVU_IMPL_STYLE); $finish; end + endcase //-------------------- Output register slice --------------------\\ struct packed { @@ -185,7 +195,7 @@ module mvu_axi #( end end end - + struct packed { logic vld; logic [PE-1:0][ACCU_WIDTH-1:0] dat; @@ -196,10 +206,10 @@ module mvu_axi #( if(rst) B <= '{ default: 'x }; else begin if(b_load) B <= '{ vld: A.vld, dat: A.dat}; - end + end end assign m_axis_output_tvalid = B.vld; assign m_axis_output_tdata = B.dat; -endmodule : mvu_axi \ No newline at end of file +endmodule : mvu_axi From 9de5ed6f7b459f37bb127f0cd105e6f927d25611 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Wed, 24 May 2023 13:52:40 +0100 Subject: [PATCH 037/112] Fixed behavioral sideband prediction. --- finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv index 09db360b77..bd1f813af6 100644 --- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv +++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv @@ -69,7 +69,7 @@ module mvu_8sx8u_dsp48 #( for(genvar pe = 0; pe < PE_END - PE_BEG; pe++) begin assign ww[pe] = w[PE_BEG + pe][s]; if(pe) begin - if(BEHAVIORAL) assign xx[pe] = zero? 0 : ww[pe] * a[s]; + if(BEHAVIORAL) assign xx = zero? 0 : ww[pe] * a[s]; `ifndef VERILATOR else begin LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x ( From 239759a6a4b8cb008aa9b80d52d15f53f77e5965 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 24 May 2023 15:49:19 +0100 Subject: [PATCH 038/112] [rtl mvu]: extension to allow selecting PE values that are not multiples of 4 --- finn-rtllib/mvu/mvu_4sx4u.sv | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv index 21594e46ac..111d651cf5 100644 --- a/finn-rtllib/mvu/mvu_4sx4u.sv +++ b/finn-rtllib/mvu/mvu_4sx4u.sv @@ -50,6 +50,7 @@ module mvu_4sx4u #( localparam int unsigned PE_BEG = 4*c; localparam int unsigned PE_END = PE < 4*(c+1)? PE : 4*(c+1); + localparam int unsigned PE_REM = 4*(c+1) - PE_END; uwire [57:0] p3[SIMD]; uwire signed [ 1:0] h3[SIMD][3]; @@ -65,12 +66,12 @@ module mvu_4sx4u #( for(genvar pe = 0; pe < PE_END - PE_BEG; pe++) begin assign ww[pe] = w[PE_BEG + pe][s]; if(pe) begin - if(BEHAVIORAL) assign xx[pe] = zero? 0 : ww[pe] * a[s]; + if(BEHAVIORAL) assign xx[pe + PE_REM] = zero? 0 : ww[pe] * a[s]; `ifndef VERILATOR else begin LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x ( - .O6(xx[pe][1]), - .O5(xx[pe][0]), + .O6(xx[pe + PE_REM][1]), + .O5(xx[pe + PE_REM][0]), .I5(1'b1), .I4(zero), .I3(ww[pe][1]), @@ -86,8 +87,8 @@ module mvu_4sx4u #( dd = '0; aa = '0; for(int unsigned pe = 0; pe < PE_END - PE_BEG; pe++) begin - dd[D[pe]+:3] = ww[pe]; - aa[D[pe]+ 3] = ww[pe][3]; + dd[D[pe + PE_REM]+:3] = ww[pe]; + aa[D[pe + PE_REM]+ 3] = ww[pe][3]; end end end : blkVectorize @@ -305,7 +306,7 @@ module mvu_4sx4u #( localparam int unsigned HI_WIDTH = ACCU_WIDTH - LO_WIDTH; // Conclusive high part accumulation - if(i < 3) begin : genHi + if(i >= PE_REM && i < 3) begin : genHi // Adder Tree across all SIMD high contributions, each from [-1:1] uwire signed [$clog2(1+SIMD):0] tree[2*SIMD-1]; for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = h3[s][i]; @@ -323,9 +324,12 @@ module mvu_4sx4u #( end assign hi4[i] = Hi4; end : genHi + else begin : genHiZero + assign hi4[i] = '0; + end : genHiZero // Conclusive low part accumulation - if(1) begin : blkLo + if(i >= PE_REM) begin : blkLo // Adder Tree across all SIMD low contributions localparam int unsigned ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1)); uwire [ROOT_WIDTH-1:0] tree[2*SIMD-1]; @@ -346,6 +350,9 @@ module mvu_4sx4u #( if(i == 3) assign up4 = Lo4; else assign lo4[i] = Lo4; end : blkLo + else begin : blkLoZero + assign lo4[i] = '0; + end : blkLoZero end @@ -363,7 +370,7 @@ module mvu_4sx4u #( // Output for(genvar pe = PE_BEG; pe < PE_END; pe++) begin - assign p[pe] = Res5[pe - PE_BEG]; + assign p[pe] = Res5[pe - PE_BEG + PE_REM]; end end : genPipes From 8d3247ccf7657aeb534147a5dd9511fa397d4eb2 Mon Sep 17 00:00:00 2001 From: Yaman Umuroglu Date: Wed, 24 May 2023 15:56:07 +0200 Subject: [PATCH 039/112] [rtlmvu] Avoid unintentional verilator metacomments --- finn-rtllib/mvu/mvu_4sx4u.sv | 2 +- finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 2 +- finn-rtllib/mvu/mvu_8sx9.sv | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv index 21594e46ac..9f101e8c29 100644 --- a/finn-rtllib/mvu/mvu_4sx4u.sv +++ b/finn-rtllib/mvu/mvu_4sx4u.sv @@ -19,7 +19,7 @@ module mvu_4sx4u #( output logic vld, output logic signed [PE-1:0][ACCU_WIDTH-1:0] p ); - // Verilator always to use behavioral code + // for verilator always use behavioral code localparam bit BEHAVIORAL = `ifdef VERILATOR 1 || diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv index bd1f813af6..6b54e91b6a 100644 --- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv +++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv @@ -23,7 +23,7 @@ module mvu_8sx8u_dsp48 #( output logic vld, output logic signed [PE-1:0][ACCU_WIDTH-1:0] p ); - // Verilator always to use behavioral code + // for verilator always use behavioral code localparam bit BEHAVIORAL = `ifdef VERILATOR 1 || diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv index f8e2ab3985..a601066cfd 100644 --- a/finn-rtllib/mvu/mvu_8sx9.sv +++ b/finn-rtllib/mvu/mvu_8sx9.sv @@ -57,7 +57,7 @@ module mvu_8sx9 #( output logic vld, output logic [PE-1:0][ACCU_WIDTH-1:0] p ); - // Verilator always to use behavioral code + // for verilator always use behavioral code localparam bit BEHAVIORAL = `ifdef VERILATOR 1 || From c8663505dcd2c2eeb3ddad05d361f82be32040eb Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 24 May 2023 17:14:23 +0100 Subject: [PATCH 040/112] [rtl mvu]: extension to allow selecting PE values that are not multiples of 2 --- finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 57 +++++++++++++++++------------- 1 file changed, 32 insertions(+), 25 deletions(-) diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv index 6b54e91b6a..5cc3fa4c49 100644 --- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv +++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv @@ -54,6 +54,7 @@ module mvu_8sx8u_dsp48 #( localparam int unsigned PE_BEG = 2*c; localparam int unsigned PE_END = PE < 2*(c+1)? PE : 2*(c+1); + localparam int unsigned PE_RES = 2*(c+1) - PE_END; uwire [57:0] p3[SIMD]; uwire signed [ 1:0] h3[SIMD]; @@ -90,8 +91,8 @@ module mvu_8sx8u_dsp48 #( dd = '0; aa = '0; for(int unsigned pe = 0; pe < PE_END - PE_BEG; pe++) begin - dd[D[pe] +: WEIGHT_WIDTH-1] = ww[pe]; - aa[D[pe] + WEIGHT_WIDTH-1] = ww[pe][WEIGHT_WIDTH-1]; + dd[D[pe + PE_RES] +: WEIGHT_WIDTH-1] = ww[pe]; + aa[D[pe + PE_RES] + WEIGHT_WIDTH-1] = ww[pe][WEIGHT_WIDTH-1]; end end end : blkVectorize @@ -301,32 +302,35 @@ module mvu_8sx8u_dsp48 #( uwire signed [ACCU_WIDTH -1:0] up4; uwire signed [ACCU_WIDTH -SINGLE_PROD_WIDTH:0] hi4; uwire [$clog2(SIMD)+SINGLE_PROD_WIDTH-1:0] lo4; - for(genvar i = 0; i < 2; i++) begin - localparam int unsigned LO_WIDTH = D[i+1] - D[i]; - localparam int unsigned HI_WIDTH = ACCU_WIDTH - LO_WIDTH; - // Conclusive high part accumulation - if(i == 0) begin : genHi - // Adder Tree across all SIMD high contributions, each from [-1:1] - uwire signed [$clog2(1+SIMD):0] tree[2*SIMD-1]; - for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = h3[s]; - for(genvar n = 0; n < SIMD-1; n++) begin - // Sum truncated to actual maximum bit width at this node - uwire signed [$clog2(1+LEAVE_LOAD[n]):0] s = tree[2*n+1] + tree[2*n+2]; - assign tree[n] = s; - end + // Conclusive high part accumulation + if(PE_RES == 0) begin : genHi + localparam int unsigned HI_WIDTH = ACCU_WIDTH - D[1]; + // Adder Tree across all SIMD high contributions, each from [-1:1] + uwire signed [$clog2(1+SIMD):0] tree[2*SIMD-1]; + for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = h3[s]; + for(genvar n = 0; n < SIMD-1; n++) begin + // Sum truncated to actual maximum bit width at this node + uwire signed [$clog2(1+LEAVE_LOAD[n]):0] s = tree[2*n+1] + tree[2*n+2]; + assign tree[n] = s; + end - // High Sideband Accumulation - logic signed [HI_WIDTH-1:0] Hi4 = 0; - always_ff @(posedge clk) begin - if(rst) Hi4 <= 0; - else if(en) Hi4 <= (L[4]? 0 : Hi4) + tree[0]; - end - assign hi4 = Hi4; - end : genHi + // High Sideband Accumulation + logic signed [HI_WIDTH-1:0] Hi4 = 0; + always_ff @(posedge clk) begin + if(rst) Hi4 <= 0; + else if(en) Hi4 <= (L[4]? 0 : Hi4) + tree[0]; + end + assign hi4 = Hi4; + end : genHi + else begin : genHiZero + assign hi4 = '0; + end : genHiZero + for(genvar i = 0; i < 2; i++) begin + localparam int unsigned LO_WIDTH = D[i+1] - D[i]; // Conclusive low part accumulation - if(1) begin : blkLo + if(i >= PE_RES) begin : blkLo // Adder Tree across all SIMD low contributions localparam int unsigned ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1)); uwire [ROOT_WIDTH-1:0] tree[2*SIMD-1]; @@ -347,6 +351,9 @@ module mvu_8sx8u_dsp48 #( if(i == 1) assign up4 = Lo4; else assign lo4 = Lo4; end : blkLo + else begin : blkLoZero + assign lo4 = '0; + end : blkLoZero end @@ -362,7 +369,7 @@ module mvu_8sx8u_dsp48 #( // Output for(genvar pe = PE_BEG; pe < PE_END; pe++) begin - assign p[pe] = Res5[pe - PE_BEG]; + assign p[pe] = Res5[pe - PE_BEG + PE_RES]; end end : genPipes From fd1e038c643c05199b38320f8815f430e538d936 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 24 May 2023 17:21:56 +0100 Subject: [PATCH 041/112] [rtl mvu axi]: updated comments on folding hints --- finn-rtllib/mvu/mvu_axi.sv | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/finn-rtllib/mvu/mvu_axi.sv b/finn-rtllib/mvu/mvu_axi.sv index a181f54ac5..cef55949ed 100644 --- a/finn-rtllib/mvu/mvu_axi.sv +++ b/finn-rtllib/mvu/mvu_axi.sv @@ -31,12 +31,13 @@ * @brief Matrix Vector Unit (MVU) AXI-lite interface wrapper. * @details * Folding hints: - * - 4-bit MVU: PE scaling should aim at a full multiple of 4. - * - 8-bit MVU - DSP48: PE scaling should aim at a full multiple of 2. - * - 8-bit MVU - DSP58: SIMD scaling should aim at a full multiple of 3. + * - 4-bit MVU: PE scaling should divide MH. + * - 8-bit MVU - DSP48: PE scaling should divide MH. + * - 8-bit MVU - DSP58: SIMD scaling should aim at a full multiple of 3 and divide MW. * - Otherwise, keep SIMD and PE somewhat balanced. SIMD scaling tends to * impact critical paths more than PE scaling. PE scaling implies a * bigger fanout on the input activations. + * - Full unfolding along MH (PE=MH) results in no replay buffer instantiated *****************************************************************************/ module mvu_axi #( From f60d4c6fa105bd29689b93aafd880ec92c32358c Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 2 Jun 2023 11:48:26 +0100 Subject: [PATCH 042/112] [rtl custom op]: minor fixes to codegen --- .../fpgadataflow/matrixvectoractivation_rtl.py | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py index 1791327e78..9f8130806b 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py @@ -612,11 +612,7 @@ def generate_params(self, model, path): code_gen_dir = path # weights, if not external weights = model.get_initializer(self.onnx_node.input[1]) -<<<<<<< HEAD - if mem_mode in ["decoupled", "external"]: -======= if mem_mode == "decoupled" or mem_mode == "external": ->>>>>>> 72fe4c5b ([rtlmvu] More fixes for memstream and param gen) weight_filename_sim = "{}/weights.npy".format(code_gen_dir) # save decoupled weights for cppsim self.make_weight_file(weights, "decoupled_npy", weight_filename_sim) @@ -909,8 +905,6 @@ def code_generation_ipi(self): self.onnx_node.name, ) ) - cmd.append("set_property CONFIG.FREQ_HZ 333333333.333333 [get_bd_intf_pins %s/in0_V]" % (self.onnx_node.name)) - cmd.append("set_property CONFIG.FREQ_HZ 333333333.333333 [get_bd_intf_pins %s/out_V]" % (self.onnx_node.name)) else: raise Exception("Unrecognized mem_mode for MatrixVectorActivation") return cmd @@ -968,8 +962,7 @@ def derive_characteristic_fxns(self, period): # TODO: characterize max_clk and implement this function in look-up style def _resolve_segment_len(self, clk): # Insert pipeline registers in the DSP chain to meet target clock frequency - segmentlen = 0 - return segmentlen + return 4 # default to 4 for now def _resolve_impl_style(self, fpgapart): # Based on target device and activation/weight-width, choose the @@ -1002,11 +995,6 @@ def generate_hdl(self, model, fpgapart, clk): # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject) self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name()) - ram_style = self.get_nodeattr("ram_style") - assert ( - ram_style == "auto" - ), "Unrecognized ram_style for MatrixVectorActivation_rtl" - # apply code generation to template with open(template_path, "r") as f: template_wrapper = f.read() From a1ad304a42bf89b36d6507cf9f749a7a1a7d130a Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 2 Jun 2023 11:48:58 +0100 Subject: [PATCH 043/112] [specialize-to-rtl]: add ram_style and rt_writeable_weights support --- .../transformation/fpgadataflow/specialize_to_rtl_layers.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py index 7d677ec216..23b6e59abe 100644 --- a/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py +++ b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py @@ -74,6 +74,8 @@ def apply(self, model): simd = getCustomOp(n).get_nodeattr("SIMD") pe = getCustomOp(n).get_nodeattr("PE") mem_mode = getCustomOp(n).get_nodeattr("mem_mode") + ram_style = getCustomOp(n).get_nodeattr("ram_style") + runtime_writeable_weights = getCustomOp(n).get_nodeattr("runtime_writeable_weights") new_node = helper.make_node( "MatrixVectorActivation_rtl", @@ -91,6 +93,8 @@ def apply(self, model): numInputVectors=numInputVectors, mem_mode=mem_mode, name=n.name + "_rtl", + ram_style=ram_style, + runtime_writeable_weights=runtime_writeable_weights ) graph.node.insert(node_ind, new_node) # remove old node From 2cbb68fe016ff7ea292ffa071741b352222d1a4c Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 2 Jun 2023 11:50:05 +0100 Subject: [PATCH 044/112] [rtllib]: change string type to parameter type due to Vivado error --- finn-rtllib/mvu/mvu_axi.sv | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/finn-rtllib/mvu/mvu_axi.sv b/finn-rtllib/mvu/mvu_axi.sv index cef55949ed..46167af95b 100644 --- a/finn-rtllib/mvu/mvu_axi.sv +++ b/finn-rtllib/mvu/mvu_axi.sv @@ -51,7 +51,7 @@ module mvu_axi #( bit SIGNED_ACTIVATIONS = 0, int unsigned SEGMENTLEN = 0, bit FORCE_BEHAVIORAL = 0, - string MVU_IMPL_STYLE, + parameter MVU_IMPL_STYLE, // string type causes error in Vivado localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, localparam int unsigned INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8, @@ -163,12 +163,11 @@ module mvu_axi #( "mvu_8sx8u_dsp48": mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), - .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( + .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( .clk, .rst, .en, .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau), .vld(ovld), .p(odat) ); - default: initial begin $error("Unrecognized MVU_IMPL_STYLE '%s'", MVU_IMPL_STYLE); $finish; From 92eb0edba2d059b8b170ed7e6d8ac7a224c9208c Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 2 Jun 2023 11:51:40 +0100 Subject: [PATCH 045/112] [rtllib]: renamed variable for consistency --- finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv index 5cc3fa4c49..3cd9cef560 100644 --- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv +++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv @@ -54,7 +54,7 @@ module mvu_8sx8u_dsp48 #( localparam int unsigned PE_BEG = 2*c; localparam int unsigned PE_END = PE < 2*(c+1)? PE : 2*(c+1); - localparam int unsigned PE_RES = 2*(c+1) - PE_END; + localparam int unsigned PE_REM = 2*(c+1) - PE_END; uwire [57:0] p3[SIMD]; uwire signed [ 1:0] h3[SIMD]; @@ -91,8 +91,8 @@ module mvu_8sx8u_dsp48 #( dd = '0; aa = '0; for(int unsigned pe = 0; pe < PE_END - PE_BEG; pe++) begin - dd[D[pe + PE_RES] +: WEIGHT_WIDTH-1] = ww[pe]; - aa[D[pe + PE_RES] + WEIGHT_WIDTH-1] = ww[pe][WEIGHT_WIDTH-1]; + dd[D[pe + PE_REM] +: WEIGHT_WIDTH-1] = ww[pe]; + aa[D[pe + PE_REM] + WEIGHT_WIDTH-1] = ww[pe][WEIGHT_WIDTH-1]; end end end : blkVectorize @@ -304,7 +304,7 @@ module mvu_8sx8u_dsp48 #( uwire [$clog2(SIMD)+SINGLE_PROD_WIDTH-1:0] lo4; // Conclusive high part accumulation - if(PE_RES == 0) begin : genHi + if(PE_REM == 0) begin : genHi localparam int unsigned HI_WIDTH = ACCU_WIDTH - D[1]; // Adder Tree across all SIMD high contributions, each from [-1:1] uwire signed [$clog2(1+SIMD):0] tree[2*SIMD-1]; @@ -330,7 +330,7 @@ module mvu_8sx8u_dsp48 #( for(genvar i = 0; i < 2; i++) begin localparam int unsigned LO_WIDTH = D[i+1] - D[i]; // Conclusive low part accumulation - if(i >= PE_RES) begin : blkLo + if(i >= PE_REM) begin : blkLo // Adder Tree across all SIMD low contributions localparam int unsigned ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1)); uwire [ROOT_WIDTH-1:0] tree[2*SIMD-1]; @@ -369,7 +369,7 @@ module mvu_8sx8u_dsp48 #( // Output for(genvar pe = PE_BEG; pe < PE_END; pe++) begin - assign p[pe] = Res5[pe - PE_BEG + PE_RES]; + assign p[pe] = Res5[pe - PE_BEG + PE_REM]; end end : genPipes From 471a221b975e549e462e7ff9488c65ad182fe278 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Fri, 2 Jun 2023 12:39:14 +0100 Subject: [PATCH 046/112] Fix improper blocking assignment & linting. --- finn-rtllib/mvu/tb/mvu_axi_tb.sv | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/finn-rtllib/mvu/tb/mvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_axi_tb.sv index ef5fa7d682..b89b58f55b 100644 --- a/finn-rtllib/mvu/tb/mvu_axi_tb.sv +++ b/finn-rtllib/mvu/tb/mvu_axi_tb.sv @@ -42,12 +42,12 @@ module mvu_axi_tb(); localparam int unsigned SEGMENTLEN = 2; localparam string MVU_IMPL_STYLE = "mvu_8sx8u_dsp48"; localparam bit FORCE_BEHAVIORAL = 1; - // Bit-width config + // Bit-width config localparam int unsigned ACTIVATION_WIDTH = 8; localparam int unsigned WEIGHT_WIDTH = 8; localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW); localparam bit SIGNED_ACTIVATIONS = 0; - // Simulation constants + // Simulation constants localparam int unsigned NF = MH/PE; localparam int unsigned SF = MW/SIMD; localparam int unsigned NUM_OF_DSP = SIMD/3; @@ -57,7 +57,7 @@ module mvu_axi_tb(); localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - SIMD*ACTIVATION_WIDTH; localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8; - // Generate clk and reset signal + // Generate clk and reset signal logic clk = 0; always #5ns clk = !clk; @@ -69,7 +69,7 @@ module mvu_axi_tb(); uwire ap_clk = clk; - // Generate activations + // Generate activations typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; typedef activation_t activation_vector_t[SF]; @@ -94,8 +94,8 @@ module mvu_axi_tb(); for (int i=0; i= 1; + do begin + activations.vld <= $urandom()%7 >= 1; @(posedge clk); end while (!(activations.vld === 1 && activations.rdy === 1)); end @@ -104,9 +104,9 @@ module mvu_axi_tb(); activations.dat <= 'x; end - // Generate weights + // Generate weights typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; - typedef weight_t weight_matrix_t[NF][SF]; + typedef weight_t weight_matrix_t[NF][SF]; function weight_matrix_t init_WEIGHTS; automatic weight_matrix_t res; @@ -139,7 +139,7 @@ module mvu_axi_tb(); weights.dat <= 'x; end - // Function to compute golden output + // Function to compute golden output // a: [SF][SIMD-1:0][ACTIVATION_WIDTH-1:0] // w: [NF][SF][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] typedef logic signed [PE-1:0][ACCU_WIDTH-1:0] output_t; @@ -155,12 +155,12 @@ module mvu_axi_tb(); automatic output_vector_t res = '{default: 0}; for (int j = 0; j>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); - else begin + else begin $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); $stop; - end + end end - + NF_CNT += 1; end - $finish; + $finish; end // Instantiate DUT @@ -211,5 +211,5 @@ module mvu_axi_tb(); .s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld), .m_axis_output_tready(outputs.rdy) ); - + endmodule : mvu_axi_tb From 5c5dc09c98d4e1a07a7e4cae17ca358b197a57c8 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 2 Jun 2023 13:35:04 +0100 Subject: [PATCH 047/112] [test rtl mvu]: modified/extended test cases --- tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py b/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py index 20a249bd08..3db7a718f5 100644 --- a/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py +++ b/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py @@ -86,13 +86,12 @@ def prepare_inputs(input_tensor): return {"inp": input_tensor} @pytest.mark.parametrize("mh", [16]) -@pytest.mark.parametrize("mw", [90]) -#@pytest.mark.parametrize("pe", [1, 2, 4, 8, 16]) -@pytest.mark.parametrize("pe", [16]) +@pytest.mark.parametrize("mw", [32]) +@pytest.mark.parametrize("pe", [1, 4, 16]) #@pytest.mark.parametrize("simd", [1, 30, 90]) -@pytest.mark.parametrize("simd", [90]) -@pytest.mark.parametrize("idt", [DataType["INT8"]]) -@pytest.mark.parametrize("wdt", [DataType["UINT4"]]) +@pytest.mark.parametrize("simd", [1, 4, 32]) +@pytest.mark.parametrize("idt", [DataType["UINT4"], DataType["UINT8"]]) +@pytest.mark.parametrize("wdt", [DataType["INT4"], DataType["INT6"]]) #@pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S", "xcku3p-ffva676-1-e"]) @pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S"]) @pytest.mark.parametrize("segmentlen", [1]) @@ -166,7 +165,3 @@ def test_fpgadataflow_mvau_rtl(mh, mw, pe, simd, idt, wdt, part, segmentlen): assert (output_mvau_hls == output_mvau_rtl).all() assert (output_mvau_hls.size > 0) - - -# python setup.py test --addopts "-k test_fpgadataflow_mvau_rtl" -# python setup.py test --addopts "-k test_fpgadataflow_fclayer_rtlsim" \ No newline at end of file From b4eb9b69a8a6920fdb3141752395e672f78479e3 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 30 Jun 2023 15:36:17 +0100 Subject: [PATCH 048/112] [rtl mvu]: updated DSP58 >4-bit variant to lift SIMD%3==0 restriction --- finn-rtllib/mvu/mvu_8sx9.sv | 103 +++++++++++++++++++++++------------- 1 file changed, 65 insertions(+), 38 deletions(-) diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv index a601066cfd..439fbc44f9 100644 --- a/finn-rtllib/mvu/mvu_8sx9.sv +++ b/finn-rtllib/mvu/mvu_8sx9.sv @@ -92,77 +92,95 @@ module mvu_8sx9 #( if (rst) Z <= '{default: 0}; else if(en) begin Z[0] <= zero; - if (MAX_PIPELINE_STAGES > 2) Z[1:MAX_PIPELINE_STAGES-1] <= Z[0:MAX_PIPELINE_STAGES-2]; + if (MAX_PIPELINE_STAGES > 2) Z[1:MAX_PIPELINE_STAGES-2] <= Z[0:MAX_PIPELINE_STAGES-3]; end end end; //-------------------- Buffer for input activations --------------------\\ localparam int unsigned PAD_BITS_ACT = 9 - ACTIVATION_WIDTH; - typedef logic [2:0][ACTIVATION_WIDTH-1:0] a_buffer_t; for (genvar i=0; i1 ? TOTAL_PREGS-1 : 0; + localparam int LANES_OCCUPIED = i == CHAINLEN-1 ? SIMD - 3*i : 3; if (EXTERNAL_PREGS > 0) begin : genExternalPregAct - a_buffer_t A [0:EXTERNAL_PREGS-1]; + logic [0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][ACTIVATION_WIDTH-1:0] A = '{ default : 0}; always_ff @(posedge clk) begin if (rst) A <= '{default: 0}; else if(en) begin - A[EXTERNAL_PREGS-1] <= a[3*i +: 3]; + A[EXTERNAL_PREGS-1] <= a[3*i +: LANES_OCCUPIED]; if (EXTERNAL_PREGS > 1) A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1]; end end - assign a_in_i[i][26:0] = SIGNED_ACTIVATIONS ? { {PAD_BITS_ACT{A[0][2][ACTIVATION_WIDTH-1]}}, A[0][2], {PAD_BITS_ACT{A[0][1][ACTIVATION_WIDTH-1]}}, A[0][1], {PAD_BITS_ACT{A[0][0][ACTIVATION_WIDTH-1]}}, A[0][0]} - : { {PAD_BITS_ACT{1'b0}}, A[0][2], {PAD_BITS_ACT{1'b0}}, A[0][1], {PAD_BITS_ACT{1'b0}}, A[0][0]} ; + for (genvar j=0; j1 ? TOTAL_PREGS-1 : 0; + localparam int LANES_OCCUPIED = j == CHAINLEN-1 ? SIMD - 3*j : 3; if (EXTERNAL_PREGS > 0) begin : genExternalPregWeight - b_buffer_t B [0:PE-1][0:EXTERNAL_PREGS-1]; + logic [0:PE-1][0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][WEIGHT_WIDTH-1:0] B = '{ default : 0}; always_ff @(posedge clk) begin if (rst) B <= '{default: 0}; else if (en) begin - B[j][EXTERNAL_PREGS-1] <= w[j][3*i +: 3]; - if (EXTERNAL_PREGS > 1) B[j][0:EXTERNAL_PREGS-2] <= B[j][1:EXTERNAL_PREGS-1]; + B[i][EXTERNAL_PREGS-1] <= w[i][3*j +: LANES_OCCUPIED]; + if (EXTERNAL_PREGS > 1) B[i][0:EXTERNAL_PREGS-2] <= B[i][1:EXTERNAL_PREGS-1]; end end - assign b_in_i[j][i][23:0] = { {PAD_BITS_WEIGHT{B[j][0][2][WEIGHT_WIDTH-1]}}, B[j][0][2], {PAD_BITS_WEIGHT{B[j][0][1][WEIGHT_WIDTH-1]}}, B[j][0][1], {PAD_BITS_WEIGHT{B[j][0][0][WEIGHT_WIDTH-1]}}, B[j][0][0] }; + for (genvar k = 0 ; k < LANES_OCCUPIED ; k++) begin : genBin + assign b_in_i[i][j][8*k +: 8] = PAD_BITS_WEIGHT == 0 ? B[i][0][k] : { {PAD_BITS_WEIGHT{B[i][0][k][WEIGHT_WIDTH-1]}}, B[i][0][k] }; + end : genBin + for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero + assign b_in_i[i][j][8*k +: 8] = 8'b0; + end : genBinZero end : genExternalPregWeight else begin : genInpDSPWeight - assign b_in_i[j][i][23:0] = { {PAD_BITS_WEIGHT{w[j][3*i+2][WEIGHT_WIDTH-1]}}, w[j][3*i+2], {PAD_BITS_WEIGHT{w[j][3*i+1][WEIGHT_WIDTH-1]}}, w[j][3*i+1], {PAD_BITS_WEIGHT{w[j][3*i][WEIGHT_WIDTH-1]}}, w[j][3*i] }; + for (genvar k = 0; k < LANES_OCCUPIED; k++) begin : genBin + assign b_in_i[i][j][8*k +: 8] = PAD_BITS_WEIGHT == 0 ? w[i][3*j+k] : { {PAD_BITS_WEIGHT{w[i][3*j+k][WEIGHT_WIDTH-1]}}, w[i][3*j+k] }; + end : genBin + for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero + assign b_in_i[i][j][8*k +: 8] = 8'b0; + end : genBinZero end : genInpDSPWeight end : genWeightSIMD - end : genWeightPE //-------------------- Instantiate PE x CHAINLEN DSPs --------------------\\ - for (genvar j=0; j0 ? 2 : 1; // 1 : 0 - localparam bit PREG = (i+1)%SEGLEN==0 || i == CHAINLEN-1; - localparam bit FIRST = i == 0; - localparam bit LAST = i == CHAINLEN-1; - uwire [57:0] pp; + localparam bit PREG = (j+1)%SEGLEN==0 || j == CHAINLEN-1; + localparam bit FIRST = j == 0; + localparam bit LAST = j == CHAINLEN-1; if (LAST) begin : genPOUT - assign p[j] = pp[ACCU_WIDTH-1:0]; + assign p[i] = pcout[i][j][ACCU_WIDTH-1:0]; end // Note: Since the product B * AD is computed, @@ -174,7 +192,7 @@ module mvu_8sx9 #( always_ff @(posedge clk) begin if (rst) Areg <= '{ default : 0}; else if (en) begin - Areg[0] <= { 7'bx, a_in_i[i] }; + Areg[0] <= { 7'bx, a_in_i[j] }; if (INTERNAL_PREGS == 2) Areg[1] <= Areg[0]; end end @@ -182,7 +200,7 @@ module mvu_8sx9 #( always_ff @(posedge clk) begin if (rst) Breg <= '{ default : 0}; else if (en) begin - Breg[0] <= b_in_i[j][i]; + Breg[0] <= b_in_i[i][j]; if (INTERNAL_PREGS == 2) Breg[1] <= Breg[0]; end end @@ -217,27 +235,36 @@ module mvu_8sx9 #( end else assign Preg = Mreg; end - else if (LAST) begin : genLast + else if (FIRST && LAST) begin : genSingle + always_ff @(posedge clk) begin + if (rst) Opmode <= 0; + else if (en) Opmode <= L[1]; + end + always_ff @(posedge clk) begin + if (rst) Preg <= 0; + else if (en) Preg <= (Opmode ? 0 : Preg) + Mreg; + end + end + else if (!FIRST && LAST) begin : genLast always_ff @(posedge clk) begin if (rst) Opmode <= 0; else if (en) Opmode <= L[1]; end always_ff @(posedge clk) begin if (rst) Preg <= 0; - else if (en) Preg <= (Opmode ? 0 : Preg) + Mreg + pcout[j][i-1]; + else if (en) Preg <= (Opmode ? 0 : Preg) + Mreg + pcout[i][j-1]; end end else begin : genMid if (PREG) begin : genPregBehav always_ff @(posedge clk) begin if (rst) Preg <= 0; - else if (en) Preg <= Mreg + pcout[j][i-1]; + else if (en) Preg <= Mreg + pcout[i][j-1]; end end - else assign Preg = Mreg + pcout[j][i-1]; + else assign Preg = Mreg + pcout[i][j-1]; end - assign pp = Preg; - assign pcout[j][i] = pp; + assign pcout[i][j] = Preg; end : genBehav `ifndef VERILATOR else begin: genDSP @@ -307,7 +334,7 @@ module mvu_8sx9 #( .BCOUT(), // 24-bit output: B cascade .CARRYCASCOUT(), // 1-bit output: Cascade carry .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade - .PCOUT(pcout[j][i]), // 58-bit output: Cascade output + .PCOUT(pcout[i][j]), // 58-bit output: Cascade output // Control outputs: Control Inputs/Status Bits .OVERFLOW(), // 1-bit output: Overflow in add/acc .PATTERNBDETECT(), // 1-bit output: Pattern bar detect @@ -322,7 +349,7 @@ module mvu_8sx9 #( .BCIN('x), // 24-bit input: B cascade .CARRYCASCIN('x), // 1-bit input: Cascade carry .MULTSIGNIN('x), // 1-bit input: Multiplier sign cascade - .PCIN(FIRST ? 'x : pcout[j][i-1]), // 58-bit input: P cascade + .PCIN(FIRST ? 'x : pcout[i][j-1]), // 58-bit input: P cascade // Control inputs: Control Inputs/Status Bits .ALUMODE(4'h0), // 4-bit input: ALU control .CARRYINSEL('0), // 3-bit input: Carry select @@ -339,8 +366,8 @@ module mvu_8sx9 #( 7'b000_0000 }), // 9-bit input: Operation mode // Data inputs: Data Ports - .A({ 7'bx, a_in_i[i] }), // 34-bit input: A data - .B(b_in_i[j][i]), // 24-bit input: B data + .A({ 7'bx, a_in_i[j] }), // 34-bit input: A data + .B(b_in_i[i][j]), // 24-bit input: B data .C('x), // 58-bit input: C data .CARRYIN('0), // 1-bit input: Carry-in .D('x), // 27-bit input: D data From ad63673cda849ecf0df993bc83d00e676998ab03 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 30 Jun 2023 15:45:26 +0100 Subject: [PATCH 049/112] [rtl mvu]: bug fix for SIMD=1 init_leave_loads --- finn-rtllib/mvu/mvu_4sx4u.sv | 2 +- finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv index 4674576d23..ac95b5f8a9 100644 --- a/finn-rtllib/mvu/mvu_4sx4u.sv +++ b/finn-rtllib/mvu/mvu_4sx4u.sv @@ -296,7 +296,7 @@ module mvu_4sx4u #( // Stage #4: Cross-SIMD Reduction // Count leaves reachable from each node - localparam leave_load_t LEAVE_LOAD = init_leave_loads(); + localparam leave_load_t LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 1}; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop uwire signed [ACCU_WIDTH -1:0] up4; uwire signed [ACCU_WIDTH -8:0] hi4[3]; diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv index 3cd9cef560..416c12c1cc 100644 --- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv +++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv @@ -297,7 +297,7 @@ module mvu_8sx8u_dsp48 #( // Stage #4: Cross-SIMD Reduction // Count leaves reachable from each node - localparam leave_load_t LEAVE_LOAD = init_leave_loads(); + localparam leave_load_t LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 0}; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop uwire signed [ACCU_WIDTH -1:0] up4; uwire signed [ACCU_WIDTH -SINGLE_PROD_WIDTH:0] hi4; From 79e8a5ef208f7bcdeafa231a5a3dff74177008c9 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 13 Jul 2023 18:34:05 +0100 Subject: [PATCH 050/112] [mvu rtl]: restrict index i to be less than 3 (within bounds of hi4) --- finn-rtllib/mvu/mvu_4sx4u.sv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv index ac95b5f8a9..88985312c9 100644 --- a/finn-rtllib/mvu/mvu_4sx4u.sv +++ b/finn-rtllib/mvu/mvu_4sx4u.sv @@ -324,7 +324,7 @@ module mvu_4sx4u #( end assign hi4[i] = Hi4; end : genHi - else begin : genHiZero + else if (i < 3) begin : genHiZero assign hi4[i] = '0; end : genHiZero From e3493c30529949a77a3f384fd75c030c551cd2cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Fri, 2 Jun 2023 12:47:53 +0100 Subject: [PATCH 051/112] Rewrite replay_buffer for input elasticity. --- finn-rtllib/mvu/replay_buffer.sv | 153 ++++++++++++++++++------- finn-rtllib/mvu/tb/replay_buffer_tb.sv | 130 +++++++++++++++++++++ 2 files changed, 242 insertions(+), 41 deletions(-) create mode 100644 finn-rtllib/mvu/tb/replay_buffer_tb.sv diff --git a/finn-rtllib/mvu/replay_buffer.sv b/finn-rtllib/mvu/replay_buffer.sv index 89bbbdb88f..3dfe72d6c6 100644 --- a/finn-rtllib/mvu/replay_buffer.sv +++ b/finn-rtllib/mvu/replay_buffer.sv @@ -1,5 +1,5 @@ /****************************************************************************** - * Copyright (C) 2022, Advanced Micro Devices, Inc. + * Copyright (C) 2022-2023, Advanced Micro Devices, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -51,60 +51,131 @@ module replay_buffer #( input logic ordy ); - typedef logic [$clog2(REP)+$clog2(LEN)-1:0] count_t; - count_t Count = 0; - uwire done_len = LEN == 1 ? 1 : ((LEN-1) & ~Count[$clog2(LEN)-1:0]) == 0; - uwire done_rep; - uwire done_all = done_len && done_rep; + if(LEN == 0) initial begin + $error("%m: Illegal zero sequence LEN."); + $finish; + end + if(REP == 0) initial begin + $error("%m: Illegal zero REP count."); + $finish; + end + // Track position in Sequence + uwire last_item; uwire shift; - uwire clr = rst || (done_all && shift); - always_ff @(posedge clk) begin - if(clr) Count <= 0; - else if(shift) Count <= Count + ((REP > 1) && done_len? 2**$clog2(LEN)-LEN+1 : 1); + if(LEN == 1) assign last_item = 1; + else begin + typedef logic [$clog2(LEN)-1:0] count_t; + count_t Count = 0; + logic Last = 0; + always_ff @(posedge clk) begin + if(rst) begin + Count <= 0; + Last <= 0; + end + else if(shift) begin + Count <= Count + (Last? 2**$clog2(LEN)-LEN+1 : 1); + Last <= (((LEN-2) & ~Count) == 0) && ((LEN&1) || !Last); + end + end + assign last_item = Last; end - typedef logic [W-1:0] data_t; - uwire data_t rdat; - uwire first_rep; if(REP == 1) begin - assign done_rep = 1; - assign first_rep = 1; - assign rdat = 'x; + assign shift = ivld && ordy; + + assign irdy = ordy; + assign odat = idat; + assign olast = last_item; + assign ofin = last_item; + assign ovld = ivld; end else begin - assign done_rep = ((REP-1) & ~Count[$left(Count):$clog2(LEN)]) == 0; - logic FirstRep = 1; + // Track Repetitions + uwire last_rep; + if(1) begin : blkRep + typedef logic [$clog2(REP)-1:0] rep_t; + rep_t RepCnt = 0; + logic RepLst = 0; + always_ff @(posedge clk) begin + if(rst) begin + RepCnt <= 0; + RepLst <= 0; + end + else if(last_item && shift) begin + RepCnt <= RepCnt + (RepLst? 2**$clog2(REP)-REP+1 : 1); + RepLst <= (((REP-2) & ~RepCnt) == 0) && ((REP&1) || !RepLst); + end + end + assign last_rep = RepLst; + end : blkRep + + localparam int unsigned AWIDTH = $clog2(LEN); + typedef logic [AWIDTH :0] ptr_t; // pointers with additional generational MSB + typedef logic [W -1:0] data_t; + + // Output Registers + data_t ODat; + logic OVld = 0; + logic OLst = 'x; + logic OFin = 'x; + assign odat = ODat; + assign olast = OLst; + assign ofin = OFin; + assign ovld = OVld; + + // Buffer Memory Management + data_t Mem[2**AWIDTH]; + ptr_t WP = 0; // Write Pointer + ptr_t RP = 0; // Read Pointer + ptr_t FP = 0; // Free Pointer + + // Operational Guards + // Occupancy: WP-FP + // WP-FP < 2**AWIDTH -> writing allowed + // - increments WP + // Availability: WP-RP + // WP-RP > 0 -> reading allowed + // - increments RP, last in sequence rewinds to FP for non-final repetition + // - increments FP in last repetition + assign irdy = !((WP-FP) >> AWIDTH); + + uwire wr = irdy && ivld; + uwire rd = !OVld || ordy; always_ff @(posedge clk) begin - if(clr) FirstRep <= 1; - else if(shift) FirstRep <= FirstRep && !done_len; + if(wr) Mem[WP[AWIDTH-1:0]] <= idat; + if(rd) ODat <= Mem[RP[AWIDTH-1:0]]; end - assign first_rep = FirstRep; - data_t Buf[LEN]; - if(LEN == 1) begin : genTrivial - always_ff @(posedge clk) begin - if(shift && FirstRep) Buf[0] <= idat; + uwire vld = (RP != WP); + assign shift = rd && vld; + always_ff @(posedge clk) begin + if(rst) begin + WP <= 0; + RP <= 0; + FP <= 0; + + OVld <= 0; + OLst <= 'x; + OFin <= 'x; end - end : genTrivial - else begin : genShift - always_ff @(posedge clk) begin - if(shift) begin - Buf[0] <= odat; - Buf[1:LEN-1] <= Buf[0:LEN-2]; + else begin + if(wr) WP <= WP + 1; + if(rd) begin + if(vld) begin + automatic logic rewind = last_item && !last_rep; + RP <= RP + (rewind? 2**(AWIDTH+1)-LEN+1 : 1); + FP <= FP + last_rep; + end + + OVld <= vld; + OLst <= last_item; + OFin <= last_rep && last_item; end end - end : genShift + end - assign rdat = Buf[LEN-1]; end - assign irdy = ordy && first_rep; - assign odat = first_rep? idat : rdat; - assign olast = done_len; - assign ofin = done_all; - assign ovld = first_rep? ivld : 1; - assign shift = ovld && ordy; - -endmodule : replay_buffer \ No newline at end of file +endmodule : replay_buffer diff --git a/finn-rtllib/mvu/tb/replay_buffer_tb.sv b/finn-rtllib/mvu/tb/replay_buffer_tb.sv new file mode 100644 index 0000000000..5581354e0e --- /dev/null +++ b/finn-rtllib/mvu/tb/replay_buffer_tb.sv @@ -0,0 +1,130 @@ +/****************************************************************************** + * Copyright (C) 2023, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Testbench for replay_buffer module. + * @author Thomas B. Preußer + *****************************************************************************/ + +module replay_buffer_tb; + + // Global Control + logic clk = 0; + always #5ns clk = !clk; + uwire rst = 0; + + // DUT Geometries + localparam int unsigned DIMS[3] = '{ 7, 8, 10 }; + localparam int unsigned W = 8; + typedef logic [W-1:0] data_t; + + bit [2**$size(DIMS)-1:0] done = 0; + always_comb begin + if(&done) begin + $display("Test completed."); + $finish; + end + end + + // Parallel DUT Instantiations + for(genvar r = 0; r < $size(DIMS); r++) begin + for(genvar l = 0; l < $size(DIMS); l++) begin + localparam int unsigned REP = DIMS[r]; + localparam int unsigned LEN = DIMS[l]; + + data_t idat; + logic ivld; + uwire irdy; + + uwire data_t odat; + uwire olast; + uwire ofin; + uwire ovld; + logic ordy; + + replay_buffer #(.LEN(LEN), .REP(REP), .W(W)) dut ( + .clk, .rst, + .idat, .ivld, .irdy, + .odat, .olast, .ofin, .ovld, .ordy + ); + + // Input Feed: 0, 1, ..., 10*LEN-1 + initial begin + idat = 'x; + ivld = 0; + @(posedge clk iff !rst); + + for(int unsigned i = 0; i < 10*LEN; i++) begin + idat <= i; + ivld <= 1; + @(posedge clk iff irdy); + idat <= 'x; + ivld <= 0; + while($urandom()%(REP-1) != 0) @(posedge clk); + end + end + + // Output Check + initial begin + automatic int unsigned base = 0; + + ordy = 0; + @(posedge clk iff !rst); + + for(int unsigned k = 0; k < 10; k++) begin + for(int unsigned j = 0; j < REP; j++) begin + for(int unsigned i = 0; i < LEN; i++) begin + ordy <= 1; + @(posedge clk iff ovld); + assert(odat == base+i) else begin + $error("#%0d.%0d: Data mismatch: %0d instead of %0d.", r, l, odat, base+i); + $stop; + end + assert(olast == (i == LEN-1)) else begin + $error("#%0d.%0d: Last mismatch.", r, l); + $stop; + end + assert(ofin == ((i == LEN-1) && (j == REP-1))) else begin + $error("#%0d.%0d: Fin mismatch.", r, l); + $stop; + end + + ordy <= 0; + while($urandom()%13 == 0) @(posedge clk); + end + end + base += LEN; + end + + done[$size(DIMS)*r + l] <= 1; + end + end + end + +endmodule : replay_buffer_tb From 2efba6854267873c84d58f6d8fe6b64f649eaa99 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Tue, 5 Sep 2023 13:53:01 +0100 Subject: [PATCH 052/112] [to-rtl]: Infer unique node names after transformation is applied --- .../transformation/fpgadataflow/specialize_to_rtl_layers.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py index 23b6e59abe..47ed5ce863 100644 --- a/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py +++ b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py @@ -32,6 +32,7 @@ from onnx import helper from qonnx.transformation.infer_shapes import InferShapes from qonnx.transformation.infer_datatypes import InferDataTypes +from qonnx.transformation.general import GiveUniqueNodeNames from finn.transformation.fpgadataflow.minimize_accumulator_width import MinimizeAccumulatorWidth class InferRTLMatrixVectorActivation(Transformation): @@ -105,5 +106,6 @@ def apply(self, model): model = model.transform(MinimizeAccumulatorWidth()) model = model.transform(InferShapes()) model = model.transform(InferDataTypes()) + model = model.transform(GiveUniqueNodeNames()) return (model, graph_modified) \ No newline at end of file From 114ea1bfed2dd2f14196f98aea97d6cac9d1d57e Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 18 Sep 2023 14:56:07 +0100 Subject: [PATCH 053/112] [mvu rtl]: add synthesis directive to handle 'X in simulation --- finn-rtllib/mvu/mvu_8sx9.sv | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv index 439fbc44f9..34aa856b1b 100644 --- a/finn-rtllib/mvu/mvu_8sx9.sv +++ b/finn-rtllib/mvu/mvu_8sx9.sv @@ -110,13 +110,17 @@ module mvu_8sx9 #( always_ff @(posedge clk) begin if (rst) A <= '{default: 0}; else if(en) begin - A[EXTERNAL_PREGS-1] <= a[3*i +: LANES_OCCUPIED]; + A[EXTERNAL_PREGS-1] <= +// synthesis translate_off + zero ? '1 : +// synthesis translate_on + a[3*i +: LANES_OCCUPIED]; if (EXTERNAL_PREGS > 1) A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1]; end end for (genvar j=0; j 1) B[i][0:EXTERNAL_PREGS-2] <= B[i][1:EXTERNAL_PREGS-1]; end end @@ -161,7 +173,11 @@ module mvu_8sx9 #( end : genExternalPregWeight else begin : genInpDSPWeight for (genvar k = 0; k < LANES_OCCUPIED; k++) begin : genBin - assign b_in_i[i][j][8*k +: 8] = PAD_BITS_WEIGHT == 0 ? w[i][3*j+k] : { {PAD_BITS_WEIGHT{w[i][3*j+k][WEIGHT_WIDTH-1]}}, w[i][3*j+k] }; + assign b_in_i[i][j][8*k +: 8] = +// synthesis translate_off + zero ? '1 : +// synthesis translate_on + PAD_BITS_WEIGHT == 0 ? w[i][3*j+k] : { {PAD_BITS_WEIGHT{w[i][3*j+k][WEIGHT_WIDTH-1]}}, w[i][3*j+k] }; end : genBin for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero assign b_in_i[i][j][8*k +: 8] = 8'b0; @@ -178,9 +194,10 @@ module mvu_8sx9 #( localparam bit PREG = (j+1)%SEGLEN==0 || j == CHAINLEN-1; localparam bit FIRST = j == 0; localparam bit LAST = j == CHAINLEN-1; + uwire [57:0] pp; if (LAST) begin : genPOUT - assign p[i] = pcout[i][j][ACCU_WIDTH-1:0]; + assign p[i] = pp[ACCU_WIDTH-1:0]; end // Note: Since the product B * AD is computed, @@ -264,6 +281,7 @@ module mvu_8sx9 #( end else assign Preg = Mreg + pcout[i][j-1]; end + assign pp = Preg; assign pcout[i][j] = Preg; end : genBehav `ifndef VERILATOR From 79fafdb25a8707f740a0a7e21aa4f55ef7101882 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 18 Sep 2023 15:06:36 +0100 Subject: [PATCH 054/112] [replay buffer rtl]: minor fix to when LEN=1 (= AWIDTH=0) --- finn-rtllib/mvu/replay_buffer.sv | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/finn-rtllib/mvu/replay_buffer.sv b/finn-rtllib/mvu/replay_buffer.sv index 3dfe72d6c6..942f1823ca 100644 --- a/finn-rtllib/mvu/replay_buffer.sv +++ b/finn-rtllib/mvu/replay_buffer.sv @@ -144,8 +144,8 @@ module replay_buffer #( uwire wr = irdy && ivld; uwire rd = !OVld || ordy; always_ff @(posedge clk) begin - if(wr) Mem[WP[AWIDTH-1:0]] <= idat; - if(rd) ODat <= Mem[RP[AWIDTH-1:0]]; + if(wr) Mem[WP[AWIDTH:0]] <= idat; + if(rd) ODat <= Mem[RP[AWIDTH:0]]; end uwire vld = (RP != WP); From 619d9db0d5872d1afd72b1d1df841e1f87a9f33a Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 18 Sep 2023 15:09:45 +0100 Subject: [PATCH 055/112] [mvu lut]: LUT-based MVU compute core --- finn-rtllib/mvu/mvu_lut.sv | 102 +++++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 finn-rtllib/mvu/mvu_lut.sv diff --git a/finn-rtllib/mvu/mvu_lut.sv b/finn-rtllib/mvu/mvu_lut.sv new file mode 100644 index 0000000000..b100a589e8 --- /dev/null +++ b/finn-rtllib/mvu/mvu_lut.sv @@ -0,0 +1,102 @@ +module mvu_lut #( + int unsigned PE, + int unsigned SIMD, + int unsigned ACCU_WIDTH, + int unsigned ACTIVATION_WIDTH, + int unsigned WEIGHT_WIDTH, + bit SIGNED_ACTIVATIONS, + bit M_REG = 1, + + localparam unsigned MULT_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH +)( + // Global Control + input logic clk, + input logic rst, + input logic en, + + // Input + input logic last, + input logic zero, // ignore current inputs and force this partial product to zero + input logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // signed weights + input logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] a, // (un)signed activations + + // Ouput + output logic vld, + output logic signed [PE-1:0][ACCU_WIDTH-1:0] p +); + + typedef int unsigned leave_load_t[2*SIMD-1]; + function leave_load_t init_leave_loads(); + automatic leave_load_t res; + for(int i = 2*(SIMD-1); i >= int'(SIMD)-1; i--) res[i] = 1; + for(int i = SIMD-2; i >= 0; i--) res[i] = res[2*i+1] + res[2*i+2]; + return res; + endfunction : init_leave_loads + + // Pipeline for last indicator flag + uwire last_i; + generate if (M_REG) begin + logic [0:1] L = '0; + always_ff @(posedge clk) begin + if(rst) L <= '0; + else if (en) L <= {last, L[0]}; + end + assign last_i = L[1]; + end + else begin + logic L = '0; + always_ff @(posedge clk) begin + if(rst) L <= '0; + else if (en) L <= last; + end + assign last_i = L; + end + endgenerate + + // For each PE generate + for (genvar i = 0; i < PE; i++) begin : genPE + // Stage #1: SIMD multipliers in parallel + uwire [MULT_WIDTH-1 : 0] m1 [SIMD]; + for (genvar j = 0; j < SIMD; j++) begin : genSIMD + if (M_REG) begin : genMreg + logic [MULT_WIDTH-1 : 0] M [SIMD]; + always_ff @(posedge clk) begin + if(rst) M[j] = '{ default : 0 }; + else if (en) M[j] = zero ? 0 : + SIGNED_ACTIVATIONS ? $signed(a[j]) * $signed(w[i][j]) : + $signed({1'b0, a[j]}) * $signed(w[i][j]); + // (SIGNED_ACTIVATIONS ? $signed(a[j]) : a[j]) * $signed(w[i][j]) isn't valid -- leads to unsigned multiplication + end + assign m1[j] = M[j]; + end : genMreg + else begin : genNoMreg + assign m1[j] = zero ? 0 : + SIGNED_ACTIVATIONS ? $signed(a[j]) * $signed(w[i][j]) : + $signed({1'b0, a[j]}) * $signed(w[i][j]); + end : genNoMreg + end : genSIMD + + // Stage #2: Adder tree to reduce SIMD products + localparam leave_load_t LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default : 1 }; + localparam int unsigned ROOT_WIDTH = $clog2(SIMD*(2**MULT_WIDTH-1)); + uwire signed [2*SIMD-2:0][ROOT_WIDTH-1:0] tree; + for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = $signed(m1[s]); + for(genvar n = 0; n < SIMD-1; n++) begin + // Sum truncated to actual maximum bit width at this node + localparam int unsigned NODE_WIDTH = $clog2(LEAVE_LOAD[n]*(2**MULT_WIDTH-1)); + uwire signed [NODE_WIDTH-1:0] s = tree[2*n+1] + tree[2*n+2]; + assign tree[n] = s; + end + + // Stage #3: Buffer output + logic [ACCU_WIDTH-1:0] P2 [PE]; + always_ff @(posedge clk) begin + if(rst) P2[i] = '{ default : 0}; + else if (en) P2[i] = (last_i ? 0 : $signed(P2[i])) + $signed(tree[0]); + end + + assign vld = last_i; + assign p[i] = P2[i]; + end : genPE + +endmodule : mvu_lut From 090f2ac4adf4b0523b23b27fce05f7422269d72a Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Tue, 19 Sep 2023 12:23:55 +0100 Subject: [PATCH 056/112] [custom op]: add preferred_backend attribute --- src/finn/custom_op/fpgadataflow/matrixvectoractivation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py index 73d39ce642..4f24d71ccc 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py @@ -123,7 +123,7 @@ def get_nodeattr_types(self): # weight data from the weight FIFOs. "runtime_writeable_weights": ("i", False, 0, {0, 1}), # Flag to specify whether RTL-based or HLS-based implementation is preferred - "impl": ("s", False, "rtl", {"hls", "rtl"}) + "preferred_backend": ("s", False, "rtl", {"hls", "rtl"}) } my_attrs.update(super().get_nodeattr_types()) return my_attrs From ac5e82d9944f5b7475eb13546affd1bc03d57f4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Thu, 21 Sep 2023 13:03:27 +0100 Subject: [PATCH 057/112] Ensure a minimum of two buffer slots even for length-1 sequences. --- finn-rtllib/mvu/replay_buffer.sv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/finn-rtllib/mvu/replay_buffer.sv b/finn-rtllib/mvu/replay_buffer.sv index 942f1823ca..d4342f705c 100644 --- a/finn-rtllib/mvu/replay_buffer.sv +++ b/finn-rtllib/mvu/replay_buffer.sv @@ -111,7 +111,7 @@ module replay_buffer #( assign last_rep = RepLst; end : blkRep - localparam int unsigned AWIDTH = $clog2(LEN); + localparam int unsigned AWIDTH = LEN < 2? 1 : $clog2(LEN); typedef logic [AWIDTH :0] ptr_t; // pointers with additional generational MSB typedef logic [W -1:0] data_t; From 85156935163fc803d453db5ce2c1c5163808bc9f Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 21 Sep 2023 15:07:12 +0100 Subject: [PATCH 058/112] [rtl mvu wrapper]: support for vvu layer and rename --- finn-rtllib/mvu/mvu_vvu_axi_wrapper.v | 92 +++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 finn-rtllib/mvu/mvu_vvu_axi_wrapper.v diff --git a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v new file mode 100644 index 0000000000..6dbf82cb7b --- /dev/null +++ b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v @@ -0,0 +1,92 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Verilog AXI-lite wrapper for MVU. + *****************************************************************************/ + +module $MODULE_NAME_AXI_WRAPPER$ #( + parameter IS_MVU = "$IS_MVU$", + parameter COMPUTE_CORE = "$COMPUTE_CORE$", + parameter MW = $MW$, + parameter MH = $MH$, + parameter PE = $PE$, + parameter SIMD = $SIMD$, + parameter ACTIVATION_WIDTH = $ACTIVATION_WIDTH$, + parameter WEIGHT_WIDTH = $WEIGHT_WIDTH$, + parameter ACCU_WIDTH = $ACCU_WIDTH$, + parameter SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$, + parameter SEGMENTLEN = $SEGMENTLEN$, + parameter FORCE_BEHAVIORAL = $FORCE_BEHAVIORAL$, + + // Safely deducible parameters + parameter WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, + parameter INPUT_STREAM_WIDTH_BA = ((IS_MVU == 1 ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8, + parameter OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8 +)( + // Global Control + (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *) + (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *) + input ap_clk, + (* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *) + input ap_rst_n, + + // Weight Stream + input [WEIGHT_STREAM_WIDTH_BA-1:0] weights_V_TDATA, + input weights_V_TVALID, + output weights_V_TREADY, + // Input Stream + input [INPUT_STREAM_WIDTH_BA-1:0] in0_V_TDATA, + input in0_V_TVALID, + output in0_V_TREADY, + // Output Stream + output [OUTPUT_STREAM_WIDTH_BA-1:0] out_V_TDATA, + output out_V_TVALID, + input out_V_TREADY +); + +mvu_vvu_axi #( + .IS_MVU(IS_MVU), .COMPUTE_CORE(COMPUTE_CORE), .MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD), + .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL) + ) inst ( + .ap_clk(ap_clk), + .ap_rst_n(ap_rst_n), + .s_axis_weights_tdata(weights_V_TDATA), + .s_axis_weights_tvalid(weights_V_TVALID), + .s_axis_weights_tready(weights_V_TREADY), + .s_axis_input_tdata(in0_V_TDATA), + .s_axis_input_tvalid(in0_V_TVALID), + .s_axis_input_tready(in0_V_TREADY), + .m_axis_output_tdata(out_V_TDATA), + .m_axis_output_tvalid(out_V_TVALID), + .m_axis_output_tready(out_V_TREADY) +); + +endmodule // $MODULE_NAME_AXI_WRAPPER$ From cf28d780041fec1effdf743e62390eebc5c81f98 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 21 Sep 2023 16:24:18 +0100 Subject: [PATCH 059/112] [mvu vvu tb]: modified testbench to also support testing VVU on DSP58 --- finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv | 222 +++++++++++++++++++++++++++ 1 file changed, 222 insertions(+) create mode 100644 finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv diff --git a/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv new file mode 100644 index 0000000000..82c2e8e7b0 --- /dev/null +++ b/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv @@ -0,0 +1,222 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Testbench for MVU AXI-lite interface wrapper. + *****************************************************************************/ + +module mvu_vvu_axi_tb(); + +//-------------------- Simulation parameters --------------------\\ + // Matrix & parallelism config + localparam bit IS_MVU = 1; + localparam string COMPUTE_CORE = "mvu_vvu_8sx9_dsp58"; + localparam int unsigned MW = 1500; + localparam int unsigned MH = 256; + localparam int unsigned SIMD = 60; + localparam int unsigned PE = 16; + localparam int unsigned SEGMENTLEN = 2.0; + localparam bit FORCE_BEHAVIORAL = 1; + localparam bit M_REG_LUT = 1; + // Bit-width config + localparam int unsigned ACTIVATION_WIDTH = 4; + localparam int unsigned WEIGHT_WIDTH = 4; + localparam int unsigned ACCU_WIDTH = 21; // == ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW) + localparam bit SIGNED_ACTIVATIONS = 0; + // Simulation constants + localparam int unsigned NF = MH/PE; + localparam int unsigned SF = MW/SIMD; + localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8; + localparam int unsigned ACTIVATION_WIDTH_BA = ((IS_MVU ? 1 : PE)*SIMD*ACTIVATION_WIDTH+7)/8*8; + localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH; + localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - (IS_MVU ? 1 : PE)*SIMD*ACTIVATION_WIDTH; + localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8; + + // Generate clk and reset signal + logic clk = 0; + always #5ns clk = !clk; + + logic ap_rst_n = 0; + initial begin + repeat(16) @(posedge clk); + ap_rst_n <= 1; + end + + uwire ap_clk = clk; + + // Generate activations + typedef logic [(IS_MVU ? 1 : PE)*SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; + typedef activation_t activation_vector_t[(IS_MVU ? 1 : NF)*SF]; + + function activation_vector_t init_ACTIVATIONS; + automatic activation_vector_t res; + std::randomize(res); + return res; + endfunction : init_ACTIVATIONS + + activation_vector_t ACTIVATIONS = init_ACTIVATIONS(); + + struct { + activation_t dat; + logic vld; + logic rdy; + } activations; + + initial begin + activations.vld = 0; + activations.dat = 'X; + @(posedge clk iff ap_rst_n); + + for (int j=0; j<(IS_MVU ? 1 : NF); j++) begin + for (int i=0; i= 0; + @(posedge clk); + end while (!(activations.vld === 1 && activations.rdy === 1)); + end + end + + activations.vld <= 0; + activations.dat <= 'x; + end + + // Generate weights + typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; + typedef weight_t weight_matrix_t[NF][SF]; + + function weight_matrix_t init_WEIGHTS; + automatic weight_matrix_t res; + std::randomize(res); + return res; + endfunction : init_WEIGHTS; + + weight_matrix_t WEIGHTS = init_WEIGHTS(); + + struct { + weight_t dat; + logic vld; + logic rdy; + } weights; + + initial begin + weights.vld = 0; + weights.dat = 'X; + @(posedge clk iff ap_rst_n); + + weights.vld <= 1; + for (int i=0; i 1 ? $signed(a[j/PE*SF+i/SIMD][(i*PE + j%PE) % (SIMD*PE)]) : $signed(a[j/PE*SF+i/SIMD][i%SIMD]) ) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]); + else + res[j/PE][j%PE] = IS_MVU ? $signed(res[j/PE][j%PE]) + $signed({1'b0, a[i/SIMD][i%SIMD]}) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]) : + $signed(res[j/PE][j%PE]) + ( PE > 1 ? $signed({1'b0, a[j/PE*SF+i/SIMD][(i*PE + j%PE) % (SIMD*PE)]}) : $signed({1'b0, a[j/PE+SF+i/SIMD][i%SIMD]}) ) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]); + end + end + return res; + endfunction : check_output; + + output_vector_t GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS); + + int unsigned NF_CNT = 0; + initial begin + outputs.rdy = 0; + while (NF_CNT < NF) begin + // Loop until both rdy & vld are asserted + do begin + outputs.rdy <= $urandom()%7 >= 0; + @(posedge clk iff ap_rst_n); + end while (!(outputs.rdy === 1 && outputs.vld === 1)); + + // Compare produced outputs against golden outputs + foreach(outputs.dat[i]) begin + assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + else begin + $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + $stop; + end + end + + NF_CNT += 1; + end + + $finish; + end + + // Instantiate DUT + mvu_vvu_axi #( + .IS_MVU(IS_MVU), + .COMPUTE_CORE(COMPUTE_CORE), + .MW(MW), + .MH(MH), + .PE(PE), + .SIMD(SIMD), + .ACTIVATION_WIDTH(ACTIVATION_WIDTH), + .WEIGHT_WIDTH(WEIGHT_WIDTH), + .ACCU_WIDTH(ACCU_WIDTH), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), + .SEGMENTLEN(SEGMENTLEN), + .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL), + .M_REG_LUT(M_REG_LUT) + ) + dut ( + .ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld), + .s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld), + .s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld), + .m_axis_output_tready(outputs.rdy) + ); + +endmodule : mvu_vvu_axi_tb From 2617c391e1d2c9b19fb881acb6012fc56df35eae Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 21 Sep 2023 16:25:22 +0100 Subject: [PATCH 060/112] [axi wrapper]: minor modification to comment description --- finn-rtllib/mvu/mvu_vvu_axi_wrapper.v | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v index 6dbf82cb7b..788e49a71b 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v +++ b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v @@ -28,7 +28,7 @@ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - * @brief Verilog AXI-lite wrapper for MVU. + * @brief Verilog AXI-lite wrapper for MVU & VVU. *****************************************************************************/ module $MODULE_NAME_AXI_WRAPPER$ #( From 8ca5fe73c003aec3e7998d83e233102c012dd531 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 21 Sep 2023 16:34:12 +0100 Subject: [PATCH 061/112] [mvu axi]: add support for VVU on DSP58 --- finn-rtllib/mvu/mvu_axi.sv | 105 ++++++++++++++++++++++++------------- 1 file changed, 69 insertions(+), 36 deletions(-) diff --git a/finn-rtllib/mvu/mvu_axi.sv b/finn-rtllib/mvu/mvu_axi.sv index 46167af95b..07ad32e6c8 100644 --- a/finn-rtllib/mvu/mvu_axi.sv +++ b/finn-rtllib/mvu/mvu_axi.sv @@ -28,19 +28,25 @@ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - * @brief Matrix Vector Unit (MVU) AXI-lite interface wrapper. + * @brief Matrix Vector Unit (MVU) & Vector Vector Unit (VVU) AXI-lite interface wrapper. * @details + * The following compute cores are supported: + * - 4-bit MVU on DSP48 & DSP58 achieving 4 MACs/DSP, + * (4,8]-bit MVU on DSP48 achieving 2 MACs/DSP, + * [4,9]-bit MVU and VVU on DSP58 achieving 3 MACs/DSP, + * 'unconstrained' LUT-based MVU and VVU. * Folding hints: - * - 4-bit MVU: PE scaling should divide MH. - * - 8-bit MVU - DSP48: PE scaling should divide MH. - * - 8-bit MVU - DSP58: SIMD scaling should aim at a full multiple of 3 and divide MW. + * - PE scaling should divide MH. + * - SIMD scaling should divide MW. * - Otherwise, keep SIMD and PE somewhat balanced. SIMD scaling tends to * impact critical paths more than PE scaling. PE scaling implies a * bigger fanout on the input activations. * - Full unfolding along MH (PE=MH) results in no replay buffer instantiated *****************************************************************************/ -module mvu_axi #( +module mvu_vvu_axi #( + bit IS_MVU, // string type causes error in Vivado + parameter COMPUTE_CORE, int unsigned MW, int unsigned MH, int unsigned PE, @@ -51,16 +57,16 @@ module mvu_axi #( bit SIGNED_ACTIVATIONS = 0, int unsigned SEGMENTLEN = 0, bit FORCE_BEHAVIORAL = 0, - parameter MVU_IMPL_STYLE, // string type causes error in Vivado + bit M_REG_LUT = 1, + // Safely deducible parameters localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, - localparam int unsigned INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8, + localparam int unsigned INPUT_STREAM_WIDTH_BA = ((IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8, localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH, - localparam int unsigned INPUT_STREAM_WIDTH = SIMD*ACTIVATION_WIDTH, + localparam int unsigned INPUT_STREAM_WIDTH = (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH, localparam int unsigned SF = MW/SIMD, localparam int unsigned NF = MH/PE, - localparam int unsigned OUTPUT_LANES = PE, - localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8 + localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8 ) ( // Global Control @@ -93,27 +99,31 @@ module mvu_axi #( $error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE); $finish; end - if (ACTIVATION_WIDTH > 9) begin - $error("Activation width of %0d-bits exceeds maximum of 9-bits", ACTIVATION_WIDTH); - $finish; - end if (WEIGHT_WIDTH > 8) begin $error("Weight width of %0d-bits exceeds maximum of 8-bits", WEIGHT_WIDTH); $finish; end - if (SIGNED_ACTIVATIONS == 0 && ACTIVATION_WIDTH==9) begin - $error("Activation width of %0d-bits exceeds maximum of 8-bits for unsigned numbers", ACTIVATION_WIDTH); - $finish; + if (ACTIVATION_WIDTH > 8) begin + if (!(SIGNED_ACTIVATIONS == 1 && ACTIVATION_WIDTH == 9 && COMPUTE_CORE == "mvu_vvu_8sx9_dsp58")) begin + $error("Activation width of %0d-bits exceeds maximum of 9-bits for signed numbers on DSP48", ACTIVATION_WIDTH); + $finish; + end end - if (MVU_IMPL_STYLE == "mvu_8sx9") begin + if (COMPUTE_CORE == "mvu_vvu_8sx9_dsp58") begin if (SEGMENTLEN == 0) begin - $warning("Segment length of %0d defaults to chain length", SEGMENTLEN); + $warning("Segment length of %0d defaults to chain length of %0d", SEGMENTLEN, (SIMD+2)/3); end if (SEGMENTLEN > (SIMD+2)/3) begin $error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3); $finish; end end + if (!IS_MVU) begin + if (COMPUTE_CORE != "mvu_vvu_8sx9_dsp58" && COMPUTE_CORE != "mvu_vvu_lut") begin + $error("VVU only supported on DSP58 or LUT-based implementation"); + $finish; + end + end end uwire clk = ap_clk; @@ -127,10 +137,10 @@ module mvu_axi #( uwire avld; uwire ardy; - replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvauin_t))) activation_replay ( - .clk, .rst, - .ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)), - .ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin) + replay_buffer #(.LEN(SF), .REP(IS_MVU ? NF : 1), .W($bits(mvauin_t))) activation_replay ( + .clk, .rst, + .ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)), + .ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin) ); //-------------------- Input control --------------------\\ @@ -139,37 +149,60 @@ module mvu_axi #( assign ardy = en && s_axis_weights_tvalid; assign s_axis_weights_tready = en && avld; -//-------------------- Core MVU --------------------\\ +//-------------------- Core MVU/VVU --------------------\\ uwire ovld; uwire [PE-1:0][ACCU_WIDTH-1:0] odat; typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t; - - case(MVU_IMPL_STYLE) - "mvu_8sx9_dsp58": - mvu_8sx9 #(.PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), + uwire mvauin_t amvau_i; + + if (IS_MVU) begin : genMVUInput + assign amvau_i = amvau; + end : genMVUInput + else begin : genVVUInput + // The input stream will have the channels interleaved for VVU when PE>1 + // Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..] + // Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like: + // (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to + // (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i) + localparam int num_of_elements = INPUT_STREAM_WIDTH/ACTIVATION_WIDTH; + for (genvar i=0; i 1) ? + amvau[(i/SIMD + (i*PE % num_of_elements) + 1) * ACTIVATION_WIDTH : (i/SIMD + (i*PE % num_of_elements)) * ACTIVATION_WIDTH] + : amvau[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH]; + end : genRewire + end : genVVUInput + + case(COMPUTE_CORE) + "mvu_vvu_8sx9_dsp58": + mvu_vvu_8sx9 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( .clk, .rst, .en, - .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau), + .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), .vld(ovld), .p(odat) ); - "mvu_4sx4u": mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( .clk, .rst, .en, - .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau), + .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), .vld(ovld), .p(odat) ); - "mvu_8sx8u_dsp48": mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( .clk, .rst, .en, - .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau), + .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), + .vld(ovld), .p(odat) + ); + "mvu_vvu_lut": + mvu_vvu_lut #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .M_REG(M_REG_LUT)) core ( + .clk, .rst, .en, + .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), .vld(ovld), .p(odat) ); default: initial begin - $error("Unrecognized MVU_IMPL_STYLE '%s'", MVU_IMPL_STYLE); + $error("Unrecognized COMPUTE_CORE '%s'", COMPUTE_CORE); $finish; end endcase @@ -203,7 +236,7 @@ module mvu_axi #( assign b_load = !B.vld || m_axis_output_tready; always_ff @(posedge clk) begin - if(rst) B <= '{ default: 'x }; + if(rst) B <= '{ vld: 0, default: 'x }; else begin if(b_load) B <= '{ vld: A.vld, dat: A.dat}; end @@ -212,4 +245,4 @@ module mvu_axi #( assign m_axis_output_tvalid = B.vld; assign m_axis_output_tdata = B.dat; -endmodule : mvu_axi +endmodule : mvu_vvu_axi From 32d6338c626b26d2e48cdb21cde438d557cc9bcd Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 21 Sep 2023 16:34:36 +0100 Subject: [PATCH 062/112] [mvu vvu axi]: renamed file for consistency purposes --- finn-rtllib/mvu/mvu_vvu_axi.sv | 248 +++++++++++++++++++++++++++++++++ 1 file changed, 248 insertions(+) create mode 100644 finn-rtllib/mvu/mvu_vvu_axi.sv diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv new file mode 100644 index 0000000000..07ad32e6c8 --- /dev/null +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -0,0 +1,248 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Matrix Vector Unit (MVU) & Vector Vector Unit (VVU) AXI-lite interface wrapper. + * @details + * The following compute cores are supported: + * - 4-bit MVU on DSP48 & DSP58 achieving 4 MACs/DSP, + * (4,8]-bit MVU on DSP48 achieving 2 MACs/DSP, + * [4,9]-bit MVU and VVU on DSP58 achieving 3 MACs/DSP, + * 'unconstrained' LUT-based MVU and VVU. + * Folding hints: + * - PE scaling should divide MH. + * - SIMD scaling should divide MW. + * - Otherwise, keep SIMD and PE somewhat balanced. SIMD scaling tends to + * impact critical paths more than PE scaling. PE scaling implies a + * bigger fanout on the input activations. + * - Full unfolding along MH (PE=MH) results in no replay buffer instantiated + *****************************************************************************/ + +module mvu_vvu_axi #( + bit IS_MVU, // string type causes error in Vivado + parameter COMPUTE_CORE, + int unsigned MW, + int unsigned MH, + int unsigned PE, + int unsigned SIMD, + int unsigned ACTIVATION_WIDTH, + int unsigned WEIGHT_WIDTH, + int unsigned ACCU_WIDTH, + bit SIGNED_ACTIVATIONS = 0, + int unsigned SEGMENTLEN = 0, + bit FORCE_BEHAVIORAL = 0, + bit M_REG_LUT = 1, + + // Safely deducible parameters + localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, + localparam int unsigned INPUT_STREAM_WIDTH_BA = ((IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8, + localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH, + localparam int unsigned INPUT_STREAM_WIDTH = (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH, + localparam int unsigned SF = MW/SIMD, + localparam int unsigned NF = MH/PE, + localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8 +) +( + // Global Control + input logic ap_clk, + input logic ap_rst_n, + + // Weight Stream + input logic [WEIGHT_STREAM_WIDTH_BA-1:0] s_axis_weights_tdata, + input logic s_axis_weights_tvalid, + output logic s_axis_weights_tready, + + // Input Stream + input logic [INPUT_STREAM_WIDTH_BA-1:0] s_axis_input_tdata, + input logic s_axis_input_tvalid, + output logic s_axis_input_tready, + + // Output Stream + output logic [OUTPUT_STREAM_WIDTH_BA-1:0] m_axis_output_tdata, + output logic m_axis_output_tvalid, + input logic m_axis_output_tready +); + +//-------------------- Parameter sanity checks --------------------\\ + initial begin + if (MW % SIMD != 0) begin + $error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD); + $finish; + end + if (MH % PE != 0) begin + $error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE); + $finish; + end + if (WEIGHT_WIDTH > 8) begin + $error("Weight width of %0d-bits exceeds maximum of 8-bits", WEIGHT_WIDTH); + $finish; + end + if (ACTIVATION_WIDTH > 8) begin + if (!(SIGNED_ACTIVATIONS == 1 && ACTIVATION_WIDTH == 9 && COMPUTE_CORE == "mvu_vvu_8sx9_dsp58")) begin + $error("Activation width of %0d-bits exceeds maximum of 9-bits for signed numbers on DSP48", ACTIVATION_WIDTH); + $finish; + end + end + if (COMPUTE_CORE == "mvu_vvu_8sx9_dsp58") begin + if (SEGMENTLEN == 0) begin + $warning("Segment length of %0d defaults to chain length of %0d", SEGMENTLEN, (SIMD+2)/3); + end + if (SEGMENTLEN > (SIMD+2)/3) begin + $error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3); + $finish; + end + end + if (!IS_MVU) begin + if (COMPUTE_CORE != "mvu_vvu_8sx9_dsp58" && COMPUTE_CORE != "mvu_vvu_lut") begin + $error("VVU only supported on DSP58 or LUT-based implementation"); + $finish; + end + end + end + + uwire clk = ap_clk; + uwire rst = !ap_rst_n; + + typedef logic [INPUT_STREAM_WIDTH-1 : 0] mvauin_t; + + uwire mvauin_t amvau; + uwire alast; + uwire afin; + uwire avld; + uwire ardy; + + replay_buffer #(.LEN(SF), .REP(IS_MVU ? NF : 1), .W($bits(mvauin_t))) activation_replay ( + .clk, .rst, + .ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)), + .ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin) + ); + +//-------------------- Input control --------------------\\ + uwire en; + uwire istb = avld && s_axis_weights_tvalid; + assign ardy = en && s_axis_weights_tvalid; + assign s_axis_weights_tready = en && avld; + +//-------------------- Core MVU/VVU --------------------\\ + uwire ovld; + uwire [PE-1:0][ACCU_WIDTH-1:0] odat; + typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t; + uwire mvauin_t amvau_i; + + if (IS_MVU) begin : genMVUInput + assign amvau_i = amvau; + end : genMVUInput + else begin : genVVUInput + // The input stream will have the channels interleaved for VVU when PE>1 + // Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..] + // Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like: + // (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to + // (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i) + localparam int num_of_elements = INPUT_STREAM_WIDTH/ACTIVATION_WIDTH; + for (genvar i=0; i 1) ? + amvau[(i/SIMD + (i*PE % num_of_elements) + 1) * ACTIVATION_WIDTH : (i/SIMD + (i*PE % num_of_elements)) * ACTIVATION_WIDTH] + : amvau[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH]; + end : genRewire + end : genVVUInput + + case(COMPUTE_CORE) + "mvu_vvu_8sx9_dsp58": + mvu_vvu_8sx9 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), + .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), + .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( + .clk, .rst, .en, + .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), + .vld(ovld), .p(odat) + ); + "mvu_4sx4u": + mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( + .clk, .rst, .en, + .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), + .vld(ovld), .p(odat) + ); + "mvu_8sx8u_dsp48": + mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), + .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( + .clk, .rst, .en, + .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), + .vld(ovld), .p(odat) + ); + "mvu_vvu_lut": + mvu_vvu_lut #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .M_REG(M_REG_LUT)) core ( + .clk, .rst, .en, + .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), + .vld(ovld), .p(odat) + ); + default: initial begin + $error("Unrecognized COMPUTE_CORE '%s'", COMPUTE_CORE); + $finish; + end + endcase + +//-------------------- Output register slice --------------------\\ + struct packed { + logic vld; + logic [PE-1:0][ACCU_WIDTH-1:0] dat; + } A = '{ vld: 0, default: 'x}; + + assign en = !A.vld || !ovld; + + uwire b_load; + always_ff @(posedge clk) begin + if(rst) A <= '{ vld: 0, default: 'x }; + else if(!A.vld || b_load) begin + A.vld <= ovld && en; + for(int unsigned i = 0; i < PE; i++) begin + // CR-1148862: + // A.dat[i] <= odat[i]; + automatic logic [ACCU_WIDTH-1:0] v = odat[i]; + A.dat[i] <= v[ACCU_WIDTH-1:0]; + end + end + end + + struct packed { + logic vld; + logic [PE-1:0][ACCU_WIDTH-1:0] dat; + } B = '{ vld: 0, default: 'x}; + + assign b_load = !B.vld || m_axis_output_tready; + always_ff @(posedge clk) begin + if(rst) B <= '{ vld: 0, default: 'x }; + else begin + if(b_load) B <= '{ vld: A.vld, dat: A.dat}; + end + end + + assign m_axis_output_tvalid = B.vld; + assign m_axis_output_tdata = B.dat; + +endmodule : mvu_vvu_axi From 031406d73fa36a02638a94affd6a0bef36956c3c Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 21 Sep 2023 16:39:22 +0100 Subject: [PATCH 063/112] [mvu 8sx9]: added support for VVU on DSP58, resolved PyVerilator-caused error and added synthesis directive to handle 'X in input data --- finn-rtllib/mvu/mvu_8sx9.sv | 100 +++++++++++++++++++----------------- 1 file changed, 52 insertions(+), 48 deletions(-) diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv index 34aa856b1b..52a93739d6 100644 --- a/finn-rtllib/mvu/mvu_8sx9.sv +++ b/finn-rtllib/mvu/mvu_8sx9.sv @@ -31,7 +31,8 @@ * @brief Matrix Vector Unit (MVU) core compute kernel utilizing DSP58. *****************************************************************************/ -module mvu_8sx9 #( +module mvu_vvu_8sx9 #( + parameter IS_MVU, int unsigned PE, int unsigned SIMD, int unsigned ACTIVATION_WIDTH, @@ -39,7 +40,9 @@ module mvu_8sx9 #( int unsigned ACCU_WIDTH, bit SIGNED_ACTIVATIONS = 0, int unsigned SEGMENTLEN = 0, // Default to 0 (which implies a single segment) - bit FORCE_BEHAVIORAL = 0 + bit FORCE_BEHAVIORAL = 0, + + int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD ) ( // Global Control @@ -51,7 +54,7 @@ module mvu_8sx9 #( input logic last, input logic zero, // ignore current inputs and force this partial product to zero input logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // weights - input logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] a, // activations + input logic [ACTIVATION_ELEMENTS-1:0][ACTIVATION_WIDTH-1:0] a, // activations // Ouput output logic vld, @@ -67,9 +70,10 @@ module mvu_8sx9 #( //-------------------- Declare global signals --------------------\\ localparam int unsigned CHAINLEN = (SIMD+2)/3; localparam int unsigned SEGLEN = SEGMENTLEN == 0 ? CHAINLEN : SEGMENTLEN; // Additional constant to default a SEGMENTLEN of '0' to the DSP-chain length - uwire [26:0] a_in_i [CHAINLEN]; + localparam int unsigned PE_ACTIVATION = IS_MVU ? 1 : PE; + uwire [26:0] a_in_i [PE_ACTIVATION * CHAINLEN]; uwire [23:0] b_in_i [PE][CHAINLEN]; - uwire [57:0] pcout [PE][CHAINLEN]; + uwire [PE-1:0][CHAINLEN-1:0][57:0] pcout; // Array with packed dimension > 256 (with a loop-carried dependency) cannot be handled out-of-the-box with PyVerilator //-------------------- Shift register for opmode select signal --------------------\\ localparam int unsigned MAX_PIPELINE_STAGES = (CHAINLEN + SEGLEN-1)/SEGLEN; // >=1 (== number of pipeline registers + 1 (A/B inputs always have 1 register)) @@ -99,48 +103,48 @@ module mvu_8sx9 #( //-------------------- Buffer for input activations --------------------\\ localparam int unsigned PAD_BITS_ACT = 9 - ACTIVATION_WIDTH; + for (genvar k=0; k1 ? TOTAL_PREGS-1 : 0; + localparam int LANES_OCCUPIED = i == CHAINLEN-1 ? SIMD - 3*i : 3; - for (genvar i=0; i1 ? TOTAL_PREGS-1 : 0; - localparam int LANES_OCCUPIED = i == CHAINLEN-1 ? SIMD - 3*i : 3; - - if (EXTERNAL_PREGS > 0) begin : genExternalPregAct - logic [0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][ACTIVATION_WIDTH-1:0] A = '{ default : 0}; - always_ff @(posedge clk) begin - if (rst) A <= '{default: 0}; - else if(en) begin - A[EXTERNAL_PREGS-1] <= -// synthesis translate_off - zero ? '1 : -// synthesis translate_on - a[3*i +: LANES_OCCUPIED]; - if (EXTERNAL_PREGS > 1) A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1]; + if (EXTERNAL_PREGS > 0) begin : genExternalPregAct + logic [0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][ACTIVATION_WIDTH-1:0] A = '{ default : 0}; + always_ff @(posedge clk) begin + if (rst) A <= '{default: 0}; + else if(en) begin + A[EXTERNAL_PREGS-1] <= + // synthesis translate_off + zero ? '1 : + // synthesis translate_on + a[SIMD*k + 3*i +: LANES_OCCUPIED]; + if (EXTERNAL_PREGS > 1) A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1]; + end end - end - for (genvar j=0; j Date: Thu, 21 Sep 2023 16:39:52 +0100 Subject: [PATCH 064/112] [mvu vvu 8sx9]: renamed compute core for consistency --- finn-rtllib/mvu/mvu_vvu_8sx9.sv | 427 ++++++++++++++++++++++++++++++++ 1 file changed, 427 insertions(+) create mode 100644 finn-rtllib/mvu/mvu_vvu_8sx9.sv diff --git a/finn-rtllib/mvu/mvu_vvu_8sx9.sv b/finn-rtllib/mvu/mvu_vvu_8sx9.sv new file mode 100644 index 0000000000..52a93739d6 --- /dev/null +++ b/finn-rtllib/mvu/mvu_vvu_8sx9.sv @@ -0,0 +1,427 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Matrix Vector Unit (MVU) core compute kernel utilizing DSP58. + *****************************************************************************/ + +module mvu_vvu_8sx9 #( + parameter IS_MVU, + int unsigned PE, + int unsigned SIMD, + int unsigned ACTIVATION_WIDTH, + int unsigned WEIGHT_WIDTH, + int unsigned ACCU_WIDTH, + bit SIGNED_ACTIVATIONS = 0, + int unsigned SEGMENTLEN = 0, // Default to 0 (which implies a single segment) + bit FORCE_BEHAVIORAL = 0, + + int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD + ) + ( + // Global Control + input logic clk, + input logic rst, + input logic en, + + // Input + input logic last, + input logic zero, // ignore current inputs and force this partial product to zero + input logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // weights + input logic [ACTIVATION_ELEMENTS-1:0][ACTIVATION_WIDTH-1:0] a, // activations + + // Ouput + output logic vld, + output logic [PE-1:0][ACCU_WIDTH-1:0] p + ); + // for verilator always use behavioral code + localparam bit BEHAVIORAL = +`ifdef VERILATOR + 1 || +`endif + FORCE_BEHAVIORAL; + +//-------------------- Declare global signals --------------------\\ + localparam int unsigned CHAINLEN = (SIMD+2)/3; + localparam int unsigned SEGLEN = SEGMENTLEN == 0 ? CHAINLEN : SEGMENTLEN; // Additional constant to default a SEGMENTLEN of '0' to the DSP-chain length + localparam int unsigned PE_ACTIVATION = IS_MVU ? 1 : PE; + uwire [26:0] a_in_i [PE_ACTIVATION * CHAINLEN]; + uwire [23:0] b_in_i [PE][CHAINLEN]; + uwire [PE-1:0][CHAINLEN-1:0][57:0] pcout; // Array with packed dimension > 256 (with a loop-carried dependency) cannot be handled out-of-the-box with PyVerilator + +//-------------------- Shift register for opmode select signal --------------------\\ + localparam int unsigned MAX_PIPELINE_STAGES = (CHAINLEN + SEGLEN-1)/SEGLEN; // >=1 (== number of pipeline registers + 1 (A/B inputs always have 1 register)) + logic L [0:1+MAX_PIPELINE_STAGES] = '{default: 0}; // After MAX_PIPELINE_STAGES (== number of pipeline stages for input data), we have 3 additional cycles latency (A/B reg, Mreg, Preg). Thus, we add +2 (since OPMODE is buffered by 1 cycle in the DSP fabric) + + always_ff @(posedge clk) begin + if(rst) L <= '{default: 0}; + else if(en) begin + L[1+MAX_PIPELINE_STAGES] <= last; + L[0:MAX_PIPELINE_STAGES] <= L[1:1+MAX_PIPELINE_STAGES]; + end + end + assign vld = L[0]; + +//-------------------- Shift register for ZERO flag --------------------\\ + logic Z [0:MAX_PIPELINE_STAGES-2] = '{default:0}; // We need MAX_PIPELINE_STAGES-1 pipeline stages (note: INMODE is buffered inside DSP fabric) + + if (MAX_PIPELINE_STAGES > 1) begin : genZreg + always_ff @(posedge clk) begin + if (rst) Z <= '{default: 0}; + else if(en) begin + Z[0] <= zero; + if (MAX_PIPELINE_STAGES > 2) Z[1:MAX_PIPELINE_STAGES-2] <= Z[0:MAX_PIPELINE_STAGES-3]; + end + end + end; + +//-------------------- Buffer for input activations --------------------\\ + localparam int unsigned PAD_BITS_ACT = 9 - ACTIVATION_WIDTH; + for (genvar k=0; k1 ? TOTAL_PREGS-1 : 0; + localparam int LANES_OCCUPIED = i == CHAINLEN-1 ? SIMD - 3*i : 3; + + if (EXTERNAL_PREGS > 0) begin : genExternalPregAct + logic [0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][ACTIVATION_WIDTH-1:0] A = '{ default : 0}; + always_ff @(posedge clk) begin + if (rst) A <= '{default: 0}; + else if(en) begin + A[EXTERNAL_PREGS-1] <= + // synthesis translate_off + zero ? '1 : + // synthesis translate_on + a[SIMD*k + 3*i +: LANES_OCCUPIED]; + if (EXTERNAL_PREGS > 1) A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1]; + end + end + for (genvar j=0; j1 ? TOTAL_PREGS-1 : 0; + localparam int LANES_OCCUPIED = j == CHAINLEN-1 ? SIMD - 3*j : 3; + + if (EXTERNAL_PREGS > 0) begin : genExternalPregWeight + logic [0:PE-1][0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][WEIGHT_WIDTH-1:0] B = '{ default : 0}; + always_ff @(posedge clk) begin + if (rst) B <= '{default: 0}; + else if (en) begin + B[i][EXTERNAL_PREGS-1] <= +// synthesis translate_off + zero ? '1 : +// synthesis translate_on + w[i][3*j +: LANES_OCCUPIED]; + if (EXTERNAL_PREGS > 1) B[i][0:EXTERNAL_PREGS-2] <= B[i][1:EXTERNAL_PREGS-1]; + end + end + for (genvar k = 0 ; k < LANES_OCCUPIED ; k++) begin : genBin + assign b_in_i[i][j][8*k +: 8] = PAD_BITS_WEIGHT == 0 ? B[i][0][k] : { {PAD_BITS_WEIGHT{B[i][0][k][WEIGHT_WIDTH-1]}}, B[i][0][k] }; + end : genBin + for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero + assign b_in_i[i][j][8*k +: 8] = 8'b0; + end : genBinZero + end : genExternalPregWeight + else begin : genInpDSPWeight + for (genvar k = 0; k < LANES_OCCUPIED; k++) begin : genBin + assign b_in_i[i][j][8*k +: 8] = +// synthesis translate_off + zero ? '1 : +// synthesis translate_on + PAD_BITS_WEIGHT == 0 ? w[i][3*j+k] : { {PAD_BITS_WEIGHT{w[i][3*j+k][WEIGHT_WIDTH-1]}}, w[i][3*j+k] }; + end : genBin + for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero + assign b_in_i[i][j][8*k +: 8] = 8'b0; + end : genBinZero + end : genInpDSPWeight + end : genWeightSIMD + end : genWeightPE + +//-------------------- Instantiate PE x CHAINLEN DSPs --------------------\\ + for (genvar i=0; i0 ? 2 : 1; // 1 : 0 + localparam bit PREG = (j+1)%SEGLEN==0 || j == CHAINLEN-1; + localparam bit FIRST = j == 0; + localparam bit LAST = j == CHAINLEN-1; + uwire [57:0] pp; + + if (LAST) begin : genPOUT + assign p[i] = pp[ACCU_WIDTH-1:0]; + end + + // Note: Since the product B * AD is computed, + // rst can be only applied to AD and zero only to B + // with the same effect as zeroing both. + if(BEHAVIORAL) begin : genBehav + // Stage #1: Input A/B + logic signed [33:0] Areg [INTERNAL_PREGS]; + always_ff @(posedge clk) begin + if (rst) Areg <= '{ default : 0}; + else if (en) begin + Areg[0] <= { 7'bx, a_in_i[(IS_MVU ? 0 : CHAINLEN*i) + j] }; + if (INTERNAL_PREGS == 2) Areg[1] <= Areg[0]; + end + end + logic signed [23:0] Breg [INTERNAL_PREGS]; + always_ff @(posedge clk) begin + if (rst) Breg <= '{ default : 0}; + else if (en) begin + Breg[0] <= b_in_i[i][j]; + if (INTERNAL_PREGS == 2) Breg[1] <= Breg[0]; + end + end + + // Stage #2: Multiply-Accumulate + logic signed [57:0] Mreg; + logic InmodeZero = 0; + always_ff @(posedge clk) begin + if (rst) InmodeZero <= 0; + else if (en) InmodeZero <= ( TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero ); + end + always_ff @(posedge clk) begin + if (rst) Mreg <= 0; + else if (en) begin + automatic logic signed [57:0] m = 0; + for (int k = 0; k < 3; k++) begin + m = m + (InmodeZero ? 0 : $signed(Areg[INTERNAL_PREGS-1][9*k +: 9]) * $signed(Breg[INTERNAL_PREGS-1][8*k +: 8])); + end + Mreg <= m; + end + end + + // Stage #3: Accumulate + logic signed [57:0] Preg; + logic Opmode = 0; + if (FIRST && !LAST) begin : genFirst + if (PREG) begin : genPregBehav + always_ff @(posedge clk) begin + if (rst) Preg <= 0; + else if (en) Preg <= Mreg; + end + end + else assign Preg = Mreg; + end + else if (FIRST && LAST) begin : genSingle + always_ff @(posedge clk) begin + if (rst) Opmode <= 0; + else if (en) Opmode <= L[1]; + end + always_ff @(posedge clk) begin + if (rst) Preg <= 0; + else if (en) Preg <= (Opmode ? 0 : Preg) + Mreg; + end + end + else if (!FIRST && LAST) begin : genLast + always_ff @(posedge clk) begin + if (rst) Opmode <= 0; + else if (en) Opmode <= L[1]; + end + always_ff @(posedge clk) begin + if (rst) Preg <= 0; + else if (en) Preg <= (Opmode ? 0 : Preg) + Mreg + pcout[i][j-1]; + end + end + else begin : genMid + if (PREG) begin : genPregBehav + always_ff @(posedge clk) begin + if (rst) Preg <= 0; + else if (en) Preg <= Mreg + pcout[i][j-1]; + end + end + else assign Preg = Mreg + pcout[i][j-1]; + end + assign pp = Preg; + assign pcout[i][j] = Preg; + end : genBehav +`ifndef VERILATOR + else begin: genDSP + DSP58 #( + // Feature Control Attributes: Data Path Selection + .AMULTSEL("A"), // Selects A input to multiplier (A, AD) + .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) + .BMULTSEL("B"), // Selects B input to multiplier (AD, B) + .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port) + .DSP_MODE("INT8"), // Configures DSP to a particular mode of operation. Set to INT24 for + // legacy mode. + .PREADDINSEL("A"), // Selects input to pre-adder (A, B) + .RND(58'h000000000000000), // Rounding Constant + .USE_MULT("MULTIPLY"), // Select multiplier usage (DYNAMIC, MULTIPLY, NONE) + .USE_SIMD("ONE58"), // SIMD selection (FOUR12, ONE58, TWO24) + .USE_WIDEXOR("FALSE"), // Use the Wide XOR function (FALSE, TRUE) + .XORSIMD("XOR24_34_58_116"), // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116) + // Pattern Detector Attributes: Pattern Detection Configuration + .AUTORESET_PATDET("NO_RESET"), // NO_RESET, RESET_MATCH, RESET_NOT_MATCH + .AUTORESET_PRIORITY("RESET"), // Priority of AUTORESET vs. CEP (CEP, RESET). + .MASK(58'h0ffffffffffffff), // 58-bit mask value for pattern detect (1=ignore) + .PATTERN(58'h000000000000000), // 58-bit pattern match for pattern detect + .SEL_MASK("MASK"), // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2 + .SEL_PATTERN("PATTERN"), // Select pattern value (C, PATTERN) + .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect (NO_PATDET, PATDET) + // Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins + .IS_ALUMODE_INVERTED(4'b0000), // Optional inversion for ALUMODE + .IS_CARRYIN_INVERTED(1'b0), // Optional inversion for CARRYIN + .IS_CLK_INVERTED(1'b0), // Optional inversion for CLK + .IS_INMODE_INVERTED(5'b00000), // Optional inversion for INMODE + .IS_NEGATE_INVERTED(3'b000), // Optional inversion for NEGATE + .IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0 + FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN + 2'b01, // Y : M + 2'b01 // X: M + }), // Optional inversion for OPMODE + .IS_RSTALLCARRYIN_INVERTED(1'b0), // Optional inversion for RSTALLCARRYIN + .IS_RSTALUMODE_INVERTED(1'b0), // Optional inversion for RSTALUMODE + .IS_RSTA_INVERTED(1'b0), // Optional inversion for RSTA + .IS_RSTB_INVERTED(1'b0), // Optional inversion for RSTB + .IS_RSTCTRL_INVERTED(1'b0), // Optional inversion for STCONJUGATE_A + .IS_RSTC_INVERTED(1'b0), // Optional inversion for RSTC + .IS_RSTD_INVERTED(1'b0), // Optional inversion for RSTD + .IS_RSTINMODE_INVERTED(1'b0), // Optional inversion for RSTINMODE + .IS_RSTM_INVERTED(1'b0), // Optional inversion for RSTM + .IS_RSTP_INVERTED(1'b0), // Optional inversion for RSTP + // Register Control Attributes: Pipeline Register Configuration + .ACASCREG(INTERNAL_PREGS), // Number of pipeline stages between A/ACIN and ACOUT (0-2) + .ADREG(0), // Pipeline stages for pre-adder (0-1) + .ALUMODEREG(0), // Pipeline stages for ALUMODE (0-1) + .AREG(INTERNAL_PREGS), // Pipeline stages for A (0-2) + .BCASCREG(INTERNAL_PREGS), // Number of pipeline stages between B/BCIN and BCOUT (0-2) + .BREG(INTERNAL_PREGS), // Pipeline stages for B (0-2) + .CARRYINREG(0), // Pipeline stages for CARRYIN (0-1) + .CARRYINSELREG(0), // Pipeline stages for CARRYINSEL (0-1) + .CREG(0), // Pipeline stages for C (0-1) + .DREG(0), // Pipeline stages for D (0-1) + .INMODEREG(1), // Pipeline stages for INMODE (0-1) + .MREG(1), // Multiplier pipeline stages (0-1) + .OPMODEREG(1), // Pipeline stages for OPMODE (0-1) + .PREG(PREG), // Number of pipeline stages for P (0-1) + .RESET_MODE("SYNC") // Selection of synchronous or asynchronous reset. (ASYNC, SYNC). + ) + DSP58_inst ( + // Cascade outputs: Cascade Ports + .ACOUT(), // 34-bit output: A port cascade + .BCOUT(), // 24-bit output: B cascade + .CARRYCASCOUT(), // 1-bit output: Cascade carry + .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade + .PCOUT(pcout[i][j]), // 58-bit output: Cascade output + // Control outputs: Control Inputs/Status Bits + .OVERFLOW(), // 1-bit output: Overflow in add/acc + .PATTERNBDETECT(), // 1-bit output: Pattern bar detect + .PATTERNDETECT(), // 1-bit output: Pattern detect + .UNDERFLOW(), // 1-bit output: Underflow in add/acc + // Data outputs: Data Ports + .CARRYOUT(), // 4-bit output: Carry + .P(pp), // 58-bit output: Primary data + .XOROUT(), // 8-bit output: XOR data + // Cascade inputs: Cascade Ports + .ACIN('x), // 34-bit input: A cascade data + .BCIN('x), // 24-bit input: B cascade + .CARRYCASCIN('x), // 1-bit input: Cascade carry + .MULTSIGNIN('x), // 1-bit input: Multiplier sign cascade + .PCIN(FIRST ? 'x : pcout[i][j-1]), // 58-bit input: P cascade + // Control inputs: Control Inputs/Status Bits + .ALUMODE(4'h0), // 4-bit input: ALU control + .CARRYINSEL('0), // 3-bit input: Carry select + .CLK(clk), // 1-bit input: Clock + .INMODE({ + INTERNAL_PREGS==2 ? 1'b0 : 1'b1, + 2'b00, + TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero, + INTERNAL_PREGS==2 ? 1'b0 : 1'b1 + }), // 5-bit input: INMODE control + .NEGATE('0), // 3-bit input: Negates the input of the multiplier + .OPMODE({ + LAST ? {1'b0, L[1]} : 2'b00, + 7'b000_0000 + }), // 9-bit input: Operation mode + // Data inputs: Data Ports + .A({ 7'bx, a_in_i[(IS_MVU ? 0 : CHAINLEN*i) + j] }), // 34-bit input: A data + .B(b_in_i[i][j]), // 24-bit input: B data + .C('x), // 58-bit input: C data + .CARRYIN('0), // 1-bit input: Carry-in + .D('x), // 27-bit input: D data + // Reset/Clock Enable inputs: Reset/Clock Enable Inputs + .ASYNC_RST('0), // 1-bit input: Asynchronous reset for all registers. + .CEA1(en), // 1-bit input: Clock enable for 1st stage AREG + .CEA2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage AREG + .CEAD('0), // 1-bit input: Clock enable for ADREG + .CEALUMODE('0), // 1-bit input: Clock enable for ALUMODE + .CEB1(en), // 1-bit input: Clock enable for 1st stage BREG + .CEB2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage BREG + .CEC('0), // 1-bit input: Clock enable for CREG + .CECARRYIN('0), // 1-bit input: Clock enable for CARRYINREG + .CECTRL(en), // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG + .CED('0), // 1-bit input: Clock enable for DREG + .CEINMODE(en), // 1-bit input: Clock enable for INMODEREG + .CEM(en), // 1-bit input: Clock enable for MREG + .CEP(PREG && en), // 1-bit input: Clock enable for PREG + .RSTA(rst), // 1-bit input: Reset for AREG + .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG + .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG + .RSTB(rst), // 1-bit input: Reset for BREG + .RSTC('0), // 1-bit input: Reset for CREG + .RSTCTRL(rst), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG + .RSTD('0), // 1-bit input: Reset for DREG and ADREG + .RSTINMODE(rst), // 1-bit input: Reset for INMODE register + .RSTM(rst), // 1-bit input: Reset for MREG + .RSTP(PREG && rst) // 1-bit input: Reset for PREG + ); + end : genDSP +`endif + end : genDSPChain + end : genDSPPE + +endmodule : mvu_vvu_8sx9 From adb58694be36bd0fa2e8558f760d1642f14a2a38 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 21 Sep 2023 16:58:20 +0100 Subject: [PATCH 065/112] [axi wrapper]: changed parameter to localparam --- finn-rtllib/mvu/mvu_vvu_axi_wrapper.v | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v index 788e49a71b..270fe7351f 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v +++ b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v @@ -46,9 +46,9 @@ module $MODULE_NAME_AXI_WRAPPER$ #( parameter FORCE_BEHAVIORAL = $FORCE_BEHAVIORAL$, // Safely deducible parameters - parameter WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, - parameter INPUT_STREAM_WIDTH_BA = ((IS_MVU == 1 ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8, - parameter OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8 + localparam WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, + localparam INPUT_STREAM_WIDTH_BA = ((IS_MVU == 1 ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8, + localparam OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8 )( // Global Control (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *) From f54d438f78fe4ce78c84fdd7bcbc514048bd2fe0 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 21 Sep 2023 16:59:32 +0100 Subject: [PATCH 066/112] [axi]: added support for LUT-based VVU --- finn-rtllib/mvu/mvu_vvu_axi.sv | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index 07ad32e6c8..ff677fc244 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -195,8 +195,8 @@ module mvu_vvu_axi #( .vld(ovld), .p(odat) ); "mvu_vvu_lut": - mvu_vvu_lut #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), - .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .M_REG(M_REG_LUT)) core ( + mvu_vvu_lut #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), + .WEIGHT_WIDTH(WEIGHT_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .M_REG(M_REG_LUT)) core ( .clk, .rst, .en, .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), .vld(ovld), .p(odat) From a4e2ac7146afeab4271344785f638c88cf78da73 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 21 Sep 2023 17:00:07 +0100 Subject: [PATCH 067/112] [mvu vvu 8sx9]: minor change to list of generics --- finn-rtllib/mvu/mvu_vvu_8sx9.sv | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/finn-rtllib/mvu/mvu_vvu_8sx9.sv b/finn-rtllib/mvu/mvu_vvu_8sx9.sv index 52a93739d6..2aa9d71b6c 100644 --- a/finn-rtllib/mvu/mvu_vvu_8sx9.sv +++ b/finn-rtllib/mvu/mvu_vvu_8sx9.sv @@ -32,7 +32,7 @@ *****************************************************************************/ module mvu_vvu_8sx9 #( - parameter IS_MVU, + bit IS_MVU, int unsigned PE, int unsigned SIMD, int unsigned ACTIVATION_WIDTH, @@ -42,7 +42,7 @@ module mvu_vvu_8sx9 #( int unsigned SEGMENTLEN = 0, // Default to 0 (which implies a single segment) bit FORCE_BEHAVIORAL = 0, - int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD + localparam int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD ) ( // Global Control From 40ad0b46c03b10b47ec4d72dd04a4ad96149fa89 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 21 Sep 2023 17:00:51 +0100 Subject: [PATCH 068/112] [mvu lut]: added support for VVU --- finn-rtllib/mvu/mvu_lut.sv | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/finn-rtllib/mvu/mvu_lut.sv b/finn-rtllib/mvu/mvu_lut.sv index b100a589e8..c100910d75 100644 --- a/finn-rtllib/mvu/mvu_lut.sv +++ b/finn-rtllib/mvu/mvu_lut.sv @@ -1,13 +1,15 @@ -module mvu_lut #( - int unsigned PE, - int unsigned SIMD, +module mvu_vvu_lut #( + bit IS_MVU, + int unsigned PE, + int unsigned SIMD, int unsigned ACCU_WIDTH, int unsigned ACTIVATION_WIDTH, int unsigned WEIGHT_WIDTH, bit SIGNED_ACTIVATIONS, bit M_REG = 1, - localparam unsigned MULT_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH + localparam int unsigned MULT_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH, + localparam int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD )( // Global Control input logic clk, @@ -17,8 +19,8 @@ module mvu_lut #( // Input input logic last, input logic zero, // ignore current inputs and force this partial product to zero - input logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // signed weights - input logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] a, // (un)signed activations + input logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // signed weights + input logic [ACTIVATION_ELEMENTS-1:0][ACTIVATION_WIDTH-1:0] a, // (un)signed activations // Ouput output logic vld, @@ -63,16 +65,16 @@ module mvu_lut #( always_ff @(posedge clk) begin if(rst) M[j] = '{ default : 0 }; else if (en) M[j] = zero ? 0 : - SIGNED_ACTIVATIONS ? $signed(a[j]) * $signed(w[i][j]) : - $signed({1'b0, a[j]}) * $signed(w[i][j]); - // (SIGNED_ACTIVATIONS ? $signed(a[j]) : a[j]) * $signed(w[i][j]) isn't valid -- leads to unsigned multiplication + SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) : + $signed({1'b0, a[(IS_MVU ? 0 : SIMD*i) + j]}) * $signed(w[i][j]); + // (SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) : a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) isn't valid -- leads to unsigned multiplication end assign m1[j] = M[j]; end : genMreg else begin : genNoMreg assign m1[j] = zero ? 0 : - SIGNED_ACTIVATIONS ? $signed(a[j]) * $signed(w[i][j]) : - $signed({1'b0, a[j]}) * $signed(w[i][j]); + SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) : + $signed({1'b0, a[(IS_MVU ? 0 : SIMD*i) + j]}) * $signed(w[i][j]); end : genNoMreg end : genSIMD @@ -99,4 +101,4 @@ module mvu_lut #( assign p[i] = P2[i]; end : genPE -endmodule : mvu_lut +endmodule : mvu_vvu_lut From 30fcb5b734f86d0032549a4efe29d96b13ee5451 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 21 Sep 2023 17:01:10 +0100 Subject: [PATCH 069/112] [mvu vvu lut]: renamed file for consistency --- finn-rtllib/mvu/mvu_vvu_lut.sv | 104 +++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 finn-rtllib/mvu/mvu_vvu_lut.sv diff --git a/finn-rtllib/mvu/mvu_vvu_lut.sv b/finn-rtllib/mvu/mvu_vvu_lut.sv new file mode 100644 index 0000000000..c100910d75 --- /dev/null +++ b/finn-rtllib/mvu/mvu_vvu_lut.sv @@ -0,0 +1,104 @@ +module mvu_vvu_lut #( + bit IS_MVU, + int unsigned PE, + int unsigned SIMD, + int unsigned ACCU_WIDTH, + int unsigned ACTIVATION_WIDTH, + int unsigned WEIGHT_WIDTH, + bit SIGNED_ACTIVATIONS, + bit M_REG = 1, + + localparam int unsigned MULT_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH, + localparam int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD +)( + // Global Control + input logic clk, + input logic rst, + input logic en, + + // Input + input logic last, + input logic zero, // ignore current inputs and force this partial product to zero + input logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // signed weights + input logic [ACTIVATION_ELEMENTS-1:0][ACTIVATION_WIDTH-1:0] a, // (un)signed activations + + // Ouput + output logic vld, + output logic signed [PE-1:0][ACCU_WIDTH-1:0] p +); + + typedef int unsigned leave_load_t[2*SIMD-1]; + function leave_load_t init_leave_loads(); + automatic leave_load_t res; + for(int i = 2*(SIMD-1); i >= int'(SIMD)-1; i--) res[i] = 1; + for(int i = SIMD-2; i >= 0; i--) res[i] = res[2*i+1] + res[2*i+2]; + return res; + endfunction : init_leave_loads + + // Pipeline for last indicator flag + uwire last_i; + generate if (M_REG) begin + logic [0:1] L = '0; + always_ff @(posedge clk) begin + if(rst) L <= '0; + else if (en) L <= {last, L[0]}; + end + assign last_i = L[1]; + end + else begin + logic L = '0; + always_ff @(posedge clk) begin + if(rst) L <= '0; + else if (en) L <= last; + end + assign last_i = L; + end + endgenerate + + // For each PE generate + for (genvar i = 0; i < PE; i++) begin : genPE + // Stage #1: SIMD multipliers in parallel + uwire [MULT_WIDTH-1 : 0] m1 [SIMD]; + for (genvar j = 0; j < SIMD; j++) begin : genSIMD + if (M_REG) begin : genMreg + logic [MULT_WIDTH-1 : 0] M [SIMD]; + always_ff @(posedge clk) begin + if(rst) M[j] = '{ default : 0 }; + else if (en) M[j] = zero ? 0 : + SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) : + $signed({1'b0, a[(IS_MVU ? 0 : SIMD*i) + j]}) * $signed(w[i][j]); + // (SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) : a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) isn't valid -- leads to unsigned multiplication + end + assign m1[j] = M[j]; + end : genMreg + else begin : genNoMreg + assign m1[j] = zero ? 0 : + SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) : + $signed({1'b0, a[(IS_MVU ? 0 : SIMD*i) + j]}) * $signed(w[i][j]); + end : genNoMreg + end : genSIMD + + // Stage #2: Adder tree to reduce SIMD products + localparam leave_load_t LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default : 1 }; + localparam int unsigned ROOT_WIDTH = $clog2(SIMD*(2**MULT_WIDTH-1)); + uwire signed [2*SIMD-2:0][ROOT_WIDTH-1:0] tree; + for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = $signed(m1[s]); + for(genvar n = 0; n < SIMD-1; n++) begin + // Sum truncated to actual maximum bit width at this node + localparam int unsigned NODE_WIDTH = $clog2(LEAVE_LOAD[n]*(2**MULT_WIDTH-1)); + uwire signed [NODE_WIDTH-1:0] s = tree[2*n+1] + tree[2*n+2]; + assign tree[n] = s; + end + + // Stage #3: Buffer output + logic [ACCU_WIDTH-1:0] P2 [PE]; + always_ff @(posedge clk) begin + if(rst) P2[i] = '{ default : 0}; + else if (en) P2[i] = (last_i ? 0 : $signed(P2[i])) + $signed(tree[0]); + end + + assign vld = last_i; + assign p[i] = P2[i]; + end : genPE + +endmodule : mvu_vvu_lut From cb434386fa8bf6f63964dd889c8025c3e9616a6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Thu, 21 Sep 2023 15:58:34 +0100 Subject: [PATCH 070/112] Revert to proper address truncation without generation bit. --- finn-rtllib/mvu/replay_buffer.sv | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/finn-rtllib/mvu/replay_buffer.sv b/finn-rtllib/mvu/replay_buffer.sv index d4342f705c..3e2766f63d 100644 --- a/finn-rtllib/mvu/replay_buffer.sv +++ b/finn-rtllib/mvu/replay_buffer.sv @@ -144,8 +144,8 @@ module replay_buffer #( uwire wr = irdy && ivld; uwire rd = !OVld || ordy; always_ff @(posedge clk) begin - if(wr) Mem[WP[AWIDTH:0]] <= idat; - if(rd) ODat <= Mem[RP[AWIDTH:0]]; + if(wr) Mem[WP[AWIDTH-1:0]] <= idat; + if(rd) ODat <= Mem[RP[AWIDTH-1:0]]; end uwire vld = (RP != WP); From b4b69f3fa7caae4be9357abf596aff4a66561228 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 21 Sep 2023 17:04:05 +0100 Subject: [PATCH 071/112] remove deletd/renamed files --- finn-rtllib/mvu/mvu_8sx9.sv | 427 ------------------------- finn-rtllib/mvu/mvu_8sx9_axi.sv | 179 ----------- finn-rtllib/mvu/mvu_8sx9_axi_tb.sv | 208 ------------ finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v | 93 ------ finn-rtllib/mvu/mvu_8sx9_tb.sv | 165 ---------- finn-rtllib/mvu/mvu_axi.sv | 248 -------------- finn-rtllib/mvu/mvu_axi_wrapper.v | 92 ------ finn-rtllib/mvu/mvu_lut.sv | 104 ------ finn-rtllib/mvu/tb/mvu_axi_tb.sv | 215 ------------- 9 files changed, 1731 deletions(-) delete mode 100644 finn-rtllib/mvu/mvu_8sx9.sv delete mode 100644 finn-rtllib/mvu/mvu_8sx9_axi.sv delete mode 100644 finn-rtllib/mvu/mvu_8sx9_axi_tb.sv delete mode 100644 finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v delete mode 100644 finn-rtllib/mvu/mvu_8sx9_tb.sv delete mode 100644 finn-rtllib/mvu/mvu_axi.sv delete mode 100644 finn-rtllib/mvu/mvu_axi_wrapper.v delete mode 100644 finn-rtllib/mvu/mvu_lut.sv delete mode 100644 finn-rtllib/mvu/tb/mvu_axi_tb.sv diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv deleted file mode 100644 index 52a93739d6..0000000000 --- a/finn-rtllib/mvu/mvu_8sx9.sv +++ /dev/null @@ -1,427 +0,0 @@ -/****************************************************************************** - * Copyright (C) 2022, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, - * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * @brief Matrix Vector Unit (MVU) core compute kernel utilizing DSP58. - *****************************************************************************/ - -module mvu_vvu_8sx9 #( - parameter IS_MVU, - int unsigned PE, - int unsigned SIMD, - int unsigned ACTIVATION_WIDTH, - int unsigned WEIGHT_WIDTH, - int unsigned ACCU_WIDTH, - bit SIGNED_ACTIVATIONS = 0, - int unsigned SEGMENTLEN = 0, // Default to 0 (which implies a single segment) - bit FORCE_BEHAVIORAL = 0, - - int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD - ) - ( - // Global Control - input logic clk, - input logic rst, - input logic en, - - // Input - input logic last, - input logic zero, // ignore current inputs and force this partial product to zero - input logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // weights - input logic [ACTIVATION_ELEMENTS-1:0][ACTIVATION_WIDTH-1:0] a, // activations - - // Ouput - output logic vld, - output logic [PE-1:0][ACCU_WIDTH-1:0] p - ); - // for verilator always use behavioral code - localparam bit BEHAVIORAL = -`ifdef VERILATOR - 1 || -`endif - FORCE_BEHAVIORAL; - -//-------------------- Declare global signals --------------------\\ - localparam int unsigned CHAINLEN = (SIMD+2)/3; - localparam int unsigned SEGLEN = SEGMENTLEN == 0 ? CHAINLEN : SEGMENTLEN; // Additional constant to default a SEGMENTLEN of '0' to the DSP-chain length - localparam int unsigned PE_ACTIVATION = IS_MVU ? 1 : PE; - uwire [26:0] a_in_i [PE_ACTIVATION * CHAINLEN]; - uwire [23:0] b_in_i [PE][CHAINLEN]; - uwire [PE-1:0][CHAINLEN-1:0][57:0] pcout; // Array with packed dimension > 256 (with a loop-carried dependency) cannot be handled out-of-the-box with PyVerilator - -//-------------------- Shift register for opmode select signal --------------------\\ - localparam int unsigned MAX_PIPELINE_STAGES = (CHAINLEN + SEGLEN-1)/SEGLEN; // >=1 (== number of pipeline registers + 1 (A/B inputs always have 1 register)) - logic L [0:1+MAX_PIPELINE_STAGES] = '{default: 0}; // After MAX_PIPELINE_STAGES (== number of pipeline stages for input data), we have 3 additional cycles latency (A/B reg, Mreg, Preg). Thus, we add +2 (since OPMODE is buffered by 1 cycle in the DSP fabric) - - always_ff @(posedge clk) begin - if(rst) L <= '{default: 0}; - else if(en) begin - L[1+MAX_PIPELINE_STAGES] <= last; - L[0:MAX_PIPELINE_STAGES] <= L[1:1+MAX_PIPELINE_STAGES]; - end - end - assign vld = L[0]; - -//-------------------- Shift register for ZERO flag --------------------\\ - logic Z [0:MAX_PIPELINE_STAGES-2] = '{default:0}; // We need MAX_PIPELINE_STAGES-1 pipeline stages (note: INMODE is buffered inside DSP fabric) - - if (MAX_PIPELINE_STAGES > 1) begin : genZreg - always_ff @(posedge clk) begin - if (rst) Z <= '{default: 0}; - else if(en) begin - Z[0] <= zero; - if (MAX_PIPELINE_STAGES > 2) Z[1:MAX_PIPELINE_STAGES-2] <= Z[0:MAX_PIPELINE_STAGES-3]; - end - end - end; - -//-------------------- Buffer for input activations --------------------\\ - localparam int unsigned PAD_BITS_ACT = 9 - ACTIVATION_WIDTH; - for (genvar k=0; k1 ? TOTAL_PREGS-1 : 0; - localparam int LANES_OCCUPIED = i == CHAINLEN-1 ? SIMD - 3*i : 3; - - if (EXTERNAL_PREGS > 0) begin : genExternalPregAct - logic [0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][ACTIVATION_WIDTH-1:0] A = '{ default : 0}; - always_ff @(posedge clk) begin - if (rst) A <= '{default: 0}; - else if(en) begin - A[EXTERNAL_PREGS-1] <= - // synthesis translate_off - zero ? '1 : - // synthesis translate_on - a[SIMD*k + 3*i +: LANES_OCCUPIED]; - if (EXTERNAL_PREGS > 1) A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1]; - end - end - for (genvar j=0; j1 ? TOTAL_PREGS-1 : 0; - localparam int LANES_OCCUPIED = j == CHAINLEN-1 ? SIMD - 3*j : 3; - - if (EXTERNAL_PREGS > 0) begin : genExternalPregWeight - logic [0:PE-1][0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][WEIGHT_WIDTH-1:0] B = '{ default : 0}; - always_ff @(posedge clk) begin - if (rst) B <= '{default: 0}; - else if (en) begin - B[i][EXTERNAL_PREGS-1] <= -// synthesis translate_off - zero ? '1 : -// synthesis translate_on - w[i][3*j +: LANES_OCCUPIED]; - if (EXTERNAL_PREGS > 1) B[i][0:EXTERNAL_PREGS-2] <= B[i][1:EXTERNAL_PREGS-1]; - end - end - for (genvar k = 0 ; k < LANES_OCCUPIED ; k++) begin : genBin - assign b_in_i[i][j][8*k +: 8] = PAD_BITS_WEIGHT == 0 ? B[i][0][k] : { {PAD_BITS_WEIGHT{B[i][0][k][WEIGHT_WIDTH-1]}}, B[i][0][k] }; - end : genBin - for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero - assign b_in_i[i][j][8*k +: 8] = 8'b0; - end : genBinZero - end : genExternalPregWeight - else begin : genInpDSPWeight - for (genvar k = 0; k < LANES_OCCUPIED; k++) begin : genBin - assign b_in_i[i][j][8*k +: 8] = -// synthesis translate_off - zero ? '1 : -// synthesis translate_on - PAD_BITS_WEIGHT == 0 ? w[i][3*j+k] : { {PAD_BITS_WEIGHT{w[i][3*j+k][WEIGHT_WIDTH-1]}}, w[i][3*j+k] }; - end : genBin - for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero - assign b_in_i[i][j][8*k +: 8] = 8'b0; - end : genBinZero - end : genInpDSPWeight - end : genWeightSIMD - end : genWeightPE - -//-------------------- Instantiate PE x CHAINLEN DSPs --------------------\\ - for (genvar i=0; i0 ? 2 : 1; // 1 : 0 - localparam bit PREG = (j+1)%SEGLEN==0 || j == CHAINLEN-1; - localparam bit FIRST = j == 0; - localparam bit LAST = j == CHAINLEN-1; - uwire [57:0] pp; - - if (LAST) begin : genPOUT - assign p[i] = pp[ACCU_WIDTH-1:0]; - end - - // Note: Since the product B * AD is computed, - // rst can be only applied to AD and zero only to B - // with the same effect as zeroing both. - if(BEHAVIORAL) begin : genBehav - // Stage #1: Input A/B - logic signed [33:0] Areg [INTERNAL_PREGS]; - always_ff @(posedge clk) begin - if (rst) Areg <= '{ default : 0}; - else if (en) begin - Areg[0] <= { 7'bx, a_in_i[(IS_MVU ? 0 : CHAINLEN*i) + j] }; - if (INTERNAL_PREGS == 2) Areg[1] <= Areg[0]; - end - end - logic signed [23:0] Breg [INTERNAL_PREGS]; - always_ff @(posedge clk) begin - if (rst) Breg <= '{ default : 0}; - else if (en) begin - Breg[0] <= b_in_i[i][j]; - if (INTERNAL_PREGS == 2) Breg[1] <= Breg[0]; - end - end - - // Stage #2: Multiply-Accumulate - logic signed [57:0] Mreg; - logic InmodeZero = 0; - always_ff @(posedge clk) begin - if (rst) InmodeZero <= 0; - else if (en) InmodeZero <= ( TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero ); - end - always_ff @(posedge clk) begin - if (rst) Mreg <= 0; - else if (en) begin - automatic logic signed [57:0] m = 0; - for (int k = 0; k < 3; k++) begin - m = m + (InmodeZero ? 0 : $signed(Areg[INTERNAL_PREGS-1][9*k +: 9]) * $signed(Breg[INTERNAL_PREGS-1][8*k +: 8])); - end - Mreg <= m; - end - end - - // Stage #3: Accumulate - logic signed [57:0] Preg; - logic Opmode = 0; - if (FIRST && !LAST) begin : genFirst - if (PREG) begin : genPregBehav - always_ff @(posedge clk) begin - if (rst) Preg <= 0; - else if (en) Preg <= Mreg; - end - end - else assign Preg = Mreg; - end - else if (FIRST && LAST) begin : genSingle - always_ff @(posedge clk) begin - if (rst) Opmode <= 0; - else if (en) Opmode <= L[1]; - end - always_ff @(posedge clk) begin - if (rst) Preg <= 0; - else if (en) Preg <= (Opmode ? 0 : Preg) + Mreg; - end - end - else if (!FIRST && LAST) begin : genLast - always_ff @(posedge clk) begin - if (rst) Opmode <= 0; - else if (en) Opmode <= L[1]; - end - always_ff @(posedge clk) begin - if (rst) Preg <= 0; - else if (en) Preg <= (Opmode ? 0 : Preg) + Mreg + pcout[i][j-1]; - end - end - else begin : genMid - if (PREG) begin : genPregBehav - always_ff @(posedge clk) begin - if (rst) Preg <= 0; - else if (en) Preg <= Mreg + pcout[i][j-1]; - end - end - else assign Preg = Mreg + pcout[i][j-1]; - end - assign pp = Preg; - assign pcout[i][j] = Preg; - end : genBehav -`ifndef VERILATOR - else begin: genDSP - DSP58 #( - // Feature Control Attributes: Data Path Selection - .AMULTSEL("A"), // Selects A input to multiplier (A, AD) - .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) - .BMULTSEL("B"), // Selects B input to multiplier (AD, B) - .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port) - .DSP_MODE("INT8"), // Configures DSP to a particular mode of operation. Set to INT24 for - // legacy mode. - .PREADDINSEL("A"), // Selects input to pre-adder (A, B) - .RND(58'h000000000000000), // Rounding Constant - .USE_MULT("MULTIPLY"), // Select multiplier usage (DYNAMIC, MULTIPLY, NONE) - .USE_SIMD("ONE58"), // SIMD selection (FOUR12, ONE58, TWO24) - .USE_WIDEXOR("FALSE"), // Use the Wide XOR function (FALSE, TRUE) - .XORSIMD("XOR24_34_58_116"), // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116) - // Pattern Detector Attributes: Pattern Detection Configuration - .AUTORESET_PATDET("NO_RESET"), // NO_RESET, RESET_MATCH, RESET_NOT_MATCH - .AUTORESET_PRIORITY("RESET"), // Priority of AUTORESET vs. CEP (CEP, RESET). - .MASK(58'h0ffffffffffffff), // 58-bit mask value for pattern detect (1=ignore) - .PATTERN(58'h000000000000000), // 58-bit pattern match for pattern detect - .SEL_MASK("MASK"), // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2 - .SEL_PATTERN("PATTERN"), // Select pattern value (C, PATTERN) - .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect (NO_PATDET, PATDET) - // Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins - .IS_ALUMODE_INVERTED(4'b0000), // Optional inversion for ALUMODE - .IS_CARRYIN_INVERTED(1'b0), // Optional inversion for CARRYIN - .IS_CLK_INVERTED(1'b0), // Optional inversion for CLK - .IS_INMODE_INVERTED(5'b00000), // Optional inversion for INMODE - .IS_NEGATE_INVERTED(3'b000), // Optional inversion for NEGATE - .IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0 - FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN - 2'b01, // Y : M - 2'b01 // X: M - }), // Optional inversion for OPMODE - .IS_RSTALLCARRYIN_INVERTED(1'b0), // Optional inversion for RSTALLCARRYIN - .IS_RSTALUMODE_INVERTED(1'b0), // Optional inversion for RSTALUMODE - .IS_RSTA_INVERTED(1'b0), // Optional inversion for RSTA - .IS_RSTB_INVERTED(1'b0), // Optional inversion for RSTB - .IS_RSTCTRL_INVERTED(1'b0), // Optional inversion for STCONJUGATE_A - .IS_RSTC_INVERTED(1'b0), // Optional inversion for RSTC - .IS_RSTD_INVERTED(1'b0), // Optional inversion for RSTD - .IS_RSTINMODE_INVERTED(1'b0), // Optional inversion for RSTINMODE - .IS_RSTM_INVERTED(1'b0), // Optional inversion for RSTM - .IS_RSTP_INVERTED(1'b0), // Optional inversion for RSTP - // Register Control Attributes: Pipeline Register Configuration - .ACASCREG(INTERNAL_PREGS), // Number of pipeline stages between A/ACIN and ACOUT (0-2) - .ADREG(0), // Pipeline stages for pre-adder (0-1) - .ALUMODEREG(0), // Pipeline stages for ALUMODE (0-1) - .AREG(INTERNAL_PREGS), // Pipeline stages for A (0-2) - .BCASCREG(INTERNAL_PREGS), // Number of pipeline stages between B/BCIN and BCOUT (0-2) - .BREG(INTERNAL_PREGS), // Pipeline stages for B (0-2) - .CARRYINREG(0), // Pipeline stages for CARRYIN (0-1) - .CARRYINSELREG(0), // Pipeline stages for CARRYINSEL (0-1) - .CREG(0), // Pipeline stages for C (0-1) - .DREG(0), // Pipeline stages for D (0-1) - .INMODEREG(1), // Pipeline stages for INMODE (0-1) - .MREG(1), // Multiplier pipeline stages (0-1) - .OPMODEREG(1), // Pipeline stages for OPMODE (0-1) - .PREG(PREG), // Number of pipeline stages for P (0-1) - .RESET_MODE("SYNC") // Selection of synchronous or asynchronous reset. (ASYNC, SYNC). - ) - DSP58_inst ( - // Cascade outputs: Cascade Ports - .ACOUT(), // 34-bit output: A port cascade - .BCOUT(), // 24-bit output: B cascade - .CARRYCASCOUT(), // 1-bit output: Cascade carry - .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade - .PCOUT(pcout[i][j]), // 58-bit output: Cascade output - // Control outputs: Control Inputs/Status Bits - .OVERFLOW(), // 1-bit output: Overflow in add/acc - .PATTERNBDETECT(), // 1-bit output: Pattern bar detect - .PATTERNDETECT(), // 1-bit output: Pattern detect - .UNDERFLOW(), // 1-bit output: Underflow in add/acc - // Data outputs: Data Ports - .CARRYOUT(), // 4-bit output: Carry - .P(pp), // 58-bit output: Primary data - .XOROUT(), // 8-bit output: XOR data - // Cascade inputs: Cascade Ports - .ACIN('x), // 34-bit input: A cascade data - .BCIN('x), // 24-bit input: B cascade - .CARRYCASCIN('x), // 1-bit input: Cascade carry - .MULTSIGNIN('x), // 1-bit input: Multiplier sign cascade - .PCIN(FIRST ? 'x : pcout[i][j-1]), // 58-bit input: P cascade - // Control inputs: Control Inputs/Status Bits - .ALUMODE(4'h0), // 4-bit input: ALU control - .CARRYINSEL('0), // 3-bit input: Carry select - .CLK(clk), // 1-bit input: Clock - .INMODE({ - INTERNAL_PREGS==2 ? 1'b0 : 1'b1, - 2'b00, - TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero, - INTERNAL_PREGS==2 ? 1'b0 : 1'b1 - }), // 5-bit input: INMODE control - .NEGATE('0), // 3-bit input: Negates the input of the multiplier - .OPMODE({ - LAST ? {1'b0, L[1]} : 2'b00, - 7'b000_0000 - }), // 9-bit input: Operation mode - // Data inputs: Data Ports - .A({ 7'bx, a_in_i[(IS_MVU ? 0 : CHAINLEN*i) + j] }), // 34-bit input: A data - .B(b_in_i[i][j]), // 24-bit input: B data - .C('x), // 58-bit input: C data - .CARRYIN('0), // 1-bit input: Carry-in - .D('x), // 27-bit input: D data - // Reset/Clock Enable inputs: Reset/Clock Enable Inputs - .ASYNC_RST('0), // 1-bit input: Asynchronous reset for all registers. - .CEA1(en), // 1-bit input: Clock enable for 1st stage AREG - .CEA2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage AREG - .CEAD('0), // 1-bit input: Clock enable for ADREG - .CEALUMODE('0), // 1-bit input: Clock enable for ALUMODE - .CEB1(en), // 1-bit input: Clock enable for 1st stage BREG - .CEB2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage BREG - .CEC('0), // 1-bit input: Clock enable for CREG - .CECARRYIN('0), // 1-bit input: Clock enable for CARRYINREG - .CECTRL(en), // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG - .CED('0), // 1-bit input: Clock enable for DREG - .CEINMODE(en), // 1-bit input: Clock enable for INMODEREG - .CEM(en), // 1-bit input: Clock enable for MREG - .CEP(PREG && en), // 1-bit input: Clock enable for PREG - .RSTA(rst), // 1-bit input: Reset for AREG - .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG - .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG - .RSTB(rst), // 1-bit input: Reset for BREG - .RSTC('0), // 1-bit input: Reset for CREG - .RSTCTRL(rst), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG - .RSTD('0), // 1-bit input: Reset for DREG and ADREG - .RSTINMODE(rst), // 1-bit input: Reset for INMODE register - .RSTM(rst), // 1-bit input: Reset for MREG - .RSTP(PREG && rst) // 1-bit input: Reset for PREG - ); - end : genDSP -`endif - end : genDSPChain - end : genDSPPE - -endmodule : mvu_vvu_8sx9 diff --git a/finn-rtllib/mvu/mvu_8sx9_axi.sv b/finn-rtllib/mvu/mvu_8sx9_axi.sv deleted file mode 100644 index 5f215927d8..0000000000 --- a/finn-rtllib/mvu/mvu_8sx9_axi.sv +++ /dev/null @@ -1,179 +0,0 @@ -/****************************************************************************** - * Copyright (C) 2022, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, - * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * @brief Matrix Vector Unit (MVU) AXI-lite interface wrapper. - *****************************************************************************/ - -module mvu_8sx9_axi #( - int unsigned MW, - int unsigned MH, - int unsigned PE, - int unsigned SIMD, - int unsigned ACTIVATION_WIDTH, - int unsigned WEIGHT_WIDTH, - int unsigned ACCU_WIDTH, - bit SIGNED_ACTIVATIONS = 0, - int unsigned SEGMENTLEN = 0, - parameter RAM_STYLE = "auto", - - localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, - localparam int unsigned INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8, - localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH, - localparam int unsigned INPUT_STREAM_WIDTH = SIMD*ACTIVATION_WIDTH, - localparam int unsigned SF = MW/SIMD, - localparam int unsigned NF = MH/PE, - localparam int unsigned OUTPUT_LANES = PE, - localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8 -) -( - // Global Control - input logic ap_clk, - input logic ap_rst_n, - - // Weight Stream - input logic [WEIGHT_STREAM_WIDTH_BA-1:0] s_axis_weights_tdata, - input logic s_axis_weights_tvalid, - output logic s_axis_weights_tready, - - // Input Stream - input logic [INPUT_STREAM_WIDTH_BA-1:0] s_axis_input_tdata, - input logic s_axis_input_tvalid, - output logic s_axis_input_tready, - - // Output Stream - output logic [OUTPUT_STREAM_WIDTH_BA-1:0] m_axis_output_tdata, - output logic m_axis_output_tvalid, - input logic m_axis_output_tready -); - -//-------------------- Parameter sanity checks --------------------\\ - initial begin - if (MW % SIMD != 0) begin - $error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD); - $finish; - end - if (MH % PE != 0) begin - $error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE); - $finish; - end - if (ACTIVATION_WIDTH > 9) begin - $error("Activation width of %0d-bits exceeds maximum of 9-bits", ACTIVATION_WIDTH); - $finish; - end - if (WEIGHT_WIDTH > 8) begin - $error("Weight width of %0d-bits exceeds maximum of 8-bits", WEIGHT_WIDTH); - $finish; - end - if (SIGNED_ACTIVATIONS == 0 && ACTIVATION_WIDTH==9) begin - $error("Activation width of %0d-bits exceeds maximum of 8-bits for unsigned numbers", ACTIVATION_WIDTH); - $finish; - end - if (SEGMENTLEN == 0) begin - $warning("Segment length of %0d defaults to chain length", SEGMENTLEN); - end - if (SEGMENTLEN > (SIMD+2)/3) begin - $error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3); - $finish; - end - end - - uwire clk = ap_clk; - uwire rst = !ap_rst_n; - - typedef logic [INPUT_STREAM_WIDTH-1 : 0] mvauin_t; - - uwire mvauin_t amvau; - uwire alast; - uwire afin; - uwire avld; - uwire ardy; - - replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvauin_t)), .RAM_STYLE(RAM_STYLE)) activation_replay ( - .clk, .rst, - .ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)), - .ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin) - ); - -//-------------------- Input control --------------------\\ - uwire en; - uwire istb = avld && s_axis_weights_tvalid; - assign ardy = en && s_axis_weights_tvalid; - assign s_axis_weights_tready = en && avld; - -//-------------------- Core MVU --------------------\\ - uwire ovld; - uwire [PE-1:0][57:0] odat; - typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t; - mvu_8sx9 #(.PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), - .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN)) core ( - .clk, .rst, .en, - .last(alast), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau), - .vld(ovld), .p(odat) - ); - -//-------------------- Output register slice --------------------\\ - struct { - logic vld; - logic [PE-1:0][ACCU_WIDTH-1:0] dat; - } A = '{ vld: 0, default: 'x}; - - assign en = !A.vld || !ovld; - - uwire b_load; - always_ff @(posedge clk) begin - if(rst) A <= '{ vld: 0, default: 'x }; - else if(!A.vld || b_load) begin - A.vld <= ovld && en; - for(int unsigned i = 0; i < PE; i++) begin - // CR-1148862: - // A.dat[i] <= odat[i]; - automatic logic [57:0] v = odat[i]; - A.dat[i] <= v[ACCU_WIDTH-1:0]; - end - end - end - - struct { - logic vld; - logic [PE-1:0][ACCU_WIDTH-1:0] dat; - } B = '{ vld: 0, default: 'x}; - - assign b_load = !B.vld || m_axis_output_tready; - always_ff @(posedge clk) begin - if(rst) B <= '{ default: 'x }; - else begin - if(b_load) B <= '{ vld: A.vld, dat: A.dat}; - end - end - - assign m_axis_output_tvalid = B.vld; - assign m_axis_output_tdata = B.dat; - -endmodule \ No newline at end of file diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv b/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv deleted file mode 100644 index 70ffa096ef..0000000000 --- a/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv +++ /dev/null @@ -1,208 +0,0 @@ -/****************************************************************************** - * Copyright (C) 2022, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, - * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * @brief Testbench for MVU AXI-lite interface wrapper. - *****************************************************************************/ - -module mvu_8sx9_axi_tb(); - -//-------------------- Simulation parameters --------------------\\ - // Matrix & parallelism config - localparam int unsigned MW = 600; - localparam int unsigned MH = 256; - localparam int unsigned SIMD = 60; - localparam int unsigned PE = 16; - localparam int unsigned SEGMENTLEN = 4; - // Bit-width config - localparam int unsigned ACTIVATION_WIDTH = 8; - localparam int unsigned WEIGHT_WIDTH = 4; - localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW); - localparam bit SIGNED_ACTIVATIONS = 1; - // Simulation constants - localparam int unsigned NF = MH/PE; - localparam int unsigned SF = MW/SIMD; - localparam int unsigned NUM_OF_DSP = SIMD/3; - localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8; - localparam int unsigned ACTIVATION_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8*8; - localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH; - localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - SIMD*ACTIVATION_WIDTH; - localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8; - - // Generate clk and reset signal - logic clk = 0; - always #5ns clk = !clk; - - logic ap_rst_n = 0; - initial begin - repeat(16) @(posedge clk); - ap_rst_n <= 1; - end - - uwire ap_clk = clk; - - // Generate activations - typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; - typedef activation_t activation_vector_t[SF]; - - function activation_vector_t init_ACTIVATIONS; - automatic activation_vector_t res; - std::randomize(res); - return res; - endfunction : init_ACTIVATIONS - - activation_vector_t ACTIVATIONS = init_ACTIVATIONS(); - - struct { - activation_t dat; - logic vld; - logic rdy; - } activations; - - initial begin - activations.vld = 0; - activations.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain - @(posedge clk iff ap_rst_n); - - for (int i=0; i 1; - @(posedge clk); - end while (!(activations.vld === 1 && activations.rdy === 1)); - end - - activations.vld <= 0; - activations.dat <= 'x; - end - - // Generate weights - typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; - typedef weight_t weight_matrix_t[NF][SF]; - - function weight_matrix_t init_WEIGHTS; - automatic weight_matrix_t res; - std::randomize(res); - return res; - endfunction : init_WEIGHTS; - - weight_matrix_t WEIGHTS = init_WEIGHTS(); - - struct { - weight_t dat; - logic vld; - logic rdy; - } weights; - - initial begin - weights.vld = 0; - weights.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain - @(posedge clk iff ap_rst_n); - - weights.vld <= 1; - for (int i=0; i= 1; - @(posedge clk iff ap_rst_n); - end while (!(outputs.rdy === 1 && outputs.vld === 1)); - - // Compare produced outputs against golden outputs - foreach(outputs.dat[i]) begin - assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); - else begin - $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); - $stop; - end - end - - NF_CNT += 1; - end - - $finish; - end - - // Instantiate DUT - mvu_8sx9_axi #( - .MW(MW), - .MH(MH), - .PE(PE), - .SIMD(SIMD), - .ACTIVATION_WIDTH(ACTIVATION_WIDTH), - .WEIGHT_WIDTH(WEIGHT_WIDTH), - .ACCU_WIDTH(ACCU_WIDTH), - .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), - .SEGMENTLEN(SEGMENTLEN) - ) - dut ( - .ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld), - .s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld), - .s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld), - .m_axis_output_tready(outputs.rdy) - ); - -endmodule diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v deleted file mode 100644 index e15f77fbae..0000000000 --- a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v +++ /dev/null @@ -1,93 +0,0 @@ -/****************************************************************************** - * Copyright (C) 2022, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, - * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * @brief Verilog AXI-lite wrapper for MVU. - *****************************************************************************/ - -module $MODULE_NAME_AXI_WRAPPER$ #( - parameter MW = $MW$, - parameter MH = $MH$, - parameter PE = $PE$, - parameter SIMD = $SIMD$, - parameter ACTIVATION_WIDTH = $ACTIVATION_WIDTH$, - parameter WEIGHT_WIDTH = $WEIGHT_WIDTH$, - parameter ACCU_WIDTH = $ACCU_WIDTH$, - parameter SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$, - parameter SEGMENTLEN = $SEGMENTLEN$, - parameter RAM_STYLE = "$IBUF_RAM_STYLE$", - - // Safely deducible parameters - parameter WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, - parameter INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8, - parameter OUTPUT_LANES = PE, - parameter OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8 -)( - // Global Control - (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF s_axis_weights:s_axis_input:m_axis_output, ASSOCIATED_RESET ap_rst_n" *) - (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *) - input ap_clk, - (* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *) - input ap_rst_n, - - // Weight Stream - input [WEIGHT_STREAM_WIDTH_BA-1:0] s_axis_weights_tdata, - input s_axis_weights_tvalid, - output s_axis_weights_tready, - - // Input Stream - input [INPUT_STREAM_WIDTH_BA-1:0] s_axis_input_tdata, - input s_axis_input_tvalid, - output s_axis_input_tready, - - // Output Stream - output [OUTPUT_STREAM_WIDTH_BA-1:0] m_axis_output_tdata, - output m_axis_output_tvalid, - input m_axis_output_tready -); - -mvu_8sx9_axi #( - .MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), - .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), - .SEGMENTLEN(SEGMENTLEN), .RAM_STYLE(RAM_STYLE) - ) inst ( - .ap_clk(ap_clk), - .ap_rst_n(ap_rst_n), - .s_axis_weights_tdata(s_axis_weights_tdata), - .s_axis_weights_tvalid(s_axis_weights_tvalid), - .s_axis_weights_tready(s_axis_weights_tready), - .s_axis_input_tdata(s_axis_input_tdata), - .s_axis_input_tvalid(s_axis_input_tvalid), - .s_axis_input_tready(s_axis_input_tready), - .m_axis_output_tdata(m_axis_output_tdata), - .m_axis_output_tvalid(m_axis_output_tvalid), - .m_axis_output_tready(m_axis_output_tready) -); - -endmodule : $MODULE_NAME_AXI_WRAPPER$ diff --git a/finn-rtllib/mvu/mvu_8sx9_tb.sv b/finn-rtllib/mvu/mvu_8sx9_tb.sv deleted file mode 100644 index adf6a8f9c2..0000000000 --- a/finn-rtllib/mvu/mvu_8sx9_tb.sv +++ /dev/null @@ -1,165 +0,0 @@ -/****************************************************************************** - * Copyright (C) 2022, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, - * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * @brief Testbench for MVU core compute kernel. - *****************************************************************************/ - -module mvu_8sx9_tb(); - -//-------------------- Simulation parameters --------------------\\ - // Matrix & parallelism config - localparam int unsigned MH = 256; - localparam int unsigned PE = 16; - localparam int unsigned MW = 600; - localparam int unsigned SIMD = 60; - localparam int unsigned SEGMENTLEN = 4; - // Bit-width config - localparam int unsigned ACTIVATION_WIDTH = 8; - localparam int unsigned WEIGHT_WIDTH = 4; - localparam bit SIGNED_ACTIVATIONS = 1; - // Simulation constants - localparam int unsigned NF = MH/PE; - localparam int unsigned SF = MW/SIMD; - localparam int unsigned NUM_OF_DSP = SIMD/3; - - typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; - typedef activation_t activation_vector_t[SF]; - - function activation_vector_t init_ACTIVATIONS; - automatic activation_vector_t res; - std::randomize(res); - return res; - endfunction : init_ACTIVATIONS - - typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; - typedef weight_t weight_matrix_t[NF][SF]; - - function weight_matrix_t init_WEIGHTS; - automatic weight_matrix_t res; - std::randomize(res); - return res; - endfunction : init_WEIGHTS; - - typedef logic signed [PE-1:0][57:0] output_t; - typedef output_t output_vector_t [NF]; - - function output_vector_t check_output(activation_vector_t a, weight_matrix_t w); - automatic output_vector_t res = '{default: 0}; - for (int j = 0; j 1) && !rst; - end - - // Compare computed output against golden output when vld flag is raised by DUT - always_ff @(posedge clk iff (vld && en)) begin - foreach(p[i]) begin - assert ($signed(p[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); - else begin - $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); - $stop; - end - end - NF_CNT += 1; - end - - // Instantiate DUT - mvu_8sx9 #( - .PE(PE), - .SIMD(SIMD), - .WEIGHT_WIDTH(WEIGHT_WIDTH), - .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), - .ACTIVATION_WIDTH(ACTIVATION_WIDTH), - .SEGMENTLEN(SEGMENTLEN) - ) - dut ( - .clk, .rst, .en, .last, .zero, .a, .w, .vld, .p - ); - -endmodule diff --git a/finn-rtllib/mvu/mvu_axi.sv b/finn-rtllib/mvu/mvu_axi.sv deleted file mode 100644 index 07ad32e6c8..0000000000 --- a/finn-rtllib/mvu/mvu_axi.sv +++ /dev/null @@ -1,248 +0,0 @@ -/****************************************************************************** - * Copyright (C) 2022, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, - * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * @brief Matrix Vector Unit (MVU) & Vector Vector Unit (VVU) AXI-lite interface wrapper. - * @details - * The following compute cores are supported: - * - 4-bit MVU on DSP48 & DSP58 achieving 4 MACs/DSP, - * (4,8]-bit MVU on DSP48 achieving 2 MACs/DSP, - * [4,9]-bit MVU and VVU on DSP58 achieving 3 MACs/DSP, - * 'unconstrained' LUT-based MVU and VVU. - * Folding hints: - * - PE scaling should divide MH. - * - SIMD scaling should divide MW. - * - Otherwise, keep SIMD and PE somewhat balanced. SIMD scaling tends to - * impact critical paths more than PE scaling. PE scaling implies a - * bigger fanout on the input activations. - * - Full unfolding along MH (PE=MH) results in no replay buffer instantiated - *****************************************************************************/ - -module mvu_vvu_axi #( - bit IS_MVU, // string type causes error in Vivado - parameter COMPUTE_CORE, - int unsigned MW, - int unsigned MH, - int unsigned PE, - int unsigned SIMD, - int unsigned ACTIVATION_WIDTH, - int unsigned WEIGHT_WIDTH, - int unsigned ACCU_WIDTH, - bit SIGNED_ACTIVATIONS = 0, - int unsigned SEGMENTLEN = 0, - bit FORCE_BEHAVIORAL = 0, - bit M_REG_LUT = 1, - - // Safely deducible parameters - localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, - localparam int unsigned INPUT_STREAM_WIDTH_BA = ((IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8, - localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH, - localparam int unsigned INPUT_STREAM_WIDTH = (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH, - localparam int unsigned SF = MW/SIMD, - localparam int unsigned NF = MH/PE, - localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8 -) -( - // Global Control - input logic ap_clk, - input logic ap_rst_n, - - // Weight Stream - input logic [WEIGHT_STREAM_WIDTH_BA-1:0] s_axis_weights_tdata, - input logic s_axis_weights_tvalid, - output logic s_axis_weights_tready, - - // Input Stream - input logic [INPUT_STREAM_WIDTH_BA-1:0] s_axis_input_tdata, - input logic s_axis_input_tvalid, - output logic s_axis_input_tready, - - // Output Stream - output logic [OUTPUT_STREAM_WIDTH_BA-1:0] m_axis_output_tdata, - output logic m_axis_output_tvalid, - input logic m_axis_output_tready -); - -//-------------------- Parameter sanity checks --------------------\\ - initial begin - if (MW % SIMD != 0) begin - $error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD); - $finish; - end - if (MH % PE != 0) begin - $error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE); - $finish; - end - if (WEIGHT_WIDTH > 8) begin - $error("Weight width of %0d-bits exceeds maximum of 8-bits", WEIGHT_WIDTH); - $finish; - end - if (ACTIVATION_WIDTH > 8) begin - if (!(SIGNED_ACTIVATIONS == 1 && ACTIVATION_WIDTH == 9 && COMPUTE_CORE == "mvu_vvu_8sx9_dsp58")) begin - $error("Activation width of %0d-bits exceeds maximum of 9-bits for signed numbers on DSP48", ACTIVATION_WIDTH); - $finish; - end - end - if (COMPUTE_CORE == "mvu_vvu_8sx9_dsp58") begin - if (SEGMENTLEN == 0) begin - $warning("Segment length of %0d defaults to chain length of %0d", SEGMENTLEN, (SIMD+2)/3); - end - if (SEGMENTLEN > (SIMD+2)/3) begin - $error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3); - $finish; - end - end - if (!IS_MVU) begin - if (COMPUTE_CORE != "mvu_vvu_8sx9_dsp58" && COMPUTE_CORE != "mvu_vvu_lut") begin - $error("VVU only supported on DSP58 or LUT-based implementation"); - $finish; - end - end - end - - uwire clk = ap_clk; - uwire rst = !ap_rst_n; - - typedef logic [INPUT_STREAM_WIDTH-1 : 0] mvauin_t; - - uwire mvauin_t amvau; - uwire alast; - uwire afin; - uwire avld; - uwire ardy; - - replay_buffer #(.LEN(SF), .REP(IS_MVU ? NF : 1), .W($bits(mvauin_t))) activation_replay ( - .clk, .rst, - .ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)), - .ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin) - ); - -//-------------------- Input control --------------------\\ - uwire en; - uwire istb = avld && s_axis_weights_tvalid; - assign ardy = en && s_axis_weights_tvalid; - assign s_axis_weights_tready = en && avld; - -//-------------------- Core MVU/VVU --------------------\\ - uwire ovld; - uwire [PE-1:0][ACCU_WIDTH-1:0] odat; - typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t; - uwire mvauin_t amvau_i; - - if (IS_MVU) begin : genMVUInput - assign amvau_i = amvau; - end : genMVUInput - else begin : genVVUInput - // The input stream will have the channels interleaved for VVU when PE>1 - // Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..] - // Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like: - // (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to - // (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i) - localparam int num_of_elements = INPUT_STREAM_WIDTH/ACTIVATION_WIDTH; - for (genvar i=0; i 1) ? - amvau[(i/SIMD + (i*PE % num_of_elements) + 1) * ACTIVATION_WIDTH : (i/SIMD + (i*PE % num_of_elements)) * ACTIVATION_WIDTH] - : amvau[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH]; - end : genRewire - end : genVVUInput - - case(COMPUTE_CORE) - "mvu_vvu_8sx9_dsp58": - mvu_vvu_8sx9 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), - .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), - .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( - .clk, .rst, .en, - .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), - .vld(ovld), .p(odat) - ); - "mvu_4sx4u": - mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( - .clk, .rst, .en, - .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), - .vld(ovld), .p(odat) - ); - "mvu_8sx8u_dsp48": - mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), - .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( - .clk, .rst, .en, - .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), - .vld(ovld), .p(odat) - ); - "mvu_vvu_lut": - mvu_vvu_lut #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), - .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .M_REG(M_REG_LUT)) core ( - .clk, .rst, .en, - .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), - .vld(ovld), .p(odat) - ); - default: initial begin - $error("Unrecognized COMPUTE_CORE '%s'", COMPUTE_CORE); - $finish; - end - endcase - -//-------------------- Output register slice --------------------\\ - struct packed { - logic vld; - logic [PE-1:0][ACCU_WIDTH-1:0] dat; - } A = '{ vld: 0, default: 'x}; - - assign en = !A.vld || !ovld; - - uwire b_load; - always_ff @(posedge clk) begin - if(rst) A <= '{ vld: 0, default: 'x }; - else if(!A.vld || b_load) begin - A.vld <= ovld && en; - for(int unsigned i = 0; i < PE; i++) begin - // CR-1148862: - // A.dat[i] <= odat[i]; - automatic logic [ACCU_WIDTH-1:0] v = odat[i]; - A.dat[i] <= v[ACCU_WIDTH-1:0]; - end - end - end - - struct packed { - logic vld; - logic [PE-1:0][ACCU_WIDTH-1:0] dat; - } B = '{ vld: 0, default: 'x}; - - assign b_load = !B.vld || m_axis_output_tready; - always_ff @(posedge clk) begin - if(rst) B <= '{ vld: 0, default: 'x }; - else begin - if(b_load) B <= '{ vld: A.vld, dat: A.dat}; - end - end - - assign m_axis_output_tvalid = B.vld; - assign m_axis_output_tdata = B.dat; - -endmodule : mvu_vvu_axi diff --git a/finn-rtllib/mvu/mvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_axi_wrapper.v deleted file mode 100644 index 239c5bbacd..0000000000 --- a/finn-rtllib/mvu/mvu_axi_wrapper.v +++ /dev/null @@ -1,92 +0,0 @@ -/****************************************************************************** - * Copyright (C) 2022, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, - * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * @brief Verilog AXI-lite wrapper for MVU. - *****************************************************************************/ - -module $MODULE_NAME_AXI_WRAPPER$ #( - parameter MW = $MW$, - parameter MH = $MH$, - parameter PE = $PE$, - parameter SIMD = $SIMD$, - parameter ACTIVATION_WIDTH = $ACTIVATION_WIDTH$, - parameter WEIGHT_WIDTH = $WEIGHT_WIDTH$, - parameter ACCU_WIDTH = $ACCU_WIDTH$, - parameter SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$, - parameter SEGMENTLEN = $SEGMENTLEN$, - parameter MVU_IMPL_STYLE = "$MVU_IMPL_STYLE$", - parameter FORCE_BEHAVIORAL = $FORCE_BEHAVIORAL$, - - // Safely deducible parameters - parameter WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, - parameter INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8, - parameter OUTPUT_LANES = PE, - parameter OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8 -)( - // Global Control - (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *) - (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *) - input ap_clk, - (* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *) - input ap_rst_n, - - // Weight Stream - input [WEIGHT_STREAM_WIDTH_BA-1:0] weights_V_TDATA, - input weights_V_TVALID, - output weights_V_TREADY, - // Input Stream - input [INPUT_STREAM_WIDTH_BA-1:0] in0_V_TDATA, - input in0_V_TVALID, - output in0_V_TREADY, - // Output Stream - output [OUTPUT_STREAM_WIDTH_BA-1:0] out_V_TDATA, - output out_V_TVALID, - input out_V_TREADY -); - -mvu_axi #( - .MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), - .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), - .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL), .MVU_IMPL_STYLE(MVU_IMPL_STYLE) - ) inst ( - .ap_clk(ap_clk), - .ap_rst_n(ap_rst_n), - .s_axis_weights_tdata(weights_V_TDATA), - .s_axis_weights_tvalid(weights_V_TVALID), - .s_axis_weights_tready(weights_V_TREADY), - .s_axis_input_tdata(in0_V_TDATA), - .s_axis_input_tvalid(in0_V_TVALID), - .s_axis_input_tready(in0_V_TREADY), - .m_axis_output_tdata(out_V_TDATA), - .m_axis_output_tvalid(out_V_TVALID), - .m_axis_output_tready(out_V_TREADY) -); - -endmodule : $MODULE_NAME_AXI_WRAPPER$ diff --git a/finn-rtllib/mvu/mvu_lut.sv b/finn-rtllib/mvu/mvu_lut.sv deleted file mode 100644 index c100910d75..0000000000 --- a/finn-rtllib/mvu/mvu_lut.sv +++ /dev/null @@ -1,104 +0,0 @@ -module mvu_vvu_lut #( - bit IS_MVU, - int unsigned PE, - int unsigned SIMD, - int unsigned ACCU_WIDTH, - int unsigned ACTIVATION_WIDTH, - int unsigned WEIGHT_WIDTH, - bit SIGNED_ACTIVATIONS, - bit M_REG = 1, - - localparam int unsigned MULT_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH, - localparam int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD -)( - // Global Control - input logic clk, - input logic rst, - input logic en, - - // Input - input logic last, - input logic zero, // ignore current inputs and force this partial product to zero - input logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // signed weights - input logic [ACTIVATION_ELEMENTS-1:0][ACTIVATION_WIDTH-1:0] a, // (un)signed activations - - // Ouput - output logic vld, - output logic signed [PE-1:0][ACCU_WIDTH-1:0] p -); - - typedef int unsigned leave_load_t[2*SIMD-1]; - function leave_load_t init_leave_loads(); - automatic leave_load_t res; - for(int i = 2*(SIMD-1); i >= int'(SIMD)-1; i--) res[i] = 1; - for(int i = SIMD-2; i >= 0; i--) res[i] = res[2*i+1] + res[2*i+2]; - return res; - endfunction : init_leave_loads - - // Pipeline for last indicator flag - uwire last_i; - generate if (M_REG) begin - logic [0:1] L = '0; - always_ff @(posedge clk) begin - if(rst) L <= '0; - else if (en) L <= {last, L[0]}; - end - assign last_i = L[1]; - end - else begin - logic L = '0; - always_ff @(posedge clk) begin - if(rst) L <= '0; - else if (en) L <= last; - end - assign last_i = L; - end - endgenerate - - // For each PE generate - for (genvar i = 0; i < PE; i++) begin : genPE - // Stage #1: SIMD multipliers in parallel - uwire [MULT_WIDTH-1 : 0] m1 [SIMD]; - for (genvar j = 0; j < SIMD; j++) begin : genSIMD - if (M_REG) begin : genMreg - logic [MULT_WIDTH-1 : 0] M [SIMD]; - always_ff @(posedge clk) begin - if(rst) M[j] = '{ default : 0 }; - else if (en) M[j] = zero ? 0 : - SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) : - $signed({1'b0, a[(IS_MVU ? 0 : SIMD*i) + j]}) * $signed(w[i][j]); - // (SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) : a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) isn't valid -- leads to unsigned multiplication - end - assign m1[j] = M[j]; - end : genMreg - else begin : genNoMreg - assign m1[j] = zero ? 0 : - SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) : - $signed({1'b0, a[(IS_MVU ? 0 : SIMD*i) + j]}) * $signed(w[i][j]); - end : genNoMreg - end : genSIMD - - // Stage #2: Adder tree to reduce SIMD products - localparam leave_load_t LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default : 1 }; - localparam int unsigned ROOT_WIDTH = $clog2(SIMD*(2**MULT_WIDTH-1)); - uwire signed [2*SIMD-2:0][ROOT_WIDTH-1:0] tree; - for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = $signed(m1[s]); - for(genvar n = 0; n < SIMD-1; n++) begin - // Sum truncated to actual maximum bit width at this node - localparam int unsigned NODE_WIDTH = $clog2(LEAVE_LOAD[n]*(2**MULT_WIDTH-1)); - uwire signed [NODE_WIDTH-1:0] s = tree[2*n+1] + tree[2*n+2]; - assign tree[n] = s; - end - - // Stage #3: Buffer output - logic [ACCU_WIDTH-1:0] P2 [PE]; - always_ff @(posedge clk) begin - if(rst) P2[i] = '{ default : 0}; - else if (en) P2[i] = (last_i ? 0 : $signed(P2[i])) + $signed(tree[0]); - end - - assign vld = last_i; - assign p[i] = P2[i]; - end : genPE - -endmodule : mvu_vvu_lut diff --git a/finn-rtllib/mvu/tb/mvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_axi_tb.sv deleted file mode 100644 index b89b58f55b..0000000000 --- a/finn-rtllib/mvu/tb/mvu_axi_tb.sv +++ /dev/null @@ -1,215 +0,0 @@ -/****************************************************************************** - * Copyright (C) 2022, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, - * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * @brief Testbench for MVU AXI-lite interface wrapper. - *****************************************************************************/ - -module mvu_axi_tb(); - -//-------------------- Simulation parameters --------------------\\ - // Matrix & parallelism config - localparam int unsigned MW = 50; - localparam int unsigned MH = 8; - localparam int unsigned SIMD = 10; - localparam int unsigned PE = 2; - localparam int unsigned SEGMENTLEN = 2; - localparam string MVU_IMPL_STYLE = "mvu_8sx8u_dsp48"; - localparam bit FORCE_BEHAVIORAL = 1; - // Bit-width config - localparam int unsigned ACTIVATION_WIDTH = 8; - localparam int unsigned WEIGHT_WIDTH = 8; - localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW); - localparam bit SIGNED_ACTIVATIONS = 0; - // Simulation constants - localparam int unsigned NF = MH/PE; - localparam int unsigned SF = MW/SIMD; - localparam int unsigned NUM_OF_DSP = SIMD/3; - localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8; - localparam int unsigned ACTIVATION_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8*8; - localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH; - localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - SIMD*ACTIVATION_WIDTH; - localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8; - - // Generate clk and reset signal - logic clk = 0; - always #5ns clk = !clk; - - logic ap_rst_n = 0; - initial begin - repeat(16) @(posedge clk); - ap_rst_n <= 1; - end - - uwire ap_clk = clk; - - // Generate activations - typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; - typedef activation_t activation_vector_t[SF]; - - function activation_vector_t init_ACTIVATIONS; - automatic activation_vector_t res; - std::randomize(res); - return res; - endfunction : init_ACTIVATIONS - - activation_vector_t ACTIVATIONS = init_ACTIVATIONS(); - - struct { - activation_t dat; - logic vld; - logic rdy; - } activations; - - initial begin - activations.vld = 0; - activations.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain - @(posedge clk iff ap_rst_n); - - for (int i=0; i= 1; - @(posedge clk); - end while (!(activations.vld === 1 && activations.rdy === 1)); - end - - activations.vld <= 0; - activations.dat <= 'x; - end - - // Generate weights - typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; - typedef weight_t weight_matrix_t[NF][SF]; - - function weight_matrix_t init_WEIGHTS; - automatic weight_matrix_t res; - std::randomize(res); - return res; - endfunction : init_WEIGHTS; - - weight_matrix_t WEIGHTS = init_WEIGHTS(); - - struct { - weight_t dat; - logic vld; - logic rdy; - } weights; - - initial begin - weights.vld = 0; - weights.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain - @(posedge clk iff ap_rst_n); - - weights.vld <= 1; - for (int i=0; i= 1; - @(posedge clk iff ap_rst_n); - end while (!(outputs.rdy === 1 && outputs.vld === 1)); - - // Compare produced outputs against golden outputs - foreach(outputs.dat[i]) begin - assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); - else begin - $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); - $stop; - end - end - - NF_CNT += 1; - end - - $finish; - end - - // Instantiate DUT - mvu_axi #( - .MW(MW), - .MH(MH), - .PE(PE), - .SIMD(SIMD), - .ACTIVATION_WIDTH(ACTIVATION_WIDTH), - .WEIGHT_WIDTH(WEIGHT_WIDTH), - .ACCU_WIDTH(ACCU_WIDTH), - .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), - .SEGMENTLEN(SEGMENTLEN), - .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL), - .MVU_IMPL_STYLE(MVU_IMPL_STYLE) - ) - dut ( - .ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld), - .s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld), - .s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld), - .m_axis_output_tready(outputs.rdy) - ); - -endmodule : mvu_axi_tb From 14c5fa902820396e3489a244dc4d705fd1ebe532 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 21 Sep 2023 17:12:47 +0100 Subject: [PATCH 072/112] [mvu vvu 8sx9]: renamed for consistency --- finn-rtllib/mvu/{mvu_vvu_8sx9.sv => mvu_vvu_8sx9_dsp58.sv} | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) rename finn-rtllib/mvu/{mvu_vvu_8sx9.sv => mvu_vvu_8sx9_dsp58.sv} (99%) diff --git a/finn-rtllib/mvu/mvu_vvu_8sx9.sv b/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv similarity index 99% rename from finn-rtllib/mvu/mvu_vvu_8sx9.sv rename to finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv index 2aa9d71b6c..6ae117e3ab 100644 --- a/finn-rtllib/mvu/mvu_vvu_8sx9.sv +++ b/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv @@ -31,7 +31,7 @@ * @brief Matrix Vector Unit (MVU) core compute kernel utilizing DSP58. *****************************************************************************/ -module mvu_vvu_8sx9 #( +module mvu_vvu_8sx9_dsp58 #( bit IS_MVU, int unsigned PE, int unsigned SIMD, @@ -424,4 +424,4 @@ module mvu_vvu_8sx9 #( end : genDSPChain end : genDSPPE -endmodule : mvu_vvu_8sx9 +endmodule : mvu_vvu_8sx9_dsp58 From 3a3758826512fd3d5ed0bcdd23358d5fd5b724cd Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 21 Sep 2023 17:13:25 +0100 Subject: [PATCH 073/112] [mvu vvu axi]: changes for renamed module --- finn-rtllib/mvu/mvu_vvu_axi.sv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index ff677fc244..416480da79 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -174,7 +174,7 @@ module mvu_vvu_axi #( case(COMPUTE_CORE) "mvu_vvu_8sx9_dsp58": - mvu_vvu_8sx9 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), + mvu_vvu_8sx9_dsp58 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( .clk, .rst, .en, From afe36baa134b947718db34d140c8d6500b91cb2a Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 25 Sep 2023 13:44:17 +0100 Subject: [PATCH 074/112] [mvu vvu wrapper]: convert localparam to param --- finn-rtllib/mvu/mvu_vvu_axi_wrapper.v | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v index 270fe7351f..9c65dbc06e 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v +++ b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v @@ -46,9 +46,9 @@ module $MODULE_NAME_AXI_WRAPPER$ #( parameter FORCE_BEHAVIORAL = $FORCE_BEHAVIORAL$, // Safely deducible parameters - localparam WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, - localparam INPUT_STREAM_WIDTH_BA = ((IS_MVU == 1 ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8, - localparam OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8 + parameter WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, + parameter INPUT_STREAM_WIDTH_BA = ((IS_MVU == 1 ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8, + parameter OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8 )( // Global Control (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *) From e4f2f9e0e4f1cb0bae2bf7e439c57356b3670620 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 25 Sep 2023 13:45:48 +0100 Subject: [PATCH 075/112] [mvau-rtl custom-op]: bugfix to instantiate memstreamer, modified renamed files and axi wrapper template fill-out --- .../matrixvectoractivation_rtl.py | 92 ++++++++++--------- 1 file changed, 51 insertions(+), 41 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py index 9f8130806b..c7fb855884 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py @@ -61,8 +61,7 @@ class MatrixVectorActivation_rtl(HLSCustomOp): - """Class that corresponds to finn-hls Matrix_Vector_Activate(_Stream)_Batch - function.""" + """Class that corresponds to finn-rtl Matrix Vector Unit.""" def __init__(self, onnx_node, **kwargs): super().__init__(onnx_node, **kwargs) @@ -73,8 +72,7 @@ def get_nodeattr_types(self): "SIMD": ("i", True, 0), "MW": ("i", True, 0), "MH": ("i", True, 0), - "resType": ("s", False, "lut", {"auto", "lut", "dsp"}), - "ActVal": ("i", False, 0), + "resType": ("s", False, "dsp", {"auto", "lut", "dsp"}), # FINN DataTypes for inputs, weights, outputs "inputDataType": ("s", True, ""), "weightDataType": ("s", True, ""), @@ -165,7 +163,6 @@ def verify_node(self): # verify that all necessary attributes exist # TODO collect automatically from get_nodeattr_types try: - self.get_nodeattr("code_gen_dir_cppsim") self.get_nodeattr("executable_path") self.get_nodeattr("resType") self.get_nodeattr("MW") @@ -199,7 +196,6 @@ def verify_node(self): return info_messages - # TODO: Add in replay_buffer estimation def uram_estimation(self): P = self.get_nodeattr("PE") Q = self.get_nodeattr("SIMD") @@ -213,7 +209,6 @@ def uram_estimation(self): mstyle = self.get_nodeattr("ram_style") if ( (mmode == "decoupled" and mstyle != "ultra") - or (mmode == "const" and self.calc_wmem() <= 128) or (mmode == "external") ): return 0 @@ -221,7 +216,6 @@ def uram_estimation(self): depth_multiplier = math.ceil(omega / 4096) return width_multiplier * depth_multiplier - # TODO: Add in replay_buffer estimation def bram_estimation(self): """Calculates resource estimation for BRAM based on: - FINN-R: An End-to-End Deep-Learning Framework for Fast @@ -243,7 +237,6 @@ def bram_estimation(self): mstyle = self.get_nodeattr("ram_style") if ( (mmode == "decoupled" and mstyle in ["distributed", "ultra"]) - or (mmode == "const" and self.calc_wmem() <= 128) or (mmode == "external") ): return 0 @@ -262,7 +255,6 @@ def bram_estimation(self): else: return (math.ceil(omega / 512)) * (math.ceil(mem_width / 36)) - # TODO: Add in replay_buffer estimation def bram_efficiency_estimation(self): wdt = self.get_weight_datatype() W = wdt.bitwidth() @@ -275,7 +267,6 @@ def bram_efficiency_estimation(self): bram16_est_capacity = bram16_est * 36 * 512 return wbits / bram16_est_capacity - # TODO: Add in replay_buffer estimation def uram_efficiency_estimation(self): """Function for URAM efficiency estimation: actual parameter storage needed divided by the allocated URAM storage (from estimation)""" @@ -290,7 +281,7 @@ def uram_efficiency_estimation(self): uram_est_capacity = uram_est * 72 * 4096 return wbits / uram_est_capacity - # TODO: FIX: worst case estimates since segmentlen is not known at this point? +# TODO: fix lut estimations def lut_estimation(self): """Calculates resource estimations for LUTs based on: - FINN-R: An End-to-End Deep-Learning Framework for Fast @@ -333,9 +324,13 @@ def lut_estimation(self): return int(c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts)) + c2) - # TODO: FIX: worst case estimates since segmentlen is not known at this point? +# TODO: fix DSP estimations --> depends on fpga_part def dsp_estimation(self): # multiplication + # mvu_8sx9 (DSP58): ceil(SIMD/3) + # mvu_4sx4u (DSP48/DSP58): ceil(PE/4) + # mvu_8sx8u (DSP48): ceil(PE/2) + # mvu_lut: 0 P = self.get_nodeattr("PE") res_type = self.get_nodeattr("resType") Q = self.get_nodeattr("SIMD") @@ -349,18 +344,24 @@ def dsp_estimation(self): mult_dsp = 0 return int(mult_dsp) - # TODO: FIX: worst case estimates since segmentlen is not known at this point +# TODO: fix exp_cycles estimations --> depends on fpga_part and clk def get_exp_cycles(self): + # mvu_8sx9 (DSP58): + # 2 (replay_buffer) + ceil(chainlen/seglen) + 2 (MREG, PREG) + 2 (output reg slice) + # + MW/SIMD * MH/PE + # mvu_4sx4u (DSP48/DSP58) / mvu_8sx8u (DSP48): + # 3 (IN_REG, MREG, PREG) + 2 (replay_buffer) + 2 (output reg slice) + 1 (adder tree SIMD) + 1 (output lane) + # + MW/SIMD * MH/PE + # mvu_lut: + # 2 (replay_buffer) + 1 OR 2 (no MREG OR MREG) + 2 (output reg slice) + # + MW/SIMD * MH/PE pe = self.get_nodeattr("PE") simd = self.get_nodeattr("SIMD") num_inp_vec = self.get_nodeattr("numInputVectors") mh = self.get_nodeattr("MH") mw = self.get_nodeattr("MW") # since mmv != 1 is not supported yet, we set mmv for now to 1 - mmv = 1 - # Actual exp_cycles is probably slightly larger (say 3 cycles - # (DSP A/B, M, P - reg) + additional pipeline buffer cycles. - # Most probably <10) + mmv = 1 exp_cycles = (mh / pe) * (mw / simd) * np.prod(num_inp_vec) / mmv return int(exp_cycles) @@ -711,7 +712,7 @@ def execute_node(self, context, graph): else: raise Exception( """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( + has to be set to "rtlsim" """.format( mode ) ) @@ -795,11 +796,12 @@ def code_generation_ipi(self): os.path.join( code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v" ), - rtllib_dir + "mvu_axi.sv", + rtllib_dir + "mvu_vvu_axi.sv", rtllib_dir + "replay_buffer.sv", rtllib_dir + "mvu_4sx4u.sv", - rtllib_dir + "mvu_8sx9.sv", + rtllib_dir + "mvu_vvu_8sx9_dsp58.sv", rtllib_dir + "mvu_8sx8u_dsp48.sv", + rtllib_dir + "mvu_vvu_lut.sv", ] for f in sourcefiles: cmd.append("add_files -norecurse %s" % (f)) @@ -813,7 +815,7 @@ def code_generation_ipi(self): ) # instantiate a streamer and connect it to the HLS IP - strm_vlnv = "amd.com:FINN:memstream:1.0" + strm_vlnv = "amd.com:finn:memstream:1.0" strm_inst = node_name + "_wstrm" cmd.append( "create_bd_cell -type ip -vlnv %s /%s/%s" @@ -890,11 +892,12 @@ def code_generation_ipi(self): os.path.join( code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v" ), - rtllib_dir + "mvu_axi.sv", + rtllib_dir + "mvu_vvu_axi.sv", rtllib_dir + "replay_buffer.sv", rtllib_dir + "mvu_4sx4u.sv", - rtllib_dir + "mvu_8sx9.sv", + rtllib_dir + "mvu_vvu_8sx9_dsp58.sv", rtllib_dir + "mvu_8sx8u_dsp48.sv", + rtllib_dir + "mvu_vvu_lut.sv", ] for f in sourcefiles: cmd.append("add_files -norecurse %s" % (f)) @@ -959,27 +962,32 @@ def derive_characteristic_fxns(self, period): ] super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) - # TODO: characterize max_clk and implement this function in look-up style def _resolve_segment_len(self, clk): - # Insert pipeline registers in the DSP chain to meet target clock frequency - return 4 # default to 4 for now + # Insert pipeline registers in the DSP58 chain to meet target clock frequency + # 0.741 ns seems the worst-case delay through first DSP + # 0.605 ns seems to be (on average) delay for all subsequent DSPs + dsp_chain_len = np.floor((clk - 0.741) / 0.605) + return max(1, dsp_chain_len) def _resolve_impl_style(self, fpgapart): # Based on target device and activation/weight-width, choose the - # supported RTL module - act_width = self.get_input_datatype(0).bitwidth() - weight_width = self.get_input_datatype(1).bitwidth() - is_versal = ( - fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"] - or fpgapart[0:5] == "xqrvc" - ) - if act_width == 4 and weight_width == 4: - return "mvu_4sx4u" + # supported RTL compute core + if self.get_nodeattr("resType") == "lut": + return "mvu_vvu_lut" else: - if is_versal: - return "mvu_8sx9_dsp58" + act_width = self.get_input_datatype(0).bitwidth() + weight_width = self.get_input_datatype(1).bitwidth() + is_versal = ( + fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"] + or fpgapart[0:5] == "xqrvc" + ) + if act_width == 4 and weight_width == 4: + return "mvu_4sx4u" else: - return "mvu_8sx8u_dsp48" + if is_versal: + return "mvu_vvu_8sx9_dsp58" + else: + return "mvu_8sx8u_dsp48" def generate_hdl(self, model, fpgapart, clk): # Generate params as part of IP preparation @@ -1023,9 +1031,11 @@ def generate_hdl(self, model, fpgapart, clk): self.set_nodeattr("ip_path", code_gen_dir) def prepare_codegen_default(self, fpgapart, clk): - template_path = os.environ["FINN_ROOT"] + "/finn-rtllib/mvu/mvu_axi_wrapper.v" + template_path = os.environ["FINN_ROOT"] + "/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v" code_gen_dict = {} + code_gen_dict["$IS_MVU$"] = [str(1)] + code_gen_dict["$COMPUTE_CORE$"] = [self._resolve_impl_style(fpgapart)] code_gen_dict["$MW$"] = [str(self.get_nodeattr("MW"))] code_gen_dict["$MH$"] = [str(self.get_nodeattr("MH"))] code_gen_dict["$PE$"] = [str(self.get_nodeattr("PE"))] @@ -1039,7 +1049,7 @@ def prepare_codegen_default(self, fpgapart, clk): [str(1)] if (self.get_input_datatype(0).min() < 0) else [str(0)] ) code_gen_dict["$SEGMENTLEN$"] = [str(self._resolve_segment_len(clk))] - code_gen_dict["$MVU_IMPL_STYLE$"] = [self._resolve_impl_style(fpgapart)] + code_gen_dict["$FORCE_BEHAVIORAL$"] = [str(0)] return template_path, code_gen_dict From b49b79a0a669caad9355e59e1ee877ca59b65d27 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 25 Sep 2023 13:47:50 +0100 Subject: [PATCH 076/112] [specialize to rtl]: fix to changed attribute name and added support for converting HLS-based VVU custom-ops to RTL-based custom-ops --- .../fpgadataflow/specialize_to_rtl_layers.py | 82 ++++++++++++++++++- 1 file changed, 81 insertions(+), 1 deletion(-) diff --git a/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py index 47ed5ce863..5061282695 100644 --- a/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py +++ b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py @@ -26,6 +26,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import numpy as np from qonnx.transformation.base import Transformation from qonnx.custom_op.registry import getCustomOp from qonnx.core.datatype import DataType @@ -60,7 +61,7 @@ def apply(self, model): for n in graph.node: node_ind += 1 if n.op_type == "MatrixVectorActivation": - preferred_in_rtl = getCustomOp(n).get_nodeattr("impl") == "rtl" and getCustomOp(n).get_nodeattr("resType") == "dsp" + preferred_in_rtl = getCustomOp(n).get_nodeattr("preferred_backend") == "rtl" supported_in_rtl = self._is_rtl_variant_compatible(n) if (preferred_in_rtl and supported_in_rtl): mvau_input = n.input[0] @@ -76,6 +77,7 @@ def apply(self, model): pe = getCustomOp(n).get_nodeattr("PE") mem_mode = getCustomOp(n).get_nodeattr("mem_mode") ram_style = getCustomOp(n).get_nodeattr("ram_style") + resType = getCustomOp(n).get_nodeattr("resType") runtime_writeable_weights = getCustomOp(n).get_nodeattr("runtime_writeable_weights") new_node = helper.make_node( @@ -93,6 +95,7 @@ def apply(self, model): outputDataType=outputDataType, numInputVectors=numInputVectors, mem_mode=mem_mode, + resType=resType, name=n.name + "_rtl", ram_style=ram_style, runtime_writeable_weights=runtime_writeable_weights @@ -108,4 +111,81 @@ def apply(self, model): model = model.transform(InferDataTypes()) model = model.transform(GiveUniqueNodeNames()) + return (model, graph_modified) + +class InferRTLVectorVectorActivation(Transformation): + """Convert (HLS-based) VectorVectorActivation layers to specialized RTL layers is supported.""" + + def __init__(self): + super().__init__() + + def _is_rtl_variant_compatible(self, n): + no_activation = getCustomOp(n).get_nodeattr("noActivation") == 1 + act_width_in_range = (DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() <= 8) or (DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() == 9 and DataType[getCustomOp(n).get_nodeattr("inputDataType")].min() < 0) + weight_width_in_range = DataType[getCustomOp(n).get_nodeattr("weightDataType")].bitwidth() <= 8 + folding_supported = (getCustomOp(n).get_nodeattr("Channels") % getCustomOp(n).get_nodeattr("PE") == 0) and (np.prod(getCustomOp(n).get_nodeattr("Kernel")) % getCustomOp(n).get_nodeattr("SIMD") == 0) + + if (no_activation and act_width_in_range and weight_width_in_range and folding_supported): + return True + else: + return False + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for n in graph.node: + node_ind += 1 + if n.op_type == "VectorVectorActivation": + preferred_in_rtl = getCustomOp(n).get_nodeattr("preferred_backend") == "rtl" + supported_in_rtl = self._is_rtl_variant_compatible(n) + if (preferred_in_rtl and supported_in_rtl): + vvau_input = n.input[0] + vvau_weight = n.input[1] + vvau_output = n.output[0] + inputDataType = getCustomOp(n).get_nodeattr("inputDataType") + weightDataType = getCustomOp(n).get_nodeattr("weightDataType") + outputDataType = getCustomOp(n).get_nodeattr("outputDataType") + pe = getCustomOp(n).get_nodeattr("PE") + simd = getCustomOp(n).get_nodeattr("SIMD") + dim = getCustomOp(n).get_nodeattr("Dim") + channels = getCustomOp(n).get_nodeattr("Channels") + kernel = getCustomOp(n).get_nodeattr("Kernel") + resType = getCustomOp(n).get_nodeattr("resType") + mem_mode = getCustomOp(n).get_nodeattr("mem_mode") + runtime_writeable_weights = getCustomOp(n).get_nodeattr("runtime_writeable_weights") + ram_style = getCustomOp(n).get_nodeattr("ram_style") + resType = getCustomOp(n).get_nodeattr("resType") + + new_node = helper.make_node( + "VectorVectorActivation_rtl", + [vvau_input, vvau_weight], + [vvau_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + name=n.name + "_rtl", + PE=pe, + SIMD=simd, + Dim=dim, + Channels=channels, + Kernel=kernel, + resType=resType, + inputDataType=inputDataType, + weightDataType=weightDataType, + outputDataType=outputDataType, + mem_mode=mem_mode, + runtime_writeable_weights=runtime_writeable_weights, + ram_style=ram_style + ) + graph.node.insert(node_ind, new_node) + # remove old node + graph.node.remove(n) + graph_modified=True + + if graph_modified: + model = model.transform(MinimizeAccumulatorWidth()) + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + model = model.transform(GiveUniqueNodeNames()) + return (model, graph_modified) \ No newline at end of file From 9bdba031df228a2afbe99b8ea2fb576b678bba86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Tue, 19 Sep 2023 15:27:28 +0100 Subject: [PATCH 077/112] Adding core for DSP48 backport. --- finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv index 416c12c1cc..07c44cf89a 100644 --- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv +++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv @@ -4,7 +4,9 @@ module mvu_8sx8u_dsp48 #( int unsigned ACCU_WIDTH, int unsigned ACTIVATION_WIDTH, int unsigned WEIGHT_WIDTH, - bit FORCE_BEHAVIORAL = 0, + + bit SIGNED_ACTIVATIONS = 0, + bit FORCE_BEHAVIORAL = 0, localparam int unsigned SINGLE_PROD_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH )( @@ -16,8 +18,8 @@ module mvu_8sx8u_dsp48 #( // Input input logic last, input logic zero, // ignore current inputs and force this partial product to zero - input logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // signed weights - input logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] a, // unsigned activations + input logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH -1:0] w, // signed weights + input logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] a, // unsigned activations (override by SIGNED_ACTIVATIONS) // Ouput output logic vld, @@ -47,7 +49,7 @@ module mvu_8sx8u_dsp48 #( assign vld = L[5]; // Stages #1 - #3: DSP Lanes + cross-lane canaries duplicated with SIMD parallelism - localparam int unsigned D[2:0] = '{ ACCU_WIDTH+SINGLE_PROD_WIDTH, SINGLE_PROD_WIDTH, 0 }; // Lane offsets + localparam int unsigned D[2:0] = '{ ACCU_WIDTH+SINGLE_PROD_WIDTH, SINGLE_PROD_WIDTH, 0 }; // Lane offsets localparam int unsigned PIPE_COUNT = (PE+1)/2; for(genvar c = 0; c < PIPE_COUNT; c++) begin : genPipes @@ -61,7 +63,7 @@ module mvu_8sx8u_dsp48 #( for(genvar s = 0; s < SIMD; s++) begin : genSIMD // Input Lane Assembly - uwire [23:0] bb = a[s]; + uwire [23:0] bb = { {(24-ACTIVATION_WIDTH){SIGNED_ACTIVATIONS && a[s][ACTIVATION_WIDTH-1]}}, a[s] }; logic [33:0] aa; logic [26:0] dd; logic [ 1:0] xx; From 2cf1ef70306339b1409ed61d8e18eda243bf56ad Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 25 Sep 2023 14:48:34 +0100 Subject: [PATCH 078/112] [mvu rtl core]: added support for signed activations for DSP48-based MVUs --- finn-rtllib/mvu/mvu_4sx4u.sv | 3 ++- finn-rtllib/mvu/mvu_vvu_axi.sv | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv index 88985312c9..706347d700 100644 --- a/finn-rtllib/mvu/mvu_4sx4u.sv +++ b/finn-rtllib/mvu/mvu_4sx4u.sv @@ -2,6 +2,7 @@ module mvu_4sx4u #( int unsigned PE, int unsigned SIMD, int unsigned ACCU_WIDTH, + bit SIGNED_ACTIVATIONS = 0, bit FORCE_BEHAVIORAL = 0 )( // Global Control @@ -57,7 +58,7 @@ module mvu_4sx4u #( for(genvar s = 0; s < SIMD; s++) begin : genSIMD // Input Lane Assembly - uwire [23:0] bb = a[s]; + uwire [23:0] bb = { {(20){SIGNED_ACTIVATIONS && a[s][3]}}, a[s] }; logic [33:0] aa; logic [26:0] dd; logic [ 1:0] xx[3:1]; diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index 416480da79..da7e00cc55 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -182,14 +182,14 @@ module mvu_vvu_axi #( .vld(ovld), .p(odat) ); "mvu_4sx4u": - mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( + mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( .clk, .rst, .en, .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), .vld(ovld), .p(odat) ); "mvu_8sx8u_dsp48": mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), - .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( .clk, .rst, .en, .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), .vld(ovld), .p(odat) From ab8d4a8e075ac9b3ccf78d2a08907d5dcc116fdb Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 25 Sep 2023 16:17:38 +0100 Subject: [PATCH 079/112] [rtl mvu custom-op]: add upper bound to SEGMENTLEN equal to number of DSP58s chained together --- src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py index c7fb855884..d0a638475a 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py @@ -966,7 +966,9 @@ def _resolve_segment_len(self, clk): # Insert pipeline registers in the DSP58 chain to meet target clock frequency # 0.741 ns seems the worst-case delay through first DSP # 0.605 ns seems to be (on average) delay for all subsequent DSPs - dsp_chain_len = np.floor((clk - 0.741) / 0.605) + critical_path_dsps = np.floor((clk - 0.741) / 0.605) + max_chain_len = np.ceil(self.get_nodeattr("SIMD") / 3) + dsp_chain_len = critical_path_dsps if critical_path_dsps < max_chain_len else max_chain_len return max(1, dsp_chain_len) def _resolve_impl_style(self, fpgapart): From 5a429fcbe14ca6177082fab472549407f47f97d6 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 13 Oct 2023 23:29:39 +0100 Subject: [PATCH 080/112] [mvu_vvu dsp58]: change weight input to 2D instead of 3D array --- finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv b/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv index 6ae117e3ab..53cf71fd5f 100644 --- a/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv +++ b/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv @@ -42,7 +42,8 @@ module mvu_vvu_8sx9_dsp58 #( int unsigned SEGMENTLEN = 0, // Default to 0 (which implies a single segment) bit FORCE_BEHAVIORAL = 0, - localparam int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD + localparam int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD, + localparam int unsigned WEIGHT_ELEMENTS = PE*SIMD ) ( // Global Control @@ -53,7 +54,7 @@ module mvu_vvu_8sx9_dsp58 #( // Input input logic last, input logic zero, // ignore current inputs and force this partial product to zero - input logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // weights + input logic [WEIGHT_ELEMENTS-1:0][WEIGHT_WIDTH-1:0] w, // weights input logic [ACTIVATION_ELEMENTS-1:0][ACTIVATION_WIDTH-1:0] a, // activations // Ouput @@ -164,7 +165,8 @@ module mvu_vvu_8sx9_dsp58 #( // synthesis translate_off zero ? '1 : // synthesis translate_on - w[i][3*j +: LANES_OCCUPIED]; + //w[i][3*j +: LANES_OCCUPIED]; + w[SIMD*i+3*j +: LANES_OCCUPIED]; if (EXTERNAL_PREGS > 1) B[i][0:EXTERNAL_PREGS-2] <= B[i][1:EXTERNAL_PREGS-1]; end end @@ -181,7 +183,8 @@ module mvu_vvu_8sx9_dsp58 #( // synthesis translate_off zero ? '1 : // synthesis translate_on - PAD_BITS_WEIGHT == 0 ? w[i][3*j+k] : { {PAD_BITS_WEIGHT{w[i][3*j+k][WEIGHT_WIDTH-1]}}, w[i][3*j+k] }; + //PAD_BITS_WEIGHT == 0 ? w[i][3*j+k] : { {PAD_BITS_WEIGHT{w[i][3*j+k][WEIGHT_WIDTH-1]}}, w[i][3*j+k] }; + PAD_BITS_WEIGHT == 0 ? w[SIMD*i+3*j+k] : { {PAD_BITS_WEIGHT{w[SIMD*i+3*j+k][WEIGHT_WIDTH-1]}}, w[SIMD*i+3*j+k] }; end : genBin for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero assign b_in_i[i][j][8*k +: 8] = 8'b0; From a4a18bb08cef96bb52c02096d54b573b421bcd12 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 13 Oct 2023 23:30:55 +0100 Subject: [PATCH 081/112] [mvu_vvu axi]: re-wire weights appropriately for VVU DSP58 --- finn-rtllib/mvu/mvu_vvu_axi.sv | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index da7e00cc55..f0f75c633a 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -45,7 +45,7 @@ *****************************************************************************/ module mvu_vvu_axi #( - bit IS_MVU, // string type causes error in Vivado + bit IS_MVU, parameter COMPUTE_CORE, int unsigned MW, int unsigned MH, @@ -64,8 +64,8 @@ module mvu_vvu_axi #( localparam int unsigned INPUT_STREAM_WIDTH_BA = ((IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8, localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH, localparam int unsigned INPUT_STREAM_WIDTH = (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH, - localparam int unsigned SF = MW/SIMD, - localparam int unsigned NF = MH/PE, + localparam int unsigned SF = IS_MVU ? MW/SIMD : MW/(SIMD*PE), + localparam int unsigned NF = IS_MVU ? MH/PE : 1, localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8 ) ( @@ -91,11 +91,11 @@ module mvu_vvu_axi #( //-------------------- Parameter sanity checks --------------------\\ initial begin - if (MW % SIMD != 0) begin + if ((MW % SIMD != 0 && IS_MVU) || (MW % (SIMD*PE) != 0 && !IS_MVU)) begin $error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD); $finish; end - if (MH % PE != 0) begin + if (MH % PE != 0 && IS_MVU) begin $error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE); $finish; end @@ -137,7 +137,7 @@ module mvu_vvu_axi #( uwire avld; uwire ardy; - replay_buffer #(.LEN(SF), .REP(IS_MVU ? NF : 1), .W($bits(mvauin_t))) activation_replay ( + replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvauin_t))) activation_replay ( .clk, .rst, .ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)), .ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin) @@ -154,9 +154,11 @@ module mvu_vvu_axi #( uwire [PE-1:0][ACCU_WIDTH-1:0] odat; typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t; uwire mvauin_t amvau_i; + uwire mvauin_weight_t wmvau_i; if (IS_MVU) begin : genMVUInput assign amvau_i = amvau; + assign wmvau_i = s_axis_weights_tdata; end : genMVUInput else begin : genVVUInput // The input stream will have the channels interleaved for VVU when PE>1 @@ -164,11 +166,14 @@ module mvu_vvu_axi #( // Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like: // (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to // (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i) - localparam int num_of_elements = INPUT_STREAM_WIDTH/ACTIVATION_WIDTH; + localparam int num_of_elements = PE*SIMD; for (genvar i=0; i 1) ? amvau[(i/SIMD + (i*PE % num_of_elements) + 1) * ACTIVATION_WIDTH : (i/SIMD + (i*PE % num_of_elements)) * ACTIVATION_WIDTH] : amvau[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH]; + assign wmvau_i[i*WEIGHT_WIDTH +: WEIGHT_WIDTH] = (PE > 1) ? + s_axis_weights_tdata[( ((SIMD-1-i) + int'(i/SIMD)*SIMD) + int'(i/SIMD) * SIMD + 1) * WEIGHT_WIDTH : ( ((SIMD-1-i) + int'(i/SIMD)*SIMD) + int'(i/SIMD) * SIMD ) * WEIGHT_WIDTH] + : s_axis_weights_tdata[i*WEIGHT_WIDTH +: WEIGHT_WIDTH]; end : genRewire end : genVVUInput @@ -178,7 +183,7 @@ module mvu_vvu_axi #( .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( .clk, .rst, .en, - .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), + .last(alast && avld), .zero(!istb), .w(wmvau_i), .a(amvau_i), .vld(ovld), .p(odat) ); "mvu_4sx4u": From cc0737bcd00cdd6df6e3d4ff38215ac5d9eb42e6 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 13 Oct 2023 23:31:35 +0100 Subject: [PATCH 082/112] [mvu_vvu axi wrapper]: fix to IS_MVU parameter --- finn-rtllib/mvu/mvu_vvu_axi_wrapper.v | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v index 9c65dbc06e..01deb23840 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v +++ b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v @@ -32,7 +32,7 @@ *****************************************************************************/ module $MODULE_NAME_AXI_WRAPPER$ #( - parameter IS_MVU = "$IS_MVU$", + parameter IS_MVU = $IS_MVU$, parameter COMPUTE_CORE = "$COMPUTE_CORE$", parameter MW = $MW$, parameter MH = $MH$, From c0eff0b819828a5e1d1ef80815f63be0042ce742 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 13 Oct 2023 23:32:47 +0100 Subject: [PATCH 083/112] [mvu_vvu tb]: WIP -- changes to self-checker and shape of input data --- finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv | 79 +++++++++++++++++----------- 1 file changed, 49 insertions(+), 30 deletions(-) diff --git a/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv index 82c2e8e7b0..b46fc588c9 100644 --- a/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv +++ b/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv @@ -35,23 +35,23 @@ module mvu_vvu_axi_tb(); //-------------------- Simulation parameters --------------------\\ // Matrix & parallelism config - localparam bit IS_MVU = 1; + localparam bit IS_MVU = 0; localparam string COMPUTE_CORE = "mvu_vvu_8sx9_dsp58"; - localparam int unsigned MW = 1500; - localparam int unsigned MH = 256; - localparam int unsigned SIMD = 60; - localparam int unsigned PE = 16; - localparam int unsigned SEGMENTLEN = 2.0; + localparam int unsigned MW = 36; + localparam int unsigned MH = 1; + localparam int unsigned SIMD = 3; + localparam int unsigned PE = 4; + localparam int unsigned SEGMENTLEN = 1.0; localparam bit FORCE_BEHAVIORAL = 1; localparam bit M_REG_LUT = 1; // Bit-width config - localparam int unsigned ACTIVATION_WIDTH = 4; - localparam int unsigned WEIGHT_WIDTH = 4; - localparam int unsigned ACCU_WIDTH = 21; // == ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW) - localparam bit SIGNED_ACTIVATIONS = 0; + localparam int unsigned ACTIVATION_WIDTH = 8; + localparam int unsigned WEIGHT_WIDTH = 6; + localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW); + localparam bit SIGNED_ACTIVATIONS = 1; // Simulation constants - localparam int unsigned NF = MH/PE; - localparam int unsigned SF = MW/SIMD; + localparam int unsigned NF = IS_MVU ? MH/PE : 1; + localparam int unsigned SF = IS_MVU ? MW/SIMD : MW/(SIMD*PE); localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8; localparam int unsigned ACTIVATION_WIDTH_BA = ((IS_MVU ? 1 : PE)*SIMD*ACTIVATION_WIDTH+7)/8*8; localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH; @@ -72,7 +72,7 @@ module mvu_vvu_axi_tb(); // Generate activations typedef logic [(IS_MVU ? 1 : PE)*SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; - typedef activation_t activation_vector_t[(IS_MVU ? 1 : NF)*SF]; + typedef activation_t activation_vector_t[SF]; function activation_vector_t init_ACTIVATIONS; automatic activation_vector_t res; @@ -93,14 +93,12 @@ module mvu_vvu_axi_tb(); activations.dat = 'X; @(posedge clk iff ap_rst_n); - for (int j=0; j<(IS_MVU ? 1 : NF); j++) begin - for (int i=0; i= 0; - @(posedge clk); - end while (!(activations.vld === 1 && activations.rdy === 1)); - end + for (int i=0; i= 0; + @(posedge clk); + end while (!(activations.vld === 1 && activations.rdy === 1)); end activations.vld <= 0; @@ -143,7 +141,9 @@ module mvu_vvu_axi_tb(); end // Function to compute golden output - // a: [(IS_MVU?1:NF)*SF][SIMD-1:0][ACTIVATION_WIDTH-1:0] + // a: [SF][(IS_MVU ? 1 : PE)*SIMD-1:0][ACTIVATION_WIDTH-1:0] + // a: [SF][SIMD-1:0][ACTIVATION_WIDTH-1:0] + // a: [SF][PE*SIMD-1:0][ACTIVATION_WIDTH-1:0] // w: [NF][SF][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] typedef logic signed [PE-1:0][ACCU_WIDTH-1:0] output_t; typedef output_t output_vector_t [NF]; @@ -156,14 +156,33 @@ module mvu_vvu_axi_tb(); function output_vector_t check_output(activation_vector_t a, weight_matrix_t w); automatic output_vector_t res = '{default: 0}; - for (int j = 0; j 1 ? $signed(a[j/PE*SF+i/SIMD][(i*PE + j%PE) % (SIMD*PE)]) : $signed(a[j/PE*SF+i/SIMD][i%SIMD]) ) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]); - else - res[j/PE][j%PE] = IS_MVU ? $signed(res[j/PE][j%PE]) + $signed({1'b0, a[i/SIMD][i%SIMD]}) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]) : - $signed(res[j/PE][j%PE]) + ( PE > 1 ? $signed({1'b0, a[j/PE*SF+i/SIMD][(i*PE + j%PE) % (SIMD*PE)]}) : $signed({1'b0, a[j/PE+SF+i/SIMD][i%SIMD]}) ) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]); + // for (int j = 0; j 1 ? $signed(a[i/SIMD/PE][i % (SIMD*PE)]) : $signed(a[i/SIMD/PE][(i)%(SIMD*PE)]) ) * $signed(w[0][i/SIMD/PE][i/PE][i%SIMD]); + // else + // res[j/PE][j%PE] = IS_MVU ? $signed(res[j/PE][j%PE]) + $signed({1'b0, a[i/SIMD][i%SIMD]}) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]) : + // $signed(res[j/PE][j%PE]) + ( PE > 1 ? $signed({1'b0, a[i/SIMD/PE][i % (SIMD*PE)]}) : $signed({1'b0, a[i/SIMD/PE][i%(SIMD*PE)]}) ) * $signed(w[0][i/SIMD][0][i%SIMD]); + // end + // end + // The input stream will have the channels interleaved for VVU when PE>1 + // Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..] + // Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like: + // (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to + // (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i) + for (int i = 0; i < NF; i++) begin + for (int j = 0; j < SF; j++) begin + for (int k = 0; k < PE; k++) begin + for (int l = 0; l < SIMD; l++) begin + if (SIGNED_ACTIVATIONS) + res[i][k] = IS_MVU ? $signed(res[i][k]) + $signed(a[j][l]) * $signed(w[i][j][k][l]) : + $signed(res[i][k]) + $signed(a[j][k + l*PE]) * $signed(w[i][j][k][l]); + else + res[i][k] = IS_MVU ? $signed(res[i][k]) + $signed({1'b0, a[j][l]}) * $signed(w[i][j][k][l]) : + $signed(res[i][k]) + $signed({1'b0, a[j][k + l*PE]}) * $signed(w[i][j][k][l]); + end + end end end return res; From cf7f4946dc44f264de665e8a23893bd858277796 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 1 Nov 2023 15:20:07 +0000 Subject: [PATCH 084/112] [mvu vvu axi]: minor bugfixes to enable VVU --- finn-rtllib/mvu/mvu_vvu_axi.sv | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index f0f75c633a..ddedec1e8a 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -64,7 +64,7 @@ module mvu_vvu_axi #( localparam int unsigned INPUT_STREAM_WIDTH_BA = ((IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8, localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH, localparam int unsigned INPUT_STREAM_WIDTH = (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH, - localparam int unsigned SF = IS_MVU ? MW/SIMD : MW/(SIMD*PE), + localparam int unsigned SF = MW/SIMD, localparam int unsigned NF = IS_MVU ? MH/PE : 1, localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8 ) @@ -91,11 +91,11 @@ module mvu_vvu_axi #( //-------------------- Parameter sanity checks --------------------\\ initial begin - if ((MW % SIMD != 0 && IS_MVU) || (MW % (SIMD*PE) != 0 && !IS_MVU)) begin + if (MW % SIMD != 0) begin $error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD); $finish; end - if (MH % PE != 0 && IS_MVU) begin + if (MH % PE != 0) begin $error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE); $finish; end @@ -152,13 +152,10 @@ module mvu_vvu_axi #( //-------------------- Core MVU/VVU --------------------\\ uwire ovld; uwire [PE-1:0][ACCU_WIDTH-1:0] odat; - typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t; uwire mvauin_t amvau_i; - uwire mvauin_weight_t wmvau_i; if (IS_MVU) begin : genMVUInput assign amvau_i = amvau; - assign wmvau_i = s_axis_weights_tdata; end : genMVUInput else begin : genVVUInput // The input stream will have the channels interleaved for VVU when PE>1 @@ -169,11 +166,8 @@ module mvu_vvu_axi #( localparam int num_of_elements = PE*SIMD; for (genvar i=0; i 1) ? - amvau[(i/SIMD + (i*PE % num_of_elements) + 1) * ACTIVATION_WIDTH : (i/SIMD + (i*PE % num_of_elements)) * ACTIVATION_WIDTH] + amvau[(i/SIMD + (i*PE % num_of_elements) + 1) * ACTIVATION_WIDTH -1: (i/SIMD + (i*PE % num_of_elements)) * ACTIVATION_WIDTH] : amvau[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH]; - assign wmvau_i[i*WEIGHT_WIDTH +: WEIGHT_WIDTH] = (PE > 1) ? - s_axis_weights_tdata[( ((SIMD-1-i) + int'(i/SIMD)*SIMD) + int'(i/SIMD) * SIMD + 1) * WEIGHT_WIDTH : ( ((SIMD-1-i) + int'(i/SIMD)*SIMD) + int'(i/SIMD) * SIMD ) * WEIGHT_WIDTH] - : s_axis_weights_tdata[i*WEIGHT_WIDTH +: WEIGHT_WIDTH]; end : genRewire end : genVVUInput @@ -183,7 +177,7 @@ module mvu_vvu_axi #( .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( .clk, .rst, .en, - .last(alast && avld), .zero(!istb), .w(wmvau_i), .a(amvau_i), + .last(alast && avld), .zero(!istb), .w(s_axis_weights_tdata), .a(amvau_i), .vld(ovld), .p(odat) ); "mvu_4sx4u": From 5ffc221eaa07828001e423551ad05f8207178656 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 20 Nov 2023 14:35:45 +0000 Subject: [PATCH 085/112] [mvu vvu axi]: minor fix -- define mvauin_weight_t --- finn-rtllib/mvu/mvu_vvu_axi.sv | 1 + 1 file changed, 1 insertion(+) diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index ddedec1e8a..8eb92a93e6 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -153,6 +153,7 @@ module mvu_vvu_axi #( uwire ovld; uwire [PE-1:0][ACCU_WIDTH-1:0] odat; uwire mvauin_t amvau_i; + typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t; if (IS_MVU) begin : genMVUInput assign amvau_i = amvau; From 40d652ccb817295e5668ed765f8e348346584465 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 29 Nov 2023 14:02:33 +0000 Subject: [PATCH 086/112] [rtl mvu op]: minor fix to chain length estimation and enabled behavioral mode for rtl sim --- .../custom_op/fpgadataflow/matrixvectoractivation_rtl.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py index d0a638475a..da560d73fd 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py @@ -966,10 +966,12 @@ def _resolve_segment_len(self, clk): # Insert pipeline registers in the DSP58 chain to meet target clock frequency # 0.741 ns seems the worst-case delay through first DSP # 0.605 ns seems to be (on average) delay for all subsequent DSPs - critical_path_dsps = np.floor((clk - 0.741) / 0.605) + # clk >= (critical_path_dsps - 1) * 0.605 + 0.741 + assert (clk > 0.741), "Infeasible clk target of {} ns has been set, consider lowering the targeted clock frequency!".format(clk) + critical_path_dsps = np.floor((clk - 0.741) / 0.605 + 1) max_chain_len = np.ceil(self.get_nodeattr("SIMD") / 3) dsp_chain_len = critical_path_dsps if critical_path_dsps < max_chain_len else max_chain_len - return max(1, dsp_chain_len) + return dsp_chain_len def _resolve_impl_style(self, fpgapart): # Based on target device and activation/weight-width, choose the @@ -1051,7 +1053,6 @@ def prepare_codegen_default(self, fpgapart, clk): [str(1)] if (self.get_input_datatype(0).min() < 0) else [str(0)] ) code_gen_dict["$SEGMENTLEN$"] = [str(self._resolve_segment_len(clk))] - code_gen_dict["$FORCE_BEHAVIORAL$"] = [str(0)] return template_path, code_gen_dict From 6e98bac42f225e7ed8629e0cb67211e78db61d15 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 13 Dec 2023 09:36:25 +0000 Subject: [PATCH 087/112] [rtlsim]: use pyverilator util functions --- src/finn/custom_op/fpgadataflow/hlscustomop.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/hlscustomop.py b/src/finn/custom_op/fpgadataflow/hlscustomop.py index 4fed8ed4b5..01b94c20ca 100644 --- a/src/finn/custom_op/fpgadataflow/hlscustomop.py +++ b/src/finn/custom_op/fpgadataflow/hlscustomop.py @@ -31,7 +31,7 @@ import subprocess import warnings from abc import abstractmethod -from pyverilator.util.axi_utils import _read_signal, reset_rtlsim, rtlsim_multi_io +from pyverilator.util.axi_utils import _read_signal, reset_rtlsim, rtlsim_multi_io, toggle_clk from qonnx.core.datatype import DataType from qonnx.custom_op.base import CustomOp from qonnx.util.basic import roundup_to_integer_multiple @@ -491,15 +491,11 @@ def exec_precompiled_singlenode_model(self): def reset_rtlsim(self, sim): """Sets reset input in pyverilator to zero, toggles the clock and set it back to one""" - sim.io.ap_rst_n = 0 - sim.io.ap_clk = 1 - sim.io.ap_clk = 0 - sim.io.ap_rst_n = 1 + reset_rtlsim(sim) def toggle_clk(self, sim): """Toggles the clock input in pyverilator once.""" - sim.io.ap_clk = 1 - sim.io.ap_clk = 0 + toggle_clk(sim) def hls_sname(self): """Get the naming convention used by Vitis HLS for stream signals From 5dd74ad1dede3bf2a0405de8c803a4adfb2e65d3 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 8 Dec 2023 17:12:42 +0000 Subject: [PATCH 088/112] [mvu vvu axi]: sign extend output tdata (byte-aligned) --- finn-rtllib/mvu/mvu_vvu_axi.sv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index 8eb92a93e6..699662bd72 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -243,6 +243,6 @@ module mvu_vvu_axi #( end assign m_axis_output_tvalid = B.vld; - assign m_axis_output_tdata = B.dat; + assign m_axis_output_tdata = { {(OUTPUT_STREAM_WIDTH_BA-OUTPUT_STREAM_WIDTH){B.dat[PE-1][ACCU_WIDTH-1]}}, B.dat}; endmodule : mvu_vvu_axi From b20410bfd968c27395537b60bba11849b599a33a Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 8 Jan 2024 14:55:56 +0000 Subject: [PATCH 089/112] [mvu core]: dsp48 convert unpacked array to packed array to work around limitation on max array indices in Verilator --- finn-rtllib/mvu/mvu_4sx4u.sv | 4 ++-- finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv index 706347d700..7a2af35742 100644 --- a/finn-rtllib/mvu/mvu_4sx4u.sv +++ b/finn-rtllib/mvu/mvu_4sx4u.sv @@ -309,7 +309,7 @@ module mvu_4sx4u #( // Conclusive high part accumulation if(i >= PE_REM && i < 3) begin : genHi // Adder Tree across all SIMD high contributions, each from [-1:1] - uwire signed [$clog2(1+SIMD):0] tree[2*SIMD-1]; + uwire signed [2*SIMD-2:0][$clog2(1+SIMD):0] tree; for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = h3[s][i]; for(genvar n = 0; n < SIMD-1; n++) begin // Sum truncated to actual maximum bit width at this node @@ -333,7 +333,7 @@ module mvu_4sx4u #( if(i >= PE_REM) begin : blkLo // Adder Tree across all SIMD low contributions localparam int unsigned ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1)); - uwire [ROOT_WIDTH-1:0] tree[2*SIMD-1]; + uwire [2*SIMD-2:0][ROOT_WIDTH-1:0] tree; for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = p3[s][D[i]+:LO_WIDTH]; for(genvar n = 0; n < SIMD-1; n++) begin // Sum truncated to actual maximum bit width at this node diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv index 07c44cf89a..1e6855f779 100644 --- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv +++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv @@ -335,7 +335,7 @@ module mvu_8sx8u_dsp48 #( if(i >= PE_REM) begin : blkLo // Adder Tree across all SIMD low contributions localparam int unsigned ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1)); - uwire [ROOT_WIDTH-1:0] tree[2*SIMD-1]; + uwire [2*SIMD-2:0][ROOT_WIDTH-1:0] tree; for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = p3[s][D[i]+:LO_WIDTH]; for(genvar n = 0; n < SIMD-1; n++) begin // Sum truncated to actual maximum bit width at this node From 1c2cc0c2c1d98d7cde569f65eb20873a10e1f12f Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 8 Jan 2024 14:57:19 +0000 Subject: [PATCH 090/112] [mvu axi]: update list of deduced parameters --- finn-rtllib/mvu/mvu_vvu_axi.sv | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index 699662bd72..dd357c94bb 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -60,13 +60,14 @@ module mvu_vvu_axi #( bit M_REG_LUT = 1, // Safely deducible parameters - localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, - localparam int unsigned INPUT_STREAM_WIDTH_BA = ((IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8, - localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH, - localparam int unsigned INPUT_STREAM_WIDTH = (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH, - localparam int unsigned SF = MW/SIMD, - localparam int unsigned NF = IS_MVU ? MH/PE : 1, - localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8 + localparam int unsigned WEIGHT_STREAM_WIDTH = PE * SIMD * WEIGHT_WIDTH, + localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (WEIGHT_STREAM_WIDTH + 7) / 8 * 8, + localparam int unsigned INPUT_STREAM_WIDTH = (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH, + localparam int unsigned INPUT_STREAM_WIDTH_BA = (INPUT_STREAM_WIDTH + 7) / 8 * 8, + localparam int unsigned OUTPUT_STREAM_WIDTH = PE * ACCU_WIDTH, + localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_STREAM_WIDTH + 7) / 8 * 8, + localparam int unsigned SF = MW / SIMD, + localparam int unsigned NF = IS_MVU ? MH / PE : 1 ) ( // Global Control From eeb3cea623865a13d8da78acb5a9c7fc621caf0e Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 8 Jan 2024 14:58:02 +0000 Subject: [PATCH 091/112] [mvu custom-op]: remove lut-based implementation and update compute core selection --- .../matrixvectoractivation_rtl.py | 39 ++++++++++--------- 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py index da560d73fd..fcab06658c 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py @@ -191,7 +191,12 @@ def verify_node(self): if mem_mode not in ["decoupled", "external"]: info_messages.append( - "RTL-based MVAU supports only decoupled or external weights." + "RTL-based MVU only supports decoupled or external weights." + ) + + if self.get_nodeattr("resType") == "lut": + info_message.append( + "RTL-based MVU only supports DSP-based implementation" ) return info_messages @@ -635,7 +640,6 @@ def execute_node(self, context, graph): mem_mode = self.get_nodeattr("mem_mode") node = self.onnx_node - # TODO ensure codegen dir exists if mode == "cppsim": raise Exception( "cppsim not possible for RTL MVAU, please set exec_mode to rtlsim" @@ -801,7 +805,6 @@ def code_generation_ipi(self): rtllib_dir + "mvu_4sx4u.sv", rtllib_dir + "mvu_vvu_8sx9_dsp58.sv", rtllib_dir + "mvu_8sx8u_dsp48.sv", - rtllib_dir + "mvu_vvu_lut.sv", ] for f in sourcefiles: cmd.append("add_files -norecurse %s" % (f)) @@ -897,7 +900,6 @@ def code_generation_ipi(self): rtllib_dir + "mvu_4sx4u.sv", rtllib_dir + "mvu_vvu_8sx9_dsp58.sv", rtllib_dir + "mvu_8sx8u_dsp48.sv", - rtllib_dir + "mvu_vvu_lut.sv", ] for f in sourcefiles: cmd.append("add_files -norecurse %s" % (f)) @@ -964,8 +966,8 @@ def derive_characteristic_fxns(self, period): def _resolve_segment_len(self, clk): # Insert pipeline registers in the DSP58 chain to meet target clock frequency - # 0.741 ns seems the worst-case delay through first DSP - # 0.605 ns seems to be (on average) delay for all subsequent DSPs + # ~0.741 ns seems the worst-case delay through first DSP + # ~0.605 ns seems to be (on average) delay for all subsequent DSPs # clk >= (critical_path_dsps - 1) * 0.605 + 0.741 assert (clk > 0.741), "Infeasible clk target of {} ns has been set, consider lowering the targeted clock frequency!".format(clk) critical_path_dsps = np.floor((clk - 0.741) / 0.605 + 1) @@ -976,22 +978,23 @@ def _resolve_segment_len(self, clk): def _resolve_impl_style(self, fpgapart): # Based on target device and activation/weight-width, choose the # supported RTL compute core - if self.get_nodeattr("resType") == "lut": - return "mvu_vvu_lut" + + assert self.get_nodeattr("resType") != "lut", "LUT-based RTL-MVU implementation currently not supported! Please change resType for {}".format(self.onnx_node.name) + + act_width = self.get_input_datatype(0).bitwidth() + weight_width = self.get_input_datatype(1).bitwidth() + is_versal = ( + fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"] + or fpgapart[0:5] == "xqrvc" + ) + + if is_versal: + return "mvu_vvu_8sx9_dsp58" else: - act_width = self.get_input_datatype(0).bitwidth() - weight_width = self.get_input_datatype(1).bitwidth() - is_versal = ( - fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"] - or fpgapart[0:5] == "xqrvc" - ) if act_width == 4 and weight_width == 4: return "mvu_4sx4u" else: - if is_versal: - return "mvu_vvu_8sx9_dsp58" - else: - return "mvu_8sx8u_dsp48" + return "mvu_8sx8u_dsp48" def generate_hdl(self, model, fpgapart, clk): # Generate params as part of IP preparation From 0813d1463a219384b4666fad2db93a4f7dee1a0f Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 8 Jan 2024 14:59:30 +0000 Subject: [PATCH 092/112] [mvu axi]: remove LUT-based compute core --- finn-rtllib/mvu/mvu_vvu_axi.sv | 11 +--- finn-rtllib/mvu/mvu_vvu_lut.sv | 104 --------------------------------- 2 files changed, 2 insertions(+), 113 deletions(-) delete mode 100644 finn-rtllib/mvu/mvu_vvu_lut.sv diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index dd357c94bb..a3b051c9a1 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -120,8 +120,8 @@ module mvu_vvu_axi #( end end if (!IS_MVU) begin - if (COMPUTE_CORE != "mvu_vvu_8sx9_dsp58" && COMPUTE_CORE != "mvu_vvu_lut") begin - $error("VVU only supported on DSP58 or LUT-based implementation"); + if (COMPUTE_CORE != "mvu_vvu_8sx9_dsp58") begin + $error("VVU only supported on DSP58"); $finish; end end @@ -195,13 +195,6 @@ module mvu_vvu_axi #( .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), .vld(ovld), .p(odat) ); - "mvu_vvu_lut": - mvu_vvu_lut #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), - .WEIGHT_WIDTH(WEIGHT_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .M_REG(M_REG_LUT)) core ( - .clk, .rst, .en, - .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), - .vld(ovld), .p(odat) - ); default: initial begin $error("Unrecognized COMPUTE_CORE '%s'", COMPUTE_CORE); $finish; diff --git a/finn-rtllib/mvu/mvu_vvu_lut.sv b/finn-rtllib/mvu/mvu_vvu_lut.sv deleted file mode 100644 index c100910d75..0000000000 --- a/finn-rtllib/mvu/mvu_vvu_lut.sv +++ /dev/null @@ -1,104 +0,0 @@ -module mvu_vvu_lut #( - bit IS_MVU, - int unsigned PE, - int unsigned SIMD, - int unsigned ACCU_WIDTH, - int unsigned ACTIVATION_WIDTH, - int unsigned WEIGHT_WIDTH, - bit SIGNED_ACTIVATIONS, - bit M_REG = 1, - - localparam int unsigned MULT_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH, - localparam int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD -)( - // Global Control - input logic clk, - input logic rst, - input logic en, - - // Input - input logic last, - input logic zero, // ignore current inputs and force this partial product to zero - input logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // signed weights - input logic [ACTIVATION_ELEMENTS-1:0][ACTIVATION_WIDTH-1:0] a, // (un)signed activations - - // Ouput - output logic vld, - output logic signed [PE-1:0][ACCU_WIDTH-1:0] p -); - - typedef int unsigned leave_load_t[2*SIMD-1]; - function leave_load_t init_leave_loads(); - automatic leave_load_t res; - for(int i = 2*(SIMD-1); i >= int'(SIMD)-1; i--) res[i] = 1; - for(int i = SIMD-2; i >= 0; i--) res[i] = res[2*i+1] + res[2*i+2]; - return res; - endfunction : init_leave_loads - - // Pipeline for last indicator flag - uwire last_i; - generate if (M_REG) begin - logic [0:1] L = '0; - always_ff @(posedge clk) begin - if(rst) L <= '0; - else if (en) L <= {last, L[0]}; - end - assign last_i = L[1]; - end - else begin - logic L = '0; - always_ff @(posedge clk) begin - if(rst) L <= '0; - else if (en) L <= last; - end - assign last_i = L; - end - endgenerate - - // For each PE generate - for (genvar i = 0; i < PE; i++) begin : genPE - // Stage #1: SIMD multipliers in parallel - uwire [MULT_WIDTH-1 : 0] m1 [SIMD]; - for (genvar j = 0; j < SIMD; j++) begin : genSIMD - if (M_REG) begin : genMreg - logic [MULT_WIDTH-1 : 0] M [SIMD]; - always_ff @(posedge clk) begin - if(rst) M[j] = '{ default : 0 }; - else if (en) M[j] = zero ? 0 : - SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) : - $signed({1'b0, a[(IS_MVU ? 0 : SIMD*i) + j]}) * $signed(w[i][j]); - // (SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) : a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) isn't valid -- leads to unsigned multiplication - end - assign m1[j] = M[j]; - end : genMreg - else begin : genNoMreg - assign m1[j] = zero ? 0 : - SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) : - $signed({1'b0, a[(IS_MVU ? 0 : SIMD*i) + j]}) * $signed(w[i][j]); - end : genNoMreg - end : genSIMD - - // Stage #2: Adder tree to reduce SIMD products - localparam leave_load_t LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default : 1 }; - localparam int unsigned ROOT_WIDTH = $clog2(SIMD*(2**MULT_WIDTH-1)); - uwire signed [2*SIMD-2:0][ROOT_WIDTH-1:0] tree; - for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = $signed(m1[s]); - for(genvar n = 0; n < SIMD-1; n++) begin - // Sum truncated to actual maximum bit width at this node - localparam int unsigned NODE_WIDTH = $clog2(LEAVE_LOAD[n]*(2**MULT_WIDTH-1)); - uwire signed [NODE_WIDTH-1:0] s = tree[2*n+1] + tree[2*n+2]; - assign tree[n] = s; - end - - // Stage #3: Buffer output - logic [ACCU_WIDTH-1:0] P2 [PE]; - always_ff @(posedge clk) begin - if(rst) P2[i] = '{ default : 0}; - else if (en) P2[i] = (last_i ? 0 : $signed(P2[i])) + $signed(tree[0]); - end - - assign vld = last_i; - assign p[i] = P2[i]; - end : genPE - -endmodule : mvu_vvu_lut From 4892d6614b734a08315062b86ec6d5e1f1af0dc1 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 11 Jan 2024 12:02:38 +0000 Subject: [PATCH 093/112] [hls custom-op]: enable reset in sim --- src/finn/custom_op/fpgadataflow/hlscustomop.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/finn/custom_op/fpgadataflow/hlscustomop.py b/src/finn/custom_op/fpgadataflow/hlscustomop.py index 01b94c20ca..bc59c69192 100644 --- a/src/finn/custom_op/fpgadataflow/hlscustomop.py +++ b/src/finn/custom_op/fpgadataflow/hlscustomop.py @@ -600,6 +600,7 @@ def rtlsim_multi_io(self, sim, io_dict): trace_file=trace_file, sname=sname, liveness_threshold=pyverilate_get_liveness_threshold_cycles(), + do_reset=True, ) self.set_nodeattr("cycles_rtlsim", total_cycle_count) From 44f6e0f3e70eea06408b94a31e555f0f6b9ea358 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 11 Jan 2024 12:21:00 +0000 Subject: [PATCH 094/112] [test mvu rtl]: updated test flow (DSP58 only) --- .../test_fpgadataflow_mvau_rtl.py | 167 +++++++++--------- 1 file changed, 87 insertions(+), 80 deletions(-) diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py b/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py index 3db7a718f5..1e9de44fb2 100644 --- a/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py +++ b/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py @@ -27,141 +27,148 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import pytest -import os import numpy as np +import os +import pickle from onnx import TensorProto, helper -from qonnx.util.basic import ( - qonnx_make_model, - gen_finn_dt_tensor -) -from qonnx.core.modelwrapper import ModelWrapper from qonnx.core.datatype import DataType -from qonnx.transformation.general import GiveUniqueNodeNames +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.custom_op.registry import getCustomOp +from qonnx.transformation.general import ApplyConfig, GiveUniqueNodeNames, GiveReadableTensorNames +from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model + import finn.core.onnx_exec as oxe import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls -from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl +from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP -from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.set_fifo_depths import InsertAndSetFIFODepths + + from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim -from qonnx.transformation.general import ApplyConfig -import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl -#import qonnx.core.data_layout as DataLayout +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode build_dir = os.environ["FINN_BUILD_DIR"] -def make_single_matmul_modelwrapper(W, ofm_shape, mh, ifm, weights, idt, wdt): - (ofm_h, ofm_w) = ofm_shape - ofm = helper.make_tensor_value_info( - "ofm", - TensorProto.FLOAT, - (1, ofm_h, ofm_w, mh) - ) - - matmul_node = helper.make_node( - "MatMul", - ["ifm", "weights"], - ["ofm"] - ) - graph = helper.make_graph( - nodes=[matmul_node], - name="matmul_graph", - inputs=[ifm], - outputs=[ofm] - ) + +def make_single_matmul_modelwrapper(ifm, ofm, idt, wdt, W): + matmul_node = helper.make_node("MatMul", ["ifm", "weights"], ["ofm"]) + graph = helper.make_graph(nodes=[matmul_node], name="matmul_graph", inputs=[ifm], outputs=[ofm]) model = qonnx_make_model(graph, producer_name="fclayer-model") model = ModelWrapper(model) model.set_tensor_datatype("ifm", idt) model.set_tensor_datatype("weights", wdt) - model.set_tensor_datatype("ofm", DataType["INT32"]) # At this step, the MatMul layer does not optimize the bit-width of the output datatype + model.set_tensor_datatype( + "ofm", DataType["INT32"] + ) # At this step, the MatMul layer does not optimize the bit-width of the output datatype model.set_initializer("weights", W) - # model.set_tensor_layout("ifm", DataLayout.NHWC) return model + def prepare_inputs(input_tensor): - return {"inp": input_tensor} + return {"global_in": input_tensor} + -@pytest.mark.parametrize("mh", [16]) -@pytest.mark.parametrize("mw", [32]) -@pytest.mark.parametrize("pe", [1, 4, 16]) -#@pytest.mark.parametrize("simd", [1, 30, 90]) -@pytest.mark.parametrize("simd", [1, 4, 32]) +# @pytest.mark.parametrize("mh", [36]) +# @pytest.mark.parametrize("mw", [256]) +@pytest.mark.parametrize("mh", [9]) +@pytest.mark.parametrize("mw", [36]) +# @pytest.mark.parametrize("pe", [1, 4, 9, 36]) +# @pytest.mark.parametrize("simd", [1, 4, 16, 64, 256]) +@pytest.mark.parametrize("pe", [1, 3, 9]) +@pytest.mark.parametrize("simd", [1, 3, 6, 18, 36]) @pytest.mark.parametrize("idt", [DataType["UINT4"], DataType["UINT8"]]) -@pytest.mark.parametrize("wdt", [DataType["INT4"], DataType["INT6"]]) -#@pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S", "xcku3p-ffva676-1-e"]) -@pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S"]) -@pytest.mark.parametrize("segmentlen", [1]) +@pytest.mark.parametrize("wdt", [DataType["INT4"], DataType["INT8"]]) +# @pytest.mark.parametrize("part", ["xcvc1902-vsva2197-2MP-e-S", "xcku3p-ffva676-1-e"]) +@pytest.mark.parametrize("part", ["xcvc1902-vsva2197-2MP-e-S"]) +@pytest.mark.parametrize("clk_ns", [1.66, 4]) @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado -def test_fpgadataflow_mvau_rtl(mh, mw, pe, simd, idt, wdt, part, segmentlen): +def test_fpgadataflow_mvau_rtl( + mh, mw, pe, simd, idt, wdt, part, clk_ns +): + if part == "xcku3p-ffva676-1-e" and clk_ns != 1.66: + pytest.skip("Skip test for varying clk for devices other than Versal, since this variable doesn't change anything for this test") + # Create test input vector (produced by SWG) ofm_shape = (5, 5) ofm_h, ofm_w = ofm_shape - ifm = helper.make_tensor_value_info( - "ifm", - TensorProto.FLOAT, - [1, ofm_h, ofm_w, mw] - ) - weights = helper.make_tensor_value_info( - "weights", - TensorProto.FLOAT, - [mw, mh] - ) + ifm = helper.make_tensor_value_info("ifm", TensorProto.FLOAT, [1, ofm_h, ofm_w, mw]) + ofm = helper.make_tensor_value_info("ofm", TensorProto.FLOAT, (1, ofm_h, ofm_w, mh)) W = gen_finn_dt_tensor(wdt, (mw, mh)) - model = make_single_matmul_modelwrapper(W, ofm_shape, mh, ifm, weights, idt, wdt) + model = make_single_matmul_modelwrapper(ifm, ofm, idt, wdt, W) model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) - model.save(build_dir+"/matmul.onnx") + model.save(build_dir + "/matmul.onnx") # Create MatMul & obtain golden reference output - A = gen_finn_dt_tensor(model.get_tensor_datatype("ifm"), model.get_tensor_shape("ifm")) + A = gen_finn_dt_tensor(model.get_tensor_datatype("global_in"), model.get_tensor_shape("global_in")) input_dict = prepare_inputs(A) - ## Execute ONNX model - output_matmul = oxe.execute_onnx(model, input_dict) + # Execute ONNX model + output_matmul = oxe.execute_onnx(model, input_dict)["global_out"] + + with open(build_dir + "/onnx_output.pkl", "wb") as f: + pickle.dump(output_matmul, f) # Create MVAU (HLS) model = model.transform(to_hls.InferQuantizedMatrixVectorActivation(mem_mode="decoupled")) model = model.transform(GiveUniqueNodeNames()) - + # Apply folding (i.e. specify to use DSPs) folding_config = { "Defaults": {}, "MatrixVectorActivation_0": { - "PE" : pe, - "SIMD" : simd, - "mem_mode" : "decoupled", - "ram_style" : "auto", - "resType" : "dsp", - "impl" : "rtl" - } + "PE": pe, + "SIMD": simd, + "mem_mode": "decoupled", + "ram_style": "auto", + "resType": "dsp", + "preferred_backend" : "rtl" + }, } model = model.transform(ApplyConfig(folding_config)) - model.save(build_dir+"/mvau_hls.onnx") - - model = model.transform(SetExecMode("rtlsim")) - model = model.transform(PrepareIP(part, 5)) - model = model.transform(HLSSynthIP()) - model = model.transform(PrepareRTLSim()) - output_mvau_hls = oxe.execute_onnx(model, input_dict)["ofm"] + model.save(build_dir + "/mvau_hls.onnx") # Apply convert-to-rtl step model = model.transform(to_rtl.InferRTLMatrixVectorActivation()) model = model.transform(GiveUniqueNodeNames()) - model.save(build_dir+"/mvau_rtl.onnx") + model.save(build_dir + "/mvau_rtl.onnx") + # Reset rtlsim_so and ip-related paths such that new Pyverilator SO and IP is generated + for n in model.graph.node: + getCustomOp(n).set_nodeattr("rtlsim_trace", build_dir + "/mvu_trace_rtl_nodebynode.vcd") + model = model.transform(SetExecMode("rtlsim")) - model = model.transform(PrepareIP("xcvm1802-vsvd1760-2MP-e-S", 5)) + model = model.transform(PrepareIP(part, clk_ns)) model = model.transform(HLSSynthIP()) model = model.transform(PrepareRTLSim()) - output_mvau_rtl = oxe.execute_onnx(model, input_dict)["ofm"] + output_mvau_rtl = oxe.execute_onnx(model, input_dict)["global_out"] + + with open(build_dir + "/mvau_rtl_output.pkl", "wb") as f: + pickle.dump(output_mvau_rtl, f) + + model.save(build_dir + "/mvau_rtl_sim.onnx") + assert (output_matmul == output_mvau_rtl).all(), "Output of ONNX model not matching output of node-by-node sim!" + + model = model.transform(InsertAndSetFIFODepths(part, clk_ns)) + model = model.transform(PrepareIP(part, clk_ns)) + model = model.transform(HLSSynthIP()) + model = model.transform(CreateStitchedIP(part, clk_ns)) - model.save(build_dir+"/mvau_rtl_sim.onnx") + os.environ["RTLSIM_TRACE_DEPTH"] = "3" + model.set_metadata_prop("rtlsim_so", "") + model.set_metadata_prop("exec_mode", "rtlsim") + model.set_metadata_prop("rtlsim_trace", build_dir + "/mvu_trace_rtl_stitch.vcd") + model.save(build_dir + "/stitched_ip.onnx") + output_mvau_rtl_stitch = oxe.execute_onnx(model, input_dict)["global_out"] - assert (output_mvau_hls == output_mvau_rtl).all() - assert (output_mvau_hls.size > 0) + assert (output_matmul == output_mvau_rtl_stitch).all(), "Output of ONNX model not matching output of stitched-IP RTL model!" \ No newline at end of file From 9b2ccebba2c3689d6a1e55b6df027f461244d216 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 11 Jan 2024 14:43:46 +0000 Subject: [PATCH 095/112] [mvu vvu axi]: reworked flow control and backpressure handling by tpreusser --- finn-rtllib/mvu/mvu_vvu_axi.sv | 130 ++++++++++++++++----------------- 1 file changed, 61 insertions(+), 69 deletions(-) diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index a3b051c9a1..0168f20563 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -62,12 +62,12 @@ module mvu_vvu_axi #( // Safely deducible parameters localparam int unsigned WEIGHT_STREAM_WIDTH = PE * SIMD * WEIGHT_WIDTH, localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (WEIGHT_STREAM_WIDTH + 7) / 8 * 8, - localparam int unsigned INPUT_STREAM_WIDTH = (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH, + localparam int unsigned INPUT_STREAM_WIDTH = SIMD * ACTIVATION_WIDTH, localparam int unsigned INPUT_STREAM_WIDTH_BA = (INPUT_STREAM_WIDTH + 7) / 8 * 8, localparam int unsigned OUTPUT_STREAM_WIDTH = PE * ACCU_WIDTH, localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_STREAM_WIDTH + 7) / 8 * 8, localparam int unsigned SF = MW / SIMD, - localparam int unsigned NF = IS_MVU ? MH / PE : 1 + localparam int unsigned NF = MH / PE ) ( // Global Control @@ -119,81 +119,73 @@ module mvu_vvu_axi #( $finish; end end - if (!IS_MVU) begin - if (COMPUTE_CORE != "mvu_vvu_8sx9_dsp58") begin - $error("VVU only supported on DSP58"); - $finish; - end - end end uwire clk = ap_clk; uwire rst = !ap_rst_n; - typedef logic [INPUT_STREAM_WIDTH-1 : 0] mvauin_t; - - uwire mvauin_t amvau; + //- Replay to Accommodate Neuron Fold ----------------------------------- + typedef logic [PE*SIMD-1:0][ACTIVATION_WIDTH-1:0] mvu_flatin_t; + uwire mvu_flatin_t amvau; uwire alast; uwire afin; uwire avld; uwire ardy; - replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvauin_t))) activation_replay ( + replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvu_flatin_t))) activation_replay ( .clk, .rst, - .ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)), + .ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvu_flatin_t'(s_axis_input_tdata)), .ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin) ); -//-------------------- Input control --------------------\\ + //- Unflatten inputs into structured matrices --------------------------- + typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH -1:0] mvu_w_t; + typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] mvu_a_t; + + uwire mvu_w_t mvu_w = s_axis_weights_tdata; + uwire mvu_a_t mvu_a = amvau; + + //- Flow Control Bracket around Compute Core ---------------------------- uwire en; uwire istb = avld && s_axis_weights_tvalid; assign ardy = en && s_axis_weights_tvalid; assign s_axis_weights_tready = en && avld; -//-------------------- Core MVU/VVU --------------------\\ - uwire ovld; - uwire [PE-1:0][ACCU_WIDTH-1:0] odat; - uwire mvauin_t amvau_i; - typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t; - - if (IS_MVU) begin : genMVUInput - assign amvau_i = amvau; - end : genMVUInput - else begin : genVVUInput - // The input stream will have the channels interleaved for VVU when PE>1 - // Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..] - // Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like: - // (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to - // (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i) - localparam int num_of_elements = PE*SIMD; - for (genvar i=0; i 1) ? - amvau[(i/SIMD + (i*PE % num_of_elements) + 1) * ACTIVATION_WIDTH -1: (i/SIMD + (i*PE % num_of_elements)) * ACTIVATION_WIDTH] - : amvau[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH]; - end : genRewire - end : genVVUInput + //- Instantiate compute core ---------------------------- + typedef logic [PE-1:0][ACCU_WIDTH-1:0] dsp_p_t; + uwire dsp_vld; + uwire dsp_p_t dsp_p; + + uwire dsp_clk = ap_clk; + uwire dsp_en = en; + uwire dsp_last = alast && avld; + uwire dsp_zero = !istb; + uwire mvu_w_t dsp_w = mvu_w; + uwire mvu_a_t dsp_a = mvu_a; + uwire ovld = dsp_vld; + uwire dsp_p_t odat = dsp_p; case(COMPUTE_CORE) "mvu_vvu_8sx9_dsp58": mvu_vvu_8sx9_dsp58 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( - .clk, .rst, .en, - .last(alast && avld), .zero(!istb), .w(s_axis_weights_tdata), .a(amvau_i), - .vld(ovld), .p(odat) + .clk(dsp_clk), .rst, .en(dsp_en), + .last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a), + .vld(dsp_vld), .p(dsp_p) ); "mvu_4sx4u": mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( - .clk, .rst, .en, - .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), - .vld(ovld), .p(odat) + .clk(dsp_clk), .rst, .en(dsp_en), + .last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a), + .vld(dsp_vld), .p(dsp_p) ); "mvu_8sx8u_dsp48": mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( - .clk, .rst, .en, - .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), - .vld(ovld), .p(odat) + .clk(dsp_clk), .rst, .en(dsp_en), + .last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a), + .vld(dsp_vld), .p(dsp_p) ); default: initial begin $error("Unrecognized COMPUTE_CORE '%s'", COMPUTE_CORE); @@ -202,41 +194,41 @@ module mvu_vvu_axi #( endcase //-------------------- Output register slice --------------------\\ + // Make `en`computation independent from external inputs. + // Drive all outputs from registers. struct packed { - logic vld; + logic rdy; logic [PE-1:0][ACCU_WIDTH-1:0] dat; - } A = '{ vld: 0, default: 'x}; - - assign en = !A.vld || !ovld; - - uwire b_load; - always_ff @(posedge clk) begin - if(rst) A <= '{ vld: 0, default: 'x }; - else if(!A.vld || b_load) begin - A.vld <= ovld && en; - for(int unsigned i = 0; i < PE; i++) begin - // CR-1148862: - // A.dat[i] <= odat[i]; - automatic logic [ACCU_WIDTH-1:0] v = odat[i]; - A.dat[i] <= v[ACCU_WIDTH-1:0]; - end - end - end - + } A = '{ rdy: 1, default: 'x }; // side-step register used when encountering backpressure struct packed { logic vld; logic [PE-1:0][ACCU_WIDTH-1:0] dat; - } B = '{ vld: 0, default: 'x}; + } B = '{ vld: 0, default: 'x }; // ultimate output register + + assign en = A.rdy; + uwire b_load = !B.vld || m_axis_output_tready; - assign b_load = !B.vld || m_axis_output_tready; always_ff @(posedge clk) begin - if(rst) B <= '{ vld: 0, default: 'x }; + if(rst) begin + A <= '{ rdy: 1, default: 'x }; + B <= '{ vld: 0, default: 'x }; + end else begin - if(b_load) B <= '{ vld: A.vld, dat: A.dat}; + if(A.rdy) A.dat <= odat; + A.rdy <= (A.rdy && !ovld) || b_load; + + if(b_load) begin + B <= '{ + vld: ovld || !A.rdy, + dat: A.rdy? odat : A.dat + }; + end end end - assign m_axis_output_tvalid = B.vld; + // Why would we need a sign extension here potentially creating a higher signal load into the next FIFO? + // These extra bits should never be used. Why not 'x them out? assign m_axis_output_tdata = { {(OUTPUT_STREAM_WIDTH_BA-OUTPUT_STREAM_WIDTH){B.dat[PE-1][ACCU_WIDTH-1]}}, B.dat}; + endmodule : mvu_vvu_axi From 4ab65960c6e6acff1cbf9974704b17ab4e5446a5 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 11 Jan 2024 15:07:05 +0000 Subject: [PATCH 096/112] [hlsbackend]: update limit HLS axi streams (8k-1) --- src/finn/custom_op/fpgadataflow/hlsbackend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/hlsbackend.py b/src/finn/custom_op/fpgadataflow/hlsbackend.py index 403b992a05..1b37cf138b 100644 --- a/src/finn/custom_op/fpgadataflow/hlsbackend.py +++ b/src/finn/custom_op/fpgadataflow/hlsbackend.py @@ -415,5 +415,5 @@ def get_ap_int_max_w(self): instream = self.get_instream_width() outstream = self.get_outstream_width() ret = max([instream, outstream]) - assert ret <= 32768, "AP_INT_MAX_W=%d is larger than allowed maximum of 32768" % ret + assert ret <= 8191, "AP_INT_MAX_W=%d is larger than allowed maximum of 8191" % ret return ret From 72ccc83afd36aa4cfb88b8cc5cee5af75a01db69 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 26 Jan 2024 11:11:05 +0000 Subject: [PATCH 097/112] [mvau hls]: refactored MVAU_hls custom_op --- .../hls/matrixvectoractivation_hls.py | 522 ++++++++++++++++++ 1 file changed, 522 insertions(+) create mode 100644 src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py diff --git a/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py new file mode 100644 index 0000000000..2ad9fefc07 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py @@ -0,0 +1,522 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import math +import numpy as np +import os +import textwrap +import warnings +from qonnx.core.datatype import DataType +from qonnx.util.basic import ( + calculate_matvec_accumulator_range, + interleave_matrix_outer_dim_from_partitions, + roundup_to_integer_multiple, +) + +from finn.custom_op.fpgadataflow.matrixvectoractivation import MatrixVectorActivation +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.util.data_packing import ( + npy_to_rtlsim_input, + numpy_to_hls_code, + pack_innermost_dim_as_hex_string, + rtlsim_output_to_npy, +) + +# ONNX i/o tensor shape assumptions for MatrixVectorActivation: +# input 0 is the input tensor, shape (.., i_size) = (..., MW) +# input 1 is the weight tensor, shape (i_size, o_size) = (MW, MH) +# (optional) input 2 is the thresholds tensor, shape (o_size, n_thres) +# output 0 is the output tensor, shape (.., o_size) = (..., MH) +# the ... here can be any shape (representing groups of vectors) + + +class MatrixVectorActivation_hls(MatrixVectorActivation, HLSBackend): + """Corresponds to finn-hlslib MatrixVectorActivation_Batch function.""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = {} + my_attrs.update(MatrixVectorActivation.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) + return my_attrs + + def get_template_param_values(self): + """Returns the template parameter values according to input, output and weight + data types.""" + ret = dict() + inp_hls_str = self.get_input_datatype().get_hls_datatype_str() + out_hls_str = self.get_output_datatype().get_hls_datatype_str() + inp_is_binary = self.get_input_datatype() == DataType["BINARY"] + # out_is_binary = self.get_output_datatype() == DataType["BINARY"] + wt_is_binary = self.get_weight_datatype() == DataType["BINARY"] + bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1 + if (inp_is_binary or wt_is_binary) and (not bin_xnor_mode): + raise Exception("True binary (non-bipolar) inputs not yet supported") + inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"] + # out_is_bipolar = self.get_output_datatype() == DataType["BIPOLAR"] + wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"] + # reinterpret inp/wt as bipolar if bin_xnor_mode is iset + inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode) + wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode) + # fill in TSrcI and TWeightI + # TODO check these with Giulio + # TODO handle non-bipolar binary inputs + if inp_is_bipolar and wt_is_bipolar: + ret["TSrcI"] = "Recast" + ret["TWeightI"] = "Identity" + elif (not inp_is_bipolar) and wt_is_bipolar: + ret["TSrcI"] = "Slice<%s>" % inp_hls_str + ret["TWeightI"] = "Recast" + elif inp_is_bipolar and (not wt_is_bipolar): + ret["TSrcI"] = "Recast" + ret["TWeightI"] = "Identity" + elif (not inp_is_bipolar) and (not wt_is_bipolar): + ret["TSrcI"] = "Slice<%s>" % inp_hls_str + ret["TWeightI"] = "Identity" + + # fill in TDstI + ret["TDstI"] = "Slice<%s>" % out_hls_str + + return ret + + def get_verilog_top_module_intf_names(self): + intf_names = super().get_verilog_top_module_intf_names() + mem_mode = self.get_nodeattr("mem_mode") + sname = self.hls_sname() + if mem_mode == "external": + intf_names["s_axis"].append(("weights_" + sname, self.get_weightstream_width_padded())) + if mem_mode == "decoupled": + # only expose axilite interface if attribute is set + runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 + if runtime_writable: + intf_names["axilite"] = ["s_axilite"] + return intf_names + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "weights.hpp"'] + self.code_gen_dict["$GLOBALS$"] += ['#include "activations.hpp"'] + + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode not in ["const", "decoupled", "external"]: + raise Exception( + """Please set mem_mode to "const", "decoupled", or "external", + currently no other parameter value is supported!""" + ) + self.code_gen_dict["$GLOBALS$"] += ['#include "mvau.hpp"'] + if self.calc_tmem() != 0: + # TODO find a better way of checking for no pregenerated thresholds + self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"'] + + def defines(self, var): + # Only ipgen mode: Make sure that SIMD parameter satisfies minimum requirements. + if var == "ipgen": + SIMD = self.get_nodeattr("SIMD") + MW = self.get_nodeattr("MW") + condition = SIMD >= (MW / 1024) + msg = ( + f"HLS synthesis of MatrixVectorActivation requires: " + f"SIMD >= MW / 1024. This is not fulfilled with: SIMD={SIMD} " + f"and MW={MW} for node: {self.onnx_node.name}." + ) + assert condition, msg + mem_mode = self.get_nodeattr("mem_mode") + numInputVectors = list(self.get_nodeattr("numInputVectors")) + numReps = np.prod(numInputVectors) + self.code_gen_dict["$DEFINES$"] = [ + """#define MW1 {}\n #define MH1 {}\n + #define SIMD1 {}\n #define PE1 {}\n #define WMEM1 {}\n + #define TMEM1 {}\n #define numReps {}""".format( + self.get_nodeattr("MW"), + self.get_nodeattr("MH"), + self.get_nodeattr("SIMD"), + self.get_nodeattr("PE"), + self.calc_wmem(), + self.calc_tmem(), + numReps, + ) + ] + if mem_mode == "decoupled" or mem_mode == "external": + wdt = self.get_weight_datatype() + self.code_gen_dict["$DEFINES$"].append("#define WP1 {}\n".format(wdt.bitwidth())) + + def read_npy_data(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_input_datatype() + if dtype == DataType["BIPOLAR"]: + # use binary for bipolar storage + dtype = DataType["BINARY"] + elem_bits = dtype.bitwidth() + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/input_0.npy" % code_gen_dir + self.code_gen_dict["$READNPYDATA$"] = [] + # note: the innermost dim is reversed for the input + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s, false);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + npy_in, + self.hls_sname(), + ) + ) + + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "decoupled" or mem_mode == "external": + wdt = self.get_weight_datatype() + elem_bits = wdt.bitwidth() + packed_bits = self.get_weightstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = wdt.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/weights.npy" % code_gen_dir + + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", weights_%s, false, numReps);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + npy_in, + self.hls_sname(), + ) + ) + + def strm_decl(self): + mem_mode = self.get_nodeattr("mem_mode") + self.code_gen_dict["$STREAMDECLARATIONS$"] = [] + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> in0_{} ("in0_{}");'.format( + self.get_instream_width(), self.hls_sname(), self.hls_sname() + ) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> out_{} ("out_{}");'.format( + self.get_outstream_width(), self.hls_sname(), self.hls_sname() + ) + ) + + if mem_mode == "decoupled" or mem_mode == "external": + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> weights_{} ("weights_{}");'.format( + self.get_weightstream_width(), self.hls_sname(), self.hls_sname() + ) + ) + + def docompute(self): + mem_mode = self.get_nodeattr("mem_mode") + map_to_hls_mult_style = { + "auto": "ap_resource_dflt()", + "lut": "ap_resource_lut()", + "dsp": "ap_resource_dsp()", + } + tmpl_args = self.get_template_param_values() + if self.calc_tmem() == 0: + odtype_hls_str = self.get_output_datatype().get_hls_datatype_str() + threshs = "PassThroughActivation<%s>()" % odtype_hls_str + else: + threshs = "threshs" + if mem_mode == "const": + self.code_gen_dict["$DOCOMPUTE$"] = [ + """Matrix_Vector_Activate_Batch + (in0_{}, out_{}, weights, {}, numReps, {});""".format( + tmpl_args["TSrcI"], + tmpl_args["TDstI"], + tmpl_args["TWeightI"], + self.hls_sname(), + self.hls_sname(), + threshs, + map_to_hls_mult_style[self.get_nodeattr("resType")], + ) + ] + elif mem_mode == "decoupled" or mem_mode == "external": + wdt = self.get_weight_datatype() + if wdt == DataType["BIPOLAR"]: + export_wdt = DataType["BINARY"] + else: + export_wdt = wdt + wdtype_hls_str = export_wdt.get_hls_datatype_str() + self.code_gen_dict["$DOCOMPUTE$"] = [ + """Matrix_Vector_Activate_Stream_Batch + (in0_{}, out_{}, weights_{}, {}, numReps, {});""".format( + tmpl_args["TSrcI"], + tmpl_args["TDstI"], + tmpl_args["TWeightI"], + wdtype_hls_str, + self.hls_sname(), + self.hls_sname(), + self.hls_sname(), + threshs, + map_to_hls_mult_style[self.get_nodeattr("resType")], + ) + ] + + else: + raise Exception( + """Please set mem_mode to "const", "decoupled", or "external", + currently no other parameter value is supported!""" + ) + + def dataoutstrm(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_output_datatype() + if dtype == DataType["BIPOLAR"]: + # use binary for bipolar storage + dtype = DataType["BINARY"] + elem_bits = dtype.bitwidth() + packed_bits = self.get_outstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_out = "%s/output.npy" % code_gen_dir + shape = self.get_folded_output_shape() + shape_cpp_str = str(shape).replace("(", "{").replace(")", "}") + + # note: the innermost dim is not reversed for the output + self.code_gen_dict["$DATAOUTSTREAM$"] = [ + 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", false);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + self.hls_sname(), + shape_cpp_str, + npy_out, + ) + ] + + def save_as_npy(self): + self.code_gen_dict["$SAVEASCNPY$"] = [] + + def blackboxfunction(self): + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "const": + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void {}(hls::stream> &in0_{}, + hls::stream> &out_{} + )""".format( + self.onnx_node.name, + self.get_instream_width(), + self.hls_sname(), + self.get_outstream_width(), + self.hls_sname(), + ) + ] + elif mem_mode == "decoupled" or mem_mode == "external": + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void {}( + hls::stream> &in0_{}, + hls::stream> &weights_{}, + hls::stream> &out_{} + )""".format( + self.onnx_node.name, + self.get_instream_width(), + self.hls_sname(), + self.get_weightstream_width(), + self.hls_sname(), + self.get_outstream_width(), + self.hls_sname(), + ) + ] + + else: + raise Exception( + """Please set mem_mode to "const" or "decoupled", currently no other + parameter value is supported!""" + ) + + def pragmas(self): + mem_mode = self.get_nodeattr("mem_mode") + ram_style_thresholds = self.get_nodeattr("ram_style_thresholds") + self.code_gen_dict["$PRAGMAS$"] = [ + "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() + ] + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() + ) + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") + + if mem_mode == "const": + self.code_gen_dict["$PRAGMAS$"].append('#include "params.h"') + # the weight tensor is ap_uint [PE][WMEM] + # partition for parallel access along the PE dimension (dim 1) + self.code_gen_dict["$PRAGMAS$"].append( + ("#pragma HLS ARRAY_PARTITION variable=weights.m_weights " "complete dim=1") + ) + elif mem_mode == "decoupled" or mem_mode == "external": + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=weights_" + self.hls_sname() + ) + + else: + raise Exception( + """Please set mem_mode to "const", "decoupled", or external, + currently no other parameter value is supported!""" + ) + + # the threshold tensor is acc_type [PE][TMEM][N_THRES] + # partition for parallel access along PE and N_THRES + # dimensions (dims 1 and 3) + if self.calc_tmem() != 0: + # TODO find a better way of checking for no pregenerated thresholds + self.code_gen_dict["$PRAGMAS$"].append( + ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=1") + ) + self.code_gen_dict["$PRAGMAS$"].append( + ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=3") + ) + # add resource pragma for thresholds if set + if ram_style_thresholds == "distributed": + self.code_gen_dict["$PRAGMAS$"].append( + ("#pragma HLS RESOURCE variable=threshs.m_thresholds " "core=ROM_2P_LUTRAM") + ) + elif ram_style_thresholds == "block": + self.code_gen_dict["$PRAGMAS$"].append( + ("#pragma HLS RESOURCE variable=threshs.m_thresholds " "core=ROM_2P_BRAM") + ) + elif ram_style_thresholds == "auto": + # no pragma needed + pass + else: + raise Exception("Unrecognized ram_style_thresholds value:" + ram_style_thresholds) + + def get_ap_int_max_w(self): + # base class impl (max of inp/out stream widths) + max_of_io = super().get_ap_int_max_w() + # decoupled mode weight stream + weightstream = self.get_weightstream_width() + # single PE weight entry + weight_bits = self.get_weight_datatype().bitwidth() + simd = self.get_nodeattr("SIMD") + single_pe_w = simd * weight_bits + return max([weightstream, max_of_io, single_pe_w]) + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + mem_mode = self.get_nodeattr("mem_mode") + node = self.onnx_node + + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + # create a npy file fore each input of the node (in_ind is input index) + in_ind = 0 + for inputs in node.input: + # it is assumed that the first input of the node is the data input + # the second input are the weights + if in_ind == 0: + assert ( + str(context[inputs].dtype) == "float32" + ), """Input datatype is + not float32 as expected.""" + expected_inp_shape = self.get_folded_input_shape() + reshaped_input = context[inputs].reshape(expected_inp_shape) + export_idt = self.get_input_datatype() + # make copy before saving the array + reshaped_input = reshaped_input.copy() + np.save( + os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)), + reshaped_input, + ) + elif in_ind > 2: + raise Exception("Unexpected input found for MatrixVectorActivation") + in_ind += 1 + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_output(context) + # reinterpret binary output as bipolar where needed + if self.get_output_datatype() == DataType["BIPOLAR"]: + out = context[node.output[0]] + out = 2 * out - 1 + context[node.output[0]] = out + assert ( + context[node.output[0]].shape == self.get_normal_output_shape() + ), "cppsim did not produce expected output shape" + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + self.reset_rtlsim(sim) + self.toggle_clk(sim) + if mem_mode in ["external", "decoupled"]: + wnbits = self.get_weightstream_width() + export_wdt = self.get_weight_datatype() + wei = npy_to_rtlsim_input( + "{}/weights.npy".format(code_gen_dir), export_wdt, wnbits + ) + num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) + io_dict = { + "inputs": {"in0": inp, "weights": wei * num_w_reps}, + "outputs": {"out": []}, + } + self.rtlsim_multi_io(sim, io_dict) + output = io_dict["outputs"]["out"] + else: + output = self.rtlsim(sim, inp) + odt = self.get_output_datatype() + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + oshape = self.get_normal_output_shape() + output = np.asarray([output], dtype=np.float32).reshape(*oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to "rtlsim" """.format( + mode + ) + ) \ No newline at end of file From 7a9b82babacdbf3730e602630e03ce614f88e965 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 26 Jan 2024 11:45:12 +0000 Subject: [PATCH 098/112] [refactor]: call to base_op_type method instead of custom_op type --- src/finn/analysis/fpgadataflow/res_estimation.py | 2 +- src/finn/transformation/fpgadataflow/create_stitched_ip.py | 3 ++- src/finn/transformation/fpgadataflow/floorplan.py | 2 +- src/finn/transformation/fpgadataflow/insert_dwc.py | 2 +- src/finn/transformation/fpgadataflow/insert_iodma.py | 2 +- src/finn/transformation/fpgadataflow/insert_tlastmarker.py | 4 ++-- src/finn/transformation/fpgadataflow/make_pynq_driver.py | 2 +- src/finn/transformation/fpgadataflow/make_zynq_proj.py | 2 +- src/finn/transformation/fpgadataflow/set_fifo_depths.py | 6 +++--- src/finn/transformation/fpgadataflow/set_folding.py | 2 +- 10 files changed, 14 insertions(+), 13 deletions(-) diff --git a/src/finn/analysis/fpgadataflow/res_estimation.py b/src/finn/analysis/fpgadataflow/res_estimation.py index be4cf417bc..a7f220daa9 100644 --- a/src/finn/analysis/fpgadataflow/res_estimation.py +++ b/src/finn/analysis/fpgadataflow/res_estimation.py @@ -60,8 +60,8 @@ def res_estimation_complete(model): res_dict = {} for node in model.graph.node: if is_fpgadataflow_node(node) is True: - op_type = node.op_type inst = registry.getCustomOp(node) + op_type = inst.base_op_type() if op_type == "MatrixVectorActivation" or op_type == "VectorVectorActivation": orig_restype = inst.get_nodeattr("resType") res_dict[node.name] = [] diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py index 1a182c7f4f..81c5848d57 100644 --- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py +++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py @@ -48,12 +48,13 @@ def is_external_input(model, node, i): # True only if input is unconnected and has no initializer # Only esception is second input of FC layers when mem_mode is external node_inst = getCustomOp(node) + op_type = node_inst.base_op_type() producer = model.find_producer(node.input[i]) if producer is None: if model.get_initializer(node.input[i]) is None: return True else: - if node.op_type == "MatrixVectorActivation": + if op_type == "MatrixVectorActivation": if node_inst.get_nodeattr("mem_mode") == "external": return True return False diff --git a/src/finn/transformation/fpgadataflow/floorplan.py b/src/finn/transformation/fpgadataflow/floorplan.py index ceb2bdb5c9..56e644f2b8 100644 --- a/src/finn/transformation/fpgadataflow/floorplan.py +++ b/src/finn/transformation/fpgadataflow/floorplan.py @@ -150,7 +150,7 @@ def apply(self, model): continue elif not ( - node.op_type == "MatrixVectorActivation" + node_inst.base_op_type() == "MatrixVectorActivation" and node_inst.get_nodeattr("mem_mode") is not None and node_inst.get_nodeattr("mem_mode") == "external" ): diff --git a/src/finn/transformation/fpgadataflow/insert_dwc.py b/src/finn/transformation/fpgadataflow/insert_dwc.py index 81cee8dae4..d0029cb630 100644 --- a/src/finn/transformation/fpgadataflow/insert_dwc.py +++ b/src/finn/transformation/fpgadataflow/insert_dwc.py @@ -88,7 +88,7 @@ def apply(self, model): # - if FC and external mem, it could be connected to input 1 # - if concat, could be connected to any input if ( - consumer.op_type == "MatrixVectorActivation" + n1.base_op_type() == "MatrixVectorActivation" and n1.get_nodeattr("mem_mode") == "external" ) or (consumer.op_type == "StreamingConcat"): # get input idx diff --git a/src/finn/transformation/fpgadataflow/insert_iodma.py b/src/finn/transformation/fpgadataflow/insert_iodma.py index 93e3226b2a..fd546459fa 100644 --- a/src/finn/transformation/fpgadataflow/insert_iodma.py +++ b/src/finn/transformation/fpgadataflow/insert_iodma.py @@ -199,7 +199,7 @@ def apply(self, model): # attached IODMA fc_extw_nodes = list( filter( - lambda x: x.op_type in ["MatrixVectorActivation", "VectorVectorActivation"] + lambda x: getCustomOp(x).base_op_type() in ["MatrixVectorActivation", "VectorVectorActivation"] and getCustomOp(x).get_nodeattr("mem_mode") == "external" and model.find_producer(x.input[1]) is None, all_nodes, diff --git a/src/finn/transformation/fpgadataflow/insert_tlastmarker.py b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py index 157df46d71..ab5142e4d8 100644 --- a/src/finn/transformation/fpgadataflow/insert_tlastmarker.py +++ b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py @@ -103,7 +103,7 @@ def apply(self, model): # the input is in the list of graph inputs because it has an # initializer (TODO: fix this with a clean-up transform) if ( - first_node.op_type == "MatrixVectorActivation" + getCustomOp(first_node).base_op_type() == "MatrixVectorActivation" and get_by_name(first_node.attribute, "mem_mode").s.decode("UTF-8") != "external" ): @@ -117,7 +117,7 @@ def apply(self, model): num_iters = np.prod(custom_op.get_folded_input_shape()[1:-1]) inp_idx = list(first_node.input).index(graph_in_name) if inp_idx > 0: - if first_node.op_type == "MatrixVectorActivation" and inp_idx == 1: + if getCustomOp(first_node).base_op_type() == "MatrixVectorActivation" and inp_idx == 1: stream_width = int(custom_op.get_weightstream_width()) elif first_node.op_type == "AddStreams_Batch" and inp_idx == 1: stream_width = int(custom_op.get_instream_width()) diff --git a/src/finn/transformation/fpgadataflow/make_pynq_driver.py b/src/finn/transformation/fpgadataflow/make_pynq_driver.py index d5c2d8f2b5..e66236bf39 100644 --- a/src/finn/transformation/fpgadataflow/make_pynq_driver.py +++ b/src/finn/transformation/fpgadataflow/make_pynq_driver.py @@ -282,7 +282,7 @@ def apply(self, model): dataflow_model = ModelWrapper(dataflow_model_filename) rt_layer_ind = 0 for node in dataflow_model.graph.node: - if node.op_type in ["MatrixVectorActivation", "Thresholding_Batch"]: + if getCustomOp(node).base_op_type() == "MatrixVectorActivation" or node.op_type == "Thresholding_Batch": node_inst = getCustomOp(node) is_rt_weights = node_inst.get_nodeattr("runtime_writeable_weights") if is_rt_weights == 1: diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py index 989eb62a88..193e6e8b42 100644 --- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py +++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py @@ -62,7 +62,7 @@ def collect_ip_dirs(model, ipstitch_path): ), """The directory that should contain the generated ip blocks doesn't exist.""" ip_dirs += [ip_dir_value] - if node.op_type in ["MatrixVectorActivation", "Thresholding_Batch"]: + if getCustomOp(node).base_op_type() == "MatrixVectorActivation" or node.op_type == "Thresholding_Batch": if node_inst.get_nodeattr("mem_mode") == "decoupled": need_memstreamer = True ip_dirs += [ipstitch_path + "/ip"] diff --git a/src/finn/transformation/fpgadataflow/set_fifo_depths.py b/src/finn/transformation/fpgadataflow/set_fifo_depths.py index da6099ab9a..8db8e4c549 100644 --- a/src/finn/transformation/fpgadataflow/set_fifo_depths.py +++ b/src/finn/transformation/fpgadataflow/set_fifo_depths.py @@ -173,7 +173,7 @@ def apply(self, model): continue if fifo_cons is None: continue - if fifo_cons.op_type != "MatrixVectorActivation": + if getCustomOp(fifo_cons).base_op_type() != "MatrixVectorActivation": continue op_inst = getCustomOp(node) depth = op_inst.get_nodeattr("depth") @@ -280,7 +280,7 @@ def apply(self, model): node.set_nodeattr("inFIFODepths", ifd) node.set_nodeattr("outFIFODepths", ofd) - if node.onnx_node.op_type in extw_optypes: + if getCustomOp(node).base_op_type() in extw_optypes: mmode = node.get_nodeattr("mem_mode") if mmode == "external": modified_fc_nodes.append(node.onnx_node.name) @@ -421,7 +421,7 @@ def apply(self, model): # (removed setting of node FIFO size attributes to 0 here) # for every extw node we changed from external to decoupled, # change back and reset implementation - if node.op_type in extw_optypes: + if getCustomOp(node).base_op_type() in extw_optypes: if node.name in modified_fc_nodes: node_inst = getCustomOp(node) node_inst.set_nodeattr("mem_mode", "external") diff --git a/src/finn/transformation/fpgadataflow/set_folding.py b/src/finn/transformation/fpgadataflow/set_folding.py index 4045a28e16..7b65023abc 100644 --- a/src/finn/transformation/fpgadataflow/set_folding.py +++ b/src/finn/transformation/fpgadataflow/set_folding.py @@ -125,7 +125,7 @@ def apply(self, model): continue op_type = node.op_type node_inst = getCustomOp(node) - if op_type == "MatrixVectorActivation": + if node_inst.base_op_type() == "MatrixVectorActivation": max_simd = node_inst.get_nodeattr("MW") max_pe = node_inst.get_nodeattr("MH") node_inst.set_nodeattr("PE", 1) From 4556d2d8973f39279ca248a38e383e90ff042c08 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 26 Jan 2024 11:46:01 +0000 Subject: [PATCH 099/112] [hls custom-op]: add mvau_hls --- src/finn/custom_op/fpgadataflow/hls/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/finn/custom_op/fpgadataflow/hls/__init__.py b/src/finn/custom_op/fpgadataflow/hls/__init__.py index 3e31c9785e..8aebcdf54f 100644 --- a/src/finn/custom_op/fpgadataflow/hls/__init__.py +++ b/src/finn/custom_op/fpgadataflow/hls/__init__.py @@ -49,6 +49,7 @@ from finn.custom_op.fpgadataflow.hls.streamingmaxpool_hls import StreamingMaxPool_hls from finn.custom_op.fpgadataflow.hls.tlastmarker_hls import TLastMarker_hls from finn.custom_op.fpgadataflow.hls.upsampler_hls import UpsampleNearestNeighbour_hls +from finn.custom_op.fpgadataflow.hls.matrixvectoractivation_hls import MatrixVectorActivation_hls custom_op = dict() @@ -73,3 +74,4 @@ custom_op["StreamingMaxPool_hls"] = StreamingMaxPool_hls custom_op["TLastMarker_hls"] = TLastMarker_hls custom_op["UpsampleNearestNeighbour_hls"] = UpsampleNearestNeighbour_hls +custom_op["MatrixVectorActivation_hls"] = MatrixVectorActivation_hls \ No newline at end of file From 0b2fc98e7c15dac6f359ba0a3b76a61d562c18a5 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 26 Jan 2024 11:46:17 +0000 Subject: [PATCH 100/112] [hw custom-op]: refactor MVAU --- .../fpgadataflow/matrixvectoractivation.py | 822 ++++++------------ 1 file changed, 274 insertions(+), 548 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py index 4f24d71ccc..fd5751ef7d 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py @@ -31,20 +31,32 @@ import os import textwrap import warnings +from onnx import TensorProto, helper from qonnx.core.datatype import DataType +import qonnx.custom_op.general.xnorpopcount as xp +from qonnx.custom_op.general.multithreshold import multithreshold +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.custom_op.registry import getCustomOp from qonnx.util.basic import ( calculate_matvec_accumulator_range, interleave_matrix_outer_dim_from_partitions, roundup_to_integer_multiple, + qonnx_make_model ) -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp from finn.util.data_packing import ( npy_to_rtlsim_input, numpy_to_hls_code, pack_innermost_dim_as_hex_string, rtlsim_output_to_npy, ) +import qonnx.core.data_layout as DataLayout +import finn.core.onnx_exec as oxe +from qonnx.transformation.infer_shapes import InferShapes +import onnx.numpy_helper as np_helper +from qonnx.transformation.general import GiveUniqueNodeNames + # ONNX i/o tensor shape assumptions for MatrixVectorActivation: # input 0 is the input tensor, shape (.., i_size) = (..., MW) @@ -54,9 +66,8 @@ # the ... here can be any shape (representing groups of vectors) -class MatrixVectorActivation(HLSCustomOp): - """Class that corresponds to finn-hls Matrix_Vector_Activate(_Stream)_Batch - function.""" +class MatrixVectorActivation(HWCustomOp): + """Abstraction layer for HW implementation of MatrixVectorActivation layers.""" def __init__(self, onnx_node, **kwargs): super().__init__(onnx_node, **kwargs) @@ -122,12 +133,14 @@ def get_nodeattr_types(self): # vector through the accelerator. This will get rid of any old # weight data from the weight FIFOs. "runtime_writeable_weights": ("i", False, 0, {0, 1}), - # Flag to specify whether RTL-based or HLS-based implementation is preferred - "preferred_backend": ("s", False, "rtl", {"hls", "rtl"}) - } + "preferred_impl_style" : ("s", False, "hls", {"hls", "rtl"}), + } my_attrs.update(super().get_nodeattr_types()) return my_attrs + def base_op_type(self): + return "MatrixVectorActivation" + def calc_wmem(self): """Calculates and returns WMEM.""" mw = self.get_nodeattr("MW") @@ -167,6 +180,61 @@ def infer_node_datatype(self, model): odt = self.get_output_datatype() model.set_tensor_datatype(node.output[0], odt) + def get_input_datatype(self, ind=0): + """Returns FINN DataType of input.""" + # when performing FIFO insertion on an FC layer with ext weights, the ind + # parameter can be > 0 (referring to the weights) so handle that here + if ind == 0: + return DataType[self.get_nodeattr("inputDataType")] + elif ind == 1: + return DataType[self.get_nodeattr("weightDataType")] + else: + raise Exception("Undefined input ind for this layer type") + + def get_weight_datatype(self): + """Returns FINN DataType of weights.""" + return DataType[self.get_nodeattr("weightDataType")] + + def get_output_datatype(self, ind=0): + """Returns FINN DataType of output.""" + return DataType[self.get_nodeattr("outputDataType")] + + def get_instream_width(self, ind=0): + i_bits = self.get_input_datatype().bitwidth() + assert ( + i_bits <= 9 + ), "RTL-based MVAU only supports activations with bit-width up to 9-bits" + in_width = i_bits * self.get_nodeattr("SIMD") + return in_width + + def get_weightstream_width(self): + """Returns weight stream width. Used only in decoupled mode.""" + if ( + self.get_nodeattr("mem_mode") == "decoupled" + or self.get_nodeattr("mem_mode") == "external" + ): + pe = self.get_nodeattr("PE") + simd = self.get_nodeattr("SIMD") + wp = self.get_weight_datatype().bitwidth() + assert ( + wp <= 8 + ), "RTL-based MVAU only supports weights with bit-width up to 8-bits" + w_width = pe * simd * wp + return w_width + else: + return 0 + + def get_outstream_width(self, ind=0): + o_bits = self.get_output_datatype().bitwidth() + out_width = o_bits * self.get_nodeattr("PE") + return out_width + + def get_weightstream_width_padded(self): + """Returns weight stream width padded to a multiple of 8. This is required + by the AXI Stream spec. Used in decoupled mode.""" + weight_width = self.get_weightstream_width() + return roundup_to_integer_multiple(weight_width, 8) + def verify_node(self): info_messages = [] # verify that "backend" is set to "fpgadataflow" @@ -387,6 +455,25 @@ def dsp_estimation(self): else: mult_dsp = 0 return int(mult_dsp) +# # TODO: fix DSP estimations --> depends on fpga_part +# def dsp_estimation(self): +# # multiplication +# # mvu_8sx9 (DSP58): ceil(SIMD/3) +# # mvu_4sx4u (DSP48/DSP58): ceil(PE/4) +# # mvu_8sx8u (DSP48): ceil(PE/2) +# # mvu_lut: 0 +# P = self.get_nodeattr("PE") +# res_type = self.get_nodeattr("resType") +# Q = self.get_nodeattr("SIMD") +# wdt = self.get_weight_datatype() +# W = wdt.bitwidth() +# idt = self.get_input_datatype() +# A = idt.bitwidth() +# if res_type == "dsp": +# mult_dsp = P * Q * np.ceil((W + A) / 48) # TODO: more accurate modelling +# else: +# mult_dsp = 0 +# return int(mult_dsp) def get_exp_cycles(self): pe = self.get_nodeattr("PE") @@ -399,6 +486,27 @@ def get_exp_cycles(self): exp_cycles = (mh / pe) * (mw / simd) * np.prod(num_inp_vec) / mmv return int(exp_cycles) +# # TODO: fix exp_cycles estimations --> depends on fpga_part and clk +# def get_exp_cycles(self): +# # mvu_8sx9 (DSP58): +# # 2 (replay_buffer) + ceil(chainlen/seglen) + 2 (MREG, PREG) + 2 (output reg slice) +# # + MW/SIMD * MH/PE +# # mvu_4sx4u (DSP48/DSP58) / mvu_8sx8u (DSP48): +# # 3 (IN_REG, MREG, PREG) + 2 (replay_buffer) + 2 (output reg slice) + 1 (adder tree SIMD) + 1 (output lane) +# # + MW/SIMD * MH/PE +# # mvu_lut: +# # 2 (replay_buffer) + 1 OR 2 (no MREG OR MREG) + 2 (output reg slice) +# # + MW/SIMD * MH/PE +# pe = self.get_nodeattr("PE") +# simd = self.get_nodeattr("SIMD") +# num_inp_vec = self.get_nodeattr("numInputVectors") +# mh = self.get_nodeattr("MH") +# mw = self.get_nodeattr("MW") +# # since mmv != 1 is not supported yet, we set mmv for now to 1 +# mmv = 1 +# exp_cycles = (mh / pe) * (mw / simd) * np.prod(num_inp_vec) / mmv +# return int(exp_cycles) + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" # when performing FIFO insertion on an FC layer with ext weights, the ind @@ -452,17 +560,6 @@ def get_weightstream_width_padded(self): weight_width = self.get_weightstream_width() return roundup_to_integer_multiple(weight_width, 8) - def get_ap_int_max_w(self): - # base class impl (max of inp/out stream widths) - max_of_io = super().get_ap_int_max_w() - # decoupled mode weight stream - weightstream = self.get_weightstream_width() - # single PE weight entry - weight_bits = self.get_weight_datatype().bitwidth() - simd = self.get_nodeattr("SIMD") - single_pe_w = simd * weight_bits - return max([weightstream, max_of_io, single_pe_w]) - def get_folded_input_shape(self, ind=0): mw = self.get_nodeattr("MW") mh = self.get_nodeattr("MH") @@ -507,82 +604,6 @@ def get_number_output_values(self): nf = np.prod(self.get_folded_output_shape()[:-1]) return nf - def get_template_param_values(self): - """Returns the template parameter values according to input, output and weight - data types.""" - ret = dict() - inp_hls_str = self.get_input_datatype().get_hls_datatype_str() - out_hls_str = self.get_output_datatype().get_hls_datatype_str() - inp_is_binary = self.get_input_datatype() == DataType["BINARY"] - # out_is_binary = self.get_output_datatype() == DataType["BINARY"] - wt_is_binary = self.get_weight_datatype() == DataType["BINARY"] - bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1 - if (inp_is_binary or wt_is_binary) and (not bin_xnor_mode): - raise Exception("True binary (non-bipolar) inputs not yet supported") - inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"] - # out_is_bipolar = self.get_output_datatype() == DataType["BIPOLAR"] - wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"] - # reinterpret inp/wt as bipolar if bin_xnor_mode is iset - inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode) - wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode) - # fill in TSrcI and TWeightI - # TODO check these with Giulio - # TODO handle non-bipolar binary inputs - if inp_is_bipolar and wt_is_bipolar: - ret["TSrcI"] = "Recast" - ret["TWeightI"] = "Identity" - elif (not inp_is_bipolar) and wt_is_bipolar: - ret["TSrcI"] = "Slice<%s>" % inp_hls_str - ret["TWeightI"] = "Recast" - elif inp_is_bipolar and (not wt_is_bipolar): - ret["TSrcI"] = "Recast" - ret["TWeightI"] = "Identity" - elif (not inp_is_bipolar) and (not wt_is_bipolar): - ret["TSrcI"] = "Slice<%s>" % inp_hls_str - ret["TWeightI"] = "Identity" - - # fill in TDstI - ret["TDstI"] = "Slice<%s>" % out_hls_str - - return ret - - def get_hls_compatible_weight_tensor(self, orig_weight_matrix): - """Convert the original numpy weight matrix orig_weight_matrix into - a form suitable for passing to the hlslib call: - * ensure MH % PE == 0 and MW % SIMD == 0 - * for bipolar {-1,+1} weights, convert to binary {0, 1} - * interleave rows between PEs - * reshape into (1, PE, WMEM, SIMD) and return - """ - mw = self.get_nodeattr("MW") - mh = self.get_nodeattr("MH") - pe = self.get_nodeattr("PE") - simd = self.get_nodeattr("SIMD") - wmem = self.calc_wmem() - assert orig_weight_matrix.shape == ( - mw, - mh, - ), """Weights matrix doesn't - have expected shape (mw, mh)""" - assert mw % simd == 0, "Requirement MH divisable by SIMD is violated." - assert mh % pe == 0, "Requirement MH divisable by PE is violated." - # start by transposing the original weight matrix, since ONNX and - # finn-hlslib use different assumptions - # ONNX uses (in_features, out_features) and matmul(x, W) - # finn-hlslib uses (out_features, in_features) and matmul(W, x) - ret = orig_weight_matrix.T - if self.get_weight_datatype() == DataType["BIPOLAR"]: - # convert bipolar to binary - ret = (ret + 1) / 2 - # interleave rows between PEs and reshape - # distribute rows between PEs - ret = interleave_matrix_outer_dim_from_partitions(ret, pe) - # create SIMD as innermost dimension and add a dummy outer dim - ret = ret.reshape(1, pe, wmem, simd) - # reverse the SIMD dimension - ret = np.flip(ret, axis=-1) - return ret - def minimize_accumulator_width(self, model): """Minimize the accumulator bit width according to the weight values, input data types, and size of dot product""" @@ -730,6 +751,43 @@ def get_hls_compatible_threshold_tensor(self, orig_thres_matrix): rows between PEs is not as expected (n_thres_steps)""" return ret.reshape(1, pe, tmem, n_thres_steps) + def get_hls_compatible_weight_tensor(self, orig_weight_matrix): + """Convert the original numpy weight matrix orig_weight_matrix into + a form suitable for passing to the hlslib call: + * ensure MH % PE == 0 and MW % SIMD == 0 + * for bipolar {-1,+1} weights, convert to binary {0, 1} + * interleave rows between PEs + * reshape into (1, PE, WMEM, SIMD) and return + """ + mw = self.get_nodeattr("MW") + mh = self.get_nodeattr("MH") + pe = self.get_nodeattr("PE") + simd = self.get_nodeattr("SIMD") + wmem = self.calc_wmem() + assert orig_weight_matrix.shape == ( + mw, + mh, + ), """Weights matrix doesn't + have expected shape (mw, mh)""" + assert mw % simd == 0, "Requirement MH divisable by SIMD is violated." + assert mh % pe == 0, "Requirement MH divisable by PE is violated." + # start by transposing the original weight matrix, since ONNX and + # finn-hlslib use different assumptions + # ONNX uses (in_features, out_features) and matmul(x, W) + # finn-hlslib uses (out_features, in_features) and matmul(W, x) + ret = orig_weight_matrix.T + if self.get_weight_datatype() == DataType["BIPOLAR"]: + # convert bipolar to binary + ret = (ret + 1) / 2 + # interleave rows between PEs and reshape + # distribute rows between PEs + ret = interleave_matrix_outer_dim_from_partitions(ret, pe) + # create SIMD as innermost dimension and add a dummy outer dim + ret = ret.reshape(1, pe, wmem, simd) + # reverse the SIMD dimension + ret = np.flip(ret, axis=-1) + return ret + def make_weight_file(self, weights, weight_file_mode, weight_file_name): """Produce a file containing given weights in appropriate format for this layer. This file can be used for either synthesis or run-time reconfig @@ -907,402 +965,68 @@ def generate_params(self, model, path): f_thresh.write(thresholds_hls_code) f_thresh.close() - def execute_node(self, context, graph): - mode = self.get_nodeattr("exec_mode") - mem_mode = self.get_nodeattr("mem_mode") - node = self.onnx_node - - # TODO ensure codegen dir exists - if mode == "cppsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - # create a npy file fore each input of the node (in_ind is input index) - in_ind = 0 - for inputs in node.input: - # it is assumed that the first input of the node is the data input - # the second input are the weights - # the third input are the thresholds - if in_ind == 0: - assert ( - str(context[inputs].dtype) == "float32" - ), """Input datatype is - not float32 as expected.""" - expected_inp_shape = self.get_folded_input_shape() - reshaped_input = context[inputs].reshape(expected_inp_shape) - if self.get_input_datatype() == DataType["BIPOLAR"]: - # store bipolar activations as binary - reshaped_input = (reshaped_input + 1) / 2 - export_idt = DataType["BINARY"] - else: - export_idt = self.get_input_datatype() - # make copy before saving the array - reshaped_input = reshaped_input.copy() - np.save( - os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)), - reshaped_input, - ) - elif in_ind > 2: - raise Exception("Unexpected input found for MatrixVectorActivation") - in_ind += 1 - - if mode == "cppsim": - # execute the precompiled model - super().exec_precompiled_singlenode_model() - # load output npy file - super().npy_to_dynamic_output(context) - # reinterpret binary output as bipolar where needed - if self.get_output_datatype() == DataType["BIPOLAR"]: - out = context[node.output[0]] - out = 2 * out - 1 - context[node.output[0]] = out - assert ( - context[node.output[0]].shape == self.get_normal_output_shape() - ), "cppsim did not produce expected output shape" - elif mode == "rtlsim": - sim = self.get_rtlsim() - nbits = self.get_instream_width() - inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) - super().reset_rtlsim(sim) - super().toggle_clk(sim) - if mem_mode == "external" or mem_mode == "decoupled": - wnbits = self.get_weightstream_width() - export_wdt = self.get_weight_datatype() - # we have converted bipolar weights to binary for export, - # so use it as such for weight generation - if self.get_weight_datatype() == DataType["BIPOLAR"]: - export_wdt = DataType["BINARY"] - wei = npy_to_rtlsim_input("{}/weights.npy".format(code_gen_dir), export_wdt, wnbits) - num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) - io_dict = { - "inputs": {"in0": inp, "weights": wei * num_w_reps}, - "outputs": {"out": []}, - } - self.rtlsim_multi_io(sim, io_dict) - output = io_dict["outputs"]["out"] - else: - output = self.rtlsim(sim, inp) - odt = self.get_output_datatype() - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits) - - # load and reshape output - output = np.load(out_npy_path) - oshape = self.get_normal_output_shape() - output = np.asarray([output], dtype=np.float32).reshape(*oshape) - context[node.output[0]] = output - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - def global_includes(self): - self.code_gen_dict["$GLOBALS$"] = ['#include "weights.hpp"'] - self.code_gen_dict["$GLOBALS$"] += ['#include "activations.hpp"'] - - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode not in ["const", "decoupled", "external"]: - raise Exception( - """Please set mem_mode to "const", "decoupled", or "external", - currently no other parameter value is supported!""" - ) - self.code_gen_dict["$GLOBALS$"] += ['#include "mvau.hpp"'] - if self.calc_tmem() != 0: - # TODO find a better way of checking for no pregenerated thresholds - self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"'] - - def defines(self, var): - # Only ipgen mode: Make sure that SIMD parameter satisfies minimum requirements. - if var == "ipgen": - SIMD = self.get_nodeattr("SIMD") - MW = self.get_nodeattr("MW") - condition = SIMD >= (MW / 1024) - msg = ( - f"HLS synthesis of MatrixVectorActivation requires: " - f"SIMD >= MW / 1024. This is not fulfilled with: SIMD={SIMD} " - f"and MW={MW} for node: {self.onnx_node.name}." - ) - assert condition, msg - mem_mode = self.get_nodeattr("mem_mode") - numInputVectors = list(self.get_nodeattr("numInputVectors")) - numReps = np.prod(numInputVectors) - self.code_gen_dict["$DEFINES$"] = [ - """#define MW1 {}\n #define MH1 {}\n - #define SIMD1 {}\n #define PE1 {}\n #define WMEM1 {}\n - #define TMEM1 {}\n #define numReps {}""".format( - self.get_nodeattr("MW"), - self.get_nodeattr("MH"), - self.get_nodeattr("SIMD"), - self.get_nodeattr("PE"), - self.calc_wmem(), - self.calc_tmem(), - numReps, - ) - ] - if mem_mode == "decoupled" or mem_mode == "external": - wdt = self.get_weight_datatype() - self.code_gen_dict["$DEFINES$"].append("#define WP1 {}\n".format(wdt.bitwidth())) - - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_input_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"] = [] - # note: the innermost dim is reversed for the input - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s, false);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - npy_in, - self.hls_sname(), - ) - ) - - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "decoupled" or mem_mode == "external": - wdt = self.get_weight_datatype() - elem_bits = wdt.bitwidth() - packed_bits = self.get_weightstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = wdt.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/weights.npy" % code_gen_dir - - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", weights_%s, false, numReps);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - npy_in, - self.hls_sname(), - ) - ) - - def strm_decl(self): - mem_mode = self.get_nodeattr("mem_mode") - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0_{} ("in0_{}");'.format( - self.get_instream_width(), self.hls_sname(), self.hls_sname() - ) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out_{} ("out_{}");'.format( - self.get_outstream_width(), self.hls_sname(), self.hls_sname() - ) - ) - - if mem_mode == "decoupled" or mem_mode == "external": - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> weights_{} ("weights_{}");'.format( - self.get_weightstream_width(), self.hls_sname(), self.hls_sname() - ) - ) + def get_op_and_param_counts(self): + in_features = self.get_nodeattr("MW") + out_features = self.get_nodeattr("MH") + weight_bits = self.get_weight_datatype().bitwidth() + inp_bits = self.get_input_datatype().bitwidth() + num_inp_vec = self.get_nodeattr("numInputVectors") + num_repetitions = int(np.prod(num_inp_vec)) + mac_count = in_features * out_features * num_repetitions + # cannonicalize op type: highest bitwidth operand first s.t. + # e.g. mac_8bx4b and mac_4bx8b don't appear as two different op types + bw1 = min(inp_bits, weight_bits) + bw2 = max(inp_bits, weight_bits) + mac_op_type = "op_mac_%dbx%db" % (bw1, bw2) + weight_param_type = "param_weight_%db" % (weight_bits) + weight_count = in_features * out_features + ret_dict = {mac_op_type: mac_count, weight_param_type: weight_count} + if self.get_nodeattr("noActivation") == 0: + tdt = DataType[self.get_nodeattr("accDataType")] + thres_bits = tdt.bitwidth() + thres_param_type = "param_threshold_%db" % (thres_bits) + thres_count = out_features + ret_dict[thres_param_type] = thres_count + return ret_dict - def docompute(self): - mem_mode = self.get_nodeattr("mem_mode") - map_to_hls_mult_style = { - "auto": "ap_resource_dflt()", - "lut": "ap_resource_lut()", - "dsp": "ap_resource_dsp()", + def derive_characteristic_fxns(self, period): + n_inps = np.prod(self.get_folded_input_shape()[:-1]) + io_dict = { + "inputs": { + "in0": [0 for i in range(n_inps)], + }, + "outputs": {"out": []}, } - tmpl_args = self.get_template_param_values() - if self.calc_tmem() == 0: - odtype_hls_str = self.get_output_datatype().get_hls_datatype_str() - threshs = "PassThroughActivation<%s>()" % odtype_hls_str - else: - threshs = "threshs" - if mem_mode == "const": - self.code_gen_dict["$DOCOMPUTE$"] = [ - """Matrix_Vector_Activate_Batch - (in0_{}, out_{}, weights, {}, numReps, {});""".format( - tmpl_args["TSrcI"], - tmpl_args["TDstI"], - tmpl_args["TWeightI"], - self.hls_sname(), - self.hls_sname(), - threshs, - map_to_hls_mult_style[self.get_nodeattr("resType")], - ) - ] - elif mem_mode == "decoupled" or mem_mode == "external": - wdt = self.get_weight_datatype() - if wdt == DataType["BIPOLAR"]: - export_wdt = DataType["BINARY"] - else: - export_wdt = wdt - wdtype_hls_str = export_wdt.get_hls_datatype_str() - self.code_gen_dict["$DOCOMPUTE$"] = [ - """Matrix_Vector_Activate_Stream_Batch - (in0_{}, out_{}, weights_{}, {}, numReps, {});""".format( - tmpl_args["TSrcI"], - tmpl_args["TDstI"], - tmpl_args["TWeightI"], - wdtype_hls_str, - self.hls_sname(), - self.hls_sname(), - self.hls_sname(), - threshs, - map_to_hls_mult_style[self.get_nodeattr("resType")], - ) - ] - - else: - raise Exception( - """Please set mem_mode to "const", "decoupled", or "external", - currently no other parameter value is supported!""" - ) - - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - shape = self.get_folded_output_shape() - shape_cpp_str = str(shape).replace("(", "{").replace(")", "}") - - # note: the innermost dim is not reversed for the output - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", false);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - self.hls_sname(), - shape_cpp_str, - npy_out, - ) - ] - - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - - def blackboxfunction(self): - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "const": - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void {}(hls::stream> &in0_{}, - hls::stream> &out_{} - )""".format( - self.onnx_node.name, - self.get_instream_width(), - self.hls_sname(), - self.get_outstream_width(), - self.hls_sname(), - ) - ] - elif mem_mode == "decoupled" or mem_mode == "external": - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void {}( - hls::stream> &in0_{}, - hls::stream> &weights_{}, - hls::stream> &out_{} - )""".format( - self.onnx_node.name, - self.get_instream_width(), - self.hls_sname(), - self.get_weightstream_width(), - self.hls_sname(), - self.get_outstream_width(), - self.hls_sname(), - ) - ] - - else: - raise Exception( - """Please set mem_mode to "const" or "decoupled", currently no other - parameter value is supported!""" - ) - - def pragmas(self): mem_mode = self.get_nodeattr("mem_mode") - ram_style_thresholds = self.get_nodeattr("ram_style_thresholds") - self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() - ] - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") - - if mem_mode == "const": - self.code_gen_dict["$PRAGMAS$"].append('#include "params.h"') - # the weight tensor is ap_uint [PE][WMEM] - # partition for parallel access along the PE dimension (dim 1) - self.code_gen_dict["$PRAGMAS$"].append( - ("#pragma HLS ARRAY_PARTITION variable=weights.m_weights " "complete dim=1") - ) - elif mem_mode == "decoupled" or mem_mode == "external": - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=weights_" + self.hls_sname() - ) + if mem_mode in ["decoupled", "external"]: + n_weight_inps = self.calc_wmem() + num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) + io_dict["inputs"]["weights"] = [0 for i in range(num_w_reps * n_weight_inps)] + super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) + def execute_node(self, context, graph): + node = self.onnx_node + in_act = context[node.input[0]] + mvau_w_init = [x for x in graph.initializer if x.name == node.input[1]][0] + mvau_w = np_helper.to_array(mvau_w_init) + # Matrix multiplication + if self.get_nodeattr("binaryXnorMode"): + # Note: activation/weights are expected to be binary (by design coming from the transformation inferring this operation mode) + result = xp.xnorpopcountmatmul(in_act, mvau_w) + elif (self.get_nodeattr("inputDataType") == "BIPOLAR" and self.get_nodeattr("weightDataType") == "BIPOLAR"): + result = xp.xnorpopcountmatmul((in_act+1)/2, (mvau_w+1)/2) else: - raise Exception( - """Please set mem_mode to "const", "decoupled", or external, - currently no other parameter value is supported!""" - ) - - # the threshold tensor is acc_type [PE][TMEM][N_THRES] - # partition for parallel access along PE and N_THRES - # dimensions (dims 1 and 3) - if self.calc_tmem() != 0: - # TODO find a better way of checking for no pregenerated thresholds - self.code_gen_dict["$PRAGMAS$"].append( - ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=1") - ) - self.code_gen_dict["$PRAGMAS$"].append( - ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=3") - ) - # add resource pragma for thresholds if set - if ram_style_thresholds == "distributed": - self.code_gen_dict["$PRAGMAS$"].append( - ("#pragma HLS RESOURCE variable=threshs.m_thresholds " "core=ROM_2P_LUTRAM") - ) - elif ram_style_thresholds == "block": - self.code_gen_dict["$PRAGMAS$"].append( - ("#pragma HLS RESOURCE variable=threshs.m_thresholds " "core=ROM_2P_BRAM") - ) - elif ram_style_thresholds == "auto": - # no pragma needed - pass - else: - raise Exception("Unrecognized ram_style_thresholds value:" + ram_style_thresholds) + result = np.matmul(in_act, mvau_w) + # Thresholding if noActivation==0 + if self.get_nodeattr("noActivation") == 0: + mvau_thr_init = [x for x in graph.initializer if x.name == node.input[2]][0] + mvau_thr = np_helper.to_array(mvau_thr_init) + odt_is_bipolar = self.get_nodeattr("outputDataType") == DataType["BIPOLAR"] + out_scale = 2 if odt_is_bipolar else 1 + out_bias = -1 if odt_is_bipolar else self.get_nodeattr("ActVal") + result = multithreshold(result, mvau_thr, out_scale, out_bias) + + context[node.output[0]] = result def code_generation_ipi(self): cmd = [] @@ -1326,22 +1050,51 @@ def code_generation_ipi(self): cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name)) cmd.append( "create_bd_intf_pin -mode Master " - "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, dout_name) + "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" + % (node_name, dout_name) ) cmd.append( "create_bd_intf_pin -mode Slave " "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name) ) - # instantiate the hls ip - cmd.append( - "create_bd_cell -type ip -vlnv %s /%s/%s" - % (self.get_nodeattr("ip_vlnv"), node_name, node_name) - ) + is_rtl_op = self.__class__.__name__ == "MatrixVectorActivation_rtl" + if is_rtl_op: + # instantiate the RTL block + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/") + sourcefiles = [ + os.path.join( + code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v" + ), + rtllib_dir + "mvu_vvu_axi.sv", + rtllib_dir + "replay_buffer.sv", + rtllib_dir + "mvu_4sx4u.sv", + rtllib_dir + "mvu_vvu_8sx9_dsp58.sv", + rtllib_dir + "mvu_8sx8u_dsp48.sv", + ] + for f in sourcefiles: + cmd.append("add_files -norecurse %s" % (f)) + cmd.append( + "create_bd_cell -type hier -reference %s /%s/%s" + % ( + self.get_nodeattr("gen_top_module"), + self.onnx_node.name, + self.onnx_node.name, + ) + ) + else: + # instantiate the hls ip + cmd.append( + "create_bd_cell -type ip -vlnv %s /%s/%s" + % (self.get_nodeattr("ip_vlnv"), node_name, node_name) + ) + # instantiate a streamer and connect it to the HLS IP strm_vlnv = "amd.com:finn:memstream:1.0" strm_inst = node_name + "_wstrm" cmd.append( - "create_bd_cell -type ip -vlnv %s /%s/%s" % (strm_vlnv, node_name, strm_inst) + "create_bd_cell -type ip -vlnv %s /%s/%s" + % (strm_vlnv, node_name, strm_inst) ) cmd.append( "set_property -dict [list " @@ -1395,7 +1148,8 @@ def code_generation_ipi(self): axilite_name = self.get_verilog_top_module_intf_names()["axilite"][0] cmd.append( "create_bd_intf_pin -mode Slave " - "-vlnv xilinx.com:interface:aximm_rtl:1.0 /%s/%s" % (node_name, axilite_name) + "-vlnv xilinx.com:interface:aximm_rtl:1.0 /%s/%s" + % (node_name, axilite_name) ) cmd.append( "connect_bd_intf_net [get_bd_intf_pins %s/%s] " @@ -1406,60 +1160,32 @@ def code_generation_ipi(self): cmd.append("assign_bd_address") cmd.append("save_bd_design") elif mem_mode == "const" or mem_mode == "external": - # base class impl sufficient for const/external modes - return super().code_generation_ipi() + if is_rtl_op and mem_mode == "external": + # instantiate the RTL block + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/") + sourcefiles = [ + os.path.join( + code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v" + ), + rtllib_dir + "mvu_vvu_axi.sv", + rtllib_dir + "replay_buffer.sv", + rtllib_dir + "mvu_4sx4u.sv", + rtllib_dir + "mvu_vvu_8sx9_dsp58.sv", + rtllib_dir + "mvu_8sx8u_dsp48.sv", + ] + for f in sourcefiles: + cmd.append("add_files -norecurse %s" % (f)) + cmd.append( + "create_bd_cell -type module -reference %s %s" + % ( + self.get_nodeattr("gen_top_module"), + self.onnx_node.name, + ) + ) + else: + # base class impl sufficient for const/external modes + return super().code_generation_ipi() else: raise Exception("Unrecognized mem_mode for MatrixVectorActivation") - return cmd - - def get_verilog_top_module_intf_names(self): - intf_names = super().get_verilog_top_module_intf_names() - mem_mode = self.get_nodeattr("mem_mode") - sname = self.hls_sname() - if mem_mode == "external": - intf_names["s_axis"].append(("weights_" + sname, self.get_weightstream_width_padded())) - if mem_mode == "decoupled": - # only expose axilite interface if attribute is set - runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 - if runtime_writable: - intf_names["axilite"] = ["s_axilite"] - return intf_names - - def get_op_and_param_counts(self): - in_features = self.get_nodeattr("MW") - out_features = self.get_nodeattr("MH") - weight_bits = self.get_weight_datatype().bitwidth() - inp_bits = self.get_input_datatype().bitwidth() - num_inp_vec = self.get_nodeattr("numInputVectors") - num_repetitions = int(np.prod(num_inp_vec)) - mac_count = in_features * out_features * num_repetitions - # cannonicalize op type: highest bitwidth operand first s.t. - # e.g. mac_8bx4b and mac_4bx8b don't appear as two different op types - bw1 = min(inp_bits, weight_bits) - bw2 = max(inp_bits, weight_bits) - mac_op_type = "op_mac_%dbx%db" % (bw1, bw2) - weight_param_type = "param_weight_%db" % (weight_bits) - weight_count = in_features * out_features - ret_dict = {mac_op_type: mac_count, weight_param_type: weight_count} - if self.get_nodeattr("noActivation") == 0: - tdt = DataType[self.get_nodeattr("accDataType")] - thres_bits = tdt.bitwidth() - thres_param_type = "param_threshold_%db" % (thres_bits) - thres_count = out_features - ret_dict[thres_param_type] = thres_count - return ret_dict - - def derive_characteristic_fxns(self, period): - n_inps = np.prod(self.get_folded_input_shape()[:-1]) - io_dict = { - "inputs": { - "in0": [0 for i in range(n_inps)], - }, - "outputs": {"out": []}, - } - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode in ["decoupled", "external"]: - n_weight_inps = self.calc_wmem() - num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) - io_dict["inputs"]["weights"] = [0 for i in range(num_w_reps * n_weight_inps)] - super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) + return cmd \ No newline at end of file From 1a40e6a5ac5670a04d74ab893b82dab59e0538f9 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 26 Jan 2024 11:46:51 +0000 Subject: [PATCH 101/112] [VVAU hw custom-op]: add base_op_type method --- src/finn/custom_op/fpgadataflow/vectorvectoractivation.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py index bd5bb75f1d..891730ece3 100644 --- a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py @@ -104,6 +104,9 @@ def get_nodeattr_types(self): my_attrs.update(super().get_nodeattr_types()) return my_attrs + def base_op_type(self): + return "VectorVectorActivation" + def minimize_accumulator_width(self, model): """Minimize the accumulator bit width according to the weight values, input data types, and size of dot product""" From 5e1ed9be433f8ced81de8dbfd1bb52f5a505a6b4 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 26 Jan 2024 11:47:27 +0000 Subject: [PATCH 102/112] [transform]: add transformation to infer MVAU hw custom-op --- .../fpgadataflow/convert_to_hw_layers.py | 136 ++++++++++++++++++ 1 file changed, 136 insertions(+) diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py index 2b8433e59c..4ea7f9298a 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py @@ -1196,3 +1196,139 @@ def apply(self, model): graph_modified = True return (model, graph_modified) + +class InferQuantizedMatrixVectorActivation(Transformation): + """Convert MatMul layers with quantized inputs and weights to + MatrixVectorActivation layers.""" + + def __init__(self, mem_mode="const"): + super().__init__() + self.mem_mode = mem_mode + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for n in graph.node: + node_ind += 1 + if n.op_type == "MatMul" and model.get_tensor_sparsity(n.input[1]) is None: + mm_input = n.input[0] + mm_weight = n.input[1] + mm_output = n.output[0] + mm_in_shape = model.get_tensor_shape(mm_input) + mm_out_shape = model.get_tensor_shape(mm_output) + idt = model.get_tensor_datatype(mm_input) + wdt = model.get_tensor_datatype(mm_weight) + if idt.is_integer() and wdt.is_integer(): + mm_output = n.output[0] + W = model.get_initializer(mm_weight) + # extract weight shape, note that ONNX and finn-hlslib + # make different assumptions about dim order here + # ONNX assumes W has (in, out) shape + # finn-hlslib assumes W has (out, in) shape + mh = int(W.shape[1]) + mw = int(W.shape[0]) + # create node with no parallelization first + pe = 1 + simd = 1 + wmem = mw * mh // (pe * simd) + assert mw * mh == wmem * pe * simd, ( + n.name + + """: Requirement (MW * MH) divisible by + (WMEM * PE * SIMD) is violated.""" + ) + # see if we have any following thresholds + consumer = model.find_consumer(mm_output) + if consumer is not None and consumer.op_type == "MultiThreshold": + # TODO ensure integer thresholds? + # create MVTU (i.e. including activation) + mt_output = consumer.output[0] + mt_out_shape = model.get_tensor_shape(mt_output) + mt_thres = consumer.input[1] + T = model.get_initializer(mt_thres) + assert T.shape[0] == 1 or T.shape[0] == mh, ( + consumer.name + + """: First dimension of + thresholds neither 1 nor MH.""" + ) + odt = model.get_tensor_datatype(mt_output) + scale = getCustomOp(consumer).get_nodeattr("out_scale") + actval = getCustomOp(consumer).get_nodeattr("out_bias") + assert int(actval) == actval, ( + consumer.name + ": out_bias must be integer for HLS conversion." + ) + actval = int(actval) + odt_is_bipolar = odt == DataType["BIPOLAR"] + bipolar_ok = odt_is_bipolar and (scale == 2.0) and (actval == -1) + assert scale == 1.0 or bipolar_ok, ( + consumer.name + ": out_scale=1 or bipolar output needed for conversion." + ) + assert (not odt.signed()) or (actval < 0), ( + consumer.name + ": Signed output requres actval < 0" + ) + model.set_tensor_shape(mm_input, mm_in_shape) + model.set_tensor_shape(mt_output, mt_out_shape) + if bipolar_ok: + # remove bias for bipolar, since + # binary->bipolar is achieved by reinterpretation + actval = 0 + # create and insert new MatrixVectorActivation node + new_node = helper.make_node( + "MatrixVectorActivation", + [mm_input, mm_weight, mt_thres], + [mt_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + MW=mw, + MH=mh, + SIMD=simd, + PE=pe, + inputDataType=idt.name, + weightDataType=wdt.name, + outputDataType=odt.name, + ActVal=actval, + binaryXnorMode=0, + noActivation=0, + numInputVectors=list(mm_in_shape[:-1]), + mem_mode=self.mem_mode, + name="MatrixVectorActivation_" + n.name, + ) + graph.node.insert(node_ind, new_node) + # remove old nodes + graph.node.remove(n) + graph.node.remove(consumer) + graph_modified = True + else: + # no activation, matmul only + odt = model.get_tensor_datatype(mm_output) + model.set_tensor_shape(mm_input, mm_in_shape) + model.set_tensor_shape(mm_output, mm_out_shape) + # create and insert new MatrixVectorActivation node + new_node = helper.make_node( + "MatrixVectorActivation", + [mm_input, mm_weight], + [mm_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + MW=mw, + MH=mh, + SIMD=simd, + PE=pe, + inputDataType=idt.name, + weightDataType=wdt.name, + outputDataType=odt.name, + ActVal=0, + binaryXnorMode=0, + noActivation=1, + numInputVectors=list(mm_in_shape[:-1]), + mem_mode=self.mem_mode, + name="MatrixVectorActivation_" + n.name, + ) + graph.node.insert(node_ind, new_node) + # remove old node + graph.node.remove(n) + graph_modified = True + if graph_modified: + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + return (model, graph_modified) \ No newline at end of file From 63c73c2970609ccc999df1bb122501e94f606ebb Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 26 Jan 2024 12:22:52 +0000 Subject: [PATCH 103/112] removed mvu rtl code to clean up PR --- finn-rtllib/mvu/mvu_4sx4u.sv | 379 ---------------------- finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 379 ---------------------- finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv | 430 ------------------------- finn-rtllib/mvu/mvu_vvu_axi.sv | 234 -------------- finn-rtllib/mvu/mvu_vvu_axi_wrapper.v | 92 ------ finn-rtllib/mvu/replay_buffer.sv | 181 ----------- finn-rtllib/mvu/tb/mvu_8sx9_tb.sv | 165 ---------- finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv | 241 -------------- finn-rtllib/mvu/tb/replay_buffer_tb.sv | 130 -------- 9 files changed, 2231 deletions(-) delete mode 100644 finn-rtllib/mvu/mvu_4sx4u.sv delete mode 100644 finn-rtllib/mvu/mvu_8sx8u_dsp48.sv delete mode 100644 finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv delete mode 100644 finn-rtllib/mvu/mvu_vvu_axi.sv delete mode 100644 finn-rtllib/mvu/mvu_vvu_axi_wrapper.v delete mode 100644 finn-rtllib/mvu/replay_buffer.sv delete mode 100644 finn-rtllib/mvu/tb/mvu_8sx9_tb.sv delete mode 100644 finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv delete mode 100644 finn-rtllib/mvu/tb/replay_buffer_tb.sv diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv deleted file mode 100644 index 7a2af35742..0000000000 --- a/finn-rtllib/mvu/mvu_4sx4u.sv +++ /dev/null @@ -1,379 +0,0 @@ -module mvu_4sx4u #( - int unsigned PE, - int unsigned SIMD, - int unsigned ACCU_WIDTH, - bit SIGNED_ACTIVATIONS = 0, - bit FORCE_BEHAVIORAL = 0 -)( - // Global Control - input logic clk, - input logic rst, - input logic en, - - // Input - input logic last, - input logic zero, // ignore current inputs and force this partial product to zero - input logic signed [PE-1:0][SIMD-1:0][3:0] w, // signed weights - input logic [SIMD-1:0][3:0] a, // unsigned activations - - // Ouput - output logic vld, - output logic signed [PE-1:0][ACCU_WIDTH-1:0] p -); - // for verilator always use behavioral code - localparam bit BEHAVIORAL = -`ifdef VERILATOR - 1 || -`endif - FORCE_BEHAVIORAL; - - typedef int unsigned leave_load_t[2*SIMD-1]; - function leave_load_t init_leave_loads(); - automatic leave_load_t res; - for(int i = 2*(SIMD-1); i >= int'(SIMD)-1; i--) res[i] = 1; - for(int i = SIMD-2; i >= 0; i--) res[i] = res[2*i+1] + res[2*i+2]; - return res; - endfunction : init_leave_loads - - // Pipeline for last indicator flag - logic [1:5] L = '0; - always_ff @(posedge clk) begin - if(rst) L <= '0; - else if(en) L <= { last, L[1:4] }; - end - assign vld = L[5]; - - // Stages #1 - #3: DSP Lanes + cross-lane canaries duplicated with SIMD parallelism - localparam int unsigned D[4:0] = '{ ACCU_WIDTH+22, 22, 15, 8, 0 }; // Lane offsets - - localparam int unsigned PIPE_COUNT = (PE+3)/4; - for(genvar c = 0; c < PIPE_COUNT; c++) begin : genPipes - - localparam int unsigned PE_BEG = 4*c; - localparam int unsigned PE_END = PE < 4*(c+1)? PE : 4*(c+1); - localparam int unsigned PE_REM = 4*(c+1) - PE_END; - - uwire [57:0] p3[SIMD]; - uwire signed [ 1:0] h3[SIMD][3]; - for(genvar s = 0; s < SIMD; s++) begin : genSIMD - - // Input Lane Assembly - uwire [23:0] bb = { {(20){SIGNED_ACTIVATIONS && a[s][3]}}, a[s] }; - logic [33:0] aa; - logic [26:0] dd; - logic [ 1:0] xx[3:1]; - if(1) begin : blkVectorize - uwire [3:0] ww[PE_END - PE_BEG]; - for(genvar pe = 0; pe < PE_END - PE_BEG; pe++) begin - assign ww[pe] = w[PE_BEG + pe][s]; - if(pe) begin - if(BEHAVIORAL) assign xx[pe + PE_REM] = zero? 0 : ww[pe] * a[s]; -`ifndef VERILATOR - else begin - LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x ( - .O6(xx[pe + PE_REM][1]), - .O5(xx[pe + PE_REM][0]), - .I5(1'b1), - .I4(zero), - .I3(ww[pe][1]), - .I2(a[s][1]), - .I1(ww[pe][0]), - .I0(a[s][0]) - ); - end -`endif - end - end - always_comb begin - dd = '0; - aa = '0; - for(int unsigned pe = 0; pe < PE_END - PE_BEG; pe++) begin - dd[D[pe + PE_REM]+:3] = ww[pe]; - aa[D[pe + PE_REM]+ 3] = ww[pe][3]; - end - end - end : blkVectorize - - uwire [57:0] pp; - - // Note: Since the product B * AD is computed, - // rst can be only applied to AD and zero only to B - // with the same effect as zeroing both. - if (BEHAVIORAL) begin : genBehav - // Stage #1: Input Refine - logic signed [23:0] B1 = 0; - always_ff @(posedge clk) begin - if(zero) B1 <= 0; - else if(en) B1 <= bb; - end - - logic signed [26:0] AD1 = 0; - always_ff @(posedge clk) begin - if(rst) AD1 <= 0; - else if(en) AD1 <= dd - aa; - end - - // Stage #2: Multiply - logic signed [50:0] M2 = 0; - always_ff @(posedge clk) begin - if(rst) M2 <= 0; - else if(en) M2 <= -// synthesis translate off - (B1 === '0) || (AD1 === '0)? 0 : -// synthesis translate on - B1 * AD1; - end - - // Stage #3: Accumulate - logic signed [57:0] P3 = 0; - always_ff @(posedge clk) begin - if(rst) P3 <= 0; - else if(en) P3 <= M2 + (L[3]? 0 : P3); - end - - assign pp = P3; - end : genBehav -`ifndef VERILATOR - else begin : genDSP - DSP48E2 #( - // Feature Control Attributes: Data Path Selection - .AMULTSEL("AD"), // Selects A input to multiplier (A, AD) - .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) - .BMULTSEL("B"), // Selects B input to multiplier (AD, B) - .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port) - .PREADDINSEL("A"), // Selects input to pre-adder (A, B) - .RND('0), // Rounding Constant - .USE_MULT("MULTIPLY"), // Select multiplier usage (DYNAMIC, MULTIPLY, NONE) - .USE_SIMD("ONE48"), // SIMD selection (FOUR12, ONE58, TWO24) - .USE_WIDEXOR("FALSE"), // Use the Wide XOR function (FALSE, TRUE) - .XORSIMD("XOR24_48_96"), // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116) - - // Pattern Detector Attributes: Pattern Detection Configuration - .AUTORESET_PATDET("NO_RESET"), // NO_RESET, RESET_MATCH, RESET_NOT_MATCH - .AUTORESET_PRIORITY("RESET"), // Priority of AUTORESET vs. CEP (CEP, RESET). - .MASK('1), // 58-bit mask value for pattern detect (1=ignore) - .PATTERN('0), // 58-bit pattern match for pattern detect - .SEL_MASK("MASK"), // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2 - .SEL_PATTERN("PATTERN"), // Select pattern value (C, PATTERN) - .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect (NO_PATDET, PATDET) - - // Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins - .IS_ALUMODE_INVERTED('0), // Optional inversion for ALUMODE - .IS_CARRYIN_INVERTED('0), // Optional inversion for CARRYIN - .IS_CLK_INVERTED('0), // Optional inversion for CLK - .IS_INMODE_INVERTED('0), // Optional inversion for INMODE - .IS_OPMODE_INVERTED(9'b00_010_01_01), // Optional inversion for OPMODE - .IS_RSTALLCARRYIN_INVERTED('0), // Optional inversion for RSTALLCARRYIN - .IS_RSTALUMODE_INVERTED('0), // Optional inversion for RSTALUMODE - .IS_RSTA_INVERTED('0), // Optional inversion for RSTA - .IS_RSTB_INVERTED('0), // Optional inversion for RSTB - .IS_RSTCTRL_INVERTED('0), // Optional inversion for STCONJUGATE_A - .IS_RSTC_INVERTED('0), // Optional inversion for RSTC - .IS_RSTD_INVERTED('0), // Optional inversion for RSTD - .IS_RSTINMODE_INVERTED('0), // Optional inversion for RSTINMODE - .IS_RSTM_INVERTED('0), // Optional inversion for RSTM - .IS_RSTP_INVERTED('0), // Optional inversion for RSTP - - // Register Control Attributes: Pipeline Register Configuration - .ACASCREG(0), // Number of pipeline stages between A/ACIN and ACOUT (0-2) - .ADREG(1), // Pipeline stages for pre-adder (0-1) - .ALUMODEREG(0), // Pipeline stages for ALUMODE (0-1) - .AREG(0), // Pipeline stages for A (0-2) - .BCASCREG(1), // Number of pipeline stages between B/BCIN and BCOUT (0-2) - .BREG(1), // Pipeline stages for B (0-2) - .CARRYINREG(0), // Pipeline stages for CARRYIN (0-1) - .CARRYINSELREG(0), // Pipeline stages for CARRYINSEL (0-1) - .CREG(0), // Pipeline stages for C (0-1) - .DREG(0), // Pipeline stages for D (0-1) - .INMODEREG(0), // Pipeline stages for INMODE (0-1) - .MREG(1), // Multiplier pipeline stages (0-1) - .OPMODEREG(1), // Pipeline stages for OPMODE (0-1) - .PREG(1) // Number of pipeline stages for P (0-1) - ) dsp ( - // Cascade outputs: Cascade Ports - .ACOUT(), // 34-bit output: A port cascade - .BCOUT(), // 24-bit output: B cascade - .CARRYCASCOUT(), // 1-bit output: Cascade carry - .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade - .PCOUT(), // 58-bit output: Cascade output - - // Control outputs: Control Inputs/Status Bits - .OVERFLOW(), // 1-bit output: Overflow in add/acc - .PATTERNBDETECT(), // 1-bit output: Pattern bar detect - .PATTERNDETECT(), // 1-bit output: Pattern detect - .UNDERFLOW(), // 1-bit output: Underflow in add/acc - - // Data outputs: Data Ports - .CARRYOUT(), // 4-bit output: Carry - .P(pp), // 58-bit output: Primary data - .XOROUT(), // 8-bit output: XOR data - - // Cascade inputs: Cascade Ports - .ACIN('x), // 34-bit input: A cascade data - .BCIN('x), // 24-bit input: B cascade - .CARRYCASCIN('x), // 1-bit input: Cascade carry - .MULTSIGNIN('x), // 1-bit input: Multiplier sign cascade - .PCIN('x), // 58-bit input: P cascade - - // Control inputs: Control Inputs/Status Bits - .CLK(clk), // 1-bit input: Clock - .ALUMODE(4'h0), // 4-bit input: ALU control - .CARRYINSEL('0), // 3-bit input: Carry select - .INMODE(5'b01100), // 5-bit input: INMODE control - .OPMODE({ 2'b00, { 1'b0, L[2], 1'b0 }, 4'b00_00 }), // 9-bit input: Operation mode - - // Data inputs: Data Ports - .A(aa), // 34-bit input: A data - .B(bb), // 24-bit input: B data - .C('x), // 58-bit input: C data - .CARRYIN('0), // 1-bit input: Carry-in - .D(dd), // 27-bit input: D data - - // Reset/Clock Enable inputs: Reset/Clock Enable Inputs - .CEA1('0), // 1-bit input: Clock enable for 1st stage AREG - .CEA2('0), // 1-bit input: Clock enable for 2nd stage AREG - .CEAD(en), // 1-bit input: Clock enable for ADREG - .CEALUMODE('0), // 1-bit input: Clock enable for ALUMODE - .CEB1('0), // 1-bit input: Clock enable for 1st stage BREG - .CEB2(en), // 1-bit input: Clock enable for 2nd stage BREG - .CEC('0), // 1-bit input: Clock enable for CREG - .CECARRYIN('0), // 1-bit input: Clock enable for CARRYINREG - .CECTRL(en), // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG - .CED('0), // 1-bit input: Clock enable for DREG - .CEINMODE('0), // 1-bit input: Clock enable for INMODEREG - .CEM(en), // 1-bit input: Clock enable for MREG - .CEP(en), // 1-bit input: Clock enable for PREG - .RSTA('0), // 1-bit input: Reset for AREG - .RSTB( // 1-bit input: Reset for BREG -// synthesis translate_off - rst || -// synthesis translate_on - zero - ), - .RSTC('0), // 1-bit input: Reset for CREG - .RSTD( // 1-bit input: Reset for DREG and ADREG -// synthesis translate_off - zero || -// synthesis translate_on - rst - ), - .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG - .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG - .RSTCTRL('0), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG - .RSTINMODE('0), // 1-bit input: Reset for INMODE register - .RSTM(rst), // 1-bit input: Reset for MREG - .RSTP(rst) // 1-bit input: Reset for PREG - ); - end : genDSP -`endif - - // External Canary Pipeline - logic [1:0] X1[3:1] = '{ default: 0 }; - logic [1:0] X2[3:1] = '{ default: 0 }; - logic [1:0] X3[3:1] = '{ default: 0 }; - always_ff @(posedge clk) begin - if(rst) begin - X1 <= '{ default: 0 }; - X2 <= '{ default: 0 }; - X3 <= '{ default: 0 }; - end - else if(en) begin - X1 <= xx; - X2 <= X1; - foreach(X3[i]) begin - X3[i] <= X2[i] + (L[3]? 2'h0 : pp[D[i]+:2]); - end - end - end - - // Derive actual cross-lane overflows - for(genvar i = 0; i < 3; i++) begin - assign h3[s][i] = pp[D[i+1]+:2] - X3[i+1]; - end - assign p3[s] = pp; - - end : genSIMD - - // Stage #4: Cross-SIMD Reduction - - // Count leaves reachable from each node - localparam leave_load_t LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 1}; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop - - uwire signed [ACCU_WIDTH -1:0] up4; - uwire signed [ACCU_WIDTH -8:0] hi4[3]; - uwire [$clog2(SIMD)+7:0] lo4[3]; - for(genvar i = 0; i < 4; i++) begin - localparam int unsigned LO_WIDTH = D[i+1] - D[i]; - localparam int unsigned HI_WIDTH = ACCU_WIDTH - LO_WIDTH; - - // Conclusive high part accumulation - if(i >= PE_REM && i < 3) begin : genHi - // Adder Tree across all SIMD high contributions, each from [-1:1] - uwire signed [2*SIMD-2:0][$clog2(1+SIMD):0] tree; - for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = h3[s][i]; - for(genvar n = 0; n < SIMD-1; n++) begin - // Sum truncated to actual maximum bit width at this node - uwire signed [$clog2(1+LEAVE_LOAD[n]):0] s = tree[2*n+1] + tree[2*n+2]; - assign tree[n] = s; - end - - // High Sideband Accumulation - logic signed [HI_WIDTH-1:0] Hi4 = 0; - always_ff @(posedge clk) begin - if(rst) Hi4 <= 0; - else if(en) Hi4 <= (L[4]? 0 : Hi4) + tree[0]; - end - assign hi4[i] = Hi4; - end : genHi - else if (i < 3) begin : genHiZero - assign hi4[i] = '0; - end : genHiZero - - // Conclusive low part accumulation - if(i >= PE_REM) begin : blkLo - // Adder Tree across all SIMD low contributions - localparam int unsigned ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1)); - uwire [2*SIMD-2:0][ROOT_WIDTH-1:0] tree; - for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = p3[s][D[i]+:LO_WIDTH]; - for(genvar n = 0; n < SIMD-1; n++) begin - // Sum truncated to actual maximum bit width at this node - localparam int unsigned NODE_WIDTH = $clog2(1 + LEAVE_LOAD[n]*(2**LO_WIDTH-1)); - uwire [NODE_WIDTH-1:0] s = tree[2*n+1] + tree[2*n+2]; - assign tree[n] = s; - end - - logic [ROOT_WIDTH-1:0] Lo4 = 0; - always_ff @(posedge clk) begin - if(rst) Lo4 <= 0; - else if(en) Lo4 <= tree[0]; - end - - if(i == 3) assign up4 = Lo4; - else assign lo4[i] = Lo4; - end : blkLo - else begin : blkLoZero - assign lo4[i] = '0; - end : blkLoZero - - end - - // Stage #5: Resolve lane totals - logic signed [3:0][ACCU_WIDTH-1:0] Res5 = '{ default: 0 }; - always_ff @(posedge clk) begin - if(rst) Res5 <= '{ default: 0 }; - else if(en) begin - Res5[3] <= up4 - hi4[2]; - Res5[2] <= $signed({ hi4[2], {(D[3] - D[2]){1'b0}} }) + $signed({ 1'b0, lo4[2] }) - hi4[1]; - Res5[1] <= $signed({ hi4[1], {(D[2] - D[1]){1'b0}} }) + $signed({ 1'b0, lo4[1] }) - hi4[0]; - Res5[0] <= $signed({ hi4[0], {(D[1] - D[0]){1'b0}} }) + $signed({ 1'b0, lo4[0] }); - end - end - - // Output - for(genvar pe = PE_BEG; pe < PE_END; pe++) begin - assign p[pe] = Res5[pe - PE_BEG + PE_REM]; - end - - end : genPipes - -endmodule : mvu_4sx4u diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv deleted file mode 100644 index 1e6855f779..0000000000 --- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv +++ /dev/null @@ -1,379 +0,0 @@ -module mvu_8sx8u_dsp48 #( - int unsigned PE, - int unsigned SIMD, - int unsigned ACCU_WIDTH, - int unsigned ACTIVATION_WIDTH, - int unsigned WEIGHT_WIDTH, - - bit SIGNED_ACTIVATIONS = 0, - bit FORCE_BEHAVIORAL = 0, - - localparam int unsigned SINGLE_PROD_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH -)( - // Global Control - input logic clk, - input logic rst, - input logic en, - - // Input - input logic last, - input logic zero, // ignore current inputs and force this partial product to zero - input logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH -1:0] w, // signed weights - input logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] a, // unsigned activations (override by SIGNED_ACTIVATIONS) - - // Ouput - output logic vld, - output logic signed [PE-1:0][ACCU_WIDTH-1:0] p -); - // for verilator always use behavioral code - localparam bit BEHAVIORAL = -`ifdef VERILATOR - 1 || -`endif - FORCE_BEHAVIORAL; - - typedef int unsigned leave_load_t[2*SIMD-1]; - function leave_load_t init_leave_loads(); - automatic leave_load_t res; - for(int i = 2*(SIMD-1); i >= int'(SIMD)-1; i--) res[i] = 1; - for(int i = SIMD-2; i >= 0; i--) res[i] = res[2*i+1] + res[2*i+2]; - return res; - endfunction : init_leave_loads - - // Pipeline for last indicator flag - logic [1:5] L = '0; - always_ff @(posedge clk) begin - if(rst) L <= '0; - else if(en) L <= { last, L[1:4] }; - end - assign vld = L[5]; - - // Stages #1 - #3: DSP Lanes + cross-lane canaries duplicated with SIMD parallelism - localparam int unsigned D[2:0] = '{ ACCU_WIDTH+SINGLE_PROD_WIDTH, SINGLE_PROD_WIDTH, 0 }; // Lane offsets - - localparam int unsigned PIPE_COUNT = (PE+1)/2; - for(genvar c = 0; c < PIPE_COUNT; c++) begin : genPipes - - localparam int unsigned PE_BEG = 2*c; - localparam int unsigned PE_END = PE < 2*(c+1)? PE : 2*(c+1); - localparam int unsigned PE_REM = 2*(c+1) - PE_END; - - uwire [57:0] p3[SIMD]; - uwire signed [ 1:0] h3[SIMD]; - for(genvar s = 0; s < SIMD; s++) begin : genSIMD - - // Input Lane Assembly - uwire [23:0] bb = { {(24-ACTIVATION_WIDTH){SIGNED_ACTIVATIONS && a[s][ACTIVATION_WIDTH-1]}}, a[s] }; - logic [33:0] aa; - logic [26:0] dd; - logic [ 1:0] xx; - if(1) begin : blkVectorize - uwire [WEIGHT_WIDTH-1:0] ww[PE_END - PE_BEG]; - for(genvar pe = 0; pe < PE_END - PE_BEG; pe++) begin - assign ww[pe] = w[PE_BEG + pe][s]; - if(pe) begin - if(BEHAVIORAL) assign xx = zero? 0 : ww[pe] * a[s]; -`ifndef VERILATOR - else begin - LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x ( - .O6(xx[1]), - .O5(xx[0]), - .I5(1'b1), - .I4(zero), - .I3(ww[pe][1]), - .I2(a[s][1]), - .I1(ww[pe][0]), - .I0(a[s][0]) - ); - end -`endif - end - end - always_comb begin - dd = '0; - aa = '0; - for(int unsigned pe = 0; pe < PE_END - PE_BEG; pe++) begin - dd[D[pe + PE_REM] +: WEIGHT_WIDTH-1] = ww[pe]; - aa[D[pe + PE_REM] + WEIGHT_WIDTH-1] = ww[pe][WEIGHT_WIDTH-1]; - end - end - end : blkVectorize - - uwire [57:0] pp; - - // Note: Since the product B * AD is computed, - // rst can be only applied to AD and zero only to B - // with the same effect as zeroing both. - if(BEHAVIORAL) begin : genBehav - // Stage #1: Input Refine - logic signed [23:0] B1 = 0; - always_ff @(posedge clk) begin - if(zero) B1 <= 0; - else if(en) B1 <= bb; - end - - logic signed [26:0] AD1 = 0; - always_ff @(posedge clk) begin - if(rst) AD1 <= 0; - else if(en) AD1 <= dd - aa; - end - - // Stage #2: Multiply - logic signed [50:0] M2 = 0; - always_ff @(posedge clk) begin - if(rst) M2 <= 0; - else if(en) M2 <= -// synthesis translate off - (B1 === '0) || (AD1 === '0)? 0 : -// synthesis translate on - B1 * AD1; - end - - // Stage #3: Accumulate - logic signed [57:0] P3 = 0; - always_ff @(posedge clk) begin - if(rst) P3 <= 0; - else if(en) P3 <= M2 + (L[3]? 0 : P3); - end - - assign pp = P3; - end : genBehav -`ifndef VERILATOR - else begin : genDSP - DSP48E2 #( - // Feature Control Attributes: Data Path Selection - .AMULTSEL("AD"), // Selects A input to multiplier (A, AD) - .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) - .BMULTSEL("B"), // Selects B input to multiplier (AD, B) - .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port) - .PREADDINSEL("A"), // Selects input to pre-adder (A, B) - .RND('0), // Rounding Constant - .USE_MULT("MULTIPLY"), // Select multiplier usage (DYNAMIC, MULTIPLY, NONE) - .USE_SIMD("ONE48"), // SIMD selection (FOUR12, ONE58, TWO24) - .USE_WIDEXOR("FALSE"), // Use the Wide XOR function (FALSE, TRUE) - .XORSIMD("XOR24_48_96"), // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116) - - // Pattern Detector Attributes: Pattern Detection Configuration - .AUTORESET_PATDET("NO_RESET"), // NO_RESET, RESET_MATCH, RESET_NOT_MATCH - .AUTORESET_PRIORITY("RESET"), // Priority of AUTORESET vs. CEP (CEP, RESET). - .MASK('1), // 58-bit mask value for pattern detect (1=ignore) - .PATTERN('0), // 58-bit pattern match for pattern detect - .SEL_MASK("MASK"), // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2 - .SEL_PATTERN("PATTERN"), // Select pattern value (C, PATTERN) - .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect (NO_PATDET, PATDET) - - // Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins - .IS_ALUMODE_INVERTED('0), // Optional inversion for ALUMODE - .IS_CARRYIN_INVERTED('0), // Optional inversion for CARRYIN - .IS_CLK_INVERTED('0), // Optional inversion for CLK - .IS_INMODE_INVERTED('0), // Optional inversion for INMODE - .IS_OPMODE_INVERTED(9'b00_010_01_01), // Optional inversion for OPMODE - .IS_RSTALLCARRYIN_INVERTED('0), // Optional inversion for RSTALLCARRYIN - .IS_RSTALUMODE_INVERTED('0), // Optional inversion for RSTALUMODE - .IS_RSTA_INVERTED('0), // Optional inversion for RSTA - .IS_RSTB_INVERTED('0), // Optional inversion for RSTB - .IS_RSTCTRL_INVERTED('0), // Optional inversion for STCONJUGATE_A - .IS_RSTC_INVERTED('0), // Optional inversion for RSTC - .IS_RSTD_INVERTED('0), // Optional inversion for RSTD - .IS_RSTINMODE_INVERTED('0), // Optional inversion for RSTINMODE - .IS_RSTM_INVERTED('0), // Optional inversion for RSTM - .IS_RSTP_INVERTED('0), // Optional inversion for RSTP - - // Register Control Attributes: Pipeline Register Configuration - .ACASCREG(0), // Number of pipeline stages between A/ACIN and ACOUT (0-2) - .ADREG(1), // Pipeline stages for pre-adder (0-1) - .ALUMODEREG(0), // Pipeline stages for ALUMODE (0-1) - .AREG(0), // Pipeline stages for A (0-2) - .BCASCREG(1), // Number of pipeline stages between B/BCIN and BCOUT (0-2) - .BREG(1), // Pipeline stages for B (0-2) - .CARRYINREG(0), // Pipeline stages for CARRYIN (0-1) - .CARRYINSELREG(0), // Pipeline stages for CARRYINSEL (0-1) - .CREG(0), // Pipeline stages for C (0-1) - .DREG(0), // Pipeline stages for D (0-1) - .INMODEREG(0), // Pipeline stages for INMODE (0-1) - .MREG(1), // Multiplier pipeline stages (0-1) - .OPMODEREG(1), // Pipeline stages for OPMODE (0-1) - .PREG(1) // Number of pipeline stages for P (0-1) - ) dsp ( - // Cascade outputs: Cascade Ports - .ACOUT(), // 34-bit output: A port cascade - .BCOUT(), // 24-bit output: B cascade - .CARRYCASCOUT(), // 1-bit output: Cascade carry - .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade - .PCOUT(), // 58-bit output: Cascade output - - // Control outputs: Control Inputs/Status Bits - .OVERFLOW(), // 1-bit output: Overflow in add/acc - .PATTERNBDETECT(), // 1-bit output: Pattern bar detect - .PATTERNDETECT(), // 1-bit output: Pattern detect - .UNDERFLOW(), // 1-bit output: Underflow in add/acc - - // Data outputs: Data Ports - .CARRYOUT(), // 4-bit output: Carry - .P(pp), // 58-bit output: Primary data - .XOROUT(), // 8-bit output: XOR data - - // Cascade inputs: Cascade Ports - .ACIN('x), // 34-bit input: A cascade data - .BCIN('x), // 24-bit input: B cascade - .CARRYCASCIN('x), // 1-bit input: Cascade carry - .MULTSIGNIN('x), // 1-bit input: Multiplier sign cascade - .PCIN('x), // 58-bit input: P cascade - - // Control inputs: Control Inputs/Status Bits - .CLK(clk), // 1-bit input: Clock - .ALUMODE(4'h0), // 4-bit input: ALU control - .CARRYINSEL('0), // 3-bit input: Carry select - .INMODE(5'b01100), // 5-bit input: INMODE control - .OPMODE({ 2'b00, { 1'b0, L[2], 1'b0 }, 4'b00_00 }), // 9-bit input: Operation mode - - // Data inputs: Data Ports - .A(aa), // 34-bit input: A data - .B(bb), // 24-bit input: B data - .C('x), // 58-bit input: C data - .CARRYIN('0), // 1-bit input: Carry-in - .D(dd), // 27-bit input: D data - - // Reset/Clock Enable inputs: Reset/Clock Enable Inputs - .CEA1('0), // 1-bit input: Clock enable for 1st stage AREG - .CEA2('0), // 1-bit input: Clock enable for 2nd stage AREG - .CEAD(en), // 1-bit input: Clock enable for ADREG - .CEALUMODE('0), // 1-bit input: Clock enable for ALUMODE - .CEB1('0), // 1-bit input: Clock enable for 1st stage BREG - .CEB2(en), // 1-bit input: Clock enable for 2nd stage BREG - .CEC('0), // 1-bit input: Clock enable for CREG - .CECARRYIN('0), // 1-bit input: Clock enable for CARRYINREG - .CECTRL(en), // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG - .CED('0), // 1-bit input: Clock enable for DREG - .CEINMODE('0), // 1-bit input: Clock enable for INMODEREG - .CEM(en), // 1-bit input: Clock enable for MREG - .CEP(en), // 1-bit input: Clock enable for PREG - .RSTA('0), // 1-bit input: Reset for AREG - .RSTB( // 1-bit input: Reset for BREG -// synthesis translate_off - rst || -// synthesis translate_on - zero - ), - .RSTC('0), // 1-bit input: Reset for CREG - .RSTD( // 1-bit input: Reset for DREG and ADREG -// synthesis translate_off - zero || -// synthesis translate_on - rst - ), - .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG - .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG - .RSTCTRL('0), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG - .RSTINMODE('0), // 1-bit input: Reset for INMODE register - .RSTM(rst), // 1-bit input: Reset for MREG - .RSTP(rst) // 1-bit input: Reset for PREG - ); - end : genDSP -`endif - - // External Canary Pipeline - logic [1:0] X1 = '{ default: 0 }; - logic [1:0] X2 = '{ default: 0 }; - logic [1:0] X3 = '{ default: 0 }; - always_ff @(posedge clk) begin - if(rst) begin - X1 <= '{ default: 0 }; - X2 <= '{ default: 0 }; - X3 <= '{ default: 0 }; - end - else if(en) begin - X1 <= xx; - X2 <= X1; - X3 <= X2 + (L[3]? 2'h0 : pp[D[1]+:2]); - end - end - - // Derive actual cross-lane overflows - assign h3[s] = pp[D[1]+:2] - X3; - - assign p3[s] = pp; - - end : genSIMD - - // Stage #4: Cross-SIMD Reduction - - // Count leaves reachable from each node - localparam leave_load_t LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 0}; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop - - uwire signed [ACCU_WIDTH -1:0] up4; - uwire signed [ACCU_WIDTH -SINGLE_PROD_WIDTH:0] hi4; - uwire [$clog2(SIMD)+SINGLE_PROD_WIDTH-1:0] lo4; - - // Conclusive high part accumulation - if(PE_REM == 0) begin : genHi - localparam int unsigned HI_WIDTH = ACCU_WIDTH - D[1]; - // Adder Tree across all SIMD high contributions, each from [-1:1] - uwire signed [$clog2(1+SIMD):0] tree[2*SIMD-1]; - for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = h3[s]; - for(genvar n = 0; n < SIMD-1; n++) begin - // Sum truncated to actual maximum bit width at this node - uwire signed [$clog2(1+LEAVE_LOAD[n]):0] s = tree[2*n+1] + tree[2*n+2]; - assign tree[n] = s; - end - - // High Sideband Accumulation - logic signed [HI_WIDTH-1:0] Hi4 = 0; - always_ff @(posedge clk) begin - if(rst) Hi4 <= 0; - else if(en) Hi4 <= (L[4]? 0 : Hi4) + tree[0]; - end - assign hi4 = Hi4; - end : genHi - else begin : genHiZero - assign hi4 = '0; - end : genHiZero - - for(genvar i = 0; i < 2; i++) begin - localparam int unsigned LO_WIDTH = D[i+1] - D[i]; - // Conclusive low part accumulation - if(i >= PE_REM) begin : blkLo - // Adder Tree across all SIMD low contributions - localparam int unsigned ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1)); - uwire [2*SIMD-2:0][ROOT_WIDTH-1:0] tree; - for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = p3[s][D[i]+:LO_WIDTH]; - for(genvar n = 0; n < SIMD-1; n++) begin - // Sum truncated to actual maximum bit width at this node - localparam int unsigned NODE_WIDTH = $clog2(1 + LEAVE_LOAD[n]*(2**LO_WIDTH-1)); - uwire [NODE_WIDTH-1:0] s = tree[2*n+1] + tree[2*n+2]; - assign tree[n] = s; - end - - logic [ROOT_WIDTH-1:0] Lo4 = 0; - always_ff @(posedge clk) begin - if(rst) Lo4 <= 0; - else if(en) Lo4 <= tree[0]; - end - - if(i == 1) assign up4 = Lo4; - else assign lo4 = Lo4; - end : blkLo - else begin : blkLoZero - assign lo4 = '0; - end : blkLoZero - - end - - // Stage #5: Resolve lane totals - logic signed [1:0][ACCU_WIDTH-1:0] Res5 = '{ default: 0 }; - always_ff @(posedge clk) begin - if(rst) Res5 <= '{ default: 0 }; - else if(en) begin - Res5[1] <= up4 - hi4; - Res5[0] <= $signed({ hi4, {(D[1] - D[0]){1'b0}} }) + $signed({ 1'b0, lo4 }); - end - end - - // Output - for(genvar pe = PE_BEG; pe < PE_END; pe++) begin - assign p[pe] = Res5[pe - PE_BEG + PE_REM]; - end - - end : genPipes - -endmodule : mvu_8sx8u_dsp48 diff --git a/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv b/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv deleted file mode 100644 index 53cf71fd5f..0000000000 --- a/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv +++ /dev/null @@ -1,430 +0,0 @@ -/****************************************************************************** - * Copyright (C) 2022, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, - * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * @brief Matrix Vector Unit (MVU) core compute kernel utilizing DSP58. - *****************************************************************************/ - -module mvu_vvu_8sx9_dsp58 #( - bit IS_MVU, - int unsigned PE, - int unsigned SIMD, - int unsigned ACTIVATION_WIDTH, - int unsigned WEIGHT_WIDTH, - int unsigned ACCU_WIDTH, - bit SIGNED_ACTIVATIONS = 0, - int unsigned SEGMENTLEN = 0, // Default to 0 (which implies a single segment) - bit FORCE_BEHAVIORAL = 0, - - localparam int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD, - localparam int unsigned WEIGHT_ELEMENTS = PE*SIMD - ) - ( - // Global Control - input logic clk, - input logic rst, - input logic en, - - // Input - input logic last, - input logic zero, // ignore current inputs and force this partial product to zero - input logic [WEIGHT_ELEMENTS-1:0][WEIGHT_WIDTH-1:0] w, // weights - input logic [ACTIVATION_ELEMENTS-1:0][ACTIVATION_WIDTH-1:0] a, // activations - - // Ouput - output logic vld, - output logic [PE-1:0][ACCU_WIDTH-1:0] p - ); - // for verilator always use behavioral code - localparam bit BEHAVIORAL = -`ifdef VERILATOR - 1 || -`endif - FORCE_BEHAVIORAL; - -//-------------------- Declare global signals --------------------\\ - localparam int unsigned CHAINLEN = (SIMD+2)/3; - localparam int unsigned SEGLEN = SEGMENTLEN == 0 ? CHAINLEN : SEGMENTLEN; // Additional constant to default a SEGMENTLEN of '0' to the DSP-chain length - localparam int unsigned PE_ACTIVATION = IS_MVU ? 1 : PE; - uwire [26:0] a_in_i [PE_ACTIVATION * CHAINLEN]; - uwire [23:0] b_in_i [PE][CHAINLEN]; - uwire [PE-1:0][CHAINLEN-1:0][57:0] pcout; // Array with packed dimension > 256 (with a loop-carried dependency) cannot be handled out-of-the-box with PyVerilator - -//-------------------- Shift register for opmode select signal --------------------\\ - localparam int unsigned MAX_PIPELINE_STAGES = (CHAINLEN + SEGLEN-1)/SEGLEN; // >=1 (== number of pipeline registers + 1 (A/B inputs always have 1 register)) - logic L [0:1+MAX_PIPELINE_STAGES] = '{default: 0}; // After MAX_PIPELINE_STAGES (== number of pipeline stages for input data), we have 3 additional cycles latency (A/B reg, Mreg, Preg). Thus, we add +2 (since OPMODE is buffered by 1 cycle in the DSP fabric) - - always_ff @(posedge clk) begin - if(rst) L <= '{default: 0}; - else if(en) begin - L[1+MAX_PIPELINE_STAGES] <= last; - L[0:MAX_PIPELINE_STAGES] <= L[1:1+MAX_PIPELINE_STAGES]; - end - end - assign vld = L[0]; - -//-------------------- Shift register for ZERO flag --------------------\\ - logic Z [0:MAX_PIPELINE_STAGES-2] = '{default:0}; // We need MAX_PIPELINE_STAGES-1 pipeline stages (note: INMODE is buffered inside DSP fabric) - - if (MAX_PIPELINE_STAGES > 1) begin : genZreg - always_ff @(posedge clk) begin - if (rst) Z <= '{default: 0}; - else if(en) begin - Z[0] <= zero; - if (MAX_PIPELINE_STAGES > 2) Z[1:MAX_PIPELINE_STAGES-2] <= Z[0:MAX_PIPELINE_STAGES-3]; - end - end - end; - -//-------------------- Buffer for input activations --------------------\\ - localparam int unsigned PAD_BITS_ACT = 9 - ACTIVATION_WIDTH; - for (genvar k=0; k1 ? TOTAL_PREGS-1 : 0; - localparam int LANES_OCCUPIED = i == CHAINLEN-1 ? SIMD - 3*i : 3; - - if (EXTERNAL_PREGS > 0) begin : genExternalPregAct - logic [0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][ACTIVATION_WIDTH-1:0] A = '{ default : 0}; - always_ff @(posedge clk) begin - if (rst) A <= '{default: 0}; - else if(en) begin - A[EXTERNAL_PREGS-1] <= - // synthesis translate_off - zero ? '1 : - // synthesis translate_on - a[SIMD*k + 3*i +: LANES_OCCUPIED]; - if (EXTERNAL_PREGS > 1) A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1]; - end - end - for (genvar j=0; j1 ? TOTAL_PREGS-1 : 0; - localparam int LANES_OCCUPIED = j == CHAINLEN-1 ? SIMD - 3*j : 3; - - if (EXTERNAL_PREGS > 0) begin : genExternalPregWeight - logic [0:PE-1][0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][WEIGHT_WIDTH-1:0] B = '{ default : 0}; - always_ff @(posedge clk) begin - if (rst) B <= '{default: 0}; - else if (en) begin - B[i][EXTERNAL_PREGS-1] <= -// synthesis translate_off - zero ? '1 : -// synthesis translate_on - //w[i][3*j +: LANES_OCCUPIED]; - w[SIMD*i+3*j +: LANES_OCCUPIED]; - if (EXTERNAL_PREGS > 1) B[i][0:EXTERNAL_PREGS-2] <= B[i][1:EXTERNAL_PREGS-1]; - end - end - for (genvar k = 0 ; k < LANES_OCCUPIED ; k++) begin : genBin - assign b_in_i[i][j][8*k +: 8] = PAD_BITS_WEIGHT == 0 ? B[i][0][k] : { {PAD_BITS_WEIGHT{B[i][0][k][WEIGHT_WIDTH-1]}}, B[i][0][k] }; - end : genBin - for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero - assign b_in_i[i][j][8*k +: 8] = 8'b0; - end : genBinZero - end : genExternalPregWeight - else begin : genInpDSPWeight - for (genvar k = 0; k < LANES_OCCUPIED; k++) begin : genBin - assign b_in_i[i][j][8*k +: 8] = -// synthesis translate_off - zero ? '1 : -// synthesis translate_on - //PAD_BITS_WEIGHT == 0 ? w[i][3*j+k] : { {PAD_BITS_WEIGHT{w[i][3*j+k][WEIGHT_WIDTH-1]}}, w[i][3*j+k] }; - PAD_BITS_WEIGHT == 0 ? w[SIMD*i+3*j+k] : { {PAD_BITS_WEIGHT{w[SIMD*i+3*j+k][WEIGHT_WIDTH-1]}}, w[SIMD*i+3*j+k] }; - end : genBin - for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero - assign b_in_i[i][j][8*k +: 8] = 8'b0; - end : genBinZero - end : genInpDSPWeight - end : genWeightSIMD - end : genWeightPE - -//-------------------- Instantiate PE x CHAINLEN DSPs --------------------\\ - for (genvar i=0; i0 ? 2 : 1; // 1 : 0 - localparam bit PREG = (j+1)%SEGLEN==0 || j == CHAINLEN-1; - localparam bit FIRST = j == 0; - localparam bit LAST = j == CHAINLEN-1; - uwire [57:0] pp; - - if (LAST) begin : genPOUT - assign p[i] = pp[ACCU_WIDTH-1:0]; - end - - // Note: Since the product B * AD is computed, - // rst can be only applied to AD and zero only to B - // with the same effect as zeroing both. - if(BEHAVIORAL) begin : genBehav - // Stage #1: Input A/B - logic signed [33:0] Areg [INTERNAL_PREGS]; - always_ff @(posedge clk) begin - if (rst) Areg <= '{ default : 0}; - else if (en) begin - Areg[0] <= { 7'bx, a_in_i[(IS_MVU ? 0 : CHAINLEN*i) + j] }; - if (INTERNAL_PREGS == 2) Areg[1] <= Areg[0]; - end - end - logic signed [23:0] Breg [INTERNAL_PREGS]; - always_ff @(posedge clk) begin - if (rst) Breg <= '{ default : 0}; - else if (en) begin - Breg[0] <= b_in_i[i][j]; - if (INTERNAL_PREGS == 2) Breg[1] <= Breg[0]; - end - end - - // Stage #2: Multiply-Accumulate - logic signed [57:0] Mreg; - logic InmodeZero = 0; - always_ff @(posedge clk) begin - if (rst) InmodeZero <= 0; - else if (en) InmodeZero <= ( TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero ); - end - always_ff @(posedge clk) begin - if (rst) Mreg <= 0; - else if (en) begin - automatic logic signed [57:0] m = 0; - for (int k = 0; k < 3; k++) begin - m = m + (InmodeZero ? 0 : $signed(Areg[INTERNAL_PREGS-1][9*k +: 9]) * $signed(Breg[INTERNAL_PREGS-1][8*k +: 8])); - end - Mreg <= m; - end - end - - // Stage #3: Accumulate - logic signed [57:0] Preg; - logic Opmode = 0; - if (FIRST && !LAST) begin : genFirst - if (PREG) begin : genPregBehav - always_ff @(posedge clk) begin - if (rst) Preg <= 0; - else if (en) Preg <= Mreg; - end - end - else assign Preg = Mreg; - end - else if (FIRST && LAST) begin : genSingle - always_ff @(posedge clk) begin - if (rst) Opmode <= 0; - else if (en) Opmode <= L[1]; - end - always_ff @(posedge clk) begin - if (rst) Preg <= 0; - else if (en) Preg <= (Opmode ? 0 : Preg) + Mreg; - end - end - else if (!FIRST && LAST) begin : genLast - always_ff @(posedge clk) begin - if (rst) Opmode <= 0; - else if (en) Opmode <= L[1]; - end - always_ff @(posedge clk) begin - if (rst) Preg <= 0; - else if (en) Preg <= (Opmode ? 0 : Preg) + Mreg + pcout[i][j-1]; - end - end - else begin : genMid - if (PREG) begin : genPregBehav - always_ff @(posedge clk) begin - if (rst) Preg <= 0; - else if (en) Preg <= Mreg + pcout[i][j-1]; - end - end - else assign Preg = Mreg + pcout[i][j-1]; - end - assign pp = Preg; - assign pcout[i][j] = Preg; - end : genBehav -`ifndef VERILATOR - else begin: genDSP - DSP58 #( - // Feature Control Attributes: Data Path Selection - .AMULTSEL("A"), // Selects A input to multiplier (A, AD) - .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) - .BMULTSEL("B"), // Selects B input to multiplier (AD, B) - .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port) - .DSP_MODE("INT8"), // Configures DSP to a particular mode of operation. Set to INT24 for - // legacy mode. - .PREADDINSEL("A"), // Selects input to pre-adder (A, B) - .RND(58'h000000000000000), // Rounding Constant - .USE_MULT("MULTIPLY"), // Select multiplier usage (DYNAMIC, MULTIPLY, NONE) - .USE_SIMD("ONE58"), // SIMD selection (FOUR12, ONE58, TWO24) - .USE_WIDEXOR("FALSE"), // Use the Wide XOR function (FALSE, TRUE) - .XORSIMD("XOR24_34_58_116"), // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116) - // Pattern Detector Attributes: Pattern Detection Configuration - .AUTORESET_PATDET("NO_RESET"), // NO_RESET, RESET_MATCH, RESET_NOT_MATCH - .AUTORESET_PRIORITY("RESET"), // Priority of AUTORESET vs. CEP (CEP, RESET). - .MASK(58'h0ffffffffffffff), // 58-bit mask value for pattern detect (1=ignore) - .PATTERN(58'h000000000000000), // 58-bit pattern match for pattern detect - .SEL_MASK("MASK"), // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2 - .SEL_PATTERN("PATTERN"), // Select pattern value (C, PATTERN) - .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect (NO_PATDET, PATDET) - // Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins - .IS_ALUMODE_INVERTED(4'b0000), // Optional inversion for ALUMODE - .IS_CARRYIN_INVERTED(1'b0), // Optional inversion for CARRYIN - .IS_CLK_INVERTED(1'b0), // Optional inversion for CLK - .IS_INMODE_INVERTED(5'b00000), // Optional inversion for INMODE - .IS_NEGATE_INVERTED(3'b000), // Optional inversion for NEGATE - .IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0 - FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN - 2'b01, // Y : M - 2'b01 // X: M - }), // Optional inversion for OPMODE - .IS_RSTALLCARRYIN_INVERTED(1'b0), // Optional inversion for RSTALLCARRYIN - .IS_RSTALUMODE_INVERTED(1'b0), // Optional inversion for RSTALUMODE - .IS_RSTA_INVERTED(1'b0), // Optional inversion for RSTA - .IS_RSTB_INVERTED(1'b0), // Optional inversion for RSTB - .IS_RSTCTRL_INVERTED(1'b0), // Optional inversion for STCONJUGATE_A - .IS_RSTC_INVERTED(1'b0), // Optional inversion for RSTC - .IS_RSTD_INVERTED(1'b0), // Optional inversion for RSTD - .IS_RSTINMODE_INVERTED(1'b0), // Optional inversion for RSTINMODE - .IS_RSTM_INVERTED(1'b0), // Optional inversion for RSTM - .IS_RSTP_INVERTED(1'b0), // Optional inversion for RSTP - // Register Control Attributes: Pipeline Register Configuration - .ACASCREG(INTERNAL_PREGS), // Number of pipeline stages between A/ACIN and ACOUT (0-2) - .ADREG(0), // Pipeline stages for pre-adder (0-1) - .ALUMODEREG(0), // Pipeline stages for ALUMODE (0-1) - .AREG(INTERNAL_PREGS), // Pipeline stages for A (0-2) - .BCASCREG(INTERNAL_PREGS), // Number of pipeline stages between B/BCIN and BCOUT (0-2) - .BREG(INTERNAL_PREGS), // Pipeline stages for B (0-2) - .CARRYINREG(0), // Pipeline stages for CARRYIN (0-1) - .CARRYINSELREG(0), // Pipeline stages for CARRYINSEL (0-1) - .CREG(0), // Pipeline stages for C (0-1) - .DREG(0), // Pipeline stages for D (0-1) - .INMODEREG(1), // Pipeline stages for INMODE (0-1) - .MREG(1), // Multiplier pipeline stages (0-1) - .OPMODEREG(1), // Pipeline stages for OPMODE (0-1) - .PREG(PREG), // Number of pipeline stages for P (0-1) - .RESET_MODE("SYNC") // Selection of synchronous or asynchronous reset. (ASYNC, SYNC). - ) - DSP58_inst ( - // Cascade outputs: Cascade Ports - .ACOUT(), // 34-bit output: A port cascade - .BCOUT(), // 24-bit output: B cascade - .CARRYCASCOUT(), // 1-bit output: Cascade carry - .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade - .PCOUT(pcout[i][j]), // 58-bit output: Cascade output - // Control outputs: Control Inputs/Status Bits - .OVERFLOW(), // 1-bit output: Overflow in add/acc - .PATTERNBDETECT(), // 1-bit output: Pattern bar detect - .PATTERNDETECT(), // 1-bit output: Pattern detect - .UNDERFLOW(), // 1-bit output: Underflow in add/acc - // Data outputs: Data Ports - .CARRYOUT(), // 4-bit output: Carry - .P(pp), // 58-bit output: Primary data - .XOROUT(), // 8-bit output: XOR data - // Cascade inputs: Cascade Ports - .ACIN('x), // 34-bit input: A cascade data - .BCIN('x), // 24-bit input: B cascade - .CARRYCASCIN('x), // 1-bit input: Cascade carry - .MULTSIGNIN('x), // 1-bit input: Multiplier sign cascade - .PCIN(FIRST ? 'x : pcout[i][j-1]), // 58-bit input: P cascade - // Control inputs: Control Inputs/Status Bits - .ALUMODE(4'h0), // 4-bit input: ALU control - .CARRYINSEL('0), // 3-bit input: Carry select - .CLK(clk), // 1-bit input: Clock - .INMODE({ - INTERNAL_PREGS==2 ? 1'b0 : 1'b1, - 2'b00, - TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero, - INTERNAL_PREGS==2 ? 1'b0 : 1'b1 - }), // 5-bit input: INMODE control - .NEGATE('0), // 3-bit input: Negates the input of the multiplier - .OPMODE({ - LAST ? {1'b0, L[1]} : 2'b00, - 7'b000_0000 - }), // 9-bit input: Operation mode - // Data inputs: Data Ports - .A({ 7'bx, a_in_i[(IS_MVU ? 0 : CHAINLEN*i) + j] }), // 34-bit input: A data - .B(b_in_i[i][j]), // 24-bit input: B data - .C('x), // 58-bit input: C data - .CARRYIN('0), // 1-bit input: Carry-in - .D('x), // 27-bit input: D data - // Reset/Clock Enable inputs: Reset/Clock Enable Inputs - .ASYNC_RST('0), // 1-bit input: Asynchronous reset for all registers. - .CEA1(en), // 1-bit input: Clock enable for 1st stage AREG - .CEA2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage AREG - .CEAD('0), // 1-bit input: Clock enable for ADREG - .CEALUMODE('0), // 1-bit input: Clock enable for ALUMODE - .CEB1(en), // 1-bit input: Clock enable for 1st stage BREG - .CEB2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage BREG - .CEC('0), // 1-bit input: Clock enable for CREG - .CECARRYIN('0), // 1-bit input: Clock enable for CARRYINREG - .CECTRL(en), // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG - .CED('0), // 1-bit input: Clock enable for DREG - .CEINMODE(en), // 1-bit input: Clock enable for INMODEREG - .CEM(en), // 1-bit input: Clock enable for MREG - .CEP(PREG && en), // 1-bit input: Clock enable for PREG - .RSTA(rst), // 1-bit input: Reset for AREG - .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG - .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG - .RSTB(rst), // 1-bit input: Reset for BREG - .RSTC('0), // 1-bit input: Reset for CREG - .RSTCTRL(rst), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG - .RSTD('0), // 1-bit input: Reset for DREG and ADREG - .RSTINMODE(rst), // 1-bit input: Reset for INMODE register - .RSTM(rst), // 1-bit input: Reset for MREG - .RSTP(PREG && rst) // 1-bit input: Reset for PREG - ); - end : genDSP -`endif - end : genDSPChain - end : genDSPPE - -endmodule : mvu_vvu_8sx9_dsp58 diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv deleted file mode 100644 index 0168f20563..0000000000 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ /dev/null @@ -1,234 +0,0 @@ -/****************************************************************************** - * Copyright (C) 2022, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, - * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * @brief Matrix Vector Unit (MVU) & Vector Vector Unit (VVU) AXI-lite interface wrapper. - * @details - * The following compute cores are supported: - * - 4-bit MVU on DSP48 & DSP58 achieving 4 MACs/DSP, - * (4,8]-bit MVU on DSP48 achieving 2 MACs/DSP, - * [4,9]-bit MVU and VVU on DSP58 achieving 3 MACs/DSP, - * 'unconstrained' LUT-based MVU and VVU. - * Folding hints: - * - PE scaling should divide MH. - * - SIMD scaling should divide MW. - * - Otherwise, keep SIMD and PE somewhat balanced. SIMD scaling tends to - * impact critical paths more than PE scaling. PE scaling implies a - * bigger fanout on the input activations. - * - Full unfolding along MH (PE=MH) results in no replay buffer instantiated - *****************************************************************************/ - -module mvu_vvu_axi #( - bit IS_MVU, - parameter COMPUTE_CORE, - int unsigned MW, - int unsigned MH, - int unsigned PE, - int unsigned SIMD, - int unsigned ACTIVATION_WIDTH, - int unsigned WEIGHT_WIDTH, - int unsigned ACCU_WIDTH, - bit SIGNED_ACTIVATIONS = 0, - int unsigned SEGMENTLEN = 0, - bit FORCE_BEHAVIORAL = 0, - bit M_REG_LUT = 1, - - // Safely deducible parameters - localparam int unsigned WEIGHT_STREAM_WIDTH = PE * SIMD * WEIGHT_WIDTH, - localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (WEIGHT_STREAM_WIDTH + 7) / 8 * 8, - localparam int unsigned INPUT_STREAM_WIDTH = SIMD * ACTIVATION_WIDTH, - localparam int unsigned INPUT_STREAM_WIDTH_BA = (INPUT_STREAM_WIDTH + 7) / 8 * 8, - localparam int unsigned OUTPUT_STREAM_WIDTH = PE * ACCU_WIDTH, - localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_STREAM_WIDTH + 7) / 8 * 8, - localparam int unsigned SF = MW / SIMD, - localparam int unsigned NF = MH / PE -) -( - // Global Control - input logic ap_clk, - input logic ap_rst_n, - - // Weight Stream - input logic [WEIGHT_STREAM_WIDTH_BA-1:0] s_axis_weights_tdata, - input logic s_axis_weights_tvalid, - output logic s_axis_weights_tready, - - // Input Stream - input logic [INPUT_STREAM_WIDTH_BA-1:0] s_axis_input_tdata, - input logic s_axis_input_tvalid, - output logic s_axis_input_tready, - - // Output Stream - output logic [OUTPUT_STREAM_WIDTH_BA-1:0] m_axis_output_tdata, - output logic m_axis_output_tvalid, - input logic m_axis_output_tready -); - -//-------------------- Parameter sanity checks --------------------\\ - initial begin - if (MW % SIMD != 0) begin - $error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD); - $finish; - end - if (MH % PE != 0) begin - $error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE); - $finish; - end - if (WEIGHT_WIDTH > 8) begin - $error("Weight width of %0d-bits exceeds maximum of 8-bits", WEIGHT_WIDTH); - $finish; - end - if (ACTIVATION_WIDTH > 8) begin - if (!(SIGNED_ACTIVATIONS == 1 && ACTIVATION_WIDTH == 9 && COMPUTE_CORE == "mvu_vvu_8sx9_dsp58")) begin - $error("Activation width of %0d-bits exceeds maximum of 9-bits for signed numbers on DSP48", ACTIVATION_WIDTH); - $finish; - end - end - if (COMPUTE_CORE == "mvu_vvu_8sx9_dsp58") begin - if (SEGMENTLEN == 0) begin - $warning("Segment length of %0d defaults to chain length of %0d", SEGMENTLEN, (SIMD+2)/3); - end - if (SEGMENTLEN > (SIMD+2)/3) begin - $error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3); - $finish; - end - end - end - - uwire clk = ap_clk; - uwire rst = !ap_rst_n; - - //- Replay to Accommodate Neuron Fold ----------------------------------- - typedef logic [PE*SIMD-1:0][ACTIVATION_WIDTH-1:0] mvu_flatin_t; - uwire mvu_flatin_t amvau; - uwire alast; - uwire afin; - uwire avld; - uwire ardy; - - replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvu_flatin_t))) activation_replay ( - .clk, .rst, - .ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvu_flatin_t'(s_axis_input_tdata)), - .ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin) - ); - - //- Unflatten inputs into structured matrices --------------------------- - typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH -1:0] mvu_w_t; - typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] mvu_a_t; - - uwire mvu_w_t mvu_w = s_axis_weights_tdata; - uwire mvu_a_t mvu_a = amvau; - - //- Flow Control Bracket around Compute Core ---------------------------- - uwire en; - uwire istb = avld && s_axis_weights_tvalid; - assign ardy = en && s_axis_weights_tvalid; - assign s_axis_weights_tready = en && avld; - - //- Instantiate compute core ---------------------------- - typedef logic [PE-1:0][ACCU_WIDTH-1:0] dsp_p_t; - uwire dsp_vld; - uwire dsp_p_t dsp_p; - - uwire dsp_clk = ap_clk; - uwire dsp_en = en; - uwire dsp_last = alast && avld; - uwire dsp_zero = !istb; - uwire mvu_w_t dsp_w = mvu_w; - uwire mvu_a_t dsp_a = mvu_a; - uwire ovld = dsp_vld; - uwire dsp_p_t odat = dsp_p; - - case(COMPUTE_CORE) - "mvu_vvu_8sx9_dsp58": - mvu_vvu_8sx9_dsp58 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), - .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), - .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( - .clk(dsp_clk), .rst, .en(dsp_en), - .last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a), - .vld(dsp_vld), .p(dsp_p) - ); - "mvu_4sx4u": - mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( - .clk(dsp_clk), .rst, .en(dsp_en), - .last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a), - .vld(dsp_vld), .p(dsp_p) - ); - "mvu_8sx8u_dsp48": - mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), - .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( - .clk(dsp_clk), .rst, .en(dsp_en), - .last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a), - .vld(dsp_vld), .p(dsp_p) - ); - default: initial begin - $error("Unrecognized COMPUTE_CORE '%s'", COMPUTE_CORE); - $finish; - end - endcase - -//-------------------- Output register slice --------------------\\ - // Make `en`computation independent from external inputs. - // Drive all outputs from registers. - struct packed { - logic rdy; - logic [PE-1:0][ACCU_WIDTH-1:0] dat; - } A = '{ rdy: 1, default: 'x }; // side-step register used when encountering backpressure - struct packed { - logic vld; - logic [PE-1:0][ACCU_WIDTH-1:0] dat; - } B = '{ vld: 0, default: 'x }; // ultimate output register - - assign en = A.rdy; - uwire b_load = !B.vld || m_axis_output_tready; - - always_ff @(posedge clk) begin - if(rst) begin - A <= '{ rdy: 1, default: 'x }; - B <= '{ vld: 0, default: 'x }; - end - else begin - if(A.rdy) A.dat <= odat; - A.rdy <= (A.rdy && !ovld) || b_load; - - if(b_load) begin - B <= '{ - vld: ovld || !A.rdy, - dat: A.rdy? odat : A.dat - }; - end - end - end - assign m_axis_output_tvalid = B.vld; - // Why would we need a sign extension here potentially creating a higher signal load into the next FIFO? - // These extra bits should never be used. Why not 'x them out? - assign m_axis_output_tdata = { {(OUTPUT_STREAM_WIDTH_BA-OUTPUT_STREAM_WIDTH){B.dat[PE-1][ACCU_WIDTH-1]}}, B.dat}; - - -endmodule : mvu_vvu_axi diff --git a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v deleted file mode 100644 index 01deb23840..0000000000 --- a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v +++ /dev/null @@ -1,92 +0,0 @@ -/****************************************************************************** - * Copyright (C) 2022, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, - * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * @brief Verilog AXI-lite wrapper for MVU & VVU. - *****************************************************************************/ - -module $MODULE_NAME_AXI_WRAPPER$ #( - parameter IS_MVU = $IS_MVU$, - parameter COMPUTE_CORE = "$COMPUTE_CORE$", - parameter MW = $MW$, - parameter MH = $MH$, - parameter PE = $PE$, - parameter SIMD = $SIMD$, - parameter ACTIVATION_WIDTH = $ACTIVATION_WIDTH$, - parameter WEIGHT_WIDTH = $WEIGHT_WIDTH$, - parameter ACCU_WIDTH = $ACCU_WIDTH$, - parameter SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$, - parameter SEGMENTLEN = $SEGMENTLEN$, - parameter FORCE_BEHAVIORAL = $FORCE_BEHAVIORAL$, - - // Safely deducible parameters - parameter WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, - parameter INPUT_STREAM_WIDTH_BA = ((IS_MVU == 1 ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8, - parameter OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8 -)( - // Global Control - (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *) - (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *) - input ap_clk, - (* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *) - input ap_rst_n, - - // Weight Stream - input [WEIGHT_STREAM_WIDTH_BA-1:0] weights_V_TDATA, - input weights_V_TVALID, - output weights_V_TREADY, - // Input Stream - input [INPUT_STREAM_WIDTH_BA-1:0] in0_V_TDATA, - input in0_V_TVALID, - output in0_V_TREADY, - // Output Stream - output [OUTPUT_STREAM_WIDTH_BA-1:0] out_V_TDATA, - output out_V_TVALID, - input out_V_TREADY -); - -mvu_vvu_axi #( - .IS_MVU(IS_MVU), .COMPUTE_CORE(COMPUTE_CORE), .MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD), - .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), - .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL) - ) inst ( - .ap_clk(ap_clk), - .ap_rst_n(ap_rst_n), - .s_axis_weights_tdata(weights_V_TDATA), - .s_axis_weights_tvalid(weights_V_TVALID), - .s_axis_weights_tready(weights_V_TREADY), - .s_axis_input_tdata(in0_V_TDATA), - .s_axis_input_tvalid(in0_V_TVALID), - .s_axis_input_tready(in0_V_TREADY), - .m_axis_output_tdata(out_V_TDATA), - .m_axis_output_tvalid(out_V_TVALID), - .m_axis_output_tready(out_V_TREADY) -); - -endmodule // $MODULE_NAME_AXI_WRAPPER$ diff --git a/finn-rtllib/mvu/replay_buffer.sv b/finn-rtllib/mvu/replay_buffer.sv deleted file mode 100644 index 3e2766f63d..0000000000 --- a/finn-rtllib/mvu/replay_buffer.sv +++ /dev/null @@ -1,181 +0,0 @@ -/****************************************************************************** - * Copyright (C) 2022-2023, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, - * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * @brief Replay buffer for counted sequences on an AXI-lite stream. - * @author Thomas B. Preußer - *****************************************************************************/ - -module replay_buffer #( - int unsigned LEN, // Sequence length - int unsigned REP, // Sequence replay count - int unsigned W // Data width -)( - input logic clk, - input logic rst, - - input logic [W-1:0] idat, - input logic ivld, - output logic irdy, - - output logic [W-1:0] odat, - output logic olast, - output logic ofin, - output logic ovld, - input logic ordy -); - - if(LEN == 0) initial begin - $error("%m: Illegal zero sequence LEN."); - $finish; - end - if(REP == 0) initial begin - $error("%m: Illegal zero REP count."); - $finish; - end - - // Track position in Sequence - uwire last_item; - uwire shift; - if(LEN == 1) assign last_item = 1; - else begin - typedef logic [$clog2(LEN)-1:0] count_t; - count_t Count = 0; - logic Last = 0; - always_ff @(posedge clk) begin - if(rst) begin - Count <= 0; - Last <= 0; - end - else if(shift) begin - Count <= Count + (Last? 2**$clog2(LEN)-LEN+1 : 1); - Last <= (((LEN-2) & ~Count) == 0) && ((LEN&1) || !Last); - end - end - assign last_item = Last; - end - - if(REP == 1) begin - assign shift = ivld && ordy; - - assign irdy = ordy; - assign odat = idat; - assign olast = last_item; - assign ofin = last_item; - assign ovld = ivld; - end - else begin - - // Track Repetitions - uwire last_rep; - if(1) begin : blkRep - typedef logic [$clog2(REP)-1:0] rep_t; - rep_t RepCnt = 0; - logic RepLst = 0; - always_ff @(posedge clk) begin - if(rst) begin - RepCnt <= 0; - RepLst <= 0; - end - else if(last_item && shift) begin - RepCnt <= RepCnt + (RepLst? 2**$clog2(REP)-REP+1 : 1); - RepLst <= (((REP-2) & ~RepCnt) == 0) && ((REP&1) || !RepLst); - end - end - assign last_rep = RepLst; - end : blkRep - - localparam int unsigned AWIDTH = LEN < 2? 1 : $clog2(LEN); - typedef logic [AWIDTH :0] ptr_t; // pointers with additional generational MSB - typedef logic [W -1:0] data_t; - - // Output Registers - data_t ODat; - logic OVld = 0; - logic OLst = 'x; - logic OFin = 'x; - assign odat = ODat; - assign olast = OLst; - assign ofin = OFin; - assign ovld = OVld; - - // Buffer Memory Management - data_t Mem[2**AWIDTH]; - ptr_t WP = 0; // Write Pointer - ptr_t RP = 0; // Read Pointer - ptr_t FP = 0; // Free Pointer - - // Operational Guards - // Occupancy: WP-FP - // WP-FP < 2**AWIDTH -> writing allowed - // - increments WP - // Availability: WP-RP - // WP-RP > 0 -> reading allowed - // - increments RP, last in sequence rewinds to FP for non-final repetition - // - increments FP in last repetition - assign irdy = !((WP-FP) >> AWIDTH); - - uwire wr = irdy && ivld; - uwire rd = !OVld || ordy; - always_ff @(posedge clk) begin - if(wr) Mem[WP[AWIDTH-1:0]] <= idat; - if(rd) ODat <= Mem[RP[AWIDTH-1:0]]; - end - - uwire vld = (RP != WP); - assign shift = rd && vld; - always_ff @(posedge clk) begin - if(rst) begin - WP <= 0; - RP <= 0; - FP <= 0; - - OVld <= 0; - OLst <= 'x; - OFin <= 'x; - end - else begin - if(wr) WP <= WP + 1; - if(rd) begin - if(vld) begin - automatic logic rewind = last_item && !last_rep; - RP <= RP + (rewind? 2**(AWIDTH+1)-LEN+1 : 1); - FP <= FP + last_rep; - end - - OVld <= vld; - OLst <= last_item; - OFin <= last_rep && last_item; - end - end - end - - end - -endmodule : replay_buffer diff --git a/finn-rtllib/mvu/tb/mvu_8sx9_tb.sv b/finn-rtllib/mvu/tb/mvu_8sx9_tb.sv deleted file mode 100644 index c8bfe5370a..0000000000 --- a/finn-rtllib/mvu/tb/mvu_8sx9_tb.sv +++ /dev/null @@ -1,165 +0,0 @@ -/****************************************************************************** - * Copyright (C) 2022, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, - * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * @brief Testbench for MVU core compute kernel. - *****************************************************************************/ - -module mvu_8sx9_tb(); - -//-------------------- Simulation parameters --------------------\\ - // Matrix & parallelism config - localparam int unsigned MH = 256; - localparam int unsigned PE = 16; - localparam int unsigned MW = 600; - localparam int unsigned SIMD = 60; - localparam int unsigned SEGMENTLEN = 4; - // Bit-width config - localparam int unsigned ACTIVATION_WIDTH = 8; - localparam int unsigned WEIGHT_WIDTH = 4; - localparam bit SIGNED_ACTIVATIONS = 1; - // Simulation constants - localparam int unsigned NF = MH/PE; - localparam int unsigned SF = MW/SIMD; - localparam int unsigned NUM_OF_DSP = SIMD/3; - - typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; - typedef activation_t activation_vector_t[SF]; - - function activation_vector_t init_ACTIVATIONS; - automatic activation_vector_t res; - std::randomize(res); - return res; - endfunction : init_ACTIVATIONS - - typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; - typedef weight_t weight_matrix_t[NF][SF]; - - function weight_matrix_t init_WEIGHTS; - automatic weight_matrix_t res; - std::randomize(res); - return res; - endfunction : init_WEIGHTS; - - typedef logic signed [PE-1:0][57:0] output_t; - typedef output_t output_vector_t [NF]; - - function output_vector_t check_output(activation_vector_t a, weight_matrix_t w); - automatic output_vector_t res = '{default: 0}; - for (int j = 0; j 1) && !rst; - end - - // Compare computed output against golden output when vld flag is raised by DUT - always_ff @(posedge clk iff (vld && en)) begin - foreach(p[i]) begin - assert ($signed(p[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); - else begin - $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); - $stop; - end - end - NF_CNT += 1; - end - - // Instantiate DUT - mvu_8sx9 #( - .PE(PE), - .SIMD(SIMD), - .WEIGHT_WIDTH(WEIGHT_WIDTH), - .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), - .ACTIVATION_WIDTH(ACTIVATION_WIDTH), - .SEGMENTLEN(SEGMENTLEN) - ) - dut ( - .clk, .rst, .en, .last, .zero, .a, .w, .vld, .p - ); - -endmodule : mvu_8sx9_tb diff --git a/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv deleted file mode 100644 index b46fc588c9..0000000000 --- a/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv +++ /dev/null @@ -1,241 +0,0 @@ -/****************************************************************************** - * Copyright (C) 2022, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, - * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * @brief Testbench for MVU AXI-lite interface wrapper. - *****************************************************************************/ - -module mvu_vvu_axi_tb(); - -//-------------------- Simulation parameters --------------------\\ - // Matrix & parallelism config - localparam bit IS_MVU = 0; - localparam string COMPUTE_CORE = "mvu_vvu_8sx9_dsp58"; - localparam int unsigned MW = 36; - localparam int unsigned MH = 1; - localparam int unsigned SIMD = 3; - localparam int unsigned PE = 4; - localparam int unsigned SEGMENTLEN = 1.0; - localparam bit FORCE_BEHAVIORAL = 1; - localparam bit M_REG_LUT = 1; - // Bit-width config - localparam int unsigned ACTIVATION_WIDTH = 8; - localparam int unsigned WEIGHT_WIDTH = 6; - localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW); - localparam bit SIGNED_ACTIVATIONS = 1; - // Simulation constants - localparam int unsigned NF = IS_MVU ? MH/PE : 1; - localparam int unsigned SF = IS_MVU ? MW/SIMD : MW/(SIMD*PE); - localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8; - localparam int unsigned ACTIVATION_WIDTH_BA = ((IS_MVU ? 1 : PE)*SIMD*ACTIVATION_WIDTH+7)/8*8; - localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH; - localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - (IS_MVU ? 1 : PE)*SIMD*ACTIVATION_WIDTH; - localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8; - - // Generate clk and reset signal - logic clk = 0; - always #5ns clk = !clk; - - logic ap_rst_n = 0; - initial begin - repeat(16) @(posedge clk); - ap_rst_n <= 1; - end - - uwire ap_clk = clk; - - // Generate activations - typedef logic [(IS_MVU ? 1 : PE)*SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; - typedef activation_t activation_vector_t[SF]; - - function activation_vector_t init_ACTIVATIONS; - automatic activation_vector_t res; - std::randomize(res); - return res; - endfunction : init_ACTIVATIONS - - activation_vector_t ACTIVATIONS = init_ACTIVATIONS(); - - struct { - activation_t dat; - logic vld; - logic rdy; - } activations; - - initial begin - activations.vld = 0; - activations.dat = 'X; - @(posedge clk iff ap_rst_n); - - for (int i=0; i= 0; - @(posedge clk); - end while (!(activations.vld === 1 && activations.rdy === 1)); - end - - activations.vld <= 0; - activations.dat <= 'x; - end - - // Generate weights - typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; - typedef weight_t weight_matrix_t[NF][SF]; - - function weight_matrix_t init_WEIGHTS; - automatic weight_matrix_t res; - std::randomize(res); - return res; - endfunction : init_WEIGHTS; - - weight_matrix_t WEIGHTS = init_WEIGHTS(); - - struct { - weight_t dat; - logic vld; - logic rdy; - } weights; - - initial begin - weights.vld = 0; - weights.dat = 'X; - @(posedge clk iff ap_rst_n); - - weights.vld <= 1; - for (int i=0; i 1 ? $signed(a[i/SIMD/PE][i % (SIMD*PE)]) : $signed(a[i/SIMD/PE][(i)%(SIMD*PE)]) ) * $signed(w[0][i/SIMD/PE][i/PE][i%SIMD]); - // else - // res[j/PE][j%PE] = IS_MVU ? $signed(res[j/PE][j%PE]) + $signed({1'b0, a[i/SIMD][i%SIMD]}) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]) : - // $signed(res[j/PE][j%PE]) + ( PE > 1 ? $signed({1'b0, a[i/SIMD/PE][i % (SIMD*PE)]}) : $signed({1'b0, a[i/SIMD/PE][i%(SIMD*PE)]}) ) * $signed(w[0][i/SIMD][0][i%SIMD]); - // end - // end - // The input stream will have the channels interleaved for VVU when PE>1 - // Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..] - // Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like: - // (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to - // (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i) - for (int i = 0; i < NF; i++) begin - for (int j = 0; j < SF; j++) begin - for (int k = 0; k < PE; k++) begin - for (int l = 0; l < SIMD; l++) begin - if (SIGNED_ACTIVATIONS) - res[i][k] = IS_MVU ? $signed(res[i][k]) + $signed(a[j][l]) * $signed(w[i][j][k][l]) : - $signed(res[i][k]) + $signed(a[j][k + l*PE]) * $signed(w[i][j][k][l]); - else - res[i][k] = IS_MVU ? $signed(res[i][k]) + $signed({1'b0, a[j][l]}) * $signed(w[i][j][k][l]) : - $signed(res[i][k]) + $signed({1'b0, a[j][k + l*PE]}) * $signed(w[i][j][k][l]); - end - end - end - end - return res; - endfunction : check_output; - - output_vector_t GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS); - - int unsigned NF_CNT = 0; - initial begin - outputs.rdy = 0; - while (NF_CNT < NF) begin - // Loop until both rdy & vld are asserted - do begin - outputs.rdy <= $urandom()%7 >= 0; - @(posedge clk iff ap_rst_n); - end while (!(outputs.rdy === 1 && outputs.vld === 1)); - - // Compare produced outputs against golden outputs - foreach(outputs.dat[i]) begin - assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); - else begin - $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); - $stop; - end - end - - NF_CNT += 1; - end - - $finish; - end - - // Instantiate DUT - mvu_vvu_axi #( - .IS_MVU(IS_MVU), - .COMPUTE_CORE(COMPUTE_CORE), - .MW(MW), - .MH(MH), - .PE(PE), - .SIMD(SIMD), - .ACTIVATION_WIDTH(ACTIVATION_WIDTH), - .WEIGHT_WIDTH(WEIGHT_WIDTH), - .ACCU_WIDTH(ACCU_WIDTH), - .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), - .SEGMENTLEN(SEGMENTLEN), - .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL), - .M_REG_LUT(M_REG_LUT) - ) - dut ( - .ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld), - .s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld), - .s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld), - .m_axis_output_tready(outputs.rdy) - ); - -endmodule : mvu_vvu_axi_tb diff --git a/finn-rtllib/mvu/tb/replay_buffer_tb.sv b/finn-rtllib/mvu/tb/replay_buffer_tb.sv deleted file mode 100644 index 5581354e0e..0000000000 --- a/finn-rtllib/mvu/tb/replay_buffer_tb.sv +++ /dev/null @@ -1,130 +0,0 @@ -/****************************************************************************** - * Copyright (C) 2023, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, - * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * @brief Testbench for replay_buffer module. - * @author Thomas B. Preußer - *****************************************************************************/ - -module replay_buffer_tb; - - // Global Control - logic clk = 0; - always #5ns clk = !clk; - uwire rst = 0; - - // DUT Geometries - localparam int unsigned DIMS[3] = '{ 7, 8, 10 }; - localparam int unsigned W = 8; - typedef logic [W-1:0] data_t; - - bit [2**$size(DIMS)-1:0] done = 0; - always_comb begin - if(&done) begin - $display("Test completed."); - $finish; - end - end - - // Parallel DUT Instantiations - for(genvar r = 0; r < $size(DIMS); r++) begin - for(genvar l = 0; l < $size(DIMS); l++) begin - localparam int unsigned REP = DIMS[r]; - localparam int unsigned LEN = DIMS[l]; - - data_t idat; - logic ivld; - uwire irdy; - - uwire data_t odat; - uwire olast; - uwire ofin; - uwire ovld; - logic ordy; - - replay_buffer #(.LEN(LEN), .REP(REP), .W(W)) dut ( - .clk, .rst, - .idat, .ivld, .irdy, - .odat, .olast, .ofin, .ovld, .ordy - ); - - // Input Feed: 0, 1, ..., 10*LEN-1 - initial begin - idat = 'x; - ivld = 0; - @(posedge clk iff !rst); - - for(int unsigned i = 0; i < 10*LEN; i++) begin - idat <= i; - ivld <= 1; - @(posedge clk iff irdy); - idat <= 'x; - ivld <= 0; - while($urandom()%(REP-1) != 0) @(posedge clk); - end - end - - // Output Check - initial begin - automatic int unsigned base = 0; - - ordy = 0; - @(posedge clk iff !rst); - - for(int unsigned k = 0; k < 10; k++) begin - for(int unsigned j = 0; j < REP; j++) begin - for(int unsigned i = 0; i < LEN; i++) begin - ordy <= 1; - @(posedge clk iff ovld); - assert(odat == base+i) else begin - $error("#%0d.%0d: Data mismatch: %0d instead of %0d.", r, l, odat, base+i); - $stop; - end - assert(olast == (i == LEN-1)) else begin - $error("#%0d.%0d: Last mismatch.", r, l); - $stop; - end - assert(ofin == ((i == LEN-1) && (j == REP-1))) else begin - $error("#%0d.%0d: Fin mismatch.", r, l); - $stop; - end - - ordy <= 0; - while($urandom()%13 == 0) @(posedge clk); - end - end - base += LEN; - end - - done[$size(DIMS)*r + l] <= 1; - end - end - end - -endmodule : replay_buffer_tb From 472ce110b262d6ab5a6937a6ca82c9ab1bf69e45 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 26 Jan 2024 12:46:06 +0000 Subject: [PATCH 104/112] [test mvau]: modified to support new custom-ops --- tests/fpgadataflow/test_fpgadataflow_mvau.py | 120 +++++++++++++++++-- 1 file changed, 113 insertions(+), 7 deletions(-) diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau.py b/tests/fpgadataflow/test_fpgadataflow_mvau.py index b80ef76a19..bd283855e3 100644 --- a/tests/fpgadataflow/test_fpgadataflow_mvau.py +++ b/tests/fpgadataflow/test_fpgadataflow_mvau.py @@ -52,6 +52,9 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from qonnx.transformation.general import ApplyConfig, GiveUniqueNodeNames, GiveReadableTensorNames +from qonnx.transformation.infer_shapes import InferShapes +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers def make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T=None, tdt=None): @@ -135,6 +138,87 @@ def prepare_inputs(input_tensor, idt, wdt): return {"inp": input_tensor} +# activation: None or DataType +@pytest.mark.parametrize("act", [None, DataType["BIPOLAR"], DataType["INT4"]]) +# weight datatype +@pytest.mark.parametrize("wdt", [DataType["BIPOLAR"], DataType["INT4"]]) +# input datatype +@pytest.mark.parametrize("idt", [DataType["BIPOLAR"], DataType["INT4"]]) +# neuron folding, -1 is maximum possible +@pytest.mark.parametrize("nf", [-1, 2, 1]) +# synapse folding, -1 is maximum possible +@pytest.mark.parametrize("sf", [-1, 2, 1]) +# HLS matrix width (input features) +@pytest.mark.parametrize("mw", [16]) +# HLS matrix height (output features) +@pytest.mark.parametrize("mh", [16]) +@pytest.mark.fpgadataflow +@pytest.mark.slow +@pytest.mark.vivado +def test_fpgadataflow_fclayer_hwop(idt, wdt, act, nf, sf, mw, mh): + if nf == -1: + nf = mh + if sf == -1: + sf = mw + pe = mh // nf + simd = mw // sf + assert mh % pe == 0 + assert mw % sf == 0 + # generate weights + W = gen_finn_dt_tensor(wdt, (mw, mh)) + # generate input data + x = gen_finn_dt_tensor(idt, (1, mw)) + if act is None: + # no activation, produce accumulators + T = None + tdt = None + if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]: + odt = DataType["UINT32"] + else: + odt = DataType["INT32"] + else: + odt = act + (min, max) = calculate_signed_dot_prod_range(idt, wdt, mw) + n_steps = act.get_num_possible_values() - 1 + T = np.random.randint(min, max - 1, (mh, n_steps)).astype(np.float32) + # provide non-decreasing thresholds + T = np.sort(T, axis=1) + # generate thresholds for activation + if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]: + tdt = DataType["UINT32"] + # bias thresholds to be positive + T = np.ceil((T + mw) / 2) + assert (T >= 0).all() + else: + tdt = DataType["INT32"] + model = make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T, tdt) + # prepare input data + input_dict = prepare_inputs(x, idt, wdt) + if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]: + # convert inputs to binary and use xnorpopcountmatmul + y = xp.xnorpopcountmatmul((x + 1) / 2, (W + 1) / 2) + else: + y = np.matmul(x, W) + if T is not None: + # y = multithreshold(y, T) + if act == DataType["BIPOLAR"]: + # binary to bipolar + # y = 2 * y - 1 + y = multithreshold(y, T, 2, -1) + else: + # signed offset + # y += act.min() + y = multithreshold(y, T, 1, act.min()) + oshape = model.get_tensor_shape("outp") + y_expected = y.reshape(oshape) + # execute model + y_produced = oxe.execute_onnx(model, input_dict)["outp"] + + y_produced = y_produced.reshape(y_expected.shape) + + assert (y_produced == y_expected).all(), "cppsim hw-op failed" + + # mem_mode: const or decoupled @pytest.mark.parametrize("mem_mode", ["const", "decoupled", "external"]) # activation: None or DataType @@ -154,7 +238,9 @@ def prepare_inputs(input_tensor, idt, wdt): @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado -def test_fpgadataflow_fclayer_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): +def test_fpgadataflow_fclayer_hlsop_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): + if idt == DataType["BIPOLAR"] and wdt != DataType["BIPOLAR"] or idt != DataType["BIPOLAR"] and wdt == DataType["BIPOLAR"]: + pytest.skip("Bipolar activations/weights only supported in MVU if both operands are bipolar") if nf == -1: nf = mh if sf == -1: @@ -195,6 +281,8 @@ def test_fpgadataflow_fclayer_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): # lookup op_type in registry of CustomOps inst = getCustomOp(node) inst.set_nodeattr("mem_mode", mem_mode) + inst.set_nodeattr("preferred_impl_style", "hls") + model = model.transform(SpecializeLayers()) model = model.transform(SetExecMode("cppsim")) model = model.transform(PrepareCppSim()) model = model.transform(CompileCppSim()) @@ -220,7 +308,7 @@ def test_fpgadataflow_fclayer_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): y_produced = y_produced.reshape(y_expected.shape) - assert (y_produced == y_expected).all(), "cppsim failed" + assert (y_produced == y_expected).all(), "cppsim hls-op failed" # mem_mode: const or decoupled @@ -239,10 +327,14 @@ def test_fpgadataflow_fclayer_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): @pytest.mark.parametrize("mw", [16]) # HLS matrix height (output features) @pytest.mark.parametrize("mh", [16]) +# Backend +@pytest.mark.parametrize("backend", ["rtl", "hls"]) @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado -def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): +def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh, backend): + if backend == "rtl" and act is not None: + pytest.skip("RTL MVU doesn't support embedded thresholding functionality.") if nf == -1: nf = mh if sf == -1: @@ -283,6 +375,7 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): # lookup op_type in registry of CustomOps inst = getCustomOp(node) inst.set_nodeattr("mem_mode", mem_mode) + inst.set_nodeattr("preferred_impl_style", backend) # prepare input data input_dict = prepare_inputs(x, idt, wdt) @@ -303,6 +396,7 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): y_expected = y.reshape(oshape) # TODO split up into several dependent tests -- need to check how this # works for parametrized tests... + model = model.transform(SpecializeLayers()) model = model.transform(SetExecMode("rtlsim")) model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP("xc7z020clg400-1", 5)) @@ -312,7 +406,10 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): assert (y_produced.reshape(y_expected.shape) == y_expected).all(), "rtlsim failed" hls_synt_res_est = model.analysis(hls_synth_res_estimation) - assert "MatrixVectorActivation_0" in hls_synt_res_est + if backend == "hls": + assert "MatrixVectorActivation_hls_0" in hls_synt_res_est + else: + assert "MatrixVectorActivation_rtl_0" in hls_synt_res_est node = model.get_nodes_by_op_type("MatrixVectorActivation")[0] inst = getCustomOp(node) @@ -339,10 +436,12 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): @pytest.mark.parametrize("mw", [128]) # HLS matrix height (output features) @pytest.mark.parametrize("mh", [128]) +# Backend +@pytest.mark.parametrize("backend", ["rtl", "hls"]) @pytest.mark.fpgadataflow @pytest.mark.vivado def test_fpgadataflow_fclayer_large_depth_decoupled_mode_rtlsim( - mem_mode, idt, wdt, act, nf, sf, mw, mh + mem_mode, idt, wdt, act, nf, sf, mw, mh, backend ): if nf == -1: nf = mh @@ -404,6 +503,7 @@ def test_fpgadataflow_fclayer_large_depth_decoupled_mode_rtlsim( y_expected = y.reshape(oshape) # TODO split up into several dependent tests -- need to check how this # works for parametrized tests... + model = model.transform(SpecializeLayers()) model = model.transform(SetExecMode("rtlsim")) model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP("xc7z020clg400-1", 5)) @@ -413,7 +513,10 @@ def test_fpgadataflow_fclayer_large_depth_decoupled_mode_rtlsim( assert (y_produced.reshape(y_expected.shape) == y_expected).all(), "rtlsim failed" hls_synt_res_est = model.analysis(hls_synth_res_estimation) - assert "MatrixVectorActivation_0" in hls_synt_res_est + if backend == "hls": + assert "MatrixVectorActivation_hls_0" in hls_synt_res_est + else: + assert "MatrixVectorActivation_rtl_0" in hls_synt_res_est node = model.get_nodes_by_op_type("MatrixVectorActivation")[0] inst = getCustomOp(node) @@ -440,9 +543,11 @@ def test_fpgadataflow_fclayer_large_depth_decoupled_mode_rtlsim( @pytest.mark.parametrize("mw", [32]) # HLS matrix height (output features) @pytest.mark.parametrize("mh", [32]) +# Backend +@pytest.mark.parametrize("backend", ["rtl", "hls"]) @pytest.mark.fpgadataflow @pytest.mark.vivado -def test_fclayer_fifocharacterize_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): +def test_fclayer_fifocharacterize_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh, backend): if nf == -1: nf = mh if sf == -1: @@ -469,6 +574,7 @@ def test_fclayer_fifocharacterize_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh inst.set_nodeattr("mem_mode", mem_mode) total_fold = nf * sf exp_total_cycles = total_fold + 10 + model = model.transform(SpecializeLayers()) model = model.transform(SetExecMode("rtlsim")) model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP("xc7z020clg400-1", 5)) From 31083d52da1abfe61a732199ddb70e5bf9e9f4b6 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 26 Jan 2024 19:47:17 +0000 Subject: [PATCH 105/112] removed rtl refactoring steps --- src/finn/builder/build_dataflow_config.py | 3 +-- src/finn/builder/build_dataflow_steps.py | 11 ----------- 2 files changed, 1 insertion(+), 13 deletions(-) diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py index 073bc9e12b..e4fed05731 100644 --- a/src/finn/builder/build_dataflow_config.py +++ b/src/finn/builder/build_dataflow_config.py @@ -121,7 +121,6 @@ class VerificationStepType(str, Enum): "step_apply_folding_config", "step_minimize_bit_width", "step_generate_estimate_reports", - "step_specialize_to_rtl", "step_hls_codegen", "step_hls_ipgen", "step_set_fifo_depths", @@ -234,7 +233,7 @@ class DataflowBuildConfig: #: activations in FINN) will be implemented as stand-alone HLS layers, #: instead of being part of MatrixVectorActivation layer. This gives larger #: flexibility, and makes it possible to have runtime-writable thresholds. - standalone_thresholds: Optional[bool] = True + standalone_thresholds: Optional[bool] = False #: (Optional) Whether optimizations that minimize the bit width of the #: weights and accumulator will be applied. Because this optimization relies diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py index 2629efef11..11107ccb64 100644 --- a/src/finn/builder/build_dataflow_steps.py +++ b/src/finn/builder/build_dataflow_steps.py @@ -53,7 +53,6 @@ from shutil import copy import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls -import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl import finn.transformation.streamline.absorb as absorb from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer @@ -473,15 +472,6 @@ def step_generate_estimate_reports(model: ModelWrapper, cfg: DataflowBuildConfig return model -def step_specialize_to_rtl(model: ModelWrapper, cfg: DataflowBuildConfig): - """Convert layers implemented in HLS to an equivalent specialized RTL - implementation if possible.""" - specialize_to_rtl_transforms = [to_rtl.InferRTLMatrixVectorActivation()] - for trn in specialize_to_rtl_transforms: - model = model.transform(trn) - return model - - def step_minimize_bit_width(model: ModelWrapper, cfg: DataflowBuildConfig): """Tighten the weight and accumulator bit widths for each layer.""" if cfg.minimize_bit_width: @@ -844,7 +834,6 @@ def step_deployment_package(model: ModelWrapper, cfg: DataflowBuildConfig): "step_apply_folding_config": step_apply_folding_config, "step_minimize_bit_width": step_minimize_bit_width, "step_generate_estimate_reports": step_generate_estimate_reports, - "step_specialize_to_rtl": step_specialize_to_rtl, "step_hls_codegen": step_hls_codegen, "step_hls_ipgen": step_hls_ipgen, "step_set_fifo_depths": step_set_fifo_depths, From 0032743e605651ce4a705297115c4cabf104b45d Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 26 Jan 2024 19:48:16 +0000 Subject: [PATCH 106/112] removed old rtl custom-op --- .../matrixvectoractivation_rtl.py | 1086 ----------------- 1 file changed, 1086 deletions(-) delete mode 100644 src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py deleted file mode 100644 index fcab06658c..0000000000 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py +++ /dev/null @@ -1,1086 +0,0 @@ -# Copyright (c) 2020, Xilinx -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of FINN nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import math -import numpy as np -import os -import textwrap -import warnings -from qonnx.core.datatype import DataType -from qonnx.util.basic import ( - calculate_matvec_accumulator_range, - interleave_matrix_outer_dim_from_partitions, - roundup_to_integer_multiple, -) - -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp -from finn.util.basic import get_rtlsim_trace_depth, make_build_dir -from finn.util.data_packing import ( - npy_to_rtlsim_input, - pack_innermost_dim_as_hex_string, - rtlsim_output_to_npy, -) - -try: - from pyverilator import PyVerilator -except ModuleNotFoundError: - PyVerilator = None - - -# ONNX i/o tensor shape assumptions for MatrixVectorActivation: -# input 0 is the input tensor, shape (.., i_size) = (..., MW) -# input 1 is the weight tensor, shape (i_size, o_size) = (MW, MH) -# (optional) input 2 is the thresholds tensor, shape (o_size, n_thres) -# output 0 is the output tensor, shape (.., o_size) = (..., MH) -# the ... here can be any shape (representing groups of vectors) - - -class MatrixVectorActivation_rtl(HLSCustomOp): - """Class that corresponds to finn-rtl Matrix Vector Unit.""" - - def __init__(self, onnx_node, **kwargs): - super().__init__(onnx_node, **kwargs) - - def get_nodeattr_types(self): - my_attrs = { - "PE": ("i", True, 0), - "SIMD": ("i", True, 0), - "MW": ("i", True, 0), - "MH": ("i", True, 0), - "resType": ("s", False, "dsp", {"auto", "lut", "dsp"}), - # FINN DataTypes for inputs, weights, outputs - "inputDataType": ("s", True, ""), - "weightDataType": ("s", True, ""), - "outputDataType": ("s", True, ""), - # FINN DataType for accumulator -- auto-computed and updated - "accDataType": ("s", False, "INT32"), - # number of input vectors, examples: - # [1] is a single vector (like a FC layer with batch=1) - # [4] is four vectors (like a FC layer with batch=4) - # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) - "numInputVectors": ("ints", False, [1]), - # memory mode for the FC weights - # const -- embedded weights, default, long compile/synth times - # decoupled -- streaming weights with weight streamer packaged inside IP - # external -- streaming weights with external streamer - "mem_mode": ("s", False, "const", {"const", "decoupled", "external"}), - # FPGA resource type for memories in decoupled mode - # auto -- let Vivado decide - # block -- use BRAM - # distributed -- use LUTRAM - # ultra -- use UltraRAM (URAM), must have runtime_writeable_weights=1 - # see also https://www.xilinx.com/support/answers/38070.html - "ram_style": ( - "s", - False, - "auto", - {"auto", "block", "distributed", "ultra"}, - ), - # (mem_mode = decoupled only) whether weights will be writable through - # an AXI-lite interface during runtime - # 1 for enabled, 0 for disabled. - # see finn-rtllib/memstream/doc/README for more about the memory - # address map used for writable weights - # IMPORTANT: After using AXI lite to either read or write the weights, - # always "flush" the accelerator by first passing a dummy input - # vector through the accelerator. This will get rid of any old - # weight data from the weight FIFOs. - "runtime_writeable_weights": ("i", False, 0, {0, 1}), - # attribute to save top module name - not user configurable - "gen_top_module": ("s", False, ""), - } - my_attrs.update(super().get_nodeattr_types()) - return my_attrs - - def calc_wmem(self): - """Calculates and returns WMEM.""" - mw = self.get_nodeattr("MW") - mh = self.get_nodeattr("MH") - pe = self.get_nodeattr("PE") - simd = self.get_nodeattr("SIMD") - assert mh % pe == 0, "Requirement MH divisable by PE is violated." - assert mw % simd == 0, "Requirement MW divisable by SIMD is violated." - wmem = mw * mh // (pe * simd) - return wmem - - def calc_tmem(self): - """Calculates and returns TMEM.""" - return 0 - - def make_shape_compatible_op(self, model): - oshape = self.get_normal_output_shape() - return super().make_const_shape_op(oshape) - - def infer_node_datatype(self, model): - node = self.onnx_node - idt = model.get_tensor_datatype(node.input[0]) - if idt != self.get_input_datatype(): - warn_str = "inputDataType changing for %s: %s -> %s " % ( - node.name, - str(self.get_input_datatype()), - str(idt), - ) - warnings.warn(warn_str) - self.set_nodeattr("inputDataType", idt.name) - # set output datatype from property - odt = self.get_output_datatype() - model.set_tensor_datatype(node.output[0], odt) - - def verify_node(self): - info_messages = [] - # verify that "backend" is set to "fpgadataflow" - backend_value = self.get_nodeattr("backend") - if backend_value == "fpgadataflow": - info_messages.append("Attribute backend is set correctly") - else: - info_messages.append('Attribute backend should be set to "fpgadataflow"') - - # verify that all necessary attributes exist - # TODO collect automatically from get_nodeattr_types - try: - self.get_nodeattr("executable_path") - self.get_nodeattr("resType") - self.get_nodeattr("MW") - self.get_nodeattr("MH") - self.get_nodeattr("SIMD") - self.get_nodeattr("PE") - self.get_nodeattr("inputDataType") - self.get_nodeattr("weightDataType") - self.get_nodeattr("outputDataType") - info_messages.append("All necessary attributes exist") - except Exception: - info_messages.append( - """The required MatrixVectorActivation attributes do not exist.""" - ) - - num_of_inputs = len(self.onnx_node.input) - if num_of_inputs != 2: - info_messages.append( - "RTL-based MatrixVectorActivation expects two inputs " - "(weights and activation), but got {} inputs.".format( - len(self.onnx_node.input) - ) - ) - - mem_mode = self.get_nodeattr("mem_mode") - - if mem_mode not in ["decoupled", "external"]: - info_messages.append( - "RTL-based MVU only supports decoupled or external weights." - ) - - if self.get_nodeattr("resType") == "lut": - info_message.append( - "RTL-based MVU only supports DSP-based implementation" - ) - - return info_messages - - def uram_estimation(self): - P = self.get_nodeattr("PE") - Q = self.get_nodeattr("SIMD") - wdt = self.get_weight_datatype() - W = wdt.bitwidth() - D_in = self.get_nodeattr("MW") - D_out = self.get_nodeattr("MH") - omega = (D_in * D_out) / (Q * P) - mem_width = Q * W * P - mmode = self.get_nodeattr("mem_mode") - mstyle = self.get_nodeattr("ram_style") - if ( - (mmode == "decoupled" and mstyle != "ultra") - or (mmode == "external") - ): - return 0 - width_multiplier = math.ceil(mem_width / 72) - depth_multiplier = math.ceil(omega / 4096) - return width_multiplier * depth_multiplier - - def bram_estimation(self): - """Calculates resource estimation for BRAM based on: - - FINN-R: An End-to-End Deep-Learning Framework for Fast - Exploration of Quantized Neural Networks - - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien, - Y. Umuroglu, M. Leeser and K. Vissers - - 12. Sep 2018 - """ - # TODO add in/out FIFO contributions - P = self.get_nodeattr("PE") - Q = self.get_nodeattr("SIMD") - wdt = self.get_weight_datatype() - W = wdt.bitwidth() - D_in = self.get_nodeattr("MW") - D_out = self.get_nodeattr("MH") - omega = (D_in * D_out) / (Q * P) - mem_width = Q * W * P - mmode = self.get_nodeattr("mem_mode") - mstyle = self.get_nodeattr("ram_style") - if ( - (mmode == "decoupled" and mstyle in ["distributed", "ultra"]) - or (mmode == "external") - ): - return 0 - # assuming SDP mode RAMB18s (see UG573 Table 1-10) - # assuming decoupled (RTL) memory - if mem_width == 1: - return math.ceil(omega / 16384) - elif mem_width == 2: - return math.ceil(omega / 8192) - elif mem_width <= 4: - return (math.ceil(omega / 4096)) * (math.ceil(mem_width / 4)) - elif mem_width <= 9: - return (math.ceil(omega / 2048)) * (math.ceil(mem_width / 9)) - elif mem_width <= 18 or omega > 512: - return (math.ceil(omega / 1024)) * (math.ceil(mem_width / 18)) - else: - return (math.ceil(omega / 512)) * (math.ceil(mem_width / 36)) - - def bram_efficiency_estimation(self): - wdt = self.get_weight_datatype() - W = wdt.bitwidth() - D_in = self.get_nodeattr("MW") - D_out = self.get_nodeattr("MH") - bram16_est = self.bram_estimation() - if bram16_est == 0: - return 1 - wbits = W * D_in * D_out - bram16_est_capacity = bram16_est * 36 * 512 - return wbits / bram16_est_capacity - - def uram_efficiency_estimation(self): - """Function for URAM efficiency estimation: actual parameter storage - needed divided by the allocated URAM storage (from estimation)""" - wdt = self.get_weight_datatype() - W = wdt.bitwidth() - D_in = self.get_nodeattr("MW") - D_out = self.get_nodeattr("MH") - uram_est = self.uram_estimation() - if uram_est == 0: - return 1 - wbits = W * D_in * D_out - uram_est_capacity = uram_est * 72 * 4096 - return wbits / uram_est_capacity - -# TODO: fix lut estimations - def lut_estimation(self): - """Calculates resource estimations for LUTs based on: - - FINN-R: An End-to-End Deep-Learning Framework for Fast - Exploration of Quantized Neural Networks - - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien, - Y. Umuroglu, M. Leeser and K. Vissers - - 12. Sep 2018 - """ - # TODO add in/out FIFO contributions - P = self.get_nodeattr("PE") - Q = self.get_nodeattr("SIMD") - MW = self.get_nodeattr("MW") - wdt = self.get_weight_datatype() - W = wdt.bitwidth() - # determine tdt with input and weight data types - idt = self.get_input_datatype() - A = idt.bitwidth() - # parameters from experiments in paper mentioned above - c0 = 300 - c1 = 1.1 - c2 = 0 - mmode = self.get_nodeattr("mem_mode") - mstyle = self.get_nodeattr("ram_style") - if (mmode == "decoupled" and mstyle == "distributed") or ( - mmode == "const" and self.calc_wmem() <= 128 - ): - c2 = (P * Q * W) * math.ceil(self.calc_wmem() / 64) - - # multiplication - res_type = self.get_nodeattr("resType") - if res_type == "dsp": - mult_luts = 0 - else: - mult_luts = Q * (2 * math.ceil((W + A) / 6) - 1) * (W + A) - # adder tree - addertree_luts = (W + A) * (2 * Q - 1) - # accumulator - acc_bits = W + A + np.ceil(math.log(MW, 2)) - acc_luts = acc_bits - - return int(c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts)) + c2) - -# TODO: fix DSP estimations --> depends on fpga_part - def dsp_estimation(self): - # multiplication - # mvu_8sx9 (DSP58): ceil(SIMD/3) - # mvu_4sx4u (DSP48/DSP58): ceil(PE/4) - # mvu_8sx8u (DSP48): ceil(PE/2) - # mvu_lut: 0 - P = self.get_nodeattr("PE") - res_type = self.get_nodeattr("resType") - Q = self.get_nodeattr("SIMD") - wdt = self.get_weight_datatype() - W = wdt.bitwidth() - idt = self.get_input_datatype() - A = idt.bitwidth() - if res_type == "dsp": - mult_dsp = P * Q * np.ceil((W + A) / 48) # TODO: more accurate modelling - else: - mult_dsp = 0 - return int(mult_dsp) - -# TODO: fix exp_cycles estimations --> depends on fpga_part and clk - def get_exp_cycles(self): - # mvu_8sx9 (DSP58): - # 2 (replay_buffer) + ceil(chainlen/seglen) + 2 (MREG, PREG) + 2 (output reg slice) - # + MW/SIMD * MH/PE - # mvu_4sx4u (DSP48/DSP58) / mvu_8sx8u (DSP48): - # 3 (IN_REG, MREG, PREG) + 2 (replay_buffer) + 2 (output reg slice) + 1 (adder tree SIMD) + 1 (output lane) - # + MW/SIMD * MH/PE - # mvu_lut: - # 2 (replay_buffer) + 1 OR 2 (no MREG OR MREG) + 2 (output reg slice) - # + MW/SIMD * MH/PE - pe = self.get_nodeattr("PE") - simd = self.get_nodeattr("SIMD") - num_inp_vec = self.get_nodeattr("numInputVectors") - mh = self.get_nodeattr("MH") - mw = self.get_nodeattr("MW") - # since mmv != 1 is not supported yet, we set mmv for now to 1 - mmv = 1 - exp_cycles = (mh / pe) * (mw / simd) * np.prod(num_inp_vec) / mmv - return int(exp_cycles) - - def get_input_datatype(self, ind=0): - """Returns FINN DataType of input.""" - # when performing FIFO insertion on an FC layer with ext weights, the ind - # parameter can be > 0 (referring to the weights) so handle that here - if ind == 0: - return DataType[self.get_nodeattr("inputDataType")] - elif ind == 1: - return DataType[self.get_nodeattr("weightDataType")] - else: - raise Exception("Undefined input ind for this layer type") - - def get_weight_datatype(self): - """Returns FINN DataType of weights.""" - return DataType[self.get_nodeattr("weightDataType")] - - def get_output_datatype(self, ind=0): - """Returns FINN DataType of output.""" - return DataType[self.get_nodeattr("outputDataType")] - - def get_instream_width(self, ind=0): - i_bits = self.get_input_datatype().bitwidth() - assert ( - i_bits <= 9 - ), "RTL-based MVAU only supports activations with bit-width up to 9-bits" - in_width = i_bits * self.get_nodeattr("SIMD") - return in_width - - def get_outstream_width(self, ind=0): - o_bits = self.get_output_datatype().bitwidth() - out_width = o_bits * self.get_nodeattr("PE") - return out_width - - def get_weightstream_width(self): - """Returns weight stream width. Used only in decoupled mode.""" - if ( - self.get_nodeattr("mem_mode") == "decoupled" - or self.get_nodeattr("mem_mode") == "external" - ): - pe = self.get_nodeattr("PE") - simd = self.get_nodeattr("SIMD") - wp = self.get_weight_datatype().bitwidth() - assert ( - wp <= 8 - ), "RTL-based MVAU only supports weights with bit-width up to 8-bits" - w_width = pe * simd * wp - return w_width - else: - return 0 - - def get_weightstream_width_padded(self): - """Returns weight stream width padded to a multiple of 8. This is required - by the AXI Stream spec. Used in decoupled mode.""" - weight_width = self.get_weightstream_width() - return roundup_to_integer_multiple(weight_width, 8) - - def get_ap_int_max_w(self): - # base class impl (max of inp/out stream widths) - max_of_io = super().get_ap_int_max_w() - # decoupled mode weight stream - weightstream = self.get_weightstream_width() - # single PE weight entry - weight_bits = self.get_weight_datatype().bitwidth() - simd = self.get_nodeattr("SIMD") - single_pe_w = simd * weight_bits - return max([weightstream, max_of_io, single_pe_w]) - - def get_folded_input_shape(self, ind=0): - mw = self.get_nodeattr("MW") - mh = self.get_nodeattr("MH") - simd = self.get_nodeattr("SIMD") - pe = self.get_nodeattr("PE") - sf = mw // simd - nf = mh // pe - vecs = list(self.get_nodeattr("numInputVectors")) - - if ind == 0: - # calculate shape of input 0 - folded_input_shape = tuple(vecs + [sf, simd]) - elif ind == 1 and self.get_nodeattr("mem_mode") == "external": - # calculate shape of input 1 (weights) - folded_input_shape = tuple(vecs + [sf * nf, simd * pe]) - else: - raise Exception("Undefined input shape for requested input") - - return folded_input_shape - - def get_folded_output_shape(self, ind=0): - mh = self.get_nodeattr("MH") - pe = self.get_nodeattr("PE") - nf = mh // pe - vecs = list(self.get_nodeattr("numInputVectors")) - folded_output_shape = tuple(vecs + [nf, pe]) - return folded_output_shape - - def get_normal_input_shape(self, ind=0): - mw = self.get_nodeattr("MW") - vecs = list(self.get_nodeattr("numInputVectors")) - normal_input_shape = tuple(vecs + [mw]) - return normal_input_shape - - def get_normal_output_shape(self, ind=0): - mh = self.get_nodeattr("MH") - vecs = list(self.get_nodeattr("numInputVectors")) - normal_output_shape = tuple(vecs + [mh]) - return normal_output_shape - - def get_number_output_values(self): - nf = np.prod(self.get_folded_output_shape()[:-1]) - return nf - - def get_hls_compatible_weight_tensor(self, orig_weight_matrix): - """Convert the original numpy weight matrix orig_weight_matrix into - a form suitable for passing to the hlslib call: - * ensure MH % PE == 0 and MW % SIMD == 0 - * for bipolar {-1,+1} weights, convert to binary {0, 1} - * interleave rows between PEs - * reshape into (1, PE, WMEM, SIMD) and return - """ - mw = self.get_nodeattr("MW") - mh = self.get_nodeattr("MH") - pe = self.get_nodeattr("PE") - simd = self.get_nodeattr("SIMD") - wmem = self.calc_wmem() - assert orig_weight_matrix.shape == ( - mw, - mh, - ), """Weights matrix doesn't - have expected shape (mw, mh)""" - assert mw % simd == 0, "Requirement MH divisable by SIMD is violated." - assert mh % pe == 0, "Requirement MH divisable by PE is violated." - # start by transposing the original weight matrix, since ONNX and - # finn-hlslib use different assumptions - # ONNX uses (in_features, out_features) and matmul(x, W) - # finn-hlslib uses (out_features, in_features) and matmul(W, x) - ret = orig_weight_matrix.T - # interleave rows between PEs and reshape - # distribute rows between PEs - ret = interleave_matrix_outer_dim_from_partitions(ret, pe) - # create SIMD as innermost dimension and add a dummy outer dim - ret = ret.reshape(1, pe, wmem, simd) - # reverse the SIMD dimension - ret = np.flip(ret, axis=-1) - return ret - - def minimize_accumulator_width(self, model): - weights = model.get_initializer(self.onnx_node.input[1]) - idt = self.get_input_datatype() - # calculate minimum and maximum values of accumulator - (acc_min, acc_max) = calculate_matvec_accumulator_range(weights, idt) - if acc_min < 0: - if abs(acc_min) > acc_max: - adt = DataType.get_smallest_possible(acc_min) - else: - adt = DataType.get_smallest_possible(-acc_max - 1) - else: - adt = DataType.get_smallest_possible(acc_max) - # Note: we are interested in simply the width of the output dot product. - # Padding the actual output stream to a multiple of 8-bits is done in - # the RTL component - self.set_nodeattr("accDataType", adt.name) - # for no-activation nodes, output dt = acc dt - self.set_nodeattr("outputDataType", adt.name) - return DataType[self.get_nodeattr("accDataType")] - - def make_weight_file(self, weights, weight_file_mode, weight_file_name): - """Produce a file containing given weights in appropriate format for this - layer. This file can be used for either synthesis or run-time reconfig - of weights. - - Arguments: - * weights : numpy array with weights to be put into the file - * weight_file_mode : one of {hls_header, decoupled_verilog_dat, - decoupled_runtime} - * weight_file_name : filename for the weight file to be generated - """ - # convert weights into hlslib-compatible format - weight_tensor = self.get_hls_compatible_weight_tensor(weights) - export_wdt = self.get_weight_datatype() - if "decoupled" in weight_file_mode: - # create a weight stream for various flavors of decoupled mode: - # transpose weight tensor from (1, PE, WMEM, SIMD) to (1, WMEM, PE, SIMD) - weight_tensor_unflipped = np.transpose(weight_tensor, (0, 2, 1, 3)) - # reverse SIMD flip for saving weights in .npy - weight_tensor_simd_flipped = np.flip(weight_tensor_unflipped, axis=-1) - # PE flip for saving weights in .dat - weight_tensor_pe_flipped = np.flip(weight_tensor_unflipped, axis=-2) - # reshape weight tensor (simd_flipped and pe_flipped) to desired shape - pe = self.get_nodeattr("PE") - simd = self.get_nodeattr("SIMD") - # simd_flipped - weight_tensor_simd_flipped = weight_tensor_simd_flipped.reshape( - 1, -1, pe * simd - ) - weight_tensor_simd_flipped = weight_tensor_simd_flipped.copy() - # flipped - weight_tensor_pe_flipped = weight_tensor_pe_flipped.reshape( - 1, -1, pe * simd - ) - weight_tensor_pe_flipped = weight_tensor_pe_flipped.copy() - if weight_file_mode == "decoupled_npy": - # save weight stream into npy for cppsim - np.save(weight_file_name, weight_tensor_simd_flipped) - elif weight_file_mode == "decoupled_verilog_dat": - # convert weight values into hexstring - weight_width = self.get_weightstream_width() - # pad to nearest 4 bits to get hex strings - weight_width_padded = roundup_to_integer_multiple(weight_width, 4) - weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string( - weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix="" - ) - # add zeroes to pad out file to 1024 entries - weight_stream = weight_tensor_pe_flipped.flatten() - weight_stream = weight_stream.copy() - with open(weight_file_name, "w") as f: - for val in weight_stream: - f.write(val + "\n") - elif weight_file_mode == "decoupled_runtime": - # memstream axi-lite interface will map each mem line to - # one or multiple 32-bit words - weight_width = self.get_weightstream_width() - words_per_memwidth = 2 ** math.ceil(math.log2(weight_width / 32)) - if words_per_memwidth < 1: - words_per_memwidth = 1 - weight_width_padded = words_per_memwidth * 32 - # first, pack and ensure padding to 32 bits - weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string( - weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix="" - ) - weight_stream = weight_tensor_pe_flipped.flatten() - weight_stream = weight_stream.copy() - with open(weight_file_name, "w") as f: - for val in weight_stream: - # split into groups of 8 hex digits (= 32 bits) - words_32b = textwrap.wrap(val, 8) - words_32b.reverse() - for word_32b in words_32b: - f.write(word_32b + "\n") - else: - raise Exception("Unknown/unsupported weight_file_mode") - - else: - raise Exception("Unknown/unsupported weight_file_mode") - - def generate_params(self, model, path): - mem_mode = self.get_nodeattr("mem_mode") - code_gen_dir = path - # weights, if not external - weights = model.get_initializer(self.onnx_node.input[1]) - if mem_mode == "decoupled" or mem_mode == "external": - weight_filename_sim = "{}/weights.npy".format(code_gen_dir) - # save decoupled weights for cppsim - self.make_weight_file(weights, "decoupled_npy", weight_filename_sim) - if mem_mode == "decoupled": - # also save weights as Verilog .dat file - # This file will be ignored when synthesizing UltraScale memory. - weight_filename_rtl = "{}/memblock.dat".format(code_gen_dir) - self.make_weight_file( - weights, "decoupled_verilog_dat", weight_filename_rtl - ) - else: - raise Exception( - """Please set mem_mode to "const", "decoupled", or "external", - currently no other parameter value is supported!""" - ) - - def execute_node(self, context, graph): - mode = self.get_nodeattr("exec_mode") - mem_mode = self.get_nodeattr("mem_mode") - node = self.onnx_node - - if mode == "cppsim": - raise Exception( - "cppsim not possible for RTL MVAU, please set exec_mode to rtlsim" - ) - elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - # create a npy file fore each input of the node (in_ind is input index) - in_ind = 0 - for inputs in node.input: - # it is assumed that the first input of the node is the data input - # the second input are the weights - if in_ind == 0: - assert ( - str(context[inputs].dtype) == "float32" - ), """Input datatype is - not float32 as expected.""" - expected_inp_shape = self.get_folded_input_shape() - reshaped_input = context[inputs].reshape(expected_inp_shape) - export_idt = self.get_input_datatype() - # make copy before saving the array - reshaped_input = reshaped_input.copy() - np.save( - os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)), - reshaped_input, - ) - elif in_ind > 2: - raise Exception("Unexpected input found for MatrixVectorActivation_rtl") - in_ind += 1 - - if mode == "rtlsim": - sim = self.get_rtlsim() - nbits = self.get_instream_width() - inp = npy_to_rtlsim_input( - "{}/input_0.npy".format(code_gen_dir), export_idt, nbits - ) - super().reset_rtlsim(sim) - super().toggle_clk(sim) - if mem_mode in ["external", "decoupled"]: - wnbits = self.get_weightstream_width() - export_wdt = self.get_weight_datatype() - wei = npy_to_rtlsim_input( - "{}/weights.npy".format(code_gen_dir), export_wdt, wnbits - ) - num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) - io_dict = { - "inputs": {"in0": inp, "weights": wei * num_w_reps}, - "outputs": {"out": []}, - } - self.rtlsim_multi_io(sim, io_dict) - output = io_dict["outputs"]["out"] - else: - output = self.rtlsim(sim, inp) - odt = self.get_output_datatype() - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy( - output, out_npy_path, odt, out_shape, packed_bits, target_bits - ) - # load and reshape output - output = np.load(out_npy_path) - oshape = self.get_normal_output_shape() - output = np.asarray([output], dtype=np.float32).reshape(*oshape) - context[node.output[0]] = output - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to "rtlsim" """.format( - mode - ) - ) - - def code_generation_ipgen(self, model, fpgapart, clk): - """Normally: Generates C++ code and tcl script for IP generation. - Here: Generates (System-)Verilog code for IP generation.""" - self.generate_hdl(model, fpgapart, clk) - - def ipgen_singlenode_code(self): - """Normally: Builds the bash script for IP generation.""" - pass - - def code_generation_cppsim(self, model): - """Normally: Generates C++ code for simulation (cppsim).""" - pass - - def compile_singlenode_code(self): - pass - - def global_includes(self): - pass - - def defines(self, var): - pass - - def read_npy_data(self): - pass - - def strm_decl(self): - pass - - def docompute(self): - pass - - def dataoutstrm(self): - pass - - def save_as_npy(self): - pass - - def blackboxfunction(self): - pass - - def pragmas(self): - pass - - def code_generation_ipi(self): - cmd = [] - # add streamer if needed - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "decoupled": - runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 - if self.get_nodeattr("ram_style") == "ultra": - assert ( - runtime_writable == 1 - ), "Layer with URAM weights must have runtime_writeable_weights=1" - node_name = self.onnx_node.name - sname = self.hls_sname() - # create a hierarchy for this layer, with the same port names - clk_name = self.get_verilog_top_module_intf_names()["clk"][0] - rst_name = self.get_verilog_top_module_intf_names()["rst"][0] - dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0][0] - din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0] - cmd.append("create_bd_cell -type hier %s" % node_name) - cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name)) - cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name)) - cmd.append( - "create_bd_intf_pin -mode Master " - "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" - % (node_name, dout_name) - ) - cmd.append( - "create_bd_intf_pin -mode Slave " - "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name) - ) - # instantiate the RTL block - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/") - sourcefiles = [ - os.path.join( - code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v" - ), - rtllib_dir + "mvu_vvu_axi.sv", - rtllib_dir + "replay_buffer.sv", - rtllib_dir + "mvu_4sx4u.sv", - rtllib_dir + "mvu_vvu_8sx9_dsp58.sv", - rtllib_dir + "mvu_8sx8u_dsp48.sv", - ] - for f in sourcefiles: - cmd.append("add_files -norecurse %s" % (f)) - cmd.append( - "create_bd_cell -type hier -reference %s /%s/%s" - % ( - self.get_nodeattr("gen_top_module"), - self.onnx_node.name, - self.onnx_node.name, - ) - ) - - # instantiate a streamer and connect it to the HLS IP - strm_vlnv = "amd.com:finn:memstream:1.0" - strm_inst = node_name + "_wstrm" - cmd.append( - "create_bd_cell -type ip -vlnv %s /%s/%s" - % (strm_vlnv, node_name, strm_inst) - ) - cmd.append( - "set_property -dict [list " - "CONFIG.DEPTH {%d} " - "CONFIG.WIDTH {%d} " - "CONFIG.INIT_FILE {%s} " - "CONFIG.RAM_STYLE {%s} " - "] [get_bd_cells /%s/%s]" - % ( - self.calc_wmem(), - self.get_weightstream_width_padded(), - self.get_nodeattr("code_gen_dir_ipgen") + "/memblock.dat", - self.get_nodeattr("ram_style"), - node_name, - strm_inst, - ) - ) - cmd.append( - "connect_bd_intf_net [get_bd_intf_pins %s/%s/m_axis_0] " - "[get_bd_intf_pins %s/%s/weights_%s]" - % (node_name, strm_inst, node_name, node_name, sname) - ) - cmd.append( - "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/ap_rst_n]" - % (node_name, rst_name, node_name, strm_inst) - ) - cmd.append( - "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/ap_clk]" - % (node_name, clk_name, node_name, strm_inst) - ) - cmd.append( - "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]" - % (node_name, rst_name, node_name, node_name, rst_name) - ) - cmd.append( - "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]" - % (node_name, clk_name, node_name, node_name, clk_name) - ) - cmd.append( - "connect_bd_intf_net [get_bd_intf_pins %s/%s] " - "[get_bd_intf_pins %s/%s/%s]" - % (node_name, din_name, node_name, node_name, din_name) - ) - cmd.append( - "connect_bd_intf_net [get_bd_intf_pins %s/%s] " - "[get_bd_intf_pins %s/%s/%s]" - % (node_name, dout_name, node_name, node_name, dout_name) - ) - if runtime_writable: - # expose axi lite interface for writeable weights - axilite_name = self.get_verilog_top_module_intf_names()["axilite"][0] - cmd.append( - "create_bd_intf_pin -mode Slave " - "-vlnv xilinx.com:interface:aximm_rtl:1.0 /%s/%s" - % (node_name, axilite_name) - ) - cmd.append( - "connect_bd_intf_net [get_bd_intf_pins %s/%s] " - "[get_bd_intf_pins %s/%s/%s]" - % (node_name, axilite_name, node_name, strm_inst, axilite_name) - ) - # TODO calculate and pass in segment size here - cmd.append("assign_bd_address") - cmd.append("save_bd_design") - elif mem_mode == "external": - # instantiate the RTL block - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/") - sourcefiles = [ - os.path.join( - code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v" - ), - rtllib_dir + "mvu_vvu_axi.sv", - rtllib_dir + "replay_buffer.sv", - rtllib_dir + "mvu_4sx4u.sv", - rtllib_dir + "mvu_vvu_8sx9_dsp58.sv", - rtllib_dir + "mvu_8sx8u_dsp48.sv", - ] - for f in sourcefiles: - cmd.append("add_files -norecurse %s" % (f)) - cmd.append( - "create_bd_cell -type module -reference %s %s" - % ( - self.get_nodeattr("gen_top_module"), - self.onnx_node.name, - ) - ) - else: - raise Exception("Unrecognized mem_mode for MatrixVectorActivation") - return cmd - - def get_verilog_top_module_intf_names(self): - intf_names = super().get_verilog_top_module_intf_names() - mem_mode = self.get_nodeattr("mem_mode") - sname = self.hls_sname() - if mem_mode == "external": - intf_names["s_axis"].append( - ("weights_" + sname, self.get_weightstream_width_padded()) - ) - if mem_mode == "decoupled": - # only expose axilite interface if attribute is set - runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 - if runtime_writable: - intf_names["axilite"] = ["s_axilite"] - return intf_names - - def get_op_and_param_counts(self): - in_features = self.get_nodeattr("MW") - out_features = self.get_nodeattr("MH") - weight_bits = self.get_weight_datatype().bitwidth() - inp_bits = self.get_input_datatype().bitwidth() - num_inp_vec = self.get_nodeattr("numInputVectors") - num_repetitions = int(np.prod(num_inp_vec)) - mac_count = in_features * out_features * num_repetitions - # cannonicalize op type: highest bitwidth operand first s.t. - # e.g. mac_8bx4b and mac_4bx8b don't appear as two different op types - bw1 = min(inp_bits, weight_bits) - bw2 = max(inp_bits, weight_bits) - mac_op_type = "op_mac_%dbx%db" % (bw1, bw2) - weight_param_type = "param_weight_%db" % (weight_bits) - weight_count = in_features * out_features - ret_dict = {mac_op_type: mac_count, weight_param_type: weight_count} - return ret_dict - - def derive_characteristic_fxns(self, period): - n_inps = np.prod(self.get_folded_input_shape()[:-1]) - io_dict = { - "inputs": { - "in0": [0 for i in range(n_inps)], - }, - "outputs": {"out": []}, - } - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode in ["decoupled", "external"]: - n_weight_inps = self.calc_wmem() - num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) - io_dict["inputs"]["weights"] = [ - 0 for i in range(num_w_reps * n_weight_inps) - ] - super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) - - def _resolve_segment_len(self, clk): - # Insert pipeline registers in the DSP58 chain to meet target clock frequency - # ~0.741 ns seems the worst-case delay through first DSP - # ~0.605 ns seems to be (on average) delay for all subsequent DSPs - # clk >= (critical_path_dsps - 1) * 0.605 + 0.741 - assert (clk > 0.741), "Infeasible clk target of {} ns has been set, consider lowering the targeted clock frequency!".format(clk) - critical_path_dsps = np.floor((clk - 0.741) / 0.605 + 1) - max_chain_len = np.ceil(self.get_nodeattr("SIMD") / 3) - dsp_chain_len = critical_path_dsps if critical_path_dsps < max_chain_len else max_chain_len - return dsp_chain_len - - def _resolve_impl_style(self, fpgapart): - # Based on target device and activation/weight-width, choose the - # supported RTL compute core - - assert self.get_nodeattr("resType") != "lut", "LUT-based RTL-MVU implementation currently not supported! Please change resType for {}".format(self.onnx_node.name) - - act_width = self.get_input_datatype(0).bitwidth() - weight_width = self.get_input_datatype(1).bitwidth() - is_versal = ( - fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"] - or fpgapart[0:5] == "xqrvc" - ) - - if is_versal: - return "mvu_vvu_8sx9_dsp58" - else: - if act_width == 4 and weight_width == 4: - return "mvu_4sx4u" - else: - return "mvu_8sx8u_dsp48" - - def generate_hdl(self, model, fpgapart, clk): - # Generate params as part of IP preparation - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - self.generate_params(model, code_gen_dir) - - template_path, code_gen_dict = self.prepare_codegen_default(fpgapart, clk) - # add general parameters to dictionary - code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [ - self.get_verilog_top_module_name() - ] - # save top module name so we can refer to it after this node has been renamed - # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject) - self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name()) - - # apply code generation to template - with open(template_path, "r") as f: - template_wrapper = f.read() - for key in code_gen_dict: - # transform list into long string separated by '\n' - code_gen_line = "\n".join(code_gen_dict[key]) - template_wrapper = template_wrapper.replace(key, code_gen_line) - with open( - os.path.join( - code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v" - ), - "w", - ) as f: - f.write(template_wrapper.replace("$FORCE_BEHAVIORAL$", str(0))) - with open( - os.path.join( - code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper_sim.v" - ), - "w", - ) as f: - f.write(template_wrapper.replace("$FORCE_BEHAVIORAL$", str(1))) - - # set ipgen_path and ip_path so that HLS-Synth transformation - # and stich_ip transformation do not complain - self.set_nodeattr("ipgen_path", code_gen_dir) - self.set_nodeattr("ip_path", code_gen_dir) - - def prepare_codegen_default(self, fpgapart, clk): - template_path = os.environ["FINN_ROOT"] + "/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v" - - code_gen_dict = {} - code_gen_dict["$IS_MVU$"] = [str(1)] - code_gen_dict["$COMPUTE_CORE$"] = [self._resolve_impl_style(fpgapart)] - code_gen_dict["$MW$"] = [str(self.get_nodeattr("MW"))] - code_gen_dict["$MH$"] = [str(self.get_nodeattr("MH"))] - code_gen_dict["$PE$"] = [str(self.get_nodeattr("PE"))] - code_gen_dict["$SIMD$"] = [str(self.get_nodeattr("SIMD"))] - code_gen_dict["$ACTIVATION_WIDTH$"] = [ - str(self.get_input_datatype(0).bitwidth()) - ] - code_gen_dict["$WEIGHT_WIDTH$"] = [str(self.get_input_datatype(1).bitwidth())] - code_gen_dict["$ACCU_WIDTH$"] = [str(self.get_output_datatype().bitwidth())] - code_gen_dict["$SIGNED_ACTIVATIONS$"] = ( - [str(1)] if (self.get_input_datatype(0).min() < 0) else [str(0)] - ) - code_gen_dict["$SEGMENTLEN$"] = [str(self._resolve_segment_len(clk))] - - return template_path, code_gen_dict - - def prepare_rtlsim(self): - """Creates a Verilator emulation library for the RTL code generated - for this node, sets the rtlsim_so attribute to its path and returns - a PyVerilator wrapper around it.""" - - if PyVerilator is None: - raise ImportError("Installation of PyVerilator is required.") - - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - # Path to (System-)Verilog files used by top-module & path to top-module - verilog_paths = [code_gen_dir, os.environ["FINN_ROOT"] + "/finn-rtllib/mvu"] - verilog_files = [self.get_nodeattr("gen_top_module") + "_wrapper_sim.v"] - - # build the Verilator emu library - sim = PyVerilator.build( - verilog_files, - build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"), - verilog_path=verilog_paths, - trace_depth=get_rtlsim_trace_depth(), - top_module_name=self.get_verilog_top_module_name(), - ) - # save generated lib filename in attribute - self.set_nodeattr("rtlsim_so", sim.lib._name) - - return sim From 04ec5620ba5460b41d366e54f1a6ad099c551808 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 26 Jan 2024 19:49:15 +0000 Subject: [PATCH 107/112] removed old specialize_to_rtl transform --- .../fpgadataflow/specialize_to_rtl_layers.py | 191 ------------------ 1 file changed, 191 deletions(-) delete mode 100644 src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py diff --git a/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py deleted file mode 100644 index 5061282695..0000000000 --- a/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py +++ /dev/null @@ -1,191 +0,0 @@ -# Copyright (c) 2023, AMD -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of FINN nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import numpy as np -from qonnx.transformation.base import Transformation -from qonnx.custom_op.registry import getCustomOp -from qonnx.core.datatype import DataType -from onnx import helper -from qonnx.transformation.infer_shapes import InferShapes -from qonnx.transformation.infer_datatypes import InferDataTypes -from qonnx.transformation.general import GiveUniqueNodeNames -from finn.transformation.fpgadataflow.minimize_accumulator_width import MinimizeAccumulatorWidth - -class InferRTLMatrixVectorActivation(Transformation): - """Convert (HLS-based) MatrixVectorActivation layers to specialized RTL layers if supported.""" - - def __init__(self): - super().__init__() - - def _is_rtl_variant_compatible(self, n): - no_activation = getCustomOp(n).get_nodeattr("noActivation") == 1 - act_width_in_range = (DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() <= 8) or (DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() == 9 and DataType[getCustomOp(n).get_nodeattr("inputDataType")].min() < 0) - weight_width_in_range = DataType[getCustomOp(n).get_nodeattr("weightDataType")].bitwidth() <= 8 - folding_supported = (getCustomOp(n).get_nodeattr("MH") % getCustomOp(n).get_nodeattr("PE") == 0) and (getCustomOp(n).get_nodeattr("MW") % getCustomOp(n).get_nodeattr("SIMD") == 0) - - if (no_activation and act_width_in_range and weight_width_in_range and folding_supported): - return True - else: - return False - - - def apply(self, model): - graph = model.graph - node_ind = 0 - graph_modified = False - for n in graph.node: - node_ind += 1 - if n.op_type == "MatrixVectorActivation": - preferred_in_rtl = getCustomOp(n).get_nodeattr("preferred_backend") == "rtl" - supported_in_rtl = self._is_rtl_variant_compatible(n) - if (preferred_in_rtl and supported_in_rtl): - mvau_input = n.input[0] - mvau_weight = n.input[1] - mvau_output = n.output[0] - inputDataType = getCustomOp(n).get_nodeattr("inputDataType") - weightDataType = getCustomOp(n).get_nodeattr("weightDataType") - outputDataType = getCustomOp(n).get_nodeattr("outputDataType") - numInputVectors = getCustomOp(n).get_nodeattr("numInputVectors") - mw = getCustomOp(n).get_nodeattr("MW") - mh = getCustomOp(n).get_nodeattr("MH") - simd = getCustomOp(n).get_nodeattr("SIMD") - pe = getCustomOp(n).get_nodeattr("PE") - mem_mode = getCustomOp(n).get_nodeattr("mem_mode") - ram_style = getCustomOp(n).get_nodeattr("ram_style") - resType = getCustomOp(n).get_nodeattr("resType") - runtime_writeable_weights = getCustomOp(n).get_nodeattr("runtime_writeable_weights") - - new_node = helper.make_node( - "MatrixVectorActivation_rtl", - [mvau_input, mvau_weight], - [mvau_output], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - MW=mw, - MH=mh, - SIMD=simd, - PE=pe, - inputDataType=inputDataType, - weightDataType=weightDataType, - outputDataType=outputDataType, - numInputVectors=numInputVectors, - mem_mode=mem_mode, - resType=resType, - name=n.name + "_rtl", - ram_style=ram_style, - runtime_writeable_weights=runtime_writeable_weights - ) - graph.node.insert(node_ind, new_node) - # remove old node - graph.node.remove(n) - graph_modified=True - - if graph_modified: - model = model.transform(MinimizeAccumulatorWidth()) - model = model.transform(InferShapes()) - model = model.transform(InferDataTypes()) - model = model.transform(GiveUniqueNodeNames()) - - return (model, graph_modified) - -class InferRTLVectorVectorActivation(Transformation): - """Convert (HLS-based) VectorVectorActivation layers to specialized RTL layers is supported.""" - - def __init__(self): - super().__init__() - - def _is_rtl_variant_compatible(self, n): - no_activation = getCustomOp(n).get_nodeattr("noActivation") == 1 - act_width_in_range = (DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() <= 8) or (DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() == 9 and DataType[getCustomOp(n).get_nodeattr("inputDataType")].min() < 0) - weight_width_in_range = DataType[getCustomOp(n).get_nodeattr("weightDataType")].bitwidth() <= 8 - folding_supported = (getCustomOp(n).get_nodeattr("Channels") % getCustomOp(n).get_nodeattr("PE") == 0) and (np.prod(getCustomOp(n).get_nodeattr("Kernel")) % getCustomOp(n).get_nodeattr("SIMD") == 0) - - if (no_activation and act_width_in_range and weight_width_in_range and folding_supported): - return True - else: - return False - - def apply(self, model): - graph = model.graph - node_ind = 0 - graph_modified = False - for n in graph.node: - node_ind += 1 - if n.op_type == "VectorVectorActivation": - preferred_in_rtl = getCustomOp(n).get_nodeattr("preferred_backend") == "rtl" - supported_in_rtl = self._is_rtl_variant_compatible(n) - if (preferred_in_rtl and supported_in_rtl): - vvau_input = n.input[0] - vvau_weight = n.input[1] - vvau_output = n.output[0] - inputDataType = getCustomOp(n).get_nodeattr("inputDataType") - weightDataType = getCustomOp(n).get_nodeattr("weightDataType") - outputDataType = getCustomOp(n).get_nodeattr("outputDataType") - pe = getCustomOp(n).get_nodeattr("PE") - simd = getCustomOp(n).get_nodeattr("SIMD") - dim = getCustomOp(n).get_nodeattr("Dim") - channels = getCustomOp(n).get_nodeattr("Channels") - kernel = getCustomOp(n).get_nodeattr("Kernel") - resType = getCustomOp(n).get_nodeattr("resType") - mem_mode = getCustomOp(n).get_nodeattr("mem_mode") - runtime_writeable_weights = getCustomOp(n).get_nodeattr("runtime_writeable_weights") - ram_style = getCustomOp(n).get_nodeattr("ram_style") - resType = getCustomOp(n).get_nodeattr("resType") - - new_node = helper.make_node( - "VectorVectorActivation_rtl", - [vvau_input, vvau_weight], - [vvau_output], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - name=n.name + "_rtl", - PE=pe, - SIMD=simd, - Dim=dim, - Channels=channels, - Kernel=kernel, - resType=resType, - inputDataType=inputDataType, - weightDataType=weightDataType, - outputDataType=outputDataType, - mem_mode=mem_mode, - runtime_writeable_weights=runtime_writeable_weights, - ram_style=ram_style - ) - graph.node.insert(node_ind, new_node) - # remove old node - graph.node.remove(n) - graph_modified=True - - if graph_modified: - model = model.transform(MinimizeAccumulatorWidth()) - model = model.transform(InferShapes()) - model = model.transform(InferDataTypes()) - model = model.transform(GiveUniqueNodeNames()) - - return (model, graph_modified) \ No newline at end of file From de778cf73ba2dabbb056320ad4babdb24cacc0ad Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 26 Jan 2024 19:50:01 +0000 Subject: [PATCH 108/112] removed rtl custom-op test --- .../test_fpgadataflow_mvau_rtl.py | 174 ------------------ 1 file changed, 174 deletions(-) delete mode 100644 tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py b/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py deleted file mode 100644 index 1e9de44fb2..0000000000 --- a/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py +++ /dev/null @@ -1,174 +0,0 @@ -# Copyright (C) 2022, Advanced Micro Devices, Inc. -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of FINN nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import pytest - -import numpy as np -import os -import pickle -from onnx import TensorProto, helper -from qonnx.core.datatype import DataType -from qonnx.core.modelwrapper import ModelWrapper -from qonnx.custom_op.registry import getCustomOp -from qonnx.transformation.general import ApplyConfig, GiveUniqueNodeNames, GiveReadableTensorNames -from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model - -import finn.core.onnx_exec as oxe -import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls -import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl -from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP -from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP -from finn.transformation.fpgadataflow.prepare_ip import PrepareIP -from finn.transformation.fpgadataflow.set_fifo_depths import InsertAndSetFIFODepths - - -from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim -from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode - -build_dir = os.environ["FINN_BUILD_DIR"] - - -def make_single_matmul_modelwrapper(ifm, ofm, idt, wdt, W): - matmul_node = helper.make_node("MatMul", ["ifm", "weights"], ["ofm"]) - graph = helper.make_graph(nodes=[matmul_node], name="matmul_graph", inputs=[ifm], outputs=[ofm]) - - model = qonnx_make_model(graph, producer_name="fclayer-model") - model = ModelWrapper(model) - - model.set_tensor_datatype("ifm", idt) - model.set_tensor_datatype("weights", wdt) - model.set_tensor_datatype( - "ofm", DataType["INT32"] - ) # At this step, the MatMul layer does not optimize the bit-width of the output datatype - model.set_initializer("weights", W) - # model.set_tensor_layout("ifm", DataLayout.NHWC) - - return model - - -def prepare_inputs(input_tensor): - return {"global_in": input_tensor} - - -# @pytest.mark.parametrize("mh", [36]) -# @pytest.mark.parametrize("mw", [256]) -@pytest.mark.parametrize("mh", [9]) -@pytest.mark.parametrize("mw", [36]) -# @pytest.mark.parametrize("pe", [1, 4, 9, 36]) -# @pytest.mark.parametrize("simd", [1, 4, 16, 64, 256]) -@pytest.mark.parametrize("pe", [1, 3, 9]) -@pytest.mark.parametrize("simd", [1, 3, 6, 18, 36]) -@pytest.mark.parametrize("idt", [DataType["UINT4"], DataType["UINT8"]]) -@pytest.mark.parametrize("wdt", [DataType["INT4"], DataType["INT8"]]) -# @pytest.mark.parametrize("part", ["xcvc1902-vsva2197-2MP-e-S", "xcku3p-ffva676-1-e"]) -@pytest.mark.parametrize("part", ["xcvc1902-vsva2197-2MP-e-S"]) -@pytest.mark.parametrize("clk_ns", [1.66, 4]) -@pytest.mark.fpgadataflow -@pytest.mark.slow -@pytest.mark.vivado -def test_fpgadataflow_mvau_rtl( - mh, mw, pe, simd, idt, wdt, part, clk_ns -): - if part == "xcku3p-ffva676-1-e" and clk_ns != 1.66: - pytest.skip("Skip test for varying clk for devices other than Versal, since this variable doesn't change anything for this test") - - # Create test input vector (produced by SWG) - ofm_shape = (5, 5) - ofm_h, ofm_w = ofm_shape - ifm = helper.make_tensor_value_info("ifm", TensorProto.FLOAT, [1, ofm_h, ofm_w, mw]) - ofm = helper.make_tensor_value_info("ofm", TensorProto.FLOAT, (1, ofm_h, ofm_w, mh)) - W = gen_finn_dt_tensor(wdt, (mw, mh)) - model = make_single_matmul_modelwrapper(ifm, ofm, idt, wdt, W) - model = model.transform(GiveUniqueNodeNames()) - model = model.transform(GiveReadableTensorNames()) - - model.save(build_dir + "/matmul.onnx") - - # Create MatMul & obtain golden reference output - A = gen_finn_dt_tensor(model.get_tensor_datatype("global_in"), model.get_tensor_shape("global_in")) - input_dict = prepare_inputs(A) - - # Execute ONNX model - output_matmul = oxe.execute_onnx(model, input_dict)["global_out"] - - with open(build_dir + "/onnx_output.pkl", "wb") as f: - pickle.dump(output_matmul, f) - - # Create MVAU (HLS) - model = model.transform(to_hls.InferQuantizedMatrixVectorActivation(mem_mode="decoupled")) - model = model.transform(GiveUniqueNodeNames()) - - # Apply folding (i.e. specify to use DSPs) - folding_config = { - "Defaults": {}, - "MatrixVectorActivation_0": { - "PE": pe, - "SIMD": simd, - "mem_mode": "decoupled", - "ram_style": "auto", - "resType": "dsp", - "preferred_backend" : "rtl" - }, - } - model = model.transform(ApplyConfig(folding_config)) - model.save(build_dir + "/mvau_hls.onnx") - - # Apply convert-to-rtl step - model = model.transform(to_rtl.InferRTLMatrixVectorActivation()) - model = model.transform(GiveUniqueNodeNames()) - model.save(build_dir + "/mvau_rtl.onnx") - - # Reset rtlsim_so and ip-related paths such that new Pyverilator SO and IP is generated - for n in model.graph.node: - getCustomOp(n).set_nodeattr("rtlsim_trace", build_dir + "/mvu_trace_rtl_nodebynode.vcd") - - model = model.transform(SetExecMode("rtlsim")) - model = model.transform(PrepareIP(part, clk_ns)) - model = model.transform(HLSSynthIP()) - model = model.transform(PrepareRTLSim()) - output_mvau_rtl = oxe.execute_onnx(model, input_dict)["global_out"] - - with open(build_dir + "/mvau_rtl_output.pkl", "wb") as f: - pickle.dump(output_mvau_rtl, f) - - model.save(build_dir + "/mvau_rtl_sim.onnx") - assert (output_matmul == output_mvau_rtl).all(), "Output of ONNX model not matching output of node-by-node sim!" - - model = model.transform(InsertAndSetFIFODepths(part, clk_ns)) - model = model.transform(PrepareIP(part, clk_ns)) - model = model.transform(HLSSynthIP()) - model = model.transform(CreateStitchedIP(part, clk_ns)) - - os.environ["RTLSIM_TRACE_DEPTH"] = "3" - model.set_metadata_prop("rtlsim_so", "") - model.set_metadata_prop("exec_mode", "rtlsim") - model.set_metadata_prop("rtlsim_trace", build_dir + "/mvu_trace_rtl_stitch.vcd") - model.save(build_dir + "/stitched_ip.onnx") - output_mvau_rtl_stitch = oxe.execute_onnx(model, input_dict)["global_out"] - - assert (output_matmul == output_mvau_rtl_stitch).all(), "Output of ONNX model not matching output of stitched-IP RTL model!" \ No newline at end of file From 911239c09fe0a262e745c9cb4d91478da2315d79 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 1 Feb 2024 14:34:15 +0000 Subject: [PATCH 109/112] [vvau hls]: add custom op to dict --- src/finn/custom_op/fpgadataflow/hls/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/hls/__init__.py b/src/finn/custom_op/fpgadataflow/hls/__init__.py index 1f1448b9fc..ebb5ce98da 100644 --- a/src/finn/custom_op/fpgadataflow/hls/__init__.py +++ b/src/finn/custom_op/fpgadataflow/hls/__init__.py @@ -51,6 +51,7 @@ from finn.custom_op.fpgadataflow.hls.tlastmarker_hls import TLastMarker_hls from finn.custom_op.fpgadataflow.hls.upsampler_hls import UpsampleNearestNeighbour_hls from finn.custom_op.fpgadataflow.hls.matrixvectoractivation_hls import MatrixVectorActivation_hls +from finn.custom_op.fpgadataflow.hls.vectorvectoractivation_hls import VectorVectorActivation_hls custom_op = dict() @@ -76,4 +77,5 @@ custom_op["Thresholding_hls"] = Thresholding_hls custom_op["TLastMarker_hls"] = TLastMarker_hls custom_op["UpsampleNearestNeighbour_hls"] = UpsampleNearestNeighbour_hls -custom_op["MatrixVectorActivation_hls"] = MatrixVectorActivation_hls \ No newline at end of file +custom_op["MatrixVectorActivation_hls"] = MatrixVectorActivation_hls +custom_op["VectorVectorActivation_hls"] = VectorVectorActivation_hls \ No newline at end of file From b1ee54098d221c73f8d6491826338b1847489038 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 1 Feb 2024 14:35:56 +0000 Subject: [PATCH 110/112] [vvu hw-op]: refactored hw custom-op VVAU --- .../fpgadataflow/vectorvectoractivation.py | 1196 ++++++----------- 1 file changed, 423 insertions(+), 773 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py index 891730ece3..2168474298 100644 --- a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py @@ -38,17 +38,21 @@ roundup_to_integer_multiple, ) -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp from finn.util.data_packing import ( npy_to_rtlsim_input, numpy_to_hls_code, pack_innermost_dim_as_hex_string, rtlsim_output_to_npy, ) +import onnx.numpy_helper as np_helper +import qonnx.custom_op.general.xnorpopcount as xp +from qonnx.custom_op.general.multithreshold import multithreshold -class VectorVectorActivation(HLSCustomOp): - """Class that corresponds to finn-hlslib Vector_Vector_Activate_Batch function""" + +class VectorVectorActivation(HWCustomOp): + """Abstraction layer for HW implementation of VectorVectorActivation layers.""" def __init__(self, onnx_node, **kwargs): super().__init__(onnx_node, **kwargs) @@ -100,6 +104,10 @@ def get_nodeattr_types(self): # use xnor-popcount for binary weights/inputs, thus treating them # as bipolar "binaryXnorMode": ("i", False, 0, {0, 1}), + # Backend implementation for layer + # hls -- Vivado HLS + # rtl -- (System)Verilog + "preferred_impl_style": ("s", False, "hls", {"hls", "rtl"}), } my_attrs.update(super().get_nodeattr_types()) return my_attrs @@ -107,124 +115,55 @@ def get_nodeattr_types(self): def base_op_type(self): return "VectorVectorActivation" - def minimize_accumulator_width(self, model): - """Minimize the accumulator bit width according to the weight values, - input data types, and size of dot product""" - weights = model.get_initializer(self.onnx_node.input[1]) - k_h, k_w = self.get_nodeattr("Kernel") - fm = self.get_nodeattr("Channels") - # put weights into the shape expected by calculate_matvec_accumulator_range - weights = weights.reshape(fm, k_h * k_w).transpose() - # since in the calculation the values of the weight matrix are used, - # for the bipolar case they need to be converted to bipolar - if self.get_nodeattr("binaryXnorMode"): - weights = 2 * weights - 1 - if len(self.onnx_node.input) > 2: - thresholds = model.get_initializer(self.onnx_node.input[2]) - else: - thresholds = None - idt = self.get_input_datatype() - - (acc_min, acc_max) = calculate_matvec_accumulator_range(weights, idt) - # if runtime-writeable weights, then the values of the weights can - # change and we need to use the worst-case values from the datatypes - if self.get_nodeattr("runtime_writeable_weights"): - wdt = self.get_weight_datatype() - lower_worst = wdt.min() * np.ones_like(weights) - lower_range = calculate_matvec_accumulator_range(lower_worst, idt) - upper_worst = wdt.max() * np.ones_like(weights) - upper_range = calculate_matvec_accumulator_range(upper_worst, idt) - acc_min = min(min(lower_range), min(upper_range)) - acc_max = max(max(upper_range), max(upper_range)) - - # if the thresholds can be used to determine range, then adjust the range - # according to the known values of the thresholds - if thresholds is not None: - threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) - # set threshold datatype (and accumulator datatype implicitly) - min_threshold = thresholds.min() - max_threshold = thresholds.max() - # clip threshold values - if max_threshold > acc_max or min_threshold < acc_min: - warnings.warn("Clipping some thresholds in %s" % self.onnx_node.name) - thresholds = np.clip(thresholds, acc_min, acc_max) - model.set_initializer(self.onnx_node.input[2], thresholds) - threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) - min_threshold = thresholds.min() - max_threshold = thresholds.max() - acc_min = min(min_threshold, acc_min) - acc_max = max(max_threshold, acc_max) + def _infer_sparse_weight_tensor(self, W_conv, k_h, k_w, channels): + W_sparse = np.zeros((channels, channels, k_h, k_w), dtype=np.float32) + for ch in range(channels): + W_sparse[ch][ch] = W_conv[ch][0] + W_conv = W_sparse.astype(np.float32) + W_matmul = W_conv.transpose(0, 2, 3, 1) + W_matmul = W_matmul.reshape(channels, channels * k_h * k_w) + W_matmul = W_matmul.T + return W_matmul - # if the acc_range is always greater than 0, then acc_max <= 2^P - 1 - if acc_min >= 0: - acc_bit_width = np.log2(acc_max + 1) - acc_bit_width = math.ceil(acc_bit_width) - adt = DataType[f"UINT{acc_bit_width}"] - # if the acc_range is signed, then acc_min >= -2^{P-1} and acc_max <= - # 2^{P - 1} - 1, which means 2^{P - 1} >= max(-acc_min, 1 + acc_max) + def execute_node(self, context, graph): + node = self.onnx_node + in_act = context[node.input[0]] + (_, dim_h, dim_w, _) = in_act.shape + (k_h, k_w) = self.get_nodeattr("Kernel") + channels = self.get_nodeattr("Channels") + # Reshape input activations in right format + in_act = in_act.reshape(1, dim_h, dim_w, channels, k_h*k_w) + in_act = in_act.transpose(0, 1, 2, 4, 3) + in_act = in_act.reshape(1, dim_h, dim_w, channels*k_h*k_w) + # Reshape + vvau_w_init = [x for x in graph.initializer if x.name == node.input[1]][0] + vvau_w = np_helper.to_array(vvau_w_init) + vvau_w_onnx = self._infer_sparse_weight_tensor(vvau_w, k_h, k_w, channels) + + if self.get_nodeattr("inputDataType") == "BIPOLAR" and self.get_nodeattr("weightDataType") == "BIPOLAR": + result = np.matmul(in_act, vvau_w_onnx) + result = (result + k_h*k_w) / 2 else: - _acc_max = max(-acc_min, 1 + acc_max) - acc_bit_width = np.log2(_acc_max) + 1 - acc_bit_width = math.ceil(acc_bit_width) - adt = DataType[f"INT{acc_bit_width}"] - - # if activation, assert that the thresholds can be expressed with adt - if thresholds is not None: - assert np.vectorize(adt.allowed)( - threshold_tensor - ).all(), "Thresholds in %s can't be expressed with type %s" % ( - self.onnx_node.name, - str(adt), - ) - - # if no activation, output and accumulator datatypes are the same - if self.get_nodeattr("noActivation"): - # if this is the last node in the graph, then ensure the datatype is - # divisibly by 8 bits - if model.find_direct_successors(self.onnx_node) is None: - bw = roundup_to_integer_multiple(adt.bitwidth(), 8) - new_adt_name = adt.name.replace(str(adt.bitwidth()), str(bw)) - adt = DataType[new_adt_name] - # for no-activation nodes, output dt = acc dt - self.set_nodeattr("outputDataType", adt.name) - self.set_nodeattr("accDataType", adt.name) - - return DataType[self.get_nodeattr("accDataType")] - - def minimize_weight_bit_width(self, model): - """Minimize the bit width based on the values of the weights""" - if not self.get_nodeattr("runtime_writeable_weights"): - weights = model.get_initializer(self.onnx_node.input[1]) - w_min = weights.min() - w_max = weights.max() - if w_min < 0: - if abs(w_min) > w_max: - wdt = DataType.get_smallest_possible(w_min) - else: - wdt = DataType.get_smallest_possible(-w_max - 1) - else: - wdt = DataType.get_smallest_possible(w_max) - self.set_nodeattr("weightDataType", wdt.name) - return DataType[self.get_nodeattr("weightDataType")] - - def calc_wmem(self): - """Calculates and returns WMEM.""" - ch = self.get_nodeattr("Channels") - k_h, k_w = self.get_nodeattr("Kernel") - pe = self.get_nodeattr("PE") - simd = self.get_nodeattr("SIMD") - wmem = (k_h * k_w * ch // pe) // simd - return wmem + result = np.matmul(in_act, vvau_w_onnx) # result is in [N, H, W, C] format - def calc_tmem(self): - """Calculates and returns TMEM.""" - if self.get_nodeattr("noActivation") == 1: - return 0 - else: - ch = self.get_nodeattr("Channels") - pe = self.get_nodeattr("PE") - return ch // pe + if self.get_nodeattr("noActivation") == 0: + vvau_thr_init = [x for x in graph.initializer if x.name == node.input[2]][0] + vvau_thr = np_helper.to_array(vvau_thr_init) + odt_is_bipolar = self.get_nodeattr("outputDataType") == DataType["BIPOLAR"] + out_scale = 2 if odt_is_bipolar else 1 + out_bias = -1 if odt_is_bipolar else self.get_nodeattr("ActVal") + # NHWC to NCHW for multithreshold node + result = result.transpose((0,3,1,2)) + result = multithreshold(result, vvau_thr, out_scale, out_bias) + # NCHW to NHWC + result = result.transpose((0,2,3,1)) + + # for i in range(self.get_nodeattr("Channels")): + context[node.output[0]] = result + def verify_node(self): + pass + def make_shape_compatible_op(self, model): oshape = self.get_normal_output_shape() return super().make_const_shape_op(oshape) @@ -244,9 +183,6 @@ def infer_node_datatype(self, model): odt = self.get_output_datatype() model.set_tensor_datatype(node.output[0], odt) - def verify_node(self): - pass - def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" return DataType[self.get_nodeattr("inputDataType")] @@ -269,12 +205,32 @@ def get_instream_width(self, ind=0): pe = self.get_nodeattr("PE") in_width = i_bits * simd * pe return in_width + + def get_weightstream_width(self): + """Returns weight stream width. Used only in decoupled mode.""" + if ( + self.get_nodeattr("mem_mode") == "decoupled" + or self.get_nodeattr("mem_mode") == "external" + ): + simd = self.get_nodeattr("SIMD") + pe = self.get_nodeattr("PE") + wp = self.get_weight_datatype().bitwidth() + w_width = simd * pe * wp + return w_width + else: + return 0 def get_outstream_width(self, ind=0): o_bits = self.get_output_datatype().bitwidth() out_width = o_bits * self.get_nodeattr("PE") return out_width + def get_weightstream_width_padded(self): + """Returns weight stream width padded to a multiple of 8. This is required + by the AXI Stream spec. Used in decoupled mode.""" + weight_width = self.get_weightstream_width() + return roundup_to_integer_multiple(weight_width, 8) + def get_folded_input_shape(self, ind=0): k_h, k_w = self.get_nodeattr("Kernel") dim_h, dim_w = self.get_nodeattr("Dim") @@ -323,88 +279,302 @@ def get_number_output_values(self): nf = np.prod(self.get_folded_output_shape()[:-1]) return nf - def get_exp_cycles(self): - pe = self.get_nodeattr("PE") - simd = self.get_nodeattr("SIMD") + def calc_wmem(self): + """Calculates and returns WMEM.""" ch = self.get_nodeattr("Channels") - dim_h, dim_w = self.get_nodeattr("Dim") k_h, k_w = self.get_nodeattr("Kernel") - # currently FINN supports for vvau a batch size of 1 - batch_size = 1 - # since mmv != 1 is not supported yet, we set mmv for now to 1 - mmv = 1 - exp_cycles = ((ch * k_h * k_w) / pe / simd) * batch_size * (dim_h * dim_w) / mmv - return int(exp_cycles) + pe = self.get_nodeattr("PE") + simd = self.get_nodeattr("SIMD") + wmem = (k_h * k_w * ch // pe) // simd + return wmem - def get_template_param_values(self): - """Returns the template parameter values according to input, output and weight - data types.""" - ret = dict() - inp_hls_str = self.get_input_datatype().get_hls_datatype_str() - out_hls_str = self.get_output_datatype().get_hls_datatype_str() - inp_is_binary = self.get_input_datatype() == DataType["BINARY"] - # out_is_binary = self.get_output_datatype() == DataType["BINARY"] - wt_is_binary = self.get_weight_datatype() == DataType["BINARY"] - bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1 - if (inp_is_binary or wt_is_binary) and (not bin_xnor_mode): - raise Exception("True binary (non-bipolar) inputs not yet supported") - inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"] - # out_is_bipolar = self.get_output_datatype() == DataType["BIPOLAR"] - wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"] - # reinterpret inp/wt as bipolar if bin_xnor_mode is iset - inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode) - wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode) - # fill in TSrcI and TWeightI - # TODO check these with Giulio - # TODO handle non-bipolar binary inputs - if inp_is_bipolar and wt_is_bipolar: - ret["TSrcI"] = "Recast" - ret["TWeightI"] = "Identity" - elif (not inp_is_bipolar) and wt_is_bipolar: - ret["TSrcI"] = "Slice<%s>" % inp_hls_str - ret["TWeightI"] = "Recast" - elif inp_is_bipolar and (not wt_is_bipolar): - ret["TSrcI"] = "Recast" - ret["TWeightI"] = "Identity" - elif (not inp_is_bipolar) and (not wt_is_bipolar): - ret["TSrcI"] = "Slice<%s>" % inp_hls_str - ret["TWeightI"] = "Identity" + def calc_tmem(self): + """Calculates and returns TMEM.""" + if self.get_nodeattr("noActivation") == 1: + return 0 + else: + ch = self.get_nodeattr("Channels") + pe = self.get_nodeattr("PE") + return ch // pe - # fill in TDstI - ret["TDstI"] = "Slice<%s>" % out_hls_str + def uram_estimation(self): + P = self.get_nodeattr("PE") + Q = self.get_nodeattr("SIMD") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + omega = self.calc_wmem() + mem_width = Q * W * P + mmode = self.get_nodeattr("mem_mode") + mstyle = self.get_nodeattr("ram_style") + if ( + (mmode == "decoupled" and mstyle != "ultra") + or (mmode == "const") + or (mmode == "external") + ): + return 0 + width_multiplier = math.ceil(mem_width / 72) + depth_multiplier = math.ceil(omega / 4096) + return width_multiplier * depth_multiplier - return ret + def bram_estimation(self): + """Calculates resource estimation for BRAM""" + # TODO add in/out FIFO contributions + P = self.get_nodeattr("PE") + Q = self.get_nodeattr("SIMD") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + omega = self.calc_wmem() + mem_width = Q * W * P + # assuming SDP mode RAMB18s (see UG573 Table 1-10) + # since this is HLS memory, not using the full width of a BRAM + # assuming memories up to 128 deep get implemented in LUTs + mmode = self.get_nodeattr("mem_mode") + mstyle = self.get_nodeattr("ram_style") + if ( + (mmode == "decoupled" and mstyle in ["distributed", "ultra"]) + or (mstyle == "auto" and self.calc_wmem() <= 128) + or (mmode == "const" and self.calc_wmem() <= 128) + or (mmode == "external") + ): + return 0 - def get_hls_compatible_weight_tensor(self, orig_weight_matrix): - pe = self.get_nodeattr("PE") - simd = self.get_nodeattr("SIMD") - ch = self.get_nodeattr("Channels") - k_h, k_w = self.get_nodeattr("Kernel") - wmem = self.calc_wmem() - assert orig_weight_matrix.shape == ( - ch, - 1, - k_h, - k_w, - ), """Weights matrix doesn't - have expected shape (channels, 1, kernel_size, kernel_size)""" - ret = orig_weight_matrix - if self.get_weight_datatype() == DataType["BIPOLAR"]: - # convert bipolar to binary - ret = (ret + 1) / 2 - ret = ret.reshape(ch, k_h * k_w) - # distribute rows between PEs - ret = interleave_matrix_outer_dim_from_partitions(ret, pe) - ret = ret.reshape(1, pe, wmem, simd) - return ret + if mem_width == 1: + return math.ceil(omega / 16384) + elif mem_width == 2: + return math.ceil(omega / 8192) + elif mem_width <= 4: + return (math.ceil(omega / 4096)) * (math.ceil(mem_width / 4)) + elif mem_width <= 9: + return (math.ceil(omega / 2048)) * (math.ceil(mem_width / 8)) + elif mem_width <= 18 or omega > 512: + return (math.ceil(omega / 1024)) * (math.ceil(mem_width / 16)) + else: + return (math.ceil(omega / 512)) * (math.ceil(mem_width / 32)) - def get_hls_compatible_threshold_tensor(self, orig_thres_matrix): - """Convert the original numpy weight matrix orig_weight_matrix into - a form suitable for passing to the hlslib call: - * ensure MH % PE == 0 - * for bipolar weights&inputs, ensure thresholds are positive - * interleave rows between PEs - * reshape into (PE, TMEM, n_thres_steps) and return + def bram_efficiency_estimation(self): + P = self.get_nodeattr("PE") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + omega = self.calc_wmem() + bram16_est = self.bram_estimation() + if bram16_est == 0: + return 1 + wbits = W * P * omega + bram16_est_capacity = bram16_est * 36 * 512 + return wbits / bram16_est_capacity + + def uram_efficiency_estimation(self): + """Function for URAM efficiency estimation: actual parameter storage + needed divided by the allocated URAM storage (from estimation)""" + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + D_in = int(np.prod(self.get_nodeattr("Kernel"))) + D_out = self.get_nodeattr("Channels") + uram_est = self.uram_estimation() + if uram_est == 0: + return 1 + wbits = W * D_in * D_out + uram_est_capacity = uram_est * 72 * 4096 + return wbits / uram_est_capacity + + def lut_estimation(self): + """Calculates resource estimations for LUTs based on: + - FINN-R: An End-to-End Deep-Learning Framework for Fast + Exploration of Quantized Neural Networks + - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien, + Y. Umuroglu, M. Leeser and K. Vissers + - 12. Sep 2018 + """ + # TODO add in/out FIFO contributions + P = self.get_nodeattr("PE") + Q = self.get_nodeattr("SIMD") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + # determine tdt with input and weight data types + idt = self.get_input_datatype() + A = idt.bitwidth() + # parameters from experiments in paper mentioned above + c0 = 300 + c1 = 1.1 + c2 = 0 + mmode = self.get_nodeattr("mem_mode") + mstyle = self.get_nodeattr("ram_style") + if (mmode == "decoupled" and mstyle == "distributed") or ( + mmode == "const" and self.calc_wmem() <= 128 + ): + c2 = (P * Q * W) * math.ceil(self.calc_wmem() / 64) + + # multiplication + res_type = self.get_nodeattr("resType") + if res_type == "dsp": + mult_luts = 0 + else: + mult_luts = Q * (2 * math.ceil((W + A) / 6) - 1) * (W + A) + # adder tree + addertree_luts = (W + A) * (2 * Q - 1) + # accumulator + acc_datatype = self.get_accumulator_datatype() + acc_bits = acc_datatype.bitwidth() + k_h, k_w = self.get_nodeattr("Kernel") + # if accDataType is not set, then it will default to INT32, which would + # be a large overestimate in most (if not all) cases. In this scenario, + # we would use the minimum accumulator as determined by the data types + # bound, derived in https://arxiv.org/abs/2301.13376 + alpha = math.log(k_h * k_w, 2) + W + A - 1 - int(idt.signed()) + acc_bits = min( + acc_datatype.bitwidth(), + np.ceil(alpha + math.log(1 + pow(2, -alpha), 2) + 1), + ) + acc_luts = acc_bits + # thresholds and threshold comparators + thr_luts = 0 + comp_luts = 0 + noact = self.get_nodeattr("noActivation") + # TODO - add 'ram_style_threshold' node attribute + if noact == 0: + odt = self.get_output_datatype() + B = odt.bitwidth() + thr_luts = (2**B - 1) * acc_bits * self.calc_tmem() / 64 + comp_luts = (2**B - 1) * acc_bits + + return int( + c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts)) + c2 + ) + + def dsp_estimation(self): + # multiplication + P = self.get_nodeattr("PE") + res_type = self.get_nodeattr("resType") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + idt = self.get_input_datatype() + A = idt.bitwidth() + if res_type == "dsp": + mult_dsp = P * np.ceil((W + A) / 48) # TODO: more accurate modelling + else: + mult_dsp = 0 + return int(mult_dsp) + + def get_exp_cycles(self): + pe = self.get_nodeattr("PE") + simd = self.get_nodeattr("SIMD") + ch = self.get_nodeattr("Channels") + dim_h, dim_w = self.get_nodeattr("Dim") + k_h, k_w = self.get_nodeattr("Kernel") + # currently FINN supports for vvau a batch size of 1 + batch_size = 1 + # since mmv != 1 is not supported yet, we set mmv for now to 1 + mmv = 1 + exp_cycles = ((ch * k_h * k_w) / pe / simd) * batch_size * (dim_h * dim_w) / mmv + return int(exp_cycles) + + def minimize_accumulator_width(self, model): + """Minimize the accumulator bit width according to the weight values, + input data types, and size of dot product""" + weights = model.get_initializer(self.onnx_node.input[1]) + k_h, k_w = self.get_nodeattr("Kernel") + fm = self.get_nodeattr("Channels") + # put weights into the shape expected by calculate_matvec_accumulator_range + weights = weights.reshape(fm, k_h * k_w).transpose() + # since in the calculation the values of the weight matrix are used, + # for the bipolar case they need to be converted to bipolar + if self.get_nodeattr("binaryXnorMode"): + weights = 2 * weights - 1 + if len(self.onnx_node.input) > 2: + thresholds = model.get_initializer(self.onnx_node.input[2]) + else: + thresholds = None + idt = self.get_input_datatype() + + (acc_min, acc_max) = calculate_matvec_accumulator_range(weights, idt) + # if runtime-writeable weights, then the values of the weights can + # change and we need to use the worst-case values from the datatypes + if self.get_nodeattr("runtime_writeable_weights"): + wdt = self.get_weight_datatype() + lower_worst = wdt.min() * np.ones_like(weights) + lower_range = calculate_matvec_accumulator_range(lower_worst, idt) + upper_worst = wdt.max() * np.ones_like(weights) + upper_range = calculate_matvec_accumulator_range(upper_worst, idt) + acc_min = min(min(lower_range), min(upper_range)) + acc_max = max(max(upper_range), max(upper_range)) + + # if the thresholds can be used to determine range, then adjust the range + # according to the known values of the thresholds + if thresholds is not None: + threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) + # set threshold datatype (and accumulator datatype implicitly) + min_threshold = thresholds.min() + max_threshold = thresholds.max() + # clip threshold values + if max_threshold > acc_max or min_threshold < acc_min: + warnings.warn("Clipping some thresholds in %s" % self.onnx_node.name) + thresholds = np.clip(thresholds, acc_min, acc_max) + model.set_initializer(self.onnx_node.input[2], thresholds) + threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) + min_threshold = thresholds.min() + max_threshold = thresholds.max() + acc_min = min(min_threshold, acc_min) + acc_max = max(max_threshold, acc_max) + + # if the acc_range is always greater than 0, then acc_max <= 2^P - 1 + if acc_min >= 0: + acc_bit_width = np.log2(acc_max + 1) + acc_bit_width = math.ceil(acc_bit_width) + adt = DataType[f"UINT{acc_bit_width}"] + # if the acc_range is signed, then acc_min >= -2^{P-1} and acc_max <= + # 2^{P - 1} - 1, which means 2^{P - 1} >= max(-acc_min, 1 + acc_max) + else: + _acc_max = max(-acc_min, 1 + acc_max) + acc_bit_width = np.log2(_acc_max) + 1 + acc_bit_width = math.ceil(acc_bit_width) + adt = DataType[f"INT{acc_bit_width}"] + + # if activation, assert that the thresholds can be expressed with adt + if thresholds is not None: + assert np.vectorize(adt.allowed)( + threshold_tensor + ).all(), "Thresholds in %s can't be expressed with type %s" % ( + self.onnx_node.name, + str(adt), + ) + + # if no activation, output and accumulator datatypes are the same + if self.get_nodeattr("noActivation"): + # if this is the last node in the graph, then ensure the datatype is + # divisibly by 8 bits + if model.find_direct_successors(self.onnx_node) is None: + bw = roundup_to_integer_multiple(adt.bitwidth(), 8) + new_adt_name = adt.name.replace(str(adt.bitwidth()), str(bw)) + adt = DataType[new_adt_name] + # for no-activation nodes, output dt = acc dt + self.set_nodeattr("outputDataType", adt.name) + self.set_nodeattr("accDataType", adt.name) + + return DataType[self.get_nodeattr("accDataType")] + + def minimize_weight_bit_width(self, model): + """Minimize the bit width based on the values of the weights""" + if not self.get_nodeattr("runtime_writeable_weights"): + weights = model.get_initializer(self.onnx_node.input[1]) + w_min = weights.min() + w_max = weights.max() + if w_min < 0: + if abs(w_min) > w_max: + wdt = DataType.get_smallest_possible(w_min) + else: + wdt = DataType.get_smallest_possible(-w_max - 1) + else: + wdt = DataType.get_smallest_possible(w_max) + self.set_nodeattr("weightDataType", wdt.name) + return DataType[self.get_nodeattr("weightDataType")] + + def get_hls_compatible_threshold_tensor(self, orig_thres_matrix): + """Convert the original numpy weight matrix orig_weight_matrix into + a form suitable for passing to the hlslib call: + * ensure MH % PE == 0 + * for bipolar weights&inputs, ensure thresholds are positive + * interleave rows between PEs + * reshape into (PE, TMEM, n_thres_steps) and return """ ch = self.get_nodeattr("Channels") pe = self.get_nodeattr("PE") @@ -449,6 +619,29 @@ def get_hls_compatible_threshold_tensor(self, orig_thres_matrix): rows between PEs is not as expected (n_thres_steps)""" return ret.reshape(1, pe, tmem, n_thres_steps) + def get_hls_compatible_weight_tensor(self, orig_weight_matrix): + pe = self.get_nodeattr("PE") + simd = self.get_nodeattr("SIMD") + ch = self.get_nodeattr("Channels") + k_h, k_w = self.get_nodeattr("Kernel") + wmem = self.calc_wmem() + assert orig_weight_matrix.shape == ( + ch, + 1, + k_h, + k_w, + ), """Weights matrix doesn't + have expected shape (channels, 1, kernel_size, kernel_size)""" + ret = orig_weight_matrix + if self.get_weight_datatype() == DataType["BIPOLAR"]: + # convert bipolar to binary + ret = (ret + 1) / 2 + ret = ret.reshape(ch, k_h * k_w) + # distribute rows between PEs + ret = interleave_matrix_outer_dim_from_partitions(ret, pe) + ret = ret.reshape(1, pe, wmem, simd) + return ret + def make_weight_file(self, weights, weight_file_mode, weight_file_name): """Produce a file containing given weights in appropriate format for this layer. This file can be used for either synthesis or run-time reconfig @@ -626,384 +819,44 @@ def generate_params(self, model, path): f_thresh.write(thresholds_hls_code) f_thresh.close() - def execute_node(self, context, graph): - mode = self.get_nodeattr("exec_mode") - mem_mode = self.get_nodeattr("mem_mode") - node = self.onnx_node - - # TODO ensure codegen dir exists - if mode == "cppsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - # create a npy file fore each input of the node (in_ind is input index) - in_ind = 0 - for inputs in node.input: - # it is assumed that the first input of the node is the data input - # the second input are the weights - # the third input are the thresholds - if in_ind == 0: - assert ( - str(context[inputs].dtype) == "float32" - ), """Input datatype is - not float32 as expected.""" - expected_inp_shape = self.get_folded_input_shape() - reshaped_input = context[inputs].reshape(expected_inp_shape) - if self.get_input_datatype() == DataType["BIPOLAR"]: - # store bipolar activations as binary - reshaped_input = (reshaped_input + 1) / 2 - export_idt = DataType["BINARY"] - else: - export_idt = self.get_input_datatype() - # make copy before saving the array - reshaped_input = reshaped_input.copy() - np.save( - os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)), - reshaped_input, - ) - elif in_ind > 2: - raise Exception("Unexpected input found for VectorVectorActivation") - in_ind += 1 - - if mode == "cppsim": - # execute the precompiled model - super().exec_precompiled_singlenode_model() - # load output npy file - super().npy_to_dynamic_output(context) - # reinterpret binary output as bipolar where needed - if self.get_output_datatype() == DataType["BIPOLAR"]: - out = context[node.output[0]] - out = 2 * out - 1 - context[node.output[0]] = out - assert ( - context[node.output[0]].shape == self.get_normal_output_shape() - ), "cppsim did not produce expected output shape" - elif mode == "rtlsim": - sim = self.get_rtlsim() - nbits = self.get_instream_width() - inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) - super().reset_rtlsim(sim) - super().toggle_clk(sim) - - if mem_mode == "external" or mem_mode == "decoupled": - wnbits = self.get_weightstream_width() - export_wdt = self.get_weight_datatype() - # we have converted bipolar weights to binary for export, - # so use it as such for weight generation - if self.get_weight_datatype() == DataType["BIPOLAR"]: - export_wdt = DataType["BINARY"] - wei = npy_to_rtlsim_input("{}/weights.npy".format(code_gen_dir), export_wdt, wnbits) - dim_h, dim_w = self.get_nodeattr("Dim") - num_w_reps = dim_h * dim_w - - io_dict = { - "inputs": {"in0": inp, "weights": wei * num_w_reps}, - "outputs": {"out": []}, - } - self.rtlsim_multi_io(sim, io_dict) - output = io_dict["outputs"]["out"] - else: - output = self.rtlsim(sim, inp) - odt = self.get_output_datatype() - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits) - - # load and reshape output - output = np.load(out_npy_path) - oshape = self.get_normal_output_shape() - output = np.asarray([output], dtype=np.float32).reshape(*oshape) - context[node.output[0]] = output - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - def global_includes(self): - self.code_gen_dict["$GLOBALS$"] = ['#include "weights.hpp"'] - self.code_gen_dict["$GLOBALS$"] += ['#include "activations.hpp"'] - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode not in ["const", "decoupled", "external"]: - raise Exception( - """Please set mem_mode to "const", "decoupled", or "external", - currently no other parameter value is supported!""" - ) - if self.calc_tmem() != 0: - self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"'] - - def defines(self, var): - dim_h, dim_w = self.get_nodeattr("Dim") - numReps = 1 * dim_h * dim_w + def get_op_and_param_counts(self): k_h, k_w = self.get_nodeattr("Kernel") - innerProdDim = k_h * k_w - mem_mode = self.get_nodeattr("mem_mode") - - self.code_gen_dict["$DEFINES$"] = [ - """#define Channels1 {}\n #define InnerProdDim {}\n - #define SIMD1 {}\n #define PE1 {}\n #define numReps {}""".format( - self.get_nodeattr("Channels"), - innerProdDim, - self.get_nodeattr("SIMD"), - self.get_nodeattr("PE"), - numReps, - ) - ] - if mem_mode == "decoupled" or mem_mode == "external": - wdt = self.get_weight_datatype() - self.code_gen_dict["$DEFINES$"].append("#define WP1 {}\n".format(wdt.bitwidth())) - - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_input_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"] = [] - # note: the innermost dim is reversed for the input - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s, false);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - npy_in, - self.hls_sname(), - ) - ) - - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "decoupled" or mem_mode == "external": - wdt = self.get_weight_datatype() - elem_bits = wdt.bitwidth() - packed_bits = self.get_weightstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = wdt.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/weights.npy" % code_gen_dir - - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", weights_%s, false, numReps);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - npy_in, - self.hls_sname(), - ) - ) - - def strm_decl(self): - mem_mode = self.get_nodeattr("mem_mode") - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0_{} ("in0_{}");'.format( - self.get_instream_width(), self.hls_sname(), self.hls_sname() - ) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out_{} ("out_{}");'.format( - self.get_outstream_width(), self.hls_sname(), self.hls_sname() - ) - ) - if mem_mode == "decoupled" or mem_mode == "external": - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> weights_{} ("weights_{}");'.format( - self.get_weightstream_width(), self.hls_sname(), self.hls_sname() - ) - ) + fm = self.get_nodeattr("Channels") + dim_h, dim_w = self.get_nodeattr("Dim") + weight_bits = self.get_weight_datatype().bitwidth() + inp_bits = self.get_input_datatype().bitwidth() + num_repetitions = int(dim_h * dim_w) + mac_count = k_h * k_w * fm * num_repetitions + # cannonicalize op type: highest bitwidth operand first s.t. + # e.g. mac_8bx4b and mac_4bx8b don't appear as two different op types + bw1 = min(inp_bits, weight_bits) + bw2 = max(inp_bits, weight_bits) + mac_op_type = "op_mac_%dbx%db" % (bw1, bw2) + weight_param_type = "param_weight_%db" % (weight_bits) + weight_count = k_h * k_w * fm + ret_dict = {mac_op_type: mac_count, weight_param_type: weight_count} + if self.get_nodeattr("noActivation") == 0: + tdt = DataType[self.get_nodeattr("accDataType")] + thres_bits = tdt.bitwidth() + thres_param_type = "param_threshold_%db" % (thres_bits) + thres_count = fm + ret_dict[thres_param_type] = thres_count + return ret_dict - def docompute(self): - mem_mode = self.get_nodeattr("mem_mode") - map_to_hls_mult_style = { - "auto": "ap_resource_dflt()", - "lut": "ap_resource_lut()", - "dsp": "ap_resource_dsp()", + def derive_characteristic_fxns(self, period): + n_inps = np.prod(self.get_folded_input_shape()[:-1]) + io_dict = { + "inputs": { + "in0": [0 for i in range(n_inps)], + }, + "outputs": {"out": []}, } - tmpl_args = self.get_template_param_values() - if self.calc_tmem() == 0: - odtype_hls_str = self.get_output_datatype().get_hls_datatype_str() - threshs = "PassThroughActivation<%s>()" % odtype_hls_str - else: - threshs = "threshs" - - if mem_mode == "const": - self.code_gen_dict["$DOCOMPUTE$"] = [ - """Vector_Vector_Activate_Batch - (in0_{}, out_{}, weights, {}, numReps, {});""".format( - tmpl_args["TSrcI"], - tmpl_args["TDstI"], - tmpl_args["TWeightI"], - self.hls_sname(), - self.hls_sname(), - threshs, - map_to_hls_mult_style[self.get_nodeattr("resType")], - ) - ] - elif mem_mode == "decoupled" or mem_mode == "external": - wdt = self.get_weight_datatype() - if wdt == DataType["BIPOLAR"]: - export_wdt = DataType["BINARY"] - else: - export_wdt = wdt - wdtype_hls_str = export_wdt.get_hls_datatype_str() - self.code_gen_dict["$DOCOMPUTE$"] = [ - """{} - (in0_{}, out_{}, weights_{}, {}, numReps, {});""".format( - "Vector_Vector_Activate_Stream_Batch", - tmpl_args["TSrcI"], - tmpl_args["TDstI"], - tmpl_args["TWeightI"], - wdtype_hls_str, - self.hls_sname(), - self.hls_sname(), - self.hls_sname(), - threshs, - map_to_hls_mult_style[self.get_nodeattr("resType")], - ) - ] - else: - raise Exception( - """Please set mem_mode to "const", "decoupled", or "external", - currently no other parameter value is supported!""" - ) - - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - shape = self.get_folded_output_shape() - shape_cpp_str = str(shape).replace("(", "{").replace(")", "}") - - # note: the innermost dim is not reversed for the output - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", false);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - self.hls_sname(), - shape_cpp_str, - npy_out, - ) - ] - - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - - def blackboxfunction(self): - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "const": - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void {}(hls::stream> &in0_{}, - hls::stream> &out_{} - )""".format( - self.onnx_node.name, - self.get_instream_width(), - self.hls_sname(), - self.get_outstream_width(), - self.hls_sname(), - ) - ] - elif mem_mode == "decoupled" or mem_mode == "external": - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void {}( - hls::stream> &in0_{}, - hls::stream> &weights_{}, - hls::stream> &out_{} - )""".format( - self.onnx_node.name, - self.get_instream_width(), - self.hls_sname(), - self.get_weightstream_width(), - self.hls_sname(), - self.get_outstream_width(), - self.hls_sname(), - ) - ] - else: - raise Exception( - """Please set mem_mode to "const" or "decoupled", currently no other - parameter value is supported!""" - ) - - def pragmas(self): - mem_mode = self.get_nodeattr("mem_mode") - self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() - ] - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") - - if mem_mode == "const": - self.code_gen_dict["$PRAGMAS$"].append('#include "params.h"') - # the weight tensor is ap_uint [PE][WMEM] - # partition for parallel access along the PE dimension (dim 1) - self.code_gen_dict["$PRAGMAS$"].append( - ("#pragma HLS ARRAY_PARTITION variable=weights.m_weights " "complete dim=1") - ) - elif mem_mode == "decoupled" or mem_mode == "external": - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=weights_" + self.hls_sname() - ) - else: - raise Exception( - """Please set mem_mode to "const", "decoupled", or external, - currently no other parameter value is supported!""" - ) - - if self.calc_tmem() != 0: - # TODO find a better way of checking for no pregenerated thresholds - self.code_gen_dict["$PRAGMAS$"].append( - ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=1") - ) - self.code_gen_dict["$PRAGMAS$"].append( - ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=3") - ) - - def get_verilog_top_module_intf_names(self): - intf_names = super().get_verilog_top_module_intf_names() mem_mode = self.get_nodeattr("mem_mode") - sname = self.hls_sname() - if mem_mode == "external": - intf_names["s_axis"].append(("weights_" + sname, self.get_weightstream_width_padded())) - if mem_mode == "decoupled": - # only expose axilite interface if attribute is set - runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 - if runtime_writable: - intf_names["axilite"] = ["s_axilite"] - return intf_names + if mem_mode in ["decoupled", "external"]: + n_weight_inps = self.calc_wmem() + num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) + io_dict["inputs"]["weights"] = [0 for i in range(num_w_reps * n_weight_inps)] + super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) def code_generation_ipi(self): cmd = [] @@ -1111,207 +964,4 @@ def code_generation_ipi(self): return super().code_generation_ipi() else: raise Exception("Unrecognized mem_mode for VectorVectorActivation") - return cmd - - def uram_estimation(self): - P = self.get_nodeattr("PE") - Q = self.get_nodeattr("SIMD") - wdt = self.get_weight_datatype() - W = wdt.bitwidth() - omega = self.calc_wmem() - mem_width = Q * W * P - mmode = self.get_nodeattr("mem_mode") - mstyle = self.get_nodeattr("ram_style") - if ( - (mmode == "decoupled" and mstyle != "ultra") - or (mmode == "const") - or (mmode == "external") - ): - return 0 - width_multiplier = math.ceil(mem_width / 72) - depth_multiplier = math.ceil(omega / 4096) - return width_multiplier * depth_multiplier - - def bram_estimation(self): - """Calculates resource estimation for BRAM""" - # TODO add in/out FIFO contributions - P = self.get_nodeattr("PE") - Q = self.get_nodeattr("SIMD") - wdt = self.get_weight_datatype() - W = wdt.bitwidth() - omega = self.calc_wmem() - mem_width = Q * W * P - # assuming SDP mode RAMB18s (see UG573 Table 1-10) - # since this is HLS memory, not using the full width of a BRAM - # assuming memories up to 128 deep get implemented in LUTs - mmode = self.get_nodeattr("mem_mode") - mstyle = self.get_nodeattr("ram_style") - if ( - (mmode == "decoupled" and mstyle in ["distributed", "ultra"]) - or (mstyle == "auto" and self.calc_wmem() <= 128) - or (mmode == "const" and self.calc_wmem() <= 128) - or (mmode == "external") - ): - return 0 - - if mem_width == 1: - return math.ceil(omega / 16384) - elif mem_width == 2: - return math.ceil(omega / 8192) - elif mem_width <= 4: - return (math.ceil(omega / 4096)) * (math.ceil(mem_width / 4)) - elif mem_width <= 9: - return (math.ceil(omega / 2048)) * (math.ceil(mem_width / 8)) - elif mem_width <= 18 or omega > 512: - return (math.ceil(omega / 1024)) * (math.ceil(mem_width / 16)) - else: - return (math.ceil(omega / 512)) * (math.ceil(mem_width / 32)) - - def bram_efficiency_estimation(self): - P = self.get_nodeattr("PE") - wdt = self.get_weight_datatype() - W = wdt.bitwidth() - omega = self.calc_wmem() - bram16_est = self.bram_estimation() - if bram16_est == 0: - return 1 - wbits = W * P * omega - bram16_est_capacity = bram16_est * 36 * 512 - return wbits / bram16_est_capacity - - def lut_estimation(self): - """Calculates resource estimations for LUTs based on: - - FINN-R: An End-to-End Deep-Learning Framework for Fast - Exploration of Quantized Neural Networks - - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien, - Y. Umuroglu, M. Leeser and K. Vissers - - 12. Sep 2018 - """ - # TODO add in/out FIFO contributions - P = self.get_nodeattr("PE") - Q = self.get_nodeattr("SIMD") - wdt = self.get_weight_datatype() - W = wdt.bitwidth() - # determine tdt with input and weight data types - idt = self.get_input_datatype() - A = idt.bitwidth() - # parameters from experiments in paper mentioned above - c0 = 300 - c1 = 1.1 - c2 = 0 - mmode = self.get_nodeattr("mem_mode") - mstyle = self.get_nodeattr("ram_style") - if (mmode == "decoupled" and mstyle == "distributed") or ( - mmode == "const" and self.calc_wmem() <= 128 - ): - c2 = (P * Q * W) * math.ceil(self.calc_wmem() / 64) - - # multiplication - res_type = self.get_nodeattr("resType") - if res_type == "dsp": - mult_luts = 0 - else: - mult_luts = Q * (2 * math.ceil((W + A) / 6) - 1) * (W + A) - # adder tree - addertree_luts = (W + A) * (2 * Q - 1) - # accumulator - acc_datatype = self.get_accumulator_datatype() - acc_bits = acc_datatype.bitwidth() - k_h, k_w = self.get_nodeattr("Kernel") - # if accDataType is not set, then it will default to INT32, which would - # be a large overestimate in most (if not all) cases. In this scenario, - # we would use the minimum accumulator as determined by the data types - # bound, derived in https://arxiv.org/abs/2301.13376 - alpha = math.log(k_h * k_w, 2) + W + A - 1 - int(idt.signed()) - acc_bits = min( - acc_datatype.bitwidth(), - np.ceil(alpha + math.log(1 + pow(2, -alpha), 2) + 1), - ) - acc_luts = acc_bits - # thresholds and threshold comparators - thr_luts = 0 - comp_luts = 0 - noact = self.get_nodeattr("noActivation") - # TODO - add 'ram_style_threshold' node attribute - if noact == 0: - odt = self.get_output_datatype() - B = odt.bitwidth() - thr_luts = (2**B - 1) * acc_bits * self.calc_tmem() / 64 - comp_luts = (2**B - 1) * acc_bits - - return int( - c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts)) + c2 - ) - - def dsp_estimation(self): - # multiplication - P = self.get_nodeattr("PE") - res_type = self.get_nodeattr("resType") - wdt = self.get_weight_datatype() - W = wdt.bitwidth() - idt = self.get_input_datatype() - A = idt.bitwidth() - if res_type == "dsp": - mult_dsp = P * np.ceil((W + A) / 48) # TODO: more accurate modelling - else: - mult_dsp = 0 - return int(mult_dsp) - - def get_weightstream_width(self): - """Returns weight stream width. Used only in decoupled mode.""" - if ( - self.get_nodeattr("mem_mode") == "decoupled" - or self.get_nodeattr("mem_mode") == "external" - ): - simd = self.get_nodeattr("SIMD") - pe = self.get_nodeattr("PE") - wp = self.get_weight_datatype().bitwidth() - w_width = simd * pe * wp - return w_width - else: - return 0 - - def get_weightstream_width_padded(self): - """Returns weight stream width padded to a multiple of 8. This is required - by the AXI Stream spec. Used in decoupled mode.""" - weight_width = self.get_weightstream_width() - return roundup_to_integer_multiple(weight_width, 8) - - def get_op_and_param_counts(self): - k_h, k_w = self.get_nodeattr("Kernel") - fm = self.get_nodeattr("Channels") - dim_h, dim_w = self.get_nodeattr("Dim") - weight_bits = self.get_weight_datatype().bitwidth() - inp_bits = self.get_input_datatype().bitwidth() - num_repetitions = int(dim_h * dim_w) - mac_count = k_h * k_w * fm * num_repetitions - # cannonicalize op type: highest bitwidth operand first s.t. - # e.g. mac_8bx4b and mac_4bx8b don't appear as two different op types - bw1 = min(inp_bits, weight_bits) - bw2 = max(inp_bits, weight_bits) - mac_op_type = "op_mac_%dbx%db" % (bw1, bw2) - weight_param_type = "param_weight_%db" % (weight_bits) - weight_count = k_h * k_w * fm - ret_dict = {mac_op_type: mac_count, weight_param_type: weight_count} - if self.get_nodeattr("noActivation") == 0: - tdt = DataType[self.get_nodeattr("accDataType")] - thres_bits = tdt.bitwidth() - thres_param_type = "param_threshold_%db" % (thres_bits) - thres_count = fm - ret_dict[thres_param_type] = thres_count - return ret_dict - - def derive_characteristic_fxns(self, period): - n_inps = np.prod(self.get_folded_input_shape()[:-1]) - io_dict = { - "inputs": { - "in0": [0 for i in range(n_inps)], - }, - "outputs": {"out": []}, - } - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode in ["decoupled", "external"]: - n_weight_inps = self.calc_wmem() - num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) - io_dict["inputs"]["weights"] = [0 for i in range(num_w_reps * n_weight_inps)] - super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) + return cmd \ No newline at end of file From bc44a4d487590c857652d3dfd4ab0a11962816d1 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 1 Feb 2024 14:36:31 +0000 Subject: [PATCH 111/112] [vvau hls-op]: refactored HLS custom-op VVAU --- .../hls/vectorvectoractivation_hls.py | 372 ++++++++++++++++++ 1 file changed, 372 insertions(+) create mode 100644 src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py diff --git a/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py b/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py new file mode 100644 index 0000000000..51de49f1c7 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py @@ -0,0 +1,372 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import math +import numpy as np +import os +import textwrap +import warnings +from qonnx.core.datatype import DataType +from qonnx.util.basic import ( + calculate_matvec_accumulator_range, + interleave_matrix_outer_dim_from_partitions, + roundup_to_integer_multiple, +) + +from finn.util.data_packing import ( + npy_to_rtlsim_input, + numpy_to_hls_code, + pack_innermost_dim_as_hex_string, + rtlsim_output_to_npy, +) +from finn.custom_op.fpgadataflow.vectorvectoractivation import VectorVectorActivation +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend + +class VectorVectorActivation_hls(VectorVectorActivation, HLSBackend): + """Corresponds to finn-hlslib Vector_Vector_Activate_Batch function""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = {} + my_attrs.update(VectorVectorActivation.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) + return my_attrs + + def get_template_param_values(self): + """Returns the template parameter values according to input, output and weight + data types.""" + ret = dict() + inp_hls_str = self.get_input_datatype().get_hls_datatype_str() + out_hls_str = self.get_output_datatype().get_hls_datatype_str() + inp_is_binary = self.get_input_datatype() == DataType["BINARY"] + # out_is_binary = self.get_output_datatype() == DataType["BINARY"] + wt_is_binary = self.get_weight_datatype() == DataType["BINARY"] + bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1 + if (inp_is_binary or wt_is_binary) and (not bin_xnor_mode): + raise Exception("True binary (non-bipolar) inputs not yet supported") + inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"] + # out_is_bipolar = self.get_output_datatype() == DataType["BIPOLAR"] + wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"] + # reinterpret inp/wt as bipolar if bin_xnor_mode is iset + inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode) + wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode) + # fill in TSrcI and TWeightI + # TODO check these with Giulio + # TODO handle non-bipolar binary inputs + if inp_is_bipolar and wt_is_bipolar: + ret["TSrcI"] = "Recast" + ret["TWeightI"] = "Identity" + elif (not inp_is_bipolar) and wt_is_bipolar: + ret["TSrcI"] = "Slice<%s>" % inp_hls_str + ret["TWeightI"] = "Recast" + elif inp_is_bipolar and (not wt_is_bipolar): + ret["TSrcI"] = "Recast" + ret["TWeightI"] = "Identity" + elif (not inp_is_bipolar) and (not wt_is_bipolar): + ret["TSrcI"] = "Slice<%s>" % inp_hls_str + ret["TWeightI"] = "Identity" + + # fill in TDstI + ret["TDstI"] = "Slice<%s>" % out_hls_str + + return ret + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "weights.hpp"'] + self.code_gen_dict["$GLOBALS$"] += ['#include "activations.hpp"'] + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode not in ["const", "decoupled", "external"]: + raise Exception( + """Please set mem_mode to "const", "decoupled", or "external", + currently no other parameter value is supported!""" + ) + if self.calc_tmem() != 0: + self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"'] + + def defines(self, var): + dim_h, dim_w = self.get_nodeattr("Dim") + numReps = 1 * dim_h * dim_w + k_h, k_w = self.get_nodeattr("Kernel") + innerProdDim = k_h * k_w + mem_mode = self.get_nodeattr("mem_mode") + + self.code_gen_dict["$DEFINES$"] = [ + """#define Channels1 {}\n #define InnerProdDim {}\n + #define SIMD1 {}\n #define PE1 {}\n #define numReps {}""".format( + self.get_nodeattr("Channels"), + innerProdDim, + self.get_nodeattr("SIMD"), + self.get_nodeattr("PE"), + numReps, + ) + ] + if mem_mode == "decoupled" or mem_mode == "external": + wdt = self.get_weight_datatype() + self.code_gen_dict["$DEFINES$"].append("#define WP1 {}\n".format(wdt.bitwidth())) + + def read_npy_data(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_input_datatype() + if dtype == DataType["BIPOLAR"]: + # use binary for bipolar storage + dtype = DataType["BINARY"] + elem_bits = dtype.bitwidth() + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/input_0.npy" % code_gen_dir + self.code_gen_dict["$READNPYDATA$"] = [] + # note: the innermost dim is reversed for the input + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s, false);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + npy_in, + self.hls_sname(), + ) + ) + + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "decoupled" or mem_mode == "external": + wdt = self.get_weight_datatype() + elem_bits = wdt.bitwidth() + packed_bits = self.get_weightstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = wdt.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/weights.npy" % code_gen_dir + + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", weights_%s, false, numReps);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + npy_in, + self.hls_sname(), + ) + ) + + def strm_decl(self): + mem_mode = self.get_nodeattr("mem_mode") + self.code_gen_dict["$STREAMDECLARATIONS$"] = [] + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> in0_{} ("in0_{}");'.format( + self.get_instream_width(), self.hls_sname(), self.hls_sname() + ) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> out_{} ("out_{}");'.format( + self.get_outstream_width(), self.hls_sname(), self.hls_sname() + ) + ) + if mem_mode == "decoupled" or mem_mode == "external": + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> weights_{} ("weights_{}");'.format( + self.get_weightstream_width(), self.hls_sname(), self.hls_sname() + ) + ) + + def docompute(self): + mem_mode = self.get_nodeattr("mem_mode") + map_to_hls_mult_style = { + "auto": "ap_resource_dflt()", + "lut": "ap_resource_lut()", + "dsp": "ap_resource_dsp()", + } + tmpl_args = self.get_template_param_values() + if self.calc_tmem() == 0: + odtype_hls_str = self.get_output_datatype().get_hls_datatype_str() + threshs = "PassThroughActivation<%s>()" % odtype_hls_str + else: + threshs = "threshs" + + if mem_mode == "const": + self.code_gen_dict["$DOCOMPUTE$"] = [ + """Vector_Vector_Activate_Batch + (in0_{}, out_{}, weights, {}, numReps, {});""".format( + tmpl_args["TSrcI"], + tmpl_args["TDstI"], + tmpl_args["TWeightI"], + self.hls_sname(), + self.hls_sname(), + threshs, + map_to_hls_mult_style[self.get_nodeattr("resType")], + ) + ] + elif mem_mode == "decoupled" or mem_mode == "external": + wdt = self.get_weight_datatype() + if wdt == DataType["BIPOLAR"]: + export_wdt = DataType["BINARY"] + else: + export_wdt = wdt + wdtype_hls_str = export_wdt.get_hls_datatype_str() + self.code_gen_dict["$DOCOMPUTE$"] = [ + """{} + (in0_{}, out_{}, weights_{}, {}, numReps, {});""".format( + "Vector_Vector_Activate_Stream_Batch", + tmpl_args["TSrcI"], + tmpl_args["TDstI"], + tmpl_args["TWeightI"], + wdtype_hls_str, + self.hls_sname(), + self.hls_sname(), + self.hls_sname(), + threshs, + map_to_hls_mult_style[self.get_nodeattr("resType")], + ) + ] + else: + raise Exception( + """Please set mem_mode to "const", "decoupled", or "external", + currently no other parameter value is supported!""" + ) + + def dataoutstrm(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_output_datatype() + if dtype == DataType["BIPOLAR"]: + # use binary for bipolar storage + dtype = DataType["BINARY"] + elem_bits = dtype.bitwidth() + packed_bits = self.get_outstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_out = "%s/output.npy" % code_gen_dir + shape = self.get_folded_output_shape() + shape_cpp_str = str(shape).replace("(", "{").replace(")", "}") + + # note: the innermost dim is not reversed for the output + self.code_gen_dict["$DATAOUTSTREAM$"] = [ + 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", false);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + self.hls_sname(), + shape_cpp_str, + npy_out, + ) + ] + + def save_as_npy(self): + self.code_gen_dict["$SAVEASCNPY$"] = [] + + def blackboxfunction(self): + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "const": + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void {}(hls::stream> &in0_{}, + hls::stream> &out_{} + )""".format( + self.onnx_node.name, + self.get_instream_width(), + self.hls_sname(), + self.get_outstream_width(), + self.hls_sname(), + ) + ] + elif mem_mode == "decoupled" or mem_mode == "external": + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void {}( + hls::stream> &in0_{}, + hls::stream> &weights_{}, + hls::stream> &out_{} + )""".format( + self.onnx_node.name, + self.get_instream_width(), + self.hls_sname(), + self.get_weightstream_width(), + self.hls_sname(), + self.get_outstream_width(), + self.hls_sname(), + ) + ] + else: + raise Exception( + """Please set mem_mode to "const" or "decoupled", currently no other + parameter value is supported!""" + ) + + def pragmas(self): + mem_mode = self.get_nodeattr("mem_mode") + self.code_gen_dict["$PRAGMAS$"] = [ + "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() + ] + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() + ) + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") + + if mem_mode == "const": + self.code_gen_dict["$PRAGMAS$"].append('#include "params.h"') + # the weight tensor is ap_uint [PE][WMEM] + # partition for parallel access along the PE dimension (dim 1) + self.code_gen_dict["$PRAGMAS$"].append( + ("#pragma HLS ARRAY_PARTITION variable=weights.m_weights " "complete dim=1") + ) + elif mem_mode == "decoupled" or mem_mode == "external": + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=weights_" + self.hls_sname() + ) + else: + raise Exception( + """Please set mem_mode to "const", "decoupled", or external, + currently no other parameter value is supported!""" + ) + + if self.calc_tmem() != 0: + # TODO find a better way of checking for no pregenerated thresholds + self.code_gen_dict["$PRAGMAS$"].append( + ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=1") + ) + self.code_gen_dict["$PRAGMAS$"].append( + ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=3") + ) + + def get_verilog_top_module_intf_names(self): + intf_names = super().get_verilog_top_module_intf_names() + mem_mode = self.get_nodeattr("mem_mode") + sname = self.hls_sname() + if mem_mode == "external": + intf_names["s_axis"].append(("weights_" + sname, self.get_weightstream_width_padded())) + if mem_mode == "decoupled": + # only expose axilite interface if attribute is set + runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 + if runtime_writable: + intf_names["axilite"] = ["s_axilite"] + return intf_names \ No newline at end of file From faabc0fe32392975e21f4be58cc6352db414f40f Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 1 Feb 2024 14:37:12 +0000 Subject: [PATCH 112/112] [convert-to-hw]: added transformations to infer binary-MVAU and VVAU --- .../fpgadataflow/convert_to_hw_layers.py | 279 ++++++++++++++++++ 1 file changed, 279 insertions(+) diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py index eb6dd337f5..26cd0b74ad 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py @@ -1281,6 +1281,139 @@ def apply(self, model): return (model, graph_modified) +class InferBinaryMatrixVectorActivation(Transformation): + """Convert XnorPopcountMatMul layers to + MatrixVectorActivation layers. Any immediately following MultiThreshold + layers will also be absorbed into the MVTU.""" + + def __init__(self, mem_mode="const"): + super().__init__() + self.mem_mode = mem_mode + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for n in graph.node: + node_ind += 1 + if n.op_type == "XnorPopcountMatMul": + mm_input = n.input[0] + mm_weight = n.input[1] + mm_output = n.output[0] + mm_in_shape = model.get_tensor_shape(mm_input) + mm_out_shape = model.get_tensor_shape(mm_output) + assert model.get_tensor_datatype(mm_input) == DataType["BINARY"], ( + n.name + + """: First + input for xnorpopcount is not Wset to FINN DataType BINARY.""" + ) + assert model.get_tensor_datatype(mm_weight) == DataType["BINARY"], ( + n.name + + """: Second + input (weights) for xnorpopcount is not set to FINN DataType BINARY.""" + ) + idt = DataType["BINARY"] + wdt = DataType["BINARY"] + mm_output = n.output[0] + W = model.get_initializer(mm_weight) + # extract weight shape, note that ONNX and finn-hlslib + # make different assumptions about dim order here + # ONNX assumes W has (in, out) shape + # finn-hlslib assumes W has (out, in) shape + mh = int(W.shape[1]) + mw = int(W.shape[0]) + # create node with no parallelization first + pe = 1 + simd = 1 + wmem = mw * mh // (pe * simd) + assert mw * mh == wmem * pe * simd, ( + n.name + + """: Requirement (MW * MH) divisiable by + (WMEM * PE * SIMD) is violated.""" + ) + # see if we have any following thresholds + consumer = model.find_consumer(mm_output) + if consumer is not None and consumer.op_type == "MultiThreshold": + # TODO ensure integer thresholds? + # create MVTU (i.e. including activation) + mt_output = consumer.output[0] + mt_out_shape = model.get_tensor_shape(mt_output) + mt_thres = consumer.input[1] + T = model.get_initializer(mt_thres) + assert T.shape[0] == 1 or T.shape[0] == mh, ( + consumer.name + + """: First dimension of + thresholds neither 1 nor MH.""" + ) + odt = model.get_tensor_datatype(mt_output) + if odt.bitwidth() == 1: + # covers both bipolar and binary + actval = 0 + else: + actval = odt.min() + model.set_tensor_shape(mm_input, mm_in_shape) + model.set_tensor_shape(mt_output, mt_out_shape) + # create and insert new MatrixVectorActivation node + new_node = helper.make_node( + "MatrixVectorActivation", + [mm_input, mm_weight, mt_thres], + [mt_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + MW=mw, + MH=mh, + SIMD=simd, + PE=pe, + inputDataType=idt.name, + weightDataType=wdt.name, + outputDataType=odt.name, + ActVal=actval, + binaryXnorMode=1, + noActivation=0, + numInputVectors=list(mm_in_shape[:-1]), + mem_mode=self.mem_mode, + name=n.name, + ) + graph.node.insert(node_ind, new_node) + # remove old nodes + graph.node.remove(n) + graph.node.remove(consumer) + graph_modified = True + else: + # no activation, matmul only + odt = model.get_tensor_datatype(mm_output) + model.set_tensor_shape(mm_input, mm_in_shape) + model.set_tensor_shape(mm_output, mm_out_shape) + # create and insert new MatrixVectorActivation node + new_node = helper.make_node( + "MatrixVectorActivation", + [mm_input, mm_weight], + [mm_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + MW=mw, + MH=mh, + SIMD=simd, + PE=pe, + inputDataType=idt.name, + weightDataType=wdt.name, + outputDataType=odt.name, + ActVal=0, + binaryXnorMode=1, + noActivation=1, + numInputVectors=list(mm_in_shape[:-1]), + mem_mode=self.mem_mode, + name=n.name, + ) + graph.node.insert(node_ind, new_node) + # remove old node + graph.node.remove(n) + graph_modified = True + if graph_modified: + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + return (model, graph_modified) + class InferQuantizedMatrixVectorActivation(Transformation): """Convert MatMul layers with quantized inputs and weights to MatrixVectorActivation layers.""" @@ -1415,4 +1548,150 @@ def apply(self, model): if graph_modified: model = model.transform(InferShapes()) model = model.transform(InferDataTypes()) + return (model, graph_modified) + +class InferVectorVectorActivation(Transformation): + """Convert MatMul layers with quantized inputs and weights to + VectorVectorActivation layers, if the sparsity annotation + of the weight matrix indicates that the MatMul layer belongs to + a depthwise convolution. Any immediately following MultiThreshold + layers will also be absorbed into the VVAU.""" + + def __init__(self, mem_mode="const"): + super().__init__() + self.mem_mode = mem_mode + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for n in graph.node: + node_ind += 1 + if n.op_type == "MatMul" and model.get_tensor_sparsity(n.input[1]) is not None: + sparsity = model.get_tensor_sparsity(n.input[1]) + try: + k_h, k_w = sparsity["dw"]["kernel_shape"] + except KeyError: + raise Exception( + n.name + + """: sparsity annotation doesn't indicate that MatMul + belongs to a depthwise convolution.""" + ) + + mm_input = n.input[0] + mm_weight = n.input[1] + mm_output = n.output[0] + mm_in_shape = model.get_tensor_shape(mm_input) + mm_out_shape = model.get_tensor_shape(mm_output) + idt = model.get_tensor_datatype(mm_input) + wdt = model.get_tensor_datatype(mm_weight) + if idt.is_integer() and wdt.is_integer(): + mm_output = n.output[0] + W = model.get_initializer(mm_weight) + # infer dense weight tensor from sparse weight matrix + # kernel size (k_h, k_w) which was extracted above and the value of + # the channels is used. + # the weight matrix has a shape of (k_h * k_w * Channels, Channels) + # we need to reverse the creation of the sparse weight matrix + # to achieve a weight tensor of shape (Channels, 1, k_h, k_w) + channels = int(W.shape[1]) + # transpose to achieve a shape of (k_h * k_w * Channels, Channels) + W = W.T + # reshape to (Channels, k_h, k_w, Channels) to transpose afterwards + # to (Channels, Channels, k_h, k_w) + W = W.reshape(channels, k_h, k_w, channels) + W = W.transpose(0, 3, 1, 2) + # now we can extract the values using a for loop over the channels + # and fill a zero numpy array in the correct shape + w_tensor = np.zeros((channels, 1, k_h, k_w), dtype=np.float32) + for ch in range(channels): + w_tensor[ch][0] = W[ch][ch] + model.set_initializer(mm_weight, w_tensor) + model.set_tensor_shape(mm_weight, (channels, 1, k_h, k_w)) + # create node with pe=channels as default + pe = channels + # see if we have any following thresholds + consumer = model.find_consumer(mm_output) + if consumer is not None and consumer.op_type == "MultiThreshold": + # create VVAU (i.e. including activation) + mt_output = consumer.output[0] + mt_out_shape = model.get_tensor_shape(mt_output) + mt_thres = consumer.input[1] + T = model.get_initializer(mt_thres) + assert T.shape[0] == 1 or T.shape[0] == channels, ( + consumer.name + + """: First dimension of + thresholds neither 1 nor Channels.""" + ) + odt = model.get_tensor_datatype(mt_output) + scale = getCustomOp(consumer).get_nodeattr("out_scale") + assert scale == 1.0, ( + consumer.name + ": out_scale must be equal to 1.0 for HLS conversion." + ) + actval = getCustomOp(consumer).get_nodeattr("out_bias") + assert int(actval) == actval, ( + consumer.name + ": out_bias must be integer for HLS conversion." + ) + actval = int(actval) + assert (not odt.signed()) or (actval < 0), ( + consumer.name + ": Signed output requres actval < 0" + ) + model.set_tensor_shape(mm_input, mm_in_shape) + model.set_tensor_shape(mt_output, mt_out_shape) + # create and insert new VectorVectorActivation node + new_node = helper.make_node( + "VectorVectorActivation", + [mm_input, mm_weight, mt_thres], + [mt_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + resType="lut", + PE=pe, + Dim=[mm_in_shape[1], mm_in_shape[2]], + Channels=channels, + Kernel=[k_h, k_w], + inputDataType=idt.name, + weightDataType=wdt.name, + outputDataType=odt.name, + ActVal=actval, + noActivation=0, + name="VectorVectorActivation_" + n.name, + mem_mode=self.mem_mode, + ) + graph.node.insert(node_ind, new_node) + # remove old nodes + graph.node.remove(n) + graph.node.remove(consumer) + graph_modified = True + else: + # no activation, matmul only + odt = model.get_tensor_datatype(mm_output) + model.set_tensor_shape(mm_input, mm_in_shape) + model.set_tensor_shape(mm_output, mm_out_shape) + # create and insert new VVAU node + new_node = helper.make_node( + "VectorVectorActivation", + [mm_input, mm_weight], + [mm_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + resType="lut", + PE=pe, + Dim=[mm_in_shape[1], mm_in_shape[2]], + Channels=channels, + Kernel=[k_h, k_w], + inputDataType=idt.name, + weightDataType=wdt.name, + outputDataType=odt.name, + ActVal=0, + noActivation=1, + name="VectorVectorActivation_" + n.name, + ) + graph.node.insert(node_ind, new_node) + # remove old node + graph.node.remove(n) + graph_modified = True + if graph_modified: + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) return (model, graph_modified) \ No newline at end of file