From 3002e6239903af9f9f9444ef3fbbb8935ba2bb92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Tue, 20 Sep 2022 11:08:59 +0100 Subject: [PATCH 001/235] HDL for new thresholding by binary search. --- finn-rtllib/thresholding/hdl/thresholding.sv | 153 ++++++++++++++ .../thresholding/hdl/thresholding_axi.sv | 198 ++++++++++++++++++ .../hdl/thresholding_axi_wrapper.v | 122 +++++++++++ 3 files changed, 473 insertions(+) create mode 100644 finn-rtllib/thresholding/hdl/thresholding.sv create mode 100644 finn-rtllib/thresholding/hdl/thresholding_axi.sv create mode 100644 finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v diff --git a/finn-rtllib/thresholding/hdl/thresholding.sv b/finn-rtllib/thresholding/hdl/thresholding.sv new file mode 100644 index 0000000000..93ccdc51c5 --- /dev/null +++ b/finn-rtllib/thresholding/hdl/thresholding.sv @@ -0,0 +1,153 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Pipelined thresholding by binary search. + * @author Thomas B. Preußer + * + * @description + * Produces the N-bit count of those among 2^N-1 thresholds that are not + * larger than the corresponding input: + * y = Σ(T_i <= x) + * The result is computed by binary search. The runtime-configurable + * thresholds must be written in ascending order: + * i < j => T_i < T_j + * The design supports channel folding allowing each input to be processed + * with respect to a selectable set of thresholds. The corresponding + * threshold configuration relies on a channel address prefix. Inputs are + * accompanied by a channel selector. + *****************************************************************************/ +module thresholding #( + int unsigned N, // output precision + int unsigned M, // input/threshold precision + int unsigned C, // number of channels + + localparam int unsigned C_BITS = C < 2? 1 : $clog2(C) +)( + // Global Control + input logic clk, + input logic rst, + + // Threshold Configuration + input logic twe, + input logic [$clog2(C)+N-1:0] twa, + input logic [ M-1:0] twd, + + // Clock Enable for Stream Processing + input logic en, + + // Input Stream + input logic ivld, + input logic [C_BITS-1:0] icnl, // Ignored for C == 1 + input logic [M -1:0] idat, + + // Output Stream + output logic ovld, + output logic [C_BITS-1:0] ocnl, + output logic [N -1:0] odat +); + + // Pipeline Links & Feed + typedef struct packed { + logic vld; // Valid data identification + logic [C_BITS-1:0] cnl; // Channel + logic [M -1:0] val; // Original input value + logic [0:N-1] res; // Assembling result with valid prefix [0:stage] after stage #stage + } pipe_t; + uwire pipe_t pipe[0:N]; + assign pipe[0] = pipe_t'{ vld: ivld, cnl: icnl, val: idat, res: {N{1'bx}} }; // Feed original input + + // Stages: 0, 1, ..., N-1 + uwire [0:N-1] tws = (twa[N-1:0]+1) & ~twa[N-1:0]; // Write Select per stage by address suffix + for(genvar stage = 0; stage < N; stage++) begin : genStages + + // Threshold Memory + uwire [M-1:0] thresh; + if(1) begin : blkUpdate + + // Write control: local select from global address + uwire we = twe && tws[stage]; + if((C == 1) && (stage == 0)) begin + logic [M-1:0] Thresh = 'x; + always_ff @(posedge clk) begin + if(rst) Thresh <= 'x; + else if(we) Thresh <= twd; + end + assign thresh = Thresh; + end + else begin + logic [M-1:0] Threshs[C * 2**stage]; + uwire [$clog2(C)+stage-1:0] wa = twa[$left(twa):N-stage]; + uwire [$clog2(C)+stage-1:0] ra; + if(C > 1) assign ra[stage+:C_BITS] = pipe[stage].cnl; + if(stage) assign ra[stage-1:0] = pipe[stage].res[0:stage-1]; + + // Write + always_ff @(posedge clk) begin + if(we) Threshs[wa] <= twd; + end + + // Read + logic [M-1:0] RdReg; + always_ff @(posedge clk) begin + if(en) RdReg <= Threshs[ra]; + end + assign thresh = RdReg; + end + + end : blkUpdate + + // Pipeline regs simply copying the input + pipe_t State = '{ vld: 0, cnl: 'x, val: 'x, res: 'x }; + always_ff @(posedge clk) begin + if(rst) State <= '{ vld: 0, cnl: 'x, val: 'x, res: 'x }; + else if(en) State <= pipe[stage]; + end + + // Assemble pipeline data + logic [0:N-1] res; + always_comb begin + res = State.res; + res[stage] = thresh <= State.val; // Patch in next result bit + end + assign pipe[stage+1] = '{ + vld: State.vld, + cnl: State.cnl, + val: State.val, + res: res + }; + + end : genStages + + // Output + assign ovld = pipe[N].vld; + assign ocnl = pipe[N].cnl; + assign odat = pipe[N].res; + +endmodule : thresholding diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi.sv b/finn-rtllib/thresholding/hdl/thresholding_axi.sv new file mode 100644 index 0000000000..71e54c5ca0 --- /dev/null +++ b/finn-rtllib/thresholding/hdl/thresholding_axi.sv @@ -0,0 +1,198 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief All-AXI interface adapter for thresholding module. + * @author Thomas B. Preußer + *****************************************************************************/ + +module thresholding_axi #( + int unsigned N, // output precision + int unsigned M, // input/threshold precision + int unsigned C // Channels +)( + //- Global Control ------------------ + input logic ap_clk, + input logic ap_rst_n, + + //- AXI Lite ------------------------ + // Writing + input logic s_axilite_AWVALID, + output logic s_axilite_AWREADY, + input logic [$clog2(C)+N-1:0] s_axilite_AWADDR, + + input logic s_axilite_WVALID, + output logic s_axilite_WREADY, + input logic [31:0] s_axilite_WDATA, + input logic [ 3:0] s_axilite_WSTRB, + + output logic s_axilite_BVALID, + input logic s_axilite_BREADY, + output logic [1:0] s_axilite_BRESP, + + // Reading + input logic s_axilite_ARVALID, + output logic s_axilite_ARREADY, + input logic [0:0] s_axilite_ARADDR, + + output logic s_axilite_RVALID, + input logic s_axilite_RREADY, + output logic [31:0] s_axilite_RDATA, + output logic [ 1:0] s_axilite_RRESP, + + //- AXI Stream - Input -------------- + output logic s_axis_tready, + input logic s_axis_tvalid, + input logic [((M+7)/8)*8-1:0] s_axis_tdata, + + //- AXI Stream - Output ------------- + input logic m_axis_tready, + output logic m_axis_tvalid, + output logic [((N+7)/8)*8-1:0] m_axis_tdata +); + //- Global Control ------------------------------------------------------ + uwire clk = ap_clk; + uwire rst = !ap_rst_n; + + //- AXI Lite: Threshold Configuration ----------------------------------- + uwire twe; + uwire [$clog2(C)+N-1:0] twa; + uwire [ M-1:0] twd; + if(1) begin : blkAxiLite + logic WABusy = 0; + logic WDBusy = 0; + logic [$clog2(C)+N-1:0] Addr = 'x; + logic [ M-1:0] Data = 'x; + + assign twe = WABusy && WDBusy; + assign twa = Addr; + assign twd = Data; + + uwire clr_wr = rst || (twe && s_axilite_BREADY); + always_ff @(posedge clk) begin : blockName + if(clr_wr) begin + WABusy <= 0; + Addr <= 'x; + WDBusy <= 0; + Data <= 'x; + end + else begin + if(!WABusy) begin + WABusy <= s_axilite_AWVALID; + Addr <= s_axilite_AWADDR[$clog2(C)+N-1:0]; + end + if(!WDBusy) begin + WDBusy <= s_axilite_WVALID; + Data <= s_axilite_WDATA[M-1:0]; + end + end + end + assign s_axilite_AWREADY = !WABusy; + assign s_axilite_WREADY = !WDBusy; + assign s_axilite_BVALID = WABusy && WDBusy; + assign s_axilite_BRESP = '0; // OK + + // Answer all reads with '1 + logic RValid = 0; + uwire clr_rd = rst || (RValid && s_axilite_RREADY); + always_ff @(posedge clk) begin + if(clr_rd) RValid <= 0; + else if(!RValid) RValid <= s_axilite_ARVALID; + end + assign s_axilite_ARREADY = !RValid; + assign s_axilite_RVALID = RValid; + assign s_axilite_RDATA = '1; + assign s_axilite_RRESP = '0; // OK + + end : blkAxiLite + + //- IO-Sandwich with two-stage output buffer for containing a local enable + uwire en; + uwire [N-1:0] odat; + uwire ovld; + if(1) begin : blkOutputDecouple + typedef struct { + logic vld; + logic [N-1:0] dat; + } buf_t; + buf_t Buf[2] = '{ default: '{ vld: 0, dat: 'x } }; + always_ff @(posedge clk) begin + if(rst) Buf <= '{ default: '{ vld: 0, dat: 'x } }; + else begin + if(!Buf[1].vld || m_axis_tready) begin + Buf[1] <= '{ + vld: Buf[0].vld || ovld, + dat: Buf[0].vld? Buf[0].dat : odat + }; + end + Buf[0].vld <= Buf[1].vld && !m_axis_tready && (Buf[0].vld || ovld); + if(!Buf[0].vld) Buf[0].dat <= odat; + end + end + assign en = !Buf[0].vld; + + assign m_axis_tvalid = Buf[1].vld; + assign m_axis_tdata = Buf[1].dat; + + end : blkOutputDecouple + + localparam int unsigned C_BITS = C < 2? 1 : $clog2(C); + uwire ivld = s_axis_tvalid; + uwire [C_BITS-1:0] icnl; + uwire [M -1:0] idat = s_axis_tdata[M-1:0]; + assign s_axis_tready = en; + if(C == 1) assign icnl = 'x; + else begin + logic [C_BITS-1:0] Chnl = 0; + logic Last = 0; + uwire inc = ivld && en; + uwire clr = rst || (Last && inc); + always_ff @(posedge clk) begin + if(clr) begin + Chnl <= 0; + Last <= 0; + end + else if(inc) begin + Chnl <= Chnl + 1; + Last <= (~Chnl & (C-2)) == 0; + end + end + assign icnl = Chnl; + end + + // Core Thresholding Module + thresholding #(.N(N), .M(M), .C(C)) core ( + .clk, .rst, + .twe, .twa, .twd, + .en, + .ivld, .icnl, .idat, + .ovld, .ocnl(), .odat + ); + +endmodule : thresholding_axi diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v new file mode 100644 index 0000000000..bb6b17b32f --- /dev/null +++ b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v @@ -0,0 +1,122 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief IPI-compatible Verilog wrapper for thresholding_axi module. + * @author Thomas B. Preußer + *****************************************************************************/ + +module thresholding_axi_wrapper #( + parameter N, // output precision + parameter M, // input/threshold precision + parameter C, // Channels + parameter C_BITS //= $clog2(C) +)( + //- Global Control ------------------ + input ap_clk, + input ap_rst_n, + + //- AXI Lite ------------------------ + // Writing + input s_axilite_AWVALID, + output s_axilite_AWREADY, + input [C_BITS+N-1:0] s_axilite_AWADDR, + + input s_axilite_WVALID, + output s_axilite_WREADY, + input [31:0] s_axilite_WDATA, + input [ 3:0] s_axilite_WSTRB, + + output s_axilite_BVALID, + input s_axilite_BREADY, + output [1:0] s_axilite_BRESP, + + // Reading + input s_axilite_ARVALID, + output s_axilite_ARREADY, + input [0:0] s_axilite_ARADDR, + + output s_axilite_RVALID, + input s_axilite_RREADY, + output [31:0] s_axilite_RDATA, + output [ 1:0] s_axilite_RRESP, + + //- AXI Stream - Input -------------- + output s_axis_tready, + input s_axis_tvalid, + input [((M+7)/8)*8-1:0] s_axis_tdata, + + //- AXI Stream - Output ------------- + input m_axis_tready, + output m_axis_tvalid, + output [((N+7)/8)*8-1:0] m_axis_tdata +); + + thresholding_axi #(.N(N), .M(M), .C(C)) inst ( + //- Global Control ------------------ + .ap_clk(ap_clk), + .ap_rst_n(ap_rst_n), + + //- AXI Lite ------------------------ + // Writing + .s_axilite_AWVALID(s_axilite_AWVALID), + .s_axilite_AWREADY(s_axilite_AWREADY), + .s_axilite_AWADDR(s_axilite_AWADDR), + + .s_axilite_WVALID(s_axilite_WVALID), + .s_axilite_WREADY(s_axilite_WREADY), + .s_axilite_WDATA(s_axilite_WDATA), + .s_axilite_WSTRB(s_axilite_WSTRB), + + .s_axilite_BVALID(s_axilite_BVALID), + .s_axilite_BREADY(s_axilite_BREADY), + .s_axilite_BRESP(s_axilite_BRESP), + + // Reading + .s_axilite_ARVALID(s_axilite_ARVALID), + .s_axilite_ARREADY(s_axilite_ARREADY), + .s_axilite_ARADDR(s_axilite_ARADDR), + + .s_axilite_RVALID(s_axilite_RVALID), + .s_axilite_RREADY(s_axilite_RREADY), + .s_axilite_RDATA(s_axilite_RDATA), + .s_axilite_RRESP(s_axilite_RRESP), + + //- AXI Stream - Input -------------- + .s_axis_tready(s_axis_tready), + .s_axis_tvalid(s_axis_tvalid), + .s_axis_tdata(s_axis_tdata), + + //- AXI Stream - Output ------------- + .m_axis_tready(m_axis_tready), + .m_axis_tvalid(m_axis_tvalid), + .m_axis_tdata(m_axis_tdata) + ); + +endmodule : thresholding_axi_wrapper From 3c92c2fc460fb5e45fdb0dfcc0b92c572ae65ce7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Tue, 20 Sep 2022 13:33:01 +0100 Subject: [PATCH 002/235] IP core support files for thresholding module. --- finn-rtllib/thresholding/component.xml | 817 ++++++++++++++++++ .../xgui/thresholding_axi_wrapper_v1_0.tcl | 74 ++ 2 files changed, 891 insertions(+) create mode 100644 finn-rtllib/thresholding/component.xml create mode 100644 finn-rtllib/thresholding/xgui/thresholding_axi_wrapper_v1_0.tcl diff --git a/finn-rtllib/thresholding/component.xml b/finn-rtllib/thresholding/component.xml new file mode 100644 index 0000000000..0a56f93316 --- /dev/null +++ b/finn-rtllib/thresholding/component.xml @@ -0,0 +1,817 @@ + + + amd.com + user + thresholding_axi_wrapper + 1.0 + + + m_axis + + + + + + + TDATA + + + m_axis_tdata + + + + + TVALID + + + m_axis_tvalid + + + + + TREADY + + + m_axis_tready + + + + + + s_axis + + + + + + + TDATA + + + s_axis_tdata + + + + + TVALID + + + s_axis_tvalid + + + + + TREADY + + + s_axis_tready + + + + + + s_axilite + + + + + + + + + AWADDR + + + s_axilite_AWADDR + + + + + AWVALID + + + s_axilite_AWVALID + + + + + AWREADY + + + s_axilite_AWREADY + + + + + WDATA + + + s_axilite_WDATA + + + + + WSTRB + + + s_axilite_WSTRB + + + + + WVALID + + + s_axilite_WVALID + + + + + WREADY + + + s_axilite_WREADY + + + + + BRESP + + + s_axilite_BRESP + + + + + BVALID + + + s_axilite_BVALID + + + + + BREADY + + + s_axilite_BREADY + + + + + ARADDR + + + s_axilite_ARADDR + + + + + ARVALID + + + s_axilite_ARVALID + + + + + ARREADY + + + s_axilite_ARREADY + + + + + RDATA + + + s_axilite_RDATA + + + + + RRESP + + + s_axilite_RRESP + + + + + RVALID + + + s_axilite_RVALID + + + + + RREADY + + + s_axilite_RREADY + + + + + + ap_rst_n + + + + + + + RST + + + ap_rst_n + + + + + + POLARITY + ACTIVE_LOW + + + + + ap_clk + + + + + + + CLK + + + ap_clk + + + + + + ASSOCIATED_RESET + ap_rst_n + + + ASSOCIATED_BUSIF + m_axis:s_axis:s_axilite + + + + + + + s_axilite + s_axilite + + reg0 + reg0 + 0x0 + 4096 + 32 + register + + + + + + + xilinx_anylanguagesynthesis + Synthesis + :vivado.xilinx.com:synthesis + Verilog + thresholding_axi_wrapper + + xilinx_anylanguagesynthesis_view_fileset + + + + viewChecksum + 5cc8f7a9 + + + + + xilinx_xpgui + UI Layout + :vivado.xilinx.com:xgui.ui + + xilinx_xpgui_view_fileset + + + + viewChecksum + c456596c + + + + + + + ap_clk + + in + + + std_logic + xilinx_anylanguagesynthesis + + + + + + ap_rst_n + + in + + + std_logic + xilinx_anylanguagesynthesis + + + + + + s_axilite_AWVALID + + in + + + std_logic + xilinx_anylanguagesynthesis + + + + 0 + + + + + s_axilite_AWREADY + + out + + + std_logic + xilinx_anylanguagesynthesis + + + + + + s_axilite_AWADDR + + in + + 3 + 0 + + + + std_logic_vector + xilinx_anylanguagesynthesis + + + + 0 + + + + + s_axilite_WVALID + + in + + + std_logic + xilinx_anylanguagesynthesis + + + + 0 + + + + + s_axilite_WREADY + + out + + + std_logic + xilinx_anylanguagesynthesis + + + + + + s_axilite_WDATA + + in + + 31 + 0 + + + + std_logic_vector + xilinx_anylanguagesynthesis + + + + 0 + + + + + s_axilite_WSTRB + + in + + 3 + 0 + + + + std_logic_vector + xilinx_anylanguagesynthesis + + + + 1 + + + + + s_axilite_BVALID + + out + + + std_logic + xilinx_anylanguagesynthesis + + + + + + s_axilite_BREADY + + in + + + std_logic + xilinx_anylanguagesynthesis + + + + 0 + + + + + s_axilite_BRESP + + out + + 1 + 0 + + + + std_logic_vector + xilinx_anylanguagesynthesis + + + + + + s_axilite_ARVALID + + in + + + std_logic + xilinx_anylanguagesynthesis + + + + 0 + + + + + s_axilite_ARREADY + + out + + + std_logic + xilinx_anylanguagesynthesis + + + + + + s_axilite_ARADDR + + in + + 0 + 0 + + + + std_logic_vector + xilinx_anylanguagesynthesis + + + + 0 + + + + + s_axilite_RVALID + + out + + + std_logic + xilinx_anylanguagesynthesis + + + + + + s_axilite_RREADY + + in + + + std_logic + xilinx_anylanguagesynthesis + + + + 0 + + + + + s_axilite_RDATA + + out + + 31 + 0 + + + + std_logic_vector + xilinx_anylanguagesynthesis + + + + + + s_axilite_RRESP + + out + + 1 + 0 + + + + std_logic_vector + xilinx_anylanguagesynthesis + + + + + + s_axis_tready + + out + + + std_logic + xilinx_anylanguagesynthesis + + + + + + s_axis_tvalid + + in + + + std_logic + xilinx_anylanguagesynthesis + + + + + + s_axis_tdata + + in + + 15 + 0 + + + + std_logic_vector + xilinx_anylanguagesynthesis + + + + 0 + + + + + m_axis_tready + + in + + + std_logic + xilinx_anylanguagesynthesis + + + + 1 + + + + + m_axis_tvalid + + out + + + std_logic + xilinx_anylanguagesynthesis + + + + + + m_axis_tdata + + out + + 7 + 0 + + + + std_logic_vector + xilinx_anylanguagesynthesis + + + + + + + + N + N + 4 + + + M + M + 16 + + + C + C + 1 + + + C_BITS + C Bits + 0 + + + + + + choice_list_74b5137e + ACTIVE_HIGH + ACTIVE_LOW + + + + + xilinx_anylanguagesynthesis_view_fileset + + hdl/thresholding.sv + systemVerilogSource + + + hdl/thresholding_axi.sv + systemVerilogSource + + + hdl/thresholding_axi_wrapper.v + verilogSource + CHECKSUM_2ec027ae + + + + xilinx_xpgui_view_fileset + + xgui/thresholding_axi_wrapper_v1_0.tcl + tclSource + CHECKSUM_c456596c + XGUI_VERSION_2 + + + + thresholding_axi_wrapper_v1_0 + + + N + N + 4 + + + M + M + 16 + + + C + C + 1 + + + C_BITS + C_BITS + 0 + + + Component_Name + thresholding_axi_wrapper_v1_0 + + + + + + virtex7 + qvirtex7 + versal + kintex7 + kintex7l + qkintex7 + qkintex7l + akintex7 + artix7 + artix7l + aartix7 + qartix7 + zynq + qzynq + azynq + spartan7 + aspartan7 + virtexu + zynquplus + virtexuplus + virtexuplusHBM + virtexuplus58g + kintexuplus + artixuplus + kintexu + + + /UserIP + + thresholding_axi_wrapper_v1_0 + package_project + AMD + 2 + 2022-09-20T12:31:16Z + + + 2022.1 + + + + + + + + + diff --git a/finn-rtllib/thresholding/xgui/thresholding_axi_wrapper_v1_0.tcl b/finn-rtllib/thresholding/xgui/thresholding_axi_wrapper_v1_0.tcl new file mode 100644 index 0000000000..02c373e8f2 --- /dev/null +++ b/finn-rtllib/thresholding/xgui/thresholding_axi_wrapper_v1_0.tcl @@ -0,0 +1,74 @@ +# Definitional proc to organize widgets for parameters. +proc init_gui { IPINST } { + ipgui::add_param $IPINST -name "Component_Name" + #Adding Page + set Page_0 [ipgui::add_page $IPINST -name "Page 0"] + set C [ipgui::add_param $IPINST -name "C" -parent ${Page_0}] + set_property tooltip {Channel Count} ${C} + set C_BITS [ipgui::add_param $IPINST -name "C_BITS" -parent ${Page_0}] + set_property tooltip {Must be clog2(C)} ${C_BITS} + set M [ipgui::add_param $IPINST -name "M" -parent ${Page_0}] + set_property tooltip {Input Precision} ${M} + set N [ipgui::add_param $IPINST -name "N" -parent ${Page_0}] + set_property tooltip {Output Precision} ${N} + + +} + +proc update_PARAM_VALUE.C { PARAM_VALUE.C } { + # Procedure called to update C when any of the dependent parameters in the arguments change +} + +proc validate_PARAM_VALUE.C { PARAM_VALUE.C } { + # Procedure called to validate C + return true +} + +proc update_PARAM_VALUE.C_BITS { PARAM_VALUE.C_BITS } { + # Procedure called to update C_BITS when any of the dependent parameters in the arguments change +} + +proc validate_PARAM_VALUE.C_BITS { PARAM_VALUE.C_BITS } { + # Procedure called to validate C_BITS + return true +} + +proc update_PARAM_VALUE.M { PARAM_VALUE.M } { + # Procedure called to update M when any of the dependent parameters in the arguments change +} + +proc validate_PARAM_VALUE.M { PARAM_VALUE.M } { + # Procedure called to validate M + return true +} + +proc update_PARAM_VALUE.N { PARAM_VALUE.N } { + # Procedure called to update N when any of the dependent parameters in the arguments change +} + +proc validate_PARAM_VALUE.N { PARAM_VALUE.N } { + # Procedure called to validate N + return true +} + + +proc update_MODELPARAM_VALUE.N { MODELPARAM_VALUE.N PARAM_VALUE.N } { + # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value + set_property value [get_property value ${PARAM_VALUE.N}] ${MODELPARAM_VALUE.N} +} + +proc update_MODELPARAM_VALUE.M { MODELPARAM_VALUE.M PARAM_VALUE.M } { + # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value + set_property value [get_property value ${PARAM_VALUE.M}] ${MODELPARAM_VALUE.M} +} + +proc update_MODELPARAM_VALUE.C { MODELPARAM_VALUE.C PARAM_VALUE.C } { + # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value + set_property value [get_property value ${PARAM_VALUE.C}] ${MODELPARAM_VALUE.C} +} + +proc update_MODELPARAM_VALUE.C_BITS { MODELPARAM_VALUE.C_BITS PARAM_VALUE.C_BITS } { + # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value + set_property value [get_property value ${PARAM_VALUE.C_BITS}] ${MODELPARAM_VALUE.C_BITS} +} + From 09c6da9fc27c3897d3a9cb7423a3e21978f17c2c Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Wed, 21 Sep 2022 15:36:12 +0100 Subject: [PATCH 003/235] [thresholding] FINN-44: Add skeleton class for Threshold (the RTL version, no HLS support for this class required). The following functions have been removed when compared to the original Thresholding_Batch class: - get_weightstream_width_padded() needed for cppsim - get_ap_int_max_w() needed for cppsim - get_template_param_values() needed for cppsim - get_hls_compatible_threshold_tensor() needed for cppsim/hlslib - get_verilog_top_module_intf_names() already have TOP verilog module interface names I think - get_op_and_param_counts() not used anywhere - ipgen_extra_directives() needed for cppsim/hlslib Signed-off-by: Fionn O'Donohoe --- .../thresholding_binary_search.py | 159 ++++++++++++++++++ 1 file changed, 159 insertions(+) create mode 100755 src/finn/custom_op/fpgadataflow/thresholding_binary_search.py diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py new file mode 100755 index 0000000000..0e1916706b --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py @@ -0,0 +1,159 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp + +"""@package thresholding_binary_search +- ONNX i/o tensor shape assumptions for Thresholding: +- input 0 is the input tensor, shape (..., NumChannels) +- input 1 is the threshold tensor, shape (NumChannels, n_thres) +- output 0 is the output tensor, shape (..., NumChannels) - same as input +- the '...' here can be any shape (representing groups of vectors) + +This module creates an RTL IP, HLS is not supported. See 'thresholding_batch' +for a HLS equivalent. +""" + + +class Thresholding_Bin_Search(HLSCustomOp): + """Class that corresponds to finn-rtllib 'thresholding' function.""" + + def __init__(self, onnx_node): + super().__init__(onnx_node) + + def get_nodeattr_types(self): + return {} + + def calc_tmem(self): + return 0 + + def make_shape_compatible_op(self, model): + return [] + + def infer_node_datatype(self, model): + return + + def verify_node(self): + return [] + + def bram_estimation(self): + return 0 + + def lut_estimation(self): + return 0 + + def get_input_datatype(self): + return None + + def get_output_datatype(self): + return None + + def get_weight_datatype(self): + return None + + def minimize_accumulator_width(self, model): + return None + + def get_instream_width(self): + return 0 + + def get_outstream_width(self): + return 0 + + def get_weightstream_width(self): + return 0 + + def get_folded_input_shape(self): + return tuple([] + []) + + def get_folded_output_shape(self): + return tuple([] + []) + + def get_normal_input_shape(self): + return tuple([] + []) + + def get_normal_output_shape(self): + return tuple([] + []) + + def get_number_output_values(self): + return 0 + + def get_exp_cycles(self): + return 0 + + def get_template_param_values(self): + return dict() + + def make_weight_file(self, weights, weight_file_mode, weight_file_name): + """Produce a file containing given weights (thresholds) in appropriate + format for this layer. This file can be used for either synthesis or + run-time reconfig of weights. + + Arguments: + * weights : numpy array with weights to be put into the file + * weight_file_mode : one of {hls_header, decoupled_verilog_dat, + decoupled_runtime} + * weight_file_name : filename for the weight file to be generated + """ + return + + def generate_params(self, model, path): + return + + def execute_node(self, context, graph): + return + + def code_generation_ipi(self): + return [] + + def global_includes(self): + pass + + def defines(self, var): + pass + + def read_npy_data(self): + pass + + def strm_decl(self): + pass + + def docompute(self): + pass + + def dataoutstrm(self): + pass + + def save_as_npy(self): + pass + + def blackboxfunction(self): + pass + + def pragmas(self): + pass From 1dde2479f65de6cd8bce0be7091189c5b2d313c1 Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Wed, 21 Sep 2022 15:52:58 +0100 Subject: [PATCH 004/235] [thresholding] FINN-44: Update custom_op's __init__ to pick up new Threshold_binary_search class Signed-off-by: Fionn O'Donohoe --- src/finn/custom_op/fpgadataflow/__init__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index e5eb483a00..65fbd6e20c 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -58,6 +58,9 @@ from finn.custom_op.fpgadataflow.streamingfifo import StreamingFIFO from finn.custom_op.fpgadataflow.streamingmaxpool_batch import StreamingMaxPool_Batch from finn.custom_op.fpgadataflow.thresholding_batch import Thresholding_Batch +from finn.custom_op.fpgadataflow.thresholding_binary_search import ( + Thresholding_Bin_Search, +) from finn.custom_op.fpgadataflow.tlastmarker import TLastMarker from finn.custom_op.fpgadataflow.upsampler import UpsampleNearestNeighbour_Batch from finn.custom_op.fpgadataflow.vectorvectoractivation import VectorVectorActivation @@ -79,6 +82,7 @@ custom_op["Pool_Batch"] = Pool_Batch custom_op["FMPadding_Batch"] = FMPadding_Batch custom_op["Thresholding_Batch"] = Thresholding_Batch +custom_op["Thresholding_Binary_search"] = Thresholding_Bin_Search custom_op["AddStreams_Batch"] = AddStreams_Batch custom_op["LabelSelect_Batch"] = LabelSelect_Batch custom_op["DuplicateStreams_Batch"] = DuplicateStreams_Batch From 95082d3ce1f518494910b5444da05722fa8db09c Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Wed, 21 Sep 2022 19:01:19 +0100 Subject: [PATCH 005/235] [thresholding] FINN-44: Add inital node attributes for Thresholding binary search class Signed-off-by: Fionn O'Donohoe --- .../thresholding_binary_search.py | 40 ++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py index 0e1916706b..97d8e0b281 100755 --- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py @@ -47,7 +47,45 @@ def __init__(self, onnx_node): super().__init__(onnx_node) def get_nodeattr_types(self): - return {} + my_attrs = { + # parallelization; channels thresholded per cycle + "PE": ("i", True, 0), + # number of channels (each may have different thresholds) + "NumChannels": ("i", True, 0), + # number of steps in thresholding function. Used only in decoupled mode + "numSteps": ("i", True, 1), + # string defining memory type + "ram_style": ("s", False, "distributed", {"distributed", "block"}), + # FINN DataTypes for inputs, outputs + "inputDataType": ("s", True, ""), + "weightDataType": ("s", True, ""), + "outputDataType": ("s", True, ""), + # input and output FIFO depths + "inFIFODepth": ("i", False, 0), + "outFIFODepth": ("i", False, 0), + # number of input vectors, examples: + # [1] is a single vector (like a FC layer with batch=1) + # [4] is four vectors (like a FC layer with batch=4) + # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) + "numInputVectors": ("ints", False, [1]), + # memory mode for the thresholds + # const -- embedded thresholds, default + # decoupled -- streaming thresholds with streamer packaged inside IP + "mem_mode": ("s", False, "const", {"const", "decoupled"}), + # (mem_mode = decoupled only) whether weights (thresholds) will be + # writable through an AXI-lite interface during runtime + # 1 for enabled, 0 for disabled. + # see finn-rtllib/memstream/doc/README for more about the memory + # address map used for writable weights + # IMPORTANT: After using AXI lite to either read or write the weights, + # always "flush" the accelerator by first passing a dummy input + # vector through the accelerator. This will get rid of any old + # weight data from the weight FIFOs. + "runtime_writeable_weights": ("i", False, 0, {0, 1}), + "gen_top_module": ("s", False, ""), + } + my_attrs.update(super().get_nodeattr_types()) + return my_attrs def calc_tmem(self): return 0 From 72832be6caeefdb895a911988ba5ee77d7d2813f Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Wed, 21 Sep 2022 19:02:30 +0100 Subject: [PATCH 006/235] [thresholding] FINN-44: Add calc_tmem() method Signed-off-by: Fionn O'Donohoe --- src/finn/custom_op/fpgadataflow/thresholding_binary_search.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py index 97d8e0b281..6195a26afb 100755 --- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py @@ -88,7 +88,9 @@ def get_nodeattr_types(self): return my_attrs def calc_tmem(self): - return 0 + num_channels = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + return num_channels // pe def make_shape_compatible_op(self, model): return [] From 0d4e3bea27fce23864729663411a80c6734ed402 Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Wed, 21 Sep 2022 19:06:07 +0100 Subject: [PATCH 007/235] [thresholding] FINN-44: Add methods for retrieving inut/output/weight data types Signed-off-by: Fionn O'Donohoe --- .../custom_op/fpgadataflow/thresholding_binary_search.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py index 6195a26afb..50a3ce5b6b 100755 --- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py @@ -26,6 +26,8 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +from qonnx.core.datatype import DataType + from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp """@package thresholding_binary_search @@ -108,13 +110,14 @@ def lut_estimation(self): return 0 def get_input_datatype(self): - return None + return DataType[self.get_nodeattr("inputDataType")] def get_output_datatype(self): - return None + return DataType[self.get_nodeattr("outputDataType")] def get_weight_datatype(self): - return None + """The term 'weights' and 'thresholds' are used interchangably in this class.""" + return DataType[self.get_nodeattr("weightDataType")] def minimize_accumulator_width(self, model): return None From 28568c6777d64adaa9d16f9bc58c3eda96fd7dbc Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Wed, 21 Sep 2022 19:09:24 +0100 Subject: [PATCH 008/235] [thresholding] FINN-44: Add methods for retrieving node input/output shapes Signed-off-by: Fionn O'Donohoe --- .../fpgadataflow/thresholding_binary_search.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py index 50a3ce5b6b..ee74f28485 100755 --- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py @@ -132,16 +132,25 @@ def get_weightstream_width(self): return 0 def get_folded_input_shape(self): - return tuple([] + []) + fold = self.calc_tmem() + pe = self.get_nodeattr("PE") + vecs = list(self.get_nodeattr("numInputVectors")) + folded_input_shape = tuple(vecs + [fold, pe]) + return folded_input_shape def get_folded_output_shape(self): - return tuple([] + []) + # same shape as input + return self.get_folded_input_shape() def get_normal_input_shape(self): - return tuple([] + []) + num_channels = self.get_nodeattr("NumChannels") + vecs = list(self.get_nodeattr("numInputVectors")) + normal_input_shape = tuple(vecs + [num_channels]) + return normal_input_shape def get_normal_output_shape(self): - return tuple([] + []) + # same shape as input + return self.get_normal_input_shape() def get_number_output_values(self): return 0 From 280870d25864781b2ce3683a10824049d19f9bff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Mon, 24 Oct 2022 14:58:32 +0100 Subject: [PATCH 009/235] Thresholding over signed inputs. --- finn-rtllib/thresholding/hdl/thresholding.sv | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/finn-rtllib/thresholding/hdl/thresholding.sv b/finn-rtllib/thresholding/hdl/thresholding.sv index 93ccdc51c5..9deeac458c 100644 --- a/finn-rtllib/thresholding/hdl/thresholding.sv +++ b/finn-rtllib/thresholding/hdl/thresholding.sv @@ -64,8 +64,8 @@ module thresholding #( // Input Stream input logic ivld, - input logic [C_BITS-1:0] icnl, // Ignored for C == 1 - input logic [M -1:0] idat, + input logic [C_BITS-1:0] icnl, // Ignored for C == 1 + input logic signed [M -1:0] idat, // Output Stream output logic ovld, @@ -75,10 +75,10 @@ module thresholding #( // Pipeline Links & Feed typedef struct packed { - logic vld; // Valid data identification - logic [C_BITS-1:0] cnl; // Channel - logic [M -1:0] val; // Original input value - logic [0:N-1] res; // Assembling result with valid prefix [0:stage] after stage #stage + logic vld; // Valid data identification + logic [C_BITS-1:0] cnl; // Channel + logic signed [M -1:0] val; // Original input value + logic [0:N-1] res; // Assembling result with valid prefix [0:stage] after stage #stage } pipe_t; uwire pipe_t pipe[0:N]; assign pipe[0] = pipe_t'{ vld: ivld, cnl: icnl, val: idat, res: {N{1'bx}} }; // Feed original input @@ -88,13 +88,13 @@ module thresholding #( for(genvar stage = 0; stage < N; stage++) begin : genStages // Threshold Memory - uwire [M-1:0] thresh; + uwire signed [M-1:0] thresh; if(1) begin : blkUpdate // Write control: local select from global address uwire we = twe && tws[stage]; if((C == 1) && (stage == 0)) begin - logic [M-1:0] Thresh = 'x; + logic signed [M-1:0] Thresh = 'x; always_ff @(posedge clk) begin if(rst) Thresh <= 'x; else if(we) Thresh <= twd; @@ -102,7 +102,7 @@ module thresholding #( assign thresh = Thresh; end else begin - logic [M-1:0] Threshs[C * 2**stage]; + logic signed [M-1:0] Threshs[C * 2**stage]; uwire [$clog2(C)+stage-1:0] wa = twa[$left(twa):N-stage]; uwire [$clog2(C)+stage-1:0] ra; if(C > 1) assign ra[stage+:C_BITS] = pipe[stage].cnl; @@ -114,7 +114,7 @@ module thresholding #( end // Read - logic [M-1:0] RdReg; + logic signed [M-1:0] RdReg; always_ff @(posedge clk) begin if(en) RdReg <= Threshs[ra]; end From 2bf1a21e463297a885b1a7a40ab78fb2deeb2d52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Mon, 24 Oct 2022 15:38:22 +0100 Subject: [PATCH 010/235] Introduce an optional threshold output bias. --- finn-rtllib/thresholding/hdl/thresholding.sv | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/finn-rtllib/thresholding/hdl/thresholding.sv b/finn-rtllib/thresholding/hdl/thresholding.sv index 9deeac458c..cea93e40ab 100644 --- a/finn-rtllib/thresholding/hdl/thresholding.sv +++ b/finn-rtllib/thresholding/hdl/thresholding.sv @@ -48,7 +48,12 @@ module thresholding #( int unsigned M, // input/threshold precision int unsigned C, // number of channels - localparam int unsigned C_BITS = C < 2? 1 : $clog2(C) + int BIAS = 0, // offsetting the output [0, 2^N-1) -> [-BIAS, 2^N-1 - BIAS) + + localparam int unsigned C_BITS = C < 2? 1 : $clog2(C), + localparam int unsigned O_BITS = BIAS <= 0? + /* unsigned */ $clog2(2**N-BIAS) : + /* signed */ 1+$clog2(BIAS >= 2**(N-1)? BIAS : 2**N-BIAS) )( // Global Control input logic clk, @@ -70,7 +75,7 @@ module thresholding #( // Output Stream output logic ovld, output logic [C_BITS-1:0] ocnl, - output logic [N -1:0] odat + output logic [O_BITS-1:0] odat ); // Pipeline Links & Feed @@ -148,6 +153,6 @@ module thresholding #( // Output assign ovld = pipe[N].vld; assign ocnl = pipe[N].cnl; - assign odat = pipe[N].res; + assign odat = pipe[N].res - BIAS; endmodule : thresholding From 4c7b5acd24cf88716fdfdc1dac8d8cc2c2ece44e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Tue, 25 Oct 2022 06:17:14 +0100 Subject: [PATCH 011/235] Exposing the thresholding bias through the AXI adapter. --- finn-rtllib/thresholding/hdl/thresholding_axi.sv | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi.sv b/finn-rtllib/thresholding/hdl/thresholding_axi.sv index 71e54c5ca0..a20952c33b 100644 --- a/finn-rtllib/thresholding/hdl/thresholding_axi.sv +++ b/finn-rtllib/thresholding/hdl/thresholding_axi.sv @@ -36,6 +36,12 @@ module thresholding_axi #( int unsigned N, // output precision int unsigned M, // input/threshold precision int unsigned C // Channels + + int BIAS = 0, // offsetting the output [0, 2^N-1) -> [-BIAS, 2^N-1 - BIAS) + + localparam int unsigned O_BITS = BIAS <= 0? + /* unsigned */ $clog2(2**N-BIAS) : + /* signed */ 1+$clog2(BIAS >= 2**(N-1)? BIAS : 2**N-BIAS) )( //- Global Control ------------------ input logic ap_clk, @@ -74,7 +80,7 @@ module thresholding_axi #( //- AXI Stream - Output ------------- input logic m_axis_tready, output logic m_axis_tvalid, - output logic [((N+7)/8)*8-1:0] m_axis_tdata + output logic [((O_BITS+7)/8)*8-1:0] m_axis_tdata ); //- Global Control ------------------------------------------------------ uwire clk = ap_clk; @@ -134,12 +140,12 @@ module thresholding_axi #( //- IO-Sandwich with two-stage output buffer for containing a local enable uwire en; - uwire [N-1:0] odat; + uwire [O_BITS-1:0] odat; uwire ovld; if(1) begin : blkOutputDecouple typedef struct { logic vld; - logic [N-1:0] dat; + logic [O_BITS-1:0] dat; } buf_t; buf_t Buf[2] = '{ default: '{ vld: 0, dat: 'x } }; always_ff @(posedge clk) begin @@ -187,7 +193,7 @@ module thresholding_axi #( end // Core Thresholding Module - thresholding #(.N(N), .M(M), .C(C)) core ( + thresholding #(.N(N), .M(M), .C(C), .BIAS(BIAS)) core ( .clk, .rst, .twe, .twa, .twd, .en, From 7663d3f60c445ad595a193eb6b493b4f65b2f921 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Tue, 25 Oct 2022 11:55:19 +0100 Subject: [PATCH 012/235] Have thresholding wrapper pass on bias parameter and compute derived ones. --- .../thresholding/hdl/thresholding_axi_wrapper.v | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v index bb6b17b32f..b5c65e5879 100644 --- a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v +++ b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v @@ -36,7 +36,12 @@ module thresholding_axi_wrapper #( parameter N, // output precision parameter M, // input/threshold precision parameter C, // Channels - parameter C_BITS //= $clog2(C) + parameter BIAS = 0, // offsetting the output [0, 2^N-1) -> [-BIAS, 2^N-1 - BIAS) + + localparam C_BITS = $clog2(C), + localparam O_BITS = BIAS <= 0? + /* unsigned */ $clog2(2**N-BIAS) : + /* signed */ 1+$clog2(BIAS >= 2**(N-1)? BIAS : 2**N-BIAS) )( //- Global Control ------------------ input ap_clk, @@ -75,10 +80,10 @@ module thresholding_axi_wrapper #( //- AXI Stream - Output ------------- input m_axis_tready, output m_axis_tvalid, - output [((N+7)/8)*8-1:0] m_axis_tdata + output [((O_BITS+7)/8)*8-1:0] m_axis_tdata ); - thresholding_axi #(.N(N), .M(M), .C(C)) inst ( + thresholding_axi #(.N(N), .M(M), .C(C), .BIAS(BIAS)) inst ( //- Global Control ------------------ .ap_clk(ap_clk), .ap_rst_n(ap_rst_n), From 55e2eacd4b554456bb980f7518f9c79d7be3104d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Tue, 25 Oct 2022 15:53:11 +0100 Subject: [PATCH 013/235] Fix typo. --- finn-rtllib/thresholding/hdl/thresholding_axi.sv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi.sv b/finn-rtllib/thresholding/hdl/thresholding_axi.sv index a20952c33b..6b869ba303 100644 --- a/finn-rtllib/thresholding/hdl/thresholding_axi.sv +++ b/finn-rtllib/thresholding/hdl/thresholding_axi.sv @@ -35,7 +35,7 @@ module thresholding_axi #( int unsigned N, // output precision int unsigned M, // input/threshold precision - int unsigned C // Channels + int unsigned C, // Channels int BIAS = 0, // offsetting the output [0, 2^N-1) -> [-BIAS, 2^N-1 - BIAS) From fa5d71aaf2b4ba3340aa8e07e23d90bf45bee32d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Tue, 25 Oct 2022 16:58:08 +0100 Subject: [PATCH 014/235] Abandon IPI support files. --- finn-rtllib/thresholding/component.xml | 817 ------------------ .../xgui/thresholding_axi_wrapper_v1_0.tcl | 74 -- 2 files changed, 891 deletions(-) delete mode 100644 finn-rtllib/thresholding/component.xml delete mode 100644 finn-rtllib/thresholding/xgui/thresholding_axi_wrapper_v1_0.tcl diff --git a/finn-rtllib/thresholding/component.xml b/finn-rtllib/thresholding/component.xml deleted file mode 100644 index 0a56f93316..0000000000 --- a/finn-rtllib/thresholding/component.xml +++ /dev/null @@ -1,817 +0,0 @@ - - - amd.com - user - thresholding_axi_wrapper - 1.0 - - - m_axis - - - - - - - TDATA - - - m_axis_tdata - - - - - TVALID - - - m_axis_tvalid - - - - - TREADY - - - m_axis_tready - - - - - - s_axis - - - - - - - TDATA - - - s_axis_tdata - - - - - TVALID - - - s_axis_tvalid - - - - - TREADY - - - s_axis_tready - - - - - - s_axilite - - - - - - - - - AWADDR - - - s_axilite_AWADDR - - - - - AWVALID - - - s_axilite_AWVALID - - - - - AWREADY - - - s_axilite_AWREADY - - - - - WDATA - - - s_axilite_WDATA - - - - - WSTRB - - - s_axilite_WSTRB - - - - - WVALID - - - s_axilite_WVALID - - - - - WREADY - - - s_axilite_WREADY - - - - - BRESP - - - s_axilite_BRESP - - - - - BVALID - - - s_axilite_BVALID - - - - - BREADY - - - s_axilite_BREADY - - - - - ARADDR - - - s_axilite_ARADDR - - - - - ARVALID - - - s_axilite_ARVALID - - - - - ARREADY - - - s_axilite_ARREADY - - - - - RDATA - - - s_axilite_RDATA - - - - - RRESP - - - s_axilite_RRESP - - - - - RVALID - - - s_axilite_RVALID - - - - - RREADY - - - s_axilite_RREADY - - - - - - ap_rst_n - - - - - - - RST - - - ap_rst_n - - - - - - POLARITY - ACTIVE_LOW - - - - - ap_clk - - - - - - - CLK - - - ap_clk - - - - - - ASSOCIATED_RESET - ap_rst_n - - - ASSOCIATED_BUSIF - m_axis:s_axis:s_axilite - - - - - - - s_axilite - s_axilite - - reg0 - reg0 - 0x0 - 4096 - 32 - register - - - - - - - xilinx_anylanguagesynthesis - Synthesis - :vivado.xilinx.com:synthesis - Verilog - thresholding_axi_wrapper - - xilinx_anylanguagesynthesis_view_fileset - - - - viewChecksum - 5cc8f7a9 - - - - - xilinx_xpgui - UI Layout - :vivado.xilinx.com:xgui.ui - - xilinx_xpgui_view_fileset - - - - viewChecksum - c456596c - - - - - - - ap_clk - - in - - - std_logic - xilinx_anylanguagesynthesis - - - - - - ap_rst_n - - in - - - std_logic - xilinx_anylanguagesynthesis - - - - - - s_axilite_AWVALID - - in - - - std_logic - xilinx_anylanguagesynthesis - - - - 0 - - - - - s_axilite_AWREADY - - out - - - std_logic - xilinx_anylanguagesynthesis - - - - - - s_axilite_AWADDR - - in - - 3 - 0 - - - - std_logic_vector - xilinx_anylanguagesynthesis - - - - 0 - - - - - s_axilite_WVALID - - in - - - std_logic - xilinx_anylanguagesynthesis - - - - 0 - - - - - s_axilite_WREADY - - out - - - std_logic - xilinx_anylanguagesynthesis - - - - - - s_axilite_WDATA - - in - - 31 - 0 - - - - std_logic_vector - xilinx_anylanguagesynthesis - - - - 0 - - - - - s_axilite_WSTRB - - in - - 3 - 0 - - - - std_logic_vector - xilinx_anylanguagesynthesis - - - - 1 - - - - - s_axilite_BVALID - - out - - - std_logic - xilinx_anylanguagesynthesis - - - - - - s_axilite_BREADY - - in - - - std_logic - xilinx_anylanguagesynthesis - - - - 0 - - - - - s_axilite_BRESP - - out - - 1 - 0 - - - - std_logic_vector - xilinx_anylanguagesynthesis - - - - - - s_axilite_ARVALID - - in - - - std_logic - xilinx_anylanguagesynthesis - - - - 0 - - - - - s_axilite_ARREADY - - out - - - std_logic - xilinx_anylanguagesynthesis - - - - - - s_axilite_ARADDR - - in - - 0 - 0 - - - - std_logic_vector - xilinx_anylanguagesynthesis - - - - 0 - - - - - s_axilite_RVALID - - out - - - std_logic - xilinx_anylanguagesynthesis - - - - - - s_axilite_RREADY - - in - - - std_logic - xilinx_anylanguagesynthesis - - - - 0 - - - - - s_axilite_RDATA - - out - - 31 - 0 - - - - std_logic_vector - xilinx_anylanguagesynthesis - - - - - - s_axilite_RRESP - - out - - 1 - 0 - - - - std_logic_vector - xilinx_anylanguagesynthesis - - - - - - s_axis_tready - - out - - - std_logic - xilinx_anylanguagesynthesis - - - - - - s_axis_tvalid - - in - - - std_logic - xilinx_anylanguagesynthesis - - - - - - s_axis_tdata - - in - - 15 - 0 - - - - std_logic_vector - xilinx_anylanguagesynthesis - - - - 0 - - - - - m_axis_tready - - in - - - std_logic - xilinx_anylanguagesynthesis - - - - 1 - - - - - m_axis_tvalid - - out - - - std_logic - xilinx_anylanguagesynthesis - - - - - - m_axis_tdata - - out - - 7 - 0 - - - - std_logic_vector - xilinx_anylanguagesynthesis - - - - - - - - N - N - 4 - - - M - M - 16 - - - C - C - 1 - - - C_BITS - C Bits - 0 - - - - - - choice_list_74b5137e - ACTIVE_HIGH - ACTIVE_LOW - - - - - xilinx_anylanguagesynthesis_view_fileset - - hdl/thresholding.sv - systemVerilogSource - - - hdl/thresholding_axi.sv - systemVerilogSource - - - hdl/thresholding_axi_wrapper.v - verilogSource - CHECKSUM_2ec027ae - - - - xilinx_xpgui_view_fileset - - xgui/thresholding_axi_wrapper_v1_0.tcl - tclSource - CHECKSUM_c456596c - XGUI_VERSION_2 - - - - thresholding_axi_wrapper_v1_0 - - - N - N - 4 - - - M - M - 16 - - - C - C - 1 - - - C_BITS - C_BITS - 0 - - - Component_Name - thresholding_axi_wrapper_v1_0 - - - - - - virtex7 - qvirtex7 - versal - kintex7 - kintex7l - qkintex7 - qkintex7l - akintex7 - artix7 - artix7l - aartix7 - qartix7 - zynq - qzynq - azynq - spartan7 - aspartan7 - virtexu - zynquplus - virtexuplus - virtexuplusHBM - virtexuplus58g - kintexuplus - artixuplus - kintexu - - - /UserIP - - thresholding_axi_wrapper_v1_0 - package_project - AMD - 2 - 2022-09-20T12:31:16Z - - - 2022.1 - - - - - - - - - diff --git a/finn-rtllib/thresholding/xgui/thresholding_axi_wrapper_v1_0.tcl b/finn-rtllib/thresholding/xgui/thresholding_axi_wrapper_v1_0.tcl deleted file mode 100644 index 02c373e8f2..0000000000 --- a/finn-rtllib/thresholding/xgui/thresholding_axi_wrapper_v1_0.tcl +++ /dev/null @@ -1,74 +0,0 @@ -# Definitional proc to organize widgets for parameters. -proc init_gui { IPINST } { - ipgui::add_param $IPINST -name "Component_Name" - #Adding Page - set Page_0 [ipgui::add_page $IPINST -name "Page 0"] - set C [ipgui::add_param $IPINST -name "C" -parent ${Page_0}] - set_property tooltip {Channel Count} ${C} - set C_BITS [ipgui::add_param $IPINST -name "C_BITS" -parent ${Page_0}] - set_property tooltip {Must be clog2(C)} ${C_BITS} - set M [ipgui::add_param $IPINST -name "M" -parent ${Page_0}] - set_property tooltip {Input Precision} ${M} - set N [ipgui::add_param $IPINST -name "N" -parent ${Page_0}] - set_property tooltip {Output Precision} ${N} - - -} - -proc update_PARAM_VALUE.C { PARAM_VALUE.C } { - # Procedure called to update C when any of the dependent parameters in the arguments change -} - -proc validate_PARAM_VALUE.C { PARAM_VALUE.C } { - # Procedure called to validate C - return true -} - -proc update_PARAM_VALUE.C_BITS { PARAM_VALUE.C_BITS } { - # Procedure called to update C_BITS when any of the dependent parameters in the arguments change -} - -proc validate_PARAM_VALUE.C_BITS { PARAM_VALUE.C_BITS } { - # Procedure called to validate C_BITS - return true -} - -proc update_PARAM_VALUE.M { PARAM_VALUE.M } { - # Procedure called to update M when any of the dependent parameters in the arguments change -} - -proc validate_PARAM_VALUE.M { PARAM_VALUE.M } { - # Procedure called to validate M - return true -} - -proc update_PARAM_VALUE.N { PARAM_VALUE.N } { - # Procedure called to update N when any of the dependent parameters in the arguments change -} - -proc validate_PARAM_VALUE.N { PARAM_VALUE.N } { - # Procedure called to validate N - return true -} - - -proc update_MODELPARAM_VALUE.N { MODELPARAM_VALUE.N PARAM_VALUE.N } { - # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value - set_property value [get_property value ${PARAM_VALUE.N}] ${MODELPARAM_VALUE.N} -} - -proc update_MODELPARAM_VALUE.M { MODELPARAM_VALUE.M PARAM_VALUE.M } { - # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value - set_property value [get_property value ${PARAM_VALUE.M}] ${MODELPARAM_VALUE.M} -} - -proc update_MODELPARAM_VALUE.C { MODELPARAM_VALUE.C PARAM_VALUE.C } { - # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value - set_property value [get_property value ${PARAM_VALUE.C}] ${MODELPARAM_VALUE.C} -} - -proc update_MODELPARAM_VALUE.C_BITS { MODELPARAM_VALUE.C_BITS PARAM_VALUE.C_BITS } { - # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value - set_property value [get_property value ${PARAM_VALUE.C_BITS}] ${MODELPARAM_VALUE.C_BITS} -} - From 174c0ffe1d0614dd14013de1b073469d79c9191e Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Tue, 15 Nov 2022 17:59:23 +0000 Subject: [PATCH 015/235] [thresholding] allow for positive and negative bias values Signed-off-by: Fionn O'Donohoe --- finn-rtllib/thresholding/hdl/thresholding.sv | 4 ++-- finn-rtllib/thresholding/hdl/thresholding_axi.sv | 2 +- finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/finn-rtllib/thresholding/hdl/thresholding.sv b/finn-rtllib/thresholding/hdl/thresholding.sv index cea93e40ab..a99c752e17 100644 --- a/finn-rtllib/thresholding/hdl/thresholding.sv +++ b/finn-rtllib/thresholding/hdl/thresholding.sv @@ -51,7 +51,7 @@ module thresholding #( int BIAS = 0, // offsetting the output [0, 2^N-1) -> [-BIAS, 2^N-1 - BIAS) localparam int unsigned C_BITS = C < 2? 1 : $clog2(C), - localparam int unsigned O_BITS = BIAS <= 0? + localparam int unsigned O_BITS = BIAS > 0? /* unsigned */ $clog2(2**N-BIAS) : /* signed */ 1+$clog2(BIAS >= 2**(N-1)? BIAS : 2**N-BIAS) )( @@ -153,6 +153,6 @@ module thresholding #( // Output assign ovld = pipe[N].vld; assign ocnl = pipe[N].cnl; - assign odat = pipe[N].res - BIAS; + assign odat = pipe[N].res + BIAS; endmodule : thresholding diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi.sv b/finn-rtllib/thresholding/hdl/thresholding_axi.sv index 6b869ba303..795683da1d 100644 --- a/finn-rtllib/thresholding/hdl/thresholding_axi.sv +++ b/finn-rtllib/thresholding/hdl/thresholding_axi.sv @@ -39,7 +39,7 @@ module thresholding_axi #( int BIAS = 0, // offsetting the output [0, 2^N-1) -> [-BIAS, 2^N-1 - BIAS) - localparam int unsigned O_BITS = BIAS <= 0? + localparam int unsigned O_BITS = BIAS > 0? /* unsigned */ $clog2(2**N-BIAS) : /* signed */ 1+$clog2(BIAS >= 2**(N-1)? BIAS : 2**N-BIAS) )( diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v index b5c65e5879..6bfc2f57a4 100644 --- a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v +++ b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v @@ -39,7 +39,7 @@ module thresholding_axi_wrapper #( parameter BIAS = 0, // offsetting the output [0, 2^N-1) -> [-BIAS, 2^N-1 - BIAS) localparam C_BITS = $clog2(C), - localparam O_BITS = BIAS <= 0? + localparam O_BITS = BIAS > 0? /* unsigned */ $clog2(2**N-BIAS) : /* signed */ 1+$clog2(BIAS >= 2**(N-1)? BIAS : 2**N-BIAS) )( From 2ec20e5cab8c821d7dc6d652564e85eb1bc84b6b Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Tue, 15 Nov 2022 18:00:52 +0000 Subject: [PATCH 016/235] [thresholding] pass bias from top module to thresholding.sv core Signed-off-by: Fionn O'Donohoe --- finn-rtllib/thresholding/hdl/thresholding.sv | 2 +- finn-rtllib/thresholding/hdl/thresholding_axi.sv | 2 +- finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/finn-rtllib/thresholding/hdl/thresholding.sv b/finn-rtllib/thresholding/hdl/thresholding.sv index a99c752e17..f9763af96c 100644 --- a/finn-rtllib/thresholding/hdl/thresholding.sv +++ b/finn-rtllib/thresholding/hdl/thresholding.sv @@ -48,7 +48,7 @@ module thresholding #( int unsigned M, // input/threshold precision int unsigned C, // number of channels - int BIAS = 0, // offsetting the output [0, 2^N-1) -> [-BIAS, 2^N-1 - BIAS) + int BIAS, // offsetting the output [0, 2^N-1) -> [-BIAS, 2^N-1 - BIAS) localparam int unsigned C_BITS = C < 2? 1 : $clog2(C), localparam int unsigned O_BITS = BIAS > 0? diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi.sv b/finn-rtllib/thresholding/hdl/thresholding_axi.sv index 795683da1d..e4f3feac3f 100644 --- a/finn-rtllib/thresholding/hdl/thresholding_axi.sv +++ b/finn-rtllib/thresholding/hdl/thresholding_axi.sv @@ -37,7 +37,7 @@ module thresholding_axi #( int unsigned M, // input/threshold precision int unsigned C, // Channels - int BIAS = 0, // offsetting the output [0, 2^N-1) -> [-BIAS, 2^N-1 - BIAS) + int BIAS, // offsetting the output [0, 2^N-1) -> [-BIAS, 2^N-1 - BIAS) localparam int unsigned O_BITS = BIAS > 0? /* unsigned */ $clog2(2**N-BIAS) : diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v index 6bfc2f57a4..1b5921d8ba 100644 --- a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v +++ b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v @@ -36,7 +36,7 @@ module thresholding_axi_wrapper #( parameter N, // output precision parameter M, // input/threshold precision parameter C, // Channels - parameter BIAS = 0, // offsetting the output [0, 2^N-1) -> [-BIAS, 2^N-1 - BIAS) + int BIAS = 0, // offsetting the output [0, 2^N-1) -> [-BIAS, 2^N-1 - BIAS) localparam C_BITS = $clog2(C), localparam O_BITS = BIAS > 0? From 861614837dd187dc58ab24af0b5d0cd2050c76e6 Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Tue, 15 Nov 2022 18:07:56 +0000 Subject: [PATCH 017/235] [thresholding] pass O_BITS from top module to thresholding.sv core Signed-off-by: Fionn O'Donohoe --- finn-rtllib/thresholding/hdl/thresholding.sv | 4 +--- finn-rtllib/thresholding/hdl/thresholding_axi.sv | 6 ++---- finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v | 4 ++-- 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/finn-rtllib/thresholding/hdl/thresholding.sv b/finn-rtllib/thresholding/hdl/thresholding.sv index f9763af96c..04116e995c 100644 --- a/finn-rtllib/thresholding/hdl/thresholding.sv +++ b/finn-rtllib/thresholding/hdl/thresholding.sv @@ -51,9 +51,7 @@ module thresholding #( int BIAS, // offsetting the output [0, 2^N-1) -> [-BIAS, 2^N-1 - BIAS) localparam int unsigned C_BITS = C < 2? 1 : $clog2(C), - localparam int unsigned O_BITS = BIAS > 0? - /* unsigned */ $clog2(2**N-BIAS) : - /* signed */ 1+$clog2(BIAS >= 2**(N-1)? BIAS : 2**N-BIAS) + int unsigned O_BITS )( // Global Control input logic clk, diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi.sv b/finn-rtllib/thresholding/hdl/thresholding_axi.sv index e4f3feac3f..a7eec445e0 100644 --- a/finn-rtllib/thresholding/hdl/thresholding_axi.sv +++ b/finn-rtllib/thresholding/hdl/thresholding_axi.sv @@ -39,9 +39,7 @@ module thresholding_axi #( int BIAS, // offsetting the output [0, 2^N-1) -> [-BIAS, 2^N-1 - BIAS) - localparam int unsigned O_BITS = BIAS > 0? - /* unsigned */ $clog2(2**N-BIAS) : - /* signed */ 1+$clog2(BIAS >= 2**(N-1)? BIAS : 2**N-BIAS) + int unsigned O_BITS )( //- Global Control ------------------ input logic ap_clk, @@ -193,7 +191,7 @@ module thresholding_axi #( end // Core Thresholding Module - thresholding #(.N(N), .M(M), .C(C), .BIAS(BIAS)) core ( + thresholding #(.N(N), .M(M), .C(C), .BIAS(BIAS), .O_BITS(O_BITS)) core ( .clk, .rst, .twe, .twa, .twd, .en, diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v index 1b5921d8ba..5c43a70445 100644 --- a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v +++ b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v @@ -39,7 +39,7 @@ module thresholding_axi_wrapper #( int BIAS = 0, // offsetting the output [0, 2^N-1) -> [-BIAS, 2^N-1 - BIAS) localparam C_BITS = $clog2(C), - localparam O_BITS = BIAS > 0? + parameter O_BITS = BIAS > 0? /* unsigned */ $clog2(2**N-BIAS) : /* signed */ 1+$clog2(BIAS >= 2**(N-1)? BIAS : 2**N-BIAS) )( @@ -83,7 +83,7 @@ module thresholding_axi_wrapper #( output [((O_BITS+7)/8)*8-1:0] m_axis_tdata ); - thresholding_axi #(.N(N), .M(M), .C(C), .BIAS(BIAS)) inst ( + thresholding_axi #(.N(N), .M(M), .C(C), .BIAS(BIAS), .O_BITS(O_BITS)) inst ( //- Global Control ------------------ .ap_clk(ap_clk), .ap_rst_n(ap_rst_n), From 275abaddee9504360c1589565036611bab5955da Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Tue, 15 Nov 2022 18:10:12 +0000 Subject: [PATCH 018/235] [thresholding] pass C_BITS from top module to thresholding.sv core Signed-off-by: Fionn O'Donohoe --- finn-rtllib/thresholding/hdl/thresholding.sv | 2 +- finn-rtllib/thresholding/hdl/thresholding_axi.sv | 2 +- finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/finn-rtllib/thresholding/hdl/thresholding.sv b/finn-rtllib/thresholding/hdl/thresholding.sv index 04116e995c..70f94f1c22 100644 --- a/finn-rtllib/thresholding/hdl/thresholding.sv +++ b/finn-rtllib/thresholding/hdl/thresholding.sv @@ -50,7 +50,7 @@ module thresholding #( int BIAS, // offsetting the output [0, 2^N-1) -> [-BIAS, 2^N-1 - BIAS) - localparam int unsigned C_BITS = C < 2? 1 : $clog2(C), + int unsigned C_BITS, int unsigned O_BITS )( // Global Control diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi.sv b/finn-rtllib/thresholding/hdl/thresholding_axi.sv index a7eec445e0..fac69b33fc 100644 --- a/finn-rtllib/thresholding/hdl/thresholding_axi.sv +++ b/finn-rtllib/thresholding/hdl/thresholding_axi.sv @@ -191,7 +191,7 @@ module thresholding_axi #( end // Core Thresholding Module - thresholding #(.N(N), .M(M), .C(C), .BIAS(BIAS), .O_BITS(O_BITS)) core ( + thresholding #(.N(N), .M(M), .C(C), .BIAS(BIAS), .O_BITS(O_BITS), .C_BITS(C_BITS)) core ( .clk, .rst, .twe, .twa, .twd, .en, diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v index 5c43a70445..588f9e4852 100644 --- a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v +++ b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v @@ -38,7 +38,7 @@ module thresholding_axi_wrapper #( parameter C, // Channels int BIAS = 0, // offsetting the output [0, 2^N-1) -> [-BIAS, 2^N-1 - BIAS) - localparam C_BITS = $clog2(C), + parameter C_BITS = C < 2 ? 1 : $clog2(C), parameter O_BITS = BIAS > 0? /* unsigned */ $clog2(2**N-BIAS) : /* signed */ 1+$clog2(BIAS >= 2**(N-1)? BIAS : 2**N-BIAS) From 8849c026b780c152dd51c0e007c5f72bdca4808c Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Wed, 16 Nov 2022 09:31:20 +0000 Subject: [PATCH 019/235] [thresholding] create & fill in RTL template values using FINN Signed-off-by: Fionn O'Donohoe --- finn-rtllib/thresholding/hdl/thresholding.sv | 16 +-- .../thresholding/hdl/thresholding_axi.sv | 6 +- .../hdl/thresholding_axi_wrapper.v | 14 +-- .../thresholding_binary_search.py | 99 +++++++++++++++++++ 4 files changed, 117 insertions(+), 18 deletions(-) diff --git a/finn-rtllib/thresholding/hdl/thresholding.sv b/finn-rtllib/thresholding/hdl/thresholding.sv index 70f94f1c22..25d6ff3112 100644 --- a/finn-rtllib/thresholding/hdl/thresholding.sv +++ b/finn-rtllib/thresholding/hdl/thresholding.sv @@ -43,7 +43,7 @@ * threshold configuration relies on a channel address prefix. Inputs are * accompanied by a channel selector. *****************************************************************************/ -module thresholding #( +module $MODULE_NAME$ #( int unsigned N, // output precision int unsigned M, // input/threshold precision int unsigned C, // number of channels @@ -68,7 +68,7 @@ module thresholding #( // Input Stream input logic ivld, input logic [C_BITS-1:0] icnl, // Ignored for C == 1 - input logic signed [M -1:0] idat, + input logic $SIGN$ [M -1:0] idat, // Output Stream output logic ovld, @@ -80,7 +80,7 @@ module thresholding #( typedef struct packed { logic vld; // Valid data identification logic [C_BITS-1:0] cnl; // Channel - logic signed [M -1:0] val; // Original input value + logic $SIGN$ [M -1:0] val; // Original input value logic [0:N-1] res; // Assembling result with valid prefix [0:stage] after stage #stage } pipe_t; uwire pipe_t pipe[0:N]; @@ -91,13 +91,13 @@ module thresholding #( for(genvar stage = 0; stage < N; stage++) begin : genStages // Threshold Memory - uwire signed [M-1:0] thresh; + uwire $SIGN$ [M-1:0] thresh; if(1) begin : blkUpdate // Write control: local select from global address uwire we = twe && tws[stage]; if((C == 1) && (stage == 0)) begin - logic signed [M-1:0] Thresh = 'x; + logic $SIGN$ [M-1:0] Thresh = 'x; always_ff @(posedge clk) begin if(rst) Thresh <= 'x; else if(we) Thresh <= twd; @@ -105,7 +105,7 @@ module thresholding #( assign thresh = Thresh; end else begin - logic signed [M-1:0] Threshs[C * 2**stage]; + logic $SIGN$ [M-1:0] Threshs[C * 2**stage]; uwire [$clog2(C)+stage-1:0] wa = twa[$left(twa):N-stage]; uwire [$clog2(C)+stage-1:0] ra; if(C > 1) assign ra[stage+:C_BITS] = pipe[stage].cnl; @@ -117,7 +117,7 @@ module thresholding #( end // Read - logic signed [M-1:0] RdReg; + logic $SIGN$ [M-1:0] RdReg; always_ff @(posedge clk) begin if(en) RdReg <= Threshs[ra]; end @@ -153,4 +153,4 @@ module thresholding #( assign ocnl = pipe[N].cnl; assign odat = pipe[N].res + BIAS; -endmodule : thresholding +endmodule : $MODULE_NAME$ diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi.sv b/finn-rtllib/thresholding/hdl/thresholding_axi.sv index fac69b33fc..97cdfd3e12 100644 --- a/finn-rtllib/thresholding/hdl/thresholding_axi.sv +++ b/finn-rtllib/thresholding/hdl/thresholding_axi.sv @@ -32,7 +32,7 @@ * @author Thomas B. Preußer *****************************************************************************/ -module thresholding_axi #( +module $MODULE_NAME_AXI$ #( int unsigned N, // output precision int unsigned M, // input/threshold precision int unsigned C, // Channels @@ -191,7 +191,7 @@ module thresholding_axi #( end // Core Thresholding Module - thresholding #(.N(N), .M(M), .C(C), .BIAS(BIAS), .O_BITS(O_BITS), .C_BITS(C_BITS)) core ( + $MODULE_NAME$ #(.N(N), .M(M), .C(C), .BIAS(BIAS), .O_BITS(O_BITS), .C_BITS(C_BITS)) core ( .clk, .rst, .twe, .twa, .twd, .en, @@ -199,4 +199,4 @@ module thresholding_axi #( .ovld, .ocnl(), .odat ); -endmodule : thresholding_axi +endmodule : $MODULE_NAME_AXI$ diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v index 588f9e4852..e3f8596bc8 100644 --- a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v +++ b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v @@ -32,11 +32,11 @@ * @author Thomas B. Preußer *****************************************************************************/ -module thresholding_axi_wrapper #( - parameter N, // output precision - parameter M, // input/threshold precision - parameter C, // Channels - int BIAS = 0, // offsetting the output [0, 2^N-1) -> [-BIAS, 2^N-1 - BIAS) +module $MODULE_NAME_AXI_WRAPPER$ #( + parameter N = $N$, // output precision + parameter M = $M$, // input/threshold precision + parameter C = $C$, // Channels + int BIAS = $BIAS$, // offsetting the output [0, 2^N-1) -> [-BIAS, 2^N-1 - BIAS) parameter C_BITS = C < 2 ? 1 : $clog2(C), parameter O_BITS = BIAS > 0? @@ -83,7 +83,7 @@ module thresholding_axi_wrapper #( output [((O_BITS+7)/8)*8-1:0] m_axis_tdata ); - thresholding_axi #(.N(N), .M(M), .C(C), .BIAS(BIAS), .O_BITS(O_BITS)) inst ( + $MODULE_NAME_AXI$ #(.N(N), .M(M), .C(C), .BIAS(BIAS), .O_BITS(O_BITS)) inst ( //- Global Control ------------------ .ap_clk(ap_clk), .ap_rst_n(ap_rst_n), @@ -124,4 +124,4 @@ module thresholding_axi_wrapper #( .m_axis_tdata(m_axis_tdata) ); -endmodule : thresholding_axi_wrapper +endmodule : $MODULE_NAME_AXI_WRAPPER$ diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py index ee74f28485..d546d52843 100755 --- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py @@ -26,6 +26,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import os from qonnx.core.datatype import DataType from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp @@ -85,6 +86,7 @@ def get_nodeattr_types(self): # weight data from the weight FIFOs. "runtime_writeable_weights": ("i", False, 0, {0, 1}), "gen_top_module": ("s", False, ""), + "activation_bias": ("i", False, 0), } my_attrs.update(super().get_nodeattr_types()) return my_attrs @@ -174,6 +176,103 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name): """ return + # Get the integer from the DataType and string-ify it + # This assumes that the data is in the form "INTx" or similar + def conv_datatype_to_str(self, data_type): + # Handle the case that an int is passed to the function + if isinstance(data_type, int): + return str(data_type) + return str(DataType[data_type].bitwidth()) + + def prepare_codegen_rtl_values(self): + """All dictionary values produced in this function are to replace + their key value(s) in the RTL template files""" + code_gen_dict = {} + + # Identify the module names + code_gen_dict["$MODULE_NAME$"] = [self.get_verilog_top_module_name()] + code_gen_dict["$MODULE_NAME_AXI$"] = [self.get_verilog_top_module_name() + "_axi"] + code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [self.get_verilog_top_module_name() + "_axi_wrapper"] + # Set the top module name - AXI wrapper + code_gen_dict["$TOP_MODULE$"] = code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] + + # Identify the module variables + output_data_type = self.get_nodeattr("outputDataType") # output precision + input_data_type = self.get_nodeattr("inputDataType") # input/threshold precision + num_channels = self.get_nodeattr("NumChannels") # number of channels + bias = self.get_nodeattr("activation_bias") # activation bias value + + code_gen_dict["$N$"] = [self.conv_datatype_to_str(output_data_type)] # output precision + code_gen_dict["$M$"] = [self.conv_datatype_to_str(input_data_type)] # input/threshold precision + code_gen_dict["$C$"] = [self.conv_datatype_to_str(num_channels)] # number of channels + code_gen_dict["$BIAS$"] = [self.conv_datatype_to_str(bias)] # activation bias value + + # Is the input datatype signed or unsigned? The thresholding core needs to know this + if self.get_input_datatype().min() < 0: + code_gen_dict["$SIGN$"] = ["signed"] + else: + code_gen_dict["$SIGN$"] = ["unsigned"] + + return code_gen_dict + + def get_rtl_file_list(self): + return ["thresholding.sv", + "thresholding_axi.sv", + "thresholding_axi_wrapper.v"] + + def get_rtl_file_paths(self): + rtl_root_dir = os.environ["FINN_ROOT"] + "/finn-rtllib/thresholding/hdl/" + rtl_file_list = self.get_rtl_file_list() + rtl_file_paths = [rtl_root_dir + file for file in rtl_file_list] + return rtl_file_paths + + def get_rtl_template_data(self, path): + with open(path, "r") as f: + template = f.read() + return template + + def fill_in_rtl_template_data(self, replace_dict, template_data): + template_data_cp = template_data + for key in replace_dict: + replacement_line = "\n".join(replace_dict[key]) + template_data_cp = template_data_cp.replace(key, replacement_line) + return template_data_cp + + def dump_rtl_data(self, dest_dir, filename, data): + with open(os.path.join(dest_dir, filename), "w") as f: + f.write(data) + return + + def generate_hdl(self): + # Generate a dictionary of values to put in RTL template + code_gen_dict = self.prepare_codegen_rtl_values() + + # Retrieve the destination directory for the final RTL files + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + + for rtl_file_path in self.get_rtl_file_paths(): + # read in original RTL template file + template_data = self.get_rtl_template_data(rtl_file_path) + # apply code generation to templates + data = self.fill_in_rtl_template_data(code_gen_dict, template_data) + # dump filled-in template to destination directory for compilation + file_only_path = rtl_file_path.split('/')[-1] + self.dump_rtl_data(code_gen_dir, file_only_path, data) + + # Before we return - set the 'gen_top_module' attribute for use later by PyVerilator and IPI generation + self.set_nodeattr("gen_top_module", code_gen_dict["$TOP_MODULE$"][0]) + return + + def code_generation_ipgen(self, model, fpgapart, clk): + self.generate_hdl() + + # set ipgen_path and ip_path so that HLS-Synth transformation + # and stich_ip transformation do not complain + # i.e. during the HLSSynthIP() transformation + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + self.set_nodeattr("ipgen_path", code_gen_dir) + self.set_nodeattr("ip_path", code_gen_dir) + def generate_params(self, model, path): return From 84704edd5aa7e53351819238f96d4c63dfb45d07 Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Wed, 16 Nov 2022 09:45:29 +0000 Subject: [PATCH 020/235] [thresholding] add method get_weightstream_width() Signed-off-by: Fionn O'Donohoe --- .../custom_op/fpgadataflow/thresholding_binary_search.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py index d546d52843..54fa2def1e 100755 --- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py @@ -131,7 +131,14 @@ def get_outstream_width(self): return 0 def get_weightstream_width(self): - return 0 + # Only 'decoupled' mode is supported + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode != "decoupled": raise Exception("Unrecognized memory mode for this node: {}".format(mem_mode)) + pe = self.get_nodeattr("PE") + wp = self.get_weight_datatype().bitwidth() + n_thres_steps = self.get_nodeattr("numSteps") + w_width = pe * wp * n_thres_steps + return w_width def get_folded_input_shape(self): fold = self.calc_tmem() From 9aa7ff3f8c1a0584afd8684e9280d77aada43105 Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Wed, 16 Nov 2022 09:48:56 +0000 Subject: [PATCH 021/235] [thresholding] add method get_in/output_width() Signed-off-by: Fionn O'Donohoe --- .../custom_op/fpgadataflow/thresholding_binary_search.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py index 54fa2def1e..a1b75b3de1 100755 --- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py @@ -125,10 +125,12 @@ def minimize_accumulator_width(self, model): return None def get_instream_width(self): - return 0 + i_bits = self.get_input_datatype().bitwidth() + return i_bits * self.get_nodeattr("PE") def get_outstream_width(self): - return 0 + o_bits = self.get_output_datatype().bitwidth() + return o_bits * self.get_nodeattr("PE") def get_weightstream_width(self): # Only 'decoupled' mode is supported From 608b5da9222e2ede4792c487dc4d77fb5ef02e16 Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Wed, 16 Nov 2022 09:51:10 +0000 Subject: [PATCH 022/235] [thresholding] add method body for code_generation_ipi() Signed-off-by: Fionn O'Donohoe --- .../thresholding_binary_search.py | 27 ++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py index a1b75b3de1..4ca651be76 100755 --- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py @@ -289,7 +289,32 @@ def execute_node(self, context, graph): return def code_generation_ipi(self): - return [] + """Constructs and returns the TCL commands for node instantiation as an RTL block.""" + cmd = [] + rtl_file_list = self.get_rtl_file_list() + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + + for rtl_file in rtl_file_list: + cmd.append("add_files -norecurse %s" + % ( + os.path.join( + code_gen_dir, rtl_file + ) + )) + + # Create an RTL block, not an IP core (-type ip) + cmd.append("create_bd_cell -type module -reference %s %s" + % (self.get_nodeattr("gen_top_module"), self.onnx_node.name)) + + # ERROR: [BD 41-237] Bus Interface property FREQ_HZ does not match between + # /Thresholding_Binary_Search_0/s_axis(100000000 and /StreamingFIFO_0/out_V(200000000.000000) + cmd.append("set_property -dict [list CONFIG.FREQ_HZ {200000000}] [get_bd_intf_pins Thresholding_Binary_Search_0/s_axis]") + + # ERROR: [BD 41-237] Bus Interface property FREQ_HZ does not match between + # /StreamingFIFO_1/in0_V(200000000.000000) and /Thresholding_Binary_Search_0/m_axis(100000000) + cmd.append("set_property -dict [list CONFIG.FREQ_HZ {200000000}] [get_bd_intf_pins Thresholding_Binary_Search_0/m_axis]") + + return cmd def global_includes(self): pass From ca6e7e745c4ad810ac824ee3b6ccd55bdb6f724d Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Wed, 16 Nov 2022 09:56:01 +0000 Subject: [PATCH 023/235] [thresholding] add method get_verilog_top_module_intf_names() Signed-off-by: Fionn O'Donohoe --- .../thresholding_binary_search.py | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py index 4ca651be76..5dac98ad66 100755 --- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py @@ -316,6 +316,28 @@ def code_generation_ipi(self): return cmd + def get_verilog_top_module_intf_names(self): + """Return a dict of names of input and output interfaces. + The keys reflect the protocols each interface implements: + 'clk', 'rst', 'm_axis', 's_axis', 'aximm', 'axilite'. + Values are lists of tuples (axis, aximm) or names (axilite): + 'axis' tuples correspond to the list of node inputs in order, + each tuple is (interface_name, interface_width_bits). + axilite always assumed to be 32 bits and is not tuple (name only). + Each block must have at most one aximm and one axilite.""" + + intf_names = super().get_verilog_top_module_intf_names() + # Only 'decoupled' mode is supported - check before adding axilite interface + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode != "decoupled": raise Exception("Unrecognized memory mode for this node: {}".format(mem_mode)) + intf_names["axilite"] = ["s_axilite"] + intf_names["s_axis"] = [["s_axis"]] + intf_names["m_axis"] = [["m_axis"]] + + self.set_nodeattr("runtime_writeable_weights", 1) + + return intf_names + def global_includes(self): pass From 7266ee91af50a149d1d8310401e2a4134cdac18c Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Wed, 16 Nov 2022 10:41:14 +0000 Subject: [PATCH 024/235] [thresholding] retrieve axilite write sequence for runtime weight programming Signed-off-by: Fionn O'Donohoe --- .../thresholding_binary_search.py | 53 +++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py index 5dac98ad66..07b675f0f3 100755 --- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py @@ -338,6 +338,59 @@ def get_verilog_top_module_intf_names(self): return intf_names + def find_next_power_of_2(self, n): + # Negative values will loop infinitely below - return 0 + if n <= 0: + return 0 + # If '1' is requested, output will be '0' in the loop below, so avoid this earlier. + elif n == 1: + return 2 # i.e. 2**1 + + # decrement 'n' (to handle cases when `n` itself is a power of 2) + n = n - 1 + + # loop until only one bit is left + while n & n - 1: + # unset rightmost bit + n = n & n - 1 + return n << 1 + + def twos_comp(self, val, bitwidth): + return (val + (1 << bitwidth)) % (1 << bitwidth) + + def prep_axilite_val(self, val): + return self.twos_comp(int(val), self.get_weight_datatype().bitwidth()) + + def get_dynamic_config(self, model, address_stride=1): + ## TODO - not sure this description is correct + """Returns a configuration dictionary containing axilite write commands + in order to program the thresholds into the RTL core during runtime. + The default address stride for the weights is 1 byte.""" + + thresholds = model.get_initializer(self.onnx_node.input[1]) + num_channels, num_weights_per_channel = thresholds.shape + + weight_addr_boundary = self.find_next_power_of_2(num_weights_per_channel) + # Make sure that the next power of 2 (output) is greater than the input + assert weight_addr_boundary >= num_weights_per_channel + + config = {} + channel_cntr = 0 + for channel in thresholds: + channel_start_addr = (channel_cntr * weight_addr_boundary * address_stride) + weight_cntr = 0 + addr = 0 + for weight in channel: + key_name = "{}_{}{}_{}{}".format("axilite", "ch", str(channel_cntr), "w", str(weight_cntr)) + config[key_name] = (channel_start_addr + addr, self.prep_axilite_val(weight)) + + weight_cntr += 1 + addr += address_stride + + channel_cntr += 1 + + return config + def global_includes(self): pass From f88bdbfeb4ade334740d29fa81f6a83174635ad2 Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Wed, 16 Nov 2022 11:06:39 +0000 Subject: [PATCH 025/235] [thresholding] add methods for creating weight files for each simulation type Signed-off-by: Fionn O'Donohoe --- .../thresholding_binary_search.py | 130 +++++++++++++++++- 1 file changed, 128 insertions(+), 2 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py index 07b675f0f3..6ed07287ab 100755 --- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py @@ -27,9 +27,17 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import os +import numpy as np from qonnx.core.datatype import DataType - +from qonnx.util.basic import ( + interleave_matrix_outer_dim_from_partitions, + roundup_to_integer_multiple, +) +from finn.util.data_packing import ( + pack_innermost_dim_as_hex_string, +) from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +import warnings """@package thresholding_binary_search - ONNX i/o tensor shape assumptions for Thresholding: @@ -172,6 +180,63 @@ def get_exp_cycles(self): def get_template_param_values(self): return dict() + def get_hls_compatible_threshold_tensor(self, orig_thres_matrix): + """Convert the original numpy weight matrix orig_weight_matrix into + a form suitable for passing to the hlslib call: + * ensure MH % PE == 0 + * for unsigned inputs, ensure thresholds are positive + * interleave rows between PEs + * reshape into (PE, TMEM, n_thres_steps) and return + """ + mh = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + tmem = mh // pe + assert mh % pe == 0, "Requirement NumChannels divisable by PE is violated." + assert ( + orig_thres_matrix.ndim == 2 + ), """Threshold matrix dimension is + not as expected (2).""" + n_thres_steps = orig_thres_matrix.shape[1] + assert n_thres_steps == self.get_nodeattr( + "numSteps" + ), "Mismatch in threshold steps" + if not self.get_input_datatype().signed(): + # ensure all thresholds are nonnegative + assert (orig_thres_matrix >= 0).all() + # ensure all thresholds are integer + assert np.equal( + np.mod(orig_thres_matrix, 1), 0 + ).all(), "Need int threshold tensor" + ret = orig_thres_matrix + # workaround for vivado_hls threshold bug + if ret[0][0] == 0 and n_thres_steps == 1: + ret = np.copy(ret) + ret[0][0] = 1 + warnings.warn( + "Setting 0-valued first threshold to 1 to avoid vivado_hls bug" + ) + # ensure channels = mh , duplicating if necessary + if ret.shape[0] == 1: + ret = np.tile(ret, (mh, 1)) + assert ( + ret.shape[0] == mh + ), "Channels of threshold matrix are not as expected (mh)" + # distribute rows between PEs + ret = interleave_matrix_outer_dim_from_partitions(ret, pe) + assert ( + ret.shape[0] == pe + ), """First dimension after distribution of the + rows between PEs is not as expected (pe)""" + assert ( + ret.shape[1] == tmem + ), """Second dimension after distribution of the + rows between PEs is not as expected (tmem)""" + assert ( + ret.shape[2] == n_thres_steps + ), """Third dimension after distribution of the + rows between PEs is not as expected (n_thres_steps)""" + return ret.reshape(1, pe, tmem, n_thres_steps) + def make_weight_file(self, weights, weight_file_mode, weight_file_name): """Produce a file containing given weights (thresholds) in appropriate format for this layer. This file can be used for either synthesis or @@ -183,7 +248,68 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name): decoupled_runtime} * weight_file_name : filename for the weight file to be generated """ - return + # There are 'decoupled_*' flavors, just make sure that the flavors are decoupled related + if "decoupled" not in weight_file_mode: raise Exception("Unrecognized memory mode for this node: {}".format(weight_file_mode)) + + threshold_tensor = self.get_hls_compatible_threshold_tensor(weights) + tdt = self.get_weight_datatype() + assert np.vectorize(tdt.allowed)( + threshold_tensor + ).all(), "Thresholds can't be expressed with type %s" % str(tdt) + + # streaming thresholds need to be organized differently + # (1, pe, tmem, n_thres_steps) -> (1, tmem, pe, n_thres_steps) + decoupled_thres = np.transpose(threshold_tensor, (0, 2, 1, 3)) + # (1, tmem, pe, n_thres_steps) -(1, tmem, pe * n_thres_steps) + pe = self.get_nodeattr("PE") + n_thres_steps = self.get_nodeattr("numSteps") + decoupled_thres_pe_flipped = np.flip(decoupled_thres, axis=-2) + decoupled_thres = decoupled_thres.reshape(1, -1, pe * n_thres_steps) + decoupled_thres = decoupled_thres.copy() + decoupled_thres_pe_flipped = decoupled_thres_pe_flipped.reshape( + 1, -1, pe * n_thres_steps + ) + decoupled_thres_pe_flipped = decoupled_thres_pe_flipped.copy() + + if weight_file_mode == "decoupled_npy": + # save weight stream into npy for cppsim + np.save(weight_file_name, decoupled_thres) + elif weight_file_mode == "decoupled_verilog_dat": + # convert weight values into hexstring + weight_width = self.get_weightstream_width() + # pad to nearest 4 bits to get hex strings + weight_width_padded = roundup_to_integer_multiple(weight_width, 4) + weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string( + decoupled_thres_pe_flipped, tdt, weight_width_padded, prefix="" + ) + weight_stream = weight_tensor_pe_flipped.flatten() + weight_stream = weight_stream.copy() + with open(weight_file_name, "w") as f: + for val in weight_stream: + f.write(val + "\n") + elif weight_file_mode == "decoupled_runtime": + # memstream axi-lite interface will map each mem line to + # one or multiple 32-bit words + weight_width = self.get_weightstream_width() + words_per_memwidth = 2 ** ceil(log2(weight_width / 32)) + if words_per_memwidth < 1: + words_per_memwidth = 1 + weight_width_padded = words_per_memwidth * 32 + # first, pack and ensure padding to 32 bits + weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string( + decoupled_thres_pe_flipped, tdt, weight_width_padded, prefix="" + ) + weight_stream = weight_tensor_pe_flipped.flatten() + weight_stream = weight_stream.copy() + with open(weight_file_name, "w") as f: + for val in weight_stream: + # split into groups of 8 hex digits (= 32 bits) + words_32b = textwrap.wrap(val, 8) + words_32b.reverse() + for word_32b in words_32b: + f.write(word_32b + "\n") + else: + raise Exception("Decoupled weight export not yet implemented") # Get the integer from the DataType and string-ify it # This assumes that the data is in the form "INTx" or similar From 560771a1b87a6f25dd2274232be55d86b350f74b Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Wed, 16 Nov 2022 11:08:53 +0000 Subject: [PATCH 026/235] [thresholding] add method generate_params() Signed-off-by: Fionn O'Donohoe --- .../thresholding_binary_search.py | 38 +++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py index 6ed07287ab..ff9f5f4875 100755 --- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py @@ -408,7 +408,45 @@ def code_generation_ipgen(self, model, fpgapart, clk): self.set_nodeattr("ipgen_path", code_gen_dir) self.set_nodeattr("ip_path", code_gen_dir) + # Generate params for RTLSim + self.generate_params(model, code_gen_dir) + def generate_params(self, model, path): + # Only 'decoupled' mode is supported + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode != "decoupled": raise Exception("Unrecognized memory mode for this node: {}".format(mem_mode)) + + code_gen_dir = path + weight_filename_sim = "{}/thresholds.npy".format(code_gen_dir) + thresholds = model.get_initializer(self.onnx_node.input[1]) + self.make_weight_file(thresholds, "decoupled_npy", weight_filename_sim) + + # Verilog.dat thresholds: + # also save weights as Verilog .dat file + # note that we provide two different .dat files, one for synth + # and one for synthesis. this is because URAM-based weights always + # need zero weights for synthesis, otherwise they get inferred + # as BRAM + weight_filename_rtl_synth = "{}/memblock_synth_0.dat".format(code_gen_dir) + weight_filename_rtl_sim = "{}/memblock_sim_0.dat".format(code_gen_dir) + # sim weights are always the true weights + self.make_weight_file( + thresholds, "decoupled_verilog_dat", weight_filename_rtl_sim + ) + + # Synthesis thresholds: + ram_style = self.get_nodeattr("ram_style") + if ram_style == "ultra": + # UltraRAM must have no memory initializer, or only zeroes + # otherwise BRAM will be inferred instead of URAM + # as a workaround we provide a zero-weight init here + synth_thresholds = np.zeros_like(thresholds, dtype=np.float32) + else: + synth_thresholds = thresholds + self.make_weight_file( + synth_thresholds, "decoupled_verilog_dat", weight_filename_rtl_synth + ) + return def execute_node(self, context, graph): From e763bf80773be4e362f9f9171a01bb4b9eb4dc8a Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Wed, 16 Nov 2022 11:11:49 +0000 Subject: [PATCH 027/235] [thresholding] add method for preparing a Pyverilator object for RTL simulation Signed-off-by: Fionn O'Donohoe --- .../thresholding_binary_search.py | 33 ++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py index ff9f5f4875..611a75992e 100755 --- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py @@ -28,6 +28,7 @@ import os import numpy as np +import warnings from qonnx.core.datatype import DataType from qonnx.util.basic import ( interleave_matrix_outer_dim_from_partitions, @@ -37,7 +38,12 @@ pack_innermost_dim_as_hex_string, ) from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp -import warnings +from finn.util.basic import make_build_dir, get_rtlsim_trace_depth + +try: + from pyverilator import PyVerilator +except ModuleNotFoundError: + PyVerilator = None """@package thresholding_binary_search - ONNX i/o tensor shape assumptions for Thresholding: @@ -449,6 +455,31 @@ def generate_params(self, model, path): return + def prepare_rtlsim(self): + """Creates a Verilator emulation library for the RTL code generated + for this node, sets the rtlsim_so attribute to its path and returns + a PyVerilator wrapper around it.""" + + if PyVerilator is None: + raise ImportError("Installation of PyVerilator is required.") + + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + verilog_paths = [code_gen_dir] + verilog_files = self.get_rtl_file_list() + + # build the Verilator emulation library + sim = PyVerilator.build( + verilog_files, + build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"), + verilog_path=verilog_paths, + trace_depth=get_rtlsim_trace_depth(), + top_module_name=self.get_nodeattr("gen_top_module"), + ) + + # save generated lib filename in attribute + self.set_nodeattr("rtlsim_so", sim.lib._name) + return sim + def execute_node(self, context, graph): return From 84e08f18a031dbfacec6a11b980c09885552efdf Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Wed, 16 Nov 2022 11:14:47 +0000 Subject: [PATCH 028/235] [thresholding] add method to run rtlsim on a thresholding binary search simulation object Signed-off-by: Fionn O'Donohoe --- .../thresholding_binary_search.py | 79 +++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py index 611a75992e..4c7c67af72 100755 --- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py @@ -35,6 +35,8 @@ roundup_to_integer_multiple, ) from finn.util.data_packing import ( + npy_to_rtlsim_input, + rtlsim_output_to_npy, pack_innermost_dim_as_hex_string, ) from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp @@ -481,6 +483,83 @@ def prepare_rtlsim(self): return sim def execute_node(self, context, graph): + # Perform input checks + if self.get_nodeattr("exec_mode") != "rtlsim": raise Exception("Invalid exec_mode value: {}; exec_mode must be set to 'rtlsim'".format(self.get_nodeattr("exec_mode"))) + if self.get_nodeattr("mem_mode") != "decoupled": raise Exception("Invalid mem_mode value: {}; mem_mode must be set to 'decoupled'".format(self.get_nodeattr("mem_mode"))) + + node = self.onnx_node + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + + # create a npy file fore each input of the node (in_ind is input index) + in_ind = 0 + for inputs in node.input: + # it is assumed that the first input of the node is the data input + # the second input are the weights + # the third input are the thresholds + if in_ind == 0: + assert ( + str(context[inputs].dtype) == "float32" + ), """Input datatype is + not float32 as expected.""" + expected_inp_shape = self.get_folded_input_shape() + reshaped_input = context[inputs].reshape(expected_inp_shape) + + if self.get_input_datatype() == DataType["BIPOLAR"]: + # store bipolar activations as binary + reshaped_input = (reshaped_input + 1) / 2 + export_idt = DataType["BINARY"] + else: + export_idt = self.get_input_datatype() + + # make copy before saving the array + reshaped_input = reshaped_input.copy() + np.save( + os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)), + reshaped_input, + ) + elif in_ind > 2: + raise Exception("Unexpected input found for Thresholding_Batch") + in_ind += 1 + + # Create a PyVerilator wrapper of the RTLSim .so + sim = self.get_rtlsim() + nbits = self.get_instream_width() + inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + + super().reset_rtlsim(sim) + super().toggle_clk(sim) + + wnbits = self.get_weightstream_width() + export_wdt = self.get_weight_datatype() + wei = npy_to_rtlsim_input( + "{}/thresholds.npy".format(code_gen_dir), export_wdt, wnbits + ) + num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) + io_dict = { + "inputs": {"in0": inp, "weights": wei * num_w_reps}, + "outputs": {"s_axis": []}, + } + self.rtlsim_multi_io(sim, io_dict) + output = io_dict["outputs"]["out"] + + # Manage output data + odt = self.get_output_datatype() + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + + rtlsim_output_to_npy( + output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + + # load and reshape output + output = np.load(out_npy_path) + oshape = self.get_normal_output_shape() + output = np.asarray([output], dtype=np.float32).reshape(*oshape) + context[node.output[0]] = output return def code_generation_ipi(self): From b0be07adb8e2bb0ab5005169ff0f878efc5c7c80 Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Wed, 16 Nov 2022 11:16:33 +0000 Subject: [PATCH 029/235] [thresholding] add stubbed method for ipgen_singlenode_code() Signed-off-by: Fionn O'Donohoe --- .../custom_op/fpgadataflow/thresholding_binary_search.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py index 4c7c67af72..19140a0090 100755 --- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py @@ -665,6 +665,13 @@ def get_dynamic_config(self, model, address_stride=1): return config + def ipgen_singlenode_code(self): + """Normally: Builds the bash script for IP generation.""" + """This is needed for the HLSSynthIP() transformation. + This is an IP, not a HLS node, so therefore provide an empty hook + to prevent any HLS synthesis.""" + pass + def global_includes(self): pass From 30d22f88a40864257a97f7e9e9ff84f25c1bc32e Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Wed, 16 Nov 2022 13:51:10 +0000 Subject: [PATCH 030/235] [thresholding] update class name to a more consistent naming convention Signed-off-by: Fionn O'Donohoe --- src/finn/custom_op/fpgadataflow/__init__.py | 4 ++-- src/finn/custom_op/fpgadataflow/thresholding_binary_search.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index 65fbd6e20c..dc9a5a349a 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -59,7 +59,7 @@ from finn.custom_op.fpgadataflow.streamingmaxpool_batch import StreamingMaxPool_Batch from finn.custom_op.fpgadataflow.thresholding_batch import Thresholding_Batch from finn.custom_op.fpgadataflow.thresholding_binary_search import ( - Thresholding_Bin_Search, + Thresholding_Binary_Search, ) from finn.custom_op.fpgadataflow.tlastmarker import TLastMarker from finn.custom_op.fpgadataflow.upsampler import UpsampleNearestNeighbour_Batch @@ -82,7 +82,7 @@ custom_op["Pool_Batch"] = Pool_Batch custom_op["FMPadding_Batch"] = FMPadding_Batch custom_op["Thresholding_Batch"] = Thresholding_Batch -custom_op["Thresholding_Binary_search"] = Thresholding_Bin_Search +custom_op["Thresholding_Binary_Search"] = Thresholding_Binary_Search custom_op["AddStreams_Batch"] = AddStreams_Batch custom_op["LabelSelect_Batch"] = LabelSelect_Batch custom_op["DuplicateStreams_Batch"] = DuplicateStreams_Batch diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py index 19140a0090..9bf36283da 100755 --- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py @@ -59,7 +59,7 @@ """ -class Thresholding_Bin_Search(HLSCustomOp): +class Thresholding_Binary_Search(HLSCustomOp): """Class that corresponds to finn-rtllib 'thresholding' function.""" def __init__(self, onnx_node): From 3594edddf51f8a13053a6ad99e179d081e15d8d4 Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Thu, 17 Nov 2022 09:54:46 +0000 Subject: [PATCH 031/235] [thresholding] add fpgadataflow pytests for thresholding binary search node Signed-off-by: Fionn O'Donohoe --- ...fpgadataflow_thresholding_binary_search.py | 417 ++++++++++++++++++ 1 file changed, 417 insertions(+) create mode 100755 tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py new file mode 100755 index 0000000000..0a02503300 --- /dev/null +++ b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py @@ -0,0 +1,417 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest +import numpy as np +from onnx import TensorProto, helper +from pyverilator.util.axi_utils import axilite_write, reset_rtlsim +from qonnx.core.datatype import DataType +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.custom_op.general.multithreshold import multithreshold +from qonnx.custom_op.registry import getCustomOp +from qonnx.transformation.general import GiveUniqueNodeNames +from qonnx.util.basic import gen_finn_dt_tensor + +from finn.core.rtlsim_exec import rtlsim_exec +from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim +from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO +from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode + +test_fpga_part = "xczu3eg-sbva484-1-e" +target_clk_ns = 5 + +# Helper functions +def sort_thresholds_increasing(thresholds): + return np.sort(thresholds, axis=1) + +def generate_random_threshold_values(input_data_type, num_input_channels, num_steps): + return np.random.randint(input_data_type.min(), input_data_type.max() + 1, (num_input_channels, num_steps)).astype(np.float32) + +def generate_pe_value(fold, num_input_channels): + if fold == -1: + fold = num_input_channels + pe = num_input_channels // fold + assert num_input_channels % pe == 0 + return pe + +# n = batch, c = channel, h = height, w = width of feature map +# Standard = NCHW; FINN = NHWC +# Convert from NCHW to NHWC +def convert_np_array_to_finn_data_layout(data): + return np.transpose(data, (0, 2, 3, 1)) + +# n = batch, c = channel, h = height, w = width of feature map +# Standard = NCHW; FINN = NHWC +# Convert from NHWC to NCHW +def convert_np_array_to_standard_data_layout(data): + return np.transpose(data, (0, 3, 1, 2)) + +def make_single_thresholding_binary_search_modelwrapper( + thresholds, pe, input_data_type, output_data_type, activation_bias, mem_mode, num_input_vecs +): + NumChannels = thresholds.shape[0] + + inp = helper.make_tensor_value_info( + "inp", TensorProto.FLOAT, num_input_vecs + [NumChannels] + ) + outp = helper.make_tensor_value_info( + "outp", TensorProto.FLOAT, num_input_vecs + [NumChannels] + ) + + node_inp_list = ["inp", "thresh"] + + Thresholding_node = helper.make_node( + "Thresholding_Binary_Search", + node_inp_list, + ["outp"], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + NumChannels=NumChannels, + PE=pe, + numSteps=thresholds.shape[1], + inputDataType=input_data_type.name, + weightDataType=input_data_type.name, + outputDataType=output_data_type.name, + activation_bias=activation_bias, + mem_mode=mem_mode, + numInputVectors=num_input_vecs, + ) + graph = helper.make_graph( + nodes=[Thresholding_node], + name="thresholding_graph", + inputs=[inp], + outputs=[outp], + ) + + model = helper.make_model(graph, producer_name="thresholding-model") + model = ModelWrapper(model) + + model.set_tensor_datatype("inp", input_data_type) + model.set_tensor_datatype("outp", output_data_type) + + model.set_tensor_datatype("thresh", input_data_type) + model.set_initializer("thresh", thresholds) + return model + +# Test brief: a particular method for this class was causing a bug - find_next_power_of_2() +# Weights in the thresholding core are programmed on a per-channel basis and are byte-addressable. +# When a channel is programmed, the next channel can start programming at the next power-of-2 byte boundary. +# This test is to show that the function that calculates that boundary is working correctly. +# +# A Thresholding_Binary_Search layer was created and a SW generated dataset with a threshold channel +# depth of 1 weight (1 layer of N channels in the thresholding core). However, find_next_power_of_2() +# was returning a next-power-of-2 address boundary at address '0', instead of '2'. This unit test +# is to prove that this bug no longer occurs. It was originally seen when the input datatype +# was 'DataType["BIPOLAR"]'. +@pytest.mark.tbs_unit +@pytest.mark.tbs_all +def test_fpgadataflow_thresholding_binary_search_unit(): + activation = DataType["BIPOLAR"] + input_data_type = DataType["INT16"] + fold = -1 + num_input_channels = 16 + mem_mode = "decoupled" + + # Handle inputs to the test + pe = generate_pe_value(fold, num_input_channels) + num_steps = activation.get_num_possible_values() - 1 + + # Other non-input parameters + num_input_vecs = [1, 2, 2] + output_data_type = activation + if output_data_type == DataType["BIPOLAR"]: + activation_bias = 0 + else: + activation_bias = output_data_type.min() + + # Generate random thresholds and sort in ascending order + thresholds = generate_random_threshold_values(input_data_type, num_input_channels, num_steps) + + # Generate model from input parameters to the test + model = make_single_thresholding_binary_search_modelwrapper( + thresholds, pe, input_data_type, output_data_type, activation_bias, mem_mode, num_input_vecs + ) + + # Retrieve the class to get the method-under-test + tbs_node = model.get_nodes_by_op_type("Thresholding_Binary_Search")[0] + tbs_inst = getCustomOp(tbs_node) + + test_vector = [ + {"input": -2, "expected_result": 0}, + {"input": -1, "expected_result": 0}, + {"input": 0, "expected_result": 0}, + {"input": 1, "expected_result": 2}, + {"input": 2, "expected_result": 2}, + {"input": 3, "expected_result": 4}, + {"input": 4, "expected_result": 4}, + {"input": 7, "expected_result": 8}, + {"input": 8, "expected_result": 8}, + {"input": 11, "expected_result": 16}, + {"input": 15, "expected_result": 16}, + {"input": 16, "expected_result": 16}, + {"input": 18, "expected_result": 32}, + {"input": 27, "expected_result": 32}, + {"input": 31, "expected_result": 32}, + {"input": 32, "expected_result": 32}, + {"input": 42, "expected_result": 64}, + {"input": 65, "expected_result": 128}, + ] + + for test_dict in test_vector: + output = tbs_inst.find_next_power_of_2(test_dict["input"]) + assert output >= test_dict["input"] + assert output == test_dict["expected_result"] + + return + +# Test brief: Prove that cppsim is not supported for this class +@pytest.mark.tbs_cppsim +@pytest.mark.tbs_all +def test_fpgadataflow_thresholding_binary_search_cppsim(): + input_data_type = DataType["UINT16"] + act = DataType["BIPOLAR"] + fold = -1 + num_input_channels = 16 + mem_mode = "decoupled" # 'const' is unsupported - see test_fpgadataflow_thresholding_binary_search_const_mem_mode + + pe = generate_pe_value(fold, num_input_channels) + num_steps = act.get_num_possible_values() - 1 + + # Generate random, non-decreasing thresholds + thresholds = generate_random_threshold_values(input_data_type, num_input_channels, num_steps) + + # make the vivado_hls threshold bug appear (incorrect rtlsim result when first + # threshold of first channel is zero, while using BIPOLAR output) + if act == DataType["BIPOLAR"]: + thresholds[0][0] = 0 + thresholds = sort_thresholds_increasing(thresholds) + + # Other non-input parameters + num_input_vecs = [1, 2, 2] + output_data_type = act + if output_data_type == DataType["BIPOLAR"]: + activation_bias = 0 + else: + activation_bias = output_data_type.min() + + # Generate model from input parameters to the test + model = make_single_thresholding_binary_search_modelwrapper( + thresholds, pe, input_data_type, output_data_type, activation_bias, mem_mode, num_input_vecs + ) + + # Cppsim is not supported for this class, catch the specific exception thrown by cppsim + # Exception raised in cppsim: Custom op_type Thresholding_Binary_Search is currently not supported. + try: + model = model.transform(PrepareCppSim()) + model = model.transform(CompileCppSim()) + model = model.transform(SetExecMode("cppsim")) + except Exception as e: + if str(e) != "Custom op_type Thresholding_Binary_Search is currently not supported.": + raise + +# Test brief: Prove that memory mode 'const' is not supported for this layer type +@pytest.mark.tbs_const +@pytest.mark.tbs_all +def test_fpgadataflow_thresholding_binary_search_const_mem_mode(): + input_data_type = DataType["INT16"] + activation = DataType["INT4"] + fold = -1 + num_input_channels = 16 + mem_mode = "const" + + pe = generate_pe_value(fold, num_input_channels) + num_input_vecs = [1, 2, 2] + output_data_type = activation + activation_bias = output_data_type.min() + + # Generate random thresholds and sort in ascending order + num_steps = activation.get_num_possible_values() - 1 + thresholds = generate_random_threshold_values(input_data_type, num_input_channels, num_steps) + + # Generate model from input parameters to the test + model = make_single_thresholding_binary_search_modelwrapper( + thresholds, pe, input_data_type, output_data_type, activation_bias, mem_mode, num_input_vecs + ) + + # Prove that 'const' memory mode is not supported for this class + # 'const' memory mode is not supported for this class, catch the specific exception thrown by FINN + # Exception: ('Unrecognized memory mode for this node:', 'const') + try: + model = model.transform(InsertFIFO(True)) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) + model = model.transform(HLSSynthIP()) + model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns)) + except Exception as e: + if str(e) != "Unrecognized memory mode for this node: {}".format(mem_mode): + raise + # Caught the expected exception, leave the test early + return + +# Test brief: Test that PrepareRTLSim() runs successfully. This function is not +# tested in test_fpgadataflow_thresholding_binary_search() +@pytest.mark.tbs_prep_rtlsim +@pytest.mark.tbs_all +def test_fpgadataflow_thresholding_binary_search_prepare_rtlsim(): + input_data_type = DataType["INT16"] + act = DataType["INT4"] + fold = -1 + num_input_channels = 16 + mem_mode = "decoupled" + + # Handle inputs to the test + pe = generate_pe_value(fold, num_input_channels) + num_steps = act.get_num_possible_values() - 1 + + # Generate random, non-decreasing thresholds + thresholds = generate_random_threshold_values(input_data_type, num_input_channels, num_steps) + # make the vivado_hls threshold bug appear (incorrect rtlsim result when first + # threshold of first channel is zero, while using BIPOLAR output) + if act == DataType["BIPOLAR"]: + thresholds[0][0] = 0 + thresholds = sort_thresholds_increasing(thresholds) + + # Other non-input parameters + num_input_vecs = [1, 2, 2] + output_data_type = act + if output_data_type == DataType["BIPOLAR"]: + activation_bias = 0 + else: + activation_bias = output_data_type.min() + + # Generate model from input parameters to the test + model = make_single_thresholding_binary_search_modelwrapper( + thresholds, pe, input_data_type, output_data_type, activation_bias, mem_mode, num_input_vecs + ) + + model = model.transform(SetExecMode("rtlsim")) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) + model = model.transform(HLSSynthIP()) + model = model.transform(PrepareRTLSim()) + return + +# Test brief: Create a Thresholding binary search layer using various parameters +# and test against a SW generated & simulated dataset +# N.B. - fold factor of '-1' is supported only (no PE/SIMD support) +@pytest.mark.parametrize("activation", [DataType["INT4"], DataType["BIPOLAR"]]) +@pytest.mark.parametrize("input_data_type", [DataType["INT16"], DataType["UINT16"]]) +@pytest.mark.parametrize("fold", [-1]) # 1, 2, etc. will fail +@pytest.mark.parametrize("num_input_channels", [16]) +# no need to test 'const' mode, it's already done in test_fpgadataflow_thresholding_binary_search_const_mem_mode() +@pytest.mark.parametrize("mem_mode", ["decoupled"]) +@pytest.mark.tbs_soak +@pytest.mark.tbs_all +def test_fpgadataflow_thresholding_binary_search(activation, input_data_type, fold, num_input_channels, mem_mode): + # Handle inputs to the test + pe = generate_pe_value(fold, num_input_channels) + num_steps = activation.get_num_possible_values() - 1 + + # Other non-input parameters + num_input_vecs = [1, 2, 2] + output_data_type = activation + if output_data_type == DataType["BIPOLAR"]: + activation_bias = 0 + else: + activation_bias = output_data_type.min() + + # generate random input data + tensor_shape = tuple(num_input_vecs + [num_input_channels]) + x = gen_finn_dt_tensor(input_data_type, tensor_shape) + + # Generate random thresholds and sort in ascending order + thresholds = generate_random_threshold_values(input_data_type, num_input_channels, num_steps) + + # make the vivado_hls threshold bug appear (incorrect rtlsim result when first + # threshold of first channel is zero, while using BIPOLAR output) + if activation == DataType["BIPOLAR"]: + thresholds[0][0] = 0 + + # provide non-decreasing/ascending thresholds + thresholds = sort_thresholds_increasing(thresholds) + + x_nhwc = convert_np_array_to_standard_data_layout(x) + y = multithreshold(x_nhwc, thresholds) + + # convert back to NHWC for comparison to hw outputs + y = convert_np_array_to_finn_data_layout(y) + if activation == DataType["BIPOLAR"]: + # binary to bipolar + y = 2 * y - 1 + else: + # signed offset + y += activation.min() + + # Generate model from input parameters to the test + model = make_single_thresholding_binary_search_modelwrapper( + thresholds, pe, input_data_type, output_data_type, activation_bias, mem_mode, num_input_vecs + ) + + model = model.transform(InsertFIFO(True)) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) + model = model.transform(HLSSynthIP()) + model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns)) + + # Retrieve the axilite programming sequence for the weights - for decoupled mode only + tbs_node = model.get_nodes_by_op_type("Thresholding_Binary_Search")[0] + tbs_inst = getCustomOp(tbs_node) + config = tbs_inst.get_dynamic_config(model) + + # Reshape generated data (not from model) + oshape = model.get_tensor_shape("outp") + y_expected = y.reshape(oshape) + + # Helper function that delivers the hook to program the thresholds via AXI-Lite + def config_hook(config): + if config is None: + return None + + def write_thresh_config(sim): + # axi_name = "s_axilite_0_" # works + axi_name = getCustomOp(model.get_nodes_by_op_type("Thresholding_Binary_Search")[0]).get_verilog_top_module_intf_names()['axilite'][0] + axi_name += "_0_" + + # 1. Write config registers to the Threshold memory, dict defines (addr, value) tuples + for config_entry in config.values(): + addr = config_entry[0] + val = config_entry[1] + axilite_write(sim, addr, val, basename=axi_name) + + reset_rtlsim(sim) + return write_thresh_config + + input_dict = {"inp": x} + rtlsim_exec(model, input_dict, pre_hook=config_hook(config)) + y_produced = input_dict["outp"] + assert (y_produced == y_expected).all() From 0bee70d5e4bc5fd163b8cf8a84931ac709aaac35 Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Thu, 17 Nov 2022 10:08:38 +0000 Subject: [PATCH 032/235] [thresholding] add linter fixes Signed-off-by: Fionn O'Donohoe --- .../thresholding_binary_search.py | 121 ++++++++++++------ ...fpgadataflow_thresholding_binary_search.py | 103 ++++++++++++--- 2 files changed, 168 insertions(+), 56 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py index 9bf36283da..b785abcaa8 100755 --- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py @@ -26,21 +26,22 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import os import numpy as np +import os import warnings from qonnx.core.datatype import DataType from qonnx.util.basic import ( interleave_matrix_outer_dim_from_partitions, roundup_to_integer_multiple, ) + +from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.util.basic import get_rtlsim_trace_depth, make_build_dir from finn.util.data_packing import ( npy_to_rtlsim_input, - rtlsim_output_to_npy, pack_innermost_dim_as_hex_string, + rtlsim_output_to_npy, ) -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp -from finn.util.basic import make_build_dir, get_rtlsim_trace_depth try: from pyverilator import PyVerilator @@ -151,7 +152,10 @@ def get_outstream_width(self): def get_weightstream_width(self): # Only 'decoupled' mode is supported mem_mode = self.get_nodeattr("mem_mode") - if mem_mode != "decoupled": raise Exception("Unrecognized memory mode for this node: {}".format(mem_mode)) + if mem_mode != "decoupled": + raise Exception( + "Unrecognized memory mode for this node: {}".format(mem_mode) + ) pe = self.get_nodeattr("PE") wp = self.get_weight_datatype().bitwidth() n_thres_steps = self.get_nodeattr("numSteps") @@ -257,7 +261,10 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name): * weight_file_name : filename for the weight file to be generated """ # There are 'decoupled_*' flavors, just make sure that the flavors are decoupled related - if "decoupled" not in weight_file_mode: raise Exception("Unrecognized memory mode for this node: {}".format(weight_file_mode)) + if "decoupled" not in weight_file_mode: + raise Exception( + "Unrecognized memory mode for this node: {}".format(weight_file_mode) + ) threshold_tensor = self.get_hls_compatible_threshold_tensor(weights) tdt = self.get_weight_datatype() @@ -334,21 +341,35 @@ def prepare_codegen_rtl_values(self): # Identify the module names code_gen_dict["$MODULE_NAME$"] = [self.get_verilog_top_module_name()] - code_gen_dict["$MODULE_NAME_AXI$"] = [self.get_verilog_top_module_name() + "_axi"] - code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [self.get_verilog_top_module_name() + "_axi_wrapper"] + code_gen_dict["$MODULE_NAME_AXI$"] = [ + self.get_verilog_top_module_name() + "_axi" + ] + code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [ + self.get_verilog_top_module_name() + "_axi_wrapper" + ] # Set the top module name - AXI wrapper code_gen_dict["$TOP_MODULE$"] = code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] # Identify the module variables - output_data_type = self.get_nodeattr("outputDataType") # output precision - input_data_type = self.get_nodeattr("inputDataType") # input/threshold precision - num_channels = self.get_nodeattr("NumChannels") # number of channels - bias = self.get_nodeattr("activation_bias") # activation bias value - - code_gen_dict["$N$"] = [self.conv_datatype_to_str(output_data_type)] # output precision - code_gen_dict["$M$"] = [self.conv_datatype_to_str(input_data_type)] # input/threshold precision - code_gen_dict["$C$"] = [self.conv_datatype_to_str(num_channels)] # number of channels - code_gen_dict["$BIAS$"] = [self.conv_datatype_to_str(bias)] # activation bias value + output_data_type = self.get_nodeattr("outputDataType") # output precision + input_data_type = self.get_nodeattr( + "inputDataType" + ) # input/threshold precision + num_channels = self.get_nodeattr("NumChannels") # number of channels + bias = self.get_nodeattr("activation_bias") # activation bias value + + code_gen_dict["$N$"] = [ + self.conv_datatype_to_str(output_data_type) + ] # output precision + code_gen_dict["$M$"] = [ + self.conv_datatype_to_str(input_data_type) + ] # input/threshold precision + code_gen_dict["$C$"] = [ + self.conv_datatype_to_str(num_channels) + ] # number of channels + code_gen_dict["$BIAS$"] = [ + self.conv_datatype_to_str(bias) + ] # activation bias value # Is the input datatype signed or unsigned? The thresholding core needs to know this if self.get_input_datatype().min() < 0: @@ -359,9 +380,7 @@ def prepare_codegen_rtl_values(self): return code_gen_dict def get_rtl_file_list(self): - return ["thresholding.sv", - "thresholding_axi.sv", - "thresholding_axi_wrapper.v"] + return ["thresholding.sv", "thresholding_axi.sv", "thresholding_axi_wrapper.v"] def get_rtl_file_paths(self): rtl_root_dir = os.environ["FINN_ROOT"] + "/finn-rtllib/thresholding/hdl/" @@ -399,7 +418,7 @@ def generate_hdl(self): # apply code generation to templates data = self.fill_in_rtl_template_data(code_gen_dict, template_data) # dump filled-in template to destination directory for compilation - file_only_path = rtl_file_path.split('/')[-1] + file_only_path = rtl_file_path.split("/")[-1] self.dump_rtl_data(code_gen_dir, file_only_path, data) # Before we return - set the 'gen_top_module' attribute for use later by PyVerilator and IPI generation @@ -422,7 +441,10 @@ def code_generation_ipgen(self, model, fpgapart, clk): def generate_params(self, model, path): # Only 'decoupled' mode is supported mem_mode = self.get_nodeattr("mem_mode") - if mem_mode != "decoupled": raise Exception("Unrecognized memory mode for this node: {}".format(mem_mode)) + if mem_mode != "decoupled": + raise Exception( + "Unrecognized memory mode for this node: {}".format(mem_mode) + ) code_gen_dir = path weight_filename_sim = "{}/thresholds.npy".format(code_gen_dir) @@ -484,8 +506,18 @@ def prepare_rtlsim(self): def execute_node(self, context, graph): # Perform input checks - if self.get_nodeattr("exec_mode") != "rtlsim": raise Exception("Invalid exec_mode value: {}; exec_mode must be set to 'rtlsim'".format(self.get_nodeattr("exec_mode"))) - if self.get_nodeattr("mem_mode") != "decoupled": raise Exception("Invalid mem_mode value: {}; mem_mode must be set to 'decoupled'".format(self.get_nodeattr("mem_mode"))) + if self.get_nodeattr("exec_mode") != "rtlsim": + raise Exception( + "Invalid exec_mode value: {}; exec_mode must be set to 'rtlsim'".format( + self.get_nodeattr("exec_mode") + ) + ) + if self.get_nodeattr("mem_mode") != "decoupled": + raise Exception( + "Invalid mem_mode value: {}; mem_mode must be set to 'decoupled'".format( + self.get_nodeattr("mem_mode") + ) + ) node = self.onnx_node code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") @@ -569,24 +601,27 @@ def code_generation_ipi(self): code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") for rtl_file in rtl_file_list: - cmd.append("add_files -norecurse %s" - % ( - os.path.join( - code_gen_dir, rtl_file - ) - )) + cmd.append( + "add_files -norecurse %s" % (os.path.join(code_gen_dir, rtl_file)) + ) # Create an RTL block, not an IP core (-type ip) - cmd.append("create_bd_cell -type module -reference %s %s" - % (self.get_nodeattr("gen_top_module"), self.onnx_node.name)) + cmd.append( + "create_bd_cell -type module -reference %s %s" + % (self.get_nodeattr("gen_top_module"), self.onnx_node.name) + ) # ERROR: [BD 41-237] Bus Interface property FREQ_HZ does not match between # /Thresholding_Binary_Search_0/s_axis(100000000 and /StreamingFIFO_0/out_V(200000000.000000) - cmd.append("set_property -dict [list CONFIG.FREQ_HZ {200000000}] [get_bd_intf_pins Thresholding_Binary_Search_0/s_axis]") + cmd.append( + "set_property -dict [list CONFIG.FREQ_HZ {200000000}] [get_bd_intf_pins Thresholding_Binary_Search_0/s_axis]" + ) # ERROR: [BD 41-237] Bus Interface property FREQ_HZ does not match between # /StreamingFIFO_1/in0_V(200000000.000000) and /Thresholding_Binary_Search_0/m_axis(100000000) - cmd.append("set_property -dict [list CONFIG.FREQ_HZ {200000000}] [get_bd_intf_pins Thresholding_Binary_Search_0/m_axis]") + cmd.append( + "set_property -dict [list CONFIG.FREQ_HZ {200000000}] [get_bd_intf_pins Thresholding_Binary_Search_0/m_axis]" + ) return cmd @@ -603,7 +638,10 @@ def get_verilog_top_module_intf_names(self): intf_names = super().get_verilog_top_module_intf_names() # Only 'decoupled' mode is supported - check before adding axilite interface mem_mode = self.get_nodeattr("mem_mode") - if mem_mode != "decoupled": raise Exception("Unrecognized memory mode for this node: {}".format(mem_mode)) + if mem_mode != "decoupled": + raise Exception( + "Unrecognized memory mode for this node: {}".format(mem_mode) + ) intf_names["axilite"] = ["s_axilite"] intf_names["s_axis"] = [["s_axis"]] intf_names["m_axis"] = [["m_axis"]] @@ -618,7 +656,7 @@ def find_next_power_of_2(self, n): return 0 # If '1' is requested, output will be '0' in the loop below, so avoid this earlier. elif n == 1: - return 2 # i.e. 2**1 + return 2 # i.e. 2**1 # decrement 'n' (to handle cases when `n` itself is a power of 2) n = n - 1 @@ -651,12 +689,17 @@ def get_dynamic_config(self, model, address_stride=1): config = {} channel_cntr = 0 for channel in thresholds: - channel_start_addr = (channel_cntr * weight_addr_boundary * address_stride) + channel_start_addr = channel_cntr * weight_addr_boundary * address_stride weight_cntr = 0 addr = 0 for weight in channel: - key_name = "{}_{}{}_{}{}".format("axilite", "ch", str(channel_cntr), "w", str(weight_cntr)) - config[key_name] = (channel_start_addr + addr, self.prep_axilite_val(weight)) + key_name = "{}_{}{}_{}{}".format( + "axilite", "ch", str(channel_cntr), "w", str(weight_cntr) + ) + config[key_name] = ( + channel_start_addr + addr, + self.prep_axilite_val(weight), + ) weight_cntr += 1 addr += address_stride diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py index 0a02503300..579b6fe83c 100755 --- a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py +++ b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py @@ -27,6 +27,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import pytest + import numpy as np from onnx import TensorProto, helper from pyverilator.util.axi_utils import axilite_write, reset_rtlsim @@ -54,8 +55,14 @@ def sort_thresholds_increasing(thresholds): return np.sort(thresholds, axis=1) + def generate_random_threshold_values(input_data_type, num_input_channels, num_steps): - return np.random.randint(input_data_type.min(), input_data_type.max() + 1, (num_input_channels, num_steps)).astype(np.float32) + return np.random.randint( + input_data_type.min(), + input_data_type.max() + 1, + (num_input_channels, num_steps), + ).astype(np.float32) + def generate_pe_value(fold, num_input_channels): if fold == -1: @@ -64,20 +71,29 @@ def generate_pe_value(fold, num_input_channels): assert num_input_channels % pe == 0 return pe + # n = batch, c = channel, h = height, w = width of feature map # Standard = NCHW; FINN = NHWC # Convert from NCHW to NHWC def convert_np_array_to_finn_data_layout(data): return np.transpose(data, (0, 2, 3, 1)) + # n = batch, c = channel, h = height, w = width of feature map # Standard = NCHW; FINN = NHWC # Convert from NHWC to NCHW def convert_np_array_to_standard_data_layout(data): return np.transpose(data, (0, 3, 1, 2)) + def make_single_thresholding_binary_search_modelwrapper( - thresholds, pe, input_data_type, output_data_type, activation_bias, mem_mode, num_input_vecs + thresholds, + pe, + input_data_type, + output_data_type, + activation_bias, + mem_mode, + num_input_vecs, ): NumChannels = thresholds.shape[0] @@ -123,6 +139,7 @@ def make_single_thresholding_binary_search_modelwrapper( model.set_initializer("thresh", thresholds) return model + # Test brief: a particular method for this class was causing a bug - find_next_power_of_2() # Weights in the thresholding core are programmed on a per-channel basis and are byte-addressable. # When a channel is programmed, the next channel can start programming at the next power-of-2 byte boundary. @@ -155,11 +172,19 @@ def test_fpgadataflow_thresholding_binary_search_unit(): activation_bias = output_data_type.min() # Generate random thresholds and sort in ascending order - thresholds = generate_random_threshold_values(input_data_type, num_input_channels, num_steps) + thresholds = generate_random_threshold_values( + input_data_type, num_input_channels, num_steps + ) # Generate model from input parameters to the test model = make_single_thresholding_binary_search_modelwrapper( - thresholds, pe, input_data_type, output_data_type, activation_bias, mem_mode, num_input_vecs + thresholds, + pe, + input_data_type, + output_data_type, + activation_bias, + mem_mode, + num_input_vecs, ) # Retrieve the class to get the method-under-test @@ -194,6 +219,7 @@ def test_fpgadataflow_thresholding_binary_search_unit(): return + # Test brief: Prove that cppsim is not supported for this class @pytest.mark.tbs_cppsim @pytest.mark.tbs_all @@ -202,13 +228,15 @@ def test_fpgadataflow_thresholding_binary_search_cppsim(): act = DataType["BIPOLAR"] fold = -1 num_input_channels = 16 - mem_mode = "decoupled" # 'const' is unsupported - see test_fpgadataflow_thresholding_binary_search_const_mem_mode + mem_mode = "decoupled" # 'const' is unsupported - see test_fpgadataflow_thresholding_binary_search_const_mem_mode pe = generate_pe_value(fold, num_input_channels) num_steps = act.get_num_possible_values() - 1 # Generate random, non-decreasing thresholds - thresholds = generate_random_threshold_values(input_data_type, num_input_channels, num_steps) + thresholds = generate_random_threshold_values( + input_data_type, num_input_channels, num_steps + ) # make the vivado_hls threshold bug appear (incorrect rtlsim result when first # threshold of first channel is zero, while using BIPOLAR output) @@ -226,7 +254,13 @@ def test_fpgadataflow_thresholding_binary_search_cppsim(): # Generate model from input parameters to the test model = make_single_thresholding_binary_search_modelwrapper( - thresholds, pe, input_data_type, output_data_type, activation_bias, mem_mode, num_input_vecs + thresholds, + pe, + input_data_type, + output_data_type, + activation_bias, + mem_mode, + num_input_vecs, ) # Cppsim is not supported for this class, catch the specific exception thrown by cppsim @@ -236,9 +270,13 @@ def test_fpgadataflow_thresholding_binary_search_cppsim(): model = model.transform(CompileCppSim()) model = model.transform(SetExecMode("cppsim")) except Exception as e: - if str(e) != "Custom op_type Thresholding_Binary_Search is currently not supported.": + if ( + str(e) + != "Custom op_type Thresholding_Binary_Search is currently not supported." + ): raise + # Test brief: Prove that memory mode 'const' is not supported for this layer type @pytest.mark.tbs_const @pytest.mark.tbs_all @@ -256,11 +294,19 @@ def test_fpgadataflow_thresholding_binary_search_const_mem_mode(): # Generate random thresholds and sort in ascending order num_steps = activation.get_num_possible_values() - 1 - thresholds = generate_random_threshold_values(input_data_type, num_input_channels, num_steps) + thresholds = generate_random_threshold_values( + input_data_type, num_input_channels, num_steps + ) # Generate model from input parameters to the test model = make_single_thresholding_binary_search_modelwrapper( - thresholds, pe, input_data_type, output_data_type, activation_bias, mem_mode, num_input_vecs + thresholds, + pe, + input_data_type, + output_data_type, + activation_bias, + mem_mode, + num_input_vecs, ) # Prove that 'const' memory mode is not supported for this class @@ -278,6 +324,7 @@ def test_fpgadataflow_thresholding_binary_search_const_mem_mode(): # Caught the expected exception, leave the test early return + # Test brief: Test that PrepareRTLSim() runs successfully. This function is not # tested in test_fpgadataflow_thresholding_binary_search() @pytest.mark.tbs_prep_rtlsim @@ -294,7 +341,9 @@ def test_fpgadataflow_thresholding_binary_search_prepare_rtlsim(): num_steps = act.get_num_possible_values() - 1 # Generate random, non-decreasing thresholds - thresholds = generate_random_threshold_values(input_data_type, num_input_channels, num_steps) + thresholds = generate_random_threshold_values( + input_data_type, num_input_channels, num_steps + ) # make the vivado_hls threshold bug appear (incorrect rtlsim result when first # threshold of first channel is zero, while using BIPOLAR output) if act == DataType["BIPOLAR"]: @@ -311,7 +360,13 @@ def test_fpgadataflow_thresholding_binary_search_prepare_rtlsim(): # Generate model from input parameters to the test model = make_single_thresholding_binary_search_modelwrapper( - thresholds, pe, input_data_type, output_data_type, activation_bias, mem_mode, num_input_vecs + thresholds, + pe, + input_data_type, + output_data_type, + activation_bias, + mem_mode, + num_input_vecs, ) model = model.transform(SetExecMode("rtlsim")) @@ -321,18 +376,21 @@ def test_fpgadataflow_thresholding_binary_search_prepare_rtlsim(): model = model.transform(PrepareRTLSim()) return + # Test brief: Create a Thresholding binary search layer using various parameters # and test against a SW generated & simulated dataset # N.B. - fold factor of '-1' is supported only (no PE/SIMD support) @pytest.mark.parametrize("activation", [DataType["INT4"], DataType["BIPOLAR"]]) @pytest.mark.parametrize("input_data_type", [DataType["INT16"], DataType["UINT16"]]) -@pytest.mark.parametrize("fold", [-1]) # 1, 2, etc. will fail +@pytest.mark.parametrize("fold", [-1]) # 1, 2, etc. will fail @pytest.mark.parametrize("num_input_channels", [16]) # no need to test 'const' mode, it's already done in test_fpgadataflow_thresholding_binary_search_const_mem_mode() @pytest.mark.parametrize("mem_mode", ["decoupled"]) @pytest.mark.tbs_soak @pytest.mark.tbs_all -def test_fpgadataflow_thresholding_binary_search(activation, input_data_type, fold, num_input_channels, mem_mode): +def test_fpgadataflow_thresholding_binary_search( + activation, input_data_type, fold, num_input_channels, mem_mode +): # Handle inputs to the test pe = generate_pe_value(fold, num_input_channels) num_steps = activation.get_num_possible_values() - 1 @@ -350,7 +408,9 @@ def test_fpgadataflow_thresholding_binary_search(activation, input_data_type, fo x = gen_finn_dt_tensor(input_data_type, tensor_shape) # Generate random thresholds and sort in ascending order - thresholds = generate_random_threshold_values(input_data_type, num_input_channels, num_steps) + thresholds = generate_random_threshold_values( + input_data_type, num_input_channels, num_steps + ) # make the vivado_hls threshold bug appear (incorrect rtlsim result when first # threshold of first channel is zero, while using BIPOLAR output) @@ -374,7 +434,13 @@ def test_fpgadataflow_thresholding_binary_search(activation, input_data_type, fo # Generate model from input parameters to the test model = make_single_thresholding_binary_search_modelwrapper( - thresholds, pe, input_data_type, output_data_type, activation_bias, mem_mode, num_input_vecs + thresholds, + pe, + input_data_type, + output_data_type, + activation_bias, + mem_mode, + num_input_vecs, ) model = model.transform(InsertFIFO(True)) @@ -399,7 +465,9 @@ def config_hook(config): def write_thresh_config(sim): # axi_name = "s_axilite_0_" # works - axi_name = getCustomOp(model.get_nodes_by_op_type("Thresholding_Binary_Search")[0]).get_verilog_top_module_intf_names()['axilite'][0] + axi_name = getCustomOp( + model.get_nodes_by_op_type("Thresholding_Binary_Search")[0] + ).get_verilog_top_module_intf_names()["axilite"][0] axi_name += "_0_" # 1. Write config registers to the Threshold memory, dict defines (addr, value) tuples @@ -409,6 +477,7 @@ def write_thresh_config(sim): axilite_write(sim, addr, val, basename=axi_name) reset_rtlsim(sim) + return write_thresh_config input_dict = {"inp": x} From 0689c6a6a03cbc2e9b3982af971144ac186a2c76 Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Thu, 17 Nov 2022 10:30:50 +0000 Subject: [PATCH 033/235] [thresholding] add flake8 fixes Signed-off-by: Fionn O'Donohoe --- .../thresholding_binary_search.py | 37 +++++++++------- ...fpgadataflow_thresholding_binary_search.py | 42 +++++++++++-------- 2 files changed, 48 insertions(+), 31 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py index b785abcaa8..003dbb2fd9 100755 --- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py @@ -28,7 +28,9 @@ import numpy as np import os +import textwrap import warnings +from math import ceil, log2 from qonnx.core.datatype import DataType from qonnx.util.basic import ( interleave_matrix_outer_dim_from_partitions, @@ -260,7 +262,8 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name): decoupled_runtime} * weight_file_name : filename for the weight file to be generated """ - # There are 'decoupled_*' flavors, just make sure that the flavors are decoupled related + # There are 'decoupled_*' flavors, just make sure that the flavors + # are decoupled related if "decoupled" not in weight_file_mode: raise Exception( "Unrecognized memory mode for this node: {}".format(weight_file_mode) @@ -371,7 +374,8 @@ def prepare_codegen_rtl_values(self): self.conv_datatype_to_str(bias) ] # activation bias value - # Is the input datatype signed or unsigned? The thresholding core needs to know this + # Is the input datatype signed or unsigned? + # The thresholding core needs to know this when comparing weights to inputs if self.get_input_datatype().min() < 0: code_gen_dict["$SIGN$"] = ["signed"] else: @@ -421,7 +425,8 @@ def generate_hdl(self): file_only_path = rtl_file_path.split("/")[-1] self.dump_rtl_data(code_gen_dir, file_only_path, data) - # Before we return - set the 'gen_top_module' attribute for use later by PyVerilator and IPI generation + # Before we return - set the 'gen_top_module' attribute for use later + # by PyVerilator and IPI generation self.set_nodeattr("gen_top_module", code_gen_dict["$TOP_MODULE$"][0]) return @@ -508,14 +513,14 @@ def execute_node(self, context, graph): # Perform input checks if self.get_nodeattr("exec_mode") != "rtlsim": raise Exception( - "Invalid exec_mode value: {}; exec_mode must be set to 'rtlsim'".format( - self.get_nodeattr("exec_mode") + "Invalid exec_mode value: {}; exec_mode must be set to '{}'".format( + self.get_nodeattr("exec_mode"), "rtlsim" ) ) if self.get_nodeattr("mem_mode") != "decoupled": raise Exception( - "Invalid mem_mode value: {}; mem_mode must be set to 'decoupled'".format( - self.get_nodeattr("mem_mode") + "Invalid mem_mode value: {}; mem_mode must be set to '{}'".format( + self.get_nodeattr("mem_mode"), "decoupled" ) ) @@ -595,7 +600,8 @@ def execute_node(self, context, graph): return def code_generation_ipi(self): - """Constructs and returns the TCL commands for node instantiation as an RTL block.""" + """Constructs and returns the TCL commands for node instantiation as an RTL + block.""" cmd = [] rtl_file_list = self.get_rtl_file_list() code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") @@ -612,15 +618,19 @@ def code_generation_ipi(self): ) # ERROR: [BD 41-237] Bus Interface property FREQ_HZ does not match between - # /Thresholding_Binary_Search_0/s_axis(100000000 and /StreamingFIFO_0/out_V(200000000.000000) + # /Thresholding_Binary_Search_0/s_axis(100000000 and + # /StreamingFIFO_0/out_V(200000000.000000) cmd.append( - "set_property -dict [list CONFIG.FREQ_HZ {200000000}] [get_bd_intf_pins Thresholding_Binary_Search_0/s_axis]" + "set_property -dict [list CONFIG.FREQ_HZ {200000000}] [%s %s]" + % ("get_bd_intf_pins", "Thresholding_Binary_Search_0/s_axis") ) # ERROR: [BD 41-237] Bus Interface property FREQ_HZ does not match between - # /StreamingFIFO_1/in0_V(200000000.000000) and /Thresholding_Binary_Search_0/m_axis(100000000) + # /StreamingFIFO_1/in0_V(200000000.000000) and + # /Thresholding_Binary_Search_0/m_axis(100000000) cmd.append( - "set_property -dict [list CONFIG.FREQ_HZ {200000000}] [get_bd_intf_pins Thresholding_Binary_Search_0/m_axis]" + "set_property -dict [list CONFIG.FREQ_HZ {200000000}] [%s %s]" + % ("get_bd_intf_pins", "Thresholding_Binary_Search_0/m_axis") ) return cmd @@ -654,7 +664,7 @@ def find_next_power_of_2(self, n): # Negative values will loop infinitely below - return 0 if n <= 0: return 0 - # If '1' is requested, output will be '0' in the loop below, so avoid this earlier. + # If '1' is requested, output will be '0' in the loop below, avoid this now. elif n == 1: return 2 # i.e. 2**1 @@ -674,7 +684,6 @@ def prep_axilite_val(self, val): return self.twos_comp(int(val), self.get_weight_datatype().bitwidth()) def get_dynamic_config(self, model, address_stride=1): - ## TODO - not sure this description is correct """Returns a configuration dictionary containing axilite write commands in order to program the thresholds into the RTL core during runtime. The default address stride for the weights is 1 byte.""" diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py index 579b6fe83c..81a089844d 100755 --- a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py +++ b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py @@ -51,6 +51,7 @@ test_fpga_part = "xczu3eg-sbva484-1-e" target_clk_ns = 5 + # Helper functions def sort_thresholds_increasing(thresholds): return np.sort(thresholds, axis=1) @@ -140,16 +141,18 @@ def make_single_thresholding_binary_search_modelwrapper( return model -# Test brief: a particular method for this class was causing a bug - find_next_power_of_2() -# Weights in the thresholding core are programmed on a per-channel basis and are byte-addressable. -# When a channel is programmed, the next channel can start programming at the next power-of-2 byte boundary. -# This test is to show that the function that calculates that boundary is working correctly. +# Test brief: a particular method for this class was causing a bug: +# find_next_power_of_2() +# Weights in the thresholding core are programmed on a per-channel basis and are +# byte-addressable. When a channel is programmed, the next channel can start +# programming at the next power-of-2 byte boundary. This test is to show that the +# function that calculates that boundary is working correctly. # -# A Thresholding_Binary_Search layer was created and a SW generated dataset with a threshold channel -# depth of 1 weight (1 layer of N channels in the thresholding core). However, find_next_power_of_2() -# was returning a next-power-of-2 address boundary at address '0', instead of '2'. This unit test -# is to prove that this bug no longer occurs. It was originally seen when the input datatype -# was 'DataType["BIPOLAR"]'. +# A Thresholding_Binary_Search layer was created and a SW generated dataset with a +# threshold channel depth of 1 weight (1 layer of N channels in the thresholding core). +# However, find_next_power_of_2() was returning a next-power-of-2 address boundary at +# address '0', instead of '2'. This unit test is to prove that this bug no longer +# occurs. It was originally seen when the input datatype was 'DataType["BIPOLAR"]'. @pytest.mark.tbs_unit @pytest.mark.tbs_all def test_fpgadataflow_thresholding_binary_search_unit(): @@ -228,7 +231,9 @@ def test_fpgadataflow_thresholding_binary_search_cppsim(): act = DataType["BIPOLAR"] fold = -1 num_input_channels = 16 - mem_mode = "decoupled" # 'const' is unsupported - see test_fpgadataflow_thresholding_binary_search_const_mem_mode + # 'const' is unsupported see test: + # test_fpgadataflow_thresholding_binary_search_const_mem_mode() + mem_mode = "decoupled" pe = generate_pe_value(fold, num_input_channels) num_steps = act.get_num_possible_values() - 1 @@ -263,8 +268,9 @@ def test_fpgadataflow_thresholding_binary_search_cppsim(): num_input_vecs, ) - # Cppsim is not supported for this class, catch the specific exception thrown by cppsim - # Exception raised in cppsim: Custom op_type Thresholding_Binary_Search is currently not supported. + # Cppsim is not supported for this class, catch the specific exception thrown by + # cppsim. Exception raised in cppsim: Custom op_type Thresholding_Binary_Search is + # currently not supported. try: model = model.transform(PrepareCppSim()) model = model.transform(CompileCppSim()) @@ -310,8 +316,8 @@ def test_fpgadataflow_thresholding_binary_search_const_mem_mode(): ) # Prove that 'const' memory mode is not supported for this class - # 'const' memory mode is not supported for this class, catch the specific exception thrown by FINN - # Exception: ('Unrecognized memory mode for this node:', 'const') + # 'const' memory mode is not supported for this class, catch the specific exception + # thrown by FINN. Exception: ('Unrecognized memory mode for this node:', 'const') try: model = model.transform(InsertFIFO(True)) model = model.transform(GiveUniqueNodeNames()) @@ -384,7 +390,8 @@ def test_fpgadataflow_thresholding_binary_search_prepare_rtlsim(): @pytest.mark.parametrize("input_data_type", [DataType["INT16"], DataType["UINT16"]]) @pytest.mark.parametrize("fold", [-1]) # 1, 2, etc. will fail @pytest.mark.parametrize("num_input_channels", [16]) -# no need to test 'const' mode, it's already done in test_fpgadataflow_thresholding_binary_search_const_mem_mode() +# no need to test 'const' mode, it's already done in: +# test_fpgadataflow_thresholding_binary_search_const_mem_mode() @pytest.mark.parametrize("mem_mode", ["decoupled"]) @pytest.mark.tbs_soak @pytest.mark.tbs_all @@ -449,7 +456,7 @@ def test_fpgadataflow_thresholding_binary_search( model = model.transform(HLSSynthIP()) model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns)) - # Retrieve the axilite programming sequence for the weights - for decoupled mode only + # Retrieve the axilite programming sequence for weights - for decoupled mode only tbs_node = model.get_nodes_by_op_type("Thresholding_Binary_Search")[0] tbs_inst = getCustomOp(tbs_node) config = tbs_inst.get_dynamic_config(model) @@ -470,7 +477,8 @@ def write_thresh_config(sim): ).get_verilog_top_module_intf_names()["axilite"][0] axi_name += "_0_" - # 1. Write config registers to the Threshold memory, dict defines (addr, value) tuples + # Write config registers to the Threshold memory. + # The dictionary defines (addr, value) tuples. for config_entry in config.values(): addr = config_entry[0] val = config_entry[1] From e9a4a7bb9dbdcc6dd2a7dd900f62851891793017 Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Thu, 17 Nov 2022 12:01:52 +0000 Subject: [PATCH 034/235] [thresholding] change the pytest markers to omit tests from quicktest Signed-off-by: Fionn O'Donohoe --- ...fpgadataflow_thresholding_binary_search.py | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py index 81a089844d..e2189c4c79 100755 --- a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py +++ b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py @@ -153,8 +153,8 @@ def make_single_thresholding_binary_search_modelwrapper( # However, find_next_power_of_2() was returning a next-power-of-2 address boundary at # address '0', instead of '2'. This unit test is to prove that this bug no longer # occurs. It was originally seen when the input datatype was 'DataType["BIPOLAR"]'. -@pytest.mark.tbs_unit -@pytest.mark.tbs_all +@pytest.mark.fpgadataflow +@pytest.mark.vivado def test_fpgadataflow_thresholding_binary_search_unit(): activation = DataType["BIPOLAR"] input_data_type = DataType["INT16"] @@ -224,8 +224,8 @@ def test_fpgadataflow_thresholding_binary_search_unit(): # Test brief: Prove that cppsim is not supported for this class -@pytest.mark.tbs_cppsim -@pytest.mark.tbs_all +@pytest.mark.fpgadataflow +@pytest.mark.vivado def test_fpgadataflow_thresholding_binary_search_cppsim(): input_data_type = DataType["UINT16"] act = DataType["BIPOLAR"] @@ -284,8 +284,8 @@ def test_fpgadataflow_thresholding_binary_search_cppsim(): # Test brief: Prove that memory mode 'const' is not supported for this layer type -@pytest.mark.tbs_const -@pytest.mark.tbs_all +@pytest.mark.fpgadataflow +@pytest.mark.vivado def test_fpgadataflow_thresholding_binary_search_const_mem_mode(): input_data_type = DataType["INT16"] activation = DataType["INT4"] @@ -333,8 +333,8 @@ def test_fpgadataflow_thresholding_binary_search_const_mem_mode(): # Test brief: Test that PrepareRTLSim() runs successfully. This function is not # tested in test_fpgadataflow_thresholding_binary_search() -@pytest.mark.tbs_prep_rtlsim -@pytest.mark.tbs_all +@pytest.mark.fpgadataflow +@pytest.mark.vivado def test_fpgadataflow_thresholding_binary_search_prepare_rtlsim(): input_data_type = DataType["INT16"] act = DataType["INT4"] @@ -393,8 +393,9 @@ def test_fpgadataflow_thresholding_binary_search_prepare_rtlsim(): # no need to test 'const' mode, it's already done in: # test_fpgadataflow_thresholding_binary_search_const_mem_mode() @pytest.mark.parametrize("mem_mode", ["decoupled"]) -@pytest.mark.tbs_soak -@pytest.mark.tbs_all +@pytest.mark.fpgadataflow +@pytest.mark.vivado +@pytest.mark.slow def test_fpgadataflow_thresholding_binary_search( activation, input_data_type, fold, num_input_channels, mem_mode ): From 41c0b4b0799674cd468b9aabfe47a5992891e873 Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Fri, 25 Nov 2022 14:57:39 +0000 Subject: [PATCH 035/235] [thresholding] update copyright banners of files I have added/changed Signed-off-by: Fionn O'Donohoe --- src/finn/custom_op/fpgadataflow/__init__.py | 2 +- src/finn/custom_op/fpgadataflow/thresholding_binary_search.py | 2 +- .../test_fpgadataflow_thresholding_binary_search.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index dc9a5a349a..0e17726d48 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2022, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py index 003dbb2fd9..7df755ae1b 100755 --- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2022, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py index e2189c4c79..1e3521a610 100755 --- a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py +++ b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2022, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without From 71ef39b38d70365f4812cfd6f0d46a1d0198b269 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Thu, 1 Dec 2022 13:12:33 +0000 Subject: [PATCH 036/235] Translate byte to parameter word addressing in AXI adapter. --- finn-rtllib/thresholding/hdl/thresholding_axi.sv | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi.sv b/finn-rtllib/thresholding/hdl/thresholding_axi.sv index 97cdfd3e12..c766e60b9e 100644 --- a/finn-rtllib/thresholding/hdl/thresholding_axi.sv +++ b/finn-rtllib/thresholding/hdl/thresholding_axi.sv @@ -30,6 +30,12 @@ * * @brief All-AXI interface adapter for thresholding module. * @author Thomas B. Preußer + * + * @description + * This AXI adapter fits the core thresholding functionality: + * - with AXI stream data interfaces with flow control + * - with implicit round-robin channel rotation as used by FINN, and + * - performs aligned byte address to parameter word address translation. *****************************************************************************/ module $MODULE_NAME_AXI$ #( @@ -49,7 +55,7 @@ module $MODULE_NAME_AXI$ #( // Writing input logic s_axilite_AWVALID, output logic s_axilite_AWREADY, - input logic [$clog2(C)+N-1:0] s_axilite_AWADDR, + input logic [$clog2(C)+N+1:0] s_axilite_AWADDR, // lowest 2 bits (byte selectors) are ignored input logic s_axilite_WVALID, output logic s_axilite_WREADY, @@ -109,7 +115,7 @@ module $MODULE_NAME_AXI$ #( else begin if(!WABusy) begin WABusy <= s_axilite_AWVALID; - Addr <= s_axilite_AWADDR[$clog2(C)+N-1:0]; + Addr <= s_axilite_AWADDR[$clog2(C)+N+1:2]; end if(!WDBusy) begin WDBusy <= s_axilite_WVALID; From d44a66c949177163099e36ce4e57c9ac992ee70b Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Mon, 19 Dec 2022 15:05:08 +0000 Subject: [PATCH 037/235] [thresholding] remove unused attribute Signed-off-by: Fionn O'Donohoe --- src/finn/custom_op/fpgadataflow/thresholding_binary_search.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py index 7df755ae1b..2ebe6f0a39 100755 --- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py @@ -103,7 +103,6 @@ def get_nodeattr_types(self): # always "flush" the accelerator by first passing a dummy input # vector through the accelerator. This will get rid of any old # weight data from the weight FIFOs. - "runtime_writeable_weights": ("i", False, 0, {0, 1}), "gen_top_module": ("s", False, ""), "activation_bias": ("i", False, 0), } @@ -656,8 +655,6 @@ def get_verilog_top_module_intf_names(self): intf_names["s_axis"] = [["s_axis"]] intf_names["m_axis"] = [["m_axis"]] - self.set_nodeattr("runtime_writeable_weights", 1) - return intf_names def find_next_power_of_2(self, n): From f79b9ec3e19d83d6469e6e563422fbba70f7a87a Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Mon, 19 Dec 2022 15:53:20 +0000 Subject: [PATCH 038/235] [thresholding] remove unnecessary HLS bug prevention check Signed-off-by: Fionn O'Donohoe --- ...test_fpgadataflow_thresholding_binary_search.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py index 1e3521a610..ab98189ea5 100755 --- a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py +++ b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py @@ -242,11 +242,6 @@ def test_fpgadataflow_thresholding_binary_search_cppsim(): thresholds = generate_random_threshold_values( input_data_type, num_input_channels, num_steps ) - - # make the vivado_hls threshold bug appear (incorrect rtlsim result when first - # threshold of first channel is zero, while using BIPOLAR output) - if act == DataType["BIPOLAR"]: - thresholds[0][0] = 0 thresholds = sort_thresholds_increasing(thresholds) # Other non-input parameters @@ -350,10 +345,6 @@ def test_fpgadataflow_thresholding_binary_search_prepare_rtlsim(): thresholds = generate_random_threshold_values( input_data_type, num_input_channels, num_steps ) - # make the vivado_hls threshold bug appear (incorrect rtlsim result when first - # threshold of first channel is zero, while using BIPOLAR output) - if act == DataType["BIPOLAR"]: - thresholds[0][0] = 0 thresholds = sort_thresholds_increasing(thresholds) # Other non-input parameters @@ -420,11 +411,6 @@ def test_fpgadataflow_thresholding_binary_search( input_data_type, num_input_channels, num_steps ) - # make the vivado_hls threshold bug appear (incorrect rtlsim result when first - # threshold of first channel is zero, while using BIPOLAR output) - if activation == DataType["BIPOLAR"]: - thresholds[0][0] = 0 - # provide non-decreasing/ascending thresholds thresholds = sort_thresholds_increasing(thresholds) From 7b82de2c78e14f9dc2017e7c5e9378865011e9da Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Mon, 19 Dec 2022 16:40:57 +0000 Subject: [PATCH 039/235] [thresholding] align methods with hlscustom class by adding in additional input parameter Signed-off-by: Fionn O'Donohoe --- .../fpgadataflow/thresholding_binary_search.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py index 2ebe6f0a39..d69c7e47b7 100755 --- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py @@ -129,10 +129,10 @@ def bram_estimation(self): def lut_estimation(self): return 0 - def get_input_datatype(self): + def get_input_datatype(self, ind=0): return DataType[self.get_nodeattr("inputDataType")] - def get_output_datatype(self): + def get_output_datatype(self, ind=0): return DataType[self.get_nodeattr("outputDataType")] def get_weight_datatype(self): @@ -142,11 +142,11 @@ def get_weight_datatype(self): def minimize_accumulator_width(self, model): return None - def get_instream_width(self): + def get_instream_width(self, ind=0): i_bits = self.get_input_datatype().bitwidth() return i_bits * self.get_nodeattr("PE") - def get_outstream_width(self): + def get_outstream_width(self, ind=0): o_bits = self.get_output_datatype().bitwidth() return o_bits * self.get_nodeattr("PE") @@ -163,24 +163,24 @@ def get_weightstream_width(self): w_width = pe * wp * n_thres_steps return w_width - def get_folded_input_shape(self): + def get_folded_input_shape(self, ind=0): fold = self.calc_tmem() pe = self.get_nodeattr("PE") vecs = list(self.get_nodeattr("numInputVectors")) folded_input_shape = tuple(vecs + [fold, pe]) return folded_input_shape - def get_folded_output_shape(self): + def get_folded_output_shape(self, ind=0): # same shape as input return self.get_folded_input_shape() - def get_normal_input_shape(self): + def get_normal_input_shape(self, ind=0): num_channels = self.get_nodeattr("NumChannels") vecs = list(self.get_nodeattr("numInputVectors")) normal_input_shape = tuple(vecs + [num_channels]) return normal_input_shape - def get_normal_output_shape(self): + def get_normal_output_shape(self, ind=0): # same shape as input return self.get_normal_input_shape() From e2816d3e1c8ce75ad9f0b1aafbef25af8b305a6c Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Mon, 19 Dec 2022 16:50:26 +0000 Subject: [PATCH 040/235] [thresholding] replace hardcoded tcl commands with node attributes Signed-off-by: Fionn O'Donohoe --- .../thresholding_binary_search.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py index d69c7e47b7..fe976c7dbe 100755 --- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py @@ -105,6 +105,7 @@ def get_nodeattr_types(self): # weight data from the weight FIFOs. "gen_top_module": ("s", False, ""), "activation_bias": ("i", False, 0), + "clkFreq": ("i", False, 200000000), } my_attrs.update(super().get_nodeattr_types()) return my_attrs @@ -604,6 +605,10 @@ def code_generation_ipi(self): cmd = [] rtl_file_list = self.get_rtl_file_list() code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + node_name = self.onnx_node.name + dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0][0] + din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0] + clock_freq = self.get_nodeattr("clkFreq") for rtl_file in rtl_file_list: cmd.append( @@ -616,20 +621,14 @@ def code_generation_ipi(self): % (self.get_nodeattr("gen_top_module"), self.onnx_node.name) ) - # ERROR: [BD 41-237] Bus Interface property FREQ_HZ does not match between - # /Thresholding_Binary_Search_0/s_axis(100000000 and - # /StreamingFIFO_0/out_V(200000000.000000) cmd.append( - "set_property -dict [list CONFIG.FREQ_HZ {200000000}] [%s %s]" - % ("get_bd_intf_pins", "Thresholding_Binary_Search_0/s_axis") + "set_property -dict [list CONFIG.FREQ_HZ {%d}] [%s %s/%s]" + % (clock_freq, "get_bd_intf_pins", node_name, din_name) ) - # ERROR: [BD 41-237] Bus Interface property FREQ_HZ does not match between - # /StreamingFIFO_1/in0_V(200000000.000000) and - # /Thresholding_Binary_Search_0/m_axis(100000000) cmd.append( - "set_property -dict [list CONFIG.FREQ_HZ {200000000}] [%s %s]" - % ("get_bd_intf_pins", "Thresholding_Binary_Search_0/m_axis") + "set_property -dict [list CONFIG.FREQ_HZ {%d}] [%s %s/%s]" + % (clock_freq, "get_bd_intf_pins", node_name, dout_name) ) return cmd From bda05ae16e62627d414c80452caa012dee7aa0d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Tue, 20 Dec 2022 09:24:04 +0000 Subject: [PATCH 041/235] Fix BIAS parameter specification. --- finn-rtllib/thresholding/hdl/thresholding.sv | 2 +- finn-rtllib/thresholding/hdl/thresholding_axi.sv | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/finn-rtllib/thresholding/hdl/thresholding.sv b/finn-rtllib/thresholding/hdl/thresholding.sv index 25d6ff3112..b26747d1ff 100644 --- a/finn-rtllib/thresholding/hdl/thresholding.sv +++ b/finn-rtllib/thresholding/hdl/thresholding.sv @@ -48,7 +48,7 @@ module $MODULE_NAME$ #( int unsigned M, // input/threshold precision int unsigned C, // number of channels - int BIAS, // offsetting the output [0, 2^N-1) -> [-BIAS, 2^N-1 - BIAS) + int BIAS, // offsetting the output [0, 2^N-1) -> [BIAS, 2^N-1 + BIAS) int unsigned C_BITS, int unsigned O_BITS diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi.sv b/finn-rtllib/thresholding/hdl/thresholding_axi.sv index c766e60b9e..5cd7746b82 100644 --- a/finn-rtllib/thresholding/hdl/thresholding_axi.sv +++ b/finn-rtllib/thresholding/hdl/thresholding_axi.sv @@ -43,7 +43,7 @@ module $MODULE_NAME_AXI$ #( int unsigned M, // input/threshold precision int unsigned C, // Channels - int BIAS, // offsetting the output [0, 2^N-1) -> [-BIAS, 2^N-1 - BIAS) + int BIAS, // offsetting the output [0, 2^N-1) -> [BIAS, 2^N-1 + BIAS) int unsigned O_BITS )( From 7388e7613ef38b6caa1fafb1129973cefef8716a Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Tue, 20 Dec 2022 14:08:07 +0000 Subject: [PATCH 042/235] [thresholding] remove unused ram_style attribute Signed-off-by: Fionn O'Donohoe --- .../fpgadataflow/thresholding_binary_search.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py index fe976c7dbe..9cbe049be3 100755 --- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py @@ -76,8 +76,6 @@ def get_nodeattr_types(self): "NumChannels": ("i", True, 0), # number of steps in thresholding function. Used only in decoupled mode "numSteps": ("i", True, 1), - # string defining memory type - "ram_style": ("s", False, "distributed", {"distributed", "block"}), # FINN DataTypes for inputs, outputs "inputDataType": ("s", True, ""), "weightDataType": ("s", True, ""), @@ -470,14 +468,7 @@ def generate_params(self, model, path): ) # Synthesis thresholds: - ram_style = self.get_nodeattr("ram_style") - if ram_style == "ultra": - # UltraRAM must have no memory initializer, or only zeroes - # otherwise BRAM will be inferred instead of URAM - # as a workaround we provide a zero-weight init here - synth_thresholds = np.zeros_like(thresholds, dtype=np.float32) - else: - synth_thresholds = thresholds + synth_thresholds = thresholds self.make_weight_file( synth_thresholds, "decoupled_verilog_dat", weight_filename_rtl_synth ) From be1503a0c78fd4c4d903b1ffbf61964659725bb6 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Tue, 3 Jan 2023 15:37:42 +0000 Subject: [PATCH 043/235] First changes to custom_op for RTL-based MVAU --- .../matrixvectoractivation_rtl.py | 1036 +++++++++++++++++ 1 file changed, 1036 insertions(+) create mode 100644 src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py new file mode 100644 index 0000000000..c8a0aa675b --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py @@ -0,0 +1,1036 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import math +import numpy as np +import os +import textwrap +import warnings +from qonnx.core.datatype import DataType +from qonnx.util.basic import ( + calculate_matvec_accumulator_range, + interleave_matrix_outer_dim_from_partitions, + roundup_to_integer_multiple, +) + +from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.util.data_packing import ( + npy_to_rtlsim_input, + numpy_to_hls_code, + pack_innermost_dim_as_hex_string, + rtlsim_output_to_npy, +) + +from . import templates + +# ONNX i/o tensor shape assumptions for MatrixVectorActivation: +# input 0 is the input tensor, shape (.., i_size) = (..., MW) +# input 1 is the weight tensor, shape (i_size, o_size) = (MW, MH) +# (optional) input 2 is the thresholds tensor, shape (o_size, n_thres) +# output 0 is the output tensor, shape (.., o_size) = (..., MH) +# the ... here can be any shape (representing groups of vectors) + + +class MatrixVectorActivation_rtl(HLSCustomOp): + """Class that corresponds to finn-hls Matrix_Vector_Activate(_Stream)_Batch + function.""" + + def __init__(self, onnx_node): + super().__init__(onnx_node) + self.decoupled_wrapper = templates.decoupled_wrapper + + def get_nodeattr_types(self): + my_attrs = { + "PE": ("i", True, 0), + "SIMD": ("i", True, 0), + "MW": ("i", True, 0), + "MH": ("i", True, 0), + "resType": ("s", False, "lut", {"auto", "lut", "dsp"}), + "ActVal": ("i", False, 0), + # FINN DataTypes for inputs, weights, outputs + "inputDataType": ("s", True, ""), + "weightDataType": ("s", True, ""), + "outputDataType": ("s", True, ""), + # FINN DataType for accumulator -- auto-computed and updated + "accDataType": ("s", False, "INT32"), + # use xnor-popcount for binary weights/inputs, thus treating them + # as bipolar + "binaryXnorMode": ("i", False, 0, {0, 1}), + # no-activation mode (produce accumulators) + "noActivation": ("i", False, 0, {0, 1}), + # number of input vectors, examples: + # [1] is a single vector (like a FC layer with batch=1) + # [4] is four vectors (like a FC layer with batch=4) + # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) + "numInputVectors": ("ints", False, [1]), + # memory mode for the FC weights + # const -- embedded weights, default, long compile/synth times + # decoupled -- streaming weights with weight streamer packaged inside IP + # external -- streaming weights with external streamer + "mem_mode": ("s", False, "const", {"const", "decoupled", "external"}), + # FPGA resource type for memories in decoupled mode + # auto -- let Vivado decide + # block -- use BRAM + # distributed -- use LUTRAM + # ultra -- use UltraRAM (URAM), must have runtime_writeable_weights=1 + # see also https://www.xilinx.com/support/answers/38070.html + "ram_style": ( + "s", + False, + "auto", + {"auto", "block", "distributed", "ultra"}, + ), + # FPGA resource type for threshold memories (if noActivation is False) + # auto -- let Vivado decide + # block -- use BRAM + # distributed -- use LUTRAM + "ram_style_thresholds": ( + "s", + False, + "auto", + {"auto", "block", "distributed"}, + ), + # (mem_mode = decoupled only) whether weights will be writable through + # an AXI-lite interface during runtime + # 1 for enabled, 0 for disabled. + # see finn-rtllib/memstream/doc/README for more about the memory + # address map used for writable weights + # IMPORTANT: After using AXI lite to either read or write the weights, + # always "flush" the accelerator by first passing a dummy input + # vector through the accelerator. This will get rid of any old + # weight data from the weight FIFOs. + "runtime_writeable_weights": ("i", False, 0, {0, 1}), + } + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def calc_wmem(self): + """Calculates and returns WMEM.""" + mw = self.get_nodeattr("MW") + mh = self.get_nodeattr("MH") + pe = self.get_nodeattr("PE") + simd = self.get_nodeattr("SIMD") + assert mh % pe == 0, "Requirement MH divisable by PE is violated." + assert mw % simd == 0, "Requirement MW divisable by SIMD is violated." + wmem = mw * mh // (pe * simd) + return wmem + + def calc_tmem(self): + """Calculates and returns TMEM.""" + assert self.get_nodeattr("noActivation")==1, "RTL-based MVAU does not support thresholding currently, please infer a standalone Thresholding_Batch layer" + return 0 + + def make_shape_compatible_op(self, model): + oshape = self.get_normal_output_shape() + return super().make_const_shape_op(oshape) + + def infer_node_datatype(self, model): + node = self.onnx_node + idt = model.get_tensor_datatype(node.input[0]) + if idt != self.get_input_datatype(): + warn_str = "inputDataType changing for %s: %s -> %s " % ( + node.name, + str(self.get_input_datatype()), + str(idt), + ) + warnings.warn(warn_str) + self.set_nodeattr("inputDataType", idt.name) + # set output datatype from property + odt = self.get_output_datatype() + model.set_tensor_datatype(node.output[0], odt) + + def verify_node(self): + info_messages = [] + # verify that "backend" is set to "fpgadataflow" + backend_value = self.get_nodeattr("backend") + if backend_value == "fpgadataflow": + info_messages.append("Attribute backend is set correctly") + else: + info_messages.append('Attribute backend should be set to "fpgadataflow"') + + # verify that all necessary attributes exist + # TODO collect automatically from get_nodeattr_types + try: + self.get_nodeattr("code_gen_dir_cppsim") + self.get_nodeattr("executable_path") + self.get_nodeattr("resType") + self.get_nodeattr("MW") + self.get_nodeattr("MH") + self.get_nodeattr("SIMD") + self.get_nodeattr("PE") + self.get_nodeattr("inputDataType") + self.get_nodeattr("weightDataType") + self.get_nodeattr("outputDataType") + info_messages.append("All necessary attributes exist") + except Exception: + info_messages.append( + """The required MatrixVectorActivation attributes do not exist.""" + ) + + # verify the number of inputs depending on noActivation value + # check noActivation value to determine the number of inputs + no_act = self.get_nodeattr("noActivation") + + if no_act == 1: + if len(self.onnx_node.input) == 2: + info_messages.append("The number of inputs is correct") + else: + info_messages.append( + """RTL-based MatrixVectorActivation needs in no + activation mode 2 inputs (data input and weights)""" + ) + elif no_act == 0: + info_messages.append("RTL-based MVAU does not support thresholding currently, please infer a standalone Thresholding_Batch layer") + else: + info_messages.append( + """noActivation attribute contains {} should + be 1 for RTL-based MatrixVectorActivation""".format( + no_act + ) + ) + + mem_mode = self.get_nodeattr("mem_mode") + + if mem_mode != "decoupled": + info_messages.append("RTL-based MVAU supports only decoupled weights currently") + + return info_messages + + def uram_estimation(self): + P = self.get_nodeattr("PE") + Q = self.get_nodeattr("SIMD") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + D_in = self.get_nodeattr("MW") + D_out = self.get_nodeattr("MH") + omega = (D_in * D_out) / (Q * P) + mem_width = Q * W * P + mmode = self.get_nodeattr("mem_mode") + mstyle = self.get_nodeattr("ram_style") + if ( + (mmode == "decoupled" and mstyle != "ultra") + or (mmode == "const" and self.calc_wmem() <= 128) + or (mmode == "external") + ): + return 0 + width_multiplier = math.ceil(mem_width / 72) + depth_multiplier = math.ceil(omega / 4096) + return width_multiplier * depth_multiplier + + def bram_estimation(self): + """Calculates resource estimation for BRAM based on: + - FINN-R: An End-to-End Deep-Learning Framework for Fast + Exploration of Quantized Neural Networks + - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien, + Y. Umuroglu, M. Leeser and K. Vissers + - 12. Sep 2018 + """ + # TODO add in/out FIFO contributions + P = self.get_nodeattr("PE") + Q = self.get_nodeattr("SIMD") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + D_in = self.get_nodeattr("MW") + D_out = self.get_nodeattr("MH") + omega = (D_in * D_out) / (Q * P) + mem_width = Q * W * P + mmode = self.get_nodeattr("mem_mode") + mstyle = self.get_nodeattr("ram_style") + if ( + (mmode == "decoupled" and mstyle in ["distributed", "ultra"]) + or (mmode == "const" and self.calc_wmem() <= 128) + or (mmode == "external") + ): + return 0 + # assuming SDP mode RAMB18s (see UG573 Table 1-10) + # assuming decoupled (RTL) memory, which is more efficient than const (HLS) + if mem_width == 1: + return math.ceil(omega / 16384) + elif mem_width == 2: + return math.ceil(omega / 8192) + elif mem_width <= 4: + return (math.ceil(omega / 4096)) * (math.ceil(mem_width / 4)) + elif mem_width <= 9: + return (math.ceil(omega / 2048)) * (math.ceil(mem_width / 9)) + elif mem_width <= 18 or omega > 512: + return (math.ceil(omega / 1024)) * (math.ceil(mem_width / 18)) + else: + return (math.ceil(omega / 512)) * (math.ceil(mem_width / 36)) + + def bram_efficiency_estimation(self): + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + D_in = self.get_nodeattr("MW") + D_out = self.get_nodeattr("MH") + bram16_est = self.bram_estimation() + if bram16_est == 0: + return 1 + wbits = W * D_in * D_out + bram16_est_capacity = bram16_est * 36 * 512 + return wbits / bram16_est_capacity + + def uram_efficiency_estimation(self): + """Function for URAM efficiency estimation: actual parameter storage + needed divided by the allocated URAM storage (from estimation)""" + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + D_in = self.get_nodeattr("MW") + D_out = self.get_nodeattr("MH") + uram_est = self.uram_estimation() + if uram_est == 0: + return 1 + wbits = W * D_in * D_out + uram_est_capacity = uram_est * 72 * 4096 + return wbits / uram_est_capacity + +#TODO: FIX + def lut_estimation(self): + """Calculates resource estimations for LUTs based on: + - FINN-R: An End-to-End Deep-Learning Framework for Fast + Exploration of Quantized Neural Networks + - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien, + Y. Umuroglu, M. Leeser and K. Vissers + - 12. Sep 2018 + """ + # TODO add in/out FIFO contributions + P = self.get_nodeattr("PE") + Q = self.get_nodeattr("SIMD") + MW = self.get_nodeattr("MW") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + # determine tdt with input and weight data types + idt = self.get_input_datatype() + A = idt.bitwidth() + # parameters from experiments in paper mentioned above + c0 = 300 + c1 = 1.1 + c2 = 0 + mmode = self.get_nodeattr("mem_mode") + mstyle = self.get_nodeattr("ram_style") + if (mmode == "decoupled" and mstyle == "distributed") or ( + mmode == "const" and self.calc_wmem() <= 128 + ): + c2 = (P * Q * W) * math.ceil(self.calc_wmem() / 64) + + # multiplication + res_type = self.get_nodeattr("resType") + if res_type == "dsp": + mult_luts = 0 + else: + mult_luts = Q * (2 * math.ceil((W + A) / 6) - 1) * (W + A) + # adder tree + addertree_luts = (W + A) * (2 * Q - 1) + # accumulator + acc_bits = W + A + np.ceil(math.log(MW, 2)) + acc_luts = acc_bits + # thresholds and threshold comparators + thr_luts = 0 + comp_luts = 0 + noact = self.get_nodeattr("noActivation") + if noact == 0: + odt = self.get_output_datatype() + B = odt.bitwidth() + thr_luts = (2**B - 1) * acc_bits * math.ceil(self.calc_tmem() / 64) + comp_luts = (2**B - 1) * acc_bits + + return int( + c0 + + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts)) + + c2 + ) + +#TODO: FIX + def dsp_estimation(self): + # multiplication + P = self.get_nodeattr("PE") + res_type = self.get_nodeattr("resType") + Q = self.get_nodeattr("SIMD") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + idt = self.get_input_datatype() + A = idt.bitwidth() + if res_type == "dsp": + mult_dsp = P * Q * np.ceil((W + A) / 48) # TODO: more accurate modelling + else: + mult_dsp = 0 + return int(mult_dsp) + +#TODO: FIX + def get_exp_cycles(self): + pe = self.get_nodeattr("PE") + simd = self.get_nodeattr("SIMD") + num_inp_vec = self.get_nodeattr("numInputVectors") + mh = self.get_nodeattr("MH") + mw = self.get_nodeattr("MW") + # since mmv != 1 is not supported yet, we set mmv for now to 1 + mmv = 1 + exp_cycles = (mh / pe) * (mw / simd) * np.prod(num_inp_vec) / mmv + return int(exp_cycles) + + def get_input_datatype(self, ind=0): + """Returns FINN DataType of input.""" + # when performing FIFO insertion on an FC layer with ext weights, the ind + # parameter can be > 0 (referring to the weights) so handle that here + if ind == 0: + return DataType[self.get_nodeattr("inputDataType")] + elif ind == 1: + return DataType[self.get_nodeattr("weightDataType")] + else: + raise Exception("Undefined input ind for this layer type") + + def get_weight_datatype(self): + """Returns FINN DataType of weights.""" + return DataType[self.get_nodeattr("weightDataType")] + + def get_output_datatype(self, ind=0): + """Returns FINN DataType of output.""" + return DataType[self.get_nodeattr("outputDataType")] + + def get_instream_width(self, ind=0): + i_bits = self.get_input_datatype().bitwidth() + assert i_bits<=9, "RTL-based MVAU only supports activations with bit-width up to 9-bits" + in_width = i_bits * self.get_nodeattr("SIMD") + return in_width + + def get_outstream_width(self, ind=0): + o_bits = self.get_output_datatype().bitwidth() + out_width = o_bits * self.get_nodeattr("PE") + return out_width + + def get_weightstream_width(self): + """Returns weight stream width. Used only in decoupled mode.""" + if ( + self.get_nodeattr("mem_mode") == "decoupled" + or self.get_nodeattr("mem_mode") == "external" + ): + pe = self.get_nodeattr("PE") + simd = self.get_nodeattr("SIMD") + wp = self.get_weight_datatype().bitwidth() + w_width = pe * simd * wp + assert wp<=8, "RTL-based MVAU only supports weights with bit-width up to 8-bits" + return w_width + else: + return 0 + + def get_weightstream_width_padded(self): + """Returns weight stream width padded to a multiple of 8. This is required + by the AXI Stream spec. Used in decoupled mode.""" + weight_width = self.get_weightstream_width() + return roundup_to_integer_multiple(weight_width, 8) + + def get_ap_int_max_w(self): + # base class impl (max of inp/out stream widths) + max_of_io = super().get_ap_int_max_w() + # decoupled mode weight stream + weightstream = self.get_weightstream_width() + # single PE weight entry + weight_bits = self.get_weight_datatype().bitwidth() + simd = self.get_nodeattr("SIMD") + single_pe_w = simd * weight_bits + return max([weightstream, max_of_io, single_pe_w]) + + def get_folded_input_shape(self, ind=0): + mw = self.get_nodeattr("MW") + mh = self.get_nodeattr("MH") + simd = self.get_nodeattr("SIMD") + pe = self.get_nodeattr("PE") + sf = mw // simd + nf = mh // pe + vecs = list(self.get_nodeattr("numInputVectors")) + + if ind == 0: + # calculate shape of input 0 + folded_input_shape = tuple(vecs + [sf, simd]) + elif ind == 1 and self.get_nodeattr("mem_mode") == "external": + # calculate shape of input 1 (weights) + folded_input_shape = tuple(vecs + [sf * nf, simd * pe]) + else: + raise Exception("Undefined input shape for requested input") + + return folded_input_shape + + def get_folded_output_shape(self, ind=0): + mh = self.get_nodeattr("MH") + pe = self.get_nodeattr("PE") + nf = mh // pe + vecs = list(self.get_nodeattr("numInputVectors")) + folded_output_shape = tuple(vecs + [nf, pe]) + return folded_output_shape + + def get_normal_input_shape(self, ind=0): + mw = self.get_nodeattr("MW") + vecs = list(self.get_nodeattr("numInputVectors")) + normal_input_shape = tuple(vecs + [mw]) + return normal_input_shape + + def get_normal_output_shape(self, ind=0): + mh = self.get_nodeattr("MH") + vecs = list(self.get_nodeattr("numInputVectors")) + normal_output_shape = tuple(vecs + [mh]) + return normal_output_shape + + def get_number_output_values(self): + nf = np.prod(self.get_folded_output_shape()[:-1]) + return nf + + def get_hls_compatible_weight_tensor(self, orig_weight_matrix): + """Convert the original numpy weight matrix orig_weight_matrix into + a form suitable for passing to the hlslib call: + * ensure MH % PE == 0 and MW % SIMD == 0 + * for bipolar {-1,+1} weights, convert to binary {0, 1} + * interleave rows between PEs + * reshape into (1, PE, WMEM, SIMD) and return + """ + mw = self.get_nodeattr("MW") + mh = self.get_nodeattr("MH") + pe = self.get_nodeattr("PE") + simd = self.get_nodeattr("SIMD") + wmem = self.calc_wmem() + assert orig_weight_matrix.shape == ( + mw, + mh, + ), """Weights matrix doesn't + have expected shape (mw, mh)""" + assert mw % simd == 0, "Requirement MH divisable by SIMD is violated." + assert mh % pe == 0, "Requirement MH divisable by PE is violated." + # start by transposing the original weight matrix, since ONNX and + # finn-hlslib use different assumptions + # ONNX uses (in_features, out_features) and matmul(x, W) + # finn-hlslib uses (out_features, in_features) and matmul(W, x) + ret = orig_weight_matrix.T + # interleave rows between PEs and reshape + # distribute rows between PEs + ret = interleave_matrix_outer_dim_from_partitions(ret, pe) + # create SIMD as innermost dimension and add a dummy outer dim + ret = ret.reshape(1, pe, wmem, simd) + # reverse the SIMD dimension + ret = np.flip(ret, axis=-1) + return ret + + def minimize_accumulator_width(self, model): + weights = model.get_initializer(self.onnx_node.input[1]) + idt = self.get_input_datatype() + # calculate minimum and maximum values of accumulator + (acc_min, acc_max) = calculate_matvec_accumulator_range(weights, idt) + if acc_min < 0: + if abs(acc_min) > acc_max: + adt = DataType.get_smallest_possible(acc_min) + else: + adt = DataType.get_smallest_possible(-acc_max - 1) + else: + adt = DataType.get_smallest_possible(acc_max) + # ensure a datatype divisible by 8-bits in case this is the last node + bw = roundup_to_integer_multiple(adt.bitwidth(), 8) + new_adt_name = adt.name.replace(str(adt.bitwidth()), str(bw)) + adt = DataType[new_adt_name] + self.set_nodeattr("accDataType", adt.name) + # for no-activation nodes, output dt = acc dt + self.set_nodeattr("outputDataType", adt.name) + return DataType[self.get_nodeattr("accDataType")] + + def make_weight_file(self, weights, weight_file_mode, weight_file_name): + """Produce a file containing given weights in appropriate format for this + layer. This file can be used for either synthesis or run-time reconfig + of weights. + + Arguments: + * weights : numpy array with weights to be put into the file + * weight_file_mode : one of {hls_header, decoupled_verilog_dat, + decoupled_runtime} + * weight_file_name : filename for the weight file to be generated + """ + # convert weights into hlslib-compatible format + weight_tensor = self.get_hls_compatible_weight_tensor(weights) + export_wdt = self.get_weight_datatype() + if "decoupled" in weight_file_mode: + # create a weight stream for various flavors of decoupled mode: + # transpose weight tensor from (1, PE, WMEM, SIMD) to (1, WMEM, PE, SIMD) + weight_tensor_unflipped = np.transpose(weight_tensor, (0, 2, 1, 3)) + # reverse SIMD flip for saving weights in .npy + weight_tensor_simd_flipped = np.flip(weight_tensor_unflipped, axis=-1) + # PE flip for saving weights in .dat + weight_tensor_pe_flipped = np.flip(weight_tensor_unflipped, axis=-2) + # reshape weight tensor (simd_flipped and pe_flipped) to desired shape + pe = self.get_nodeattr("PE") + simd = self.get_nodeattr("SIMD") + # simd_flipped + weight_tensor_simd_flipped = weight_tensor_simd_flipped.reshape( + 1, -1, pe * simd + ) + weight_tensor_simd_flipped = weight_tensor_simd_flipped.copy() + # flipped + weight_tensor_pe_flipped = weight_tensor_pe_flipped.reshape( + 1, -1, pe * simd + ) + weight_tensor_pe_flipped = weight_tensor_pe_flipped.copy() + if weight_file_mode == "decoupled_verilog_dat": + # convert weight values into hexstring + weight_width = self.get_weightstream_width() + # pad to nearest 4 bits to get hex strings + weight_width_padded = roundup_to_integer_multiple(weight_width, 4) + weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string( + weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix="" + ) + # add zeroes to pad out file to 1024 entries + weight_stream = weight_tensor_pe_flipped.flatten() + weight_stream = weight_stream.copy() + with open(weight_file_name, "w") as f: + for val in weight_stream: + f.write(val + "\n") + elif weight_file_mode == "decoupled_runtime": + # memstream axi-lite interface will map each mem line to + # one or multiple 32-bit words + weight_width = self.get_weightstream_width() + words_per_memwidth = 2 ** math.ceil(math.log2(weight_width / 32)) + if words_per_memwidth < 1: + words_per_memwidth = 1 + weight_width_padded = words_per_memwidth * 32 + # first, pack and ensure padding to 32 bits + weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string( + weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix="" + ) + weight_stream = weight_tensor_pe_flipped.flatten() + weight_stream = weight_stream.copy() + with open(weight_file_name, "w") as f: + for val in weight_stream: + # split into groups of 8 hex digits (= 32 bits) + words_32b = textwrap.wrap(val, 8) + words_32b.reverse() + for word_32b in words_32b: + f.write(word_32b + "\n") + else: + raise Exception("Unknown/unsupported weight_file_mode") + + else: + raise Exception("Unknown/unsupported weight_file_mode") + + def generate_params(self, model, path): + mem_mode = self.get_nodeattr("mem_mode") + code_gen_dir = path + # weights, if not external + weights = model.get_initializer(self.onnx_node.input[1]) + if mem_mode == "decoupled": + weight_filename_sim = "{}/weights.npy".format(code_gen_dir) + # save decoupled weights for cppsim + self.make_weight_file(weights, "decoupled_npy", weight_filename_sim) + # also save weights as Verilog .dat file + # note that we provide two different .dat files, one for synth + # and one for synthesis. this is because URAM-based weights always + # need zero weights for synthesis, otherwise they get inferred + # as BRAM + weight_filename_rtl_synth = "{}/memblock_synth_0.dat".format( + code_gen_dir + ) + weight_filename_rtl_sim = "{}/memblock_sim_0.dat".format(code_gen_dir) + # sim weights are always the true weights + self.make_weight_file( + weights, "decoupled_verilog_dat", weight_filename_rtl_sim + ) + ram_style = self.get_nodeattr("ram_style") + if ram_style == "ultra": + # UltraRAM must have no memory initializer, or only zeroes + # otherwise BRAM will be inferred instead of URAM + # as a workaround we provide a zero-weight init here + synth_weights = np.zeros_like(weights, dtype=np.float32) + else: + synth_weights = weights + self.make_weight_file( + synth_weights, "decoupled_verilog_dat", weight_filename_rtl_synth + ) + else: + raise Exception( + """Please set mem_mode to "decoupled", + currently no other parameter value is supported!""" + ) + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + mem_mode = self.get_nodeattr("mem_mode") + node = self.onnx_node + + # TODO ensure codegen dir exists + if mode == "cppsim": + raise Exception( + "cppsim not possible for RTL MVAU, please set exec_mode to rtlsim" + ) + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + # create a npy file fore each input of the node (in_ind is input index) + in_ind = 0 + for inputs in node.input: + # it is assumed that the first input of the node is the data input + # the second input are the weights + # the third input are the thresholds + if in_ind == 0: + assert ( + str(context[inputs].dtype) == "float32" + ), """Input datatype is + not float32 as expected.""" + expected_inp_shape = self.get_folded_input_shape() + reshaped_input = context[inputs].reshape(expected_inp_shape) + export_idt = self.get_input_datatype() + # make copy before saving the array + reshaped_input = reshaped_input.copy() + np.save( + os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)), + reshaped_input, + ) + elif in_ind > 2: + raise Exception("Unexpected input found for MatrixVectorActivation") + in_ind += 1 + + if mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + if mem_mode == "external" or mem_mode == "decoupled": + wnbits = self.get_weightstream_width() + export_wdt = self.get_weight_datatype() + wei = npy_to_rtlsim_input( + "{}/weights.npy".format(code_gen_dir), export_wdt, wnbits + ) + num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) + io_dict = { + "inputs": {"in0": inp, "weights": wei * num_w_reps}, + "outputs": {"out": []}, + } + self.rtlsim_multi_io(sim, io_dict) + output = io_dict["outputs"]["out"] + else: + output = self.rtlsim(sim, inp) + odt = self.get_output_datatype() + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + oshape = self.get_normal_output_shape() + output = np.asarray([output], dtype=np.float32).reshape(*oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + def code_generation_ipgen(self, model, fpgapart, clk): + """Normally: Generates C++ code and tcl script for IP generation. + Here: Generates (System-)Verilog code for IP generation.""" + self.generate_hdl() + + def ipgen_singlenode_code(self): + """Normally: Builds the bash script for IP generation.""" + pass + + def code_generation_cppsim(self, model): + """Normally: Generates C++ code for simulation (cppsim).""" + pass + + def compile_singlenode_code(self): + pass + + def global_includes(self): + pass + + def defines(self, var): + pass + + def read_npy_data(self): + pass + + def strm_decl(self): + pass + + def docompute(self): + pass + + def dataoutstrm(self): + pass + + def save_as_npy(self): + pass + + def blackboxfunction(self): + pass + + def pragmas(self): + pass + + def code_generation_ipi(self): + cmd = [] + # add streamer if needed + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "decoupled": + runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 + if self.get_nodeattr("ram_style") == "ultra": + assert ( + runtime_writable == 1 + ), "Layer with URAM weights must have runtime_writeable_weights=1" + node_name = self.onnx_node.name + sname = self.hls_sname() + # create a hierarchy for this layer, with the same port names + clk_name = self.get_verilog_top_module_intf_names()["clk"][0] + rst_name = self.get_verilog_top_module_intf_names()["rst"][0] + dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0][0] + din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0] + cmd.append("create_bd_cell -type hier %s" % node_name) + cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name)) + cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name)) + cmd.append( + "create_bd_intf_pin -mode Master " + "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" + % (node_name, dout_name) + ) + cmd.append( + "create_bd_intf_pin -mode Slave " + "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name) + ) + # instantiate the hls ip + cmd.append( + "create_bd_cell -type ip -vlnv %s /%s/%s" + % (self.get_nodeattr("ip_vlnv"), node_name, node_name) + ) + # instantiate a streamer and connect it to the HLS IP + strm_vlnv = "xilinx.com:user:memstream:1.0" + strm_inst = node_name + "_wstrm" + cmd.append( + "create_bd_cell -type ip -vlnv %s /%s/%s" + % (strm_vlnv, node_name, strm_inst) + ) + cmd.append( + "set_property -dict [list " + "CONFIG.NSTREAMS {1} " + "CONFIG.MEM_DEPTH {%d} " + "CONFIG.MEM_WIDTH {%d} " + "CONFIG.MEM_INIT {%s} " + "CONFIG.RAM_STYLE {%s} " + "CONFIG.STRM0_DEPTH {%d} " + "CONFIG.STRM0_WIDTH {%d} " + "CONFIG.STRM0_OFFSET {0} " + "] [get_bd_cells /%s/%s]" + % ( + self.calc_wmem(), + self.get_weightstream_width_padded(), + self.get_nodeattr("code_gen_dir_ipgen") + "/", + self.get_nodeattr("ram_style"), + self.calc_wmem(), + self.get_weightstream_width_padded(), + node_name, + strm_inst, + ) + ) + cmd.append( + "connect_bd_intf_net [get_bd_intf_pins %s/%s/m_axis_0] " + "[get_bd_intf_pins %s/%s/weights_%s]" + % (node_name, strm_inst, node_name, node_name, sname) + ) + cmd.append( + "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aresetn]" + % (node_name, rst_name, node_name, strm_inst) + ) + cmd.append( + "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aclk]" + % (node_name, clk_name, node_name, strm_inst) + ) + cmd.append( + "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]" + % (node_name, rst_name, node_name, node_name, rst_name) + ) + cmd.append( + "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]" + % (node_name, clk_name, node_name, node_name, clk_name) + ) + cmd.append( + "connect_bd_intf_net [get_bd_intf_pins %s/%s] " + "[get_bd_intf_pins %s/%s/%s]" + % (node_name, din_name, node_name, node_name, din_name) + ) + cmd.append( + "connect_bd_intf_net [get_bd_intf_pins %s/%s] " + "[get_bd_intf_pins %s/%s/%s]" + % (node_name, dout_name, node_name, node_name, dout_name) + ) + if runtime_writable: + # expose axi lite interface for writeable weights + axilite_name = self.get_verilog_top_module_intf_names()["axilite"][0] + cmd.append( + "create_bd_intf_pin -mode Slave " + "-vlnv xilinx.com:interface:aximm_rtl:1.0 /%s/%s" + % (node_name, axilite_name) + ) + cmd.append( + "connect_bd_intf_net [get_bd_intf_pins %s/%s] " + "[get_bd_intf_pins %s/%s/%s]" + % (node_name, axilite_name, node_name, strm_inst, axilite_name) + ) + # TODO calculate and pass in segment size here + cmd.append("assign_bd_address") + cmd.append("save_bd_design") + elif mem_mode == "const" or mem_mode == "external": + # base class impl sufficient for const/external modes + return super().code_generation_ipi() + else: + raise Exception("Unrecognized mem_mode for MatrixVectorActivation") + return cmd + + def get_verilog_top_module_intf_names(self): + intf_names = super().get_verilog_top_module_intf_names() + mem_mode = self.get_nodeattr("mem_mode") + sname = self.hls_sname() + if mem_mode == "external": + intf_names["s_axis"].append( + ("weights_" + sname, self.get_weightstream_width_padded()) + ) + if mem_mode == "decoupled": + # only expose axilite interface if attribute is set + runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 + if runtime_writable: + intf_names["axilite"] = ["s_axilite"] + return intf_names + + def get_op_and_param_counts(self): + in_features = self.get_nodeattr("MW") + out_features = self.get_nodeattr("MH") + weight_bits = self.get_weight_datatype().bitwidth() + inp_bits = self.get_input_datatype().bitwidth() + num_inp_vec = self.get_nodeattr("numInputVectors") + num_repetitions = int(np.prod(num_inp_vec)) + mac_count = in_features * out_features * num_repetitions + # cannonicalize op type: highest bitwidth operand first s.t. + # e.g. mac_8bx4b and mac_4bx8b don't appear as two different op types + bw1 = min(inp_bits, weight_bits) + bw2 = max(inp_bits, weight_bits) + mac_op_type = "op_mac_%dbx%db" % (bw1, bw2) + weight_param_type = "param_weight_%db" % (weight_bits) + weight_count = in_features * out_features + ret_dict = {mac_op_type: mac_count, weight_param_type: weight_count} + if self.get_nodeattr("noActivation") == 0: + tdt = DataType[self.get_nodeattr("accDataType")] + thres_bits = tdt.bitwidth() + thres_param_type = "param_threshold_%db" % (thres_bits) + thres_count = out_features + ret_dict[thres_param_type] = thres_count + return ret_dict + + def derive_characteristic_fxns(self, period): + n_inps = np.prod(self.get_folded_input_shape()[:-1]) + io_dict = { + "inputs": { + "in0": [0 for i in range(n_inps)], + }, + "outputs": {"out": []}, + } + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode in ["decoupled", "external"]: + n_weight_inps = self.calc_wmem() + num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) + io_dict["inputs"]["weights"] = [ + 0 for i in range(num_w_reps * n_weight_inps) + ] + super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) + + def generate_hdl(self): +#TODO: add distinction between (PE=MH or PE=1) and where MH dimension is folded + template_path, code_gen_dict = self.prepare_codegen_default() + + # add general parameters to dictionary + code_gen_dict["$TOP_MODULE_NAME$"] = [self.get_verilog_top_module_name()] + # save top module name so we can refer to it after this node has been renamed + # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject) + self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name()) +#TODO: currently only ram_style=auto is supported + ram_style = self.get_nodeattr("ram_style") + if ram_style == "auto": + continue + else: + raise Exception("Unrecognized ram_style for MatrixVectorActivation") + + # apply code generation to templates + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + with open(template_path, "r") as f: + template = f.read() + for key in code_gen_dict: + # transform list into long string separated by '\n' + code_gen_line = "\n".join(code_gen_dict[key]) + template = template.replace(key, code_gen_line) + template_wrapper = template_wrapper.replace(key, code_gen_line) + with open( + os.path.join( + code_gen_dir, self.get_nodeattr("gen_top_module") + "_impl.sv" + ), + "w", + ) as f: + f.write(template) + with open( + os.path.join( + code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v" + ), + "w", + ) as f: + f.write(template_wrapper) + + # set ipgen_path and ip_path so that HLS-Synth transformation + # and stich_ip transformation do not complain + self.set_nodeattr("ipgen_path", code_gen_dir) + self.set_nodeattr("ip_path", code_gen_dir) + + def prepare_codegen_default(self): + # TODO: Differentiate between PE folding and fully unrolled along MH dimension + template_path = ( + os.environ["FINN_ROOT"] + "/finn-rtllib/mvau/dsp58_mvau_template.vhdl" + ) + code_gen_dict = {} + + code_gen_dict["$PE$"] = self.get_nodeattr("PE") + code_gen_dict["$SIMD$"] = self.get_nodeattr("SIMD") + code_gen_dict["$MW$"] = self.get_nodeattr("MW") + code_gen_dict["$MH$"] = self.get_nodeattr("MH") + code_gen_dict["$ACTIVATION_WIDTH$"] = self.get_input_datatype(0).bitwidth() + code_gen_dict["$WEIGHT_WIDTH$"] = self.get_input_datatype(1).bitwidth() + code_gen_dict["$ACCU_WIDTH_BA$"] = self.get_output_datatype().bitwidth() + + return template_path, code_gen_dict + From e965396e4ddf4848fc9a17b04fa4908a0924568e Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Thu, 5 Jan 2023 10:40:52 +0000 Subject: [PATCH 044/235] [thresholding] skip test for unsupported cppsim configuration and merge tests Signed-off-by: Fionn O'Donohoe --- ...fpgadataflow_thresholding_binary_search.py | 65 +++---------------- 1 file changed, 9 insertions(+), 56 deletions(-) diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py index ab98189ea5..947109794e 100755 --- a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py +++ b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py @@ -96,6 +96,7 @@ def make_single_thresholding_binary_search_modelwrapper( mem_mode, num_input_vecs, ): + NumChannels = thresholds.shape[0] inp = helper.make_tensor_value_info( @@ -223,61 +224,6 @@ def test_fpgadataflow_thresholding_binary_search_unit(): return -# Test brief: Prove that cppsim is not supported for this class -@pytest.mark.fpgadataflow -@pytest.mark.vivado -def test_fpgadataflow_thresholding_binary_search_cppsim(): - input_data_type = DataType["UINT16"] - act = DataType["BIPOLAR"] - fold = -1 - num_input_channels = 16 - # 'const' is unsupported see test: - # test_fpgadataflow_thresholding_binary_search_const_mem_mode() - mem_mode = "decoupled" - - pe = generate_pe_value(fold, num_input_channels) - num_steps = act.get_num_possible_values() - 1 - - # Generate random, non-decreasing thresholds - thresholds = generate_random_threshold_values( - input_data_type, num_input_channels, num_steps - ) - thresholds = sort_thresholds_increasing(thresholds) - - # Other non-input parameters - num_input_vecs = [1, 2, 2] - output_data_type = act - if output_data_type == DataType["BIPOLAR"]: - activation_bias = 0 - else: - activation_bias = output_data_type.min() - - # Generate model from input parameters to the test - model = make_single_thresholding_binary_search_modelwrapper( - thresholds, - pe, - input_data_type, - output_data_type, - activation_bias, - mem_mode, - num_input_vecs, - ) - - # Cppsim is not supported for this class, catch the specific exception thrown by - # cppsim. Exception raised in cppsim: Custom op_type Thresholding_Binary_Search is - # currently not supported. - try: - model = model.transform(PrepareCppSim()) - model = model.transform(CompileCppSim()) - model = model.transform(SetExecMode("cppsim")) - except Exception as e: - if ( - str(e) - != "Custom op_type Thresholding_Binary_Search is currently not supported." - ): - raise - - # Test brief: Prove that memory mode 'const' is not supported for this layer type @pytest.mark.fpgadataflow @pytest.mark.vivado @@ -384,16 +330,23 @@ def test_fpgadataflow_thresholding_binary_search_prepare_rtlsim(): # no need to test 'const' mode, it's already done in: # test_fpgadataflow_thresholding_binary_search_const_mem_mode() @pytest.mark.parametrize("mem_mode", ["decoupled"]) +@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) @pytest.mark.fpgadataflow @pytest.mark.vivado @pytest.mark.slow def test_fpgadataflow_thresholding_binary_search( - activation, input_data_type, fold, num_input_channels, mem_mode + activation, input_data_type, fold, num_input_channels, mem_mode, exec_mode ): # Handle inputs to the test pe = generate_pe_value(fold, num_input_channels) num_steps = activation.get_num_possible_values() - 1 + # Cppsim is not supported for this node (as it is an RTL node) + if exec_mode == "cppsim": + pytest.skip("cppsim not supported for RTL Thresholding Binary Search node") + elif exec_mode != "rtlsim": + raise Exception("Unknown exec_mode: {}".format(exec_mode)) + # Other non-input parameters num_input_vecs = [1, 2, 2] output_data_type = activation From 2b8a674573e3415e54665ff05a2db75d5c20f30f Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Thu, 5 Jan 2023 11:07:40 +0000 Subject: [PATCH 045/235] [thresholding] moving find_next_power_of_2() to the util suite Signed-off-by: Fionn O'Donohoe --- src/finn/util/basic.py | 17 ++++ ...fpgadataflow_thresholding_binary_search.py | 82 ------------------- tests/util/test_basic.py | 62 ++++++++++++++ 3 files changed, 79 insertions(+), 82 deletions(-) create mode 100755 tests/util/test_basic.py diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py index 4aba87216c..9a66cf90eb 100644 --- a/src/finn/util/basic.py +++ b/src/finn/util/basic.py @@ -216,3 +216,20 @@ def is_exe(fpath): return exe_file return None + +def find_next_power_of_2(n): + # Negative values will loop infinitely below - return 0 + if n <= 0: + return 0 + # If '1' is requested, output will be '0' in the loop below, avoid this now. + elif n == 1: + return 2 # i.e. 2**1 + + # decrement 'n' (to handle cases when `n` itself is a power of 2) + n = n - 1 + + # loop until only one bit is left + while n & n - 1: + # unset rightmost bit + n = n & n - 1 + return n << 1 diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py index 947109794e..29fc2828b6 100755 --- a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py +++ b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py @@ -142,88 +142,6 @@ def make_single_thresholding_binary_search_modelwrapper( return model -# Test brief: a particular method for this class was causing a bug: -# find_next_power_of_2() -# Weights in the thresholding core are programmed on a per-channel basis and are -# byte-addressable. When a channel is programmed, the next channel can start -# programming at the next power-of-2 byte boundary. This test is to show that the -# function that calculates that boundary is working correctly. -# -# A Thresholding_Binary_Search layer was created and a SW generated dataset with a -# threshold channel depth of 1 weight (1 layer of N channels in the thresholding core). -# However, find_next_power_of_2() was returning a next-power-of-2 address boundary at -# address '0', instead of '2'. This unit test is to prove that this bug no longer -# occurs. It was originally seen when the input datatype was 'DataType["BIPOLAR"]'. -@pytest.mark.fpgadataflow -@pytest.mark.vivado -def test_fpgadataflow_thresholding_binary_search_unit(): - activation = DataType["BIPOLAR"] - input_data_type = DataType["INT16"] - fold = -1 - num_input_channels = 16 - mem_mode = "decoupled" - - # Handle inputs to the test - pe = generate_pe_value(fold, num_input_channels) - num_steps = activation.get_num_possible_values() - 1 - - # Other non-input parameters - num_input_vecs = [1, 2, 2] - output_data_type = activation - if output_data_type == DataType["BIPOLAR"]: - activation_bias = 0 - else: - activation_bias = output_data_type.min() - - # Generate random thresholds and sort in ascending order - thresholds = generate_random_threshold_values( - input_data_type, num_input_channels, num_steps - ) - - # Generate model from input parameters to the test - model = make_single_thresholding_binary_search_modelwrapper( - thresholds, - pe, - input_data_type, - output_data_type, - activation_bias, - mem_mode, - num_input_vecs, - ) - - # Retrieve the class to get the method-under-test - tbs_node = model.get_nodes_by_op_type("Thresholding_Binary_Search")[0] - tbs_inst = getCustomOp(tbs_node) - - test_vector = [ - {"input": -2, "expected_result": 0}, - {"input": -1, "expected_result": 0}, - {"input": 0, "expected_result": 0}, - {"input": 1, "expected_result": 2}, - {"input": 2, "expected_result": 2}, - {"input": 3, "expected_result": 4}, - {"input": 4, "expected_result": 4}, - {"input": 7, "expected_result": 8}, - {"input": 8, "expected_result": 8}, - {"input": 11, "expected_result": 16}, - {"input": 15, "expected_result": 16}, - {"input": 16, "expected_result": 16}, - {"input": 18, "expected_result": 32}, - {"input": 27, "expected_result": 32}, - {"input": 31, "expected_result": 32}, - {"input": 32, "expected_result": 32}, - {"input": 42, "expected_result": 64}, - {"input": 65, "expected_result": 128}, - ] - - for test_dict in test_vector: - output = tbs_inst.find_next_power_of_2(test_dict["input"]) - assert output >= test_dict["input"] - assert output == test_dict["expected_result"] - - return - - # Test brief: Prove that memory mode 'const' is not supported for this layer type @pytest.mark.fpgadataflow @pytest.mark.vivado diff --git a/tests/util/test_basic.py b/tests/util/test_basic.py new file mode 100755 index 0000000000..d2586f4f19 --- /dev/null +++ b/tests/util/test_basic.py @@ -0,0 +1,62 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +import finn.util.basic as basic + + +@pytest.mark.util +def test_next_power_of_2(): + test_vector = [ + {"input": -2, "expected_result": 0}, + {"input": -1, "expected_result": 0}, + {"input": 0, "expected_result": 0}, + {"input": 1, "expected_result": 2}, + {"input": 2, "expected_result": 2}, + {"input": 3, "expected_result": 4}, + {"input": 4, "expected_result": 4}, + {"input": 7, "expected_result": 8}, + {"input": 8, "expected_result": 8}, + {"input": 11, "expected_result": 16}, + {"input": 15, "expected_result": 16}, + {"input": 16, "expected_result": 16}, + {"input": 18, "expected_result": 32}, + {"input": 27, "expected_result": 32}, + {"input": 31, "expected_result": 32}, + {"input": 32, "expected_result": 32}, + {"input": 42, "expected_result": 64}, + {"input": 65, "expected_result": 128}, + ] + + for test_dict in test_vector: + output = basic.find_next_power_of_2(test_dict["input"]) + assert output >= test_dict["input"] + assert output == test_dict["expected_result"] + + return From 45bb19f2821bde10cf7303a193869160fd46c72e Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Thu, 5 Jan 2023 11:22:08 +0000 Subject: [PATCH 046/235] [thresholding] remove find_next_power_of_2() from thresholding binary search CustomOp class Signed-off-by: Fionn O'Donohoe --- .../thresholding_binary_search.py | 21 ++----------------- 1 file changed, 2 insertions(+), 19 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py index 9cbe049be3..c681bb2631 100755 --- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py @@ -38,7 +38,7 @@ ) from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp -from finn.util.basic import get_rtlsim_trace_depth, make_build_dir +from finn.util.basic import get_rtlsim_trace_depth, make_build_dir, find_next_power_of_2 from finn.util.data_packing import ( npy_to_rtlsim_input, pack_innermost_dim_as_hex_string, @@ -647,23 +647,6 @@ def get_verilog_top_module_intf_names(self): return intf_names - def find_next_power_of_2(self, n): - # Negative values will loop infinitely below - return 0 - if n <= 0: - return 0 - # If '1' is requested, output will be '0' in the loop below, avoid this now. - elif n == 1: - return 2 # i.e. 2**1 - - # decrement 'n' (to handle cases when `n` itself is a power of 2) - n = n - 1 - - # loop until only one bit is left - while n & n - 1: - # unset rightmost bit - n = n & n - 1 - return n << 1 - def twos_comp(self, val, bitwidth): return (val + (1 << bitwidth)) % (1 << bitwidth) @@ -678,7 +661,7 @@ def get_dynamic_config(self, model, address_stride=1): thresholds = model.get_initializer(self.onnx_node.input[1]) num_channels, num_weights_per_channel = thresholds.shape - weight_addr_boundary = self.find_next_power_of_2(num_weights_per_channel) + weight_addr_boundary = find_next_power_of_2(num_weights_per_channel) # Make sure that the next power of 2 (output) is greater than the input assert weight_addr_boundary >= num_weights_per_channel From ca0042225c006d4545e26b0e0f1221ecd4ab68c3 Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Thu, 5 Jan 2023 12:58:01 +0000 Subject: [PATCH 047/235] [thresholding] replace math functions with existing functions Signed-off-by: Fionn O'Donohoe --- .../custom_op/fpgadataflow/thresholding_binary_search.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py index c681bb2631..9113e4f9d9 100755 --- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py @@ -647,12 +647,6 @@ def get_verilog_top_module_intf_names(self): return intf_names - def twos_comp(self, val, bitwidth): - return (val + (1 << bitwidth)) % (1 << bitwidth) - - def prep_axilite_val(self, val): - return self.twos_comp(int(val), self.get_weight_datatype().bitwidth()) - def get_dynamic_config(self, model, address_stride=1): """Returns a configuration dictionary containing axilite write commands in order to program the thresholds into the RTL core during runtime. @@ -677,7 +671,7 @@ def get_dynamic_config(self, model, address_stride=1): ) config[key_name] = ( channel_start_addr + addr, - self.prep_axilite_val(weight), + int(str(pack_innermost_dim_as_hex_string([weight], self.get_weight_datatype(), self.get_weight_datatype().bitwidth())), 0), ) weight_cntr += 1 From 7f3455fc0d1dafedaf8cdfca8144dea41747a624 Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Thu, 5 Jan 2023 13:16:11 +0000 Subject: [PATCH 048/235] [thresholding] remove convept of mem_mode for RTL thresholding binary search node Signed-off-by: Fionn O'Donohoe --- .../thresholding_binary_search.py | 38 ------------ ...fpgadataflow_thresholding_binary_search.py | 58 +------------------ 2 files changed, 1 insertion(+), 95 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py index 9113e4f9d9..954850562e 100755 --- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py @@ -88,19 +88,6 @@ def get_nodeattr_types(self): # [4] is four vectors (like a FC layer with batch=4) # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) "numInputVectors": ("ints", False, [1]), - # memory mode for the thresholds - # const -- embedded thresholds, default - # decoupled -- streaming thresholds with streamer packaged inside IP - "mem_mode": ("s", False, "const", {"const", "decoupled"}), - # (mem_mode = decoupled only) whether weights (thresholds) will be - # writable through an AXI-lite interface during runtime - # 1 for enabled, 0 for disabled. - # see finn-rtllib/memstream/doc/README for more about the memory - # address map used for writable weights - # IMPORTANT: After using AXI lite to either read or write the weights, - # always "flush" the accelerator by first passing a dummy input - # vector through the accelerator. This will get rid of any old - # weight data from the weight FIFOs. "gen_top_module": ("s", False, ""), "activation_bias": ("i", False, 0), "clkFreq": ("i", False, 200000000), @@ -150,12 +137,6 @@ def get_outstream_width(self, ind=0): return o_bits * self.get_nodeattr("PE") def get_weightstream_width(self): - # Only 'decoupled' mode is supported - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode != "decoupled": - raise Exception( - "Unrecognized memory mode for this node: {}".format(mem_mode) - ) pe = self.get_nodeattr("PE") wp = self.get_weight_datatype().bitwidth() n_thres_steps = self.get_nodeattr("numSteps") @@ -442,13 +423,6 @@ def code_generation_ipgen(self, model, fpgapart, clk): self.generate_params(model, code_gen_dir) def generate_params(self, model, path): - # Only 'decoupled' mode is supported - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode != "decoupled": - raise Exception( - "Unrecognized memory mode for this node: {}".format(mem_mode) - ) - code_gen_dir = path weight_filename_sim = "{}/thresholds.npy".format(code_gen_dir) thresholds = model.get_initializer(self.onnx_node.input[1]) @@ -508,12 +482,6 @@ def execute_node(self, context, graph): self.get_nodeattr("exec_mode"), "rtlsim" ) ) - if self.get_nodeattr("mem_mode") != "decoupled": - raise Exception( - "Invalid mem_mode value: {}; mem_mode must be set to '{}'".format( - self.get_nodeattr("mem_mode"), "decoupled" - ) - ) node = self.onnx_node code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") @@ -635,12 +603,6 @@ def get_verilog_top_module_intf_names(self): Each block must have at most one aximm and one axilite.""" intf_names = super().get_verilog_top_module_intf_names() - # Only 'decoupled' mode is supported - check before adding axilite interface - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode != "decoupled": - raise Exception( - "Unrecognized memory mode for this node: {}".format(mem_mode) - ) intf_names["axilite"] = ["s_axilite"] intf_names["s_axis"] = [["s_axis"]] intf_names["m_axis"] = [["m_axis"]] diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py index 29fc2828b6..7ef5da8f23 100755 --- a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py +++ b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py @@ -93,7 +93,6 @@ def make_single_thresholding_binary_search_modelwrapper( input_data_type, output_data_type, activation_bias, - mem_mode, num_input_vecs, ): @@ -121,7 +120,6 @@ def make_single_thresholding_binary_search_modelwrapper( weightDataType=input_data_type.name, outputDataType=output_data_type.name, activation_bias=activation_bias, - mem_mode=mem_mode, numInputVectors=num_input_vecs, ) graph = helper.make_graph( @@ -142,54 +140,6 @@ def make_single_thresholding_binary_search_modelwrapper( return model -# Test brief: Prove that memory mode 'const' is not supported for this layer type -@pytest.mark.fpgadataflow -@pytest.mark.vivado -def test_fpgadataflow_thresholding_binary_search_const_mem_mode(): - input_data_type = DataType["INT16"] - activation = DataType["INT4"] - fold = -1 - num_input_channels = 16 - mem_mode = "const" - - pe = generate_pe_value(fold, num_input_channels) - num_input_vecs = [1, 2, 2] - output_data_type = activation - activation_bias = output_data_type.min() - - # Generate random thresholds and sort in ascending order - num_steps = activation.get_num_possible_values() - 1 - thresholds = generate_random_threshold_values( - input_data_type, num_input_channels, num_steps - ) - - # Generate model from input parameters to the test - model = make_single_thresholding_binary_search_modelwrapper( - thresholds, - pe, - input_data_type, - output_data_type, - activation_bias, - mem_mode, - num_input_vecs, - ) - - # Prove that 'const' memory mode is not supported for this class - # 'const' memory mode is not supported for this class, catch the specific exception - # thrown by FINN. Exception: ('Unrecognized memory mode for this node:', 'const') - try: - model = model.transform(InsertFIFO(True)) - model = model.transform(GiveUniqueNodeNames()) - model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) - model = model.transform(HLSSynthIP()) - model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns)) - except Exception as e: - if str(e) != "Unrecognized memory mode for this node: {}".format(mem_mode): - raise - # Caught the expected exception, leave the test early - return - - # Test brief: Test that PrepareRTLSim() runs successfully. This function is not # tested in test_fpgadataflow_thresholding_binary_search() @pytest.mark.fpgadataflow @@ -199,7 +149,6 @@ def test_fpgadataflow_thresholding_binary_search_prepare_rtlsim(): act = DataType["INT4"] fold = -1 num_input_channels = 16 - mem_mode = "decoupled" # Handle inputs to the test pe = generate_pe_value(fold, num_input_channels) @@ -226,7 +175,6 @@ def test_fpgadataflow_thresholding_binary_search_prepare_rtlsim(): input_data_type, output_data_type, activation_bias, - mem_mode, num_input_vecs, ) @@ -245,15 +193,12 @@ def test_fpgadataflow_thresholding_binary_search_prepare_rtlsim(): @pytest.mark.parametrize("input_data_type", [DataType["INT16"], DataType["UINT16"]]) @pytest.mark.parametrize("fold", [-1]) # 1, 2, etc. will fail @pytest.mark.parametrize("num_input_channels", [16]) -# no need to test 'const' mode, it's already done in: -# test_fpgadataflow_thresholding_binary_search_const_mem_mode() -@pytest.mark.parametrize("mem_mode", ["decoupled"]) @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) @pytest.mark.fpgadataflow @pytest.mark.vivado @pytest.mark.slow def test_fpgadataflow_thresholding_binary_search( - activation, input_data_type, fold, num_input_channels, mem_mode, exec_mode + activation, input_data_type, fold, num_input_channels, exec_mode ): # Handle inputs to the test pe = generate_pe_value(fold, num_input_channels) @@ -304,7 +249,6 @@ def test_fpgadataflow_thresholding_binary_search( input_data_type, output_data_type, activation_bias, - mem_mode, num_input_vecs, ) From 4bc69f1a374821b16b80826946223a0a36cae787 Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Thu, 5 Jan 2023 16:20:02 +0000 Subject: [PATCH 049/235] [thresholding] add methods needed for convertingToHls transformation Signed-off-by: Fionn O'Donohoe --- .../thresholding_binary_search.py | 40 +++++++++++++++++-- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py index 954850562e..c342d235d9 100755 --- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py @@ -101,10 +101,23 @@ def calc_tmem(self): return num_channels // pe def make_shape_compatible_op(self, model): - return [] + oshape = self.get_normal_output_shape() + return super().make_const_shape_op(oshape) def infer_node_datatype(self, model): - return + node = self.onnx_node + idt = model.get_tensor_datatype(node.input[0]) + if idt != self.get_input_datatype(): + warn_str = "inputDataType changing for %s: %s -> %s " % ( + node.name, + str(self.get_input_datatype().name), + str(idt.name), + ) + warnings.warn(warn_str) + self.set_nodeattr("inputDataType", idt.name) + # set output datatype from property + odt = self.get_output_datatype() + model.set_tensor_datatype(node.output[0], odt) def verify_node(self): return [] @@ -126,7 +139,28 @@ def get_weight_datatype(self): return DataType[self.get_nodeattr("weightDataType")] def minimize_accumulator_width(self, model): - return None + "Minimize threshold width ('accumulator width' here due to convention)" + thresholds = model.get_initializer(self.onnx_node.input[1]) + threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) + min_threshold = thresholds.min() + max_threshold = thresholds.max() + min_input = self.get_input_datatype().min() + max_input = self.get_input_datatype().max() + # get range required by threshold values + tdt_min = min(min_input, min_threshold) + tdt_max = max(max_input, max_threshold) + if tdt_min < 0: + if abs(tdt_min) > tdt_max: + tdt = DataType.get_smallest_possible(tdt_min) + else: + tdt = DataType.get_smallest_possible(-tdt_max - 1) + else: + tdt = DataType.get_smallest_possible(tdt_max) + assert np.vectorize(tdt.allowed)( + threshold_tensor + ).all(), "Thresholds can't be expressed with type %s" % str(tdt) + self.set_nodeattr("weightDataType", tdt.name) + return DataType[self.get_nodeattr("weightDataType")] def get_instream_width(self, ind=0): i_bits = self.get_input_datatype().bitwidth() From 3b6a1980b8ac28f5a809125d1e06eeb5ab2ba3b5 Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Thu, 5 Jan 2023 16:32:23 +0000 Subject: [PATCH 050/235] [thresholding] add convertingToHls transformation for thresholding binary search RTL node Signed-off-by: Fionn O'Donohoe --- .../fpgadataflow/convert_to_hls_layers.py | 93 +++++++++++++++---- 1 file changed, 73 insertions(+), 20 deletions(-) diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py index 525af7ea92..17f839c5c5 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py @@ -1062,9 +1062,10 @@ def apply(self, model): class InferThresholdingLayer(Transformation): """Convert any MultiThreshold into a standalone thresholding HLS layer.""" - def __init__(self, mem_mode="const"): + def __init__(self, mem_mode="const", use_rtl_variant=False): super().__init__() self.mem_mode = mem_mode + self.use_rtl_variant = use_rtl_variant def apply(self, model): graph = model.graph @@ -1118,26 +1119,78 @@ def apply(self, model): ) actval = int(actval) assert (not odt.signed()) or (actval < 0), ( - node.name + ": Signed output requres actval < 0" - ) - # create and insert new Thresholding_Batch node - new_node = helper.make_node( - "Thresholding_Batch", - [thl_input, thl_threshold], - [thl_output], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - NumChannels=ifc, - PE=pe, - numSteps=thl_thres_shape[1], - inputDataType=idt.name, - weightDataType=idt.name, # will be set by MinimizeAccumulatorWidth - outputDataType=odt.name, - numInputVectors=list(thl_in_shape[:-1]), - ActVal=actval, - mem_mode=self.mem_mode, - name="Thresholding_Batch_" + node.name, + node.name + ": Signed output requires actval < 0" ) + + # Ensure that RTL variant is not inserted for unsupported configuration + is_rtl_variant_compatible = True + + # Perform checks for RTL variant if chosen + if self.use_rtl_variant: + # Check memory mode + if self.mem_mode != "decoupled": + warnings.warn( + """%s : RTL Thresholding does not support 'decoupled' memory mode. + Falling back to HLS implementation.""" + % node.name + ) + is_rtl_variant_compatible = False + + # Check PE/SIMD value + if pe != 1: + warnings.warn( + """%s : RTL Thresholding does not support paralellisation. + Only a PE value of 1 is supported. + Falling back to HLS implementation.""" + % node.name + ) + is_rtl_variant_compatible = False + + if self.use_rtl_variant and is_rtl_variant_compatible: + new_node = helper.make_node( + "Thresholding_Binary_Search", + [thl_input, thl_threshold], + [thl_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + NumChannels=ifc, + PE=pe, + numSteps=thl_thres_shape[1], + inputDataType=idt.name, + weightDataType=idt.name, # will be set by MinimizeAccumulatorWidth + outputDataType=odt.name, + numInputVectors=list(thl_in_shape[:-1]), + activation_bias=actval, + mem_mode=self.mem_mode, + name="Thresholding_Binary_Search_" + node.name, + ) + else: + if self.use_rtl_variant: + warnings.warn( + """%s : RTL Thresholding requested for unsupported + configuration. Falling back to HLS implementation.""" + % node.name + ) + + # create and insert new Thresholding_Batch node + new_node = helper.make_node( + "Thresholding_Batch", + [thl_input, thl_threshold], + [thl_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + NumChannels=ifc, + PE=pe, + numSteps=thl_thres_shape[1], + inputDataType=idt.name, + weightDataType=idt.name, # will be set by MinimizeAccumulatorWidth + outputDataType=odt.name, + numInputVectors=list(thl_in_shape[:-1]), + ActVal=actval, + mem_mode=self.mem_mode, + name="Thresholding_Batch_" + node.name, + ) + graph.node.insert(insert_point, new_node) # remove old node graph.node.remove(node) From b3800cd7e258cecb0466cb9238eeb37ff738d660 Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Thu, 5 Jan 2023 16:34:09 +0000 Subject: [PATCH 051/235] [thresholding] add test for convertingToHls transformation for thresholding binary search node Signed-off-by: Fionn O'Donohoe --- .../test_convert_to_hls_thresholding.py | 322 ++++++++++++++++++ 1 file changed, 322 insertions(+) create mode 100755 tests/fpgadataflow/test_convert_to_hls_thresholding.py diff --git a/tests/fpgadataflow/test_convert_to_hls_thresholding.py b/tests/fpgadataflow/test_convert_to_hls_thresholding.py new file mode 100755 index 0000000000..30932638b6 --- /dev/null +++ b/tests/fpgadataflow/test_convert_to_hls_thresholding.py @@ -0,0 +1,322 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +import numpy as np +from onnx import TensorProto, helper +from pyverilator.util.axi_utils import axilite_write, reset_rtlsim +from qonnx.core.datatype import DataType +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.custom_op.general.multithreshold import multithreshold +from qonnx.custom_op.registry import getCustomOp +from qonnx.transformation.general import GiveUniqueNodeNames +from qonnx.transformation.infer_datatypes import InferDataTypes +from qonnx.transformation.infer_shapes import InferShapes +from qonnx.util.basic import gen_finn_dt_tensor + +from finn.core.rtlsim_exec import rtlsim_exec +import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim +from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO +from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode + +test_fpga_part = "xczu3eg-sbva484-1-e" +target_clk_ns = 5 + + +# Helper functions +def sort_thresholds_increasing(thresholds): + return np.sort(thresholds, axis=1) + + +def generate_random_threshold_values(input_data_type, num_input_channels, num_steps): + return np.random.randint( + input_data_type.min(), + input_data_type.max() + 1, + (num_input_channels, num_steps), + ).astype(np.float32) + + +def generate_pe_value(fold, num_input_channels): + if fold == -1: + fold = num_input_channels + pe = num_input_channels // fold + assert num_input_channels % pe == 0 + return pe + + +# n = batch, c = channel, h = height, w = width of feature map +# Standard = NCHW; FINN = NHWC +# Convert from NCHW to NHWC +def convert_np_array_to_finn_data_layout(data): + return np.transpose(data, (0, 2, 3, 1)) + + +# n = batch, c = channel, h = height, w = width of feature map +# Standard = NCHW; FINN = NHWC +# Convert from NHWC to NCHW +def convert_np_array_to_standard_data_layout(data): + return np.transpose(data, (0, 3, 1, 2)) + + +def make_single_thresholding_binary_search_modelwrapper( + thresholds, + pe, + input_data_type, + output_data_type, + activation_bias, + num_input_vecs, +): + NumChannels = thresholds.shape[0] + + inp = helper.make_tensor_value_info( + "inp", TensorProto.FLOAT, num_input_vecs + [NumChannels] + ) + outp = helper.make_tensor_value_info( + "outp", TensorProto.FLOAT, num_input_vecs + [NumChannels] + ) + + node_inp_list = ["inp", "thresh"] + + Thresholding_node = helper.make_node( + "Thresholding_Binary_Search", + node_inp_list, + ["outp"], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + NumChannels=NumChannels, + PE=pe, + numSteps=thresholds.shape[1], + inputDataType=input_data_type.name, + weightDataType=input_data_type.name, + outputDataType=output_data_type.name, + numInputVectors=num_input_vecs, + activation_bias=activation_bias, + ) + graph = helper.make_graph( + nodes=[Thresholding_node], + name="thresholding_graph", + inputs=[inp], + outputs=[outp], + ) + + model = helper.make_model(graph, producer_name="thresholding-model") + model = ModelWrapper(model) + + model.set_tensor_datatype("inp", input_data_type) + model.set_tensor_datatype("outp", output_data_type) + + model.set_tensor_datatype("thresh", input_data_type) + model.set_initializer("thresh", thresholds) + return model + + +def make_single_multithresholding_modelwrapper( + thresholds, + pe, + input_data_type, + output_data_type, + activation_bias, + num_input_vecs, +): + NumChannels = thresholds.shape[0] + + inp = helper.make_tensor_value_info( + "inp", TensorProto.FLOAT, num_input_vecs + [NumChannels] + ) + outp = helper.make_tensor_value_info( + "outp", TensorProto.FLOAT, num_input_vecs + [NumChannels] + ) + + node_inp_list = ["inp", "thresh"] + + Multithresholding_node = helper.make_node( + "MultiThreshold", + node_inp_list, + ["outp"], + domain="qonnx.custom_op.general", + out_dtype=output_data_type.name, + out_bias=float(activation_bias), + out_scale=1.0, + ) + + graph = helper.make_graph( + nodes=[Multithresholding_node], + name="multithresholding_graph", + inputs=[inp], + outputs=[outp], + ) + + model = helper.make_model(graph, producer_name="multithresholding-model") + model = ModelWrapper(model) + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + model = model.transform(GiveUniqueNodeNames()) + + model.set_tensor_datatype("inp", input_data_type) + model.set_tensor_datatype("outp", output_data_type) + + model.set_tensor_datatype("thresh", input_data_type) + model.set_initializer("thresh", thresholds) + return model + + +@pytest.mark.parametrize("activation", [DataType["INT4"], DataType["BIPOLAR"]]) +@pytest.mark.parametrize("input_data_type", [DataType["INT16"], DataType["UINT16"]]) +@pytest.mark.parametrize("fold", [-1]) +@pytest.mark.parametrize("num_input_channels", [16]) +@pytest.mark.parametrize("mem_mode", ["decoupled", "const"]) +@pytest.mark.fpgadataflow +@pytest.mark.vivado +def test_convert_to_hls_tbs_rtl_variant(activation, input_data_type, fold, num_input_channels, mem_mode): + # Handle inputs to the test + pe = generate_pe_value(fold, num_input_channels) + num_steps = activation.get_num_possible_values() - 1 + + # Cppsim is not supported for this node (as it is an RTL node) + if mem_mode == "const": + pytest.skip("const memory mode not supported for RTL Thresholding Binary Search node") + elif mem_mode != "decoupled": + raise Exception("Unknown mem_mode: {}".format(mem_mode)) + + if activation == DataType["BIPOLAR"]: + pytest.skip("Only negative activations are supported for RTL Thresholding Binary Search node") + + # Other non-input parameters + num_input_vecs = [1, 2, 2] + output_data_type = activation + if output_data_type == DataType["BIPOLAR"]: + activation_bias = 0 + else: + activation_bias = output_data_type.min() + + # generate random input data + tensor_shape = tuple(num_input_vecs + [num_input_channels]) + x = gen_finn_dt_tensor(input_data_type, tensor_shape) + + # Generate random thresholds and sort in ascending order + thresholds = generate_random_threshold_values( + input_data_type, num_input_channels, num_steps + ) + + # provide non-decreasing/ascending thresholds + thresholds = sort_thresholds_increasing(thresholds) + + x_nhwc = convert_np_array_to_standard_data_layout(x) + y = multithreshold(x_nhwc, thresholds) + + # convert back to NHWC for comparison to hw outputs + y = convert_np_array_to_finn_data_layout(y) + if activation == DataType["BIPOLAR"]: + # binary to bipolar + y = 2 * y - 1 + else: + # signed offset + y += activation.min() + + # Generate model from input parameters to the test + model = make_single_thresholding_binary_search_modelwrapper( + thresholds, + pe, + input_data_type, + output_data_type, + activation_bias, + num_input_vecs, + ) + + model = model.transform(InsertFIFO(True)) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) + model = model.transform(HLSSynthIP()) + model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns)) + + # Retrieve the axilite programming sequence for weights - for decoupled mode only + tbs_node = model.get_nodes_by_op_type("Thresholding_Binary_Search")[0] + tbs_inst = getCustomOp(tbs_node) + config = tbs_inst.get_dynamic_config(model) + + # Reshape generated data (not from model) + oshape = model.get_tensor_shape("outp") + y_expected = y.reshape(oshape) + + # Helper function that delivers the hook to program the thresholds via AXI-Lite + def config_hook(config): + if config is None: + return None + + def write_thresh_config(sim): + # axi_name = "s_axilite_0_" # works + axi_name = getCustomOp( + model.get_nodes_by_op_type("Thresholding_Binary_Search")[0] + ).get_verilog_top_module_intf_names()["axilite"][0] + axi_name += "_0_" + + # Write config registers to the Threshold memory. + # The dictionary defines (addr, value) tuples. + for config_entry in config.values(): + addr = config_entry[0] + val = config_entry[1] + axilite_write(sim, addr, val, basename=axi_name) + + reset_rtlsim(sim) + + return write_thresh_config + + input_dict = {"inp": x} + rtlsim_exec(model, input_dict, pre_hook=config_hook(config)) + y_produced = input_dict["outp"] + assert (y_produced == y_expected).all() + + #### Make a Multithreshold graph and convert to thresholding binary search node + new_model = make_single_multithresholding_modelwrapper( + thresholds, + pe, + input_data_type, + output_data_type, + activation_bias, + num_input_vecs, + ) + + # Recreate the model using the ConvertToHLS transform + new_model = new_model.transform(to_hls.InferThresholdingLayer(mem_mode=mem_mode, use_rtl_variant=True)) + new_model = new_model.transform(InsertFIFO(True)) + new_model = new_model.transform(GiveUniqueNodeNames()) + new_model = new_model.transform(PrepareIP(test_fpga_part, target_clk_ns)) + new_model = new_model.transform(HLSSynthIP()) + new_model = new_model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns)) + + input_dict = {"inp": x} + rtlsim_exec(new_model, input_dict, pre_hook=config_hook(config)) + y_produced_new = input_dict["outp"] + assert (y_produced_new == y_expected).all() From 11464d87c4857dd2227935c198adbb6115250fe3 Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Thu, 5 Jan 2023 16:35:32 +0000 Subject: [PATCH 052/235] [thresholding] skip tests with unsupported folding factor input Signed-off-by: Fionn O'Donohoe --- tests/fpgadataflow/test_convert_to_hls_thresholding.py | 6 +++++- .../test_fpgadataflow_thresholding_binary_search.py | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/tests/fpgadataflow/test_convert_to_hls_thresholding.py b/tests/fpgadataflow/test_convert_to_hls_thresholding.py index 30932638b6..3b56f40d9c 100755 --- a/tests/fpgadataflow/test_convert_to_hls_thresholding.py +++ b/tests/fpgadataflow/test_convert_to_hls_thresholding.py @@ -194,7 +194,7 @@ def make_single_multithresholding_modelwrapper( @pytest.mark.parametrize("activation", [DataType["INT4"], DataType["BIPOLAR"]]) @pytest.mark.parametrize("input_data_type", [DataType["INT16"], DataType["UINT16"]]) -@pytest.mark.parametrize("fold", [-1]) +@pytest.mark.parametrize("fold", [-1, 1, 2]) @pytest.mark.parametrize("num_input_channels", [16]) @pytest.mark.parametrize("mem_mode", ["decoupled", "const"]) @pytest.mark.fpgadataflow @@ -213,6 +213,10 @@ def test_convert_to_hls_tbs_rtl_variant(activation, input_data_type, fold, num_i if activation == DataType["BIPOLAR"]: pytest.skip("Only negative activations are supported for RTL Thresholding Binary Search node") + # Paralellisation not supported for thresholding binary search rtl node + if pe != 1: + pytest.skip("Paralellisation of IP not supported for RTL Thresholding Binary Search node") + # Other non-input parameters num_input_vecs = [1, 2, 2] output_data_type = activation diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py index 7ef5da8f23..0be91a2569 100755 --- a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py +++ b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py @@ -191,7 +191,7 @@ def test_fpgadataflow_thresholding_binary_search_prepare_rtlsim(): # N.B. - fold factor of '-1' is supported only (no PE/SIMD support) @pytest.mark.parametrize("activation", [DataType["INT4"], DataType["BIPOLAR"]]) @pytest.mark.parametrize("input_data_type", [DataType["INT16"], DataType["UINT16"]]) -@pytest.mark.parametrize("fold", [-1]) # 1, 2, etc. will fail +@pytest.mark.parametrize("fold", [-1, 1, 2]) @pytest.mark.parametrize("num_input_channels", [16]) @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) @pytest.mark.fpgadataflow @@ -204,6 +204,10 @@ def test_fpgadataflow_thresholding_binary_search( pe = generate_pe_value(fold, num_input_channels) num_steps = activation.get_num_possible_values() - 1 + # Paralellisation not supported for thresholding binary search rtl node + if pe != 1: + pytest.skip("Paralellisation of IP not supported for RTL Thresholding Binary Search node") + # Cppsim is not supported for this node (as it is an RTL node) if exec_mode == "cppsim": pytest.skip("cppsim not supported for RTL Thresholding Binary Search node") From e71b1c0e1487befd8ec04ac6ebcc0caf8d63b4a3 Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Thu, 5 Jan 2023 16:45:16 +0000 Subject: [PATCH 053/235] [thresholding] add comments for attributes Signed-off-by: Fionn O'Donohoe --- src/finn/custom_op/fpgadataflow/thresholding_binary_search.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py index c342d235d9..711e3a8270 100755 --- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py @@ -88,8 +88,12 @@ def get_nodeattr_types(self): # [4] is four vectors (like a FC layer with batch=4) # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) "numInputVectors": ("ints", False, [1]), + # name of the top module in verilog template. Used by PyVerilator + # and IPI generation "gen_top_module": ("s", False, ""), + # bias to be applied to outputs of the node "activation_bias": ("i", False, 0), + # used for IPI step "clkFreq": ("i", False, 200000000), } my_attrs.update(super().get_nodeattr_types()) From 3be1140fe68058c55fc1e3685609b6964ce7e993 Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Thu, 5 Jan 2023 17:01:05 +0000 Subject: [PATCH 054/235] [thresholding] replace min() with signed() function Signed-off-by: Fionn O'Donohoe --- src/finn/custom_op/fpgadataflow/thresholding_binary_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py index 711e3a8270..2073e95b41 100755 --- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py @@ -393,7 +393,7 @@ def prepare_codegen_rtl_values(self): # Is the input datatype signed or unsigned? # The thresholding core needs to know this when comparing weights to inputs - if self.get_input_datatype().min() < 0: + if self.get_input_datatype().signed(): code_gen_dict["$SIGN$"] = ["signed"] else: code_gen_dict["$SIGN$"] = ["unsigned"] From e05effc20cd2e357f5bba38d2e320144b313c9f5 Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Thu, 5 Jan 2023 17:40:42 +0000 Subject: [PATCH 055/235] [thresholding] fix formatting from pre-commit Signed-off-by: Fionn O'Donohoe --- .../thresholding_binary_search.py | 13 +++++++++-- .../fpgadataflow/convert_to_hls_layers.py | 6 ++--- src/finn/util/basic.py | 1 + .../test_convert_to_hls_thresholding.py | 22 ++++++++++++++----- ...fpgadataflow_thresholding_binary_search.py | 4 +++- 5 files changed, 34 insertions(+), 12 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py index 2073e95b41..d5d5c48cce 100755 --- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py @@ -38,7 +38,7 @@ ) from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp -from finn.util.basic import get_rtlsim_trace_depth, make_build_dir, find_next_power_of_2 +from finn.util.basic import find_next_power_of_2, get_rtlsim_trace_depth, make_build_dir from finn.util.data_packing import ( npy_to_rtlsim_input, pack_innermost_dim_as_hex_string, @@ -671,7 +671,16 @@ def get_dynamic_config(self, model, address_stride=1): ) config[key_name] = ( channel_start_addr + addr, - int(str(pack_innermost_dim_as_hex_string([weight], self.get_weight_datatype(), self.get_weight_datatype().bitwidth())), 0), + int( + str( + pack_innermost_dim_as_hex_string( + [weight], + self.get_weight_datatype(), + self.get_weight_datatype().bitwidth(), + ) + ), + 0, + ), ) weight_cntr += 1 diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py index 17f839c5c5..a0461bda82 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py @@ -1167,10 +1167,10 @@ def apply(self, model): else: if self.use_rtl_variant: warnings.warn( - """%s : RTL Thresholding requested for unsupported + """%s : RTL Thresholding requested for unsupported configuration. Falling back to HLS implementation.""" - % node.name - ) + % node.name + ) # create and insert new Thresholding_Batch node new_node = helper.make_node( diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py index 9a66cf90eb..8782bd7f8c 100644 --- a/src/finn/util/basic.py +++ b/src/finn/util/basic.py @@ -217,6 +217,7 @@ def is_exe(fpath): return None + def find_next_power_of_2(n): # Negative values will loop infinitely below - return 0 if n <= 0: diff --git a/tests/fpgadataflow/test_convert_to_hls_thresholding.py b/tests/fpgadataflow/test_convert_to_hls_thresholding.py index 3b56f40d9c..d0502a9b74 100755 --- a/tests/fpgadataflow/test_convert_to_hls_thresholding.py +++ b/tests/fpgadataflow/test_convert_to_hls_thresholding.py @@ -40,8 +40,8 @@ from qonnx.transformation.infer_shapes import InferShapes from qonnx.util.basic import gen_finn_dt_tensor -from finn.core.rtlsim_exec import rtlsim_exec import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +from finn.core.rtlsim_exec import rtlsim_exec from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP @@ -199,23 +199,31 @@ def make_single_multithresholding_modelwrapper( @pytest.mark.parametrize("mem_mode", ["decoupled", "const"]) @pytest.mark.fpgadataflow @pytest.mark.vivado -def test_convert_to_hls_tbs_rtl_variant(activation, input_data_type, fold, num_input_channels, mem_mode): +def test_convert_to_hls_tbs_rtl_variant( + activation, input_data_type, fold, num_input_channels, mem_mode +): # Handle inputs to the test pe = generate_pe_value(fold, num_input_channels) num_steps = activation.get_num_possible_values() - 1 # Cppsim is not supported for this node (as it is an RTL node) if mem_mode == "const": - pytest.skip("const memory mode not supported for RTL Thresholding Binary Search node") + pytest.skip( + "const memory mode not supported for RTL Thresholding Binary Search node" + ) elif mem_mode != "decoupled": raise Exception("Unknown mem_mode: {}".format(mem_mode)) if activation == DataType["BIPOLAR"]: - pytest.skip("Only negative activations are supported for RTL Thresholding Binary Search node") + pytest.skip( + "Only negative activations are supported for RTL Thresholding Binary Search node" + ) # Paralellisation not supported for thresholding binary search rtl node if pe != 1: - pytest.skip("Paralellisation of IP not supported for RTL Thresholding Binary Search node") + pytest.skip( + "Paralellisation of IP not supported for RTL Thresholding Binary Search node" + ) # Other non-input parameters num_input_vecs = [1, 2, 2] @@ -313,7 +321,9 @@ def write_thresh_config(sim): ) # Recreate the model using the ConvertToHLS transform - new_model = new_model.transform(to_hls.InferThresholdingLayer(mem_mode=mem_mode, use_rtl_variant=True)) + new_model = new_model.transform( + to_hls.InferThresholdingLayer(mem_mode=mem_mode, use_rtl_variant=True) + ) new_model = new_model.transform(InsertFIFO(True)) new_model = new_model.transform(GiveUniqueNodeNames()) new_model = new_model.transform(PrepareIP(test_fpga_part, target_clk_ns)) diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py index 0be91a2569..f1a03a3a89 100755 --- a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py +++ b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py @@ -206,7 +206,9 @@ def test_fpgadataflow_thresholding_binary_search( # Paralellisation not supported for thresholding binary search rtl node if pe != 1: - pytest.skip("Paralellisation of IP not supported for RTL Thresholding Binary Search node") + pytest.skip( + "Paralellisation of IP not supported for RTL Thresholding Binary Search node" + ) # Cppsim is not supported for this node (as it is an RTL node) if exec_mode == "cppsim": From 48c33042bbc7b17f98510a8299504e4d36c3a2e8 Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Thu, 5 Jan 2023 17:47:16 +0000 Subject: [PATCH 056/235] [thresholding] fix more flake8 formatting Signed-off-by: Fionn O'Donohoe --- .../fpgadataflow/convert_to_hls_layers.py | 4 ++-- .../fpgadataflow/test_convert_to_hls_thresholding.py | 12 +++++------- .../test_fpgadataflow_thresholding_binary_search.py | 5 ++--- 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py index a0461bda82..f6dd466fab 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py @@ -1157,7 +1157,7 @@ def apply(self, model): PE=pe, numSteps=thl_thres_shape[1], inputDataType=idt.name, - weightDataType=idt.name, # will be set by MinimizeAccumulatorWidth + weightDataType=idt.name, outputDataType=odt.name, numInputVectors=list(thl_in_shape[:-1]), activation_bias=actval, @@ -1183,7 +1183,7 @@ def apply(self, model): PE=pe, numSteps=thl_thres_shape[1], inputDataType=idt.name, - weightDataType=idt.name, # will be set by MinimizeAccumulatorWidth + weightDataType=idt.name, outputDataType=odt.name, numInputVectors=list(thl_in_shape[:-1]), ActVal=actval, diff --git a/tests/fpgadataflow/test_convert_to_hls_thresholding.py b/tests/fpgadataflow/test_convert_to_hls_thresholding.py index d0502a9b74..2785d91617 100755 --- a/tests/fpgadataflow/test_convert_to_hls_thresholding.py +++ b/tests/fpgadataflow/test_convert_to_hls_thresholding.py @@ -42,14 +42,10 @@ import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls from finn.core.rtlsim_exec import rtlsim_exec -from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO -from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.prepare_ip import PrepareIP -from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim -from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode test_fpga_part = "xczu3eg-sbva484-1-e" target_clk_ns = 5 @@ -209,14 +205,16 @@ def test_convert_to_hls_tbs_rtl_variant( # Cppsim is not supported for this node (as it is an RTL node) if mem_mode == "const": pytest.skip( - "const memory mode not supported for RTL Thresholding Binary Search node" + "const memory mode not supported for " \ + "RTL Thresholding Binary Search node" ) elif mem_mode != "decoupled": raise Exception("Unknown mem_mode: {}".format(mem_mode)) if activation == DataType["BIPOLAR"]: pytest.skip( - "Only negative activations are supported for RTL Thresholding Binary Search node" + "Only negative activations are supported for " \ + "RTL Thresholding Binary Search node" ) # Paralellisation not supported for thresholding binary search rtl node @@ -310,7 +308,7 @@ def write_thresh_config(sim): y_produced = input_dict["outp"] assert (y_produced == y_expected).all() - #### Make a Multithreshold graph and convert to thresholding binary search node + # Make a Multithreshold graph and convert to thresholding binary search node new_model = make_single_multithresholding_modelwrapper( thresholds, pe, diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py index f1a03a3a89..a4eab1e181 100755 --- a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py +++ b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py @@ -39,11 +39,9 @@ from qonnx.util.basic import gen_finn_dt_tensor from finn.core.rtlsim_exec import rtlsim_exec -from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO -from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode @@ -207,7 +205,8 @@ def test_fpgadataflow_thresholding_binary_search( # Paralellisation not supported for thresholding binary search rtl node if pe != 1: pytest.skip( - "Paralellisation of IP not supported for RTL Thresholding Binary Search node" + "Paralellisation of IP not supported for " \ + "RTL Thresholding Binary Search node" ) # Cppsim is not supported for this node (as it is an RTL node) From 1e8a36ca3712100caeed506976a92c7e2ee4b4c4 Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Thu, 5 Jan 2023 17:55:20 +0000 Subject: [PATCH 057/235] [thresholding] remove backslashes for flake8 Signed-off-by: Fionn O'Donohoe --- tests/fpgadataflow/test_convert_to_hls_thresholding.py | 4 ++-- .../test_fpgadataflow_thresholding_binary_search.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/fpgadataflow/test_convert_to_hls_thresholding.py b/tests/fpgadataflow/test_convert_to_hls_thresholding.py index 2785d91617..217ee39d74 100755 --- a/tests/fpgadataflow/test_convert_to_hls_thresholding.py +++ b/tests/fpgadataflow/test_convert_to_hls_thresholding.py @@ -205,7 +205,7 @@ def test_convert_to_hls_tbs_rtl_variant( # Cppsim is not supported for this node (as it is an RTL node) if mem_mode == "const": pytest.skip( - "const memory mode not supported for " \ + "const memory mode not supported for " "RTL Thresholding Binary Search node" ) elif mem_mode != "decoupled": @@ -213,7 +213,7 @@ def test_convert_to_hls_tbs_rtl_variant( if activation == DataType["BIPOLAR"]: pytest.skip( - "Only negative activations are supported for " \ + "Only negative activations are supported for " "RTL Thresholding Binary Search node" ) diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py index a4eab1e181..049d65835f 100755 --- a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py +++ b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py @@ -205,7 +205,7 @@ def test_fpgadataflow_thresholding_binary_search( # Paralellisation not supported for thresholding binary search rtl node if pe != 1: pytest.skip( - "Paralellisation of IP not supported for " \ + "Paralellisation of IP not supported for " "RTL Thresholding Binary Search node" ) From 08f1b5f49e0d5180fa739056209bc5f0a8589c7e Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Thu, 5 Jan 2023 18:00:57 +0000 Subject: [PATCH 058/235] [thresholding] more flake8 fixes Signed-off-by: Fionn O'Donohoe --- tests/fpgadataflow/test_convert_to_hls_thresholding.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/fpgadataflow/test_convert_to_hls_thresholding.py b/tests/fpgadataflow/test_convert_to_hls_thresholding.py index 217ee39d74..45705dc833 100755 --- a/tests/fpgadataflow/test_convert_to_hls_thresholding.py +++ b/tests/fpgadataflow/test_convert_to_hls_thresholding.py @@ -205,8 +205,7 @@ def test_convert_to_hls_tbs_rtl_variant( # Cppsim is not supported for this node (as it is an RTL node) if mem_mode == "const": pytest.skip( - "const memory mode not supported for " - "RTL Thresholding Binary Search node" + "const memory mode not supported for this node" ) elif mem_mode != "decoupled": raise Exception("Unknown mem_mode: {}".format(mem_mode)) From 481d773257e41ad04f2bb5e1b614decfac4312ab Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Thu, 5 Jan 2023 18:02:55 +0000 Subject: [PATCH 059/235] [thresholding] undo flake8 fixes Signed-off-by: Fionn O'Donohoe --- tests/fpgadataflow/test_convert_to_hls_thresholding.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/fpgadataflow/test_convert_to_hls_thresholding.py b/tests/fpgadataflow/test_convert_to_hls_thresholding.py index 45705dc833..cee06ebec9 100755 --- a/tests/fpgadataflow/test_convert_to_hls_thresholding.py +++ b/tests/fpgadataflow/test_convert_to_hls_thresholding.py @@ -204,9 +204,7 @@ def test_convert_to_hls_tbs_rtl_variant( # Cppsim is not supported for this node (as it is an RTL node) if mem_mode == "const": - pytest.skip( - "const memory mode not supported for this node" - ) + pytest.skip("const memory mode not supported for this node") elif mem_mode != "decoupled": raise Exception("Unknown mem_mode: {}".format(mem_mode)) From a51bef4e3ea906b056eb7fb3fbb114a2ae12b6aa Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Thu, 5 Jan 2023 18:04:59 +0000 Subject: [PATCH 060/235] [thresholding] another flake8 fix Signed-off-by: Fionn O'Donohoe --- tests/fpgadataflow/test_convert_to_hls_thresholding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/fpgadataflow/test_convert_to_hls_thresholding.py b/tests/fpgadataflow/test_convert_to_hls_thresholding.py index cee06ebec9..07821983e1 100755 --- a/tests/fpgadataflow/test_convert_to_hls_thresholding.py +++ b/tests/fpgadataflow/test_convert_to_hls_thresholding.py @@ -217,7 +217,7 @@ def test_convert_to_hls_tbs_rtl_variant( # Paralellisation not supported for thresholding binary search rtl node if pe != 1: pytest.skip( - "Paralellisation of IP not supported for RTL Thresholding Binary Search node" + "Paralellisation not supported for RTL Thresholding Binary Search node" ) # Other non-input parameters From 2c313ad01465f66a9e6f367cf6552f64b6a1dab3 Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Fri, 6 Jan 2023 11:11:25 +0000 Subject: [PATCH 061/235] [thresholding] remove cppsim test file generation Signed-off-by: Fionn O'Donohoe --- .../custom_op/fpgadataflow/thresholding_binary_search.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py index d5d5c48cce..1a5faad72a 100755 --- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py @@ -239,13 +239,6 @@ def get_hls_compatible_threshold_tensor(self, orig_thres_matrix): np.mod(orig_thres_matrix, 1), 0 ).all(), "Need int threshold tensor" ret = orig_thres_matrix - # workaround for vivado_hls threshold bug - if ret[0][0] == 0 and n_thres_steps == 1: - ret = np.copy(ret) - ret[0][0] = 1 - warnings.warn( - "Setting 0-valued first threshold to 1 to avoid vivado_hls bug" - ) # ensure channels = mh , duplicating if necessary if ret.shape[0] == 1: ret = np.tile(ret, (mh, 1)) From 49bdd28e4edc3d47ccb57161e073fcde2a2cb216 Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Fri, 6 Jan 2023 11:14:22 +0000 Subject: [PATCH 062/235] [thresholding] remove unnecessary data generation functions for simulators Signed-off-by: Fionn O'Donohoe --- .../thresholding_binary_search.py | 107 ------------------ 1 file changed, 107 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py index 1a5faad72a..7b37b2029a 100755 --- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py @@ -261,84 +261,6 @@ def get_hls_compatible_threshold_tensor(self, orig_thres_matrix): rows between PEs is not as expected (n_thres_steps)""" return ret.reshape(1, pe, tmem, n_thres_steps) - def make_weight_file(self, weights, weight_file_mode, weight_file_name): - """Produce a file containing given weights (thresholds) in appropriate - format for this layer. This file can be used for either synthesis or - run-time reconfig of weights. - - Arguments: - * weights : numpy array with weights to be put into the file - * weight_file_mode : one of {hls_header, decoupled_verilog_dat, - decoupled_runtime} - * weight_file_name : filename for the weight file to be generated - """ - # There are 'decoupled_*' flavors, just make sure that the flavors - # are decoupled related - if "decoupled" not in weight_file_mode: - raise Exception( - "Unrecognized memory mode for this node: {}".format(weight_file_mode) - ) - - threshold_tensor = self.get_hls_compatible_threshold_tensor(weights) - tdt = self.get_weight_datatype() - assert np.vectorize(tdt.allowed)( - threshold_tensor - ).all(), "Thresholds can't be expressed with type %s" % str(tdt) - - # streaming thresholds need to be organized differently - # (1, pe, tmem, n_thres_steps) -> (1, tmem, pe, n_thres_steps) - decoupled_thres = np.transpose(threshold_tensor, (0, 2, 1, 3)) - # (1, tmem, pe, n_thres_steps) -(1, tmem, pe * n_thres_steps) - pe = self.get_nodeattr("PE") - n_thres_steps = self.get_nodeattr("numSteps") - decoupled_thres_pe_flipped = np.flip(decoupled_thres, axis=-2) - decoupled_thres = decoupled_thres.reshape(1, -1, pe * n_thres_steps) - decoupled_thres = decoupled_thres.copy() - decoupled_thres_pe_flipped = decoupled_thres_pe_flipped.reshape( - 1, -1, pe * n_thres_steps - ) - decoupled_thres_pe_flipped = decoupled_thres_pe_flipped.copy() - - if weight_file_mode == "decoupled_npy": - # save weight stream into npy for cppsim - np.save(weight_file_name, decoupled_thres) - elif weight_file_mode == "decoupled_verilog_dat": - # convert weight values into hexstring - weight_width = self.get_weightstream_width() - # pad to nearest 4 bits to get hex strings - weight_width_padded = roundup_to_integer_multiple(weight_width, 4) - weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string( - decoupled_thres_pe_flipped, tdt, weight_width_padded, prefix="" - ) - weight_stream = weight_tensor_pe_flipped.flatten() - weight_stream = weight_stream.copy() - with open(weight_file_name, "w") as f: - for val in weight_stream: - f.write(val + "\n") - elif weight_file_mode == "decoupled_runtime": - # memstream axi-lite interface will map each mem line to - # one or multiple 32-bit words - weight_width = self.get_weightstream_width() - words_per_memwidth = 2 ** ceil(log2(weight_width / 32)) - if words_per_memwidth < 1: - words_per_memwidth = 1 - weight_width_padded = words_per_memwidth * 32 - # first, pack and ensure padding to 32 bits - weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string( - decoupled_thres_pe_flipped, tdt, weight_width_padded, prefix="" - ) - weight_stream = weight_tensor_pe_flipped.flatten() - weight_stream = weight_stream.copy() - with open(weight_file_name, "w") as f: - for val in weight_stream: - # split into groups of 8 hex digits (= 32 bits) - words_32b = textwrap.wrap(val, 8) - words_32b.reverse() - for word_32b in words_32b: - f.write(word_32b + "\n") - else: - raise Exception("Decoupled weight export not yet implemented") - # Get the integer from the DataType and string-ify it # This assumes that the data is in the form "INTx" or similar def conv_datatype_to_str(self, data_type): @@ -449,35 +371,6 @@ def code_generation_ipgen(self, model, fpgapart, clk): code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") self.set_nodeattr("ipgen_path", code_gen_dir) self.set_nodeattr("ip_path", code_gen_dir) - - # Generate params for RTLSim - self.generate_params(model, code_gen_dir) - - def generate_params(self, model, path): - code_gen_dir = path - weight_filename_sim = "{}/thresholds.npy".format(code_gen_dir) - thresholds = model.get_initializer(self.onnx_node.input[1]) - self.make_weight_file(thresholds, "decoupled_npy", weight_filename_sim) - - # Verilog.dat thresholds: - # also save weights as Verilog .dat file - # note that we provide two different .dat files, one for synth - # and one for synthesis. this is because URAM-based weights always - # need zero weights for synthesis, otherwise they get inferred - # as BRAM - weight_filename_rtl_synth = "{}/memblock_synth_0.dat".format(code_gen_dir) - weight_filename_rtl_sim = "{}/memblock_sim_0.dat".format(code_gen_dir) - # sim weights are always the true weights - self.make_weight_file( - thresholds, "decoupled_verilog_dat", weight_filename_rtl_sim - ) - - # Synthesis thresholds: - synth_thresholds = thresholds - self.make_weight_file( - synth_thresholds, "decoupled_verilog_dat", weight_filename_rtl_synth - ) - return def prepare_rtlsim(self): From e663030e98dc6c1f194ccec1d8e5d65b9599c19c Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Fri, 6 Jan 2023 11:27:46 +0000 Subject: [PATCH 063/235] [thresholding] remove potentially problematic helper function Signed-off-by: Fionn O'Donohoe --- .../thresholding_binary_search.py | 20 ++++++------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py index 7b37b2029a..b14eaa1669 100755 --- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py @@ -261,14 +261,6 @@ def get_hls_compatible_threshold_tensor(self, orig_thres_matrix): rows between PEs is not as expected (n_thres_steps)""" return ret.reshape(1, pe, tmem, n_thres_steps) - # Get the integer from the DataType and string-ify it - # This assumes that the data is in the form "INTx" or similar - def conv_datatype_to_str(self, data_type): - # Handle the case that an int is passed to the function - if isinstance(data_type, int): - return str(data_type) - return str(DataType[data_type].bitwidth()) - def prepare_codegen_rtl_values(self): """All dictionary values produced in this function are to replace their key value(s) in the RTL template files""" @@ -294,16 +286,16 @@ def prepare_codegen_rtl_values(self): bias = self.get_nodeattr("activation_bias") # activation bias value code_gen_dict["$N$"] = [ - self.conv_datatype_to_str(output_data_type) - ] # output precision + str(DataType[output_data_type].bitwidth()) + ] # output precision - convert bitwidth to string code_gen_dict["$M$"] = [ - self.conv_datatype_to_str(input_data_type) - ] # input/threshold precision + str(DataType[input_data_type].bitwidth()) + ] # input/threshold precision - convert bitwidth to string code_gen_dict["$C$"] = [ - self.conv_datatype_to_str(num_channels) + str(num_channels) ] # number of channels code_gen_dict["$BIAS$"] = [ - self.conv_datatype_to_str(bias) + str(bias) ] # activation bias value # Is the input datatype signed or unsigned? From 42dbf23938fdd1a302e88706302980c718a66d05 Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Fri, 6 Jan 2023 11:35:36 +0000 Subject: [PATCH 064/235] [thresholding] implement flake8 formatting Signed-off-by: Fionn O'Donohoe --- .../custom_op/fpgadataflow/thresholding_binary_search.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py index b14eaa1669..6dc9130792 100755 --- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py @@ -291,12 +291,8 @@ def prepare_codegen_rtl_values(self): code_gen_dict["$M$"] = [ str(DataType[input_data_type].bitwidth()) ] # input/threshold precision - convert bitwidth to string - code_gen_dict["$C$"] = [ - str(num_channels) - ] # number of channels - code_gen_dict["$BIAS$"] = [ - str(bias) - ] # activation bias value + code_gen_dict["$C$"] = [str(num_channels)] # number of channels + code_gen_dict["$BIAS$"] = [str(bias)] # activation bias value # Is the input datatype signed or unsigned? # The thresholding core needs to know this when comparing weights to inputs From 933d7476d3336a6aec9c4dea852acb25ebdf4b46 Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Fri, 6 Jan 2023 11:38:11 +0000 Subject: [PATCH 065/235] [thresholding] remove unused imports Signed-off-by: Fionn O'Donohoe --- src/finn/custom_op/fpgadataflow/thresholding_binary_search.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py index 6dc9130792..9e1dd454f1 100755 --- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py @@ -28,9 +28,7 @@ import numpy as np import os -import textwrap import warnings -from math import ceil, log2 from qonnx.core.datatype import DataType from qonnx.util.basic import ( interleave_matrix_outer_dim_from_partitions, From 5c6dcd9b89a7b35328676855c5c5ac13e06da90f Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Fri, 6 Jan 2023 11:40:13 +0000 Subject: [PATCH 066/235] [thresholding] remove last ununsed import Signed-off-by: Fionn O'Donohoe --- src/finn/custom_op/fpgadataflow/thresholding_binary_search.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py index 9e1dd454f1..a2e0f404b2 100755 --- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py @@ -32,7 +32,6 @@ from qonnx.core.datatype import DataType from qonnx.util.basic import ( interleave_matrix_outer_dim_from_partitions, - roundup_to_integer_multiple, ) from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp From 51acd119eb8864ff302d0f040fcb0307c2778ccf Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Fri, 6 Jan 2023 11:42:12 +0000 Subject: [PATCH 067/235] [thresholding] reformat existing import Signed-off-by: Fionn O'Donohoe --- src/finn/custom_op/fpgadataflow/thresholding_binary_search.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py index a2e0f404b2..595a643acc 100755 --- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py @@ -30,9 +30,7 @@ import os import warnings from qonnx.core.datatype import DataType -from qonnx.util.basic import ( - interleave_matrix_outer_dim_from_partitions, -) +from qonnx.util.basic import interleave_matrix_outer_dim_from_partitions from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp from finn.util.basic import find_next_power_of_2, get_rtlsim_trace_depth, make_build_dir From b886a5ae08e608808795bc584da0650eb8ff260f Mon Sep 17 00:00:00 2001 From: auphelia Date: Wed, 18 Jan 2023 11:25:51 +0000 Subject: [PATCH 068/235] [Docs] Add bin search thresholding to docs generation --- docs/finn/source_code/finn.custom_op.fpgadataflow.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/docs/finn/source_code/finn.custom_op.fpgadataflow.rst b/docs/finn/source_code/finn.custom_op.fpgadataflow.rst index fdcf44c6d9..3627855cfb 100644 --- a/docs/finn/source_code/finn.custom_op.fpgadataflow.rst +++ b/docs/finn/source_code/finn.custom_op.fpgadataflow.rst @@ -203,6 +203,14 @@ finn.custom\_op.fpgadataflow.thresholding\_batch :undoc-members: :show-inheritance: +finn.custom\_op.fpgadataflow.thresholding\_binary\_search +----------------------------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.thresholding_binary_search + :members: + :undoc-members: + :show-inheritance: + finn.custom\_op.fpgadataflow.tlastmarker ----------------------------------------------- From 2c3de2ab7ad12c89ee4af52e611532ff4255e258 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Mon, 23 Jan 2023 07:28:39 +0000 Subject: [PATCH 069/235] Corrected address width in Verilog wrapper for thresholding. --- finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v index e3f8596bc8..5068cb549c 100644 --- a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v +++ b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v @@ -51,7 +51,7 @@ module $MODULE_NAME_AXI_WRAPPER$ #( // Writing input s_axilite_AWVALID, output s_axilite_AWREADY, - input [C_BITS+N-1:0] s_axilite_AWADDR, + input [C_BITS+N+1:0] s_axilite_AWADDR, input s_axilite_WVALID, output s_axilite_WREADY, From 7c9f5d8805b288a299cd1970d797af0d24327577 Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Mon, 23 Jan 2023 11:57:16 +0000 Subject: [PATCH 070/235] [thresholding] remove bug affecting input width in top level wrapper The C_BITS parameter is calculating the correct width needed for the top level wrapper for the thresholding binary search IP. However, the parameter is not 'synthesizing' correctly and does not update the width for the affected s_axilite_AWADDR signal. This results in the MSBs of the input signal being truncated. These missing bits affected addressing when writing weights into the core. Weights were written to the incorrect addresses in the core causing incorrect thresholding to occur. Signed-off-by: Fionn O'Donohoe --- finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v index 5068cb549c..768e7b6a5b 100644 --- a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v +++ b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v @@ -38,7 +38,6 @@ module $MODULE_NAME_AXI_WRAPPER$ #( parameter C = $C$, // Channels int BIAS = $BIAS$, // offsetting the output [0, 2^N-1) -> [-BIAS, 2^N-1 - BIAS) - parameter C_BITS = C < 2 ? 1 : $clog2(C), parameter O_BITS = BIAS > 0? /* unsigned */ $clog2(2**N-BIAS) : /* signed */ 1+$clog2(BIAS >= 2**(N-1)? BIAS : 2**N-BIAS) @@ -49,9 +48,9 @@ module $MODULE_NAME_AXI_WRAPPER$ #( //- AXI Lite ------------------------ // Writing - input s_axilite_AWVALID, - output s_axilite_AWREADY, - input [C_BITS+N+1:0] s_axilite_AWADDR, + input s_axilite_AWVALID, + output s_axilite_AWREADY, + input [$clog2(C)+N+1:0] s_axilite_AWADDR, input s_axilite_WVALID, output s_axilite_WREADY, From 3a0d59dd6717daedb043ea83d6873e6c663b0d06 Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Mon, 23 Jan 2023 11:59:17 +0000 Subject: [PATCH 071/235] [thresholding] adjust thresholding binary search tests to use word addressing for programming thresholds with axilite Signed-off-by: Fionn O'Donohoe --- tests/fpgadataflow/test_convert_to_hls_thresholding.py | 2 +- .../test_fpgadataflow_thresholding_binary_search.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/fpgadataflow/test_convert_to_hls_thresholding.py b/tests/fpgadataflow/test_convert_to_hls_thresholding.py index 07821983e1..9486513402 100755 --- a/tests/fpgadataflow/test_convert_to_hls_thresholding.py +++ b/tests/fpgadataflow/test_convert_to_hls_thresholding.py @@ -271,7 +271,7 @@ def test_convert_to_hls_tbs_rtl_variant( # Retrieve the axilite programming sequence for weights - for decoupled mode only tbs_node = model.get_nodes_by_op_type("Thresholding_Binary_Search")[0] tbs_inst = getCustomOp(tbs_node) - config = tbs_inst.get_dynamic_config(model) + config = tbs_inst.get_dynamic_config(model, 4) # Reshape generated data (not from model) oshape = model.get_tensor_shape("outp") diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py index 049d65835f..2a34971f0d 100755 --- a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py +++ b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py @@ -266,7 +266,7 @@ def test_fpgadataflow_thresholding_binary_search( # Retrieve the axilite programming sequence for weights - for decoupled mode only tbs_node = model.get_nodes_by_op_type("Thresholding_Binary_Search")[0] tbs_inst = getCustomOp(tbs_node) - config = tbs_inst.get_dynamic_config(model) + config = tbs_inst.get_dynamic_config(model, 4) # Reshape generated data (not from model) oshape = model.get_tensor_shape("outp") From 757e3a1398948878e866f4fe5fe1747206a1c7d9 Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Mon, 23 Jan 2023 12:05:40 +0000 Subject: [PATCH 072/235] [thresholding] adjust typo in exception Signed-off-by: Fionn O'Donohoe --- src/finn/custom_op/fpgadataflow/thresholding_binary_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py index 595a643acc..f2f9e133b2 100755 --- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py @@ -421,7 +421,7 @@ def execute_node(self, context, graph): reshaped_input, ) elif in_ind > 2: - raise Exception("Unexpected input found for Thresholding_Batch") + raise Exception("Unexpected input found for Thresholding_Binary_Search") in_ind += 1 # Create a PyVerilator wrapper of the RTLSim .so From 479575b224559680c559c7af5fd4f09582529919 Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Mon, 23 Jan 2023 12:07:22 +0000 Subject: [PATCH 073/235] [thresholding] undo copyright header change - only needed for new files Signed-off-by: Fionn O'Donohoe --- src/finn/custom_op/fpgadataflow/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index 0e17726d48..dc9a5a349a 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -1,4 +1,4 @@ -# Copyright (C) 2022, Advanced Micro Devices, Inc. +# Copyright (c) 2020, Xilinx # All rights reserved. # # Redistribution and use in source and binary forms, with or without From 0d99b6c8ed358b2feea41cc8af242d40b30c8d97 Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Mon, 23 Jan 2023 12:54:19 +0000 Subject: [PATCH 074/235] [thresholding] add docstring for migrated find_next_power_of_2() function Signed-off-by: Fionn O'Donohoe --- src/finn/util/basic.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py index 8782bd7f8c..ee185aa94f 100644 --- a/src/finn/util/basic.py +++ b/src/finn/util/basic.py @@ -219,6 +219,7 @@ def is_exe(fpath): def find_next_power_of_2(n): + """For any integer 'n', find the next greatest power of 2""" # Negative values will loop infinitely below - return 0 if n <= 0: return 0 From 5a77a326558de1ecd59e61aae38575b73ac54b1b Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Mon, 23 Jan 2023 12:55:40 +0000 Subject: [PATCH 075/235] [thresholding] add docstrings for methods not in base class Signed-off-by: Fionn O'Donohoe --- .../fpgadataflow/thresholding_binary_search.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py index f2f9e133b2..7dfcd91d58 100755 --- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py @@ -95,6 +95,7 @@ def get_nodeattr_types(self): return my_attrs def calc_tmem(self): + """Calculates and returns TMEM.""" num_channels = self.get_nodeattr("NumChannels") pe = self.get_nodeattr("PE") return num_channels // pe @@ -104,6 +105,8 @@ def make_shape_compatible_op(self, model): return super().make_const_shape_op(oshape) def infer_node_datatype(self, model): + """Used for FINN DataType inference: set the output tensors' datatypes + accordingly for this node""" node = self.onnx_node idt = model.get_tensor_datatype(node.input[0]) if idt != self.get_input_datatype(): @@ -119,6 +122,8 @@ def infer_node_datatype(self, model): model.set_tensor_datatype(node.output[0], odt) def verify_node(self): + """Required by the FINN nalysis module. Checks if custom ops in graph + are correctly built, with all attributes and inputs.""" return [] def bram_estimation(self): @@ -170,6 +175,7 @@ def get_outstream_width(self, ind=0): return o_bits * self.get_nodeattr("PE") def get_weightstream_width(self): + """Returns weight stream width""" pe = self.get_nodeattr("PE") wp = self.get_weight_datatype().bitwidth() n_thres_steps = self.get_nodeattr("numSteps") @@ -299,20 +305,24 @@ def prepare_codegen_rtl_values(self): return code_gen_dict def get_rtl_file_list(self): + """Thresholding binary search RTL file list""" return ["thresholding.sv", "thresholding_axi.sv", "thresholding_axi_wrapper.v"] def get_rtl_file_paths(self): + """Get full path of all RTL files""" rtl_root_dir = os.environ["FINN_ROOT"] + "/finn-rtllib/thresholding/hdl/" rtl_file_list = self.get_rtl_file_list() rtl_file_paths = [rtl_root_dir + file for file in rtl_file_list] return rtl_file_paths def get_rtl_template_data(self, path): + """Return RTL file contents as a template""" with open(path, "r") as f: template = f.read() return template def fill_in_rtl_template_data(self, replace_dict, template_data): + """Use attribute values to finn in RTL template placeholders""" template_data_cp = template_data for key in replace_dict: replacement_line = "\n".join(replace_dict[key]) @@ -320,11 +330,13 @@ def fill_in_rtl_template_data(self, replace_dict, template_data): return template_data_cp def dump_rtl_data(self, dest_dir, filename, data): + """Dump filled-in-template RTL files for future synthesis step""" with open(os.path.join(dest_dir, filename), "w") as f: f.write(data) return def generate_hdl(self): + """Prepare HDL files from templates for synthesis""" # Generate a dictionary of values to put in RTL template code_gen_dict = self.prepare_codegen_rtl_values() From eeed0702125de77c293a4a702f213a1035829179 Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Mon, 23 Jan 2023 12:56:22 +0000 Subject: [PATCH 076/235] [thresholding] remove unused method Signed-off-by: Fionn O'Donohoe --- src/finn/custom_op/fpgadataflow/thresholding_binary_search.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py index 7dfcd91d58..94182b4ea0 100755 --- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py @@ -209,9 +209,6 @@ def get_number_output_values(self): def get_exp_cycles(self): return 0 - def get_template_param_values(self): - return dict() - def get_hls_compatible_threshold_tensor(self, orig_thres_matrix): """Convert the original numpy weight matrix orig_weight_matrix into a form suitable for passing to the hlslib call: From c2708686e22c9eaff18a5314c2f470fbbcb819f0 Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Fri, 27 Jan 2023 14:55:16 +0000 Subject: [PATCH 077/235] [thresholding] remove 'return' at end of function - not needed Signed-off-by: Fionn O'Donohoe --- tests/util/test_basic.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/util/test_basic.py b/tests/util/test_basic.py index d2586f4f19..97a8c50261 100755 --- a/tests/util/test_basic.py +++ b/tests/util/test_basic.py @@ -58,5 +58,3 @@ def test_next_power_of_2(): output = basic.find_next_power_of_2(test_dict["input"]) assert output >= test_dict["input"] assert output == test_dict["expected_result"] - - return From af22177e50ae808072d87a9d0c5260ccb6c3a67f Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Fri, 27 Jan 2023 14:59:48 +0000 Subject: [PATCH 078/235] [thresholding] remove cppsim exec_mode from test - not exercised Signed-off-by: Fionn O'Donohoe --- .../test_fpgadataflow_thresholding_binary_search.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py index 2a34971f0d..e57c4942c8 100755 --- a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py +++ b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py @@ -191,12 +191,11 @@ def test_fpgadataflow_thresholding_binary_search_prepare_rtlsim(): @pytest.mark.parametrize("input_data_type", [DataType["INT16"], DataType["UINT16"]]) @pytest.mark.parametrize("fold", [-1, 1, 2]) @pytest.mark.parametrize("num_input_channels", [16]) -@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) @pytest.mark.fpgadataflow @pytest.mark.vivado @pytest.mark.slow def test_fpgadataflow_thresholding_binary_search( - activation, input_data_type, fold, num_input_channels, exec_mode + activation, input_data_type, fold, num_input_channels ): # Handle inputs to the test pe = generate_pe_value(fold, num_input_channels) @@ -209,12 +208,6 @@ def test_fpgadataflow_thresholding_binary_search( "RTL Thresholding Binary Search node" ) - # Cppsim is not supported for this node (as it is an RTL node) - if exec_mode == "cppsim": - pytest.skip("cppsim not supported for RTL Thresholding Binary Search node") - elif exec_mode != "rtlsim": - raise Exception("Unknown exec_mode: {}".format(exec_mode)) - # Other non-input parameters num_input_vecs = [1, 2, 2] output_data_type = activation From fab120b8218b2bacf8a94a23c7d250d0c5df12b6 Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Fri, 27 Jan 2023 15:02:38 +0000 Subject: [PATCH 079/235] [thresholding] remove unused attributes Signed-off-by: Fionn O'Donohoe --- src/finn/custom_op/fpgadataflow/thresholding_binary_search.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py index 94182b4ea0..43ae8e8233 100755 --- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py @@ -75,9 +75,6 @@ def get_nodeattr_types(self): "inputDataType": ("s", True, ""), "weightDataType": ("s", True, ""), "outputDataType": ("s", True, ""), - # input and output FIFO depths - "inFIFODepth": ("i", False, 0), - "outFIFODepth": ("i", False, 0), # number of input vectors, examples: # [1] is a single vector (like a FC layer with batch=1) # [4] is four vectors (like a FC layer with batch=4) From 5d6c964443e0c41865a18e862830e0c27a307dd1 Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Fri, 27 Jan 2023 15:47:41 +0000 Subject: [PATCH 080/235] [thresholding] adjust i/o port names on thresholding RTL wrapper Originally s_axis and m_axis port names on the thresholding RTL wrapper could synthesise, but did not adhere to the FINN i/o signal naming convention. The FINN compiler would not recognise the IP being synthesised and would rely on the IP having the correct IP/signal wiring steps in place. The FINN compiler did not recognise s_axis/m_axis signal naming and therefore did not automatically set the clock frequency of the IP to match the rest of the network. This required a Tcl command to set the clock frequency of the IP, as well a user-configurable attribute to set the clock frequency for ease-of-use. It turns out that this actually reduces user ease-of-use. Having the compiler take care of the clock signalling is preferred. To do this, the s_axis/m_axis signals are renamed to in0_V/out_V, as the compiler expects, and this extra 'user configurability' can therefore be removed. Signed-off-by: Fionn O'Donohoe --- .../hdl/thresholding_axi_wrapper.v | 26 ++++++++++--------- .../thresholding_binary_search.py | 19 -------------- 2 files changed, 14 insertions(+), 31 deletions(-) diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v index 768e7b6a5b..c16bf264dd 100644 --- a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v +++ b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v @@ -43,7 +43,9 @@ module $MODULE_NAME_AXI_WRAPPER$ #( /* signed */ 1+$clog2(BIAS >= 2**(N-1)? BIAS : 2**N-BIAS) )( //- Global Control ------------------ + (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V" *) input ap_clk, + (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V" *) input ap_rst_n, //- AXI Lite ------------------------ @@ -72,14 +74,14 @@ module $MODULE_NAME_AXI_WRAPPER$ #( output [ 1:0] s_axilite_RRESP, //- AXI Stream - Input -------------- - output s_axis_tready, - input s_axis_tvalid, - input [((M+7)/8)*8-1:0] s_axis_tdata, + output in0_V_TREADY, + input in0_V_TVALID, + input [((M+7)/8)*8-1:0] in0_V_TDATA, //- AXI Stream - Output ------------- - input m_axis_tready, - output m_axis_tvalid, - output [((O_BITS+7)/8)*8-1:0] m_axis_tdata + input out_V_TREADY, + output out_V_TVALID, + output [((O_BITS+7)/8)*8-1:0] out_V_TDATA ); $MODULE_NAME_AXI$ #(.N(N), .M(M), .C(C), .BIAS(BIAS), .O_BITS(O_BITS)) inst ( @@ -113,14 +115,14 @@ module $MODULE_NAME_AXI_WRAPPER$ #( .s_axilite_RRESP(s_axilite_RRESP), //- AXI Stream - Input -------------- - .s_axis_tready(s_axis_tready), - .s_axis_tvalid(s_axis_tvalid), - .s_axis_tdata(s_axis_tdata), + .s_axis_tready(in0_V_TREADY), + .s_axis_tvalid(in0_V_TVALID), + .s_axis_tdata(in0_V_TDATA), //- AXI Stream - Output ------------- - .m_axis_tready(m_axis_tready), - .m_axis_tvalid(m_axis_tvalid), - .m_axis_tdata(m_axis_tdata) + .m_axis_tready(out_V_TREADY), + .m_axis_tvalid(out_V_TVALID), + .m_axis_tdata(out_V_TDATA) ); endmodule : $MODULE_NAME_AXI_WRAPPER$ diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py index 43ae8e8233..97c9dd82c6 100755 --- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py @@ -85,8 +85,6 @@ def get_nodeattr_types(self): "gen_top_module": ("s", False, ""), # bias to be applied to outputs of the node "activation_bias": ("i", False, 0), - # used for IPI step - "clkFreq": ("i", False, 200000000), } my_attrs.update(super().get_nodeattr_types()) return my_attrs @@ -477,10 +475,6 @@ def code_generation_ipi(self): cmd = [] rtl_file_list = self.get_rtl_file_list() code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - node_name = self.onnx_node.name - dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0][0] - din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0] - clock_freq = self.get_nodeattr("clkFreq") for rtl_file in rtl_file_list: cmd.append( @@ -493,16 +487,6 @@ def code_generation_ipi(self): % (self.get_nodeattr("gen_top_module"), self.onnx_node.name) ) - cmd.append( - "set_property -dict [list CONFIG.FREQ_HZ {%d}] [%s %s/%s]" - % (clock_freq, "get_bd_intf_pins", node_name, din_name) - ) - - cmd.append( - "set_property -dict [list CONFIG.FREQ_HZ {%d}] [%s %s/%s]" - % (clock_freq, "get_bd_intf_pins", node_name, dout_name) - ) - return cmd def get_verilog_top_module_intf_names(self): @@ -517,9 +501,6 @@ def get_verilog_top_module_intf_names(self): intf_names = super().get_verilog_top_module_intf_names() intf_names["axilite"] = ["s_axilite"] - intf_names["s_axis"] = [["s_axis"]] - intf_names["m_axis"] = [["m_axis"]] - return intf_names def get_dynamic_config(self, model, address_stride=1): From bdfa6cb97096680247b6648edf20d4c519dcad16 Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Tue, 31 Jan 2023 11:41:24 +0000 Subject: [PATCH 081/235] [thresholding] remove duplicated test helper function Signed-off-by: Fionn O'Donohoe --- .../test_convert_to_hls_thresholding.py | 54 +------------------ 1 file changed, 2 insertions(+), 52 deletions(-) diff --git a/tests/fpgadataflow/test_convert_to_hls_thresholding.py b/tests/fpgadataflow/test_convert_to_hls_thresholding.py index 9486513402..84521b395c 100755 --- a/tests/fpgadataflow/test_convert_to_hls_thresholding.py +++ b/tests/fpgadataflow/test_convert_to_hls_thresholding.py @@ -46,6 +46,8 @@ from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from test_fpgadataflow_thresholding_binary_search import make_single_thresholding_binary_search_modelwrapper + test_fpga_part = "xczu3eg-sbva484-1-e" target_clk_ns = 5 @@ -86,58 +88,6 @@ def convert_np_array_to_standard_data_layout(data): return np.transpose(data, (0, 3, 1, 2)) -def make_single_thresholding_binary_search_modelwrapper( - thresholds, - pe, - input_data_type, - output_data_type, - activation_bias, - num_input_vecs, -): - NumChannels = thresholds.shape[0] - - inp = helper.make_tensor_value_info( - "inp", TensorProto.FLOAT, num_input_vecs + [NumChannels] - ) - outp = helper.make_tensor_value_info( - "outp", TensorProto.FLOAT, num_input_vecs + [NumChannels] - ) - - node_inp_list = ["inp", "thresh"] - - Thresholding_node = helper.make_node( - "Thresholding_Binary_Search", - node_inp_list, - ["outp"], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - NumChannels=NumChannels, - PE=pe, - numSteps=thresholds.shape[1], - inputDataType=input_data_type.name, - weightDataType=input_data_type.name, - outputDataType=output_data_type.name, - numInputVectors=num_input_vecs, - activation_bias=activation_bias, - ) - graph = helper.make_graph( - nodes=[Thresholding_node], - name="thresholding_graph", - inputs=[inp], - outputs=[outp], - ) - - model = helper.make_model(graph, producer_name="thresholding-model") - model = ModelWrapper(model) - - model.set_tensor_datatype("inp", input_data_type) - model.set_tensor_datatype("outp", output_data_type) - - model.set_tensor_datatype("thresh", input_data_type) - model.set_initializer("thresh", thresholds) - return model - - def make_single_multithresholding_modelwrapper( thresholds, pe, From 6809351c5210c87a199e8b4167fa54b2dd9a48c8 Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Tue, 31 Jan 2023 12:24:41 +0000 Subject: [PATCH 082/235] [thresholding] assert on finding unsupported memory mode for thresholding binary search HLS conversion function Signed-off-by: Fionn O'Donohoe --- .../fpgadataflow/convert_to_hls_layers.py | 12 ++++-------- .../fpgadataflow/test_convert_to_hls_thresholding.py | 11 ++--------- 2 files changed, 6 insertions(+), 17 deletions(-) diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py index f6dd466fab..1a331b059f 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py @@ -1127,14 +1127,10 @@ def apply(self, model): # Perform checks for RTL variant if chosen if self.use_rtl_variant: - # Check memory mode - if self.mem_mode != "decoupled": - warnings.warn( - """%s : RTL Thresholding does not support 'decoupled' memory mode. - Falling back to HLS implementation.""" - % node.name - ) - is_rtl_variant_compatible = False + assert self.mem_mode == "decoupled", ( + """%s : RTL Thresholding only supports 'decoupled' memory mode.""" + % node.name + ) # Check PE/SIMD value if pe != 1: diff --git a/tests/fpgadataflow/test_convert_to_hls_thresholding.py b/tests/fpgadataflow/test_convert_to_hls_thresholding.py index 84521b395c..d07ffd2cbf 100755 --- a/tests/fpgadataflow/test_convert_to_hls_thresholding.py +++ b/tests/fpgadataflow/test_convert_to_hls_thresholding.py @@ -142,22 +142,15 @@ def make_single_multithresholding_modelwrapper( @pytest.mark.parametrize("input_data_type", [DataType["INT16"], DataType["UINT16"]]) @pytest.mark.parametrize("fold", [-1, 1, 2]) @pytest.mark.parametrize("num_input_channels", [16]) -@pytest.mark.parametrize("mem_mode", ["decoupled", "const"]) @pytest.mark.fpgadataflow @pytest.mark.vivado def test_convert_to_hls_tbs_rtl_variant( - activation, input_data_type, fold, num_input_channels, mem_mode + activation, input_data_type, fold, num_input_channels, ): # Handle inputs to the test pe = generate_pe_value(fold, num_input_channels) num_steps = activation.get_num_possible_values() - 1 - # Cppsim is not supported for this node (as it is an RTL node) - if mem_mode == "const": - pytest.skip("const memory mode not supported for this node") - elif mem_mode != "decoupled": - raise Exception("Unknown mem_mode: {}".format(mem_mode)) - if activation == DataType["BIPOLAR"]: pytest.skip( "Only negative activations are supported for " @@ -267,7 +260,7 @@ def write_thresh_config(sim): # Recreate the model using the ConvertToHLS transform new_model = new_model.transform( - to_hls.InferThresholdingLayer(mem_mode=mem_mode, use_rtl_variant=True) + to_hls.InferThresholdingLayer(mem_mode="decoupled", use_rtl_variant=True) ) new_model = new_model.transform(InsertFIFO(True)) new_model = new_model.transform(GiveUniqueNodeNames()) From 4515cf7c6d4e55f8dfca62b52b504e2666a6b497 Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Tue, 31 Jan 2023 12:29:42 +0000 Subject: [PATCH 083/235] [thresholding] precommit fix Signed-off-by: Fionn O'Donohoe --- tests/fpgadataflow/test_convert_to_hls_thresholding.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/fpgadataflow/test_convert_to_hls_thresholding.py b/tests/fpgadataflow/test_convert_to_hls_thresholding.py index d07ffd2cbf..75c4ef599c 100755 --- a/tests/fpgadataflow/test_convert_to_hls_thresholding.py +++ b/tests/fpgadataflow/test_convert_to_hls_thresholding.py @@ -46,7 +46,9 @@ from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO from finn.transformation.fpgadataflow.prepare_ip import PrepareIP -from test_fpgadataflow_thresholding_binary_search import make_single_thresholding_binary_search_modelwrapper +from test_fpgadataflow_thresholding_binary_search import ( + make_single_thresholding_binary_search_modelwrapper, +) test_fpga_part = "xczu3eg-sbva484-1-e" @@ -145,7 +147,10 @@ def make_single_multithresholding_modelwrapper( @pytest.mark.fpgadataflow @pytest.mark.vivado def test_convert_to_hls_tbs_rtl_variant( - activation, input_data_type, fold, num_input_channels, + activation, + input_data_type, + fold, + num_input_channels, ): # Handle inputs to the test pe = generate_pe_value(fold, num_input_channels) From b51498ef84edcd9362f4f83270f9ae39f5d7980f Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Tue, 31 Jan 2023 12:39:39 +0000 Subject: [PATCH 084/235] [thresholding] precommit fix 2 Signed-off-by: Fionn O'Donohoe --- tests/fpgadataflow/test_convert_to_hls_thresholding.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/fpgadataflow/test_convert_to_hls_thresholding.py b/tests/fpgadataflow/test_convert_to_hls_thresholding.py index 75c4ef599c..09067564eb 100755 --- a/tests/fpgadataflow/test_convert_to_hls_thresholding.py +++ b/tests/fpgadataflow/test_convert_to_hls_thresholding.py @@ -39,6 +39,9 @@ from qonnx.transformation.infer_datatypes import InferDataTypes from qonnx.transformation.infer_shapes import InferShapes from qonnx.util.basic import gen_finn_dt_tensor +from test_fpgadataflow_thresholding_binary_search import ( + make_single_thresholding_binary_search_modelwrapper, +) import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls from finn.core.rtlsim_exec import rtlsim_exec @@ -46,10 +49,6 @@ from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO from finn.transformation.fpgadataflow.prepare_ip import PrepareIP -from test_fpgadataflow_thresholding_binary_search import ( - make_single_thresholding_binary_search_modelwrapper, -) - test_fpga_part = "xczu3eg-sbva484-1-e" target_clk_ns = 5 From ff3b2014d5de4bf2a98c321d14bce15a9862bf74 Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Tue, 31 Jan 2023 12:48:23 +0000 Subject: [PATCH 085/235] [thresholding] precommit fix 3 Signed-off-by: Fionn O'Donohoe --- src/finn/transformation/fpgadataflow/convert_to_hls_layers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py index 1a331b059f..1bc5fee664 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py @@ -1128,7 +1128,8 @@ def apply(self, model): # Perform checks for RTL variant if chosen if self.use_rtl_variant: assert self.mem_mode == "decoupled", ( - """%s : RTL Thresholding only supports 'decoupled' memory mode.""" + """%s : RTL Thresholding only supports 'decoupled' memory + mode.""" % node.name ) From fc7e00db46414b88f3e1c3d3dc9dff4cf6bc84ff Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Thu, 23 Mar 2023 14:59:46 +0000 Subject: [PATCH 086/235] [thresholding] adjust templates so that .sv files are modular and can be used as standalone IP Signed-off-by: Fionn O'Donohoe --- finn-rtllib/thresholding/hdl/thresholding.sv | 4 ++-- finn-rtllib/thresholding/hdl/thresholding_axi.sv | 6 +++--- finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v | 2 +- .../custom_op/fpgadataflow/thresholding_binary_search.py | 6 +----- 4 files changed, 7 insertions(+), 11 deletions(-) diff --git a/finn-rtllib/thresholding/hdl/thresholding.sv b/finn-rtllib/thresholding/hdl/thresholding.sv index b26747d1ff..c7d5c86f6d 100644 --- a/finn-rtllib/thresholding/hdl/thresholding.sv +++ b/finn-rtllib/thresholding/hdl/thresholding.sv @@ -43,7 +43,7 @@ * threshold configuration relies on a channel address prefix. Inputs are * accompanied by a channel selector. *****************************************************************************/ -module $MODULE_NAME$ #( +module thresholding #( int unsigned N, // output precision int unsigned M, // input/threshold precision int unsigned C, // number of channels @@ -153,4 +153,4 @@ module $MODULE_NAME$ #( assign ocnl = pipe[N].cnl; assign odat = pipe[N].res + BIAS; -endmodule : $MODULE_NAME$ +endmodule : thresholding diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi.sv b/finn-rtllib/thresholding/hdl/thresholding_axi.sv index 5cd7746b82..79383c7996 100644 --- a/finn-rtllib/thresholding/hdl/thresholding_axi.sv +++ b/finn-rtllib/thresholding/hdl/thresholding_axi.sv @@ -38,7 +38,7 @@ * - performs aligned byte address to parameter word address translation. *****************************************************************************/ -module $MODULE_NAME_AXI$ #( +module thresholding_axi #( int unsigned N, // output precision int unsigned M, // input/threshold precision int unsigned C, // Channels @@ -197,7 +197,7 @@ module $MODULE_NAME_AXI$ #( end // Core Thresholding Module - $MODULE_NAME$ #(.N(N), .M(M), .C(C), .BIAS(BIAS), .O_BITS(O_BITS), .C_BITS(C_BITS)) core ( + thresholding #(.N(N), .M(M), .C(C), .BIAS(BIAS), .O_BITS(O_BITS), .C_BITS(C_BITS)) core ( .clk, .rst, .twe, .twa, .twd, .en, @@ -205,4 +205,4 @@ module $MODULE_NAME_AXI$ #( .ovld, .ocnl(), .odat ); -endmodule : $MODULE_NAME_AXI$ +endmodule : thresholding_axi diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v index c16bf264dd..e46d0046ee 100644 --- a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v +++ b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v @@ -84,7 +84,7 @@ module $MODULE_NAME_AXI_WRAPPER$ #( output [((O_BITS+7)/8)*8-1:0] out_V_TDATA ); - $MODULE_NAME_AXI$ #(.N(N), .M(M), .C(C), .BIAS(BIAS), .O_BITS(O_BITS)) inst ( + thresholding_axi #(.N(N), .M(M), .C(C), .BIAS(BIAS), .O_BITS(O_BITS)) inst ( //- Global Control ------------------ .ap_clk(ap_clk), .ap_rst_n(ap_rst_n), diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py index 97c9dd82c6..9b02248185 100755 --- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py @@ -259,11 +259,7 @@ def prepare_codegen_rtl_values(self): their key value(s) in the RTL template files""" code_gen_dict = {} - # Identify the module names - code_gen_dict["$MODULE_NAME$"] = [self.get_verilog_top_module_name()] - code_gen_dict["$MODULE_NAME_AXI$"] = [ - self.get_verilog_top_module_name() + "_axi" - ] + # Identify the module name code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [ self.get_verilog_top_module_name() + "_axi_wrapper" ] From f530aba05b05a59c5cd05b749666f89b82706cba Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Thu, 23 Mar 2023 15:50:43 +0000 Subject: [PATCH 087/235] [thresholding]: remove SIGN template in thresholding RTL and create parameter instead for more modular RTL Signed-off-by: Fionn O'Donohoe --- finn-rtllib/thresholding/hdl/thresholding.sv | 18 +++++++++++------- .../thresholding/hdl/thresholding_axi.sv | 3 ++- .../hdl/thresholding_axi_wrapper.v | 3 ++- .../fpgadataflow/thresholding_binary_search.py | 4 ++-- 4 files changed, 17 insertions(+), 11 deletions(-) diff --git a/finn-rtllib/thresholding/hdl/thresholding.sv b/finn-rtllib/thresholding/hdl/thresholding.sv index c7d5c86f6d..deff4fe0f8 100644 --- a/finn-rtllib/thresholding/hdl/thresholding.sv +++ b/finn-rtllib/thresholding/hdl/thresholding.sv @@ -48,6 +48,7 @@ module thresholding #( int unsigned M, // input/threshold precision int unsigned C, // number of channels + bit SIGNED, // signed inputs int BIAS, // offsetting the output [0, 2^N-1) -> [BIAS, 2^N-1 + BIAS) int unsigned C_BITS, @@ -68,7 +69,7 @@ module thresholding #( // Input Stream input logic ivld, input logic [C_BITS-1:0] icnl, // Ignored for C == 1 - input logic $SIGN$ [M -1:0] idat, + input logic [M -1:0] idat, // Output Stream output logic ovld, @@ -80,7 +81,7 @@ module thresholding #( typedef struct packed { logic vld; // Valid data identification logic [C_BITS-1:0] cnl; // Channel - logic $SIGN$ [M -1:0] val; // Original input value + logic [M -1:0] val; // Original input value logic [0:N-1] res; // Assembling result with valid prefix [0:stage] after stage #stage } pipe_t; uwire pipe_t pipe[0:N]; @@ -91,13 +92,13 @@ module thresholding #( for(genvar stage = 0; stage < N; stage++) begin : genStages // Threshold Memory - uwire $SIGN$ [M-1:0] thresh; + uwire [M-1:0] thresh; if(1) begin : blkUpdate // Write control: local select from global address uwire we = twe && tws[stage]; if((C == 1) && (stage == 0)) begin - logic $SIGN$ [M-1:0] Thresh = 'x; + logic [M-1:0] Thresh = 'x; always_ff @(posedge clk) begin if(rst) Thresh <= 'x; else if(we) Thresh <= twd; @@ -105,7 +106,7 @@ module thresholding #( assign thresh = Thresh; end else begin - logic $SIGN$ [M-1:0] Threshs[C * 2**stage]; + logic [M-1:0] Threshs[C * 2**stage]; uwire [$clog2(C)+stage-1:0] wa = twa[$left(twa):N-stage]; uwire [$clog2(C)+stage-1:0] ra; if(C > 1) assign ra[stage+:C_BITS] = pipe[stage].cnl; @@ -117,7 +118,7 @@ module thresholding #( end // Read - logic $SIGN$ [M-1:0] RdReg; + logic [M-1:0] RdReg; always_ff @(posedge clk) begin if(en) RdReg <= Threshs[ra]; end @@ -135,9 +136,12 @@ module thresholding #( // Assemble pipeline data logic [0:N-1] res; + uwire cmp = + SIGNED? $signed(thresh) <= $signed(State.val) : + /* else */ $unsigned(thresh) <= $unsigned(State.val); always_comb begin res = State.res; - res[stage] = thresh <= State.val; // Patch in next result bit + res[stage] = cmp; // Patch in next result bit end assign pipe[stage+1] = '{ vld: State.vld, diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi.sv b/finn-rtllib/thresholding/hdl/thresholding_axi.sv index 79383c7996..6099a64746 100644 --- a/finn-rtllib/thresholding/hdl/thresholding_axi.sv +++ b/finn-rtllib/thresholding/hdl/thresholding_axi.sv @@ -43,6 +43,7 @@ module thresholding_axi #( int unsigned M, // input/threshold precision int unsigned C, // Channels + bit SIGNED, // signed inputs int BIAS, // offsetting the output [0, 2^N-1) -> [BIAS, 2^N-1 + BIAS) int unsigned O_BITS @@ -197,7 +198,7 @@ module thresholding_axi #( end // Core Thresholding Module - thresholding #(.N(N), .M(M), .C(C), .BIAS(BIAS), .O_BITS(O_BITS), .C_BITS(C_BITS)) core ( + thresholding #(.N(N), .M(M), .C(C), .SIGNED(SIGNED), .BIAS(BIAS), .O_BITS(O_BITS), .C_BITS(C_BITS)) core ( .clk, .rst, .twe, .twa, .twd, .en, diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v index e46d0046ee..caf850b5bc 100644 --- a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v +++ b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v @@ -36,6 +36,7 @@ module $MODULE_NAME_AXI_WRAPPER$ #( parameter N = $N$, // output precision parameter M = $M$, // input/threshold precision parameter C = $C$, // Channels + parameter SIGNED = $SIGNED$, // signed inputs int BIAS = $BIAS$, // offsetting the output [0, 2^N-1) -> [-BIAS, 2^N-1 - BIAS) parameter O_BITS = BIAS > 0? @@ -84,7 +85,7 @@ module $MODULE_NAME_AXI_WRAPPER$ #( output [((O_BITS+7)/8)*8-1:0] out_V_TDATA ); - thresholding_axi #(.N(N), .M(M), .C(C), .BIAS(BIAS), .O_BITS(O_BITS)) inst ( + thresholding_axi #(.N(N), .M(M), .C(C), .SIGNED(SIGNED), .BIAS(BIAS), .O_BITS(O_BITS)) inst ( //- Global Control ------------------ .ap_clk(ap_clk), .ap_rst_n(ap_rst_n), diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py index 9b02248185..af9e1173fb 100755 --- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py @@ -286,9 +286,9 @@ def prepare_codegen_rtl_values(self): # Is the input datatype signed or unsigned? # The thresholding core needs to know this when comparing weights to inputs if self.get_input_datatype().signed(): - code_gen_dict["$SIGN$"] = ["signed"] + code_gen_dict["$SIGNED$"] = [str(1)] else: - code_gen_dict["$SIGN$"] = ["unsigned"] + code_gen_dict["$SIGNED$"] = [str(0)] return code_gen_dict From 3cd600cce8e1ff98161c55dce232d703173fa569 Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Thu, 23 Mar 2023 16:20:49 +0000 Subject: [PATCH 088/235] [thresholding]: decouple thresholding core from axi wrapper by removing input parameters Signed-off-by: Fionn O'Donohoe --- finn-rtllib/thresholding/hdl/thresholding.sv | 6 ++++-- finn-rtllib/thresholding/hdl/thresholding_axi.sv | 8 +++++--- finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v | 2 +- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/finn-rtllib/thresholding/hdl/thresholding.sv b/finn-rtllib/thresholding/hdl/thresholding.sv index deff4fe0f8..52d0b41b33 100644 --- a/finn-rtllib/thresholding/hdl/thresholding.sv +++ b/finn-rtllib/thresholding/hdl/thresholding.sv @@ -51,8 +51,10 @@ module thresholding #( bit SIGNED, // signed inputs int BIAS, // offsetting the output [0, 2^N-1) -> [BIAS, 2^N-1 + BIAS) - int unsigned C_BITS, - int unsigned O_BITS + localparam int unsigned C_BITS = C < 2? 1 : $clog2(C), + localparam int unsigned O_BITS = BIAS >= 0? + /* unsigned */ $clog2(2**N+BIAS) : + /* signed */ 1+$clog2(-BIAS >= 2**(N-1)? -BIAS : 2**N+BIAS) )( // Global Control input logic clk, diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi.sv b/finn-rtllib/thresholding/hdl/thresholding_axi.sv index 6099a64746..4bb3add13b 100644 --- a/finn-rtllib/thresholding/hdl/thresholding_axi.sv +++ b/finn-rtllib/thresholding/hdl/thresholding_axi.sv @@ -46,7 +46,10 @@ module thresholding_axi #( bit SIGNED, // signed inputs int BIAS, // offsetting the output [0, 2^N-1) -> [BIAS, 2^N-1 + BIAS) - int unsigned O_BITS + localparam int unsigned C_BITS = C < 2? 1 : $clog2(C), + localparam int unsigned O_BITS = BIAS >= 0? + /* unsigned */ $clog2(2**N+BIAS) : + /* signed */ 1+$clog2(-BIAS >= 2**(N-1)? -BIAS : 2**N+BIAS) )( //- Global Control ------------------ input logic ap_clk, @@ -173,7 +176,6 @@ module thresholding_axi #( end : blkOutputDecouple - localparam int unsigned C_BITS = C < 2? 1 : $clog2(C); uwire ivld = s_axis_tvalid; uwire [C_BITS-1:0] icnl; uwire [M -1:0] idat = s_axis_tdata[M-1:0]; @@ -198,7 +200,7 @@ module thresholding_axi #( end // Core Thresholding Module - thresholding #(.N(N), .M(M), .C(C), .SIGNED(SIGNED), .BIAS(BIAS), .O_BITS(O_BITS), .C_BITS(C_BITS)) core ( + thresholding #(.N(N), .M(M), .C(C), .SIGNED(SIGNED), .BIAS(BIAS)) core ( .clk, .rst, .twe, .twa, .twd, .en, diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v index caf850b5bc..da013b667a 100644 --- a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v +++ b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v @@ -85,7 +85,7 @@ module $MODULE_NAME_AXI_WRAPPER$ #( output [((O_BITS+7)/8)*8-1:0] out_V_TDATA ); - thresholding_axi #(.N(N), .M(M), .C(C), .SIGNED(SIGNED), .BIAS(BIAS), .O_BITS(O_BITS)) inst ( + thresholding_axi #(.N(N), .M(M), .C(C), .SIGNED(SIGNED), .BIAS(BIAS)) inst ( //- Global Control ------------------ .ap_clk(ap_clk), .ap_rst_n(ap_rst_n), From 54afa637d2b7beac8beca99979e2d727385b90f3 Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Tue, 28 Mar 2023 17:17:13 +0100 Subject: [PATCH 089/235] [thresholding]: patch in PE value to the thresholding AXI module and wrapper Signed-off-by: Fionn O'Donohoe --- .../thresholding/hdl/thresholding_axi.sv | 117 ++++++++++++------ .../hdl/thresholding_axi_wrapper.v | 7 +- .../thresholding_binary_search.py | 2 + 3 files changed, 82 insertions(+), 44 deletions(-) diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi.sv b/finn-rtllib/thresholding/hdl/thresholding_axi.sv index 4bb3add13b..506e31b215 100644 --- a/finn-rtllib/thresholding/hdl/thresholding_axi.sv +++ b/finn-rtllib/thresholding/hdl/thresholding_axi.sv @@ -42,11 +42,14 @@ module thresholding_axi #( int unsigned N, // output precision int unsigned M, // input/threshold precision int unsigned C, // Channels + int unsigned PE, // Processing Parallelism, requires C = M*PE bit SIGNED, // signed inputs int BIAS, // offsetting the output [0, 2^N-1) -> [BIAS, 2^N-1 + BIAS) - localparam int unsigned C_BITS = C < 2? 1 : $clog2(C), + localparam int unsigned CF = 1 + (C-1)/PE, // Channel Fold + localparam int unsigned ADDR_BITS = $clog2(CF) + $clog2(PE) + N + 2, + localparam int unsigned C_BITS = C/PE < 2? 1 : $clog2(C/PE), localparam int unsigned O_BITS = BIAS >= 0? /* unsigned */ $clog2(2**N+BIAS) : /* signed */ 1+$clog2(-BIAS >= 2**(N-1)? -BIAS : 2**N+BIAS) @@ -57,9 +60,9 @@ module thresholding_axi #( //- AXI Lite ------------------------ // Writing - input logic s_axilite_AWVALID, - output logic s_axilite_AWREADY, - input logic [$clog2(C)+N+1:0] s_axilite_AWADDR, // lowest 2 bits (byte selectors) are ignored + input logic s_axilite_AWVALID, + output logic s_axilite_AWREADY, + input logic [ADDR_BITS-1:0] s_axilite_AWADDR, // lowest 2 bits (byte selectors) are ignored input logic s_axilite_WVALID, output logic s_axilite_WREADY, @@ -83,33 +86,53 @@ module thresholding_axi #( //- AXI Stream - Input -------------- output logic s_axis_tready, input logic s_axis_tvalid, - input logic [((M+7)/8)*8-1:0] s_axis_tdata, + input logic [((PE*M+7)/8)*8-1:0] s_axis_tdata, //- AXI Stream - Output ------------- input logic m_axis_tready, output logic m_axis_tvalid, - output logic [((O_BITS+7)/8)*8-1:0] m_axis_tdata + output logic [((PE*O_BITS+7)/8)*8-1:0] m_axis_tdata ); + //- Parameter Constraints Checking -------------------------------------- + initial begin + if(C%PE != 0) begin + $error("%m: Channel count C=%0d is not a multiple of PE=%0d.", C, PE); + $finish; + end + end + //- Global Control ------------------------------------------------------ uwire clk = ap_clk; uwire rst = !ap_rst_n; //- AXI Lite: Threshold Configuration ----------------------------------- - uwire twe; - uwire [$clog2(C)+N-1:0] twa; - uwire [ M-1:0] twd; + uwire twe[PE]; + uwire [$clog2(CF)+N-1:0] twa; + uwire [ M-1:0] twd; if(1) begin : blkAxiLite logic WABusy = 0; logic WDBusy = 0; - logic [$clog2(C)+N-1:0] Addr = 'x; - logic [ M-1:0] Data = 'x; + logic Sel[PE] = '{ default: 'x }; + logic [$clog2(CF)+N-1:0] Addr = 'x; + logic [ M-1:0] Data = 'x; - assign twe = WABusy && WDBusy; + for(genvar pe = 0; pe < PE; pe++) begin + assign twe[pe] = WABusy && WDBusy && Sel[pe]; + end assign twa = Addr; assign twd = Data; - uwire clr_wr = rst || (twe && s_axilite_BREADY); - always_ff @(posedge clk) begin : blockName + if(PE == 1) always_comb Sel[0] = 1; + else begin + always_ff @(posedge clk) begin + if(!WABusy) begin + foreach(Sel[pe]) Sel[pe] <= s_axilite_AWADDR[N+2+:$clog2(PE)] == pe; + end + end + end + + uwire clr_wr = rst || (WABusy && WDBusy && s_axilite_BREADY); + always_ff @(posedge clk) begin if(clr_wr) begin WABusy <= 0; Addr <= 'x; @@ -119,7 +142,8 @@ module thresholding_axi #( else begin if(!WABusy) begin WABusy <= s_axilite_AWVALID; - Addr <= s_axilite_AWADDR[$clog2(C)+N+1:2]; + Addr[0+:N] <= s_axilite_AWADDR[2+:N]; + if(C > 1) Addr[N+:$clog2(CF)] <= s_axilite_AWADDR[2+N+$clog2(PE)+:$clog2(CF)]; end if(!WDBusy) begin WDBusy <= s_axilite_WVALID; @@ -148,39 +172,48 @@ module thresholding_axi #( //- IO-Sandwich with two-stage output buffer for containing a local enable uwire en; - uwire [O_BITS-1:0] odat; - uwire ovld; + uwire [PE-1:0][O_BITS-1:0] odat; + uwire ovld[PE]; if(1) begin : blkOutputDecouple typedef struct { - logic vld; - logic [O_BITS-1:0] dat; + logic vld; + logic [PE-1:0][O_BITS-1:0] dat; } buf_t; - buf_t Buf[2] = '{ default: '{ vld: 0, dat: 'x } }; + buf_t A = '{ vld: 0, dat: 'x }; + buf_t B = '{ vld: 0, dat: 'x }; always_ff @(posedge clk) begin - if(rst) Buf <= '{ default: '{ vld: 0, dat: 'x } }; + if(rst) begin + A <= '{ vld: 0, dat: 'x }; + B <= '{ vld: 0, dat: 'x }; + end else begin - if(!Buf[1].vld || m_axis_tready) begin - Buf[1] <= '{ - vld: Buf[0].vld || ovld, - dat: Buf[0].vld? Buf[0].dat : odat + if(!B.vld || m_axis_tready) begin + B <= '{ + vld: A.vld || ovld[0], + dat: A.vld? A.dat : odat }; end - Buf[0].vld <= Buf[1].vld && !m_axis_tready && (Buf[0].vld || ovld); - if(!Buf[0].vld) Buf[0].dat <= odat; + A.vld <= B.vld && !m_axis_tready && (A.vld || ovld[0]); + if(!A.vld) A.dat <= odat; end end - assign en = !Buf[0].vld; + assign en = !A.vld; - assign m_axis_tvalid = Buf[1].vld; - assign m_axis_tdata = Buf[1].dat; + assign m_axis_tvalid = B.vld; + assign m_axis_tdata = B.dat; end : blkOutputDecouple + // localparam int unsigned C_BITS = C/PE < 2? 1 : $clog2(C/PE); uwire ivld = s_axis_tvalid; uwire [C_BITS-1:0] icnl; - uwire [M -1:0] idat = s_axis_tdata[M-1:0]; + uwire [M -1:0] idat[PE]; + for(genvar pe = 0; pe < PE; pe++) begin + assign idat[pe] = s_axis_tdata[pe*M+:M]; + end + assign s_axis_tready = en; - if(C == 1) assign icnl = 'x; + if(C == PE) assign icnl = 'x; else begin logic [C_BITS-1:0] Chnl = 0; logic Last = 0; @@ -193,19 +226,21 @@ module thresholding_axi #( end else if(inc) begin Chnl <= Chnl + 1; - Last <= (~Chnl & (C-2)) == 0; + Last <= (~Chnl & (C/PE-2)) == 0; end end assign icnl = Chnl; end - // Core Thresholding Module - thresholding #(.N(N), .M(M), .C(C), .SIGNED(SIGNED), .BIAS(BIAS)) core ( - .clk, .rst, - .twe, .twa, .twd, - .en, - .ivld, .icnl, .idat, - .ovld, .ocnl(), .odat - ); + // Core Thresholding Modules + for(genvar pe = 0; pe < PE; pe++) begin : genCores + thresholding #(.N(N), .M(M), .C(C/PE), .SIGNED(SIGNED), .BIAS(BIAS)) core ( + .clk, .rst, + .twe(twe[pe]), .twa, .twd, + .en, + .ivld, .icnl, .idat(idat[pe]), + .ovld(ovld[pe]), .ocnl(), .odat(odat[pe]) + ); + end : genCores endmodule : thresholding_axi diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v index da013b667a..c27480f388 100644 --- a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v +++ b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v @@ -38,6 +38,7 @@ module $MODULE_NAME_AXI_WRAPPER$ #( parameter C = $C$, // Channels parameter SIGNED = $SIGNED$, // signed inputs int BIAS = $BIAS$, // offsetting the output [0, 2^N-1) -> [-BIAS, 2^N-1 - BIAS) + parameter PE = $PE$, parameter O_BITS = BIAS > 0? /* unsigned */ $clog2(2**N-BIAS) : @@ -77,15 +78,15 @@ module $MODULE_NAME_AXI_WRAPPER$ #( //- AXI Stream - Input -------------- output in0_V_TREADY, input in0_V_TVALID, - input [((M+7)/8)*8-1:0] in0_V_TDATA, + input [((PE*M+7)/8)*8-1:0] in0_V_TDATA, //- AXI Stream - Output ------------- input out_V_TREADY, output out_V_TVALID, - output [((O_BITS+7)/8)*8-1:0] out_V_TDATA + output [((PE*O_BITS+7)/8)*8-1:0] out_V_TDATA ); - thresholding_axi #(.N(N), .M(M), .C(C), .SIGNED(SIGNED), .BIAS(BIAS)) inst ( + thresholding_axi #(.N(N), .M(M), .C(C), .PE(PE), .SIGNED(SIGNED), .BIAS(BIAS)) inst ( //- Global Control ------------------ .ap_clk(ap_clk), .ap_rst_n(ap_rst_n), diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py index af9e1173fb..e2453fcaad 100755 --- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py @@ -273,6 +273,7 @@ def prepare_codegen_rtl_values(self): ) # input/threshold precision num_channels = self.get_nodeattr("NumChannels") # number of channels bias = self.get_nodeattr("activation_bias") # activation bias value + pe = self.get_nodeattr("PE") code_gen_dict["$N$"] = [ str(DataType[output_data_type].bitwidth()) @@ -282,6 +283,7 @@ def prepare_codegen_rtl_values(self): ] # input/threshold precision - convert bitwidth to string code_gen_dict["$C$"] = [str(num_channels)] # number of channels code_gen_dict["$BIAS$"] = [str(bias)] # activation bias value + code_gen_dict["$PE$"] = [str(pe)] # requires C = M*PE # Is the input datatype signed or unsigned? # The thresholding core needs to know this when comparing weights to inputs From 29f9e1ce637d5df00cb1dae2ada3438070da0852 Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Thu, 30 Mar 2023 15:43:36 +0100 Subject: [PATCH 090/235] [thresholding]: remove reset that erases the 0th stage threshold value There is a corner case where the number of channels configured for the thresholding core is 1 and during programming the stage 0 threshold parameter. For each other stage in this case (and all other cases) the threshold parameters are non-volatile. When a reset happens after programming the threshold parameters, all would still be intact except for the 0th stage threshold value. Signed-off-by: Fionn O'Donohoe --- finn-rtllib/thresholding/hdl/thresholding.sv | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/finn-rtllib/thresholding/hdl/thresholding.sv b/finn-rtllib/thresholding/hdl/thresholding.sv index 52d0b41b33..0ce95ed3f9 100644 --- a/finn-rtllib/thresholding/hdl/thresholding.sv +++ b/finn-rtllib/thresholding/hdl/thresholding.sv @@ -102,8 +102,7 @@ module thresholding #( if((C == 1) && (stage == 0)) begin logic [M-1:0] Thresh = 'x; always_ff @(posedge clk) begin - if(rst) Thresh <= 'x; - else if(we) Thresh <= twd; + if(we) Thresh <= twd; end assign thresh = Thresh; end From 2c4c8e224f8921848713f6d121532ff345c84fd0 Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Fri, 31 Mar 2023 10:43:00 +0100 Subject: [PATCH 091/235] [thresholding]: enable PE testing of RTL threhoslding binary search node Signed-off-by: Fionn O'Donohoe --- .../fpgadataflow/convert_to_hls_layers.py | 10 ---------- .../fpgadataflow/test_convert_to_hls_thresholding.py | 10 ++-------- .../test_fpgadataflow_thresholding_binary_search.py | 11 ++--------- 3 files changed, 4 insertions(+), 27 deletions(-) diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py index dedcc30a38..4c06a28b75 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py @@ -1137,16 +1137,6 @@ def apply(self, model): % node.name ) - # Check PE/SIMD value - if pe != 1: - warnings.warn( - """%s : RTL Thresholding does not support paralellisation. - Only a PE value of 1 is supported. - Falling back to HLS implementation.""" - % node.name - ) - is_rtl_variant_compatible = False - if self.use_rtl_variant and is_rtl_variant_compatible: new_node = helper.make_node( "Thresholding_Binary_Search", diff --git a/tests/fpgadataflow/test_convert_to_hls_thresholding.py b/tests/fpgadataflow/test_convert_to_hls_thresholding.py index 09067564eb..895c82d4ca 100755 --- a/tests/fpgadataflow/test_convert_to_hls_thresholding.py +++ b/tests/fpgadataflow/test_convert_to_hls_thresholding.py @@ -138,10 +138,10 @@ def make_single_multithresholding_modelwrapper( model.set_initializer("thresh", thresholds) return model - +# N.B. Fold values where C % PE != 0 fail @pytest.mark.parametrize("activation", [DataType["INT4"], DataType["BIPOLAR"]]) @pytest.mark.parametrize("input_data_type", [DataType["INT16"], DataType["UINT16"]]) -@pytest.mark.parametrize("fold", [-1, 1, 2]) +@pytest.mark.parametrize("fold", [-1, 1, 2, 4, 6]) @pytest.mark.parametrize("num_input_channels", [16]) @pytest.mark.fpgadataflow @pytest.mark.vivado @@ -161,12 +161,6 @@ def test_convert_to_hls_tbs_rtl_variant( "RTL Thresholding Binary Search node" ) - # Paralellisation not supported for thresholding binary search rtl node - if pe != 1: - pytest.skip( - "Paralellisation not supported for RTL Thresholding Binary Search node" - ) - # Other non-input parameters num_input_vecs = [1, 2, 2] output_data_type = activation diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py index e57c4942c8..24b60f5ea5 100755 --- a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py +++ b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py @@ -186,10 +186,10 @@ def test_fpgadataflow_thresholding_binary_search_prepare_rtlsim(): # Test brief: Create a Thresholding binary search layer using various parameters # and test against a SW generated & simulated dataset -# N.B. - fold factor of '-1' is supported only (no PE/SIMD support) +# N.B. Fold values where C % PE != 0 fail @pytest.mark.parametrize("activation", [DataType["INT4"], DataType["BIPOLAR"]]) @pytest.mark.parametrize("input_data_type", [DataType["INT16"], DataType["UINT16"]]) -@pytest.mark.parametrize("fold", [-1, 1, 2]) +@pytest.mark.parametrize("fold", [-1, 1, 2, 4, 6]) @pytest.mark.parametrize("num_input_channels", [16]) @pytest.mark.fpgadataflow @pytest.mark.vivado @@ -201,13 +201,6 @@ def test_fpgadataflow_thresholding_binary_search( pe = generate_pe_value(fold, num_input_channels) num_steps = activation.get_num_possible_values() - 1 - # Paralellisation not supported for thresholding binary search rtl node - if pe != 1: - pytest.skip( - "Paralellisation of IP not supported for " - "RTL Thresholding Binary Search node" - ) - # Other non-input parameters num_input_vecs = [1, 2, 2] output_data_type = activation From 5d07a435c2994f0238fb41ec21381d75ea049796 Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Fri, 31 Mar 2023 10:45:34 +0100 Subject: [PATCH 092/235] [thresholding]: add comment about why bipolar activations skipped for threhsolding binary search node Signed-off-by: Fionn O'Donohoe --- tests/fpgadataflow/test_convert_to_hls_thresholding.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/fpgadataflow/test_convert_to_hls_thresholding.py b/tests/fpgadataflow/test_convert_to_hls_thresholding.py index 895c82d4ca..f2d76c8416 100755 --- a/tests/fpgadataflow/test_convert_to_hls_thresholding.py +++ b/tests/fpgadataflow/test_convert_to_hls_thresholding.py @@ -155,6 +155,9 @@ def test_convert_to_hls_tbs_rtl_variant( pe = generate_pe_value(fold, num_input_channels) num_steps = activation.get_num_possible_values() - 1 + # See convert_to_hls_layers::InferThresholdingLayer: + # assert (not odt.signed()) or (actval < 0) + # This implies that it expects a negative activation, BIPOLAR does not provide that if activation == DataType["BIPOLAR"]: pytest.skip( "Only negative activations are supported for " From fcf579ce01075bbeb997580fbafc8cd9d64ed50c Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Fri, 31 Mar 2023 11:32:42 +0100 Subject: [PATCH 093/235] fix precommit issues Signed-off-by: Fionn O'Donohoe --- src/finn/custom_op/fpgadataflow/thresholding_binary_search.py | 2 +- tests/fpgadataflow/test_convert_to_hls_thresholding.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py index e2453fcaad..694d25bfaa 100755 --- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py @@ -283,7 +283,7 @@ def prepare_codegen_rtl_values(self): ] # input/threshold precision - convert bitwidth to string code_gen_dict["$C$"] = [str(num_channels)] # number of channels code_gen_dict["$BIAS$"] = [str(bias)] # activation bias value - code_gen_dict["$PE$"] = [str(pe)] # requires C = M*PE + code_gen_dict["$PE$"] = [str(pe)] # requires C = M*PE # Is the input datatype signed or unsigned? # The thresholding core needs to know this when comparing weights to inputs diff --git a/tests/fpgadataflow/test_convert_to_hls_thresholding.py b/tests/fpgadataflow/test_convert_to_hls_thresholding.py index f2d76c8416..9c233bdd06 100755 --- a/tests/fpgadataflow/test_convert_to_hls_thresholding.py +++ b/tests/fpgadataflow/test_convert_to_hls_thresholding.py @@ -138,6 +138,7 @@ def make_single_multithresholding_modelwrapper( model.set_initializer("thresh", thresholds) return model + # N.B. Fold values where C % PE != 0 fail @pytest.mark.parametrize("activation", [DataType["INT4"], DataType["BIPOLAR"]]) @pytest.mark.parametrize("input_data_type", [DataType["INT16"], DataType["UINT16"]]) From 6c9d1f50177de5bb1c91eacc061d0aa8adb9cf56 Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Wed, 5 Apr 2023 16:28:55 +0100 Subject: [PATCH 094/235] [thresholding] only adjust MSB thresholding addressing bits when channel fold factor is present In the case where channel fold is not present (i.e. CF is 0), we saw incorrect threshold address programming. Without this commit and when no channel folding is present, this if statement is always stepped through and was damaging LSBs of the Addr signal, causing incorrect threshold address programming for a PE core. Although the logic in the if statement looks correct (i.e. programming 0 bits (clog2(CF) => 0)) and should not harm the Addr signal, it's best to avoid stepping through a case that does not exist (i.e., there is no channel folding and each channel has its own PE; therefore no extra bits needed to program multiple channel thresholds into a single PE core). Signed-off-by: Fionn O'Donohoe --- finn-rtllib/thresholding/hdl/thresholding_axi.sv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi.sv b/finn-rtllib/thresholding/hdl/thresholding_axi.sv index 506e31b215..d2a7420a99 100644 --- a/finn-rtllib/thresholding/hdl/thresholding_axi.sv +++ b/finn-rtllib/thresholding/hdl/thresholding_axi.sv @@ -143,7 +143,7 @@ module thresholding_axi #( if(!WABusy) begin WABusy <= s_axilite_AWVALID; Addr[0+:N] <= s_axilite_AWADDR[2+:N]; - if(C > 1) Addr[N+:$clog2(CF)] <= s_axilite_AWADDR[2+N+$clog2(PE)+:$clog2(CF)]; + if(CF > 1) Addr[N+:$clog2(CF)] <= s_axilite_AWADDR[2+N+$clog2(PE)+:$clog2(CF)]; end if(!WDBusy) begin WDBusy <= s_axilite_WVALID; From b247ffbc258ec628a51c14822ec4343283ef5a2e Mon Sep 17 00:00:00 2001 From: Fionn O'Donohoe Date: Wed, 5 Apr 2023 19:44:36 +0100 Subject: [PATCH 095/235] [thresholding] update binary search to match qonnx 0.2.0 commit 65822357a7dba4f917c852d5f08bdebc7dd22e9d on dev moved all custom_ops to be compatible with qonnx 0.2.0 Signed-off-by: Fionn O'Donohoe --- src/finn/custom_op/fpgadataflow/thresholding_binary_search.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py index 694d25bfaa..d02b778823 100755 --- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py @@ -60,8 +60,8 @@ class Thresholding_Binary_Search(HLSCustomOp): """Class that corresponds to finn-rtllib 'thresholding' function.""" - def __init__(self, onnx_node): - super().__init__(onnx_node) + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) def get_nodeattr_types(self): my_attrs = { From afab9cd6543b4fe1f612c329074d30d59706ac08 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 6 Apr 2023 12:34:01 +0100 Subject: [PATCH 096/235] [rtl custom op]: initial implementation of mvu_8sx9 --- finn-rtllib/mvu/mvu_8sx9.sv | 284 ++++++++++++++++++++++++++++++++++++ 1 file changed, 284 insertions(+) create mode 100644 finn-rtllib/mvu/mvu_8sx9.sv diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv new file mode 100644 index 0000000000..c992990d9f --- /dev/null +++ b/finn-rtllib/mvu/mvu_8sx9.sv @@ -0,0 +1,284 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Matrix Vector Unit (MVU) core compute kernel utilizing DSP58. + *****************************************************************************/ + +module mvu_8sx9 #( + int unsigned PE, + int unsigned SIMD, + int unsigned ACTIVATION_WIDTH, + int unsigned WEIGHT_WIDTH, + bit SIGNED_ACTIVATIONS = 0, + int unsigned SEGMENTLEN = 0 // Default to 0 (which implies a single segment) + ) + ( + input logic clk, + input logic rst, + input logic en, + input logic last, + input logic zero, + input logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] a, + input logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, + output logic vld, + output logic [PE-1:0][57:0] p + ); + +//-------------------- Declare global signals --------------------\\ +localparam int unsigned CHAINLEN = (SIMD+2)/3; +localparam int unsigned SEGLEN = SEGMENTLEN == 0 ? CHAINLEN : SEGMENTLEN; // Additional constant to default a SEGMENTLEN of '0' to the DSP-chain length +uwire [26:0] a_in_i [CHAINLEN]; +uwire [23:0] b_in_i [PE][CHAINLEN]; +uwire [57:0] pcout [PE][CHAINLEN]; + +//-------------------- Shift register for opmode select signal --------------------\\ +localparam int unsigned MAX_PIPELINE_STAGES = (CHAINLEN + SEGLEN-1)/SEGLEN; // >=1 (== number of pipeline registers + 1 (A/B inputs always have 1 register)) +logic L [0:1+MAX_PIPELINE_STAGES] = '{default: 0}; // After MAX_PIPELINE_STAGES (== number of pipeline stages for input data), we have 3 additional cycles latency (A/B reg, Mreg, Preg). Thus, we add +2 (since OPMODE is buffered by 1 cycle in the DSP fabric) + +always_ff @(posedge clk) begin + if(rst) L <= '{default: 0}; + else if(en) L <= { L[1:1+MAX_PIPELINE_STAGES], last }; +end +assign vld = L[0]; + +//-------------------- Shift register for ZERO flag --------------------\\ +logic Z [0:MAX_PIPELINE_STAGES-2] = '{default:0}; // We need MAX_PIPELINE_STAGES-1 pipeline stages (note: INMODE is buffered inside DSP fabric) + +if (MAX_PIPELINE_STAGES > 1) begin : genZreg + always_ff @(posedge clk) begin + if (rst) Z <= '{default: 0}; + else if(en) begin + Z[0] <= zero; + if (MAX_PIPELINE_STAGES > 2) Z[1:MAX_PIPELINE_STAGES-1] <= Z[0:MAX_PIPELINE_STAGES-2]; + end + end +end; + +//-------------------- Buffer for input activations --------------------\\ +localparam int unsigned PAD_BITS_ACT = 9 - ACTIVATION_WIDTH; +typedef logic [2:0][ACTIVATION_WIDTH-1:0] a_buffer_t; + +for (genvar i=0; i1 ? TOTAL_PREGS-1 : 0; + + if (EXTERNAL_PREGS > 0) begin : genExternalPregAct + a_buffer_t A [0:EXTERNAL_PREGS-1]; + always_ff @(posedge clk) begin + if (rst) A <= '{default: 0}; + else if(en) begin + A[EXTERNAL_PREGS-1] <= a[3*i +: 3]; + if (EXTERNAL_PREGS > 1) A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1]; + end + end + assign a_in_i[i][26:0] = SIGNED_ACTIVATIONS ? { {PAD_BITS_ACT{A[0][2][ACTIVATION_WIDTH-1]}}, A[0][2], {PAD_BITS_ACT{A[0][1][ACTIVATION_WIDTH-1]}}, A[0][1], {PAD_BITS_ACT{A[0][0][ACTIVATION_WIDTH-1]}}, A[0][0]} + : { {PAD_BITS_ACT{1'b0}}, A[0][2], {PAD_BITS_ACT{1'b0}}, A[0][1], {PAD_BITS_ACT{1'b0}}, A[0][0]} ; + end : genExternalPregAct + else begin : genInpDSPAct + assign a_in_i[i][26:0] = SIGNED_ACTIVATIONS ? { {PAD_BITS_ACT{a[3*i+2][ACTIVATION_WIDTH-1]}}, a[3*i+2], {PAD_BITS_ACT{a[3*i+1][ACTIVATION_WIDTH-1]}}, a[3*i+1], {PAD_BITS_ACT{a[3*i][ACTIVATION_WIDTH-1]}}, a[3*i]} + : { {PAD_BITS_ACT{1'b0}}, a[3*i+2], {PAD_BITS_ACT{1'b0}}, a[3*i+1], {PAD_BITS_ACT{1'b0}}, a[3*i]} ; + end : genInpDSPAct + +end : genActSIMD + +//-------------------- Buffer for weights --------------------\\ +localparam int unsigned PAD_BITS_WEIGHT = 8 - WEIGHT_WIDTH; +typedef logic [2:0][WEIGHT_WIDTH-1:0] b_buffer_t; + +for (genvar j=0; j1 ? TOTAL_PREGS-1 : 0; + + if (EXTERNAL_PREGS > 0) begin : genExternalPregWeight + b_buffer_t B [0:PE-1][0:EXTERNAL_PREGS-1]; + always_ff @(posedge clk) begin + if (rst) B <= '{default: 0}; + else if (en) begin + B[j][EXTERNAL_PREGS-1] <= w[j][3*i +: 3]; + if (EXTERNAL_PREGS > 1) B[j][0:EXTERNAL_PREGS-2] <= B[j][1:EXTERNAL_PREGS-1]; + end + end + assign b_in_i[j][i][23:0] = { {PAD_BITS_WEIGHT{B[j][0][2][WEIGHT_WIDTH-1]}}, B[j][0][2], {PAD_BITS_WEIGHT{B[j][0][1][WEIGHT_WIDTH-1]}}, B[j][0][1], {PAD_BITS_WEIGHT{B[j][0][0][WEIGHT_WIDTH-1]}}, B[j][0][0] }; + end : genExternalPregWeight + else begin : genInpDSPWeight + assign b_in_i[j][i][23:0] = { {PAD_BITS_WEIGHT{w[j][3*i+2][WEIGHT_WIDTH-1]}}, w[j][3*i+2], {PAD_BITS_WEIGHT{w[j][3*i+1][WEIGHT_WIDTH-1]}}, w[j][3*i+1], {PAD_BITS_WEIGHT{w[j][3*i][WEIGHT_WIDTH-1]}}, w[j][3*i] }; + end : genInpDSPWeight + end : genWeightSIMD + +end : genWeightPE + +//-------------------- Instantiate PE x CHAINLEN DSPs --------------------\\ +for (genvar j=0; j0 ? 2 : 1; // 1 : 0 + localparam bit PREG = (i+1)%SEGLEN==0 || i == CHAINLEN-1; + localparam bit FIRST = i == 0; + localparam bit LAST = i == CHAINLEN-1; + uwire [57:0] pp; + + if (LAST) begin : genPOUT + assign p[j] = pp; + end + + DSP58 #( + // Feature Control Attributes: Data Path Selection + .AMULTSEL("A"), // Selects A input to multiplier (A, AD) + .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) + .BMULTSEL("B"), // Selects B input to multiplier (AD, B) + .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port) + .DSP_MODE("INT8"), // Configures DSP to a particular mode of operation. Set to INT24 for + // legacy mode. + .PREADDINSEL("A"), // Selects input to pre-adder (A, B) + .RND(58'h000000000000000), // Rounding Constant + .USE_MULT("MULTIPLY"), // Select multiplier usage (DYNAMIC, MULTIPLY, NONE) + .USE_SIMD("ONE58"), // SIMD selection (FOUR12, ONE58, TWO24) + .USE_WIDEXOR("FALSE"), // Use the Wide XOR function (FALSE, TRUE) + .XORSIMD("XOR24_34_58_116"), // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116) + // Pattern Detector Attributes: Pattern Detection Configuration + .AUTORESET_PATDET("NO_RESET"), // NO_RESET, RESET_MATCH, RESET_NOT_MATCH + .AUTORESET_PRIORITY("RESET"), // Priority of AUTORESET vs. CEP (CEP, RESET). + .MASK(58'h0ffffffffffffff), // 58-bit mask value for pattern detect (1=ignore) + .PATTERN(58'h000000000000000), // 58-bit pattern match for pattern detect + .SEL_MASK("MASK"), // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2 + .SEL_PATTERN("PATTERN"), // Select pattern value (C, PATTERN) + .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect (NO_PATDET, PATDET) + // Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins + .IS_ALUMODE_INVERTED(4'b0000), // Optional inversion for ALUMODE + .IS_CARRYIN_INVERTED(1'b0), // Optional inversion for CARRYIN + .IS_CLK_INVERTED(1'b0), // Optional inversion for CLK + .IS_INMODE_INVERTED(5'b00000), // Optional inversion for INMODE + .IS_NEGATE_INVERTED(3'b000), // Optional inversion for NEGATE + .IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0 + FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN + 2'b01, // Y : M + 2'b01 // X: M + }), // Optional inversion for OPMODE + .IS_RSTALLCARRYIN_INVERTED(1'b0), // Optional inversion for RSTALLCARRYIN + .IS_RSTALUMODE_INVERTED(1'b0), // Optional inversion for RSTALUMODE + .IS_RSTA_INVERTED(1'b0), // Optional inversion for RSTA + .IS_RSTB_INVERTED(1'b0), // Optional inversion for RSTB + .IS_RSTCTRL_INVERTED(1'b0), // Optional inversion for STCONJUGATE_A + .IS_RSTC_INVERTED(1'b0), // Optional inversion for RSTC + .IS_RSTD_INVERTED(1'b0), // Optional inversion for RSTD + .IS_RSTINMODE_INVERTED(1'b0), // Optional inversion for RSTINMODE + .IS_RSTM_INVERTED(1'b0), // Optional inversion for RSTM + .IS_RSTP_INVERTED(1'b0), // Optional inversion for RSTP + // Register Control Attributes: Pipeline Register Configuration + .ACASCREG(INTERNAL_PREGS), // Number of pipeline stages between A/ACIN and ACOUT (0-2) + .ADREG(0), // Pipeline stages for pre-adder (0-1) + .ALUMODEREG(0), // Pipeline stages for ALUMODE (0-1) + .AREG(INTERNAL_PREGS), // Pipeline stages for A (0-2) + .BCASCREG(INTERNAL_PREGS), // Number of pipeline stages between B/BCIN and BCOUT (0-2) + .BREG(INTERNAL_PREGS), // Pipeline stages for B (0-2) + .CARRYINREG(0), // Pipeline stages for CARRYIN (0-1) + .CARRYINSELREG(0), // Pipeline stages for CARRYINSEL (0-1) + .CREG(0), // Pipeline stages for C (0-1) + .DREG(0), // Pipeline stages for D (0-1) + .INMODEREG(1), // Pipeline stages for INMODE (0-1) + .MREG(1), // Multiplier pipeline stages (0-1) + .OPMODEREG(1), // Pipeline stages for OPMODE (0-1) + .PREG(PREG), // Number of pipeline stages for P (0-1) + .RESET_MODE("SYNC") // Selection of synchronous or asynchronous reset. (ASYNC, SYNC). + ) + DSP58_inst ( + // Cascade outputs: Cascade Ports + .ACOUT(), // 34-bit output: A port cascade + .BCOUT(), // 24-bit output: B cascade + .CARRYCASCOUT(), // 1-bit output: Cascade carry + .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade + .PCOUT(pcout[j][i]), // 58-bit output: Cascade output + // Control outputs: Control Inputs/Status Bits + .OVERFLOW(), // 1-bit output: Overflow in add/acc + .PATTERNBDETECT(), // 1-bit output: Pattern bar detect + .PATTERNDETECT(), // 1-bit output: Pattern detect + .UNDERFLOW(), // 1-bit output: Underflow in add/acc + // Data outputs: Data Ports + .CARRYOUT(), // 4-bit output: Carry + .P(pp), // 58-bit output: Primary data + .XOROUT(), // 8-bit output: XOR data + // Cascade inputs: Cascade Ports + .ACIN('x), // 34-bit input: A cascade data + .BCIN('x), // 24-bit input: B cascade + .CARRYCASCIN('x), // 1-bit input: Cascade carry + .MULTSIGNIN('x), // 1-bit input: Multiplier sign cascade + .PCIN(FIRST ? 'x : pcout[j][i-1]), // 58-bit input: P cascade + // Control inputs: Control Inputs/Status Bits + .ALUMODE(4'h0), // 4-bit input: ALU control + .CARRYINSEL('0), // 3-bit input: Carry select + .CLK(clk), // 1-bit input: Clock + .INMODE({ + INTERNAL_PREGS==2 ? 1'b0 : 1'b1, + 2'b00, + TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero, + INTERNAL_PREGS==2 ? 1'b0 : 1'b1 + }), // 5-bit input: INMODE control + .NEGATE('0), // 3-bit input: Negates the input of the multiplier + .OPMODE({ + LAST ? {1'b0, L[1]} : 2'b00, + 7'b000_0000 + }), // 9-bit input: Operation mode + // Data inputs: Data Ports + .A({ 7'bx, a_in_i[i] }), // 34-bit input: A data + .B(b_in_i[j][i]), // 24-bit input: B data + .C('x), // 58-bit input: C data + .CARRYIN('0), // 1-bit input: Carry-in + .D('x), // 27-bit input: D data + // Reset/Clock Enable inputs: Reset/Clock Enable Inputs + .ASYNC_RST('0), // 1-bit input: Asynchronous reset for all registers. + .CEA1(en), // 1-bit input: Clock enable for 1st stage AREG + .CEA2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage AREG + .CEAD('0), // 1-bit input: Clock enable for ADREG + .CEALUMODE('0), // 1-bit input: Clock enable for ALUMODE + .CEB1(en), // 1-bit input: Clock enable for 1st stage BREG + .CEB2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage BREG + .CEC('0), // 1-bit input: Clock enable for CREG + .CECARRYIN('0), // 1-bit input: Clock enable for CARRYINREG + .CECTRL(en), // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG + .CED('0), // 1-bit input: Clock enable for DREG + .CEINMODE(en), // 1-bit input: Clock enable for INMODEREG + .CEM(en), // 1-bit input: Clock enable for MREG + .CEP(PREG && en), // 1-bit input: Clock enable for PREG + .RSTA(rst), // 1-bit input: Reset for AREG + .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG + .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG + .RSTB(rst), // 1-bit input: Reset for BREG + .RSTC('0), // 1-bit input: Reset for CREG + .RSTCTRL(rst), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG + .RSTD('0), // 1-bit input: Reset for DREG and ADREG + .RSTINMODE(rst), // 1-bit input: Reset for INMODE register + .RSTM(rst), // 1-bit input: Reset for MREG + .RSTP(PREG && rst) // 1-bit input: Reset for PREG + ); + end : genDSPChain +end : genDSPPE + +endmodule From a94fc3bb0759ecd4b9af212d1629236894a1b520 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 6 Apr 2023 12:34:22 +0100 Subject: [PATCH 097/235] [rtl custom op]: testbench for mvu_8sx9 --- finn-rtllib/mvu/mvu_8sx9_tb.sv | 165 +++++++++++++++++++++++++++++++++ 1 file changed, 165 insertions(+) create mode 100644 finn-rtllib/mvu/mvu_8sx9_tb.sv diff --git a/finn-rtllib/mvu/mvu_8sx9_tb.sv b/finn-rtllib/mvu/mvu_8sx9_tb.sv new file mode 100644 index 0000000000..ea3ecbbd70 --- /dev/null +++ b/finn-rtllib/mvu/mvu_8sx9_tb.sv @@ -0,0 +1,165 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Testbench for MVU core compute kernel. + *****************************************************************************/ + +module mvu_8sx9_tb(); + + //-------------------- Simulation parameters --------------------\\ + // Matrix & parallelism config + localparam int unsigned MH = 256; + localparam int unsigned PE = 16; + localparam int unsigned MW = 600; + localparam int unsigned SIMD = 60; + localparam int unsigned SEGMENTLEN = 4; + // Bit-width config + localparam int unsigned ACTIVATION_WIDTH = 8; + localparam int unsigned WEIGHT_WIDTH = 4; + localparam bit SIGNED_ACTIVATIONS = 1; + // Simulation constants + localparam int unsigned NF = MH/PE; + localparam int unsigned SF = MW/SIMD; + localparam int unsigned NUM_OF_DSP = SIMD/3; + + typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; + typedef activation_t activation_vector_t[SF]; + + function activation_vector_t init_ACTIVATIONS; + automatic activation_vector_t res; + std::randomize(res); + return res; + endfunction : init_ACTIVATIONS + + typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; + typedef weight_t weight_matrix_t[NF][SF]; + + function weight_matrix_t init_WEIGHTS; + automatic weight_matrix_t res; + std::randomize(res); + return res; + endfunction : init_WEIGHTS; + + typedef logic signed [PE-1:0][57:0] output_t; + typedef output_t output_vector_t [NF]; + + function output_vector_t check_output(activation_vector_t a, weight_matrix_t w); + automatic output_vector_t res = '{default: 0}; + for (int j = 0; j 1) && !rst; + end + + // Compare computed output against golden output when vld flag is raised by DUT + always_ff @(posedge clk iff (vld && en)) begin + foreach(p[i]) begin + assert ($signed(p[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + else begin + $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + $stop; + end + end + NF_CNT += 1; + end + + // Instantiate DUT + mvu_8sx9 #( + .PE(PE), + .SIMD(SIMD), + .WEIGHT_WIDTH(WEIGHT_WIDTH), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), + .ACTIVATION_WIDTH(ACTIVATION_WIDTH), + .SEGMENTLEN(SEGMENTLEN) + ) + dut ( + .clk, .rst, .en, .last, .zero, .a, .w, .vld, .p + ); + +endmodule From 98f9accb40bed3445215e15d30398e09948e0b9f Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 6 Apr 2023 12:35:30 +0100 Subject: [PATCH 098/235] [rtl custom op]: initial implementation of flow control component for mvu_8sx9 --- finn-rtllib/mvu/mvu_8sx9_axi.sv | 179 ++++++++++++++++++++++++++++++++ 1 file changed, 179 insertions(+) create mode 100644 finn-rtllib/mvu/mvu_8sx9_axi.sv diff --git a/finn-rtllib/mvu/mvu_8sx9_axi.sv b/finn-rtllib/mvu/mvu_8sx9_axi.sv new file mode 100644 index 0000000000..8765c50a26 --- /dev/null +++ b/finn-rtllib/mvu/mvu_8sx9_axi.sv @@ -0,0 +1,179 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Matrix Vector Unit (MVU) AXI-lite interface wrapper. + *****************************************************************************/ + +module mvu_8sx9_axi #( + int unsigned MW, + int unsigned MH, + int unsigned PE, + int unsigned SIMD, + int unsigned ACTIVATION_WIDTH, + int unsigned WEIGHT_WIDTH, + int unsigned ACCU_WIDTH, + bit SIGNED_ACTIVATIONS = 0, + int unsigned SEGMENTLEN = 0, + parameter RAM_STYLE = "auto", + + localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, + localparam int unsigned INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8, + localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH, + localparam int unsigned INPUT_STREAM_WIDTH = SIMD*ACTIVATION_WIDTH, + localparam int unsigned SF = MW/SIMD, + localparam int unsigned NF = MH/PE, + localparam int unsigned OUTPUT_LANES = PE, + localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8 +) +( + // Global Control + input logic ap_clk, + input logic ap_rst_n, + + // Weight Stream + input logic [WEIGHT_STREAM_WIDTH_BA-1:0] s_axis_weights_tdata, + input logic s_axis_weights_tvalid, + output logic s_axis_weights_tready, + + // Input Stream + input logic [INPUT_STREAM_WIDTH_BA-1:0] s_axis_input_tdata, + input logic s_axis_input_tvalid, + output logic s_axis_input_tready, + + // Output Stream + output logic [OUTPUT_STREAM_WIDTH_BA-1:0] m_axis_output_tdata, + output logic m_axis_output_tvalid, + input logic m_axis_output_tready +); + +//-------------------- Parameter sanity checks --------------------\\ + initial begin + if (MW % SIMD != 0) begin + $error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD); + $finish; + end + if (MH % PE != 0) begin + $error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE); + $finish; + end + if (ACTIVATION_WIDTH > 9) begin + $error("Activation width of %0d-bits exceeds maximum of 9-bits", ACTIVATION_WIDTH); + $finish; + end + if (WEIGHT_WIDTH > 8) begin + $error("Weight width of %0d-bits exceeds maximum of 8-bits", WEIGHT_WIDTH); + $finish; + end + if (SIGNED_ACTIVATIONS == 0 && ACTIVATION_WIDTH==9) begin + $error("Activation width of %0d-bits exceeds maximum of 8-bits for unsigned numbers", ACTIVATION_WIDTH); + $finish; + end + if (SEGMENTLEN == 0) begin + $warning("Segment length of %0d defaults to chain length", SEGMENTLEN); + end + if (SEGMENTLEN > (SIMD+2)/3) begin + $error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3); + $finish; + end + end + + uwire clk = ap_clk; + uwire rst = !ap_rst_n; + + typedef logic [INPUT_STREAM_WIDTH-1 : 0] mvauin_t; + + uwire mvauin_t amvau; + uwire alast; + uwire afin; + uwire avld; + uwire ardy; + + replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvauin_t)), .RAM_STYLE(RAM_STYLE)) activation_replay ( + .clk, .rst, + .ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)), + .ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin) + ); + + //-------------------- Input control --------------------\\ + uwire en; + uwire istb = avld && s_axis_weights_tvalid; + assign ardy = en && s_axis_weights_tvalid; + assign s_axis_weights_tready = en && avld; + + //-------------------- Core MVU --------------------\\ + uwire ovld; + uwire [PE-1:0][57:0] odat; + typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t; + mvu_8sx9 #(.PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN)) core ( + .clk, .rst, .en, + .last(alast), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau), + .vld(ovld), .p(odat) + ); + + //-------------------- Output register slice --------------------\\ + struct { + logic vld; + logic [PE-1:0][ACCU_WIDTH-1:0] dat; + } A = '{ vld: 0, default: 'x}; + + assign en = !A.vld || !ovld; + + uwire b_load; + always_ff @(posedge clk) begin + if(rst) A <= '{ vld: 0, default: 'x }; + else if(!A.vld || b_load) begin + A.vld <= ovld && en; + for(int unsigned i = 0; i < PE; i++) begin + // CR-1148862: + // A.dat[i] <= odat[i]; + automatic logic [57:0] v = odat[i]; + A.dat[i] <= v[ACCU_WIDTH-1:0]; + end + end + end + + struct { + logic vld; + logic [PE-1:0][ACCU_WIDTH-1:0] dat; + } B = '{ vld: 0, default: 'x}; + + assign b_load = !B.vld || m_axis_output_tready; + always_ff @(posedge clk) begin + if(rst) B <= '{ default: 'x }; + else begin + if(b_load) B <= '{ vld: A.vld, dat: A.dat}; + end + end + + assign m_axis_output_tvalid = B.vld; + assign m_axis_output_tdata = B.dat; + +endmodule \ No newline at end of file From 96925a929877ce084466438128678250b09784a9 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 6 Apr 2023 12:36:00 +0100 Subject: [PATCH 099/235] [rtl custom op]: implementation of replay buffer for mvu --- finn-rtllib/mvu/replay_buffer.sv | 109 +++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 finn-rtllib/mvu/replay_buffer.sv diff --git a/finn-rtllib/mvu/replay_buffer.sv b/finn-rtllib/mvu/replay_buffer.sv new file mode 100644 index 0000000000..685ac03137 --- /dev/null +++ b/finn-rtllib/mvu/replay_buffer.sv @@ -0,0 +1,109 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Replay buffer for counted sequences on an AXI-lite stream. + * @author Thomas B. Preußer + *****************************************************************************/ + +module replay_buffer #( + int unsigned LEN, // Sequence length + int unsigned REP, // Sequence replay count + int unsigned W, // Data width + parameter RAM_STYLE = "auto" // ram style for buffer {block, distributed, ultra, auto} +)( + input logic clk, + input logic rst, + + input logic [W-1:0] idat, + input logic ivld, + output logic irdy, + + output logic [W-1:0] odat, + output logic olast, + output logic ofin, + output logic ovld, + input logic ordy +); + + typedef logic [$clog2(REP)+$clog2(LEN)-1:0] count_t; + count_t Count = 0; + uwire done_len = ((LEN-1) & ~Count[$clog2(LEN)-1:0]) == 0; + uwire done_rep; + uwire done_all = done_len && done_rep; + + uwire shift; + uwire clr = rst || (done_all && shift); + always_ff @(posedge clk) begin + if(clr) Count <= 0; + else if(shift) Count <= Count + ((REP > 1) && done_len? 2**$clog2(LEN)-LEN+1 : 1); + end + + typedef logic [W-1:0] data_t; + uwire data_t rdat; + uwire first_rep; + if(REP == 1) begin + assign done_rep = 1; + assign first_rep = 1; + assign rdat = 'x; + end + else begin + assign done_rep = ((REP-1) & ~Count[$left(Count):$clog2(LEN)]) == 0; + + logic FirstRep = 1; + always_ff @(posedge clk) begin + if(clr) FirstRep <= 1; + else if(shift) FirstRep <= FirstRep && !done_len; + end + assign first_rep = FirstRep; + + (* RAM_STYLE = RAM_STYLE *) + data_t Buf[LEN]; + if(LEN == 1) begin : genTrivial + always_ff @(posedge clk) begin + if(shift && FirstRep) Buf[0] <= idat; + end + end : genTrivial + else begin : genShift + always_ff @(posedge clk) begin + if(shift) Buf <= { odat, Buf[0:LEN-2] }; + end + end : genShift + + assign rdat = Buf[LEN-1]; + end + + assign irdy = ordy && first_rep; + assign odat = first_rep? idat : rdat; + assign olast = done_len; + assign ofin = done_all; + assign ovld = first_rep? ivld : 1; + assign shift = ovld && ordy; + +endmodule : replay_buffer \ No newline at end of file From a3d11567468899bbcf33c83b509c26f908a807a3 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 6 Apr 2023 12:37:16 +0100 Subject: [PATCH 100/235] [rtl custom op]: testbench for mvu_8sx9_axi (including axi_wrapper & compute kernel) --- finn-rtllib/mvu/mvu_8sx9_axi_tb.sv | 208 +++++++++++++++++++++++++++++ 1 file changed, 208 insertions(+) create mode 100644 finn-rtllib/mvu/mvu_8sx9_axi_tb.sv diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv b/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv new file mode 100644 index 0000000000..ea97e0708c --- /dev/null +++ b/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv @@ -0,0 +1,208 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Testbench for MVU AXI-lite interface wrapper. + *****************************************************************************/ + +module mvu_8sx9_axi_tb(); + + //-------------------- Simulation parameters --------------------\\ + // Matrix & parallelism config + localparam int unsigned MW = 600; + localparam int unsigned MH = 256; + localparam int unsigned SIMD = 60; + localparam int unsigned PE = 16; + localparam int unsigned SEGMENTLEN = 4; + // Bit-width config + localparam int unsigned ACTIVATION_WIDTH = 8; + localparam int unsigned WEIGHT_WIDTH = 4; + localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW); + localparam bit SIGNED_ACTIVATIONS = 1; + // Simulation constants + localparam int unsigned NF = MH/PE; + localparam int unsigned SF = MW/SIMD; + localparam int unsigned NUM_OF_DSP = SIMD/3; + localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8; + localparam int unsigned ACTIVATION_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8*8; + localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH; + localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - SIMD*ACTIVATION_WIDTH; + localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8; + + // Generate clk and reset signal + logic clk = 0; + always #5ns clk = !clk; + + logic ap_rst_n = 0; + initial begin + repeat(16) @(posedge clk); + ap_rst_n <= 1; + end + + uwire ap_clk = clk; + + // Generate activations + typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; + typedef activation_t activation_vector_t[SF]; + + function activation_vector_t init_ACTIVATIONS; + automatic activation_vector_t res; + std::randomize(res); + return res; + endfunction : init_ACTIVATIONS + + activation_vector_t ACTIVATIONS = init_ACTIVATIONS(); + + struct { + activation_t dat; + logic vld; + logic rdy; + } activations; + + initial begin + activations.vld = 0; + activations.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain + @(posedge clk iff ap_rst_n); + + for (int i=0; i 1; + @(posedge clk); + end while (!(activations.vld === 1 && activations.rdy === 1)); + end + + activations.vld <= 0; + activations.dat <= 'x; + end + + // Generate weights + typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; + typedef weight_t weight_matrix_t[NF][SF]; + + function weight_matrix_t init_WEIGHTS; + automatic weight_matrix_t res; + std::randomize(res); + return res; + endfunction : init_WEIGHTS; + + weight_matrix_t WEIGHTS = init_WEIGHTS(); + + struct { + weight_t dat; + logic vld; + logic rdy; + } weights; + + initial begin + weights.vld = 0; + weights.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain + @(posedge clk iff ap_rst_n); + + weights.vld <= 1; + for (int i=0; i= 1; + @(posedge clk iff ap_rst_n); + end while (!(outputs.rdy === 1 && outputs.vld === 1)); + + // Compare produced outputs against golden outputs + foreach(outputs.dat[i]) begin + assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + else begin + $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + $stop; + end + end + + NF_CNT += 1; + end + + $finish; + end + + // Instantiate DUT + mvu_8sx9_axi #( + .MW(MW), + .MH(MH), + .PE(PE), + .SIMD(SIMD), + .ACTIVATION_WIDTH(ACTIVATION_WIDTH), + .WEIGHT_WIDTH(WEIGHT_WIDTH), + .ACCU_WIDTH(ACCU_WIDTH), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), + .SEGMENTLEN(SEGMENTLEN) + ) + dut ( + .ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld), + .s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld), + .s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld), + .m_axis_output_tready(outputs.rdy) + ); + +endmodule From 2aea664b2260a4ea759909d0a3168b5f62b114a2 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 6 Apr 2023 12:37:55 +0100 Subject: [PATCH 101/235] [rtl custom op]: initial implementation of verilog wrapper for mvu_8sx9_axi --- finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v | 90 ++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v new file mode 100644 index 0000000000..ff3779d211 --- /dev/null +++ b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v @@ -0,0 +1,90 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Verilog AXI-lite wrapper for MVU. + *****************************************************************************/ + +module $MODULE_NAME_AXI_WRAPPER$ #( + parameter MW = $MW$, + parameter MH = $MH$, + parameter PE = $PE$, + parameter SIMD = $SIMD$, + parameter ACTIVATION_WIDTH = $ACTIVATION_WIDTH$, + parameter WEIGHT_WIDTH = $WEIGHT_WIDTH$, + parameter ACCU_WIDTH = $ACCU_WIDTH$, + parameter SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$, + parameter SEGMENTLEN = $SEGMENTLEN$, + parameter RAM_STYLE = $IBUF_RAM_STYLE$, + + // Safely deducible parameters + parameter WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, + parameter INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8, + parameter OUTPUT_LANES = PE, + parameter OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8 +)( + // Global Control + input logic ap_clk, + input logic ap_rst_n, + + // Weight Stream + input logic [WEIGHT_STREAM_WIDTH_BA-1:0] s_axis_weights_tdata, + input logic s_axis_weights_tvalid, + output logic s_axis_weights_tready, + + // Input Stream + input logic [INPUT_STREAM_WIDTH_BA-1:0] s_axis_input_tdata, + input logic s_axis_input_tvalid, + output logic s_axis_input_tready, + + // Output Stream + output logic [OUTPUT_STREAM_WIDTH_BA-1:0] m_axis_output_tdata, + output logic m_axis_output_tvalid, + input logic m_axis_output_tready +); + +mvu_8sx9_axi #( + .MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), + .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), + .SEGMENTLEN(SEGMENTLEN), .RAM_STYLE(RAM_STYLE) + ) inst ( + .ap_clk(ap_clk), + .ap_rst_n(ap_rst_n), + .s_axis_weights_tdata(s_axis_weights_tdata), + .s_axis_weights_tvalid(s_axis_weights_tvalid), + .s_axis_weights_tready(s_axis_weights_tready), + .s_axis_input_tdata(s_axis_input_tdata), + .s_axis_input_tvalid(s_axis_input_tvalid), + .s_axis_input_tready(s_axis_input_tready), + .m_axis_output_tdata(m_axis_output_tdata), + .m_axis_output_tvalid(m_axis_output_tvalid), + .m_axis_output_tready(m_axis_output_tready) +) + +endmodule : mvau_8sx9_axi_wrapper \ No newline at end of file From 8b57849bb47c3119b177e78dcbaa48954f69b811 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Tue, 11 Apr 2023 15:50:24 +0100 Subject: [PATCH 102/235] [rtl mvu]: fix tab indentation --- finn-rtllib/mvu/mvu_8sx9.sv | 424 ++++++++++++------------- finn-rtllib/mvu/mvu_8sx9_axi.sv | 32 +- finn-rtllib/mvu/mvu_8sx9_axi_tb.sv | 342 ++++++++++---------- finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v | 26 +- finn-rtllib/mvu/mvu_8sx9_tb.sv | 258 +++++++-------- 5 files changed, 541 insertions(+), 541 deletions(-) diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv index c992990d9f..d082d4fb2e 100644 --- a/finn-rtllib/mvu/mvu_8sx9.sv +++ b/finn-rtllib/mvu/mvu_8sx9.sv @@ -52,233 +52,233 @@ module mvu_8sx9 #( ); //-------------------- Declare global signals --------------------\\ -localparam int unsigned CHAINLEN = (SIMD+2)/3; -localparam int unsigned SEGLEN = SEGMENTLEN == 0 ? CHAINLEN : SEGMENTLEN; // Additional constant to default a SEGMENTLEN of '0' to the DSP-chain length -uwire [26:0] a_in_i [CHAINLEN]; -uwire [23:0] b_in_i [PE][CHAINLEN]; -uwire [57:0] pcout [PE][CHAINLEN]; + localparam int unsigned CHAINLEN = (SIMD+2)/3; + localparam int unsigned SEGLEN = SEGMENTLEN == 0 ? CHAINLEN : SEGMENTLEN; // Additional constant to default a SEGMENTLEN of '0' to the DSP-chain length + uwire [26:0] a_in_i [CHAINLEN]; + uwire [23:0] b_in_i [PE][CHAINLEN]; + uwire [57:0] pcout [PE][CHAINLEN]; //-------------------- Shift register for opmode select signal --------------------\\ -localparam int unsigned MAX_PIPELINE_STAGES = (CHAINLEN + SEGLEN-1)/SEGLEN; // >=1 (== number of pipeline registers + 1 (A/B inputs always have 1 register)) -logic L [0:1+MAX_PIPELINE_STAGES] = '{default: 0}; // After MAX_PIPELINE_STAGES (== number of pipeline stages for input data), we have 3 additional cycles latency (A/B reg, Mreg, Preg). Thus, we add +2 (since OPMODE is buffered by 1 cycle in the DSP fabric) + localparam int unsigned MAX_PIPELINE_STAGES = (CHAINLEN + SEGLEN-1)/SEGLEN; // >=1 (== number of pipeline registers + 1 (A/B inputs always have 1 register)) + logic L [0:1+MAX_PIPELINE_STAGES] = '{default: 0}; // After MAX_PIPELINE_STAGES (== number of pipeline stages for input data), we have 3 additional cycles latency (A/B reg, Mreg, Preg). Thus, we add +2 (since OPMODE is buffered by 1 cycle in the DSP fabric) -always_ff @(posedge clk) begin - if(rst) L <= '{default: 0}; - else if(en) L <= { L[1:1+MAX_PIPELINE_STAGES], last }; -end -assign vld = L[0]; + always_ff @(posedge clk) begin + if(rst) L <= '{default: 0}; + else if(en) L <= { L[1:1+MAX_PIPELINE_STAGES], last }; + end + assign vld = L[0]; //-------------------- Shift register for ZERO flag --------------------\\ -logic Z [0:MAX_PIPELINE_STAGES-2] = '{default:0}; // We need MAX_PIPELINE_STAGES-1 pipeline stages (note: INMODE is buffered inside DSP fabric) + logic Z [0:MAX_PIPELINE_STAGES-2] = '{default:0}; // We need MAX_PIPELINE_STAGES-1 pipeline stages (note: INMODE is buffered inside DSP fabric) -if (MAX_PIPELINE_STAGES > 1) begin : genZreg - always_ff @(posedge clk) begin - if (rst) Z <= '{default: 0}; - else if(en) begin - Z[0] <= zero; - if (MAX_PIPELINE_STAGES > 2) Z[1:MAX_PIPELINE_STAGES-1] <= Z[0:MAX_PIPELINE_STAGES-2]; - end - end -end; + if (MAX_PIPELINE_STAGES > 1) begin : genZreg + always_ff @(posedge clk) begin + if (rst) Z <= '{default: 0}; + else if(en) begin + Z[0] <= zero; + if (MAX_PIPELINE_STAGES > 2) Z[1:MAX_PIPELINE_STAGES-1] <= Z[0:MAX_PIPELINE_STAGES-2]; + end + end + end; //-------------------- Buffer for input activations --------------------\\ -localparam int unsigned PAD_BITS_ACT = 9 - ACTIVATION_WIDTH; -typedef logic [2:0][ACTIVATION_WIDTH-1:0] a_buffer_t; + localparam int unsigned PAD_BITS_ACT = 9 - ACTIVATION_WIDTH; + typedef logic [2:0][ACTIVATION_WIDTH-1:0] a_buffer_t; -for (genvar i=0; i1 ? TOTAL_PREGS-1 : 0; - - if (EXTERNAL_PREGS > 0) begin : genExternalPregAct - a_buffer_t A [0:EXTERNAL_PREGS-1]; - always_ff @(posedge clk) begin - if (rst) A <= '{default: 0}; - else if(en) begin - A[EXTERNAL_PREGS-1] <= a[3*i +: 3]; - if (EXTERNAL_PREGS > 1) A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1]; - end - end - assign a_in_i[i][26:0] = SIGNED_ACTIVATIONS ? { {PAD_BITS_ACT{A[0][2][ACTIVATION_WIDTH-1]}}, A[0][2], {PAD_BITS_ACT{A[0][1][ACTIVATION_WIDTH-1]}}, A[0][1], {PAD_BITS_ACT{A[0][0][ACTIVATION_WIDTH-1]}}, A[0][0]} - : { {PAD_BITS_ACT{1'b0}}, A[0][2], {PAD_BITS_ACT{1'b0}}, A[0][1], {PAD_BITS_ACT{1'b0}}, A[0][0]} ; - end : genExternalPregAct - else begin : genInpDSPAct - assign a_in_i[i][26:0] = SIGNED_ACTIVATIONS ? { {PAD_BITS_ACT{a[3*i+2][ACTIVATION_WIDTH-1]}}, a[3*i+2], {PAD_BITS_ACT{a[3*i+1][ACTIVATION_WIDTH-1]}}, a[3*i+1], {PAD_BITS_ACT{a[3*i][ACTIVATION_WIDTH-1]}}, a[3*i]} - : { {PAD_BITS_ACT{1'b0}}, a[3*i+2], {PAD_BITS_ACT{1'b0}}, a[3*i+1], {PAD_BITS_ACT{1'b0}}, a[3*i]} ; - end : genInpDSPAct + for (genvar i=0; i1 ? TOTAL_PREGS-1 : 0; -end : genActSIMD + if (EXTERNAL_PREGS > 0) begin : genExternalPregAct + a_buffer_t A [0:EXTERNAL_PREGS-1]; + always_ff @(posedge clk) begin + if (rst) A <= '{default: 0}; + else if(en) begin + A[EXTERNAL_PREGS-1] <= a[3*i +: 3]; + if (EXTERNAL_PREGS > 1) A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1]; + end + end + assign a_in_i[i][26:0] = SIGNED_ACTIVATIONS ? { {PAD_BITS_ACT{A[0][2][ACTIVATION_WIDTH-1]}}, A[0][2], {PAD_BITS_ACT{A[0][1][ACTIVATION_WIDTH-1]}}, A[0][1], {PAD_BITS_ACT{A[0][0][ACTIVATION_WIDTH-1]}}, A[0][0]} + : { {PAD_BITS_ACT{1'b0}}, A[0][2], {PAD_BITS_ACT{1'b0}}, A[0][1], {PAD_BITS_ACT{1'b0}}, A[0][0]} ; + end : genExternalPregAct + else begin : genInpDSPAct + assign a_in_i[i][26:0] = SIGNED_ACTIVATIONS ? { {PAD_BITS_ACT{a[3*i+2][ACTIVATION_WIDTH-1]}}, a[3*i+2], {PAD_BITS_ACT{a[3*i+1][ACTIVATION_WIDTH-1]}}, a[3*i+1], {PAD_BITS_ACT{a[3*i][ACTIVATION_WIDTH-1]}}, a[3*i]} + : { {PAD_BITS_ACT{1'b0}}, a[3*i+2], {PAD_BITS_ACT{1'b0}}, a[3*i+1], {PAD_BITS_ACT{1'b0}}, a[3*i]} ; + end : genInpDSPAct + + end : genActSIMD //-------------------- Buffer for weights --------------------\\ -localparam int unsigned PAD_BITS_WEIGHT = 8 - WEIGHT_WIDTH; -typedef logic [2:0][WEIGHT_WIDTH-1:0] b_buffer_t; + localparam int unsigned PAD_BITS_WEIGHT = 8 - WEIGHT_WIDTH; + typedef logic [2:0][WEIGHT_WIDTH-1:0] b_buffer_t; -for (genvar j=0; j1 ? TOTAL_PREGS-1 : 0; - - if (EXTERNAL_PREGS > 0) begin : genExternalPregWeight - b_buffer_t B [0:PE-1][0:EXTERNAL_PREGS-1]; - always_ff @(posedge clk) begin - if (rst) B <= '{default: 0}; - else if (en) begin - B[j][EXTERNAL_PREGS-1] <= w[j][3*i +: 3]; - if (EXTERNAL_PREGS > 1) B[j][0:EXTERNAL_PREGS-2] <= B[j][1:EXTERNAL_PREGS-1]; - end - end - assign b_in_i[j][i][23:0] = { {PAD_BITS_WEIGHT{B[j][0][2][WEIGHT_WIDTH-1]}}, B[j][0][2], {PAD_BITS_WEIGHT{B[j][0][1][WEIGHT_WIDTH-1]}}, B[j][0][1], {PAD_BITS_WEIGHT{B[j][0][0][WEIGHT_WIDTH-1]}}, B[j][0][0] }; - end : genExternalPregWeight - else begin : genInpDSPWeight - assign b_in_i[j][i][23:0] = { {PAD_BITS_WEIGHT{w[j][3*i+2][WEIGHT_WIDTH-1]}}, w[j][3*i+2], {PAD_BITS_WEIGHT{w[j][3*i+1][WEIGHT_WIDTH-1]}}, w[j][3*i+1], {PAD_BITS_WEIGHT{w[j][3*i][WEIGHT_WIDTH-1]}}, w[j][3*i] }; - end : genInpDSPWeight - end : genWeightSIMD + for (genvar j=0; j1 ? TOTAL_PREGS-1 : 0; -end : genWeightPE + if (EXTERNAL_PREGS > 0) begin : genExternalPregWeight + b_buffer_t B [0:PE-1][0:EXTERNAL_PREGS-1]; + always_ff @(posedge clk) begin + if (rst) B <= '{default: 0}; + else if (en) begin + B[j][EXTERNAL_PREGS-1] <= w[j][3*i +: 3]; + if (EXTERNAL_PREGS > 1) B[j][0:EXTERNAL_PREGS-2] <= B[j][1:EXTERNAL_PREGS-1]; + end + end + assign b_in_i[j][i][23:0] = { {PAD_BITS_WEIGHT{B[j][0][2][WEIGHT_WIDTH-1]}}, B[j][0][2], {PAD_BITS_WEIGHT{B[j][0][1][WEIGHT_WIDTH-1]}}, B[j][0][1], {PAD_BITS_WEIGHT{B[j][0][0][WEIGHT_WIDTH-1]}}, B[j][0][0] }; + end : genExternalPregWeight + else begin : genInpDSPWeight + assign b_in_i[j][i][23:0] = { {PAD_BITS_WEIGHT{w[j][3*i+2][WEIGHT_WIDTH-1]}}, w[j][3*i+2], {PAD_BITS_WEIGHT{w[j][3*i+1][WEIGHT_WIDTH-1]}}, w[j][3*i+1], {PAD_BITS_WEIGHT{w[j][3*i][WEIGHT_WIDTH-1]}}, w[j][3*i] }; + end : genInpDSPWeight + end : genWeightSIMD + + end : genWeightPE //-------------------- Instantiate PE x CHAINLEN DSPs --------------------\\ -for (genvar j=0; j0 ? 2 : 1; // 1 : 0 - localparam bit PREG = (i+1)%SEGLEN==0 || i == CHAINLEN-1; - localparam bit FIRST = i == 0; - localparam bit LAST = i == CHAINLEN-1; - uwire [57:0] pp; - - if (LAST) begin : genPOUT - assign p[j] = pp; - end - - DSP58 #( - // Feature Control Attributes: Data Path Selection - .AMULTSEL("A"), // Selects A input to multiplier (A, AD) - .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) - .BMULTSEL("B"), // Selects B input to multiplier (AD, B) - .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port) - .DSP_MODE("INT8"), // Configures DSP to a particular mode of operation. Set to INT24 for - // legacy mode. - .PREADDINSEL("A"), // Selects input to pre-adder (A, B) - .RND(58'h000000000000000), // Rounding Constant - .USE_MULT("MULTIPLY"), // Select multiplier usage (DYNAMIC, MULTIPLY, NONE) - .USE_SIMD("ONE58"), // SIMD selection (FOUR12, ONE58, TWO24) - .USE_WIDEXOR("FALSE"), // Use the Wide XOR function (FALSE, TRUE) - .XORSIMD("XOR24_34_58_116"), // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116) - // Pattern Detector Attributes: Pattern Detection Configuration - .AUTORESET_PATDET("NO_RESET"), // NO_RESET, RESET_MATCH, RESET_NOT_MATCH - .AUTORESET_PRIORITY("RESET"), // Priority of AUTORESET vs. CEP (CEP, RESET). - .MASK(58'h0ffffffffffffff), // 58-bit mask value for pattern detect (1=ignore) - .PATTERN(58'h000000000000000), // 58-bit pattern match for pattern detect - .SEL_MASK("MASK"), // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2 - .SEL_PATTERN("PATTERN"), // Select pattern value (C, PATTERN) - .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect (NO_PATDET, PATDET) - // Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins - .IS_ALUMODE_INVERTED(4'b0000), // Optional inversion for ALUMODE - .IS_CARRYIN_INVERTED(1'b0), // Optional inversion for CARRYIN - .IS_CLK_INVERTED(1'b0), // Optional inversion for CLK - .IS_INMODE_INVERTED(5'b00000), // Optional inversion for INMODE - .IS_NEGATE_INVERTED(3'b000), // Optional inversion for NEGATE - .IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0 - FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN - 2'b01, // Y : M - 2'b01 // X: M - }), // Optional inversion for OPMODE - .IS_RSTALLCARRYIN_INVERTED(1'b0), // Optional inversion for RSTALLCARRYIN - .IS_RSTALUMODE_INVERTED(1'b0), // Optional inversion for RSTALUMODE - .IS_RSTA_INVERTED(1'b0), // Optional inversion for RSTA - .IS_RSTB_INVERTED(1'b0), // Optional inversion for RSTB - .IS_RSTCTRL_INVERTED(1'b0), // Optional inversion for STCONJUGATE_A - .IS_RSTC_INVERTED(1'b0), // Optional inversion for RSTC - .IS_RSTD_INVERTED(1'b0), // Optional inversion for RSTD - .IS_RSTINMODE_INVERTED(1'b0), // Optional inversion for RSTINMODE - .IS_RSTM_INVERTED(1'b0), // Optional inversion for RSTM - .IS_RSTP_INVERTED(1'b0), // Optional inversion for RSTP - // Register Control Attributes: Pipeline Register Configuration - .ACASCREG(INTERNAL_PREGS), // Number of pipeline stages between A/ACIN and ACOUT (0-2) - .ADREG(0), // Pipeline stages for pre-adder (0-1) - .ALUMODEREG(0), // Pipeline stages for ALUMODE (0-1) - .AREG(INTERNAL_PREGS), // Pipeline stages for A (0-2) - .BCASCREG(INTERNAL_PREGS), // Number of pipeline stages between B/BCIN and BCOUT (0-2) - .BREG(INTERNAL_PREGS), // Pipeline stages for B (0-2) - .CARRYINREG(0), // Pipeline stages for CARRYIN (0-1) - .CARRYINSELREG(0), // Pipeline stages for CARRYINSEL (0-1) - .CREG(0), // Pipeline stages for C (0-1) - .DREG(0), // Pipeline stages for D (0-1) - .INMODEREG(1), // Pipeline stages for INMODE (0-1) - .MREG(1), // Multiplier pipeline stages (0-1) - .OPMODEREG(1), // Pipeline stages for OPMODE (0-1) - .PREG(PREG), // Number of pipeline stages for P (0-1) - .RESET_MODE("SYNC") // Selection of synchronous or asynchronous reset. (ASYNC, SYNC). - ) - DSP58_inst ( - // Cascade outputs: Cascade Ports - .ACOUT(), // 34-bit output: A port cascade - .BCOUT(), // 24-bit output: B cascade - .CARRYCASCOUT(), // 1-bit output: Cascade carry - .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade - .PCOUT(pcout[j][i]), // 58-bit output: Cascade output - // Control outputs: Control Inputs/Status Bits - .OVERFLOW(), // 1-bit output: Overflow in add/acc - .PATTERNBDETECT(), // 1-bit output: Pattern bar detect - .PATTERNDETECT(), // 1-bit output: Pattern detect - .UNDERFLOW(), // 1-bit output: Underflow in add/acc - // Data outputs: Data Ports - .CARRYOUT(), // 4-bit output: Carry - .P(pp), // 58-bit output: Primary data - .XOROUT(), // 8-bit output: XOR data - // Cascade inputs: Cascade Ports - .ACIN('x), // 34-bit input: A cascade data - .BCIN('x), // 24-bit input: B cascade - .CARRYCASCIN('x), // 1-bit input: Cascade carry - .MULTSIGNIN('x), // 1-bit input: Multiplier sign cascade - .PCIN(FIRST ? 'x : pcout[j][i-1]), // 58-bit input: P cascade - // Control inputs: Control Inputs/Status Bits - .ALUMODE(4'h0), // 4-bit input: ALU control - .CARRYINSEL('0), // 3-bit input: Carry select - .CLK(clk), // 1-bit input: Clock - .INMODE({ - INTERNAL_PREGS==2 ? 1'b0 : 1'b1, - 2'b00, - TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero, - INTERNAL_PREGS==2 ? 1'b0 : 1'b1 - }), // 5-bit input: INMODE control - .NEGATE('0), // 3-bit input: Negates the input of the multiplier - .OPMODE({ - LAST ? {1'b0, L[1]} : 2'b00, - 7'b000_0000 - }), // 9-bit input: Operation mode - // Data inputs: Data Ports - .A({ 7'bx, a_in_i[i] }), // 34-bit input: A data - .B(b_in_i[j][i]), // 24-bit input: B data - .C('x), // 58-bit input: C data - .CARRYIN('0), // 1-bit input: Carry-in - .D('x), // 27-bit input: D data - // Reset/Clock Enable inputs: Reset/Clock Enable Inputs - .ASYNC_RST('0), // 1-bit input: Asynchronous reset for all registers. - .CEA1(en), // 1-bit input: Clock enable for 1st stage AREG - .CEA2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage AREG - .CEAD('0), // 1-bit input: Clock enable for ADREG - .CEALUMODE('0), // 1-bit input: Clock enable for ALUMODE - .CEB1(en), // 1-bit input: Clock enable for 1st stage BREG - .CEB2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage BREG - .CEC('0), // 1-bit input: Clock enable for CREG - .CECARRYIN('0), // 1-bit input: Clock enable for CARRYINREG - .CECTRL(en), // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG - .CED('0), // 1-bit input: Clock enable for DREG - .CEINMODE(en), // 1-bit input: Clock enable for INMODEREG - .CEM(en), // 1-bit input: Clock enable for MREG - .CEP(PREG && en), // 1-bit input: Clock enable for PREG - .RSTA(rst), // 1-bit input: Reset for AREG - .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG - .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG - .RSTB(rst), // 1-bit input: Reset for BREG - .RSTC('0), // 1-bit input: Reset for CREG - .RSTCTRL(rst), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG - .RSTD('0), // 1-bit input: Reset for DREG and ADREG - .RSTINMODE(rst), // 1-bit input: Reset for INMODE register - .RSTM(rst), // 1-bit input: Reset for MREG - .RSTP(PREG && rst) // 1-bit input: Reset for PREG - ); - end : genDSPChain -end : genDSPPE + for (genvar j=0; j0 ? 2 : 1; // 1 : 0 + localparam bit PREG = (i+1)%SEGLEN==0 || i == CHAINLEN-1; + localparam bit FIRST = i == 0; + localparam bit LAST = i == CHAINLEN-1; + uwire [57:0] pp; + + if (LAST) begin : genPOUT + assign p[j] = pp; + end + + DSP58 #( + // Feature Control Attributes: Data Path Selection + .AMULTSEL("A"), // Selects A input to multiplier (A, AD) + .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) + .BMULTSEL("B"), // Selects B input to multiplier (AD, B) + .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port) + .DSP_MODE("INT8"), // Configures DSP to a particular mode of operation. Set to INT24 for + // legacy mode. + .PREADDINSEL("A"), // Selects input to pre-adder (A, B) + .RND(58'h000000000000000), // Rounding Constant + .USE_MULT("MULTIPLY"), // Select multiplier usage (DYNAMIC, MULTIPLY, NONE) + .USE_SIMD("ONE58"), // SIMD selection (FOUR12, ONE58, TWO24) + .USE_WIDEXOR("FALSE"), // Use the Wide XOR function (FALSE, TRUE) + .XORSIMD("XOR24_34_58_116"), // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116) + // Pattern Detector Attributes: Pattern Detection Configuration + .AUTORESET_PATDET("NO_RESET"), // NO_RESET, RESET_MATCH, RESET_NOT_MATCH + .AUTORESET_PRIORITY("RESET"), // Priority of AUTORESET vs. CEP (CEP, RESET). + .MASK(58'h0ffffffffffffff), // 58-bit mask value for pattern detect (1=ignore) + .PATTERN(58'h000000000000000), // 58-bit pattern match for pattern detect + .SEL_MASK("MASK"), // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2 + .SEL_PATTERN("PATTERN"), // Select pattern value (C, PATTERN) + .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect (NO_PATDET, PATDET) + // Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins + .IS_ALUMODE_INVERTED(4'b0000), // Optional inversion for ALUMODE + .IS_CARRYIN_INVERTED(1'b0), // Optional inversion for CARRYIN + .IS_CLK_INVERTED(1'b0), // Optional inversion for CLK + .IS_INMODE_INVERTED(5'b00000), // Optional inversion for INMODE + .IS_NEGATE_INVERTED(3'b000), // Optional inversion for NEGATE + .IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0 + FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN + 2'b01, // Y : M + 2'b01 // X: M + }), // Optional inversion for OPMODE + .IS_RSTALLCARRYIN_INVERTED(1'b0), // Optional inversion for RSTALLCARRYIN + .IS_RSTALUMODE_INVERTED(1'b0), // Optional inversion for RSTALUMODE + .IS_RSTA_INVERTED(1'b0), // Optional inversion for RSTA + .IS_RSTB_INVERTED(1'b0), // Optional inversion for RSTB + .IS_RSTCTRL_INVERTED(1'b0), // Optional inversion for STCONJUGATE_A + .IS_RSTC_INVERTED(1'b0), // Optional inversion for RSTC + .IS_RSTD_INVERTED(1'b0), // Optional inversion for RSTD + .IS_RSTINMODE_INVERTED(1'b0), // Optional inversion for RSTINMODE + .IS_RSTM_INVERTED(1'b0), // Optional inversion for RSTM + .IS_RSTP_INVERTED(1'b0), // Optional inversion for RSTP + // Register Control Attributes: Pipeline Register Configuration + .ACASCREG(INTERNAL_PREGS), // Number of pipeline stages between A/ACIN and ACOUT (0-2) + .ADREG(0), // Pipeline stages for pre-adder (0-1) + .ALUMODEREG(0), // Pipeline stages for ALUMODE (0-1) + .AREG(INTERNAL_PREGS), // Pipeline stages for A (0-2) + .BCASCREG(INTERNAL_PREGS), // Number of pipeline stages between B/BCIN and BCOUT (0-2) + .BREG(INTERNAL_PREGS), // Pipeline stages for B (0-2) + .CARRYINREG(0), // Pipeline stages for CARRYIN (0-1) + .CARRYINSELREG(0), // Pipeline stages for CARRYINSEL (0-1) + .CREG(0), // Pipeline stages for C (0-1) + .DREG(0), // Pipeline stages for D (0-1) + .INMODEREG(1), // Pipeline stages for INMODE (0-1) + .MREG(1), // Multiplier pipeline stages (0-1) + .OPMODEREG(1), // Pipeline stages for OPMODE (0-1) + .PREG(PREG), // Number of pipeline stages for P (0-1) + .RESET_MODE("SYNC") // Selection of synchronous or asynchronous reset. (ASYNC, SYNC). + ) + DSP58_inst ( + // Cascade outputs: Cascade Ports + .ACOUT(), // 34-bit output: A port cascade + .BCOUT(), // 24-bit output: B cascade + .CARRYCASCOUT(), // 1-bit output: Cascade carry + .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade + .PCOUT(pcout[j][i]), // 58-bit output: Cascade output + // Control outputs: Control Inputs/Status Bits + .OVERFLOW(), // 1-bit output: Overflow in add/acc + .PATTERNBDETECT(), // 1-bit output: Pattern bar detect + .PATTERNDETECT(), // 1-bit output: Pattern detect + .UNDERFLOW(), // 1-bit output: Underflow in add/acc + // Data outputs: Data Ports + .CARRYOUT(), // 4-bit output: Carry + .P(pp), // 58-bit output: Primary data + .XOROUT(), // 8-bit output: XOR data + // Cascade inputs: Cascade Ports + .ACIN('x), // 34-bit input: A cascade data + .BCIN('x), // 24-bit input: B cascade + .CARRYCASCIN('x), // 1-bit input: Cascade carry + .MULTSIGNIN('x), // 1-bit input: Multiplier sign cascade + .PCIN(FIRST ? 'x : pcout[j][i-1]), // 58-bit input: P cascade + // Control inputs: Control Inputs/Status Bits + .ALUMODE(4'h0), // 4-bit input: ALU control + .CARRYINSEL('0), // 3-bit input: Carry select + .CLK(clk), // 1-bit input: Clock + .INMODE({ + INTERNAL_PREGS==2 ? 1'b0 : 1'b1, + 2'b00, + TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero, + INTERNAL_PREGS==2 ? 1'b0 : 1'b1 + }), // 5-bit input: INMODE control + .NEGATE('0), // 3-bit input: Negates the input of the multiplier + .OPMODE({ + LAST ? {1'b0, L[1]} : 2'b00, + 7'b000_0000 + }), // 9-bit input: Operation mode + // Data inputs: Data Ports + .A({ 7'bx, a_in_i[i] }), // 34-bit input: A data + .B(b_in_i[j][i]), // 24-bit input: B data + .C('x), // 58-bit input: C data + .CARRYIN('0), // 1-bit input: Carry-in + .D('x), // 27-bit input: D data + // Reset/Clock Enable inputs: Reset/Clock Enable Inputs + .ASYNC_RST('0), // 1-bit input: Asynchronous reset for all registers. + .CEA1(en), // 1-bit input: Clock enable for 1st stage AREG + .CEA2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage AREG + .CEAD('0), // 1-bit input: Clock enable for ADREG + .CEALUMODE('0), // 1-bit input: Clock enable for ALUMODE + .CEB1(en), // 1-bit input: Clock enable for 1st stage BREG + .CEB2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage BREG + .CEC('0), // 1-bit input: Clock enable for CREG + .CECARRYIN('0), // 1-bit input: Clock enable for CARRYINREG + .CECTRL(en), // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG + .CED('0), // 1-bit input: Clock enable for DREG + .CEINMODE(en), // 1-bit input: Clock enable for INMODEREG + .CEM(en), // 1-bit input: Clock enable for MREG + .CEP(PREG && en), // 1-bit input: Clock enable for PREG + .RSTA(rst), // 1-bit input: Reset for AREG + .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG + .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG + .RSTB(rst), // 1-bit input: Reset for BREG + .RSTC('0), // 1-bit input: Reset for CREG + .RSTCTRL(rst), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG + .RSTD('0), // 1-bit input: Reset for DREG and ADREG + .RSTINMODE(rst), // 1-bit input: Reset for INMODE register + .RSTM(rst), // 1-bit input: Reset for MREG + .RSTP(PREG && rst) // 1-bit input: Reset for PREG + ); + end : genDSPChain + end : genDSPPE endmodule diff --git a/finn-rtllib/mvu/mvu_8sx9_axi.sv b/finn-rtllib/mvu/mvu_8sx9_axi.sv index 8765c50a26..6c7eaeaeca 100644 --- a/finn-rtllib/mvu/mvu_8sx9_axi.sv +++ b/finn-rtllib/mvu/mvu_8sx9_axi.sv @@ -41,36 +41,36 @@ module mvu_8sx9_axi #( int unsigned ACCU_WIDTH, bit SIGNED_ACTIVATIONS = 0, int unsigned SEGMENTLEN = 0, - parameter RAM_STYLE = "auto", + parameter RAM_STYLE = "auto", localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, localparam int unsigned INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8, - localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH, - localparam int unsigned INPUT_STREAM_WIDTH = SIMD*ACTIVATION_WIDTH, + localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH, + localparam int unsigned INPUT_STREAM_WIDTH = SIMD*ACTIVATION_WIDTH, localparam int unsigned SF = MW/SIMD, - localparam int unsigned NF = MH/PE, + localparam int unsigned NF = MH/PE, localparam int unsigned OUTPUT_LANES = PE, localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8 ) ( // Global Control - input logic ap_clk, - input logic ap_rst_n, + input logic ap_clk, + input logic ap_rst_n, // Weight Stream - input logic [WEIGHT_STREAM_WIDTH_BA-1:0] s_axis_weights_tdata, - input logic s_axis_weights_tvalid, + input logic [WEIGHT_STREAM_WIDTH_BA-1:0] s_axis_weights_tdata, + input logic s_axis_weights_tvalid, output logic s_axis_weights_tready, // Input Stream - input logic [INPUT_STREAM_WIDTH_BA-1:0] s_axis_input_tdata, - input logic s_axis_input_tvalid, + input logic [INPUT_STREAM_WIDTH_BA-1:0] s_axis_input_tdata, + input logic s_axis_input_tvalid, output logic s_axis_input_tready, // Output Stream output logic [OUTPUT_STREAM_WIDTH_BA-1:0] m_axis_output_tdata, output logic m_axis_output_tvalid, - input logic m_axis_output_tready + input logic m_axis_output_tready ); //-------------------- Parameter sanity checks --------------------\\ @@ -121,13 +121,13 @@ module mvu_8sx9_axi #( .ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin) ); - //-------------------- Input control --------------------\\ +//-------------------- Input control --------------------\\ uwire en; uwire istb = avld && s_axis_weights_tvalid; assign ardy = en && s_axis_weights_tvalid; assign s_axis_weights_tready = en && avld; - //-------------------- Core MVU --------------------\\ +//-------------------- Core MVU --------------------\\ uwire ovld; uwire [PE-1:0][57:0] odat; typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t; @@ -138,7 +138,7 @@ module mvu_8sx9_axi #( .vld(ovld), .p(odat) ); - //-------------------- Output register slice --------------------\\ +//-------------------- Output register slice --------------------\\ struct { logic vld; logic [PE-1:0][ACCU_WIDTH-1:0] dat; @@ -148,7 +148,7 @@ module mvu_8sx9_axi #( uwire b_load; always_ff @(posedge clk) begin - if(rst) A <= '{ vld: 0, default: 'x }; + if(rst) A <= '{ vld: 0, default: 'x }; else if(!A.vld || b_load) begin A.vld <= ovld && en; for(int unsigned i = 0; i < PE; i++) begin @@ -169,7 +169,7 @@ module mvu_8sx9_axi #( always_ff @(posedge clk) begin if(rst) B <= '{ default: 'x }; else begin - if(b_load) B <= '{ vld: A.vld, dat: A.dat}; + if(b_load) B <= '{ vld: A.vld, dat: A.dat}; end end diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv b/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv index ea97e0708c..70ffa096ef 100644 --- a/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv +++ b/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv @@ -33,176 +33,176 @@ module mvu_8sx9_axi_tb(); - //-------------------- Simulation parameters --------------------\\ - // Matrix & parallelism config - localparam int unsigned MW = 600; - localparam int unsigned MH = 256; - localparam int unsigned SIMD = 60; - localparam int unsigned PE = 16; - localparam int unsigned SEGMENTLEN = 4; - // Bit-width config - localparam int unsigned ACTIVATION_WIDTH = 8; - localparam int unsigned WEIGHT_WIDTH = 4; - localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW); - localparam bit SIGNED_ACTIVATIONS = 1; - // Simulation constants - localparam int unsigned NF = MH/PE; - localparam int unsigned SF = MW/SIMD; - localparam int unsigned NUM_OF_DSP = SIMD/3; - localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8; - localparam int unsigned ACTIVATION_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8*8; - localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH; - localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - SIMD*ACTIVATION_WIDTH; - localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8; - - // Generate clk and reset signal - logic clk = 0; - always #5ns clk = !clk; - - logic ap_rst_n = 0; - initial begin - repeat(16) @(posedge clk); - ap_rst_n <= 1; - end - - uwire ap_clk = clk; - - // Generate activations - typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; - typedef activation_t activation_vector_t[SF]; - - function activation_vector_t init_ACTIVATIONS; - automatic activation_vector_t res; - std::randomize(res); - return res; - endfunction : init_ACTIVATIONS - - activation_vector_t ACTIVATIONS = init_ACTIVATIONS(); - - struct { - activation_t dat; - logic vld; - logic rdy; - } activations; - - initial begin - activations.vld = 0; - activations.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain - @(posedge clk iff ap_rst_n); - - for (int i=0; i 1; - @(posedge clk); - end while (!(activations.vld === 1 && activations.rdy === 1)); - end - - activations.vld <= 0; - activations.dat <= 'x; - end - - // Generate weights - typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; - typedef weight_t weight_matrix_t[NF][SF]; - - function weight_matrix_t init_WEIGHTS; - automatic weight_matrix_t res; - std::randomize(res); - return res; - endfunction : init_WEIGHTS; - - weight_matrix_t WEIGHTS = init_WEIGHTS(); - - struct { - weight_t dat; - logic vld; - logic rdy; - } weights; - - initial begin - weights.vld = 0; - weights.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain - @(posedge clk iff ap_rst_n); - - weights.vld <= 1; - for (int i=0; i= 1; - @(posedge clk iff ap_rst_n); - end while (!(outputs.rdy === 1 && outputs.vld === 1)); - - // Compare produced outputs against golden outputs - foreach(outputs.dat[i]) begin - assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); - else begin - $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); - $stop; - end - end - - NF_CNT += 1; - end - - $finish; - end - - // Instantiate DUT - mvu_8sx9_axi #( - .MW(MW), - .MH(MH), - .PE(PE), - .SIMD(SIMD), - .ACTIVATION_WIDTH(ACTIVATION_WIDTH), - .WEIGHT_WIDTH(WEIGHT_WIDTH), - .ACCU_WIDTH(ACCU_WIDTH), - .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), - .SEGMENTLEN(SEGMENTLEN) - ) - dut ( - .ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld), - .s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld), - .s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld), - .m_axis_output_tready(outputs.rdy) - ); +//-------------------- Simulation parameters --------------------\\ + // Matrix & parallelism config + localparam int unsigned MW = 600; + localparam int unsigned MH = 256; + localparam int unsigned SIMD = 60; + localparam int unsigned PE = 16; + localparam int unsigned SEGMENTLEN = 4; + // Bit-width config + localparam int unsigned ACTIVATION_WIDTH = 8; + localparam int unsigned WEIGHT_WIDTH = 4; + localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW); + localparam bit SIGNED_ACTIVATIONS = 1; + // Simulation constants + localparam int unsigned NF = MH/PE; + localparam int unsigned SF = MW/SIMD; + localparam int unsigned NUM_OF_DSP = SIMD/3; + localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8; + localparam int unsigned ACTIVATION_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8*8; + localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH; + localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - SIMD*ACTIVATION_WIDTH; + localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8; + + // Generate clk and reset signal + logic clk = 0; + always #5ns clk = !clk; + + logic ap_rst_n = 0; + initial begin + repeat(16) @(posedge clk); + ap_rst_n <= 1; + end + + uwire ap_clk = clk; + + // Generate activations + typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; + typedef activation_t activation_vector_t[SF]; + + function activation_vector_t init_ACTIVATIONS; + automatic activation_vector_t res; + std::randomize(res); + return res; + endfunction : init_ACTIVATIONS + + activation_vector_t ACTIVATIONS = init_ACTIVATIONS(); + + struct { + activation_t dat; + logic vld; + logic rdy; + } activations; + + initial begin + activations.vld = 0; + activations.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain + @(posedge clk iff ap_rst_n); + + for (int i=0; i 1; + @(posedge clk); + end while (!(activations.vld === 1 && activations.rdy === 1)); + end + + activations.vld <= 0; + activations.dat <= 'x; + end + + // Generate weights + typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; + typedef weight_t weight_matrix_t[NF][SF]; + + function weight_matrix_t init_WEIGHTS; + automatic weight_matrix_t res; + std::randomize(res); + return res; + endfunction : init_WEIGHTS; + + weight_matrix_t WEIGHTS = init_WEIGHTS(); + + struct { + weight_t dat; + logic vld; + logic rdy; + } weights; + + initial begin + weights.vld = 0; + weights.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain + @(posedge clk iff ap_rst_n); + + weights.vld <= 1; + for (int i=0; i= 1; + @(posedge clk iff ap_rst_n); + end while (!(outputs.rdy === 1 && outputs.vld === 1)); + + // Compare produced outputs against golden outputs + foreach(outputs.dat[i]) begin + assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + else begin + $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + $stop; + end + end + + NF_CNT += 1; + end + + $finish; + end + + // Instantiate DUT + mvu_8sx9_axi #( + .MW(MW), + .MH(MH), + .PE(PE), + .SIMD(SIMD), + .ACTIVATION_WIDTH(ACTIVATION_WIDTH), + .WEIGHT_WIDTH(WEIGHT_WIDTH), + .ACCU_WIDTH(ACCU_WIDTH), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), + .SEGMENTLEN(SEGMENTLEN) + ) + dut ( + .ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld), + .s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld), + .s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld), + .m_axis_output_tready(outputs.rdy) + ); endmodule diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v index ff3779d211..2456eb3a47 100644 --- a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v +++ b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v @@ -33,7 +33,7 @@ module $MODULE_NAME_AXI_WRAPPER$ #( parameter MW = $MW$, - parameter MH = $MH$, + parameter MH = $MH$, parameter PE = $PE$, parameter SIMD = $SIMD$, parameter ACTIVATION_WIDTH = $ACTIVATION_WIDTH$, @@ -44,29 +44,29 @@ module $MODULE_NAME_AXI_WRAPPER$ #( parameter RAM_STYLE = $IBUF_RAM_STYLE$, // Safely deducible parameters - parameter WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, - parameter INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8, - parameter OUTPUT_LANES = PE, - parameter OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8 + parameter WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, + parameter INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8, + parameter OUTPUT_LANES = PE, + parameter OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8 )( - // Global Control - input logic ap_clk, - input logic ap_rst_n, + // Global Control + input logic ap_clk, + input logic ap_rst_n, // Weight Stream - input logic [WEIGHT_STREAM_WIDTH_BA-1:0] s_axis_weights_tdata, - input logic s_axis_weights_tvalid, + input logic [WEIGHT_STREAM_WIDTH_BA-1:0] s_axis_weights_tdata, + input logic s_axis_weights_tvalid, output logic s_axis_weights_tready, // Input Stream - input logic [INPUT_STREAM_WIDTH_BA-1:0] s_axis_input_tdata, - input logic s_axis_input_tvalid, + input logic [INPUT_STREAM_WIDTH_BA-1:0] s_axis_input_tdata, + input logic s_axis_input_tvalid, output logic s_axis_input_tready, // Output Stream output logic [OUTPUT_STREAM_WIDTH_BA-1:0] m_axis_output_tdata, output logic m_axis_output_tvalid, - input logic m_axis_output_tready + input logic m_axis_output_tready ); mvu_8sx9_axi #( diff --git a/finn-rtllib/mvu/mvu_8sx9_tb.sv b/finn-rtllib/mvu/mvu_8sx9_tb.sv index ea3ecbbd70..adf6a8f9c2 100644 --- a/finn-rtllib/mvu/mvu_8sx9_tb.sv +++ b/finn-rtllib/mvu/mvu_8sx9_tb.sv @@ -33,133 +33,133 @@ module mvu_8sx9_tb(); - //-------------------- Simulation parameters --------------------\\ - // Matrix & parallelism config - localparam int unsigned MH = 256; - localparam int unsigned PE = 16; - localparam int unsigned MW = 600; - localparam int unsigned SIMD = 60; - localparam int unsigned SEGMENTLEN = 4; - // Bit-width config - localparam int unsigned ACTIVATION_WIDTH = 8; - localparam int unsigned WEIGHT_WIDTH = 4; - localparam bit SIGNED_ACTIVATIONS = 1; - // Simulation constants - localparam int unsigned NF = MH/PE; - localparam int unsigned SF = MW/SIMD; - localparam int unsigned NUM_OF_DSP = SIMD/3; - - typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; - typedef activation_t activation_vector_t[SF]; - - function activation_vector_t init_ACTIVATIONS; - automatic activation_vector_t res; - std::randomize(res); - return res; - endfunction : init_ACTIVATIONS - - typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; - typedef weight_t weight_matrix_t[NF][SF]; - - function weight_matrix_t init_WEIGHTS; - automatic weight_matrix_t res; - std::randomize(res); - return res; - endfunction : init_WEIGHTS; - - typedef logic signed [PE-1:0][57:0] output_t; - typedef output_t output_vector_t [NF]; - - function output_vector_t check_output(activation_vector_t a, weight_matrix_t w); - automatic output_vector_t res = '{default: 0}; - for (int j = 0; j 1) && !rst; - end - - // Compare computed output against golden output when vld flag is raised by DUT - always_ff @(posedge clk iff (vld && en)) begin - foreach(p[i]) begin - assert ($signed(p[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); - else begin - $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); - $stop; - end - end - NF_CNT += 1; - end - - // Instantiate DUT - mvu_8sx9 #( - .PE(PE), - .SIMD(SIMD), - .WEIGHT_WIDTH(WEIGHT_WIDTH), - .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), - .ACTIVATION_WIDTH(ACTIVATION_WIDTH), - .SEGMENTLEN(SEGMENTLEN) - ) - dut ( - .clk, .rst, .en, .last, .zero, .a, .w, .vld, .p - ); - +//-------------------- Simulation parameters --------------------\\ + // Matrix & parallelism config + localparam int unsigned MH = 256; + localparam int unsigned PE = 16; + localparam int unsigned MW = 600; + localparam int unsigned SIMD = 60; + localparam int unsigned SEGMENTLEN = 4; + // Bit-width config + localparam int unsigned ACTIVATION_WIDTH = 8; + localparam int unsigned WEIGHT_WIDTH = 4; + localparam bit SIGNED_ACTIVATIONS = 1; + // Simulation constants + localparam int unsigned NF = MH/PE; + localparam int unsigned SF = MW/SIMD; + localparam int unsigned NUM_OF_DSP = SIMD/3; + + typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; + typedef activation_t activation_vector_t[SF]; + + function activation_vector_t init_ACTIVATIONS; + automatic activation_vector_t res; + std::randomize(res); + return res; + endfunction : init_ACTIVATIONS + + typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; + typedef weight_t weight_matrix_t[NF][SF]; + + function weight_matrix_t init_WEIGHTS; + automatic weight_matrix_t res; + std::randomize(res); + return res; + endfunction : init_WEIGHTS; + + typedef logic signed [PE-1:0][57:0] output_t; + typedef output_t output_vector_t [NF]; + + function output_vector_t check_output(activation_vector_t a, weight_matrix_t w); + automatic output_vector_t res = '{default: 0}; + for (int j = 0; j 1) && !rst; + end + + // Compare computed output against golden output when vld flag is raised by DUT + always_ff @(posedge clk iff (vld && en)) begin + foreach(p[i]) begin + assert ($signed(p[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + else begin + $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + $stop; + end + end + NF_CNT += 1; + end + + // Instantiate DUT + mvu_8sx9 #( + .PE(PE), + .SIMD(SIMD), + .WEIGHT_WIDTH(WEIGHT_WIDTH), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), + .ACTIVATION_WIDTH(ACTIVATION_WIDTH), + .SEGMENTLEN(SEGMENTLEN) + ) + dut ( + .clk, .rst, .en, .last, .zero, .a, .w, .vld, .p + ); + endmodule From 5e61f42afd991233153ee8b7fe0fb6e9e8ac562d Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 12 Apr 2023 08:54:45 +0100 Subject: [PATCH 103/235] [rtl custom op]: fix to indentation --- finn-rtllib/mvu/mvu_8sx9_axi.sv | 54 ++++++++++++++++----------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/finn-rtllib/mvu/mvu_8sx9_axi.sv b/finn-rtllib/mvu/mvu_8sx9_axi.sv index 6c7eaeaeca..5f215927d8 100644 --- a/finn-rtllib/mvu/mvu_8sx9_axi.sv +++ b/finn-rtllib/mvu/mvu_8sx9_axi.sv @@ -32,25 +32,25 @@ *****************************************************************************/ module mvu_8sx9_axi #( - int unsigned MW, - int unsigned MH, - int unsigned PE, - int unsigned SIMD, - int unsigned ACTIVATION_WIDTH, - int unsigned WEIGHT_WIDTH, - int unsigned ACCU_WIDTH, - bit SIGNED_ACTIVATIONS = 0, - int unsigned SEGMENTLEN = 0, + int unsigned MW, + int unsigned MH, + int unsigned PE, + int unsigned SIMD, + int unsigned ACTIVATION_WIDTH, + int unsigned WEIGHT_WIDTH, + int unsigned ACCU_WIDTH, + bit SIGNED_ACTIVATIONS = 0, + int unsigned SEGMENTLEN = 0, parameter RAM_STYLE = "auto", - localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, - localparam int unsigned INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8, + localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, + localparam int unsigned INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8, localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH, localparam int unsigned INPUT_STREAM_WIDTH = SIMD*ACTIVATION_WIDTH, - localparam int unsigned SF = MW/SIMD, + localparam int unsigned SF = MW/SIMD, localparam int unsigned NF = MH/PE, - localparam int unsigned OUTPUT_LANES = PE, - localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8 + localparam int unsigned OUTPUT_LANES = PE, + localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8 ) ( // Global Control @@ -76,31 +76,31 @@ module mvu_8sx9_axi #( //-------------------- Parameter sanity checks --------------------\\ initial begin if (MW % SIMD != 0) begin - $error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD); - $finish; + $error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD); + $finish; end if (MH % PE != 0) begin - $error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE); - $finish; + $error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE); + $finish; end if (ACTIVATION_WIDTH > 9) begin - $error("Activation width of %0d-bits exceeds maximum of 9-bits", ACTIVATION_WIDTH); - $finish; + $error("Activation width of %0d-bits exceeds maximum of 9-bits", ACTIVATION_WIDTH); + $finish; end if (WEIGHT_WIDTH > 8) begin - $error("Weight width of %0d-bits exceeds maximum of 8-bits", WEIGHT_WIDTH); - $finish; + $error("Weight width of %0d-bits exceeds maximum of 8-bits", WEIGHT_WIDTH); + $finish; end if (SIGNED_ACTIVATIONS == 0 && ACTIVATION_WIDTH==9) begin - $error("Activation width of %0d-bits exceeds maximum of 8-bits for unsigned numbers", ACTIVATION_WIDTH); - $finish; + $error("Activation width of %0d-bits exceeds maximum of 8-bits for unsigned numbers", ACTIVATION_WIDTH); + $finish; end if (SEGMENTLEN == 0) begin - $warning("Segment length of %0d defaults to chain length", SEGMENTLEN); + $warning("Segment length of %0d defaults to chain length", SEGMENTLEN); end if (SEGMENTLEN > (SIMD+2)/3) begin - $error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3); - $finish; + $error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3); + $finish; end end From cbee193d746763044a870bdf1af248bbe8d31156 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 12 Apr 2023 14:33:13 +0100 Subject: [PATCH 104/235] [rtl custom-op]: minor changes for compiler integration --- finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v index 2456eb3a47..502a72d3f2 100644 --- a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v +++ b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v @@ -41,7 +41,7 @@ module $MODULE_NAME_AXI_WRAPPER$ #( parameter ACCU_WIDTH = $ACCU_WIDTH$, parameter SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$, parameter SEGMENTLEN = $SEGMENTLEN$, - parameter RAM_STYLE = $IBUF_RAM_STYLE$, + parameter RAM_STYLE = "$IBUF_RAM_STYLE$", // Safely deducible parameters parameter WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, @@ -85,6 +85,6 @@ mvu_8sx9_axi #( .m_axis_output_tdata(m_axis_output_tdata), .m_axis_output_tvalid(m_axis_output_tvalid), .m_axis_output_tready(m_axis_output_tready) -) +); -endmodule : mvau_8sx9_axi_wrapper \ No newline at end of file +endmodule : $MODULE_NAME_AXI_WRAPPER$ \ No newline at end of file From ba5e77bde008fff2a445d6ef469072dd67f67f42 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 12 Apr 2023 23:26:05 +0100 Subject: [PATCH 105/235] [rtl custom op]: moved testbenches to separate directory --- finn-rtllib/mvu/tb/mvu_8sx9_tb.sv | 165 +++++++++++++++++++++++ finn-rtllib/mvu/tb/mvu_axi_tb.sv | 213 ++++++++++++++++++++++++++++++ 2 files changed, 378 insertions(+) create mode 100644 finn-rtllib/mvu/tb/mvu_8sx9_tb.sv create mode 100644 finn-rtllib/mvu/tb/mvu_axi_tb.sv diff --git a/finn-rtllib/mvu/tb/mvu_8sx9_tb.sv b/finn-rtllib/mvu/tb/mvu_8sx9_tb.sv new file mode 100644 index 0000000000..c8bfe5370a --- /dev/null +++ b/finn-rtllib/mvu/tb/mvu_8sx9_tb.sv @@ -0,0 +1,165 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Testbench for MVU core compute kernel. + *****************************************************************************/ + +module mvu_8sx9_tb(); + +//-------------------- Simulation parameters --------------------\\ + // Matrix & parallelism config + localparam int unsigned MH = 256; + localparam int unsigned PE = 16; + localparam int unsigned MW = 600; + localparam int unsigned SIMD = 60; + localparam int unsigned SEGMENTLEN = 4; + // Bit-width config + localparam int unsigned ACTIVATION_WIDTH = 8; + localparam int unsigned WEIGHT_WIDTH = 4; + localparam bit SIGNED_ACTIVATIONS = 1; + // Simulation constants + localparam int unsigned NF = MH/PE; + localparam int unsigned SF = MW/SIMD; + localparam int unsigned NUM_OF_DSP = SIMD/3; + + typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; + typedef activation_t activation_vector_t[SF]; + + function activation_vector_t init_ACTIVATIONS; + automatic activation_vector_t res; + std::randomize(res); + return res; + endfunction : init_ACTIVATIONS + + typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; + typedef weight_t weight_matrix_t[NF][SF]; + + function weight_matrix_t init_WEIGHTS; + automatic weight_matrix_t res; + std::randomize(res); + return res; + endfunction : init_WEIGHTS; + + typedef logic signed [PE-1:0][57:0] output_t; + typedef output_t output_vector_t [NF]; + + function output_vector_t check_output(activation_vector_t a, weight_matrix_t w); + automatic output_vector_t res = '{default: 0}; + for (int j = 0; j 1) && !rst; + end + + // Compare computed output against golden output when vld flag is raised by DUT + always_ff @(posedge clk iff (vld && en)) begin + foreach(p[i]) begin + assert ($signed(p[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + else begin + $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + $stop; + end + end + NF_CNT += 1; + end + + // Instantiate DUT + mvu_8sx9 #( + .PE(PE), + .SIMD(SIMD), + .WEIGHT_WIDTH(WEIGHT_WIDTH), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), + .ACTIVATION_WIDTH(ACTIVATION_WIDTH), + .SEGMENTLEN(SEGMENTLEN) + ) + dut ( + .clk, .rst, .en, .last, .zero, .a, .w, .vld, .p + ); + +endmodule : mvu_8sx9_tb diff --git a/finn-rtllib/mvu/tb/mvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_axi_tb.sv new file mode 100644 index 0000000000..08a349da84 --- /dev/null +++ b/finn-rtllib/mvu/tb/mvu_axi_tb.sv @@ -0,0 +1,213 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Testbench for MVU AXI-lite interface wrapper. + *****************************************************************************/ + +module mvu_axi_tb(); + +//-------------------- Simulation parameters --------------------\\ + // Matrix & parallelism config + localparam int unsigned MW = 90; + localparam int unsigned MH = 16; + localparam int unsigned SIMD = 9; + localparam int unsigned PE = 4; + localparam int unsigned SEGMENTLEN = 1; + localparam string MVU_IMPL_STYLE = "mvu_8sx9"; + // Bit-width config + localparam int unsigned ACTIVATION_WIDTH = 8; + localparam int unsigned WEIGHT_WIDTH = 4; + localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW); + localparam bit SIGNED_ACTIVATIONS = 1; + // Simulation constants + localparam int unsigned NF = MH/PE; + localparam int unsigned SF = MW/SIMD; + localparam int unsigned NUM_OF_DSP = SIMD/3; + localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8; + localparam int unsigned ACTIVATION_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8*8; + localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH; + localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - SIMD*ACTIVATION_WIDTH; + localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8; + + // Generate clk and reset signal + logic clk = 0; + always #5ns clk = !clk; + + logic ap_rst_n = 0; + initial begin + repeat(16) @(posedge clk); + ap_rst_n <= 1; + end + + uwire ap_clk = clk; + + // Generate activations + typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; + typedef activation_t activation_vector_t[SF]; + + function activation_vector_t init_ACTIVATIONS; + automatic activation_vector_t res; + std::randomize(res); + return res; + endfunction : init_ACTIVATIONS + + activation_vector_t ACTIVATIONS = init_ACTIVATIONS(); + + struct { + activation_t dat; + logic vld; + logic rdy; + } activations; + + initial begin + activations.vld = 0; + activations.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain + @(posedge clk iff ap_rst_n); + + for (int i=0; i 1; + @(posedge clk); + end while (!(activations.vld === 1 && activations.rdy === 1)); + end + + activations.vld <= 0; + activations.dat <= 'x; + end + + // Generate weights + typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; + typedef weight_t weight_matrix_t[NF][SF]; + + function weight_matrix_t init_WEIGHTS; + automatic weight_matrix_t res; + std::randomize(res); + return res; + endfunction : init_WEIGHTS; + + weight_matrix_t WEIGHTS = init_WEIGHTS(); + + struct { + weight_t dat; + logic vld; + logic rdy; + } weights; + + initial begin + weights.vld = 0; + weights.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain + @(posedge clk iff ap_rst_n); + + weights.vld <= 1; + for (int i=0; i= 1; + @(posedge clk iff ap_rst_n); + end while (!(outputs.rdy === 1 && outputs.vld === 1)); + + // Compare produced outputs against golden outputs + foreach(outputs.dat[i]) begin + assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + else begin + $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + $stop; + end + end + + NF_CNT += 1; + end + + $finish; + end + + // Instantiate DUT + mvu_axi #( + .MW(MW), + .MH(MH), + .PE(PE), + .SIMD(SIMD), + .ACTIVATION_WIDTH(ACTIVATION_WIDTH), + .WEIGHT_WIDTH(WEIGHT_WIDTH), + .ACCU_WIDTH(ACCU_WIDTH), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), + .SEGMENTLEN(SEGMENTLEN), + .MVU_IMPL_STYLE(MVU_IMPL_STYLE) + ) + dut ( + .ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld), + .s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld), + .s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld), + .m_axis_output_tready(outputs.rdy) + ); + +endmodule : mvu_axi_tb From 69310b4e6d2ee4bf2e60b236582656fd7f364a6d Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 12 Apr 2023 23:27:50 +0100 Subject: [PATCH 106/235] [rtl custom op]: fixed output width to ACCU_WIDTH --- finn-rtllib/mvu/mvu_8sx9.sv | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv index d082d4fb2e..5af27ab0ce 100644 --- a/finn-rtllib/mvu/mvu_8sx9.sv +++ b/finn-rtllib/mvu/mvu_8sx9.sv @@ -36,19 +36,25 @@ module mvu_8sx9 #( int unsigned SIMD, int unsigned ACTIVATION_WIDTH, int unsigned WEIGHT_WIDTH, + int unsigned ACCU_WIDTH, bit SIGNED_ACTIVATIONS = 0, int unsigned SEGMENTLEN = 0 // Default to 0 (which implies a single segment) ) ( - input logic clk, + // Global Control + input logic clk, input logic rst, input logic en, + + // Input input logic last, - input logic zero, - input logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] a, - input logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, - output logic vld, - output logic [PE-1:0][57:0] p + input logic zero, // ignore current inputs and force this partial product to zero + input logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // weights + input logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] a, // activations + + // Ouput + output logic vld, + output logic [PE-1:0][ACCU_WIDTH-1:0] p ); //-------------------- Declare global signals --------------------\\ @@ -146,7 +152,7 @@ module mvu_8sx9 #( uwire [57:0] pp; if (LAST) begin : genPOUT - assign p[j] = pp; + assign p[j] = pp[ACCU_WIDTH-1:0]; end DSP58 #( @@ -281,4 +287,4 @@ module mvu_8sx9 #( end : genDSPChain end : genDSPPE -endmodule +endmodule : mvu_8sx9 From cfcff0040c85a76d7c5a16b2bf1b6b966b62e87d Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 12 Apr 2023 23:29:06 +0100 Subject: [PATCH 107/235] [rtl custom op]: renamed file and added generic to switch between compute kernels --- finn-rtllib/mvu/mvu_axi.sv | 194 +++++++++++++++++++++++++++++++++++++ 1 file changed, 194 insertions(+) create mode 100644 finn-rtllib/mvu/mvu_axi.sv diff --git a/finn-rtllib/mvu/mvu_axi.sv b/finn-rtllib/mvu/mvu_axi.sv new file mode 100644 index 0000000000..5d8700738f --- /dev/null +++ b/finn-rtllib/mvu/mvu_axi.sv @@ -0,0 +1,194 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Matrix Vector Unit (MVU) AXI-lite interface wrapper. + *****************************************************************************/ + +module mvu_axi #( + int unsigned MW, + int unsigned MH, + int unsigned PE, + int unsigned SIMD, + int unsigned ACTIVATION_WIDTH, + int unsigned WEIGHT_WIDTH, + int unsigned ACCU_WIDTH, + bit SIGNED_ACTIVATIONS = 0, + int unsigned SEGMENTLEN = 0, + parameter RAM_STYLE = "auto", + parameter MVU_IMPL_STYLE, + + localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, + localparam int unsigned INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8, + localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH, + localparam int unsigned INPUT_STREAM_WIDTH = SIMD*ACTIVATION_WIDTH, + localparam int unsigned SF = MW/SIMD, + localparam int unsigned NF = MH/PE, + localparam int unsigned OUTPUT_LANES = PE, + localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8 +) +( + // Global Control + input logic ap_clk, + input logic ap_rst_n, + + // Weight Stream + input logic [WEIGHT_STREAM_WIDTH_BA-1:0] s_axis_weights_tdata, + input logic s_axis_weights_tvalid, + output logic s_axis_weights_tready, + + // Input Stream + input logic [INPUT_STREAM_WIDTH_BA-1:0] s_axis_input_tdata, + input logic s_axis_input_tvalid, + output logic s_axis_input_tready, + + // Output Stream + output logic [OUTPUT_STREAM_WIDTH_BA-1:0] m_axis_output_tdata, + output logic m_axis_output_tvalid, + input logic m_axis_output_tready +); + +//-------------------- Parameter sanity checks --------------------\\ + initial begin + if (MW % SIMD != 0) begin + $error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD); + $finish; + end + if (MH % PE != 0) begin + $error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE); + $finish; + end + if (ACTIVATION_WIDTH > 9) begin + $error("Activation width of %0d-bits exceeds maximum of 9-bits", ACTIVATION_WIDTH); + $finish; + end + if (WEIGHT_WIDTH > 8) begin + $error("Weight width of %0d-bits exceeds maximum of 8-bits", WEIGHT_WIDTH); + $finish; + end + if (SIGNED_ACTIVATIONS == 0 && ACTIVATION_WIDTH==9) begin + $error("Activation width of %0d-bits exceeds maximum of 8-bits for unsigned numbers", ACTIVATION_WIDTH); + $finish; + end + if (SEGMENTLEN == 0) begin + $warning("Segment length of %0d defaults to chain length", SEGMENTLEN); + end + if (SEGMENTLEN > (SIMD+2)/3) begin + $error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3); + $finish; + end + end + + uwire clk = ap_clk; + uwire rst = !ap_rst_n; + + typedef logic [INPUT_STREAM_WIDTH-1 : 0] mvauin_t; + + uwire mvauin_t amvau; + uwire alast; + uwire afin; + uwire avld; + uwire ardy; + + replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvauin_t)), .RAM_STYLE(RAM_STYLE)) activation_replay ( + .clk, .rst, + .ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)), + .ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin) + ); + +//-------------------- Input control --------------------\\ + uwire en; + uwire istb = avld && s_axis_weights_tvalid; + assign ardy = en && s_axis_weights_tvalid; + assign s_axis_weights_tready = en && avld; + +//-------------------- Core MVU --------------------\\ + uwire ovld; + uwire [PE-1:0][ACCU_WIDTH-1:0] odat; + typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t; + + if (MVU_IMPL_STYLE == "mvu_8sx9") begin : genMVU8sx9 + mvu_8sx9 #(.PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), + .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN)) core ( + .clk, .rst, .en, + .last(alast), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau), + .vld(ovld), .p(odat) + ); + end + else if (MVU_IMPL_STYLE == "mvu_4sx4u") begin : genMVU4sx4u + mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .FORCE_BEHAVIORAL(0)) core ( + .clk, .rst, .en, + .last(alast), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau), + .vld(ovld), .p(odat) + ); + end + //else begin + // $error("Unrecognized MVU_IMPL_STYLE!"); + // $finish; + //end + +//-------------------- Output register slice --------------------\\ + struct { + logic vld; + logic [PE-1:0][ACCU_WIDTH-1:0] dat; + } A = '{ vld: 0, default: 'x}; + + assign en = !A.vld || !ovld; + + uwire b_load; + always_ff @(posedge clk) begin + if(rst) A <= '{ vld: 0, default: 'x }; + else if(!A.vld || b_load) begin + A.vld <= ovld && en; + for(int unsigned i = 0; i < PE; i++) begin + // CR-1148862: + // A.dat[i] <= odat[i]; + automatic logic [ACCU_WIDTH-1:0] v = odat[i]; + A.dat[i] <= v[ACCU_WIDTH-1:0]; + end + end + end + + struct { + logic vld; + logic [PE-1:0][ACCU_WIDTH-1:0] dat; + } B = '{ vld: 0, default: 'x}; + + assign b_load = !B.vld || m_axis_output_tready; + always_ff @(posedge clk) begin + if(rst) B <= '{ default: 'x }; + else begin + if(b_load) B <= '{ vld: A.vld, dat: A.dat}; + end + end + + assign m_axis_output_tvalid = B.vld; + assign m_axis_output_tdata = B.dat; + +endmodule : mvu_axi \ No newline at end of file From 72b519691369b9ebc31983a6723485860837e37b Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 12 Apr 2023 23:29:45 +0100 Subject: [PATCH 108/235] [rtl custom op]: renamed file and added generic to switch between compute kernels --- finn-rtllib/mvu/mvu_axi_wrapper.v | 90 +++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 finn-rtllib/mvu/mvu_axi_wrapper.v diff --git a/finn-rtllib/mvu/mvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_axi_wrapper.v new file mode 100644 index 0000000000..323d2711e4 --- /dev/null +++ b/finn-rtllib/mvu/mvu_axi_wrapper.v @@ -0,0 +1,90 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Verilog AXI-lite wrapper for MVU. + *****************************************************************************/ + +module $MODULE_NAME_AXI_WRAPPER$ #( + parameter MW = $MW$, + parameter MH = $MH$, + parameter PE = $PE$, + parameter SIMD = $SIMD$, + parameter ACTIVATION_WIDTH = $ACTIVATION_WIDTH$, + parameter WEIGHT_WIDTH = $WEIGHT_WIDTH$, + parameter ACCU_WIDTH = $ACCU_WIDTH$, + parameter SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$, + parameter SEGMENTLEN = $SEGMENTLEN$, + parameter RAM_STYLE = "$IBUF_RAM_STYLE$", + + // Safely deducible parameters + parameter WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, + parameter INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8, + parameter OUTPUT_LANES = PE, + parameter OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8 +)( + // Global Control + input logic ap_clk, + input logic ap_rst_n, + + // Weight Stream + input logic [WEIGHT_STREAM_WIDTH_BA-1:0] s_axis_weights_tdata, + input logic s_axis_weights_tvalid, + output logic s_axis_weights_tready, + + // Input Stream + input logic [INPUT_STREAM_WIDTH_BA-1:0] s_axis_input_tdata, + input logic s_axis_input_tvalid, + output logic s_axis_input_tready, + + // Output Stream + output logic [OUTPUT_STREAM_WIDTH_BA-1:0] m_axis_output_tdata, + output logic m_axis_output_tvalid, + input logic m_axis_output_tready +); + +mvu_axi #( + .MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), + .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), + .SEGMENTLEN(SEGMENTLEN), .RAM_STYLE(RAM_STYLE) + ) inst ( + .ap_clk(ap_clk), + .ap_rst_n(ap_rst_n), + .s_axis_weights_tdata(s_axis_weights_tdata), + .s_axis_weights_tvalid(s_axis_weights_tvalid), + .s_axis_weights_tready(s_axis_weights_tready), + .s_axis_input_tdata(s_axis_input_tdata), + .s_axis_input_tvalid(s_axis_input_tvalid), + .s_axis_input_tready(s_axis_input_tready), + .m_axis_output_tdata(m_axis_output_tdata), + .m_axis_output_tvalid(m_axis_output_tvalid), + .m_axis_output_tready(m_axis_output_tready) +); + +endmodule : $MODULE_NAME_AXI_WRAPPER$ \ No newline at end of file From 7be5ce412e5747f17fe0062769cd2cc476b5bfa4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Mon, 17 Apr 2023 07:53:44 +0100 Subject: [PATCH 109/235] Defaulting BIAS and SIGNED parameters. Renaming M to K avoiding naming collision with uniform option. --- finn-rtllib/thresholding/hdl/thresholding.sv | 28 +++++++++---------- .../thresholding/hdl/thresholding_axi.sv | 27 +++++++++--------- .../hdl/thresholding_axi_wrapper.v | 18 ++++++------ 3 files changed, 36 insertions(+), 37 deletions(-) diff --git a/finn-rtllib/thresholding/hdl/thresholding.sv b/finn-rtllib/thresholding/hdl/thresholding.sv index 0ce95ed3f9..d16a9219d7 100644 --- a/finn-rtllib/thresholding/hdl/thresholding.sv +++ b/finn-rtllib/thresholding/hdl/thresholding.sv @@ -45,11 +45,11 @@ *****************************************************************************/ module thresholding #( int unsigned N, // output precision - int unsigned M, // input/threshold precision + int unsigned K, // input/threshold precision int unsigned C, // number of channels - bit SIGNED, // signed inputs - int BIAS, // offsetting the output [0, 2^N-1) -> [BIAS, 2^N-1 + BIAS) + bit SIGNED = 1, // signed inputs + int BIAS = 0, // offsetting the output [0, 2^N-1] -> [BIAS, 2^N-1 + BIAS] localparam int unsigned C_BITS = C < 2? 1 : $clog2(C), localparam int unsigned O_BITS = BIAS >= 0? @@ -63,15 +63,15 @@ module thresholding #( // Threshold Configuration input logic twe, input logic [$clog2(C)+N-1:0] twa, - input logic [ M-1:0] twd, + input logic [ K-1:0] twd, // Clock Enable for Stream Processing input logic en, // Input Stream input logic ivld, - input logic [C_BITS-1:0] icnl, // Ignored for C == 1 - input logic [M -1:0] idat, + input logic [C_BITS-1:0] icnl, // Ignored for C == 1 + input logic [K -1:0] idat, // Output Stream output logic ovld, @@ -81,10 +81,10 @@ module thresholding #( // Pipeline Links & Feed typedef struct packed { - logic vld; // Valid data identification - logic [C_BITS-1:0] cnl; // Channel - logic [M -1:0] val; // Original input value - logic [0:N-1] res; // Assembling result with valid prefix [0:stage] after stage #stage + logic vld; // Valid data identification + logic [C_BITS-1:0] cnl; // Channel + logic [K -1:0] val; // Original input value + logic [0:N-1] res; // Assembling result with valid prefix [0:stage] after stage #stage } pipe_t; uwire pipe_t pipe[0:N]; assign pipe[0] = pipe_t'{ vld: ivld, cnl: icnl, val: idat, res: {N{1'bx}} }; // Feed original input @@ -94,20 +94,20 @@ module thresholding #( for(genvar stage = 0; stage < N; stage++) begin : genStages // Threshold Memory - uwire [M-1:0] thresh; + uwire [K-1:0] thresh; if(1) begin : blkUpdate // Write control: local select from global address uwire we = twe && tws[stage]; if((C == 1) && (stage == 0)) begin - logic [M-1:0] Thresh = 'x; + logic [K-1:0] Thresh = 'x; always_ff @(posedge clk) begin if(we) Thresh <= twd; end assign thresh = Thresh; end else begin - logic [M-1:0] Threshs[C * 2**stage]; + logic [K-1:0] Threshs[C * 2**stage]; uwire [$clog2(C)+stage-1:0] wa = twa[$left(twa):N-stage]; uwire [$clog2(C)+stage-1:0] ra; if(C > 1) assign ra[stage+:C_BITS] = pipe[stage].cnl; @@ -119,7 +119,7 @@ module thresholding #( end // Read - logic [M-1:0] RdReg; + logic [K-1:0] RdReg; always_ff @(posedge clk) begin if(en) RdReg <= Threshs[ra]; end diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi.sv b/finn-rtllib/thresholding/hdl/thresholding_axi.sv index d2a7420a99..2f0393a3e7 100644 --- a/finn-rtllib/thresholding/hdl/thresholding_axi.sv +++ b/finn-rtllib/thresholding/hdl/thresholding_axi.sv @@ -40,16 +40,15 @@ module thresholding_axi #( int unsigned N, // output precision - int unsigned M, // input/threshold precision + int unsigned K, // input/threshold precision int unsigned C, // Channels - int unsigned PE, // Processing Parallelism, requires C = M*PE + int unsigned PE, // Processing Parallelism, requires C = k*PE - bit SIGNED, // signed inputs - int BIAS, // offsetting the output [0, 2^N-1) -> [BIAS, 2^N-1 + BIAS) + bit SIGNED = 1, // signed inputs + int BIAS = 0, // offsetting the output [0, 2^N-1] -> [BIAS, 2^N-1 + BIAS] - localparam int unsigned CF = 1 + (C-1)/PE, // Channel Fold + localparam int unsigned CF = 1 + (C-1)/PE, // Channel Fold localparam int unsigned ADDR_BITS = $clog2(CF) + $clog2(PE) + N + 2, - localparam int unsigned C_BITS = C/PE < 2? 1 : $clog2(C/PE), localparam int unsigned O_BITS = BIAS >= 0? /* unsigned */ $clog2(2**N+BIAS) : /* signed */ 1+$clog2(-BIAS >= 2**(N-1)? -BIAS : 2**N+BIAS) @@ -86,7 +85,7 @@ module thresholding_axi #( //- AXI Stream - Input -------------- output logic s_axis_tready, input logic s_axis_tvalid, - input logic [((PE*M+7)/8)*8-1:0] s_axis_tdata, + input logic [((PE*K+7)/8)*8-1:0] s_axis_tdata, //- AXI Stream - Output ------------- input logic m_axis_tready, @@ -108,13 +107,13 @@ module thresholding_axi #( //- AXI Lite: Threshold Configuration ----------------------------------- uwire twe[PE]; uwire [$clog2(CF)+N-1:0] twa; - uwire [ M-1:0] twd; + uwire [ K-1:0] twd; if(1) begin : blkAxiLite logic WABusy = 0; logic WDBusy = 0; logic Sel[PE] = '{ default: 'x }; logic [$clog2(CF)+N-1:0] Addr = 'x; - logic [ M-1:0] Data = 'x; + logic [ K-1:0] Data = 'x; for(genvar pe = 0; pe < PE; pe++) begin assign twe[pe] = WABusy && WDBusy && Sel[pe]; @@ -147,7 +146,7 @@ module thresholding_axi #( end if(!WDBusy) begin WDBusy <= s_axilite_WVALID; - Data <= s_axilite_WDATA[M-1:0]; + Data <= s_axilite_WDATA[K-1:0]; end end end @@ -204,12 +203,12 @@ module thresholding_axi #( end : blkOutputDecouple - // localparam int unsigned C_BITS = C/PE < 2? 1 : $clog2(C/PE); + localparam int unsigned C_BITS = C/PE < 2? 1 : $clog2(C/PE); uwire ivld = s_axis_tvalid; uwire [C_BITS-1:0] icnl; - uwire [M -1:0] idat[PE]; + uwire [K -1:0] idat[PE]; for(genvar pe = 0; pe < PE; pe++) begin - assign idat[pe] = s_axis_tdata[pe*M+:M]; + assign idat[pe] = s_axis_tdata[pe*K+:K]; end assign s_axis_tready = en; @@ -234,7 +233,7 @@ module thresholding_axi #( // Core Thresholding Modules for(genvar pe = 0; pe < PE; pe++) begin : genCores - thresholding #(.N(N), .M(M), .C(C/PE), .SIGNED(SIGNED), .BIAS(BIAS)) core ( + thresholding #(.N(N), .K(K), .C(C/PE), .SIGNED(SIGNED), .BIAS(BIAS)) core ( .clk, .rst, .twe(twe[pe]), .twa, .twd, .en, diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v index c27480f388..2657b39d98 100644 --- a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v +++ b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v @@ -34,20 +34,20 @@ module $MODULE_NAME_AXI_WRAPPER$ #( parameter N = $N$, // output precision - parameter M = $M$, // input/threshold precision + parameter K = $M$, // input/threshold precision parameter C = $C$, // Channels - parameter SIGNED = $SIGNED$, // signed inputs - int BIAS = $BIAS$, // offsetting the output [0, 2^N-1) -> [-BIAS, 2^N-1 - BIAS) parameter PE = $PE$, + parameter SIGNED = $SIGNED$, // signed inputs + parameter BIAS = $BIAS$, // offsetting the output [0, 2^N-1) -> [BIAS, 2^N-1 + BIAS) parameter O_BITS = BIAS > 0? - /* unsigned */ $clog2(2**N-BIAS) : - /* signed */ 1+$clog2(BIAS >= 2**(N-1)? BIAS : 2**N-BIAS) + /* unsigned */ $clog2(2**N+BIAS) : + /* signed */ 1+$clog2(-BIAS >= 2**(N-1)? -BIAS : 2**N+BIAS) )( //- Global Control ------------------ - (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V" *) + (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V" *) input ap_clk, - (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V" *) + (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V" *) input ap_rst_n, //- AXI Lite ------------------------ @@ -78,7 +78,7 @@ module $MODULE_NAME_AXI_WRAPPER$ #( //- AXI Stream - Input -------------- output in0_V_TREADY, input in0_V_TVALID, - input [((PE*M+7)/8)*8-1:0] in0_V_TDATA, + input [((PE*K+7)/8)*8-1:0] in0_V_TDATA, //- AXI Stream - Output ------------- input out_V_TREADY, @@ -86,7 +86,7 @@ module $MODULE_NAME_AXI_WRAPPER$ #( output [((PE*O_BITS+7)/8)*8-1:0] out_V_TDATA ); - thresholding_axi #(.N(N), .M(M), .C(C), .PE(PE), .SIGNED(SIGNED), .BIAS(BIAS)) inst ( + thresholding_axi #(.N(N), .K(K), .C(C), .PE(PE), .SIGNED(SIGNED), .BIAS(BIAS)) inst ( //- Global Control ------------------ .ap_clk(ap_clk), .ap_rst_n(ap_rst_n), From c068bb65c6a4b877876c5b1278e7b2663b81d8e1 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 8 May 2023 10:15:16 +0100 Subject: [PATCH 110/235] [rtl mvu]: added behavioral model DSP58 --- finn-rtllib/mvu/mvu_8sx9.sv | 343 ++++++++++++++++++++++-------------- 1 file changed, 212 insertions(+), 131 deletions(-) diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv index 5af27ab0ce..2d1da26efb 100644 --- a/finn-rtllib/mvu/mvu_8sx9.sv +++ b/finn-rtllib/mvu/mvu_8sx9.sv @@ -38,7 +38,8 @@ module mvu_8sx9 #( int unsigned WEIGHT_WIDTH, int unsigned ACCU_WIDTH, bit SIGNED_ACTIVATIONS = 0, - int unsigned SEGMENTLEN = 0 // Default to 0 (which implies a single segment) + int unsigned SEGMENTLEN = 0, // Default to 0 (which implies a single segment) + bit FORCE_BEHAVIORAL = 0 ) ( // Global Control @@ -70,7 +71,10 @@ module mvu_8sx9 #( always_ff @(posedge clk) begin if(rst) L <= '{default: 0}; - else if(en) L <= { L[1:1+MAX_PIPELINE_STAGES], last }; + else if(en) begin + L[1+MAX_PIPELINE_STAGES] <= last; + L[0:MAX_PIPELINE_STAGES] <= L[1:1+MAX_PIPELINE_STAGES]; + end end assign vld = L[0]; @@ -155,135 +159,212 @@ module mvu_8sx9 #( assign p[j] = pp[ACCU_WIDTH-1:0]; end - DSP58 #( - // Feature Control Attributes: Data Path Selection - .AMULTSEL("A"), // Selects A input to multiplier (A, AD) - .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) - .BMULTSEL("B"), // Selects B input to multiplier (AD, B) - .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port) - .DSP_MODE("INT8"), // Configures DSP to a particular mode of operation. Set to INT24 for - // legacy mode. - .PREADDINSEL("A"), // Selects input to pre-adder (A, B) - .RND(58'h000000000000000), // Rounding Constant - .USE_MULT("MULTIPLY"), // Select multiplier usage (DYNAMIC, MULTIPLY, NONE) - .USE_SIMD("ONE58"), // SIMD selection (FOUR12, ONE58, TWO24) - .USE_WIDEXOR("FALSE"), // Use the Wide XOR function (FALSE, TRUE) - .XORSIMD("XOR24_34_58_116"), // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116) - // Pattern Detector Attributes: Pattern Detection Configuration - .AUTORESET_PATDET("NO_RESET"), // NO_RESET, RESET_MATCH, RESET_NOT_MATCH - .AUTORESET_PRIORITY("RESET"), // Priority of AUTORESET vs. CEP (CEP, RESET). - .MASK(58'h0ffffffffffffff), // 58-bit mask value for pattern detect (1=ignore) - .PATTERN(58'h000000000000000), // 58-bit pattern match for pattern detect - .SEL_MASK("MASK"), // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2 - .SEL_PATTERN("PATTERN"), // Select pattern value (C, PATTERN) - .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect (NO_PATDET, PATDET) - // Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins - .IS_ALUMODE_INVERTED(4'b0000), // Optional inversion for ALUMODE - .IS_CARRYIN_INVERTED(1'b0), // Optional inversion for CARRYIN - .IS_CLK_INVERTED(1'b0), // Optional inversion for CLK - .IS_INMODE_INVERTED(5'b00000), // Optional inversion for INMODE - .IS_NEGATE_INVERTED(3'b000), // Optional inversion for NEGATE - .IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0 - FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN - 2'b01, // Y : M - 2'b01 // X: M - }), // Optional inversion for OPMODE - .IS_RSTALLCARRYIN_INVERTED(1'b0), // Optional inversion for RSTALLCARRYIN - .IS_RSTALUMODE_INVERTED(1'b0), // Optional inversion for RSTALUMODE - .IS_RSTA_INVERTED(1'b0), // Optional inversion for RSTA - .IS_RSTB_INVERTED(1'b0), // Optional inversion for RSTB - .IS_RSTCTRL_INVERTED(1'b0), // Optional inversion for STCONJUGATE_A - .IS_RSTC_INVERTED(1'b0), // Optional inversion for RSTC - .IS_RSTD_INVERTED(1'b0), // Optional inversion for RSTD - .IS_RSTINMODE_INVERTED(1'b0), // Optional inversion for RSTINMODE - .IS_RSTM_INVERTED(1'b0), // Optional inversion for RSTM - .IS_RSTP_INVERTED(1'b0), // Optional inversion for RSTP - // Register Control Attributes: Pipeline Register Configuration - .ACASCREG(INTERNAL_PREGS), // Number of pipeline stages between A/ACIN and ACOUT (0-2) - .ADREG(0), // Pipeline stages for pre-adder (0-1) - .ALUMODEREG(0), // Pipeline stages for ALUMODE (0-1) - .AREG(INTERNAL_PREGS), // Pipeline stages for A (0-2) - .BCASCREG(INTERNAL_PREGS), // Number of pipeline stages between B/BCIN and BCOUT (0-2) - .BREG(INTERNAL_PREGS), // Pipeline stages for B (0-2) - .CARRYINREG(0), // Pipeline stages for CARRYIN (0-1) - .CARRYINSELREG(0), // Pipeline stages for CARRYINSEL (0-1) - .CREG(0), // Pipeline stages for C (0-1) - .DREG(0), // Pipeline stages for D (0-1) - .INMODEREG(1), // Pipeline stages for INMODE (0-1) - .MREG(1), // Multiplier pipeline stages (0-1) - .OPMODEREG(1), // Pipeline stages for OPMODE (0-1) - .PREG(PREG), // Number of pipeline stages for P (0-1) - .RESET_MODE("SYNC") // Selection of synchronous or asynchronous reset. (ASYNC, SYNC). - ) - DSP58_inst ( - // Cascade outputs: Cascade Ports - .ACOUT(), // 34-bit output: A port cascade - .BCOUT(), // 24-bit output: B cascade - .CARRYCASCOUT(), // 1-bit output: Cascade carry - .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade - .PCOUT(pcout[j][i]), // 58-bit output: Cascade output - // Control outputs: Control Inputs/Status Bits - .OVERFLOW(), // 1-bit output: Overflow in add/acc - .PATTERNBDETECT(), // 1-bit output: Pattern bar detect - .PATTERNDETECT(), // 1-bit output: Pattern detect - .UNDERFLOW(), // 1-bit output: Underflow in add/acc - // Data outputs: Data Ports - .CARRYOUT(), // 4-bit output: Carry - .P(pp), // 58-bit output: Primary data - .XOROUT(), // 8-bit output: XOR data - // Cascade inputs: Cascade Ports - .ACIN('x), // 34-bit input: A cascade data - .BCIN('x), // 24-bit input: B cascade - .CARRYCASCIN('x), // 1-bit input: Cascade carry - .MULTSIGNIN('x), // 1-bit input: Multiplier sign cascade - .PCIN(FIRST ? 'x : pcout[j][i-1]), // 58-bit input: P cascade - // Control inputs: Control Inputs/Status Bits - .ALUMODE(4'h0), // 4-bit input: ALU control - .CARRYINSEL('0), // 3-bit input: Carry select - .CLK(clk), // 1-bit input: Clock - .INMODE({ - INTERNAL_PREGS==2 ? 1'b0 : 1'b1, - 2'b00, - TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero, - INTERNAL_PREGS==2 ? 1'b0 : 1'b1 - }), // 5-bit input: INMODE control - .NEGATE('0), // 3-bit input: Negates the input of the multiplier - .OPMODE({ - LAST ? {1'b0, L[1]} : 2'b00, - 7'b000_0000 - }), // 9-bit input: Operation mode - // Data inputs: Data Ports - .A({ 7'bx, a_in_i[i] }), // 34-bit input: A data - .B(b_in_i[j][i]), // 24-bit input: B data - .C('x), // 58-bit input: C data - .CARRYIN('0), // 1-bit input: Carry-in - .D('x), // 27-bit input: D data - // Reset/Clock Enable inputs: Reset/Clock Enable Inputs - .ASYNC_RST('0), // 1-bit input: Asynchronous reset for all registers. - .CEA1(en), // 1-bit input: Clock enable for 1st stage AREG - .CEA2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage AREG - .CEAD('0), // 1-bit input: Clock enable for ADREG - .CEALUMODE('0), // 1-bit input: Clock enable for ALUMODE - .CEB1(en), // 1-bit input: Clock enable for 1st stage BREG - .CEB2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage BREG - .CEC('0), // 1-bit input: Clock enable for CREG - .CECARRYIN('0), // 1-bit input: Clock enable for CARRYINREG - .CECTRL(en), // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG - .CED('0), // 1-bit input: Clock enable for DREG - .CEINMODE(en), // 1-bit input: Clock enable for INMODEREG - .CEM(en), // 1-bit input: Clock enable for MREG - .CEP(PREG && en), // 1-bit input: Clock enable for PREG - .RSTA(rst), // 1-bit input: Reset for AREG - .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG - .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG - .RSTB(rst), // 1-bit input: Reset for BREG - .RSTC('0), // 1-bit input: Reset for CREG - .RSTCTRL(rst), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG - .RSTD('0), // 1-bit input: Reset for DREG and ADREG - .RSTINMODE(rst), // 1-bit input: Reset for INMODE register - .RSTM(rst), // 1-bit input: Reset for MREG - .RSTP(PREG && rst) // 1-bit input: Reset for PREG - ); + // Note: Since the product B * AD is computed, + // rst can be only applied to AD and zero only to B + // with the same effect as zeroing both. + if (FORCE_BEHAVIORAL) begin : genBehav + // Stage #1: Input A/B + logic signed [33:0] Areg [INTERNAL_PREGS]; + always_ff @(posedge clk) begin + if (rst) Areg <= '{ default : 0}; + else if (en) begin + Areg[0] <= { 7'bx, a_in_i[i] }; + if (INTERNAL_PREGS == 2) Areg[1] <= Areg[0]; + end + end + logic signed [23:0] Breg [INTERNAL_PREGS]; + always_ff @(posedge clk) begin + if (rst) Breg <= '{ default : 0}; + else if (en) begin + Breg[0] <= b_in_i[j][i]; + if (INTERNAL_PREGS == 2) Breg[1] <= Breg[0]; + end + end + + // Stage #2: Multiply-Accumulate + logic signed [57:0] Mreg; + logic InmodeZero = 0; + always_ff @(posedge clk) begin + if (rst) InmodeZero <= 0; + else if (en) InmodeZero <= ( TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero ); + end + always_ff @(posedge clk) begin + if (rst) Mreg <= 0; + else if (en) begin + automatic logic signed [57:0] m = 0; + for (int k = 0; k < 3; k++) begin + m = m + (InmodeZero ? 0 : $signed(Areg[INTERNAL_PREGS-1][9*k +: 9]) * $signed(Breg[INTERNAL_PREGS-1][8*k +: 8])); + end + Mreg <= m; + end + end + + // Stage #3: Accumulate + logic signed [57:0] Preg; + logic Opmode = 0; + if (FIRST && !LAST) begin : genFirst + if (PREG) begin : genPregBehav + always_ff @(posedge clk) begin + if (rst) Preg <= 0; + else if (en) Preg <= Mreg; + end + end + else assign Preg = Mreg; + end + else if (LAST) begin : genLast + always_ff @(posedge clk) begin + if (rst) Opmode <= 0; + else if (en) Opmode <= L[1]; + end + always_ff @(posedge clk) begin + if (rst) Preg <= 0; + else if (en) Preg <= (Opmode ? 0 : Preg) + Mreg + pcout[j][i-1]; + end + end + else begin : genMid + if (PREG) begin : genPregBehav + always_ff @(posedge clk) begin + if (rst) Preg <= 0; + else if (en) Preg <= Mreg + pcout[j][i-1]; + end + end + else assign Preg = Mreg + pcout[j][i-1]; + end + assign pp = Preg; + assign pcout[j][i] = pp; + end : genBehav + + else begin: genDSP + DSP58 #( + // Feature Control Attributes: Data Path Selection + .AMULTSEL("A"), // Selects A input to multiplier (A, AD) + .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) + .BMULTSEL("B"), // Selects B input to multiplier (AD, B) + .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port) + .DSP_MODE("INT8"), // Configures DSP to a particular mode of operation. Set to INT24 for + // legacy mode. + .PREADDINSEL("A"), // Selects input to pre-adder (A, B) + .RND(58'h000000000000000), // Rounding Constant + .USE_MULT("MULTIPLY"), // Select multiplier usage (DYNAMIC, MULTIPLY, NONE) + .USE_SIMD("ONE58"), // SIMD selection (FOUR12, ONE58, TWO24) + .USE_WIDEXOR("FALSE"), // Use the Wide XOR function (FALSE, TRUE) + .XORSIMD("XOR24_34_58_116"), // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116) + // Pattern Detector Attributes: Pattern Detection Configuration + .AUTORESET_PATDET("NO_RESET"), // NO_RESET, RESET_MATCH, RESET_NOT_MATCH + .AUTORESET_PRIORITY("RESET"), // Priority of AUTORESET vs. CEP (CEP, RESET). + .MASK(58'h0ffffffffffffff), // 58-bit mask value for pattern detect (1=ignore) + .PATTERN(58'h000000000000000), // 58-bit pattern match for pattern detect + .SEL_MASK("MASK"), // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2 + .SEL_PATTERN("PATTERN"), // Select pattern value (C, PATTERN) + .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect (NO_PATDET, PATDET) + // Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins + .IS_ALUMODE_INVERTED(4'b0000), // Optional inversion for ALUMODE + .IS_CARRYIN_INVERTED(1'b0), // Optional inversion for CARRYIN + .IS_CLK_INVERTED(1'b0), // Optional inversion for CLK + .IS_INMODE_INVERTED(5'b00000), // Optional inversion for INMODE + .IS_NEGATE_INVERTED(3'b000), // Optional inversion for NEGATE + .IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0 + FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN + 2'b01, // Y : M + 2'b01 // X: M + }), // Optional inversion for OPMODE + .IS_RSTALLCARRYIN_INVERTED(1'b0), // Optional inversion for RSTALLCARRYIN + .IS_RSTALUMODE_INVERTED(1'b0), // Optional inversion for RSTALUMODE + .IS_RSTA_INVERTED(1'b0), // Optional inversion for RSTA + .IS_RSTB_INVERTED(1'b0), // Optional inversion for RSTB + .IS_RSTCTRL_INVERTED(1'b0), // Optional inversion for STCONJUGATE_A + .IS_RSTC_INVERTED(1'b0), // Optional inversion for RSTC + .IS_RSTD_INVERTED(1'b0), // Optional inversion for RSTD + .IS_RSTINMODE_INVERTED(1'b0), // Optional inversion for RSTINMODE + .IS_RSTM_INVERTED(1'b0), // Optional inversion for RSTM + .IS_RSTP_INVERTED(1'b0), // Optional inversion for RSTP + // Register Control Attributes: Pipeline Register Configuration + .ACASCREG(INTERNAL_PREGS), // Number of pipeline stages between A/ACIN and ACOUT (0-2) + .ADREG(0), // Pipeline stages for pre-adder (0-1) + .ALUMODEREG(0), // Pipeline stages for ALUMODE (0-1) + .AREG(INTERNAL_PREGS), // Pipeline stages for A (0-2) + .BCASCREG(INTERNAL_PREGS), // Number of pipeline stages between B/BCIN and BCOUT (0-2) + .BREG(INTERNAL_PREGS), // Pipeline stages for B (0-2) + .CARRYINREG(0), // Pipeline stages for CARRYIN (0-1) + .CARRYINSELREG(0), // Pipeline stages for CARRYINSEL (0-1) + .CREG(0), // Pipeline stages for C (0-1) + .DREG(0), // Pipeline stages for D (0-1) + .INMODEREG(1), // Pipeline stages for INMODE (0-1) + .MREG(1), // Multiplier pipeline stages (0-1) + .OPMODEREG(1), // Pipeline stages for OPMODE (0-1) + .PREG(PREG), // Number of pipeline stages for P (0-1) + .RESET_MODE("SYNC") // Selection of synchronous or asynchronous reset. (ASYNC, SYNC). + ) + DSP58_inst ( + // Cascade outputs: Cascade Ports + .ACOUT(), // 34-bit output: A port cascade + .BCOUT(), // 24-bit output: B cascade + .CARRYCASCOUT(), // 1-bit output: Cascade carry + .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade + .PCOUT(pcout[j][i]), // 58-bit output: Cascade output + // Control outputs: Control Inputs/Status Bits + .OVERFLOW(), // 1-bit output: Overflow in add/acc + .PATTERNBDETECT(), // 1-bit output: Pattern bar detect + .PATTERNDETECT(), // 1-bit output: Pattern detect + .UNDERFLOW(), // 1-bit output: Underflow in add/acc + // Data outputs: Data Ports + .CARRYOUT(), // 4-bit output: Carry + .P(pp), // 58-bit output: Primary data + .XOROUT(), // 8-bit output: XOR data + // Cascade inputs: Cascade Ports + .ACIN('x), // 34-bit input: A cascade data + .BCIN('x), // 24-bit input: B cascade + .CARRYCASCIN('x), // 1-bit input: Cascade carry + .MULTSIGNIN('x), // 1-bit input: Multiplier sign cascade + .PCIN(FIRST ? 'x : pcout[j][i-1]), // 58-bit input: P cascade + // Control inputs: Control Inputs/Status Bits + .ALUMODE(4'h0), // 4-bit input: ALU control + .CARRYINSEL('0), // 3-bit input: Carry select + .CLK(clk), // 1-bit input: Clock + .INMODE({ + INTERNAL_PREGS==2 ? 1'b0 : 1'b1, + 2'b00, + TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero, + INTERNAL_PREGS==2 ? 1'b0 : 1'b1 + }), // 5-bit input: INMODE control + .NEGATE('0), // 3-bit input: Negates the input of the multiplier + .OPMODE({ + LAST ? {1'b0, L[1]} : 2'b00, + 7'b000_0000 + }), // 9-bit input: Operation mode + // Data inputs: Data Ports + .A({ 7'bx, a_in_i[i] }), // 34-bit input: A data + .B(b_in_i[j][i]), // 24-bit input: B data + .C('x), // 58-bit input: C data + .CARRYIN('0), // 1-bit input: Carry-in + .D('x), // 27-bit input: D data + // Reset/Clock Enable inputs: Reset/Clock Enable Inputs + .ASYNC_RST('0), // 1-bit input: Asynchronous reset for all registers. + .CEA1(en), // 1-bit input: Clock enable for 1st stage AREG + .CEA2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage AREG + .CEAD('0), // 1-bit input: Clock enable for ADREG + .CEALUMODE('0), // 1-bit input: Clock enable for ALUMODE + .CEB1(en), // 1-bit input: Clock enable for 1st stage BREG + .CEB2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage BREG + .CEC('0), // 1-bit input: Clock enable for CREG + .CECARRYIN('0), // 1-bit input: Clock enable for CARRYINREG + .CECTRL(en), // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG + .CED('0), // 1-bit input: Clock enable for DREG + .CEINMODE(en), // 1-bit input: Clock enable for INMODEREG + .CEM(en), // 1-bit input: Clock enable for MREG + .CEP(PREG && en), // 1-bit input: Clock enable for PREG + .RSTA(rst), // 1-bit input: Reset for AREG + .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG + .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG + .RSTB(rst), // 1-bit input: Reset for BREG + .RSTC('0), // 1-bit input: Reset for CREG + .RSTCTRL(rst), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG + .RSTD('0), // 1-bit input: Reset for DREG and ADREG + .RSTINMODE(rst), // 1-bit input: Reset for INMODE register + .RSTM(rst), // 1-bit input: Reset for MREG + .RSTP(PREG && rst) // 1-bit input: Reset for PREG + ); + end : genDSP end : genDSPChain end : genDSPPE From 18f94e7ab03a3034083680faa91a80359858589e Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 8 May 2023 10:18:58 +0100 Subject: [PATCH 111/235] [rtl mvu]: extended flow control wrapper with additional compute core and other minor changes --- finn-rtllib/mvu/mvu_axi.sv | 51 +++++++++++++++++++------------ finn-rtllib/mvu/mvu_axi_wrapper.v | 48 ++++++++++++++--------------- 2 files changed, 54 insertions(+), 45 deletions(-) diff --git a/finn-rtllib/mvu/mvu_axi.sv b/finn-rtllib/mvu/mvu_axi.sv index 5d8700738f..e4a919ba88 100644 --- a/finn-rtllib/mvu/mvu_axi.sv +++ b/finn-rtllib/mvu/mvu_axi.sv @@ -41,8 +41,8 @@ module mvu_axi #( int unsigned ACCU_WIDTH, bit SIGNED_ACTIVATIONS = 0, int unsigned SEGMENTLEN = 0, - parameter RAM_STYLE = "auto", - parameter MVU_IMPL_STYLE, + bit FORCE_BEHAVIORAL = 0, + string MVU_IMPL_STYLE, localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, localparam int unsigned INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8, @@ -96,12 +96,14 @@ module mvu_axi #( $error("Activation width of %0d-bits exceeds maximum of 8-bits for unsigned numbers", ACTIVATION_WIDTH); $finish; end - if (SEGMENTLEN == 0) begin - $warning("Segment length of %0d defaults to chain length", SEGMENTLEN); - end - if (SEGMENTLEN > (SIMD+2)/3) begin - $error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3); - $finish; + if (MVU_IMPL_STYLE == "mvu_8sx9") begin + if (SEGMENTLEN == 0) begin + $warning("Segment length of %0d defaults to chain length", SEGMENTLEN); + end + if (SEGMENTLEN > (SIMD+2)/3) begin + $error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3); + $finish; + end end end @@ -116,7 +118,7 @@ module mvu_axi #( uwire avld; uwire ardy; - replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvauin_t)), .RAM_STYLE(RAM_STYLE)) activation_replay ( + replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvauin_t))) activation_replay ( .clk, .rst, .ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)), .ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin) @@ -133,28 +135,37 @@ module mvu_axi #( uwire [PE-1:0][ACCU_WIDTH-1:0] odat; typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t; - if (MVU_IMPL_STYLE == "mvu_8sx9") begin : genMVU8sx9 + if (MVU_IMPL_STYLE == "mvu_8sx9_dsp58") begin : genMVU8sx9 mvu_8sx9 #(.PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), - .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN)) core ( + .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), + .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( .clk, .rst, .en, - .last(alast), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau), + .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau), .vld(ovld), .p(odat) ); end else if (MVU_IMPL_STYLE == "mvu_4sx4u") begin : genMVU4sx4u - mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .FORCE_BEHAVIORAL(0)) core ( + mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( .clk, .rst, .en, - .last(alast), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau), + .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau), .vld(ovld), .p(odat) ); end - //else begin - // $error("Unrecognized MVU_IMPL_STYLE!"); - // $finish; - //end + else if (MVU_IMPL_STYLE == "mvu_8sx8u_dsp48") begin : genMVU8sx8u + mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), + .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( + .clk, .rst, .en, + .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau), + .vld(ovld), .p(odat) + ); + end + else initial begin + $error("Unrecognized MVU_IMPL_STYLE!"); + $finish; + end //-------------------- Output register slice --------------------\\ - struct { + struct packed { logic vld; logic [PE-1:0][ACCU_WIDTH-1:0] dat; } A = '{ vld: 0, default: 'x}; @@ -175,7 +186,7 @@ module mvu_axi #( end end - struct { + struct packed { logic vld; logic [PE-1:0][ACCU_WIDTH-1:0] dat; } B = '{ vld: 0, default: 'x}; diff --git a/finn-rtllib/mvu/mvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_axi_wrapper.v index 323d2711e4..b79ba6bbd1 100644 --- a/finn-rtllib/mvu/mvu_axi_wrapper.v +++ b/finn-rtllib/mvu/mvu_axi_wrapper.v @@ -41,7 +41,8 @@ module $MODULE_NAME_AXI_WRAPPER$ #( parameter ACCU_WIDTH = $ACCU_WIDTH$, parameter SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$, parameter SEGMENTLEN = $SEGMENTLEN$, - parameter RAM_STYLE = "$IBUF_RAM_STYLE$", + parameter MVU_IMPL_STYLE = "$MVU_IMPL_STYLE$", + parameter FORCE_BEHAVIORAL = $FORCE_BEHAVIORAL$, // Safely deducible parameters parameter WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, @@ -50,41 +51,38 @@ module $MODULE_NAME_AXI_WRAPPER$ #( parameter OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8 )( // Global Control - input logic ap_clk, - input logic ap_rst_n, - + input ap_clk, + input ap_rst_n, // Weight Stream - input logic [WEIGHT_STREAM_WIDTH_BA-1:0] s_axis_weights_tdata, - input logic s_axis_weights_tvalid, - output logic s_axis_weights_tready, - + input [WEIGHT_STREAM_WIDTH_BA-1:0] weights_V_TDATA, + input weights_V_TVALID, + output weights_V_TREADY, // Input Stream - input logic [INPUT_STREAM_WIDTH_BA-1:0] s_axis_input_tdata, - input logic s_axis_input_tvalid, - output logic s_axis_input_tready, - + input [INPUT_STREAM_WIDTH_BA-1:0] in0_V_TDATA, + input in0_V_TVALID, + output in0_V_TREADY, // Output Stream - output logic [OUTPUT_STREAM_WIDTH_BA-1:0] m_axis_output_tdata, - output logic m_axis_output_tvalid, - input logic m_axis_output_tready + output [OUTPUT_STREAM_WIDTH_BA-1:0] out_V_TDATA, + output out_V_TVALID, + input out_V_TREADY ); mvu_axi #( .MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), - .SEGMENTLEN(SEGMENTLEN), .RAM_STYLE(RAM_STYLE) + .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL), .MVU_IMPL_STYLE(MVU_IMPL_STYLE) ) inst ( .ap_clk(ap_clk), .ap_rst_n(ap_rst_n), - .s_axis_weights_tdata(s_axis_weights_tdata), - .s_axis_weights_tvalid(s_axis_weights_tvalid), - .s_axis_weights_tready(s_axis_weights_tready), - .s_axis_input_tdata(s_axis_input_tdata), - .s_axis_input_tvalid(s_axis_input_tvalid), - .s_axis_input_tready(s_axis_input_tready), - .m_axis_output_tdata(m_axis_output_tdata), - .m_axis_output_tvalid(m_axis_output_tvalid), - .m_axis_output_tready(m_axis_output_tready) + .s_axis_weights_tdata(weights_V_TDATA), + .s_axis_weights_tvalid(weights_V_TVALID), + .s_axis_weights_tready(weights_V_TREADY), + .s_axis_input_tdata(in0_V_TDATA), + .s_axis_input_tvalid(in0_V_TVALID), + .s_axis_input_tready(in0_V_TREADY), + .m_axis_output_tdata(out_V_TDATA), + .m_axis_output_tvalid(out_V_TVALID), + .m_axis_output_tready(out_V_TREADY) ); endmodule : $MODULE_NAME_AXI_WRAPPER$ \ No newline at end of file From 6d4a0a764e0e6ded16d7034e0d69f5408c76ca75 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 8 May 2023 10:22:51 +0100 Subject: [PATCH 112/235] [rtl mvu]: fix to done_len flag when SIMD dimension fully unrolled and PyVerilator-related syntax change --- finn-rtllib/mvu/replay_buffer.sv | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/finn-rtllib/mvu/replay_buffer.sv b/finn-rtllib/mvu/replay_buffer.sv index 685ac03137..89bbbdb88f 100644 --- a/finn-rtllib/mvu/replay_buffer.sv +++ b/finn-rtllib/mvu/replay_buffer.sv @@ -35,8 +35,7 @@ module replay_buffer #( int unsigned LEN, // Sequence length int unsigned REP, // Sequence replay count - int unsigned W, // Data width - parameter RAM_STYLE = "auto" // ram style for buffer {block, distributed, ultra, auto} + int unsigned W // Data width )( input logic clk, input logic rst, @@ -54,7 +53,7 @@ module replay_buffer #( typedef logic [$clog2(REP)+$clog2(LEN)-1:0] count_t; count_t Count = 0; - uwire done_len = ((LEN-1) & ~Count[$clog2(LEN)-1:0]) == 0; + uwire done_len = LEN == 1 ? 1 : ((LEN-1) & ~Count[$clog2(LEN)-1:0]) == 0; uwire done_rep; uwire done_all = done_len && done_rep; @@ -83,7 +82,6 @@ module replay_buffer #( end assign first_rep = FirstRep; - (* RAM_STYLE = RAM_STYLE *) data_t Buf[LEN]; if(LEN == 1) begin : genTrivial always_ff @(posedge clk) begin @@ -92,7 +90,10 @@ module replay_buffer #( end : genTrivial else begin : genShift always_ff @(posedge clk) begin - if(shift) Buf <= { odat, Buf[0:LEN-2] }; + if(shift) begin + Buf[0] <= odat; + Buf[1:LEN-1] <= Buf[0:LEN-2]; + end end end : genShift From 90c547d54756aed2aa101862fb6f55c05149173c Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 8 May 2023 10:23:22 +0100 Subject: [PATCH 113/235] [rtl mvu tb]: updated testbench --- finn-rtllib/mvu/tb/mvu_axi_tb.sv | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/finn-rtllib/mvu/tb/mvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_axi_tb.sv index 08a349da84..ef5fa7d682 100644 --- a/finn-rtllib/mvu/tb/mvu_axi_tb.sv +++ b/finn-rtllib/mvu/tb/mvu_axi_tb.sv @@ -35,17 +35,18 @@ module mvu_axi_tb(); //-------------------- Simulation parameters --------------------\\ // Matrix & parallelism config - localparam int unsigned MW = 90; - localparam int unsigned MH = 16; - localparam int unsigned SIMD = 9; - localparam int unsigned PE = 4; - localparam int unsigned SEGMENTLEN = 1; - localparam string MVU_IMPL_STYLE = "mvu_8sx9"; + localparam int unsigned MW = 50; + localparam int unsigned MH = 8; + localparam int unsigned SIMD = 10; + localparam int unsigned PE = 2; + localparam int unsigned SEGMENTLEN = 2; + localparam string MVU_IMPL_STYLE = "mvu_8sx8u_dsp48"; + localparam bit FORCE_BEHAVIORAL = 1; // Bit-width config localparam int unsigned ACTIVATION_WIDTH = 8; - localparam int unsigned WEIGHT_WIDTH = 4; + localparam int unsigned WEIGHT_WIDTH = 8; localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW); - localparam bit SIGNED_ACTIVATIONS = 1; + localparam bit SIGNED_ACTIVATIONS = 0; // Simulation constants localparam int unsigned NF = MH/PE; localparam int unsigned SF = MW/SIMD; @@ -94,7 +95,7 @@ module mvu_axi_tb(); for (int i=0; i 1; + activations.vld = $urandom()%7 >= 1; @(posedge clk); end while (!(activations.vld === 1 && activations.rdy === 1)); end @@ -201,6 +202,7 @@ module mvu_axi_tb(); .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), + .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL), .MVU_IMPL_STYLE(MVU_IMPL_STYLE) ) dut ( From 0c37f1f7bed1143833649accceb59bd6821bed3c Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 8 May 2023 10:25:10 +0100 Subject: [PATCH 114/235] [builder]: added specialize_to_rtl step and changed standalone threshold layers to be by default true --- src/finn/builder/build_dataflow_config.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py index 4c3e4ff899..24940489df 100644 --- a/src/finn/builder/build_dataflow_config.py +++ b/src/finn/builder/build_dataflow_config.py @@ -121,6 +121,7 @@ class VerificationStepType(str, Enum): "step_apply_folding_config", "step_minimize_bit_width", "step_generate_estimate_reports", + "step_specialize_to_rtl", "step_hls_codegen", "step_hls_ipgen", "step_set_fifo_depths", @@ -233,7 +234,7 @@ class DataflowBuildConfig: #: activations in FINN) will be implemented as stand-alone HLS layers, #: instead of being part of MatrixVectorActivation layer. This gives larger #: flexibility, and makes it possible to have runtime-writable thresholds. - standalone_thresholds: Optional[bool] = False + standalone_thresholds: Optional[bool] = True #: (Optional) Whether optimizations that minimize the bit width of the #: weights and accumulator will be applied. Because this optimization relies From 5ccb016a640dbed6818a9f1f3ef46136ce949c0d Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 8 May 2023 10:26:03 +0100 Subject: [PATCH 115/235] [builder]: added specialize_to_rtl step --- src/finn/builder/build_dataflow_steps.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py index e43a29d632..3e4d047a51 100644 --- a/src/finn/builder/build_dataflow_steps.py +++ b/src/finn/builder/build_dataflow_steps.py @@ -123,6 +123,7 @@ ) from finn.util.pyverilator import verilator_fifosim from finn.util.test import execute_parent +import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl def verify_step( @@ -483,6 +484,16 @@ def step_generate_estimate_reports(model: ModelWrapper, cfg: DataflowBuildConfig return model +def step_specialize_to_rtl(model: ModelWrapper, cfg: DataflowBuildConfig): + """Convert layers implemented in HLS to an equivalent specialized RTL implementation if possible.""" + specialize_to_rtl_transforms = [ + to_rtl.InferRTLMatrixVectorActivation() + ] + for trn in specialize_to_rtl_transforms: + model = model.transform(trn) + return model + + def step_minimize_bit_width(model: ModelWrapper, cfg: DataflowBuildConfig): """Tighten the weight and accumulator bit widths for each layer.""" if cfg.minimize_bit_width: @@ -855,6 +866,7 @@ def step_deployment_package(model: ModelWrapper, cfg: DataflowBuildConfig): "step_apply_folding_config": step_apply_folding_config, "step_minimize_bit_width": step_minimize_bit_width, "step_generate_estimate_reports": step_generate_estimate_reports, + "step_specialize_to_rtl": step_specialize_to_rtl, "step_hls_codegen": step_hls_codegen, "step_hls_ipgen": step_hls_ipgen, "step_set_fifo_depths": step_set_fifo_depths, From f099f4bbfd01b628a89c6099f637a4a85a8158ca Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 8 May 2023 10:26:44 +0100 Subject: [PATCH 116/235] [custom op]: added custom op MatrixVectorActivation_rtl --- src/finn/custom_op/fpgadataflow/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index 56d4230a3a..19c0ddd999 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -49,6 +49,7 @@ from finn.custom_op.fpgadataflow.labelselect_batch import LabelSelect_Batch from finn.custom_op.fpgadataflow.lookup import Lookup from finn.custom_op.fpgadataflow.matrixvectoractivation import MatrixVectorActivation +from finn.custom_op.fpgadataflow.matrixvectoractivation_rtl import MatrixVectorActivation_rtl from finn.custom_op.fpgadataflow.pool_batch import Pool_Batch from finn.custom_op.fpgadataflow.streamingdataflowpartition import ( StreamingDataflowPartition, @@ -70,6 +71,7 @@ custom_op["DownSampler"] = DownSampler custom_op["StreamingMaxPool_Batch"] = StreamingMaxPool_Batch custom_op["MatrixVectorActivation"] = MatrixVectorActivation +custom_op["MatrixVectorActivation_rtl"] = MatrixVectorActivation_rtl custom_op["ConvolutionInputGenerator"] = ConvolutionInputGenerator custom_op["ConvolutionInputGenerator1D"] = ConvolutionInputGenerator1D custom_op["ConvolutionInputGenerator_rtl"] = ConvolutionInputGenerator_rtl From 9a3b0fdc54f8c7c1b541c8cfdaaf6e96315da092 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 8 May 2023 10:28:34 +0100 Subject: [PATCH 117/235] [custom op]: added additional attribute to enable conversion to RTL (custom-op) --- src/finn/custom_op/fpgadataflow/matrixvectoractivation.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py index aa987384dd..e54abb0c3f 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py @@ -70,7 +70,7 @@ def get_nodeattr_types(self): "SIMD": ("i", True, 0), "MW": ("i", True, 0), "MH": ("i", True, 0), - "resType": ("s", False, "lut", {"auto", "lut", "dsp"}), + "resType": ("s", False, "dsp", {"auto", "lut", "dsp"}), "ActVal": ("i", False, 0), # FINN DataTypes for inputs, weights, outputs "inputDataType": ("s", True, ""), @@ -125,6 +125,8 @@ def get_nodeattr_types(self): # vector through the accelerator. This will get rid of any old # weight data from the weight FIFOs. "runtime_writeable_weights": ("i", False, 0, {0, 1}), + # Flag to specify whether RTL-based or HLS-based implementation is preferred + "impl": ("s", False, "rtl", {"hls", "rtl"}) } my_attrs.update(super().get_nodeattr_types()) return my_attrs From 38aa930baa1296a7099f9df22e3d0d000c8d5a05 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 8 May 2023 10:30:15 +0100 Subject: [PATCH 118/235] [custom op]: modified ip-stitching and code generation --- .../matrixvectoractivation_rtl.py | 231 ++++++++++-------- 1 file changed, 127 insertions(+), 104 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py index c8a0aa675b..6b1c2f3be7 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py @@ -27,6 +27,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import math +from shutil import copy import numpy as np import os import textwrap @@ -45,6 +46,12 @@ pack_innermost_dim_as_hex_string, rtlsim_output_to_npy, ) +from finn.util.basic import get_rtlsim_trace_depth, make_build_dir + +try: + from pyverilator import PyVerilator +except ModuleNotFoundError: + PyVerilator = None from . import templates @@ -60,8 +67,8 @@ class MatrixVectorActivation_rtl(HLSCustomOp): """Class that corresponds to finn-hls Matrix_Vector_Activate(_Stream)_Batch function.""" - def __init__(self, onnx_node): - super().__init__(onnx_node) + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) self.decoupled_wrapper = templates.decoupled_wrapper def get_nodeattr_types(self): @@ -78,11 +85,6 @@ def get_nodeattr_types(self): "outputDataType": ("s", True, ""), # FINN DataType for accumulator -- auto-computed and updated "accDataType": ("s", False, "INT32"), - # use xnor-popcount for binary weights/inputs, thus treating them - # as bipolar - "binaryXnorMode": ("i", False, 0, {0, 1}), - # no-activation mode (produce accumulators) - "noActivation": ("i", False, 0, {0, 1}), # number of input vectors, examples: # [1] is a single vector (like a FC layer with batch=1) # [4] is four vectors (like a FC layer with batch=4) @@ -105,16 +107,6 @@ def get_nodeattr_types(self): "auto", {"auto", "block", "distributed", "ultra"}, ), - # FPGA resource type for threshold memories (if noActivation is False) - # auto -- let Vivado decide - # block -- use BRAM - # distributed -- use LUTRAM - "ram_style_thresholds": ( - "s", - False, - "auto", - {"auto", "block", "distributed"}, - ), # (mem_mode = decoupled only) whether weights will be writable through # an AXI-lite interface during runtime # 1 for enabled, 0 for disabled. @@ -125,6 +117,8 @@ def get_nodeattr_types(self): # vector through the accelerator. This will get rid of any old # weight data from the weight FIFOs. "runtime_writeable_weights": ("i", False, 0, {0, 1}), + # attribute to save top module name - not user configurable + "gen_top_module": ("s", False, ""), } my_attrs.update(super().get_nodeattr_types()) return my_attrs @@ -142,7 +136,6 @@ def calc_wmem(self): def calc_tmem(self): """Calculates and returns TMEM.""" - assert self.get_nodeattr("noActivation")==1, "RTL-based MVAU does not support thresholding currently, please infer a standalone Thresholding_Batch layer" return 0 def make_shape_compatible_op(self, model): @@ -192,27 +185,9 @@ def verify_node(self): """The required MatrixVectorActivation attributes do not exist.""" ) - # verify the number of inputs depending on noActivation value - # check noActivation value to determine the number of inputs - no_act = self.get_nodeattr("noActivation") - - if no_act == 1: - if len(self.onnx_node.input) == 2: - info_messages.append("The number of inputs is correct") - else: - info_messages.append( - """RTL-based MatrixVectorActivation needs in no - activation mode 2 inputs (data input and weights)""" - ) - elif no_act == 0: - info_messages.append("RTL-based MVAU does not support thresholding currently, please infer a standalone Thresholding_Batch layer") - else: - info_messages.append( - """noActivation attribute contains {} should - be 1 for RTL-based MatrixVectorActivation""".format( - no_act - ) - ) + num_of_inputs = len(self.onnx_node.input) + if num_of_inputs!=2: + info_messages.append("RTL-based MatrixVectorActivation expects two inputs (weights and activation), but got {} inputs.".format(len(self.onnx_node.input))) mem_mode = self.get_nodeattr("mem_mode") @@ -221,6 +196,7 @@ def verify_node(self): return info_messages +# TODO: Add in replay_buffer estimation def uram_estimation(self): P = self.get_nodeattr("PE") Q = self.get_nodeattr("SIMD") @@ -242,6 +218,7 @@ def uram_estimation(self): depth_multiplier = math.ceil(omega / 4096) return width_multiplier * depth_multiplier +# TODO: Add in replay_buffer estimation def bram_estimation(self): """Calculates resource estimation for BRAM based on: - FINN-R: An End-to-End Deep-Learning Framework for Fast @@ -268,7 +245,7 @@ def bram_estimation(self): ): return 0 # assuming SDP mode RAMB18s (see UG573 Table 1-10) - # assuming decoupled (RTL) memory, which is more efficient than const (HLS) + # assuming decoupled (RTL) memory if mem_width == 1: return math.ceil(omega / 16384) elif mem_width == 2: @@ -282,6 +259,7 @@ def bram_estimation(self): else: return (math.ceil(omega / 512)) * (math.ceil(mem_width / 36)) +# TODO: Add in replay_buffer estimation def bram_efficiency_estimation(self): wdt = self.get_weight_datatype() W = wdt.bitwidth() @@ -294,6 +272,7 @@ def bram_efficiency_estimation(self): bram16_est_capacity = bram16_est * 36 * 512 return wbits / bram16_est_capacity +# TODO: Add in replay_buffer estimation def uram_efficiency_estimation(self): """Function for URAM efficiency estimation: actual parameter storage needed divided by the allocated URAM storage (from estimation)""" @@ -308,7 +287,7 @@ def uram_efficiency_estimation(self): uram_est_capacity = uram_est * 72 * 4096 return wbits / uram_est_capacity -#TODO: FIX +#TODO: FIX: worst case estimates since segmentlen is not known at this point? def lut_estimation(self): """Calculates resource estimations for LUTs based on: - FINN-R: An End-to-End Deep-Learning Framework for Fast @@ -348,23 +327,14 @@ def lut_estimation(self): # accumulator acc_bits = W + A + np.ceil(math.log(MW, 2)) acc_luts = acc_bits - # thresholds and threshold comparators - thr_luts = 0 - comp_luts = 0 - noact = self.get_nodeattr("noActivation") - if noact == 0: - odt = self.get_output_datatype() - B = odt.bitwidth() - thr_luts = (2**B - 1) * acc_bits * math.ceil(self.calc_tmem() / 64) - comp_luts = (2**B - 1) * acc_bits return int( c0 - + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts)) + + c1 * (P * (mult_luts + addertree_luts + acc_luts)) + c2 ) -#TODO: FIX +#TODO: FIX: worst case estimates since segmentlen is not known at this point? def dsp_estimation(self): # multiplication P = self.get_nodeattr("PE") @@ -380,7 +350,7 @@ def dsp_estimation(self): mult_dsp = 0 return int(mult_dsp) -#TODO: FIX +#TODO: FIX: worst case estimates since segmentlen is not known at this point def get_exp_cycles(self): pe = self.get_nodeattr("PE") simd = self.get_nodeattr("SIMD") @@ -389,6 +359,7 @@ def get_exp_cycles(self): mw = self.get_nodeattr("MW") # since mmv != 1 is not supported yet, we set mmv for now to 1 mmv = 1 + # Actual exp_cycles is probably slightly larger (say 3 cycles (DSP A/B, M, P - reg) + additional pipeline buffer cycles. Most probably <10) exp_cycles = (mh / pe) * (mw / simd) * np.prod(num_inp_vec) / mmv return int(exp_cycles) @@ -413,7 +384,7 @@ def get_output_datatype(self, ind=0): def get_instream_width(self, ind=0): i_bits = self.get_input_datatype().bitwidth() - assert i_bits<=9, "RTL-based MVAU only supports activations with bit-width up to 9-bits" + assert (i_bits<=9), "RTL-based MVAU only supports activations with bit-width up to 9-bits" in_width = i_bits * self.get_nodeattr("SIMD") return in_width @@ -431,8 +402,8 @@ def get_weightstream_width(self): pe = self.get_nodeattr("PE") simd = self.get_nodeattr("SIMD") wp = self.get_weight_datatype().bitwidth() + assert (wp <= 8), "RTL-based MVAU only supports weights with bit-width up to 8-bits" w_width = pe * simd * wp - assert wp<=8, "RTL-based MVAU only supports weights with bit-width up to 8-bits" return w_width else: return 0 @@ -544,10 +515,8 @@ def minimize_accumulator_width(self, model): adt = DataType.get_smallest_possible(-acc_max - 1) else: adt = DataType.get_smallest_possible(acc_max) - # ensure a datatype divisible by 8-bits in case this is the last node - bw = roundup_to_integer_multiple(adt.bitwidth(), 8) - new_adt_name = adt.name.replace(str(adt.bitwidth()), str(bw)) - adt = DataType[new_adt_name] + # Note: we are interested in simply the width of the output dot product. + # Padding the actual output stream to a multiple of 8-bits is done in the RTL component self.set_nodeattr("accDataType", adt.name) # for no-activation nodes, output dt = acc dt self.set_nodeattr("outputDataType", adt.name) @@ -588,7 +557,10 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name): 1, -1, pe * simd ) weight_tensor_pe_flipped = weight_tensor_pe_flipped.copy() - if weight_file_mode == "decoupled_verilog_dat": + if weight_file_mode == "decoupled_npy": + # save weight stream into npy for cppsim + np.save(weight_file_name, weight_tensor_simd_flipped) + elif weight_file_mode == "decoupled_verilog_dat": # convert weight values into hexstring weight_width = self.get_weightstream_width() # pad to nearest 4 bits to get hex strings @@ -638,7 +610,7 @@ def generate_params(self, model, path): weight_filename_sim = "{}/weights.npy".format(code_gen_dir) # save decoupled weights for cppsim self.make_weight_file(weights, "decoupled_npy", weight_filename_sim) - # also save weights as Verilog .dat file + # Also save weights as Verilog .dat file # note that we provide two different .dat files, one for synth # and one for synthesis. this is because URAM-based weights always # need zero weights for synthesis, otherwise they get inferred @@ -693,7 +665,6 @@ def execute_node(self, context, graph): for inputs in node.input: # it is assumed that the first input of the node is the data input # the second input are the weights - # the third input are the thresholds if in_ind == 0: assert ( str(context[inputs].dtype) == "float32" @@ -709,7 +680,7 @@ def execute_node(self, context, graph): reshaped_input, ) elif in_ind > 2: - raise Exception("Unexpected input found for MatrixVectorActivation") + raise Exception("Unexpected input found for MatrixVectorActivation_rtl") in_ind += 1 if mode == "rtlsim": @@ -759,7 +730,7 @@ def execute_node(self, context, graph): def code_generation_ipgen(self, model, fpgapart, clk): """Normally: Generates C++ code and tcl script for IP generation. Here: Generates (System-)Verilog code for IP generation.""" - self.generate_hdl() + self.generate_hdl(model, fpgapart, clk) def ipgen_singlenode_code(self): """Normally: Builds the bash script for IP generation.""" @@ -828,11 +799,21 @@ def code_generation_ipi(self): "create_bd_intf_pin -mode Slave " "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name) ) - # instantiate the hls ip - cmd.append( - "create_bd_cell -type ip -vlnv %s /%s/%s" - % (self.get_nodeattr("ip_vlnv"), node_name, node_name) - ) + # instantiate the RTL block + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/") + sourcefiles = [ + os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"), + rtllib_dir + "mvu_axi.sv", + rtllib_dir + "replay_buffer.sv", + rtllib_dir + "mvu_4sx4u.sv", + rtllib_dir + "mvu_8sx9.sv", + rtllib_dir + "mvu_8sx8u_dsp48.sv" + ] + for f in sourcefiles: + cmd.append("add_files -norecurse %s" % (f)) + cmd.append("create_bd_cell -type hier -reference %s /%s/%s" % (self.get_nodeattr("gen_top_module"), self.onnx_node.name, self.onnx_node.name)) + # instantiate a streamer and connect it to the HLS IP strm_vlnv = "xilinx.com:user:memstream:1.0" strm_inst = node_name + "_wstrm" @@ -947,12 +928,6 @@ def get_op_and_param_counts(self): weight_param_type = "param_weight_%db" % (weight_bits) weight_count = in_features * out_features ret_dict = {mac_op_type: mac_count, weight_param_type: weight_count} - if self.get_nodeattr("noActivation") == 0: - tdt = DataType[self.get_nodeattr("accDataType")] - thres_bits = tdt.bitwidth() - thres_param_type = "param_threshold_%db" % (thres_bits) - thres_count = out_features - ret_dict[thres_param_type] = thres_count return ret_dict def derive_characteristic_fxns(self, period): @@ -972,65 +947,113 @@ def derive_characteristic_fxns(self, period): ] super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) - def generate_hdl(self): -#TODO: add distinction between (PE=MH or PE=1) and where MH dimension is folded - template_path, code_gen_dict = self.prepare_codegen_default() +# TODO: characterize max_clk and implement this function in look-up style + def _resolve_segment_len(self, clk): + # Insert pipeline registers in the DSP chain to meet target clock frequency + segmentlen = 0 + return segmentlen + + def _resolve_impl_style(self, fpgapart): + # Based on target device and activation/weight-width, choose the supported RTL module + act_width = self.get_input_datatype(0).bitwidth() + weight_width = self.get_input_datatype(1).bitwidth() + is_versal = fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"] or fpgapart[0:5] == "xqrvc" + if (act_width == 4 and weight_width == 4): + return "mvu_4sx4u" + else: + if (is_versal): + return "mvu_8sx9_dsp58" + else: + return "mvu_8sx8u_dsp48" + + def generate_hdl(self, model, fpgapart, clk): + # Generate params as part of IP preparation + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + self.generate_params(model, code_gen_dir) + template_path, code_gen_dict = self.prepare_codegen_default(fpgapart, clk) # add general parameters to dictionary - code_gen_dict["$TOP_MODULE_NAME$"] = [self.get_verilog_top_module_name()] + code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [self.get_verilog_top_module_name()] # save top module name so we can refer to it after this node has been renamed # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject) self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name()) -#TODO: currently only ram_style=auto is supported + ram_style = self.get_nodeattr("ram_style") - if ram_style == "auto": - continue - else: - raise Exception("Unrecognized ram_style for MatrixVectorActivation") + assert (ram_style=="auto"), "Unrecognized ram_style for MatrixVectorActivation_rtl" - # apply code generation to templates - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + # apply code generation to template with open(template_path, "r") as f: - template = f.read() + template_wrapper = f.read() for key in code_gen_dict: # transform list into long string separated by '\n' code_gen_line = "\n".join(code_gen_dict[key]) - template = template.replace(key, code_gen_line) template_wrapper = template_wrapper.replace(key, code_gen_line) with open( os.path.join( - code_gen_dir, self.get_nodeattr("gen_top_module") + "_impl.sv" + code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v" ), "w", ) as f: - f.write(template) + f.write(template_wrapper.replace("$FORCE_BEHAVIORAL$", str(0))) with open( os.path.join( - code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v" + code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper_sim.v" ), "w", ) as f: - f.write(template_wrapper) + f.write(template_wrapper.replace("$FORCE_BEHAVIORAL$", str(1))) # set ipgen_path and ip_path so that HLS-Synth transformation # and stich_ip transformation do not complain self.set_nodeattr("ipgen_path", code_gen_dir) - self.set_nodeattr("ip_path", code_gen_dir) + self.set_nodeattr("ip_path", code_gen_dir) - def prepare_codegen_default(self): - # TODO: Differentiate between PE folding and fully unrolled along MH dimension + def prepare_codegen_default(self, fpgapart, clk): template_path = ( - os.environ["FINN_ROOT"] + "/finn-rtllib/mvau/dsp58_mvau_template.vhdl" + os.environ["FINN_ROOT"] + "/finn-rtllib/mvu/mvu_axi_wrapper.v" ) + code_gen_dict = {} - - code_gen_dict["$PE$"] = self.get_nodeattr("PE") - code_gen_dict["$SIMD$"] = self.get_nodeattr("SIMD") - code_gen_dict["$MW$"] = self.get_nodeattr("MW") - code_gen_dict["$MH$"] = self.get_nodeattr("MH") - code_gen_dict["$ACTIVATION_WIDTH$"] = self.get_input_datatype(0).bitwidth() - code_gen_dict["$WEIGHT_WIDTH$"] = self.get_input_datatype(1).bitwidth() - code_gen_dict["$ACCU_WIDTH_BA$"] = self.get_output_datatype().bitwidth() + code_gen_dict["$MW$"] = [str(self.get_nodeattr("MW"))] + code_gen_dict["$MH$"] = [str(self.get_nodeattr("MH"))] + code_gen_dict["$PE$"] = [str(self.get_nodeattr("PE"))] + code_gen_dict["$SIMD$"] = [str(self.get_nodeattr("SIMD"))] + code_gen_dict["$ACTIVATION_WIDTH$"] = [str(self.get_input_datatype(0).bitwidth())] + code_gen_dict["$WEIGHT_WIDTH$"] = [str(self.get_input_datatype(1).bitwidth())] + code_gen_dict["$ACCU_WIDTH$"] = [str(self.get_output_datatype().bitwidth())] + code_gen_dict["$SIGNED_ACTIVATIONS$"] = [str(1)] if (self.get_input_datatype(0).min() < 0) else [str(0)] + code_gen_dict["$SEGMENTLEN$"] = [str(self._resolve_segment_len(clk))] + code_gen_dict["$MVU_IMPL_STYLE$"] = [self._resolve_impl_style(fpgapart)] return template_path, code_gen_dict + def prepare_rtlsim(self): + """Creates a Verilator emulation library for the RTL code generated + for this node, sets the rtlsim_so attribute to its path and returns + a PyVerilator wrapper around it.""" + + if PyVerilator is None: + raise ImportError("Installation of PyVerilator is required.") + + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + # Path to (System-)Verilog files used by top-module & path to top-module + verilog_paths = [ + code_gen_dir, + os.environ["FINN_ROOT"] + "/finn-rtllib/mvu" + ] + verilog_files = [ + self.get_nodeattr("gen_top_module") + "_wrapper_sim.v" + ] + + # build the Verilator emu library + sim = PyVerilator.build( + verilog_files, + build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"), + verilog_path=verilog_paths, + trace_depth=get_rtlsim_trace_depth(), + top_module_name=self.get_verilog_top_module_name() + ) + # save generated lib filename in attribute + self.set_nodeattr("rtlsim_so", sim.lib._name) + + return sim \ No newline at end of file From 4e44934c3001174e52c62caf5d320104a308e611 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 8 May 2023 10:31:35 +0100 Subject: [PATCH 119/235] [tests]: initial version of unit test for RTL custom op and specialize_to_rtl transformation for MVU --- .../test_fpgadataflow_mvau_rtl.py | 172 ++++++++++++++++++ 1 file changed, 172 insertions(+) create mode 100644 tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py b/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py new file mode 100644 index 0000000000..20a249bd08 --- /dev/null +++ b/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py @@ -0,0 +1,172 @@ +# Copyright (C) 2022, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest +import os + +import numpy as np +from onnx import TensorProto, helper +from qonnx.util.basic import ( + qonnx_make_model, + gen_finn_dt_tensor +) +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.core.datatype import DataType +from qonnx.transformation.general import GiveUniqueNodeNames +import finn.core.onnx_exec as oxe +import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim +from qonnx.transformation.general import ApplyConfig +import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl +#import qonnx.core.data_layout as DataLayout + +build_dir = os.environ["FINN_BUILD_DIR"] + +def make_single_matmul_modelwrapper(W, ofm_shape, mh, ifm, weights, idt, wdt): + (ofm_h, ofm_w) = ofm_shape + ofm = helper.make_tensor_value_info( + "ofm", + TensorProto.FLOAT, + (1, ofm_h, ofm_w, mh) + ) + + matmul_node = helper.make_node( + "MatMul", + ["ifm", "weights"], + ["ofm"] + ) + graph = helper.make_graph( + nodes=[matmul_node], + name="matmul_graph", + inputs=[ifm], + outputs=[ofm] + ) + + model = qonnx_make_model(graph, producer_name="fclayer-model") + model = ModelWrapper(model) + + model.set_tensor_datatype("ifm", idt) + model.set_tensor_datatype("weights", wdt) + model.set_tensor_datatype("ofm", DataType["INT32"]) # At this step, the MatMul layer does not optimize the bit-width of the output datatype + model.set_initializer("weights", W) + + # model.set_tensor_layout("ifm", DataLayout.NHWC) + + return model + +def prepare_inputs(input_tensor): + return {"inp": input_tensor} + +@pytest.mark.parametrize("mh", [16]) +@pytest.mark.parametrize("mw", [90]) +#@pytest.mark.parametrize("pe", [1, 2, 4, 8, 16]) +@pytest.mark.parametrize("pe", [16]) +#@pytest.mark.parametrize("simd", [1, 30, 90]) +@pytest.mark.parametrize("simd", [90]) +@pytest.mark.parametrize("idt", [DataType["INT8"]]) +@pytest.mark.parametrize("wdt", [DataType["UINT4"]]) +#@pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S", "xcku3p-ffva676-1-e"]) +@pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S"]) +@pytest.mark.parametrize("segmentlen", [1]) +@pytest.mark.fpgadataflow +@pytest.mark.slow +@pytest.mark.vivado +def test_fpgadataflow_mvau_rtl(mh, mw, pe, simd, idt, wdt, part, segmentlen): + # Create test input vector (produced by SWG) + ofm_shape = (5, 5) + ofm_h, ofm_w = ofm_shape + ifm = helper.make_tensor_value_info( + "ifm", + TensorProto.FLOAT, + [1, ofm_h, ofm_w, mw] + ) + weights = helper.make_tensor_value_info( + "weights", + TensorProto.FLOAT, + [mw, mh] + ) + W = gen_finn_dt_tensor(wdt, (mw, mh)) + model = make_single_matmul_modelwrapper(W, ofm_shape, mh, ifm, weights, idt, wdt) + model = model.transform(GiveUniqueNodeNames()) + + model.save(build_dir+"/matmul.onnx") + + # Create MatMul & obtain golden reference output + A = gen_finn_dt_tensor(model.get_tensor_datatype("ifm"), model.get_tensor_shape("ifm")) + input_dict = prepare_inputs(A) + + ## Execute ONNX model + output_matmul = oxe.execute_onnx(model, input_dict) + + # Create MVAU (HLS) + model = model.transform(to_hls.InferQuantizedMatrixVectorActivation(mem_mode="decoupled")) + model = model.transform(GiveUniqueNodeNames()) + + # Apply folding (i.e. specify to use DSPs) + folding_config = { + "Defaults": {}, + "MatrixVectorActivation_0": { + "PE" : pe, + "SIMD" : simd, + "mem_mode" : "decoupled", + "ram_style" : "auto", + "resType" : "dsp", + "impl" : "rtl" + } + } + model = model.transform(ApplyConfig(folding_config)) + model.save(build_dir+"/mvau_hls.onnx") + + model = model.transform(SetExecMode("rtlsim")) + model = model.transform(PrepareIP(part, 5)) + model = model.transform(HLSSynthIP()) + model = model.transform(PrepareRTLSim()) + output_mvau_hls = oxe.execute_onnx(model, input_dict)["ofm"] + + # Apply convert-to-rtl step + model = model.transform(to_rtl.InferRTLMatrixVectorActivation()) + model = model.transform(GiveUniqueNodeNames()) + model.save(build_dir+"/mvau_rtl.onnx") + + model = model.transform(SetExecMode("rtlsim")) + model = model.transform(PrepareIP("xcvm1802-vsvd1760-2MP-e-S", 5)) + model = model.transform(HLSSynthIP()) + model = model.transform(PrepareRTLSim()) + output_mvau_rtl = oxe.execute_onnx(model, input_dict)["ofm"] + + model.save(build_dir+"/mvau_rtl_sim.onnx") + + assert (output_mvau_hls == output_mvau_rtl).all() + assert (output_mvau_hls.size > 0) + + +# python setup.py test --addopts "-k test_fpgadataflow_mvau_rtl" +# python setup.py test --addopts "-k test_fpgadataflow_fclayer_rtlsim" \ No newline at end of file From cc361d9fd4ea082e04d7a1a6bc3932406b0a4f14 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 8 May 2023 10:32:52 +0100 Subject: [PATCH 120/235] [rtl mvu]: specialized compute core for 4-bit weights and activations for DSP48/DSP58 --- finn-rtllib/mvu/mvu_4sx4u.sv | 359 +++++++++++++++++++++++++++++++++++ 1 file changed, 359 insertions(+) create mode 100644 finn-rtllib/mvu/mvu_4sx4u.sv diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv new file mode 100644 index 0000000000..5993154355 --- /dev/null +++ b/finn-rtllib/mvu/mvu_4sx4u.sv @@ -0,0 +1,359 @@ +module mvu_4sx4u #( + int unsigned PE, + int unsigned SIMD, + int unsigned ACCU_WIDTH, + bit FORCE_BEHAVIORAL = 0 +)( + // Global Control + input logic clk, + input logic rst, + input logic en, + + // Input + input logic last, + input logic zero, // ignore current inputs and force this partial product to zero + input logic signed [PE-1:0][SIMD-1:0][3:0] w, // signed weights + input logic [SIMD-1:0][3:0] a, // unsigned activations + + // Ouput + output logic vld, + output logic signed [PE-1:0][ACCU_WIDTH-1:0] p +); + + typedef int unsigned leave_load_t[2*SIMD-1]; + function leave_load_t init_leave_loads(); + automatic leave_load_t res; + for(int i = 2*(SIMD-1); i >= int'(SIMD)-1; i--) res[i] = 1; + for(int i = SIMD-2; i >= 0; i--) res[i] = res[2*i+1] + res[2*i+2]; + return res; + endfunction : init_leave_loads + + // Pipeline for last indicator flag + logic [1:5] L = '0; + always_ff @(posedge clk) begin + if(rst) L <= '0; + else if(en) L <= { last, L[1:4] }; + end + assign vld = L[5]; + + // Stages #1 - #3: DSP Lanes + cross-lane canaries duplicated with SIMD parallelism + localparam int unsigned D[4:0] = '{ ACCU_WIDTH+22, 22, 15, 8, 0 }; // Lane offsets + + localparam int unsigned PIPE_COUNT = (PE+3)/4; + for(genvar c = 0; c < PIPE_COUNT; c++) begin : genPipes + + localparam int unsigned PE_BEG = 4*c; + localparam int unsigned PE_END = PE < 4*(c+1)? PE : 4*(c+1); + + uwire [57:0] p3[SIMD]; + uwire signed [ 1:0] h3[SIMD][3]; + for(genvar s = 0; s < SIMD; s++) begin : genSIMD + + // Input Lane Assembly + uwire [23:0] bb = a[s]; + logic [33:0] aa; + logic [26:0] dd; + logic [ 1:0] xx[3:1]; + if(1) begin : blkVectorize + uwire [3:0] ww[PE_END - PE_BEG]; + for(genvar pe = 0; pe < PE_END - PE_BEG; pe++) begin + assign ww[pe] = w[PE_BEG + pe][s]; + if(pe) begin +// assign xx[pe] = zero? 0 : ww[pe] * a[s]; + LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x ( + .O6(xx[pe][1]), + .O5(xx[pe][0]), + .I5(1'b1), + .I4(zero), + .I3(ww[pe][1]), + .I2(a[s][1]), + .I1(ww[pe][0]), + .I0(a[s][0]) + ); + end + end + always_comb begin + dd = '0; + aa = '0; + for(int unsigned pe = 0; pe < PE_END - PE_BEG; pe++) begin + dd[D[pe]+:3] = ww[pe]; + aa[D[pe]+ 3] = ww[pe][3]; + end + end + end : blkVectorize + + uwire [57:0] pp; + + // Note: Since the product B * AD is computed, + // rst can be only applied to AD and zero only to B + // with the same effect as zeroing both. + if (FORCE_BEHAVIORAL) begin : genBehav + // Stage #1: Input Refine + logic signed [23:0] B1 = 0; + always_ff @(posedge clk) begin + if(zero) B1 <= 0; + else if(en) B1 <= bb; + end + + logic signed [26:0] AD1 = 0; + always_ff @(posedge clk) begin + if(rst) AD1 <= 0; + else if(en) AD1 <= dd - aa; + end + + // Stage #2: Multiply + logic signed [50:0] M2 = 0; + always_ff @(posedge clk) begin + if(rst) M2 <= 0; + else if(en) M2 <= +// synthesis translate off + (B1 === '0) || (AD1 === '0)? 0 : +// synthesis translate on + B1 * AD1; + end + + // Stage #3: Accumulate + logic signed [57:0] P3 = 0; + always_ff @(posedge clk) begin + if(rst) P3 <= 0; + else if(en) P3 <= M2 + (L[3]? 0 : P3); + end + + assign pp = P3; + end : genBehav + else begin : genDSP + DSP48E2 #( + // Feature Control Attributes: Data Path Selection + .AMULTSEL("AD"), // Selects A input to multiplier (A, AD) + .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) + .BMULTSEL("B"), // Selects B input to multiplier (AD, B) + .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port) + .PREADDINSEL("A"), // Selects input to pre-adder (A, B) + .RND('0), // Rounding Constant + .USE_MULT("MULTIPLY"), // Select multiplier usage (DYNAMIC, MULTIPLY, NONE) + .USE_SIMD("ONE48"), // SIMD selection (FOUR12, ONE58, TWO24) + .USE_WIDEXOR("FALSE"), // Use the Wide XOR function (FALSE, TRUE) + .XORSIMD("XOR24_48_96"), // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116) + + // Pattern Detector Attributes: Pattern Detection Configuration + .AUTORESET_PATDET("NO_RESET"), // NO_RESET, RESET_MATCH, RESET_NOT_MATCH + .AUTORESET_PRIORITY("RESET"), // Priority of AUTORESET vs. CEP (CEP, RESET). + .MASK('1), // 58-bit mask value for pattern detect (1=ignore) + .PATTERN('0), // 58-bit pattern match for pattern detect + .SEL_MASK("MASK"), // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2 + .SEL_PATTERN("PATTERN"), // Select pattern value (C, PATTERN) + .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect (NO_PATDET, PATDET) + + // Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins + .IS_ALUMODE_INVERTED('0), // Optional inversion for ALUMODE + .IS_CARRYIN_INVERTED('0), // Optional inversion for CARRYIN + .IS_CLK_INVERTED('0), // Optional inversion for CLK + .IS_INMODE_INVERTED('0), // Optional inversion for INMODE + .IS_OPMODE_INVERTED(9'b00_010_01_01), // Optional inversion for OPMODE + .IS_RSTALLCARRYIN_INVERTED('0), // Optional inversion for RSTALLCARRYIN + .IS_RSTALUMODE_INVERTED('0), // Optional inversion for RSTALUMODE + .IS_RSTA_INVERTED('0), // Optional inversion for RSTA + .IS_RSTB_INVERTED('0), // Optional inversion for RSTB + .IS_RSTCTRL_INVERTED('0), // Optional inversion for STCONJUGATE_A + .IS_RSTC_INVERTED('0), // Optional inversion for RSTC + .IS_RSTD_INVERTED('0), // Optional inversion for RSTD + .IS_RSTINMODE_INVERTED('0), // Optional inversion for RSTINMODE + .IS_RSTM_INVERTED('0), // Optional inversion for RSTM + .IS_RSTP_INVERTED('0), // Optional inversion for RSTP + + // Register Control Attributes: Pipeline Register Configuration + .ACASCREG(0), // Number of pipeline stages between A/ACIN and ACOUT (0-2) + .ADREG(1), // Pipeline stages for pre-adder (0-1) + .ALUMODEREG(0), // Pipeline stages for ALUMODE (0-1) + .AREG(0), // Pipeline stages for A (0-2) + .BCASCREG(1), // Number of pipeline stages between B/BCIN and BCOUT (0-2) + .BREG(1), // Pipeline stages for B (0-2) + .CARRYINREG(0), // Pipeline stages for CARRYIN (0-1) + .CARRYINSELREG(0), // Pipeline stages for CARRYINSEL (0-1) + .CREG(0), // Pipeline stages for C (0-1) + .DREG(0), // Pipeline stages for D (0-1) + .INMODEREG(0), // Pipeline stages for INMODE (0-1) + .MREG(1), // Multiplier pipeline stages (0-1) + .OPMODEREG(1), // Pipeline stages for OPMODE (0-1) + .PREG(1) // Number of pipeline stages for P (0-1) + ) dsp ( + // Cascade outputs: Cascade Ports + .ACOUT(), // 34-bit output: A port cascade + .BCOUT(), // 24-bit output: B cascade + .CARRYCASCOUT(), // 1-bit output: Cascade carry + .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade + .PCOUT(), // 58-bit output: Cascade output + + // Control outputs: Control Inputs/Status Bits + .OVERFLOW(), // 1-bit output: Overflow in add/acc + .PATTERNBDETECT(), // 1-bit output: Pattern bar detect + .PATTERNDETECT(), // 1-bit output: Pattern detect + .UNDERFLOW(), // 1-bit output: Underflow in add/acc + + // Data outputs: Data Ports + .CARRYOUT(), // 4-bit output: Carry + .P(pp), // 58-bit output: Primary data + .XOROUT(), // 8-bit output: XOR data + + // Cascade inputs: Cascade Ports + .ACIN('x), // 34-bit input: A cascade data + .BCIN('x), // 24-bit input: B cascade + .CARRYCASCIN('x), // 1-bit input: Cascade carry + .MULTSIGNIN('x), // 1-bit input: Multiplier sign cascade + .PCIN('x), // 58-bit input: P cascade + + // Control inputs: Control Inputs/Status Bits + .CLK(clk), // 1-bit input: Clock + .ALUMODE(4'h0), // 4-bit input: ALU control + .CARRYINSEL('0), // 3-bit input: Carry select + .INMODE(5'b01100), // 5-bit input: INMODE control + .OPMODE({ 2'b00, { 1'b0, L[2], 1'b0 }, 4'b00_00 }), // 9-bit input: Operation mode + + // Data inputs: Data Ports + .A(aa), // 34-bit input: A data + .B(bb), // 24-bit input: B data + .C('x), // 58-bit input: C data + .CARRYIN('0), // 1-bit input: Carry-in + .D(dd), // 27-bit input: D data + + // Reset/Clock Enable inputs: Reset/Clock Enable Inputs + .CEA1('0), // 1-bit input: Clock enable for 1st stage AREG + .CEA2('0), // 1-bit input: Clock enable for 2nd stage AREG + .CEAD(en), // 1-bit input: Clock enable for ADREG + .CEALUMODE('0), // 1-bit input: Clock enable for ALUMODE + .CEB1('0), // 1-bit input: Clock enable for 1st stage BREG + .CEB2(en), // 1-bit input: Clock enable for 2nd stage BREG + .CEC('0), // 1-bit input: Clock enable for CREG + .CECARRYIN('0), // 1-bit input: Clock enable for CARRYINREG + .CECTRL(en), // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG + .CED('0), // 1-bit input: Clock enable for DREG + .CEINMODE('0), // 1-bit input: Clock enable for INMODEREG + .CEM(en), // 1-bit input: Clock enable for MREG + .CEP(en), // 1-bit input: Clock enable for PREG + .RSTA('0), // 1-bit input: Reset for AREG + .RSTB( // 1-bit input: Reset for BREG +// synthesis translate_off + rst || +// synthesis translate_on + zero + ), + .RSTC('0), // 1-bit input: Reset for CREG + .RSTD( // 1-bit input: Reset for DREG and ADREG +// synthesis translate_off + zero || +// synthesis translate_on + rst + ), + .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG + .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG + .RSTCTRL('0), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG + .RSTINMODE('0), // 1-bit input: Reset for INMODE register + .RSTM(rst), // 1-bit input: Reset for MREG + .RSTP(rst) // 1-bit input: Reset for PREG + ); + end : genDSP + + // External Canary Pipeline + logic [1:0] X1[3:1] = '{ default: 0 }; + logic [1:0] X2[3:1] = '{ default: 0 }; + logic [1:0] X3[3:1] = '{ default: 0 }; + always_ff @(posedge clk) begin + if(rst) begin + X1 <= '{ default: 0 }; + X2 <= '{ default: 0 }; + X3 <= '{ default: 0 }; + end + else if(en) begin + X1 <= xx; + X2 <= X1; + foreach(X3[i]) begin + X3[i] <= X2[i] + (L[3]? 2'h0 : pp[D[i]+:2]); + end + end + end + + // Derive actual cross-lane overflows + for(genvar i = 0; i < 3; i++) begin + assign h3[s][i] = pp[D[i+1]+:2] - X3[i+1]; + end + assign p3[s] = pp; + + end : genSIMD + + // Stage #4: Cross-SIMD Reduction + + // Count leaves reachable from each node + localparam leave_load_t LEAVE_LOAD = init_leave_loads(); + + uwire signed [ACCU_WIDTH -1:0] up4; + uwire signed [ACCU_WIDTH -8:0] hi4[3]; + uwire [$clog2(SIMD)+7:0] lo4[3]; + for(genvar i = 0; i < 4; i++) begin + localparam int unsigned LO_WIDTH = D[i+1] - D[i]; + localparam int unsigned HI_WIDTH = ACCU_WIDTH - LO_WIDTH; + + // Conclusive high part accumulation + if(i < 3) begin : genHi + // Adder Tree across all SIMD high contributions, each from [-1:1] + uwire signed [$clog2(1+SIMD):0] tree[2*SIMD-1]; + for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = h3[s][i]; + for(genvar n = 0; n < SIMD-1; n++) begin + // Sum truncated to actual maximum bit width at this node + uwire signed [$clog2(1+LEAVE_LOAD[n]):0] s = tree[2*n+1] + tree[2*n+2]; + assign tree[n] = s; + end + + // High Sideband Accumulation + logic signed [HI_WIDTH-1:0] Hi4 = 0; + always_ff @(posedge clk) begin + if(rst) Hi4 <= 0; + else if(en) Hi4 <= (L[4]? 0 : Hi4) + tree[0]; + end + assign hi4[i] = Hi4; + end : genHi + + // Conclusive low part accumulation + if(1) begin : blkLo + // Adder Tree across all SIMD low contributions + localparam int unsigned ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1)); + uwire [ROOT_WIDTH-1:0] tree[2*SIMD-1]; + for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = p3[s][D[i]+:LO_WIDTH]; + for(genvar n = 0; n < SIMD-1; n++) begin + // Sum truncated to actual maximum bit width at this node + localparam int unsigned NODE_WIDTH = $clog2(1 + LEAVE_LOAD[n]*(2**LO_WIDTH-1)); + uwire [NODE_WIDTH-1:0] s = tree[2*n+1] + tree[2*n+2]; + assign tree[n] = s; + end + + logic [ROOT_WIDTH-1:0] Lo4 = 0; + always_ff @(posedge clk) begin + if(rst) Lo4 <= 0; + else if(en) Lo4 <= tree[0]; + end + + if(i == 3) assign up4 = Lo4; + else assign lo4[i] = Lo4; + end : blkLo + + end + + // Stage #5: Resolve lane totals + logic signed [3:0][ACCU_WIDTH-1:0] Res5 = '{ default: 0 }; + always_ff @(posedge clk) begin + if(rst) Res5 <= '{ default: 0 }; + else if(en) begin + Res5[3] <= up4 - hi4[2]; + Res5[2] <= $signed({ hi4[2], {(D[3] - D[2]){1'b0}} }) + $signed({ 1'b0, lo4[2] }) - hi4[1]; + Res5[1] <= $signed({ hi4[1], {(D[2] - D[1]){1'b0}} }) + $signed({ 1'b0, lo4[1] }) - hi4[0]; + Res5[0] <= $signed({ hi4[0], {(D[1] - D[0]){1'b0}} }) + $signed({ 1'b0, lo4[0] }); + end + end + + // Output + for(genvar pe = PE_BEG; pe < PE_END; pe++) begin + assign p[pe] = Res5[pe - PE_BEG]; + end + + end : genPipes + +endmodule : mvu_4sx4u \ No newline at end of file From 8eefb535c3da6482f95465df05b8d3e1c610be21 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 8 May 2023 10:33:31 +0100 Subject: [PATCH 121/235] [rtl mvu]: specialized compute core for > 4-bit weights and activations for DSP48 --- finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 358 +++++++++++++++++++++++++++++ 1 file changed, 358 insertions(+) create mode 100644 finn-rtllib/mvu/mvu_8sx8u_dsp48.sv diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv new file mode 100644 index 0000000000..e06a92c8fa --- /dev/null +++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv @@ -0,0 +1,358 @@ +module mvu_8sx8u_dsp48 #( + int unsigned PE, + int unsigned SIMD, + int unsigned ACCU_WIDTH, + int unsigned ACTIVATION_WIDTH, + int unsigned WEIGHT_WIDTH, + bit FORCE_BEHAVIORAL = 0, + + localparam int unsigned SINGLE_PROD_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH +)( + // Global Control + input logic clk, + input logic rst, + input logic en, + + // Input + input logic last, + input logic zero, // ignore current inputs and force this partial product to zero + input logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // signed weights + input logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] a, // unsigned activations + + // Ouput + output logic vld, + output logic signed [PE-1:0][ACCU_WIDTH-1:0] p +); + + typedef int unsigned leave_load_t[2*SIMD-1]; + function leave_load_t init_leave_loads(); + automatic leave_load_t res; + for(int i = 2*(SIMD-1); i >= int'(SIMD)-1; i--) res[i] = 1; + for(int i = SIMD-2; i >= 0; i--) res[i] = res[2*i+1] + res[2*i+2]; + return res; + endfunction : init_leave_loads + + // Pipeline for last indicator flag + logic [1:5] L = '0; + always_ff @(posedge clk) begin + if(rst) L <= '0; + else if(en) L <= { last, L[1:4] }; + end + assign vld = L[5]; + + // Stages #1 - #3: DSP Lanes + cross-lane canaries duplicated with SIMD parallelism + localparam int unsigned D[2:0] = '{ ACCU_WIDTH+SINGLE_PROD_WIDTH, SINGLE_PROD_WIDTH, 0 }; // Lane offsets + + localparam int unsigned PIPE_COUNT = (PE+1)/2; + for(genvar c = 0; c < PIPE_COUNT; c++) begin : genPipes + + localparam int unsigned PE_BEG = 2*c; + localparam int unsigned PE_END = PE < 2*(c+1)? PE : 2*(c+1); + + uwire [57:0] p3[SIMD]; + uwire signed [ 1:0] h3[SIMD]; + for(genvar s = 0; s < SIMD; s++) begin : genSIMD + + // Input Lane Assembly + uwire [23:0] bb = a[s]; + logic [33:0] aa; + logic [26:0] dd; + logic [ 1:0] xx; + if(1) begin : blkVectorize + uwire [WEIGHT_WIDTH-1:0] ww[PE_END - PE_BEG]; + for(genvar pe = 0; pe < PE_END - PE_BEG; pe++) begin + assign ww[pe] = w[PE_BEG + pe][s]; + if(pe) begin +// assign xx[pe] = zero? 0 : ww[pe] * a[s]; + LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x ( + .O6(xx[1]), + .O5(xx[0]), + .I5(1'b1), + .I4(zero), + .I3(ww[pe][1]), + .I2(a[s][1]), + .I1(ww[pe][0]), + .I0(a[s][0]) + ); + end + end + always_comb begin + dd = '0; + aa = '0; + for(int unsigned pe = 0; pe < PE_END - PE_BEG; pe++) begin + dd[D[pe] +: WEIGHT_WIDTH-1] = ww[pe]; + aa[D[pe] + WEIGHT_WIDTH-1] = ww[pe][WEIGHT_WIDTH-1]; + end + end + end : blkVectorize + + uwire [57:0] pp; + + // Note: Since the product B * AD is computed, + // rst can be only applied to AD and zero only to B + // with the same effect as zeroing both. + if (FORCE_BEHAVIORAL) begin : genBehav + // Stage #1: Input Refine + logic signed [23:0] B1 = 0; + always_ff @(posedge clk) begin + if(zero) B1 <= 0; + else if(en) B1 <= bb; + end + + logic signed [26:0] AD1 = 0; + always_ff @(posedge clk) begin + if(rst) AD1 <= 0; + else if(en) AD1 <= dd - aa; + end + + // Stage #2: Multiply + logic signed [50:0] M2 = 0; + always_ff @(posedge clk) begin + if(rst) M2 <= 0; + else if(en) M2 <= +// synthesis translate off + (B1 === '0) || (AD1 === '0)? 0 : +// synthesis translate on + B1 * AD1; + end + + // Stage #3: Accumulate + logic signed [57:0] P3 = 0; + always_ff @(posedge clk) begin + if(rst) P3 <= 0; + else if(en) P3 <= M2 + (L[3]? 0 : P3); + end + + assign pp = P3; + end : genBehav + else begin : genDSP + DSP48E2 #( + // Feature Control Attributes: Data Path Selection + .AMULTSEL("AD"), // Selects A input to multiplier (A, AD) + .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) + .BMULTSEL("B"), // Selects B input to multiplier (AD, B) + .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port) + .PREADDINSEL("A"), // Selects input to pre-adder (A, B) + .RND('0), // Rounding Constant + .USE_MULT("MULTIPLY"), // Select multiplier usage (DYNAMIC, MULTIPLY, NONE) + .USE_SIMD("ONE48"), // SIMD selection (FOUR12, ONE58, TWO24) + .USE_WIDEXOR("FALSE"), // Use the Wide XOR function (FALSE, TRUE) + .XORSIMD("XOR24_48_96"), // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116) + + // Pattern Detector Attributes: Pattern Detection Configuration + .AUTORESET_PATDET("NO_RESET"), // NO_RESET, RESET_MATCH, RESET_NOT_MATCH + .AUTORESET_PRIORITY("RESET"), // Priority of AUTORESET vs. CEP (CEP, RESET). + .MASK('1), // 58-bit mask value for pattern detect (1=ignore) + .PATTERN('0), // 58-bit pattern match for pattern detect + .SEL_MASK("MASK"), // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2 + .SEL_PATTERN("PATTERN"), // Select pattern value (C, PATTERN) + .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect (NO_PATDET, PATDET) + + // Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins + .IS_ALUMODE_INVERTED('0), // Optional inversion for ALUMODE + .IS_CARRYIN_INVERTED('0), // Optional inversion for CARRYIN + .IS_CLK_INVERTED('0), // Optional inversion for CLK + .IS_INMODE_INVERTED('0), // Optional inversion for INMODE + .IS_OPMODE_INVERTED(9'b00_010_01_01), // Optional inversion for OPMODE + .IS_RSTALLCARRYIN_INVERTED('0), // Optional inversion for RSTALLCARRYIN + .IS_RSTALUMODE_INVERTED('0), // Optional inversion for RSTALUMODE + .IS_RSTA_INVERTED('0), // Optional inversion for RSTA + .IS_RSTB_INVERTED('0), // Optional inversion for RSTB + .IS_RSTCTRL_INVERTED('0), // Optional inversion for STCONJUGATE_A + .IS_RSTC_INVERTED('0), // Optional inversion for RSTC + .IS_RSTD_INVERTED('0), // Optional inversion for RSTD + .IS_RSTINMODE_INVERTED('0), // Optional inversion for RSTINMODE + .IS_RSTM_INVERTED('0), // Optional inversion for RSTM + .IS_RSTP_INVERTED('0), // Optional inversion for RSTP + + // Register Control Attributes: Pipeline Register Configuration + .ACASCREG(0), // Number of pipeline stages between A/ACIN and ACOUT (0-2) + .ADREG(1), // Pipeline stages for pre-adder (0-1) + .ALUMODEREG(0), // Pipeline stages for ALUMODE (0-1) + .AREG(0), // Pipeline stages for A (0-2) + .BCASCREG(1), // Number of pipeline stages between B/BCIN and BCOUT (0-2) + .BREG(1), // Pipeline stages for B (0-2) + .CARRYINREG(0), // Pipeline stages for CARRYIN (0-1) + .CARRYINSELREG(0), // Pipeline stages for CARRYINSEL (0-1) + .CREG(0), // Pipeline stages for C (0-1) + .DREG(0), // Pipeline stages for D (0-1) + .INMODEREG(0), // Pipeline stages for INMODE (0-1) + .MREG(1), // Multiplier pipeline stages (0-1) + .OPMODEREG(1), // Pipeline stages for OPMODE (0-1) + .PREG(1) // Number of pipeline stages for P (0-1) + ) dsp ( + // Cascade outputs: Cascade Ports + .ACOUT(), // 34-bit output: A port cascade + .BCOUT(), // 24-bit output: B cascade + .CARRYCASCOUT(), // 1-bit output: Cascade carry + .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade + .PCOUT(), // 58-bit output: Cascade output + + // Control outputs: Control Inputs/Status Bits + .OVERFLOW(), // 1-bit output: Overflow in add/acc + .PATTERNBDETECT(), // 1-bit output: Pattern bar detect + .PATTERNDETECT(), // 1-bit output: Pattern detect + .UNDERFLOW(), // 1-bit output: Underflow in add/acc + + // Data outputs: Data Ports + .CARRYOUT(), // 4-bit output: Carry + .P(pp), // 58-bit output: Primary data + .XOROUT(), // 8-bit output: XOR data + + // Cascade inputs: Cascade Ports + .ACIN('x), // 34-bit input: A cascade data + .BCIN('x), // 24-bit input: B cascade + .CARRYCASCIN('x), // 1-bit input: Cascade carry + .MULTSIGNIN('x), // 1-bit input: Multiplier sign cascade + .PCIN('x), // 58-bit input: P cascade + + // Control inputs: Control Inputs/Status Bits + .CLK(clk), // 1-bit input: Clock + .ALUMODE(4'h0), // 4-bit input: ALU control + .CARRYINSEL('0), // 3-bit input: Carry select + .INMODE(5'b01100), // 5-bit input: INMODE control + .OPMODE({ 2'b00, { 1'b0, L[2], 1'b0 }, 4'b00_00 }), // 9-bit input: Operation mode + + // Data inputs: Data Ports + .A(aa), // 34-bit input: A data + .B(bb), // 24-bit input: B data + .C('x), // 58-bit input: C data + .CARRYIN('0), // 1-bit input: Carry-in + .D(dd), // 27-bit input: D data + + // Reset/Clock Enable inputs: Reset/Clock Enable Inputs + .CEA1('0), // 1-bit input: Clock enable for 1st stage AREG + .CEA2('0), // 1-bit input: Clock enable for 2nd stage AREG + .CEAD(en), // 1-bit input: Clock enable for ADREG + .CEALUMODE('0), // 1-bit input: Clock enable for ALUMODE + .CEB1('0), // 1-bit input: Clock enable for 1st stage BREG + .CEB2(en), // 1-bit input: Clock enable for 2nd stage BREG + .CEC('0), // 1-bit input: Clock enable for CREG + .CECARRYIN('0), // 1-bit input: Clock enable for CARRYINREG + .CECTRL(en), // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG + .CED('0), // 1-bit input: Clock enable for DREG + .CEINMODE('0), // 1-bit input: Clock enable for INMODEREG + .CEM(en), // 1-bit input: Clock enable for MREG + .CEP(en), // 1-bit input: Clock enable for PREG + .RSTA('0), // 1-bit input: Reset for AREG + .RSTB( // 1-bit input: Reset for BREG +// synthesis translate_off + rst || +// synthesis translate_on + zero + ), + .RSTC('0), // 1-bit input: Reset for CREG + .RSTD( // 1-bit input: Reset for DREG and ADREG +// synthesis translate_off + zero || +// synthesis translate_on + rst + ), + .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG + .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG + .RSTCTRL('0), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG + .RSTINMODE('0), // 1-bit input: Reset for INMODE register + .RSTM(rst), // 1-bit input: Reset for MREG + .RSTP(rst) // 1-bit input: Reset for PREG + ); + end : genDSP + + // External Canary Pipeline + logic [1:0] X1 = '{ default: 0 }; + logic [1:0] X2 = '{ default: 0 }; + logic [1:0] X3 = '{ default: 0 }; + always_ff @(posedge clk) begin + if(rst) begin + X1 <= '{ default: 0 }; + X2 <= '{ default: 0 }; + X3 <= '{ default: 0 }; + end + else if(en) begin + X1 <= xx; + X2 <= X1; + X3 <= X2 + (L[3]? 2'h0 : pp[D[1]+:2]); + end + end + + // Derive actual cross-lane overflows + assign h3[s] = pp[D[1]+:2] - X3; + + assign p3[s] = pp; + + end : genSIMD + + // Stage #4: Cross-SIMD Reduction + + // Count leaves reachable from each node + localparam leave_load_t LEAVE_LOAD = init_leave_loads(); + + uwire signed [ACCU_WIDTH -1:0] up4; + uwire signed [ACCU_WIDTH -SINGLE_PROD_WIDTH:0] hi4; + uwire [$clog2(SIMD)+SINGLE_PROD_WIDTH-1:0] lo4; + for(genvar i = 0; i < 2; i++) begin + localparam int unsigned LO_WIDTH = D[i+1] - D[i]; + localparam int unsigned HI_WIDTH = ACCU_WIDTH - LO_WIDTH; + + // Conclusive high part accumulation + if(i == 0) begin : genHi + // Adder Tree across all SIMD high contributions, each from [-1:1] + uwire signed [$clog2(1+SIMD):0] tree[2*SIMD-1]; + for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = h3[s]; + for(genvar n = 0; n < SIMD-1; n++) begin + // Sum truncated to actual maximum bit width at this node + uwire signed [$clog2(1+LEAVE_LOAD[n]):0] s = tree[2*n+1] + tree[2*n+2]; + assign tree[n] = s; + end + + // High Sideband Accumulation + logic signed [HI_WIDTH-1:0] Hi4 = 0; + always_ff @(posedge clk) begin + if(rst) Hi4 <= 0; + else if(en) Hi4 <= (L[4]? 0 : Hi4) + tree[0]; + end + assign hi4 = Hi4; + end : genHi + + // Conclusive low part accumulation + if(1) begin : blkLo + // Adder Tree across all SIMD low contributions + localparam int unsigned ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1)); + uwire [ROOT_WIDTH-1:0] tree[2*SIMD-1]; + for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = p3[s][D[i]+:LO_WIDTH]; + for(genvar n = 0; n < SIMD-1; n++) begin + // Sum truncated to actual maximum bit width at this node + localparam int unsigned NODE_WIDTH = $clog2(1 + LEAVE_LOAD[n]*(2**LO_WIDTH-1)); + uwire [NODE_WIDTH-1:0] s = tree[2*n+1] + tree[2*n+2]; + assign tree[n] = s; + end + + logic [ROOT_WIDTH-1:0] Lo4 = 0; + always_ff @(posedge clk) begin + if(rst) Lo4 <= 0; + else if(en) Lo4 <= tree[0]; + end + + if(i == 1) assign up4 = Lo4; + else assign lo4 = Lo4; + end : blkLo + + end + + // Stage #5: Resolve lane totals + logic signed [1:0][ACCU_WIDTH-1:0] Res5 = '{ default: 0 }; + always_ff @(posedge clk) begin + if(rst) Res5 <= '{ default: 0 }; + else if(en) begin + Res5[1] <= up4 - hi4; + Res5[0] <= $signed({ hi4, {(D[1] - D[0]){1'b0}} }) + $signed({ 1'b0, lo4 }); + end + end + + // Output + for(genvar pe = PE_BEG; pe < PE_END; pe++) begin + assign p[pe] = Res5[pe - PE_BEG]; + end + + end : genPipes + +endmodule : mvu_8sx8u_dsp48 \ No newline at end of file From e7109e75161774280b24e5884f6c9b9c17a07f7b Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 8 May 2023 10:34:23 +0100 Subject: [PATCH 122/235] [fpgadataflow transform]: initial specialize_to_rtl_layers-transform for MVU --- .../fpgadataflow/specialize_to_rtl_layers.py | 105 ++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100644 src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py diff --git a/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py new file mode 100644 index 0000000000..7d677ec216 --- /dev/null +++ b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py @@ -0,0 +1,105 @@ +# Copyright (c) 2023, AMD +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from qonnx.transformation.base import Transformation +from qonnx.custom_op.registry import getCustomOp +from qonnx.core.datatype import DataType +from onnx import helper +from qonnx.transformation.infer_shapes import InferShapes +from qonnx.transformation.infer_datatypes import InferDataTypes +from finn.transformation.fpgadataflow.minimize_accumulator_width import MinimizeAccumulatorWidth + +class InferRTLMatrixVectorActivation(Transformation): + """Convert (HLS-based) MatrixVectorActivation layers to specialized RTL layers if supported.""" + + def __init__(self): + super().__init__() + + def _is_rtl_variant_compatible(self, n): + no_activation = getCustomOp(n).get_nodeattr("noActivation") == 1 + act_width_in_range = (DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() <= 8) or (DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() == 9 and DataType[getCustomOp(n).get_nodeattr("inputDataType")].min() < 0) + weight_width_in_range = DataType[getCustomOp(n).get_nodeattr("weightDataType")].bitwidth() <= 8 + folding_supported = (getCustomOp(n).get_nodeattr("MH") % getCustomOp(n).get_nodeattr("PE") == 0) and (getCustomOp(n).get_nodeattr("MW") % getCustomOp(n).get_nodeattr("SIMD") == 0) + + if (no_activation and act_width_in_range and weight_width_in_range and folding_supported): + return True + else: + return False + + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for n in graph.node: + node_ind += 1 + if n.op_type == "MatrixVectorActivation": + preferred_in_rtl = getCustomOp(n).get_nodeattr("impl") == "rtl" and getCustomOp(n).get_nodeattr("resType") == "dsp" + supported_in_rtl = self._is_rtl_variant_compatible(n) + if (preferred_in_rtl and supported_in_rtl): + mvau_input = n.input[0] + mvau_weight = n.input[1] + mvau_output = n.output[0] + inputDataType = getCustomOp(n).get_nodeattr("inputDataType") + weightDataType = getCustomOp(n).get_nodeattr("weightDataType") + outputDataType = getCustomOp(n).get_nodeattr("outputDataType") + numInputVectors = getCustomOp(n).get_nodeattr("numInputVectors") + mw = getCustomOp(n).get_nodeattr("MW") + mh = getCustomOp(n).get_nodeattr("MH") + simd = getCustomOp(n).get_nodeattr("SIMD") + pe = getCustomOp(n).get_nodeattr("PE") + mem_mode = getCustomOp(n).get_nodeattr("mem_mode") + + new_node = helper.make_node( + "MatrixVectorActivation_rtl", + [mvau_input, mvau_weight], + [mvau_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + MW=mw, + MH=mh, + SIMD=simd, + PE=pe, + inputDataType=inputDataType, + weightDataType=weightDataType, + outputDataType=outputDataType, + numInputVectors=numInputVectors, + mem_mode=mem_mode, + name=n.name + "_rtl", + ) + graph.node.insert(node_ind, new_node) + # remove old node + graph.node.remove(n) + graph_modified=True + + if graph_modified: + model = model.transform(MinimizeAccumulatorWidth()) + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + + return (model, graph_modified) \ No newline at end of file From 5a868d19e5955abdb894bf1e8b93d2d1f6f8410d Mon Sep 17 00:00:00 2001 From: Yaman Umuroglu Date: Tue, 9 May 2023 09:41:15 +0200 Subject: [PATCH 123/235] [rtl mvu] fixes for latest memstream + linting --- .../matrixvectoractivation_rtl.py | 136 ++++++++++-------- 1 file changed, 77 insertions(+), 59 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py index 6b1c2f3be7..8fd261d395 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py @@ -27,7 +27,6 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import math -from shutil import copy import numpy as np import os import textwrap @@ -40,20 +39,18 @@ ) from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.util.basic import get_rtlsim_trace_depth, make_build_dir from finn.util.data_packing import ( npy_to_rtlsim_input, - numpy_to_hls_code, pack_innermost_dim_as_hex_string, rtlsim_output_to_npy, ) -from finn.util.basic import get_rtlsim_trace_depth, make_build_dir try: from pyverilator import PyVerilator except ModuleNotFoundError: PyVerilator = None -from . import templates # ONNX i/o tensor shape assumptions for MatrixVectorActivation: # input 0 is the input tensor, shape (.., i_size) = (..., MW) @@ -69,7 +66,6 @@ class MatrixVectorActivation_rtl(HLSCustomOp): def __init__(self, onnx_node, **kwargs): super().__init__(onnx_node, **kwargs) - self.decoupled_wrapper = templates.decoupled_wrapper def get_nodeattr_types(self): my_attrs = { @@ -186,17 +182,24 @@ def verify_node(self): ) num_of_inputs = len(self.onnx_node.input) - if num_of_inputs!=2: - info_messages.append("RTL-based MatrixVectorActivation expects two inputs (weights and activation), but got {} inputs.".format(len(self.onnx_node.input))) + if num_of_inputs != 2: + info_messages.append( + "RTL-based MatrixVectorActivation expects two inputs " + "(weights and activation), but got {} inputs.".format( + len(self.onnx_node.input) + ) + ) mem_mode = self.get_nodeattr("mem_mode") if mem_mode != "decoupled": - info_messages.append("RTL-based MVAU supports only decoupled weights currently") + info_messages.append( + "RTL-based MVAU supports only decoupled weights currently" + ) return info_messages -# TODO: Add in replay_buffer estimation + # TODO: Add in replay_buffer estimation def uram_estimation(self): P = self.get_nodeattr("PE") Q = self.get_nodeattr("SIMD") @@ -218,7 +221,7 @@ def uram_estimation(self): depth_multiplier = math.ceil(omega / 4096) return width_multiplier * depth_multiplier -# TODO: Add in replay_buffer estimation + # TODO: Add in replay_buffer estimation def bram_estimation(self): """Calculates resource estimation for BRAM based on: - FINN-R: An End-to-End Deep-Learning Framework for Fast @@ -259,7 +262,7 @@ def bram_estimation(self): else: return (math.ceil(omega / 512)) * (math.ceil(mem_width / 36)) -# TODO: Add in replay_buffer estimation + # TODO: Add in replay_buffer estimation def bram_efficiency_estimation(self): wdt = self.get_weight_datatype() W = wdt.bitwidth() @@ -272,7 +275,7 @@ def bram_efficiency_estimation(self): bram16_est_capacity = bram16_est * 36 * 512 return wbits / bram16_est_capacity -# TODO: Add in replay_buffer estimation + # TODO: Add in replay_buffer estimation def uram_efficiency_estimation(self): """Function for URAM efficiency estimation: actual parameter storage needed divided by the allocated URAM storage (from estimation)""" @@ -287,7 +290,7 @@ def uram_efficiency_estimation(self): uram_est_capacity = uram_est * 72 * 4096 return wbits / uram_est_capacity -#TODO: FIX: worst case estimates since segmentlen is not known at this point? + # TODO: FIX: worst case estimates since segmentlen is not known at this point? def lut_estimation(self): """Calculates resource estimations for LUTs based on: - FINN-R: An End-to-End Deep-Learning Framework for Fast @@ -328,13 +331,9 @@ def lut_estimation(self): acc_bits = W + A + np.ceil(math.log(MW, 2)) acc_luts = acc_bits - return int( - c0 - + c1 * (P * (mult_luts + addertree_luts + acc_luts)) - + c2 - ) + return int(c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts)) + c2) -#TODO: FIX: worst case estimates since segmentlen is not known at this point? + # TODO: FIX: worst case estimates since segmentlen is not known at this point? def dsp_estimation(self): # multiplication P = self.get_nodeattr("PE") @@ -350,7 +349,7 @@ def dsp_estimation(self): mult_dsp = 0 return int(mult_dsp) -#TODO: FIX: worst case estimates since segmentlen is not known at this point + # TODO: FIX: worst case estimates since segmentlen is not known at this point def get_exp_cycles(self): pe = self.get_nodeattr("PE") simd = self.get_nodeattr("SIMD") @@ -359,7 +358,9 @@ def get_exp_cycles(self): mw = self.get_nodeattr("MW") # since mmv != 1 is not supported yet, we set mmv for now to 1 mmv = 1 - # Actual exp_cycles is probably slightly larger (say 3 cycles (DSP A/B, M, P - reg) + additional pipeline buffer cycles. Most probably <10) + # Actual exp_cycles is probably slightly larger (say 3 cycles + # (DSP A/B, M, P - reg) + additional pipeline buffer cycles. + # Most probably <10) exp_cycles = (mh / pe) * (mw / simd) * np.prod(num_inp_vec) / mmv return int(exp_cycles) @@ -384,7 +385,9 @@ def get_output_datatype(self, ind=0): def get_instream_width(self, ind=0): i_bits = self.get_input_datatype().bitwidth() - assert (i_bits<=9), "RTL-based MVAU only supports activations with bit-width up to 9-bits" + assert ( + i_bits <= 9 + ), "RTL-based MVAU only supports activations with bit-width up to 9-bits" in_width = i_bits * self.get_nodeattr("SIMD") return in_width @@ -402,7 +405,9 @@ def get_weightstream_width(self): pe = self.get_nodeattr("PE") simd = self.get_nodeattr("SIMD") wp = self.get_weight_datatype().bitwidth() - assert (wp <= 8), "RTL-based MVAU only supports weights with bit-width up to 8-bits" + assert ( + wp <= 8 + ), "RTL-based MVAU only supports weights with bit-width up to 8-bits" w_width = pe * simd * wp return w_width else: @@ -516,7 +521,8 @@ def minimize_accumulator_width(self, model): else: adt = DataType.get_smallest_possible(acc_max) # Note: we are interested in simply the width of the output dot product. - # Padding the actual output stream to a multiple of 8-bits is done in the RTL component + # Padding the actual output stream to a multiple of 8-bits is done in + # the RTL component self.set_nodeattr("accDataType", adt.name) # for no-activation nodes, output dt = acc dt self.set_nodeattr("outputDataType", adt.name) @@ -615,9 +621,7 @@ def generate_params(self, model, path): # and one for synthesis. this is because URAM-based weights always # need zero weights for synthesis, otherwise they get inferred # as BRAM - weight_filename_rtl_synth = "{}/memblock_synth_0.dat".format( - code_gen_dir - ) + weight_filename_rtl_synth = "{}/memblock_synth_0.dat".format(code_gen_dir) weight_filename_rtl_sim = "{}/memblock_sim_0.dat".format(code_gen_dir) # sim weights are always the true weights self.make_weight_file( @@ -734,11 +738,11 @@ def code_generation_ipgen(self, model, fpgapart, clk): def ipgen_singlenode_code(self): """Normally: Builds the bash script for IP generation.""" - pass + pass def code_generation_cppsim(self, model): """Normally: Generates C++ code for simulation (cppsim).""" - pass + pass def compile_singlenode_code(self): pass @@ -803,19 +807,28 @@ def code_generation_ipi(self): code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/") sourcefiles = [ - os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"), + os.path.join( + code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v" + ), rtllib_dir + "mvu_axi.sv", rtllib_dir + "replay_buffer.sv", rtllib_dir + "mvu_4sx4u.sv", rtllib_dir + "mvu_8sx9.sv", - rtllib_dir + "mvu_8sx8u_dsp48.sv" + rtllib_dir + "mvu_8sx8u_dsp48.sv", ] for f in sourcefiles: cmd.append("add_files -norecurse %s" % (f)) - cmd.append("create_bd_cell -type hier -reference %s /%s/%s" % (self.get_nodeattr("gen_top_module"), self.onnx_node.name, self.onnx_node.name)) + cmd.append( + "create_bd_cell -type hier -reference %s /%s/%s" + % ( + self.get_nodeattr("gen_top_module"), + self.onnx_node.name, + self.onnx_node.name, + ) + ) # instantiate a streamer and connect it to the HLS IP - strm_vlnv = "xilinx.com:user:memstream:1.0" + strm_vlnv = "amd.com:FINN:memstream:1.0" strm_inst = node_name + "_wstrm" cmd.append( "create_bd_cell -type ip -vlnv %s /%s/%s" @@ -849,11 +862,11 @@ def code_generation_ipi(self): % (node_name, strm_inst, node_name, node_name, sname) ) cmd.append( - "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aresetn]" + "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/ap_rst_n]" % (node_name, rst_name, node_name, strm_inst) ) cmd.append( - "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aclk]" + "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/ap_clk]" % (node_name, clk_name, node_name, strm_inst) ) cmd.append( @@ -947,21 +960,25 @@ def derive_characteristic_fxns(self, period): ] super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) -# TODO: characterize max_clk and implement this function in look-up style + # TODO: characterize max_clk and implement this function in look-up style def _resolve_segment_len(self, clk): # Insert pipeline registers in the DSP chain to meet target clock frequency segmentlen = 0 return segmentlen def _resolve_impl_style(self, fpgapart): - # Based on target device and activation/weight-width, choose the supported RTL module + # Based on target device and activation/weight-width, choose the + # supported RTL module act_width = self.get_input_datatype(0).bitwidth() weight_width = self.get_input_datatype(1).bitwidth() - is_versal = fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"] or fpgapart[0:5] == "xqrvc" - if (act_width == 4 and weight_width == 4): + is_versal = ( + fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"] + or fpgapart[0:5] == "xqrvc" + ) + if act_width == 4 and weight_width == 4: return "mvu_4sx4u" else: - if (is_versal): + if is_versal: return "mvu_8sx9_dsp58" else: return "mvu_8sx8u_dsp48" @@ -973,13 +990,17 @@ def generate_hdl(self, model, fpgapart, clk): template_path, code_gen_dict = self.prepare_codegen_default(fpgapart, clk) # add general parameters to dictionary - code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [self.get_verilog_top_module_name()] + code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [ + self.get_verilog_top_module_name() + ] # save top module name so we can refer to it after this node has been renamed # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject) self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name()) ram_style = self.get_nodeattr("ram_style") - assert (ram_style=="auto"), "Unrecognized ram_style for MatrixVectorActivation_rtl" + assert ( + ram_style == "auto" + ), "Unrecognized ram_style for MatrixVectorActivation_rtl" # apply code generation to template with open(template_path, "r") as f: @@ -1009,19 +1030,21 @@ def generate_hdl(self, model, fpgapart, clk): self.set_nodeattr("ip_path", code_gen_dir) def prepare_codegen_default(self, fpgapart, clk): - template_path = ( - os.environ["FINN_ROOT"] + "/finn-rtllib/mvu/mvu_axi_wrapper.v" - ) - + template_path = os.environ["FINN_ROOT"] + "/finn-rtllib/mvu/mvu_axi_wrapper.v" + code_gen_dict = {} code_gen_dict["$MW$"] = [str(self.get_nodeattr("MW"))] code_gen_dict["$MH$"] = [str(self.get_nodeattr("MH"))] code_gen_dict["$PE$"] = [str(self.get_nodeattr("PE"))] code_gen_dict["$SIMD$"] = [str(self.get_nodeattr("SIMD"))] - code_gen_dict["$ACTIVATION_WIDTH$"] = [str(self.get_input_datatype(0).bitwidth())] + code_gen_dict["$ACTIVATION_WIDTH$"] = [ + str(self.get_input_datatype(0).bitwidth()) + ] code_gen_dict["$WEIGHT_WIDTH$"] = [str(self.get_input_datatype(1).bitwidth())] code_gen_dict["$ACCU_WIDTH$"] = [str(self.get_output_datatype().bitwidth())] - code_gen_dict["$SIGNED_ACTIVATIONS$"] = [str(1)] if (self.get_input_datatype(0).min() < 0) else [str(0)] + code_gen_dict["$SIGNED_ACTIVATIONS$"] = ( + [str(1)] if (self.get_input_datatype(0).min() < 0) else [str(0)] + ) code_gen_dict["$SEGMENTLEN$"] = [str(self._resolve_segment_len(clk))] code_gen_dict["$MVU_IMPL_STYLE$"] = [self._resolve_impl_style(fpgapart)] @@ -1035,15 +1058,10 @@ def prepare_rtlsim(self): if PyVerilator is None: raise ImportError("Installation of PyVerilator is required.") - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") # Path to (System-)Verilog files used by top-module & path to top-module - verilog_paths = [ - code_gen_dir, - os.environ["FINN_ROOT"] + "/finn-rtllib/mvu" - ] - verilog_files = [ - self.get_nodeattr("gen_top_module") + "_wrapper_sim.v" - ] + verilog_paths = [code_gen_dir, os.environ["FINN_ROOT"] + "/finn-rtllib/mvu"] + verilog_files = [self.get_nodeattr("gen_top_module") + "_wrapper_sim.v"] # build the Verilator emu library sim = PyVerilator.build( @@ -1051,9 +1069,9 @@ def prepare_rtlsim(self): build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"), verilog_path=verilog_paths, trace_depth=get_rtlsim_trace_depth(), - top_module_name=self.get_verilog_top_module_name() + top_module_name=self.get_verilog_top_module_name(), ) # save generated lib filename in attribute self.set_nodeattr("rtlsim_so", sim.lib._name) - - return sim \ No newline at end of file + + return sim From 4a9cfa1c7a17497578faad3f76c25b80c116ba58 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 11 May 2023 10:56:07 +0100 Subject: [PATCH 124/235] [rtl custom_op]: add support for external weights --- .../matrixvectoractivation_rtl.py | 67 ++++++++++--------- 1 file changed, 37 insertions(+), 30 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py index 8fd261d395..162b5e2e16 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py @@ -192,9 +192,9 @@ def verify_node(self): mem_mode = self.get_nodeattr("mem_mode") - if mem_mode != "decoupled": + if mem_mode not in ["decoupled", "external"]: info_messages.append( - "RTL-based MVAU supports only decoupled weights currently" + "RTL-based MVAU supports only decoupled or external weights." ) return info_messages @@ -612,35 +612,20 @@ def generate_params(self, model, path): code_gen_dir = path # weights, if not external weights = model.get_initializer(self.onnx_node.input[1]) - if mem_mode == "decoupled": + if mem_mode in ["decoupled", "external"]: weight_filename_sim = "{}/weights.npy".format(code_gen_dir) # save decoupled weights for cppsim self.make_weight_file(weights, "decoupled_npy", weight_filename_sim) - # Also save weights as Verilog .dat file - # note that we provide two different .dat files, one for synth - # and one for synthesis. this is because URAM-based weights always - # need zero weights for synthesis, otherwise they get inferred - # as BRAM - weight_filename_rtl_synth = "{}/memblock_synth_0.dat".format(code_gen_dir) - weight_filename_rtl_sim = "{}/memblock_sim_0.dat".format(code_gen_dir) - # sim weights are always the true weights - self.make_weight_file( - weights, "decoupled_verilog_dat", weight_filename_rtl_sim - ) - ram_style = self.get_nodeattr("ram_style") - if ram_style == "ultra": - # UltraRAM must have no memory initializer, or only zeroes - # otherwise BRAM will be inferred instead of URAM - # as a workaround we provide a zero-weight init here - synth_weights = np.zeros_like(weights, dtype=np.float32) - else: - synth_weights = weights - self.make_weight_file( - synth_weights, "decoupled_verilog_dat", weight_filename_rtl_synth - ) + if mem_mode == "decoupled": + # also save weights as Verilog .dat file + # This file will be ignored when synthesizing UltraScale memory. + weight_filename_rtl = "{}/memblock.dat".format(code_gen_dir) + self.make_weight_file( + weights, "decoupled_verilog_dat", weight_filename_rtl + ) else: raise Exception( - """Please set mem_mode to "decoupled", + """Please set mem_mode to "const", "decoupled", or "external", currently no other parameter value is supported!""" ) @@ -695,7 +680,7 @@ def execute_node(self, context, graph): ) super().reset_rtlsim(sim) super().toggle_clk(sim) - if mem_mode == "external" or mem_mode == "decoupled": + if mem_mode in ["external", "decoupled"]: wnbits = self.get_weightstream_width() export_wdt = self.get_weight_datatype() wei = npy_to_rtlsim_input( @@ -903,9 +888,31 @@ def code_generation_ipi(self): # TODO calculate and pass in segment size here cmd.append("assign_bd_address") cmd.append("save_bd_design") - elif mem_mode == "const" or mem_mode == "external": - # base class impl sufficient for const/external modes - return super().code_generation_ipi() + elif mem_mode == "external": + # instantiate the RTL block + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/") + sourcefiles = [ + os.path.join( + code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v" + ), + rtllib_dir + "mvu_axi.sv", + rtllib_dir + "replay_buffer.sv", + rtllib_dir + "mvu_4sx4u.sv", + rtllib_dir + "mvu_8sx9.sv", + rtllib_dir + "mvu_8sx8u_dsp48.sv", + ] + for f in sourcefiles: + cmd.append("add_files -norecurse %s" % (f)) + cmd.append( + "create_bd_cell -type module -reference %s %s" + % ( + self.get_nodeattr("gen_top_module"), + self.onnx_node.name, + ) + ) + cmd.append("set_property CONFIG.FREQ_HZ 333333333.333333 [get_bd_intf_pins %s/in0_V]" % (self.onnx_node.name)) + cmd.append("set_property CONFIG.FREQ_HZ 333333333.333333 [get_bd_intf_pins %s/out_V]" % (self.onnx_node.name)) else: raise Exception("Unrecognized mem_mode for MatrixVectorActivation") return cmd From 8a9ac1af4d6c62e7c9557ab41992b84cf2c37ae1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Thu, 11 May 2023 11:04:28 +0100 Subject: [PATCH 125/235] Specify clock and reset associations of bus interfaces. --- finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v | 4 +++- finn-rtllib/mvu/mvu_axi_wrapper.v | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v index 502a72d3f2..fb3c62a15a 100644 --- a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v +++ b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v @@ -49,8 +49,10 @@ module $MODULE_NAME_AXI_WRAPPER$ #( parameter OUTPUT_LANES = PE, parameter OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8 )( - // Global Control + // Global Control + (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF s_axis_weights:s_axis_input:m_axis_output" *) input logic ap_clk, + (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF s_axis_weights:s_axis_input:m_axis_output" *) input logic ap_rst_n, // Weight Stream diff --git a/finn-rtllib/mvu/mvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_axi_wrapper.v index b79ba6bbd1..d8acaefcc7 100644 --- a/finn-rtllib/mvu/mvu_axi_wrapper.v +++ b/finn-rtllib/mvu/mvu_axi_wrapper.v @@ -50,8 +50,10 @@ module $MODULE_NAME_AXI_WRAPPER$ #( parameter OUTPUT_LANES = PE, parameter OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8 )( - // Global Control + // Global Control + (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V" *) input ap_clk, + (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V" *) input ap_rst_n, // Weight Stream input [WEIGHT_STREAM_WIDTH_BA-1:0] weights_V_TDATA, From d9b90793bd54a5e112531c737fa7c60a51b21d34 Mon Sep 17 00:00:00 2001 From: Yaman Umuroglu Date: Mon, 15 May 2023 10:16:48 +0200 Subject: [PATCH 126/235] [rtlmvu] More fixes for memstream and param gen --- .../fpgadataflow/matrixvectoractivation_rtl.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py index 162b5e2e16..1791327e78 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py @@ -612,7 +612,11 @@ def generate_params(self, model, path): code_gen_dir = path # weights, if not external weights = model.get_initializer(self.onnx_node.input[1]) +<<<<<<< HEAD if mem_mode in ["decoupled", "external"]: +======= + if mem_mode == "decoupled" or mem_mode == "external": +>>>>>>> 72fe4c5b ([rtlmvu] More fixes for memstream and param gen) weight_filename_sim = "{}/weights.npy".format(code_gen_dir) # save decoupled weights for cppsim self.make_weight_file(weights, "decoupled_npy", weight_filename_sim) @@ -821,22 +825,16 @@ def code_generation_ipi(self): ) cmd.append( "set_property -dict [list " - "CONFIG.NSTREAMS {1} " - "CONFIG.MEM_DEPTH {%d} " - "CONFIG.MEM_WIDTH {%d} " - "CONFIG.MEM_INIT {%s} " + "CONFIG.DEPTH {%d} " + "CONFIG.WIDTH {%d} " + "CONFIG.INIT_FILE {%s} " "CONFIG.RAM_STYLE {%s} " - "CONFIG.STRM0_DEPTH {%d} " - "CONFIG.STRM0_WIDTH {%d} " - "CONFIG.STRM0_OFFSET {0} " "] [get_bd_cells /%s/%s]" % ( self.calc_wmem(), self.get_weightstream_width_padded(), - self.get_nodeattr("code_gen_dir_ipgen") + "/", + self.get_nodeattr("code_gen_dir_ipgen") + "/memblock.dat", self.get_nodeattr("ram_style"), - self.calc_wmem(), - self.get_weightstream_width_padded(), node_name, strm_inst, ) From a5f2a83897e33acb4b3e2231d9bfa534e56bb6b2 Mon Sep 17 00:00:00 2001 From: Yaman Umuroglu Date: Thu, 11 May 2023 23:49:10 +0200 Subject: [PATCH 127/235] [Build] apply config to only FIFO nodes in step_set_fifo_depths --- src/finn/builder/build_dataflow_steps.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py index 65ab2b0b93..d4af757491 100644 --- a/src/finn/builder/build_dataflow_steps.py +++ b/src/finn/builder/build_dataflow_steps.py @@ -53,6 +53,7 @@ from shutil import copy import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl import finn.transformation.streamline.absorb as absorb from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer @@ -123,7 +124,6 @@ ) from finn.util.pyverilator import verilator_fifosim from finn.util.test import execute_parent -import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl def verify_step( @@ -486,14 +486,13 @@ def step_generate_estimate_reports(model: ModelWrapper, cfg: DataflowBuildConfig def step_specialize_to_rtl(model: ModelWrapper, cfg: DataflowBuildConfig): - """Convert layers implemented in HLS to an equivalent specialized RTL implementation if possible.""" - specialize_to_rtl_transforms = [ - to_rtl.InferRTLMatrixVectorActivation() - ] + """Convert layers implemented in HLS to an equivalent specialized RTL + implementation if possible.""" + specialize_to_rtl_transforms = [to_rtl.InferRTLMatrixVectorActivation()] for trn in specialize_to_rtl_transforms: model = model.transform(trn) return model - + def step_minimize_bit_width(model: ModelWrapper, cfg: DataflowBuildConfig): """Tighten the weight and accumulator bit widths for each layer.""" @@ -594,7 +593,12 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): model = model.transform(GiveUniqueNodeNames()) model = model.transform(GiveReadableTensorNames()) if cfg.folding_config_file is not None: - model = model.transform(ApplyConfig(cfg.folding_config_file)) + model = model.transform( + ApplyConfig( + cfg.folding_config_file, + node_filter=lambda x: x.op_type == "StreamingFIFO", + ) + ) # extract the final configuration and save it as json hw_attrs = [ From 08cbdc59a95ed6281c3234c5e8b0b9d7327a2988 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Wed, 24 May 2023 07:58:41 +0100 Subject: [PATCH 128/235] Revised control interface attributes. --- finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v | 29 +++++++++++++------------- finn-rtllib/mvu/mvu_axi_wrapper.v | 8 ++++--- 2 files changed, 20 insertions(+), 17 deletions(-) diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v index fb3c62a15a..e15f77fbae 100644 --- a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v +++ b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v @@ -50,25 +50,26 @@ module $MODULE_NAME_AXI_WRAPPER$ #( parameter OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8 )( // Global Control - (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF s_axis_weights:s_axis_input:m_axis_output" *) - input logic ap_clk, - (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF s_axis_weights:s_axis_input:m_axis_output" *) - input logic ap_rst_n, + (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF s_axis_weights:s_axis_input:m_axis_output, ASSOCIATED_RESET ap_rst_n" *) + (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *) + input ap_clk, + (* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *) + input ap_rst_n, // Weight Stream - input logic [WEIGHT_STREAM_WIDTH_BA-1:0] s_axis_weights_tdata, - input logic s_axis_weights_tvalid, - output logic s_axis_weights_tready, + input [WEIGHT_STREAM_WIDTH_BA-1:0] s_axis_weights_tdata, + input s_axis_weights_tvalid, + output s_axis_weights_tready, // Input Stream - input logic [INPUT_STREAM_WIDTH_BA-1:0] s_axis_input_tdata, - input logic s_axis_input_tvalid, - output logic s_axis_input_tready, + input [INPUT_STREAM_WIDTH_BA-1:0] s_axis_input_tdata, + input s_axis_input_tvalid, + output s_axis_input_tready, // Output Stream - output logic [OUTPUT_STREAM_WIDTH_BA-1:0] m_axis_output_tdata, - output logic m_axis_output_tvalid, - input logic m_axis_output_tready + output [OUTPUT_STREAM_WIDTH_BA-1:0] m_axis_output_tdata, + output m_axis_output_tvalid, + input m_axis_output_tready ); mvu_8sx9_axi #( @@ -89,4 +90,4 @@ mvu_8sx9_axi #( .m_axis_output_tready(m_axis_output_tready) ); -endmodule : $MODULE_NAME_AXI_WRAPPER$ \ No newline at end of file +endmodule : $MODULE_NAME_AXI_WRAPPER$ diff --git a/finn-rtllib/mvu/mvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_axi_wrapper.v index d8acaefcc7..239c5bbacd 100644 --- a/finn-rtllib/mvu/mvu_axi_wrapper.v +++ b/finn-rtllib/mvu/mvu_axi_wrapper.v @@ -51,10 +51,12 @@ module $MODULE_NAME_AXI_WRAPPER$ #( parameter OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8 )( // Global Control - (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V" *) + (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *) + (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *) input ap_clk, - (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V" *) + (* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *) input ap_rst_n, + // Weight Stream input [WEIGHT_STREAM_WIDTH_BA-1:0] weights_V_TDATA, input weights_V_TVALID, @@ -87,4 +89,4 @@ mvu_axi #( .m_axis_output_tready(out_V_TREADY) ); -endmodule : $MODULE_NAME_AXI_WRAPPER$ \ No newline at end of file +endmodule : $MODULE_NAME_AXI_WRAPPER$ From d058cc2a5c1ed71a2c2ea12034cfa921818381ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Wed, 24 May 2023 09:16:50 +0100 Subject: [PATCH 129/235] Mask device primitives from Verilator in favor of using behavioral code. --- finn-rtllib/mvu/mvu_4sx4u.sv | 38 ++++++++++++++++++++---------- finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 38 ++++++++++++++++++++---------- finn-rtllib/mvu/mvu_8sx9.sv | 29 ++++++++++++++--------- 3 files changed, 68 insertions(+), 37 deletions(-) diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv index 5993154355..21594e46ac 100644 --- a/finn-rtllib/mvu/mvu_4sx4u.sv +++ b/finn-rtllib/mvu/mvu_4sx4u.sv @@ -19,6 +19,12 @@ module mvu_4sx4u #( output logic vld, output logic signed [PE-1:0][ACCU_WIDTH-1:0] p ); + // Verilator always to use behavioral code + localparam bit BEHAVIORAL = +`ifdef VERILATOR + 1 || +`endif + FORCE_BEHAVIORAL; typedef int unsigned leave_load_t[2*SIMD-1]; function leave_load_t init_leave_loads(); @@ -59,17 +65,21 @@ module mvu_4sx4u #( for(genvar pe = 0; pe < PE_END - PE_BEG; pe++) begin assign ww[pe] = w[PE_BEG + pe][s]; if(pe) begin -// assign xx[pe] = zero? 0 : ww[pe] * a[s]; - LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x ( - .O6(xx[pe][1]), - .O5(xx[pe][0]), - .I5(1'b1), - .I4(zero), - .I3(ww[pe][1]), - .I2(a[s][1]), - .I1(ww[pe][0]), - .I0(a[s][0]) - ); + if(BEHAVIORAL) assign xx[pe] = zero? 0 : ww[pe] * a[s]; +`ifndef VERILATOR + else begin + LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x ( + .O6(xx[pe][1]), + .O5(xx[pe][0]), + .I5(1'b1), + .I4(zero), + .I3(ww[pe][1]), + .I2(a[s][1]), + .I1(ww[pe][0]), + .I0(a[s][0]) + ); + end +`endif end end always_comb begin @@ -87,7 +97,7 @@ module mvu_4sx4u #( // Note: Since the product B * AD is computed, // rst can be only applied to AD and zero only to B // with the same effect as zeroing both. - if (FORCE_BEHAVIORAL) begin : genBehav + if (BEHAVIORAL) begin : genBehav // Stage #1: Input Refine logic signed [23:0] B1 = 0; always_ff @(posedge clk) begin @@ -121,6 +131,7 @@ module mvu_4sx4u #( assign pp = P3; end : genBehav +`ifndef VERILATOR else begin : genDSP DSP48E2 #( // Feature Control Attributes: Data Path Selection @@ -252,6 +263,7 @@ module mvu_4sx4u #( .RSTP(rst) // 1-bit input: Reset for PREG ); end : genDSP +`endif // External Canary Pipeline logic [1:0] X1[3:1] = '{ default: 0 }; @@ -356,4 +368,4 @@ module mvu_4sx4u #( end : genPipes -endmodule : mvu_4sx4u \ No newline at end of file +endmodule : mvu_4sx4u diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv index e06a92c8fa..09db360b77 100644 --- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv +++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv @@ -23,6 +23,12 @@ module mvu_8sx8u_dsp48 #( output logic vld, output logic signed [PE-1:0][ACCU_WIDTH-1:0] p ); + // Verilator always to use behavioral code + localparam bit BEHAVIORAL = +`ifdef VERILATOR + 1 || +`endif + FORCE_BEHAVIORAL; typedef int unsigned leave_load_t[2*SIMD-1]; function leave_load_t init_leave_loads(); @@ -63,17 +69,21 @@ module mvu_8sx8u_dsp48 #( for(genvar pe = 0; pe < PE_END - PE_BEG; pe++) begin assign ww[pe] = w[PE_BEG + pe][s]; if(pe) begin -// assign xx[pe] = zero? 0 : ww[pe] * a[s]; - LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x ( - .O6(xx[1]), - .O5(xx[0]), - .I5(1'b1), - .I4(zero), - .I3(ww[pe][1]), - .I2(a[s][1]), - .I1(ww[pe][0]), - .I0(a[s][0]) - ); + if(BEHAVIORAL) assign xx[pe] = zero? 0 : ww[pe] * a[s]; +`ifndef VERILATOR + else begin + LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x ( + .O6(xx[1]), + .O5(xx[0]), + .I5(1'b1), + .I4(zero), + .I3(ww[pe][1]), + .I2(a[s][1]), + .I1(ww[pe][0]), + .I0(a[s][0]) + ); + end +`endif end end always_comb begin @@ -91,7 +101,7 @@ module mvu_8sx8u_dsp48 #( // Note: Since the product B * AD is computed, // rst can be only applied to AD and zero only to B // with the same effect as zeroing both. - if (FORCE_BEHAVIORAL) begin : genBehav + if(BEHAVIORAL) begin : genBehav // Stage #1: Input Refine logic signed [23:0] B1 = 0; always_ff @(posedge clk) begin @@ -125,6 +135,7 @@ module mvu_8sx8u_dsp48 #( assign pp = P3; end : genBehav +`ifndef VERILATOR else begin : genDSP DSP48E2 #( // Feature Control Attributes: Data Path Selection @@ -256,6 +267,7 @@ module mvu_8sx8u_dsp48 #( .RSTP(rst) // 1-bit input: Reset for PREG ); end : genDSP +`endif // External Canary Pipeline logic [1:0] X1 = '{ default: 0 }; @@ -355,4 +367,4 @@ module mvu_8sx8u_dsp48 #( end : genPipes -endmodule : mvu_8sx8u_dsp48 \ No newline at end of file +endmodule : mvu_8sx8u_dsp48 diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv index 2d1da26efb..f8e2ab3985 100644 --- a/finn-rtllib/mvu/mvu_8sx9.sv +++ b/finn-rtllib/mvu/mvu_8sx9.sv @@ -52,11 +52,17 @@ module mvu_8sx9 #( input logic zero, // ignore current inputs and force this partial product to zero input logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // weights input logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] a, // activations - + // Ouput output logic vld, output logic [PE-1:0][ACCU_WIDTH-1:0] p ); + // Verilator always to use behavioral code + localparam bit BEHAVIORAL = +`ifdef VERILATOR + 1 || +`endif + FORCE_BEHAVIORAL; //-------------------- Declare global signals --------------------\\ localparam int unsigned CHAINLEN = (SIMD+2)/3; @@ -75,7 +81,7 @@ module mvu_8sx9 #( L[1+MAX_PIPELINE_STAGES] <= last; L[0:MAX_PIPELINE_STAGES] <= L[1:1+MAX_PIPELINE_STAGES]; end - end + end assign vld = L[0]; //-------------------- Shift register for ZERO flag --------------------\\ @@ -87,7 +93,7 @@ module mvu_8sx9 #( else if(en) begin Z[0] <= zero; if (MAX_PIPELINE_STAGES > 2) Z[1:MAX_PIPELINE_STAGES-1] <= Z[0:MAX_PIPELINE_STAGES-2]; - end + end end end; @@ -157,12 +163,12 @@ module mvu_8sx9 #( if (LAST) begin : genPOUT assign p[j] = pp[ACCU_WIDTH-1:0]; - end + end // Note: Since the product B * AD is computed, // rst can be only applied to AD and zero only to B // with the same effect as zeroing both. - if (FORCE_BEHAVIORAL) begin : genBehav + if(BEHAVIORAL) begin : genBehav // Stage #1: Input A/B logic signed [33:0] Areg [INTERNAL_PREGS]; always_ff @(posedge clk) begin @@ -233,7 +239,7 @@ module mvu_8sx9 #( assign pp = Preg; assign pcout[j][i] = pp; end : genBehav - +`ifndef VERILATOR else begin: genDSP DSP58 #( // Feature Control Attributes: Data Path Selection @@ -263,8 +269,8 @@ module mvu_8sx9 #( .IS_CLK_INVERTED(1'b0), // Optional inversion for CLK .IS_INMODE_INVERTED(5'b00000), // Optional inversion for INMODE .IS_NEGATE_INVERTED(3'b000), // Optional inversion for NEGATE - .IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0 - FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN + .IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0 + FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN 2'b01, // Y : M 2'b01 // X: M }), // Optional inversion for OPMODE @@ -325,7 +331,7 @@ module mvu_8sx9 #( INTERNAL_PREGS==2 ? 1'b0 : 1'b1, 2'b00, TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero, - INTERNAL_PREGS==2 ? 1'b0 : 1'b1 + INTERNAL_PREGS==2 ? 1'b0 : 1'b1 }), // 5-bit input: INMODE control .NEGATE('0), // 3-bit input: Negates the input of the multiplier .OPMODE({ @@ -365,7 +371,8 @@ module mvu_8sx9 #( .RSTP(PREG && rst) // 1-bit input: Reset for PREG ); end : genDSP - end : genDSPChain +`endif + end : genDSPChain end : genDSPPE - + endmodule : mvu_8sx9 From a66f38f2d06901fd27cf874701572268ea4793d6 Mon Sep 17 00:00:00 2001 From: Yaman Umuroglu Date: Thu, 11 May 2023 23:48:36 +0200 Subject: [PATCH 130/235] [Deps] update qonnx --- fetch-repos.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fetch-repos.sh b/fetch-repos.sh index e039ca9144..f1cf8754f2 100755 --- a/fetch-repos.sh +++ b/fetch-repos.sh @@ -27,7 +27,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -QONNX_COMMIT="20a34289cf2297d2b2bbbe75d6ac152ece86e3b4" +QONNX_COMMIT="bc36fd56bf1e4abfcf98cd76a001cad13d57baac" FINN_EXP_COMMIT="0aa7e1c44b20cf085b6fe42cff360f0a832afd2c" BREVITAS_COMMIT="c65f9c13dc124971f14739349531bbcda5c2a4aa" PYVERILATOR_COMMIT="766e457465f5c0dd315490d7b9cc5d74f9a76f4f" From 8f9bd04b3311e56da4684a58d4de868d61f342ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Wed, 24 May 2023 12:44:53 +0100 Subject: [PATCH 131/235] Adding folding hints. Impl selection by case statement. --- finn-rtllib/mvu/mvu_axi.sv | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/finn-rtllib/mvu/mvu_axi.sv b/finn-rtllib/mvu/mvu_axi.sv index e4a919ba88..a181f54ac5 100644 --- a/finn-rtllib/mvu/mvu_axi.sv +++ b/finn-rtllib/mvu/mvu_axi.sv @@ -29,6 +29,14 @@ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * @brief Matrix Vector Unit (MVU) AXI-lite interface wrapper. + * @details + * Folding hints: + * - 4-bit MVU: PE scaling should aim at a full multiple of 4. + * - 8-bit MVU - DSP48: PE scaling should aim at a full multiple of 2. + * - 8-bit MVU - DSP58: SIMD scaling should aim at a full multiple of 3. + * - Otherwise, keep SIMD and PE somewhat balanced. SIMD scaling tends to + * impact critical paths more than PE scaling. PE scaling implies a + * bigger fanout on the input activations. *****************************************************************************/ module mvu_axi #( @@ -134,8 +142,9 @@ module mvu_axi #( uwire ovld; uwire [PE-1:0][ACCU_WIDTH-1:0] odat; typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t; - - if (MVU_IMPL_STYLE == "mvu_8sx9_dsp58") begin : genMVU8sx9 + + case(MVU_IMPL_STYLE) + "mvu_8sx9_dsp58": mvu_8sx9 #(.PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( @@ -143,26 +152,27 @@ module mvu_axi #( .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau), .vld(ovld), .p(odat) ); - end - else if (MVU_IMPL_STYLE == "mvu_4sx4u") begin : genMVU4sx4u + + "mvu_4sx4u": mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( .clk, .rst, .en, .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau), .vld(ovld), .p(odat) ); - end - else if (MVU_IMPL_STYLE == "mvu_8sx8u_dsp48") begin : genMVU8sx8u + + "mvu_8sx8u_dsp48": mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( .clk, .rst, .en, .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau), .vld(ovld), .p(odat) ); - end - else initial begin - $error("Unrecognized MVU_IMPL_STYLE!"); + + default: initial begin + $error("Unrecognized MVU_IMPL_STYLE '%s'", MVU_IMPL_STYLE); $finish; end + endcase //-------------------- Output register slice --------------------\\ struct packed { @@ -185,7 +195,7 @@ module mvu_axi #( end end end - + struct packed { logic vld; logic [PE-1:0][ACCU_WIDTH-1:0] dat; @@ -196,10 +206,10 @@ module mvu_axi #( if(rst) B <= '{ default: 'x }; else begin if(b_load) B <= '{ vld: A.vld, dat: A.dat}; - end + end end assign m_axis_output_tvalid = B.vld; assign m_axis_output_tdata = B.dat; -endmodule : mvu_axi \ No newline at end of file +endmodule : mvu_axi From 9de5ed6f7b459f37bb127f0cd105e6f927d25611 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Wed, 24 May 2023 13:52:40 +0100 Subject: [PATCH 132/235] Fixed behavioral sideband prediction. --- finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv index 09db360b77..bd1f813af6 100644 --- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv +++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv @@ -69,7 +69,7 @@ module mvu_8sx8u_dsp48 #( for(genvar pe = 0; pe < PE_END - PE_BEG; pe++) begin assign ww[pe] = w[PE_BEG + pe][s]; if(pe) begin - if(BEHAVIORAL) assign xx[pe] = zero? 0 : ww[pe] * a[s]; + if(BEHAVIORAL) assign xx = zero? 0 : ww[pe] * a[s]; `ifndef VERILATOR else begin LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x ( From 239759a6a4b8cb008aa9b80d52d15f53f77e5965 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 24 May 2023 15:49:19 +0100 Subject: [PATCH 133/235] [rtl mvu]: extension to allow selecting PE values that are not multiples of 4 --- finn-rtllib/mvu/mvu_4sx4u.sv | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv index 21594e46ac..111d651cf5 100644 --- a/finn-rtllib/mvu/mvu_4sx4u.sv +++ b/finn-rtllib/mvu/mvu_4sx4u.sv @@ -50,6 +50,7 @@ module mvu_4sx4u #( localparam int unsigned PE_BEG = 4*c; localparam int unsigned PE_END = PE < 4*(c+1)? PE : 4*(c+1); + localparam int unsigned PE_REM = 4*(c+1) - PE_END; uwire [57:0] p3[SIMD]; uwire signed [ 1:0] h3[SIMD][3]; @@ -65,12 +66,12 @@ module mvu_4sx4u #( for(genvar pe = 0; pe < PE_END - PE_BEG; pe++) begin assign ww[pe] = w[PE_BEG + pe][s]; if(pe) begin - if(BEHAVIORAL) assign xx[pe] = zero? 0 : ww[pe] * a[s]; + if(BEHAVIORAL) assign xx[pe + PE_REM] = zero? 0 : ww[pe] * a[s]; `ifndef VERILATOR else begin LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x ( - .O6(xx[pe][1]), - .O5(xx[pe][0]), + .O6(xx[pe + PE_REM][1]), + .O5(xx[pe + PE_REM][0]), .I5(1'b1), .I4(zero), .I3(ww[pe][1]), @@ -86,8 +87,8 @@ module mvu_4sx4u #( dd = '0; aa = '0; for(int unsigned pe = 0; pe < PE_END - PE_BEG; pe++) begin - dd[D[pe]+:3] = ww[pe]; - aa[D[pe]+ 3] = ww[pe][3]; + dd[D[pe + PE_REM]+:3] = ww[pe]; + aa[D[pe + PE_REM]+ 3] = ww[pe][3]; end end end : blkVectorize @@ -305,7 +306,7 @@ module mvu_4sx4u #( localparam int unsigned HI_WIDTH = ACCU_WIDTH - LO_WIDTH; // Conclusive high part accumulation - if(i < 3) begin : genHi + if(i >= PE_REM && i < 3) begin : genHi // Adder Tree across all SIMD high contributions, each from [-1:1] uwire signed [$clog2(1+SIMD):0] tree[2*SIMD-1]; for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = h3[s][i]; @@ -323,9 +324,12 @@ module mvu_4sx4u #( end assign hi4[i] = Hi4; end : genHi + else begin : genHiZero + assign hi4[i] = '0; + end : genHiZero // Conclusive low part accumulation - if(1) begin : blkLo + if(i >= PE_REM) begin : blkLo // Adder Tree across all SIMD low contributions localparam int unsigned ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1)); uwire [ROOT_WIDTH-1:0] tree[2*SIMD-1]; @@ -346,6 +350,9 @@ module mvu_4sx4u #( if(i == 3) assign up4 = Lo4; else assign lo4[i] = Lo4; end : blkLo + else begin : blkLoZero + assign lo4[i] = '0; + end : blkLoZero end @@ -363,7 +370,7 @@ module mvu_4sx4u #( // Output for(genvar pe = PE_BEG; pe < PE_END; pe++) begin - assign p[pe] = Res5[pe - PE_BEG]; + assign p[pe] = Res5[pe - PE_BEG + PE_REM]; end end : genPipes From 8d3247ccf7657aeb534147a5dd9511fa397d4eb2 Mon Sep 17 00:00:00 2001 From: Yaman Umuroglu Date: Wed, 24 May 2023 15:56:07 +0200 Subject: [PATCH 134/235] [rtlmvu] Avoid unintentional verilator metacomments --- finn-rtllib/mvu/mvu_4sx4u.sv | 2 +- finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 2 +- finn-rtllib/mvu/mvu_8sx9.sv | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv index 21594e46ac..9f101e8c29 100644 --- a/finn-rtllib/mvu/mvu_4sx4u.sv +++ b/finn-rtllib/mvu/mvu_4sx4u.sv @@ -19,7 +19,7 @@ module mvu_4sx4u #( output logic vld, output logic signed [PE-1:0][ACCU_WIDTH-1:0] p ); - // Verilator always to use behavioral code + // for verilator always use behavioral code localparam bit BEHAVIORAL = `ifdef VERILATOR 1 || diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv index bd1f813af6..6b54e91b6a 100644 --- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv +++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv @@ -23,7 +23,7 @@ module mvu_8sx8u_dsp48 #( output logic vld, output logic signed [PE-1:0][ACCU_WIDTH-1:0] p ); - // Verilator always to use behavioral code + // for verilator always use behavioral code localparam bit BEHAVIORAL = `ifdef VERILATOR 1 || diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv index f8e2ab3985..a601066cfd 100644 --- a/finn-rtllib/mvu/mvu_8sx9.sv +++ b/finn-rtllib/mvu/mvu_8sx9.sv @@ -57,7 +57,7 @@ module mvu_8sx9 #( output logic vld, output logic [PE-1:0][ACCU_WIDTH-1:0] p ); - // Verilator always to use behavioral code + // for verilator always use behavioral code localparam bit BEHAVIORAL = `ifdef VERILATOR 1 || From c8663505dcd2c2eeb3ddad05d361f82be32040eb Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 24 May 2023 17:14:23 +0100 Subject: [PATCH 135/235] [rtl mvu]: extension to allow selecting PE values that are not multiples of 2 --- finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 57 +++++++++++++++++------------- 1 file changed, 32 insertions(+), 25 deletions(-) diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv index 6b54e91b6a..5cc3fa4c49 100644 --- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv +++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv @@ -54,6 +54,7 @@ module mvu_8sx8u_dsp48 #( localparam int unsigned PE_BEG = 2*c; localparam int unsigned PE_END = PE < 2*(c+1)? PE : 2*(c+1); + localparam int unsigned PE_RES = 2*(c+1) - PE_END; uwire [57:0] p3[SIMD]; uwire signed [ 1:0] h3[SIMD]; @@ -90,8 +91,8 @@ module mvu_8sx8u_dsp48 #( dd = '0; aa = '0; for(int unsigned pe = 0; pe < PE_END - PE_BEG; pe++) begin - dd[D[pe] +: WEIGHT_WIDTH-1] = ww[pe]; - aa[D[pe] + WEIGHT_WIDTH-1] = ww[pe][WEIGHT_WIDTH-1]; + dd[D[pe + PE_RES] +: WEIGHT_WIDTH-1] = ww[pe]; + aa[D[pe + PE_RES] + WEIGHT_WIDTH-1] = ww[pe][WEIGHT_WIDTH-1]; end end end : blkVectorize @@ -301,32 +302,35 @@ module mvu_8sx8u_dsp48 #( uwire signed [ACCU_WIDTH -1:0] up4; uwire signed [ACCU_WIDTH -SINGLE_PROD_WIDTH:0] hi4; uwire [$clog2(SIMD)+SINGLE_PROD_WIDTH-1:0] lo4; - for(genvar i = 0; i < 2; i++) begin - localparam int unsigned LO_WIDTH = D[i+1] - D[i]; - localparam int unsigned HI_WIDTH = ACCU_WIDTH - LO_WIDTH; - // Conclusive high part accumulation - if(i == 0) begin : genHi - // Adder Tree across all SIMD high contributions, each from [-1:1] - uwire signed [$clog2(1+SIMD):0] tree[2*SIMD-1]; - for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = h3[s]; - for(genvar n = 0; n < SIMD-1; n++) begin - // Sum truncated to actual maximum bit width at this node - uwire signed [$clog2(1+LEAVE_LOAD[n]):0] s = tree[2*n+1] + tree[2*n+2]; - assign tree[n] = s; - end + // Conclusive high part accumulation + if(PE_RES == 0) begin : genHi + localparam int unsigned HI_WIDTH = ACCU_WIDTH - D[1]; + // Adder Tree across all SIMD high contributions, each from [-1:1] + uwire signed [$clog2(1+SIMD):0] tree[2*SIMD-1]; + for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = h3[s]; + for(genvar n = 0; n < SIMD-1; n++) begin + // Sum truncated to actual maximum bit width at this node + uwire signed [$clog2(1+LEAVE_LOAD[n]):0] s = tree[2*n+1] + tree[2*n+2]; + assign tree[n] = s; + end - // High Sideband Accumulation - logic signed [HI_WIDTH-1:0] Hi4 = 0; - always_ff @(posedge clk) begin - if(rst) Hi4 <= 0; - else if(en) Hi4 <= (L[4]? 0 : Hi4) + tree[0]; - end - assign hi4 = Hi4; - end : genHi + // High Sideband Accumulation + logic signed [HI_WIDTH-1:0] Hi4 = 0; + always_ff @(posedge clk) begin + if(rst) Hi4 <= 0; + else if(en) Hi4 <= (L[4]? 0 : Hi4) + tree[0]; + end + assign hi4 = Hi4; + end : genHi + else begin : genHiZero + assign hi4 = '0; + end : genHiZero + for(genvar i = 0; i < 2; i++) begin + localparam int unsigned LO_WIDTH = D[i+1] - D[i]; // Conclusive low part accumulation - if(1) begin : blkLo + if(i >= PE_RES) begin : blkLo // Adder Tree across all SIMD low contributions localparam int unsigned ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1)); uwire [ROOT_WIDTH-1:0] tree[2*SIMD-1]; @@ -347,6 +351,9 @@ module mvu_8sx8u_dsp48 #( if(i == 1) assign up4 = Lo4; else assign lo4 = Lo4; end : blkLo + else begin : blkLoZero + assign lo4 = '0; + end : blkLoZero end @@ -362,7 +369,7 @@ module mvu_8sx8u_dsp48 #( // Output for(genvar pe = PE_BEG; pe < PE_END; pe++) begin - assign p[pe] = Res5[pe - PE_BEG]; + assign p[pe] = Res5[pe - PE_BEG + PE_RES]; end end : genPipes From fd1e038c643c05199b38320f8815f430e538d936 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 24 May 2023 17:21:56 +0100 Subject: [PATCH 136/235] [rtl mvu axi]: updated comments on folding hints --- finn-rtllib/mvu/mvu_axi.sv | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/finn-rtllib/mvu/mvu_axi.sv b/finn-rtllib/mvu/mvu_axi.sv index a181f54ac5..cef55949ed 100644 --- a/finn-rtllib/mvu/mvu_axi.sv +++ b/finn-rtllib/mvu/mvu_axi.sv @@ -31,12 +31,13 @@ * @brief Matrix Vector Unit (MVU) AXI-lite interface wrapper. * @details * Folding hints: - * - 4-bit MVU: PE scaling should aim at a full multiple of 4. - * - 8-bit MVU - DSP48: PE scaling should aim at a full multiple of 2. - * - 8-bit MVU - DSP58: SIMD scaling should aim at a full multiple of 3. + * - 4-bit MVU: PE scaling should divide MH. + * - 8-bit MVU - DSP48: PE scaling should divide MH. + * - 8-bit MVU - DSP58: SIMD scaling should aim at a full multiple of 3 and divide MW. * - Otherwise, keep SIMD and PE somewhat balanced. SIMD scaling tends to * impact critical paths more than PE scaling. PE scaling implies a * bigger fanout on the input activations. + * - Full unfolding along MH (PE=MH) results in no replay buffer instantiated *****************************************************************************/ module mvu_axi #( From f60d4c6fa105bd29689b93aafd880ec92c32358c Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 2 Jun 2023 11:48:26 +0100 Subject: [PATCH 137/235] [rtl custom op]: minor fixes to codegen --- .../fpgadataflow/matrixvectoractivation_rtl.py | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py index 1791327e78..9f8130806b 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py @@ -612,11 +612,7 @@ def generate_params(self, model, path): code_gen_dir = path # weights, if not external weights = model.get_initializer(self.onnx_node.input[1]) -<<<<<<< HEAD - if mem_mode in ["decoupled", "external"]: -======= if mem_mode == "decoupled" or mem_mode == "external": ->>>>>>> 72fe4c5b ([rtlmvu] More fixes for memstream and param gen) weight_filename_sim = "{}/weights.npy".format(code_gen_dir) # save decoupled weights for cppsim self.make_weight_file(weights, "decoupled_npy", weight_filename_sim) @@ -909,8 +905,6 @@ def code_generation_ipi(self): self.onnx_node.name, ) ) - cmd.append("set_property CONFIG.FREQ_HZ 333333333.333333 [get_bd_intf_pins %s/in0_V]" % (self.onnx_node.name)) - cmd.append("set_property CONFIG.FREQ_HZ 333333333.333333 [get_bd_intf_pins %s/out_V]" % (self.onnx_node.name)) else: raise Exception("Unrecognized mem_mode for MatrixVectorActivation") return cmd @@ -968,8 +962,7 @@ def derive_characteristic_fxns(self, period): # TODO: characterize max_clk and implement this function in look-up style def _resolve_segment_len(self, clk): # Insert pipeline registers in the DSP chain to meet target clock frequency - segmentlen = 0 - return segmentlen + return 4 # default to 4 for now def _resolve_impl_style(self, fpgapart): # Based on target device and activation/weight-width, choose the @@ -1002,11 +995,6 @@ def generate_hdl(self, model, fpgapart, clk): # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject) self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name()) - ram_style = self.get_nodeattr("ram_style") - assert ( - ram_style == "auto" - ), "Unrecognized ram_style for MatrixVectorActivation_rtl" - # apply code generation to template with open(template_path, "r") as f: template_wrapper = f.read() From a1ad304a42bf89b36d6507cf9f749a7a1a7d130a Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 2 Jun 2023 11:48:58 +0100 Subject: [PATCH 138/235] [specialize-to-rtl]: add ram_style and rt_writeable_weights support --- .../transformation/fpgadataflow/specialize_to_rtl_layers.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py index 7d677ec216..23b6e59abe 100644 --- a/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py +++ b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py @@ -74,6 +74,8 @@ def apply(self, model): simd = getCustomOp(n).get_nodeattr("SIMD") pe = getCustomOp(n).get_nodeattr("PE") mem_mode = getCustomOp(n).get_nodeattr("mem_mode") + ram_style = getCustomOp(n).get_nodeattr("ram_style") + runtime_writeable_weights = getCustomOp(n).get_nodeattr("runtime_writeable_weights") new_node = helper.make_node( "MatrixVectorActivation_rtl", @@ -91,6 +93,8 @@ def apply(self, model): numInputVectors=numInputVectors, mem_mode=mem_mode, name=n.name + "_rtl", + ram_style=ram_style, + runtime_writeable_weights=runtime_writeable_weights ) graph.node.insert(node_ind, new_node) # remove old node From 2cbb68fe016ff7ea292ffa071741b352222d1a4c Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 2 Jun 2023 11:50:05 +0100 Subject: [PATCH 139/235] [rtllib]: change string type to parameter type due to Vivado error --- finn-rtllib/mvu/mvu_axi.sv | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/finn-rtllib/mvu/mvu_axi.sv b/finn-rtllib/mvu/mvu_axi.sv index cef55949ed..46167af95b 100644 --- a/finn-rtllib/mvu/mvu_axi.sv +++ b/finn-rtllib/mvu/mvu_axi.sv @@ -51,7 +51,7 @@ module mvu_axi #( bit SIGNED_ACTIVATIONS = 0, int unsigned SEGMENTLEN = 0, bit FORCE_BEHAVIORAL = 0, - string MVU_IMPL_STYLE, + parameter MVU_IMPL_STYLE, // string type causes error in Vivado localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, localparam int unsigned INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8, @@ -163,12 +163,11 @@ module mvu_axi #( "mvu_8sx8u_dsp48": mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), - .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( + .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( .clk, .rst, .en, .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau), .vld(ovld), .p(odat) ); - default: initial begin $error("Unrecognized MVU_IMPL_STYLE '%s'", MVU_IMPL_STYLE); $finish; From 92eb0edba2d059b8b170ed7e6d8ac7a224c9208c Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 2 Jun 2023 11:51:40 +0100 Subject: [PATCH 140/235] [rtllib]: renamed variable for consistency --- finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv index 5cc3fa4c49..3cd9cef560 100644 --- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv +++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv @@ -54,7 +54,7 @@ module mvu_8sx8u_dsp48 #( localparam int unsigned PE_BEG = 2*c; localparam int unsigned PE_END = PE < 2*(c+1)? PE : 2*(c+1); - localparam int unsigned PE_RES = 2*(c+1) - PE_END; + localparam int unsigned PE_REM = 2*(c+1) - PE_END; uwire [57:0] p3[SIMD]; uwire signed [ 1:0] h3[SIMD]; @@ -91,8 +91,8 @@ module mvu_8sx8u_dsp48 #( dd = '0; aa = '0; for(int unsigned pe = 0; pe < PE_END - PE_BEG; pe++) begin - dd[D[pe + PE_RES] +: WEIGHT_WIDTH-1] = ww[pe]; - aa[D[pe + PE_RES] + WEIGHT_WIDTH-1] = ww[pe][WEIGHT_WIDTH-1]; + dd[D[pe + PE_REM] +: WEIGHT_WIDTH-1] = ww[pe]; + aa[D[pe + PE_REM] + WEIGHT_WIDTH-1] = ww[pe][WEIGHT_WIDTH-1]; end end end : blkVectorize @@ -304,7 +304,7 @@ module mvu_8sx8u_dsp48 #( uwire [$clog2(SIMD)+SINGLE_PROD_WIDTH-1:0] lo4; // Conclusive high part accumulation - if(PE_RES == 0) begin : genHi + if(PE_REM == 0) begin : genHi localparam int unsigned HI_WIDTH = ACCU_WIDTH - D[1]; // Adder Tree across all SIMD high contributions, each from [-1:1] uwire signed [$clog2(1+SIMD):0] tree[2*SIMD-1]; @@ -330,7 +330,7 @@ module mvu_8sx8u_dsp48 #( for(genvar i = 0; i < 2; i++) begin localparam int unsigned LO_WIDTH = D[i+1] - D[i]; // Conclusive low part accumulation - if(i >= PE_RES) begin : blkLo + if(i >= PE_REM) begin : blkLo // Adder Tree across all SIMD low contributions localparam int unsigned ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1)); uwire [ROOT_WIDTH-1:0] tree[2*SIMD-1]; @@ -369,7 +369,7 @@ module mvu_8sx8u_dsp48 #( // Output for(genvar pe = PE_BEG; pe < PE_END; pe++) begin - assign p[pe] = Res5[pe - PE_BEG + PE_RES]; + assign p[pe] = Res5[pe - PE_BEG + PE_REM]; end end : genPipes From 471a221b975e549e462e7ff9488c65ad182fe278 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Fri, 2 Jun 2023 12:39:14 +0100 Subject: [PATCH 141/235] Fix improper blocking assignment & linting. --- finn-rtllib/mvu/tb/mvu_axi_tb.sv | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/finn-rtllib/mvu/tb/mvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_axi_tb.sv index ef5fa7d682..b89b58f55b 100644 --- a/finn-rtllib/mvu/tb/mvu_axi_tb.sv +++ b/finn-rtllib/mvu/tb/mvu_axi_tb.sv @@ -42,12 +42,12 @@ module mvu_axi_tb(); localparam int unsigned SEGMENTLEN = 2; localparam string MVU_IMPL_STYLE = "mvu_8sx8u_dsp48"; localparam bit FORCE_BEHAVIORAL = 1; - // Bit-width config + // Bit-width config localparam int unsigned ACTIVATION_WIDTH = 8; localparam int unsigned WEIGHT_WIDTH = 8; localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW); localparam bit SIGNED_ACTIVATIONS = 0; - // Simulation constants + // Simulation constants localparam int unsigned NF = MH/PE; localparam int unsigned SF = MW/SIMD; localparam int unsigned NUM_OF_DSP = SIMD/3; @@ -57,7 +57,7 @@ module mvu_axi_tb(); localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - SIMD*ACTIVATION_WIDTH; localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8; - // Generate clk and reset signal + // Generate clk and reset signal logic clk = 0; always #5ns clk = !clk; @@ -69,7 +69,7 @@ module mvu_axi_tb(); uwire ap_clk = clk; - // Generate activations + // Generate activations typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; typedef activation_t activation_vector_t[SF]; @@ -94,8 +94,8 @@ module mvu_axi_tb(); for (int i=0; i= 1; + do begin + activations.vld <= $urandom()%7 >= 1; @(posedge clk); end while (!(activations.vld === 1 && activations.rdy === 1)); end @@ -104,9 +104,9 @@ module mvu_axi_tb(); activations.dat <= 'x; end - // Generate weights + // Generate weights typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; - typedef weight_t weight_matrix_t[NF][SF]; + typedef weight_t weight_matrix_t[NF][SF]; function weight_matrix_t init_WEIGHTS; automatic weight_matrix_t res; @@ -139,7 +139,7 @@ module mvu_axi_tb(); weights.dat <= 'x; end - // Function to compute golden output + // Function to compute golden output // a: [SF][SIMD-1:0][ACTIVATION_WIDTH-1:0] // w: [NF][SF][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] typedef logic signed [PE-1:0][ACCU_WIDTH-1:0] output_t; @@ -155,12 +155,12 @@ module mvu_axi_tb(); automatic output_vector_t res = '{default: 0}; for (int j = 0; j>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); - else begin + else begin $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); $stop; - end + end end - + NF_CNT += 1; end - $finish; + $finish; end // Instantiate DUT @@ -211,5 +211,5 @@ module mvu_axi_tb(); .s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld), .m_axis_output_tready(outputs.rdy) ); - + endmodule : mvu_axi_tb From 5c5dc09c98d4e1a07a7e4cae17ca358b197a57c8 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 2 Jun 2023 13:35:04 +0100 Subject: [PATCH 142/235] [test rtl mvu]: modified/extended test cases --- tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py b/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py index 20a249bd08..3db7a718f5 100644 --- a/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py +++ b/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py @@ -86,13 +86,12 @@ def prepare_inputs(input_tensor): return {"inp": input_tensor} @pytest.mark.parametrize("mh", [16]) -@pytest.mark.parametrize("mw", [90]) -#@pytest.mark.parametrize("pe", [1, 2, 4, 8, 16]) -@pytest.mark.parametrize("pe", [16]) +@pytest.mark.parametrize("mw", [32]) +@pytest.mark.parametrize("pe", [1, 4, 16]) #@pytest.mark.parametrize("simd", [1, 30, 90]) -@pytest.mark.parametrize("simd", [90]) -@pytest.mark.parametrize("idt", [DataType["INT8"]]) -@pytest.mark.parametrize("wdt", [DataType["UINT4"]]) +@pytest.mark.parametrize("simd", [1, 4, 32]) +@pytest.mark.parametrize("idt", [DataType["UINT4"], DataType["UINT8"]]) +@pytest.mark.parametrize("wdt", [DataType["INT4"], DataType["INT6"]]) #@pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S", "xcku3p-ffva676-1-e"]) @pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S"]) @pytest.mark.parametrize("segmentlen", [1]) @@ -166,7 +165,3 @@ def test_fpgadataflow_mvau_rtl(mh, mw, pe, simd, idt, wdt, part, segmentlen): assert (output_mvau_hls == output_mvau_rtl).all() assert (output_mvau_hls.size > 0) - - -# python setup.py test --addopts "-k test_fpgadataflow_mvau_rtl" -# python setup.py test --addopts "-k test_fpgadataflow_fclayer_rtlsim" \ No newline at end of file From b4eb9b69a8a6920fdb3141752395e672f78479e3 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 30 Jun 2023 15:36:17 +0100 Subject: [PATCH 143/235] [rtl mvu]: updated DSP58 >4-bit variant to lift SIMD%3==0 restriction --- finn-rtllib/mvu/mvu_8sx9.sv | 103 +++++++++++++++++++++++------------- 1 file changed, 65 insertions(+), 38 deletions(-) diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv index a601066cfd..439fbc44f9 100644 --- a/finn-rtllib/mvu/mvu_8sx9.sv +++ b/finn-rtllib/mvu/mvu_8sx9.sv @@ -92,77 +92,95 @@ module mvu_8sx9 #( if (rst) Z <= '{default: 0}; else if(en) begin Z[0] <= zero; - if (MAX_PIPELINE_STAGES > 2) Z[1:MAX_PIPELINE_STAGES-1] <= Z[0:MAX_PIPELINE_STAGES-2]; + if (MAX_PIPELINE_STAGES > 2) Z[1:MAX_PIPELINE_STAGES-2] <= Z[0:MAX_PIPELINE_STAGES-3]; end end end; //-------------------- Buffer for input activations --------------------\\ localparam int unsigned PAD_BITS_ACT = 9 - ACTIVATION_WIDTH; - typedef logic [2:0][ACTIVATION_WIDTH-1:0] a_buffer_t; for (genvar i=0; i1 ? TOTAL_PREGS-1 : 0; + localparam int LANES_OCCUPIED = i == CHAINLEN-1 ? SIMD - 3*i : 3; if (EXTERNAL_PREGS > 0) begin : genExternalPregAct - a_buffer_t A [0:EXTERNAL_PREGS-1]; + logic [0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][ACTIVATION_WIDTH-1:0] A = '{ default : 0}; always_ff @(posedge clk) begin if (rst) A <= '{default: 0}; else if(en) begin - A[EXTERNAL_PREGS-1] <= a[3*i +: 3]; + A[EXTERNAL_PREGS-1] <= a[3*i +: LANES_OCCUPIED]; if (EXTERNAL_PREGS > 1) A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1]; end end - assign a_in_i[i][26:0] = SIGNED_ACTIVATIONS ? { {PAD_BITS_ACT{A[0][2][ACTIVATION_WIDTH-1]}}, A[0][2], {PAD_BITS_ACT{A[0][1][ACTIVATION_WIDTH-1]}}, A[0][1], {PAD_BITS_ACT{A[0][0][ACTIVATION_WIDTH-1]}}, A[0][0]} - : { {PAD_BITS_ACT{1'b0}}, A[0][2], {PAD_BITS_ACT{1'b0}}, A[0][1], {PAD_BITS_ACT{1'b0}}, A[0][0]} ; + for (genvar j=0; j1 ? TOTAL_PREGS-1 : 0; + localparam int LANES_OCCUPIED = j == CHAINLEN-1 ? SIMD - 3*j : 3; if (EXTERNAL_PREGS > 0) begin : genExternalPregWeight - b_buffer_t B [0:PE-1][0:EXTERNAL_PREGS-1]; + logic [0:PE-1][0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][WEIGHT_WIDTH-1:0] B = '{ default : 0}; always_ff @(posedge clk) begin if (rst) B <= '{default: 0}; else if (en) begin - B[j][EXTERNAL_PREGS-1] <= w[j][3*i +: 3]; - if (EXTERNAL_PREGS > 1) B[j][0:EXTERNAL_PREGS-2] <= B[j][1:EXTERNAL_PREGS-1]; + B[i][EXTERNAL_PREGS-1] <= w[i][3*j +: LANES_OCCUPIED]; + if (EXTERNAL_PREGS > 1) B[i][0:EXTERNAL_PREGS-2] <= B[i][1:EXTERNAL_PREGS-1]; end end - assign b_in_i[j][i][23:0] = { {PAD_BITS_WEIGHT{B[j][0][2][WEIGHT_WIDTH-1]}}, B[j][0][2], {PAD_BITS_WEIGHT{B[j][0][1][WEIGHT_WIDTH-1]}}, B[j][0][1], {PAD_BITS_WEIGHT{B[j][0][0][WEIGHT_WIDTH-1]}}, B[j][0][0] }; + for (genvar k = 0 ; k < LANES_OCCUPIED ; k++) begin : genBin + assign b_in_i[i][j][8*k +: 8] = PAD_BITS_WEIGHT == 0 ? B[i][0][k] : { {PAD_BITS_WEIGHT{B[i][0][k][WEIGHT_WIDTH-1]}}, B[i][0][k] }; + end : genBin + for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero + assign b_in_i[i][j][8*k +: 8] = 8'b0; + end : genBinZero end : genExternalPregWeight else begin : genInpDSPWeight - assign b_in_i[j][i][23:0] = { {PAD_BITS_WEIGHT{w[j][3*i+2][WEIGHT_WIDTH-1]}}, w[j][3*i+2], {PAD_BITS_WEIGHT{w[j][3*i+1][WEIGHT_WIDTH-1]}}, w[j][3*i+1], {PAD_BITS_WEIGHT{w[j][3*i][WEIGHT_WIDTH-1]}}, w[j][3*i] }; + for (genvar k = 0; k < LANES_OCCUPIED; k++) begin : genBin + assign b_in_i[i][j][8*k +: 8] = PAD_BITS_WEIGHT == 0 ? w[i][3*j+k] : { {PAD_BITS_WEIGHT{w[i][3*j+k][WEIGHT_WIDTH-1]}}, w[i][3*j+k] }; + end : genBin + for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero + assign b_in_i[i][j][8*k +: 8] = 8'b0; + end : genBinZero end : genInpDSPWeight end : genWeightSIMD - end : genWeightPE //-------------------- Instantiate PE x CHAINLEN DSPs --------------------\\ - for (genvar j=0; j0 ? 2 : 1; // 1 : 0 - localparam bit PREG = (i+1)%SEGLEN==0 || i == CHAINLEN-1; - localparam bit FIRST = i == 0; - localparam bit LAST = i == CHAINLEN-1; - uwire [57:0] pp; + localparam bit PREG = (j+1)%SEGLEN==0 || j == CHAINLEN-1; + localparam bit FIRST = j == 0; + localparam bit LAST = j == CHAINLEN-1; if (LAST) begin : genPOUT - assign p[j] = pp[ACCU_WIDTH-1:0]; + assign p[i] = pcout[i][j][ACCU_WIDTH-1:0]; end // Note: Since the product B * AD is computed, @@ -174,7 +192,7 @@ module mvu_8sx9 #( always_ff @(posedge clk) begin if (rst) Areg <= '{ default : 0}; else if (en) begin - Areg[0] <= { 7'bx, a_in_i[i] }; + Areg[0] <= { 7'bx, a_in_i[j] }; if (INTERNAL_PREGS == 2) Areg[1] <= Areg[0]; end end @@ -182,7 +200,7 @@ module mvu_8sx9 #( always_ff @(posedge clk) begin if (rst) Breg <= '{ default : 0}; else if (en) begin - Breg[0] <= b_in_i[j][i]; + Breg[0] <= b_in_i[i][j]; if (INTERNAL_PREGS == 2) Breg[1] <= Breg[0]; end end @@ -217,27 +235,36 @@ module mvu_8sx9 #( end else assign Preg = Mreg; end - else if (LAST) begin : genLast + else if (FIRST && LAST) begin : genSingle + always_ff @(posedge clk) begin + if (rst) Opmode <= 0; + else if (en) Opmode <= L[1]; + end + always_ff @(posedge clk) begin + if (rst) Preg <= 0; + else if (en) Preg <= (Opmode ? 0 : Preg) + Mreg; + end + end + else if (!FIRST && LAST) begin : genLast always_ff @(posedge clk) begin if (rst) Opmode <= 0; else if (en) Opmode <= L[1]; end always_ff @(posedge clk) begin if (rst) Preg <= 0; - else if (en) Preg <= (Opmode ? 0 : Preg) + Mreg + pcout[j][i-1]; + else if (en) Preg <= (Opmode ? 0 : Preg) + Mreg + pcout[i][j-1]; end end else begin : genMid if (PREG) begin : genPregBehav always_ff @(posedge clk) begin if (rst) Preg <= 0; - else if (en) Preg <= Mreg + pcout[j][i-1]; + else if (en) Preg <= Mreg + pcout[i][j-1]; end end - else assign Preg = Mreg + pcout[j][i-1]; + else assign Preg = Mreg + pcout[i][j-1]; end - assign pp = Preg; - assign pcout[j][i] = pp; + assign pcout[i][j] = Preg; end : genBehav `ifndef VERILATOR else begin: genDSP @@ -307,7 +334,7 @@ module mvu_8sx9 #( .BCOUT(), // 24-bit output: B cascade .CARRYCASCOUT(), // 1-bit output: Cascade carry .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade - .PCOUT(pcout[j][i]), // 58-bit output: Cascade output + .PCOUT(pcout[i][j]), // 58-bit output: Cascade output // Control outputs: Control Inputs/Status Bits .OVERFLOW(), // 1-bit output: Overflow in add/acc .PATTERNBDETECT(), // 1-bit output: Pattern bar detect @@ -322,7 +349,7 @@ module mvu_8sx9 #( .BCIN('x), // 24-bit input: B cascade .CARRYCASCIN('x), // 1-bit input: Cascade carry .MULTSIGNIN('x), // 1-bit input: Multiplier sign cascade - .PCIN(FIRST ? 'x : pcout[j][i-1]), // 58-bit input: P cascade + .PCIN(FIRST ? 'x : pcout[i][j-1]), // 58-bit input: P cascade // Control inputs: Control Inputs/Status Bits .ALUMODE(4'h0), // 4-bit input: ALU control .CARRYINSEL('0), // 3-bit input: Carry select @@ -339,8 +366,8 @@ module mvu_8sx9 #( 7'b000_0000 }), // 9-bit input: Operation mode // Data inputs: Data Ports - .A({ 7'bx, a_in_i[i] }), // 34-bit input: A data - .B(b_in_i[j][i]), // 24-bit input: B data + .A({ 7'bx, a_in_i[j] }), // 34-bit input: A data + .B(b_in_i[i][j]), // 24-bit input: B data .C('x), // 58-bit input: C data .CARRYIN('0), // 1-bit input: Carry-in .D('x), // 27-bit input: D data From ad63673cda849ecf0df993bc83d00e676998ab03 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 30 Jun 2023 15:45:26 +0100 Subject: [PATCH 144/235] [rtl mvu]: bug fix for SIMD=1 init_leave_loads --- finn-rtllib/mvu/mvu_4sx4u.sv | 2 +- finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv index 4674576d23..ac95b5f8a9 100644 --- a/finn-rtllib/mvu/mvu_4sx4u.sv +++ b/finn-rtllib/mvu/mvu_4sx4u.sv @@ -296,7 +296,7 @@ module mvu_4sx4u #( // Stage #4: Cross-SIMD Reduction // Count leaves reachable from each node - localparam leave_load_t LEAVE_LOAD = init_leave_loads(); + localparam leave_load_t LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 1}; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop uwire signed [ACCU_WIDTH -1:0] up4; uwire signed [ACCU_WIDTH -8:0] hi4[3]; diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv index 3cd9cef560..416c12c1cc 100644 --- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv +++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv @@ -297,7 +297,7 @@ module mvu_8sx8u_dsp48 #( // Stage #4: Cross-SIMD Reduction // Count leaves reachable from each node - localparam leave_load_t LEAVE_LOAD = init_leave_loads(); + localparam leave_load_t LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 0}; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop uwire signed [ACCU_WIDTH -1:0] up4; uwire signed [ACCU_WIDTH -SINGLE_PROD_WIDTH:0] hi4; From 79e8a5ef208f7bcdeafa231a5a3dff74177008c9 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 13 Jul 2023 18:34:05 +0100 Subject: [PATCH 145/235] [mvu rtl]: restrict index i to be less than 3 (within bounds of hi4) --- finn-rtllib/mvu/mvu_4sx4u.sv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv index ac95b5f8a9..88985312c9 100644 --- a/finn-rtllib/mvu/mvu_4sx4u.sv +++ b/finn-rtllib/mvu/mvu_4sx4u.sv @@ -324,7 +324,7 @@ module mvu_4sx4u #( end assign hi4[i] = Hi4; end : genHi - else begin : genHiZero + else if (i < 3) begin : genHiZero assign hi4[i] = '0; end : genHiZero From e3493c30529949a77a3f384fd75c030c551cd2cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Fri, 2 Jun 2023 12:47:53 +0100 Subject: [PATCH 146/235] Rewrite replay_buffer for input elasticity. --- finn-rtllib/mvu/replay_buffer.sv | 153 ++++++++++++++++++------- finn-rtllib/mvu/tb/replay_buffer_tb.sv | 130 +++++++++++++++++++++ 2 files changed, 242 insertions(+), 41 deletions(-) create mode 100644 finn-rtllib/mvu/tb/replay_buffer_tb.sv diff --git a/finn-rtllib/mvu/replay_buffer.sv b/finn-rtllib/mvu/replay_buffer.sv index 89bbbdb88f..3dfe72d6c6 100644 --- a/finn-rtllib/mvu/replay_buffer.sv +++ b/finn-rtllib/mvu/replay_buffer.sv @@ -1,5 +1,5 @@ /****************************************************************************** - * Copyright (C) 2022, Advanced Micro Devices, Inc. + * Copyright (C) 2022-2023, Advanced Micro Devices, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -51,60 +51,131 @@ module replay_buffer #( input logic ordy ); - typedef logic [$clog2(REP)+$clog2(LEN)-1:0] count_t; - count_t Count = 0; - uwire done_len = LEN == 1 ? 1 : ((LEN-1) & ~Count[$clog2(LEN)-1:0]) == 0; - uwire done_rep; - uwire done_all = done_len && done_rep; + if(LEN == 0) initial begin + $error("%m: Illegal zero sequence LEN."); + $finish; + end + if(REP == 0) initial begin + $error("%m: Illegal zero REP count."); + $finish; + end + // Track position in Sequence + uwire last_item; uwire shift; - uwire clr = rst || (done_all && shift); - always_ff @(posedge clk) begin - if(clr) Count <= 0; - else if(shift) Count <= Count + ((REP > 1) && done_len? 2**$clog2(LEN)-LEN+1 : 1); + if(LEN == 1) assign last_item = 1; + else begin + typedef logic [$clog2(LEN)-1:0] count_t; + count_t Count = 0; + logic Last = 0; + always_ff @(posedge clk) begin + if(rst) begin + Count <= 0; + Last <= 0; + end + else if(shift) begin + Count <= Count + (Last? 2**$clog2(LEN)-LEN+1 : 1); + Last <= (((LEN-2) & ~Count) == 0) && ((LEN&1) || !Last); + end + end + assign last_item = Last; end - typedef logic [W-1:0] data_t; - uwire data_t rdat; - uwire first_rep; if(REP == 1) begin - assign done_rep = 1; - assign first_rep = 1; - assign rdat = 'x; + assign shift = ivld && ordy; + + assign irdy = ordy; + assign odat = idat; + assign olast = last_item; + assign ofin = last_item; + assign ovld = ivld; end else begin - assign done_rep = ((REP-1) & ~Count[$left(Count):$clog2(LEN)]) == 0; - logic FirstRep = 1; + // Track Repetitions + uwire last_rep; + if(1) begin : blkRep + typedef logic [$clog2(REP)-1:0] rep_t; + rep_t RepCnt = 0; + logic RepLst = 0; + always_ff @(posedge clk) begin + if(rst) begin + RepCnt <= 0; + RepLst <= 0; + end + else if(last_item && shift) begin + RepCnt <= RepCnt + (RepLst? 2**$clog2(REP)-REP+1 : 1); + RepLst <= (((REP-2) & ~RepCnt) == 0) && ((REP&1) || !RepLst); + end + end + assign last_rep = RepLst; + end : blkRep + + localparam int unsigned AWIDTH = $clog2(LEN); + typedef logic [AWIDTH :0] ptr_t; // pointers with additional generational MSB + typedef logic [W -1:0] data_t; + + // Output Registers + data_t ODat; + logic OVld = 0; + logic OLst = 'x; + logic OFin = 'x; + assign odat = ODat; + assign olast = OLst; + assign ofin = OFin; + assign ovld = OVld; + + // Buffer Memory Management + data_t Mem[2**AWIDTH]; + ptr_t WP = 0; // Write Pointer + ptr_t RP = 0; // Read Pointer + ptr_t FP = 0; // Free Pointer + + // Operational Guards + // Occupancy: WP-FP + // WP-FP < 2**AWIDTH -> writing allowed + // - increments WP + // Availability: WP-RP + // WP-RP > 0 -> reading allowed + // - increments RP, last in sequence rewinds to FP for non-final repetition + // - increments FP in last repetition + assign irdy = !((WP-FP) >> AWIDTH); + + uwire wr = irdy && ivld; + uwire rd = !OVld || ordy; always_ff @(posedge clk) begin - if(clr) FirstRep <= 1; - else if(shift) FirstRep <= FirstRep && !done_len; + if(wr) Mem[WP[AWIDTH-1:0]] <= idat; + if(rd) ODat <= Mem[RP[AWIDTH-1:0]]; end - assign first_rep = FirstRep; - data_t Buf[LEN]; - if(LEN == 1) begin : genTrivial - always_ff @(posedge clk) begin - if(shift && FirstRep) Buf[0] <= idat; + uwire vld = (RP != WP); + assign shift = rd && vld; + always_ff @(posedge clk) begin + if(rst) begin + WP <= 0; + RP <= 0; + FP <= 0; + + OVld <= 0; + OLst <= 'x; + OFin <= 'x; end - end : genTrivial - else begin : genShift - always_ff @(posedge clk) begin - if(shift) begin - Buf[0] <= odat; - Buf[1:LEN-1] <= Buf[0:LEN-2]; + else begin + if(wr) WP <= WP + 1; + if(rd) begin + if(vld) begin + automatic logic rewind = last_item && !last_rep; + RP <= RP + (rewind? 2**(AWIDTH+1)-LEN+1 : 1); + FP <= FP + last_rep; + end + + OVld <= vld; + OLst <= last_item; + OFin <= last_rep && last_item; end end - end : genShift + end - assign rdat = Buf[LEN-1]; end - assign irdy = ordy && first_rep; - assign odat = first_rep? idat : rdat; - assign olast = done_len; - assign ofin = done_all; - assign ovld = first_rep? ivld : 1; - assign shift = ovld && ordy; - -endmodule : replay_buffer \ No newline at end of file +endmodule : replay_buffer diff --git a/finn-rtllib/mvu/tb/replay_buffer_tb.sv b/finn-rtllib/mvu/tb/replay_buffer_tb.sv new file mode 100644 index 0000000000..5581354e0e --- /dev/null +++ b/finn-rtllib/mvu/tb/replay_buffer_tb.sv @@ -0,0 +1,130 @@ +/****************************************************************************** + * Copyright (C) 2023, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Testbench for replay_buffer module. + * @author Thomas B. Preußer + *****************************************************************************/ + +module replay_buffer_tb; + + // Global Control + logic clk = 0; + always #5ns clk = !clk; + uwire rst = 0; + + // DUT Geometries + localparam int unsigned DIMS[3] = '{ 7, 8, 10 }; + localparam int unsigned W = 8; + typedef logic [W-1:0] data_t; + + bit [2**$size(DIMS)-1:0] done = 0; + always_comb begin + if(&done) begin + $display("Test completed."); + $finish; + end + end + + // Parallel DUT Instantiations + for(genvar r = 0; r < $size(DIMS); r++) begin + for(genvar l = 0; l < $size(DIMS); l++) begin + localparam int unsigned REP = DIMS[r]; + localparam int unsigned LEN = DIMS[l]; + + data_t idat; + logic ivld; + uwire irdy; + + uwire data_t odat; + uwire olast; + uwire ofin; + uwire ovld; + logic ordy; + + replay_buffer #(.LEN(LEN), .REP(REP), .W(W)) dut ( + .clk, .rst, + .idat, .ivld, .irdy, + .odat, .olast, .ofin, .ovld, .ordy + ); + + // Input Feed: 0, 1, ..., 10*LEN-1 + initial begin + idat = 'x; + ivld = 0; + @(posedge clk iff !rst); + + for(int unsigned i = 0; i < 10*LEN; i++) begin + idat <= i; + ivld <= 1; + @(posedge clk iff irdy); + idat <= 'x; + ivld <= 0; + while($urandom()%(REP-1) != 0) @(posedge clk); + end + end + + // Output Check + initial begin + automatic int unsigned base = 0; + + ordy = 0; + @(posedge clk iff !rst); + + for(int unsigned k = 0; k < 10; k++) begin + for(int unsigned j = 0; j < REP; j++) begin + for(int unsigned i = 0; i < LEN; i++) begin + ordy <= 1; + @(posedge clk iff ovld); + assert(odat == base+i) else begin + $error("#%0d.%0d: Data mismatch: %0d instead of %0d.", r, l, odat, base+i); + $stop; + end + assert(olast == (i == LEN-1)) else begin + $error("#%0d.%0d: Last mismatch.", r, l); + $stop; + end + assert(ofin == ((i == LEN-1) && (j == REP-1))) else begin + $error("#%0d.%0d: Fin mismatch.", r, l); + $stop; + end + + ordy <= 0; + while($urandom()%13 == 0) @(posedge clk); + end + end + base += LEN; + end + + done[$size(DIMS)*r + l] <= 1; + end + end + end + +endmodule : replay_buffer_tb From 2efba6854267873c84d58f6d8fe6b64f649eaa99 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Tue, 5 Sep 2023 13:53:01 +0100 Subject: [PATCH 147/235] [to-rtl]: Infer unique node names after transformation is applied --- .../transformation/fpgadataflow/specialize_to_rtl_layers.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py index 23b6e59abe..47ed5ce863 100644 --- a/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py +++ b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py @@ -32,6 +32,7 @@ from onnx import helper from qonnx.transformation.infer_shapes import InferShapes from qonnx.transformation.infer_datatypes import InferDataTypes +from qonnx.transformation.general import GiveUniqueNodeNames from finn.transformation.fpgadataflow.minimize_accumulator_width import MinimizeAccumulatorWidth class InferRTLMatrixVectorActivation(Transformation): @@ -105,5 +106,6 @@ def apply(self, model): model = model.transform(MinimizeAccumulatorWidth()) model = model.transform(InferShapes()) model = model.transform(InferDataTypes()) + model = model.transform(GiveUniqueNodeNames()) return (model, graph_modified) \ No newline at end of file From 114ea1bfed2dd2f14196f98aea97d6cac9d1d57e Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 18 Sep 2023 14:56:07 +0100 Subject: [PATCH 148/235] [mvu rtl]: add synthesis directive to handle 'X in simulation --- finn-rtllib/mvu/mvu_8sx9.sv | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv index 439fbc44f9..34aa856b1b 100644 --- a/finn-rtllib/mvu/mvu_8sx9.sv +++ b/finn-rtllib/mvu/mvu_8sx9.sv @@ -110,13 +110,17 @@ module mvu_8sx9 #( always_ff @(posedge clk) begin if (rst) A <= '{default: 0}; else if(en) begin - A[EXTERNAL_PREGS-1] <= a[3*i +: LANES_OCCUPIED]; + A[EXTERNAL_PREGS-1] <= +// synthesis translate_off + zero ? '1 : +// synthesis translate_on + a[3*i +: LANES_OCCUPIED]; if (EXTERNAL_PREGS > 1) A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1]; end end for (genvar j=0; j 1) B[i][0:EXTERNAL_PREGS-2] <= B[i][1:EXTERNAL_PREGS-1]; end end @@ -161,7 +173,11 @@ module mvu_8sx9 #( end : genExternalPregWeight else begin : genInpDSPWeight for (genvar k = 0; k < LANES_OCCUPIED; k++) begin : genBin - assign b_in_i[i][j][8*k +: 8] = PAD_BITS_WEIGHT == 0 ? w[i][3*j+k] : { {PAD_BITS_WEIGHT{w[i][3*j+k][WEIGHT_WIDTH-1]}}, w[i][3*j+k] }; + assign b_in_i[i][j][8*k +: 8] = +// synthesis translate_off + zero ? '1 : +// synthesis translate_on + PAD_BITS_WEIGHT == 0 ? w[i][3*j+k] : { {PAD_BITS_WEIGHT{w[i][3*j+k][WEIGHT_WIDTH-1]}}, w[i][3*j+k] }; end : genBin for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero assign b_in_i[i][j][8*k +: 8] = 8'b0; @@ -178,9 +194,10 @@ module mvu_8sx9 #( localparam bit PREG = (j+1)%SEGLEN==0 || j == CHAINLEN-1; localparam bit FIRST = j == 0; localparam bit LAST = j == CHAINLEN-1; + uwire [57:0] pp; if (LAST) begin : genPOUT - assign p[i] = pcout[i][j][ACCU_WIDTH-1:0]; + assign p[i] = pp[ACCU_WIDTH-1:0]; end // Note: Since the product B * AD is computed, @@ -264,6 +281,7 @@ module mvu_8sx9 #( end else assign Preg = Mreg + pcout[i][j-1]; end + assign pp = Preg; assign pcout[i][j] = Preg; end : genBehav `ifndef VERILATOR From 79fafdb25a8707f740a0a7e21aa4f55ef7101882 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 18 Sep 2023 15:06:36 +0100 Subject: [PATCH 149/235] [replay buffer rtl]: minor fix to when LEN=1 (= AWIDTH=0) --- finn-rtllib/mvu/replay_buffer.sv | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/finn-rtllib/mvu/replay_buffer.sv b/finn-rtllib/mvu/replay_buffer.sv index 3dfe72d6c6..942f1823ca 100644 --- a/finn-rtllib/mvu/replay_buffer.sv +++ b/finn-rtllib/mvu/replay_buffer.sv @@ -144,8 +144,8 @@ module replay_buffer #( uwire wr = irdy && ivld; uwire rd = !OVld || ordy; always_ff @(posedge clk) begin - if(wr) Mem[WP[AWIDTH-1:0]] <= idat; - if(rd) ODat <= Mem[RP[AWIDTH-1:0]]; + if(wr) Mem[WP[AWIDTH:0]] <= idat; + if(rd) ODat <= Mem[RP[AWIDTH:0]]; end uwire vld = (RP != WP); From 619d9db0d5872d1afd72b1d1df841e1f87a9f33a Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 18 Sep 2023 15:09:45 +0100 Subject: [PATCH 150/235] [mvu lut]: LUT-based MVU compute core --- finn-rtllib/mvu/mvu_lut.sv | 102 +++++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 finn-rtllib/mvu/mvu_lut.sv diff --git a/finn-rtllib/mvu/mvu_lut.sv b/finn-rtllib/mvu/mvu_lut.sv new file mode 100644 index 0000000000..b100a589e8 --- /dev/null +++ b/finn-rtllib/mvu/mvu_lut.sv @@ -0,0 +1,102 @@ +module mvu_lut #( + int unsigned PE, + int unsigned SIMD, + int unsigned ACCU_WIDTH, + int unsigned ACTIVATION_WIDTH, + int unsigned WEIGHT_WIDTH, + bit SIGNED_ACTIVATIONS, + bit M_REG = 1, + + localparam unsigned MULT_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH +)( + // Global Control + input logic clk, + input logic rst, + input logic en, + + // Input + input logic last, + input logic zero, // ignore current inputs and force this partial product to zero + input logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // signed weights + input logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] a, // (un)signed activations + + // Ouput + output logic vld, + output logic signed [PE-1:0][ACCU_WIDTH-1:0] p +); + + typedef int unsigned leave_load_t[2*SIMD-1]; + function leave_load_t init_leave_loads(); + automatic leave_load_t res; + for(int i = 2*(SIMD-1); i >= int'(SIMD)-1; i--) res[i] = 1; + for(int i = SIMD-2; i >= 0; i--) res[i] = res[2*i+1] + res[2*i+2]; + return res; + endfunction : init_leave_loads + + // Pipeline for last indicator flag + uwire last_i; + generate if (M_REG) begin + logic [0:1] L = '0; + always_ff @(posedge clk) begin + if(rst) L <= '0; + else if (en) L <= {last, L[0]}; + end + assign last_i = L[1]; + end + else begin + logic L = '0; + always_ff @(posedge clk) begin + if(rst) L <= '0; + else if (en) L <= last; + end + assign last_i = L; + end + endgenerate + + // For each PE generate + for (genvar i = 0; i < PE; i++) begin : genPE + // Stage #1: SIMD multipliers in parallel + uwire [MULT_WIDTH-1 : 0] m1 [SIMD]; + for (genvar j = 0; j < SIMD; j++) begin : genSIMD + if (M_REG) begin : genMreg + logic [MULT_WIDTH-1 : 0] M [SIMD]; + always_ff @(posedge clk) begin + if(rst) M[j] = '{ default : 0 }; + else if (en) M[j] = zero ? 0 : + SIGNED_ACTIVATIONS ? $signed(a[j]) * $signed(w[i][j]) : + $signed({1'b0, a[j]}) * $signed(w[i][j]); + // (SIGNED_ACTIVATIONS ? $signed(a[j]) : a[j]) * $signed(w[i][j]) isn't valid -- leads to unsigned multiplication + end + assign m1[j] = M[j]; + end : genMreg + else begin : genNoMreg + assign m1[j] = zero ? 0 : + SIGNED_ACTIVATIONS ? $signed(a[j]) * $signed(w[i][j]) : + $signed({1'b0, a[j]}) * $signed(w[i][j]); + end : genNoMreg + end : genSIMD + + // Stage #2: Adder tree to reduce SIMD products + localparam leave_load_t LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default : 1 }; + localparam int unsigned ROOT_WIDTH = $clog2(SIMD*(2**MULT_WIDTH-1)); + uwire signed [2*SIMD-2:0][ROOT_WIDTH-1:0] tree; + for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = $signed(m1[s]); + for(genvar n = 0; n < SIMD-1; n++) begin + // Sum truncated to actual maximum bit width at this node + localparam int unsigned NODE_WIDTH = $clog2(LEAVE_LOAD[n]*(2**MULT_WIDTH-1)); + uwire signed [NODE_WIDTH-1:0] s = tree[2*n+1] + tree[2*n+2]; + assign tree[n] = s; + end + + // Stage #3: Buffer output + logic [ACCU_WIDTH-1:0] P2 [PE]; + always_ff @(posedge clk) begin + if(rst) P2[i] = '{ default : 0}; + else if (en) P2[i] = (last_i ? 0 : $signed(P2[i])) + $signed(tree[0]); + end + + assign vld = last_i; + assign p[i] = P2[i]; + end : genPE + +endmodule : mvu_lut From 090f2ac4adf4b0523b23b27fce05f7422269d72a Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Tue, 19 Sep 2023 12:23:55 +0100 Subject: [PATCH 151/235] [custom op]: add preferred_backend attribute --- src/finn/custom_op/fpgadataflow/matrixvectoractivation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py index 73d39ce642..4f24d71ccc 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py @@ -123,7 +123,7 @@ def get_nodeattr_types(self): # weight data from the weight FIFOs. "runtime_writeable_weights": ("i", False, 0, {0, 1}), # Flag to specify whether RTL-based or HLS-based implementation is preferred - "impl": ("s", False, "rtl", {"hls", "rtl"}) + "preferred_backend": ("s", False, "rtl", {"hls", "rtl"}) } my_attrs.update(super().get_nodeattr_types()) return my_attrs From ac5e82d9944f5b7475eb13546affd1bc03d57f4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Thu, 21 Sep 2023 13:03:27 +0100 Subject: [PATCH 152/235] Ensure a minimum of two buffer slots even for length-1 sequences. --- finn-rtllib/mvu/replay_buffer.sv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/finn-rtllib/mvu/replay_buffer.sv b/finn-rtllib/mvu/replay_buffer.sv index 942f1823ca..d4342f705c 100644 --- a/finn-rtllib/mvu/replay_buffer.sv +++ b/finn-rtllib/mvu/replay_buffer.sv @@ -111,7 +111,7 @@ module replay_buffer #( assign last_rep = RepLst; end : blkRep - localparam int unsigned AWIDTH = $clog2(LEN); + localparam int unsigned AWIDTH = LEN < 2? 1 : $clog2(LEN); typedef logic [AWIDTH :0] ptr_t; // pointers with additional generational MSB typedef logic [W -1:0] data_t; From 85156935163fc803d453db5ce2c1c5163808bc9f Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 21 Sep 2023 15:07:12 +0100 Subject: [PATCH 153/235] [rtl mvu wrapper]: support for vvu layer and rename --- finn-rtllib/mvu/mvu_vvu_axi_wrapper.v | 92 +++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 finn-rtllib/mvu/mvu_vvu_axi_wrapper.v diff --git a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v new file mode 100644 index 0000000000..6dbf82cb7b --- /dev/null +++ b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v @@ -0,0 +1,92 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Verilog AXI-lite wrapper for MVU. + *****************************************************************************/ + +module $MODULE_NAME_AXI_WRAPPER$ #( + parameter IS_MVU = "$IS_MVU$", + parameter COMPUTE_CORE = "$COMPUTE_CORE$", + parameter MW = $MW$, + parameter MH = $MH$, + parameter PE = $PE$, + parameter SIMD = $SIMD$, + parameter ACTIVATION_WIDTH = $ACTIVATION_WIDTH$, + parameter WEIGHT_WIDTH = $WEIGHT_WIDTH$, + parameter ACCU_WIDTH = $ACCU_WIDTH$, + parameter SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$, + parameter SEGMENTLEN = $SEGMENTLEN$, + parameter FORCE_BEHAVIORAL = $FORCE_BEHAVIORAL$, + + // Safely deducible parameters + parameter WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, + parameter INPUT_STREAM_WIDTH_BA = ((IS_MVU == 1 ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8, + parameter OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8 +)( + // Global Control + (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *) + (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *) + input ap_clk, + (* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *) + input ap_rst_n, + + // Weight Stream + input [WEIGHT_STREAM_WIDTH_BA-1:0] weights_V_TDATA, + input weights_V_TVALID, + output weights_V_TREADY, + // Input Stream + input [INPUT_STREAM_WIDTH_BA-1:0] in0_V_TDATA, + input in0_V_TVALID, + output in0_V_TREADY, + // Output Stream + output [OUTPUT_STREAM_WIDTH_BA-1:0] out_V_TDATA, + output out_V_TVALID, + input out_V_TREADY +); + +mvu_vvu_axi #( + .IS_MVU(IS_MVU), .COMPUTE_CORE(COMPUTE_CORE), .MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD), + .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL) + ) inst ( + .ap_clk(ap_clk), + .ap_rst_n(ap_rst_n), + .s_axis_weights_tdata(weights_V_TDATA), + .s_axis_weights_tvalid(weights_V_TVALID), + .s_axis_weights_tready(weights_V_TREADY), + .s_axis_input_tdata(in0_V_TDATA), + .s_axis_input_tvalid(in0_V_TVALID), + .s_axis_input_tready(in0_V_TREADY), + .m_axis_output_tdata(out_V_TDATA), + .m_axis_output_tvalid(out_V_TVALID), + .m_axis_output_tready(out_V_TREADY) +); + +endmodule // $MODULE_NAME_AXI_WRAPPER$ From cf28d780041fec1effdf743e62390eebc5c81f98 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 21 Sep 2023 16:24:18 +0100 Subject: [PATCH 154/235] [mvu vvu tb]: modified testbench to also support testing VVU on DSP58 --- finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv | 222 +++++++++++++++++++++++++++ 1 file changed, 222 insertions(+) create mode 100644 finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv diff --git a/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv new file mode 100644 index 0000000000..82c2e8e7b0 --- /dev/null +++ b/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv @@ -0,0 +1,222 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Testbench for MVU AXI-lite interface wrapper. + *****************************************************************************/ + +module mvu_vvu_axi_tb(); + +//-------------------- Simulation parameters --------------------\\ + // Matrix & parallelism config + localparam bit IS_MVU = 1; + localparam string COMPUTE_CORE = "mvu_vvu_8sx9_dsp58"; + localparam int unsigned MW = 1500; + localparam int unsigned MH = 256; + localparam int unsigned SIMD = 60; + localparam int unsigned PE = 16; + localparam int unsigned SEGMENTLEN = 2.0; + localparam bit FORCE_BEHAVIORAL = 1; + localparam bit M_REG_LUT = 1; + // Bit-width config + localparam int unsigned ACTIVATION_WIDTH = 4; + localparam int unsigned WEIGHT_WIDTH = 4; + localparam int unsigned ACCU_WIDTH = 21; // == ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW) + localparam bit SIGNED_ACTIVATIONS = 0; + // Simulation constants + localparam int unsigned NF = MH/PE; + localparam int unsigned SF = MW/SIMD; + localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8; + localparam int unsigned ACTIVATION_WIDTH_BA = ((IS_MVU ? 1 : PE)*SIMD*ACTIVATION_WIDTH+7)/8*8; + localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH; + localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - (IS_MVU ? 1 : PE)*SIMD*ACTIVATION_WIDTH; + localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8; + + // Generate clk and reset signal + logic clk = 0; + always #5ns clk = !clk; + + logic ap_rst_n = 0; + initial begin + repeat(16) @(posedge clk); + ap_rst_n <= 1; + end + + uwire ap_clk = clk; + + // Generate activations + typedef logic [(IS_MVU ? 1 : PE)*SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; + typedef activation_t activation_vector_t[(IS_MVU ? 1 : NF)*SF]; + + function activation_vector_t init_ACTIVATIONS; + automatic activation_vector_t res; + std::randomize(res); + return res; + endfunction : init_ACTIVATIONS + + activation_vector_t ACTIVATIONS = init_ACTIVATIONS(); + + struct { + activation_t dat; + logic vld; + logic rdy; + } activations; + + initial begin + activations.vld = 0; + activations.dat = 'X; + @(posedge clk iff ap_rst_n); + + for (int j=0; j<(IS_MVU ? 1 : NF); j++) begin + for (int i=0; i= 0; + @(posedge clk); + end while (!(activations.vld === 1 && activations.rdy === 1)); + end + end + + activations.vld <= 0; + activations.dat <= 'x; + end + + // Generate weights + typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; + typedef weight_t weight_matrix_t[NF][SF]; + + function weight_matrix_t init_WEIGHTS; + automatic weight_matrix_t res; + std::randomize(res); + return res; + endfunction : init_WEIGHTS; + + weight_matrix_t WEIGHTS = init_WEIGHTS(); + + struct { + weight_t dat; + logic vld; + logic rdy; + } weights; + + initial begin + weights.vld = 0; + weights.dat = 'X; + @(posedge clk iff ap_rst_n); + + weights.vld <= 1; + for (int i=0; i 1 ? $signed(a[j/PE*SF+i/SIMD][(i*PE + j%PE) % (SIMD*PE)]) : $signed(a[j/PE*SF+i/SIMD][i%SIMD]) ) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]); + else + res[j/PE][j%PE] = IS_MVU ? $signed(res[j/PE][j%PE]) + $signed({1'b0, a[i/SIMD][i%SIMD]}) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]) : + $signed(res[j/PE][j%PE]) + ( PE > 1 ? $signed({1'b0, a[j/PE*SF+i/SIMD][(i*PE + j%PE) % (SIMD*PE)]}) : $signed({1'b0, a[j/PE+SF+i/SIMD][i%SIMD]}) ) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]); + end + end + return res; + endfunction : check_output; + + output_vector_t GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS); + + int unsigned NF_CNT = 0; + initial begin + outputs.rdy = 0; + while (NF_CNT < NF) begin + // Loop until both rdy & vld are asserted + do begin + outputs.rdy <= $urandom()%7 >= 0; + @(posedge clk iff ap_rst_n); + end while (!(outputs.rdy === 1 && outputs.vld === 1)); + + // Compare produced outputs against golden outputs + foreach(outputs.dat[i]) begin + assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + else begin + $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + $stop; + end + end + + NF_CNT += 1; + end + + $finish; + end + + // Instantiate DUT + mvu_vvu_axi #( + .IS_MVU(IS_MVU), + .COMPUTE_CORE(COMPUTE_CORE), + .MW(MW), + .MH(MH), + .PE(PE), + .SIMD(SIMD), + .ACTIVATION_WIDTH(ACTIVATION_WIDTH), + .WEIGHT_WIDTH(WEIGHT_WIDTH), + .ACCU_WIDTH(ACCU_WIDTH), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), + .SEGMENTLEN(SEGMENTLEN), + .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL), + .M_REG_LUT(M_REG_LUT) + ) + dut ( + .ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld), + .s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld), + .s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld), + .m_axis_output_tready(outputs.rdy) + ); + +endmodule : mvu_vvu_axi_tb From 2617c391e1d2c9b19fb881acb6012fc56df35eae Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 21 Sep 2023 16:25:22 +0100 Subject: [PATCH 155/235] [axi wrapper]: minor modification to comment description --- finn-rtllib/mvu/mvu_vvu_axi_wrapper.v | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v index 6dbf82cb7b..788e49a71b 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v +++ b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v @@ -28,7 +28,7 @@ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - * @brief Verilog AXI-lite wrapper for MVU. + * @brief Verilog AXI-lite wrapper for MVU & VVU. *****************************************************************************/ module $MODULE_NAME_AXI_WRAPPER$ #( From 8ca5fe73c003aec3e7998d83e233102c012dd531 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 21 Sep 2023 16:34:12 +0100 Subject: [PATCH 156/235] [mvu axi]: add support for VVU on DSP58 --- finn-rtllib/mvu/mvu_axi.sv | 105 ++++++++++++++++++++++++------------- 1 file changed, 69 insertions(+), 36 deletions(-) diff --git a/finn-rtllib/mvu/mvu_axi.sv b/finn-rtllib/mvu/mvu_axi.sv index 46167af95b..07ad32e6c8 100644 --- a/finn-rtllib/mvu/mvu_axi.sv +++ b/finn-rtllib/mvu/mvu_axi.sv @@ -28,19 +28,25 @@ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - * @brief Matrix Vector Unit (MVU) AXI-lite interface wrapper. + * @brief Matrix Vector Unit (MVU) & Vector Vector Unit (VVU) AXI-lite interface wrapper. * @details + * The following compute cores are supported: + * - 4-bit MVU on DSP48 & DSP58 achieving 4 MACs/DSP, + * (4,8]-bit MVU on DSP48 achieving 2 MACs/DSP, + * [4,9]-bit MVU and VVU on DSP58 achieving 3 MACs/DSP, + * 'unconstrained' LUT-based MVU and VVU. * Folding hints: - * - 4-bit MVU: PE scaling should divide MH. - * - 8-bit MVU - DSP48: PE scaling should divide MH. - * - 8-bit MVU - DSP58: SIMD scaling should aim at a full multiple of 3 and divide MW. + * - PE scaling should divide MH. + * - SIMD scaling should divide MW. * - Otherwise, keep SIMD and PE somewhat balanced. SIMD scaling tends to * impact critical paths more than PE scaling. PE scaling implies a * bigger fanout on the input activations. * - Full unfolding along MH (PE=MH) results in no replay buffer instantiated *****************************************************************************/ -module mvu_axi #( +module mvu_vvu_axi #( + bit IS_MVU, // string type causes error in Vivado + parameter COMPUTE_CORE, int unsigned MW, int unsigned MH, int unsigned PE, @@ -51,16 +57,16 @@ module mvu_axi #( bit SIGNED_ACTIVATIONS = 0, int unsigned SEGMENTLEN = 0, bit FORCE_BEHAVIORAL = 0, - parameter MVU_IMPL_STYLE, // string type causes error in Vivado + bit M_REG_LUT = 1, + // Safely deducible parameters localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, - localparam int unsigned INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8, + localparam int unsigned INPUT_STREAM_WIDTH_BA = ((IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8, localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH, - localparam int unsigned INPUT_STREAM_WIDTH = SIMD*ACTIVATION_WIDTH, + localparam int unsigned INPUT_STREAM_WIDTH = (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH, localparam int unsigned SF = MW/SIMD, localparam int unsigned NF = MH/PE, - localparam int unsigned OUTPUT_LANES = PE, - localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8 + localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8 ) ( // Global Control @@ -93,27 +99,31 @@ module mvu_axi #( $error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE); $finish; end - if (ACTIVATION_WIDTH > 9) begin - $error("Activation width of %0d-bits exceeds maximum of 9-bits", ACTIVATION_WIDTH); - $finish; - end if (WEIGHT_WIDTH > 8) begin $error("Weight width of %0d-bits exceeds maximum of 8-bits", WEIGHT_WIDTH); $finish; end - if (SIGNED_ACTIVATIONS == 0 && ACTIVATION_WIDTH==9) begin - $error("Activation width of %0d-bits exceeds maximum of 8-bits for unsigned numbers", ACTIVATION_WIDTH); - $finish; + if (ACTIVATION_WIDTH > 8) begin + if (!(SIGNED_ACTIVATIONS == 1 && ACTIVATION_WIDTH == 9 && COMPUTE_CORE == "mvu_vvu_8sx9_dsp58")) begin + $error("Activation width of %0d-bits exceeds maximum of 9-bits for signed numbers on DSP48", ACTIVATION_WIDTH); + $finish; + end end - if (MVU_IMPL_STYLE == "mvu_8sx9") begin + if (COMPUTE_CORE == "mvu_vvu_8sx9_dsp58") begin if (SEGMENTLEN == 0) begin - $warning("Segment length of %0d defaults to chain length", SEGMENTLEN); + $warning("Segment length of %0d defaults to chain length of %0d", SEGMENTLEN, (SIMD+2)/3); end if (SEGMENTLEN > (SIMD+2)/3) begin $error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3); $finish; end end + if (!IS_MVU) begin + if (COMPUTE_CORE != "mvu_vvu_8sx9_dsp58" && COMPUTE_CORE != "mvu_vvu_lut") begin + $error("VVU only supported on DSP58 or LUT-based implementation"); + $finish; + end + end end uwire clk = ap_clk; @@ -127,10 +137,10 @@ module mvu_axi #( uwire avld; uwire ardy; - replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvauin_t))) activation_replay ( - .clk, .rst, - .ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)), - .ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin) + replay_buffer #(.LEN(SF), .REP(IS_MVU ? NF : 1), .W($bits(mvauin_t))) activation_replay ( + .clk, .rst, + .ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)), + .ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin) ); //-------------------- Input control --------------------\\ @@ -139,37 +149,60 @@ module mvu_axi #( assign ardy = en && s_axis_weights_tvalid; assign s_axis_weights_tready = en && avld; -//-------------------- Core MVU --------------------\\ +//-------------------- Core MVU/VVU --------------------\\ uwire ovld; uwire [PE-1:0][ACCU_WIDTH-1:0] odat; typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t; - - case(MVU_IMPL_STYLE) - "mvu_8sx9_dsp58": - mvu_8sx9 #(.PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), + uwire mvauin_t amvau_i; + + if (IS_MVU) begin : genMVUInput + assign amvau_i = amvau; + end : genMVUInput + else begin : genVVUInput + // The input stream will have the channels interleaved for VVU when PE>1 + // Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..] + // Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like: + // (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to + // (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i) + localparam int num_of_elements = INPUT_STREAM_WIDTH/ACTIVATION_WIDTH; + for (genvar i=0; i 1) ? + amvau[(i/SIMD + (i*PE % num_of_elements) + 1) * ACTIVATION_WIDTH : (i/SIMD + (i*PE % num_of_elements)) * ACTIVATION_WIDTH] + : amvau[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH]; + end : genRewire + end : genVVUInput + + case(COMPUTE_CORE) + "mvu_vvu_8sx9_dsp58": + mvu_vvu_8sx9 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( .clk, .rst, .en, - .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau), + .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), .vld(ovld), .p(odat) ); - "mvu_4sx4u": mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( .clk, .rst, .en, - .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau), + .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), .vld(ovld), .p(odat) ); - "mvu_8sx8u_dsp48": mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( .clk, .rst, .en, - .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau), + .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), + .vld(ovld), .p(odat) + ); + "mvu_vvu_lut": + mvu_vvu_lut #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .M_REG(M_REG_LUT)) core ( + .clk, .rst, .en, + .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), .vld(ovld), .p(odat) ); default: initial begin - $error("Unrecognized MVU_IMPL_STYLE '%s'", MVU_IMPL_STYLE); + $error("Unrecognized COMPUTE_CORE '%s'", COMPUTE_CORE); $finish; end endcase @@ -203,7 +236,7 @@ module mvu_axi #( assign b_load = !B.vld || m_axis_output_tready; always_ff @(posedge clk) begin - if(rst) B <= '{ default: 'x }; + if(rst) B <= '{ vld: 0, default: 'x }; else begin if(b_load) B <= '{ vld: A.vld, dat: A.dat}; end @@ -212,4 +245,4 @@ module mvu_axi #( assign m_axis_output_tvalid = B.vld; assign m_axis_output_tdata = B.dat; -endmodule : mvu_axi +endmodule : mvu_vvu_axi From 32d6338c626b26d2e48cdb21cde438d557cc9bcd Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 21 Sep 2023 16:34:36 +0100 Subject: [PATCH 157/235] [mvu vvu axi]: renamed file for consistency purposes --- finn-rtllib/mvu/mvu_vvu_axi.sv | 248 +++++++++++++++++++++++++++++++++ 1 file changed, 248 insertions(+) create mode 100644 finn-rtllib/mvu/mvu_vvu_axi.sv diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv new file mode 100644 index 0000000000..07ad32e6c8 --- /dev/null +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -0,0 +1,248 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Matrix Vector Unit (MVU) & Vector Vector Unit (VVU) AXI-lite interface wrapper. + * @details + * The following compute cores are supported: + * - 4-bit MVU on DSP48 & DSP58 achieving 4 MACs/DSP, + * (4,8]-bit MVU on DSP48 achieving 2 MACs/DSP, + * [4,9]-bit MVU and VVU on DSP58 achieving 3 MACs/DSP, + * 'unconstrained' LUT-based MVU and VVU. + * Folding hints: + * - PE scaling should divide MH. + * - SIMD scaling should divide MW. + * - Otherwise, keep SIMD and PE somewhat balanced. SIMD scaling tends to + * impact critical paths more than PE scaling. PE scaling implies a + * bigger fanout on the input activations. + * - Full unfolding along MH (PE=MH) results in no replay buffer instantiated + *****************************************************************************/ + +module mvu_vvu_axi #( + bit IS_MVU, // string type causes error in Vivado + parameter COMPUTE_CORE, + int unsigned MW, + int unsigned MH, + int unsigned PE, + int unsigned SIMD, + int unsigned ACTIVATION_WIDTH, + int unsigned WEIGHT_WIDTH, + int unsigned ACCU_WIDTH, + bit SIGNED_ACTIVATIONS = 0, + int unsigned SEGMENTLEN = 0, + bit FORCE_BEHAVIORAL = 0, + bit M_REG_LUT = 1, + + // Safely deducible parameters + localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, + localparam int unsigned INPUT_STREAM_WIDTH_BA = ((IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8, + localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH, + localparam int unsigned INPUT_STREAM_WIDTH = (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH, + localparam int unsigned SF = MW/SIMD, + localparam int unsigned NF = MH/PE, + localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8 +) +( + // Global Control + input logic ap_clk, + input logic ap_rst_n, + + // Weight Stream + input logic [WEIGHT_STREAM_WIDTH_BA-1:0] s_axis_weights_tdata, + input logic s_axis_weights_tvalid, + output logic s_axis_weights_tready, + + // Input Stream + input logic [INPUT_STREAM_WIDTH_BA-1:0] s_axis_input_tdata, + input logic s_axis_input_tvalid, + output logic s_axis_input_tready, + + // Output Stream + output logic [OUTPUT_STREAM_WIDTH_BA-1:0] m_axis_output_tdata, + output logic m_axis_output_tvalid, + input logic m_axis_output_tready +); + +//-------------------- Parameter sanity checks --------------------\\ + initial begin + if (MW % SIMD != 0) begin + $error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD); + $finish; + end + if (MH % PE != 0) begin + $error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE); + $finish; + end + if (WEIGHT_WIDTH > 8) begin + $error("Weight width of %0d-bits exceeds maximum of 8-bits", WEIGHT_WIDTH); + $finish; + end + if (ACTIVATION_WIDTH > 8) begin + if (!(SIGNED_ACTIVATIONS == 1 && ACTIVATION_WIDTH == 9 && COMPUTE_CORE == "mvu_vvu_8sx9_dsp58")) begin + $error("Activation width of %0d-bits exceeds maximum of 9-bits for signed numbers on DSP48", ACTIVATION_WIDTH); + $finish; + end + end + if (COMPUTE_CORE == "mvu_vvu_8sx9_dsp58") begin + if (SEGMENTLEN == 0) begin + $warning("Segment length of %0d defaults to chain length of %0d", SEGMENTLEN, (SIMD+2)/3); + end + if (SEGMENTLEN > (SIMD+2)/3) begin + $error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3); + $finish; + end + end + if (!IS_MVU) begin + if (COMPUTE_CORE != "mvu_vvu_8sx9_dsp58" && COMPUTE_CORE != "mvu_vvu_lut") begin + $error("VVU only supported on DSP58 or LUT-based implementation"); + $finish; + end + end + end + + uwire clk = ap_clk; + uwire rst = !ap_rst_n; + + typedef logic [INPUT_STREAM_WIDTH-1 : 0] mvauin_t; + + uwire mvauin_t amvau; + uwire alast; + uwire afin; + uwire avld; + uwire ardy; + + replay_buffer #(.LEN(SF), .REP(IS_MVU ? NF : 1), .W($bits(mvauin_t))) activation_replay ( + .clk, .rst, + .ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)), + .ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin) + ); + +//-------------------- Input control --------------------\\ + uwire en; + uwire istb = avld && s_axis_weights_tvalid; + assign ardy = en && s_axis_weights_tvalid; + assign s_axis_weights_tready = en && avld; + +//-------------------- Core MVU/VVU --------------------\\ + uwire ovld; + uwire [PE-1:0][ACCU_WIDTH-1:0] odat; + typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t; + uwire mvauin_t amvau_i; + + if (IS_MVU) begin : genMVUInput + assign amvau_i = amvau; + end : genMVUInput + else begin : genVVUInput + // The input stream will have the channels interleaved for VVU when PE>1 + // Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..] + // Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like: + // (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to + // (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i) + localparam int num_of_elements = INPUT_STREAM_WIDTH/ACTIVATION_WIDTH; + for (genvar i=0; i 1) ? + amvau[(i/SIMD + (i*PE % num_of_elements) + 1) * ACTIVATION_WIDTH : (i/SIMD + (i*PE % num_of_elements)) * ACTIVATION_WIDTH] + : amvau[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH]; + end : genRewire + end : genVVUInput + + case(COMPUTE_CORE) + "mvu_vvu_8sx9_dsp58": + mvu_vvu_8sx9 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), + .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), + .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( + .clk, .rst, .en, + .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), + .vld(ovld), .p(odat) + ); + "mvu_4sx4u": + mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( + .clk, .rst, .en, + .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), + .vld(ovld), .p(odat) + ); + "mvu_8sx8u_dsp48": + mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), + .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( + .clk, .rst, .en, + .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), + .vld(ovld), .p(odat) + ); + "mvu_vvu_lut": + mvu_vvu_lut #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .M_REG(M_REG_LUT)) core ( + .clk, .rst, .en, + .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), + .vld(ovld), .p(odat) + ); + default: initial begin + $error("Unrecognized COMPUTE_CORE '%s'", COMPUTE_CORE); + $finish; + end + endcase + +//-------------------- Output register slice --------------------\\ + struct packed { + logic vld; + logic [PE-1:0][ACCU_WIDTH-1:0] dat; + } A = '{ vld: 0, default: 'x}; + + assign en = !A.vld || !ovld; + + uwire b_load; + always_ff @(posedge clk) begin + if(rst) A <= '{ vld: 0, default: 'x }; + else if(!A.vld || b_load) begin + A.vld <= ovld && en; + for(int unsigned i = 0; i < PE; i++) begin + // CR-1148862: + // A.dat[i] <= odat[i]; + automatic logic [ACCU_WIDTH-1:0] v = odat[i]; + A.dat[i] <= v[ACCU_WIDTH-1:0]; + end + end + end + + struct packed { + logic vld; + logic [PE-1:0][ACCU_WIDTH-1:0] dat; + } B = '{ vld: 0, default: 'x}; + + assign b_load = !B.vld || m_axis_output_tready; + always_ff @(posedge clk) begin + if(rst) B <= '{ vld: 0, default: 'x }; + else begin + if(b_load) B <= '{ vld: A.vld, dat: A.dat}; + end + end + + assign m_axis_output_tvalid = B.vld; + assign m_axis_output_tdata = B.dat; + +endmodule : mvu_vvu_axi From 031406d73fa36a02638a94affd6a0bef36956c3c Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 21 Sep 2023 16:39:22 +0100 Subject: [PATCH 158/235] [mvu 8sx9]: added support for VVU on DSP58, resolved PyVerilator-caused error and added synthesis directive to handle 'X in input data --- finn-rtllib/mvu/mvu_8sx9.sv | 100 +++++++++++++++++++----------------- 1 file changed, 52 insertions(+), 48 deletions(-) diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv index 34aa856b1b..52a93739d6 100644 --- a/finn-rtllib/mvu/mvu_8sx9.sv +++ b/finn-rtllib/mvu/mvu_8sx9.sv @@ -31,7 +31,8 @@ * @brief Matrix Vector Unit (MVU) core compute kernel utilizing DSP58. *****************************************************************************/ -module mvu_8sx9 #( +module mvu_vvu_8sx9 #( + parameter IS_MVU, int unsigned PE, int unsigned SIMD, int unsigned ACTIVATION_WIDTH, @@ -39,7 +40,9 @@ module mvu_8sx9 #( int unsigned ACCU_WIDTH, bit SIGNED_ACTIVATIONS = 0, int unsigned SEGMENTLEN = 0, // Default to 0 (which implies a single segment) - bit FORCE_BEHAVIORAL = 0 + bit FORCE_BEHAVIORAL = 0, + + int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD ) ( // Global Control @@ -51,7 +54,7 @@ module mvu_8sx9 #( input logic last, input logic zero, // ignore current inputs and force this partial product to zero input logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // weights - input logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] a, // activations + input logic [ACTIVATION_ELEMENTS-1:0][ACTIVATION_WIDTH-1:0] a, // activations // Ouput output logic vld, @@ -67,9 +70,10 @@ module mvu_8sx9 #( //-------------------- Declare global signals --------------------\\ localparam int unsigned CHAINLEN = (SIMD+2)/3; localparam int unsigned SEGLEN = SEGMENTLEN == 0 ? CHAINLEN : SEGMENTLEN; // Additional constant to default a SEGMENTLEN of '0' to the DSP-chain length - uwire [26:0] a_in_i [CHAINLEN]; + localparam int unsigned PE_ACTIVATION = IS_MVU ? 1 : PE; + uwire [26:0] a_in_i [PE_ACTIVATION * CHAINLEN]; uwire [23:0] b_in_i [PE][CHAINLEN]; - uwire [57:0] pcout [PE][CHAINLEN]; + uwire [PE-1:0][CHAINLEN-1:0][57:0] pcout; // Array with packed dimension > 256 (with a loop-carried dependency) cannot be handled out-of-the-box with PyVerilator //-------------------- Shift register for opmode select signal --------------------\\ localparam int unsigned MAX_PIPELINE_STAGES = (CHAINLEN + SEGLEN-1)/SEGLEN; // >=1 (== number of pipeline registers + 1 (A/B inputs always have 1 register)) @@ -99,48 +103,48 @@ module mvu_8sx9 #( //-------------------- Buffer for input activations --------------------\\ localparam int unsigned PAD_BITS_ACT = 9 - ACTIVATION_WIDTH; + for (genvar k=0; k1 ? TOTAL_PREGS-1 : 0; + localparam int LANES_OCCUPIED = i == CHAINLEN-1 ? SIMD - 3*i : 3; - for (genvar i=0; i1 ? TOTAL_PREGS-1 : 0; - localparam int LANES_OCCUPIED = i == CHAINLEN-1 ? SIMD - 3*i : 3; - - if (EXTERNAL_PREGS > 0) begin : genExternalPregAct - logic [0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][ACTIVATION_WIDTH-1:0] A = '{ default : 0}; - always_ff @(posedge clk) begin - if (rst) A <= '{default: 0}; - else if(en) begin - A[EXTERNAL_PREGS-1] <= -// synthesis translate_off - zero ? '1 : -// synthesis translate_on - a[3*i +: LANES_OCCUPIED]; - if (EXTERNAL_PREGS > 1) A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1]; + if (EXTERNAL_PREGS > 0) begin : genExternalPregAct + logic [0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][ACTIVATION_WIDTH-1:0] A = '{ default : 0}; + always_ff @(posedge clk) begin + if (rst) A <= '{default: 0}; + else if(en) begin + A[EXTERNAL_PREGS-1] <= + // synthesis translate_off + zero ? '1 : + // synthesis translate_on + a[SIMD*k + 3*i +: LANES_OCCUPIED]; + if (EXTERNAL_PREGS > 1) A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1]; + end end - end - for (genvar j=0; j Date: Thu, 21 Sep 2023 16:39:52 +0100 Subject: [PATCH 159/235] [mvu vvu 8sx9]: renamed compute core for consistency --- finn-rtllib/mvu/mvu_vvu_8sx9.sv | 427 ++++++++++++++++++++++++++++++++ 1 file changed, 427 insertions(+) create mode 100644 finn-rtllib/mvu/mvu_vvu_8sx9.sv diff --git a/finn-rtllib/mvu/mvu_vvu_8sx9.sv b/finn-rtllib/mvu/mvu_vvu_8sx9.sv new file mode 100644 index 0000000000..52a93739d6 --- /dev/null +++ b/finn-rtllib/mvu/mvu_vvu_8sx9.sv @@ -0,0 +1,427 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Matrix Vector Unit (MVU) core compute kernel utilizing DSP58. + *****************************************************************************/ + +module mvu_vvu_8sx9 #( + parameter IS_MVU, + int unsigned PE, + int unsigned SIMD, + int unsigned ACTIVATION_WIDTH, + int unsigned WEIGHT_WIDTH, + int unsigned ACCU_WIDTH, + bit SIGNED_ACTIVATIONS = 0, + int unsigned SEGMENTLEN = 0, // Default to 0 (which implies a single segment) + bit FORCE_BEHAVIORAL = 0, + + int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD + ) + ( + // Global Control + input logic clk, + input logic rst, + input logic en, + + // Input + input logic last, + input logic zero, // ignore current inputs and force this partial product to zero + input logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // weights + input logic [ACTIVATION_ELEMENTS-1:0][ACTIVATION_WIDTH-1:0] a, // activations + + // Ouput + output logic vld, + output logic [PE-1:0][ACCU_WIDTH-1:0] p + ); + // for verilator always use behavioral code + localparam bit BEHAVIORAL = +`ifdef VERILATOR + 1 || +`endif + FORCE_BEHAVIORAL; + +//-------------------- Declare global signals --------------------\\ + localparam int unsigned CHAINLEN = (SIMD+2)/3; + localparam int unsigned SEGLEN = SEGMENTLEN == 0 ? CHAINLEN : SEGMENTLEN; // Additional constant to default a SEGMENTLEN of '0' to the DSP-chain length + localparam int unsigned PE_ACTIVATION = IS_MVU ? 1 : PE; + uwire [26:0] a_in_i [PE_ACTIVATION * CHAINLEN]; + uwire [23:0] b_in_i [PE][CHAINLEN]; + uwire [PE-1:0][CHAINLEN-1:0][57:0] pcout; // Array with packed dimension > 256 (with a loop-carried dependency) cannot be handled out-of-the-box with PyVerilator + +//-------------------- Shift register for opmode select signal --------------------\\ + localparam int unsigned MAX_PIPELINE_STAGES = (CHAINLEN + SEGLEN-1)/SEGLEN; // >=1 (== number of pipeline registers + 1 (A/B inputs always have 1 register)) + logic L [0:1+MAX_PIPELINE_STAGES] = '{default: 0}; // After MAX_PIPELINE_STAGES (== number of pipeline stages for input data), we have 3 additional cycles latency (A/B reg, Mreg, Preg). Thus, we add +2 (since OPMODE is buffered by 1 cycle in the DSP fabric) + + always_ff @(posedge clk) begin + if(rst) L <= '{default: 0}; + else if(en) begin + L[1+MAX_PIPELINE_STAGES] <= last; + L[0:MAX_PIPELINE_STAGES] <= L[1:1+MAX_PIPELINE_STAGES]; + end + end + assign vld = L[0]; + +//-------------------- Shift register for ZERO flag --------------------\\ + logic Z [0:MAX_PIPELINE_STAGES-2] = '{default:0}; // We need MAX_PIPELINE_STAGES-1 pipeline stages (note: INMODE is buffered inside DSP fabric) + + if (MAX_PIPELINE_STAGES > 1) begin : genZreg + always_ff @(posedge clk) begin + if (rst) Z <= '{default: 0}; + else if(en) begin + Z[0] <= zero; + if (MAX_PIPELINE_STAGES > 2) Z[1:MAX_PIPELINE_STAGES-2] <= Z[0:MAX_PIPELINE_STAGES-3]; + end + end + end; + +//-------------------- Buffer for input activations --------------------\\ + localparam int unsigned PAD_BITS_ACT = 9 - ACTIVATION_WIDTH; + for (genvar k=0; k1 ? TOTAL_PREGS-1 : 0; + localparam int LANES_OCCUPIED = i == CHAINLEN-1 ? SIMD - 3*i : 3; + + if (EXTERNAL_PREGS > 0) begin : genExternalPregAct + logic [0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][ACTIVATION_WIDTH-1:0] A = '{ default : 0}; + always_ff @(posedge clk) begin + if (rst) A <= '{default: 0}; + else if(en) begin + A[EXTERNAL_PREGS-1] <= + // synthesis translate_off + zero ? '1 : + // synthesis translate_on + a[SIMD*k + 3*i +: LANES_OCCUPIED]; + if (EXTERNAL_PREGS > 1) A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1]; + end + end + for (genvar j=0; j1 ? TOTAL_PREGS-1 : 0; + localparam int LANES_OCCUPIED = j == CHAINLEN-1 ? SIMD - 3*j : 3; + + if (EXTERNAL_PREGS > 0) begin : genExternalPregWeight + logic [0:PE-1][0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][WEIGHT_WIDTH-1:0] B = '{ default : 0}; + always_ff @(posedge clk) begin + if (rst) B <= '{default: 0}; + else if (en) begin + B[i][EXTERNAL_PREGS-1] <= +// synthesis translate_off + zero ? '1 : +// synthesis translate_on + w[i][3*j +: LANES_OCCUPIED]; + if (EXTERNAL_PREGS > 1) B[i][0:EXTERNAL_PREGS-2] <= B[i][1:EXTERNAL_PREGS-1]; + end + end + for (genvar k = 0 ; k < LANES_OCCUPIED ; k++) begin : genBin + assign b_in_i[i][j][8*k +: 8] = PAD_BITS_WEIGHT == 0 ? B[i][0][k] : { {PAD_BITS_WEIGHT{B[i][0][k][WEIGHT_WIDTH-1]}}, B[i][0][k] }; + end : genBin + for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero + assign b_in_i[i][j][8*k +: 8] = 8'b0; + end : genBinZero + end : genExternalPregWeight + else begin : genInpDSPWeight + for (genvar k = 0; k < LANES_OCCUPIED; k++) begin : genBin + assign b_in_i[i][j][8*k +: 8] = +// synthesis translate_off + zero ? '1 : +// synthesis translate_on + PAD_BITS_WEIGHT == 0 ? w[i][3*j+k] : { {PAD_BITS_WEIGHT{w[i][3*j+k][WEIGHT_WIDTH-1]}}, w[i][3*j+k] }; + end : genBin + for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero + assign b_in_i[i][j][8*k +: 8] = 8'b0; + end : genBinZero + end : genInpDSPWeight + end : genWeightSIMD + end : genWeightPE + +//-------------------- Instantiate PE x CHAINLEN DSPs --------------------\\ + for (genvar i=0; i0 ? 2 : 1; // 1 : 0 + localparam bit PREG = (j+1)%SEGLEN==0 || j == CHAINLEN-1; + localparam bit FIRST = j == 0; + localparam bit LAST = j == CHAINLEN-1; + uwire [57:0] pp; + + if (LAST) begin : genPOUT + assign p[i] = pp[ACCU_WIDTH-1:0]; + end + + // Note: Since the product B * AD is computed, + // rst can be only applied to AD and zero only to B + // with the same effect as zeroing both. + if(BEHAVIORAL) begin : genBehav + // Stage #1: Input A/B + logic signed [33:0] Areg [INTERNAL_PREGS]; + always_ff @(posedge clk) begin + if (rst) Areg <= '{ default : 0}; + else if (en) begin + Areg[0] <= { 7'bx, a_in_i[(IS_MVU ? 0 : CHAINLEN*i) + j] }; + if (INTERNAL_PREGS == 2) Areg[1] <= Areg[0]; + end + end + logic signed [23:0] Breg [INTERNAL_PREGS]; + always_ff @(posedge clk) begin + if (rst) Breg <= '{ default : 0}; + else if (en) begin + Breg[0] <= b_in_i[i][j]; + if (INTERNAL_PREGS == 2) Breg[1] <= Breg[0]; + end + end + + // Stage #2: Multiply-Accumulate + logic signed [57:0] Mreg; + logic InmodeZero = 0; + always_ff @(posedge clk) begin + if (rst) InmodeZero <= 0; + else if (en) InmodeZero <= ( TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero ); + end + always_ff @(posedge clk) begin + if (rst) Mreg <= 0; + else if (en) begin + automatic logic signed [57:0] m = 0; + for (int k = 0; k < 3; k++) begin + m = m + (InmodeZero ? 0 : $signed(Areg[INTERNAL_PREGS-1][9*k +: 9]) * $signed(Breg[INTERNAL_PREGS-1][8*k +: 8])); + end + Mreg <= m; + end + end + + // Stage #3: Accumulate + logic signed [57:0] Preg; + logic Opmode = 0; + if (FIRST && !LAST) begin : genFirst + if (PREG) begin : genPregBehav + always_ff @(posedge clk) begin + if (rst) Preg <= 0; + else if (en) Preg <= Mreg; + end + end + else assign Preg = Mreg; + end + else if (FIRST && LAST) begin : genSingle + always_ff @(posedge clk) begin + if (rst) Opmode <= 0; + else if (en) Opmode <= L[1]; + end + always_ff @(posedge clk) begin + if (rst) Preg <= 0; + else if (en) Preg <= (Opmode ? 0 : Preg) + Mreg; + end + end + else if (!FIRST && LAST) begin : genLast + always_ff @(posedge clk) begin + if (rst) Opmode <= 0; + else if (en) Opmode <= L[1]; + end + always_ff @(posedge clk) begin + if (rst) Preg <= 0; + else if (en) Preg <= (Opmode ? 0 : Preg) + Mreg + pcout[i][j-1]; + end + end + else begin : genMid + if (PREG) begin : genPregBehav + always_ff @(posedge clk) begin + if (rst) Preg <= 0; + else if (en) Preg <= Mreg + pcout[i][j-1]; + end + end + else assign Preg = Mreg + pcout[i][j-1]; + end + assign pp = Preg; + assign pcout[i][j] = Preg; + end : genBehav +`ifndef VERILATOR + else begin: genDSP + DSP58 #( + // Feature Control Attributes: Data Path Selection + .AMULTSEL("A"), // Selects A input to multiplier (A, AD) + .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) + .BMULTSEL("B"), // Selects B input to multiplier (AD, B) + .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port) + .DSP_MODE("INT8"), // Configures DSP to a particular mode of operation. Set to INT24 for + // legacy mode. + .PREADDINSEL("A"), // Selects input to pre-adder (A, B) + .RND(58'h000000000000000), // Rounding Constant + .USE_MULT("MULTIPLY"), // Select multiplier usage (DYNAMIC, MULTIPLY, NONE) + .USE_SIMD("ONE58"), // SIMD selection (FOUR12, ONE58, TWO24) + .USE_WIDEXOR("FALSE"), // Use the Wide XOR function (FALSE, TRUE) + .XORSIMD("XOR24_34_58_116"), // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116) + // Pattern Detector Attributes: Pattern Detection Configuration + .AUTORESET_PATDET("NO_RESET"), // NO_RESET, RESET_MATCH, RESET_NOT_MATCH + .AUTORESET_PRIORITY("RESET"), // Priority of AUTORESET vs. CEP (CEP, RESET). + .MASK(58'h0ffffffffffffff), // 58-bit mask value for pattern detect (1=ignore) + .PATTERN(58'h000000000000000), // 58-bit pattern match for pattern detect + .SEL_MASK("MASK"), // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2 + .SEL_PATTERN("PATTERN"), // Select pattern value (C, PATTERN) + .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect (NO_PATDET, PATDET) + // Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins + .IS_ALUMODE_INVERTED(4'b0000), // Optional inversion for ALUMODE + .IS_CARRYIN_INVERTED(1'b0), // Optional inversion for CARRYIN + .IS_CLK_INVERTED(1'b0), // Optional inversion for CLK + .IS_INMODE_INVERTED(5'b00000), // Optional inversion for INMODE + .IS_NEGATE_INVERTED(3'b000), // Optional inversion for NEGATE + .IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0 + FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN + 2'b01, // Y : M + 2'b01 // X: M + }), // Optional inversion for OPMODE + .IS_RSTALLCARRYIN_INVERTED(1'b0), // Optional inversion for RSTALLCARRYIN + .IS_RSTALUMODE_INVERTED(1'b0), // Optional inversion for RSTALUMODE + .IS_RSTA_INVERTED(1'b0), // Optional inversion for RSTA + .IS_RSTB_INVERTED(1'b0), // Optional inversion for RSTB + .IS_RSTCTRL_INVERTED(1'b0), // Optional inversion for STCONJUGATE_A + .IS_RSTC_INVERTED(1'b0), // Optional inversion for RSTC + .IS_RSTD_INVERTED(1'b0), // Optional inversion for RSTD + .IS_RSTINMODE_INVERTED(1'b0), // Optional inversion for RSTINMODE + .IS_RSTM_INVERTED(1'b0), // Optional inversion for RSTM + .IS_RSTP_INVERTED(1'b0), // Optional inversion for RSTP + // Register Control Attributes: Pipeline Register Configuration + .ACASCREG(INTERNAL_PREGS), // Number of pipeline stages between A/ACIN and ACOUT (0-2) + .ADREG(0), // Pipeline stages for pre-adder (0-1) + .ALUMODEREG(0), // Pipeline stages for ALUMODE (0-1) + .AREG(INTERNAL_PREGS), // Pipeline stages for A (0-2) + .BCASCREG(INTERNAL_PREGS), // Number of pipeline stages between B/BCIN and BCOUT (0-2) + .BREG(INTERNAL_PREGS), // Pipeline stages for B (0-2) + .CARRYINREG(0), // Pipeline stages for CARRYIN (0-1) + .CARRYINSELREG(0), // Pipeline stages for CARRYINSEL (0-1) + .CREG(0), // Pipeline stages for C (0-1) + .DREG(0), // Pipeline stages for D (0-1) + .INMODEREG(1), // Pipeline stages for INMODE (0-1) + .MREG(1), // Multiplier pipeline stages (0-1) + .OPMODEREG(1), // Pipeline stages for OPMODE (0-1) + .PREG(PREG), // Number of pipeline stages for P (0-1) + .RESET_MODE("SYNC") // Selection of synchronous or asynchronous reset. (ASYNC, SYNC). + ) + DSP58_inst ( + // Cascade outputs: Cascade Ports + .ACOUT(), // 34-bit output: A port cascade + .BCOUT(), // 24-bit output: B cascade + .CARRYCASCOUT(), // 1-bit output: Cascade carry + .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade + .PCOUT(pcout[i][j]), // 58-bit output: Cascade output + // Control outputs: Control Inputs/Status Bits + .OVERFLOW(), // 1-bit output: Overflow in add/acc + .PATTERNBDETECT(), // 1-bit output: Pattern bar detect + .PATTERNDETECT(), // 1-bit output: Pattern detect + .UNDERFLOW(), // 1-bit output: Underflow in add/acc + // Data outputs: Data Ports + .CARRYOUT(), // 4-bit output: Carry + .P(pp), // 58-bit output: Primary data + .XOROUT(), // 8-bit output: XOR data + // Cascade inputs: Cascade Ports + .ACIN('x), // 34-bit input: A cascade data + .BCIN('x), // 24-bit input: B cascade + .CARRYCASCIN('x), // 1-bit input: Cascade carry + .MULTSIGNIN('x), // 1-bit input: Multiplier sign cascade + .PCIN(FIRST ? 'x : pcout[i][j-1]), // 58-bit input: P cascade + // Control inputs: Control Inputs/Status Bits + .ALUMODE(4'h0), // 4-bit input: ALU control + .CARRYINSEL('0), // 3-bit input: Carry select + .CLK(clk), // 1-bit input: Clock + .INMODE({ + INTERNAL_PREGS==2 ? 1'b0 : 1'b1, + 2'b00, + TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero, + INTERNAL_PREGS==2 ? 1'b0 : 1'b1 + }), // 5-bit input: INMODE control + .NEGATE('0), // 3-bit input: Negates the input of the multiplier + .OPMODE({ + LAST ? {1'b0, L[1]} : 2'b00, + 7'b000_0000 + }), // 9-bit input: Operation mode + // Data inputs: Data Ports + .A({ 7'bx, a_in_i[(IS_MVU ? 0 : CHAINLEN*i) + j] }), // 34-bit input: A data + .B(b_in_i[i][j]), // 24-bit input: B data + .C('x), // 58-bit input: C data + .CARRYIN('0), // 1-bit input: Carry-in + .D('x), // 27-bit input: D data + // Reset/Clock Enable inputs: Reset/Clock Enable Inputs + .ASYNC_RST('0), // 1-bit input: Asynchronous reset for all registers. + .CEA1(en), // 1-bit input: Clock enable for 1st stage AREG + .CEA2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage AREG + .CEAD('0), // 1-bit input: Clock enable for ADREG + .CEALUMODE('0), // 1-bit input: Clock enable for ALUMODE + .CEB1(en), // 1-bit input: Clock enable for 1st stage BREG + .CEB2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage BREG + .CEC('0), // 1-bit input: Clock enable for CREG + .CECARRYIN('0), // 1-bit input: Clock enable for CARRYINREG + .CECTRL(en), // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG + .CED('0), // 1-bit input: Clock enable for DREG + .CEINMODE(en), // 1-bit input: Clock enable for INMODEREG + .CEM(en), // 1-bit input: Clock enable for MREG + .CEP(PREG && en), // 1-bit input: Clock enable for PREG + .RSTA(rst), // 1-bit input: Reset for AREG + .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG + .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG + .RSTB(rst), // 1-bit input: Reset for BREG + .RSTC('0), // 1-bit input: Reset for CREG + .RSTCTRL(rst), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG + .RSTD('0), // 1-bit input: Reset for DREG and ADREG + .RSTINMODE(rst), // 1-bit input: Reset for INMODE register + .RSTM(rst), // 1-bit input: Reset for MREG + .RSTP(PREG && rst) // 1-bit input: Reset for PREG + ); + end : genDSP +`endif + end : genDSPChain + end : genDSPPE + +endmodule : mvu_vvu_8sx9 From adb58694be36bd0fa2e8558f760d1642f14a2a38 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 21 Sep 2023 16:58:20 +0100 Subject: [PATCH 160/235] [axi wrapper]: changed parameter to localparam --- finn-rtllib/mvu/mvu_vvu_axi_wrapper.v | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v index 788e49a71b..270fe7351f 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v +++ b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v @@ -46,9 +46,9 @@ module $MODULE_NAME_AXI_WRAPPER$ #( parameter FORCE_BEHAVIORAL = $FORCE_BEHAVIORAL$, // Safely deducible parameters - parameter WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, - parameter INPUT_STREAM_WIDTH_BA = ((IS_MVU == 1 ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8, - parameter OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8 + localparam WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, + localparam INPUT_STREAM_WIDTH_BA = ((IS_MVU == 1 ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8, + localparam OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8 )( // Global Control (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *) From f54d438f78fe4ce78c84fdd7bcbc514048bd2fe0 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 21 Sep 2023 16:59:32 +0100 Subject: [PATCH 161/235] [axi]: added support for LUT-based VVU --- finn-rtllib/mvu/mvu_vvu_axi.sv | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index 07ad32e6c8..ff677fc244 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -195,8 +195,8 @@ module mvu_vvu_axi #( .vld(ovld), .p(odat) ); "mvu_vvu_lut": - mvu_vvu_lut #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), - .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .M_REG(M_REG_LUT)) core ( + mvu_vvu_lut #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), + .WEIGHT_WIDTH(WEIGHT_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .M_REG(M_REG_LUT)) core ( .clk, .rst, .en, .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), .vld(ovld), .p(odat) From a4e2ac7146afeab4271344785f638c88cf78da73 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 21 Sep 2023 17:00:07 +0100 Subject: [PATCH 162/235] [mvu vvu 8sx9]: minor change to list of generics --- finn-rtllib/mvu/mvu_vvu_8sx9.sv | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/finn-rtllib/mvu/mvu_vvu_8sx9.sv b/finn-rtllib/mvu/mvu_vvu_8sx9.sv index 52a93739d6..2aa9d71b6c 100644 --- a/finn-rtllib/mvu/mvu_vvu_8sx9.sv +++ b/finn-rtllib/mvu/mvu_vvu_8sx9.sv @@ -32,7 +32,7 @@ *****************************************************************************/ module mvu_vvu_8sx9 #( - parameter IS_MVU, + bit IS_MVU, int unsigned PE, int unsigned SIMD, int unsigned ACTIVATION_WIDTH, @@ -42,7 +42,7 @@ module mvu_vvu_8sx9 #( int unsigned SEGMENTLEN = 0, // Default to 0 (which implies a single segment) bit FORCE_BEHAVIORAL = 0, - int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD + localparam int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD ) ( // Global Control From 40ad0b46c03b10b47ec4d72dd04a4ad96149fa89 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 21 Sep 2023 17:00:51 +0100 Subject: [PATCH 163/235] [mvu lut]: added support for VVU --- finn-rtllib/mvu/mvu_lut.sv | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/finn-rtllib/mvu/mvu_lut.sv b/finn-rtllib/mvu/mvu_lut.sv index b100a589e8..c100910d75 100644 --- a/finn-rtllib/mvu/mvu_lut.sv +++ b/finn-rtllib/mvu/mvu_lut.sv @@ -1,13 +1,15 @@ -module mvu_lut #( - int unsigned PE, - int unsigned SIMD, +module mvu_vvu_lut #( + bit IS_MVU, + int unsigned PE, + int unsigned SIMD, int unsigned ACCU_WIDTH, int unsigned ACTIVATION_WIDTH, int unsigned WEIGHT_WIDTH, bit SIGNED_ACTIVATIONS, bit M_REG = 1, - localparam unsigned MULT_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH + localparam int unsigned MULT_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH, + localparam int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD )( // Global Control input logic clk, @@ -17,8 +19,8 @@ module mvu_lut #( // Input input logic last, input logic zero, // ignore current inputs and force this partial product to zero - input logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // signed weights - input logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] a, // (un)signed activations + input logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // signed weights + input logic [ACTIVATION_ELEMENTS-1:0][ACTIVATION_WIDTH-1:0] a, // (un)signed activations // Ouput output logic vld, @@ -63,16 +65,16 @@ module mvu_lut #( always_ff @(posedge clk) begin if(rst) M[j] = '{ default : 0 }; else if (en) M[j] = zero ? 0 : - SIGNED_ACTIVATIONS ? $signed(a[j]) * $signed(w[i][j]) : - $signed({1'b0, a[j]}) * $signed(w[i][j]); - // (SIGNED_ACTIVATIONS ? $signed(a[j]) : a[j]) * $signed(w[i][j]) isn't valid -- leads to unsigned multiplication + SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) : + $signed({1'b0, a[(IS_MVU ? 0 : SIMD*i) + j]}) * $signed(w[i][j]); + // (SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) : a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) isn't valid -- leads to unsigned multiplication end assign m1[j] = M[j]; end : genMreg else begin : genNoMreg assign m1[j] = zero ? 0 : - SIGNED_ACTIVATIONS ? $signed(a[j]) * $signed(w[i][j]) : - $signed({1'b0, a[j]}) * $signed(w[i][j]); + SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) : + $signed({1'b0, a[(IS_MVU ? 0 : SIMD*i) + j]}) * $signed(w[i][j]); end : genNoMreg end : genSIMD @@ -99,4 +101,4 @@ module mvu_lut #( assign p[i] = P2[i]; end : genPE -endmodule : mvu_lut +endmodule : mvu_vvu_lut From 30fcb5b734f86d0032549a4efe29d96b13ee5451 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 21 Sep 2023 17:01:10 +0100 Subject: [PATCH 164/235] [mvu vvu lut]: renamed file for consistency --- finn-rtllib/mvu/mvu_vvu_lut.sv | 104 +++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 finn-rtllib/mvu/mvu_vvu_lut.sv diff --git a/finn-rtllib/mvu/mvu_vvu_lut.sv b/finn-rtllib/mvu/mvu_vvu_lut.sv new file mode 100644 index 0000000000..c100910d75 --- /dev/null +++ b/finn-rtllib/mvu/mvu_vvu_lut.sv @@ -0,0 +1,104 @@ +module mvu_vvu_lut #( + bit IS_MVU, + int unsigned PE, + int unsigned SIMD, + int unsigned ACCU_WIDTH, + int unsigned ACTIVATION_WIDTH, + int unsigned WEIGHT_WIDTH, + bit SIGNED_ACTIVATIONS, + bit M_REG = 1, + + localparam int unsigned MULT_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH, + localparam int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD +)( + // Global Control + input logic clk, + input logic rst, + input logic en, + + // Input + input logic last, + input logic zero, // ignore current inputs and force this partial product to zero + input logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // signed weights + input logic [ACTIVATION_ELEMENTS-1:0][ACTIVATION_WIDTH-1:0] a, // (un)signed activations + + // Ouput + output logic vld, + output logic signed [PE-1:0][ACCU_WIDTH-1:0] p +); + + typedef int unsigned leave_load_t[2*SIMD-1]; + function leave_load_t init_leave_loads(); + automatic leave_load_t res; + for(int i = 2*(SIMD-1); i >= int'(SIMD)-1; i--) res[i] = 1; + for(int i = SIMD-2; i >= 0; i--) res[i] = res[2*i+1] + res[2*i+2]; + return res; + endfunction : init_leave_loads + + // Pipeline for last indicator flag + uwire last_i; + generate if (M_REG) begin + logic [0:1] L = '0; + always_ff @(posedge clk) begin + if(rst) L <= '0; + else if (en) L <= {last, L[0]}; + end + assign last_i = L[1]; + end + else begin + logic L = '0; + always_ff @(posedge clk) begin + if(rst) L <= '0; + else if (en) L <= last; + end + assign last_i = L; + end + endgenerate + + // For each PE generate + for (genvar i = 0; i < PE; i++) begin : genPE + // Stage #1: SIMD multipliers in parallel + uwire [MULT_WIDTH-1 : 0] m1 [SIMD]; + for (genvar j = 0; j < SIMD; j++) begin : genSIMD + if (M_REG) begin : genMreg + logic [MULT_WIDTH-1 : 0] M [SIMD]; + always_ff @(posedge clk) begin + if(rst) M[j] = '{ default : 0 }; + else if (en) M[j] = zero ? 0 : + SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) : + $signed({1'b0, a[(IS_MVU ? 0 : SIMD*i) + j]}) * $signed(w[i][j]); + // (SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) : a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) isn't valid -- leads to unsigned multiplication + end + assign m1[j] = M[j]; + end : genMreg + else begin : genNoMreg + assign m1[j] = zero ? 0 : + SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) : + $signed({1'b0, a[(IS_MVU ? 0 : SIMD*i) + j]}) * $signed(w[i][j]); + end : genNoMreg + end : genSIMD + + // Stage #2: Adder tree to reduce SIMD products + localparam leave_load_t LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default : 1 }; + localparam int unsigned ROOT_WIDTH = $clog2(SIMD*(2**MULT_WIDTH-1)); + uwire signed [2*SIMD-2:0][ROOT_WIDTH-1:0] tree; + for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = $signed(m1[s]); + for(genvar n = 0; n < SIMD-1; n++) begin + // Sum truncated to actual maximum bit width at this node + localparam int unsigned NODE_WIDTH = $clog2(LEAVE_LOAD[n]*(2**MULT_WIDTH-1)); + uwire signed [NODE_WIDTH-1:0] s = tree[2*n+1] + tree[2*n+2]; + assign tree[n] = s; + end + + // Stage #3: Buffer output + logic [ACCU_WIDTH-1:0] P2 [PE]; + always_ff @(posedge clk) begin + if(rst) P2[i] = '{ default : 0}; + else if (en) P2[i] = (last_i ? 0 : $signed(P2[i])) + $signed(tree[0]); + end + + assign vld = last_i; + assign p[i] = P2[i]; + end : genPE + +endmodule : mvu_vvu_lut From cb434386fa8bf6f63964dd889c8025c3e9616a6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Thu, 21 Sep 2023 15:58:34 +0100 Subject: [PATCH 165/235] Revert to proper address truncation without generation bit. --- finn-rtllib/mvu/replay_buffer.sv | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/finn-rtllib/mvu/replay_buffer.sv b/finn-rtllib/mvu/replay_buffer.sv index d4342f705c..3e2766f63d 100644 --- a/finn-rtllib/mvu/replay_buffer.sv +++ b/finn-rtllib/mvu/replay_buffer.sv @@ -144,8 +144,8 @@ module replay_buffer #( uwire wr = irdy && ivld; uwire rd = !OVld || ordy; always_ff @(posedge clk) begin - if(wr) Mem[WP[AWIDTH:0]] <= idat; - if(rd) ODat <= Mem[RP[AWIDTH:0]]; + if(wr) Mem[WP[AWIDTH-1:0]] <= idat; + if(rd) ODat <= Mem[RP[AWIDTH-1:0]]; end uwire vld = (RP != WP); From b4b69f3fa7caae4be9357abf596aff4a66561228 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 21 Sep 2023 17:04:05 +0100 Subject: [PATCH 166/235] remove deletd/renamed files --- finn-rtllib/mvu/mvu_8sx9.sv | 427 ------------------------- finn-rtllib/mvu/mvu_8sx9_axi.sv | 179 ----------- finn-rtllib/mvu/mvu_8sx9_axi_tb.sv | 208 ------------ finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v | 93 ------ finn-rtllib/mvu/mvu_8sx9_tb.sv | 165 ---------- finn-rtllib/mvu/mvu_axi.sv | 248 -------------- finn-rtllib/mvu/mvu_axi_wrapper.v | 92 ------ finn-rtllib/mvu/mvu_lut.sv | 104 ------ finn-rtllib/mvu/tb/mvu_axi_tb.sv | 215 ------------- 9 files changed, 1731 deletions(-) delete mode 100644 finn-rtllib/mvu/mvu_8sx9.sv delete mode 100644 finn-rtllib/mvu/mvu_8sx9_axi.sv delete mode 100644 finn-rtllib/mvu/mvu_8sx9_axi_tb.sv delete mode 100644 finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v delete mode 100644 finn-rtllib/mvu/mvu_8sx9_tb.sv delete mode 100644 finn-rtllib/mvu/mvu_axi.sv delete mode 100644 finn-rtllib/mvu/mvu_axi_wrapper.v delete mode 100644 finn-rtllib/mvu/mvu_lut.sv delete mode 100644 finn-rtllib/mvu/tb/mvu_axi_tb.sv diff --git a/finn-rtllib/mvu/mvu_8sx9.sv b/finn-rtllib/mvu/mvu_8sx9.sv deleted file mode 100644 index 52a93739d6..0000000000 --- a/finn-rtllib/mvu/mvu_8sx9.sv +++ /dev/null @@ -1,427 +0,0 @@ -/****************************************************************************** - * Copyright (C) 2022, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, - * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * @brief Matrix Vector Unit (MVU) core compute kernel utilizing DSP58. - *****************************************************************************/ - -module mvu_vvu_8sx9 #( - parameter IS_MVU, - int unsigned PE, - int unsigned SIMD, - int unsigned ACTIVATION_WIDTH, - int unsigned WEIGHT_WIDTH, - int unsigned ACCU_WIDTH, - bit SIGNED_ACTIVATIONS = 0, - int unsigned SEGMENTLEN = 0, // Default to 0 (which implies a single segment) - bit FORCE_BEHAVIORAL = 0, - - int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD - ) - ( - // Global Control - input logic clk, - input logic rst, - input logic en, - - // Input - input logic last, - input logic zero, // ignore current inputs and force this partial product to zero - input logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // weights - input logic [ACTIVATION_ELEMENTS-1:0][ACTIVATION_WIDTH-1:0] a, // activations - - // Ouput - output logic vld, - output logic [PE-1:0][ACCU_WIDTH-1:0] p - ); - // for verilator always use behavioral code - localparam bit BEHAVIORAL = -`ifdef VERILATOR - 1 || -`endif - FORCE_BEHAVIORAL; - -//-------------------- Declare global signals --------------------\\ - localparam int unsigned CHAINLEN = (SIMD+2)/3; - localparam int unsigned SEGLEN = SEGMENTLEN == 0 ? CHAINLEN : SEGMENTLEN; // Additional constant to default a SEGMENTLEN of '0' to the DSP-chain length - localparam int unsigned PE_ACTIVATION = IS_MVU ? 1 : PE; - uwire [26:0] a_in_i [PE_ACTIVATION * CHAINLEN]; - uwire [23:0] b_in_i [PE][CHAINLEN]; - uwire [PE-1:0][CHAINLEN-1:0][57:0] pcout; // Array with packed dimension > 256 (with a loop-carried dependency) cannot be handled out-of-the-box with PyVerilator - -//-------------------- Shift register for opmode select signal --------------------\\ - localparam int unsigned MAX_PIPELINE_STAGES = (CHAINLEN + SEGLEN-1)/SEGLEN; // >=1 (== number of pipeline registers + 1 (A/B inputs always have 1 register)) - logic L [0:1+MAX_PIPELINE_STAGES] = '{default: 0}; // After MAX_PIPELINE_STAGES (== number of pipeline stages for input data), we have 3 additional cycles latency (A/B reg, Mreg, Preg). Thus, we add +2 (since OPMODE is buffered by 1 cycle in the DSP fabric) - - always_ff @(posedge clk) begin - if(rst) L <= '{default: 0}; - else if(en) begin - L[1+MAX_PIPELINE_STAGES] <= last; - L[0:MAX_PIPELINE_STAGES] <= L[1:1+MAX_PIPELINE_STAGES]; - end - end - assign vld = L[0]; - -//-------------------- Shift register for ZERO flag --------------------\\ - logic Z [0:MAX_PIPELINE_STAGES-2] = '{default:0}; // We need MAX_PIPELINE_STAGES-1 pipeline stages (note: INMODE is buffered inside DSP fabric) - - if (MAX_PIPELINE_STAGES > 1) begin : genZreg - always_ff @(posedge clk) begin - if (rst) Z <= '{default: 0}; - else if(en) begin - Z[0] <= zero; - if (MAX_PIPELINE_STAGES > 2) Z[1:MAX_PIPELINE_STAGES-2] <= Z[0:MAX_PIPELINE_STAGES-3]; - end - end - end; - -//-------------------- Buffer for input activations --------------------\\ - localparam int unsigned PAD_BITS_ACT = 9 - ACTIVATION_WIDTH; - for (genvar k=0; k1 ? TOTAL_PREGS-1 : 0; - localparam int LANES_OCCUPIED = i == CHAINLEN-1 ? SIMD - 3*i : 3; - - if (EXTERNAL_PREGS > 0) begin : genExternalPregAct - logic [0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][ACTIVATION_WIDTH-1:0] A = '{ default : 0}; - always_ff @(posedge clk) begin - if (rst) A <= '{default: 0}; - else if(en) begin - A[EXTERNAL_PREGS-1] <= - // synthesis translate_off - zero ? '1 : - // synthesis translate_on - a[SIMD*k + 3*i +: LANES_OCCUPIED]; - if (EXTERNAL_PREGS > 1) A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1]; - end - end - for (genvar j=0; j1 ? TOTAL_PREGS-1 : 0; - localparam int LANES_OCCUPIED = j == CHAINLEN-1 ? SIMD - 3*j : 3; - - if (EXTERNAL_PREGS > 0) begin : genExternalPregWeight - logic [0:PE-1][0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][WEIGHT_WIDTH-1:0] B = '{ default : 0}; - always_ff @(posedge clk) begin - if (rst) B <= '{default: 0}; - else if (en) begin - B[i][EXTERNAL_PREGS-1] <= -// synthesis translate_off - zero ? '1 : -// synthesis translate_on - w[i][3*j +: LANES_OCCUPIED]; - if (EXTERNAL_PREGS > 1) B[i][0:EXTERNAL_PREGS-2] <= B[i][1:EXTERNAL_PREGS-1]; - end - end - for (genvar k = 0 ; k < LANES_OCCUPIED ; k++) begin : genBin - assign b_in_i[i][j][8*k +: 8] = PAD_BITS_WEIGHT == 0 ? B[i][0][k] : { {PAD_BITS_WEIGHT{B[i][0][k][WEIGHT_WIDTH-1]}}, B[i][0][k] }; - end : genBin - for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero - assign b_in_i[i][j][8*k +: 8] = 8'b0; - end : genBinZero - end : genExternalPregWeight - else begin : genInpDSPWeight - for (genvar k = 0; k < LANES_OCCUPIED; k++) begin : genBin - assign b_in_i[i][j][8*k +: 8] = -// synthesis translate_off - zero ? '1 : -// synthesis translate_on - PAD_BITS_WEIGHT == 0 ? w[i][3*j+k] : { {PAD_BITS_WEIGHT{w[i][3*j+k][WEIGHT_WIDTH-1]}}, w[i][3*j+k] }; - end : genBin - for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero - assign b_in_i[i][j][8*k +: 8] = 8'b0; - end : genBinZero - end : genInpDSPWeight - end : genWeightSIMD - end : genWeightPE - -//-------------------- Instantiate PE x CHAINLEN DSPs --------------------\\ - for (genvar i=0; i0 ? 2 : 1; // 1 : 0 - localparam bit PREG = (j+1)%SEGLEN==0 || j == CHAINLEN-1; - localparam bit FIRST = j == 0; - localparam bit LAST = j == CHAINLEN-1; - uwire [57:0] pp; - - if (LAST) begin : genPOUT - assign p[i] = pp[ACCU_WIDTH-1:0]; - end - - // Note: Since the product B * AD is computed, - // rst can be only applied to AD and zero only to B - // with the same effect as zeroing both. - if(BEHAVIORAL) begin : genBehav - // Stage #1: Input A/B - logic signed [33:0] Areg [INTERNAL_PREGS]; - always_ff @(posedge clk) begin - if (rst) Areg <= '{ default : 0}; - else if (en) begin - Areg[0] <= { 7'bx, a_in_i[(IS_MVU ? 0 : CHAINLEN*i) + j] }; - if (INTERNAL_PREGS == 2) Areg[1] <= Areg[0]; - end - end - logic signed [23:0] Breg [INTERNAL_PREGS]; - always_ff @(posedge clk) begin - if (rst) Breg <= '{ default : 0}; - else if (en) begin - Breg[0] <= b_in_i[i][j]; - if (INTERNAL_PREGS == 2) Breg[1] <= Breg[0]; - end - end - - // Stage #2: Multiply-Accumulate - logic signed [57:0] Mreg; - logic InmodeZero = 0; - always_ff @(posedge clk) begin - if (rst) InmodeZero <= 0; - else if (en) InmodeZero <= ( TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero ); - end - always_ff @(posedge clk) begin - if (rst) Mreg <= 0; - else if (en) begin - automatic logic signed [57:0] m = 0; - for (int k = 0; k < 3; k++) begin - m = m + (InmodeZero ? 0 : $signed(Areg[INTERNAL_PREGS-1][9*k +: 9]) * $signed(Breg[INTERNAL_PREGS-1][8*k +: 8])); - end - Mreg <= m; - end - end - - // Stage #3: Accumulate - logic signed [57:0] Preg; - logic Opmode = 0; - if (FIRST && !LAST) begin : genFirst - if (PREG) begin : genPregBehav - always_ff @(posedge clk) begin - if (rst) Preg <= 0; - else if (en) Preg <= Mreg; - end - end - else assign Preg = Mreg; - end - else if (FIRST && LAST) begin : genSingle - always_ff @(posedge clk) begin - if (rst) Opmode <= 0; - else if (en) Opmode <= L[1]; - end - always_ff @(posedge clk) begin - if (rst) Preg <= 0; - else if (en) Preg <= (Opmode ? 0 : Preg) + Mreg; - end - end - else if (!FIRST && LAST) begin : genLast - always_ff @(posedge clk) begin - if (rst) Opmode <= 0; - else if (en) Opmode <= L[1]; - end - always_ff @(posedge clk) begin - if (rst) Preg <= 0; - else if (en) Preg <= (Opmode ? 0 : Preg) + Mreg + pcout[i][j-1]; - end - end - else begin : genMid - if (PREG) begin : genPregBehav - always_ff @(posedge clk) begin - if (rst) Preg <= 0; - else if (en) Preg <= Mreg + pcout[i][j-1]; - end - end - else assign Preg = Mreg + pcout[i][j-1]; - end - assign pp = Preg; - assign pcout[i][j] = Preg; - end : genBehav -`ifndef VERILATOR - else begin: genDSP - DSP58 #( - // Feature Control Attributes: Data Path Selection - .AMULTSEL("A"), // Selects A input to multiplier (A, AD) - .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) - .BMULTSEL("B"), // Selects B input to multiplier (AD, B) - .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port) - .DSP_MODE("INT8"), // Configures DSP to a particular mode of operation. Set to INT24 for - // legacy mode. - .PREADDINSEL("A"), // Selects input to pre-adder (A, B) - .RND(58'h000000000000000), // Rounding Constant - .USE_MULT("MULTIPLY"), // Select multiplier usage (DYNAMIC, MULTIPLY, NONE) - .USE_SIMD("ONE58"), // SIMD selection (FOUR12, ONE58, TWO24) - .USE_WIDEXOR("FALSE"), // Use the Wide XOR function (FALSE, TRUE) - .XORSIMD("XOR24_34_58_116"), // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116) - // Pattern Detector Attributes: Pattern Detection Configuration - .AUTORESET_PATDET("NO_RESET"), // NO_RESET, RESET_MATCH, RESET_NOT_MATCH - .AUTORESET_PRIORITY("RESET"), // Priority of AUTORESET vs. CEP (CEP, RESET). - .MASK(58'h0ffffffffffffff), // 58-bit mask value for pattern detect (1=ignore) - .PATTERN(58'h000000000000000), // 58-bit pattern match for pattern detect - .SEL_MASK("MASK"), // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2 - .SEL_PATTERN("PATTERN"), // Select pattern value (C, PATTERN) - .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect (NO_PATDET, PATDET) - // Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins - .IS_ALUMODE_INVERTED(4'b0000), // Optional inversion for ALUMODE - .IS_CARRYIN_INVERTED(1'b0), // Optional inversion for CARRYIN - .IS_CLK_INVERTED(1'b0), // Optional inversion for CLK - .IS_INMODE_INVERTED(5'b00000), // Optional inversion for INMODE - .IS_NEGATE_INVERTED(3'b000), // Optional inversion for NEGATE - .IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0 - FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN - 2'b01, // Y : M - 2'b01 // X: M - }), // Optional inversion for OPMODE - .IS_RSTALLCARRYIN_INVERTED(1'b0), // Optional inversion for RSTALLCARRYIN - .IS_RSTALUMODE_INVERTED(1'b0), // Optional inversion for RSTALUMODE - .IS_RSTA_INVERTED(1'b0), // Optional inversion for RSTA - .IS_RSTB_INVERTED(1'b0), // Optional inversion for RSTB - .IS_RSTCTRL_INVERTED(1'b0), // Optional inversion for STCONJUGATE_A - .IS_RSTC_INVERTED(1'b0), // Optional inversion for RSTC - .IS_RSTD_INVERTED(1'b0), // Optional inversion for RSTD - .IS_RSTINMODE_INVERTED(1'b0), // Optional inversion for RSTINMODE - .IS_RSTM_INVERTED(1'b0), // Optional inversion for RSTM - .IS_RSTP_INVERTED(1'b0), // Optional inversion for RSTP - // Register Control Attributes: Pipeline Register Configuration - .ACASCREG(INTERNAL_PREGS), // Number of pipeline stages between A/ACIN and ACOUT (0-2) - .ADREG(0), // Pipeline stages for pre-adder (0-1) - .ALUMODEREG(0), // Pipeline stages for ALUMODE (0-1) - .AREG(INTERNAL_PREGS), // Pipeline stages for A (0-2) - .BCASCREG(INTERNAL_PREGS), // Number of pipeline stages between B/BCIN and BCOUT (0-2) - .BREG(INTERNAL_PREGS), // Pipeline stages for B (0-2) - .CARRYINREG(0), // Pipeline stages for CARRYIN (0-1) - .CARRYINSELREG(0), // Pipeline stages for CARRYINSEL (0-1) - .CREG(0), // Pipeline stages for C (0-1) - .DREG(0), // Pipeline stages for D (0-1) - .INMODEREG(1), // Pipeline stages for INMODE (0-1) - .MREG(1), // Multiplier pipeline stages (0-1) - .OPMODEREG(1), // Pipeline stages for OPMODE (0-1) - .PREG(PREG), // Number of pipeline stages for P (0-1) - .RESET_MODE("SYNC") // Selection of synchronous or asynchronous reset. (ASYNC, SYNC). - ) - DSP58_inst ( - // Cascade outputs: Cascade Ports - .ACOUT(), // 34-bit output: A port cascade - .BCOUT(), // 24-bit output: B cascade - .CARRYCASCOUT(), // 1-bit output: Cascade carry - .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade - .PCOUT(pcout[i][j]), // 58-bit output: Cascade output - // Control outputs: Control Inputs/Status Bits - .OVERFLOW(), // 1-bit output: Overflow in add/acc - .PATTERNBDETECT(), // 1-bit output: Pattern bar detect - .PATTERNDETECT(), // 1-bit output: Pattern detect - .UNDERFLOW(), // 1-bit output: Underflow in add/acc - // Data outputs: Data Ports - .CARRYOUT(), // 4-bit output: Carry - .P(pp), // 58-bit output: Primary data - .XOROUT(), // 8-bit output: XOR data - // Cascade inputs: Cascade Ports - .ACIN('x), // 34-bit input: A cascade data - .BCIN('x), // 24-bit input: B cascade - .CARRYCASCIN('x), // 1-bit input: Cascade carry - .MULTSIGNIN('x), // 1-bit input: Multiplier sign cascade - .PCIN(FIRST ? 'x : pcout[i][j-1]), // 58-bit input: P cascade - // Control inputs: Control Inputs/Status Bits - .ALUMODE(4'h0), // 4-bit input: ALU control - .CARRYINSEL('0), // 3-bit input: Carry select - .CLK(clk), // 1-bit input: Clock - .INMODE({ - INTERNAL_PREGS==2 ? 1'b0 : 1'b1, - 2'b00, - TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero, - INTERNAL_PREGS==2 ? 1'b0 : 1'b1 - }), // 5-bit input: INMODE control - .NEGATE('0), // 3-bit input: Negates the input of the multiplier - .OPMODE({ - LAST ? {1'b0, L[1]} : 2'b00, - 7'b000_0000 - }), // 9-bit input: Operation mode - // Data inputs: Data Ports - .A({ 7'bx, a_in_i[(IS_MVU ? 0 : CHAINLEN*i) + j] }), // 34-bit input: A data - .B(b_in_i[i][j]), // 24-bit input: B data - .C('x), // 58-bit input: C data - .CARRYIN('0), // 1-bit input: Carry-in - .D('x), // 27-bit input: D data - // Reset/Clock Enable inputs: Reset/Clock Enable Inputs - .ASYNC_RST('0), // 1-bit input: Asynchronous reset for all registers. - .CEA1(en), // 1-bit input: Clock enable for 1st stage AREG - .CEA2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage AREG - .CEAD('0), // 1-bit input: Clock enable for ADREG - .CEALUMODE('0), // 1-bit input: Clock enable for ALUMODE - .CEB1(en), // 1-bit input: Clock enable for 1st stage BREG - .CEB2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage BREG - .CEC('0), // 1-bit input: Clock enable for CREG - .CECARRYIN('0), // 1-bit input: Clock enable for CARRYINREG - .CECTRL(en), // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG - .CED('0), // 1-bit input: Clock enable for DREG - .CEINMODE(en), // 1-bit input: Clock enable for INMODEREG - .CEM(en), // 1-bit input: Clock enable for MREG - .CEP(PREG && en), // 1-bit input: Clock enable for PREG - .RSTA(rst), // 1-bit input: Reset for AREG - .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG - .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG - .RSTB(rst), // 1-bit input: Reset for BREG - .RSTC('0), // 1-bit input: Reset for CREG - .RSTCTRL(rst), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG - .RSTD('0), // 1-bit input: Reset for DREG and ADREG - .RSTINMODE(rst), // 1-bit input: Reset for INMODE register - .RSTM(rst), // 1-bit input: Reset for MREG - .RSTP(PREG && rst) // 1-bit input: Reset for PREG - ); - end : genDSP -`endif - end : genDSPChain - end : genDSPPE - -endmodule : mvu_vvu_8sx9 diff --git a/finn-rtllib/mvu/mvu_8sx9_axi.sv b/finn-rtllib/mvu/mvu_8sx9_axi.sv deleted file mode 100644 index 5f215927d8..0000000000 --- a/finn-rtllib/mvu/mvu_8sx9_axi.sv +++ /dev/null @@ -1,179 +0,0 @@ -/****************************************************************************** - * Copyright (C) 2022, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, - * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * @brief Matrix Vector Unit (MVU) AXI-lite interface wrapper. - *****************************************************************************/ - -module mvu_8sx9_axi #( - int unsigned MW, - int unsigned MH, - int unsigned PE, - int unsigned SIMD, - int unsigned ACTIVATION_WIDTH, - int unsigned WEIGHT_WIDTH, - int unsigned ACCU_WIDTH, - bit SIGNED_ACTIVATIONS = 0, - int unsigned SEGMENTLEN = 0, - parameter RAM_STYLE = "auto", - - localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, - localparam int unsigned INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8, - localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH, - localparam int unsigned INPUT_STREAM_WIDTH = SIMD*ACTIVATION_WIDTH, - localparam int unsigned SF = MW/SIMD, - localparam int unsigned NF = MH/PE, - localparam int unsigned OUTPUT_LANES = PE, - localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8 -) -( - // Global Control - input logic ap_clk, - input logic ap_rst_n, - - // Weight Stream - input logic [WEIGHT_STREAM_WIDTH_BA-1:0] s_axis_weights_tdata, - input logic s_axis_weights_tvalid, - output logic s_axis_weights_tready, - - // Input Stream - input logic [INPUT_STREAM_WIDTH_BA-1:0] s_axis_input_tdata, - input logic s_axis_input_tvalid, - output logic s_axis_input_tready, - - // Output Stream - output logic [OUTPUT_STREAM_WIDTH_BA-1:0] m_axis_output_tdata, - output logic m_axis_output_tvalid, - input logic m_axis_output_tready -); - -//-------------------- Parameter sanity checks --------------------\\ - initial begin - if (MW % SIMD != 0) begin - $error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD); - $finish; - end - if (MH % PE != 0) begin - $error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE); - $finish; - end - if (ACTIVATION_WIDTH > 9) begin - $error("Activation width of %0d-bits exceeds maximum of 9-bits", ACTIVATION_WIDTH); - $finish; - end - if (WEIGHT_WIDTH > 8) begin - $error("Weight width of %0d-bits exceeds maximum of 8-bits", WEIGHT_WIDTH); - $finish; - end - if (SIGNED_ACTIVATIONS == 0 && ACTIVATION_WIDTH==9) begin - $error("Activation width of %0d-bits exceeds maximum of 8-bits for unsigned numbers", ACTIVATION_WIDTH); - $finish; - end - if (SEGMENTLEN == 0) begin - $warning("Segment length of %0d defaults to chain length", SEGMENTLEN); - end - if (SEGMENTLEN > (SIMD+2)/3) begin - $error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3); - $finish; - end - end - - uwire clk = ap_clk; - uwire rst = !ap_rst_n; - - typedef logic [INPUT_STREAM_WIDTH-1 : 0] mvauin_t; - - uwire mvauin_t amvau; - uwire alast; - uwire afin; - uwire avld; - uwire ardy; - - replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvauin_t)), .RAM_STYLE(RAM_STYLE)) activation_replay ( - .clk, .rst, - .ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)), - .ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin) - ); - -//-------------------- Input control --------------------\\ - uwire en; - uwire istb = avld && s_axis_weights_tvalid; - assign ardy = en && s_axis_weights_tvalid; - assign s_axis_weights_tready = en && avld; - -//-------------------- Core MVU --------------------\\ - uwire ovld; - uwire [PE-1:0][57:0] odat; - typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t; - mvu_8sx9 #(.PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), - .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN)) core ( - .clk, .rst, .en, - .last(alast), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau), - .vld(ovld), .p(odat) - ); - -//-------------------- Output register slice --------------------\\ - struct { - logic vld; - logic [PE-1:0][ACCU_WIDTH-1:0] dat; - } A = '{ vld: 0, default: 'x}; - - assign en = !A.vld || !ovld; - - uwire b_load; - always_ff @(posedge clk) begin - if(rst) A <= '{ vld: 0, default: 'x }; - else if(!A.vld || b_load) begin - A.vld <= ovld && en; - for(int unsigned i = 0; i < PE; i++) begin - // CR-1148862: - // A.dat[i] <= odat[i]; - automatic logic [57:0] v = odat[i]; - A.dat[i] <= v[ACCU_WIDTH-1:0]; - end - end - end - - struct { - logic vld; - logic [PE-1:0][ACCU_WIDTH-1:0] dat; - } B = '{ vld: 0, default: 'x}; - - assign b_load = !B.vld || m_axis_output_tready; - always_ff @(posedge clk) begin - if(rst) B <= '{ default: 'x }; - else begin - if(b_load) B <= '{ vld: A.vld, dat: A.dat}; - end - end - - assign m_axis_output_tvalid = B.vld; - assign m_axis_output_tdata = B.dat; - -endmodule \ No newline at end of file diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv b/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv deleted file mode 100644 index 70ffa096ef..0000000000 --- a/finn-rtllib/mvu/mvu_8sx9_axi_tb.sv +++ /dev/null @@ -1,208 +0,0 @@ -/****************************************************************************** - * Copyright (C) 2022, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, - * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * @brief Testbench for MVU AXI-lite interface wrapper. - *****************************************************************************/ - -module mvu_8sx9_axi_tb(); - -//-------------------- Simulation parameters --------------------\\ - // Matrix & parallelism config - localparam int unsigned MW = 600; - localparam int unsigned MH = 256; - localparam int unsigned SIMD = 60; - localparam int unsigned PE = 16; - localparam int unsigned SEGMENTLEN = 4; - // Bit-width config - localparam int unsigned ACTIVATION_WIDTH = 8; - localparam int unsigned WEIGHT_WIDTH = 4; - localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW); - localparam bit SIGNED_ACTIVATIONS = 1; - // Simulation constants - localparam int unsigned NF = MH/PE; - localparam int unsigned SF = MW/SIMD; - localparam int unsigned NUM_OF_DSP = SIMD/3; - localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8; - localparam int unsigned ACTIVATION_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8*8; - localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH; - localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - SIMD*ACTIVATION_WIDTH; - localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8; - - // Generate clk and reset signal - logic clk = 0; - always #5ns clk = !clk; - - logic ap_rst_n = 0; - initial begin - repeat(16) @(posedge clk); - ap_rst_n <= 1; - end - - uwire ap_clk = clk; - - // Generate activations - typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; - typedef activation_t activation_vector_t[SF]; - - function activation_vector_t init_ACTIVATIONS; - automatic activation_vector_t res; - std::randomize(res); - return res; - endfunction : init_ACTIVATIONS - - activation_vector_t ACTIVATIONS = init_ACTIVATIONS(); - - struct { - activation_t dat; - logic vld; - logic rdy; - } activations; - - initial begin - activations.vld = 0; - activations.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain - @(posedge clk iff ap_rst_n); - - for (int i=0; i 1; - @(posedge clk); - end while (!(activations.vld === 1 && activations.rdy === 1)); - end - - activations.vld <= 0; - activations.dat <= 'x; - end - - // Generate weights - typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; - typedef weight_t weight_matrix_t[NF][SF]; - - function weight_matrix_t init_WEIGHTS; - automatic weight_matrix_t res; - std::randomize(res); - return res; - endfunction : init_WEIGHTS; - - weight_matrix_t WEIGHTS = init_WEIGHTS(); - - struct { - weight_t dat; - logic vld; - logic rdy; - } weights; - - initial begin - weights.vld = 0; - weights.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain - @(posedge clk iff ap_rst_n); - - weights.vld <= 1; - for (int i=0; i= 1; - @(posedge clk iff ap_rst_n); - end while (!(outputs.rdy === 1 && outputs.vld === 1)); - - // Compare produced outputs against golden outputs - foreach(outputs.dat[i]) begin - assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); - else begin - $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); - $stop; - end - end - - NF_CNT += 1; - end - - $finish; - end - - // Instantiate DUT - mvu_8sx9_axi #( - .MW(MW), - .MH(MH), - .PE(PE), - .SIMD(SIMD), - .ACTIVATION_WIDTH(ACTIVATION_WIDTH), - .WEIGHT_WIDTH(WEIGHT_WIDTH), - .ACCU_WIDTH(ACCU_WIDTH), - .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), - .SEGMENTLEN(SEGMENTLEN) - ) - dut ( - .ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld), - .s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld), - .s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld), - .m_axis_output_tready(outputs.rdy) - ); - -endmodule diff --git a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v b/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v deleted file mode 100644 index e15f77fbae..0000000000 --- a/finn-rtllib/mvu/mvu_8sx9_axi_wrapper.v +++ /dev/null @@ -1,93 +0,0 @@ -/****************************************************************************** - * Copyright (C) 2022, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, - * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * @brief Verilog AXI-lite wrapper for MVU. - *****************************************************************************/ - -module $MODULE_NAME_AXI_WRAPPER$ #( - parameter MW = $MW$, - parameter MH = $MH$, - parameter PE = $PE$, - parameter SIMD = $SIMD$, - parameter ACTIVATION_WIDTH = $ACTIVATION_WIDTH$, - parameter WEIGHT_WIDTH = $WEIGHT_WIDTH$, - parameter ACCU_WIDTH = $ACCU_WIDTH$, - parameter SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$, - parameter SEGMENTLEN = $SEGMENTLEN$, - parameter RAM_STYLE = "$IBUF_RAM_STYLE$", - - // Safely deducible parameters - parameter WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, - parameter INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8, - parameter OUTPUT_LANES = PE, - parameter OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8 -)( - // Global Control - (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF s_axis_weights:s_axis_input:m_axis_output, ASSOCIATED_RESET ap_rst_n" *) - (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *) - input ap_clk, - (* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *) - input ap_rst_n, - - // Weight Stream - input [WEIGHT_STREAM_WIDTH_BA-1:0] s_axis_weights_tdata, - input s_axis_weights_tvalid, - output s_axis_weights_tready, - - // Input Stream - input [INPUT_STREAM_WIDTH_BA-1:0] s_axis_input_tdata, - input s_axis_input_tvalid, - output s_axis_input_tready, - - // Output Stream - output [OUTPUT_STREAM_WIDTH_BA-1:0] m_axis_output_tdata, - output m_axis_output_tvalid, - input m_axis_output_tready -); - -mvu_8sx9_axi #( - .MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), - .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), - .SEGMENTLEN(SEGMENTLEN), .RAM_STYLE(RAM_STYLE) - ) inst ( - .ap_clk(ap_clk), - .ap_rst_n(ap_rst_n), - .s_axis_weights_tdata(s_axis_weights_tdata), - .s_axis_weights_tvalid(s_axis_weights_tvalid), - .s_axis_weights_tready(s_axis_weights_tready), - .s_axis_input_tdata(s_axis_input_tdata), - .s_axis_input_tvalid(s_axis_input_tvalid), - .s_axis_input_tready(s_axis_input_tready), - .m_axis_output_tdata(m_axis_output_tdata), - .m_axis_output_tvalid(m_axis_output_tvalid), - .m_axis_output_tready(m_axis_output_tready) -); - -endmodule : $MODULE_NAME_AXI_WRAPPER$ diff --git a/finn-rtllib/mvu/mvu_8sx9_tb.sv b/finn-rtllib/mvu/mvu_8sx9_tb.sv deleted file mode 100644 index adf6a8f9c2..0000000000 --- a/finn-rtllib/mvu/mvu_8sx9_tb.sv +++ /dev/null @@ -1,165 +0,0 @@ -/****************************************************************************** - * Copyright (C) 2022, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, - * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * @brief Testbench for MVU core compute kernel. - *****************************************************************************/ - -module mvu_8sx9_tb(); - -//-------------------- Simulation parameters --------------------\\ - // Matrix & parallelism config - localparam int unsigned MH = 256; - localparam int unsigned PE = 16; - localparam int unsigned MW = 600; - localparam int unsigned SIMD = 60; - localparam int unsigned SEGMENTLEN = 4; - // Bit-width config - localparam int unsigned ACTIVATION_WIDTH = 8; - localparam int unsigned WEIGHT_WIDTH = 4; - localparam bit SIGNED_ACTIVATIONS = 1; - // Simulation constants - localparam int unsigned NF = MH/PE; - localparam int unsigned SF = MW/SIMD; - localparam int unsigned NUM_OF_DSP = SIMD/3; - - typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; - typedef activation_t activation_vector_t[SF]; - - function activation_vector_t init_ACTIVATIONS; - automatic activation_vector_t res; - std::randomize(res); - return res; - endfunction : init_ACTIVATIONS - - typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; - typedef weight_t weight_matrix_t[NF][SF]; - - function weight_matrix_t init_WEIGHTS; - automatic weight_matrix_t res; - std::randomize(res); - return res; - endfunction : init_WEIGHTS; - - typedef logic signed [PE-1:0][57:0] output_t; - typedef output_t output_vector_t [NF]; - - function output_vector_t check_output(activation_vector_t a, weight_matrix_t w); - automatic output_vector_t res = '{default: 0}; - for (int j = 0; j 1) && !rst; - end - - // Compare computed output against golden output when vld flag is raised by DUT - always_ff @(posedge clk iff (vld && en)) begin - foreach(p[i]) begin - assert ($signed(p[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); - else begin - $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); - $stop; - end - end - NF_CNT += 1; - end - - // Instantiate DUT - mvu_8sx9 #( - .PE(PE), - .SIMD(SIMD), - .WEIGHT_WIDTH(WEIGHT_WIDTH), - .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), - .ACTIVATION_WIDTH(ACTIVATION_WIDTH), - .SEGMENTLEN(SEGMENTLEN) - ) - dut ( - .clk, .rst, .en, .last, .zero, .a, .w, .vld, .p - ); - -endmodule diff --git a/finn-rtllib/mvu/mvu_axi.sv b/finn-rtllib/mvu/mvu_axi.sv deleted file mode 100644 index 07ad32e6c8..0000000000 --- a/finn-rtllib/mvu/mvu_axi.sv +++ /dev/null @@ -1,248 +0,0 @@ -/****************************************************************************** - * Copyright (C) 2022, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, - * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * @brief Matrix Vector Unit (MVU) & Vector Vector Unit (VVU) AXI-lite interface wrapper. - * @details - * The following compute cores are supported: - * - 4-bit MVU on DSP48 & DSP58 achieving 4 MACs/DSP, - * (4,8]-bit MVU on DSP48 achieving 2 MACs/DSP, - * [4,9]-bit MVU and VVU on DSP58 achieving 3 MACs/DSP, - * 'unconstrained' LUT-based MVU and VVU. - * Folding hints: - * - PE scaling should divide MH. - * - SIMD scaling should divide MW. - * - Otherwise, keep SIMD and PE somewhat balanced. SIMD scaling tends to - * impact critical paths more than PE scaling. PE scaling implies a - * bigger fanout on the input activations. - * - Full unfolding along MH (PE=MH) results in no replay buffer instantiated - *****************************************************************************/ - -module mvu_vvu_axi #( - bit IS_MVU, // string type causes error in Vivado - parameter COMPUTE_CORE, - int unsigned MW, - int unsigned MH, - int unsigned PE, - int unsigned SIMD, - int unsigned ACTIVATION_WIDTH, - int unsigned WEIGHT_WIDTH, - int unsigned ACCU_WIDTH, - bit SIGNED_ACTIVATIONS = 0, - int unsigned SEGMENTLEN = 0, - bit FORCE_BEHAVIORAL = 0, - bit M_REG_LUT = 1, - - // Safely deducible parameters - localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, - localparam int unsigned INPUT_STREAM_WIDTH_BA = ((IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8, - localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH, - localparam int unsigned INPUT_STREAM_WIDTH = (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH, - localparam int unsigned SF = MW/SIMD, - localparam int unsigned NF = MH/PE, - localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8 -) -( - // Global Control - input logic ap_clk, - input logic ap_rst_n, - - // Weight Stream - input logic [WEIGHT_STREAM_WIDTH_BA-1:0] s_axis_weights_tdata, - input logic s_axis_weights_tvalid, - output logic s_axis_weights_tready, - - // Input Stream - input logic [INPUT_STREAM_WIDTH_BA-1:0] s_axis_input_tdata, - input logic s_axis_input_tvalid, - output logic s_axis_input_tready, - - // Output Stream - output logic [OUTPUT_STREAM_WIDTH_BA-1:0] m_axis_output_tdata, - output logic m_axis_output_tvalid, - input logic m_axis_output_tready -); - -//-------------------- Parameter sanity checks --------------------\\ - initial begin - if (MW % SIMD != 0) begin - $error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD); - $finish; - end - if (MH % PE != 0) begin - $error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE); - $finish; - end - if (WEIGHT_WIDTH > 8) begin - $error("Weight width of %0d-bits exceeds maximum of 8-bits", WEIGHT_WIDTH); - $finish; - end - if (ACTIVATION_WIDTH > 8) begin - if (!(SIGNED_ACTIVATIONS == 1 && ACTIVATION_WIDTH == 9 && COMPUTE_CORE == "mvu_vvu_8sx9_dsp58")) begin - $error("Activation width of %0d-bits exceeds maximum of 9-bits for signed numbers on DSP48", ACTIVATION_WIDTH); - $finish; - end - end - if (COMPUTE_CORE == "mvu_vvu_8sx9_dsp58") begin - if (SEGMENTLEN == 0) begin - $warning("Segment length of %0d defaults to chain length of %0d", SEGMENTLEN, (SIMD+2)/3); - end - if (SEGMENTLEN > (SIMD+2)/3) begin - $error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3); - $finish; - end - end - if (!IS_MVU) begin - if (COMPUTE_CORE != "mvu_vvu_8sx9_dsp58" && COMPUTE_CORE != "mvu_vvu_lut") begin - $error("VVU only supported on DSP58 or LUT-based implementation"); - $finish; - end - end - end - - uwire clk = ap_clk; - uwire rst = !ap_rst_n; - - typedef logic [INPUT_STREAM_WIDTH-1 : 0] mvauin_t; - - uwire mvauin_t amvau; - uwire alast; - uwire afin; - uwire avld; - uwire ardy; - - replay_buffer #(.LEN(SF), .REP(IS_MVU ? NF : 1), .W($bits(mvauin_t))) activation_replay ( - .clk, .rst, - .ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)), - .ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin) - ); - -//-------------------- Input control --------------------\\ - uwire en; - uwire istb = avld && s_axis_weights_tvalid; - assign ardy = en && s_axis_weights_tvalid; - assign s_axis_weights_tready = en && avld; - -//-------------------- Core MVU/VVU --------------------\\ - uwire ovld; - uwire [PE-1:0][ACCU_WIDTH-1:0] odat; - typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t; - uwire mvauin_t amvau_i; - - if (IS_MVU) begin : genMVUInput - assign amvau_i = amvau; - end : genMVUInput - else begin : genVVUInput - // The input stream will have the channels interleaved for VVU when PE>1 - // Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..] - // Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like: - // (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to - // (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i) - localparam int num_of_elements = INPUT_STREAM_WIDTH/ACTIVATION_WIDTH; - for (genvar i=0; i 1) ? - amvau[(i/SIMD + (i*PE % num_of_elements) + 1) * ACTIVATION_WIDTH : (i/SIMD + (i*PE % num_of_elements)) * ACTIVATION_WIDTH] - : amvau[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH]; - end : genRewire - end : genVVUInput - - case(COMPUTE_CORE) - "mvu_vvu_8sx9_dsp58": - mvu_vvu_8sx9 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), - .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), - .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( - .clk, .rst, .en, - .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), - .vld(ovld), .p(odat) - ); - "mvu_4sx4u": - mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( - .clk, .rst, .en, - .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), - .vld(ovld), .p(odat) - ); - "mvu_8sx8u_dsp48": - mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), - .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( - .clk, .rst, .en, - .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), - .vld(ovld), .p(odat) - ); - "mvu_vvu_lut": - mvu_vvu_lut #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), - .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .M_REG(M_REG_LUT)) core ( - .clk, .rst, .en, - .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), - .vld(ovld), .p(odat) - ); - default: initial begin - $error("Unrecognized COMPUTE_CORE '%s'", COMPUTE_CORE); - $finish; - end - endcase - -//-------------------- Output register slice --------------------\\ - struct packed { - logic vld; - logic [PE-1:0][ACCU_WIDTH-1:0] dat; - } A = '{ vld: 0, default: 'x}; - - assign en = !A.vld || !ovld; - - uwire b_load; - always_ff @(posedge clk) begin - if(rst) A <= '{ vld: 0, default: 'x }; - else if(!A.vld || b_load) begin - A.vld <= ovld && en; - for(int unsigned i = 0; i < PE; i++) begin - // CR-1148862: - // A.dat[i] <= odat[i]; - automatic logic [ACCU_WIDTH-1:0] v = odat[i]; - A.dat[i] <= v[ACCU_WIDTH-1:0]; - end - end - end - - struct packed { - logic vld; - logic [PE-1:0][ACCU_WIDTH-1:0] dat; - } B = '{ vld: 0, default: 'x}; - - assign b_load = !B.vld || m_axis_output_tready; - always_ff @(posedge clk) begin - if(rst) B <= '{ vld: 0, default: 'x }; - else begin - if(b_load) B <= '{ vld: A.vld, dat: A.dat}; - end - end - - assign m_axis_output_tvalid = B.vld; - assign m_axis_output_tdata = B.dat; - -endmodule : mvu_vvu_axi diff --git a/finn-rtllib/mvu/mvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_axi_wrapper.v deleted file mode 100644 index 239c5bbacd..0000000000 --- a/finn-rtllib/mvu/mvu_axi_wrapper.v +++ /dev/null @@ -1,92 +0,0 @@ -/****************************************************************************** - * Copyright (C) 2022, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, - * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * @brief Verilog AXI-lite wrapper for MVU. - *****************************************************************************/ - -module $MODULE_NAME_AXI_WRAPPER$ #( - parameter MW = $MW$, - parameter MH = $MH$, - parameter PE = $PE$, - parameter SIMD = $SIMD$, - parameter ACTIVATION_WIDTH = $ACTIVATION_WIDTH$, - parameter WEIGHT_WIDTH = $WEIGHT_WIDTH$, - parameter ACCU_WIDTH = $ACCU_WIDTH$, - parameter SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$, - parameter SEGMENTLEN = $SEGMENTLEN$, - parameter MVU_IMPL_STYLE = "$MVU_IMPL_STYLE$", - parameter FORCE_BEHAVIORAL = $FORCE_BEHAVIORAL$, - - // Safely deducible parameters - parameter WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, - parameter INPUT_STREAM_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8 * 8, - parameter OUTPUT_LANES = PE, - parameter OUTPUT_STREAM_WIDTH_BA = (OUTPUT_LANES*ACCU_WIDTH + 7)/8 * 8 -)( - // Global Control - (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *) - (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *) - input ap_clk, - (* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *) - input ap_rst_n, - - // Weight Stream - input [WEIGHT_STREAM_WIDTH_BA-1:0] weights_V_TDATA, - input weights_V_TVALID, - output weights_V_TREADY, - // Input Stream - input [INPUT_STREAM_WIDTH_BA-1:0] in0_V_TDATA, - input in0_V_TVALID, - output in0_V_TREADY, - // Output Stream - output [OUTPUT_STREAM_WIDTH_BA-1:0] out_V_TDATA, - output out_V_TVALID, - input out_V_TREADY -); - -mvu_axi #( - .MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), - .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), - .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL), .MVU_IMPL_STYLE(MVU_IMPL_STYLE) - ) inst ( - .ap_clk(ap_clk), - .ap_rst_n(ap_rst_n), - .s_axis_weights_tdata(weights_V_TDATA), - .s_axis_weights_tvalid(weights_V_TVALID), - .s_axis_weights_tready(weights_V_TREADY), - .s_axis_input_tdata(in0_V_TDATA), - .s_axis_input_tvalid(in0_V_TVALID), - .s_axis_input_tready(in0_V_TREADY), - .m_axis_output_tdata(out_V_TDATA), - .m_axis_output_tvalid(out_V_TVALID), - .m_axis_output_tready(out_V_TREADY) -); - -endmodule : $MODULE_NAME_AXI_WRAPPER$ diff --git a/finn-rtllib/mvu/mvu_lut.sv b/finn-rtllib/mvu/mvu_lut.sv deleted file mode 100644 index c100910d75..0000000000 --- a/finn-rtllib/mvu/mvu_lut.sv +++ /dev/null @@ -1,104 +0,0 @@ -module mvu_vvu_lut #( - bit IS_MVU, - int unsigned PE, - int unsigned SIMD, - int unsigned ACCU_WIDTH, - int unsigned ACTIVATION_WIDTH, - int unsigned WEIGHT_WIDTH, - bit SIGNED_ACTIVATIONS, - bit M_REG = 1, - - localparam int unsigned MULT_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH, - localparam int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD -)( - // Global Control - input logic clk, - input logic rst, - input logic en, - - // Input - input logic last, - input logic zero, // ignore current inputs and force this partial product to zero - input logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // signed weights - input logic [ACTIVATION_ELEMENTS-1:0][ACTIVATION_WIDTH-1:0] a, // (un)signed activations - - // Ouput - output logic vld, - output logic signed [PE-1:0][ACCU_WIDTH-1:0] p -); - - typedef int unsigned leave_load_t[2*SIMD-1]; - function leave_load_t init_leave_loads(); - automatic leave_load_t res; - for(int i = 2*(SIMD-1); i >= int'(SIMD)-1; i--) res[i] = 1; - for(int i = SIMD-2; i >= 0; i--) res[i] = res[2*i+1] + res[2*i+2]; - return res; - endfunction : init_leave_loads - - // Pipeline for last indicator flag - uwire last_i; - generate if (M_REG) begin - logic [0:1] L = '0; - always_ff @(posedge clk) begin - if(rst) L <= '0; - else if (en) L <= {last, L[0]}; - end - assign last_i = L[1]; - end - else begin - logic L = '0; - always_ff @(posedge clk) begin - if(rst) L <= '0; - else if (en) L <= last; - end - assign last_i = L; - end - endgenerate - - // For each PE generate - for (genvar i = 0; i < PE; i++) begin : genPE - // Stage #1: SIMD multipliers in parallel - uwire [MULT_WIDTH-1 : 0] m1 [SIMD]; - for (genvar j = 0; j < SIMD; j++) begin : genSIMD - if (M_REG) begin : genMreg - logic [MULT_WIDTH-1 : 0] M [SIMD]; - always_ff @(posedge clk) begin - if(rst) M[j] = '{ default : 0 }; - else if (en) M[j] = zero ? 0 : - SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) : - $signed({1'b0, a[(IS_MVU ? 0 : SIMD*i) + j]}) * $signed(w[i][j]); - // (SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) : a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) isn't valid -- leads to unsigned multiplication - end - assign m1[j] = M[j]; - end : genMreg - else begin : genNoMreg - assign m1[j] = zero ? 0 : - SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) : - $signed({1'b0, a[(IS_MVU ? 0 : SIMD*i) + j]}) * $signed(w[i][j]); - end : genNoMreg - end : genSIMD - - // Stage #2: Adder tree to reduce SIMD products - localparam leave_load_t LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default : 1 }; - localparam int unsigned ROOT_WIDTH = $clog2(SIMD*(2**MULT_WIDTH-1)); - uwire signed [2*SIMD-2:0][ROOT_WIDTH-1:0] tree; - for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = $signed(m1[s]); - for(genvar n = 0; n < SIMD-1; n++) begin - // Sum truncated to actual maximum bit width at this node - localparam int unsigned NODE_WIDTH = $clog2(LEAVE_LOAD[n]*(2**MULT_WIDTH-1)); - uwire signed [NODE_WIDTH-1:0] s = tree[2*n+1] + tree[2*n+2]; - assign tree[n] = s; - end - - // Stage #3: Buffer output - logic [ACCU_WIDTH-1:0] P2 [PE]; - always_ff @(posedge clk) begin - if(rst) P2[i] = '{ default : 0}; - else if (en) P2[i] = (last_i ? 0 : $signed(P2[i])) + $signed(tree[0]); - end - - assign vld = last_i; - assign p[i] = P2[i]; - end : genPE - -endmodule : mvu_vvu_lut diff --git a/finn-rtllib/mvu/tb/mvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_axi_tb.sv deleted file mode 100644 index b89b58f55b..0000000000 --- a/finn-rtllib/mvu/tb/mvu_axi_tb.sv +++ /dev/null @@ -1,215 +0,0 @@ -/****************************************************************************** - * Copyright (C) 2022, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, - * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * @brief Testbench for MVU AXI-lite interface wrapper. - *****************************************************************************/ - -module mvu_axi_tb(); - -//-------------------- Simulation parameters --------------------\\ - // Matrix & parallelism config - localparam int unsigned MW = 50; - localparam int unsigned MH = 8; - localparam int unsigned SIMD = 10; - localparam int unsigned PE = 2; - localparam int unsigned SEGMENTLEN = 2; - localparam string MVU_IMPL_STYLE = "mvu_8sx8u_dsp48"; - localparam bit FORCE_BEHAVIORAL = 1; - // Bit-width config - localparam int unsigned ACTIVATION_WIDTH = 8; - localparam int unsigned WEIGHT_WIDTH = 8; - localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW); - localparam bit SIGNED_ACTIVATIONS = 0; - // Simulation constants - localparam int unsigned NF = MH/PE; - localparam int unsigned SF = MW/SIMD; - localparam int unsigned NUM_OF_DSP = SIMD/3; - localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8; - localparam int unsigned ACTIVATION_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8*8; - localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH; - localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - SIMD*ACTIVATION_WIDTH; - localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8; - - // Generate clk and reset signal - logic clk = 0; - always #5ns clk = !clk; - - logic ap_rst_n = 0; - initial begin - repeat(16) @(posedge clk); - ap_rst_n <= 1; - end - - uwire ap_clk = clk; - - // Generate activations - typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; - typedef activation_t activation_vector_t[SF]; - - function activation_vector_t init_ACTIVATIONS; - automatic activation_vector_t res; - std::randomize(res); - return res; - endfunction : init_ACTIVATIONS - - activation_vector_t ACTIVATIONS = init_ACTIVATIONS(); - - struct { - activation_t dat; - logic vld; - logic rdy; - } activations; - - initial begin - activations.vld = 0; - activations.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain - @(posedge clk iff ap_rst_n); - - for (int i=0; i= 1; - @(posedge clk); - end while (!(activations.vld === 1 && activations.rdy === 1)); - end - - activations.vld <= 0; - activations.dat <= 'x; - end - - // Generate weights - typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; - typedef weight_t weight_matrix_t[NF][SF]; - - function weight_matrix_t init_WEIGHTS; - automatic weight_matrix_t res; - std::randomize(res); - return res; - endfunction : init_WEIGHTS; - - weight_matrix_t WEIGHTS = init_WEIGHTS(); - - struct { - weight_t dat; - logic vld; - logic rdy; - } weights; - - initial begin - weights.vld = 0; - weights.dat = '1; // Since ('X AND 0) will result in 'X in simulation, which would be propagated through the DSP chain - @(posedge clk iff ap_rst_n); - - weights.vld <= 1; - for (int i=0; i= 1; - @(posedge clk iff ap_rst_n); - end while (!(outputs.rdy === 1 && outputs.vld === 1)); - - // Compare produced outputs against golden outputs - foreach(outputs.dat[i]) begin - assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); - else begin - $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); - $stop; - end - end - - NF_CNT += 1; - end - - $finish; - end - - // Instantiate DUT - mvu_axi #( - .MW(MW), - .MH(MH), - .PE(PE), - .SIMD(SIMD), - .ACTIVATION_WIDTH(ACTIVATION_WIDTH), - .WEIGHT_WIDTH(WEIGHT_WIDTH), - .ACCU_WIDTH(ACCU_WIDTH), - .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), - .SEGMENTLEN(SEGMENTLEN), - .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL), - .MVU_IMPL_STYLE(MVU_IMPL_STYLE) - ) - dut ( - .ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld), - .s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld), - .s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld), - .m_axis_output_tready(outputs.rdy) - ); - -endmodule : mvu_axi_tb From 14c5fa902820396e3489a244dc4d705fd1ebe532 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 21 Sep 2023 17:12:47 +0100 Subject: [PATCH 167/235] [mvu vvu 8sx9]: renamed for consistency --- finn-rtllib/mvu/{mvu_vvu_8sx9.sv => mvu_vvu_8sx9_dsp58.sv} | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) rename finn-rtllib/mvu/{mvu_vvu_8sx9.sv => mvu_vvu_8sx9_dsp58.sv} (99%) diff --git a/finn-rtllib/mvu/mvu_vvu_8sx9.sv b/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv similarity index 99% rename from finn-rtllib/mvu/mvu_vvu_8sx9.sv rename to finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv index 2aa9d71b6c..6ae117e3ab 100644 --- a/finn-rtllib/mvu/mvu_vvu_8sx9.sv +++ b/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv @@ -31,7 +31,7 @@ * @brief Matrix Vector Unit (MVU) core compute kernel utilizing DSP58. *****************************************************************************/ -module mvu_vvu_8sx9 #( +module mvu_vvu_8sx9_dsp58 #( bit IS_MVU, int unsigned PE, int unsigned SIMD, @@ -424,4 +424,4 @@ module mvu_vvu_8sx9 #( end : genDSPChain end : genDSPPE -endmodule : mvu_vvu_8sx9 +endmodule : mvu_vvu_8sx9_dsp58 From 3a3758826512fd3d5ed0bcdd23358d5fd5b724cd Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 21 Sep 2023 17:13:25 +0100 Subject: [PATCH 168/235] [mvu vvu axi]: changes for renamed module --- finn-rtllib/mvu/mvu_vvu_axi.sv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index ff677fc244..416480da79 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -174,7 +174,7 @@ module mvu_vvu_axi #( case(COMPUTE_CORE) "mvu_vvu_8sx9_dsp58": - mvu_vvu_8sx9 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), + mvu_vvu_8sx9_dsp58 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( .clk, .rst, .en, From afe36baa134b947718db34d140c8d6500b91cb2a Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 25 Sep 2023 13:44:17 +0100 Subject: [PATCH 169/235] [mvu vvu wrapper]: convert localparam to param --- finn-rtllib/mvu/mvu_vvu_axi_wrapper.v | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v index 270fe7351f..9c65dbc06e 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v +++ b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v @@ -46,9 +46,9 @@ module $MODULE_NAME_AXI_WRAPPER$ #( parameter FORCE_BEHAVIORAL = $FORCE_BEHAVIORAL$, // Safely deducible parameters - localparam WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, - localparam INPUT_STREAM_WIDTH_BA = ((IS_MVU == 1 ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8, - localparam OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8 + parameter WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, + parameter INPUT_STREAM_WIDTH_BA = ((IS_MVU == 1 ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8, + parameter OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8 )( // Global Control (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *) From e4f2f9e0e4f1cb0bae2bf7e439c57356b3670620 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 25 Sep 2023 13:45:48 +0100 Subject: [PATCH 170/235] [mvau-rtl custom-op]: bugfix to instantiate memstreamer, modified renamed files and axi wrapper template fill-out --- .../matrixvectoractivation_rtl.py | 92 ++++++++++--------- 1 file changed, 51 insertions(+), 41 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py index 9f8130806b..c7fb855884 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py @@ -61,8 +61,7 @@ class MatrixVectorActivation_rtl(HLSCustomOp): - """Class that corresponds to finn-hls Matrix_Vector_Activate(_Stream)_Batch - function.""" + """Class that corresponds to finn-rtl Matrix Vector Unit.""" def __init__(self, onnx_node, **kwargs): super().__init__(onnx_node, **kwargs) @@ -73,8 +72,7 @@ def get_nodeattr_types(self): "SIMD": ("i", True, 0), "MW": ("i", True, 0), "MH": ("i", True, 0), - "resType": ("s", False, "lut", {"auto", "lut", "dsp"}), - "ActVal": ("i", False, 0), + "resType": ("s", False, "dsp", {"auto", "lut", "dsp"}), # FINN DataTypes for inputs, weights, outputs "inputDataType": ("s", True, ""), "weightDataType": ("s", True, ""), @@ -165,7 +163,6 @@ def verify_node(self): # verify that all necessary attributes exist # TODO collect automatically from get_nodeattr_types try: - self.get_nodeattr("code_gen_dir_cppsim") self.get_nodeattr("executable_path") self.get_nodeattr("resType") self.get_nodeattr("MW") @@ -199,7 +196,6 @@ def verify_node(self): return info_messages - # TODO: Add in replay_buffer estimation def uram_estimation(self): P = self.get_nodeattr("PE") Q = self.get_nodeattr("SIMD") @@ -213,7 +209,6 @@ def uram_estimation(self): mstyle = self.get_nodeattr("ram_style") if ( (mmode == "decoupled" and mstyle != "ultra") - or (mmode == "const" and self.calc_wmem() <= 128) or (mmode == "external") ): return 0 @@ -221,7 +216,6 @@ def uram_estimation(self): depth_multiplier = math.ceil(omega / 4096) return width_multiplier * depth_multiplier - # TODO: Add in replay_buffer estimation def bram_estimation(self): """Calculates resource estimation for BRAM based on: - FINN-R: An End-to-End Deep-Learning Framework for Fast @@ -243,7 +237,6 @@ def bram_estimation(self): mstyle = self.get_nodeattr("ram_style") if ( (mmode == "decoupled" and mstyle in ["distributed", "ultra"]) - or (mmode == "const" and self.calc_wmem() <= 128) or (mmode == "external") ): return 0 @@ -262,7 +255,6 @@ def bram_estimation(self): else: return (math.ceil(omega / 512)) * (math.ceil(mem_width / 36)) - # TODO: Add in replay_buffer estimation def bram_efficiency_estimation(self): wdt = self.get_weight_datatype() W = wdt.bitwidth() @@ -275,7 +267,6 @@ def bram_efficiency_estimation(self): bram16_est_capacity = bram16_est * 36 * 512 return wbits / bram16_est_capacity - # TODO: Add in replay_buffer estimation def uram_efficiency_estimation(self): """Function for URAM efficiency estimation: actual parameter storage needed divided by the allocated URAM storage (from estimation)""" @@ -290,7 +281,7 @@ def uram_efficiency_estimation(self): uram_est_capacity = uram_est * 72 * 4096 return wbits / uram_est_capacity - # TODO: FIX: worst case estimates since segmentlen is not known at this point? +# TODO: fix lut estimations def lut_estimation(self): """Calculates resource estimations for LUTs based on: - FINN-R: An End-to-End Deep-Learning Framework for Fast @@ -333,9 +324,13 @@ def lut_estimation(self): return int(c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts)) + c2) - # TODO: FIX: worst case estimates since segmentlen is not known at this point? +# TODO: fix DSP estimations --> depends on fpga_part def dsp_estimation(self): # multiplication + # mvu_8sx9 (DSP58): ceil(SIMD/3) + # mvu_4sx4u (DSP48/DSP58): ceil(PE/4) + # mvu_8sx8u (DSP48): ceil(PE/2) + # mvu_lut: 0 P = self.get_nodeattr("PE") res_type = self.get_nodeattr("resType") Q = self.get_nodeattr("SIMD") @@ -349,18 +344,24 @@ def dsp_estimation(self): mult_dsp = 0 return int(mult_dsp) - # TODO: FIX: worst case estimates since segmentlen is not known at this point +# TODO: fix exp_cycles estimations --> depends on fpga_part and clk def get_exp_cycles(self): + # mvu_8sx9 (DSP58): + # 2 (replay_buffer) + ceil(chainlen/seglen) + 2 (MREG, PREG) + 2 (output reg slice) + # + MW/SIMD * MH/PE + # mvu_4sx4u (DSP48/DSP58) / mvu_8sx8u (DSP48): + # 3 (IN_REG, MREG, PREG) + 2 (replay_buffer) + 2 (output reg slice) + 1 (adder tree SIMD) + 1 (output lane) + # + MW/SIMD * MH/PE + # mvu_lut: + # 2 (replay_buffer) + 1 OR 2 (no MREG OR MREG) + 2 (output reg slice) + # + MW/SIMD * MH/PE pe = self.get_nodeattr("PE") simd = self.get_nodeattr("SIMD") num_inp_vec = self.get_nodeattr("numInputVectors") mh = self.get_nodeattr("MH") mw = self.get_nodeattr("MW") # since mmv != 1 is not supported yet, we set mmv for now to 1 - mmv = 1 - # Actual exp_cycles is probably slightly larger (say 3 cycles - # (DSP A/B, M, P - reg) + additional pipeline buffer cycles. - # Most probably <10) + mmv = 1 exp_cycles = (mh / pe) * (mw / simd) * np.prod(num_inp_vec) / mmv return int(exp_cycles) @@ -711,7 +712,7 @@ def execute_node(self, context, graph): else: raise Exception( """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( + has to be set to "rtlsim" """.format( mode ) ) @@ -795,11 +796,12 @@ def code_generation_ipi(self): os.path.join( code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v" ), - rtllib_dir + "mvu_axi.sv", + rtllib_dir + "mvu_vvu_axi.sv", rtllib_dir + "replay_buffer.sv", rtllib_dir + "mvu_4sx4u.sv", - rtllib_dir + "mvu_8sx9.sv", + rtllib_dir + "mvu_vvu_8sx9_dsp58.sv", rtllib_dir + "mvu_8sx8u_dsp48.sv", + rtllib_dir + "mvu_vvu_lut.sv", ] for f in sourcefiles: cmd.append("add_files -norecurse %s" % (f)) @@ -813,7 +815,7 @@ def code_generation_ipi(self): ) # instantiate a streamer and connect it to the HLS IP - strm_vlnv = "amd.com:FINN:memstream:1.0" + strm_vlnv = "amd.com:finn:memstream:1.0" strm_inst = node_name + "_wstrm" cmd.append( "create_bd_cell -type ip -vlnv %s /%s/%s" @@ -890,11 +892,12 @@ def code_generation_ipi(self): os.path.join( code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v" ), - rtllib_dir + "mvu_axi.sv", + rtllib_dir + "mvu_vvu_axi.sv", rtllib_dir + "replay_buffer.sv", rtllib_dir + "mvu_4sx4u.sv", - rtllib_dir + "mvu_8sx9.sv", + rtllib_dir + "mvu_vvu_8sx9_dsp58.sv", rtllib_dir + "mvu_8sx8u_dsp48.sv", + rtllib_dir + "mvu_vvu_lut.sv", ] for f in sourcefiles: cmd.append("add_files -norecurse %s" % (f)) @@ -959,27 +962,32 @@ def derive_characteristic_fxns(self, period): ] super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) - # TODO: characterize max_clk and implement this function in look-up style def _resolve_segment_len(self, clk): - # Insert pipeline registers in the DSP chain to meet target clock frequency - return 4 # default to 4 for now + # Insert pipeline registers in the DSP58 chain to meet target clock frequency + # 0.741 ns seems the worst-case delay through first DSP + # 0.605 ns seems to be (on average) delay for all subsequent DSPs + dsp_chain_len = np.floor((clk - 0.741) / 0.605) + return max(1, dsp_chain_len) def _resolve_impl_style(self, fpgapart): # Based on target device and activation/weight-width, choose the - # supported RTL module - act_width = self.get_input_datatype(0).bitwidth() - weight_width = self.get_input_datatype(1).bitwidth() - is_versal = ( - fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"] - or fpgapart[0:5] == "xqrvc" - ) - if act_width == 4 and weight_width == 4: - return "mvu_4sx4u" + # supported RTL compute core + if self.get_nodeattr("resType") == "lut": + return "mvu_vvu_lut" else: - if is_versal: - return "mvu_8sx9_dsp58" + act_width = self.get_input_datatype(0).bitwidth() + weight_width = self.get_input_datatype(1).bitwidth() + is_versal = ( + fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"] + or fpgapart[0:5] == "xqrvc" + ) + if act_width == 4 and weight_width == 4: + return "mvu_4sx4u" else: - return "mvu_8sx8u_dsp48" + if is_versal: + return "mvu_vvu_8sx9_dsp58" + else: + return "mvu_8sx8u_dsp48" def generate_hdl(self, model, fpgapart, clk): # Generate params as part of IP preparation @@ -1023,9 +1031,11 @@ def generate_hdl(self, model, fpgapart, clk): self.set_nodeattr("ip_path", code_gen_dir) def prepare_codegen_default(self, fpgapart, clk): - template_path = os.environ["FINN_ROOT"] + "/finn-rtllib/mvu/mvu_axi_wrapper.v" + template_path = os.environ["FINN_ROOT"] + "/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v" code_gen_dict = {} + code_gen_dict["$IS_MVU$"] = [str(1)] + code_gen_dict["$COMPUTE_CORE$"] = [self._resolve_impl_style(fpgapart)] code_gen_dict["$MW$"] = [str(self.get_nodeattr("MW"))] code_gen_dict["$MH$"] = [str(self.get_nodeattr("MH"))] code_gen_dict["$PE$"] = [str(self.get_nodeattr("PE"))] @@ -1039,7 +1049,7 @@ def prepare_codegen_default(self, fpgapart, clk): [str(1)] if (self.get_input_datatype(0).min() < 0) else [str(0)] ) code_gen_dict["$SEGMENTLEN$"] = [str(self._resolve_segment_len(clk))] - code_gen_dict["$MVU_IMPL_STYLE$"] = [self._resolve_impl_style(fpgapart)] + code_gen_dict["$FORCE_BEHAVIORAL$"] = [str(0)] return template_path, code_gen_dict From b49b79a0a669caad9355e59e1ee877ca59b65d27 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 25 Sep 2023 13:47:50 +0100 Subject: [PATCH 171/235] [specialize to rtl]: fix to changed attribute name and added support for converting HLS-based VVU custom-ops to RTL-based custom-ops --- .../fpgadataflow/specialize_to_rtl_layers.py | 82 ++++++++++++++++++- 1 file changed, 81 insertions(+), 1 deletion(-) diff --git a/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py index 47ed5ce863..5061282695 100644 --- a/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py +++ b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py @@ -26,6 +26,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import numpy as np from qonnx.transformation.base import Transformation from qonnx.custom_op.registry import getCustomOp from qonnx.core.datatype import DataType @@ -60,7 +61,7 @@ def apply(self, model): for n in graph.node: node_ind += 1 if n.op_type == "MatrixVectorActivation": - preferred_in_rtl = getCustomOp(n).get_nodeattr("impl") == "rtl" and getCustomOp(n).get_nodeattr("resType") == "dsp" + preferred_in_rtl = getCustomOp(n).get_nodeattr("preferred_backend") == "rtl" supported_in_rtl = self._is_rtl_variant_compatible(n) if (preferred_in_rtl and supported_in_rtl): mvau_input = n.input[0] @@ -76,6 +77,7 @@ def apply(self, model): pe = getCustomOp(n).get_nodeattr("PE") mem_mode = getCustomOp(n).get_nodeattr("mem_mode") ram_style = getCustomOp(n).get_nodeattr("ram_style") + resType = getCustomOp(n).get_nodeattr("resType") runtime_writeable_weights = getCustomOp(n).get_nodeattr("runtime_writeable_weights") new_node = helper.make_node( @@ -93,6 +95,7 @@ def apply(self, model): outputDataType=outputDataType, numInputVectors=numInputVectors, mem_mode=mem_mode, + resType=resType, name=n.name + "_rtl", ram_style=ram_style, runtime_writeable_weights=runtime_writeable_weights @@ -108,4 +111,81 @@ def apply(self, model): model = model.transform(InferDataTypes()) model = model.transform(GiveUniqueNodeNames()) + return (model, graph_modified) + +class InferRTLVectorVectorActivation(Transformation): + """Convert (HLS-based) VectorVectorActivation layers to specialized RTL layers is supported.""" + + def __init__(self): + super().__init__() + + def _is_rtl_variant_compatible(self, n): + no_activation = getCustomOp(n).get_nodeattr("noActivation") == 1 + act_width_in_range = (DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() <= 8) or (DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() == 9 and DataType[getCustomOp(n).get_nodeattr("inputDataType")].min() < 0) + weight_width_in_range = DataType[getCustomOp(n).get_nodeattr("weightDataType")].bitwidth() <= 8 + folding_supported = (getCustomOp(n).get_nodeattr("Channels") % getCustomOp(n).get_nodeattr("PE") == 0) and (np.prod(getCustomOp(n).get_nodeattr("Kernel")) % getCustomOp(n).get_nodeattr("SIMD") == 0) + + if (no_activation and act_width_in_range and weight_width_in_range and folding_supported): + return True + else: + return False + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for n in graph.node: + node_ind += 1 + if n.op_type == "VectorVectorActivation": + preferred_in_rtl = getCustomOp(n).get_nodeattr("preferred_backend") == "rtl" + supported_in_rtl = self._is_rtl_variant_compatible(n) + if (preferred_in_rtl and supported_in_rtl): + vvau_input = n.input[0] + vvau_weight = n.input[1] + vvau_output = n.output[0] + inputDataType = getCustomOp(n).get_nodeattr("inputDataType") + weightDataType = getCustomOp(n).get_nodeattr("weightDataType") + outputDataType = getCustomOp(n).get_nodeattr("outputDataType") + pe = getCustomOp(n).get_nodeattr("PE") + simd = getCustomOp(n).get_nodeattr("SIMD") + dim = getCustomOp(n).get_nodeattr("Dim") + channels = getCustomOp(n).get_nodeattr("Channels") + kernel = getCustomOp(n).get_nodeattr("Kernel") + resType = getCustomOp(n).get_nodeattr("resType") + mem_mode = getCustomOp(n).get_nodeattr("mem_mode") + runtime_writeable_weights = getCustomOp(n).get_nodeattr("runtime_writeable_weights") + ram_style = getCustomOp(n).get_nodeattr("ram_style") + resType = getCustomOp(n).get_nodeattr("resType") + + new_node = helper.make_node( + "VectorVectorActivation_rtl", + [vvau_input, vvau_weight], + [vvau_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + name=n.name + "_rtl", + PE=pe, + SIMD=simd, + Dim=dim, + Channels=channels, + Kernel=kernel, + resType=resType, + inputDataType=inputDataType, + weightDataType=weightDataType, + outputDataType=outputDataType, + mem_mode=mem_mode, + runtime_writeable_weights=runtime_writeable_weights, + ram_style=ram_style + ) + graph.node.insert(node_ind, new_node) + # remove old node + graph.node.remove(n) + graph_modified=True + + if graph_modified: + model = model.transform(MinimizeAccumulatorWidth()) + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + model = model.transform(GiveUniqueNodeNames()) + return (model, graph_modified) \ No newline at end of file From 9bdba031df228a2afbe99b8ea2fb576b678bba86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Tue, 19 Sep 2023 15:27:28 +0100 Subject: [PATCH 172/235] Adding core for DSP48 backport. --- finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv index 416c12c1cc..07c44cf89a 100644 --- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv +++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv @@ -4,7 +4,9 @@ module mvu_8sx8u_dsp48 #( int unsigned ACCU_WIDTH, int unsigned ACTIVATION_WIDTH, int unsigned WEIGHT_WIDTH, - bit FORCE_BEHAVIORAL = 0, + + bit SIGNED_ACTIVATIONS = 0, + bit FORCE_BEHAVIORAL = 0, localparam int unsigned SINGLE_PROD_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH )( @@ -16,8 +18,8 @@ module mvu_8sx8u_dsp48 #( // Input input logic last, input logic zero, // ignore current inputs and force this partial product to zero - input logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // signed weights - input logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] a, // unsigned activations + input logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH -1:0] w, // signed weights + input logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] a, // unsigned activations (override by SIGNED_ACTIVATIONS) // Ouput output logic vld, @@ -47,7 +49,7 @@ module mvu_8sx8u_dsp48 #( assign vld = L[5]; // Stages #1 - #3: DSP Lanes + cross-lane canaries duplicated with SIMD parallelism - localparam int unsigned D[2:0] = '{ ACCU_WIDTH+SINGLE_PROD_WIDTH, SINGLE_PROD_WIDTH, 0 }; // Lane offsets + localparam int unsigned D[2:0] = '{ ACCU_WIDTH+SINGLE_PROD_WIDTH, SINGLE_PROD_WIDTH, 0 }; // Lane offsets localparam int unsigned PIPE_COUNT = (PE+1)/2; for(genvar c = 0; c < PIPE_COUNT; c++) begin : genPipes @@ -61,7 +63,7 @@ module mvu_8sx8u_dsp48 #( for(genvar s = 0; s < SIMD; s++) begin : genSIMD // Input Lane Assembly - uwire [23:0] bb = a[s]; + uwire [23:0] bb = { {(24-ACTIVATION_WIDTH){SIGNED_ACTIVATIONS && a[s][ACTIVATION_WIDTH-1]}}, a[s] }; logic [33:0] aa; logic [26:0] dd; logic [ 1:0] xx; From 2cf1ef70306339b1409ed61d8e18eda243bf56ad Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 25 Sep 2023 14:48:34 +0100 Subject: [PATCH 173/235] [mvu rtl core]: added support for signed activations for DSP48-based MVUs --- finn-rtllib/mvu/mvu_4sx4u.sv | 3 ++- finn-rtllib/mvu/mvu_vvu_axi.sv | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv index 88985312c9..706347d700 100644 --- a/finn-rtllib/mvu/mvu_4sx4u.sv +++ b/finn-rtllib/mvu/mvu_4sx4u.sv @@ -2,6 +2,7 @@ module mvu_4sx4u #( int unsigned PE, int unsigned SIMD, int unsigned ACCU_WIDTH, + bit SIGNED_ACTIVATIONS = 0, bit FORCE_BEHAVIORAL = 0 )( // Global Control @@ -57,7 +58,7 @@ module mvu_4sx4u #( for(genvar s = 0; s < SIMD; s++) begin : genSIMD // Input Lane Assembly - uwire [23:0] bb = a[s]; + uwire [23:0] bb = { {(20){SIGNED_ACTIVATIONS && a[s][3]}}, a[s] }; logic [33:0] aa; logic [26:0] dd; logic [ 1:0] xx[3:1]; diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index 416480da79..da7e00cc55 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -182,14 +182,14 @@ module mvu_vvu_axi #( .vld(ovld), .p(odat) ); "mvu_4sx4u": - mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( + mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( .clk, .rst, .en, .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), .vld(ovld), .p(odat) ); "mvu_8sx8u_dsp48": mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), - .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( .clk, .rst, .en, .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), .vld(ovld), .p(odat) From ab8d4a8e075ac9b3ccf78d2a08907d5dcc116fdb Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 25 Sep 2023 16:17:38 +0100 Subject: [PATCH 174/235] [rtl mvu custom-op]: add upper bound to SEGMENTLEN equal to number of DSP58s chained together --- src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py index c7fb855884..d0a638475a 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py @@ -966,7 +966,9 @@ def _resolve_segment_len(self, clk): # Insert pipeline registers in the DSP58 chain to meet target clock frequency # 0.741 ns seems the worst-case delay through first DSP # 0.605 ns seems to be (on average) delay for all subsequent DSPs - dsp_chain_len = np.floor((clk - 0.741) / 0.605) + critical_path_dsps = np.floor((clk - 0.741) / 0.605) + max_chain_len = np.ceil(self.get_nodeattr("SIMD") / 3) + dsp_chain_len = critical_path_dsps if critical_path_dsps < max_chain_len else max_chain_len return max(1, dsp_chain_len) def _resolve_impl_style(self, fpgapart): From 74eb42bc2266071ccbd5e6fcfadc5bdf835463d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Fri, 29 Sep 2023 15:24:28 +0100 Subject: [PATCH 175/235] Starting on pumped DSP compute. --- finn-rtllib/mvu/mvu_vvu_axi.sv | 260 ++++++++++++++++++++++++--------- 1 file changed, 194 insertions(+), 66 deletions(-) diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index da7e00cc55..54a4c092d7 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -51,26 +51,27 @@ module mvu_vvu_axi #( int unsigned MH, int unsigned PE, int unsigned SIMD, + int unsigned SEGMENTLEN = 0, + int unsigned ACTIVATION_WIDTH, int unsigned WEIGHT_WIDTH, int unsigned ACCU_WIDTH, bit SIGNED_ACTIVATIONS = 0, - int unsigned SEGMENTLEN = 0, + + bit PUMPED_COMPUTE = 0, // requires an even SIMD % 2 == 0 bit FORCE_BEHAVIORAL = 0, bit M_REG_LUT = 1, // Safely deducible parameters - localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, - localparam int unsigned INPUT_STREAM_WIDTH_BA = ((IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8, - localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH, - localparam int unsigned INPUT_STREAM_WIDTH = (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH, - localparam int unsigned SF = MW/SIMD, - localparam int unsigned NF = MH/PE, - localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8 -) -( + localparam int unsigned WEIGHT_STREAM_WIDTH = PE * SIMD * WEIGHT_WIDTH, + localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (WEIGHT_STREAM_WIDTH + 7)/8 * 8, + localparam int unsigned INPUT_STREAM_WIDTH = (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH, + localparam int unsigned INPUT_STREAM_WIDTH_BA = (INPUT_STREAM_WIDTH + 7)/8 * 8, + localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8 +)( // Global Control input logic ap_clk, + input logic ap_clk2x, // only used when PUMPED_COMPUTE input logic ap_rst_n, // Weight Stream @@ -124,38 +125,45 @@ module mvu_vvu_axi #( $finish; end end + + //- Pumping Constraints --------- + if(PUMPED_COMPUTE) begin + if(SIMD % 2 != 0) begin + $error("Odd SIMD=%0d is incompatible with pumped compute.", SIMD); + $finish; + end + end end uwire clk = ap_clk; uwire rst = !ap_rst_n; - typedef logic [INPUT_STREAM_WIDTH-1 : 0] mvauin_t; - - uwire mvauin_t amvau; + //- Replay to Accommodate Neuron Fold ----------------------------------- + typedef logic [(IS_MVU? 1:PE)*SIMD-1:0][ACTIVATION_WIDTH-1:0] mvu_flatin_t; + uwire mvu_flatin_t amvau; uwire alast; uwire afin; uwire avld; uwire ardy; - replay_buffer #(.LEN(SF), .REP(IS_MVU ? NF : 1), .W($bits(mvauin_t))) activation_replay ( - .clk, .rst, - .ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)), - .ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin) + localparam int unsigned SF = MW/SIMD; + localparam int unsigned NF = MH/PE; + replay_buffer #(.LEN(SF), .REP(IS_MVU ? NF : 1), .W($bits(mvu_flatin_t))) activation_replay ( + .clk, .rst, + .ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvu_flatin_t'(s_axis_input_tdata)), + .ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin) ); -//-------------------- Input control --------------------\\ - uwire en; - uwire istb = avld && s_axis_weights_tvalid; - assign ardy = en && s_axis_weights_tvalid; - assign s_axis_weights_tready = en && avld; + //- Unflatten inputs into structured matrices --------------------------- + localparam int unsigned ACT_PE = IS_MVU? 1 : PE; + typedef logic [PE -1:0][SIMD-1:0][WEIGHT_WIDTH -1:0] mvu_w_t; + typedef logic [ACT_PE-1:0][SIMD-1:0][ACTIVATION_WIDTH-1:0] mvu_a_t; -//-------------------- Core MVU/VVU --------------------\\ - uwire ovld; - uwire [PE-1:0][ACCU_WIDTH-1:0] odat; - typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t; - uwire mvauin_t amvau_i; + uwire mvu_w_t mvu_w = s_axis_weights_tdata; - if (IS_MVU) begin : genMVUInput + //- Conditional Activations Layout Adjustment for VVU + uwire mvu_a_t amvau_i; + if (IS_MVU || (PE == 1)) begin : genMVUInput assign amvau_i = amvau; end : genMVUInput else begin : genVVUInput @@ -163,49 +171,169 @@ module mvu_vvu_axi #( // Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..] // Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like: // (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to - // (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i) + // (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i, P_1), ..., (S_i, P_i) + for(genvar pe = 0; pe < (IS_MVU? 1:PE); pe++) begin + for(genvar simd = 0; simd < SIMD; simd++) begin + assign amvau_i[pe][simd] = amvau[]; + end + end + localparam int num_of_elements = INPUT_STREAM_WIDTH/ACTIVATION_WIDTH; for (genvar i=0; i 1) ? - amvau[(i/SIMD + (i*PE % num_of_elements) + 1) * ACTIVATION_WIDTH : (i/SIMD + (i*PE % num_of_elements)) * ACTIVATION_WIDTH] - : amvau[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH]; + assign amvau_i[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH] = + amvau[(i/SIMD + (i*PE % num_of_elements) + 1) * ACTIVATION_WIDTH : (i/SIMD + (i*PE % num_of_elements)) * ACTIVATION_WIDTH]; end : genRewire end : genVVUInput - case(COMPUTE_CORE) - "mvu_vvu_8sx9_dsp58": - mvu_vvu_8sx9_dsp58 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), - .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), - .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( - .clk, .rst, .en, - .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), - .vld(ovld), .p(odat) - ); - "mvu_4sx4u": - mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( - .clk, .rst, .en, - .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), - .vld(ovld), .p(odat) - ); - "mvu_8sx8u_dsp48": - mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), - .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( - .clk, .rst, .en, - .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), - .vld(ovld), .p(odat) - ); - "mvu_vvu_lut": - mvu_vvu_lut #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), - .WEIGHT_WIDTH(WEIGHT_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .M_REG(M_REG_LUT)) core ( - .clk, .rst, .en, - .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), - .vld(ovld), .p(odat) - ); - default: initial begin - $error("Unrecognized COMPUTE_CORE '%s'", COMPUTE_CORE); - $finish; - end - endcase + //- Flow Control Bracket around Compute Core ---------------------------- + uwire en; + uwire istb = avld && s_axis_weights_tvalid; + assign ardy = en && s_axis_weights_tvalid; + assign s_axis_weights_tready = en && avld; + + //- Conditionally Pumped DSP Compute ------------------------------------ + typedef logic [PE-1:0][ACCU_WIDTH-1:0] dsp_p_t; + uwire ovld; + uwire dsp_p_t odat; + if(1) begin : blkDsp + localparam int unsigned DSP_SIMD = SIMD/(PUMPED_COMPUTE+1); + typedef logic [PE -1:0][DSP_SIMD-1:0][WEIGHT_WIDTH -1:0] dsp_w_t; + typedef logic [ACT_PE-1:0][DSP_SIMD-1:0][ACTIVATION_WIDTH-1:0] dsp_a_t; + + uwire dsp_clk; + uwire dsp_en; + + uwire dsp_last; + uwire dsp_zero; + uwire dsp_w_t dsp_w; + uwire dsp_a_t dsp_a; + + uwire dsp_vld; + uwire dsp_p_t dsp_p; + + if(!PUMPED_COMPUTE) begin : genUnpumpedCompute + assign dsp_clk = clk; + assign dsp_en = en; + + assign dsp_last = alast && avld; + assign dsp_zero = !istb; + assign dsp_w = mvu_w; + assign dsp_a = amvau_i; + + assign ovld = dsp_vld; + assign odat = dsp_p; + end : genUnpumpedCompute + else begin : genPumpedCompute + assign dsp_clk = clk2x; + + // Identify second fast cycle before active slow clock edge + logic Active = 0; + always_ff @(posedge clk2x) Active <= clk; + + // The input for a slow cycle is split across two fast cycles along the SIMD dimension. + // - Both fast cycles are controlled by the same enable state. + // - A zero cycle is duplicated across both fast cycles. + // - The last flag must be restricted to the second fast cycle. + logic En = 0; + logic Last[1:0] = '{ default: 1'b0 }; + logic Zero = 1; + dsp_w_t W[1:0] = '{ default: 'x }; + dsp_a_t A[1:0] = '{ default: 'x }; + always_ff @(posedge clk2x) begin + if(rst) begin + En <= 0; + Last <= '{ default: 1'b0 }; + Zero <= 1; + W <= '{ default: 'x }; + A <= '{ default: 'x }; + end + else begin + if(Active) begin + En <= en; + if(en) begin + Last <= '{ alast && avld, 1'b0 }; + Zero <= !istb; + for(int unsigned simd = 0; simd < SIMD; simd++) begin + for(int unsigned pe = 0; pe < PE; pe++) begin + W[simd / DSP_SIMD][pe][simd % DSP_SIMD] <= mvu_w[pe][simd]; + end + for(int unsigned pe = 0; pe < ACT_PE; pe++) begin + A[simd / DSP_SIMD][pe][simd % DSP_SIMD] <= amvau_i[pe][simd]; + end + end + end + end + else if(En) begin + Last <= '{ 'x, Last[1] }; + W <= '{ 'x, W[1] }; + A <= '{ 'x, A[1] }; + end + end + end + assign dsp_en = En; + + assign dsp_last = Last[0]; + assign dsp_zero = Zero; + assign dsp_w = W[0]; + assign dsp_a = A[0]; + + // Since no two consecutive last cycles will ever be asserted on the input, + // valid outputs will also always be spaced by, at least, one other cycle. + // We can always hold a captured output for two cycles to allow the slow + // clock to pick it up. + logic Vld = 0; + dsp_p_t P = 'x; + always_ff @(posedge clk2x) begin + if(rst) begin + Vld <= 0; + P <= 'x; + end + else begin + if(dsp_vld) P <= dsp_p; + Vld <= dsp_vld || (Vld && !Active); + end + end + assign ovld = Vld; + assign odat = P; + + end : genPumpedCompute + + case(COMPUTE_CORE) + "mvu_vvu_8sx9_dsp58": + mvu_vvu_8sx9_dsp58 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(DSP_SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), + .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), + .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( + .clk(dsp_clk), .rst, .en(dsp_en), + .last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a), + .vld(dsp_vld), .p(dsp_p) + ); + "mvu_4sx4u": + mvu_4sx4u #(.PE(PE), .SIMD(DSP_SIMD), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( + .clk(dsp_clk), .rst, .en(dsp_en), + .last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a), + .vld(dsp_vld), .p(dsp_p) + ); + "mvu_8sx8u_dsp48": + mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(DSP_SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( + .clk(dsp_clk), .rst, .en(dsp_en), + .last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a), + .vld(dsp_vld), .p(dsp_p) + ); + "mvu_vvu_lut": + mvu_vvu_lut #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(DSP_SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), + .WEIGHT_WIDTH(WEIGHT_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .M_REG(M_REG_LUT)) core ( + .clk(dsp_clk), .rst, .en(dsp_en), + .last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a), + .vld(dsp_vld), .p(dsp_p) + ); + default: initial begin + $error("Unrecognized COMPUTE_CORE '%s'", COMPUTE_CORE); + $finish; + end + endcase + + end : blkDsp //-------------------- Output register slice --------------------\\ struct packed { From d9e2fc610a45f4a2acd3970b1606e9389e65db2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Fri, 29 Sep 2023 15:36:52 +0100 Subject: [PATCH 176/235] Flag TODO. --- finn-rtllib/mvu/mvu_vvu_axi.sv | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index 54a4c092d7..78c9892b33 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -71,7 +71,7 @@ module mvu_vvu_axi #( )( // Global Control input logic ap_clk, - input logic ap_clk2x, // only used when PUMPED_COMPUTE + input logic ap_clk2x, // synchronous, double-speed clock; only used for PUMPED_COMPUTE input logic ap_rst_n, // Weight Stream @@ -174,7 +174,7 @@ module mvu_vvu_axi #( // (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i, P_1), ..., (S_i, P_i) for(genvar pe = 0; pe < (IS_MVU? 1:PE); pe++) begin for(genvar simd = 0; simd < SIMD; simd++) begin - assign amvau_i[pe][simd] = amvau[]; + assign amvau_i[pe][simd] = amvau[]; // TODO: Do the right thing as below here. end end From 5a429fcbe14ca6177082fab472549407f47f97d6 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 13 Oct 2023 23:29:39 +0100 Subject: [PATCH 177/235] [mvu_vvu dsp58]: change weight input to 2D instead of 3D array --- finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv b/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv index 6ae117e3ab..53cf71fd5f 100644 --- a/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv +++ b/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv @@ -42,7 +42,8 @@ module mvu_vvu_8sx9_dsp58 #( int unsigned SEGMENTLEN = 0, // Default to 0 (which implies a single segment) bit FORCE_BEHAVIORAL = 0, - localparam int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD + localparam int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD, + localparam int unsigned WEIGHT_ELEMENTS = PE*SIMD ) ( // Global Control @@ -53,7 +54,7 @@ module mvu_vvu_8sx9_dsp58 #( // Input input logic last, input logic zero, // ignore current inputs and force this partial product to zero - input logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // weights + input logic [WEIGHT_ELEMENTS-1:0][WEIGHT_WIDTH-1:0] w, // weights input logic [ACTIVATION_ELEMENTS-1:0][ACTIVATION_WIDTH-1:0] a, // activations // Ouput @@ -164,7 +165,8 @@ module mvu_vvu_8sx9_dsp58 #( // synthesis translate_off zero ? '1 : // synthesis translate_on - w[i][3*j +: LANES_OCCUPIED]; + //w[i][3*j +: LANES_OCCUPIED]; + w[SIMD*i+3*j +: LANES_OCCUPIED]; if (EXTERNAL_PREGS > 1) B[i][0:EXTERNAL_PREGS-2] <= B[i][1:EXTERNAL_PREGS-1]; end end @@ -181,7 +183,8 @@ module mvu_vvu_8sx9_dsp58 #( // synthesis translate_off zero ? '1 : // synthesis translate_on - PAD_BITS_WEIGHT == 0 ? w[i][3*j+k] : { {PAD_BITS_WEIGHT{w[i][3*j+k][WEIGHT_WIDTH-1]}}, w[i][3*j+k] }; + //PAD_BITS_WEIGHT == 0 ? w[i][3*j+k] : { {PAD_BITS_WEIGHT{w[i][3*j+k][WEIGHT_WIDTH-1]}}, w[i][3*j+k] }; + PAD_BITS_WEIGHT == 0 ? w[SIMD*i+3*j+k] : { {PAD_BITS_WEIGHT{w[SIMD*i+3*j+k][WEIGHT_WIDTH-1]}}, w[SIMD*i+3*j+k] }; end : genBin for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero assign b_in_i[i][j][8*k +: 8] = 8'b0; From a4a18bb08cef96bb52c02096d54b573b421bcd12 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 13 Oct 2023 23:30:55 +0100 Subject: [PATCH 178/235] [mvu_vvu axi]: re-wire weights appropriately for VVU DSP58 --- finn-rtllib/mvu/mvu_vvu_axi.sv | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index da7e00cc55..f0f75c633a 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -45,7 +45,7 @@ *****************************************************************************/ module mvu_vvu_axi #( - bit IS_MVU, // string type causes error in Vivado + bit IS_MVU, parameter COMPUTE_CORE, int unsigned MW, int unsigned MH, @@ -64,8 +64,8 @@ module mvu_vvu_axi #( localparam int unsigned INPUT_STREAM_WIDTH_BA = ((IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8, localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH, localparam int unsigned INPUT_STREAM_WIDTH = (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH, - localparam int unsigned SF = MW/SIMD, - localparam int unsigned NF = MH/PE, + localparam int unsigned SF = IS_MVU ? MW/SIMD : MW/(SIMD*PE), + localparam int unsigned NF = IS_MVU ? MH/PE : 1, localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8 ) ( @@ -91,11 +91,11 @@ module mvu_vvu_axi #( //-------------------- Parameter sanity checks --------------------\\ initial begin - if (MW % SIMD != 0) begin + if ((MW % SIMD != 0 && IS_MVU) || (MW % (SIMD*PE) != 0 && !IS_MVU)) begin $error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD); $finish; end - if (MH % PE != 0) begin + if (MH % PE != 0 && IS_MVU) begin $error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE); $finish; end @@ -137,7 +137,7 @@ module mvu_vvu_axi #( uwire avld; uwire ardy; - replay_buffer #(.LEN(SF), .REP(IS_MVU ? NF : 1), .W($bits(mvauin_t))) activation_replay ( + replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvauin_t))) activation_replay ( .clk, .rst, .ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)), .ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin) @@ -154,9 +154,11 @@ module mvu_vvu_axi #( uwire [PE-1:0][ACCU_WIDTH-1:0] odat; typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t; uwire mvauin_t amvau_i; + uwire mvauin_weight_t wmvau_i; if (IS_MVU) begin : genMVUInput assign amvau_i = amvau; + assign wmvau_i = s_axis_weights_tdata; end : genMVUInput else begin : genVVUInput // The input stream will have the channels interleaved for VVU when PE>1 @@ -164,11 +166,14 @@ module mvu_vvu_axi #( // Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like: // (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to // (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i) - localparam int num_of_elements = INPUT_STREAM_WIDTH/ACTIVATION_WIDTH; + localparam int num_of_elements = PE*SIMD; for (genvar i=0; i 1) ? amvau[(i/SIMD + (i*PE % num_of_elements) + 1) * ACTIVATION_WIDTH : (i/SIMD + (i*PE % num_of_elements)) * ACTIVATION_WIDTH] : amvau[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH]; + assign wmvau_i[i*WEIGHT_WIDTH +: WEIGHT_WIDTH] = (PE > 1) ? + s_axis_weights_tdata[( ((SIMD-1-i) + int'(i/SIMD)*SIMD) + int'(i/SIMD) * SIMD + 1) * WEIGHT_WIDTH : ( ((SIMD-1-i) + int'(i/SIMD)*SIMD) + int'(i/SIMD) * SIMD ) * WEIGHT_WIDTH] + : s_axis_weights_tdata[i*WEIGHT_WIDTH +: WEIGHT_WIDTH]; end : genRewire end : genVVUInput @@ -178,7 +183,7 @@ module mvu_vvu_axi #( .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( .clk, .rst, .en, - .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), + .last(alast && avld), .zero(!istb), .w(wmvau_i), .a(amvau_i), .vld(ovld), .p(odat) ); "mvu_4sx4u": From cc0737bcd00cdd6df6e3d4ff38215ac5d9eb42e6 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 13 Oct 2023 23:31:35 +0100 Subject: [PATCH 179/235] [mvu_vvu axi wrapper]: fix to IS_MVU parameter --- finn-rtllib/mvu/mvu_vvu_axi_wrapper.v | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v index 9c65dbc06e..01deb23840 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v +++ b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v @@ -32,7 +32,7 @@ *****************************************************************************/ module $MODULE_NAME_AXI_WRAPPER$ #( - parameter IS_MVU = "$IS_MVU$", + parameter IS_MVU = $IS_MVU$, parameter COMPUTE_CORE = "$COMPUTE_CORE$", parameter MW = $MW$, parameter MH = $MH$, From c0eff0b819828a5e1d1ef80815f63be0042ce742 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 13 Oct 2023 23:32:47 +0100 Subject: [PATCH 180/235] [mvu_vvu tb]: WIP -- changes to self-checker and shape of input data --- finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv | 79 +++++++++++++++++----------- 1 file changed, 49 insertions(+), 30 deletions(-) diff --git a/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv index 82c2e8e7b0..b46fc588c9 100644 --- a/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv +++ b/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv @@ -35,23 +35,23 @@ module mvu_vvu_axi_tb(); //-------------------- Simulation parameters --------------------\\ // Matrix & parallelism config - localparam bit IS_MVU = 1; + localparam bit IS_MVU = 0; localparam string COMPUTE_CORE = "mvu_vvu_8sx9_dsp58"; - localparam int unsigned MW = 1500; - localparam int unsigned MH = 256; - localparam int unsigned SIMD = 60; - localparam int unsigned PE = 16; - localparam int unsigned SEGMENTLEN = 2.0; + localparam int unsigned MW = 36; + localparam int unsigned MH = 1; + localparam int unsigned SIMD = 3; + localparam int unsigned PE = 4; + localparam int unsigned SEGMENTLEN = 1.0; localparam bit FORCE_BEHAVIORAL = 1; localparam bit M_REG_LUT = 1; // Bit-width config - localparam int unsigned ACTIVATION_WIDTH = 4; - localparam int unsigned WEIGHT_WIDTH = 4; - localparam int unsigned ACCU_WIDTH = 21; // == ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW) - localparam bit SIGNED_ACTIVATIONS = 0; + localparam int unsigned ACTIVATION_WIDTH = 8; + localparam int unsigned WEIGHT_WIDTH = 6; + localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW); + localparam bit SIGNED_ACTIVATIONS = 1; // Simulation constants - localparam int unsigned NF = MH/PE; - localparam int unsigned SF = MW/SIMD; + localparam int unsigned NF = IS_MVU ? MH/PE : 1; + localparam int unsigned SF = IS_MVU ? MW/SIMD : MW/(SIMD*PE); localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8; localparam int unsigned ACTIVATION_WIDTH_BA = ((IS_MVU ? 1 : PE)*SIMD*ACTIVATION_WIDTH+7)/8*8; localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH; @@ -72,7 +72,7 @@ module mvu_vvu_axi_tb(); // Generate activations typedef logic [(IS_MVU ? 1 : PE)*SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; - typedef activation_t activation_vector_t[(IS_MVU ? 1 : NF)*SF]; + typedef activation_t activation_vector_t[SF]; function activation_vector_t init_ACTIVATIONS; automatic activation_vector_t res; @@ -93,14 +93,12 @@ module mvu_vvu_axi_tb(); activations.dat = 'X; @(posedge clk iff ap_rst_n); - for (int j=0; j<(IS_MVU ? 1 : NF); j++) begin - for (int i=0; i= 0; - @(posedge clk); - end while (!(activations.vld === 1 && activations.rdy === 1)); - end + for (int i=0; i= 0; + @(posedge clk); + end while (!(activations.vld === 1 && activations.rdy === 1)); end activations.vld <= 0; @@ -143,7 +141,9 @@ module mvu_vvu_axi_tb(); end // Function to compute golden output - // a: [(IS_MVU?1:NF)*SF][SIMD-1:0][ACTIVATION_WIDTH-1:0] + // a: [SF][(IS_MVU ? 1 : PE)*SIMD-1:0][ACTIVATION_WIDTH-1:0] + // a: [SF][SIMD-1:0][ACTIVATION_WIDTH-1:0] + // a: [SF][PE*SIMD-1:0][ACTIVATION_WIDTH-1:0] // w: [NF][SF][PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] typedef logic signed [PE-1:0][ACCU_WIDTH-1:0] output_t; typedef output_t output_vector_t [NF]; @@ -156,14 +156,33 @@ module mvu_vvu_axi_tb(); function output_vector_t check_output(activation_vector_t a, weight_matrix_t w); automatic output_vector_t res = '{default: 0}; - for (int j = 0; j 1 ? $signed(a[j/PE*SF+i/SIMD][(i*PE + j%PE) % (SIMD*PE)]) : $signed(a[j/PE*SF+i/SIMD][i%SIMD]) ) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]); - else - res[j/PE][j%PE] = IS_MVU ? $signed(res[j/PE][j%PE]) + $signed({1'b0, a[i/SIMD][i%SIMD]}) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]) : - $signed(res[j/PE][j%PE]) + ( PE > 1 ? $signed({1'b0, a[j/PE*SF+i/SIMD][(i*PE + j%PE) % (SIMD*PE)]}) : $signed({1'b0, a[j/PE+SF+i/SIMD][i%SIMD]}) ) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]); + // for (int j = 0; j 1 ? $signed(a[i/SIMD/PE][i % (SIMD*PE)]) : $signed(a[i/SIMD/PE][(i)%(SIMD*PE)]) ) * $signed(w[0][i/SIMD/PE][i/PE][i%SIMD]); + // else + // res[j/PE][j%PE] = IS_MVU ? $signed(res[j/PE][j%PE]) + $signed({1'b0, a[i/SIMD][i%SIMD]}) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]) : + // $signed(res[j/PE][j%PE]) + ( PE > 1 ? $signed({1'b0, a[i/SIMD/PE][i % (SIMD*PE)]}) : $signed({1'b0, a[i/SIMD/PE][i%(SIMD*PE)]}) ) * $signed(w[0][i/SIMD][0][i%SIMD]); + // end + // end + // The input stream will have the channels interleaved for VVU when PE>1 + // Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..] + // Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like: + // (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to + // (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i) + for (int i = 0; i < NF; i++) begin + for (int j = 0; j < SF; j++) begin + for (int k = 0; k < PE; k++) begin + for (int l = 0; l < SIMD; l++) begin + if (SIGNED_ACTIVATIONS) + res[i][k] = IS_MVU ? $signed(res[i][k]) + $signed(a[j][l]) * $signed(w[i][j][k][l]) : + $signed(res[i][k]) + $signed(a[j][k + l*PE]) * $signed(w[i][j][k][l]); + else + res[i][k] = IS_MVU ? $signed(res[i][k]) + $signed({1'b0, a[j][l]}) * $signed(w[i][j][k][l]) : + $signed(res[i][k]) + $signed({1'b0, a[j][k + l*PE]}) * $signed(w[i][j][k][l]); + end + end end end return res; From 4591bb87baf83e1d5fdb08dbd4a79b866c6076b3 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 13 Oct 2023 23:35:54 +0100 Subject: [PATCH 181/235] [vvu_hls]: add flag to specify preferred backend --- src/finn/custom_op/fpgadataflow/vectorvectoractivation.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py index bd5bb75f1d..7ddf234544 100644 --- a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py @@ -100,6 +100,8 @@ def get_nodeattr_types(self): # use xnor-popcount for binary weights/inputs, thus treating them # as bipolar "binaryXnorMode": ("i", False, 0, {0, 1}), + # Flag to specify whether RTL-based or HLS-based implementation is preferred + "preferred_backend": ("s", False, "rtl", {"hls", "rtl"}) } my_attrs.update(super().get_nodeattr_types()) return my_attrs From ef1cbbe75f05efe72604de05f0e56d2758feecfb Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 13 Oct 2023 23:36:26 +0100 Subject: [PATCH 182/235] [vvu rtl]: added new custom-op VVU_RTL --- .../vectorvectoractivation_rtl.py | 1225 +++++++++++++++++ 1 file changed, 1225 insertions(+) create mode 100644 src/finn/custom_op/fpgadataflow/vectorvectoractivation_rtl.py diff --git a/src/finn/custom_op/fpgadataflow/vectorvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/vectorvectoractivation_rtl.py new file mode 100644 index 0000000000..72976bc9a8 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/vectorvectoractivation_rtl.py @@ -0,0 +1,1225 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import math +import numpy as np +import os +import textwrap +import warnings +from qonnx.core.datatype import DataType +from qonnx.util.basic import ( + calculate_matvec_accumulator_range, + interleave_matrix_outer_dim_from_partitions, + roundup_to_integer_multiple, +) + +from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.util.data_packing import ( + npy_to_rtlsim_input, + numpy_to_hls_code, + pack_innermost_dim_as_hex_string, + rtlsim_output_to_npy, +) +from finn.util.basic import get_rtlsim_trace_depth, make_build_dir + +try: + from pyverilator import PyVerilator +except ModuleNotFoundError: + PyVerilator = None + + +class VectorVectorActivation_rtl(HLSCustomOp): + """Class that corresponds to finn-rtl Vector Vector Unit.""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = { + "PE": ("i", True, 0), + "SIMD": ("i", False, 1), + "Dim": ("ints", True, []), # [H, W] + "Channels": ("i", True, 0), + "Kernel": ("ints", True, []), # [H, W] + "resType": ("s", False, "auto", {"auto", "lut", "dsp"}), + # FINN DataTypes for inputs, weights, outputs + "inputDataType": ("s", True, ""), + "weightDataType": ("s", True, ""), + "outputDataType": ("s", True, ""), + # FINN DataType for accumulator -- auto-computed and updated + "accDataType": ("s", False, "INT32"), + # memory mode for the layer weights + # const -- embedded weights, default, long compile/synth times + # decoupled -- streaming weights with weight streamer packaged inside IP + # external -- streaming weights with external streamer + "mem_mode": ("s", False, "const", {"const", "decoupled", "external"}), + # (mem_mode = decoupled only) whether weights will be writable through + # an AXI-lite interface during runtime + # 1 for enabled, 0 for disabled. + # see finn-rtllib/memstream/doc/README for more about the memory + # address map used for writable weights + # IMPORTANT: After using AXI lite to either read or write the weights, + # always "flush" the accelerator by first passing a dummy input + # vector through the accelerator. This will get rid of any old + # weight data from the weight FIFOs. + "runtime_writeable_weights": ("i", False, 0, {0, 1}), + # FPGA resource type for memories in decoupled mode + # auto -- let Vivado decide + # block -- use BRAM + # distributed -- use LUTRAM + # ultra -- use UltraRAM (URAM), must have runtime_writeable_weights=1 + # see also https://www.xilinx.com/support/answers/38070.html + "ram_style": ( + "s", + False, + "auto", + {"auto", "block", "distributed", "ultra"}, + ), + # attribute to save top module name - not user configurable + "gen_top_module": ("s", False, "") + } + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def minimize_accumulator_width(self, model): + """Minimize the accumulator bit width according to the weight values, + input data types, and size of dot product""" + weights = model.get_initializer(self.onnx_node.input[1]) + k_h, k_w = self.get_nodeattr("Kernel") + fm = self.get_nodeattr("Channels") + # put weights into the shape expected by calculate_matvec_accumulator_range + weights = weights.reshape(fm, k_h * k_w).transpose() + if len(self.onnx_node.input) > 2: + thresholds = model.get_initializer(self.onnx_node.input[2]) + else: + thresholds = None + idt = self.get_input_datatype() + + (acc_min, acc_max) = calculate_matvec_accumulator_range(weights, idt) + # if runtime-writeable weights, then the values of the weights can + # change and we need to use the worst-case values from the datatypes + if self.get_nodeattr("runtime_writeable_weights"): + wdt = self.get_weight_datatype() + lower_worst = wdt.min() * np.ones_like(weights) + lower_range = calculate_matvec_accumulator_range(lower_worst, idt) + upper_worst = wdt.max() * np.ones_like(weights) + upper_range = calculate_matvec_accumulator_range(upper_worst, idt) + acc_min = min(min(lower_range), min(upper_range)) + acc_max = max(max(upper_range), max(upper_range)) + + # if the acc_range is always greater than 0, then acc_max <= 2^P - 1 + if acc_min >= 0: + acc_bit_width = np.log2(acc_max + 1) + acc_bit_width = math.ceil(acc_bit_width) + adt = DataType[f"UINT{acc_bit_width}"] + # if the acc_range is signed, then acc_min >= -2^{P-1} and acc_max <= + # 2^{P - 1} - 1, which means 2^{P - 1} >= max(-acc_min, 1 + acc_max) + else: + _acc_max = max(-acc_min, 1 + acc_max) + acc_bit_width = np.log2(_acc_max) + 1 + acc_bit_width = math.ceil(acc_bit_width) + adt = DataType[f"INT{acc_bit_width}"] + + # if this is the last node in the graph, then ensure the datatype is + # divisibly by 8 bits + if model.find_direct_successors(self.onnx_node) is None: + bw = roundup_to_integer_multiple(adt.bitwidth(), 8) + new_adt_name = adt.name.replace(str(adt.bitwidth()), str(bw)) + adt = DataType[new_adt_name] + # for no-activation nodes, output dt = acc dt + self.set_nodeattr("outputDataType", adt.name) + self.set_nodeattr("accDataType", adt.name) + + return DataType[self.get_nodeattr("accDataType")] + + def minimize_weight_bit_width(self, model): + """Minimize the bit width based on the values of the weights""" + if not self.get_nodeattr("runtime_writeable_weights"): + weights = model.get_initializer(self.onnx_node.input[1]) + w_min = weights.min() + w_max = weights.max() + if w_min < 0: + if abs(w_min) > w_max: + wdt = DataType.get_smallest_possible(w_min) + else: + wdt = DataType.get_smallest_possible(-w_max - 1) + else: + wdt = DataType.get_smallest_possible(w_max) + self.set_nodeattr("weightDataType", wdt.name) + return DataType[self.get_nodeattr("weightDataType")] + + def calc_wmem(self): + """Calculates and returns WMEM.""" + ch = self.get_nodeattr("Channels") + k_h, k_w = self.get_nodeattr("Kernel") + pe = self.get_nodeattr("PE") + simd = self.get_nodeattr("SIMD") + wmem = (k_h * k_w * ch // pe) // simd + return wmem + + def calc_tmem(self): + """Calculates and returns TMEM.""" + return 0 + + def make_shape_compatible_op(self, model): + oshape = self.get_normal_output_shape() + return super().make_const_shape_op(oshape) + + def infer_node_datatype(self, model): + node = self.onnx_node + idt = model.get_tensor_datatype(node.input[0]) + if idt != self.get_input_datatype(): + warn_str = "inputDataType changing for %s: %s -> %s " % ( + node.name, + str(self.get_input_datatype()), + str(idt), + ) + warnings.warn(warn_str) + self.set_nodeattr("inputDataType", idt.name) + # set output datatype from property + odt = self.get_output_datatype() + model.set_tensor_datatype(node.output[0], odt) + + def verify_node(self): + pass + + def get_input_datatype(self, ind=0): + """Returns FINN DataType of input.""" + if ind == 0: + return DataType[self.get_nodeattr("inputDataType")] + elif ind == 1: + return DataType[self.get_nodeattr("weightDataType")] + else: + raise Exception("Undefined input ind for this layer type") + + def get_weight_datatype(self): + """Returns FINN DataType of weights.""" + return DataType[self.get_nodeattr("weightDataType")] + + def get_accumulator_datatype(self): + """Returns FINN DataType of accumulator""" + return DataType[self.get_nodeattr("accDataType")] + + def get_output_datatype(self, ind=0): + """Returns FINN DataType of output.""" + return DataType[self.get_nodeattr("outputDataType")] + + def get_instream_width(self, ind=0): + i_bits = self.get_input_datatype().bitwidth() + simd = self.get_nodeattr("SIMD") + pe = self.get_nodeattr("PE") + in_width = i_bits * simd * pe + return in_width + + def get_outstream_width(self, ind=0): + o_bits = self.get_output_datatype().bitwidth() + out_width = o_bits * self.get_nodeattr("PE") + return out_width + + def get_folded_input_shape(self, ind=0): + k_h, k_w = self.get_nodeattr("Kernel") + dim_h, dim_w = self.get_nodeattr("Dim") + ch = self.get_nodeattr("Channels") + simd = self.get_nodeattr("SIMD") + pe = self.get_nodeattr("PE") + kernel_2 = k_h * k_w + assert kernel_2 % simd == 0, "Requirement kernel (k_h * k_w) divisable by SIMD is violated." + sf = kernel_2 // simd + assert ch % pe == 0, "Requirement Channels divisable by PE is violated." + nf = ch // pe + + if ind == 0: + # calculate shape of input 0 + folded_input_shape = tuple([1, dim_h, dim_w, sf * nf, simd * pe]) + elif ind == 1 and self.get_nodeattr("mem_mode") == "external": + # calculate shape of input 1 (weights) + folded_input_shape = tuple([1, sf * nf, pe]) + else: + raise Exception("Undefined input shape for requested input") + + return folded_input_shape + + def get_folded_output_shape(self, ind=0): + ch = self.get_nodeattr("Channels") + pe = self.get_nodeattr("PE") + nf = ch // pe + dim_h, dim_w = self.get_nodeattr("Dim") + folded_output_shape = tuple([1, dim_h, dim_w, nf, pe]) + return folded_output_shape + + def get_normal_input_shape(self, ind=0): + dim_h, dim_w = self.get_nodeattr("Dim") + ch = self.get_nodeattr("Channels") + k_h, k_w = self.get_nodeattr("Kernel") + normal_input_shape = tuple([1, dim_h, dim_w, k_h * k_w * ch]) + return normal_input_shape + + def get_normal_output_shape(self, ind=0): + ch = self.get_nodeattr("Channels") + dim_h, dim_w = self.get_nodeattr("Dim") + normal_output_shape = tuple([1, dim_h, dim_w, ch]) + return normal_output_shape + + def get_number_output_values(self): + nf = np.prod(self.get_folded_output_shape()[:-1]) + return nf + +# TODO: fix exp_cycles estimations --> depends on fpga_part and clk + def get_exp_cycles(self): + pe = self.get_nodeattr("PE") + simd = self.get_nodeattr("SIMD") + ch = self.get_nodeattr("Channels") + dim_h, dim_w = self.get_nodeattr("Dim") + k_h, k_w = self.get_nodeattr("Kernel") + # currently FINN supports for vvau a batch size of 1 + batch_size = 1 + # since mmv != 1 is not supported yet, we set mmv for now to 1 + mmv = 1 + exp_cycles = ((ch * k_h * k_w) / pe / simd) * batch_size * (dim_h * dim_w) / mmv + return int(exp_cycles) + + def get_template_param_values(self): + """Returns the template parameter values according to input, output and weight + data types.""" + ret = dict() + inp_hls_str = self.get_input_datatype().get_hls_datatype_str() + out_hls_str = self.get_output_datatype().get_hls_datatype_str() + inp_is_binary = self.get_input_datatype() == DataType["BINARY"] + # out_is_binary = self.get_output_datatype() == DataType["BINARY"] + wt_is_binary = self.get_weight_datatype() == DataType["BINARY"] + bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1 + if (inp_is_binary or wt_is_binary) and (not bin_xnor_mode): + raise Exception("True binary (non-bipolar) inputs not yet supported") + inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"] + # out_is_bipolar = self.get_output_datatype() == DataType["BIPOLAR"] + wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"] + # reinterpret inp/wt as bipolar if bin_xnor_mode is iset + inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode) + wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode) + # fill in TSrcI and TWeightI + # TODO check these with Giulio + # TODO handle non-bipolar binary inputs + if inp_is_bipolar and wt_is_bipolar: + ret["TSrcI"] = "Recast" + ret["TWeightI"] = "Identity" + elif (not inp_is_bipolar) and wt_is_bipolar: + ret["TSrcI"] = "Slice<%s>" % inp_hls_str + ret["TWeightI"] = "Recast" + elif inp_is_bipolar and (not wt_is_bipolar): + ret["TSrcI"] = "Recast" + ret["TWeightI"] = "Identity" + elif (not inp_is_bipolar) and (not wt_is_bipolar): + ret["TSrcI"] = "Slice<%s>" % inp_hls_str + ret["TWeightI"] = "Identity" + + # fill in TDstI + ret["TDstI"] = "Slice<%s>" % out_hls_str + + return ret + + def get_hls_compatible_weight_tensor(self, orig_weight_matrix): + pe = self.get_nodeattr("PE") + simd = self.get_nodeattr("SIMD") + ch = self.get_nodeattr("Channels") + k_h, k_w = self.get_nodeattr("Kernel") + wmem = self.calc_wmem() + assert orig_weight_matrix.shape == ( + ch, + 1, + k_h, + k_w, + ), """Weights matrix doesn't + have expected shape (channels, 1, kernel_size, kernel_size)""" + ret = orig_weight_matrix + if self.get_weight_datatype() == DataType["BIPOLAR"]: + # convert bipolar to binary + ret = (ret + 1) / 2 + ret = ret.reshape(ch, k_h * k_w) + # distribute rows between PEs + ret = interleave_matrix_outer_dim_from_partitions(ret, pe) + ret = ret.reshape(1, pe, wmem, simd) + return ret + + def get_hls_compatible_threshold_tensor(self, orig_thres_matrix): + """Convert the original numpy weight matrix orig_weight_matrix into + a form suitable for passing to the hlslib call: + * ensure MH % PE == 0 + * for bipolar weights&inputs, ensure thresholds are positive + * interleave rows between PEs + * reshape into (PE, TMEM, n_thres_steps) and return + """ + ch = self.get_nodeattr("Channels") + pe = self.get_nodeattr("PE") + tmem = self.calc_tmem() + assert ch % pe == 0, "Requirement Channels divisable by PE is violated." + assert ( + orig_thres_matrix.ndim == 2 + ), """Threshold matrix dimension is + not as expected (2).""" + n_thres_steps = orig_thres_matrix.shape[1] + inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"] + wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"] + # reinterpret inp/wt as bipolar if bin_xnor_mode is iset + inp_is_binary = self.get_input_datatype() == DataType["BINARY"] + wt_is_binary = self.get_weight_datatype() == DataType["BINARY"] + bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1 + inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode) + wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode) + if inp_is_bipolar and wt_is_bipolar: + # ensure all thresholds are nonnegative + assert (orig_thres_matrix >= 0).all() + # ensure all thresholds are integer + assert (orig_thres_matrix.astype(np.int32) == orig_thres_matrix).all() + ret = orig_thres_matrix + # ensure channels = mh , duplicating if necessary + if ret.shape[0] == 1: + ret = np.tile(ret, (ch, 1)) + assert ret.shape[0] == ch, "Channels of threshold matrix are not as expected (ch)" + # distribute rows between PEs + ret = interleave_matrix_outer_dim_from_partitions(ret, pe) + assert ( + ret.shape[0] == pe + ), """First dimension after distribution of the + rows between PEs is not as expected (pe)""" + assert ( + ret.shape[1] == tmem + ), """Second dimension after distribution of the + rows between PEs is not as expected (tmem)""" + assert ( + ret.shape[2] == n_thres_steps + ), """Third dimension after distribution of the + rows between PEs is not as expected (n_thres_steps)""" + return ret.reshape(1, pe, tmem, n_thres_steps) + + def make_weight_file(self, weights, weight_file_mode, weight_file_name): + """Produce a file containing given weights in appropriate format for this + layer. This file can be used for either synthesis or run-time reconfig + of weights. + + Arguments: + + * weights : numpy array with weights to be put into the file + * weight_file_mode : one of {hls_header, decoupled_verilog_dat, + decoupled_runtime} + * weight_file_name : filename for the weight file to be generated + + """ + # convert weights into hlslib-compatible format + weight_tensor = self.get_hls_compatible_weight_tensor(weights) + export_wdt = self.get_weight_datatype() + # we have converted bipolar weights to binary for export, + # so use it as such for weight generation + if self.get_weight_datatype() == DataType["BIPOLAR"]: + export_wdt = DataType["BINARY"] + if weight_file_mode == "hls_header": + weight_hls_code = numpy_to_hls_code(weight_tensor, export_wdt, "weights", True, True) + # write weights into C++ header file as dictated by finn-hlslib + f_weights = open(weight_file_name, "w") + if export_wdt.bitwidth() != 1: + f_weights.write( + "const FixedPointWeights<{},{},{},{}> weights = ".format( + self.get_nodeattr("SIMD"), + export_wdt.get_hls_datatype_str(), + self.get_nodeattr("PE"), + self.calc_wmem(), + ) + ) + else: + f_weights.write( + "const BinaryWeights<{},{},{}> weights = ".format( + self.get_nodeattr("SIMD"), + self.get_nodeattr("PE"), + self.calc_wmem(), + ) + ) + f_weights.write(weight_hls_code) + f_weights.close() + elif "decoupled" in weight_file_mode: + # create a weight stream for various flavors of decoupled mode: + # transpose weight tensor from (1, PE, WMEM, SIMD) to (1, WMEM, PE, SIMD) + weight_tensor_unflipped = np.transpose(weight_tensor, (0, 2, 1, 3)) + # reverse SIMD flip for saving weights in .npy + weight_tensor_simd_flipped = np.flip(weight_tensor_unflipped, axis=-1) + # PE flip for saving weights in .dat + weight_tensor_pe_flipped = np.flip(weight_tensor_unflipped, axis=-2) + # reshape weight tensor (simd_flipped and pe_flipped) to desired shape + pe = self.get_nodeattr("PE") + simd = self.get_nodeattr("SIMD") + # simd_flipped + weight_tensor_simd_flipped = weight_tensor_simd_flipped.reshape(1, -1, pe * simd) + weight_tensor_simd_flipped = weight_tensor_simd_flipped.copy() + # flipped + weight_tensor_pe_flipped = weight_tensor_pe_flipped.reshape(1, -1, pe * simd) + weight_tensor_pe_flipped = weight_tensor_pe_flipped.copy() + if weight_file_mode == "decoupled_npy": + # save weight stream into npy for cppsim + np.save(weight_file_name, weight_tensor_simd_flipped) + elif weight_file_mode == "decoupled_verilog_dat": + # convert weight values into hexstring + weight_width = self.get_weightstream_width() + # pad to nearest 4 bits to get hex strings + weight_width_padded = roundup_to_integer_multiple(weight_width, 4) + weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string( + weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix="" + ) + # add zeroes to pad out file to 1024 entries + weight_stream = weight_tensor_pe_flipped.flatten() + weight_stream = weight_stream.copy() + with open(weight_file_name, "w") as f: + for val in weight_stream: + f.write(val + "\n") + elif weight_file_mode == "decoupled_runtime": + # memstream axi-lite interface will map each mem line to + # one or multiple 32-bit words + weight_width = self.get_weightstream_width() + words_per_memwidth = 2 ** math.ceil(math.log2(weight_width / 32)) + if words_per_memwidth < 1: + words_per_memwidth = 1 + weight_width_padded = words_per_memwidth * 32 + # first, pack and ensure padding to 32 bits + weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string( + weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix="" + ) + weight_stream = weight_tensor_pe_flipped.flatten() + weight_stream = weight_stream.copy() + with open(weight_file_name, "w") as f: + for val in weight_stream: + # split into groups of 8 hex digits (= 32 bits) + words_32b = textwrap.wrap(val, 8) + words_32b.reverse() + for word_32b in words_32b: + f.write(word_32b + "\n") + else: + raise Exception("Unknown weight_file_mode") + + else: + raise Exception("Unknown weight_file_mode") + + def generate_params(self, model, path): + mem_mode = self.get_nodeattr("mem_mode") + code_gen_dir = path + # weights, if not external + weights = model.get_initializer(self.onnx_node.input[1]) + if mem_mode == "const": + # save hlslib-compatible weights in params.h + weight_filename = "{}/params.h".format(code_gen_dir) + self.make_weight_file(weights, "hls_header", weight_filename) + elif mem_mode == "decoupled" or mem_mode == "external": + weight_filename_sim = "{}/weights.npy".format(code_gen_dir) + # save decoupled weights for cppsim + self.make_weight_file(weights, "decoupled_npy", weight_filename_sim) + if mem_mode == "decoupled": + # also save weights as Verilog .dat file + # This file will be ignored when synthesizing UltraScale memory. + weight_filename_rtl = "{}/memblock.dat".format(code_gen_dir) + self.make_weight_file(weights, "decoupled_verilog_dat", weight_filename_rtl) + else: + raise Exception( + """Please set mem_mode to "const", "decoupled", or "external", + currently no other parameter value is supported!""" + ) + + # save thresholds in thresh.h + if len(self.onnx_node.input) > 2: + thresholds = model.get_initializer(self.onnx_node.input[2]) + if thresholds is not None: + threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) + # use UINT32 threshold export for bipolar times bipolar + inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"] + wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"] + # reinterpret inp/wt as bipolar if bin_xnor_mode is iset + inp_is_binary = self.get_input_datatype() == DataType["BINARY"] + wt_is_binary = self.get_weight_datatype() == DataType["BINARY"] + bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1 + inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode) + wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode) + # get computed threshold datatype from attribute + tdt = DataType[self.get_nodeattr("accDataType")] + + assert np.vectorize(tdt.allowed)( + threshold_tensor + ).all(), "Thresholds in %s can't be expressed with type %s" % ( + self.onnx_node.name, + str(tdt), + ) + thresholds_hls_code = numpy_to_hls_code( + threshold_tensor, tdt, "thresholds", False, True + ) + # write thresholds into thresh.h + f_thresh = open("{}/thresh.h".format(code_gen_dir), "w") + tdt_hls = tdt.get_hls_datatype_str() + # use binary to export bipolar activations + export_odt = self.get_output_datatype() + if self.get_output_datatype() == DataType["BIPOLAR"]: + export_odt = DataType["BINARY"] + odt_hls = export_odt.get_hls_datatype_str() + f_thresh.write( + "static ThresholdsActivation<{},{},{},{},{},{},{}> threshs \ + = ".format( + self.calc_tmem(), + self.get_nodeattr("PE"), + threshold_tensor.shape[-1], + tdt_hls, + odt_hls, + self.get_nodeattr("ActVal"), + "comp::less_equal<%s, %s>" % (tdt_hls, tdt_hls), + ) + ) + f_thresh.write(thresholds_hls_code) + f_thresh.close() + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + mem_mode = self.get_nodeattr("mem_mode") + node = self.onnx_node + + # TODO ensure codegen dir exists + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + # create a npy file fore each input of the node (in_ind is input index) + in_ind = 0 + for inputs in node.input: + # it is assumed that the first input of the node is the data input + # the second input are the weights + # the third input are the thresholds + if in_ind == 0: + assert ( + str(context[inputs].dtype) == "float32" + ), """Input datatype is + not float32 as expected.""" + expected_inp_shape = self.get_folded_input_shape() + reshaped_input = context[inputs].reshape(expected_inp_shape) + if self.get_input_datatype() == DataType["BIPOLAR"]: + # store bipolar activations as binary + reshaped_input = (reshaped_input + 1) / 2 + export_idt = DataType["BINARY"] + else: + export_idt = self.get_input_datatype() + # make copy before saving the array + reshaped_input = reshaped_input.copy() + np.save( + os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)), + reshaped_input, + ) + elif in_ind > 2: + raise Exception("Unexpected input found for VectorVectorActivation") + in_ind += 1 + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_output(context) + # reinterpret binary output as bipolar where needed + if self.get_output_datatype() == DataType["BIPOLAR"]: + out = context[node.output[0]] + out = 2 * out - 1 + context[node.output[0]] = out + assert ( + context[node.output[0]].shape == self.get_normal_output_shape() + ), "cppsim did not produce expected output shape" + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + + if mem_mode == "external" or mem_mode == "decoupled": + wnbits = self.get_weightstream_width() + export_wdt = self.get_weight_datatype() + # we have converted bipolar weights to binary for export, + # so use it as such for weight generation + if self.get_weight_datatype() == DataType["BIPOLAR"]: + export_wdt = DataType["BINARY"] + wei = npy_to_rtlsim_input("{}/weights.npy".format(code_gen_dir), export_wdt, wnbits) + dim_h, dim_w = self.get_nodeattr("Dim") + num_w_reps = dim_h * dim_w + + io_dict = { + "inputs": {"in0": inp, "weights": wei * num_w_reps}, + "outputs": {"out": []}, + } + self.rtlsim_multi_io(sim, io_dict) + output = io_dict["outputs"]["out"] + else: + output = self.rtlsim(sim, inp) + odt = self.get_output_datatype() + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits) + + # load and reshape output + output = np.load(out_npy_path) + oshape = self.get_normal_output_shape() + output = np.asarray([output], dtype=np.float32).reshape(*oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + def code_generation_ipgen(self, model, fpgapart, clk): + """Normally: Generates C++ code and tcl script for IP generation. + Here: Generates (System-)Verilog code for IP generation.""" + self.generate_hdl(model, fpgapart, clk) + + def ipgen_singlenode_code(self): + """Normally: Builds the bash script for IP generation.""" + pass + + def code_generation_cppsim(self, model): + """Normally: Generates C++ code for simulation (cppsim).""" + pass + + def compile_singlenode_code(self): + pass + + def global_includes(self): + pass + + def defines(self, var): + pass + + def read_npy_data(self): + pass + + def strm_decl(self): + pass + + def docompute(self): + pass + + def dataoutstrm(self): + pass + + def save_as_npy(self): + pass + + def blackboxfunction(self): + pass + + def pragmas(self): + pass + + def get_verilog_top_module_intf_names(self): + intf_names = super().get_verilog_top_module_intf_names() + mem_mode = self.get_nodeattr("mem_mode") + sname = self.hls_sname() + if mem_mode == "external": + intf_names["s_axis"].append(("weights_" + sname, self.get_weightstream_width_padded())) + if mem_mode == "decoupled": + # only expose axilite interface if attribute is set + runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 + if runtime_writable: + intf_names["axilite"] = ["s_axilite"] + return intf_names + + def code_generation_ipi(self): + cmd = [] + # add streamer if needed + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "decoupled": + runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 + if self.get_nodeattr("ram_style") == "ultra": + assert ( + runtime_writable == 1 + ), "Layer with URAM weights must have runtime_writeable_weights=1" + node_name = self.onnx_node.name + sname = self.hls_sname() + # create a hierarchy for this layer, with the same port names + clk_name = self.get_verilog_top_module_intf_names()["clk"][0] + rst_name = self.get_verilog_top_module_intf_names()["rst"][0] + dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0][0] + din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0] + cmd.append("create_bd_cell -type hier %s" % node_name) + cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name)) + cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name)) + cmd.append( + "create_bd_intf_pin -mode Master " + "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, dout_name) + ) + cmd.append( + "create_bd_intf_pin -mode Slave " + "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name) + ) + # instantiate the RTL block + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/") + sourcefiles = [ + os.path.join( + code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v" + ), + rtllib_dir + "mvu_vvu_axi.sv", + rtllib_dir + "replay_buffer.sv", + rtllib_dir + "mvu_4sx4u.sv", + rtllib_dir + "mvu_vvu_8sx9_dsp58.sv", + rtllib_dir + "mvu_8sx8u_dsp48.sv", + rtllib_dir + "mvu_vvu_lut.sv", + ] + for f in sourcefiles: + cmd.append("add_files -norecurse %s" % (f)) + cmd.append( + "create_bd_cell -type hier -reference %s /%s/%s" + % ( + self.get_nodeattr("gen_top_module"), + self.onnx_node.name, + self.onnx_node.name, + ) + ) + + # instantiate a streamer and connect it to the HLS IP + strm_vlnv = "amd.com:finn:memstream:1.0" + strm_inst = node_name + "_wstrm" + cmd.append( + "create_bd_cell -type ip -vlnv %s /%s/%s" % (strm_vlnv, node_name, strm_inst) + ) + cmd.append( + "set_property -dict [list " + "CONFIG.DEPTH {%d} " + "CONFIG.WIDTH {%d} " + "CONFIG.INIT_FILE {%s} " + "CONFIG.RAM_STYLE {%s} " + "] [get_bd_cells /%s/%s]" + % ( + self.calc_wmem(), + self.get_weightstream_width_padded(), + self.get_nodeattr("code_gen_dir_ipgen") + "/memblock.dat", + self.get_nodeattr("ram_style"), + node_name, + strm_inst, + ) + ) + cmd.append( + "connect_bd_intf_net [get_bd_intf_pins %s/%s/m_axis_0] " + "[get_bd_intf_pins %s/%s/weights_%s]" + % (node_name, strm_inst, node_name, node_name, sname) + ) + cmd.append( + "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/ap_rst_n]" + % (node_name, rst_name, node_name, strm_inst) + ) + cmd.append( + "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/ap_clk]" + % (node_name, clk_name, node_name, strm_inst) + ) + cmd.append( + "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]" + % (node_name, rst_name, node_name, node_name, rst_name) + ) + cmd.append( + "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]" + % (node_name, clk_name, node_name, node_name, clk_name) + ) + cmd.append( + "connect_bd_intf_net [get_bd_intf_pins %s/%s] " + "[get_bd_intf_pins %s/%s/%s]" + % (node_name, din_name, node_name, node_name, din_name) + ) + cmd.append( + "connect_bd_intf_net [get_bd_intf_pins %s/%s] " + "[get_bd_intf_pins %s/%s/%s]" + % (node_name, dout_name, node_name, node_name, dout_name) + ) + if runtime_writable: + # expose axi lite interface for writeable weights + axilite_name = self.get_verilog_top_module_intf_names()["axilite"][0] + cmd.append( + "create_bd_intf_pin -mode Slave " + "-vlnv xilinx.com:interface:aximm_rtl:1.0 /%s/%s" % (node_name, axilite_name) + ) + cmd.append( + "connect_bd_intf_net [get_bd_intf_pins %s/%s] " + "[get_bd_intf_pins %s/%s/%s]" + % (node_name, axilite_name, node_name, strm_inst, axilite_name) + ) + # TODO calculate and pass in segment size here + cmd.append("assign_bd_address") + cmd.append("save_bd_design") + elif mem_mode == "external": + # instantiate the RTL block + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/") + sourcefiles = [ + os.path.join( + code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v" + ), + rtllib_dir + "mvu_vvu_axi.sv", + rtllib_dir + "replay_buffer.sv", + rtllib_dir + "mvu_4sx4u.sv", + rtllib_dir + "mvu_vvu_8sx9_dsp58.sv", + rtllib_dir + "mvu_8sx8u_dsp48.sv", + rtllib_dir + "mvu_vvu_lut.sv", + ] + for f in sourcefiles: + cmd.append("add_files -norecurse %s" % (f)) + cmd.append( + "create_bd_cell -type module -reference %s %s" + % ( + self.get_nodeattr("gen_top_module"), + self.onnx_node.name, + ) + ) + else: + raise Exception("Unrecognized mem_mode for VectorVectorActivation") + return cmd + + def uram_estimation(self): + P = self.get_nodeattr("PE") + Q = self.get_nodeattr("SIMD") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + omega = self.calc_wmem() + mem_width = Q * W * P + mmode = self.get_nodeattr("mem_mode") + mstyle = self.get_nodeattr("ram_style") + if ( + (mmode == "decoupled" and mstyle != "ultra") + or (mmode == "const") + or (mmode == "external") + ): + return 0 + width_multiplier = math.ceil(mem_width / 72) + depth_multiplier = math.ceil(omega / 4096) + return width_multiplier * depth_multiplier + + def bram_estimation(self): + """Calculates resource estimation for BRAM""" + # TODO add in/out FIFO contributions + P = self.get_nodeattr("PE") + Q = self.get_nodeattr("SIMD") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + omega = self.calc_wmem() + mem_width = Q * W * P + # assuming SDP mode RAMB18s (see UG573 Table 1-10) + # since this is HLS memory, not using the full width of a BRAM + # assuming memories up to 128 deep get implemented in LUTs + mmode = self.get_nodeattr("mem_mode") + mstyle = self.get_nodeattr("ram_style") + if ( + (mmode == "decoupled" and mstyle in ["distributed", "ultra"]) + or (mstyle == "auto" and self.calc_wmem() <= 128) + or (mmode == "const" and self.calc_wmem() <= 128) + or (mmode == "external") + ): + return 0 + + if mem_width == 1: + return math.ceil(omega / 16384) + elif mem_width == 2: + return math.ceil(omega / 8192) + elif mem_width <= 4: + return (math.ceil(omega / 4096)) * (math.ceil(mem_width / 4)) + elif mem_width <= 9: + return (math.ceil(omega / 2048)) * (math.ceil(mem_width / 8)) + elif mem_width <= 18 or omega > 512: + return (math.ceil(omega / 1024)) * (math.ceil(mem_width / 16)) + else: + return (math.ceil(omega / 512)) * (math.ceil(mem_width / 32)) + + def bram_efficiency_estimation(self): + P = self.get_nodeattr("PE") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + omega = self.calc_wmem() + bram16_est = self.bram_estimation() + if bram16_est == 0: + return 1 + wbits = W * P * omega + bram16_est_capacity = bram16_est * 36 * 512 + return wbits / bram16_est_capacity + +# TODO: fix estimations + def lut_estimation(self): + """Calculates resource estimations for LUTs based on: + - FINN-R: An End-to-End Deep-Learning Framework for Fast + Exploration of Quantized Neural Networks + - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien, + Y. Umuroglu, M. Leeser and K. Vissers + - 12. Sep 2018 + """ + # TODO add in/out FIFO contributions + P = self.get_nodeattr("PE") + Q = self.get_nodeattr("SIMD") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + # determine tdt with input and weight data types + idt = self.get_input_datatype() + A = idt.bitwidth() + # parameters from experiments in paper mentioned above + c0 = 300 + c1 = 1.1 + c2 = 0 + mmode = self.get_nodeattr("mem_mode") + mstyle = self.get_nodeattr("ram_style") + if (mmode == "decoupled" and mstyle == "distributed") or ( + mmode == "const" and self.calc_wmem() <= 128 + ): + c2 = (P * Q * W) * math.ceil(self.calc_wmem() / 64) + + # multiplication + res_type = self.get_nodeattr("resType") + if res_type == "dsp": + mult_luts = 0 + else: + mult_luts = Q * (2 * math.ceil((W + A) / 6) - 1) * (W + A) + # adder tree + addertree_luts = (W + A) * (2 * Q - 1) + # accumulator + acc_datatype = self.get_accumulator_datatype() + acc_bits = acc_datatype.bitwidth() + k_h, k_w = self.get_nodeattr("Kernel") + # if accDataType is not set, then it will default to INT32, which would + # be a large overestimate in most (if not all) cases. In this scenario, + # we would use the minimum accumulator as determined by the data types + # bound, derived in https://arxiv.org/abs/2301.13376 + alpha = math.log(k_h * k_w, 2) + W + A - 1 - int(idt.signed()) + acc_bits = min( + acc_datatype.bitwidth(), + np.ceil(alpha + math.log(1 + pow(2, -alpha), 2) + 1), + ) + acc_luts = acc_bits + # thresholds and threshold comparators + thr_luts = 0 + comp_luts = 0 + noact = self.get_nodeattr("noActivation") + # TODO - add 'ram_style_threshold' node attribute + if noact == 0: + odt = self.get_output_datatype() + B = odt.bitwidth() + thr_luts = (2**B - 1) * acc_bits * self.calc_tmem() / 64 + comp_luts = (2**B - 1) * acc_bits + + return int( + c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts)) + c2 + ) + +# TODO: fix estimations + def dsp_estimation(self): + # multiplication + P = self.get_nodeattr("PE") + res_type = self.get_nodeattr("resType") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + idt = self.get_input_datatype() + A = idt.bitwidth() + if res_type == "dsp": + mult_dsp = P * np.ceil((W + A) / 48) # TODO: more accurate modelling + else: + mult_dsp = 0 + return int(mult_dsp) + + def get_weightstream_width(self): + """Returns weight stream width. Used only in decoupled mode.""" + if ( + self.get_nodeattr("mem_mode") == "decoupled" + or self.get_nodeattr("mem_mode") == "external" + ): + simd = self.get_nodeattr("SIMD") + pe = self.get_nodeattr("PE") + wp = self.get_weight_datatype().bitwidth() + w_width = simd * pe * wp + return w_width + else: + return 0 + + def get_weightstream_width_padded(self): + """Returns weight stream width padded to a multiple of 8. This is required + by the AXI Stream spec. Used in decoupled mode.""" + weight_width = self.get_weightstream_width() + return roundup_to_integer_multiple(weight_width, 8) + + def get_op_and_param_counts(self): + k_h, k_w = self.get_nodeattr("Kernel") + fm = self.get_nodeattr("Channels") + dim_h, dim_w = self.get_nodeattr("Dim") + weight_bits = self.get_weight_datatype().bitwidth() + inp_bits = self.get_input_datatype().bitwidth() + num_repetitions = int(dim_h * dim_w) + mac_count = k_h * k_w * fm * num_repetitions + # cannonicalize op type: highest bitwidth operand first s.t. + # e.g. mac_8bx4b and mac_4bx8b don't appear as two different op types + bw1 = min(inp_bits, weight_bits) + bw2 = max(inp_bits, weight_bits) + mac_op_type = "op_mac_%dbx%db" % (bw1, bw2) + weight_param_type = "param_weight_%db" % (weight_bits) + weight_count = k_h * k_w * fm + ret_dict = {mac_op_type: mac_count, weight_param_type: weight_count} + if self.get_nodeattr("noActivation") == 0: + tdt = DataType[self.get_nodeattr("accDataType")] + thres_bits = tdt.bitwidth() + thres_param_type = "param_threshold_%db" % (thres_bits) + thres_count = fm + ret_dict[thres_param_type] = thres_count + return ret_dict + + def derive_characteristic_fxns(self, period): + n_inps = np.prod(self.get_folded_input_shape()[:-1]) + io_dict = { + "inputs": { + "in0": [0 for i in range(n_inps)], + }, + "outputs": {"out": []}, + } + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode in ["decoupled", "external"]: + n_weight_inps = self.calc_wmem() + num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) + io_dict["inputs"]["weights"] = [0 for i in range(num_w_reps * n_weight_inps)] + super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) + + def generate_hdl(self, model, fpgapart, clk): + # Generate params as part of IP preparation + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + self.generate_params(model, code_gen_dir) + + template_path, code_gen_dict = self.prepare_codegen_default(fpgapart, clk) + # add general parameters to dictionary + code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [ + self.get_verilog_top_module_name() + ] + # save top module name so we can refer to it after this node has been renamed + # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject) + self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name()) + + # apply code generation to template + with open(template_path, "r") as f: + template_wrapper = f.read() + for key in code_gen_dict: + # transform list into long string separated by '\n' + code_gen_line = "\n".join(code_gen_dict[key]) + template_wrapper = template_wrapper.replace(key, code_gen_line) + with open( + os.path.join( + code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v" + ), + "w", + ) as f: + f.write(template_wrapper.replace("$FORCE_BEHAVIORAL$", str(0))) + with open( + os.path.join( + code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper_sim.v" + ), + "w", + ) as f: + f.write(template_wrapper.replace("$FORCE_BEHAVIORAL$", str(1))) + + # set ipgen_path and ip_path so that HLS-Synth transformation + # and stich_ip transformation do not complain + self.set_nodeattr("ipgen_path", code_gen_dir) + self.set_nodeattr("ip_path", code_gen_dir) + + def _resolve_segment_len(self, clk): + # Insert pipeline registers in the DSP58 chain to meet target clock frequency + # 0.741 ns seems the worst-case delay through first DSP + # 0.605 ns seems to be (on average) delay for all subsequent DSPs + critical_path_dsps = np.floor((clk - 0.741) / 0.605) + max_chain_len = np.ceil(self.get_nodeattr("SIMD") / 3) + dsp_chain_len = critical_path_dsps if critical_path_dsps < max_chain_len else max_chain_len + return max(1, dsp_chain_len) + + def _resolve_impl_style(self, fpgapart): + # Based on target device and activation/weight-width, choose the + # supported RTL compute core + if self.get_nodeattr("resType") == "lut": + return "mvu_vvu_lut" + else: + is_dsp_targeted = self.get_nodeattr("resType") == "dsp" + is_versal = ( + fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"] + or fpgapart[0:5] == "xqrvc" + ) + assert (is_dsp_targeted and is_versal), "DSP-based (RTL) VVU currently only supported on Versal (DSP58) devices" + return "mvu_vvu_8sx9_dsp58" + + + def prepare_codegen_default(self, fpgapart, clk): + template_path = os.environ["FINN_ROOT"] + "/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v" + + code_gen_dict = {} + code_gen_dict["$IS_MVU$"] = [str(0)] + code_gen_dict["$COMPUTE_CORE$"] = [self._resolve_impl_style(fpgapart)] + mw = int(np.prod(self.get_nodeattr("Kernel")) * self.get_nodeattr("Channels")) + code_gen_dict["$MW$"] = [str(mw)] + code_gen_dict["$MH$"] = [str(1)] + code_gen_dict["$PE$"] = [str(self.get_nodeattr("PE"))] + code_gen_dict["$SIMD$"] = [str(self.get_nodeattr("SIMD"))] + code_gen_dict["$ACTIVATION_WIDTH$"] = [ + str(self.get_input_datatype(0).bitwidth()) + ] + code_gen_dict["$WEIGHT_WIDTH$"] = [str(self.get_input_datatype(1).bitwidth())] + code_gen_dict["$ACCU_WIDTH$"] = [str(self.get_output_datatype().bitwidth())] + code_gen_dict["$SIGNED_ACTIVATIONS$"] = ( + [str(1)] if (self.get_input_datatype(0).min() < 0) else [str(0)] + ) + code_gen_dict["$SEGMENTLEN$"] = [str(self._resolve_segment_len(clk))] + #code_gen_dict["$FORCE_BEHAVIORAL$"] = [str(0)] + + return template_path, code_gen_dict + + def prepare_rtlsim(self): + """Creates a Verilator emulation library for the RTL code generated + for this node, sets the rtlsim_so attribute to its path and returns + a PyVerilator wrapper around it.""" + + if PyVerilator is None: + raise ImportError("Installation of PyVerilator is required.") + + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + # Path to (System-)Verilog files used by top-module & path to top-module + verilog_paths = [code_gen_dir, os.environ["FINN_ROOT"] + "/finn-rtllib/mvu"] + verilog_files = [self.get_nodeattr("gen_top_module") + "_wrapper_sim.v"] + + # build the Verilator emu library + sim = PyVerilator.build( + verilog_files, + build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"), + verilog_path=verilog_paths, + trace_depth=get_rtlsim_trace_depth(), + top_module_name=self.get_verilog_top_module_name(), + ) + # save generated lib filename in attribute + self.set_nodeattr("rtlsim_so", sim.lib._name) + + return sim \ No newline at end of file From 62cec5056eac6f4a28c0a4ea87051a6c0123dd41 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 13 Oct 2023 23:37:08 +0100 Subject: [PATCH 183/235] [dwc pw]: added new custom-op SDWC operating on SWG with parallel window mode enabled --- ...datawidthconverter_parallelwindow_batch.py | 390 ++++++++++++++++++ 1 file changed, 390 insertions(+) create mode 100644 src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_parallelwindow_batch.py diff --git a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_parallelwindow_batch.py b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_parallelwindow_batch.py new file mode 100644 index 0000000000..6a72f17555 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_parallelwindow_batch.py @@ -0,0 +1,390 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import math +import numpy as np +import os +import warnings +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + +# does not do anything at the ONNX node-by-node level, and input-output +# tensor shapes are the same. performs data width conversion at the rtlsim level + + +class StreamingDataWidthConverter_ParallelWindow_Batch(HLSCustomOp): + """Class that corresponds to finn-hlslib StreamingDataWidthConverter_ParallelWindow_Batch + function. To be inserted between an RTL-SWG with parallel window mode enabled and a + VVU.""" + + def get_nodeattr_types(self): + my_attrs = { + # shape of input/output tensors + "shape": ("ints", True, []), + # bit width of input and output streams + "inWidth": ("i", True, 0), + "outWidth": ("i", True, 0), + # FINN DataTypes for inputs/outputs + "dataType": ("s", True, ""), + "SIMD": ("i", True, 0), + "PE": ("i", True, 0), + "Channels": ("i", True, 0), + "Kernel": ("ints", True, []), + } + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def get_input_datatype(self, ind=0): + """Returns FINN DataType of input.""" + return DataType[self.get_nodeattr("dataType")] + + def get_output_datatype(self, ind=0): + """Returns FINN DataType of output.""" + return DataType[self.get_nodeattr("dataType")] + + def get_normal_input_shape(self, ind=0): + ishape = self.get_nodeattr("shape") + return ishape + + def get_normal_output_shape(self, ind=0): + oshape = self.get_nodeattr("shape") + return oshape + + def get_folded_input_shape(self, ind=0): + iwidth = self.get_nodeattr("inWidth") + ishape = self.get_normal_input_shape() + dummy_t = np.random.randn(*ishape) + ibits = self.get_input_datatype().bitwidth() + assert ( + iwidth % ibits == 0 + ), """DWC input width must be divisible by + input element bitwidth""" + ielems = int(iwidth // ibits) + ichannels = ishape[-1] + new_shape = [] + for i in ishape[:-1]: + new_shape.append(i) + new_shape.append(int(ichannels // ielems)) + new_shape.append(ielems) + dummy_t = dummy_t.reshape(new_shape) + return dummy_t.shape + + def get_folded_output_shape(self, ind=0): + owidth = self.get_nodeattr("outWidth") + oshape = self.get_normal_output_shape() + dummy_t = np.random.randn(*oshape) + obits = self.get_output_datatype().bitwidth() + assert ( + owidth % obits == 0 + ), """DWC output width must be divisible by + input element bitwidth""" + oelems = int(owidth // obits) + ochannels = oshape[-1] + new_shape = [] + for i in oshape[:-1]: + new_shape.append(i) + new_shape.append(int(ochannels // oelems)) + new_shape.append(oelems) + dummy_t = dummy_t.reshape(new_shape) + + return dummy_t.shape + + def get_number_output_values(self): + folded_oshape = self.get_folded_output_shape() + return np.prod(folded_oshape[:-1]) + + def get_instream_width(self, ind=0): + in_width = self.get_nodeattr("inWidth") + return in_width + + def get_outstream_width(self, ind=0): + out_width = self.get_nodeattr("outWidth") + return out_width + + def make_shape_compatible_op(self, model): + exp_ishape = self.get_normal_input_shape() + oshape = self.get_normal_output_shape() + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) + assert ishape == tuple(exp_ishape), "Unexpect input shape for StreamingDWC." + return super().make_const_shape_op(oshape) + + def infer_node_datatype(self, model): + node = self.onnx_node + idt = model.get_tensor_datatype(node.input[0]) + if idt != self.get_input_datatype(): + warn_str = "inputDataType changing for %s: %s -> %s " % ( + node.name, + str(self.get_input_datatype()), + str(idt), + ) + warnings.warn(warn_str) + self.set_nodeattr("dataType", idt.name) + # data type stays the same + model.set_tensor_datatype(node.output[0], idt) + + def verify_node(self): + info_messages = [] + # verify that "backend" is set to "fpgadataflow" + backend_value = self.get_nodeattr("backend") + if backend_value == "fpgadataflow": + info_messages.append("Attribute backend is set correctly") + else: + info_messages.append('Attribute backend should be set to "fpgadataflow"') + + # verify the number of inputs + if len(self.onnx_node.input) == 1: + info_messages.append("The number of inputs is correct") + else: + info_messages.append("""StreamingDWC needs 1 data input""") + + return info_messages + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "streamtools.h"'] + + def defines(self, var): + numReps = 1 + numInWords = int(np.prod(self.get_folded_input_shape()[:-1])) + inWidth = self.get_nodeattr("inWidth") + outWidth = self.get_nodeattr("outWidth") + simd = self.get_nodeattr("SIMD") + pe = self.get_nodeattr("PE") + channels = self.get_nodeattr("Channels") + nf = int(channels/pe) + sf = int(np.prod(self.get_nodeattr("Kernel")) / simd) + actWidth = DataType[self.get_nodeattr("dataType")].bitwidth() + self.code_gen_dict["$DEFINES$"] = [ + "#define InWidth %d " % inWidth, + "#define OutWidth %d " % outWidth, + "#define SIMD %d " % simd, + "#define PE %d " % pe, + "#define Channels %d " % channels, + "#define NF %d " % nf, + "#define SF %d " % sf, + "#define ActWidth %d " % actWidth, + "#define NumInWords %d " % numInWords, + "#define numReps %d" % numReps, + ] + + def read_npy_data(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_input_datatype() + if dtype == DataType["BIPOLAR"]: + # use binary for bipolar storage + dtype = DataType["BINARY"] + elem_bits = dtype.bitwidth() + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/input_0.npy" % code_gen_dir + self.code_gen_dict["$READNPYDATA$"] = [] + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + npy_in, + self.hls_sname(), + ) + ) + + def strm_decl(self): + self.code_gen_dict["$STREAMDECLARATIONS$"] = [] + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> in0_{} ("in0_{}");'.format( + self.get_instream_width(), self.hls_sname(), self.hls_sname() + ) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> out_{} ("out_{}");'.format( + self.get_outstream_width(), self.hls_sname(), self.hls_sname() + ) + ) + + def docompute(self): + # TODO continue with fxns below, they are copy-pasted + op = "StreamingDataWidthConverter_ParallelWindow_Batch" + self.code_gen_dict["$DOCOMPUTE$"] = [ + "%s(in0_%s, out_%s, numReps);" + % (op, self.hls_sname(), self.hls_sname()) + ] + + def dataoutstrm(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_output_datatype() + if dtype == DataType["BIPOLAR"]: + # use binary for bipolar storage + dtype = DataType["BINARY"] + elem_bits = dtype.bitwidth() + packed_bits = self.get_outstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_out = "%s/output.npy" % code_gen_dir + oshape = self.get_folded_output_shape() + oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") + + self.code_gen_dict["$DATAOUTSTREAM$"] = [ + 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + self.hls_sname(), + oshape_cpp_str, + npy_out, + ) + ] + + def save_as_npy(self): + self.code_gen_dict["$SAVEASCNPY$"] = [] + + def blackboxfunction(self): + in_packed_bits = self.get_instream_width() + in_packed_hls_type = "ap_uint<%d>" % in_packed_bits + out_packed_bits = self.get_outstream_width() + out_packed_hls_type = "ap_uint<%d>" % out_packed_bits + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + "void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s)" + % ( + self.onnx_node.name, + in_packed_hls_type, + self.hls_sname(), + out_packed_hls_type, + self.hls_sname(), + ) + ] + + def pragmas(self): + self.code_gen_dict["$PRAGMAS$"] = [ + "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() + ] + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() + ) + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + exp_shape = self.get_normal_input_shape() + folded_ishape = self.get_folded_input_shape() + + # TODO ensure codegen dir exists + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + inp = context[node.input[0]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert inp.shape == tuple(exp_shape), "Input shape does not match expected shape." + + if self.get_input_datatype() == DataType["BIPOLAR"]: + # store bipolar activations as binary + inp = (inp + 1) / 2 + export_idt = DataType["BINARY"] + else: + export_idt = self.get_input_datatype() + # reshape input into folded shape + reshaped_input = inp.reshape(folded_ishape) + # make copy before saving array + reshaped_input = reshaped_input.copy() + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + + if mode == "cppsim": + output = inp + output = np.asarray([output], dtype=np.float32).reshape(*exp_shape) + context[node.output[0]] = output + + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + rtlsim_inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_output = self.rtlsim(sim, rtlsim_inp) + odt = export_idt + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(exp_shape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to "rtlsim" """.format( + mode + ) + ) + # binary -> bipolar if needed + if self.get_output_datatype() == DataType["BIPOLAR"]: + out = context[node.output[0]] + out = 2 * out - 1 + context[node.output[0]] = out + assert context[node.output[0]].shape == tuple( + exp_shape + ), """Output + shape doesn't match expected shape, should be same as input shape""" + + def code_generation_ipi(self): + return super().code_generation_ipi() + + def lut_estimation(self): + """Calculates resource estimations for LUTs""" + return 0 + + def prepare_rtlsim(self): + super().prepare_rtlsim() + + def code_generation_ipgen(self, model, fpgapart, clk): + super().code_generation_ipgen(model, fpgapart, clk) + + def ipgen_singlenode_code(self): + super().ipgen_singlenode_code() From 511f8353d2a91c66e33761ae3c83cb1e43608988 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 13 Oct 2023 23:38:12 +0100 Subject: [PATCH 184/235] [transformation]: extended InsertDWC transformation to instantiate a StreamingDataWidthConverter_ParallelWindow_Batch when applicable --- .../transformation/fpgadataflow/insert_dwc.py | 55 ++++++++++++++----- 1 file changed, 41 insertions(+), 14 deletions(-) diff --git a/src/finn/transformation/fpgadataflow/insert_dwc.py b/src/finn/transformation/fpgadataflow/insert_dwc.py index 140d154b1a..b779241c11 100644 --- a/src/finn/transformation/fpgadataflow/insert_dwc.py +++ b/src/finn/transformation/fpgadataflow/insert_dwc.py @@ -7,7 +7,7 @@ def _is_dwc_node(node): - if node.op_type == "StreamingDataWidthConverter_Batch": + if node.op_type in ["StreamingDataWidthConverter_Batch", "StreamingDataWidthConverter_ParallelWindow_Batch"]: return True else: return False @@ -30,6 +30,12 @@ def _suitable_node(node): return False +def _is_parallel_window_mode(producer, consumer): + if producer.get_nodeattr("parallel_window") == 1 and consumer.op_type in ["VectorVectorActivation", "VectorVectorActivation_rtl"]: + return True + else: + return False + class InsertDWC(Transformation): """Add data width converters between layers where necessary.""" @@ -98,19 +104,40 @@ def apply(self, model): dwc_shape, ) graph.value_info.append(dwc_output_tensor) - - dwc_node = oh.make_node( - "StreamingDataWidthConverter_Batch", - [output_name], - [dwc_output_tensor.name], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - shape=dwc_shape, - inWidth=dwc_in_width, - outWidth=dwc_out_width, - dataType=str(dtype.name), - impl_style=impl_style, - ) + + if _is_parallel_window_mode(n0, consumer): + simd = n1.get_nodeattr("SIMD") + pe = n1.get_nodeattr("PE") + channels = n1.get_nodeattr("Channels") + kernel = n1.get_nodeattr("Kernel") + dwc_node = oh.make_node( + "StreamingDataWidthConverter_ParallelWindow_Batch", + [output_name], + [dwc_output_tensor.name], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + shape=dwc_shape, + inWidth=dwc_in_width, + outWidth=dwc_out_width, + dataType=str(dtype.name), + SIMD=simd, + PE=pe, + Channels=channels, + Kernel=kernel, + ) + else: + dwc_node = oh.make_node( + "StreamingDataWidthConverter_Batch", + [output_name], + [dwc_output_tensor.name], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + shape=dwc_shape, + inWidth=dwc_in_width, + outWidth=dwc_out_width, + dataType=str(dtype.name), + impl_style=impl_style, + ) # insert dwc graph.node.insert(node_ind + 1, dwc_node) From 4d949d665a2014af3be0476f414c9102001d2db8 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 13 Oct 2023 23:38:54 +0100 Subject: [PATCH 185/235] [custom op]: added 2 new custom-ops --- src/finn/custom_op/fpgadataflow/__init__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index 19c0ddd999..001e95cdc7 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -57,12 +57,14 @@ from finn.custom_op.fpgadataflow.streamingdatawidthconverter_batch import ( StreamingDataWidthConverter_Batch, ) +from finn.custom_op.fpgadataflow.streamingdatawidthconverter_parallelwindow_batch import StreamingDataWidthConverter_ParallelWindow_Batch from finn.custom_op.fpgadataflow.streamingfifo import StreamingFIFO from finn.custom_op.fpgadataflow.streamingmaxpool_batch import StreamingMaxPool_Batch from finn.custom_op.fpgadataflow.thresholding_batch import Thresholding_Batch from finn.custom_op.fpgadataflow.tlastmarker import TLastMarker from finn.custom_op.fpgadataflow.upsampler import UpsampleNearestNeighbour_Batch from finn.custom_op.fpgadataflow.vectorvectoractivation import VectorVectorActivation +from finn.custom_op.fpgadataflow.vectorvectoractivation_rtl import VectorVectorActivation_rtl custom_op = dict() @@ -77,6 +79,7 @@ custom_op["ConvolutionInputGenerator_rtl"] = ConvolutionInputGenerator_rtl custom_op["TLastMarker"] = TLastMarker custom_op["StreamingDataWidthConverter_Batch"] = StreamingDataWidthConverter_Batch +custom_op["StreamingDataWidthConverter_ParallelWindow_Batch"] = StreamingDataWidthConverter_ParallelWindow_Batch custom_op["StreamingFIFO"] = StreamingFIFO custom_op["GlobalAccPool_Batch"] = GlobalAccPool_Batch custom_op["Pool_Batch"] = Pool_Batch @@ -86,6 +89,7 @@ custom_op["LabelSelect_Batch"] = LabelSelect_Batch custom_op["DuplicateStreams_Batch"] = DuplicateStreams_Batch custom_op["VectorVectorActivation"] = VectorVectorActivation +custom_op["VectorVectorActivation_rtl"] = VectorVectorActivation_rtl custom_op["ChannelwiseOp_Batch"] = ChannelwiseOp_Batch custom_op["IODMA"] = IODMA custom_op["StreamingDataflowPartition"] = StreamingDataflowPartition From 05751c447dda411fbf68b2688a5186b350961713 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 13 Oct 2023 23:40:32 +0100 Subject: [PATCH 186/235] [VVU_RTL test]: added test for RTL-based VVU, which includes testing the StreamingDataWidthConverter_ParallelWindow_Batch --- .../test_fpgadataflow_vvau_rtl.py | 214 ++++++++++++++++++ 1 file changed, 214 insertions(+) create mode 100644 tests/fpgadataflow/test_fpgadataflow_vvau_rtl.py diff --git a/tests/fpgadataflow/test_fpgadataflow_vvau_rtl.py b/tests/fpgadataflow/test_fpgadataflow_vvau_rtl.py new file mode 100644 index 0000000000..abf6029f59 --- /dev/null +++ b/tests/fpgadataflow/test_fpgadataflow_vvau_rtl.py @@ -0,0 +1,214 @@ +# Copyright (C) 2022, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest +import os + +import numpy as np +from onnx import TensorProto, helper +from qonnx.util.basic import ( + qonnx_make_model, + gen_finn_dt_tensor +) +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.core.datatype import DataType +from qonnx.transformation.general import GiveUniqueNodeNames +import finn.core.onnx_exec as oxe +import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim +from qonnx.transformation.general import ApplyConfig +import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl +from qonnx.custom_op.general.im2col import compute_conv_output_dim +from qonnx.transformation.infer_datatypes import InferDataTypes +from qonnx.transformation.infer_shapes import InferShapes +from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul +from qonnx.transformation.general import GiveReadableTensorNames +import pickle +from finn.transformation.fpgadataflow.insert_dwc import InsertDWC +from finn.transformation.fpgadataflow.set_fifo_depths import InsertAndSetFIFODepths +from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP +from finn.transformation.fpgadataflow.create_dataflow_partition import CreateDataflowPartition +from qonnx.custom_op.registry import getCustomOp +from finn.transformation.fpgadataflow.minimize_accumulator_width import MinimizeAccumulatorWidth +#import qonnx.core.data_layout as DataLayout + +build_dir = os.environ["FINN_BUILD_DIR"] + +def make_single_dw_conv_modelwrapper(conv_config, idt, wdt): + kernel_size, in_feature_dim, in_chn = conv_config + stride = 1 + pad = 0 + + out_feature_dim = compute_conv_output_dim(in_feature_dim, kernel_size, stride, pad) + group = out_chn = in_chn + + conv_param_shape = [out_chn, 1, kernel_size, kernel_size] + input_shape = [1, in_chn, in_feature_dim, in_feature_dim] + output_shape = [1, out_chn, out_feature_dim, out_feature_dim] + + conv_config = {} + conv_config["dilations"] = [1, 1] + conv_config["group"] = group + conv_config["kernel_shape"] = [kernel_size, kernel_size] + conv_config["pads"] = [pad, pad, pad, pad] + conv_config["strides"] = [stride, stride] + + ifm = helper.make_tensor_value_info("ifm", TensorProto.FLOAT, input_shape) + ofm = helper.make_tensor_value_info("ofm", TensorProto.FLOAT, output_shape) + weights = [helper.make_tensor_value_info("weights", TensorProto.FLOAT, conv_param_shape)] + + modelproto = qonnx_make_model( + helper.make_graph( + name="conv_test", + inputs=[ifm], + outputs=[ofm], + value_info=weights, + nodes=[helper.make_node("Conv", ["ifm", "weights"], ["ofm"], **conv_config)], + ) + ) + + model = ModelWrapper(modelproto) + model.set_tensor_datatype("ifm", idt) + model.set_tensor_datatype("weights", wdt) + model.set_initializer("weights", gen_finn_dt_tensor(wdt, conv_param_shape)) + + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + + return model + +def prepare_inputs(input_tensor): + return {"global_in": input_tensor} + +@pytest.mark.parametrize("kernel_size", [3]) +@pytest.mark.parametrize("in_feature_dim", [5]) +@pytest.mark.parametrize("in_chn", [4]) +@pytest.mark.parametrize("idt", [DataType["INT8"]]) +#@pytest.mark.parametrize("idt", [DataType["UINT8"]]) +@pytest.mark.parametrize("wdt", [DataType["INT6"]]) +@pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S"]) +@pytest.mark.parametrize("segmentlen", [1]) +@pytest.mark.parametrize("pe", [4]) +@pytest.mark.parametrize("simd", [3]) +@pytest.mark.fpgadataflow +@pytest.mark.slow +@pytest.mark.vivado +def test_fpgadataflow_vvau_rtl(kernel_size, in_feature_dim, in_chn, idt, wdt, part, segmentlen, pe, simd): + # Create depthwise-separable convolution + conv_config = (kernel_size, in_feature_dim, in_chn) + model = make_single_dw_conv_modelwrapper(conv_config, idt, wdt) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + model.save(build_dir+"/dw_conv.onnx") + + # Obtain golden reference output + golden_in = gen_finn_dt_tensor(model.get_tensor_datatype("global_in"), model.get_tensor_shape("global_in")) + input_dict = prepare_inputs(golden_in) + golden_out = oxe.execute_onnx(model, input_dict, return_full_exec_context=True) + with open(build_dir+"/onnx_dws_conv.pkl", "wb") as f: + pickle.dump(golden_out, f) + + # Convert to HLS custom-op first + model = model.transform(LowerConvsToMatMul()) + model = model.transform(to_hls.InferConvInpGen(use_rtl_variant=True)) + model = model.transform(to_hls.InferVectorVectorActivation()) + model = model.transform(MinimizeAccumulatorWidth()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + model.save(build_dir+"/hls_vvau.onnx") + + # Apply folding (i.e. specify to use DSPs) + folding_config = { + "Defaults": {}, + "ConvolutionInputGenerator_rtl_0": { + "SIMD" : pe, + "parallel_window" : 1 + }, + "VectorVectorActivation_0": { + "PE" : pe, + "SIMD" : simd, + "mem_mode" : "decoupled", + "ram_style" : "auto", + "resType" : "dsp", + "preferred_backend" : "rtl" + } + } + model = model.transform(ApplyConfig(folding_config)) + model.save(build_dir+"/hls_vvau_folded.onnx") + + # Obtain second reference from HLS-based VVAU layer + model = model.transform(SetExecMode("rtlsim")) + model = model.transform(PrepareIP(part, 5)) + model = model.transform(HLSSynthIP()) + model = model.transform(PrepareRTLSim()) + conv_hls_out = oxe.execute_onnx(model, input_dict, return_full_exec_context=True) + with open(build_dir+"/hls_vvau_folded_output.pkl", "wb") as f: + pickle.dump(conv_hls_out, f) + #assert (golden_out["global_out"] == conv_hls_out["global_out"]).all() + + # Stitched-IP RTLsim + model = model.transform(CreateDataflowPartition(partition_model_dir=build_dir)) + model.save(build_dir+"/ip-stitched.onnx") + partition_model_path = getCustomOp(model.get_nodes_by_op_type("StreamingDataflowPartition")[0]).get_nodeattr("model") + partitioned_model = ModelWrapper(partition_model_path) + # FIFOs needed for stitched-ip RTLsim, DWC needed for VVU operating on SIMD parallelism + partitioned_model = partitioned_model.transform(InsertAndSetFIFODepths(part, 5)) + partitioned_model = partitioned_model.transform(PrepareIP(part, 5)) + partitioned_model = partitioned_model.transform(HLSSynthIP()) + partitioned_model.save(build_dir+"/partitioned_model.onnx") + partitioned_model = partitioned_model.transform(CreateStitchedIP(part, 5)) + partitioned_model.save(partition_model_path) + partitioned_model.set_metadata_prop("rtlsim_trace", build_dir+"/hls-vvu.vcd") + # set top-level prop for stitched-ip rtlsim and launch + partitioned_model.set_metadata_prop("exec_mode", "rtlsim") + # transpose input since we're now simulating HW layers (NCHW --> NHWC) + input_dict["global_in"] = np.transpose(input_dict["global_in"], (0,2,3,1)) + stitched_ip_out = oxe.execute_onnx(partitioned_model, input_dict, return_full_exec_context=True) + with open(build_dir+"/stitched_ip_output.pkl", "wb") as f: + pickle.dump(stitched_ip_out, f) + + # Apply convert-to-rtl step + partitioned_model = partitioned_model.transform(to_rtl.InferRTLVectorVectorActivation()) + partitioned_model = partitioned_model.transform(GiveUniqueNodeNames()) + partitioned_model = partitioned_model.transform(GiveReadableTensorNames()) + partitioned_model = partitioned_model.transform(PrepareIP(part, 5)) + partitioned_model = partitioned_model.transform(HLSSynthIP()) + partitioned_model = partitioned_model.transform(CreateStitchedIP(part, 5)) + partitioned_model.save(build_dir+"/partition_rtl_vvau.onnx") + partitioned_model.set_metadata_prop("rtlsim_trace", build_dir+"/rtl-vvu.vcd") + # Reset rtlsim_so path to re-generate Pyverilator sim object + partitioned_model.set_metadata_prop("rtlsim_so", "") + # set top-level prop for stitched-ip rtlsim and launch + partitioned_model.set_metadata_prop("exec_mode", "rtlsim") + vvu_rtl_out = oxe.execute_onnx(partitioned_model, input_dict, return_full_exec_context=True) + with open(build_dir+"/rtl_vvau_output.pkl", "wb") as f: + pickle.dump(vvu_rtl_out, f) + # assert (conv_hls_out["global_out"] == vvu_rtl_out["global_out"]).all(), "Mismatch" From 6d4ee089faf8232d0bc54eaf6b8c8118ab93c6f7 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 1 Nov 2023 15:20:07 +0000 Subject: [PATCH 187/235] [mvu vvu axi]: minor bugfixes to enable VVU --- finn-rtllib/mvu/mvu_vvu_axi.sv | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index f0f75c633a..ddedec1e8a 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -64,7 +64,7 @@ module mvu_vvu_axi #( localparam int unsigned INPUT_STREAM_WIDTH_BA = ((IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8, localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH, localparam int unsigned INPUT_STREAM_WIDTH = (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH, - localparam int unsigned SF = IS_MVU ? MW/SIMD : MW/(SIMD*PE), + localparam int unsigned SF = MW/SIMD, localparam int unsigned NF = IS_MVU ? MH/PE : 1, localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8 ) @@ -91,11 +91,11 @@ module mvu_vvu_axi #( //-------------------- Parameter sanity checks --------------------\\ initial begin - if ((MW % SIMD != 0 && IS_MVU) || (MW % (SIMD*PE) != 0 && !IS_MVU)) begin + if (MW % SIMD != 0) begin $error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD); $finish; end - if (MH % PE != 0 && IS_MVU) begin + if (MH % PE != 0) begin $error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE); $finish; end @@ -152,13 +152,10 @@ module mvu_vvu_axi #( //-------------------- Core MVU/VVU --------------------\\ uwire ovld; uwire [PE-1:0][ACCU_WIDTH-1:0] odat; - typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t; uwire mvauin_t amvau_i; - uwire mvauin_weight_t wmvau_i; if (IS_MVU) begin : genMVUInput assign amvau_i = amvau; - assign wmvau_i = s_axis_weights_tdata; end : genMVUInput else begin : genVVUInput // The input stream will have the channels interleaved for VVU when PE>1 @@ -169,11 +166,8 @@ module mvu_vvu_axi #( localparam int num_of_elements = PE*SIMD; for (genvar i=0; i 1) ? - amvau[(i/SIMD + (i*PE % num_of_elements) + 1) * ACTIVATION_WIDTH : (i/SIMD + (i*PE % num_of_elements)) * ACTIVATION_WIDTH] + amvau[(i/SIMD + (i*PE % num_of_elements) + 1) * ACTIVATION_WIDTH -1: (i/SIMD + (i*PE % num_of_elements)) * ACTIVATION_WIDTH] : amvau[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH]; - assign wmvau_i[i*WEIGHT_WIDTH +: WEIGHT_WIDTH] = (PE > 1) ? - s_axis_weights_tdata[( ((SIMD-1-i) + int'(i/SIMD)*SIMD) + int'(i/SIMD) * SIMD + 1) * WEIGHT_WIDTH : ( ((SIMD-1-i) + int'(i/SIMD)*SIMD) + int'(i/SIMD) * SIMD ) * WEIGHT_WIDTH] - : s_axis_weights_tdata[i*WEIGHT_WIDTH +: WEIGHT_WIDTH]; end : genRewire end : genVVUInput @@ -183,7 +177,7 @@ module mvu_vvu_axi #( .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( .clk, .rst, .en, - .last(alast && avld), .zero(!istb), .w(wmvau_i), .a(amvau_i), + .last(alast && avld), .zero(!istb), .w(s_axis_weights_tdata), .a(amvau_i), .vld(ovld), .p(odat) ); "mvu_4sx4u": From 39dc27ac24fcae5999536c504f2150a5f6f7be7e Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 1 Nov 2023 15:26:30 +0000 Subject: [PATCH 188/235] [mvu tb]: created separate vvu testbench and renamed mvu_vvu_axi tb --- .../tb/{mvu_vvu_axi_tb.sv => mvu_axi_tb.sv} | 16 +- finn-rtllib/mvu/tb/vvu_axi_tb.sv | 227 ++++++++++++++++++ 2 files changed, 235 insertions(+), 8 deletions(-) rename finn-rtllib/mvu/tb/{mvu_vvu_axi_tb.sv => mvu_axi_tb.sv} (96%) create mode 100644 finn-rtllib/mvu/tb/vvu_axi_tb.sv diff --git a/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_axi_tb.sv similarity index 96% rename from finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv rename to finn-rtllib/mvu/tb/mvu_axi_tb.sv index b46fc588c9..8614e9f811 100644 --- a/finn-rtllib/mvu/tb/mvu_vvu_axi_tb.sv +++ b/finn-rtllib/mvu/tb/mvu_axi_tb.sv @@ -31,24 +31,24 @@ * @brief Testbench for MVU AXI-lite interface wrapper. *****************************************************************************/ -module mvu_vvu_axi_tb(); +module mvu_axi_tb(); //-------------------- Simulation parameters --------------------\\ // Matrix & parallelism config localparam bit IS_MVU = 0; localparam string COMPUTE_CORE = "mvu_vvu_8sx9_dsp58"; localparam int unsigned MW = 36; - localparam int unsigned MH = 1; - localparam int unsigned SIMD = 3; + localparam int unsigned MH = 4; + localparam int unsigned SIMD = 36; localparam int unsigned PE = 4; - localparam int unsigned SEGMENTLEN = 1.0; + localparam int unsigned SEGMENTLEN = 2.0; localparam bit FORCE_BEHAVIORAL = 1; localparam bit M_REG_LUT = 1; // Bit-width config - localparam int unsigned ACTIVATION_WIDTH = 8; - localparam int unsigned WEIGHT_WIDTH = 6; + localparam int unsigned ACTIVATION_WIDTH = 4; + localparam int unsigned WEIGHT_WIDTH = 4; localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW); - localparam bit SIGNED_ACTIVATIONS = 1; + localparam bit SIGNED_ACTIVATIONS = 0; // Simulation constants localparam int unsigned NF = IS_MVU ? MH/PE : 1; localparam int unsigned SF = IS_MVU ? MW/SIMD : MW/(SIMD*PE); @@ -238,4 +238,4 @@ module mvu_vvu_axi_tb(); .m_axis_output_tready(outputs.rdy) ); -endmodule : mvu_vvu_axi_tb +endmodule : mvu_axi_tb diff --git a/finn-rtllib/mvu/tb/vvu_axi_tb.sv b/finn-rtllib/mvu/tb/vvu_axi_tb.sv new file mode 100644 index 0000000000..fbb45845e1 --- /dev/null +++ b/finn-rtllib/mvu/tb/vvu_axi_tb.sv @@ -0,0 +1,227 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Testbench for MVU AXI-lite interface wrapper. + *****************************************************************************/ + +module vvu_axi_tb(); + +//-------------------- Simulation parameters --------------------\\ + // Matrix & parallelism config + localparam bit IS_MVU = 0; + localparam string COMPUTE_CORE = "mvu_vvu_8sx9_dsp58"; + localparam int unsigned MW = 25; // Kernel*Kernel + localparam int unsigned MH = 4; // Channels + localparam int unsigned SIMD = 25; // MW%SIMD == 0 + localparam int unsigned PE = 2; // MH%PE == 0 + localparam int unsigned SEGMENTLEN = 3.0; + localparam bit FORCE_BEHAVIORAL = 1; + localparam bit M_REG_LUT = 1; + // Bit-width config + localparam int unsigned ACTIVATION_WIDTH = 4; + localparam int unsigned WEIGHT_WIDTH = 4; + localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW); + localparam bit SIGNED_ACTIVATIONS = 1; + // Simulation constants + localparam int unsigned NF = MH/PE; + localparam int unsigned SF = MW/SIMD; + localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8; + localparam int unsigned ACTIVATION_WIDTH_BA = (PE*SIMD*ACTIVATION_WIDTH+7)/8*8; + localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH; + localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - PE*SIMD*ACTIVATION_WIDTH; + localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8; + + // Generate clk and reset signal + logic clk = 0; + always #5ns clk = !clk; + + logic ap_rst_n = 0; + initial begin + repeat(16) @(posedge clk); + ap_rst_n <= 1; + end + + uwire ap_clk = clk; + + // Generate activations + typedef logic [PE*SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; + typedef activation_t activation_vector_t[NF*SF]; + + function activation_vector_t init_ACTIVATIONS; + automatic activation_vector_t res; + std::randomize(res); + return res; + endfunction : init_ACTIVATIONS + + activation_vector_t ACTIVATIONS = init_ACTIVATIONS(); + + struct { + activation_t dat; + logic vld; + logic rdy; + } activations; + + initial begin + activations.vld = 0; + activations.dat = 'X; + @(posedge clk iff ap_rst_n); + + for (int i=0; i= 0; + @(posedge clk); + end while (!(activations.vld === 1 && activations.rdy === 1)); + end + + activations.vld <= 0; + activations.dat <= 'x; + end + + // Generate weights + typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; + typedef weight_t weight_matrix_t[NF][SF]; + + function weight_matrix_t init_WEIGHTS; + automatic weight_matrix_t res; + std::randomize(res); + return res; + endfunction : init_WEIGHTS; + + weight_matrix_t WEIGHTS = init_WEIGHTS(); + + struct { + weight_t dat; + logic vld; + logic rdy; + } weights; + + initial begin + weights.vld = 0; + weights.dat = 'X; + @(posedge clk iff ap_rst_n); + + weights.vld <= 1; + for (int i=0; i1 + // Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..] + // Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like: + // (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to + // (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i) + for (int i = 0; i < NF; i++) begin + for (int j = 0; j < SF; j++) begin + for (int k = 0; k < PE; k++) begin + for (int l = 0; l < SIMD; l++) begin + if (SIGNED_ACTIVATIONS) + res[i][k] = $signed(res[i][k]) + $signed(a[i*SF+j][k + l*PE]) * $signed(w[i][j][k][l]); + else + res[i][k] = $signed(res[i][k]) + $signed({1'b0, a[i*SF+j][k + l*PE]}) * $signed(w[i][j][k][l]); + end + end + end + end + return res; + endfunction : check_output; + + output_vector_t GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS); + + int unsigned NF_CNT = 0; + initial begin + outputs.rdy = 0; + while (NF_CNT < NF) begin + // Loop until both rdy & vld are asserted + do begin + outputs.rdy <= $urandom()%7 >= 0; + @(posedge clk iff ap_rst_n); + end while (!(outputs.rdy === 1 && outputs.vld === 1)); + + // Compare produced outputs against golden outputs + foreach(outputs.dat[i]) begin + assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + else begin + $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + $stop; + end + end + + NF_CNT += 1; + end + + $finish; + end + + // Instantiate DUT + mvu_vvu_axi #( + .IS_MVU(IS_MVU), + .COMPUTE_CORE(COMPUTE_CORE), + .MW(MW), + .MH(MH), + .PE(PE), + .SIMD(SIMD), + .ACTIVATION_WIDTH(ACTIVATION_WIDTH), + .WEIGHT_WIDTH(WEIGHT_WIDTH), + .ACCU_WIDTH(ACCU_WIDTH), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), + .SEGMENTLEN(SEGMENTLEN), + .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL), + .M_REG_LUT(M_REG_LUT) + ) + dut ( + .ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld), + .s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld), + .s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld), + .m_axis_output_tready(outputs.rdy) + ); + +endmodule : vvu_axi_tb From 87b25f9cca342f004aa79d5c4738ba1e5e8398e7 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 1 Nov 2023 15:29:42 +0000 Subject: [PATCH 189/235] [rtl-vvu custom-op]: flipped weights per SIMD-chunk to match pattern of incoming input activations easier --- .../fpgadataflow/vectorvectoractivation_rtl.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/vectorvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/vectorvectoractivation_rtl.py index 72976bc9a8..3ffef9c3a5 100644 --- a/src/finn/custom_op/fpgadataflow/vectorvectoractivation_rtl.py +++ b/src/finn/custom_op/fpgadataflow/vectorvectoractivation_rtl.py @@ -467,6 +467,8 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name): weight_tensor_simd_flipped = np.flip(weight_tensor_unflipped, axis=-1) # PE flip for saving weights in .dat weight_tensor_pe_flipped = np.flip(weight_tensor_unflipped, axis=-2) + # SIMD & PE flip + weight_tensor_pe_simd_flipped = np.flip(weight_tensor_pe_flipped, axis=-1) # reshape weight tensor (simd_flipped and pe_flipped) to desired shape pe = self.get_nodeattr("PE") simd = self.get_nodeattr("SIMD") @@ -476,6 +478,9 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name): # flipped weight_tensor_pe_flipped = weight_tensor_pe_flipped.reshape(1, -1, pe * simd) weight_tensor_pe_flipped = weight_tensor_pe_flipped.copy() + # SIMD & PE flipped + weight_tensor_pe_simd_flipped = weight_tensor_pe_simd_flipped.reshape(1, -1, pe * simd) + weight_tensor_pe_simd_flipped = weight_tensor_pe_simd_flipped.copy() if weight_file_mode == "decoupled_npy": # save weight stream into npy for cppsim np.save(weight_file_name, weight_tensor_simd_flipped) @@ -484,11 +489,11 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name): weight_width = self.get_weightstream_width() # pad to nearest 4 bits to get hex strings weight_width_padded = roundup_to_integer_multiple(weight_width, 4) - weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string( - weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix="" + weight_tensor_pe_simd_flipped = pack_innermost_dim_as_hex_string( + weight_tensor_pe_simd_flipped, export_wdt, weight_width_padded, prefix="" ) # add zeroes to pad out file to 1024 entries - weight_stream = weight_tensor_pe_flipped.flatten() + weight_stream = weight_tensor_pe_simd_flipped.flatten() weight_stream = weight_stream.copy() with open(weight_file_name, "w") as f: for val in weight_stream: @@ -1180,9 +1185,9 @@ def prepare_codegen_default(self, fpgapart, clk): code_gen_dict = {} code_gen_dict["$IS_MVU$"] = [str(0)] code_gen_dict["$COMPUTE_CORE$"] = [self._resolve_impl_style(fpgapart)] - mw = int(np.prod(self.get_nodeattr("Kernel")) * self.get_nodeattr("Channels")) + mw = int(np.prod(self.get_nodeattr("Kernel"))) code_gen_dict["$MW$"] = [str(mw)] - code_gen_dict["$MH$"] = [str(1)] + code_gen_dict["$MH$"] = [str(self.get_nodeattr("Channels"))] code_gen_dict["$PE$"] = [str(self.get_nodeattr("PE"))] code_gen_dict["$SIMD$"] = [str(self.get_nodeattr("SIMD"))] code_gen_dict["$ACTIVATION_WIDTH$"] = [ From 1476927e957100cff85205e147e1b00d4b1ba198 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 1 Nov 2023 16:01:08 +0000 Subject: [PATCH 190/235] [rtl vvu test]: extended testbench --- tests/fpgadataflow/test_fpgadataflow_vvau_rtl.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/fpgadataflow/test_fpgadataflow_vvau_rtl.py b/tests/fpgadataflow/test_fpgadataflow_vvau_rtl.py index abf6029f59..29132da90e 100644 --- a/tests/fpgadataflow/test_fpgadataflow_vvau_rtl.py +++ b/tests/fpgadataflow/test_fpgadataflow_vvau_rtl.py @@ -116,8 +116,8 @@ def prepare_inputs(input_tensor): @pytest.mark.parametrize("wdt", [DataType["INT6"]]) @pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S"]) @pytest.mark.parametrize("segmentlen", [1]) -@pytest.mark.parametrize("pe", [4]) -@pytest.mark.parametrize("simd", [3]) +@pytest.mark.parametrize("pe", [1, 2, 4]) +@pytest.mark.parametrize("simd", [1, 3, 9]) @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado @@ -149,7 +149,7 @@ def test_fpgadataflow_vvau_rtl(kernel_size, in_feature_dim, in_chn, idt, wdt, pa folding_config = { "Defaults": {}, "ConvolutionInputGenerator_rtl_0": { - "SIMD" : pe, + "SIMD" : 4, "parallel_window" : 1 }, "VectorVectorActivation_0": { @@ -172,7 +172,6 @@ def test_fpgadataflow_vvau_rtl(kernel_size, in_feature_dim, in_chn, idt, wdt, pa conv_hls_out = oxe.execute_onnx(model, input_dict, return_full_exec_context=True) with open(build_dir+"/hls_vvau_folded_output.pkl", "wb") as f: pickle.dump(conv_hls_out, f) - #assert (golden_out["global_out"] == conv_hls_out["global_out"]).all() # Stitched-IP RTLsim model = model.transform(CreateDataflowPartition(partition_model_dir=build_dir)) @@ -211,4 +210,6 @@ def test_fpgadataflow_vvau_rtl(kernel_size, in_feature_dim, in_chn, idt, wdt, pa vvu_rtl_out = oxe.execute_onnx(partitioned_model, input_dict, return_full_exec_context=True) with open(build_dir+"/rtl_vvau_output.pkl", "wb") as f: pickle.dump(vvu_rtl_out, f) - # assert (conv_hls_out["global_out"] == vvu_rtl_out["global_out"]).all(), "Mismatch" + + assert (vvu_rtl_out["global_out"] == golden_out["global_out"]).all(), "Output of ONNX model not matching output of stitched-IP RTL model!" + assert (vvu_rtl_out["global_out"] == stitched_ip_out["global_out"]).all(), "Output of stitched-IP HLS model not matching output of stitched-IP RTL model!" From 7fc173b44b1edd2548535d9e0d9a808f8652a805 Mon Sep 17 00:00:00 2001 From: Yaman Umuroglu Date: Tue, 14 Nov 2023 15:19:04 +0000 Subject: [PATCH 191/235] [RTLThres] compute obits in Python and use placeholder in template --- .../hdl/thresholding_axi_wrapper.v | 4 +- .../thresholding_binary_search.py | 46 ++++++++----------- 2 files changed, 19 insertions(+), 31 deletions(-) diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v index 2657b39d98..893c791ccc 100644 --- a/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v +++ b/finn-rtllib/thresholding/hdl/thresholding_axi_wrapper.v @@ -40,9 +40,7 @@ module $MODULE_NAME_AXI_WRAPPER$ #( parameter SIGNED = $SIGNED$, // signed inputs parameter BIAS = $BIAS$, // offsetting the output [0, 2^N-1) -> [BIAS, 2^N-1 + BIAS) - parameter O_BITS = BIAS > 0? - /* unsigned */ $clog2(2**N+BIAS) : - /* signed */ 1+$clog2(-BIAS >= 2**(N-1)? -BIAS : 2**N+BIAS) + parameter O_BITS = $O_BITS$ )( //- Global Control ------------------ (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V" *) diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py index d02b778823..5fe818f4ac 100755 --- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py @@ -26,6 +26,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import math import numpy as np import os import warnings @@ -221,23 +222,17 @@ def get_hls_compatible_threshold_tensor(self, orig_thres_matrix): ), """Threshold matrix dimension is not as expected (2).""" n_thres_steps = orig_thres_matrix.shape[1] - assert n_thres_steps == self.get_nodeattr( - "numSteps" - ), "Mismatch in threshold steps" + assert n_thres_steps == self.get_nodeattr("numSteps"), "Mismatch in threshold steps" if not self.get_input_datatype().signed(): # ensure all thresholds are nonnegative assert (orig_thres_matrix >= 0).all() # ensure all thresholds are integer - assert np.equal( - np.mod(orig_thres_matrix, 1), 0 - ).all(), "Need int threshold tensor" + assert np.equal(np.mod(orig_thres_matrix, 1), 0).all(), "Need int threshold tensor" ret = orig_thres_matrix # ensure channels = mh , duplicating if necessary if ret.shape[0] == 1: ret = np.tile(ret, (mh, 1)) - assert ( - ret.shape[0] == mh - ), "Channels of threshold matrix are not as expected (mh)" + assert ret.shape[0] == mh, "Channels of threshold matrix are not as expected (mh)" # distribute rows between PEs ret = interleave_matrix_outer_dim_from_partitions(ret, pe) assert ( @@ -268,18 +263,16 @@ def prepare_codegen_rtl_values(self): # Identify the module variables output_data_type = self.get_nodeattr("outputDataType") # output precision - input_data_type = self.get_nodeattr( - "inputDataType" - ) # input/threshold precision + input_data_type = self.get_nodeattr("inputDataType") # input/threshold precision num_channels = self.get_nodeattr("NumChannels") # number of channels bias = self.get_nodeattr("activation_bias") # activation bias value pe = self.get_nodeattr("PE") + o_bitwidth = DataType[output_data_type].bitwidth() + i_bitwidth = DataType[input_data_type].bitwidth() - code_gen_dict["$N$"] = [ - str(DataType[output_data_type].bitwidth()) - ] # output precision - convert bitwidth to string + code_gen_dict["$N$"] = [str(o_bitwidth)] # output precision - convert bitwidth to string code_gen_dict["$M$"] = [ - str(DataType[input_data_type].bitwidth()) + str(i_bitwidth) ] # input/threshold precision - convert bitwidth to string code_gen_dict["$C$"] = [str(num_channels)] # number of channels code_gen_dict["$BIAS$"] = [str(bias)] # activation bias value @@ -289,8 +282,13 @@ def prepare_codegen_rtl_values(self): # The thresholding core needs to know this when comparing weights to inputs if self.get_input_datatype().signed(): code_gen_dict["$SIGNED$"] = [str(1)] + o_bits = 1 + math.ceil( + -bias if -bias >= 2 ** (o_bitwidth - 1) else 2**o_bitwidth + bias + ) else: code_gen_dict["$SIGNED$"] = [str(0)] + o_bits = math.ceil(2**o_bitwidth + bias) + code_gen_dict["$O_BITS$"] = [str(o_bits)] return code_gen_dict @@ -429,18 +427,14 @@ def execute_node(self, context, graph): # Create a PyVerilator wrapper of the RTLSim .so sim = self.get_rtlsim() nbits = self.get_instream_width() - inp = npy_to_rtlsim_input( - "{}/input_0.npy".format(code_gen_dir), export_idt, nbits - ) + inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) super().reset_rtlsim(sim) super().toggle_clk(sim) wnbits = self.get_weightstream_width() export_wdt = self.get_weight_datatype() - wei = npy_to_rtlsim_input( - "{}/thresholds.npy".format(code_gen_dir), export_wdt, wnbits - ) + wei = npy_to_rtlsim_input("{}/thresholds.npy".format(code_gen_dir), export_wdt, wnbits) num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) io_dict = { "inputs": {"in0": inp, "weights": wei * num_w_reps}, @@ -456,9 +450,7 @@ def execute_node(self, context, graph): out_npy_path = "{}/output.npy".format(code_gen_dir) out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy( - output, out_npy_path, odt, out_shape, packed_bits, target_bits - ) + rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits) # load and reshape output output = np.load(out_npy_path) @@ -475,9 +467,7 @@ def code_generation_ipi(self): code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") for rtl_file in rtl_file_list: - cmd.append( - "add_files -norecurse %s" % (os.path.join(code_gen_dir, rtl_file)) - ) + cmd.append("add_files -norecurse %s" % (os.path.join(code_gen_dir, rtl_file))) # Create an RTL block, not an IP core (-type ip) cmd.append( From a62911cda8d882e4e4dbc662815053652cda4edd Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 20 Nov 2023 14:35:45 +0000 Subject: [PATCH 192/235] [mvu vvu axi]: minor fix -- define mvauin_weight_t --- finn-rtllib/mvu/mvu_vvu_axi.sv | 1 + 1 file changed, 1 insertion(+) diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index ddedec1e8a..8eb92a93e6 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -153,6 +153,7 @@ module mvu_vvu_axi #( uwire ovld; uwire [PE-1:0][ACCU_WIDTH-1:0] odat; uwire mvauin_t amvau_i; + typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t; if (IS_MVU) begin : genMVUInput assign amvau_i = amvau; From 4d4c61b80b01858c5da2b14a2125bd5e513a7c6b Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 20 Nov 2023 14:40:46 +0000 Subject: [PATCH 193/235] [specialize_to_rtl step]: add transformation to infer RTL-VVU --- src/finn/builder/build_dataflow_config.py | 2 +- src/finn/builder/build_dataflow_steps.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py index 073bc9e12b..0d6911035c 100644 --- a/src/finn/builder/build_dataflow_config.py +++ b/src/finn/builder/build_dataflow_config.py @@ -116,12 +116,12 @@ class VerificationStepType(str, Enum): "step_tidy_up", "step_streamline", "step_convert_to_hls", + "step_specialize_to_rtl", "step_create_dataflow_partition", "step_target_fps_parallelization", "step_apply_folding_config", "step_minimize_bit_width", "step_generate_estimate_reports", - "step_specialize_to_rtl", "step_hls_codegen", "step_hls_ipgen", "step_set_fifo_depths", diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py index 2629efef11..83278aae41 100644 --- a/src/finn/builder/build_dataflow_steps.py +++ b/src/finn/builder/build_dataflow_steps.py @@ -404,6 +404,7 @@ def step_target_fps_parallelization(model: ModelWrapper, cfg: DataflowBuildConfi target_cycles_per_frame, mvau_wwidth_max=cfg.mvau_wwidth_max, two_pass_relaxation=cfg.folding_two_pass_relaxation, + fpga_part=cfg._resolve_fpga_part() ) ) # extract the suggested configuration and save it as json @@ -476,7 +477,7 @@ def step_generate_estimate_reports(model: ModelWrapper, cfg: DataflowBuildConfig def step_specialize_to_rtl(model: ModelWrapper, cfg: DataflowBuildConfig): """Convert layers implemented in HLS to an equivalent specialized RTL implementation if possible.""" - specialize_to_rtl_transforms = [to_rtl.InferRTLMatrixVectorActivation()] + specialize_to_rtl_transforms = [to_rtl.InferRTLMatrixVectorActivation(), to_rtl.InferRTLVectorVectorActivation()] for trn in specialize_to_rtl_transforms: model = model.transform(trn) return model From 612ed8f7ee2869d98ec9ed4084b64932e7b76cb0 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 20 Nov 2023 14:41:37 +0000 Subject: [PATCH 194/235] [rtl vvu custom op]: clean-up of unused functions --- .../vectorvectoractivation_rtl.py | 162 +----------------- 1 file changed, 3 insertions(+), 159 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/vectorvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/vectorvectoractivation_rtl.py index 3ffef9c3a5..e3f0abb6c5 100644 --- a/src/finn/custom_op/fpgadataflow/vectorvectoractivation_rtl.py +++ b/src/finn/custom_op/fpgadataflow/vectorvectoractivation_rtl.py @@ -66,7 +66,7 @@ def get_nodeattr_types(self): "Dim": ("ints", True, []), # [H, W] "Channels": ("i", True, 0), "Kernel": ("ints", True, []), # [H, W] - "resType": ("s", False, "auto", {"auto", "lut", "dsp"}), + "resType": ("s", False, "dsp", {"auto", "lut", "dsp"}), # FINN DataTypes for inputs, weights, outputs "inputDataType": ("s", True, ""), "weightDataType": ("s", True, ""), @@ -77,7 +77,7 @@ def get_nodeattr_types(self): # const -- embedded weights, default, long compile/synth times # decoupled -- streaming weights with weight streamer packaged inside IP # external -- streaming weights with external streamer - "mem_mode": ("s", False, "const", {"const", "decoupled", "external"}), + "mem_mode": ("s", False, "decoupled", {"const", "decoupled", "external"}), # (mem_mode = decoupled only) whether weights will be writable through # an AXI-lite interface during runtime # 1 for enabled, 0 for disabled. @@ -303,45 +303,6 @@ def get_exp_cycles(self): exp_cycles = ((ch * k_h * k_w) / pe / simd) * batch_size * (dim_h * dim_w) / mmv return int(exp_cycles) - def get_template_param_values(self): - """Returns the template parameter values according to input, output and weight - data types.""" - ret = dict() - inp_hls_str = self.get_input_datatype().get_hls_datatype_str() - out_hls_str = self.get_output_datatype().get_hls_datatype_str() - inp_is_binary = self.get_input_datatype() == DataType["BINARY"] - # out_is_binary = self.get_output_datatype() == DataType["BINARY"] - wt_is_binary = self.get_weight_datatype() == DataType["BINARY"] - bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1 - if (inp_is_binary or wt_is_binary) and (not bin_xnor_mode): - raise Exception("True binary (non-bipolar) inputs not yet supported") - inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"] - # out_is_bipolar = self.get_output_datatype() == DataType["BIPOLAR"] - wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"] - # reinterpret inp/wt as bipolar if bin_xnor_mode is iset - inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode) - wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode) - # fill in TSrcI and TWeightI - # TODO check these with Giulio - # TODO handle non-bipolar binary inputs - if inp_is_bipolar and wt_is_bipolar: - ret["TSrcI"] = "Recast" - ret["TWeightI"] = "Identity" - elif (not inp_is_bipolar) and wt_is_bipolar: - ret["TSrcI"] = "Slice<%s>" % inp_hls_str - ret["TWeightI"] = "Recast" - elif inp_is_bipolar and (not wt_is_bipolar): - ret["TSrcI"] = "Recast" - ret["TWeightI"] = "Identity" - elif (not inp_is_bipolar) and (not wt_is_bipolar): - ret["TSrcI"] = "Slice<%s>" % inp_hls_str - ret["TWeightI"] = "Identity" - - # fill in TDstI - ret["TDstI"] = "Slice<%s>" % out_hls_str - - return ret - def get_hls_compatible_weight_tensor(self, orig_weight_matrix): pe = self.get_nodeattr("PE") simd = self.get_nodeattr("SIMD") @@ -365,57 +326,6 @@ def get_hls_compatible_weight_tensor(self, orig_weight_matrix): ret = ret.reshape(1, pe, wmem, simd) return ret - def get_hls_compatible_threshold_tensor(self, orig_thres_matrix): - """Convert the original numpy weight matrix orig_weight_matrix into - a form suitable for passing to the hlslib call: - * ensure MH % PE == 0 - * for bipolar weights&inputs, ensure thresholds are positive - * interleave rows between PEs - * reshape into (PE, TMEM, n_thres_steps) and return - """ - ch = self.get_nodeattr("Channels") - pe = self.get_nodeattr("PE") - tmem = self.calc_tmem() - assert ch % pe == 0, "Requirement Channels divisable by PE is violated." - assert ( - orig_thres_matrix.ndim == 2 - ), """Threshold matrix dimension is - not as expected (2).""" - n_thres_steps = orig_thres_matrix.shape[1] - inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"] - wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"] - # reinterpret inp/wt as bipolar if bin_xnor_mode is iset - inp_is_binary = self.get_input_datatype() == DataType["BINARY"] - wt_is_binary = self.get_weight_datatype() == DataType["BINARY"] - bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1 - inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode) - wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode) - if inp_is_bipolar and wt_is_bipolar: - # ensure all thresholds are nonnegative - assert (orig_thres_matrix >= 0).all() - # ensure all thresholds are integer - assert (orig_thres_matrix.astype(np.int32) == orig_thres_matrix).all() - ret = orig_thres_matrix - # ensure channels = mh , duplicating if necessary - if ret.shape[0] == 1: - ret = np.tile(ret, (ch, 1)) - assert ret.shape[0] == ch, "Channels of threshold matrix are not as expected (ch)" - # distribute rows between PEs - ret = interleave_matrix_outer_dim_from_partitions(ret, pe) - assert ( - ret.shape[0] == pe - ), """First dimension after distribution of the - rows between PEs is not as expected (pe)""" - assert ( - ret.shape[1] == tmem - ), """Second dimension after distribution of the - rows between PEs is not as expected (tmem)""" - assert ( - ret.shape[2] == n_thres_steps - ), """Third dimension after distribution of the - rows between PEs is not as expected (n_thres_steps)""" - return ret.reshape(1, pe, tmem, n_thres_steps) - def make_weight_file(self, weights, weight_file_mode, weight_file_name): """Produce a file containing given weights in appropriate format for this layer. This file can be used for either synthesis or run-time reconfig @@ -549,55 +459,6 @@ def generate_params(self, model, path): currently no other parameter value is supported!""" ) - # save thresholds in thresh.h - if len(self.onnx_node.input) > 2: - thresholds = model.get_initializer(self.onnx_node.input[2]) - if thresholds is not None: - threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) - # use UINT32 threshold export for bipolar times bipolar - inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"] - wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"] - # reinterpret inp/wt as bipolar if bin_xnor_mode is iset - inp_is_binary = self.get_input_datatype() == DataType["BINARY"] - wt_is_binary = self.get_weight_datatype() == DataType["BINARY"] - bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1 - inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode) - wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode) - # get computed threshold datatype from attribute - tdt = DataType[self.get_nodeattr("accDataType")] - - assert np.vectorize(tdt.allowed)( - threshold_tensor - ).all(), "Thresholds in %s can't be expressed with type %s" % ( - self.onnx_node.name, - str(tdt), - ) - thresholds_hls_code = numpy_to_hls_code( - threshold_tensor, tdt, "thresholds", False, True - ) - # write thresholds into thresh.h - f_thresh = open("{}/thresh.h".format(code_gen_dir), "w") - tdt_hls = tdt.get_hls_datatype_str() - # use binary to export bipolar activations - export_odt = self.get_output_datatype() - if self.get_output_datatype() == DataType["BIPOLAR"]: - export_odt = DataType["BINARY"] - odt_hls = export_odt.get_hls_datatype_str() - f_thresh.write( - "static ThresholdsActivation<{},{},{},{},{},{},{}> threshs \ - = ".format( - self.calc_tmem(), - self.get_nodeattr("PE"), - threshold_tensor.shape[-1], - tdt_hls, - odt_hls, - self.get_nodeattr("ActVal"), - "comp::less_equal<%s, %s>" % (tdt_hls, tdt_hls), - ) - ) - f_thresh.write(thresholds_hls_code) - f_thresh.close() - def execute_node(self, context, graph): mode = self.get_nodeattr("exec_mode") mem_mode = self.get_nodeattr("mem_mode") @@ -1025,19 +886,8 @@ def lut_estimation(self): np.ceil(alpha + math.log(1 + pow(2, -alpha), 2) + 1), ) acc_luts = acc_bits - # thresholds and threshold comparators - thr_luts = 0 - comp_luts = 0 - noact = self.get_nodeattr("noActivation") - # TODO - add 'ram_style_threshold' node attribute - if noact == 0: - odt = self.get_output_datatype() - B = odt.bitwidth() - thr_luts = (2**B - 1) * acc_bits * self.calc_tmem() / 64 - comp_luts = (2**B - 1) * acc_bits - return int( - c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts)) + c2 + c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts)) + c2 ) # TODO: fix estimations @@ -1091,12 +941,6 @@ def get_op_and_param_counts(self): weight_param_type = "param_weight_%db" % (weight_bits) weight_count = k_h * k_w * fm ret_dict = {mac_op_type: mac_count, weight_param_type: weight_count} - if self.get_nodeattr("noActivation") == 0: - tdt = DataType[self.get_nodeattr("accDataType")] - thres_bits = tdt.bitwidth() - thres_param_type = "param_threshold_%db" % (thres_bits) - thres_count = fm - ret_dict[thres_param_type] = thres_count return ret_dict def derive_characteristic_fxns(self, period): From 0b31a88be11bac6e545cfda91224200b72b8d468 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 20 Nov 2023 14:43:58 +0000 Subject: [PATCH 195/235] [folding]: first attempt to extend folding transformation to parallelize multi-packed DSPs in MVU/VVU more efficiently --- .../fpgadataflow/set_folding.py | 75 +++++++++++++++---- 1 file changed, 60 insertions(+), 15 deletions(-) diff --git a/src/finn/transformation/fpgadataflow/set_folding.py b/src/finn/transformation/fpgadataflow/set_folding.py index eca1053f8f..871919f3f2 100644 --- a/src/finn/transformation/fpgadataflow/set_folding.py +++ b/src/finn/transformation/fpgadataflow/set_folding.py @@ -31,6 +31,7 @@ from qonnx.custom_op.registry import getCustomOp from qonnx.transformation.base import Transformation from qonnx.transformation.general import GiveUniqueNodeNames +from qonnx.core.datatype import DataType from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance from finn.transformation.fpgadataflow.annotate_cycles import AnnotateCycles @@ -80,11 +81,12 @@ class SetFolding(Transformation): unfolded before SIMD is increased """ - def __init__(self, target_cycles_per_frame=1000, mvau_wwidth_max=36, two_pass_relaxation=True): + def __init__(self, target_cycles_per_frame=1000, mvau_wwidth_max=36, two_pass_relaxation=True, fpga_part=None): super().__init__() self.target_cycles_per_frame = target_cycles_per_frame self.mvau_wwidth_max = mvau_wwidth_max self.two_pass_relaxation = two_pass_relaxation + self.fpga_part = fpga_part def optimize_attribute_val(self, node_inst, max_val, attr_name): node_inst.set_nodeattr(attr_name, 1) @@ -95,6 +97,10 @@ def optimize_attribute_val(self, node_inst, max_val, attr_name): # finish if target met break + def _is_versal(self, fpga_part): + assert fpga_part is not None, "Please specify a target board before setting the folding configuration for a more efficient folding configuration for RTL-based MVU/VVU" + return fpga_part[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"] or fpga_partt[0:5] == "xqrvc" + def apply(self, model): graph = model.graph # these ops use PE parallelism, up to a max value of NumChannels @@ -112,13 +118,14 @@ def apply(self, model): simd_ops = [ "DownSampler", "FMPadding_Batch", + "FMPadding_Batch_rtl", "ConvolutionInputGenerator", "ConvolutionInputGenerator1D", "ConvolutionInputGenerator_rtl", ] # these ops are preceded by depthwise SWG and have special behavior, # as explained in the SetFolding docstring - depthwise_op_exceptions = ["VectorVectorActivation", "Pool_Batch"] + depthwise_op_exceptions = ["VectorVectorActivation", "VectorVectorActivation_rtl", "Pool_Batch"] for node in graph.node: if not is_fpgadataflow_node(node): continue @@ -148,6 +155,37 @@ def apply(self, model): break # increase PE until target met or reached max_pe self.optimize_attribute_val(node_inst, max_pe, "PE") + if op_type == "MatrixVectorActivation_rtl": + max_simd = node_inst.get_nodeattr("MW") + max_pe = node_inst.get_nodeattr("MH") + node_inst.set_nodeattr("PE", 1) + node_inst.set_nodeattr("SIMD", 1) + # Depending on the board and the layer's config, either the + # SIMD or PE folding dimension would be preferred to enable efficient DSP-packing + act_width = DataType[node_inst.get_nodeattr("inputDataType")].bitwidth() + weight_width = DataType[node_inst.get_nodeattr("weightDataType")].bitwidth() + is_versal = self._is_versal(self.fpga_part) + is_dsp48 = act_width < 5 and weight_width < 5 or not(is_versal) + preferred_folding_dimension = "PE" if is_dsp48 else "SIMD" + preferred_folding_max = max_pe if is_dsp48 else max_simd + second_folding_dimension = "SIMD" if is_dsp48 else "PE" + second_folding_max = max_simd if is_dsp48 else max_pe + for fold_val in divisors(preferred_folding_max): + prev_fold_val = node_inst.get_nodeattr(preferred_folding_dimension) + node_inst.set_nodeattr(preferred_folding_dimension, fold_val) + cyc = node_inst.get_exp_cycles() + if cyc < self.target_cycles_per_frame: + # finish if target met + break + if ( + node_inst.get_weight_datatype().bitwidth() * node_inst.get_nodeattr(preferred_folding_dimension) + > self.mvau_wwidth_max + ): + # revert if we've gone above width threshold + node_inst.set_nodeattr(preferred_folding_dimension, prev_fold_val) + break + # increase SIMD until target met or reached max_simd + self.optimize_attribute_val(node_inst, second_folding_max, second_folding_dimension) elif op_type in pe_ops: max_pe = node_inst.get_nodeattr("NumChannels") self.optimize_attribute_val(node_inst, max_pe, "PE") @@ -156,37 +194,44 @@ def apply(self, model): self.optimize_attribute_val(node_inst, max_pe, "PE") elif op_type in depthwise_op_exceptions: # init/reset SIMD of VVAU - if op_type == "VectorVectorActivation": - node_inst.set_nodeattr("SIMD", 1) + is_hls_vvu_or_pool = op_type in ["VectorVectorActivation", "Pool_Batch"] max_pe = node_inst.get_nodeattr("Channels") - self.optimize_attribute_val(node_inst, max_pe, "PE") - # increase SIMD for VVAU once PE is exhausted - pe = node_inst.get_nodeattr("PE") + max_simd = np.prod(node_inst.get_nodeattr("Kernel")) if op_type.startswith("VectorVectorActivation") else 0 + preferred_folding_dimension = "PE" if is_hls_vvu_or_pool else "SIMD" + preferred_folding_max = max_pe if is_hls_vvu_or_pool else max_simd + second_folding_dimension = "SIMD" if is_hls_vvu_or_pool else "PE" + second_folding_max = max_simd if is_hls_vvu_or_pool else max_pe + if op_type.startswith("VectorVectorActivation"): + node_inst.set_nodeattr(second_folding_dimension, 1) + self.optimize_attribute_val(node_inst, preferred_folding_max, preferred_folding_dimension) + # increase SIMD(/PE) for VVAU once PE(/SIMD) is exhausted + fold_val = node_inst.get_nodeattr(preferred_folding_dimension) cyc = node_inst.get_exp_cycles() if ( - op_type == "VectorVectorActivation" - and pe == max_pe + op_type.startswith("VectorVectorActivation") + and fold_val == preferred_folding_max and cyc > self.target_cycles_per_frame ): - max_simd = np.prod(node_inst.get_nodeattr("Kernel")) - self.optimize_attribute_val(node_inst, max_simd, "SIMD") - # also set the folding of the upsteam DW SWU + self.optimize_attribute_val(node_inst, second_folding_max, second_folding_dimension) + # also set the folding of the upsteam DW SWU (in case of HLS-based VVU) # which must be identical to this node swu_node = model.find_producer(node.input[0]) if swu_node.op_type.startswith("ConvolutionInputGenerator"): swu_node_inst = getCustomOp(swu_node) - swu_node_inst.set_nodeattr("SIMD", pe) # enable parallel_window mode of RTL SWG if needed if swu_node.op_type == "ConvolutionInputGenerator_rtl": if ( - op_type == "VectorVectorActivation" + op_type.startswith("VectorVectorActivation") and node_inst.get_nodeattr("SIMD") > 1 ): swu_node_inst.set_nodeattr("parallel_window", 1) + swu_node_inst.set_nodeattr("SIMD", max_pe) else: swu_node_inst.set_nodeattr("parallel_window", 0) + pe = node_inst.get_nodeattr("PE") + swu_node_inst.set_nodeattr("SIMD", pe) else: - if op_type == "VectorVectorActivation": + if op_type.startswith("VectorVectorActivation"): ksize = np.prod(node_inst.get_nodeattr("Kernel")) elif op_type == "Pool_Batch": ksize = node_inst.get_nodeattr("KernelSize") From 92bc515255114f23fd889f3010924c07a1018fb1 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 20 Nov 2023 14:45:28 +0000 Subject: [PATCH 196/235] [to-rtl transformation]: extended with additional checker to ensure the HLS-based MVU/VVU does not have the activation function embedded --- .../transformation/fpgadataflow/specialize_to_rtl_layers.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py index 5061282695..1bd83217ab 100644 --- a/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py +++ b/src/finn/transformation/fpgadataflow/specialize_to_rtl_layers.py @@ -79,6 +79,7 @@ def apply(self, model): ram_style = getCustomOp(n).get_nodeattr("ram_style") resType = getCustomOp(n).get_nodeattr("resType") runtime_writeable_weights = getCustomOp(n).get_nodeattr("runtime_writeable_weights") + assert(getCustomOp(n).get_nodeattr("noActivation")==1), "Layer {} currently has thresholds embedded. Please implement the Thresholding layer standalone to enable the RTL-based MatrixVector unit".format(n.name) new_node = helper.make_node( "MatrixVectorActivation_rtl", @@ -156,6 +157,7 @@ def apply(self, model): runtime_writeable_weights = getCustomOp(n).get_nodeattr("runtime_writeable_weights") ram_style = getCustomOp(n).get_nodeattr("ram_style") resType = getCustomOp(n).get_nodeattr("resType") + assert(getCustomOp(n).get_nodeattr("noActivation")==1), "Layer {} currently has thresholds embedded. Please implement the Thresholding layer standalone to enable the RTL-based MatrixVector unit".format(n.name) new_node = helper.make_node( "VectorVectorActivation_rtl", From 31914b1d243a597f68192b932403d8b247047056 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 27 Nov 2023 11:17:44 +0000 Subject: [PATCH 197/235] [build steps]: move specialize_to_rtl step to be applied after convert_to_hls step --- src/finn/builder/build_dataflow_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py index 073bc9e12b..0d6911035c 100644 --- a/src/finn/builder/build_dataflow_config.py +++ b/src/finn/builder/build_dataflow_config.py @@ -116,12 +116,12 @@ class VerificationStepType(str, Enum): "step_tidy_up", "step_streamline", "step_convert_to_hls", + "step_specialize_to_rtl", "step_create_dataflow_partition", "step_target_fps_parallelization", "step_apply_folding_config", "step_minimize_bit_width", "step_generate_estimate_reports", - "step_specialize_to_rtl", "step_hls_codegen", "step_hls_ipgen", "step_set_fifo_depths", From fa1d11624bffc717bd82dc52748c97f73d574ef2 Mon Sep 17 00:00:00 2001 From: Yaman Umuroglu Date: Fri, 24 Nov 2023 15:17:48 +0000 Subject: [PATCH 198/235] [Test] fix data layout for golden/ret comparison in RTL MVU test --- .../test_fpgadataflow_vvau_rtl.py | 129 ++++++++++-------- 1 file changed, 74 insertions(+), 55 deletions(-) diff --git a/tests/fpgadataflow/test_fpgadataflow_vvau_rtl.py b/tests/fpgadataflow/test_fpgadataflow_vvau_rtl.py index 29132da90e..25fad308ee 100644 --- a/tests/fpgadataflow/test_fpgadataflow_vvau_rtl.py +++ b/tests/fpgadataflow/test_fpgadataflow_vvau_rtl.py @@ -27,41 +27,46 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import pytest -import os import numpy as np +import os +import pickle from onnx import TensorProto, helper -from qonnx.util.basic import ( - qonnx_make_model, - gen_finn_dt_tensor -) -from qonnx.core.modelwrapper import ModelWrapper from qonnx.core.datatype import DataType -from qonnx.transformation.general import GiveUniqueNodeNames -import finn.core.onnx_exec as oxe -import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls -from finn.transformation.fpgadataflow.prepare_ip import PrepareIP -from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP -from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode -from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim -from qonnx.transformation.general import ApplyConfig -import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl +from qonnx.core.modelwrapper import ModelWrapper from qonnx.custom_op.general.im2col import compute_conv_output_dim +from qonnx.custom_op.registry import getCustomOp +from qonnx.transformation.general import ( + ApplyConfig, + GiveReadableTensorNames, + GiveUniqueNodeNames, +) from qonnx.transformation.infer_datatypes import InferDataTypes from qonnx.transformation.infer_shapes import InferShapes from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul -from qonnx.transformation.general import GiveReadableTensorNames -import pickle -from finn.transformation.fpgadataflow.insert_dwc import InsertDWC -from finn.transformation.fpgadataflow.set_fifo_depths import InsertAndSetFIFODepths +from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model + +import finn.core.onnx_exec as oxe +import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl +from finn.transformation.fpgadataflow.create_dataflow_partition import ( + CreateDataflowPartition, +) from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP -from finn.transformation.fpgadataflow.create_dataflow_partition import CreateDataflowPartition -from qonnx.custom_op.registry import getCustomOp -from finn.transformation.fpgadataflow.minimize_accumulator_width import MinimizeAccumulatorWidth -#import qonnx.core.data_layout as DataLayout +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.minimize_accumulator_width import ( + MinimizeAccumulatorWidth, +) +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.set_fifo_depths import InsertAndSetFIFODepths + +# import qonnx.core.data_layout as DataLayout build_dir = os.environ["FINN_BUILD_DIR"] + def make_single_dw_conv_modelwrapper(conv_config, idt, wdt): kernel_size, in_feature_dim, in_chn = conv_config stride = 1 @@ -69,7 +74,7 @@ def make_single_dw_conv_modelwrapper(conv_config, idt, wdt): out_feature_dim = compute_conv_output_dim(in_feature_dim, kernel_size, stride, pad) group = out_chn = in_chn - + conv_param_shape = [out_chn, 1, kernel_size, kernel_size] input_shape = [1, in_chn, in_feature_dim, in_feature_dim] output_shape = [1, out_chn, out_feature_dim, out_feature_dim] @@ -105,14 +110,16 @@ def make_single_dw_conv_modelwrapper(conv_config, idt, wdt): return model + def prepare_inputs(input_tensor): return {"global_in": input_tensor} + @pytest.mark.parametrize("kernel_size", [3]) @pytest.mark.parametrize("in_feature_dim", [5]) @pytest.mark.parametrize("in_chn", [4]) @pytest.mark.parametrize("idt", [DataType["INT8"]]) -#@pytest.mark.parametrize("idt", [DataType["UINT8"]]) +# @pytest.mark.parametrize("idt", [DataType["UINT8"]]) @pytest.mark.parametrize("wdt", [DataType["INT6"]]) @pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S"]) @pytest.mark.parametrize("segmentlen", [1]) @@ -121,19 +128,23 @@ def prepare_inputs(input_tensor): @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado -def test_fpgadataflow_vvau_rtl(kernel_size, in_feature_dim, in_chn, idt, wdt, part, segmentlen, pe, simd): +def test_fpgadataflow_vvau_rtl( + kernel_size, in_feature_dim, in_chn, idt, wdt, part, segmentlen, pe, simd +): # Create depthwise-separable convolution conv_config = (kernel_size, in_feature_dim, in_chn) model = make_single_dw_conv_modelwrapper(conv_config, idt, wdt) model = model.transform(GiveUniqueNodeNames()) model = model.transform(GiveReadableTensorNames()) - model.save(build_dir+"/dw_conv.onnx") + model.save(build_dir + "/dw_conv.onnx") # Obtain golden reference output - golden_in = gen_finn_dt_tensor(model.get_tensor_datatype("global_in"), model.get_tensor_shape("global_in")) + golden_in = gen_finn_dt_tensor( + model.get_tensor_datatype("global_in"), model.get_tensor_shape("global_in") + ) input_dict = prepare_inputs(golden_in) golden_out = oxe.execute_onnx(model, input_dict, return_full_exec_context=True) - with open(build_dir+"/onnx_dws_conv.pkl", "wb") as f: + with open(build_dir + "/onnx_dws_conv.pkl", "wb") as f: pickle.dump(golden_out, f) # Convert to HLS custom-op first @@ -143,26 +154,23 @@ def test_fpgadataflow_vvau_rtl(kernel_size, in_feature_dim, in_chn, idt, wdt, pa model = model.transform(MinimizeAccumulatorWidth()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(GiveReadableTensorNames()) - model.save(build_dir+"/hls_vvau.onnx") + model.save(build_dir + "/hls_vvau.onnx") # Apply folding (i.e. specify to use DSPs) folding_config = { "Defaults": {}, - "ConvolutionInputGenerator_rtl_0": { - "SIMD" : 4, - "parallel_window" : 1 - }, + "ConvolutionInputGenerator_rtl_0": {"SIMD": 4, "parallel_window": 1}, "VectorVectorActivation_0": { - "PE" : pe, - "SIMD" : simd, - "mem_mode" : "decoupled", - "ram_style" : "auto", - "resType" : "dsp", - "preferred_backend" : "rtl" - } + "PE": pe, + "SIMD": simd, + "mem_mode": "decoupled", + "ram_style": "auto", + "resType": "dsp", + "preferred_backend": "rtl", + }, } model = model.transform(ApplyConfig(folding_config)) - model.save(build_dir+"/hls_vvau_folded.onnx") + model.save(build_dir + "/hls_vvau_folded.onnx") # Obtain second reference from HLS-based VVAU layer model = model.transform(SetExecMode("rtlsim")) @@ -170,28 +178,30 @@ def test_fpgadataflow_vvau_rtl(kernel_size, in_feature_dim, in_chn, idt, wdt, pa model = model.transform(HLSSynthIP()) model = model.transform(PrepareRTLSim()) conv_hls_out = oxe.execute_onnx(model, input_dict, return_full_exec_context=True) - with open(build_dir+"/hls_vvau_folded_output.pkl", "wb") as f: + with open(build_dir + "/hls_vvau_folded_output.pkl", "wb") as f: pickle.dump(conv_hls_out, f) # Stitched-IP RTLsim model = model.transform(CreateDataflowPartition(partition_model_dir=build_dir)) - model.save(build_dir+"/ip-stitched.onnx") - partition_model_path = getCustomOp(model.get_nodes_by_op_type("StreamingDataflowPartition")[0]).get_nodeattr("model") + model.save(build_dir + "/ip-stitched.onnx") + partition_model_path = getCustomOp( + model.get_nodes_by_op_type("StreamingDataflowPartition")[0] + ).get_nodeattr("model") partitioned_model = ModelWrapper(partition_model_path) # FIFOs needed for stitched-ip RTLsim, DWC needed for VVU operating on SIMD parallelism partitioned_model = partitioned_model.transform(InsertAndSetFIFODepths(part, 5)) partitioned_model = partitioned_model.transform(PrepareIP(part, 5)) partitioned_model = partitioned_model.transform(HLSSynthIP()) - partitioned_model.save(build_dir+"/partitioned_model.onnx") + partitioned_model.save(build_dir + "/partitioned_model.onnx") partitioned_model = partitioned_model.transform(CreateStitchedIP(part, 5)) partitioned_model.save(partition_model_path) - partitioned_model.set_metadata_prop("rtlsim_trace", build_dir+"/hls-vvu.vcd") + partitioned_model.set_metadata_prop("rtlsim_trace", build_dir + "/hls-vvu.vcd") # set top-level prop for stitched-ip rtlsim and launch partitioned_model.set_metadata_prop("exec_mode", "rtlsim") # transpose input since we're now simulating HW layers (NCHW --> NHWC) - input_dict["global_in"] = np.transpose(input_dict["global_in"], (0,2,3,1)) + input_dict["global_in"] = np.transpose(input_dict["global_in"], (0, 2, 3, 1)) stitched_ip_out = oxe.execute_onnx(partitioned_model, input_dict, return_full_exec_context=True) - with open(build_dir+"/stitched_ip_output.pkl", "wb") as f: + with open(build_dir + "/stitched_ip_output.pkl", "wb") as f: pickle.dump(stitched_ip_out, f) # Apply convert-to-rtl step @@ -201,15 +211,24 @@ def test_fpgadataflow_vvau_rtl(kernel_size, in_feature_dim, in_chn, idt, wdt, pa partitioned_model = partitioned_model.transform(PrepareIP(part, 5)) partitioned_model = partitioned_model.transform(HLSSynthIP()) partitioned_model = partitioned_model.transform(CreateStitchedIP(part, 5)) - partitioned_model.save(build_dir+"/partition_rtl_vvau.onnx") - partitioned_model.set_metadata_prop("rtlsim_trace", build_dir+"/rtl-vvu.vcd") + partitioned_model.save(build_dir + "/partition_rtl_vvau.onnx") + partitioned_model.set_metadata_prop("rtlsim_trace", build_dir + "/rtl-vvu.vcd") # Reset rtlsim_so path to re-generate Pyverilator sim object partitioned_model.set_metadata_prop("rtlsim_so", "") # set top-level prop for stitched-ip rtlsim and launch partitioned_model.set_metadata_prop("exec_mode", "rtlsim") vvu_rtl_out = oxe.execute_onnx(partitioned_model, input_dict, return_full_exec_context=True) - with open(build_dir+"/rtl_vvau_output.pkl", "wb") as f: + with open(build_dir + "/rtl_vvau_output.pkl", "wb") as f: pickle.dump(vvu_rtl_out, f) - - assert (vvu_rtl_out["global_out"] == golden_out["global_out"]).all(), "Output of ONNX model not matching output of stitched-IP RTL model!" - assert (vvu_rtl_out["global_out"] == stitched_ip_out["global_out"]).all(), "Output of stitched-IP HLS model not matching output of stitched-IP RTL model!" + + golden_ret = golden_out["global_out"] + # tranpose hardware-generated outputs NHWC -> NCHW to be comparable + vvu_rtl_ret = vvu_rtl_out["global_out"].transpose(0, 3, 1, 2) + hls_ret = stitched_ip_out["global_out"].transpose(0, 3, 1, 2) + + assert ( + vvu_rtl_ret == golden_ret + ).all(), "Output of ONNX model not matching output of stitched-IP RTL model!" + assert ( + vvu_rtl_ret == hls_ret + ).all(), "Output of stitched-IP HLS model not matching output of stitched-IP RTL model!" From becaac706358c724c27717890b9cf6e0e1bbcef1 Mon Sep 17 00:00:00 2001 From: Yaman Umuroglu Date: Fri, 24 Nov 2023 09:16:33 +0000 Subject: [PATCH 199/235] [RTLCustomOp] IP packaging fixes for pDWC+VVU, fix linting too --- ...datawidthconverter_parallelwindow_batch.py | 131 +++++++++++++++++- .../vectorvectoractivation_rtl.py | 77 +++++----- 2 files changed, 166 insertions(+), 42 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_parallelwindow_batch.py b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_parallelwindow_batch.py index 6a72f17555..2fe7ae8e54 100644 --- a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_parallelwindow_batch.py +++ b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_parallelwindow_batch.py @@ -26,22 +26,27 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import math import numpy as np import os import warnings from qonnx.core.datatype import DataType from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.util.basic import get_rtlsim_trace_depth, make_build_dir from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy +try: + from pyverilator import PyVerilator +except ModuleNotFoundError: + PyVerilator = None + # does not do anything at the ONNX node-by-node level, and input-output # tensor shapes are the same. performs data width conversion at the rtlsim level class StreamingDataWidthConverter_ParallelWindow_Batch(HLSCustomOp): """Class that corresponds to finn-hlslib StreamingDataWidthConverter_ParallelWindow_Batch - function. To be inserted between an RTL-SWG with parallel window mode enabled and a + function. To be inserted between an RTL-SWG with parallel window mode enabled and a VVU.""" def get_nodeattr_types(self): @@ -57,6 +62,9 @@ def get_nodeattr_types(self): "PE": ("i", True, 0), "Channels": ("i", True, 0), "Kernel": ("ints", True, []), + "Mode": ("s", False, ""), + # attribute to save top module name - not user configurable + "gen_top_module": ("s", False, ""), } my_attrs.update(super().get_nodeattr_types()) return my_attrs @@ -373,8 +381,89 @@ def execute_node(self, context, graph): ), """Output shape doesn't match expected shape, should be same as input shape""" + def prepare_codegen_default(self): + template_path = os.environ["FINN_ROOT"] + "/finn-rtllib/dwc/dwc_axi_wrapper.v" + + code_gen_dict = {} + code_gen_dict["$IN_WIDTH$"] = [str(self.get_nodeattr("inWidth"))] + code_gen_dict["$OUT_WIDTH$"] = [str(self.get_nodeattr("outWidth"))] + code_gen_dict["$SIMD$"] = [str(self.get_nodeattr("SIMD"))] + code_gen_dict["$PE$"] = [str(self.get_nodeattr("PE"))] + code_gen_dict["$CHANNELS$"] = [str(self.get_nodeattr("Channels"))] + code_gen_dict["$KERNEL_PROD$"] = [str(np.prod(self.get_nodeattr("Kernel")))] + code_gen_dict["$ACTIVATION_WIDTH$"] = [str(self.get_input_datatype(0).bitwidth())] + code_gen_dict["$MODE$"] = [self.get_nodeattr("Mode")] + + return template_path, code_gen_dict + + def generate_hdl(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + + template_path, code_gen_dict = self.prepare_codegen_default() + # add general parameters to dictionary + code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [self.get_verilog_top_module_name()] + # save top module name so we can refer to it after this node has been renamed + # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject) + self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name()) + + # apply code generation to template + with open(template_path, "r") as f: + template_wrapper = f.read() + for key in code_gen_dict: + # transform list into long string separated by '\n' + code_gen_line = "\n".join(code_gen_dict[key]) + template_wrapper = template_wrapper.replace(key, code_gen_line) + with open( + os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"), "w" + ) as f: + f.write(template_wrapper) + + shutil.copy2( + os.environ["FINN_ROOT"] + "/finn-rtllib/dwc/dwc_parallelwindow.sv", code_gen_dir + ) + shutil.copy2(os.environ["FINN_ROOT"] + "/finn-rtllib/dwc/dwc_upsample.sv", code_gen_dir) + + # set ipgen_path and ip_path so that HLS-Synth transformation + # and stich_ip transformation do not complain + self.set_nodeattr("ipgen_path", code_gen_dir) + self.set_nodeattr("ip_path", code_gen_dir) + + def get_all_verilog_paths(self): + "Return list of all folders containing Verilog code for this node." + + rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/dwc/") + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + return [rtllib_dir, code_gen_dir] + def code_generation_ipi(self): - return super().code_generation_ipi() + """Constructs and returns the TCL for node instantiation in Vivado IPI.""" + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + + rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/dwc/") + source_files = [ + os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"), + rtllib_dir + "dwc_parallelwindow.sv", + rtllib_dir + "dwc_upsample.sv", + ] + + source_target = "./ip/verilog/rtl_ops/%s" % self.onnx_node.name + cmd = ["file mkdir %s" % source_target] + + for f in source_files: + cmd.append("add_files -copy_to %s -norecurse %s" % (source_target, f)) + cmd += [ + "create_bd_cell -type module -reference %s %s" + % (self.get_nodeattr("gen_top_module"), self.onnx_node.name) + ] + + return cmd + + def hls_sname(self): + """Get the naming convention used by Vitis HLS for stream signals + Example: the TDATA for a stream called "out" would be out_V_TDATA. + """ + # no additional prefix/suffix in interface names since this is an RTL component + return "" def lut_estimation(self): """Calculates resource estimations for LUTs""" @@ -387,4 +476,38 @@ def code_generation_ipgen(self, model, fpgapart, clk): super().code_generation_ipgen(model, fpgapart, clk) def ipgen_singlenode_code(self): - super().ipgen_singlenode_code() + pass + + def code_generation_cppsim(self, model): + """Normally: Generates C++ code for simulation (cppsim).""" + pass + + def compile_singlenode_code(self): + pass + + def global_includes(self): + pass + + def defines(self, var): + pass + + def read_npy_data(self): + pass + + def strm_decl(self): + pass + + def docompute(self): + pass + + def dataoutstrm(self): + pass + + def save_as_npy(self): + pass + + def blackboxfunction(self): + pass + + def pragmas(self): + pass diff --git a/src/finn/custom_op/fpgadataflow/vectorvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/vectorvectoractivation_rtl.py index e3f0abb6c5..8277f36b21 100644 --- a/src/finn/custom_op/fpgadataflow/vectorvectoractivation_rtl.py +++ b/src/finn/custom_op/fpgadataflow/vectorvectoractivation_rtl.py @@ -39,13 +39,13 @@ ) from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.util.basic import get_rtlsim_trace_depth, make_build_dir from finn.util.data_packing import ( npy_to_rtlsim_input, numpy_to_hls_code, pack_innermost_dim_as_hex_string, rtlsim_output_to_npy, ) -from finn.util.basic import get_rtlsim_trace_depth, make_build_dir try: from pyverilator import PyVerilator @@ -101,7 +101,7 @@ def get_nodeattr_types(self): {"auto", "block", "distributed", "ultra"}, ), # attribute to save top module name - not user configurable - "gen_top_module": ("s", False, "") + "gen_top_module": ("s", False, ""), } my_attrs.update(super().get_nodeattr_types()) return my_attrs @@ -114,10 +114,6 @@ def minimize_accumulator_width(self, model): fm = self.get_nodeattr("Channels") # put weights into the shape expected by calculate_matvec_accumulator_range weights = weights.reshape(fm, k_h * k_w).transpose() - if len(self.onnx_node.input) > 2: - thresholds = model.get_initializer(self.onnx_node.input[2]) - else: - thresholds = None idt = self.get_input_datatype() (acc_min, acc_max) = calculate_matvec_accumulator_range(weights, idt) @@ -289,7 +285,7 @@ def get_number_output_values(self): nf = np.prod(self.get_folded_output_shape()[:-1]) return nf -# TODO: fix exp_cycles estimations --> depends on fpga_part and clk + # TODO: fix exp_cycles estimations --> depends on fpga_part and clk def get_exp_cycles(self): pe = self.get_nodeattr("PE") simd = self.get_nodeattr("SIMD") @@ -451,7 +447,8 @@ def generate_params(self, model, path): if mem_mode == "decoupled": # also save weights as Verilog .dat file # This file will be ignored when synthesizing UltraScale memory. - weight_filename_rtl = "{}/memblock.dat".format(code_gen_dir) + weight_filename_rtl = self.get_decoupled_weight_filename(abspath=False) + weight_filename_rtl = code_gen_dir + "/" + weight_filename_rtl self.make_weight_file(weights, "decoupled_verilog_dat", weight_filename_rtl) else: raise Exception( @@ -622,7 +619,8 @@ def get_verilog_top_module_intf_names(self): return intf_names def code_generation_ipi(self): - cmd = [] + source_target = "./ip/verilog/rtl_ops/%s" % self.onnx_node.name + cmd = ["file mkdir %s" % source_target] # add streamer if needed mem_mode = self.get_nodeattr("mem_mode") if mem_mode == "decoupled": @@ -653,9 +651,7 @@ def code_generation_ipi(self): code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/") sourcefiles = [ - os.path.join( - code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v" - ), + os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"), rtllib_dir + "mvu_vvu_axi.sv", rtllib_dir + "replay_buffer.sv", rtllib_dir + "mvu_4sx4u.sv", @@ -664,7 +660,7 @@ def code_generation_ipi(self): rtllib_dir + "mvu_vvu_lut.sv", ] for f in sourcefiles: - cmd.append("add_files -norecurse %s" % (f)) + cmd += ["add_files -copy_to %s -norecurse %s" % (source_target, f)] cmd.append( "create_bd_cell -type hier -reference %s /%s/%s" % ( @@ -690,7 +686,7 @@ def code_generation_ipi(self): % ( self.calc_wmem(), self.get_weightstream_width_padded(), - self.get_nodeattr("code_gen_dir_ipgen") + "/memblock.dat", + self.get_decoupled_weight_filename(abspath=False), self.get_nodeattr("ram_style"), node_name, strm_inst, @@ -747,9 +743,7 @@ def code_generation_ipi(self): code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/") sourcefiles = [ - os.path.join( - code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v" - ), + os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"), rtllib_dir + "mvu_vvu_axi.sv", rtllib_dir + "replay_buffer.sv", rtllib_dir + "mvu_4sx4u.sv", @@ -836,7 +830,7 @@ def bram_efficiency_estimation(self): bram16_est_capacity = bram16_est * 36 * 512 return wbits / bram16_est_capacity -# TODO: fix estimations + # TODO: fix estimations def lut_estimation(self): """Calculates resource estimations for LUTs based on: - FINN-R: An End-to-End Deep-Learning Framework for Fast @@ -886,11 +880,9 @@ def lut_estimation(self): np.ceil(alpha + math.log(1 + pow(2, -alpha), 2) + 1), ) acc_luts = acc_bits - return int( - c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts)) + c2 - ) + return int(c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts)) + c2) -# TODO: fix estimations + # TODO: fix estimations def dsp_estimation(self): # multiplication P = self.get_nodeattr("PE") @@ -965,9 +957,7 @@ def generate_hdl(self, model, fpgapart, clk): template_path, code_gen_dict = self.prepare_codegen_default(fpgapart, clk) # add general parameters to dictionary - code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [ - self.get_verilog_top_module_name() - ] + code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [self.get_verilog_top_module_name()] # save top module name so we can refer to it after this node has been renamed # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject) self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name()) @@ -980,16 +970,12 @@ def generate_hdl(self, model, fpgapart, clk): code_gen_line = "\n".join(code_gen_dict[key]) template_wrapper = template_wrapper.replace(key, code_gen_line) with open( - os.path.join( - code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v" - ), + os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"), "w", ) as f: f.write(template_wrapper.replace("$FORCE_BEHAVIORAL$", str(0))) with open( - os.path.join( - code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper_sim.v" - ), + os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper_sim.v"), "w", ) as f: f.write(template_wrapper.replace("$FORCE_BEHAVIORAL$", str(1))) @@ -1019,9 +1005,10 @@ def _resolve_impl_style(self, fpgapart): fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"] or fpgapart[0:5] == "xqrvc" ) - assert (is_dsp_targeted and is_versal), "DSP-based (RTL) VVU currently only supported on Versal (DSP58) devices" + assert ( + is_dsp_targeted and is_versal + ), "DSP-based (RTL) VVU currently only supported on Versal (DSP58) devices" return "mvu_vvu_8sx9_dsp58" - def prepare_codegen_default(self, fpgapart, clk): template_path = os.environ["FINN_ROOT"] + "/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v" @@ -1034,16 +1021,14 @@ def prepare_codegen_default(self, fpgapart, clk): code_gen_dict["$MH$"] = [str(self.get_nodeattr("Channels"))] code_gen_dict["$PE$"] = [str(self.get_nodeattr("PE"))] code_gen_dict["$SIMD$"] = [str(self.get_nodeattr("SIMD"))] - code_gen_dict["$ACTIVATION_WIDTH$"] = [ - str(self.get_input_datatype(0).bitwidth()) - ] + code_gen_dict["$ACTIVATION_WIDTH$"] = [str(self.get_input_datatype(0).bitwidth())] code_gen_dict["$WEIGHT_WIDTH$"] = [str(self.get_input_datatype(1).bitwidth())] code_gen_dict["$ACCU_WIDTH$"] = [str(self.get_output_datatype().bitwidth())] code_gen_dict["$SIGNED_ACTIVATIONS$"] = ( [str(1)] if (self.get_input_datatype(0).min() < 0) else [str(0)] ) code_gen_dict["$SEGMENTLEN$"] = [str(self._resolve_segment_len(clk))] - #code_gen_dict["$FORCE_BEHAVIORAL$"] = [str(0)] + # code_gen_dict["$FORCE_BEHAVIORAL$"] = [str(0)] return template_path, code_gen_dict @@ -1071,4 +1056,20 @@ def prepare_rtlsim(self): # save generated lib filename in attribute self.set_nodeattr("rtlsim_so", sim.lib._name) - return sim \ No newline at end of file + return sim + + def get_all_verilog_paths(self): + "Return list of all folders containing Verilog code for this node." + + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + # Path to (System-)Verilog files used by top-module & path to top-module + verilog_paths = [code_gen_dir, os.environ["FINN_ROOT"] + "/finn-rtllib/mvu"] + return verilog_paths + + def get_verilog_top_filename(self): + "Return the Verilog top module filename for this node." + + verilog_file = "{}/{}_wrapper.v".format( + self.get_nodeattr("code_gen_dir_ipgen"), self.get_nodeattr("gen_top_module") + ) + return verilog_file From cf7f4946dc44f264de665e8a23893bd858277796 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 1 Nov 2023 15:20:07 +0000 Subject: [PATCH 200/235] [mvu vvu axi]: minor bugfixes to enable VVU --- finn-rtllib/mvu/mvu_vvu_axi.sv | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index f0f75c633a..ddedec1e8a 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -64,7 +64,7 @@ module mvu_vvu_axi #( localparam int unsigned INPUT_STREAM_WIDTH_BA = ((IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8, localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH, localparam int unsigned INPUT_STREAM_WIDTH = (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH, - localparam int unsigned SF = IS_MVU ? MW/SIMD : MW/(SIMD*PE), + localparam int unsigned SF = MW/SIMD, localparam int unsigned NF = IS_MVU ? MH/PE : 1, localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8 ) @@ -91,11 +91,11 @@ module mvu_vvu_axi #( //-------------------- Parameter sanity checks --------------------\\ initial begin - if ((MW % SIMD != 0 && IS_MVU) || (MW % (SIMD*PE) != 0 && !IS_MVU)) begin + if (MW % SIMD != 0) begin $error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD); $finish; end - if (MH % PE != 0 && IS_MVU) begin + if (MH % PE != 0) begin $error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE); $finish; end @@ -152,13 +152,10 @@ module mvu_vvu_axi #( //-------------------- Core MVU/VVU --------------------\\ uwire ovld; uwire [PE-1:0][ACCU_WIDTH-1:0] odat; - typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t; uwire mvauin_t amvau_i; - uwire mvauin_weight_t wmvau_i; if (IS_MVU) begin : genMVUInput assign amvau_i = amvau; - assign wmvau_i = s_axis_weights_tdata; end : genMVUInput else begin : genVVUInput // The input stream will have the channels interleaved for VVU when PE>1 @@ -169,11 +166,8 @@ module mvu_vvu_axi #( localparam int num_of_elements = PE*SIMD; for (genvar i=0; i 1) ? - amvau[(i/SIMD + (i*PE % num_of_elements) + 1) * ACTIVATION_WIDTH : (i/SIMD + (i*PE % num_of_elements)) * ACTIVATION_WIDTH] + amvau[(i/SIMD + (i*PE % num_of_elements) + 1) * ACTIVATION_WIDTH -1: (i/SIMD + (i*PE % num_of_elements)) * ACTIVATION_WIDTH] : amvau[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH]; - assign wmvau_i[i*WEIGHT_WIDTH +: WEIGHT_WIDTH] = (PE > 1) ? - s_axis_weights_tdata[( ((SIMD-1-i) + int'(i/SIMD)*SIMD) + int'(i/SIMD) * SIMD + 1) * WEIGHT_WIDTH : ( ((SIMD-1-i) + int'(i/SIMD)*SIMD) + int'(i/SIMD) * SIMD ) * WEIGHT_WIDTH] - : s_axis_weights_tdata[i*WEIGHT_WIDTH +: WEIGHT_WIDTH]; end : genRewire end : genVVUInput @@ -183,7 +177,7 @@ module mvu_vvu_axi #( .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( .clk, .rst, .en, - .last(alast && avld), .zero(!istb), .w(wmvau_i), .a(amvau_i), + .last(alast && avld), .zero(!istb), .w(s_axis_weights_tdata), .a(amvau_i), .vld(ovld), .p(odat) ); "mvu_4sx4u": From 5ffc221eaa07828001e423551ad05f8207178656 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 20 Nov 2023 14:35:45 +0000 Subject: [PATCH 201/235] [mvu vvu axi]: minor fix -- define mvauin_weight_t --- finn-rtllib/mvu/mvu_vvu_axi.sv | 1 + 1 file changed, 1 insertion(+) diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index ddedec1e8a..8eb92a93e6 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -153,6 +153,7 @@ module mvu_vvu_axi #( uwire ovld; uwire [PE-1:0][ACCU_WIDTH-1:0] odat; uwire mvauin_t amvau_i; + typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t; if (IS_MVU) begin : genMVUInput assign amvau_i = amvau; From 40d652ccb817295e5668ed765f8e348346584465 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 29 Nov 2023 14:02:33 +0000 Subject: [PATCH 202/235] [rtl mvu op]: minor fix to chain length estimation and enabled behavioral mode for rtl sim --- .../custom_op/fpgadataflow/matrixvectoractivation_rtl.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py index d0a638475a..da560d73fd 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py @@ -966,10 +966,12 @@ def _resolve_segment_len(self, clk): # Insert pipeline registers in the DSP58 chain to meet target clock frequency # 0.741 ns seems the worst-case delay through first DSP # 0.605 ns seems to be (on average) delay for all subsequent DSPs - critical_path_dsps = np.floor((clk - 0.741) / 0.605) + # clk >= (critical_path_dsps - 1) * 0.605 + 0.741 + assert (clk > 0.741), "Infeasible clk target of {} ns has been set, consider lowering the targeted clock frequency!".format(clk) + critical_path_dsps = np.floor((clk - 0.741) / 0.605 + 1) max_chain_len = np.ceil(self.get_nodeattr("SIMD") / 3) dsp_chain_len = critical_path_dsps if critical_path_dsps < max_chain_len else max_chain_len - return max(1, dsp_chain_len) + return dsp_chain_len def _resolve_impl_style(self, fpgapart): # Based on target device and activation/weight-width, choose the @@ -1051,7 +1053,6 @@ def prepare_codegen_default(self, fpgapart, clk): [str(1)] if (self.get_input_datatype(0).min() < 0) else [str(0)] ) code_gen_dict["$SEGMENTLEN$"] = [str(self._resolve_segment_len(clk))] - code_gen_dict["$FORCE_BEHAVIORAL$"] = [str(0)] return template_path, code_gen_dict From 3a1d9d26c93451a7d8ec2c63b0832234fd10a598 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 29 Nov 2023 17:42:58 +0000 Subject: [PATCH 203/235] [mvu vvu axi]: minor changes to enable double-pumped DSPs for uneven SIMD --- finn-rtllib/mvu/mvu_vvu_axi.sv | 47 +++++++++++++--------------------- 1 file changed, 18 insertions(+), 29 deletions(-) diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index 0294d2ce88..98fd522306 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -63,13 +63,12 @@ module mvu_vvu_axi #( bit M_REG_LUT = 1, // Safely deducible parameters - localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, - localparam int unsigned INPUT_STREAM_WIDTH_BA = ((IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8, - localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH, - localparam int unsigned INPUT_STREAM_WIDTH = (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH, - localparam int unsigned SF = MW/SIMD, - localparam int unsigned NF = IS_MVU ? MH/PE : 1, - localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8 + localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, + localparam int unsigned INPUT_STREAM_WIDTH_BA = ((IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8, + localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH, + localparam int unsigned INPUT_STREAM_WIDTH = (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH, + localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8, + localparam bit SIMD_UNEVEN = SIMD % 2 ) ( // Global Control @@ -129,17 +128,11 @@ module mvu_vvu_axi #( end end - //- Pumping Constraints --------- - if(PUMPED_COMPUTE) begin - if(SIMD % 2 != 0) begin - $error("Odd SIMD=%0d is incompatible with pumped compute.", SIMD); - $finish; - end - end end - uwire clk = ap_clk; - uwire rst = !ap_rst_n; + uwire clk = ap_clk; + uwire clk2x = ap_clk2x; + uwire rst = !ap_rst_n; //- Replay to Accommodate Neuron Fold ----------------------------------- typedef logic [(IS_MVU? 1:PE)*SIMD-1:0][ACTIVATION_WIDTH-1:0] mvu_flatin_t; @@ -175,17 +168,11 @@ module mvu_vvu_axi #( // Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like: // (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to // (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i, P_1), ..., (S_i, P_i) - for(genvar pe = 0; pe < (IS_MVU? 1:PE); pe++) begin + for(genvar pe = 0; pe < ACT_PE; pe++) begin for(genvar simd = 0; simd < SIMD; simd++) begin - assign amvau_i[pe][simd] = amvau[]; // TODO: Do the right thing as below here. + assign amvau_i[pe][simd] = amvau[simd*ACT_PE+pe]; end end - - localparam int num_of_elements = INPUT_STREAM_WIDTH/ACTIVATION_WIDTH; - for (genvar i=0; i Date: Wed, 29 Nov 2023 17:48:37 +0000 Subject: [PATCH 204/235] [axi wrapper]: add port for double-clock --- finn-rtllib/mvu/mvu_vvu_axi_wrapper.v | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v index 01deb23840..11949dec24 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v +++ b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v @@ -34,6 +34,7 @@ module $MODULE_NAME_AXI_WRAPPER$ #( parameter IS_MVU = $IS_MVU$, parameter COMPUTE_CORE = "$COMPUTE_CORE$", + parameter PUMPED_COMPUTE = $PUMPED_COMPUTE$, parameter MW = $MW$, parameter MH = $MH$, parameter PE = $PE$, @@ -54,6 +55,9 @@ module $MODULE_NAME_AXI_WRAPPER$ #( (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *) (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *) input ap_clk, + (* X_INTERFACE_PARAMETER = "ASSOCIATED_RESET ap_rst_n" *) + (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk2x CLK" *) + input ap_clk2x, (* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *) input ap_rst_n, @@ -72,11 +76,12 @@ module $MODULE_NAME_AXI_WRAPPER$ #( ); mvu_vvu_axi #( - .IS_MVU(IS_MVU), .COMPUTE_CORE(COMPUTE_CORE), .MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD), + .IS_MVU(IS_MVU), .COMPUTE_CORE(COMPUTE_CORE), .PUMPED_COMPUTE(PUMPED_COMPUTE), .MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL) ) inst ( .ap_clk(ap_clk), + .ap_clk2x(ap_clk2x), .ap_rst_n(ap_rst_n), .s_axis_weights_tdata(weights_V_TDATA), .s_axis_weights_tvalid(weights_V_TVALID), From 58f191e9cb0cf158db4a6dbc7b100cc0507d6ee6 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 1 Dec 2023 15:05:20 +0000 Subject: [PATCH 205/235] [builder]: add flag for enabling pumped compute --- src/finn/builder/build_dataflow_config.py | 3 +++ src/finn/builder/build_dataflow_steps.py | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py index 0d6911035c..af1ce12dc0 100644 --- a/src/finn/builder/build_dataflow_config.py +++ b/src/finn/builder/build_dataflow_config.py @@ -230,6 +230,9 @@ class DataflowBuildConfig: #: very high performance. mvau_wwidth_max: Optional[int] = 36 + #: (Optional) Double-pump DSP58s in MVU/VVU layers if possible + enable_pumped_compute: Optional[bool] = False + #: (Optional) Whether thresholding layers (which implement quantized #: activations in FINN) will be implemented as stand-alone HLS layers, #: instead of being part of MatrixVectorActivation layer. This gives larger diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py index 83278aae41..7af3f4c3ab 100644 --- a/src/finn/builder/build_dataflow_steps.py +++ b/src/finn/builder/build_dataflow_steps.py @@ -480,6 +480,12 @@ def step_specialize_to_rtl(model: ModelWrapper, cfg: DataflowBuildConfig): specialize_to_rtl_transforms = [to_rtl.InferRTLMatrixVectorActivation(), to_rtl.InferRTLVectorVectorActivation()] for trn in specialize_to_rtl_transforms: model = model.transform(trn) + + # If double-pumping enabled, annotate relevant MVU/VVU layers + if cfg.enable_double_pump: + for n in model.graph.node: + if n.op_type in ["MatrixVectorActivation_rtl", "VectorVectorActivation_rtl"]: + getCustomOp(n).set_nodeattr("pumpedCompute", 1) return model From f435aed5b2a5ffc5ca03dd65d8d45a3a3c6bb2aa Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 1 Dec 2023 15:06:50 +0000 Subject: [PATCH 206/235] [hls custom op]: add clk2x interface --- src/finn/custom_op/fpgadataflow/hlscustomop.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/finn/custom_op/fpgadataflow/hlscustomop.py b/src/finn/custom_op/fpgadataflow/hlscustomop.py index 4fed8ed4b5..c0b9f0735f 100644 --- a/src/finn/custom_op/fpgadataflow/hlscustomop.py +++ b/src/finn/custom_op/fpgadataflow/hlscustomop.py @@ -142,6 +142,7 @@ def get_verilog_top_module_intf_names(self): Each block must have at most one aximm and one axilite.""" intf_names = {} intf_names["clk"] = ["ap_clk"] + intf_names["clk2x"] = ["ap_clk2x"] intf_names["rst"] = ["ap_rst_n"] sname = self.hls_sname() intf_names["s_axis"] = [("in0_" + sname, self.get_instream_width_padded())] From 4a8ff5924868be72fdfcd64652181214f07ad388 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 1 Dec 2023 15:07:30 +0000 Subject: [PATCH 207/235] [mvu rtl]: add pumped compute attribute and fill out template accordingly --- .../fpgadataflow/matrixvectoractivation_rtl.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py index da560d73fd..a66c6f4b2f 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py @@ -73,6 +73,7 @@ def get_nodeattr_types(self): "MW": ("i", True, 0), "MH": ("i", True, 0), "resType": ("s", False, "dsp", {"auto", "lut", "dsp"}), + "pumpedCompute": ("i", False, 0, {0, 1}), # FINN DataTypes for inputs, weights, outputs "inputDataType": ("s", True, ""), "weightDataType": ("s", True, ""), @@ -779,6 +780,9 @@ def code_generation_ipi(self): din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0] cmd.append("create_bd_cell -type hier %s" % node_name) cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name)) + if self.get_nodeattr("pumpedCompute"): + clk2x_name = self.get_verilog_top_module_intf_names()["clk2x"][0] + cmd.append("create_bd_pin -dir I -type clk2x /%s/%s" % (node_name, clk2x_name)) cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name)) cmd.append( "create_bd_intf_pin -mode Master " @@ -858,6 +862,11 @@ def code_generation_ipi(self): "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]" % (node_name, clk_name, node_name, node_name, clk_name) ) + if self.get_nodeattr("pumpedCompute"): + cmd.append( + "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]" + % (node_name, clk2x_name, node_name, node_name, clk2x_name) + ) cmd.append( "connect_bd_intf_net [get_bd_intf_pins %s/%s] " "[get_bd_intf_pins %s/%s/%s]" @@ -1040,6 +1049,7 @@ def prepare_codegen_default(self, fpgapart, clk): code_gen_dict = {} code_gen_dict["$IS_MVU$"] = [str(1)] code_gen_dict["$COMPUTE_CORE$"] = [self._resolve_impl_style(fpgapart)] + code_gen_dict["$PUMPED_COMPUTE$"] = [str(self.get_nodeattr("pumpedCompute"))] code_gen_dict["$MW$"] = [str(self.get_nodeattr("MW"))] code_gen_dict["$MH$"] = [str(self.get_nodeattr("MH"))] code_gen_dict["$PE$"] = [str(self.get_nodeattr("PE"))] From f38fd6b5cc29b5c97e684e1d9f209148cc1e7344 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 1 Dec 2023 15:08:15 +0000 Subject: [PATCH 208/235] [stitched ip]: wire up clk2x interface --- .../fpgadataflow/create_stitched_ip.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py index 9a653fe404..63f98e6156 100644 --- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py +++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py @@ -102,6 +102,7 @@ def __init__(self, fpgapart, clk_ns, ip_name="finn_design", vitis=False, signatu # keep track of top-level interface names self.intf_names = { "clk": [], + "clk2x": [], "rst": [], "s_axis": [], "m_axis": [], @@ -113,19 +114,25 @@ def connect_clk_rst(self, node): inst_name = node.name node_inst = getCustomOp(node) clock_intf_name = node_inst.get_verilog_top_module_intf_names()["clk"][0] + clock2x_intf_name = node_inst.get_verilog_top_module_intf_names()["clk2x"][0] reset_intf_name = node_inst.get_verilog_top_module_intf_names()["rst"][0] # make clock and reset external, if they aren't already if not self.clock_reset_are_external: self.connect_cmds.append( "make_bd_pins_external [get_bd_pins %s/%s]" % (inst_name, clock_intf_name) ) + self.connect_cmds.append( + "make_bd_pins_external [get_bd_pins %s/%s]" % (inst_name, clock2x_intf_name) + ) self.connect_cmds.append("set_property name ap_clk [get_bd_ports ap_clk_0]") + self.connect_cmds.append("set_property name ap_clk2x [get_bd_ports ap_clk2x_0]") self.connect_cmds.append( "make_bd_pins_external [get_bd_pins %s/%s]" % (inst_name, reset_intf_name) ) self.connect_cmds.append("set_property name ap_rst_n [get_bd_ports ap_rst_n_0]") self.clock_reset_are_external = True self.intf_names["clk"] = ["ap_clk"] + self.intf_names["clk2x"] = ["ap_clk2x"] self.intf_names["rst"] = ["ap_rst_n"] # otherwise connect clock and reset else: @@ -137,6 +144,10 @@ def connect_clk_rst(self, node): "connect_bd_net [get_bd_ports ap_clk] [get_bd_pins %s/%s]" % (inst_name, clock_intf_name) ) + self.connect_cmds.append( + "connect_bd_net [get_bd_ports ap_clk2x] [get_bd_pins %s/%s]" + % (inst_name, clock2x_intf_name) + ) def connect_axi(self, node): inst_name = node.name @@ -376,6 +387,13 @@ def apply(self, model): fclk_hz = fclk_mhz * 1000000 model.set_metadata_prop("clk_ns", str(self.clk_ns)) tcl.append("set_property CONFIG.FREQ_HZ %d [get_bd_ports /ap_clk]" % round(fclk_hz)) + tcl.append("set_property CONFIG.FREQ_HZ %d [get_bd_ports /ap_clk2x]" % round(2*fclk_hz)) + # tcl.append( + # "set_property CONFIG.FREQ_HZ %d [get_bd_intf_pins MatrixVectorActivation_rtl_0/s_axilite_0]" % round(fclk_hz) + # ) + # tcl.append( + # "set_property CONFIG.FREQ_HZ %d [get_bd_intf_pins MatrixVectorActivation_rtl_0/in0_V]" % round(fclk_hz) + # ) tcl.append("validate_bd_design") tcl.append("save_bd_design") # create wrapper hdl (for rtlsim later on) From 078888af360baf455e99b473e367b3f5f4dbbaeb Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 1 Dec 2023 15:22:24 +0000 Subject: [PATCH 209/235] [mvu vvu axi]: removed SIMD%2 constraint for double-pumped DSP58 --- finn-rtllib/mvu/mvu_vvu_axi.sv | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index 98fd522306..3379577046 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -63,14 +63,13 @@ module mvu_vvu_axi #( bit M_REG_LUT = 1, // Safely deducible parameters - localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, - localparam int unsigned INPUT_STREAM_WIDTH_BA = ((IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8, - localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH, - localparam int unsigned INPUT_STREAM_WIDTH = (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH, - localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8, - localparam bit SIMD_UNEVEN = SIMD % 2 -) -( + localparam int unsigned WEIGHT_STREAM_WIDTH = PE * SIMD * WEIGHT_WIDTH, + localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (WEIGHT_STREAM_WIDTH + 7)/8 * 8, + localparam int unsigned INPUT_STREAM_WIDTH = (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH, + localparam int unsigned INPUT_STREAM_WIDTH_BA = (INPUT_STREAM_WIDTH + 7)/8 * 8, + localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8, + localparam bit SIMD_UNEVEN = SIMD % 2 +)( // Global Control input logic ap_clk, input logic ap_clk2x, // synchronous, double-speed clock; only used for PUMPED_COMPUTE @@ -128,6 +127,13 @@ module mvu_vvu_axi #( end end + // //- Pumping Constraints --------- + // if(PUMPED_COMPUTE) begin + // if(SIMD % 2 != 0) begin + // $error("Odd SIMD=%0d is incompatible with pumped compute.", SIMD); + // $finish; + // end + // end end uwire clk = ap_clk; From bbcbb5a8819601263dd6260137f717c020103629 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 1 Dec 2023 15:42:48 +0000 Subject: [PATCH 210/235] [builder]: minor fix to attribute naming --- src/finn/builder/build_dataflow_steps.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py index 7af3f4c3ab..29401b8f52 100644 --- a/src/finn/builder/build_dataflow_steps.py +++ b/src/finn/builder/build_dataflow_steps.py @@ -482,7 +482,7 @@ def step_specialize_to_rtl(model: ModelWrapper, cfg: DataflowBuildConfig): model = model.transform(trn) # If double-pumping enabled, annotate relevant MVU/VVU layers - if cfg.enable_double_pump: + if cfg.enable_pumped_compute: for n in model.graph.node: if n.op_type in ["MatrixVectorActivation_rtl", "VectorVectorActivation_rtl"]: getCustomOp(n).set_nodeattr("pumpedCompute", 1) From b72d00de9bd2d5a947a50e18e2945b832488b471 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Sun, 3 Dec 2023 22:56:02 +0000 Subject: [PATCH 211/235] [stitched-ip]: minor fixes to creating valid stitched-ip with ap_clk2x interface --- .../fpgadataflow/create_stitched_ip.py | 46 ++++++++++++------- 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py index 63f98e6156..f797e3d841 100644 --- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py +++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py @@ -110,32 +110,41 @@ def __init__(self, fpgapart, clk_ns, ip_name="finn_design", vitis=False, signatu "axilite": [], } + def _is_double_pumped(self, node): + try: + pumped_compute = getCustomOp(node).get_nodeattr("pumpedCompute") + return pumped_compute==1 + except: + return False + def connect_clk_rst(self, node): inst_name = node.name node_inst = getCustomOp(node) clock_intf_name = node_inst.get_verilog_top_module_intf_names()["clk"][0] - clock2x_intf_name = node_inst.get_verilog_top_module_intf_names()["clk2x"][0] + if self._is_double_pumped(node): + clock2x_intf_name = node_inst.get_verilog_top_module_intf_names()["clk2x"][0] reset_intf_name = node_inst.get_verilog_top_module_intf_names()["rst"][0] # make clock and reset external, if they aren't already if not self.clock_reset_are_external: self.connect_cmds.append( "make_bd_pins_external [get_bd_pins %s/%s]" % (inst_name, clock_intf_name) ) - self.connect_cmds.append( - "make_bd_pins_external [get_bd_pins %s/%s]" % (inst_name, clock2x_intf_name) - ) self.connect_cmds.append("set_property name ap_clk [get_bd_ports ap_clk_0]") - self.connect_cmds.append("set_property name ap_clk2x [get_bd_ports ap_clk2x_0]") self.connect_cmds.append( "make_bd_pins_external [get_bd_pins %s/%s]" % (inst_name, reset_intf_name) ) self.connect_cmds.append("set_property name ap_rst_n [get_bd_ports ap_rst_n_0]") self.clock_reset_are_external = True self.intf_names["clk"] = ["ap_clk"] - self.intf_names["clk2x"] = ["ap_clk2x"] self.intf_names["rst"] = ["ap_rst_n"] - # otherwise connect clock and reset - else: + # make clk2x external, if it isn't already and connect clk and reset + elif self._is_double_pumped(node) and not self.clock2x_is_external: + self.connect_cmds.append( + "make_bd_pins_external [get_bd_pins %s/%s]" % (inst_name, clock2x_intf_name) + ) + self.connect_cmds.append("set_property name ap_clk2x [get_bd_ports ap_clk2x_0]") + self.clock2x_is_external = True + self.intf_names["clk2x"] = ["ap_clk2x"] self.connect_cmds.append( "connect_bd_net [get_bd_ports ap_rst_n] [get_bd_pins %s/%s]" % (inst_name, reset_intf_name) @@ -144,10 +153,21 @@ def connect_clk_rst(self, node): "connect_bd_net [get_bd_ports ap_clk] [get_bd_pins %s/%s]" % (inst_name, clock_intf_name) ) + # otherwise connect clock and reset + else: self.connect_cmds.append( - "connect_bd_net [get_bd_ports ap_clk2x] [get_bd_pins %s/%s]" - % (inst_name, clock2x_intf_name) + "connect_bd_net [get_bd_ports ap_rst_n] [get_bd_pins %s/%s]" + % (inst_name, reset_intf_name) ) + self.connect_cmds.append( + "connect_bd_net [get_bd_ports ap_clk] [get_bd_pins %s/%s]" + % (inst_name, clock_intf_name) + ) + if self._is_double_pumped(node): + self.connect_cmds.append( + "connect_bd_net [get_bd_ports ap_clk2x] [get_bd_pins %s/%s]" + % (inst_name, clock2x_intf_name) + ) def connect_axi(self, node): inst_name = node.name @@ -388,12 +408,6 @@ def apply(self, model): model.set_metadata_prop("clk_ns", str(self.clk_ns)) tcl.append("set_property CONFIG.FREQ_HZ %d [get_bd_ports /ap_clk]" % round(fclk_hz)) tcl.append("set_property CONFIG.FREQ_HZ %d [get_bd_ports /ap_clk2x]" % round(2*fclk_hz)) - # tcl.append( - # "set_property CONFIG.FREQ_HZ %d [get_bd_intf_pins MatrixVectorActivation_rtl_0/s_axilite_0]" % round(fclk_hz) - # ) - # tcl.append( - # "set_property CONFIG.FREQ_HZ %d [get_bd_intf_pins MatrixVectorActivation_rtl_0/in0_V]" % round(fclk_hz) - # ) tcl.append("validate_bd_design") tcl.append("save_bd_design") # create wrapper hdl (for rtlsim later on) From 04f5863706103a01a986497a7b6b721cb6fd5979 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Sun, 3 Dec 2023 23:00:54 +0000 Subject: [PATCH 212/235] [rtl-vvu]: add stitching support for pumped compute, minor fix to segment length estimation --- .../fpgadataflow/vectorvectoractivation_rtl.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/vectorvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/vectorvectoractivation_rtl.py index 8277f36b21..63a00fc55f 100644 --- a/src/finn/custom_op/fpgadataflow/vectorvectoractivation_rtl.py +++ b/src/finn/custom_op/fpgadataflow/vectorvectoractivation_rtl.py @@ -67,6 +67,7 @@ def get_nodeattr_types(self): "Channels": ("i", True, 0), "Kernel": ("ints", True, []), # [H, W] "resType": ("s", False, "dsp", {"auto", "lut", "dsp"}), + "pumpedCompute": ("i", False, 0, {0, 1}), # FINN DataTypes for inputs, weights, outputs "inputDataType": ("s", True, ""), "weightDataType": ("s", True, ""), @@ -638,6 +639,9 @@ def code_generation_ipi(self): din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0] cmd.append("create_bd_cell -type hier %s" % node_name) cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name)) + if self.get_nodeattr("pumpedCompute"): + clk2x_name = self.get_verilog_top_module_intf_names()["clk2x"][0] + cmd.append("create_bd_pin -dir I -type clk2x /%s/%s" % (node_name, clk2x_name)) cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name)) cmd.append( "create_bd_intf_pin -mode Master " @@ -713,6 +717,11 @@ def code_generation_ipi(self): "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]" % (node_name, clk_name, node_name, node_name, clk_name) ) + if self.get_nodeattr("pumpedCompute"): + cmd.append( + "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]" + % (node_name, clk2x_name, node_name, node_name, clk2x_name) + ) cmd.append( "connect_bd_intf_net [get_bd_intf_pins %s/%s] " "[get_bd_intf_pins %s/%s/%s]" @@ -989,10 +998,12 @@ def _resolve_segment_len(self, clk): # Insert pipeline registers in the DSP58 chain to meet target clock frequency # 0.741 ns seems the worst-case delay through first DSP # 0.605 ns seems to be (on average) delay for all subsequent DSPs - critical_path_dsps = np.floor((clk - 0.741) / 0.605) + # clk >= (critical_path_dsps - 1) * 0.605 + 0.741 + assert (clk > 0.741), "Infeasible clk target of {} ns has been set, consider lowering the targeted clock frequency!".format(clk) + critical_path_dsps = np.floor((clk - 0.741) / 0.605 + 1) max_chain_len = np.ceil(self.get_nodeattr("SIMD") / 3) dsp_chain_len = critical_path_dsps if critical_path_dsps < max_chain_len else max_chain_len - return max(1, dsp_chain_len) + return dsp_chain_len def _resolve_impl_style(self, fpgapart): # Based on target device and activation/weight-width, choose the @@ -1016,6 +1027,7 @@ def prepare_codegen_default(self, fpgapart, clk): code_gen_dict = {} code_gen_dict["$IS_MVU$"] = [str(0)] code_gen_dict["$COMPUTE_CORE$"] = [self._resolve_impl_style(fpgapart)] + code_gen_dict["$PUMPED_COMPUTE$"] = [str(self.get_nodeattr("pumpedCompute"))] mw = int(np.prod(self.get_nodeattr("Kernel"))) code_gen_dict["$MW$"] = [str(mw)] code_gen_dict["$MH$"] = [str(self.get_nodeattr("Channels"))] @@ -1028,7 +1040,6 @@ def prepare_codegen_default(self, fpgapart, clk): [str(1)] if (self.get_input_datatype(0).min() < 0) else [str(0)] ) code_gen_dict["$SEGMENTLEN$"] = [str(self._resolve_segment_len(clk))] - # code_gen_dict["$FORCE_BEHAVIORAL$"] = [str(0)] return template_path, code_gen_dict From 9b80ac1d0e8baaf4a1b55eabd87e60ebd4a50396 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Sun, 3 Dec 2023 13:30:02 +0000 Subject: [PATCH 213/235] Prevent output register slice from operating in unnecessary ping-pong mode. --- finn-rtllib/mvu/mvu_vvu_axi.sv | 45 +++++++++++++++------------------- 1 file changed, 20 insertions(+), 25 deletions(-) diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index 3379577046..4f635bf78d 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -334,40 +334,35 @@ module mvu_vvu_axi #( end : blkDsp //-------------------- Output register slice --------------------\\ - struct packed { + // Make `en`computation independent from external inputs. + // Drive all outputs from registers. + typedef struct packed { logic vld; logic [PE-1:0][ACCU_WIDTH-1:0] dat; - } A = '{ vld: 0, default: 'x}; + } buf_t; + buf_t A = '{ vld: 0, default: 'x }; // side-step register used when encountering backpressure + buf_t B = '{ vld: 0, default: 'x }; // ultimate output register - assign en = !A.vld || !ovld; + assign en = !A.vld || !ovld; + uwire b_load = !B.vld || m_axis_output_tready; - uwire b_load; always_ff @(posedge clk) begin - if(rst) A <= '{ vld: 0, default: 'x }; - else if(!A.vld || b_load) begin - A.vld <= ovld && en; - for(int unsigned i = 0; i < PE; i++) begin - // CR-1148862: - // A.dat[i] <= odat[i]; - automatic logic [ACCU_WIDTH-1:0] v = odat[i]; - A.dat[i] <= v[ACCU_WIDTH-1:0]; - end + if(rst) begin + A <= '{ vld: 0, default: 'x }; + B <= '{ vld: 0, default: 'x }; end - end - - struct packed { - logic vld; - logic [PE-1:0][ACCU_WIDTH-1:0] dat; - } B = '{ vld: 0, default: 'x}; - - assign b_load = !B.vld || m_axis_output_tready; - always_ff @(posedge clk) begin - if(rst) B <= '{ vld: 0, default: 'x }; else begin - if(b_load) B <= '{ vld: A.vld, dat: A.dat}; + if(!A.vld) A.dat <= odat; + A.vld <= (ovld || A.vld) && !b_load; + + if(b_load) begin + B <= '{ + vld: A.vld || ovld, + dat: A.vld? A.dat : odat + }; + end end end - assign m_axis_output_tvalid = B.vld; assign m_axis_output_tdata = B.dat; From 60f483a3f8b753c310c1831be73f31f72db301d6 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 7 Dec 2023 15:20:26 +0000 Subject: [PATCH 214/235] [mvu vvu axi]: verilator BLKLOOPINIT-error workaround --- finn-rtllib/mvu/mvu_vvu_axi.sv | 55 ++++++++++++++++++++++++++-------- 1 file changed, 43 insertions(+), 12 deletions(-) diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index 4f635bf78d..114223052a 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -242,8 +242,6 @@ module mvu_vvu_axi #( En <= 0; Last <= '{ default: 1'b0 }; Zero <= 1; - W <= '{ default: 'x }; - A <= '{ default: 'x }; end else begin if(Active) begin @@ -251,23 +249,56 @@ module mvu_vvu_axi #( if(en) begin Last <= '{ alast && avld, 1'b0 }; Zero <= !istb; - for(int unsigned simd = 0; simd < EFFECTIVE_SIMD; simd++) begin - for(int unsigned pe = 0; pe < PE; pe++) begin - W[simd / DSP_SIMD][pe][simd % DSP_SIMD] <= (simd==EFFECTIVE_SIMD-1 && SIMD_UNEVEN) ? '0 : mvu_w[pe][simd]; - end - for(int unsigned pe = 0; pe < ACT_PE; pe++) begin - A[simd / DSP_SIMD][pe][simd % DSP_SIMD] <= (simd==EFFECTIVE_SIMD-1 && SIMD_UNEVEN) ? '0 : amvau_i[pe][simd]; - end - end end end else if(En) begin Last <= '{ 'x, Last[1] }; - W <= '{ 'x, W[1] }; - A <= '{ 'x, A[1] }; end end end + + for(genvar simd = 0; simd < EFFECTIVE_SIMD; simd++) begin : genSIMDRegW + for(genvar pe = 0; pe < PE; pe++) begin : genPERegW + always_ff @(posedge clk2x) begin + if(rst) begin + W[simd / DSP_SIMD][pe][simd % DSP_SIMD] <= '{ default: 'x }; + end + else begin + if(Active) begin + if(en) begin + W[simd / DSP_SIMD][pe][simd % DSP_SIMD] <= (simd==EFFECTIVE_SIMD-1 && SIMD_UNEVEN) ? '0 : mvu_w[pe][simd]; + end + end + else if(En) begin + W[1][pe][simd % DSP_SIMD] <= 'x; + W[0][pe][simd % DSP_SIMD] <= W[1][pe][simd % DSP_SIMD]; + end + end + end + end : genPERegW + end : genSIMDRegW + + for(genvar simd = 0; simd < EFFECTIVE_SIMD; simd++) begin : genSIMDRegA + for(genvar pe = 0; pe < ACT_PE; pe++) begin : genPERegA + always_ff @(posedge clk2x) begin + if(rst) begin + A[simd / DSP_SIMD][pe][simd % DSP_SIMD] <= '{ default: 'x }; + end + else begin + if(Active) begin + if(en) begin + A[simd / DSP_SIMD][pe][simd % DSP_SIMD] <= (simd==EFFECTIVE_SIMD-1 && SIMD_UNEVEN) ? '0 : amvau_i[pe][simd]; + end + end + else if(En) begin + A[1][pe][simd % DSP_SIMD] <= 'x; + A[0][pe][simd % DSP_SIMD] <= A[1][pe][simd % DSP_SIMD]; + end + end + end + end : genPERegA + end : genSIMDRegA + assign dsp_en = En; assign dsp_last = Last[0]; From 23fb64f4c4e82af45d8ffac8dee1415fbbe44d25 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 8 Dec 2023 17:12:42 +0000 Subject: [PATCH 215/235] [mvu vvu axi]: sign extend output tdata (byte-aligned) --- finn-rtllib/mvu/mvu_vvu_axi.sv | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index 114223052a..b285be076f 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -67,7 +67,8 @@ module mvu_vvu_axi #( localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (WEIGHT_STREAM_WIDTH + 7)/8 * 8, localparam int unsigned INPUT_STREAM_WIDTH = (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH, localparam int unsigned INPUT_STREAM_WIDTH_BA = (INPUT_STREAM_WIDTH + 7)/8 * 8, - localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8, + localparam int unsigned OUTPUT_STREAM_WIDTH = PE*ACCU_WIDTH, + localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_STREAM_WIDTH + 7)/8 * 8, localparam bit SIMD_UNEVEN = SIMD % 2 )( // Global Control @@ -395,6 +396,6 @@ module mvu_vvu_axi #( end end assign m_axis_output_tvalid = B.vld; - assign m_axis_output_tdata = B.dat; + assign m_axis_output_tdata = { {(OUTPUT_STREAM_WIDTH_BA-OUTPUT_STREAM_WIDTH){B.dat[PE-1][ACCU_WIDTH-1]}}, B.dat}; endmodule : mvu_vvu_axi From fdca45b50d0334dcbc888c989ba348fcfe67f1fa Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 11 Dec 2023 16:19:00 +0000 Subject: [PATCH 216/235] [mvu-rtl]: default seglen to 1 for now --- .../custom_op/fpgadataflow/matrixvectoractivation_rtl.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py index a66c6f4b2f..968c880980 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py @@ -980,7 +980,8 @@ def _resolve_segment_len(self, clk): critical_path_dsps = np.floor((clk - 0.741) / 0.605 + 1) max_chain_len = np.ceil(self.get_nodeattr("SIMD") / 3) dsp_chain_len = critical_path_dsps if critical_path_dsps < max_chain_len else max_chain_len - return dsp_chain_len + #return dsp_chain_len + return 1 def _resolve_impl_style(self, fpgapart): # Based on target device and activation/weight-width, choose the @@ -994,7 +995,7 @@ def _resolve_impl_style(self, fpgapart): fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"] or fpgapart[0:5] == "xqrvc" ) - if act_width == 4 and weight_width == 4: + if (act_width == 4 and weight_width == 4) and not(is_versal): return "mvu_4sx4u" else: if is_versal: From 45074d964b6405a41d71ed7e16fac5e5ef9b1269 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 11 Dec 2023 16:20:02 +0000 Subject: [PATCH 217/235] update test config --- .../test_fpgadataflow_mvau_rtl.py | 65 ++++++++++++++----- 1 file changed, 50 insertions(+), 15 deletions(-) diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py b/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py index 3db7a718f5..ebcc87102d 100644 --- a/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py +++ b/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py @@ -28,6 +28,7 @@ import pytest import os +import pickle import numpy as np from onnx import TensorProto, helper @@ -46,7 +47,8 @@ from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from qonnx.transformation.general import ApplyConfig import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl -#import qonnx.core.data_layout as DataLayout +from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP +from qonnx.custom_op.registry import getCustomOp build_dir = os.environ["FINN_BUILD_DIR"] @@ -83,22 +85,28 @@ def make_single_matmul_modelwrapper(W, ofm_shape, mh, ifm, weights, idt, wdt): return model def prepare_inputs(input_tensor): - return {"inp": input_tensor} - -@pytest.mark.parametrize("mh", [16]) -@pytest.mark.parametrize("mw", [32]) -@pytest.mark.parametrize("pe", [1, 4, 16]) -#@pytest.mark.parametrize("simd", [1, 30, 90]) -@pytest.mark.parametrize("simd", [1, 4, 32]) -@pytest.mark.parametrize("idt", [DataType["UINT4"], DataType["UINT8"]]) -@pytest.mark.parametrize("wdt", [DataType["INT4"], DataType["INT6"]]) + return {"ifm": input_tensor} + +@pytest.mark.parametrize("mh", [31]) +@pytest.mark.parametrize("mw", [279]) +#@pytest.mark.parametrize("pe", [1,2,4,8]) +@pytest.mark.parametrize("pe", [31]) +#@pytest.mark.parametrize("simd", [1,3,6,9,18,36]) +@pytest.mark.parametrize("simd", [9]) +#@pytest.mark.parametrize("idt", [DataType["UINT4"], DataType["UINT8"]]) +@pytest.mark.parametrize("idt", [DataType["UINT8"]]) +#@pytest.mark.parametrize("wdt", [DataType["INT4"], DataType["INT6"]]) +@pytest.mark.parametrize("wdt", [DataType["INT8"]]) #@pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S", "xcku3p-ffva676-1-e"]) -@pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S"]) +#@pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S"]) +@pytest.mark.parametrize("part", ["xcvc1902-vsva2197-2MP-e-S"]) @pytest.mark.parametrize("segmentlen", [1]) @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado def test_fpgadataflow_mvau_rtl(mh, mw, pe, simd, idt, wdt, part, segmentlen): + # Synthesis constants + clk_ns = 5 # Create test input vector (produced by SWG) ofm_shape = (5, 5) ofm_h, ofm_w = ofm_shape @@ -125,6 +133,9 @@ def test_fpgadataflow_mvau_rtl(mh, mw, pe, simd, idt, wdt, part, segmentlen): ## Execute ONNX model output_matmul = oxe.execute_onnx(model, input_dict) + with open(build_dir + "/onnx_output.pkl", "wb") as f: + pickle.dump(output_matmul, f) + # Create MVAU (HLS) model = model.transform(to_hls.InferQuantizedMatrixVectorActivation(mem_mode="decoupled")) model = model.transform(GiveUniqueNodeNames()) @@ -138,30 +149,54 @@ def test_fpgadataflow_mvau_rtl(mh, mw, pe, simd, idt, wdt, part, segmentlen): "mem_mode" : "decoupled", "ram_style" : "auto", "resType" : "dsp", - "impl" : "rtl" + "preferred_backend" : "rtl" } } model = model.transform(ApplyConfig(folding_config)) model.save(build_dir+"/mvau_hls.onnx") model = model.transform(SetExecMode("rtlsim")) - model = model.transform(PrepareIP(part, 5)) + model = model.transform(PrepareIP(part, clk_ns)) model = model.transform(HLSSynthIP()) model = model.transform(PrepareRTLSim()) + for n in model.graph.node: + getCustomOp(n).set_nodeattr("rtlsim_trace", "mvu_trace_hls.vcd") output_mvau_hls = oxe.execute_onnx(model, input_dict)["ofm"] + # Apply convert-to-rtl step model = model.transform(to_rtl.InferRTLMatrixVectorActivation()) model = model.transform(GiveUniqueNodeNames()) + for n in model.graph.node: + if n.op_type=="MatrixVectorActivation_rtl": + getCustomOp(n).set_nodeattr("pumpedCompute", 0) model.save(build_dir+"/mvau_rtl.onnx") + # Reset rtlsim_so and ip-related paths such that new Pyverilator SO and IP is generated + for n in model.graph.node: + getCustomOp(n).set_nodeattr("rtlsim_so", "") + getCustomOp(n).set_nodeattr("code_gen_dir_ipgen", "") + getCustomOp(n).set_nodeattr("ipgen_path", "") + getCustomOp(n).set_nodeattr("ip_path", "") + getCustomOp(n).set_nodeattr("rtlsim_trace", "mvu_trace_rtl.vcd") model = model.transform(SetExecMode("rtlsim")) - model = model.transform(PrepareIP("xcvm1802-vsvd1760-2MP-e-S", 5)) + model = model.transform(PrepareIP(part, clk_ns)) model = model.transform(HLSSynthIP()) model = model.transform(PrepareRTLSim()) output_mvau_rtl = oxe.execute_onnx(model, input_dict)["ofm"] model.save(build_dir+"/mvau_rtl_sim.onnx") + with open(build_dir + "/hls_output.pkl", "wb") as f: + pickle.dump(output_mvau_hls, f) + + with open(build_dir + "/rtl_output.pkl", "wb") as f: + pickle.dump(output_mvau_rtl, f) + + # model = model.transform(PrepareIP(part, clk_ns)) + # model = model.transform(HLSSynthIP()) + # model = model.transform(CreateStitchedIP(fpgapart=part, clk_ns=clk_ns, vitis=True)) + # model.save(build_dir+"/stitched_ip.onnx") + assert (output_mvau_hls == output_mvau_rtl).all() - assert (output_mvau_hls.size > 0) + # assert (output_mvau_hls.size > 0) From 0ed36812a077cba17f5b8c6503540773a5ff6756 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Tue, 12 Dec 2023 11:27:00 +0000 Subject: [PATCH 218/235] updated test config --- .../test_fpgadataflow_mvau_rtl.py | 22 +++++++++++++------ 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py b/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py index ebcc87102d..5091581d75 100644 --- a/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py +++ b/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py @@ -87,12 +87,13 @@ def make_single_matmul_modelwrapper(W, ofm_shape, mh, ifm, weights, idt, wdt): def prepare_inputs(input_tensor): return {"ifm": input_tensor} -@pytest.mark.parametrize("mh", [31]) -@pytest.mark.parametrize("mw", [279]) -#@pytest.mark.parametrize("pe", [1,2,4,8]) -@pytest.mark.parametrize("pe", [31]) -#@pytest.mark.parametrize("simd", [1,3,6,9,18,36]) -@pytest.mark.parametrize("simd", [9]) +@pytest.mark.parametrize("mh", [4]) +# @pytest.mark.parametrize("mw", [36]) +@pytest.mark.parametrize("mw", [18]) +# @pytest.mark.parametrize("pe", [1,2,4,8]) +@pytest.mark.parametrize("pe", [2]) +# @pytest.mark.parametrize("simd", [1,3,6,9,18,36]) +@pytest.mark.parametrize("simd", [6]) #@pytest.mark.parametrize("idt", [DataType["UINT4"], DataType["UINT8"]]) @pytest.mark.parametrize("idt", [DataType["UINT8"]]) #@pytest.mark.parametrize("wdt", [DataType["INT4"], DataType["INT6"]]) @@ -121,6 +122,9 @@ def test_fpgadataflow_mvau_rtl(mh, mw, pe, simd, idt, wdt, part, segmentlen): [mw, mh] ) W = gen_finn_dt_tensor(wdt, (mw, mh)) + # np.save("weights.npy", W) + ## + W = np.load("weights.npy") model = make_single_matmul_modelwrapper(W, ofm_shape, mh, ifm, weights, idt, wdt) model = model.transform(GiveUniqueNodeNames()) @@ -128,6 +132,9 @@ def test_fpgadataflow_mvau_rtl(mh, mw, pe, simd, idt, wdt, part, segmentlen): # Create MatMul & obtain golden reference output A = gen_finn_dt_tensor(model.get_tensor_datatype("ifm"), model.get_tensor_shape("ifm")) + # np.save("activations.npy", A) + ## + # A = np.load("activations.npy") input_dict = prepare_inputs(A) ## Execute ONNX model @@ -198,5 +205,6 @@ def test_fpgadataflow_mvau_rtl(mh, mw, pe, simd, idt, wdt, part, segmentlen): # model = model.transform(CreateStitchedIP(fpgapart=part, clk_ns=clk_ns, vitis=True)) # model.save(build_dir+"/stitched_ip.onnx") - assert (output_mvau_hls == output_mvau_rtl).all() + #assert (output_mvau_hls == output_mvau_rtl).all() + assert (output_matmul['ofm'] == output_mvau_rtl).all() # assert (output_mvau_hls.size > 0) From c39642510b8c55cd2173999c938947d7162371c4 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 13 Dec 2023 09:36:25 +0000 Subject: [PATCH 219/235] [rtlsim]: use pyverilator util functions --- src/finn/custom_op/fpgadataflow/hlscustomop.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/hlscustomop.py b/src/finn/custom_op/fpgadataflow/hlscustomop.py index c0b9f0735f..75c9240aeb 100644 --- a/src/finn/custom_op/fpgadataflow/hlscustomop.py +++ b/src/finn/custom_op/fpgadataflow/hlscustomop.py @@ -31,7 +31,7 @@ import subprocess import warnings from abc import abstractmethod -from pyverilator.util.axi_utils import _read_signal, reset_rtlsim, rtlsim_multi_io +from pyverilator.util.axi_utils import _read_signal, reset_rtlsim, rtlsim_multi_io, toggle_clk from qonnx.core.datatype import DataType from qonnx.custom_op.base import CustomOp from qonnx.util.basic import roundup_to_integer_multiple @@ -492,15 +492,11 @@ def exec_precompiled_singlenode_model(self): def reset_rtlsim(self, sim): """Sets reset input in pyverilator to zero, toggles the clock and set it back to one""" - sim.io.ap_rst_n = 0 - sim.io.ap_clk = 1 - sim.io.ap_clk = 0 - sim.io.ap_rst_n = 1 + reset_rtlsim(sim) def toggle_clk(self, sim): """Toggles the clock input in pyverilator once.""" - sim.io.ap_clk = 1 - sim.io.ap_clk = 0 + toggle_clk(sim) def hls_sname(self): """Get the naming convention used by Vitis HLS for stream signals From 538852d4f42afc5ef4a4be6bc19567034d081727 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 13 Dec 2023 11:44:00 +0000 Subject: [PATCH 220/235] [mvu vvu axi]: fix multiple driver error --- finn-rtllib/mvu/mvu_vvu_axi.sv | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index b285be076f..3fb8fd2455 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -271,8 +271,7 @@ module mvu_vvu_axi #( end end else if(En) begin - W[1][pe][simd % DSP_SIMD] <= 'x; - W[0][pe][simd % DSP_SIMD] <= W[1][pe][simd % DSP_SIMD]; + W[simd / DSP_SIMD][pe][simd % DSP_SIMD] <= simd / DSP_SIMD == 1 ? 'x : W[1][pe][simd % DSP_SIMD]; end end end @@ -292,8 +291,7 @@ module mvu_vvu_axi #( end end else if(En) begin - A[1][pe][simd % DSP_SIMD] <= 'x; - A[0][pe][simd % DSP_SIMD] <= A[1][pe][simd % DSP_SIMD]; + A[simd / DSP_SIMD][pe][simd % DSP_SIMD] <= simd / DSP_SIMD == 1 ? 'x : A[1][pe][simd % DSP_SIMD]; end end end From 7e5306c6439bca6f7a1d1b209709f48e38d47f77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Mon, 18 Dec 2023 09:02:54 +0000 Subject: [PATCH 221/235] Mitigate hold time issues on feed from fast clock net. --- finn-rtllib/mvu/mvu_vvu_axi.sv | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index 3fb8fd2455..813ffb69d7 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -226,7 +226,12 @@ module mvu_vvu_axi #( // Identify second fast cycle before active slow clock edge logic Active = 0; - always_ff @(posedge clk2x) Active <= clk; + if(1) begin : blkActive + uwire clk_lut[2]; // Put some LUT delay on the input from the fast clock net + (* DONT_TOUCH = "TRUE", HLUTNM = "CLK_LUT" *) LUT1 #(.INIT(2'b10)) lut(.O(clk_lut[0]), .I0(clk)); + (* DONT_TOUCH = "TRUE", HLUTNM = "CLK_LUT" *) LUT1 #(.INIT(2'b10)) lut(.O(clk_lut[1]), .I0(clk_lut[0])); + always_ff @(posedge clk2x) Active <= clk_lut[1]; + end : blkActive // The input for a slow cycle is split across two fast cycles along the SIMD dimension. // - Both fast cycles are controlled by the same enable state. From 256931fcf92ff629fd267e0d40efa93a480d4811 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 18 Dec 2023 11:49:24 +0000 Subject: [PATCH 222/235] toggle P and Vld only when no backpressure is applied --- finn-rtllib/mvu/mvu_vvu_axi.sv | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index 813ffb69d7..31d40b7cba 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -322,8 +322,10 @@ module mvu_vvu_axi #( P <= 'x; end else begin - if(dsp_vld) P <= dsp_p; - Vld <= dsp_vld || (Vld && Active); + if (dsp_en) begin + if(dsp_vld) P <= dsp_p; + Vld <= dsp_vld || (Vld && Active); + end end end assign ovld = Vld; From 020c4e09ac3e5ba292a3fa43998b842b49909bf3 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 18 Dec 2023 11:56:57 +0000 Subject: [PATCH 223/235] change naming --- finn-rtllib/mvu/mvu_vvu_axi.sv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index 31d40b7cba..8a593713a3 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -229,7 +229,7 @@ module mvu_vvu_axi #( if(1) begin : blkActive uwire clk_lut[2]; // Put some LUT delay on the input from the fast clock net (* DONT_TOUCH = "TRUE", HLUTNM = "CLK_LUT" *) LUT1 #(.INIT(2'b10)) lut(.O(clk_lut[0]), .I0(clk)); - (* DONT_TOUCH = "TRUE", HLUTNM = "CLK_LUT" *) LUT1 #(.INIT(2'b10)) lut(.O(clk_lut[1]), .I0(clk_lut[0])); + (* DONT_TOUCH = "TRUE", HLUTNM = "CLK_LUT" *) LUT1 #(.INIT(2'b10)) lut2(.O(clk_lut[1]), .I0(clk_lut[0])); always_ff @(posedge clk2x) Active <= clk_lut[1]; end : blkActive From 7e12ae4c0902882aa436c1b7b3b82dbdcc5f8dac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Tue, 19 Dec 2023 09:12:48 +0000 Subject: [PATCH 224/235] Reworking pumped DSP integration with simplified enable computation. --- finn-rtllib/mvu/mvu_vvu_axi.sv | 141 ++++++++++++---------------- finn-rtllib/mvu/tb/mvu_dsp58_tb.sv | 142 +++++++++++++++++++++++++++++ 2 files changed, 201 insertions(+), 82 deletions(-) create mode 100644 finn-rtllib/mvu/tb/mvu_dsp58_tb.sv diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index 8a593713a3..d40c5e1b10 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -224,12 +224,12 @@ module mvu_vvu_axi #( else begin : genPumpedCompute assign dsp_clk = clk2x; - // Identify second fast cycle before active slow clock edge + // Identify second fast cycle just before active slow clock edge logic Active = 0; if(1) begin : blkActive uwire clk_lut[2]; // Put some LUT delay on the input from the fast clock net - (* DONT_TOUCH = "TRUE", HLUTNM = "CLK_LUT" *) LUT1 #(.INIT(2'b10)) lut(.O(clk_lut[0]), .I0(clk)); - (* DONT_TOUCH = "TRUE", HLUTNM = "CLK_LUT" *) LUT1 #(.INIT(2'b10)) lut2(.O(clk_lut[1]), .I0(clk_lut[0])); + (* DONT_TOUCH = "TRUE", HLUTNM = "CLK_LUT" *) LUT1 #(.INIT(2'b10)) lut0(.O(clk_lut[0]), .I0(clk)); + (* DONT_TOUCH = "TRUE", HLUTNM = "CLK_LUT" *) LUT1 #(.INIT(2'b10)) lut1(.O(clk_lut[1]), .I0(clk_lut[0])); always_ff @(posedge clk2x) Active <= clk_lut[1]; end : blkActive @@ -237,78 +237,53 @@ module mvu_vvu_axi #( // - Both fast cycles are controlled by the same enable state. // - A zero cycle is duplicated across both fast cycles. // - The last flag must be restricted to the second fast cycle. - logic En = 0; - logic Last[1:0] = '{ default: 1'b0 }; - logic Zero = 1; - dsp_w_t W[1:0] = '{ default: 'x }; - dsp_a_t A[1:0] = '{ default: 'x }; + dsp_w_t W = 'x; + for(genvar pe = 0; pe < PE; pe++) begin : genPERegW + + uwire [2*DSP_SIMD-1:0][WEIGHT_WIDTH-1:0] w; + for(genvar i = 0; i < SIMD; i++) assign w[i] = mvu_w[pe][i]; + for(genvar i = SIMD; i < 2*DSP_SIMD; i++) assign w[i] = 0; + + always_ff @(posedge clk2x) begin + if(rst) W[pe] <= 'x; + else if(en) W[pe] <= w[(Active? DSP_SIMD : 0) +: DSP_SIMD]; + end + + end : genPERegW + + dsp_a_t A = 'x; + for(genvar pe = 0; pe < ACT_PE; pe++) begin : genPERegA + + uwire [2*DSP_SIMD-1:0][ACTIVATION_WIDTH-1:0] a; + for(genvar i = 0; i < SIMD; i++) assign a[i] = amvau_i[pe][i]; + for(genvar i = SIMD; i < 2*DSP_SIMD; i++) assign a[i] = 0; + + always_ff @(posedge clk2x) begin + if(rst) A[pe] <= 'x; + else if(en) A[pe] <= a[(Active? DSP_SIMD : 0) +: DSP_SIMD]; + end + + end : genPERegA + + logic Zero = 1; + logic Last = 0; always_ff @(posedge clk2x) begin if(rst) begin - En <= 0; - Last <= '{ default: 1'b0 }; - Zero <= 1; + Zero <= 1; + Last <= 0; end - else begin - if(Active) begin - En <= en; - if(en) begin - Last <= '{ alast && avld, 1'b0 }; - Zero <= !istb; - end - end - else if(En) begin - Last <= '{ 'x, Last[1] }; - end + else if(en) begin + Zero <= !istb; + Last <= alast && avld && Active; end end - for(genvar simd = 0; simd < EFFECTIVE_SIMD; simd++) begin : genSIMDRegW - for(genvar pe = 0; pe < PE; pe++) begin : genPERegW - always_ff @(posedge clk2x) begin - if(rst) begin - W[simd / DSP_SIMD][pe][simd % DSP_SIMD] <= '{ default: 'x }; - end - else begin - if(Active) begin - if(en) begin - W[simd / DSP_SIMD][pe][simd % DSP_SIMD] <= (simd==EFFECTIVE_SIMD-1 && SIMD_UNEVEN) ? '0 : mvu_w[pe][simd]; - end - end - else if(En) begin - W[simd / DSP_SIMD][pe][simd % DSP_SIMD] <= simd / DSP_SIMD == 1 ? 'x : W[1][pe][simd % DSP_SIMD]; - end - end - end - end : genPERegW - end : genSIMDRegW - - for(genvar simd = 0; simd < EFFECTIVE_SIMD; simd++) begin : genSIMDRegA - for(genvar pe = 0; pe < ACT_PE; pe++) begin : genPERegA - always_ff @(posedge clk2x) begin - if(rst) begin - A[simd / DSP_SIMD][pe][simd % DSP_SIMD] <= '{ default: 'x }; - end - else begin - if(Active) begin - if(en) begin - A[simd / DSP_SIMD][pe][simd % DSP_SIMD] <= (simd==EFFECTIVE_SIMD-1 && SIMD_UNEVEN) ? '0 : amvau_i[pe][simd]; - end - end - else if(En) begin - A[simd / DSP_SIMD][pe][simd % DSP_SIMD] <= simd / DSP_SIMD == 1 ? 'x : A[1][pe][simd % DSP_SIMD]; - end - end - end - end : genPERegA - end : genSIMDRegA - - assign dsp_en = En; - - assign dsp_last = Last[0]; + assign dsp_en = en; + assign dsp_last = Last; assign dsp_zero = Zero; - assign dsp_w = W[0]; - assign dsp_a = A[0]; + assign dsp_w = W; + assign dsp_a = A; // Since no two consecutive last cycles will ever be asserted on the input, // valid outputs will also always be spaced by, at least, one other cycle. @@ -321,11 +296,9 @@ module mvu_vvu_axi #( Vld <= 0; P <= 'x; end - else begin - if (dsp_en) begin - if(dsp_vld) P <= dsp_p; - Vld <= dsp_vld || (Vld && Active); - end + else if(en) begin + if(dsp_vld) P <= dsp_p; + Vld <= dsp_vld || (Vld && !Active); end end assign ovld = Vld; @@ -373,34 +346,38 @@ module mvu_vvu_axi #( //-------------------- Output register slice --------------------\\ // Make `en`computation independent from external inputs. // Drive all outputs from registers. - typedef struct packed { + struct packed { + logic rdy; + logic [PE-1:0][ACCU_WIDTH-1:0] dat; + } A = '{ rdy: 1, default: 'x }; // side-step register used when encountering backpressure + struct packed { logic vld; logic [PE-1:0][ACCU_WIDTH-1:0] dat; - } buf_t; - buf_t A = '{ vld: 0, default: 'x }; // side-step register used when encountering backpressure - buf_t B = '{ vld: 0, default: 'x }; // ultimate output register + } B = '{ vld: 0, default: 'x }; // ultimate output register - assign en = !A.vld || !ovld; + assign en = A.rdy; uwire b_load = !B.vld || m_axis_output_tready; always_ff @(posedge clk) begin if(rst) begin - A <= '{ vld: 0, default: 'x }; + A <= '{ rdy: 1, default: 'x }; B <= '{ vld: 0, default: 'x }; end else begin - if(!A.vld) A.dat <= odat; - A.vld <= (ovld || A.vld) && !b_load; + if(A.rdy) A.dat <= odat; + A.rdy <= (A.rdy && !ovld) || b_load; if(b_load) begin B <= '{ - vld: A.vld || ovld, - dat: A.vld? A.dat : odat + vld: ovld || !A.rdy, + dat: A.rdy? odat : A.dat }; end end end assign m_axis_output_tvalid = B.vld; + // Why would we need a sign extension here potentially creating a higher signal load into the next FIFO? + // These extra bits should never be used. Why not 'x them out? assign m_axis_output_tdata = { {(OUTPUT_STREAM_WIDTH_BA-OUTPUT_STREAM_WIDTH){B.dat[PE-1][ACCU_WIDTH-1]}}, B.dat}; endmodule : mvu_vvu_axi diff --git a/finn-rtllib/mvu/tb/mvu_dsp58_tb.sv b/finn-rtllib/mvu/tb/mvu_dsp58_tb.sv new file mode 100644 index 0000000000..108980c497 --- /dev/null +++ b/finn-rtllib/mvu/tb/mvu_dsp58_tb.sv @@ -0,0 +1,142 @@ +module mvu_dsp58_tb; + + localparam int unsigned N = 1000; + + localparam int unsigned MW = 12; + localparam int unsigned MH = 4; + localparam int unsigned PE = 2; + localparam int unsigned SIMD = 6; + localparam int unsigned ACTIVATION_WIDTH = 8; + localparam int unsigned WEIGHT_WIDTH = 8; + localparam int unsigned ACCU_WIDTH = 24; + + //- Global Control ------------------ + logic clk = 1; + logic clk2x = 1; + always #5ns clk = !clk; + always #2.5ns clk2x = !clk2x; + + logic rst = 1; + initial begin + repeat(8) @(posedge clk); + rst <= 0; + end + + //- DUTs ---------------------------- + + // Weight Stream + logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] s_axis_weights_tdata; + logic s_axis_weights_tvalid[2]; + uwire s_axis_weights_tready[2]; + + // Input Stream + logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] s_axis_input_tdata; + logic s_axis_input_tvalid[2]; + uwire s_axis_input_tready[2]; + + // Output Stream + uwire [PE-1:0][ACCU_WIDTH-1:0] m_axis_output_tdata[2]; + uwire m_axis_output_tvalid[2]; + logic m_axis_output_tready[2]; + + for(genvar i = 0; i < 2; i++) begin : genDUTs + mvu_vvu_axi #( + .IS_MVU(1), + .COMPUTE_CORE("mvu_vvu_8sx9_dsp58"), + .MW(MW), .MH(MH), + .PE(PE), .SIMD(SIMD), + .ACTIVATION_WIDTH(ACTIVATION_WIDTH), + .WEIGHT_WIDTH(WEIGHT_WIDTH), + .ACCU_WIDTH(ACCU_WIDTH), + .PUMPED_COMPUTE(i) + ) dut ( + .ap_clk(clk), .ap_clk2x(clk2x), .ap_rst_n(!rst), + .s_axis_weights_tdata, .s_axis_weights_tvalid(s_axis_weights_tvalid[i]), .s_axis_weights_tready(s_axis_weights_tready[i]), + .s_axis_input_tdata, .s_axis_input_tvalid (s_axis_input_tvalid [i]), .s_axis_input_tready (s_axis_input_tready [i]), + .m_axis_output_tdata(m_axis_output_tdata[i]), .m_axis_output_tvalid (m_axis_output_tvalid [i]), .m_axis_output_tready (m_axis_output_tready [i]) + ); + end : genDUTs + + + //- Stimuli ------------------------- + + // Weight Feed + initial begin + s_axis_weights_tvalid = '{ default: 0 }; + s_axis_weights_tdata = 'x; + @(posedge clk iff !rst); + + repeat(N * (MH/PE)*(MW/SIMD)) begin + automatic type(s_axis_weights_tdata) weights; + std::randomize(weights); + s_axis_weights_tdata <= weights; + s_axis_weights_tvalid <= '{ default: 1 }; + fork + begin + @(posedge clk iff s_axis_weights_tready[0]); + s_axis_weights_tvalid[0] <= 0; + end + begin + @(posedge clk iff s_axis_weights_tready[1]); + s_axis_weights_tvalid[1] <= 0; + end + join + end + end + + // Input Feed + initial begin + s_axis_input_tvalid = '{ default: 0 }; + s_axis_input_tdata = 'x; + @(posedge clk iff !rst); + + repeat(N * (MW/SIMD)) begin + automatic type(s_axis_input_tdata) in; + std::randomize(in); + s_axis_input_tdata <= in; + s_axis_input_tvalid <= '{ default: 1 }; + fork + begin + @(posedge clk iff s_axis_input_tready[0]); + s_axis_input_tvalid[0] <= 0; + end + begin + @(posedge clk iff s_axis_input_tready[1]); + s_axis_input_tvalid[1] <= 0; + end + join + end + end + + // Output Capture and Comparison + initial begin + m_axis_output_tready = '{ default: 0 }; + @(posedge clk iff !rst); + + repeat(N * (MH/PE)) begin + automatic type(m_axis_output_tdata) res; + m_axis_output_tready <= '{ default: 1 }; + fork + begin + @(posedge clk iff m_axis_output_tvalid[0]); + m_axis_output_tready[0] <= 0; + res[0] = m_axis_output_tdata[0]; + end + begin + @(posedge clk iff m_axis_output_tvalid[1]); + m_axis_output_tready[1] <= 0; + res[1] = m_axis_output_tdata[1]; + end + join + assert(res[0] == res[1]) else begin + $error("Output mismatch: %0x <=> %0x", res[0], res[1]); + $stop; + end + while($urandom()%7 < MW/SIMD) @(posedge clk); // Occassional backpressure + end + + $display("Test completed."); + $finish; + end + +endmodule : mvu_dsp58_tb From 6e98bac42f225e7ed8629e0cb67211e78db61d15 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 13 Dec 2023 09:36:25 +0000 Subject: [PATCH 225/235] [rtlsim]: use pyverilator util functions --- src/finn/custom_op/fpgadataflow/hlscustomop.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/hlscustomop.py b/src/finn/custom_op/fpgadataflow/hlscustomop.py index 4fed8ed4b5..01b94c20ca 100644 --- a/src/finn/custom_op/fpgadataflow/hlscustomop.py +++ b/src/finn/custom_op/fpgadataflow/hlscustomop.py @@ -31,7 +31,7 @@ import subprocess import warnings from abc import abstractmethod -from pyverilator.util.axi_utils import _read_signal, reset_rtlsim, rtlsim_multi_io +from pyverilator.util.axi_utils import _read_signal, reset_rtlsim, rtlsim_multi_io, toggle_clk from qonnx.core.datatype import DataType from qonnx.custom_op.base import CustomOp from qonnx.util.basic import roundup_to_integer_multiple @@ -491,15 +491,11 @@ def exec_precompiled_singlenode_model(self): def reset_rtlsim(self, sim): """Sets reset input in pyverilator to zero, toggles the clock and set it back to one""" - sim.io.ap_rst_n = 0 - sim.io.ap_clk = 1 - sim.io.ap_clk = 0 - sim.io.ap_rst_n = 1 + reset_rtlsim(sim) def toggle_clk(self, sim): """Toggles the clock input in pyverilator once.""" - sim.io.ap_clk = 1 - sim.io.ap_clk = 0 + toggle_clk(sim) def hls_sname(self): """Get the naming convention used by Vitis HLS for stream signals From 5dd74ad1dede3bf2a0405de8c803a4adfb2e65d3 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 8 Dec 2023 17:12:42 +0000 Subject: [PATCH 226/235] [mvu vvu axi]: sign extend output tdata (byte-aligned) --- finn-rtllib/mvu/mvu_vvu_axi.sv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index 8eb92a93e6..699662bd72 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -243,6 +243,6 @@ module mvu_vvu_axi #( end assign m_axis_output_tvalid = B.vld; - assign m_axis_output_tdata = B.dat; + assign m_axis_output_tdata = { {(OUTPUT_STREAM_WIDTH_BA-OUTPUT_STREAM_WIDTH){B.dat[PE-1][ACCU_WIDTH-1]}}, B.dat}; endmodule : mvu_vvu_axi From b20410bfd968c27395537b60bba11849b599a33a Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 8 Jan 2024 14:55:56 +0000 Subject: [PATCH 227/235] [mvu core]: dsp48 convert unpacked array to packed array to work around limitation on max array indices in Verilator --- finn-rtllib/mvu/mvu_4sx4u.sv | 4 ++-- finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv index 706347d700..7a2af35742 100644 --- a/finn-rtllib/mvu/mvu_4sx4u.sv +++ b/finn-rtllib/mvu/mvu_4sx4u.sv @@ -309,7 +309,7 @@ module mvu_4sx4u #( // Conclusive high part accumulation if(i >= PE_REM && i < 3) begin : genHi // Adder Tree across all SIMD high contributions, each from [-1:1] - uwire signed [$clog2(1+SIMD):0] tree[2*SIMD-1]; + uwire signed [2*SIMD-2:0][$clog2(1+SIMD):0] tree; for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = h3[s][i]; for(genvar n = 0; n < SIMD-1; n++) begin // Sum truncated to actual maximum bit width at this node @@ -333,7 +333,7 @@ module mvu_4sx4u #( if(i >= PE_REM) begin : blkLo // Adder Tree across all SIMD low contributions localparam int unsigned ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1)); - uwire [ROOT_WIDTH-1:0] tree[2*SIMD-1]; + uwire [2*SIMD-2:0][ROOT_WIDTH-1:0] tree; for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = p3[s][D[i]+:LO_WIDTH]; for(genvar n = 0; n < SIMD-1; n++) begin // Sum truncated to actual maximum bit width at this node diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv index 07c44cf89a..1e6855f779 100644 --- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv +++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv @@ -335,7 +335,7 @@ module mvu_8sx8u_dsp48 #( if(i >= PE_REM) begin : blkLo // Adder Tree across all SIMD low contributions localparam int unsigned ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1)); - uwire [ROOT_WIDTH-1:0] tree[2*SIMD-1]; + uwire [2*SIMD-2:0][ROOT_WIDTH-1:0] tree; for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = p3[s][D[i]+:LO_WIDTH]; for(genvar n = 0; n < SIMD-1; n++) begin // Sum truncated to actual maximum bit width at this node From 1c2cc0c2c1d98d7cde569f65eb20873a10e1f12f Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 8 Jan 2024 14:57:19 +0000 Subject: [PATCH 228/235] [mvu axi]: update list of deduced parameters --- finn-rtllib/mvu/mvu_vvu_axi.sv | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index 699662bd72..dd357c94bb 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -60,13 +60,14 @@ module mvu_vvu_axi #( bit M_REG_LUT = 1, // Safely deducible parameters - localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, - localparam int unsigned INPUT_STREAM_WIDTH_BA = ((IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8, - localparam int unsigned WEIGHT_STREAM_WIDTH = PE*SIMD*WEIGHT_WIDTH, - localparam int unsigned INPUT_STREAM_WIDTH = (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH, - localparam int unsigned SF = MW/SIMD, - localparam int unsigned NF = IS_MVU ? MH/PE : 1, - localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8 + localparam int unsigned WEIGHT_STREAM_WIDTH = PE * SIMD * WEIGHT_WIDTH, + localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (WEIGHT_STREAM_WIDTH + 7) / 8 * 8, + localparam int unsigned INPUT_STREAM_WIDTH = (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH, + localparam int unsigned INPUT_STREAM_WIDTH_BA = (INPUT_STREAM_WIDTH + 7) / 8 * 8, + localparam int unsigned OUTPUT_STREAM_WIDTH = PE * ACCU_WIDTH, + localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_STREAM_WIDTH + 7) / 8 * 8, + localparam int unsigned SF = MW / SIMD, + localparam int unsigned NF = IS_MVU ? MH / PE : 1 ) ( // Global Control From eeb3cea623865a13d8da78acb5a9c7fc621caf0e Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 8 Jan 2024 14:58:02 +0000 Subject: [PATCH 229/235] [mvu custom-op]: remove lut-based implementation and update compute core selection --- .../matrixvectoractivation_rtl.py | 39 ++++++++++--------- 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py index da560d73fd..fcab06658c 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation_rtl.py @@ -191,7 +191,12 @@ def verify_node(self): if mem_mode not in ["decoupled", "external"]: info_messages.append( - "RTL-based MVAU supports only decoupled or external weights." + "RTL-based MVU only supports decoupled or external weights." + ) + + if self.get_nodeattr("resType") == "lut": + info_message.append( + "RTL-based MVU only supports DSP-based implementation" ) return info_messages @@ -635,7 +640,6 @@ def execute_node(self, context, graph): mem_mode = self.get_nodeattr("mem_mode") node = self.onnx_node - # TODO ensure codegen dir exists if mode == "cppsim": raise Exception( "cppsim not possible for RTL MVAU, please set exec_mode to rtlsim" @@ -801,7 +805,6 @@ def code_generation_ipi(self): rtllib_dir + "mvu_4sx4u.sv", rtllib_dir + "mvu_vvu_8sx9_dsp58.sv", rtllib_dir + "mvu_8sx8u_dsp48.sv", - rtllib_dir + "mvu_vvu_lut.sv", ] for f in sourcefiles: cmd.append("add_files -norecurse %s" % (f)) @@ -897,7 +900,6 @@ def code_generation_ipi(self): rtllib_dir + "mvu_4sx4u.sv", rtllib_dir + "mvu_vvu_8sx9_dsp58.sv", rtllib_dir + "mvu_8sx8u_dsp48.sv", - rtllib_dir + "mvu_vvu_lut.sv", ] for f in sourcefiles: cmd.append("add_files -norecurse %s" % (f)) @@ -964,8 +966,8 @@ def derive_characteristic_fxns(self, period): def _resolve_segment_len(self, clk): # Insert pipeline registers in the DSP58 chain to meet target clock frequency - # 0.741 ns seems the worst-case delay through first DSP - # 0.605 ns seems to be (on average) delay for all subsequent DSPs + # ~0.741 ns seems the worst-case delay through first DSP + # ~0.605 ns seems to be (on average) delay for all subsequent DSPs # clk >= (critical_path_dsps - 1) * 0.605 + 0.741 assert (clk > 0.741), "Infeasible clk target of {} ns has been set, consider lowering the targeted clock frequency!".format(clk) critical_path_dsps = np.floor((clk - 0.741) / 0.605 + 1) @@ -976,22 +978,23 @@ def _resolve_segment_len(self, clk): def _resolve_impl_style(self, fpgapart): # Based on target device and activation/weight-width, choose the # supported RTL compute core - if self.get_nodeattr("resType") == "lut": - return "mvu_vvu_lut" + + assert self.get_nodeattr("resType") != "lut", "LUT-based RTL-MVU implementation currently not supported! Please change resType for {}".format(self.onnx_node.name) + + act_width = self.get_input_datatype(0).bitwidth() + weight_width = self.get_input_datatype(1).bitwidth() + is_versal = ( + fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"] + or fpgapart[0:5] == "xqrvc" + ) + + if is_versal: + return "mvu_vvu_8sx9_dsp58" else: - act_width = self.get_input_datatype(0).bitwidth() - weight_width = self.get_input_datatype(1).bitwidth() - is_versal = ( - fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"] - or fpgapart[0:5] == "xqrvc" - ) if act_width == 4 and weight_width == 4: return "mvu_4sx4u" else: - if is_versal: - return "mvu_vvu_8sx9_dsp58" - else: - return "mvu_8sx8u_dsp48" + return "mvu_8sx8u_dsp48" def generate_hdl(self, model, fpgapart, clk): # Generate params as part of IP preparation From 0813d1463a219384b4666fad2db93a4f7dee1a0f Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 8 Jan 2024 14:59:30 +0000 Subject: [PATCH 230/235] [mvu axi]: remove LUT-based compute core --- finn-rtllib/mvu/mvu_vvu_axi.sv | 11 +--- finn-rtllib/mvu/mvu_vvu_lut.sv | 104 --------------------------------- 2 files changed, 2 insertions(+), 113 deletions(-) delete mode 100644 finn-rtllib/mvu/mvu_vvu_lut.sv diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index dd357c94bb..a3b051c9a1 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -120,8 +120,8 @@ module mvu_vvu_axi #( end end if (!IS_MVU) begin - if (COMPUTE_CORE != "mvu_vvu_8sx9_dsp58" && COMPUTE_CORE != "mvu_vvu_lut") begin - $error("VVU only supported on DSP58 or LUT-based implementation"); + if (COMPUTE_CORE != "mvu_vvu_8sx9_dsp58") begin + $error("VVU only supported on DSP58"); $finish; end end @@ -195,13 +195,6 @@ module mvu_vvu_axi #( .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), .vld(ovld), .p(odat) ); - "mvu_vvu_lut": - mvu_vvu_lut #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), - .WEIGHT_WIDTH(WEIGHT_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .M_REG(M_REG_LUT)) core ( - .clk, .rst, .en, - .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), - .vld(ovld), .p(odat) - ); default: initial begin $error("Unrecognized COMPUTE_CORE '%s'", COMPUTE_CORE); $finish; diff --git a/finn-rtllib/mvu/mvu_vvu_lut.sv b/finn-rtllib/mvu/mvu_vvu_lut.sv deleted file mode 100644 index c100910d75..0000000000 --- a/finn-rtllib/mvu/mvu_vvu_lut.sv +++ /dev/null @@ -1,104 +0,0 @@ -module mvu_vvu_lut #( - bit IS_MVU, - int unsigned PE, - int unsigned SIMD, - int unsigned ACCU_WIDTH, - int unsigned ACTIVATION_WIDTH, - int unsigned WEIGHT_WIDTH, - bit SIGNED_ACTIVATIONS, - bit M_REG = 1, - - localparam int unsigned MULT_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH, - localparam int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD -)( - // Global Control - input logic clk, - input logic rst, - input logic en, - - // Input - input logic last, - input logic zero, // ignore current inputs and force this partial product to zero - input logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] w, // signed weights - input logic [ACTIVATION_ELEMENTS-1:0][ACTIVATION_WIDTH-1:0] a, // (un)signed activations - - // Ouput - output logic vld, - output logic signed [PE-1:0][ACCU_WIDTH-1:0] p -); - - typedef int unsigned leave_load_t[2*SIMD-1]; - function leave_load_t init_leave_loads(); - automatic leave_load_t res; - for(int i = 2*(SIMD-1); i >= int'(SIMD)-1; i--) res[i] = 1; - for(int i = SIMD-2; i >= 0; i--) res[i] = res[2*i+1] + res[2*i+2]; - return res; - endfunction : init_leave_loads - - // Pipeline for last indicator flag - uwire last_i; - generate if (M_REG) begin - logic [0:1] L = '0; - always_ff @(posedge clk) begin - if(rst) L <= '0; - else if (en) L <= {last, L[0]}; - end - assign last_i = L[1]; - end - else begin - logic L = '0; - always_ff @(posedge clk) begin - if(rst) L <= '0; - else if (en) L <= last; - end - assign last_i = L; - end - endgenerate - - // For each PE generate - for (genvar i = 0; i < PE; i++) begin : genPE - // Stage #1: SIMD multipliers in parallel - uwire [MULT_WIDTH-1 : 0] m1 [SIMD]; - for (genvar j = 0; j < SIMD; j++) begin : genSIMD - if (M_REG) begin : genMreg - logic [MULT_WIDTH-1 : 0] M [SIMD]; - always_ff @(posedge clk) begin - if(rst) M[j] = '{ default : 0 }; - else if (en) M[j] = zero ? 0 : - SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) : - $signed({1'b0, a[(IS_MVU ? 0 : SIMD*i) + j]}) * $signed(w[i][j]); - // (SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) : a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) isn't valid -- leads to unsigned multiplication - end - assign m1[j] = M[j]; - end : genMreg - else begin : genNoMreg - assign m1[j] = zero ? 0 : - SIGNED_ACTIVATIONS ? $signed(a[(IS_MVU ? 0 : SIMD*i) + j]) * $signed(w[i][j]) : - $signed({1'b0, a[(IS_MVU ? 0 : SIMD*i) + j]}) * $signed(w[i][j]); - end : genNoMreg - end : genSIMD - - // Stage #2: Adder tree to reduce SIMD products - localparam leave_load_t LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default : 1 }; - localparam int unsigned ROOT_WIDTH = $clog2(SIMD*(2**MULT_WIDTH-1)); - uwire signed [2*SIMD-2:0][ROOT_WIDTH-1:0] tree; - for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = $signed(m1[s]); - for(genvar n = 0; n < SIMD-1; n++) begin - // Sum truncated to actual maximum bit width at this node - localparam int unsigned NODE_WIDTH = $clog2(LEAVE_LOAD[n]*(2**MULT_WIDTH-1)); - uwire signed [NODE_WIDTH-1:0] s = tree[2*n+1] + tree[2*n+2]; - assign tree[n] = s; - end - - // Stage #3: Buffer output - logic [ACCU_WIDTH-1:0] P2 [PE]; - always_ff @(posedge clk) begin - if(rst) P2[i] = '{ default : 0}; - else if (en) P2[i] = (last_i ? 0 : $signed(P2[i])) + $signed(tree[0]); - end - - assign vld = last_i; - assign p[i] = P2[i]; - end : genPE - -endmodule : mvu_vvu_lut From 4892d6614b734a08315062b86ec6d5e1f1af0dc1 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 11 Jan 2024 12:02:38 +0000 Subject: [PATCH 231/235] [hls custom-op]: enable reset in sim --- src/finn/custom_op/fpgadataflow/hlscustomop.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/finn/custom_op/fpgadataflow/hlscustomop.py b/src/finn/custom_op/fpgadataflow/hlscustomop.py index 01b94c20ca..bc59c69192 100644 --- a/src/finn/custom_op/fpgadataflow/hlscustomop.py +++ b/src/finn/custom_op/fpgadataflow/hlscustomop.py @@ -600,6 +600,7 @@ def rtlsim_multi_io(self, sim, io_dict): trace_file=trace_file, sname=sname, liveness_threshold=pyverilate_get_liveness_threshold_cycles(), + do_reset=True, ) self.set_nodeattr("cycles_rtlsim", total_cycle_count) From 44f6e0f3e70eea06408b94a31e555f0f6b9ea358 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 11 Jan 2024 12:21:00 +0000 Subject: [PATCH 232/235] [test mvu rtl]: updated test flow (DSP58 only) --- .../test_fpgadataflow_mvau_rtl.py | 167 +++++++++--------- 1 file changed, 87 insertions(+), 80 deletions(-) diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py b/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py index 3db7a718f5..1e9de44fb2 100644 --- a/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py +++ b/tests/fpgadataflow/test_fpgadataflow_mvau_rtl.py @@ -27,141 +27,148 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import pytest -import os import numpy as np +import os +import pickle from onnx import TensorProto, helper -from qonnx.util.basic import ( - qonnx_make_model, - gen_finn_dt_tensor -) -from qonnx.core.modelwrapper import ModelWrapper from qonnx.core.datatype import DataType -from qonnx.transformation.general import GiveUniqueNodeNames +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.custom_op.registry import getCustomOp +from qonnx.transformation.general import ApplyConfig, GiveUniqueNodeNames, GiveReadableTensorNames +from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model + import finn.core.onnx_exec as oxe import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls -from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl +from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP -from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.set_fifo_depths import InsertAndSetFIFODepths + + from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim -from qonnx.transformation.general import ApplyConfig -import finn.transformation.fpgadataflow.specialize_to_rtl_layers as to_rtl -#import qonnx.core.data_layout as DataLayout +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode build_dir = os.environ["FINN_BUILD_DIR"] -def make_single_matmul_modelwrapper(W, ofm_shape, mh, ifm, weights, idt, wdt): - (ofm_h, ofm_w) = ofm_shape - ofm = helper.make_tensor_value_info( - "ofm", - TensorProto.FLOAT, - (1, ofm_h, ofm_w, mh) - ) - - matmul_node = helper.make_node( - "MatMul", - ["ifm", "weights"], - ["ofm"] - ) - graph = helper.make_graph( - nodes=[matmul_node], - name="matmul_graph", - inputs=[ifm], - outputs=[ofm] - ) + +def make_single_matmul_modelwrapper(ifm, ofm, idt, wdt, W): + matmul_node = helper.make_node("MatMul", ["ifm", "weights"], ["ofm"]) + graph = helper.make_graph(nodes=[matmul_node], name="matmul_graph", inputs=[ifm], outputs=[ofm]) model = qonnx_make_model(graph, producer_name="fclayer-model") model = ModelWrapper(model) model.set_tensor_datatype("ifm", idt) model.set_tensor_datatype("weights", wdt) - model.set_tensor_datatype("ofm", DataType["INT32"]) # At this step, the MatMul layer does not optimize the bit-width of the output datatype + model.set_tensor_datatype( + "ofm", DataType["INT32"] + ) # At this step, the MatMul layer does not optimize the bit-width of the output datatype model.set_initializer("weights", W) - # model.set_tensor_layout("ifm", DataLayout.NHWC) return model + def prepare_inputs(input_tensor): - return {"inp": input_tensor} + return {"global_in": input_tensor} + -@pytest.mark.parametrize("mh", [16]) -@pytest.mark.parametrize("mw", [32]) -@pytest.mark.parametrize("pe", [1, 4, 16]) -#@pytest.mark.parametrize("simd", [1, 30, 90]) -@pytest.mark.parametrize("simd", [1, 4, 32]) +# @pytest.mark.parametrize("mh", [36]) +# @pytest.mark.parametrize("mw", [256]) +@pytest.mark.parametrize("mh", [9]) +@pytest.mark.parametrize("mw", [36]) +# @pytest.mark.parametrize("pe", [1, 4, 9, 36]) +# @pytest.mark.parametrize("simd", [1, 4, 16, 64, 256]) +@pytest.mark.parametrize("pe", [1, 3, 9]) +@pytest.mark.parametrize("simd", [1, 3, 6, 18, 36]) @pytest.mark.parametrize("idt", [DataType["UINT4"], DataType["UINT8"]]) -@pytest.mark.parametrize("wdt", [DataType["INT4"], DataType["INT6"]]) -#@pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S", "xcku3p-ffva676-1-e"]) -@pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S"]) -@pytest.mark.parametrize("segmentlen", [1]) +@pytest.mark.parametrize("wdt", [DataType["INT4"], DataType["INT8"]]) +# @pytest.mark.parametrize("part", ["xcvc1902-vsva2197-2MP-e-S", "xcku3p-ffva676-1-e"]) +@pytest.mark.parametrize("part", ["xcvc1902-vsva2197-2MP-e-S"]) +@pytest.mark.parametrize("clk_ns", [1.66, 4]) @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado -def test_fpgadataflow_mvau_rtl(mh, mw, pe, simd, idt, wdt, part, segmentlen): +def test_fpgadataflow_mvau_rtl( + mh, mw, pe, simd, idt, wdt, part, clk_ns +): + if part == "xcku3p-ffva676-1-e" and clk_ns != 1.66: + pytest.skip("Skip test for varying clk for devices other than Versal, since this variable doesn't change anything for this test") + # Create test input vector (produced by SWG) ofm_shape = (5, 5) ofm_h, ofm_w = ofm_shape - ifm = helper.make_tensor_value_info( - "ifm", - TensorProto.FLOAT, - [1, ofm_h, ofm_w, mw] - ) - weights = helper.make_tensor_value_info( - "weights", - TensorProto.FLOAT, - [mw, mh] - ) + ifm = helper.make_tensor_value_info("ifm", TensorProto.FLOAT, [1, ofm_h, ofm_w, mw]) + ofm = helper.make_tensor_value_info("ofm", TensorProto.FLOAT, (1, ofm_h, ofm_w, mh)) W = gen_finn_dt_tensor(wdt, (mw, mh)) - model = make_single_matmul_modelwrapper(W, ofm_shape, mh, ifm, weights, idt, wdt) + model = make_single_matmul_modelwrapper(ifm, ofm, idt, wdt, W) model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) - model.save(build_dir+"/matmul.onnx") + model.save(build_dir + "/matmul.onnx") # Create MatMul & obtain golden reference output - A = gen_finn_dt_tensor(model.get_tensor_datatype("ifm"), model.get_tensor_shape("ifm")) + A = gen_finn_dt_tensor(model.get_tensor_datatype("global_in"), model.get_tensor_shape("global_in")) input_dict = prepare_inputs(A) - ## Execute ONNX model - output_matmul = oxe.execute_onnx(model, input_dict) + # Execute ONNX model + output_matmul = oxe.execute_onnx(model, input_dict)["global_out"] + + with open(build_dir + "/onnx_output.pkl", "wb") as f: + pickle.dump(output_matmul, f) # Create MVAU (HLS) model = model.transform(to_hls.InferQuantizedMatrixVectorActivation(mem_mode="decoupled")) model = model.transform(GiveUniqueNodeNames()) - + # Apply folding (i.e. specify to use DSPs) folding_config = { "Defaults": {}, "MatrixVectorActivation_0": { - "PE" : pe, - "SIMD" : simd, - "mem_mode" : "decoupled", - "ram_style" : "auto", - "resType" : "dsp", - "impl" : "rtl" - } + "PE": pe, + "SIMD": simd, + "mem_mode": "decoupled", + "ram_style": "auto", + "resType": "dsp", + "preferred_backend" : "rtl" + }, } model = model.transform(ApplyConfig(folding_config)) - model.save(build_dir+"/mvau_hls.onnx") - - model = model.transform(SetExecMode("rtlsim")) - model = model.transform(PrepareIP(part, 5)) - model = model.transform(HLSSynthIP()) - model = model.transform(PrepareRTLSim()) - output_mvau_hls = oxe.execute_onnx(model, input_dict)["ofm"] + model.save(build_dir + "/mvau_hls.onnx") # Apply convert-to-rtl step model = model.transform(to_rtl.InferRTLMatrixVectorActivation()) model = model.transform(GiveUniqueNodeNames()) - model.save(build_dir+"/mvau_rtl.onnx") + model.save(build_dir + "/mvau_rtl.onnx") + # Reset rtlsim_so and ip-related paths such that new Pyverilator SO and IP is generated + for n in model.graph.node: + getCustomOp(n).set_nodeattr("rtlsim_trace", build_dir + "/mvu_trace_rtl_nodebynode.vcd") + model = model.transform(SetExecMode("rtlsim")) - model = model.transform(PrepareIP("xcvm1802-vsvd1760-2MP-e-S", 5)) + model = model.transform(PrepareIP(part, clk_ns)) model = model.transform(HLSSynthIP()) model = model.transform(PrepareRTLSim()) - output_mvau_rtl = oxe.execute_onnx(model, input_dict)["ofm"] + output_mvau_rtl = oxe.execute_onnx(model, input_dict)["global_out"] + + with open(build_dir + "/mvau_rtl_output.pkl", "wb") as f: + pickle.dump(output_mvau_rtl, f) + + model.save(build_dir + "/mvau_rtl_sim.onnx") + assert (output_matmul == output_mvau_rtl).all(), "Output of ONNX model not matching output of node-by-node sim!" + + model = model.transform(InsertAndSetFIFODepths(part, clk_ns)) + model = model.transform(PrepareIP(part, clk_ns)) + model = model.transform(HLSSynthIP()) + model = model.transform(CreateStitchedIP(part, clk_ns)) - model.save(build_dir+"/mvau_rtl_sim.onnx") + os.environ["RTLSIM_TRACE_DEPTH"] = "3" + model.set_metadata_prop("rtlsim_so", "") + model.set_metadata_prop("exec_mode", "rtlsim") + model.set_metadata_prop("rtlsim_trace", build_dir + "/mvu_trace_rtl_stitch.vcd") + model.save(build_dir + "/stitched_ip.onnx") + output_mvau_rtl_stitch = oxe.execute_onnx(model, input_dict)["global_out"] - assert (output_mvau_hls == output_mvau_rtl).all() - assert (output_mvau_hls.size > 0) + assert (output_matmul == output_mvau_rtl_stitch).all(), "Output of ONNX model not matching output of stitched-IP RTL model!" \ No newline at end of file From 9b2ccebba2c3689d6a1e55b6df027f461244d216 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 11 Jan 2024 14:43:46 +0000 Subject: [PATCH 233/235] [mvu vvu axi]: reworked flow control and backpressure handling by tpreusser --- finn-rtllib/mvu/mvu_vvu_axi.sv | 130 ++++++++++++++++----------------- 1 file changed, 61 insertions(+), 69 deletions(-) diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index a3b051c9a1..0168f20563 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -62,12 +62,12 @@ module mvu_vvu_axi #( // Safely deducible parameters localparam int unsigned WEIGHT_STREAM_WIDTH = PE * SIMD * WEIGHT_WIDTH, localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (WEIGHT_STREAM_WIDTH + 7) / 8 * 8, - localparam int unsigned INPUT_STREAM_WIDTH = (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH, + localparam int unsigned INPUT_STREAM_WIDTH = SIMD * ACTIVATION_WIDTH, localparam int unsigned INPUT_STREAM_WIDTH_BA = (INPUT_STREAM_WIDTH + 7) / 8 * 8, localparam int unsigned OUTPUT_STREAM_WIDTH = PE * ACCU_WIDTH, localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_STREAM_WIDTH + 7) / 8 * 8, localparam int unsigned SF = MW / SIMD, - localparam int unsigned NF = IS_MVU ? MH / PE : 1 + localparam int unsigned NF = MH / PE ) ( // Global Control @@ -119,81 +119,73 @@ module mvu_vvu_axi #( $finish; end end - if (!IS_MVU) begin - if (COMPUTE_CORE != "mvu_vvu_8sx9_dsp58") begin - $error("VVU only supported on DSP58"); - $finish; - end - end end uwire clk = ap_clk; uwire rst = !ap_rst_n; - typedef logic [INPUT_STREAM_WIDTH-1 : 0] mvauin_t; - - uwire mvauin_t amvau; + //- Replay to Accommodate Neuron Fold ----------------------------------- + typedef logic [PE*SIMD-1:0][ACTIVATION_WIDTH-1:0] mvu_flatin_t; + uwire mvu_flatin_t amvau; uwire alast; uwire afin; uwire avld; uwire ardy; - replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvauin_t))) activation_replay ( + replay_buffer #(.LEN(SF), .REP(NF), .W($bits(mvu_flatin_t))) activation_replay ( .clk, .rst, - .ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvauin_t'(s_axis_input_tdata)), + .ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvu_flatin_t'(s_axis_input_tdata)), .ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin) ); -//-------------------- Input control --------------------\\ + //- Unflatten inputs into structured matrices --------------------------- + typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH -1:0] mvu_w_t; + typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] mvu_a_t; + + uwire mvu_w_t mvu_w = s_axis_weights_tdata; + uwire mvu_a_t mvu_a = amvau; + + //- Flow Control Bracket around Compute Core ---------------------------- uwire en; uwire istb = avld && s_axis_weights_tvalid; assign ardy = en && s_axis_weights_tvalid; assign s_axis_weights_tready = en && avld; -//-------------------- Core MVU/VVU --------------------\\ - uwire ovld; - uwire [PE-1:0][ACCU_WIDTH-1:0] odat; - uwire mvauin_t amvau_i; - typedef logic [WEIGHT_STREAM_WIDTH-1 : 0] mvauin_weight_t; - - if (IS_MVU) begin : genMVUInput - assign amvau_i = amvau; - end : genMVUInput - else begin : genVVUInput - // The input stream will have the channels interleaved for VVU when PE>1 - // Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..] - // Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like: - // (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to - // (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i) - localparam int num_of_elements = PE*SIMD; - for (genvar i=0; i 1) ? - amvau[(i/SIMD + (i*PE % num_of_elements) + 1) * ACTIVATION_WIDTH -1: (i/SIMD + (i*PE % num_of_elements)) * ACTIVATION_WIDTH] - : amvau[i*ACTIVATION_WIDTH +: ACTIVATION_WIDTH]; - end : genRewire - end : genVVUInput + //- Instantiate compute core ---------------------------- + typedef logic [PE-1:0][ACCU_WIDTH-1:0] dsp_p_t; + uwire dsp_vld; + uwire dsp_p_t dsp_p; + + uwire dsp_clk = ap_clk; + uwire dsp_en = en; + uwire dsp_last = alast && avld; + uwire dsp_zero = !istb; + uwire mvu_w_t dsp_w = mvu_w; + uwire mvu_a_t dsp_a = mvu_a; + uwire ovld = dsp_vld; + uwire dsp_p_t odat = dsp_p; case(COMPUTE_CORE) "mvu_vvu_8sx9_dsp58": mvu_vvu_8sx9_dsp58 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( - .clk, .rst, .en, - .last(alast && avld), .zero(!istb), .w(s_axis_weights_tdata), .a(amvau_i), - .vld(ovld), .p(odat) + .clk(dsp_clk), .rst, .en(dsp_en), + .last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a), + .vld(dsp_vld), .p(dsp_p) ); "mvu_4sx4u": mvu_4sx4u #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( - .clk, .rst, .en, - .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), - .vld(ovld), .p(odat) + .clk(dsp_clk), .rst, .en(dsp_en), + .last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a), + .vld(dsp_vld), .p(dsp_p) ); "mvu_8sx8u_dsp48": mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( - .clk, .rst, .en, - .last(alast && avld), .zero(!istb), .w(mvauin_weight_t'(s_axis_weights_tdata)), .a(amvau_i), - .vld(ovld), .p(odat) + .clk(dsp_clk), .rst, .en(dsp_en), + .last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a), + .vld(dsp_vld), .p(dsp_p) ); default: initial begin $error("Unrecognized COMPUTE_CORE '%s'", COMPUTE_CORE); @@ -202,41 +194,41 @@ module mvu_vvu_axi #( endcase //-------------------- Output register slice --------------------\\ + // Make `en`computation independent from external inputs. + // Drive all outputs from registers. struct packed { - logic vld; + logic rdy; logic [PE-1:0][ACCU_WIDTH-1:0] dat; - } A = '{ vld: 0, default: 'x}; - - assign en = !A.vld || !ovld; - - uwire b_load; - always_ff @(posedge clk) begin - if(rst) A <= '{ vld: 0, default: 'x }; - else if(!A.vld || b_load) begin - A.vld <= ovld && en; - for(int unsigned i = 0; i < PE; i++) begin - // CR-1148862: - // A.dat[i] <= odat[i]; - automatic logic [ACCU_WIDTH-1:0] v = odat[i]; - A.dat[i] <= v[ACCU_WIDTH-1:0]; - end - end - end - + } A = '{ rdy: 1, default: 'x }; // side-step register used when encountering backpressure struct packed { logic vld; logic [PE-1:0][ACCU_WIDTH-1:0] dat; - } B = '{ vld: 0, default: 'x}; + } B = '{ vld: 0, default: 'x }; // ultimate output register + + assign en = A.rdy; + uwire b_load = !B.vld || m_axis_output_tready; - assign b_load = !B.vld || m_axis_output_tready; always_ff @(posedge clk) begin - if(rst) B <= '{ vld: 0, default: 'x }; + if(rst) begin + A <= '{ rdy: 1, default: 'x }; + B <= '{ vld: 0, default: 'x }; + end else begin - if(b_load) B <= '{ vld: A.vld, dat: A.dat}; + if(A.rdy) A.dat <= odat; + A.rdy <= (A.rdy && !ovld) || b_load; + + if(b_load) begin + B <= '{ + vld: ovld || !A.rdy, + dat: A.rdy? odat : A.dat + }; + end end end - assign m_axis_output_tvalid = B.vld; + // Why would we need a sign extension here potentially creating a higher signal load into the next FIFO? + // These extra bits should never be used. Why not 'x them out? assign m_axis_output_tdata = { {(OUTPUT_STREAM_WIDTH_BA-OUTPUT_STREAM_WIDTH){B.dat[PE-1][ACCU_WIDTH-1]}}, B.dat}; + endmodule : mvu_vvu_axi From ee9f027592e0f28deeab5cbe8d008f3be6076c92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Wed, 31 Jan 2024 09:59:17 +0000 Subject: [PATCH 234/235] Adding DSP48E1 support for 8-bit compute. Todo: finer core differentiation to select DSP48E2 explicitly again. --- finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 165 ++++++++++++++++++++++++----- 1 file changed, 139 insertions(+), 26 deletions(-) diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv index 1e6855f779..f3cde9dea9 100644 --- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv +++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv @@ -5,10 +5,9 @@ module mvu_8sx8u_dsp48 #( int unsigned ACTIVATION_WIDTH, int unsigned WEIGHT_WIDTH, + int unsigned VERSION = 1, bit SIGNED_ACTIVATIONS = 0, - bit FORCE_BEHAVIORAL = 0, - - localparam int unsigned SINGLE_PROD_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH + bit FORCE_BEHAVIORAL = 0 )( // Global Control input logic clk, @@ -49,6 +48,7 @@ module mvu_8sx8u_dsp48 #( assign vld = L[5]; // Stages #1 - #3: DSP Lanes + cross-lane canaries duplicated with SIMD parallelism + localparam int unsigned SINGLE_PROD_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH; localparam int unsigned D[2:0] = '{ ACCU_WIDTH+SINGLE_PROD_WIDTH, SINGLE_PROD_WIDTH, 0 }; // Lane offsets localparam int unsigned PIPE_COUNT = (PE+1)/2; @@ -63,8 +63,8 @@ module mvu_8sx8u_dsp48 #( for(genvar s = 0; s < SIMD; s++) begin : genSIMD // Input Lane Assembly - uwire [23:0] bb = { {(24-ACTIVATION_WIDTH){SIGNED_ACTIVATIONS && a[s][ACTIVATION_WIDTH-1]}}, a[s] }; - logic [33:0] aa; + uwire [17:0] bb = { {(18-ACTIVATION_WIDTH){SIGNED_ACTIVATIONS && a[s][ACTIVATION_WIDTH-1]}}, a[s] }; + logic [29:0] aa; logic [26:0] dd; logic [ 1:0] xx; if(1) begin : blkVectorize @@ -99,14 +99,14 @@ module mvu_8sx8u_dsp48 #( end end : blkVectorize - uwire [57:0] pp; + uwire [47:0] pp; // Note: Since the product B * AD is computed, // rst can be only applied to AD and zero only to B // with the same effect as zeroing both. if(BEHAVIORAL) begin : genBehav // Stage #1: Input Refine - logic signed [23:0] B1 = 0; + logic signed [17:0] B1 = 0; always_ff @(posedge clk) begin if(zero) B1 <= 0; else if(en) B1 <= bb; @@ -119,7 +119,7 @@ module mvu_8sx8u_dsp48 #( end // Stage #2: Multiply - logic signed [50:0] M2 = 0; + logic signed [45:0] M2 = 0; always_ff @(posedge clk) begin if(rst) M2 <= 0; else if(en) M2 <= @@ -130,7 +130,7 @@ module mvu_8sx8u_dsp48 #( end // Stage #3: Accumulate - logic signed [57:0] P3 = 0; + logic signed [47:0] P3 = 0; always_ff @(posedge clk) begin if(rst) P3 <= 0; else if(en) P3 <= M2 + (L[3]? 0 : P3); @@ -140,7 +140,115 @@ module mvu_8sx8u_dsp48 #( end : genBehav `ifndef VERILATOR else begin : genDSP - DSP48E2 #( + localparam logic [6:0] OPMODE_INVERSION = 7'b010_01_01; + uwire [6:0] opmode = { { 1'b0, L[2], 1'b0 }, 4'b00_00 }; + case(VERSION) + 1: DSP48E1 #( + // Feature Control Attributes: Data Path Selection + .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) + .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port) + .USE_DPORT("TRUE"), // Select D port usage (TRUE or FALSE) + .USE_MULT("MULTIPLY"), // Select multiplier usage ("MULTIPLY", "DYNAMIC", or "NONE") + .USE_SIMD("ONE48"), // SIMD selection ("ONE48", "TWO24", "FOUR12") + + // Pattern Detector Attributes: Pattern Detection Configuration + .AUTORESET_PATDET("NO_RESET"), // "NO_RESET", "RESET_MATCH", "RESET_NOT_MATCH" + .MASK('1), // 48-bit mask value for pattern detect (1=ignore) + .PATTERN('0), // 48-bit pattern match for pattern detect + .SEL_MASK("MASK"), // "C", "MASK", "ROUNDING_MODE1", "ROUNDING_MODE2" + .SEL_PATTERN("PATTERN"), // Select pattern value ("PATTERN" or "C") + .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect ("PATDET" or "NO_PATDET") + + // Register Control Attributes: Pipeline Register Configuration + .ACASCREG(0), // Number of pipeline stages between A/ACIN and ACOUT (0, 1 or 2) + .ADREG(1), // Number of pipeline stages for pre-adder (0 or 1) + .ALUMODEREG(0), // Number of pipeline stages for ALUMODE (0 or 1) + .AREG(0), // Number of pipeline stages for A (0, 1 or 2) + .BCASCREG(1), // Number of pipeline stages between B/BCIN and BCOUT (0, 1 or 2) + .BREG(1), // Number of pipeline stages for B (0, 1 or 2) + .CARRYINREG(0), // Number of pipeline stages for CARRYIN (0 or 1) + .CARRYINSELREG(0), // Number of pipeline stages for CARRYINSEL (0 or 1) + .CREG(0), // Number of pipeline stages for C (0 or 1) + .DREG(0), // Number of pipeline stages for D (0 or 1) + .INMODEREG(0), // Number of pipeline stages for INMODE (0 or 1) + .MREG(1), // Number of multiplier pipeline stages (0 or 1) + .OPMODEREG(1), // Number of pipeline stages for OPMODE (0 or 1) + .PREG(1) // Number of pipeline stages for P (0 or 1) + ) dsp ( + // Cascade: 30-bit (each) output: Cascade Ports + .ACOUT(), // 30-bit output: A port cascade output + .BCOUT(), // 18-bit output: B port cascade output + .CARRYCASCOUT(), // 1-bit output: Cascade carry output + .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade output + .PCOUT(), // 48-bit output: Cascade output + + // Control: 1-bit (each) output: Control Inputs/Status Bits + .OVERFLOW(), // 1-bit output: Overflow in add/acc output + .PATTERNBDETECT(), // 1-bit output: Pattern bar detect output + .PATTERNDETECT(), // 1-bit output: Pattern detect output + .UNDERFLOW(), // 1-bit output: Underflow in add/acc output + + // Data: 4-bit (each) output: Data Ports + .CARRYOUT(), // 4-bit output: Carry output + .P(pp), // 48-bit output: Primary data output + + // Cascade: 30-bit (each) input: Cascade Ports + .ACIN('x), // 30-bit input: A cascade data input + .BCIN('x), // 18-bit input: B cascade input + .CARRYCASCIN('x), // 1-bit input: Cascade carry input + .MULTSIGNIN('x), // 1-bit input: Multiplier sign input + .PCIN('x), // 48-bit input: P cascade input + + // Control: 4-bit (each) input: Control Inputs/Status Bits + .CLK(clk), // 1-bit input: Clock input + .ALUMODE('0), // 4-bit input: ALU control input + .CARRYINSEL('0), // 3-bit input: Carry select input + .INMODE(5'b01100), // 5-bit input: INMODE control input + .OPMODE(opmode ^ OPMODE_INVERSION), // 7-bit input: Operation mode input + + // Data: 30-bit (each) input: Data Ports + .A(aa), // 30-bit input: A data input + .B(bb), // 18-bit input: B data input + .C('x), // 48-bit input: C data input + .CARRYIN('0), // 1-bit input: Carry input signal + .D(dd), // 25-bit input: D data input + + // Reset/Clock Enable: 1-bit (each) input: Reset/Clock Enable Inputs + .CEA1('0), // 1-bit input: Clock enable input for 1st stage AREG + .CEA2('0), // 1-bit input: Clock enable input for 2nd stage AREG + .CEAD(en), // 1-bit input: Clock enable input for ADREG + .CEALUMODE('0), // 1-bit input: Clock enable input for ALUMODERE + .CEB1('0), // 1-bit input: Clock enable input for 1st stage BREG + .CEB2(en), // 1-bit input: Clock enable input for 2nd stage BREG + .CEC('0), // 1-bit input: Clock enable input for CREG + .CECARRYIN('0), // 1-bit input: Clock enable input for CARRYINREG + .CECTRL(en), // 1-bit input: Clock enable input for OPMODEREG and CARRYINSELREG + .CED('0), // 1-bit input: Clock enable input for DREG + .CEINMODE('0), // 1-bit input: Clock enable input for INMODEREG + .CEM(en), // 1-bit input: Clock enable input for MREG + .CEP(en), // 1-bit input: Clock enable input for PREG + .RSTA('0), // 1-bit input: Reset input for AREG + .RSTB( // 1-bit input: Reset for BREG +// synthesis translate_off + rst || +// synthesis translate_on + zero + ), + .RSTC('0), // 1-bit input: Reset for CREG + .RSTD( // 1-bit input: Reset for DREG and ADREG +// synthesis translate_off + zero || +// synthesis translate_on + rst + ), + .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG + .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG + .RSTCTRL('0), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG + .RSTINMODE('0), // 1-bit input: Reset for INMODE register + .RSTM(rst), // 1-bit input: Reset for MREG + .RSTP(rst) // 1-bit input: Reset for PREG + ); + 2: DSP48E2 #( // Feature Control Attributes: Data Path Selection .AMULTSEL("AD"), // Selects A input to multiplier (A, AD) .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) @@ -163,21 +271,21 @@ module mvu_8sx8u_dsp48 #( .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect (NO_PATDET, PATDET) // Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins - .IS_ALUMODE_INVERTED('0), // Optional inversion for ALUMODE - .IS_CARRYIN_INVERTED('0), // Optional inversion for CARRYIN - .IS_CLK_INVERTED('0), // Optional inversion for CLK - .IS_INMODE_INVERTED('0), // Optional inversion for INMODE - .IS_OPMODE_INVERTED(9'b00_010_01_01), // Optional inversion for OPMODE - .IS_RSTALLCARRYIN_INVERTED('0), // Optional inversion for RSTALLCARRYIN - .IS_RSTALUMODE_INVERTED('0), // Optional inversion for RSTALUMODE - .IS_RSTA_INVERTED('0), // Optional inversion for RSTA - .IS_RSTB_INVERTED('0), // Optional inversion for RSTB - .IS_RSTCTRL_INVERTED('0), // Optional inversion for STCONJUGATE_A - .IS_RSTC_INVERTED('0), // Optional inversion for RSTC - .IS_RSTD_INVERTED('0), // Optional inversion for RSTD - .IS_RSTINMODE_INVERTED('0), // Optional inversion for RSTINMODE - .IS_RSTM_INVERTED('0), // Optional inversion for RSTM - .IS_RSTP_INVERTED('0), // Optional inversion for RSTP + .IS_ALUMODE_INVERTED('0), // Optional inversion for ALUMODE + .IS_CARRYIN_INVERTED('0), // Optional inversion for CARRYIN + .IS_CLK_INVERTED('0), // Optional inversion for CLK + .IS_INMODE_INVERTED('0), // Optional inversion for INMODE + .IS_OPMODE_INVERTED({ 2'b00, OPMODE_INVERSION}), // Optional inversion for OPMODE + .IS_RSTALLCARRYIN_INVERTED('0), // Optional inversion for RSTALLCARRYIN + .IS_RSTALUMODE_INVERTED('0), // Optional inversion for RSTALUMODE + .IS_RSTA_INVERTED('0), // Optional inversion for RSTA + .IS_RSTB_INVERTED('0), // Optional inversion for RSTB + .IS_RSTCTRL_INVERTED('0), // Optional inversion for STCONJUGATE_A + .IS_RSTC_INVERTED('0), // Optional inversion for RSTC + .IS_RSTD_INVERTED('0), // Optional inversion for RSTD + .IS_RSTINMODE_INVERTED('0), // Optional inversion for RSTINMODE + .IS_RSTM_INVERTED('0), // Optional inversion for RSTM + .IS_RSTP_INVERTED('0), // Optional inversion for RSTP // Register Control Attributes: Pipeline Register Configuration .ACASCREG(0), // Number of pipeline stages between A/ACIN and ACOUT (0-2) @@ -225,7 +333,7 @@ module mvu_8sx8u_dsp48 #( .ALUMODE(4'h0), // 4-bit input: ALU control .CARRYINSEL('0), // 3-bit input: Carry select .INMODE(5'b01100), // 5-bit input: INMODE control - .OPMODE({ 2'b00, { 1'b0, L[2], 1'b0 }, 4'b00_00 }), // 9-bit input: Operation mode + .OPMODE({ 2'b00, opmode }), // 9-bit input: Operation mode // Data inputs: Data Ports .A(aa), // 34-bit input: A data @@ -269,6 +377,11 @@ module mvu_8sx8u_dsp48 #( .RSTM(rst), // 1-bit input: Reset for MREG .RSTP(rst) // 1-bit input: Reset for PREG ); + default: initial begin + $error("Unknown version DSP48E%0d.", VERSION); + $finish; + end + endcase end : genDSP `endif From 3ab82966e1af64aa6ddb75f88561c5e6c86196b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Wed, 31 Jan 2024 10:15:32 +0000 Subject: [PATCH 235/235] Adding DSP48E1 support for 4-bit compute. Todo: finer core differentiation to select DSP48E2 explicitly again. --- finn-rtllib/mvu/mvu_4sx4u.sv | 169 +++++++++++++++++++++++++++++------ 1 file changed, 142 insertions(+), 27 deletions(-) diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv index 7a2af35742..b49315637f 100644 --- a/finn-rtllib/mvu/mvu_4sx4u.sv +++ b/finn-rtllib/mvu/mvu_4sx4u.sv @@ -2,8 +2,10 @@ module mvu_4sx4u #( int unsigned PE, int unsigned SIMD, int unsigned ACCU_WIDTH, - bit SIGNED_ACTIVATIONS = 0, - bit FORCE_BEHAVIORAL = 0 + + int unsigned VERSION = 1, + bit SIGNED_ACTIVATIONS = 0, + bit FORCE_BEHAVIORAL = 0 )( // Global Control input logic clk, @@ -14,7 +16,7 @@ module mvu_4sx4u #( input logic last, input logic zero, // ignore current inputs and force this partial product to zero input logic signed [PE-1:0][SIMD-1:0][3:0] w, // signed weights - input logic [SIMD-1:0][3:0] a, // unsigned activations + input logic [SIMD-1:0][3:0] a, // unsigned activations (override by SIGNED_ACTIVATIONS) // Ouput output logic vld, @@ -58,8 +60,8 @@ module mvu_4sx4u #( for(genvar s = 0; s < SIMD; s++) begin : genSIMD // Input Lane Assembly - uwire [23:0] bb = { {(20){SIGNED_ACTIVATIONS && a[s][3]}}, a[s] }; - logic [33:0] aa; + uwire [17:0] bb = { {(14){SIGNED_ACTIVATIONS && a[s][3]}}, a[s] }; + logic [29:0] aa; logic [26:0] dd; logic [ 1:0] xx[3:1]; if(1) begin : blkVectorize @@ -94,14 +96,14 @@ module mvu_4sx4u #( end end : blkVectorize - uwire [57:0] pp; + uwire [47:0] pp; // Note: Since the product B * AD is computed, // rst can be only applied to AD and zero only to B // with the same effect as zeroing both. - if (BEHAVIORAL) begin : genBehav + if(BEHAVIORAL) begin : genBehav // Stage #1: Input Refine - logic signed [23:0] B1 = 0; + logic signed [17:0] B1 = 0; always_ff @(posedge clk) begin if(zero) B1 <= 0; else if(en) B1 <= bb; @@ -114,7 +116,7 @@ module mvu_4sx4u #( end // Stage #2: Multiply - logic signed [50:0] M2 = 0; + logic signed [45:0] M2 = 0; always_ff @(posedge clk) begin if(rst) M2 <= 0; else if(en) M2 <= @@ -125,7 +127,7 @@ module mvu_4sx4u #( end // Stage #3: Accumulate - logic signed [57:0] P3 = 0; + logic signed [47:0] P3 = 0; always_ff @(posedge clk) begin if(rst) P3 <= 0; else if(en) P3 <= M2 + (L[3]? 0 : P3); @@ -135,7 +137,115 @@ module mvu_4sx4u #( end : genBehav `ifndef VERILATOR else begin : genDSP - DSP48E2 #( + localparam logic [6:0] OPMODE_INVERSION = 7'b010_01_01; + uwire [6:0] opmode = { { 1'b0, L[2], 1'b0 }, 4'b00_00 }; + case(VERSION) + 1: DSP48E1 #( + // Feature Control Attributes: Data Path Selection + .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) + .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port) + .USE_DPORT("TRUE"), // Select D port usage (TRUE or FALSE) + .USE_MULT("MULTIPLY"), // Select multiplier usage ("MULTIPLY", "DYNAMIC", or "NONE") + .USE_SIMD("ONE48"), // SIMD selection ("ONE48", "TWO24", "FOUR12") + + // Pattern Detector Attributes: Pattern Detection Configuration + .AUTORESET_PATDET("NO_RESET"), // "NO_RESET", "RESET_MATCH", "RESET_NOT_MATCH" + .MASK('1), // 48-bit mask value for pattern detect (1=ignore) + .PATTERN('0), // 48-bit pattern match for pattern detect + .SEL_MASK("MASK"), // "C", "MASK", "ROUNDING_MODE1", "ROUNDING_MODE2" + .SEL_PATTERN("PATTERN"), // Select pattern value ("PATTERN" or "C") + .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect ("PATDET" or "NO_PATDET") + + // Register Control Attributes: Pipeline Register Configuration + .ACASCREG(0), // Number of pipeline stages between A/ACIN and ACOUT (0, 1 or 2) + .ADREG(1), // Number of pipeline stages for pre-adder (0 or 1) + .ALUMODEREG(0), // Number of pipeline stages for ALUMODE (0 or 1) + .AREG(0), // Number of pipeline stages for A (0, 1 or 2) + .BCASCREG(1), // Number of pipeline stages between B/BCIN and BCOUT (0, 1 or 2) + .BREG(1), // Number of pipeline stages for B (0, 1 or 2) + .CARRYINREG(0), // Number of pipeline stages for CARRYIN (0 or 1) + .CARRYINSELREG(0), // Number of pipeline stages for CARRYINSEL (0 or 1) + .CREG(0), // Number of pipeline stages for C (0 or 1) + .DREG(0), // Number of pipeline stages for D (0 or 1) + .INMODEREG(0), // Number of pipeline stages for INMODE (0 or 1) + .MREG(1), // Number of multiplier pipeline stages (0 or 1) + .OPMODEREG(1), // Number of pipeline stages for OPMODE (0 or 1) + .PREG(1) // Number of pipeline stages for P (0 or 1) + ) dsp ( + // Cascade: 30-bit (each) output: Cascade Ports + .ACOUT(), // 30-bit output: A port cascade output + .BCOUT(), // 18-bit output: B port cascade output + .CARRYCASCOUT(), // 1-bit output: Cascade carry output + .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade output + .PCOUT(), // 48-bit output: Cascade output + + // Control: 1-bit (each) output: Control Inputs/Status Bits + .OVERFLOW(), // 1-bit output: Overflow in add/acc output + .PATTERNBDETECT(), // 1-bit output: Pattern bar detect output + .PATTERNDETECT(), // 1-bit output: Pattern detect output + .UNDERFLOW(), // 1-bit output: Underflow in add/acc output + + // Data: 4-bit (each) output: Data Ports + .CARRYOUT(), // 4-bit output: Carry output + .P(pp), // 48-bit output: Primary data output + + // Cascade: 30-bit (each) input: Cascade Ports + .ACIN('x), // 30-bit input: A cascade data input + .BCIN('x), // 18-bit input: B cascade input + .CARRYCASCIN('x), // 1-bit input: Cascade carry input + .MULTSIGNIN('x), // 1-bit input: Multiplier sign input + .PCIN('x), // 48-bit input: P cascade input + + // Control: 4-bit (each) input: Control Inputs/Status Bits + .CLK(clk), // 1-bit input: Clock input + .ALUMODE('0), // 4-bit input: ALU control input + .CARRYINSEL('0), // 3-bit input: Carry select input + .INMODE(5'b01100), // 5-bit input: INMODE control input + .OPMODE(opmode ^ OPMODE_INVERSION), // 7-bit input: Operation mode input + + // Data: 30-bit (each) input: Data Ports + .A(aa), // 30-bit input: A data input + .B(bb), // 18-bit input: B data input + .C('x), // 48-bit input: C data input + .CARRYIN('0), // 1-bit input: Carry input signal + .D(dd), // 25-bit input: D data input + + // Reset/Clock Enable: 1-bit (each) input: Reset/Clock Enable Inputs + .CEA1('0), // 1-bit input: Clock enable input for 1st stage AREG + .CEA2('0), // 1-bit input: Clock enable input for 2nd stage AREG + .CEAD(en), // 1-bit input: Clock enable input for ADREG + .CEALUMODE('0), // 1-bit input: Clock enable input for ALUMODERE + .CEB1('0), // 1-bit input: Clock enable input for 1st stage BREG + .CEB2(en), // 1-bit input: Clock enable input for 2nd stage BREG + .CEC('0), // 1-bit input: Clock enable input for CREG + .CECARRYIN('0), // 1-bit input: Clock enable input for CARRYINREG + .CECTRL(en), // 1-bit input: Clock enable input for OPMODEREG and CARRYINSELREG + .CED('0), // 1-bit input: Clock enable input for DREG + .CEINMODE('0), // 1-bit input: Clock enable input for INMODEREG + .CEM(en), // 1-bit input: Clock enable input for MREG + .CEP(en), // 1-bit input: Clock enable input for PREG + .RSTA('0), // 1-bit input: Reset input for AREG + .RSTB( // 1-bit input: Reset for BREG +// synthesis translate_off + rst || +// synthesis translate_on + zero + ), + .RSTC('0), // 1-bit input: Reset for CREG + .RSTD( // 1-bit input: Reset for DREG and ADREG +// synthesis translate_off + zero || +// synthesis translate_on + rst + ), + .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG + .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG + .RSTCTRL('0), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG + .RSTINMODE('0), // 1-bit input: Reset for INMODE register + .RSTM(rst), // 1-bit input: Reset for MREG + .RSTP(rst) // 1-bit input: Reset for PREG + ); + 2: DSP48E2 #( // Feature Control Attributes: Data Path Selection .AMULTSEL("AD"), // Selects A input to multiplier (A, AD) .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) @@ -158,21 +268,21 @@ module mvu_4sx4u #( .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect (NO_PATDET, PATDET) // Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins - .IS_ALUMODE_INVERTED('0), // Optional inversion for ALUMODE - .IS_CARRYIN_INVERTED('0), // Optional inversion for CARRYIN - .IS_CLK_INVERTED('0), // Optional inversion for CLK - .IS_INMODE_INVERTED('0), // Optional inversion for INMODE - .IS_OPMODE_INVERTED(9'b00_010_01_01), // Optional inversion for OPMODE - .IS_RSTALLCARRYIN_INVERTED('0), // Optional inversion for RSTALLCARRYIN - .IS_RSTALUMODE_INVERTED('0), // Optional inversion for RSTALUMODE - .IS_RSTA_INVERTED('0), // Optional inversion for RSTA - .IS_RSTB_INVERTED('0), // Optional inversion for RSTB - .IS_RSTCTRL_INVERTED('0), // Optional inversion for STCONJUGATE_A - .IS_RSTC_INVERTED('0), // Optional inversion for RSTC - .IS_RSTD_INVERTED('0), // Optional inversion for RSTD - .IS_RSTINMODE_INVERTED('0), // Optional inversion for RSTINMODE - .IS_RSTM_INVERTED('0), // Optional inversion for RSTM - .IS_RSTP_INVERTED('0), // Optional inversion for RSTP + .IS_ALUMODE_INVERTED('0), // Optional inversion for ALUMODE + .IS_CARRYIN_INVERTED('0), // Optional inversion for CARRYIN + .IS_CLK_INVERTED('0), // Optional inversion for CLK + .IS_INMODE_INVERTED('0), // Optional inversion for INMODE + .IS_OPMODE_INVERTED({ 2'b00, OPMODE_INVERSION}), // Optional inversion for OPMODE + .IS_RSTALLCARRYIN_INVERTED('0), // Optional inversion for RSTALLCARRYIN + .IS_RSTALUMODE_INVERTED('0), // Optional inversion for RSTALUMODE + .IS_RSTA_INVERTED('0), // Optional inversion for RSTA + .IS_RSTB_INVERTED('0), // Optional inversion for RSTB + .IS_RSTCTRL_INVERTED('0), // Optional inversion for STCONJUGATE_A + .IS_RSTC_INVERTED('0), // Optional inversion for RSTC + .IS_RSTD_INVERTED('0), // Optional inversion for RSTD + .IS_RSTINMODE_INVERTED('0), // Optional inversion for RSTINMODE + .IS_RSTM_INVERTED('0), // Optional inversion for RSTM + .IS_RSTP_INVERTED('0), // Optional inversion for RSTP // Register Control Attributes: Pipeline Register Configuration .ACASCREG(0), // Number of pipeline stages between A/ACIN and ACOUT (0-2) @@ -220,7 +330,7 @@ module mvu_4sx4u #( .ALUMODE(4'h0), // 4-bit input: ALU control .CARRYINSEL('0), // 3-bit input: Carry select .INMODE(5'b01100), // 5-bit input: INMODE control - .OPMODE({ 2'b00, { 1'b0, L[2], 1'b0 }, 4'b00_00 }), // 9-bit input: Operation mode + .OPMODE({ 2'b00, opmode }), // 9-bit input: Operation mode // Data inputs: Data Ports .A(aa), // 34-bit input: A data @@ -264,6 +374,11 @@ module mvu_4sx4u #( .RSTM(rst), // 1-bit input: Reset for MREG .RSTP(rst) // 1-bit input: Reset for PREG ); + default: initial begin + $error("Unknown version DSP48E%0d.", VERSION); + $finish; + end + endcase end : genDSP `endif