From 7583b420236d0108364e6414a4ec385a589ce0fb Mon Sep 17 00:00:00 2001
From: aziz bahri <azizb@amd.com>
Date: Sat, 27 Jan 2024 18:40:55 +0000
Subject: [PATCH 01/49] [SETUPTOOLS] add all install requirements

Signed-off-by: aziz bahri <azizb@amd.com>
---
 setup.cfg | 66 +++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 55 insertions(+), 11 deletions(-)

diff --git a/setup.cfg b/setup.cfg
index 4834011dea..26158cc4b1 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -32,6 +32,7 @@
 
 [metadata]
 name = finn
+version = 0.9.0
 description = A Framework for Fast, Scalable Quantized Neural Network Inference
 author = Yaman Umuroglu
 author_email = yamanu@xilinx.com
@@ -60,6 +61,60 @@ package_dir =
 # tests_require = pytest; pytest-cov
 # Require a specific Python version, e.g. Python 2.7 or >= 3.4
 # python_requires = >=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*
+install_requires =
+    qonnx @ git+https://github.com/fastmachinelearning/qonnx.git@47e4357faf66b5b0d1bf77bf908bb47752421e5b
+    pyverilator @ git+https://github.com/maltanar/pyverilator.git@766e457465f5c0dd315490d7b9cc5d74f9a76f4f
+    brevitas @ git+https://github.com/Xilinx/brevitas.git@84f42259ec869eb151af4cb8a8b23ad925f493db
+    finn-experimental @ git+https://github.com/Xilinx/finn-experimental.git@de99347e936d51715f5356a1b6c64e37b91c23c2
+    dataset_loading @ git+https://github.com/fbcotter/dataset_loading.git@0.0.4#egg=dataset_loading
+    bitstring==3.1.7
+    clize==5.0.1
+    dataclasses-json==0.5.7
+    gspread==3.6.0
+    importlib-resources==6.1.0
+    ipython==8.12.2
+    numpy==1.24.1
+    onnx==1.13.0
+    onnxoptimizer
+    onnxruntime==1.15.0
+    pre-commit==3.3.2
+    protobuf==3.20.3
+    psutil==5.9.4
+    pyscaffold==4.4
+    scipy==1.10.1
+    setupext-janitor>=1.1.2
+    setuptools==68.2.2
+    sigtools==4.0.1
+    toposort==1.7.0
+    vcdvcd==1.0.5
+    wget==3.2
+    torch==1.13.1
+    torchvision==0.14.1
+    torchaudio==0.13.1
+    pygments==2.14.0
+    ipykernel==6.21.2
+    jupyter==1.0.0
+    markupsafe==2.0.1
+    matplotlib==3.7.0
+    pytest-dependency==0.5.1
+    pytest-xdist[setproctitle]==3.2.0
+    pytest-parallel==0.1.1
+    netron>=5.0.0
+    pandas==1.5.3
+    scikit-learn==1.2.1
+    tqdm==4.64.1
+    pytest==6.2.5
+    pytest-metadata==1.7.0
+    pytest-html==3.0.0
+    pytest-html-merger==0.0.8
+    pytest-cov==4.1.0
+    deap==1.3.1
+    mip==1.13.0
+    networkx==2.8
+    future-annotations==1.0.0
+    dependencies==2.0.1
+    tokenize-rt==4.2.1
+    tclwrapper==0.0.1
 
 [options.packages.find]
 where = src
@@ -164,14 +219,3 @@ exclude =
     dist
     .eggs
     docs/conf.py
-
-[pyscaffold]
-# PyScaffold's parameters when the project was created.
-# This will be used when updating. Do not change!
-version = 3.2.1
-package = finn
-extensions =
-    travis
-    pre_commit
-    namespace
-namespace = finn

From 8ae17bc795f29088ff1bf78c1dc6dc2799839cee Mon Sep 17 00:00:00 2001
From: aziz bahri <azizb@amd.com>
Date: Sun, 28 Jan 2024 16:33:39 +0000
Subject: [PATCH 02/49] [SETUPTOOLS] Deps directory cleanup

Signed-off-by: aziz bahri <azizb@amd.com>
---
 fetch-repos.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fetch-repos.sh b/fetch-repos.sh
index 1275ccf31c..9869495fc1 100755
--- a/fetch-repos.sh
+++ b/fetch-repos.sh
@@ -115,10 +115,10 @@ fetch_board_files() {
     cd $OLD_PWD
 }
 
-fetch_repo $QONNX_URL $QONNX_COMMIT $QONNX_DIR
-fetch_repo $FINN_EXP_URL $FINN_EXP_COMMIT $FINN_EXP_DIR
-fetch_repo $BREVITAS_URL $BREVITAS_COMMIT $BREVITAS_DIR
-fetch_repo $PYVERILATOR_URL $PYVERILATOR_COMMIT $PYVERILATOR_DIR
+# fetch_repo $QONNX_URL $QONNX_COMMIT $QONNX_DIR
+# fetch_repo $FINN_EXP_URL $FINN_EXP_COMMIT $FINN_EXP_DIR
+# fetch_repo $BREVITAS_URL $BREVITAS_COMMIT $BREVITAS_DIR
+# fetch_repo $PYVERILATOR_URL $PYVERILATOR_COMMIT $PYVERILATOR_DIR
 fetch_repo $CNPY_URL $CNPY_COMMIT $CNPY_DIR
 fetch_repo $HLSLIB_URL $HLSLIB_COMMIT $HLSLIB_DIR
 fetch_repo $OMX_URL $OMX_COMMIT $OMX_DIR

From 3e0b04f019fd41ba8aec49a169443a39f61ae168 Mon Sep 17 00:00:00 2001
From: aziz bahri <azizb@amd.com>
Date: Thu, 7 Mar 2024 16:43:39 +0000
Subject: [PATCH 03/49] [QONNX] update to latest main fd61cfe

Signed-off-by: aziz bahri <azizb@amd.com>
---
 setup.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index 26158cc4b1..6168bec5e3 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -62,7 +62,7 @@ package_dir =
 # Require a specific Python version, e.g. Python 2.7 or >= 3.4
 # python_requires = >=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*
 install_requires =
-    qonnx @ git+https://github.com/fastmachinelearning/qonnx.git@47e4357faf66b5b0d1bf77bf908bb47752421e5b
+    qonnx @ git+https://github.com/fastmachinelearning/qonnx.git@fd61cfeebbdaba351abf7e9d54cd785d7776fa4f
     pyverilator @ git+https://github.com/maltanar/pyverilator.git@766e457465f5c0dd315490d7b9cc5d74f9a76f4f
     brevitas @ git+https://github.com/Xilinx/brevitas.git@84f42259ec869eb151af4cb8a8b23ad925f493db
     finn-experimental @ git+https://github.com/Xilinx/finn-experimental.git@de99347e936d51715f5356a1b6c64e37b91c23c2

From b066882f6a912c6eb9cd19f4c6487a7d254dd7db Mon Sep 17 00:00:00 2001
From: auphelia <jakobapk@web.de>
Date: Fri, 19 Jul 2024 14:39:47 +0100
Subject: [PATCH 04/49] [RTL Thresh] Enable workaround for unsigned narrow
 quantization

---
 .../fpgadataflow/rtl/thresholding_rtl.py      | 31 +++++++++++++++----
 .../test_fpgadataflow_thresholding.py         |  8 ++---
 .../test_fpgadataflow_thresholding_runtime.py | 26 ++++++++++------
 3 files changed, 45 insertions(+), 20 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py
index c31f90af0b..230d2879f5 100644
--- a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py
@@ -186,9 +186,19 @@ def prepare_codegen_rtl_values(self, model):
         n_thres_steps = self.get_nodeattr("numSteps")
         wdt = self.get_weight_datatype()
         if expected_thresholds != n_thres_steps:
-            min_val = wdt.min()
-            thresholds = np.insert(thresholds, 0, min_val, axis=1)
-            bias = bias - 1
+            if DataType[output_data_type].signed():
+                min_val = wdt.min()
+                thresholds = np.insert(thresholds, 0, min_val, axis=1)
+                bias = bias - 1
+            # TODO: temporary fix for unsigned narrow quantization
+            else:
+                max_val = wdt.max()
+                if max_val > DataType[input_data_type].max():
+                    thresholds = np.insert(thresholds, len(thresholds[0]), max_val, axis=1)
+                else:
+                    max_val = max_val + 1
+                    wdt = DataType.get_smallest_possible(max_val)
+                    thresholds = np.insert(thresholds, len(thresholds[0]), max_val, axis=1)
             n_thres_steps += 1
 
         # add dummy dimension as final dimension (that's what gets packed with next call)
@@ -528,8 +538,18 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name):
         n_thres_steps = self.get_nodeattr("numSteps")
         wdt = self.get_weight_datatype()
         if expected_thresholds != n_thres_steps:
-            min_val = wdt.min()
-            thresholds = np.insert(thresholds, 0, min_val, axis=1)
+            if DataType[output_data_type].signed():
+                min_val = wdt.min()
+                thresholds = np.insert(thresholds, 0, min_val, axis=1)
+            # TODO: temporary fix for unsigned narrow quantization
+            else:
+                max_val = wdt.max()
+                if max_val > self.get_input_datatype().max():
+                    thresholds = np.insert(thresholds, len(thresholds[0]), max_val, axis=1)
+                else:
+                    max_val = max_val + 1
+                    wdt = DataType.get_smallest_possible(max_val)
+                    thresholds = np.insert(thresholds, len(thresholds[0]), max_val, axis=1)
             n_thres_steps += 1
 
         # If a single threshold value is found, broadcast the value
@@ -541,7 +561,6 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name):
         thresh_padded = np.zeros((thresholds.shape[0], width_padded))
         thresh_padded[: thresholds.shape[0], :n_thres_steps] = thresholds
         thresh_stream = []
-        wdt = self.get_weight_datatype()
         bw_hexdigit = roundup_to_integer_multiple(wdt.bitwidth(), 32)
         padding = np.zeros(width_padded, dtype=np.int32)
 
diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding.py b/tests/fpgadataflow/test_fpgadataflow_thresholding.py
index e4dd49fc7f..fe7ba3d9fb 100644
--- a/tests/fpgadataflow/test_fpgadataflow_thresholding.py
+++ b/tests/fpgadataflow/test_fpgadataflow_thresholding.py
@@ -129,14 +129,14 @@ def make_single_multithresholding_modelwrapper(
         [1, 2, 2],
     ],
 )
-@pytest.mark.parametrize("activation", [DataType["INT4"], DataType["BIPOLAR"]])
+@pytest.mark.parametrize("activation", [DataType["UINT4"], DataType["INT4"], DataType["BIPOLAR"]])
 @pytest.mark.parametrize(
     "idt_tdt_cfg",
     [
         (DataType["INT8"], DataType["INT8"]),
         (DataType["INT8"], DataType["INT9"]),
-        (DataType["UINT8"], DataType["UINT8"]),
-        (DataType["UINT8"], DataType["UINT9"]),
+        (DataType["UINT5"], DataType["UINT5"]),
+        (DataType["UINT5"], DataType["UINT6"]),
     ],
 )
 @pytest.mark.parametrize("fold", [-1, 1, 2])
@@ -184,7 +184,7 @@ def test_fpgadataflow_thresholding(
         activation_bias = 0
     else:
         activation_bias = activation.min()
-        if narrow:
+        if narrow and activation.signed():
             activation_bias += 1
 
     # Generate random thresholds and sort in ascending order
diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding_runtime.py b/tests/fpgadataflow/test_fpgadataflow_thresholding_runtime.py
index 1ad695bb94..e6175ac58b 100644
--- a/tests/fpgadataflow/test_fpgadataflow_thresholding_runtime.py
+++ b/tests/fpgadataflow/test_fpgadataflow_thresholding_runtime.py
@@ -122,13 +122,16 @@ def make_single_thresholding_modelwrapper(impl_style, T, idt, odt, actval, n_inp
 
 
 @pytest.mark.parametrize("impl_style", ["rtl", "hls"])
+@pytest.mark.parametrize(
+    "idt_act_cfg", [(DataType["INT16"], DataType["INT4"]), (DataType["UINT8"], DataType["UINT4"])]
+)
 # configuration (ch, pe)
-@pytest.mark.parametrize("cfg", [(1, 1), (6, 2), (6, 3)])
+@pytest.mark.parametrize("cfg", [(1, 1), (6, 2), (6, 6)])
 @pytest.mark.parametrize("narrow", [True, False])
 @pytest.mark.parametrize("per_tensor", [True, False])
 @pytest.mark.fpgadataflow
 @pytest.mark.vivado
-def test_runtime_thresholds_read(impl_style, cfg, narrow, per_tensor):
+def test_runtime_thresholds_read(impl_style, idt_act_cfg, cfg, narrow, per_tensor):
     """Read back threshold weights during runtime
 
     1. Create random initial weights T
@@ -140,8 +143,8 @@ def test_runtime_thresholds_read(impl_style, cfg, narrow, per_tensor):
     pe = cfg[1]
     n_inp_vecs = [1, 2, 2]
     hls_mem_mode = "internal_decoupled"
-    act = DataType["INT4"]
-    idt = DataType["INT16"]
+    act = idt_act_cfg[1]
+    idt = idt_act_cfg[0]
     odt = act
     n_steps = act.get_num_possible_values() - 1
     # Generate random thresholds and sort in ascending order
@@ -151,7 +154,7 @@ def test_runtime_thresholds_read(impl_style, cfg, narrow, per_tensor):
     T = sort_thresholds_increasing(T)
 
     actval = act.min()
-    if narrow:
+    if narrow and act.signed():
         actval += 1
 
     model = make_single_thresholding_modelwrapper(impl_style, T, idt, odt, actval, n_inp_vecs, ch)
@@ -219,13 +222,16 @@ def read_weights(sim):
 
 
 @pytest.mark.parametrize("impl_style", ["rtl", "hls"])
+@pytest.mark.parametrize(
+    "idt_act_cfg", [(DataType["INT16"], DataType["INT4"]), (DataType["UINT8"], DataType["UINT4"])]
+)
 # configuration (ch, pe)
-@pytest.mark.parametrize("cfg", [(1, 1), (6, 2), (6, 3)])
+@pytest.mark.parametrize("cfg", [(1, 1), (6, 2), (6, 6)])
 @pytest.mark.parametrize("narrow", [True, False])
 @pytest.mark.parametrize("per_tensor", [True, False])
 @pytest.mark.fpgadataflow
 @pytest.mark.vivado
-def test_runtime_thresholds_write(impl_style, cfg, narrow, per_tensor):
+def test_runtime_thresholds_write(impl_style, idt_act_cfg, cfg, narrow, per_tensor):
     """Write threshold weights during runtime
 
     1. Create random initial weights T_init
@@ -241,8 +247,8 @@ def test_runtime_thresholds_write(impl_style, cfg, narrow, per_tensor):
 
     n_inp_vecs = [1, 2, 2]
     hls_mem_mode = "internal_decoupled"
-    act = DataType["INT4"]
-    idt = DataType["INT16"]
+    act = idt_act_cfg[1]
+    idt = idt_act_cfg[0]
 
     odt = act
     n_steps = act.get_num_possible_values() - 1
@@ -253,7 +259,7 @@ def test_runtime_thresholds_write(impl_style, cfg, narrow, per_tensor):
     T_init = sort_thresholds_increasing(T_init)
 
     actval = act.min()
-    if narrow:
+    if narrow and act.signed():
         actval += 1
 
     model = make_single_thresholding_modelwrapper(

From 9d95b1b3c34bfabcf4160e4a39f7cc9bc26a363e Mon Sep 17 00:00:00 2001
From: auphelia <jakobapk@web.de>
Date: Mon, 22 Jul 2024 11:30:31 +0100
Subject: [PATCH 05/49] [RTL thresh] Fix datatype extension for unsigned narrow
 quantization

---
 .../custom_op/fpgadataflow/rtl/thresholding_rtl.py   | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py
index 230d2879f5..d1e9387b1b 100644
--- a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py
@@ -197,7 +197,11 @@ def prepare_codegen_rtl_values(self, model):
                     thresholds = np.insert(thresholds, len(thresholds[0]), max_val, axis=1)
                 else:
                     max_val = max_val + 1
-                    wdt = DataType.get_smallest_possible(max_val)
+                    # increase wdt
+                    if not wdt.signed():
+                        wdt = DataType.get_smallest_possible(max_val)
+                    else:
+                        wdt = DataType.get_smallest_possible(-max_val - 1)
                     thresholds = np.insert(thresholds, len(thresholds[0]), max_val, axis=1)
             n_thres_steps += 1
 
@@ -548,7 +552,11 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name):
                     thresholds = np.insert(thresholds, len(thresholds[0]), max_val, axis=1)
                 else:
                     max_val = max_val + 1
-                    wdt = DataType.get_smallest_possible(max_val)
+                    # increase wdt
+                    if not wdt.signed():
+                        wdt = DataType.get_smallest_possible(max_val)
+                    else:
+                        wdt = DataType.get_smallest_possible(-max_val - 1)
                     thresholds = np.insert(thresholds, len(thresholds[0]), max_val, axis=1)
             n_thres_steps += 1
 

From ec120d54f86d27546301b552cc780d87b859c6e7 Mon Sep 17 00:00:00 2001
From: Hannah Yan <harbingerpasta11@gmail.com>
Date: Mon, 22 Jul 2024 17:48:32 +0100
Subject: [PATCH 06/49] Updated run-docker.sh to include values needed for
 verification

---
 run-docker.sh | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/run-docker.sh b/run-docker.sh
index 88fabff2fa..9dd6796782 100755
--- a/run-docker.sh
+++ b/run-docker.sh
@@ -265,6 +265,30 @@ if [ ! -z "$FINN_XILINX_PATH" ];then
     DOCKER_EXEC+="-e ALVEO_TARGET_DIR=$ALVEO_TARGET_DIR "
   fi
 fi
+
+# If using build verification, set up the necessary Docker variables
+if [ "$VERIFICATION_EN" = 1 ]; then
+  if [ -z "$FINN_EXAMPLES_ROOT" ]; then
+    recho "FINN_EXAMPLES_ROOT path has not been set."
+    recho "Please set FINN_EXAMPLES_ROOT path to enable verification."
+    exit -1
+  elif [ ! -d "${FINN_EXAMPLES_ROOT}/ci" ]; then
+    recho "ci folder not found in ${FINN_EXAMPLES_ROOT}."
+    recho "Please ensure the FINN-examples repo has been set up correctly, and FINN_EXAMPLES_ROOT path is set correctly, to enable verification."
+    exit -1
+  elif [ -z "$VERIFICATION_IO" ]; then
+    recho "VERIFICATION_IO paths has not been set."
+    recho "Please ensure the path to the input and expected output files has been set correctly to eneable verification."
+    exit -1
+  else
+    DOCKER_EXEC+="-e VERIFICATION_EN=$VERIFICATION_EN "
+    DOCKER_EXEC+="-e FINN_EXAMPLES_ROOT=$FINN_EXAMPLES_ROOT "
+    DOCKER_EXEC+="-e VERIFICATION_IO=$VERIFICATION_IO "
+    FINN_DOCKER_EXTRA+="-v $FINN_EXAMPLES_ROOT/ci:$FINN_EXAMPLES_ROOT/ci "
+    FINN_DOCKER_EXTRA+="-v $VERIFICATION_IO:$VERIFICATION_IO "
+  fi
+fi
+
 DOCKER_EXEC+="$FINN_DOCKER_EXTRA "
 
 if [ -z "$FINN_SINGULARITY" ];then

From 6a4406d2d14da298648f0733ec0f744918b98806 Mon Sep 17 00:00:00 2001
From: auphelia <jakobapk@web.de>
Date: Tue, 23 Jul 2024 14:06:19 +0100
Subject: [PATCH 07/49] [Docker] Add additional comment to clarify that
 additions to bash script are for internal ci

---
 run-docker.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/run-docker.sh b/run-docker.sh
index 9dd6796782..0b45638bda 100755
--- a/run-docker.sh
+++ b/run-docker.sh
@@ -266,7 +266,8 @@ if [ ! -z "$FINN_XILINX_PATH" ];then
   fi
 fi
 
-# If using build verification, set up the necessary Docker variables
+# This part is used for internal ci for finn-examples
+# if using build verification for finn-examples ci, set up the necessary Docker variables
 if [ "$VERIFICATION_EN" = 1 ]; then
   if [ -z "$FINN_EXAMPLES_ROOT" ]; then
     recho "FINN_EXAMPLES_ROOT path has not been set."
@@ -289,6 +290,7 @@ if [ "$VERIFICATION_EN" = 1 ]; then
   fi
 fi
 
+
 DOCKER_EXEC+="$FINN_DOCKER_EXTRA "
 
 if [ -z "$FINN_SINGULARITY" ];then

From 65a356a08f73cb42d750106ffa84fcc0c401826a Mon Sep 17 00:00:00 2001
From: Hannah Yan <harbingerpasta11@gmail.com>
Date: Thu, 25 Jul 2024 10:13:04 +0100
Subject: [PATCH 08/49] Updated run-docker.sh to check VERIFICATION_IO path

---
 run-docker.sh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/run-docker.sh b/run-docker.sh
index 0b45638bda..b1fe44eb0c 100755
--- a/run-docker.sh
+++ b/run-docker.sh
@@ -281,6 +281,10 @@ if [ "$VERIFICATION_EN" = 1 ]; then
     recho "VERIFICATION_IO paths has not been set."
     recho "Please ensure the path to the input and expected output files has been set correctly to eneable verification."
     exit -1
+  elif [ ! -d "$VERIFICATION_IO" ]; then
+    recho "${VERIFICATION_IO} is not a directory."
+    recho "Please ensure the VERIFICATION_IO path has been set to the directory containing the input and expected output files for verification."
+    exit -1
   else
     DOCKER_EXEC+="-e VERIFICATION_EN=$VERIFICATION_EN "
     DOCKER_EXEC+="-e FINN_EXAMPLES_ROOT=$FINN_EXAMPLES_ROOT "

From 43dad3bf55b1a7da7e1d246b6c969e1d80f46480 Mon Sep 17 00:00:00 2001
From: aziz bahri <azizb@amd.com>
Date: Fri, 26 Jul 2024 23:58:52 +0100
Subject: [PATCH 09/49] setup: upgrade onnxruntime

---
 setup.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index 6168bec5e3..511ce451dd 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -76,7 +76,7 @@ install_requires =
     numpy==1.24.1
     onnx==1.13.0
     onnxoptimizer
-    onnxruntime==1.15.0
+    onnxruntime==1.16.1
     pre-commit==3.3.2
     protobuf==3.20.3
     psutil==5.9.4

From 0256f043a527f721aed3464d21c0ef8f708715e7 Mon Sep 17 00:00:00 2001
From: aziz bahri <azizb@amd.com>
Date: Fri, 26 Jul 2024 23:56:46 +0100
Subject: [PATCH 10/49] softmax: add initial test harness

---
 .../fpgadataflow/test_fpgadataflow_softmax.py | 165 ++++++++++++++++++
 1 file changed, 165 insertions(+)
 create mode 100644 tests/fpgadataflow/test_fpgadataflow_softmax.py

diff --git a/tests/fpgadataflow/test_fpgadataflow_softmax.py b/tests/fpgadataflow/test_fpgadataflow_softmax.py
new file mode 100644
index 0000000000..6eb424f441
--- /dev/null
+++ b/tests/fpgadataflow/test_fpgadataflow_softmax.py
@@ -0,0 +1,165 @@
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+import torch
+import onnx
+from onnx import helper, numpy_helper
+import numpy as np
+import os
+from brevitas.export import export_qonnx
+from qonnx.util.cleanup import cleanup as qonnx_cleanup
+from onnx import TensorProto, helper
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.general import GiveUniqueNodeNames
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.fold_constants import FoldConstants
+from finn.transformation.streamline.absorb import (
+    AbsorbAddIntoMultiThreshold,
+    AbsorbMulIntoMultiThreshold,
+    FactorOutMulSignMagnitude,
+    Absorb1BitMulIntoConv,
+)
+import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw
+from brevitas.quant.scaled_int import Int8ActPerTensorFloat, Int8WeightPerTensorFloat
+import finn.core.onnx_exec as oxe
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
+from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
+from finn.util.basic import pynq_part_map
+from finn.transformation.streamline.reorder import (
+    MakeMaxPoolNHWC,
+    MoveScalarLinearPastInvariants,
+)
+from finn.transformation.streamline import Streamline
+import finn.transformation.streamline.absorb as absorb
+import onnx
+from onnx import helper
+import onnxruntime
+import torch
+import torch.nn as nn
+import brevitas.nn as qnn
+test_fpga_part = "xczu3eg-sbva484-1-e"
+target_clk_ns = 5
+export_onnx_path = "softmax_dut.onnx"
+
+### Make model wrapper
+# 1. make node, 
+
+
+### Test
+## 1. Compiler integration
+#       1. check all transforms can be applied to a model with a softmax layer
+#       2. Check that IP stitching produces valid HLS package
+
+## 2. Functionality test
+#       1. Check that we can run cpp/rtl sims
+#       2. check values are correct
+
+def create_model():
+    '''
+    Create a quantized softmax model.
+    Input and output are quantized to Int8ActPerTensorFloat, this is to make sure 
+    that the softmax layer is followed by a Quant node.
+    '''
+    io_shape = (1, 64)
+    class QuantSoftMaxSimple(nn.Module):
+        def __init__(self):
+            super(QuantSoftMaxSimple, self).__init__()
+            # self.input_identity = qnn.QuantIdentity(act_quant=Int8ActPerTensorFloat)
+            self.output_identity = qnn.QuantIdentity()
+            self.softmax = nn.Softmax(dim=1)
+
+        def forward(self, x):
+            # x = self.input_identity(x)
+            x = self.softmax(x)
+            x = self.output_identity(x)
+            return x
+
+    dut = QuantSoftMaxSimple()
+    input = torch.randn(io_shape)
+    export_qonnx(dut, input, export_onnx_path, opset_version=11)
+    qonnx_cleanup(export_onnx_path, out_file=export_onnx_path)
+    # set the model input to INT8
+    model = ModelWrapper(export_onnx_path)
+    model.set_tensor_datatype(model.graph.input[0].name, DataType["UINT8"])
+    # import pdb; pdb.set_trace()
+    return model
+
+@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
+@pytest.mark.fpgadataflow
+def test_convert_to_hw_softmax_layer(exec_mode):
+    '''
+    Test that all transofrmations can be applied to a model with a softmax layer.
+    '''
+    # Create the qonnx model
+    # modelproto = create_softmax_graph()
+   
+    model = create_model()
+    try:
+        model = model.transform(ConvertQONNXtoFINN())
+        model = model.transform(InferShapes())
+        model = model.transform(InferDataTypes())
+        model.save("qonnx_softmax_dut.onnx")
+        model = model.transform(to_hw.InferQuantSoftmax())
+        model = model.transform(SpecializeLayers(test_fpga_part))
+        if exec_mode == "cppsim":
+            model = model.transform(PrepareCppSim())
+            model = model.transform(CompileCppSim())
+            model = model.transform(SetExecMode("cppsim"))
+        elif exec_mode == "rtlsim":
+            model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
+            model = model.transform(SetExecMode("rtlsim"))
+            model = model.transform(HLSSynthIP())
+            model = model.transform(PrepareRTLSim())
+    except Exception as e:
+        pytest.fail(f"Failed to transform the model: {str(e)}")
+
+def test_fpgadataflow_quantsoftmax():
+    # Create the qonnx model
+    # create_model()
+    model = create_model()
+    try:
+        model = model.transform(InferShapes())
+        model = model.transform(InferDataTypes())
+        model = model.transform(FoldConstants())
+        model = model.transform(to_hw.InferQuantSoftmax())
+        model = model.transform(SpecializeLayers(test_fpga_part))
+        
+    except Exception as e:
+        pytest.fail(f"Failed to transform the model: {str(e)}")
\ No newline at end of file

From 58da0f67ef1e8b5cbc14962270bc8f67a7b171e7 Mon Sep 17 00:00:00 2001
From: aziz bahri <azizb@amd.com>
Date: Fri, 26 Jul 2024 23:57:57 +0100
Subject: [PATCH 11/49] softmax: stub class an infersoftmax

---
 src/finn/custom_op/fpgadataflow/__init__.py   |  3 ++
 .../custom_op/fpgadataflow/quantsoftmax.py    | 22 +++++++++
 .../fpgadataflow/convert_to_hw_layers.py      | 47 +++++++++++++++++++
 3 files changed, 72 insertions(+)
 create mode 100644 src/finn/custom_op/fpgadataflow/quantsoftmax.py

diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index aed2ab7fe1..9bcbb1e860 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -54,6 +54,8 @@
 from finn.custom_op.fpgadataflow.thresholding import Thresholding
 from finn.custom_op.fpgadataflow.upsampler import UpsampleNearestNeighbour
 from finn.custom_op.fpgadataflow.vectorvectoractivation import VVAU
+from finn.custom_op.fpgadataflow.quantsoftmax import QuantSoftmax
+
 
 custom_op = dict()
 
@@ -81,3 +83,4 @@
 custom_op["StreamingEltwise"] = StreamingEltwise
 custom_op["StreamingMaxPool"] = StreamingMaxPool
 custom_op["UpsampleNearestNeighbour"] = UpsampleNearestNeighbour
+custom_op["QuantSoftmax"] = QuantSoftmax
diff --git a/src/finn/custom_op/fpgadataflow/quantsoftmax.py b/src/finn/custom_op/fpgadataflow/quantsoftmax.py
new file mode 100644
index 0000000000..16f54cc2af
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/quantsoftmax.py
@@ -0,0 +1,22 @@
+
+from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
+from finn.util.data_packing import numpy_to_hls_code, pack_innermost_dim_as_hex_string
+
+class QuantSoftmax(HWCustomOp):
+    """Abstraction layer for HW implementation of VectorVectorActivation layers."""
+
+    def __init__(self, onnx_node, **kwargs):
+        super().__init__(onnx_node, **kwargs)
+
+    def get_nodeattr_types(self):
+        my_attrs = {
+            "simd": ("i", False, 1),
+            "channels": ("i", True, 0),
+            # FINN DataTypes for inputs, weights, outputs
+            "data_type": ("s", True, ""),
+        }
+        my_attrs.update(super().get_nodeattr_types())
+        return my_attrs
+
+    def get_number_output_values(self):
+        raise NotImplementedError("This function is not yet implemented.")
\ No newline at end of file
diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
index e14181b140..c93bf48393 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
@@ -1697,3 +1697,50 @@ def apply(self, model):
             model = model.transform(InferShapes())
             model = model.transform(InferDataTypes())
         return (model, graph_modified)
+
+
+class InferQuantSoftmax(Transformation):
+    '''
+    Find softmax layers that are followed by a MultiThreshold layer and replace them with QuantizedSoftmax
+    '''
+    def __init__(self):
+        super().__init__()
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for n in graph.node:
+            node_ind += 1
+            # check that an optype of Softmax is present followed by a MultiThreshold
+            consumer = model.find_consumer(n.output[0])
+            if consumer is not None and consumer.op_type == "MultiThreshold":
+                print("Found Softmax followed by MultiThreshold")
+                # get the shape of the input tensor
+                input_shape = model.get_tensor_shape(n.input[0])
+                # get the shape of the output tensor
+                output_shape = model.get_tensor_shape(n.output[0])
+                idt0 = model.get_tensor_datatype(n.input[0])
+                num_channels = int(input_shape[-1])
+                # create node with no parallelization first
+                simd = 1
+                # create and insert new node
+                new_node = helper.make_node(
+                    "QuantSoftmax",
+                    [n.input[0]],  # input tensor(s)
+                    [n.output[0]],  # output tensor(s)
+                    domain="finn.custom_op.fpgadataflow",
+                    backend="fpgadataflow",
+                    channels=num_channels,
+                    data_type = idt0.name,
+                    name=n.name,
+                    simd=simd
+                )
+                graph.node.insert(node_ind, new_node)
+                graph.node.remove(n)
+                graph_modified = True
+                
+        if graph_modified:
+            model = model.transform(InferShapes())
+            model = model.transform(InferDataTypes())
+        return (model, graph_modified)

From bbbdc4aefc6c60e1ea446573d8117e9b7ebf1f23 Mon Sep 17 00:00:00 2001
From: aziz bahri <azizb@amd.com>
Date: Mon, 29 Jul 2024 14:02:18 +0100
Subject: [PATCH 12/49] softmax: stub abstract methods

---
 .../custom_op/fpgadataflow/quantsoftmax.py    | 20 ++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/src/finn/custom_op/fpgadataflow/quantsoftmax.py b/src/finn/custom_op/fpgadataflow/quantsoftmax.py
index 16f54cc2af..e6f258bde6 100644
--- a/src/finn/custom_op/fpgadataflow/quantsoftmax.py
+++ b/src/finn/custom_op/fpgadataflow/quantsoftmax.py
@@ -19,4 +19,22 @@ def get_nodeattr_types(self):
         return my_attrs
 
     def get_number_output_values(self):
-        raise NotImplementedError("This function is not yet implemented.")
\ No newline at end of file
+        raise NotImplementedError("This function is not yet implemented.")
+
+    def execute_node(self, context, graph):
+        raise NotImplementedError
+
+    def get_number_output_values(self):
+        raise NotImplementedError
+
+    def get_nodeattr_types(self):
+        raise NotImplementedError
+
+    def make_shape_compatible_op(self, model):
+        raise NotImplementedError
+
+    def infer_node_datatype(self, model):
+        raise NotImplementedError
+
+    def verify_node(self):
+        raise NotImplementedError
\ No newline at end of file

From cbda331ed543e5b6dc8a48579b037776b561f9d1 Mon Sep 17 00:00:00 2001
From: aziz bahri <azizb@amd.com>
Date: Mon, 29 Jul 2024 14:02:47 +0100
Subject: [PATCH 13/49] softmax: input image dimension attribute

---
 .../transformation/fpgadataflow/convert_to_hw_layers.py     | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
index c93bf48393..52999c4c1a 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
@@ -1716,9 +1716,10 @@ def apply(self, model):
             consumer = model.find_consumer(n.output[0])
             if consumer is not None and consumer.op_type == "MultiThreshold":
                 print("Found Softmax followed by MultiThreshold")
-                # get the shape of the input tensor
+                # get the shape of the input/output tensor
                 input_shape = model.get_tensor_shape(n.input[0])
-                # get the shape of the output tensor
+                dim_h = int(input_shape[1])
+                dim_w = int(input_shape[2])
                 output_shape = model.get_tensor_shape(n.output[0])
                 idt0 = model.get_tensor_datatype(n.input[0])
                 num_channels = int(input_shape[-1])
@@ -1731,6 +1732,7 @@ def apply(self, model):
                     [n.output[0]],  # output tensor(s)
                     domain="finn.custom_op.fpgadataflow",
                     backend="fpgadataflow",
+                    img_dim=[dim_h, dim_w],
                     channels=num_channels,
                     data_type = idt0.name,
                     name=n.name,

From 48738f63cefce35074952a247bd28dc9bbea744d Mon Sep 17 00:00:00 2001
From: aziz bahri <azizb@amd.com>
Date: Mon, 29 Jul 2024 14:03:07 +0100
Subject: [PATCH 14/49] softmax: use input img dimension to build input shape

---
 src/finn/custom_op/fpgadataflow/quantsoftmax.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/finn/custom_op/fpgadataflow/quantsoftmax.py b/src/finn/custom_op/fpgadataflow/quantsoftmax.py
index e6f258bde6..4654feb6cc 100644
--- a/src/finn/custom_op/fpgadataflow/quantsoftmax.py
+++ b/src/finn/custom_op/fpgadataflow/quantsoftmax.py
@@ -10,6 +10,7 @@ def __init__(self, onnx_node, **kwargs):
 
     def get_nodeattr_types(self):
         my_attrs = {
+            "img_dim": ("i", True, 0),
             "simd": ("i", False, 1),
             "channels": ("i", True, 0),
             # FINN DataTypes for inputs, weights, outputs
@@ -18,6 +19,15 @@ def get_nodeattr_types(self):
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
 
+    def get_normal_input_shape(self, ind=0):
+        idim_h, idim_w = self.get_nodeattr("img_dim")
+        num_ch = self.get_nodeattr("channels")
+        ishape = (1, idim_h, idim_w, num_ch)
+        return ishape
+
+    def get_normal_output_shape(self, ind=0):
+        return self.get_normal_input_shape()
+
     def get_number_output_values(self):
         raise NotImplementedError("This function is not yet implemented.")
 

From ec33c6aa6b89f2e886af1e9487f9d5eefb2c02e0 Mon Sep 17 00:00:00 2001
From: aziz bahri <azizb@amd.com>
Date: Mon, 29 Jul 2024 16:18:29 +0100
Subject: [PATCH 15/49] softmax: softmax on inner dim

---
 tests/fpgadataflow/test_fpgadataflow_softmax.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/fpgadataflow/test_fpgadataflow_softmax.py b/tests/fpgadataflow/test_fpgadataflow_softmax.py
index 6eb424f441..e1242b7283 100644
--- a/tests/fpgadataflow/test_fpgadataflow_softmax.py
+++ b/tests/fpgadataflow/test_fpgadataflow_softmax.py
@@ -76,7 +76,7 @@
 import brevitas.nn as qnn
 test_fpga_part = "xczu3eg-sbva484-1-e"
 target_clk_ns = 5
-export_onnx_path = "softmax_dut.onnx"
+export_onnx_path = "softmax_dut_qonnx.onnx"
 
 ### Make model wrapper
 # 1. make node, 
@@ -97,13 +97,13 @@ def create_model():
     Input and output are quantized to Int8ActPerTensorFloat, this is to make sure 
     that the softmax layer is followed by a Quant node.
     '''
-    io_shape = (1, 64)
+    io_shape = (1, 8, 8, 2)
     class QuantSoftMaxSimple(nn.Module):
         def __init__(self):
             super(QuantSoftMaxSimple, self).__init__()
             # self.input_identity = qnn.QuantIdentity(act_quant=Int8ActPerTensorFloat)
             self.output_identity = qnn.QuantIdentity()
-            self.softmax = nn.Softmax(dim=1)
+            self.softmax = nn.Softmax(dim=3) # softmax along the last dimension
 
         def forward(self, x):
             # x = self.input_identity(x)

From 507b7981731601cb48536fcd1af51c3177a02b5e Mon Sep 17 00:00:00 2001
From: aziz bahri <azizb@amd.com>
Date: Mon, 29 Jul 2024 16:19:16 +0100
Subject: [PATCH 16/49] softmax: infer softmax helper implementation

---
 .../custom_op/fpgadataflow/quantsoftmax.py    | 43 ++++++++++++++-----
 .../fpgadataflow/convert_to_hw_layers.py      | 16 ++++---
 2 files changed, 42 insertions(+), 17 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/quantsoftmax.py b/src/finn/custom_op/fpgadataflow/quantsoftmax.py
index 4654feb6cc..c37f791270 100644
--- a/src/finn/custom_op/fpgadataflow/quantsoftmax.py
+++ b/src/finn/custom_op/fpgadataflow/quantsoftmax.py
@@ -1,6 +1,9 @@
 
 from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
 from finn.util.data_packing import numpy_to_hls_code, pack_innermost_dim_as_hex_string
+from onnx.helper import make_node
+import warnings
+from qonnx.core.datatype import DataType
 
 class QuantSoftmax(HWCustomOp):
     """Abstraction layer for HW implementation of VectorVectorActivation layers."""
@@ -10,7 +13,7 @@ def __init__(self, onnx_node, **kwargs):
 
     def get_nodeattr_types(self):
         my_attrs = {
-            "img_dim": ("i", True, 0),
+            "ifm_dim": ("ints", True, []),
             "simd": ("i", False, 1),
             "channels": ("i", True, 0),
             # FINN DataTypes for inputs, weights, outputs
@@ -20,10 +23,9 @@ def get_nodeattr_types(self):
         return my_attrs
 
     def get_normal_input_shape(self, ind=0):
-        idim_h, idim_w = self.get_nodeattr("img_dim")
-        num_ch = self.get_nodeattr("channels")
-        ishape = (1, idim_h, idim_w, num_ch)
-        return ishape
+        h, w = self.get_nodeattr("ifm_dim")
+        c = self.get_nodeattr("channels")
+        return (1, h, w, c)
 
     def get_normal_output_shape(self, ind=0):
         return self.get_normal_input_shape()
@@ -37,14 +39,35 @@ def execute_node(self, context, graph):
     def get_number_output_values(self):
         raise NotImplementedError
 
-    def get_nodeattr_types(self):
-        raise NotImplementedError
 
-    def make_shape_compatible_op(self, model):
-        raise NotImplementedError
+    def get_input_datatype(self, ind=0):
+        """Returns FINN DataType of input."""
+        data_type = DataType[self.get_nodeattr("data_type")]
+        # the hlslib op always pads with zeros, so ensure that the DataType
+        # is able to represent zeros
+        assert data_type.allowed(0), "DataType must support zero"
+        return data_type
 
+    def make_shape_compatible_op(self, model):
+        shape = self.get_normal_input_shape()
+        # create an ONNX Softmax node with the same shape as this one
+        return make_node("Softmax",
+                         inputs=[self.onnx_node.input[0]],
+                         outputs=[self.onnx_node.output[0]],
+                         shape=list(shape)
+                         )
     def infer_node_datatype(self, model):
-        raise NotImplementedError
+        node = self.onnx_node
+        idt = model.get_tensor_datatype(node.input[0])
+        if idt != self.get_input_datatype():
+            warn_str = "data_type changing for %s: %s -> %s " % (
+                node.name,
+                str(self.get_input_datatype()),
+                str(idt),
+            )
+            warnings.warn(warn_str)
+        self.set_nodeattr("data_type", idt.name)
+        model.set_tensor_datatype(node.output[0], idt)
 
     def verify_node(self):
         raise NotImplementedError
\ No newline at end of file
diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
index 52999c4c1a..2966bf1cc0 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
@@ -1714,15 +1714,17 @@ def apply(self, model):
             node_ind += 1
             # check that an optype of Softmax is present followed by a MultiThreshold
             consumer = model.find_consumer(n.output[0])
-            if consumer is not None and consumer.op_type == "MultiThreshold":
+            if n.op_type == "Softmax" and consumer is not None and consumer.op_type == "MultiThreshold":
                 print("Found Softmax followed by MultiThreshold")
                 # get the shape of the input/output tensor
                 input_shape = model.get_tensor_shape(n.input[0])
-                dim_h = int(input_shape[1])
-                dim_w = int(input_shape[2])
-                output_shape = model.get_tensor_shape(n.output[0])
+                assert input_shape == model.get_tensor_shape(consumer.input[0]), (
+                    "Softmax and MultiThreshold input shapes do not match"
+                )
+                h = int(input_shape[1])
+                w = int(input_shape[2])
+                c = int(input_shape[3])
                 idt0 = model.get_tensor_datatype(n.input[0])
-                num_channels = int(input_shape[-1])
                 # create node with no parallelization first
                 simd = 1
                 # create and insert new node
@@ -1732,8 +1734,8 @@ def apply(self, model):
                     [n.output[0]],  # output tensor(s)
                     domain="finn.custom_op.fpgadataflow",
                     backend="fpgadataflow",
-                    img_dim=[dim_h, dim_w],
-                    channels=num_channels,
+                    ifm_dim=[h, w],
+                    channels=c,
                     data_type = idt0.name,
                     name=n.name,
                     simd=simd

From 7727a3f91cc241d7a7cc85019a82a65190ce024c Mon Sep 17 00:00:00 2001
From: aziz bahri <azizb@amd.com>
Date: Mon, 29 Jul 2024 16:26:59 +0100
Subject: [PATCH 17/49] softmax: hls class stub

---
 .../custom_op/fpgadataflow/hls/__init__.py    |  2 +
 .../fpgadataflow/hls/quantsoftmax_hls.py      | 53 +++++++++++++++++++
 2 files changed, 55 insertions(+)
 create mode 100644 src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py

diff --git a/src/finn/custom_op/fpgadataflow/hls/__init__.py b/src/finn/custom_op/fpgadataflow/hls/__init__.py
index 405c47a08d..8f5a0a7cc7 100644
--- a/src/finn/custom_op/fpgadataflow/hls/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/hls/__init__.py
@@ -52,6 +52,7 @@
 from finn.custom_op.fpgadataflow.hls.tlastmarker_hls import TLastMarker_hls
 from finn.custom_op.fpgadataflow.hls.upsampler_hls import UpsampleNearestNeighbour_hls
 from finn.custom_op.fpgadataflow.hls.vectorvectoractivation_hls import VVAU_hls
+from finn.custom_op.fpgadataflow.hls.quantsoftmax_hls import QuantSoftmax_hls
 
 custom_op = dict()
 
@@ -79,3 +80,4 @@
 custom_op["UpsampleNearestNeighbour_hls"] = UpsampleNearestNeighbour_hls
 custom_op["MVAU_hls"] = MVAU_hls
 custom_op["VVAU_hls"] = VVAU_hls
+custom_op["QuantSoftmax_hls"] = QuantSoftmax_hls
diff --git a/src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py b/src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py
new file mode 100644
index 0000000000..804fe35ab9
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py
@@ -0,0 +1,53 @@
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from finn.custom_op.fpgadataflow.quantsoftmax import QuantSoftmax
+from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend
+
+class QuantSoftmax_hls(QuantSoftmax, HLSBackend):
+    def __init__(self, onnx_node, **kwargs):
+        super().__init__(onnx_node, **kwargs)
+        
+    def get_nodeattr_types(self):
+        my_attrs = {}
+        my_attrs.update(QuantSoftmax.get_nodeattr_types(self))
+        my_attrs.update(HLSBackend.get_nodeattr_types(self))
+        return my_attrs
+    
+    def global_includes(self):
+        # not implemented
+        raise NotImplementedError
+    
+    def defines(self, var):
+        raise NotImplementedError
+    
+    def docompute(self):
+        raise NotImplementedError
+    
+    def blackboxfunction(self):
+        raise NotImplementedError
\ No newline at end of file

From 26899d2649be953faf7a6a4a5e273e9db00c2745 Mon Sep 17 00:00:00 2001
From: aziz bahri <azizb@amd.com>
Date: Tue, 30 Jul 2024 08:56:47 +0100
Subject: [PATCH 18/49] softmax: extend test to apply folding config

---
 .../fpgadataflow/test_fpgadataflow_softmax.py | 38 ++++++++++++++-----
 1 file changed, 29 insertions(+), 9 deletions(-)

diff --git a/tests/fpgadataflow/test_fpgadataflow_softmax.py b/tests/fpgadataflow/test_fpgadataflow_softmax.py
index e1242b7283..06e091fc9d 100644
--- a/tests/fpgadataflow/test_fpgadataflow_softmax.py
+++ b/tests/fpgadataflow/test_fpgadataflow_softmax.py
@@ -32,13 +32,13 @@
 from onnx import helper, numpy_helper
 import numpy as np
 import os
+import finn.core.onnx_exec as oxe
 from brevitas.export import export_qonnx
 from qonnx.util.cleanup import cleanup as qonnx_cleanup
 from onnx import TensorProto, helper
 from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.registry import getCustomOp
-from qonnx.transformation.general import GiveUniqueNodeNames
 from qonnx.transformation.infer_shapes import InferShapes
 from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 from qonnx.transformation.infer_datatypes import InferDataTypes
@@ -66,6 +66,13 @@
     MakeMaxPoolNHWC,
     MoveScalarLinearPastInvariants,
 )
+from qonnx.transformation.general import (
+    ApplyConfig,
+    GiveReadableTensorNames,
+    GiveUniqueNodeNames,
+    RemoveStaticGraphInputs,
+    RemoveUnusedTensors,
+)
 from finn.transformation.streamline import Streamline
 import finn.transformation.streamline.absorb as absorb
 import onnx
@@ -79,7 +86,7 @@
 export_onnx_path = "softmax_dut_qonnx.onnx"
 
 ### Make model wrapper
-# 1. make node, 
+# 1. make node,
 
 
 ### Test
@@ -91,13 +98,14 @@
 #       1. Check that we can run cpp/rtl sims
 #       2. check values are correct
 
+
 def create_model():
     '''
     Create a quantized softmax model.
-    Input and output are quantized to Int8ActPerTensorFloat, this is to make sure 
+    Input and output are quantized to Int8ActPerTensorFloat, this is to make sure
     that the softmax layer is followed by a Quant node.
     '''
-    io_shape = (1, 8, 8, 2)
+    io_shape = (1, 12, 128, 128)
     class QuantSoftMaxSimple(nn.Module):
         def __init__(self):
             super(QuantSoftMaxSimple, self).__init__()
@@ -122,34 +130,46 @@ def forward(self, x):
     return model
 
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
+@pytest.mark.parametrize("simd", ["simd1", "simd2", "simd3", "simd4"])
 @pytest.mark.fpgadataflow
-def test_convert_to_hw_softmax_layer(exec_mode):
+def test_convert_to_hw_softmax_layer(exec_mode, simd):
     '''
     Test that all transofrmations can be applied to a model with a softmax layer.
     '''
     # Create the qonnx model
     # modelproto = create_softmax_graph()
-   
+
     model = create_model()
+    simd = int(simd[-1])
+    folding_config = {
+        "Defaults": {},
+        "QuantSoftmax_0": {
+            "simd": simd
+        }
+    }
     try:
         model = model.transform(ConvertQONNXtoFINN())
         model = model.transform(InferShapes())
         model = model.transform(InferDataTypes())
-        model.save("qonnx_softmax_dut.onnx")
         model = model.transform(to_hw.InferQuantSoftmax())
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(ApplyConfig(folding_config))
         model = model.transform(SpecializeLayers(test_fpga_part))
         if exec_mode == "cppsim":
             model = model.transform(PrepareCppSim())
             model = model.transform(CompileCppSim())
             model = model.transform(SetExecMode("cppsim"))
         elif exec_mode == "rtlsim":
-            model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
             model = model.transform(SetExecMode("rtlsim"))
+            model = model.transform(GiveUniqueNodeNames())
+            model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
             model = model.transform(HLSSynthIP())
             model = model.transform(PrepareRTLSim())
     except Exception as e:
         pytest.fail(f"Failed to transform the model: {str(e)}")
 
+    # oxe.execute_onnx()
+
 def test_fpgadataflow_quantsoftmax():
     # Create the qonnx model
     # create_model()
@@ -160,6 +180,6 @@ def test_fpgadataflow_quantsoftmax():
         model = model.transform(FoldConstants())
         model = model.transform(to_hw.InferQuantSoftmax())
         model = model.transform(SpecializeLayers(test_fpga_part))
-        
+
     except Exception as e:
         pytest.fail(f"Failed to transform the model: {str(e)}")
\ No newline at end of file

From f917bfc33400f3961b958af538d0da29147be581 Mon Sep 17 00:00:00 2001
From: aziz bahri <azizb@amd.com>
Date: Tue, 30 Jul 2024 08:57:16 +0100
Subject: [PATCH 19/49] softmax: add Quant to node name

---
 src/finn/transformation/fpgadataflow/convert_to_hw_layers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
index 2966bf1cc0..aef5f6a64c 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
@@ -1737,7 +1737,7 @@ def apply(self, model):
                     ifm_dim=[h, w],
                     channels=c,
                     data_type = idt0.name,
-                    name=n.name,
+                    name="Quant"+n.name,
                     simd=simd
                 )
                 graph.node.insert(node_ind, new_node)

From 74c338ccd8bc1d104bb8eaf31493918118fe947e Mon Sep 17 00:00:00 2001
From: aziz bahri <azizb@amd.com>
Date: Tue, 30 Jul 2024 08:57:58 +0100
Subject: [PATCH 20/49] softmax: generate hls code

---
 .../fpgadataflow/hls/quantsoftmax_hls.py      | 61 ++++++++++++++++---
 .../custom_op/fpgadataflow/quantsoftmax.py    | 25 +++++++-
 2 files changed, 75 insertions(+), 11 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py b/src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py
index 804fe35ab9..b4ada72a32 100644
--- a/src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py
@@ -32,22 +32,63 @@
 class QuantSoftmax_hls(QuantSoftmax, HLSBackend):
     def __init__(self, onnx_node, **kwargs):
         super().__init__(onnx_node, **kwargs)
-        
+
     def get_nodeattr_types(self):
         my_attrs = {}
         my_attrs.update(QuantSoftmax.get_nodeattr_types(self))
         my_attrs.update(HLSBackend.get_nodeattr_types(self))
         return my_attrs
-    
+
     def global_includes(self):
-        # not implemented
-        raise NotImplementedError
-    
+        self.code_gen_dict["$GLOBALS$"] = [
+            '#include "softmax.hpp"',
+            '#include "utils.hpp"'
+            ]
+
     def defines(self, var):
-        raise NotImplementedError
-    
+        simd = self.get_nodeattr("simd")
+        ibits = self.get_input_datatype().bitwidth()
+        channels = self.get_nodeattr("channels")
+        self.code_gen_dict["$DEFINES$"] = [
+           f"""
+            constexpr unsigned  SIMD = {simd};
+            constexpr unsigned  W = {channels};
+            using  T = ap_uint<{ibits}>;
+            using  F = float;
+           """
+        ]
+
     def docompute(self):
-        raise NotImplementedError
-    
+        self.code_gen_dict["$DOCOMPUTE$"] = [
+            f'''
+                static hls::stream<hls::vector<T,SIMD>>  src0;
+                static hls::stream<hls::vector<T,SIMD>>  dst0;
+
+                move(src, src0);
+                smaxquant<W,SIMD,T,F>(src0, dst0);
+                move(dst0, dst);
+        '''
+        ]
+
     def blackboxfunction(self):
-        raise NotImplementedError
\ No newline at end of file
+        self.code_gen_dict["$BLACKBOXFUNCTION$"]  = [
+            f'''
+            void {self.onnx_node.name}(
+                hls::stream<hls::vector<T,SIMD>> &src,
+                hls::stream<hls::vector<T,SIMD>> &dst
+                )
+            '''
+        ]
+
+    def pragmas(self):
+        self.code_gen_dict["$PRAGMAS$"]  = [
+            f'''
+            #pragma HLS interface AXIS port=src
+            #pragma HLS interface AXIS port=dst
+            #pragma HLS aggregate  variable=src compact=bit
+            #pragma HLS aggregate  variable=dst compact=bit
+
+            #pragma HLS interface ap_ctrl_none port=return
+            #pragma HLS dataflow disable_start_propagation
+            '''
+        ]
\ No newline at end of file
diff --git a/src/finn/custom_op/fpgadataflow/quantsoftmax.py b/src/finn/custom_op/fpgadataflow/quantsoftmax.py
index c37f791270..cac2c1a327 100644
--- a/src/finn/custom_op/fpgadataflow/quantsoftmax.py
+++ b/src/finn/custom_op/fpgadataflow/quantsoftmax.py
@@ -70,4 +70,27 @@ def infer_node_datatype(self, model):
         model.set_tensor_datatype(node.output[0], idt)
 
     def verify_node(self):
-        raise NotImplementedError
\ No newline at end of file
+        raise NotImplementedError
+
+    def get_instream_width(self, ind=0):
+        ibits = self.get_input_datatype().bitwidth()
+        simd = self.get_nodeattr("simd")
+        return ibits * simd
+
+    def get_outstream_width(self, ind=0):
+        obits = self.get_output_datatype().bitwidth()
+        simd = self.get_nodeattr("simd")
+        return obits * simd
+
+    def get_output_datatype(self, ind=0):
+        """Returns FINN DataType of output. (Same as input datatype)"""
+        return self.get_input_datatype()
+
+    def get_folded_output_shape(self, ind=0):
+        normal_oshape = list(self.get_normal_output_shape())
+        ifm_ch = self.get_nodeattr("channels")
+        simd = self.get_nodeattr("simd")
+        assert ifm_ch % simd == 0, "SIMD must divide input channels"
+        fold = int(normal_oshape[-1] / simd)
+        folded_oshape = normal_oshape[:-1] + [fold, simd]
+        return tuple(folded_oshape)
\ No newline at end of file

From 41d0f06354721962d8ac4118d4a5dbe3b5824a3c Mon Sep 17 00:00:00 2001
From: aziz bahri <azizb@amd.com>
Date: Tue, 30 Jul 2024 12:38:13 +0100
Subject: [PATCH 21/49] softmax: use sname for input name

---
 .../fpgadataflow/hls/quantsoftmax_hls.py         | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py b/src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py
index b4ada72a32..e78c1a3473 100644
--- a/src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py
@@ -64,9 +64,9 @@ def docompute(self):
                 static hls::stream<hls::vector<T,SIMD>>  src0;
                 static hls::stream<hls::vector<T,SIMD>>  dst0;
 
-                move(src, src0);
+                move(in0_{self.hls_sname()}, src0);
                 smaxquant<W,SIMD,T,F>(src0, dst0);
-                move(dst0, dst);
+                move(dst0, out_{self.hls_sname()});
         '''
         ]
 
@@ -74,8 +74,8 @@ def blackboxfunction(self):
         self.code_gen_dict["$BLACKBOXFUNCTION$"]  = [
             f'''
             void {self.onnx_node.name}(
-                hls::stream<hls::vector<T,SIMD>> &src,
-                hls::stream<hls::vector<T,SIMD>> &dst
+                hls::stream<hls::vector<T,SIMD>> &in0_{self.hls_sname()},
+                hls::stream<hls::vector<T,SIMD>> &out_{self.hls_sname()}
                 )
             '''
         ]
@@ -83,10 +83,10 @@ def blackboxfunction(self):
     def pragmas(self):
         self.code_gen_dict["$PRAGMAS$"]  = [
             f'''
-            #pragma HLS interface AXIS port=src
-            #pragma HLS interface AXIS port=dst
-            #pragma HLS aggregate  variable=src compact=bit
-            #pragma HLS aggregate  variable=dst compact=bit
+            #pragma HLS interface AXIS port=in0_{self.hls_sname()}
+            #pragma HLS interface AXIS port=out_{self.hls_sname()}
+            #pragma HLS aggregate  variable=in0_{self.hls_sname()} compact=bit
+            #pragma HLS aggregate  variable=out_{self.hls_sname()} compact=bit
 
             #pragma HLS interface ap_ctrl_none port=return
             #pragma HLS dataflow disable_start_propagation

From 4fc3e2c39a42cb8931f725726b9e258d92c67d67 Mon Sep 17 00:00:00 2001
From: aziz bahri <azizb@amd.com>
Date: Tue, 30 Jul 2024 12:38:30 +0100
Subject: [PATCH 22/49] softmax: quantsoftmax replaces softmax+multithreshold

---
 src/finn/transformation/fpgadataflow/convert_to_hw_layers.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
index aef5f6a64c..257db2c79a 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
@@ -1731,7 +1731,7 @@ def apply(self, model):
                 new_node = helper.make_node(
                     "QuantSoftmax",
                     [n.input[0]],  # input tensor(s)
-                    [n.output[0]],  # output tensor(s)
+                    [consumer.output[0]],  # output tensor(s)
                     domain="finn.custom_op.fpgadataflow",
                     backend="fpgadataflow",
                     ifm_dim=[h, w],
@@ -1742,6 +1742,8 @@ def apply(self, model):
                 )
                 graph.node.insert(node_ind, new_node)
                 graph.node.remove(n)
+                # remove multithreshold too
+                graph.node.remove(consumer)
                 graph_modified = True
                 
         if graph_modified:

From e26cc5f535a0ae4b53d1d5c22ee542a0d7d4877b Mon Sep 17 00:00:00 2001
From: aziz bahri <azizb@amd.com>
Date: Wed, 31 Jul 2024 10:37:00 +0100
Subject: [PATCH 23/49] softmax: run stitchedip

---
 tests/fpgadataflow/test_fpgadataflow_softmax.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tests/fpgadataflow/test_fpgadataflow_softmax.py b/tests/fpgadataflow/test_fpgadataflow_softmax.py
index 06e091fc9d..a1ba6ef5bb 100644
--- a/tests/fpgadataflow/test_fpgadataflow_softmax.py
+++ b/tests/fpgadataflow/test_fpgadataflow_softmax.py
@@ -153,6 +153,11 @@ def test_convert_to_hw_softmax_layer(exec_mode, simd):
         model = model.transform(InferDataTypes())
         model = model.transform(to_hw.InferQuantSoftmax())
         model = model.transform(GiveUniqueNodeNames())
+        # isolate fpga dataflow layers
+        parent_model = model.transform(CreateDataflowPartition())
+        sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
+        sdp_node_path = getCustomOp(sdp_node).get_nodeattr("model")
+        model = ModelWrapper(sdp_node_path)
         model = model.transform(ApplyConfig(folding_config))
         model = model.transform(SpecializeLayers(test_fpga_part))
         if exec_mode == "cppsim":
@@ -164,7 +169,8 @@ def test_convert_to_hw_softmax_layer(exec_mode, simd):
             model = model.transform(GiveUniqueNodeNames())
             model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
             model = model.transform(HLSSynthIP())
-            model = model.transform(PrepareRTLSim())
+            model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns))
+            # model = model.transform(PrepareRTLSim())
     except Exception as e:
         pytest.fail(f"Failed to transform the model: {str(e)}")
 

From 85b97a6fccb91a64b03a370873e950d79a53fd1e Mon Sep 17 00:00:00 2001
From: aziz bahri <azizb@amd.com>
Date: Wed, 31 Jul 2024 10:37:19 +0100
Subject: [PATCH 24/49] softmax: hls execute softmax bin

---
 .../fpgadataflow/hls/quantsoftmax_hls.py      | 27 ++++++++++++++++++-
 .../custom_op/fpgadataflow/quantsoftmax.py    | 20 ++++++++++++--
 2 files changed, 44 insertions(+), 3 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py b/src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py
index e78c1a3473..a3980b0749 100644
--- a/src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py
@@ -26,6 +26,8 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import os
+import numpy as np
 from finn.custom_op.fpgadataflow.quantsoftmax import QuantSoftmax
 from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend
 
@@ -91,4 +93,27 @@ def pragmas(self):
             #pragma HLS interface ap_ctrl_none port=return
             #pragma HLS dataflow disable_start_propagation
             '''
-        ]
\ No newline at end of file
+        ]
+
+    def execute_node(self, context, graph):
+        mode = self.get_nodeattr("exec_mode")
+        node = self.onnx_node
+        exp_ishape = self.get_normal_input_shape()
+        exp_oshape = self.get_normal_output_shape()
+        folded_ishape = self.get_folded_input_shape()
+
+
+        if mode == "cppsim":
+            print("Executing node with cppsim")
+            code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+            inp = context[node.input[0]]
+            export_idt = self.get_input_datatype()
+            inp = inp.reshape(folded_ishape)
+            np.save(os.path.join(code_gen_dir, "input_0.npy"), inp)
+            # # execute the precompiled model
+            super().exec_precompiled_singlenode_model()
+            # # load output npy file
+            super().npy_to_dynamic_output(context)
+        else:
+            raise Exception(f"Unsupported execution mode: {mode}")
+
diff --git a/src/finn/custom_op/fpgadataflow/quantsoftmax.py b/src/finn/custom_op/fpgadataflow/quantsoftmax.py
index cac2c1a327..47167cbc3c 100644
--- a/src/finn/custom_op/fpgadataflow/quantsoftmax.py
+++ b/src/finn/custom_op/fpgadataflow/quantsoftmax.py
@@ -4,6 +4,10 @@
 from onnx.helper import make_node
 import warnings
 from qonnx.core.datatype import DataType
+import onnx
+from onnx.helper import make_node, make_tensor_value_info
+import numpy as np
+import torch
 
 class QuantSoftmax(HWCustomOp):
     """Abstraction layer for HW implementation of VectorVectorActivation layers."""
@@ -34,7 +38,10 @@ def get_number_output_values(self):
         raise NotImplementedError("This function is not yet implemented.")
 
     def execute_node(self, context, graph):
-        raise NotImplementedError
+        node = self.onnx_node
+        input_data = context[node.input[0]]
+        output_data = torch.softmax(input_data, dim=3)
+        context[node.output[0]] = output_data
 
     def get_number_output_values(self):
         raise NotImplementedError
@@ -93,4 +100,13 @@ def get_folded_output_shape(self, ind=0):
         assert ifm_ch % simd == 0, "SIMD must divide input channels"
         fold = int(normal_oshape[-1] / simd)
         folded_oshape = normal_oshape[:-1] + [fold, simd]
-        return tuple(folded_oshape)
\ No newline at end of file
+        return tuple(folded_oshape)
+
+    def get_folded_input_shape(self, ind=0):
+        normal_ishape = list(self.get_normal_input_shape())
+        ifm_ch = self.get_nodeattr("channels")
+        simd = self.get_nodeattr("simd")
+        assert ifm_ch % simd == 0, "SIMD must divide input channels"
+        fold = int(normal_ishape[-1] / simd)
+        folded_ishape = normal_ishape[:-1] + [fold, simd]
+        return tuple(folded_ishape)
\ No newline at end of file

From f53a8386501cccc09f3be7aa5ec665cf9b2aaef5 Mon Sep 17 00:00:00 2001
From: aziz bahri <azizb@amd.com>
Date: Wed, 31 Jul 2024 13:52:05 +0100
Subject: [PATCH 25/49] softmax: include correct imports

---
 tests/fpgadataflow/test_fpgadataflow_softmax.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/fpgadataflow/test_fpgadataflow_softmax.py b/tests/fpgadataflow/test_fpgadataflow_softmax.py
index a1ba6ef5bb..8e118d6178 100644
--- a/tests/fpgadataflow/test_fpgadataflow_softmax.py
+++ b/tests/fpgadataflow/test_fpgadataflow_softmax.py
@@ -61,11 +61,15 @@
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
 from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
+from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
 from finn.util.basic import pynq_part_map
 from finn.transformation.streamline.reorder import (
     MakeMaxPoolNHWC,
     MoveScalarLinearPastInvariants,
 )
+from finn.transformation.fpgadataflow.create_dataflow_partition import (
+    CreateDataflowPartition,
+)
 from qonnx.transformation.general import (
     ApplyConfig,
     GiveReadableTensorNames,

From 9dbcc13e1acb6472af79fb49158e4351980d2916 Mon Sep 17 00:00:00 2001
From: aziz bahri <azizb@amd.com>
Date: Wed, 31 Jul 2024 13:52:33 +0100
Subject: [PATCH 26/49] softmax: set preferred impl style

---
 tests/fpgadataflow/test_fpgadataflow_softmax.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/fpgadataflow/test_fpgadataflow_softmax.py b/tests/fpgadataflow/test_fpgadataflow_softmax.py
index 8e118d6178..ff621dd026 100644
--- a/tests/fpgadataflow/test_fpgadataflow_softmax.py
+++ b/tests/fpgadataflow/test_fpgadataflow_softmax.py
@@ -148,7 +148,8 @@ def test_convert_to_hw_softmax_layer(exec_mode, simd):
     folding_config = {
         "Defaults": {},
         "QuantSoftmax_0": {
-            "simd": simd
+            "simd": simd,
+            "preferred_impl_style": "hls"
         }
     }
     try:

From 843b7b142e0e14a4bf6df1cb293ce25010b224e3 Mon Sep 17 00:00:00 2001
From: aziz bahri <azizb@amd.com>
Date: Wed, 31 Jul 2024 17:46:32 +0100
Subject: [PATCH 27/49] finn: add hls library paths

---
 docker/finn_entrypoint.sh | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/docker/finn_entrypoint.sh b/docker/finn_entrypoint.sh
index c7500bcaa6..4004523bad 100644
--- a/docker/finn_entrypoint.sh
+++ b/docker/finn_entrypoint.sh
@@ -81,14 +81,14 @@ if [ -f "$VITIS_PATH/settings64.sh" ];then
   export XILINX_XRT=/opt/xilinx/xrt
   source $VITIS_PATH/settings64.sh
   gecho "Found Vitis at $VITIS_PATH"
-  if [ -f "$XILINX_XRT/setup.sh" ];then
-    # source XRT
-    source $XILINX_XRT/setup.sh
-    gecho "Found XRT at $XILINX_XRT"
-  else
-    recho "XRT not found on $XILINX_XRT, did you skip the download or did the installation fail?"
-    exit -1
-  fi
+  # if [ -f "$XILINX_XRT/setup.sh" ];then
+  #   # source XRT
+  #   source $XILINX_XRT/setup.sh
+  #   gecho "Found XRT at $XILINX_XRT"
+  # else
+  #   recho "XRT not found on $XILINX_XRT, did you skip the download or did the installation fail?"
+  #   exit -1
+  # fi
 else
   yecho "Unable to find $VITIS_PATH/settings64.sh"
   yecho "Functionality dependent on Vitis will not be available."
@@ -137,6 +137,15 @@ else
   echo "See https://docs.xilinx.com/r/en-US/ug835-vivado-tcl-commands/Tcl-Initialization-Scripts"
 fi
 
+# add hls library path to LD_LIBRARY_PATH
+export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$HLS_PATH/lnx64/tools/fpo_v7_1"
+export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$HLS_PATH/lnx64/tools/fft_v9_1"
+export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$HLS_PATH/lnx64/tools/fir_v7_0"
+export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$HLS_PATH/lnx64/tools/dds_v6_0"
+export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$HLS_PATH/tps/lnx64/gcc-8.3.0/lib"
+export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$HLS_PATH/lib/lnx64.o/Rhel"
+export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$HLS_PATH/lnx64/lib/csim"
+
 export PATH=$PATH:$HOME/.local/bin
 # execute the provided command(s) as root
 exec "$@"

From 705294797fb4567bca21d34f7c12752054c7dac7 Mon Sep 17 00:00:00 2001
From: aziz bahri <azizb@amd.com>
Date: Thu, 1 Aug 2024 10:54:57 +0100
Subject: [PATCH 28/49] cpp compiler: raise exception if compilation fails

---
 src/finn/util/basic.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py
index 91c191962f..c2e2cbcd8a 100644
--- a/src/finn/util/basic.py
+++ b/src/finn/util/basic.py
@@ -192,8 +192,12 @@ def build(self, code_gen_dir):
             f.write("#!/bin/bash \n")
             f.write(bash_compile + "\n")
         bash_command = ["bash", self.compile_script]
-        process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
-        process_compile.communicate()
+
+        with open(str(self.code_gen_dir) + "/compile.log", "w") as f:
+            try:
+                subprocess.check_output(bash_command, stderr=f)
+            except subprocess.CalledProcessError:
+                raise Exception(f"Error in compiling the generated code. Check {f.name} for more details.")
 
 
 def launch_process_helper(args, proc_env=None, cwd=None):

From ffeb69ca4d9ccbdd22c806b8622fe01aef77358a Mon Sep 17 00:00:00 2001
From: aziz bahri <azizb@amd.com>
Date: Thu, 1 Aug 2024 10:56:03 +0100
Subject: [PATCH 29/49] softmax: compile node for cpp sim

---
 .../fpgadataflow/hls/quantsoftmax_hls.py      | 24 +++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py b/src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py
index a3980b0749..5337f4561b 100644
--- a/src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py
@@ -30,6 +30,8 @@
 import numpy as np
 from finn.custom_op.fpgadataflow.quantsoftmax import QuantSoftmax
 from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend
+import subprocess
+from finn.util.basic import CppBuilder, get_rtlsim_trace_depth, make_build_dir
 
 class QuantSoftmax_hls(QuantSoftmax, HLSBackend):
     def __init__(self, onnx_node, **kwargs):
@@ -117,3 +119,25 @@ def execute_node(self, context, graph):
         else:
             raise Exception(f"Unsupported execution mode: {mode}")
 
+    def compile_singlenode_code(self):
+        """Builds the bash script for compilation using the CppBuilder from
+        finn.util.basic and executes the script to produce the executable."""
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        builder = CppBuilder()
+        # to enable additional debug features please uncommand the next line
+        # builder.append_includes("-DDEBUG")
+        builder.append_includes("-I$FINN_ROOT/src/finn/qnn-data/cpp")
+        builder.append_includes("-I$FINN_ROOT/deps/cnpy/")
+        builder.append_includes("-I$FINN_ROOT/deps/finn-hlslib")
+        builder.append_includes("-I$FINN_ROOT/custom_hls")
+        builder.append_includes("-I{}/include".format(os.environ["HLS_PATH"]))
+        builder.append_includes("--std=c++14")
+        builder.append_includes("-O3")
+        builder.append_sources(code_gen_dir + "/*.cpp")
+        builder.append_sources("$FINN_ROOT/deps/cnpy/cnpy.cpp")
+        builder.append_includes("-lz")
+        builder.append_includes("-fno-builtin -fno-inline -Wl,-rpath,\"$HLS_PATH/lnx64/lib/csim\" -L$HLS_PATH/lnx64/lib/csim -lhlsmc++-GCC46")
+        builder.append_includes("-L$HLS_PATH/lnx64/tools/fpo_v7_1 -lgmp -lmpfr -lIp_floating_point_v7_1_bitacc_cmodel")
+        builder.set_executable_path(code_gen_dir + "/node_model")
+        builder.build(code_gen_dir)
+        self.set_nodeattr("executable_path", builder.executable_path)
\ No newline at end of file

From ef80c8e412dc1b64e2a701c0e66201221a7405db Mon Sep 17 00:00:00 2001
From: aziz bahri <azizb@amd.com>
Date: Fri, 2 Aug 2024 02:36:40 +0100
Subject: [PATCH 30/49] softmax: generate cppsim code

---
 .../fpgadataflow/hls/quantsoftmax_hls.py      | 47 ++++++++++++++++++-
 1 file changed, 46 insertions(+), 1 deletion(-)

diff --git a/src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py b/src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py
index 5337f4561b..41541274d2 100644
--- a/src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py
@@ -30,6 +30,7 @@
 import numpy as np
 from finn.custom_op.fpgadataflow.quantsoftmax import QuantSoftmax
 from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend
+from finn.custom_op.fpgadataflow import templates
 import subprocess
 from finn.util.basic import CppBuilder, get_rtlsim_trace_depth, make_build_dir
 
@@ -140,4 +141,48 @@ def compile_singlenode_code(self):
         builder.append_includes("-L$HLS_PATH/lnx64/tools/fpo_v7_1 -lgmp -lmpfr -lIp_floating_point_v7_1_bitacc_cmodel")
         builder.set_executable_path(code_gen_dir + "/node_model")
         builder.build(code_gen_dir)
-        self.set_nodeattr("executable_path", builder.executable_path)
\ No newline at end of file
+        self.set_nodeattr("executable_path", builder.executable_path)
+
+    def code_generation_cppsim(self, model):
+        """Generates c++ code for simulation (cppsim)."""
+        node = self.onnx_node
+        path = self.get_nodeattr("code_gen_dir_cppsim")
+        self.code_gen_dict["$AP_INT_MAX_W$"] = [str(self.get_ap_int_max_w())]
+        self.generate_params(model, path)
+        self.global_includes()
+        self.defines("cppsim")
+        self.read_npy_data()
+        self.strm_decl()
+        self.pragmas()
+
+        self.code_gen_dict["$DOCOMPUTE$"] = [
+            f"""
+                static hls::stream<hls::vector<T,SIMD>>  src0;
+                static hls::stream<hls::vector<T,SIMD>>  dst0;
+
+                hls::vector<T, SIMD> x;
+                for(unsigned i=0; i<SIMD; i++) {{
+                    T v = in0_V.read();
+                    x[i] = v;
+                }}
+                src0.write(x);
+                smaxquant<W,SIMD,T,F>(src0, dst0);
+
+                for(unsigned i=0; i<SIMD; i++) {{
+                    T v = dst0.read()[i];
+                    out_V.write(v);
+                }}
+            """
+        ]
+        self.dataoutstrm()
+        self.save_as_npy()
+
+        template = templates.docompute_template
+
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + f"/execute_{node.op_type}.cpp"
+        with open(code_gen_dir, "w") as f:
+            for key in self.code_gen_dict:
+                # transform list into long string separated by '\n'
+                code_gen_line = "\n".join(self.code_gen_dict[key])
+                template = template.replace(key, code_gen_line)
+            f.write(template)

From 87fc354cc6389a26f98bc7b49690005f537f3aa4 Mon Sep 17 00:00:00 2001
From: aziz bahri <azizb@amd.com>
Date: Fri, 2 Aug 2024 11:43:40 +0100
Subject: [PATCH 31/49] softmax: does not suppor rtlsim

---
 src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py b/src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py
index 41541274d2..3a3cb2b076 100644
--- a/src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py
@@ -186,3 +186,6 @@ def code_generation_cppsim(self, model):
                 code_gen_line = "\n".join(self.code_gen_dict[key])
                 template = template.replace(key, code_gen_line)
             f.write(template)
+    def prepare_rtlsim(self):
+        # this node currently does not support rtlsim
+        raise NotImplementedError("QuantSoftmax_hls does not support rtlsim")
\ No newline at end of file

From 1c150224e322748790a84cf8b8a3986e2e513775 Mon Sep 17 00:00:00 2001
From: aziz bahri <azizb@amd.com>
Date: Fri, 2 Aug 2024 11:44:23 +0100
Subject: [PATCH 32/49] softmax: transformation test

---
 .../fpgadataflow/test_fpgadataflow_softmax.py | 35 +++++++++++--------
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/tests/fpgadataflow/test_fpgadataflow_softmax.py b/tests/fpgadataflow/test_fpgadataflow_softmax.py
index ff621dd026..851e86cddd 100644
--- a/tests/fpgadataflow/test_fpgadataflow_softmax.py
+++ b/tests/fpgadataflow/test_fpgadataflow_softmax.py
@@ -103,22 +103,19 @@
 #       2. check values are correct
 
 
-def create_model():
+def create_model(io_shape=(1, 12, 128, 128)):
     '''
     Create a quantized softmax model.
     Input and output are quantized to Int8ActPerTensorFloat, this is to make sure
     that the softmax layer is followed by a Quant node.
     '''
-    io_shape = (1, 12, 128, 128)
     class QuantSoftMaxSimple(nn.Module):
         def __init__(self):
             super(QuantSoftMaxSimple, self).__init__()
-            # self.input_identity = qnn.QuantIdentity(act_quant=Int8ActPerTensorFloat)
             self.output_identity = qnn.QuantIdentity()
             self.softmax = nn.Softmax(dim=3) # softmax along the last dimension
 
         def forward(self, x):
-            # x = self.input_identity(x)
             x = self.softmax(x)
             x = self.output_identity(x)
             return x
@@ -130,20 +127,24 @@ def forward(self, x):
     # set the model input to INT8
     model = ModelWrapper(export_onnx_path)
     model.set_tensor_datatype(model.graph.input[0].name, DataType["UINT8"])
-    # import pdb; pdb.set_trace()
     return model
 
-@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
+@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim", "stitched_ip"])
 @pytest.mark.parametrize("simd", ["simd1", "simd2", "simd3", "simd4"])
 @pytest.mark.fpgadataflow
 def test_convert_to_hw_softmax_layer(exec_mode, simd):
     '''
-    Test that all transofrmations can be applied to a model with a softmax layer.
+    This test checks that the softmax layer can be converted to a HW layer.
     '''
+    if (exec_mode == "stitched_ip" or exec_mode == "rtlsim") and simd != "simd1":
+        pytest.skip("Skipping this test to avoid long test times")
     # Create the qonnx model
-    # modelproto = create_softmax_graph()
+    io_shape = (1, 12, 128, 128)
+    # input = torch.randn(io_shape)
+    input = gen_finn_dt_tensor(DataType["UINT8"], io_shape)
+    input_t = {"global_in": input}
 
-    model = create_model()
+    model = create_model(io_shape)
     simd = int(simd[-1])
     folding_config = {
         "Defaults": {},
@@ -165,22 +166,28 @@ def test_convert_to_hw_softmax_layer(exec_mode, simd):
         model = ModelWrapper(sdp_node_path)
         model = model.transform(ApplyConfig(folding_config))
         model = model.transform(SpecializeLayers(test_fpga_part))
+        model = model.transform(GiveUniqueNodeNames())
         if exec_mode == "cppsim":
+            model = model.transform(SetExecMode("cppsim"))
             model = model.transform(PrepareCppSim())
             model = model.transform(CompileCppSim())
-            model = model.transform(SetExecMode("cppsim"))
         elif exec_mode == "rtlsim":
             model = model.transform(SetExecMode("rtlsim"))
-            model = model.transform(GiveUniqueNodeNames())
+            model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
+            model = model.transform(HLSSynthIP())
+            try:
+                model = model.transform(PrepareRTLSim())
+                pytest.fail("PrepareRTLSim should have failed")
+            except Exception as e:
+                # expected to fail because this node do not support rtlsim
+                pass
+        elif exec_mode == "stitched_ip":
             model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
             model = model.transform(HLSSynthIP())
             model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns))
-            # model = model.transform(PrepareRTLSim())
     except Exception as e:
         pytest.fail(f"Failed to transform the model: {str(e)}")
 
-    # oxe.execute_onnx()
-
 def test_fpgadataflow_quantsoftmax():
     # Create the qonnx model
     # create_model()

From 3a78f7953e6172dca7adaf96094588e6c8aaeeb4 Mon Sep 17 00:00:00 2001
From: aziz bahri <azizb@amd.com>
Date: Tue, 6 Aug 2024 09:14:32 +0100
Subject: [PATCH 33/49] softmax: more generic testcase

---
 .../fpgadataflow/test_fpgadataflow_softmax.py | 66 ++++++++++++++++---
 1 file changed, 58 insertions(+), 8 deletions(-)

diff --git a/tests/fpgadataflow/test_fpgadataflow_softmax.py b/tests/fpgadataflow/test_fpgadataflow_softmax.py
index 851e86cddd..b4e0129a5c 100644
--- a/tests/fpgadataflow/test_fpgadataflow_softmax.py
+++ b/tests/fpgadataflow/test_fpgadataflow_softmax.py
@@ -129,6 +129,42 @@ def forward(self, x):
     model.set_tensor_datatype(model.graph.input[0].name, DataType["UINT8"])
     return model
 
+def make_single_quantsoftmax_modelwrapper(impl_style="hls", simd=1, idt=DataType["UINT8"], ifm_dim=(128, 128), channels=12):
+    '''
+    Create a single quantized softmax node with variable parameters.
+    this is before SpecializeLayers() transformation.
+    '''
+    h = ifm_dim[0]
+    w = ifm_dim[1]
+
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, h, w, channels])
+    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, h, w, channels])
+    new_node = helper.make_node(
+        "QuantSoftmax",
+        ["inp"],
+        ["outp"],
+        domain="finn.custom_op.fpgadataflow",
+        backend="fpgadataflow",
+        ifm_dim=[h, w],
+        channels=channels,
+        data_type = idt.name,
+        simd=simd,
+        preferred_impl_style=impl_style,
+    )
+    graph = helper.make_graph(
+        [new_node],
+        "softmax_graph",
+        inputs=[inp],
+        outputs=[outp]
+    )
+    model = qonnx_make_model(graph, producer_name="fmpadding-model")
+    model = ModelWrapper(model)
+
+    model.set_tensor_datatype("inp", idt)
+    model.set_tensor_datatype("outp", idt)
+    
+    return model
+
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim", "stitched_ip"])
 @pytest.mark.parametrize("simd", ["simd1", "simd2", "simd3", "simd4"])
 @pytest.mark.fpgadataflow
@@ -188,16 +224,30 @@ def test_convert_to_hw_softmax_layer(exec_mode, simd):
     except Exception as e:
         pytest.fail(f"Failed to transform the model: {str(e)}")
 
-def test_fpgadataflow_quantsoftmax():
+
+@pytest.mark.parametrize("impl_style", ["hls","rtl"])
+@pytest.mark.parametrize("simd", ["simd1", "simd2", "simd3", "simd4"])
+@pytest.mark.parametrize("idt", [DataType["INT2"], DataType["INT4"]])
+@pytest.mark.parametrize("ifm_dim", [(12,128)])
+@pytest.mark.parametrize("channels", [128, 384])
+@pytest.mark.fpgadataflow
+def test_fpga_dataflow_quantsoftmax(impl_style, simd, idt, ifm_dim, channels):
+    simd = int(simd[-1])
+    model = make_single_quantsoftmax_modelwrapper(impl_style=impl_style, simd=simd, idt=idt, ifm_dim=ifm_dim, channels=channels)
+    
     # Create the qonnx model
-    # create_model()
-    model = create_model()
+    io_shape = (1, 12, 128, 128)
+    # input = torch.randn(io_shape)
+    input = gen_finn_dt_tensor(DataType["UINT8"], io_shape)
+    input_t = {"global_in": input}
+
     try:
-        model = model.transform(InferShapes())
-        model = model.transform(InferDataTypes())
-        model = model.transform(FoldConstants())
-        model = model.transform(to_hw.InferQuantSoftmax())
         model = model.transform(SpecializeLayers(test_fpga_part))
-
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(SetExecMode("cppsim"))
+        model = model.transform(PrepareCppSim())
+        model = model.transform(CompileCppSim())
+        # run the model
+        oxe.execute_onnx(model, input_t)
     except Exception as e:
         pytest.fail(f"Failed to transform the model: {str(e)}")
\ No newline at end of file

From cd8d27080f32db3f1e360800ae29f5a36532133c Mon Sep 17 00:00:00 2001
From: aziz bahri <azizb@amd.com>
Date: Wed, 7 Aug 2024 15:26:31 +0100
Subject: [PATCH 34/49] hlsbackend: handle subprocess exceptions and log them

---
 src/finn/custom_op/fpgadataflow/hlsbackend.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/hlsbackend.py b/src/finn/custom_op/fpgadataflow/hlsbackend.py
index d8210fd684..5436aa31af 100644
--- a/src/finn/custom_op/fpgadataflow/hlsbackend.py
+++ b/src/finn/custom_op/fpgadataflow/hlsbackend.py
@@ -307,16 +307,21 @@ def npy_to_dynamic_outputs(self, context, npy_list):
 
     def exec_precompiled_singlenode_model(self):
         """Executes precompiled executable."""
-        executable_path = self.get_nodeattr("executable_path")
-        if executable_path == "":
+        executable = self.get_nodeattr("executable_path")
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        if executable == "":
             raise Exception(
                 """
 Found no executable for this node, did you run the codegen and
 compilation transformations?
             """
             )
-        process_execute = subprocess.Popen(executable_path, stdout=subprocess.PIPE)
-        process_execute.communicate()
+        with open(code_gen_dir + "/sim.log", "w") as f:
+            try:
+                subprocess.check_output(executable, stderr=f)
+            except subprocess.CalledProcessError:
+                raise Exception(f"Error running the generated code. Check {f.name} for more details.")
+
 
     def hls_sname(self):
         """Get the naming convention used by Vitis HLS for stream signals

From f6a7b8b5295808f5f634f0122e224fc857e2e2a8 Mon Sep 17 00:00:00 2001
From: aziz bahri <azizb@amd.com>
Date: Wed, 7 Aug 2024 15:28:29 +0100
Subject: [PATCH 35/49] cpp template: try catch in cppsim templates

---
 src/finn/custom_op/fpgadataflow/templates.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py
index 3d89a0ab23..920711909a 100644
--- a/src/finn/custom_op/fpgadataflow/templates.py
+++ b/src/finn/custom_op/fpgadataflow/templates.py
@@ -42,18 +42,22 @@
 $DEFINES$
 
 int main(){
-$PRAGMAS$
 
-$STREAMDECLARATIONS$
+    $PRAGMAS$
 
-$READNPYDATA$
+    try {
+    $STREAMDECLARATIONS$
 
-$DOCOMPUTE$
+    $READNPYDATA$
 
-$DATAOUTSTREAM$
+    $DOCOMPUTE$
 
-$SAVEASCNPY$
+    $DATAOUTSTREAM$
 
+    $SAVEASCNPY$
+    } catch (const std::exception& e) {
+        std::cerr << "Error: " << e.what() << std::endl;
+    }
 }
 
 """

From 618a529e9963e2739158aee689e3414e204b06d8 Mon Sep 17 00:00:00 2001
From: aziz bahri <azizb@amd.com>
Date: Thu, 8 Aug 2024 09:27:41 +0100
Subject: [PATCH 36/49] softmax: generate cppsim with npyvector stream

---
 .../fpgadataflow/hls/quantsoftmax_hls.py      | 51 ++++++++-----------
 1 file changed, 22 insertions(+), 29 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py b/src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py
index 3a3cb2b076..ac9abd86c0 100644
--- a/src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py
@@ -31,9 +31,7 @@
 from finn.custom_op.fpgadataflow.quantsoftmax import QuantSoftmax
 from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend
 from finn.custom_op.fpgadataflow import templates
-import subprocess
-from finn.util.basic import CppBuilder, get_rtlsim_trace_depth, make_build_dir
-
+from finn.util.basic import CppBuilder
 class QuantSoftmax_hls(QuantSoftmax, HLSBackend):
     def __init__(self, onnx_node, **kwargs):
         super().__init__(onnx_node, **kwargs)
@@ -46,19 +44,21 @@ def get_nodeattr_types(self):
 
     def global_includes(self):
         self.code_gen_dict["$GLOBALS$"] = [
+            '#include "npy2vectorstream.hpp"',
+            '#include "debug_print.hpp"',
             '#include "softmax.hpp"',
             '#include "utils.hpp"'
             ]
 
     def defines(self, var):
         simd = self.get_nodeattr("simd")
-        ibits = self.get_input_datatype().bitwidth()
+        dtype = self.get_input_datatype()
         channels = self.get_nodeattr("channels")
         self.code_gen_dict["$DEFINES$"] = [
            f"""
             constexpr unsigned  SIMD = {simd};
             constexpr unsigned  W = {channels};
-            using  T = ap_uint<{ibits}>;
+            using  T = {dtype.get_hls_datatype_str()};
             using  F = float;
            """
         ]
@@ -101,17 +101,13 @@ def pragmas(self):
     def execute_node(self, context, graph):
         mode = self.get_nodeattr("exec_mode")
         node = self.onnx_node
-        exp_ishape = self.get_normal_input_shape()
-        exp_oshape = self.get_normal_output_shape()
         folded_ishape = self.get_folded_input_shape()
 
-
         if mode == "cppsim":
-            print("Executing node with cppsim")
             code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
             inp = context[node.input[0]]
             export_idt = self.get_input_datatype()
-            inp = inp.reshape(folded_ishape)
+            # inp = inp.reshape(folded_ishape)
             np.save(os.path.join(code_gen_dir, "input_0.npy"), inp)
             # # execute the precompiled model
             super().exec_precompiled_singlenode_model()
@@ -145,36 +141,32 @@ def compile_singlenode_code(self):
 
     def code_generation_cppsim(self, model):
         """Generates c++ code for simulation (cppsim)."""
+        self.code_gen_dict["$READNPYDATA$"] = [""]
+        self.code_gen_dict["$DATAOUTSTREAM$"] = [""]
+        self.code_gen_dict["$STREAMDECLARATIONS$"] = [""]
         node = self.onnx_node
         path = self.get_nodeattr("code_gen_dir_cppsim")
         self.code_gen_dict["$AP_INT_MAX_W$"] = [str(self.get_ap_int_max_w())]
         self.generate_params(model, path)
         self.global_includes()
         self.defines("cppsim")
-        self.read_npy_data()
-        self.strm_decl()
         self.pragmas()
-
+        oshape = self.get_normal_output_shape()
+        oshape_str = str(oshape).replace("(", "{").replace(")", "}")
         self.code_gen_dict["$DOCOMPUTE$"] = [
-            f"""
-                static hls::stream<hls::vector<T,SIMD>>  src0;
-                static hls::stream<hls::vector<T,SIMD>>  dst0;
+            f'''
+            static hls::stream<hls::vector<T,SIMD>>  in0_V;
+            static hls::stream<hls::vector<T,SIMD>>  out_V;
 
-                hls::vector<T, SIMD> x;
-                for(unsigned i=0; i<SIMD; i++) {{
-                    T v = in0_V.read();
-                    x[i] = v;
-                }}
-                src0.write(x);
-                smaxquant<W,SIMD,T,F>(src0, dst0);
+            npy2vectorstream<T, float, SIMD>("{path}/input_0.npy", in0_V);
+
+            for (unsigned i = 0; i < 300; i++){{
+                smaxquant<W, SIMD, T>(in0_V, out_V);
+            }}
 
-                for(unsigned i=0; i<SIMD; i++) {{
-                    T v = dst0.read()[i];
-                    out_V.write(v);
-                }}
-            """
+            vectorstream2npy<T, float, SIMD>(out_V,{oshape_str}, "{path}/output.npy");
+            '''
         ]
-        self.dataoutstrm()
         self.save_as_npy()
 
         template = templates.docompute_template
@@ -186,6 +178,7 @@ def code_generation_cppsim(self, model):
                 code_gen_line = "\n".join(self.code_gen_dict[key])
                 template = template.replace(key, code_gen_line)
             f.write(template)
+
     def prepare_rtlsim(self):
         # this node currently does not support rtlsim
         raise NotImplementedError("QuantSoftmax_hls does not support rtlsim")
\ No newline at end of file

From 89dfd56f796616e58c87e8f61f3e88dbe909a1cc Mon Sep 17 00:00:00 2001
From: aziz bahri <azizb@amd.com>
Date: Thu, 8 Aug 2024 09:29:00 +0100
Subject: [PATCH 37/49] softmax: functional model

---
 .../custom_op/fpgadataflow/quantsoftmax.py    | 22 +++++++++++++------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/quantsoftmax.py b/src/finn/custom_op/fpgadataflow/quantsoftmax.py
index 47167cbc3c..ac9c17fb63 100644
--- a/src/finn/custom_op/fpgadataflow/quantsoftmax.py
+++ b/src/finn/custom_op/fpgadataflow/quantsoftmax.py
@@ -1,14 +1,11 @@
 
 from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
-from finn.util.data_packing import numpy_to_hls_code, pack_innermost_dim_as_hex_string
 from onnx.helper import make_node
 import warnings
 from qonnx.core.datatype import DataType
-import onnx
-from onnx.helper import make_node, make_tensor_value_info
+from onnx.helper import make_node
 import numpy as np
-import torch
-
+from scipy.special import softmax
 class QuantSoftmax(HWCustomOp):
     """Abstraction layer for HW implementation of VectorVectorActivation layers."""
 
@@ -37,11 +34,22 @@ def get_normal_output_shape(self, ind=0):
     def get_number_output_values(self):
         raise NotImplementedError("This function is not yet implemented.")
 
+    def quantise_to_int(self, arr, dtype):
+        max_val = np.iinfo(dtype).max
+        output = np.zeros_like(arr, dtype=dtype)
+        frac_part = arr - np.floor(arr)
+        scaled_frac = frac_part * max_val
+        output = scaled_frac.astype(dtype)
+        output[arr >= 1.0] = max_val
+        return output
+
     def execute_node(self, context, graph):
         node = self.onnx_node
         input_data = context[node.input[0]]
-        output_data = torch.softmax(input_data, dim=3)
-        context[node.output[0]] = output_data
+        output_data = softmax(input_data, axis=-1)
+        qsm_out = self.quantise_to_int(output_data, np.int8)
+        context[node.output[0]] = qsm_out
+
 
     def get_number_output_values(self):
         raise NotImplementedError

From d98561a86e7fad94537feb7f39ceb4b5d9765d01 Mon Sep 17 00:00:00 2001
From: aziz bahri <azizb@amd.com>
Date: Thu, 8 Aug 2024 09:30:12 +0100
Subject: [PATCH 38/49] softmax: clean up prints

---
 .../fpgadataflow/convert_to_hw_layers.py           |  1 -
 tests/fpgadataflow/test_fpgadataflow_softmax.py    | 14 --------------
 2 files changed, 15 deletions(-)

diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
index 257db2c79a..e400e4335f 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
@@ -1715,7 +1715,6 @@ def apply(self, model):
             # check that an optype of Softmax is present followed by a MultiThreshold
             consumer = model.find_consumer(n.output[0])
             if n.op_type == "Softmax" and consumer is not None and consumer.op_type == "MultiThreshold":
-                print("Found Softmax followed by MultiThreshold")
                 # get the shape of the input/output tensor
                 input_shape = model.get_tensor_shape(n.input[0])
                 assert input_shape == model.get_tensor_shape(consumer.input[0]), (
diff --git a/tests/fpgadataflow/test_fpgadataflow_softmax.py b/tests/fpgadataflow/test_fpgadataflow_softmax.py
index b4e0129a5c..db5f7fe053 100644
--- a/tests/fpgadataflow/test_fpgadataflow_softmax.py
+++ b/tests/fpgadataflow/test_fpgadataflow_softmax.py
@@ -89,20 +89,6 @@
 target_clk_ns = 5
 export_onnx_path = "softmax_dut_qonnx.onnx"
 
-### Make model wrapper
-# 1. make node,
-
-
-### Test
-## 1. Compiler integration
-#       1. check all transforms can be applied to a model with a softmax layer
-#       2. Check that IP stitching produces valid HLS package
-
-## 2. Functionality test
-#       1. Check that we can run cpp/rtl sims
-#       2. check values are correct
-
-
 def create_model(io_shape=(1, 12, 128, 128)):
     '''
     Create a quantized softmax model.

From 2eaec1e2a7f59539c2aeba86313e50e4dfcc61c0 Mon Sep 17 00:00:00 2001
From: aziz bahri <azizb@amd.com>
Date: Thu, 8 Aug 2024 09:31:33 +0100
Subject: [PATCH 39/49] softmax: clear up functional test

---
 .../fpgadataflow/test_fpgadataflow_softmax.py  | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/tests/fpgadataflow/test_fpgadataflow_softmax.py b/tests/fpgadataflow/test_fpgadataflow_softmax.py
index db5f7fe053..6824c02913 100644
--- a/tests/fpgadataflow/test_fpgadataflow_softmax.py
+++ b/tests/fpgadataflow/test_fpgadataflow_softmax.py
@@ -148,7 +148,7 @@ def make_single_quantsoftmax_modelwrapper(impl_style="hls", simd=1, idt=DataType
 
     model.set_tensor_datatype("inp", idt)
     model.set_tensor_datatype("outp", idt)
-    
+
     return model
 
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim", "stitched_ip"])
@@ -213,19 +213,20 @@ def test_convert_to_hw_softmax_layer(exec_mode, simd):
 
 @pytest.mark.parametrize("impl_style", ["hls","rtl"])
 @pytest.mark.parametrize("simd", ["simd1", "simd2", "simd3", "simd4"])
-@pytest.mark.parametrize("idt", [DataType["INT2"], DataType["INT4"]])
+@pytest.mark.parametrize("idt", [DataType["UINT8"],DataType["INT8"],DataType["INT4"],DataType["UINT4"]])
 @pytest.mark.parametrize("ifm_dim", [(12,128)])
 @pytest.mark.parametrize("channels", [128, 384])
 @pytest.mark.fpgadataflow
 def test_fpga_dataflow_quantsoftmax(impl_style, simd, idt, ifm_dim, channels):
     simd = int(simd[-1])
     model = make_single_quantsoftmax_modelwrapper(impl_style=impl_style, simd=simd, idt=idt, ifm_dim=ifm_dim, channels=channels)
-    
+
     # Create the qonnx model
-    io_shape = (1, 12, 128, 128)
-    # input = torch.randn(io_shape)
-    input = gen_finn_dt_tensor(DataType["UINT8"], io_shape)
-    input_t = {"global_in": input}
+    io_shape = (1, ifm_dim[0], ifm_dim[1], channels)
+    input = gen_finn_dt_tensor(idt, io_shape)
+    input_t = {"inp": input}
+
+    y_expected = oxe.execute_onnx(model, input_t)["outp"]
 
     try:
         model = model.transform(SpecializeLayers(test_fpga_part))
@@ -234,6 +235,7 @@ def test_fpga_dataflow_quantsoftmax(impl_style, simd, idt, ifm_dim, channels):
         model = model.transform(PrepareCppSim())
         model = model.transform(CompileCppSim())
         # run the model
-        oxe.execute_onnx(model, input_t)
+        y_hw = oxe.execute_onnx(model, input_t)["outp"]
+        assert (y_hw == y_expected).all(), "HW layer execution failed"
     except Exception as e:
         pytest.fail(f"Failed to transform the model: {str(e)}")
\ No newline at end of file

From 42c810feeb0764b4d23f8cbbe1b1086994d8eec3 Mon Sep 17 00:00:00 2001
From: aziz bahri <azizb@amd.com>
Date: Thu, 8 Aug 2024 09:39:19 +0100
Subject: [PATCH 40/49] softmax: clean up unused modules in test

---
 .../fpgadataflow/test_fpgadataflow_softmax.py | 24 +------------------
 1 file changed, 1 insertion(+), 23 deletions(-)

diff --git a/tests/fpgadataflow/test_fpgadataflow_softmax.py b/tests/fpgadataflow/test_fpgadataflow_softmax.py
index 6824c02913..9ba35e7d8d 100644
--- a/tests/fpgadataflow/test_fpgadataflow_softmax.py
+++ b/tests/fpgadataflow/test_fpgadataflow_softmax.py
@@ -28,9 +28,7 @@
 
 import pytest
 import torch
-import onnx
-from onnx import helper, numpy_helper
-import numpy as np
+from onnx import helper
 import os
 import finn.core.onnx_exec as oxe
 from brevitas.export import export_qonnx
@@ -42,17 +40,8 @@
 from qonnx.transformation.infer_shapes import InferShapes
 from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 from qonnx.transformation.infer_datatypes import InferDataTypes
-from qonnx.transformation.fold_constants import FoldConstants
-from finn.transformation.streamline.absorb import (
-    AbsorbAddIntoMultiThreshold,
-    AbsorbMulIntoMultiThreshold,
-    FactorOutMulSignMagnitude,
-    Absorb1BitMulIntoConv,
-)
 import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw
-from brevitas.quant.scaled_int import Int8ActPerTensorFloat, Int8WeightPerTensorFloat
 import finn.core.onnx_exec as oxe
-from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
@@ -62,26 +51,15 @@
 from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
 from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
 from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
-from finn.util.basic import pynq_part_map
-from finn.transformation.streamline.reorder import (
-    MakeMaxPoolNHWC,
-    MoveScalarLinearPastInvariants,
-)
 from finn.transformation.fpgadataflow.create_dataflow_partition import (
     CreateDataflowPartition,
 )
 from qonnx.transformation.general import (
     ApplyConfig,
-    GiveReadableTensorNames,
     GiveUniqueNodeNames,
-    RemoveStaticGraphInputs,
-    RemoveUnusedTensors,
 )
-from finn.transformation.streamline import Streamline
 import finn.transformation.streamline.absorb as absorb
-import onnx
 from onnx import helper
-import onnxruntime
 import torch
 import torch.nn as nn
 import brevitas.nn as qnn

From 4b49c66569669862fb24909e3d90dd5fb7b6dd9e Mon Sep 17 00:00:00 2001
From: aziz bahri <azizb@amd.com>
Date: Thu, 8 Aug 2024 11:23:42 +0100
Subject: [PATCH 41/49] softmax: use folded output shape

---
 src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py b/src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py
index ac9abd86c0..71f1b30b40 100644
--- a/src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py
@@ -107,7 +107,7 @@ def execute_node(self, context, graph):
             code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
             inp = context[node.input[0]]
             export_idt = self.get_input_datatype()
-            # inp = inp.reshape(folded_ishape)
+            inp = inp.reshape(folded_ishape)
             np.save(os.path.join(code_gen_dir, "input_0.npy"), inp)
             # # execute the precompiled model
             super().exec_precompiled_singlenode_model()
@@ -151,7 +151,7 @@ def code_generation_cppsim(self, model):
         self.global_includes()
         self.defines("cppsim")
         self.pragmas()
-        oshape = self.get_normal_output_shape()
+        oshape = self.get_folded_output_shape()
         oshape_str = str(oshape).replace("(", "{").replace(")", "}")
         self.code_gen_dict["$DOCOMPUTE$"] = [
             f'''
@@ -160,7 +160,7 @@ def code_generation_cppsim(self, model):
 
             npy2vectorstream<T, float, SIMD>("{path}/input_0.npy", in0_V);
 
-            for (unsigned i = 0; i < 300; i++){{
+            for (unsigned i = 0; i < 900; i++){{
                 smaxquant<W, SIMD, T>(in0_V, out_V);
             }}
 

From 1ff725fb82c02a2e1e8640b3656a0af82d78126a Mon Sep 17 00:00:00 2001
From: aziz bahri <azizb@amd.com>
Date: Thu, 8 Aug 2024 16:46:52 +0100
Subject: [PATCH 42/49] softmax: use onnx graph to validate the finn
 integration

---
 .../fpgadataflow/test_fpgadataflow_softmax.py | 82 +++++++++++--------
 1 file changed, 48 insertions(+), 34 deletions(-)

diff --git a/tests/fpgadataflow/test_fpgadataflow_softmax.py b/tests/fpgadataflow/test_fpgadataflow_softmax.py
index 9ba35e7d8d..358a278bb9 100644
--- a/tests/fpgadataflow/test_fpgadataflow_softmax.py
+++ b/tests/fpgadataflow/test_fpgadataflow_softmax.py
@@ -29,7 +29,6 @@
 import pytest
 import torch
 from onnx import helper
-import os
 import finn.core.onnx_exec as oxe
 from brevitas.export import export_qonnx
 from qonnx.util.cleanup import cleanup as qonnx_cleanup
@@ -41,7 +40,6 @@
 from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 from qonnx.transformation.infer_datatypes import InferDataTypes
 import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw
-import finn.core.onnx_exec as oxe
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
@@ -63,35 +61,39 @@
 import torch
 import torch.nn as nn
 import brevitas.nn as qnn
+import numpy as np
 test_fpga_part = "xczu3eg-sbva484-1-e"
 target_clk_ns = 5
-export_onnx_path = "softmax_dut_qonnx.onnx"
+export_onnx_path = "pytest_quantsoftmax_dut.onnx"
+
+class QuantSoftMaxSimple(nn.Module):
+    def __init__(self, bit_width=8, signed=True):
+        super(QuantSoftMaxSimple, self).__init__()
+        self.output_identity = qnn.QuantIdentity(bit_width=bit_width, scaling_per_tensor=True, bias=False, signed = signed)
+        self.softmax = nn.Softmax(dim=3) # softmax along the last dimension
+
+    def get_quant_scale(self):
+        return self.output_identity.quant_act_scale()
 
-def create_model(io_shape=(1, 12, 128, 128)):
+    def forward(self, x):
+        x = self.softmax(x)
+        x = self.output_identity(x)
+        return x
+
+def create_model(io_shape=(1, 12, 128, 128), idt=DataType["INT8"]):
     '''
     Create a quantized softmax model.
     Input and output are quantized to Int8ActPerTensorFloat, this is to make sure
     that the softmax layer is followed by a Quant node.
     '''
-    class QuantSoftMaxSimple(nn.Module):
-        def __init__(self):
-            super(QuantSoftMaxSimple, self).__init__()
-            self.output_identity = qnn.QuantIdentity()
-            self.softmax = nn.Softmax(dim=3) # softmax along the last dimension
-
-        def forward(self, x):
-            x = self.softmax(x)
-            x = self.output_identity(x)
-            return x
-
-    dut = QuantSoftMaxSimple()
-    input = torch.randn(io_shape)
+    dut = QuantSoftMaxSimple(idt.bitwidth(), idt.signed())
+    input = torch.rand(io_shape)
     export_qonnx(dut, input, export_onnx_path, opset_version=11)
     qonnx_cleanup(export_onnx_path, out_file=export_onnx_path)
-    # set the model input to INT8
+    # set the model input to UINT8
     model = ModelWrapper(export_onnx_path)
-    model.set_tensor_datatype(model.graph.input[0].name, DataType["UINT8"])
-    return model
+    model.set_tensor_datatype(model.graph.input[0].name, idt)
+    return model, dut.get_quant_scale()
 
 def make_single_quantsoftmax_modelwrapper(impl_style="hls", simd=1, idt=DataType["UINT8"], ifm_dim=(128, 128), channels=12):
     '''
@@ -101,12 +103,12 @@ def make_single_quantsoftmax_modelwrapper(impl_style="hls", simd=1, idt=DataType
     h = ifm_dim[0]
     w = ifm_dim[1]
 
-    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, h, w, channels])
-    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, h, w, channels])
+    inp = helper.make_tensor_value_info("global_in", TensorProto.FLOAT, [1, h, w, channels])
+    outp = helper.make_tensor_value_info("global_out", TensorProto.FLOAT, [1, h, w, channels])
     new_node = helper.make_node(
         "QuantSoftmax",
-        ["inp"],
-        ["outp"],
+        ["global_in"],
+        ["global_out"],
         domain="finn.custom_op.fpgadataflow",
         backend="fpgadataflow",
         ifm_dim=[h, w],
@@ -121,11 +123,11 @@ def make_single_quantsoftmax_modelwrapper(impl_style="hls", simd=1, idt=DataType
         inputs=[inp],
         outputs=[outp]
     )
-    model = qonnx_make_model(graph, producer_name="fmpadding-model")
+    model = qonnx_make_model(graph)
     model = ModelWrapper(model)
 
-    model.set_tensor_datatype("inp", idt)
-    model.set_tensor_datatype("outp", idt)
+    model.set_tensor_datatype("global_in", idt)
+    model.set_tensor_datatype("global_out", idt)
 
     return model
 
@@ -144,7 +146,8 @@ def test_convert_to_hw_softmax_layer(exec_mode, simd):
     input = gen_finn_dt_tensor(DataType["UINT8"], io_shape)
     input_t = {"global_in": input}
 
-    model = create_model(io_shape)
+    model, _ = create_model(io_shape)
+
     simd = int(simd[-1])
     folding_config = {
         "Defaults": {},
@@ -192,19 +195,28 @@ def test_convert_to_hw_softmax_layer(exec_mode, simd):
 @pytest.mark.parametrize("impl_style", ["hls","rtl"])
 @pytest.mark.parametrize("simd", ["simd1", "simd2", "simd3", "simd4"])
 @pytest.mark.parametrize("idt", [DataType["UINT8"],DataType["INT8"],DataType["INT4"],DataType["UINT4"]])
-@pytest.mark.parametrize("ifm_dim", [(12,128)])
-@pytest.mark.parametrize("channels", [128, 384])
+@pytest.mark.parametrize("ifm_dim", [(12,12)])
+@pytest.mark.parametrize("channels", [12, 384])
 @pytest.mark.fpgadataflow
 def test_fpga_dataflow_quantsoftmax(impl_style, simd, idt, ifm_dim, channels):
     simd = int(simd[-1])
+
     model = make_single_quantsoftmax_modelwrapper(impl_style=impl_style, simd=simd, idt=idt, ifm_dim=ifm_dim, channels=channels)
 
     # Create the qonnx model
     io_shape = (1, ifm_dim[0], ifm_dim[1], channels)
+
     input = gen_finn_dt_tensor(idt, io_shape)
-    input_t = {"inp": input}
+    input_t = {"global_in": input}
 
-    y_expected = oxe.execute_onnx(model, input_t)["outp"]
+    # Create reference values using the qonnx model
+    ref_model, scale = create_model(io_shape, idt)
+    y_ref = oxe.execute_onnx(ref_model, input_t)["global_out"]
+    y_ref = y_ref / scale
+    y_ref = y_ref.numpy()
+
+    y_out = oxe.execute_onnx(model, input_t)["global_out"]
+    assert np.allclose(y_ref, y_out, atol=5), "Model output does not match expected output"
 
     try:
         model = model.transform(SpecializeLayers(test_fpga_part))
@@ -213,7 +225,9 @@ def test_fpga_dataflow_quantsoftmax(impl_style, simd, idt, ifm_dim, channels):
         model = model.transform(PrepareCppSim())
         model = model.transform(CompileCppSim())
         # run the model
-        y_hw = oxe.execute_onnx(model, input_t)["outp"]
-        assert (y_hw == y_expected).all(), "HW layer execution failed"
+        y_hw = oxe.execute_onnx(model, input_t)["global_out"]
+
+        assert np.allclose(y_ref, y_hw, atol=5), "Model output does not match expected output"
+
     except Exception as e:
         pytest.fail(f"Failed to transform the model: {str(e)}")
\ No newline at end of file

From 949d1e8c8c4b8272d38d05c950ba964e7a77b098 Mon Sep 17 00:00:00 2001
From: aziz bahri <azizb@amd.com>
Date: Thu, 8 Aug 2024 17:43:11 +0100
Subject: [PATCH 43/49] softmax: update latest hls implementation

---
 src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py | 8 ++++----
 src/finn/custom_op/fpgadataflow/templates.py            | 1 +
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py b/src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py
index 71f1b30b40..697bd2cfa6 100644
--- a/src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py
@@ -44,10 +44,10 @@ def get_nodeattr_types(self):
 
     def global_includes(self):
         self.code_gen_dict["$GLOBALS$"] = [
-            '#include "npy2vectorstream.hpp"',
-            '#include "debug_print.hpp"',
+            '#include <hls_vector.h>',
             '#include "softmax.hpp"',
-            '#include "utils.hpp"'
+            '#include "utils.hpp"',
+            '#include "sm_utils.hpp"'
             ]
 
     def defines(self, var):
@@ -70,7 +70,7 @@ def docompute(self):
                 static hls::stream<hls::vector<T,SIMD>>  dst0;
 
                 move(in0_{self.hls_sname()}, src0);
-                smaxquant<W,SIMD,T,F>(src0, dst0);
+                smaxquant<W,SIMD,T>(src0, dst0);
                 move(dst0, out_{self.hls_sname()});
         '''
         ]
diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py
index 920711909a..8c9e99a578 100644
--- a/src/finn/custom_op/fpgadataflow/templates.py
+++ b/src/finn/custom_op/fpgadataflow/templates.py
@@ -32,6 +32,7 @@
 #define AP_INT_MAX_W $AP_INT_MAX_W$
 #include "cnpy.h"
 #include "npy2apintstream.hpp"
+#include "npy2vectorstream.hpp"
 #include <vector>
 #include "bnn-library.h"
 

From 98ac566cb1b97df0ac35a6560881bf87748a045e Mon Sep 17 00:00:00 2001
From: aziz bahri <azizb@amd.com>
Date: Mon, 12 Aug 2024 07:31:38 +0100
Subject: [PATCH 44/49] softmax: cpp sim stream size check

---
 src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py b/src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py
index 697bd2cfa6..d759535da8 100644
--- a/src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py
@@ -47,7 +47,6 @@ def global_includes(self):
             '#include <hls_vector.h>',
             '#include "softmax.hpp"',
             '#include "utils.hpp"',
-            '#include "sm_utils.hpp"'
             ]
 
     def defines(self, var):
@@ -159,8 +158,9 @@ def code_generation_cppsim(self, model):
             static hls::stream<hls::vector<T,SIMD>>  out_V;
 
             npy2vectorstream<T, float, SIMD>("{path}/input_0.npy", in0_V);
+            int stream_size = in0_V.size() - 1;
 
-            for (unsigned i = 0; i < 900; i++){{
+            while(out_V.size() != stream_size){{
                 smaxquant<W, SIMD, T>(in0_V, out_V);
             }}
 

From 43b15774d8d980e89d931f163ce761ae17fa8f47 Mon Sep 17 00:00:00 2001
From: aziz bahri <azizb@amd.com>
Date: Mon, 12 Aug 2024 14:20:55 +0100
Subject: [PATCH 45/49] softmax: fix expected stream size

---
 src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py b/src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py
index d759535da8..19903866b3 100644
--- a/src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py
@@ -158,7 +158,7 @@ def code_generation_cppsim(self, model):
             static hls::stream<hls::vector<T,SIMD>>  out_V;
 
             npy2vectorstream<T, float, SIMD>("{path}/input_0.npy", in0_V);
-            int stream_size = in0_V.size() - 1;
+            int stream_size = in0_V.size();
 
             while(out_V.size() != stream_size){{
                 smaxquant<W, SIMD, T>(in0_V, out_V);

From 8297bb44aef271ebda24121332d416e2f8c88420 Mon Sep 17 00:00:00 2001
From: aziz bahri <azizb@amd.com>
Date: Mon, 12 Aug 2024 14:21:15 +0100
Subject: [PATCH 46/49] softmax: add debug prints into testbench

---
 tests/fpgadataflow/test_fpgadataflow_softmax.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/tests/fpgadataflow/test_fpgadataflow_softmax.py b/tests/fpgadataflow/test_fpgadataflow_softmax.py
index 358a278bb9..7d5b1b7782 100644
--- a/tests/fpgadataflow/test_fpgadataflow_softmax.py
+++ b/tests/fpgadataflow/test_fpgadataflow_softmax.py
@@ -227,7 +227,17 @@ def test_fpga_dataflow_quantsoftmax(impl_style, simd, idt, ifm_dim, channels):
         # run the model
         y_hw = oxe.execute_onnx(model, input_t)["global_out"]
 
-        assert np.allclose(y_ref, y_hw, atol=5), "Model output does not match expected output"
+        # loop through the output tensor and compare the values
+        tollerance = 2
+
+        # Debug prints to help identify the failing values
+        for i in range(len(y_ref)):
+            for j in range(len(y_ref[i])):
+                for k in range(len(y_ref[i][j])):
+                    for l in range(len(y_ref[i][j][k])):
+                        if np.allclose(y_ref[i][j][k][l], y_hw[i][j][k][l], atol=tollerance) == False:
+                            print(f"|  {i},{j},{k},{l:<2}  |  {y_ref[i][j][k][l]:<4.0f} | {y_hw[i][j][k][l]:<4.0f} | {y_ref[i][j][k][l] - y_hw[i][j][k][l]:<4.0f} |")
+
 
     except Exception as e:
         pytest.fail(f"Failed to transform the model: {str(e)}")
\ No newline at end of file

From abefce6cdf593269ee14f8e561b58fbf9853e1da Mon Sep 17 00:00:00 2001
From: aziz bahri <azizb@amd.com>
Date: Mon, 12 Aug 2024 15:05:24 +0100
Subject: [PATCH 47/49] softmax: simplify testcase

---
 .../fpgadataflow/test_fpgadataflow_softmax.py | 51 +++++++++----------
 1 file changed, 25 insertions(+), 26 deletions(-)

diff --git a/tests/fpgadataflow/test_fpgadataflow_softmax.py b/tests/fpgadataflow/test_fpgadataflow_softmax.py
index 7d5b1b7782..c813bc3ff9 100644
--- a/tests/fpgadataflow/test_fpgadataflow_softmax.py
+++ b/tests/fpgadataflow/test_fpgadataflow_softmax.py
@@ -100,8 +100,8 @@ def make_single_quantsoftmax_modelwrapper(impl_style="hls", simd=1, idt=DataType
     Create a single quantized softmax node with variable parameters.
     this is before SpecializeLayers() transformation.
     '''
-    h = ifm_dim[0]
-    w = ifm_dim[1]
+    h = ifm_dim[1]
+    w = ifm_dim[2]
 
     inp = helper.make_tensor_value_info("global_in", TensorProto.FLOAT, [1, h, w, channels])
     outp = helper.make_tensor_value_info("global_out", TensorProto.FLOAT, [1, h, w, channels])
@@ -192,19 +192,20 @@ def test_convert_to_hw_softmax_layer(exec_mode, simd):
         pytest.fail(f"Failed to transform the model: {str(e)}")
 
 
-@pytest.mark.parametrize("impl_style", ["hls","rtl"])
+@pytest.mark.parametrize("impl_style", ["hls"])
 @pytest.mark.parametrize("simd", ["simd1", "simd2", "simd3", "simd4"])
-@pytest.mark.parametrize("idt", [DataType["UINT8"],DataType["INT8"],DataType["INT4"],DataType["UINT4"]])
-@pytest.mark.parametrize("ifm_dim", [(12,12)])
-@pytest.mark.parametrize("channels", [12, 384])
+@pytest.mark.parametrize("idt", ["INT8"])
+@pytest.mark.parametrize("ifm_dim", [(1, 12, 12, 12), (1, 128, 128, 384)])
 @pytest.mark.fpgadataflow
-def test_fpga_dataflow_quantsoftmax(impl_style, simd, idt, ifm_dim, channels):
+def test_fpga_dataflow_quantsoftmax(impl_style, simd, idt, ifm_dim):
+    idt = DataType[idt]
     simd = int(simd[-1])
+    io_shape = (ifm_dim[0], ifm_dim[1], ifm_dim[2], ifm_dim[3])
+    tollerance = 2
+    model = make_single_quantsoftmax_modelwrapper(impl_style=impl_style, simd=simd, idt=idt, ifm_dim=ifm_dim, channels=ifm_dim[3])
 
-    model = make_single_quantsoftmax_modelwrapper(impl_style=impl_style, simd=simd, idt=idt, ifm_dim=ifm_dim, channels=channels)
-
-    # Create the qonnx model
-    io_shape = (1, ifm_dim[0], ifm_dim[1], channels)
+    if(ifm_dim[3] % 3 != 0):
+        pytest.skip(f"Skipping this test because the number of channels is not a multiple of {simd}")
 
     input = gen_finn_dt_tensor(idt, io_shape)
     input_t = {"global_in": input}
@@ -216,7 +217,7 @@ def test_fpga_dataflow_quantsoftmax(impl_style, simd, idt, ifm_dim, channels):
     y_ref = y_ref.numpy()
 
     y_out = oxe.execute_onnx(model, input_t)["global_out"]
-    assert np.allclose(y_ref, y_out, atol=5), "Model output does not match expected output"
+    assert np.allclose(y_ref, y_out, atol=tollerance), "Model output does not match expected output"
 
     try:
         model = model.transform(SpecializeLayers(test_fpga_part))
@@ -224,20 +225,18 @@ def test_fpga_dataflow_quantsoftmax(impl_style, simd, idt, ifm_dim, channels):
         model = model.transform(SetExecMode("cppsim"))
         model = model.transform(PrepareCppSim())
         model = model.transform(CompileCppSim())
-        # run the model
-        y_hw = oxe.execute_onnx(model, input_t)["global_out"]
-
-        # loop through the output tensor and compare the values
-        tollerance = 2
+    except Exception as e:
+        pytest.fail(f"Failed to transform the model: {str(e)}")
 
-        # Debug prints to help identify the failing values
-        for i in range(len(y_ref)):
-            for j in range(len(y_ref[i])):
-                for k in range(len(y_ref[i][j])):
-                    for l in range(len(y_ref[i][j][k])):
-                        if np.allclose(y_ref[i][j][k][l], y_hw[i][j][k][l], atol=tollerance) == False:
-                            print(f"|  {i},{j},{k},{l:<2}  |  {y_ref[i][j][k][l]:<4.0f} | {y_hw[i][j][k][l]:<4.0f} | {y_ref[i][j][k][l] - y_hw[i][j][k][l]:<4.0f} |")
+    # run the model
+    y_hw = oxe.execute_onnx(model, input_t)["global_out"]
 
+    # Debug prints to help identify the failing values
+    for i in range(len(y_ref)):
+        for j in range(len(y_ref[i])):
+            for k in range(len(y_ref[i][j])):
+                for l in range(len(y_ref[i][j][k])):
+                    if np.allclose(y_ref[i][j][k][l], y_hw[i][j][k][l], atol=tollerance) == False:
+                        print(f"|  {i},{j},{k},{l:<2}  |  {y_ref[i][j][k][l]:<4.0f} | {y_hw[i][j][k][l]:<4.0f} | {y_ref[i][j][k][l] - y_hw[i][j][k][l]:<4.0f} |")
 
-    except Exception as e:
-        pytest.fail(f"Failed to transform the model: {str(e)}")
\ No newline at end of file
+    assert np.allclose(y_ref, y_hw, atol=tollerance), "Model output does not match expected output"
\ No newline at end of file

From 2b026f87b4f634d06d6b487341d4069f8d7b9a7d Mon Sep 17 00:00:00 2001
From: aziz bahri <azizb@amd.com>
Date: Wed, 14 Aug 2024 10:48:24 +0100
Subject: [PATCH 48/49] softmax: move hls source to custom hls directory

---
 custom_hls/sm_utils.hpp | 164 ++++++++++++++++++++++++
 custom_hls/softmax.hpp  | 275 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 439 insertions(+)
 create mode 100644 custom_hls/sm_utils.hpp
 create mode 100644 custom_hls/softmax.hpp

diff --git a/custom_hls/sm_utils.hpp b/custom_hls/sm_utils.hpp
new file mode 100644
index 0000000000..918f8879bf
--- /dev/null
+++ b/custom_hls/sm_utils.hpp
@@ -0,0 +1,164 @@
+// Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
+//
+// This file is subject to the Xilinx Design License Agreement located
+// in the LICENSE.md file in the root directory of this repository.
+//
+// This file contains confidential and proprietary information of Xilinx, Inc.
+// and is protected under U.S. and international copyright and other
+// intellectual property laws.
+//
+// DISCLAIMER
+// This disclaimer is not a license and does not grant any rights to the materials
+// distributed herewith. Except as otherwise provided in a valid license issued to
+// you by Xilinx, and to the maximum extent permitted by applicable law: (1) THESE
+// MATERIALS ARE MADE AVAILABLE "AS IS" AND WITH ALL FAULTS, AND XILINX HEREBY
+// DISCLAIMS ALL WARRANTIES AND CONDITIONS, EXPRESS, IMPLIED, OR STATUTORY,
+// INCLUDING BUT NOT LIMITED TO WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR
+// FITNESS FOR ANY PARTICULAR PURPOSE; and (2) Xilinx shall not be liable (whether
+// in contract or tort, including negligence, or under any other theory of
+// liability) for any loss or damage of any kind or nature related to, arising
+// under or in connection with these materials, including for any direct, or any
+// indirect, special, incidental, or consequential loss or damage (including loss
+// of data, profits, goodwill, or any type of loss or damage suffered as a result
+// of any action brought by a third party) even if such damage or loss was
+// reasonably foreseeable or Xilinx had been advised of the possibility of the
+// same.
+//
+// CRITICAL APPLICATIONS
+// Xilinx products are not designed or intended to be fail-safe, or for use in
+// any application requiring failsafe performance, such as life-support or safety
+// devices or systems, Class III medical devices, nuclear facilities, applications
+// related to the deployment of airbags, or any other applications that could lead
+// to death, personal injury, or severe property or environmental damage
+// (individually and collectively, "Critical Applications"). Customer assumes the
+// sole risk and liability of any use of Xilinx products in Critical Applications,
+// subject only to applicable laws and regulations governing limitations on product
+// liability.
+//
+// THIS COPYRIGHT NOTICE AND DISCLAIMER MUST BE RETAINED AS PART OF THIS FILE AT ALL TIMES.
+#ifndef SM_UTIL_HPP
+#define SM_UTIL_HPP
+#include "hls_vector.h"
+
+//- Compile-Time Functions --------------------------------------------------
+
+// ceil(log2(x))
+template<typename T>
+constexpr unsigned clog2(T  x) {
+  return  x<2? 0 : 1+clog2((x+1)/2);
+}
+
+//- Streaming Flit with `last` Marking --------------------------------------
+template<typename T>
+struct flit_t {
+	bool  last;
+	T     data;
+
+public:
+	flit_t(bool  last_, T const &data_) : last(last_), data(data_) {}
+	~flit_t() {}
+};
+
+//- Streaming Copy ----------------------------------------------------------
+template<typename T>
+void move(hls::stream<T> &src, hls::stream<T> &dst) {
+#pragma HLS pipeline II=1 style=flp
+	if(!src.empty())  dst.write(src.read());
+}
+
+//- Tree Reduce -------------------------------------------------------------
+template< unsigned long  N, typename  TA, typename  TR = TA, typename  F >
+TR tree_reduce(hls::stream<TA> &v, F f) {
+#pragma HLS inline
+#pragma HLS function_instantiate variable=f
+        TR  tree[2*N-1];
+#pragma HLS array_partition complete dim=1 variable=tree
+        for(unsigned  i = N; i-- > 0;) {
+#pragma HLS unroll
+                tree[N-1 + i] = v.read();
+        }
+        for(unsigned  i = N-1; i-- > 0;) {
+#pragma HLS unroll
+                tree[i] = f(tree[2*i+1], tree[2*i+2]);
+        }
+        return  tree[0];
+}
+
+// Recursive comparison and count (of max)
+// Builds a tree to compute the max of a vector
+template<unsigned N, typename T>
+struct MaxReduction {
+
+    static T max(const hls::vector<T, N>& input) {
+#pragma HLS INLINE
+        constexpr unsigned M = (N + 1) / 2;
+        hls::vector<T, M> res;
+
+        for(unsigned i = 0; i < M; ++i) {
+#pragma HLS unroll
+            if (2*i + 1 < N)
+                res[i] = input[2*i] > input[2*i + 1] ? input[2*i] : input[2*i + 1];
+            else
+                res[i] = input[2*i]; // Handle the case where the input size is odd
+        }
+
+        return MaxReduction<M, T>::max(res);
+    }
+
+};
+
+template<typename T>
+struct MaxReduction<2, T> {
+    static T max(const hls::vector<T, 2>& input) {
+#pragma HLS INLINE
+        return (input[0] > input[1]) ? input[0] : input[1];
+    }
+};
+
+template<typename T>
+struct MaxReduction<1, T> {
+    static T max(const hls::vector<T, 1>& input) {
+#pragma HLS INLINE
+        return input[0];
+    }
+};
+
+// Recursive reduction tree for the total summation
+// Code for the Nth stage
+template<unsigned N>
+struct TreeReduction {
+    static float reduce(const hls::vector<float, N>& input) {
+#pragma HLS INLINE
+        constexpr unsigned M = (N + 1) / 2;
+        hls::vector<float, M> sum;
+
+        for(unsigned i = 0; i < M; ++i) {
+#pragma HLS unroll
+            if (2*i + 1 < N)
+                sum[i] = input[2*i] + input[2*i + 1];
+            else
+                sum[i] = input[2*i]; // Handle the case where the input size is odd
+        }
+
+        return TreeReduction<M>::reduce(sum);
+    }
+};
+
+template<>
+struct TreeReduction<2> {
+    static float reduce(const hls::vector<float, 2>& input) {
+#pragma HLS INLINE
+        return input[0] + input[1];
+    }
+};
+
+template<>
+struct TreeReduction<1> {
+    static float reduce(const hls::vector<float, 1>& input) {
+#pragma HLS INLINE
+        return input[0];
+    }
+};
+
+
+#endif
\ No newline at end of file
diff --git a/custom_hls/softmax.hpp b/custom_hls/softmax.hpp
new file mode 100644
index 0000000000..61d8bab0e2
--- /dev/null
+++ b/custom_hls/softmax.hpp
@@ -0,0 +1,275 @@
+// Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
+//
+// This file is subject to the Xilinx Design License Agreement located
+// in the LICENSE.md file in the root directory of this repository.
+//
+// This file contains confidential and proprietary information of Xilinx, Inc.
+// and is protected under U.S. and international copyright and other
+// intellectual property laws.
+//
+// DISCLAIMER
+// This disclaimer is not a license and does not grant any rights to the materials
+// distributed herewith. Except as otherwise provided in a valid license issued to
+// you by Xilinx, and to the maximum extent permitted by applicable law: (1) THESE
+// MATERIALS ARE MADE AVAILABLE "AS IS" AND WITH ALL FAULTS, AND XILINX HEREBY
+// DISCLAIMS ALL WARRANTIES AND CONDITIONS, EXPRESS, IMPLIED, OR STATUTORY,
+// INCLUDING BUT NOT LIMITED TO WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR
+// FITNESS FOR ANY PARTICULAR PURPOSE; and (2) Xilinx shall not be liable (whether
+// in contract or tort, including negligence, or under any other theory of
+// liability) for any loss or damage of any kind or nature related to, arising
+// under or in connection with these materials, including for any direct, or any
+// indirect, special, incidental, or consequential loss or damage (including loss
+// of data, profits, goodwill, or any type of loss or damage suffered as a result
+// of any action brought by a third party) even if such damage or loss was
+// reasonably foreseeable or Xilinx had been advised of the possibility of the
+// same.
+//
+// CRITICAL APPLICATIONS
+// Xilinx products are not designed or intended to be fail-safe, or for use in
+// any application requiring failsafe performance, such as life-support or safety
+// devices or systems, Class III medical devices, nuclear facilities, applications
+// related to the deployment of airbags, or any other applications that could lead
+// to death, personal injury, or severe property or environmental damage
+// (individually and collectively, "Critical Applications"). Customer assumes the
+// sole risk and liability of any use of Xilinx products in Critical Applications,
+// subject only to applicable laws and regulations governing limitations on product
+// liability.
+//
+// THIS COPYRIGHT NOTICE AND DISCLAIMER MUST BE RETAINED AS PART OF THIS FILE AT ALL TIMES.
+
+#include <ap_int.h>
+#include <hls_stream.h>
+#include <hls_vector.h>
+#include <hls_math.h>
+#include <functional>
+#include <cmath>
+#include <climits>
+#include "sm_utils.hpp"
+
+// First stage of the pipeline:
+//
+// Trigger: When a vector of SIMD elements is present in the stream
+//
+// Desc: Pass over the input N items and calc the max value
+template<unsigned N, unsigned SIMD, typename T>
+void max_calc_stage(
+	hls::stream<hls::vector<T, SIMD>> &ins, 
+	hls::stream<hls::vector<T,SIMD>> &outs,
+	hls::stream<T> &maxs
+) {
+#pragma HLS pipeline II=1 style=flp
+	static ap_uint<clog2(N/SIMD)> count = 0;
+	static T max = 0;
+#pragma HLS reset variable=count
+#pragma HLS reset variable=max
+
+	if(!ins.empty()){
+		hls::vector<T,SIMD> out;
+		hls::vector<T,SIMD+1> max_v;
+		hls::vector<T,SIMD> const in = ins.read();
+
+
+		for(unsigned i=0; i<SIMD; i++){
+#pragma HLS UNROLL 
+			out[i] = in[i]; 
+			max_v[i] = in[i];
+		}
+		outs.write(out);
+
+		max_v[SIMD] = max;
+		max = MaxReduction<SIMD+1, T>::max(max_v);
+
+		count++;
+		if (count == (N/SIMD)-1) {
+			count = 0;
+			maxs.write(max);
+			max = 0;
+		}
+	}
+}
+
+
+// Second stage of the pipeline
+//
+// Trigger: When a max value is sent from the preceeding stage 
+//
+// Desc: For each item in a N item sequence calc the (exp - max) in float
+// track the sum while processing the N items.
+template<unsigned N, unsigned SIMD, typename T>
+void exp_sum_calc(
+	hls::stream<hls::vector<T, SIMD>> &ins, 
+	hls::stream<T> &maxs, 
+	hls::stream<hls::vector<float, SIMD>> &outs,
+	hls::stream<float> &sums
+){
+#pragma HLS pipeline II=1 style=flp
+	static ap_uint<clog2(N/SIMD)+1> count = 0;
+	static float sum = 0.0f;
+	static bool valid = false;
+	static float max = 0.0f;
+#pragma HLS reset variable=count
+#pragma HLS reset variable=sum
+#pragma HLS reset variable=valid
+#pragma HLS reset variable=max
+
+	if (count == (N/SIMD)) {
+		count = 0;
+		valid = false;
+		sums.write(sum);
+		sum = 0.0f;
+		return;
+	}
+
+	if(valid && !ins.empty()) {
+		hls::vector<T, SIMD> const in = ins.read();
+		hls::vector<float, SIMD> out;
+		for (unsigned i=0; i<SIMD; i++) {
+#pragma HLS UNROLL
+			out[i] = hls::exp(float(in[i]) - max); 	
+		}
+		sum += TreeReduction<SIMD>::reduce(out); 
+		outs.write(out);
+		
+		count++;
+	}
+
+	if (!maxs.empty() && !valid) {
+		max = maxs.read();
+		valid = true;
+	}
+
+}
+
+// Third stage of the pipeline
+//
+// Trigger: When a sum value is sent from the preceeding stage 
+// 
+// Desc: For the N items take the input and divide it by the sum 
+template<unsigned N, unsigned SIMD>
+void div_calc(
+	hls::stream<hls::vector<float, SIMD>> &ins, 
+	hls::stream<float> &sums,
+	hls::stream<hls::vector<float, SIMD>> &outs
+){
+#pragma HLS pipeline II=1 style=flp
+	static ap_uint<clog2(N/SIMD)+1> count = 0;
+	static bool valid = false;
+	static float sum = 0.0f;
+#pragma HLS reset variable=count
+#pragma HLS reset variable=valid
+#pragma HLS reset variable=sum
+
+	if (count == (N/SIMD)) {
+		count = 0;
+		valid = false;
+		return;
+	}
+
+	if (valid && !ins.empty()) {
+		hls::vector<float, SIMD> const in = ins.read();
+		hls::vector<float, SIMD> out;
+		for(unsigned i=0; i<SIMD; i++) {
+#pragma HLS unroll
+			out[i] = in[i] / sum;
+		}
+
+		outs.write(out);
+
+		count++;
+	}
+
+	if(!sums.empty() && !valid ){
+		valid = true;
+		sum = sums.read();
+	}
+}
+
+
+template<unsigned N, unsigned SIMD, typename T>
+void smax(
+    hls::stream<hls::vector<T, SIMD>> &src,
+    hls::stream<hls::vector<float, SIMD>> &dst
+) {
+#pragma HLS dataflow disable_start_propagation 
+    static_assert(N%SIMD == 0, "N must be a multiple of SIMD");
+
+    static hls::stream<hls::vector<T,SIMD>> max_data_s;
+#pragma HLS stream variable=max_data_s depth=N
+    static hls::stream<T> max_s;
+#pragma HLS stream variable=max_s depth=2
+
+    static hls::stream<hls::vector<float,SIMD>> exp_data_s;
+#pragma HLS stream variable=exp_data_s depth=N
+    static hls::stream<float> sum_s;
+#pragma HLS stream variable=sum_s depth=2
+
+    max_calc_stage<N, SIMD, T>(src, max_data_s, max_s);
+    exp_sum_calc<N, SIMD, T>(max_data_s, max_s, exp_data_s, sum_s);
+    div_calc<N,SIMD>(exp_data_s, sum_s, dst);
+
+} // smax()
+
+// Threshold/quantisation at the output of the softmax 
+template<
+	typename T, // The quantised output type (Needs to be signed)
+	typename F // The float based input type
+>
+T quant_threshold(F val) {
+#pragma HLS INLINE
+	if(val>=1.0f) 
+		return T((~unsigned(0)) >> 1); 
+
+	constexpr unsigned N_fracbits = (sizeof(T)*CHAR_BIT);
+
+	ap_fixed<N_fracbits-1, 0> fixed_point_val = val;
+	T frac_val = fixed_point_val.range(N_fracbits - 2, 0);
+	return frac_val; 
+}
+
+// Quantisation pipeline stage
+//
+// Trigger: When a SIMD vector is received from the preceeding stage 
+// 
+// Desc: Apply quantisation to the SIMD elements and write them into the
+// SIMD width output stream.
+template<
+	unsigned N,
+	unsigned SIMD,
+	typename T
+>
+void quant_stage(
+		hls::stream<hls::vector<float,SIMD>> &in,
+		hls::stream<hls::vector<T, SIMD>> &out
+) {
+#pragma HLS pipeline II=1 style=flp
+	if(!in.empty()) {
+		hls::vector<float, SIMD> const x = in.read();
+		hls::vector<T,SIMD> y;
+		for(unsigned i=0; i<SIMD; i++) {
+#pragma HLS UNROLL
+			y[i] = quant_threshold<T>(x[i]);
+		}
+		out.write(y);
+	}
+}
+
+// Quantised version of softmax
+// This is the same as the float softmax with an additional baked in quantisation stage at the end
+template<
+	 unsigned N, // The width of the input dimension 
+	 unsigned SIMD, // Amount of parallelism (how many items consumed/produced at a time 
+	 typename T  
+	 >
+void smaxquant(
+	hls::stream<hls::vector<T,SIMD>> &src,
+	hls::stream<hls::vector<T,SIMD>> &dst
+) {
+#pragma HLS DATAFLOW disable_start_propagation
+	hls::stream<hls::vector<float,SIMD>> smax_out;
+#pragma HLS stream variable=smax_out depth=2
+	static_assert(N%SIMD == 0, "SIMD must be a factor of N"); 
+
+	smax<N,SIMD,T>(src, smax_out);
+	quant_stage<N,SIMD,T>(smax_out, dst);
+
+} // smaxquant()

From d629093023f93c3ca42fc41788e31b48cfb90c23 Mon Sep 17 00:00:00 2001
From: aziz bahri <azizb@amd.com>
Date: Wed, 14 Aug 2024 15:37:56 +0100
Subject: [PATCH 49/49] softmax: quantization fix

---
 custom_hls/softmax.hpp | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/custom_hls/softmax.hpp b/custom_hls/softmax.hpp
index 61d8bab0e2..9452045a77 100644
--- a/custom_hls/softmax.hpp
+++ b/custom_hls/softmax.hpp
@@ -44,6 +44,7 @@
 #include <functional>
 #include <cmath>
 #include <climits>
+#include <type_traits>
 #include "sm_utils.hpp"
 
 // First stage of the pipeline:
@@ -211,20 +212,29 @@ void smax(
 
 // Threshold/quantisation at the output of the softmax 
 template<
-	typename T, // The quantised output type (Needs to be signed)
-	typename F // The float based input type
+        typename T, // The quantised output type (Needs to be signed)
+        typename TF // The float based input type
 >
-T quant_threshold(F val) {
+T quant_threshold(TF val) {
 #pragma HLS INLINE
-	if(val>=1.0f) 
-		return T((~unsigned(0)) >> 1); 
+        constexpr unsigned numBits = sizeof(T)*CHAR_BIT;
+        if(val>=1.0f){
+                T frac_val = ~T(0);
+                if(std::is_signed<T>::value) {
+                        return frac_val;
+                } else {
+                        T mask = ~(T(1) << (numBits - 1));
+                        return frac_val & mask;
+                }
+        }
+
+
+        ap_fixed<numBits-1, 0> fixed_point_val = val;
+        T frac_val = fixed_point_val.range(numBits - 2, 0);
+        return frac_val;
+}
 
-	constexpr unsigned N_fracbits = (sizeof(T)*CHAR_BIT);
 
-	ap_fixed<N_fracbits-1, 0> fixed_point_val = val;
-	T frac_val = fixed_point_val.range(N_fracbits - 2, 0);
-	return frac_val; 
-}
 
 // Quantisation pipeline stage
 //