diff --git a/configs/openocd.sh b/configs/openocd.sh
index ff88497ee..61566bc8d 100644
--- a/configs/openocd.sh
+++ b/configs/openocd.sh
@@ -9,3 +9,7 @@ else
 fi
 
 export PATH=$GAP_SDK_HOME/install/workstation/openocd/bin:$PATH
+
+# Path to openocd scripts
+export OPENOCD_SCRIPTS=$GAP_SDK_HOME/utils/openocd_tools
+
diff --git a/examples/pmsis/bsp/ble/ble_nina_b112/gaptest.yml b/examples/pmsis/bsp/ble/ble_nina_b112/gaptest.yml
new file mode 100644
index 000000000..d822d0849
--- /dev/null
+++ b/examples/pmsis/bsp/ble/ble_nina_b112/gaptest.yml
@@ -0,0 +1,17 @@
+name: ble_nina_b112
+platforms:
+    - gvsoc
+os:
+    - freertos
+    - pulpos
+chips:
+    - gap8
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
+        compile_only: true
diff --git a/examples/pmsis/bsp/ble/ble_read_test/gaptest.yml b/examples/pmsis/bsp/ble/ble_read_test/gaptest.yml
new file mode 100644
index 000000000..42f5e834a
--- /dev/null
+++ b/examples/pmsis/bsp/ble/ble_read_test/gaptest.yml
@@ -0,0 +1,17 @@
+name: ble_read_test
+platforms:
+    - gvsoc
+os:
+    - freertos
+    - pulpos
+chips:
+    - gap8
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
+        compile_only: true
diff --git a/examples/pmsis/bsp/blink_led/gaptest.yml b/examples/pmsis/bsp/blink_led/gaptest.yml
new file mode 100644
index 000000000..0c9b4f4fd
--- /dev/null
+++ b/examples/pmsis/bsp/blink_led/gaptest.yml
@@ -0,0 +1,17 @@
+name: blink_led
+platforms:
+    - gvsoc
+os:
+    - freertos
+    - pulpos
+chips:
+    - gap8
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
+        compile_only: true
diff --git a/examples/pmsis/bsp/cameras/camera_ir_thermeye/always_on/gaptest.yml b/examples/pmsis/bsp/cameras/camera_ir_thermeye/always_on/gaptest.yml
new file mode 100644
index 000000000..650f15570
--- /dev/null
+++ b/examples/pmsis/bsp/cameras/camera_ir_thermeye/always_on/gaptest.yml
@@ -0,0 +1,25 @@
+name: camera_ir_thermeye_always_on
+platforms:
+    - gvsoc
+os:
+    - freertos
+    - pulpos
+chips:
+    - gap8
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
+        compile_only: true
+    async:
+        name: async
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ASYNC=1
+        compile_only: true
diff --git a/examples/pmsis/bsp/cameras/camera_ir_thermeye/lower_power_mode/gaptest.yml b/examples/pmsis/bsp/cameras/camera_ir_thermeye/lower_power_mode/gaptest.yml
new file mode 100644
index 000000000..42151ac50
--- /dev/null
+++ b/examples/pmsis/bsp/cameras/camera_ir_thermeye/lower_power_mode/gaptest.yml
@@ -0,0 +1,25 @@
+name: camera_ir_thermeye_low_power_mode
+platforms:
+    - gvsoc
+os:
+    - freertos
+    - pulpos
+chips:
+    - gap8
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
+        compile_only: true
+    async:
+        name: async
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ASYNC=1
+        compile_only: true
diff --git a/examples/pmsis/bsp/cameras/test_camera_gc0308/gaptest.yml b/examples/pmsis/bsp/cameras/test_camera_gc0308/gaptest.yml
new file mode 100644
index 000000000..4392330e0
--- /dev/null
+++ b/examples/pmsis/bsp/cameras/test_camera_gc0308/gaptest.yml
@@ -0,0 +1,17 @@
+name: camera_gc0308
+platforms:
+    - gvsoc
+os:
+    - freertos
+    - pulpos
+chips:
+    - gap8
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
+        compile_only: true
diff --git a/examples/pmsis/bsp/cameras/test_camera_io/gaptest.yml b/examples/pmsis/bsp/cameras/test_camera_io/gaptest.yml
new file mode 100644
index 000000000..15922f463
--- /dev/null
+++ b/examples/pmsis/bsp/cameras/test_camera_io/gaptest.yml
@@ -0,0 +1,25 @@
+name: camera_io
+platforms:
+    - gvsoc
+os:
+    - freertos
+    - pulpos
+chips:
+    - gap8
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
+        compile_only: true
+    async:
+        name: async
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ASYNC=1
+        compile_only: true
diff --git a/examples/pmsis/bsp/cameras/test_camera_lcd/gaptest.yml b/examples/pmsis/bsp/cameras/test_camera_lcd/gaptest.yml
new file mode 100644
index 000000000..6c0f08da0
--- /dev/null
+++ b/examples/pmsis/bsp/cameras/test_camera_lcd/gaptest.yml
@@ -0,0 +1,25 @@
+name: camera_lcd
+platforms:
+    - gvsoc
+os:
+    - freertos
+    - pulpos
+chips:
+    - gap8
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
+        compile_only: true
+    async:
+        name: async
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ASYNC=1
+        compile_only: true
diff --git a/examples/pmsis/bsp/cameras/test_camera_ov5640/gaptest.yml b/examples/pmsis/bsp/cameras/test_camera_ov5640/gaptest.yml
new file mode 100644
index 000000000..93049af03
--- /dev/null
+++ b/examples/pmsis/bsp/cameras/test_camera_ov5640/gaptest.yml
@@ -0,0 +1,17 @@
+name: camera_ov5640
+platforms:
+    - gvsoc
+os:
+    - freertos
+    - pulpos
+chips:
+    - gap8
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
+        compile_only: true
diff --git a/examples/pmsis/bsp/cameras/test_camera_ov7670/gaptest.yml b/examples/pmsis/bsp/cameras/test_camera_ov7670/gaptest.yml
new file mode 100644
index 000000000..35f06b52a
--- /dev/null
+++ b/examples/pmsis/bsp/cameras/test_camera_ov7670/gaptest.yml
@@ -0,0 +1,17 @@
+name: camera_ov7670
+platforms:
+    - gvsoc
+os:
+    - freertos
+    - pulpos
+chips:
+    - gap8
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
+        compile_only: true
diff --git a/examples/pmsis/bsp/cameras/test_camera_ov7670/ov7670_config.h b/examples/pmsis/bsp/cameras/test_camera_ov7670/ov7670_config.h
old mode 100755
new mode 100644
diff --git a/examples/pmsis/bsp/cameras/test_camera_pixart/gaptest.yml b/examples/pmsis/bsp/cameras/test_camera_pixart/gaptest.yml
new file mode 100644
index 000000000..d496534f3
--- /dev/null
+++ b/examples/pmsis/bsp/cameras/test_camera_pixart/gaptest.yml
@@ -0,0 +1,25 @@
+name: camera_pixart
+platforms:
+    - gvsoc
+os:
+    - freertos
+    - pulpos
+chips:
+    - gap8
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
+        compile_only: true
+    async:
+        name: async
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ASYNC=1
+        compile_only: true
diff --git a/examples/pmsis/bsp/cameras/test_camera_stream/gaptest.yml b/examples/pmsis/bsp/cameras/test_camera_stream/gaptest.yml
new file mode 100644
index 000000000..fea31f653
--- /dev/null
+++ b/examples/pmsis/bsp/cameras/test_camera_stream/gaptest.yml
@@ -0,0 +1,17 @@
+name: camera_stream
+platforms:
+    - gvsoc
+os:
+    - freertos
+    - pulpos
+chips:
+    - gap8
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
+        compile_only: true
diff --git a/examples/pmsis/bsp/filesystem/fs_to_l3_copy/gaptest.yml b/examples/pmsis/bsp/filesystem/fs_to_l3_copy/gaptest.yml
new file mode 100644
index 000000000..10d486d39
--- /dev/null
+++ b/examples/pmsis/bsp/filesystem/fs_to_l3_copy/gaptest.yml
@@ -0,0 +1,18 @@
+name: fs_to_l3_copy
+platforms:
+    - gvsoc
+os:
+    - freertos
+    - pulpos
+chips:
+    - gap8
+    - gap9
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
+        compile_only: true
diff --git a/examples/pmsis/bsp/filesystem/littlefs/native_lfs/gaptest.yml b/examples/pmsis/bsp/filesystem/littlefs/native_lfs/gaptest.yml
new file mode 100644
index 000000000..aba58f9e7
--- /dev/null
+++ b/examples/pmsis/bsp/filesystem/littlefs/native_lfs/gaptest.yml
@@ -0,0 +1,18 @@
+name: littlefs_native_lfs
+platforms:
+    - gvsoc
+os:
+    - freertos
+    - pulpos
+chips:
+    - gap8
+    - gap9
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
+        compile_only: true
diff --git a/examples/pmsis/bsp/filesystem/readfs/gaptest.yml b/examples/pmsis/bsp/filesystem/readfs/gaptest.yml
new file mode 100644
index 000000000..64fe4a473
--- /dev/null
+++ b/examples/pmsis/bsp/filesystem/readfs/gaptest.yml
@@ -0,0 +1,18 @@
+name: readfs
+platforms:
+    - gvsoc
+os:
+    - freertos
+    - pulpos
+chips:
+    - gap8
+    - gap9
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
+        compile_only: true
diff --git a/examples/pmsis/bsp/flash/hyper_flash/gaptest.yml b/examples/pmsis/bsp/flash/hyper_flash/gaptest.yml
new file mode 100644
index 000000000..b2eb78659
--- /dev/null
+++ b/examples/pmsis/bsp/flash/hyper_flash/gaptest.yml
@@ -0,0 +1,26 @@
+name: hyper_flash
+platforms:
+    - gvsoc
+os:
+    - freertos
+    - pulpos
+chips:
+    - gap8
+    - gap9
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
+        compile_only: true
+    async:
+        name: async
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ASYNC=1
+        compile_only: true
diff --git a/examples/pmsis/bsp/flash/hyper_flash_multi_thread/gaptest.yml b/examples/pmsis/bsp/flash/hyper_flash_multi_thread/gaptest.yml
new file mode 100644
index 000000000..81e854c32
--- /dev/null
+++ b/examples/pmsis/bsp/flash/hyper_flash_multi_thread/gaptest.yml
@@ -0,0 +1,25 @@
+name: hyper_flash_multi_thread
+platforms:
+    - gvsoc
+os:
+    - freertos
+chips:
+    - gap8
+    - gap9
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
+        compile_only: true
+    async:
+        name: async
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ASYNC=1
+        compile_only: true
diff --git a/examples/pmsis/bsp/lcd/gapuino_himax_with_lcd/gaptest.yml b/examples/pmsis/bsp/lcd/gapuino_himax_with_lcd/gaptest.yml
new file mode 100644
index 000000000..98de5fade
--- /dev/null
+++ b/examples/pmsis/bsp/lcd/gapuino_himax_with_lcd/gaptest.yml
@@ -0,0 +1,17 @@
+name: gapuino_himax_with_lcd
+platforms:
+    - gvsoc
+os:
+    - freertos
+    - pulpos
+chips:
+    - gap8
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
+        compile_only: true
diff --git a/examples/pmsis/bsp/microphones/vesper/vm3011_wakeup/gaptest.yml b/examples/pmsis/bsp/microphones/vesper/vm3011_wakeup/gaptest.yml
new file mode 100644
index 000000000..edfb6290f
--- /dev/null
+++ b/examples/pmsis/bsp/microphones/vesper/vm3011_wakeup/gaptest.yml
@@ -0,0 +1,17 @@
+name: microphones_vesper_vm3011_wakeup
+platforms:
+    - gvsoc
+os:
+    - freertos
+    - pulpos
+chips:
+    - gap8
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
+        compile_only: true
diff --git a/examples/pmsis/bsp/ram/hyper_ram/gaptest.yml b/examples/pmsis/bsp/ram/hyper_ram/gaptest.yml
new file mode 100644
index 000000000..2539ec88b
--- /dev/null
+++ b/examples/pmsis/bsp/ram/hyper_ram/gaptest.yml
@@ -0,0 +1,26 @@
+name: hyper_ram
+platforms:
+    - gvsoc
+os:
+    - freertos
+    - pulpos
+chips:
+    - gap8
+    - gap9
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
+        compile_only: true
+    async:
+        name: async
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ASYNC=1
+        compile_only: true
diff --git a/examples/pmsis/bsp/ram/hyper_ram_flash/gaptest.yml b/examples/pmsis/bsp/ram/hyper_ram_flash/gaptest.yml
new file mode 100644
index 000000000..387122e40
--- /dev/null
+++ b/examples/pmsis/bsp/ram/hyper_ram_flash/gaptest.yml
@@ -0,0 +1,26 @@
+name: hyper_ram_flash
+platforms:
+    - gvsoc
+os:
+    - freertos
+    - pulpos
+chips:
+    - gap8
+    - gap9
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
+        compile_only: true
+    async:
+        name: async
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ASYNC=1
+        compile_only: true
diff --git a/examples/pmsis/bsp/ram/hyper_ram_multi_thread/gaptest.yml b/examples/pmsis/bsp/ram/hyper_ram_multi_thread/gaptest.yml
new file mode 100644
index 000000000..f0c34f889
--- /dev/null
+++ b/examples/pmsis/bsp/ram/hyper_ram_multi_thread/gaptest.yml
@@ -0,0 +1,25 @@
+name: hyper_ram_multi_thread
+platforms:
+    - gvsoc
+os:
+    - freertos
+chips:
+    - gap8
+    - gap9
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
+        compile_only: true
+    async:
+        name: async
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ASYNC=1
+        compile_only: true
diff --git a/examples/pmsis/bsp/wifi/nina_b112_example/gaptest.yml b/examples/pmsis/bsp/wifi/nina_b112_example/gaptest.yml
new file mode 100644
index 000000000..67a08ccad
--- /dev/null
+++ b/examples/pmsis/bsp/wifi/nina_b112_example/gaptest.yml
@@ -0,0 +1,17 @@
+name: wifi_nina_b112
+platforms:
+    - gvsoc
+os:
+    - freertos
+    - pulpos
+chips:
+    - gap8
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
+        compile_only: true
diff --git a/examples/pmsis/features/aes128_sw/gaptest.yml b/examples/pmsis/features/aes128_sw/gaptest.yml
new file mode 100644
index 000000000..ec524aa63
--- /dev/null
+++ b/examples/pmsis/features/aes128_sw/gaptest.yml
@@ -0,0 +1,17 @@
+name: aes128_sw
+platforms:
+    - gvsoc
+os:
+    - freertos
+    - pulpos
+chips:
+    - gap8
+    - gap9
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
diff --git a/examples/pmsis/features/aes128_sw/main.c b/examples/pmsis/features/aes128_sw/main.c
index 8514d59c0..6f12a2188 100644
--- a/examples/pmsis/features/aes128_sw/main.c
+++ b/examples/pmsis/features/aes128_sw/main.c
@@ -1,16 +1,12 @@
 #include "pmsis.h"
 #include "AesLib.h"
 
-#define TEST_BUFF_SIZE      (40600) 
+#define TEST_BUFF_SIZE      (40600)
 #define TEST_KEY_HI         (0x1122334455667788)
 #define TEST_KEY_LO         (0x9900AABBCCDDEEFF)
 #define TEST_IV             (0x1122334455667788)
 
-#if defined (__PULP_OS__)
-RT_FC_DATA aes_data_t aes_data;
-#else
-GAP_FC_DATA aes_data_t aes_data;
-#endif
+PI_FC_L1 aes_data_t aes_data;
 
 static void load_key(unsigned char * key, unsigned char * iv)
 {
@@ -58,7 +54,7 @@ void aes128()
     pi_perf_start();
 
     cycles[0] = pi_perf_read(PI_PERF_CYCLES);
-    
+
 	AesBuildLUT(&aes_data);
 
     pi_perf_stop();
@@ -77,7 +73,7 @@ void aes128()
     cycles[3] = pi_perf_read(PI_PERF_CYCLES);
 
     for(int i=0 ; i<bufferSize; i++)
-    {   
+    {
         if (buffer[i] != buffer_dec[i] )
             printf("ERROR of enc/dec on %d\n", i);
     }
diff --git a/examples/pmsis/features/cluster/cluster_callback/gaptest.yml b/examples/pmsis/features/cluster/cluster_callback/gaptest.yml
new file mode 100644
index 000000000..91ce83108
--- /dev/null
+++ b/examples/pmsis/features/cluster/cluster_callback/gaptest.yml
@@ -0,0 +1,24 @@
+name: cluster_callback
+platforms:
+    - gvsoc
+os:
+    - freertos
+    - pulpos
+chips:
+    - gap8
+    - gap9
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
+    async:
+        name: async
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ASYNC=1
diff --git a/examples/pmsis/features/cluster/cluster_dma/gaptest.yml b/examples/pmsis/features/cluster/cluster_dma/gaptest.yml
new file mode 100644
index 000000000..dee89170a
--- /dev/null
+++ b/examples/pmsis/features/cluster/cluster_dma/gaptest.yml
@@ -0,0 +1,24 @@
+name: cluster_dma
+platforms:
+    - gvsoc
+os:
+    - freertos
+    - pulpos
+chips:
+    - gap8
+    - gap9
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
+    async:
+        name: async
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ASYNC=1
diff --git a/examples/pmsis/features/cluster/cluster_fork/gaptest.yml b/examples/pmsis/features/cluster/cluster_fork/gaptest.yml
new file mode 100644
index 000000000..77c63156b
--- /dev/null
+++ b/examples/pmsis/features/cluster/cluster_fork/gaptest.yml
@@ -0,0 +1,17 @@
+name: cluster_fork
+platforms:
+    - gvsoc
+os:
+    - freertos
+    - pulpos
+chips:
+    - gap8
+    - gap9
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
diff --git a/examples/pmsis/features/cluster/cluster_malloc/gaptest.yml b/examples/pmsis/features/cluster/cluster_malloc/gaptest.yml
new file mode 100644
index 000000000..e3d9e19f4
--- /dev/null
+++ b/examples/pmsis/features/cluster/cluster_malloc/gaptest.yml
@@ -0,0 +1,24 @@
+name: cluster_malloc
+platforms:
+    - gvsoc
+os:
+    - freertos
+    - pulpos
+chips:
+    - gap8
+    - gap9
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
+    async:
+        name: async
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ASYNC=1
diff --git a/examples/pmsis/features/efuse_burner/Makefile b/examples/pmsis/features/efuse_burner/Makefile
deleted file mode 100644
index 74e05b110..000000000
--- a/examples/pmsis/features/efuse_burner/Makefile
+++ /dev/null
@@ -1,9 +0,0 @@
-APP              = test
-APP_SRCS        += example.c
-APP_INC	        +=
-
-ifdef EFUSE_WRITE
-APP_CFLAGS      += -DEFUSE_WRITE=1
-endif
-
-include $(RULES_DIR)/pmsis_rules.mk
diff --git a/examples/pmsis/features/efuse_burner/example.c b/examples/pmsis/features/efuse_burner/example.c
deleted file mode 100644
index 9000bea24..000000000
--- a/examples/pmsis/features/efuse_burner/example.c
+++ /dev/null
@@ -1,57 +0,0 @@
-// This example shows how to program efuses.
-//
-// This is a permanent operation. Once and efuse is programmed, the same value is
-// read, even after power-up. This can be used to remember some settings.
-// Be careful that there are some efuses reserved for the ROM to specify the
-// boot mode, check the specifications to know which efuses can be used.
-
-#include <pmsis.h>
-
-
-static int entry()
-{
-    printf("Entering main controller\n");
-
-#ifdef EFUSE_WRITE
-    printf("Writing efuse 50 with value 0x12\n");
-
-    // Before writing the efuse, we must activate the program operation
-    // Once activated, we can wrote as many efuses as we want
-    plp_efuse_startProgram();
-
-    plp_efuse_writeByte(80, 0x12);
-
-    // Close the current operation once done
-    plp_efuse_sleep();
-#else
-    printf("Efuse has not been written, recompile with make clean all run EFUSE_WRITE=1, be careful that this is a permanent operation !!!\n");
-#endif
-
-
-    // Before reading the efuse, we must activate the read operation
-    // Once activated, we can wrote as many efuses as we want
-    plp_efuse_startRead();
-
-    int value = plp_efuse_readWord(80);
-
-    // Close the current operation once done
-    plp_efuse_sleep();
-
-    printf("Read efuse 50: 0x%x\n", value);
-
-    return 0;
-}
-
-
-static void pmsis_wrapper(void)
-{
-    int retval = entry();
-    pmsis_exit(retval);
-}
-
-
-int main(void)
-{
-    return pmsis_kickoff((void *)pmsis_wrapper);
-}
-
diff --git a/examples/pmsis/features/helloworld_cxx/gaptest.yml b/examples/pmsis/features/helloworld_cxx/gaptest.yml
new file mode 100644
index 000000000..7d9864113
--- /dev/null
+++ b/examples/pmsis/features/helloworld_cxx/gaptest.yml
@@ -0,0 +1,16 @@
+name: helloworld_cxx
+platforms:
+    - gvsoc
+os:
+    - freertos
+chips:
+    - gap8
+    - gap9
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
diff --git a/examples/pmsis/features/hyper_ram_delegate/gaptest.yml b/examples/pmsis/features/hyper_ram_delegate/gaptest.yml
new file mode 100644
index 000000000..811611f76
--- /dev/null
+++ b/examples/pmsis/features/hyper_ram_delegate/gaptest.yml
@@ -0,0 +1,26 @@
+name: hyper_ram_delegate
+platforms:
+    - gvsoc
+os:
+    - freertos
+    - pulpos
+chips:
+    - gap8
+    - gap9
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
+        compile_only: true
+    async:
+        name: async
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ASYNC=1
+        compile_only: true
diff --git a/examples/pmsis/features/test_malloc/gaptest.yml b/examples/pmsis/features/test_malloc/gaptest.yml
new file mode 100644
index 000000000..85043c25c
--- /dev/null
+++ b/examples/pmsis/features/test_malloc/gaptest.yml
@@ -0,0 +1,17 @@
+name: test_malloc
+platforms:
+    - gvsoc
+os:
+    - freertos
+    - pulpos
+chips:
+    - gap8
+    - gap9
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
diff --git a/examples/pmsis/features/uart_delegate/gaptest.yml b/examples/pmsis/features/uart_delegate/gaptest.yml
new file mode 100644
index 000000000..008a9e7a5
--- /dev/null
+++ b/examples/pmsis/features/uart_delegate/gaptest.yml
@@ -0,0 +1,18 @@
+name: uart_delegate
+platforms:
+    - gvsoc
+os:
+    - freertos
+    - pulpos
+chips:
+    - gap8
+    - gap9
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
+        compile_only: true
diff --git a/examples/pmsis/helloworld/gaptest.yml b/examples/pmsis/helloworld/gaptest.yml
new file mode 100644
index 000000000..6bedf8220
--- /dev/null
+++ b/examples/pmsis/helloworld/gaptest.yml
@@ -0,0 +1,18 @@
+name: helloworld
+platforms:
+    - gvsoc
+    - board
+os:
+    - freertos
+    - pulpos
+chips:
+    - gap8
+    - gap9
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
diff --git a/examples/pmsis/periph/dmacpy/gaptest.yml b/examples/pmsis/periph/dmacpy/gaptest.yml
new file mode 100644
index 000000000..67cd62962
--- /dev/null
+++ b/examples/pmsis/periph/dmacpy/gaptest.yml
@@ -0,0 +1,26 @@
+name: dmacpy
+platforms:
+    - gvsoc
+os:
+    - freertos
+    - pulpos
+chips:
+    - gap8
+    - gap9
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
+        compile_only: true
+    async:
+        name: async
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ASYNC=1
+        compile_only: true
diff --git a/examples/pmsis/periph/gpio/gpio_input/gaptest.yml b/examples/pmsis/periph/gpio/gpio_input/gaptest.yml
new file mode 100644
index 000000000..0cb0c762c
--- /dev/null
+++ b/examples/pmsis/periph/gpio/gpio_input/gaptest.yml
@@ -0,0 +1,18 @@
+name: gpio_input
+platforms:
+    - gvsoc
+os:
+    - freertos
+    - pulpos
+chips:
+    - gap8
+    - gap9
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
+        compile_only: true
diff --git a/examples/pmsis/periph/gpio/gpio_input/gpio.c b/examples/pmsis/periph/gpio/gpio_input/gpio.c
index 0c3005e75..80f4de0ae 100644
--- a/examples/pmsis/periph/gpio/gpio_input/gpio.c
+++ b/examples/pmsis/periph/gpio/gpio_input/gpio.c
@@ -6,6 +6,15 @@
 /* PMSIS includes */
 #include "pmsis.h"
 
+/* Defines */
+#if defined(__GAP8__)
+#define GPIO_PIN (PI_GPIO_A0_PAD_12_A3)
+#elif defined(__GAP9__)
+#define GPIO_PIN (PI_GPIO_A68)
+#else
+#error "Unknown chip"
+#endif
+
 /* Variables used. */
 struct pi_device gpio;
 
@@ -34,7 +43,7 @@ void test_gpio(void)
     }
     pi_task_t cb_gpio;
 
-    pi_gpio_e gpio_in = PI_GPIO_A0_PAD_12_A3;
+    pi_gpio_e gpio_in = GPIO_PIN;
     pi_gpio_notif_e irq_type = PI_GPIO_NOTIF_RISE;
     pi_gpio_flags_e cfg_flags = PI_GPIO_INPUT|PI_GPIO_PULL_DISABLE|PI_GPIO_DRIVE_STRENGTH_LOW;
 
diff --git a/examples/pmsis/periph/gpio/gpio_irq_cb/gaptest.yml b/examples/pmsis/periph/gpio/gpio_irq_cb/gaptest.yml
new file mode 100644
index 000000000..ead916f96
--- /dev/null
+++ b/examples/pmsis/periph/gpio/gpio_irq_cb/gaptest.yml
@@ -0,0 +1,18 @@
+name: gpio_irq_cb
+platforms:
+    - gvsoc
+os:
+    - freertos
+    - pulpos
+chips:
+    - gap8
+    - gap9
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
+        compile_only: true
diff --git a/examples/pmsis/periph/gpio/gpio_irq_cb/gpio.c b/examples/pmsis/periph/gpio/gpio_irq_cb/gpio.c
index 1b6a64a26..851226b49 100644
--- a/examples/pmsis/periph/gpio/gpio_irq_cb/gpio.c
+++ b/examples/pmsis/periph/gpio/gpio_irq_cb/gpio.c
@@ -6,6 +6,15 @@
 /* PMSIS includes */
 #include "pmsis.h"
 
+/* Defines */
+#if defined(__GAP8__)
+#define GPIO_PIN (PI_GPIO_A0_PAD_12_A3)
+#elif defined(__GAP9__)
+#define GPIO_PIN (PI_GPIO_A68)
+#else
+#error "Unknown chip"
+#endif
+
 /* Variables used. */
 struct pi_device gpio;
 
@@ -51,7 +60,7 @@ void test_gpio(void)
         pmsis_exit(errors);
     }
 
-    pi_gpio_e gpio_in = PI_GPIO_A0_PAD_12_A3;
+    pi_gpio_e gpio_in = GPIO_PIN;
     pi_gpio_notif_e irq_type = PI_GPIO_NOTIF_RISE;
     pi_gpio_flags_e cfg_flags = PI_GPIO_INPUT|PI_GPIO_PULL_DISABLE|PI_GPIO_DRIVE_STRENGTH_LOW;
 
diff --git a/examples/pmsis/periph/gpio/gpio_output/gaptest.yml b/examples/pmsis/periph/gpio/gpio_output/gaptest.yml
new file mode 100644
index 000000000..8bc059d75
--- /dev/null
+++ b/examples/pmsis/periph/gpio/gpio_output/gaptest.yml
@@ -0,0 +1,18 @@
+name: gpio_output
+platforms:
+    - gvsoc
+os:
+    - freertos
+    - pulpos
+chips:
+    - gap8
+    - gap9
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
+        compile_only: true
diff --git a/examples/pmsis/periph/gpio/gpio_output/gpio.c b/examples/pmsis/periph/gpio/gpio_output/gpio.c
index 5aeb07ffd..850f22ef2 100644
--- a/examples/pmsis/periph/gpio/gpio_output/gpio.c
+++ b/examples/pmsis/periph/gpio/gpio_output/gpio.c
@@ -6,6 +6,26 @@
 /* PMSIS includes */
 #include "pmsis.h"
 
+/* Defines */
+#if defined(__GAP8__)
+
+#define GPIO_PAD1 (PI_PAD_12_A3_RF_PACTRL0)
+#define GPIO_PIN1 (PI_GPIO_A0_PAD_12_A3)
+
+#define GPIO_PAD2 (PI_PAD_15_B1_RF_PACTRL3)
+#define GPIO_PIN2 (PI_GPIO_A3_PAD_15_B1)
+
+#elif defined(__GAP9__)
+#define GPIO_PAD1 (PI_PAD_068)
+#define GPIO_PIN1 (PI_GPIO_A68)
+
+#define GPIO_PAD2 (PI_PAD_086)
+#define GPIO_PIN2 (PI_GPIO_A86)
+
+#else
+#error "Unknown chip"
+#endif
+
 #define DELAY_MS 500
 
 /* Variables used. */
@@ -19,12 +39,12 @@ void test_gpio(void)
     uint32_t value = 0;
     //Setting pad to alternate 1
     //GPIO A1
-    pi_pad_set_function(PI_PAD_12_A3_RF_PACTRL0, PI_PAD_12_A3_GPIO_A0_FUNC1);
+    pi_pad_set_function(GPIO_PAD1, PI_PAD_FUNC1);
     //GPIO LED (A3)
-    pi_pad_set_function(PI_PAD_15_B1_RF_PACTRL3, PI_PAD_FUNC1);
-    
-    pi_gpio_e gpio_out_a1 = PI_GPIO_A0_PAD_12_A3;
-    pi_gpio_e gpio_out_led = PI_GPIO_A3_PAD_15_B1;
+    pi_pad_set_function(GPIO_PAD2, PI_PAD_FUNC1);
+
+    pi_gpio_e gpio_out_a1 = GPIO_PIN1;
+    pi_gpio_e gpio_out_led = GPIO_PIN2;
 
     /* Configure gpio output. */
     pi_gpio_flags_e cfg_flags = PI_GPIO_OUTPUT;
diff --git a/examples/pmsis/periph/i2c/i2c_bmp280/gaptest.yml b/examples/pmsis/periph/i2c/i2c_bmp280/gaptest.yml
new file mode 100644
index 000000000..964cb67c7
--- /dev/null
+++ b/examples/pmsis/periph/i2c/i2c_bmp280/gaptest.yml
@@ -0,0 +1,18 @@
+name: i2c_bmp280
+platforms:
+    - gvsoc
+os:
+    - freertos
+    - pulpos
+chips:
+    - gap8
+    - gap9
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
+        compile_only: true
diff --git a/examples/pmsis/periph/i2c/i2c_detect/gaptest.yml b/examples/pmsis/periph/i2c/i2c_detect/gaptest.yml
new file mode 100644
index 000000000..88c99bd11
--- /dev/null
+++ b/examples/pmsis/periph/i2c/i2c_detect/gaptest.yml
@@ -0,0 +1,17 @@
+name: i2c_detect
+platforms:
+    - gvsoc
+os:
+    - freertos
+    - pulpos
+chips:
+    - gap8
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
+        compile_only: true
diff --git a/examples/pmsis/periph/i2c/i2c_eeprom_pulp_fmc/gaptest.yml b/examples/pmsis/periph/i2c/i2c_eeprom_pulp_fmc/gaptest.yml
new file mode 100644
index 000000000..2acc38e5e
--- /dev/null
+++ b/examples/pmsis/periph/i2c/i2c_eeprom_pulp_fmc/gaptest.yml
@@ -0,0 +1,17 @@
+name: i2c_eeprom_pulp_fmc
+platforms:
+    - gvsoc
+os:
+    - freertos
+    - pulpos
+chips:
+    - gap9
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
+        compile_only: true
diff --git a/examples/pmsis/periph/i2c/i2c_scan/gaptest.yml b/examples/pmsis/periph/i2c/i2c_scan/gaptest.yml
new file mode 100644
index 000000000..208865a38
--- /dev/null
+++ b/examples/pmsis/periph/i2c/i2c_scan/gaptest.yml
@@ -0,0 +1,17 @@
+name: i2c_scan
+platforms:
+    - gvsoc
+os:
+    - freertos
+    - pulpos
+chips:
+    - gap9
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
+        compile_only: true
diff --git a/examples/pmsis/periph/i2c/i2c_slave/gaptest.yml b/examples/pmsis/periph/i2c/i2c_slave/gaptest.yml
new file mode 100644
index 000000000..251436da8
--- /dev/null
+++ b/examples/pmsis/periph/i2c/i2c_slave/gaptest.yml
@@ -0,0 +1,17 @@
+name: i2c_slave_loopback
+platforms:
+    - gvsoc
+os:
+    - freertos
+    - pulpos
+chips:
+    - gap9
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
+        compile_only: true
diff --git a/examples/pmsis/periph/i2s/pcm/gaptest.yml b/examples/pmsis/periph/i2s/pcm/gaptest.yml
new file mode 100644
index 000000000..bbdd53305
--- /dev/null
+++ b/examples/pmsis/periph/i2s/pcm/gaptest.yml
@@ -0,0 +1,18 @@
+name: i2s_pcm
+platforms:
+    - gvsoc
+os:
+    - freertos
+    - pulpos
+chips:
+    - gap8
+    - gap9
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
+        compile_only: true
diff --git a/examples/pmsis/periph/i2s/pdm/gaptest.yml b/examples/pmsis/periph/i2s/pdm/gaptest.yml
new file mode 100644
index 000000000..4cb332ece
--- /dev/null
+++ b/examples/pmsis/periph/i2s/pdm/gaptest.yml
@@ -0,0 +1,18 @@
+name: i2s_pdm
+platforms:
+    - gvsoc
+os:
+    - freertos
+    - pulpos
+chips:
+    - gap8
+    - gap9
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
+        compile_only: true
diff --git a/examples/pmsis/periph/i2s/pdm_4mic/gaptest.yml b/examples/pmsis/periph/i2s/pdm_4mic/gaptest.yml
new file mode 100644
index 000000000..cd88be255
--- /dev/null
+++ b/examples/pmsis/periph/i2s/pdm_4mic/gaptest.yml
@@ -0,0 +1,18 @@
+name: i2s_pdm_4mic
+platforms:
+    - gvsoc
+os:
+    - freertos
+    - pulpos
+chips:
+    - gap8
+    - gap9
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
+        compile_only: true
diff --git a/examples/pmsis/periph/i2s/pdm_async/gaptest.yml b/examples/pmsis/periph/i2s/pdm_async/gaptest.yml
new file mode 100644
index 000000000..3d99bb3f0
--- /dev/null
+++ b/examples/pmsis/periph/i2s/pdm_async/gaptest.yml
@@ -0,0 +1,18 @@
+name: i2s_pdm_async
+platforms:
+    - gvsoc
+os:
+    - freertos
+    - pulpos
+chips:
+    - gap8
+    - gap9
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
+        compile_only: true
diff --git a/examples/pmsis/periph/i2s/wav_out/gaptest.yml b/examples/pmsis/periph/i2s/wav_out/gaptest.yml
new file mode 100644
index 000000000..e91dfe000
--- /dev/null
+++ b/examples/pmsis/periph/i2s/wav_out/gaptest.yml
@@ -0,0 +1,18 @@
+name: i2s_wav_out
+platforms:
+    - gvsoc
+os:
+    - freertos
+    - pulpos
+chips:
+    - gap8
+    - gap9
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
+        compile_only: true
diff --git a/examples/pmsis/periph/i2s/wav_out_long/gaptest.yml b/examples/pmsis/periph/i2s/wav_out_long/gaptest.yml
new file mode 100644
index 000000000..d9ddabd7a
--- /dev/null
+++ b/examples/pmsis/periph/i2s/wav_out_long/gaptest.yml
@@ -0,0 +1,18 @@
+name: i2s_wav_out_long
+platforms:
+    - gvsoc
+os:
+    - freertos
+    - pulpos
+chips:
+    - gap8
+    - gap9
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
+        compile_only: true
diff --git a/examples/pmsis/periph/i2s/wav_out_one_shot/gaptest.yml b/examples/pmsis/periph/i2s/wav_out_one_shot/gaptest.yml
new file mode 100644
index 000000000..6cc8975f6
--- /dev/null
+++ b/examples/pmsis/periph/i2s/wav_out_one_shot/gaptest.yml
@@ -0,0 +1,18 @@
+name: i2s_wav_out_one_shot
+platforms:
+    - gvsoc
+os:
+    - freertos
+    - pulpos
+chips:
+    - gap8
+    - gap9
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
+        compile_only: true
diff --git a/examples/pmsis/periph/perf/gaptest.yml b/examples/pmsis/periph/perf/gaptest.yml
new file mode 100644
index 000000000..d7a791e20
--- /dev/null
+++ b/examples/pmsis/periph/perf/gaptest.yml
@@ -0,0 +1,17 @@
+name: perf
+platforms:
+    - gvsoc
+os:
+    - freertos
+    - pulpos
+chips:
+    - gap8
+    - gap9
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
diff --git a/examples/pmsis/periph/pwm/gaptest.yml b/examples/pmsis/periph/pwm/gaptest.yml
new file mode 100644
index 000000000..fa57415f0
--- /dev/null
+++ b/examples/pmsis/periph/pwm/gaptest.yml
@@ -0,0 +1,18 @@
+name: pwm
+platforms:
+    - gvsoc
+os:
+    - freertos
+    - pulpos
+chips:
+    - gap8
+    - gap9
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
+        compile_only: true
diff --git a/examples/pmsis/periph/rtc/rtc_alarm/gaptest.yml b/examples/pmsis/periph/rtc/rtc_alarm/gaptest.yml
new file mode 100644
index 000000000..44cb70c4c
--- /dev/null
+++ b/examples/pmsis/periph/rtc/rtc_alarm/gaptest.yml
@@ -0,0 +1,18 @@
+name: rtc_alarm
+platforms:
+    - gvsoc
+os:
+    - freertos
+    - pulpos
+chips:
+    - gap8
+    - gap9
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
+        compile_only: true
diff --git a/examples/pmsis/periph/rtc/rtc_calendar/gaptest.yml b/examples/pmsis/periph/rtc/rtc_calendar/gaptest.yml
new file mode 100644
index 000000000..7c35557ff
--- /dev/null
+++ b/examples/pmsis/periph/rtc/rtc_calendar/gaptest.yml
@@ -0,0 +1,18 @@
+name: rtc_calendar
+platforms:
+    - gvsoc
+os:
+    - freertos
+    - pulpos
+chips:
+    - gap8
+    - gap9
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
+        compile_only: true
diff --git a/examples/pmsis/periph/rtc/rtc_counter/gaptest.yml b/examples/pmsis/periph/rtc/rtc_counter/gaptest.yml
new file mode 100644
index 000000000..ae74bd85d
--- /dev/null
+++ b/examples/pmsis/periph/rtc/rtc_counter/gaptest.yml
@@ -0,0 +1,18 @@
+name: rtc_counter
+platforms:
+    - gvsoc
+os:
+    - freertos
+    - pulpos
+chips:
+    - gap8
+    - gap9
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
+        compile_only: true
diff --git a/examples/pmsis/periph/spi/spi_master/gaptest.yml b/examples/pmsis/periph/spi/spi_master/gaptest.yml
new file mode 100644
index 000000000..c3ef7ba4f
--- /dev/null
+++ b/examples/pmsis/periph/spi/spi_master/gaptest.yml
@@ -0,0 +1,18 @@
+name: spi_master
+platforms:
+    - gvsoc
+os:
+    - freertos
+    - pulpos
+chips:
+    - gap8
+    - gap9
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
+        compile_only: true
diff --git a/examples/pmsis/periph/uart/uart_helloworld/gaptest.yml b/examples/pmsis/periph/uart/uart_helloworld/gaptest.yml
new file mode 100644
index 000000000..67a176fa8
--- /dev/null
+++ b/examples/pmsis/periph/uart/uart_helloworld/gaptest.yml
@@ -0,0 +1,18 @@
+name: uart_helloworld
+platforms:
+    - gvsoc
+os:
+    - freertos
+    - pulpos
+chips:
+    - gap8
+    - gap9
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
+        compile_only: true
diff --git a/examples/pmsis/periph/uart/uart_helloworld_timeout/gaptest.yml b/examples/pmsis/periph/uart/uart_helloworld_timeout/gaptest.yml
new file mode 100644
index 000000000..df9d7562b
--- /dev/null
+++ b/examples/pmsis/periph/uart/uart_helloworld_timeout/gaptest.yml
@@ -0,0 +1,16 @@
+name: uart_helloworld_timeout
+platforms:
+    - gvsoc
+os:
+    - freertos
+chips:
+    - gap9
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
+        compile_only: true
diff --git a/examples/pmsis/periph/uart/uart_input/gaptest.yml b/examples/pmsis/periph/uart/uart_input/gaptest.yml
new file mode 100644
index 000000000..a7e409203
--- /dev/null
+++ b/examples/pmsis/periph/uart/uart_input/gaptest.yml
@@ -0,0 +1,18 @@
+name: uart_input
+platforms:
+    - gvsoc
+os:
+    - freertos
+    - pulpos
+chips:
+    - gap8
+    - gap9
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
+        compile_only: true
diff --git a/examples/pmsis/periph/uart/uart_input_timeout/gaptest.yml b/examples/pmsis/periph/uart/uart_input_timeout/gaptest.yml
new file mode 100644
index 000000000..c49ab3ee9
--- /dev/null
+++ b/examples/pmsis/periph/uart/uart_input_timeout/gaptest.yml
@@ -0,0 +1,16 @@
+name: uart_input_timeout
+platforms:
+    - gvsoc
+os:
+    - freertos
+chips:
+    - gap9
+variants:
+    std:
+        name: standard
+        tags:
+            - integration
+            - release
+        duration: standard
+        flags: ~
+        compile_only: true
diff --git a/gvsoc/gvsoc/models/cpu/iss/include/pulp_v2.hpp b/gvsoc/gvsoc/models/cpu/iss/include/pulp_v2.hpp
index 778ab4d58..4d23e13f6 100644
--- a/gvsoc/gvsoc/models/cpu/iss/include/pulp_v2.hpp
+++ b/gvsoc/gvsoc/models/cpu/iss/include/pulp_v2.hpp
@@ -26,13 +26,13 @@
 #define PULPV2_HWLOOP_LPEND0   1
 #define PULPV2_HWLOOP_LPCOUNT0 2
 
-#define PULPV2_HWLOOP_LPSTART1 3
-#define PULPV2_HWLOOP_LPEND1   4
-#define PULPV2_HWLOOP_LPCOUNT1 5
+#define PULPV2_HWLOOP_LPSTART1 4
+#define PULPV2_HWLOOP_LPEND1   5
+#define PULPV2_HWLOOP_LPCOUNT1 6
 
-#define PULPV2_HWLOOP_LPSTART(x) (PULPV2_HWLOOP_LPSTART0 + (x)*3)
-#define PULPV2_HWLOOP_LPEND(x) (PULPV2_HWLOOP_LPEND0 + (x)*3)
-#define PULPV2_HWLOOP_LPCOUNT(x) (PULPV2_HWLOOP_LPCOUNT0 + (x)*3)
+#define PULPV2_HWLOOP_LPSTART(x) (PULPV2_HWLOOP_LPSTART0 + (x)*4)
+#define PULPV2_HWLOOP_LPEND(x) (PULPV2_HWLOOP_LPEND0 + (x)*4)
+#define PULPV2_HWLOOP_LPCOUNT(x) (PULPV2_HWLOOP_LPCOUNT0 + (x)*4)
 
 static inline iss_insn_t *LB_RR_exec_fast(iss_t *iss, iss_insn_t *insn)
 {
@@ -595,7 +595,7 @@ static inline iss_insn_t *hwloop_check_exec(iss_t *iss, iss_insn_t *insn)
 static inline void hwloop_set_start(iss_t *iss, iss_insn_t *insn, int index, iss_reg_t start)
 {
   iss->cpu.pulpv2.hwloop_regs[PULPV2_HWLOOP_LPSTART(index)] = start;
-  iss->cpu.state.hwloop_start_insn[index] = insn_cache_get(iss, start);  
+  iss->cpu.state.hwloop_start_insn[index] = insn_cache_get(iss, start);
 }
 
 static inline void hwloop_set_end(iss_t *iss, iss_insn_t *insn, int index, iss_reg_t end)
diff --git a/gvsoc/gvsoc/models/cpu/iss/src/csr.cpp b/gvsoc/gvsoc/models/cpu/iss/src/csr.cpp
index 8258e80f0..84f4ca5d2 100644
--- a/gvsoc/gvsoc/models/cpu/iss/src/csr.cpp
+++ b/gvsoc/gvsoc/models/cpu/iss/src/csr.cpp
@@ -940,6 +940,18 @@ static bool hwloop_read(iss_t *iss, int reg, iss_reg_t *value) {
 
 static bool hwloop_write(iss_t *iss, int reg, unsigned int value) {
   iss->cpu.pulpv2.hwloop_regs[reg] = value;
+
+  // Since the HW loop is using decode instruction for the HW loop start to jump faster
+  // we need to recompute it when it is modified.
+  if (reg == 0)
+  {
+      iss->cpu.state.hwloop_start_insn[0] = insn_cache_get(iss, value);
+  }
+  else if (reg == 4)
+  {
+      iss->cpu.state.hwloop_start_insn[1] = insn_cache_get(iss, value);
+  }
+
   return false;
 }
 
diff --git a/rtos/freeRTOS/freertos_kernel/include/FreeRTOS.h b/rtos/freeRTOS/freertos_kernel/include/FreeRTOS.h
index 5e443a629..05f007802 100644
--- a/rtos/freeRTOS/freertos_kernel/include/FreeRTOS.h
+++ b/rtos/freeRTOS/freertos_kernel/include/FreeRTOS.h
@@ -871,7 +871,7 @@
 #endif
 
 #ifndef configTASK_NOTIFICATION_ARRAY_ENTRIES
-    #define configTASK_NOTIFICATION_ARRAY_ENTRIES    1
+    #define configTASK_NOTIFICATION_ARRAY_ENTRIES    2
 #endif
 
 #if configTASK_NOTIFICATION_ARRAY_ENTRIES < 1
diff --git a/rtos/freeRTOS/vendors/gwt/gap8/src/device/system_gap8.c b/rtos/freeRTOS/vendors/gwt/gap8/src/device/system_gap8.c
index e2be80db9..b9d56565e 100644
--- a/rtos/freeRTOS/vendors/gwt/gap8/src/device/system_gap8.c
+++ b/rtos/freeRTOS/vendors/gwt/gap8/src/device/system_gap8.c
@@ -33,7 +33,7 @@
 /* PMSIS includes. */
 #include "pmsis.h"
 
-#include "../driver/semihost.h"
+#include "semihost.h"
 
 /* FC & L2 heaps. */
 extern char __heapfcram_start;
diff --git a/rtos/freeRTOS/vendors/gwt/gap8/include/driver/semihost.h b/rtos/freeRTOS/vendors/gwt/libs/include/semihost.h
similarity index 63%
rename from rtos/freeRTOS/vendors/gwt/gap8/include/driver/semihost.h
rename to rtos/freeRTOS/vendors/gwt/libs/include/semihost.h
index fba61196c..81ba11c44 100644
--- a/rtos/freeRTOS/vendors/gwt/gap8/include/driver/semihost.h
+++ b/rtos/freeRTOS/vendors/gwt/libs/include/semihost.h
@@ -20,6 +20,10 @@
 #include <stdlib.h>
 #include <stdint.h>
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 enum semihosting_operation_numbers {
 	/*
 	 * ARM/openocd semihosting operations.
@@ -64,7 +68,7 @@ enum semihosting_operation_numbers {
 #define SEMIHOST_EXIT_SUCCESS 0x20026
 #define SEMIHOST_EXIT_ERROR   0x20023
 
-extern long __syscall_error(long);
+//extern long __syscall_error(long);
 
 /* riscv semihosting standard: 
  * IN: a0 holds syscall number
@@ -101,20 +105,52 @@ __internal_semihost(long n, long _a1)
 
 // roughly this is the last stage of printf:
 // print a string until '\0'
-void semihost_write0(const char *print_string);
-
-int semihost_open(const char *name, int mode);
+static inline void semihost_write0(const char *print_string)
+{
+    __internal_semihost(SEMIHOSTING_SYS_WRITE0, (long) print_string);
+}
 
-int semihost_close(int fd);
+static inline int semihost_open(const char *name, int mode)
+{
+    uint32_t len = strlen(name);
+    volatile uint32_t args[3] = {(uint32_t)name,mode,len};
+    return __internal_semihost(SEMIHOSTING_SYS_OPEN, (long) args);
+}
 
-int semihost_read(int fd, uint8_t *buffer, int len);
+static inline int semihost_close(int fd)
+{
+    //uint32_t args[3] = {name,mode,len};
+    return __internal_semihost(SEMIHOSTING_SYS_CLOSE, (long) fd);
+}
 
-int semihost_write(int fd, uint8_t *buffer, int len);
+static inline int semihost_read(int fd, uint8_t *buffer, int len)
+{
+    volatile uint32_t args[3] = {(uint32_t)fd,(uint32_t)buffer,(uint32_t)len};
+    return __internal_semihost(SEMIHOSTING_SYS_READ, (long) args);
+}
 
-int semihost_seek(int fd, uint32_t pos);
+static inline int semihost_write(int fd, uint8_t *buffer, int len)
+{
+    volatile uint32_t args[3] = {(uint32_t)fd,(uint32_t)buffer,(uint32_t)len};
+    return __internal_semihost(SEMIHOSTING_SYS_WRITE, (long) args);
+}
 
-int semihost_flen(int fd);
+static inline int semihost_seek(int fd, uint32_t pos)
+{
+    volatile uint32_t args[2] = {(uint32_t)fd,pos};
+    return __internal_semihost(SEMIHOSTING_SYS_SEEK, (long) args);
+}
 
-int semihost_exit(int code);
+static inline int semihost_flen(int fd)
+{
+    return __internal_semihost(SEMIHOSTING_SYS_FLEN, (long) fd);
+}
 
+static inline int semihost_exit(int code)
+{
+    return __internal_semihost(SEMIHOSTING_SYS_EXIT, (long) code);
+}
+#ifdef __cplusplus
+}
+#endif
 #endif
diff --git a/rtos/freeRTOS/vendors/gwt/pmsis/include/pmsis/backend/pmsis_backend_native_task_api.h b/rtos/freeRTOS/vendors/gwt/pmsis/include/pmsis/backend/pmsis_backend_native_task_api.h
index 3fc2f7763..7ce9bfd03 100644
--- a/rtos/freeRTOS/vendors/gwt/pmsis/include/pmsis/backend/pmsis_backend_native_task_api.h
+++ b/rtos/freeRTOS/vendors/gwt/pmsis/include/pmsis/backend/pmsis_backend_native_task_api.h
@@ -154,7 +154,7 @@ static inline int __os_native_api_sync_obj_deinit(void *sync_obj)
 
 static inline void __os_native_api_sync_obj_take(void *sync_obj)
 {
-    ulTaskNotifyTake(pdTRUE, portMAX_DELAY);
+    ulTaskNotifyTakeIndexed(1, pdTRUE, portMAX_DELAY);
 }
 
 static inline void __os_native_api_sync_obj_release(void *sync_obj)
@@ -162,7 +162,7 @@ static inline void __os_native_api_sync_obj_release(void *sync_obj)
     uint32_t irq = __disable_irq();
     BaseType_t higher_priority_task_woken = pdFALSE;
     TaskHandle_t task_handler = (TaskHandle_t) sync_obj;
-    vTaskNotifyGiveFromISR(task_handler, &higher_priority_task_woken);
+    vTaskNotifyGiveIndexedFromISR(task_handler, 1, &higher_priority_task_woken);
     portYIELD_FROM_ISR(higher_priority_task_woken);
     __restore_irq(irq);
 }
diff --git a/rtos/freeRTOS/vendors/gwt/rules/freeRTOS_rules.mk b/rtos/freeRTOS/vendors/gwt/rules/freeRTOS_rules.mk
index c2225d9c6..e2e16f5aa 100644
--- a/rtos/freeRTOS/vendors/gwt/rules/freeRTOS_rules.mk
+++ b/rtos/freeRTOS/vendors/gwt/rules/freeRTOS_rules.mk
@@ -413,7 +413,7 @@ flash: $(BIN)
 flash_noforce: $(BIN)
 	gapy $(GAPY_TARGET_OPT) --platform=$(platform) --work-dir=$(BUILDDIR) $(config_args) $(gapy_args) run --flash --binary=$(BIN) $(runner_args)
 
-flash_fs: $(BIN)
+flash_fs: $(BIN) image
 	gapy $(GAPY_TARGET_OPT) --platform=$(platform) --work-dir=$(BUILDDIR) $(config_args) $(gapy_args) run --flash --binary=$(BIN) $(runner_args) $(WSL_ENV)
 
 image: $(BIN)
diff --git a/rtos/pmsis/pmsis_bsp/CMakeLists.txt b/rtos/pmsis/pmsis_bsp/CMakeLists.txt
index 77c951b92..b2abc381a 100644
--- a/rtos/pmsis/pmsis_bsp/CMakeLists.txt
+++ b/rtos/pmsis/pmsis_bsp/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(BSP_READFS_SRC fs/read_fs/read_fs.c)
-set(BSP_HOSTFS_SRC fs/host_fs/semihost.c fs/host_fs/host_fs.c)
+set(BSP_HOSTFS_SRC fs/host_fs/host_fs.c)
 set(BSP_LFS_SRC fs/lfs/lfs.c fs/lfs/lfs_util.c fs/lfs/pi_lfs.c)
 set(BSP_FS_SRC fs/fs.c)
 set(BSP_FLASH_SRC
diff --git a/rtos/pmsis/pmsis_bsp/flash/mram/mram-v2.c b/rtos/pmsis/pmsis_bsp/flash/mram/mram-v2.c
index 6fb99e928..3cddb5001 100644
--- a/rtos/pmsis/pmsis_bsp/flash/mram/mram-v2.c
+++ b/rtos/pmsis/pmsis_bsp/flash/mram/mram-v2.c
@@ -338,7 +338,9 @@ static int mram_open(struct pi_device *device)
         // In XIP mode, we need to lock XIP refills to avoid having a read while the flash is doing the program operation.
         udma_mram_trans_mode_set(base, UDMA_MRAM_TRANS_MODE_AUTO_ENA(1) | UDMA_MRAM_TRANS_MODE_XIP_EN(1) | UDMA_MRAM_TRANS_MODE_XIP_AUTO_HALTED(1));
 #else
-        udma_mram_trans_mode_set(base, UDMA_MRAM_TRANS_MODE_AUTO_ENA(1));
+        udma_mram_trans_mode_set(base, UDMA_MRAM_TRANS_MODE_AUTO_ENA(1)
+                | UDMA_MRAM_TRANS_MODE_XIP_EN(conf->xip_en)
+                | UDMA_MRAM_TRANS_MODE_XIP_AUTO_HALTED(conf->xip_en));
 #endif
 
 #ifndef CONFIG_XIP_MRAM
@@ -896,4 +898,5 @@ void pi_mram_conf_init(struct pi_mram_conf *conf)
     conf->flash.api = &mram_api;
     conf->itf = 0;
     conf->baudrate = 15000000;
+    conf->xip_en = 0;
 }
diff --git a/rtos/pmsis/pmsis_bsp/fs/host_fs/semihost.c b/rtos/pmsis/pmsis_bsp/fs/host_fs/semihost.c
deleted file mode 100644
index 016237eee..000000000
--- a/rtos/pmsis/pmsis_bsp/fs/host_fs/semihost.c
+++ /dev/null
@@ -1,50 +0,0 @@
-#include "semihost.h"
-#include "string.h"
-
-// roughly this is the last stage of printf:
-// print a string until '\0'
-void semihost_write0(const char *print_string)
-{
-    __internal_semihost(SEMIHOSTING_SYS_WRITE0, (long) print_string);
-}
-
-int semihost_open(const char *name, int mode)
-{
-    uint32_t len = strlen(name);
-    volatile uint32_t args[3] = {(uint32_t)name,mode,len};
-    return __internal_semihost(SEMIHOSTING_SYS_OPEN, (long) args);
-}
-
-int semihost_close(int fd)
-{
-    //uint32_t args[3] = {name,mode,len};
-    return __internal_semihost(SEMIHOSTING_SYS_CLOSE, (long) fd);
-}
-
-int semihost_read(int fd, uint8_t *buffer, int len)
-{
-    volatile uint32_t args[3] = {(uint32_t)fd,(uint32_t)buffer,(uint32_t)len};
-    return __internal_semihost(SEMIHOSTING_SYS_READ, (long) args);
-}
-
-int semihost_write(int fd, uint8_t *buffer, int len)
-{
-    volatile uint32_t args[3] = {(uint32_t)fd,(uint32_t)buffer,(uint32_t)len};
-    return __internal_semihost(SEMIHOSTING_SYS_WRITE, (long) args);
-}
-
-int semihost_seek(int fd, uint32_t pos)
-{
-    volatile uint32_t args[2] = {(uint32_t)fd,pos};
-    return __internal_semihost(SEMIHOSTING_SYS_SEEK, (long) args);
-}
-
-int semihost_flen(int fd)
-{
-    return __internal_semihost(SEMIHOSTING_SYS_FLEN, (long) fd);
-}
-
-int semihost_exit(int code)
-{
-    return __internal_semihost(SEMIHOSTING_SYS_EXIT, (long) code);
-}
diff --git a/rtos/pmsis/pmsis_bsp/fs/host_fs/semihost.h b/rtos/pmsis/pmsis_bsp/fs/host_fs/semihost.h
index fba61196c..4d58a542e 100644
--- a/rtos/pmsis/pmsis_bsp/fs/host_fs/semihost.h
+++ b/rtos/pmsis/pmsis_bsp/fs/host_fs/semihost.h
@@ -98,23 +98,52 @@ __internal_semihost(long n, long _a1)
 #endif
 }
 
-
 // roughly this is the last stage of printf:
 // print a string until '\0'
-void semihost_write0(const char *print_string);
+static inline void semihost_write0(const char *print_string)
+{
+    __internal_semihost(SEMIHOSTING_SYS_WRITE0, (long) print_string);
+}
 
-int semihost_open(const char *name, int mode);
+static inline int semihost_open(const char *name, int mode)
+{
+    uint32_t len = strlen(name);
+    volatile uint32_t args[3] = {(uint32_t)name,mode,len};
+    return __internal_semihost(SEMIHOSTING_SYS_OPEN, (long) args);
+}
 
-int semihost_close(int fd);
+static inline int semihost_close(int fd)
+{
+    //uint32_t args[3] = {name,mode,len};
+    return __internal_semihost(SEMIHOSTING_SYS_CLOSE, (long) fd);
+}
 
-int semihost_read(int fd, uint8_t *buffer, int len);
+static inline int semihost_read(int fd, uint8_t *buffer, int len)
+{
+    volatile uint32_t args[3] = {(uint32_t)fd,(uint32_t)buffer,(uint32_t)len};
+    return __internal_semihost(SEMIHOSTING_SYS_READ, (long) args);
+}
 
-int semihost_write(int fd, uint8_t *buffer, int len);
+static inline int semihost_write(int fd, uint8_t *buffer, int len)
+{
+    volatile uint32_t args[3] = {(uint32_t)fd,(uint32_t)buffer,(uint32_t)len};
+    return __internal_semihost(SEMIHOSTING_SYS_WRITE, (long) args);
+}
 
-int semihost_seek(int fd, uint32_t pos);
+static inline int semihost_seek(int fd, uint32_t pos)
+{
+    volatile uint32_t args[2] = {(uint32_t)fd,pos};
+    return __internal_semihost(SEMIHOSTING_SYS_SEEK, (long) args);
+}
 
-int semihost_flen(int fd);
+static inline int semihost_flen(int fd)
+{
+    return __internal_semihost(SEMIHOSTING_SYS_FLEN, (long) fd);
+}
 
-int semihost_exit(int code);
+static inline int semihost_exit(int code)
+{
+    return __internal_semihost(SEMIHOSTING_SYS_EXIT, (long) code);
+}
 
 #endif
diff --git a/rtos/pmsis/pmsis_bsp/include/bsp/flash/mram.h b/rtos/pmsis/pmsis_bsp/include/bsp/flash/mram.h
index 4e57bc943..face1328e 100644
--- a/rtos/pmsis/pmsis_bsp/include/bsp/flash/mram.h
+++ b/rtos/pmsis/pmsis_bsp/include/bsp/flash/mram.h
@@ -49,6 +49,7 @@ struct pi_mram_conf
   int itf;                /*!< Mram interface where the flash is
       connected. */
   int baudrate;                /*!< Baudrate in byte/s. */
+  int xip_en;
 };
 
 /** \brief Initialize an Mram configuration with default values.
diff --git a/rtos/pmsis/pmsis_bsp/src.mk b/rtos/pmsis/pmsis_bsp/src.mk
index 505fe287f..326ee544c 100644
--- a/rtos/pmsis/pmsis_bsp/src.mk
+++ b/rtos/pmsis/pmsis_bsp/src.mk
@@ -1,5 +1,5 @@
 BSP_READFS_SRC = fs/read_fs/read_fs.c
-BSP_HOSTFS_SRC = fs/host_fs/semihost.c fs/host_fs/host_fs.c
+BSP_HOSTFS_SRC = fs/host_fs/host_fs.c
 BSP_LFS_SRC = fs/lfs/lfs.c fs/lfs/lfs_util.c fs/lfs/pi_lfs.c
 BSP_FS_SRC = fs/fs.c
 BSP_FLASH_SRC = flash/flash.c partition/partition.c partition/flash_partition.c \
diff --git a/rtos/pmsis/pmsis_bsp/zephyr/CMakeLists.txt b/rtos/pmsis/pmsis_bsp/zephyr/CMakeLists.txt
index 69a891dcc..dcf7d9e70 100644
--- a/rtos/pmsis/pmsis_bsp/zephyr/CMakeLists.txt
+++ b/rtos/pmsis/pmsis_bsp/zephyr/CMakeLists.txt
@@ -8,7 +8,6 @@ zephyr_sources(
   ../fs/read_fs/read_fs.c
   ../fs/fs.c
   ../fs/host_fs/host_fs.c
-  ../fs/host_fs/semihost.c
   ../flash/flash.c
   ../flash/hyperflash/hyperflash.c
   ../ram/ram.c
@@ -20,4 +19,4 @@ zephyr_compile_options(
   -DCONFIG_GAPUINO
   )
 
-zephyr_include_directories(../include)
\ No newline at end of file
+zephyr_include_directories(../include)
diff --git a/rtos/pmsis/pmsis_implem/chips/gap9/drivers/i2c/i2c.c b/rtos/pmsis/pmsis_implem/chips/gap9/drivers/i2c/i2c.c
index 566c8c84a..7bad66e79 100644
--- a/rtos/pmsis/pmsis_implem/chips/gap9/drivers/i2c/i2c.c
+++ b/rtos/pmsis/pmsis_implem/chips/gap9/drivers/i2c/i2c.c
@@ -342,7 +342,7 @@ static int __pi_i2c_prepare_write_read_buf(i2c_slave_data_t *slave_data,
         buffer[index++] = I2C_CMD_LEAD_START(1);
         buffer[index++] = I2C_CMD_LEAD_SEND_IMM(slave_data->slave_addrh|1);
     }
-    buffer[index++] = I2C_CMD_RPT(size1);
+    buffer[index++] = I2C_CMD_RPT(size1-1);
     // receive -1 byte because there is a "last"
     buffer[index++] = I2C_CMD_MISC_RECEIVE(1);
     buffer[index++] = I2C_CMD_MISC_RECEIVE_LAST(1);
diff --git a/tools/autotiler_v3/Autotiler/AutoTilerLibTypes.h b/tools/autotiler_v3/Autotiler/AutoTilerLibTypes.h
index e3d807233..4f4395b0d 100644
--- a/tools/autotiler_v3/Autotiler/AutoTilerLibTypes.h
+++ b/tools/autotiler_v3/Autotiler/AutoTilerLibTypes.h
@@ -49,6 +49,7 @@ typedef enum {
 	KOP_DP_REDUCT_NOSCALE,
 	KOP_DP_REDUCT_CHW2HWC,
 	KOP_DP_REDUCT_IO,
+	KOP_DP_REDUCT_IO_NOSCALE,
 	KOP_DP_REDUCT_MULBIAS,
 	KOP_DP_REDUCT_IO_MULBIAS,
 	KOP_DP_REDUCT_MULBIAS_SCALAR,
diff --git a/tools/autotiler_v3/CNN_Generators/CNN_Copy_Generators.c b/tools/autotiler_v3/CNN_Generators/CNN_Copy_Generators.c
index a6030494d..0f69bba89 100644
--- a/tools/autotiler_v3/CNN_Generators/CNN_Copy_Generators.c
+++ b/tools/autotiler_v3/CNN_Generators/CNN_Copy_Generators.c
@@ -721,6 +721,8 @@ static int CNN_MatTranspose_Internal(
 	add_kernel_arg_func_t AddKArgDimFunc = AddKernelArgDim;
         cnn_kernel_arg_datatype_func_t CNN_ArgDtype = CNN_ArgDataType;
 
+        if (Size < 0) CNN_ArgDtype = CNN_ArgDataTypeUns;
+
         if (Ctrl) {
                 if (Ctrl->TileOrientation != -1) TileOrientation = (Ctrl->TileOrientation==0)?TILE_HOR:TILE_VER;
                 if (Ctrl->ParallelFeatures != -1) ParFeat = Ctrl->ParallelFeatures;
@@ -731,6 +733,7 @@ static int CNN_MatTranspose_Internal(
         if (HWC) {
                 return CNN_3DTensorPermute(Name, Ctrl, Feat, Size, Width, Height, KOP_MATPERM_HWC2WHC);
         }
+        if (Size < 0) Size = -Size;
         unsigned long long int LayerOp = Width*Height*Feat*Size;
         unsigned long long int LayerBandwidth = 0;
 
@@ -890,6 +893,11 @@ int CNN_3DTensorPermute(
 	add_kernel_arg_func_t AddKArgDimFunc = AddKernelArgDim;
         cnn_kernel_arg_datatype_func_t CNN_ArgDtype = CNN_ArgDataType;
 
+        if (Size < 0) {
+                CNN_ArgDtype = CNN_ArgDataTypeUns;
+                Size = -Size;
+        }
+
 	if (Ctrl) {
 		if (Ctrl->HWC != -1) HWC = Ctrl->HWC;
 		if (Ctrl->FloatDump != -1&&Ctrl->FloatDump) AddKArgDimFunc = AddKernelFloatArgDim;
diff --git a/tools/autotiler_v3/CNN_Generators_NE16/CNN_Generators_NE16.c b/tools/autotiler_v3/CNN_Generators_NE16/CNN_Generators_NE16.c
index ef93bf8b1..0edf74202 100644
--- a/tools/autotiler_v3/CNN_Generators_NE16/CNN_Generators_NE16.c
+++ b/tools/autotiler_v3/CNN_Generators_NE16/CNN_Generators_NE16.c
@@ -316,8 +316,6 @@ Kernel_T *CNN_MM_ConvolutionNE16(
         char *ConvKerName=0, *PoolKerName=0, *ActKerName=0, *SetBiasKerName=0, *DPReductionKerName=0;
         int NeedFcx, NeedFcy, NeedDcx, NeedDcy, NeedScx, NeedScy, NeedFpx, NeedFpy, NeedDpx, NeedDpy, NeedSpx, NeedSpy;
         int UsedWidth, UsedHeight, UsedWc, UsedHc;
-
-        unsigned int InTileCons = 16;
         int OutTileCons = 32;
         int StandAloneAct = (ActOper!=KOP_NONE);
         unsigned long long int LayerOp = 0;
@@ -331,11 +329,18 @@ Kernel_T *CNN_MM_ConvolutionNE16(
         if (!(ActOper == KOP_NONE || ActOper == KOP_RELU || ActOper == KOP_RELUN || ActOper == KOP_RELUM || ActOper == KOP_HSIGMOID || ActOper == KOP_HSWISH || ActOper == KOP_LEAKYRELU || ActOper == KOP_SIGMOID || ActOper == KOP_TANH))
                 GenTilingError("CNN_MM_ConvolutionNE16 Kernel: %s, ActOper, expecting KOP_NONE, KOP_RELU, KOP_RELUN, KOP_RELUM, KOP_HSIGMOID, KOP_HSWISH, KOP_LEAKYRELU, KOP_SIGMOID or KOP_TANH", Name);
 
-        Wa |= O_NE16_LIN | O_LINEAR;
+        Wa |= O_NE16_LIN | O_LINEAR;        
+        int Mode16 = (Abs(In_DataSize) == 2);
+        if (Mode16) {
+            Wa |= O_NE16_MODE16;
+        }
+
+        unsigned int InTileCons = Mode16?8:16;
+        int NeedSetBias = Mode16;
         /* When there is a special activation (not supported by the accelerator itself), you need to streamout 32bits and do the act in the cluster but the ((*S) >> N) is done in the accelerator (KOP_DP_REDUCT_NOSCALE) */
         int NeedReductNoScale = !(ActOper == KOP_RELU || ActOper == KOP_NONE);
         /* Also when in/out are 16bits you need to streamout 32bits but here the reduction step will be done in the cluster (KOP_DP_REDUCT) */
-        int NeedReductScale = Abs(In_DataSize) == 2;
+        int NeedReductScale = Mode16;
         int NeedReduct = NeedReductNoScale || NeedReductScale;
 
         CNN_LayerOutputDim(Width, Height, ConvOper, Fcx, Fcy, Dcx, Dcy, Scx, Scy, ConvPad, PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PoolPad,
@@ -376,26 +381,38 @@ Kernel_T *CNN_MM_ConvolutionNE16(
 
         ConvKerName = CNN_FindMatchingKernelAttr(KOP_MM_CONV, KOP_NONE, ParFeat, CALL_NE16_KER, Abs(In_DataSize), Abs(Out_DataSize), Bias_DataSize, 0, 4, Fcx, Fcy, Dcx, Dcy, Scx, Scy,
                                                  &NeedFcx, &NeedFcy, &NeedDcx, &NeedDcy, &NeedScx, &NeedScy, 0);
-        if (ConvKerName==0) GenTilingError("CNN_MM_ConvolutionNE16 Kernel: %s, Can't find a matching Convolution basic kernel", Name);
-        if (PoolOper==KOP_MAXPOOL) {
-                PoolKerName = CNN_FindMatchingKernelAttr(PoolOper, KOP_NONE, 1, CALL_HWC_KER, In_DataSize, 0, 0, 0, Out_DataSize, Fpx, Fpy, Dpx, Dpy, Spx, Spy,
+        if (ConvKerName==0) GenTilingError("CNN_ConvolutionPoolAct_NE16 Kernel: %s, Can't find a matching Convolution basic kernel", Name);
+
+        if (PoolOper==KOP_MAXPOOL || PoolOper==KOP_AVGPOOL) {
+                PoolKerName = CNN_FindMatchingKernelAttr(PoolOper, NeedReduct?KOP_NONE:ActOper, 1, CALL_HWC_KER, In_DataSize, 0, 0, 0, Out_DataSize, Fpx, Fpy, Dpx, Dpy, Spx, Spy,
                                                          &NeedFpx, &NeedFpy, &NeedDpx, &NeedDpy, &NeedSpx, &NeedSpy, 0);
-                if (PoolKerName==0) GenTilingError("CNN_MM_ConvolutionNE16 Kernel: %s, Can't find a matching Pooling basic kernel", Name);
+                if (PoolKerName==0) GenTilingError("CNN_ConvolutionPoolAct_NE16 Kernel: %s, Can't find a matching Pooling basic kernel", Name);
+                if (NeedReduct) {
+                        DPReductionKerName = CNN_FindMatchingKernelAttr(NeedReductScale?KOP_DP_REDUCT_IO:KOP_DP_REDUCT_IO_NOSCALE, ActOper, 1, CALL_HWC_KER,
+                                                                        4, 0, 0, 0, Out_DataSize, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+                        if (DPReductionKerName==0) GenTilingError("CNN_ConvolutionPoolAct_NE16 Kernel: %s, Can't find a matching Reduction basic kernel %d %s", Name, Out_DataSize, NeedReductNoScale?"NoScale":"Scale");
+                }
+
+        } else if (NeedReduct) {
+                DPReductionKerName = CNN_FindMatchingKernelAttr(NeedReductScale?KOP_DP_REDUCT:KOP_DP_REDUCT_NOSCALE, ActOper, 1, CALL_HWC_KER,
+                                                                4, 0, 0, 0, Out_DataSize, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+                if (DPReductionKerName==0) GenTilingError("CNN_ConvolutionPoolAct_NE16 Kernel: %s, Can't find a matching Reduction basic kernel %d %s", Name, Out_DataSize, NeedReductNoScale?"NoScale":"Scale");
         }
-        if (NeedReduct) {
-                DPReductionKerName = CNN_FindMatchingKernelAttr(NeedReductNoScale?KOP_DP_REDUCT_NOSCALE:KOP_DP_REDUCT, ActOper, 1, CALL_HWC_KER, 4, 0, 0, 0, Out_DataSize, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
-                if (DPReductionKerName==0) GenTilingError("CNN_MM_ConvolutionNE16 Kernel: %s, Can't find a matching Reduction basic kernel %d", Name, Out_DataSize);
+        if (NeedSetBias) {
+                SetBiasKerName = CNN_FindMatchingKernelAttr(KOP_SETBIAS, KOP_NONE, ParFeat, CALL_HWC_KER, Bias_DataSize,0,0,0,4, 0,0,0,0,0,0, 0,0,0,0,0,0, 0);
+                if (SetBiasKerName==0) GenTilingError("CNN_ConvolutionPoolAct_NE16 Kernel: %s, Can't find a matching SetBias basic kernel", Name);
         }
+
         // If pooling you need an extra buffer for convout but reduction can be done in the accelerator
-        int NeedConvout = NeedReduct || PoolKerName;
+        int NeedConvout = NeedReduct || NeedSetBias || PoolKerName;
         unsigned int Cos = NeedConvout?4:1;
 
         if (Log) {
                 printf("InFeat: %d%s, OutFeat: %d, InFeatCons: %d\n", InFeat, " Im2Col", OutFeat, InTileCons);
                 printf("Conv => W:  %4d, Pad:[%d,%d] PadT:[%d,%d] => Wc: %d, Filter:[%d,%d]x%d Bits\n", Width,  PadInc[0], PadInc[1], PadIncT[0], PadIncT[1], Wc, Fcx, Fcy, Filter_DataSizeBits);
                 printf("     => H:  %4d, Pad:[%d,%d] PadT:[%d,%d] => Hc: %d\n", Height, PadInc[2], PadInc[3], PadIncT[2], PadIncT[3], Hc);
-                printf("     ConvOut_DataSize: %d\n", Cos);
-                printf("Pool => Wc: %4d, Pad:[%d,%d] => Wo: %d, Filter:[%d,%d]\n", UsedWc, PadInp[0], PadInp[1], Wo, Fpx, Fpy);
+                printf("%s -- >ConvOut_DataSize: %d\n", NeedConvout?"NeedConvOut":"NoConvOut", Cos);
+                printf("Pool => Wc: %4d, Pad:[%d,%d] => Wo: %d, Filter:[%d,%d] %d\n", UsedWc, PadInp[0], PadInp[1], Wo, Fpx, Fpy, Mode16);
                 printf("     => Hc: %4d, Pad:[%d,%d] => Ho: %d\n", UsedHc, PadInp[2], PadInp[3], Ho);
                 printf("OverlapC: %d\n", OverlapC);
                 printf("OverlapP: %d\n", OverlapP);
@@ -406,15 +423,14 @@ Kernel_T *CNN_MM_ConvolutionNE16(
                 if (DPReductionKerName) printf("%20s: %s\n", "DPReductionKerName", DPReductionKerName);
                 if (PoolKerName) printf("%20s: %s\n", "PoolKerName", PoolKerName);
                 printf("Nb Oper : %lld\n", LayerOp);
-                printf("NeedConvout: %d\n", NeedConvout);
+
         }
         /* User kernel C arguments */
         CKernel_Arg_T **KCArgs = AllocateCArgs(7);
         Kernel_T *Kernel;
 
-        int StreamoutMode   = 1; // Streamout = apply *Scale >> ScaleN
-        int Mode16          = (Abs(In_DataSize) == 2);
-        int Streamin        = 0; // Streamin initialized at 0, set to 1 in the basic kernel if multiple chin tile
+        int StreamoutMode   = !Mode16; // Streamout = apply *Scale >> ScaleN
+        int Streamin        = Mode16; // Streamin initialized at 0, set to 1 in the basic kernel if multiple chin tile
         int FilterMode      = 3;
         int LinearMode      = 1;
         int StridedMode     = 0;
@@ -424,7 +440,7 @@ Kernel_T *CNN_MM_ConvolutionNE16(
         int QuantBits       = (NeedReduct)?2:(Abs(Out_DataSize)==2?1:0); // 00: 8bit, 01: 16bit, 10: 32bit --> If tiling the channel input dimension you need to streamin (need 32 bits output)
         int QuantNoRect     = (NeedReduct || (Out_DataSize>0))?1:0;
         int NormShift       = 1;
-        int NormBias        = 1;
+        int NormBias        = !Mode16;
         unsigned int DEFAULT_NE16_JOB_CFG = NE16_DefaultConfig(Filter_DataSizeBits, Mode16, StreamoutMode, FilterMode, LinearMode, StridedMode, NormBits, Streamin, \
                                                                WOffsetCfg, QuantRightShift, QuantBits, QuantNoRect, NormShift, NormBias);
 
@@ -440,9 +456,19 @@ Kernel_T *CNN_MM_ConvolutionNE16(
                         TCArg(CNN_ArgDataType(1,            1,1), "ScaleN"),
                         TCArg(CNN_ArgDataType(1,            1,1),  "Infos")
                 ),
-                Calls(6,
+                Calls(7,
                         Call("NE16_Enable", LOC_D1_PROLOG, Bindings(0)),
                         Call("NE16_SoftReset", LOC_D0, Bindings(0)),
+                        SetBiasKerName?Call(SetBiasKerName, LOC_D0, 
+                                Bindings(6,
+                                        K_Arg("ConvOut", KER_ARG_TILE),                                         /* SetBias output tile */
+                                        K_Arg("ConvOut", KER_ARG_TILE_W),                                       /* SetBias output tile width */
+                                        K_Arg("ConvOut", KER_ARG_TILE_H),                                       /* SetBias output tile height */
+                                        ParFeat?K_ArgPar("ConvOut", KER_ARG_PARTILE_SIZE, D1):Imm(1),           /* Number of output features in this tile */
+                                        K_Arg("Bias", KER_ARG_TILE),                                            /* SetBias Bias tile */
+                                        K_TileOper("Infos", "char *", '@', AT_INF_BIASN)                        /* Bias Norm */
+                                )
+                        ):AT_NO_CALL,
                         Call(ConvKerName, LOC_D0,
                                 Bindings(28,
                                         K_Arg("In", KER_ARG_TILE),                                              /* Conv input tile */
@@ -452,11 +478,11 @@ Kernel_T *CNN_MM_ConvolutionNE16(
                                         K_Arg(NeedConvout?"ConvOut":"Out", KER_ARG_TILE),                       /* Conv output */
                                         K_Arg("Scale", KER_ARG_TILE),                                           /* Per channel scale tile */
                                         K_Arg("ScaleN", KER_ARG_TILE),                                          /* Per channel scale normalization tile */
-                                        K_ArgPar("In", KER_ARG_PARTILE_SIZE, D0),                           /* Number of input features in this tile */
-                                        K_ArgPar("In", KER_ARG_LOADEDPARTILE_SIZE, D0),                     /* Total Number of loaded input features in case of promotion */
+                                        K_ArgPar("In", KER_ARG_PARTILE_SIZE, D0),                               /* Number of input features in this tile */
+                                        K_ArgPar("In", KER_ARG_LOADEDPARTILE_SIZE, D0),                         /* Total Number of loaded input features in case of promotion */
                                         K_Arg("In", KER_ARG_TILE_H),                                            /* Conv input tile height */
                                         K_Arg("In", KER_ARG_TILE_W),                                            /* Conv input tile width */
-                                        K_ArgPar(NeedConvout?"ConvOut":"Out", KER_ARG_PARTILE_SIZE, D1),                          /* Number of output features in this tile */
+                                        K_ArgPar(NeedConvout?"ConvOut":"Out", KER_ARG_PARTILE_SIZE, D1),        /* Number of output features in this tile */
                                         K_Arg(NeedConvout?"ConvOut":"Out", KER_ARG_TILE_H),
                                         K_Arg(NeedConvout?"ConvOut":"Out", KER_ARG_TILE_W),
                                         Imm(PadValue),
@@ -490,7 +516,7 @@ Kernel_T *CNN_MM_ConvolutionNE16(
                         ),
                         (PoolKerName==0)?AT_NO_CALL:
                         Call(PoolKerName, LOC_D0_EPILOG,
-                                Bindings(13,
+                                Bindings(14,
                                         K_Arg("ConvOut", KER_ARG_TILE),
                                         K_Arg("ConvOut", KER_ARG_TILE_W),
                                         K_Arg("ConvOut", KER_ARG_TILE_H),
@@ -503,22 +529,23 @@ Kernel_T *CNN_MM_ConvolutionNE16(
                                         K_Arg("Out", KER_ARG_TILE),                             /* Pooling output tile */
                                         K_ArgPar("ConvOut", KER_ARG_PARTILE_SIZE, D1),          /* In Features */
                                         K_Arg("Out", KER_ARG_TILE_W),                           /* Output tile width */
-                                        K_Arg("Out", KER_ARG_TILE_H)                            /* Output tile height */
+                                        K_Arg("Out", KER_ARG_TILE_H),                            /* Output tile height */
+                                        K_Arg("Infos", KER_ARG_TILE)                                            /* Infos */
                                 )
                         ),
                         Call("NE16_Disable", LOC_D1_EPILOG, Bindings(0))
                 ),
                 KerArgs(9,
-                        KerArgPV("In",    KerArgSpace(2,T0,D0),    O_IN|O_DB|O_HWC,  Width, Height, UsedWidth, UsedHeight, PadIncT, PadInc, PadValue, Abs(In_DataSize),   OverlapC, 0, 0, "In"),
-                        KerArg ("ColBuff",KerArgSpace(1,T0),       O_BUFF|O_NTILED,    BuffS,   1,                 1, 0, 0, 0, 0),
-                        KerArg ("Bias",   KerArgSpace(1,D1),       O_IN|O_DB|O_CONST,      1,   1,                Bs, 0, 0, 0, "Bias"),
-                        KerArg ("Scale",  KerArgSpace(1,D1),       O_IN|O_DB|O_CONST,      1,   1,                 1, 0, 0, 0, "Scale"),
-                        KerArg ("ScaleN", KerArgSpace(1,D1),       O_IN|O_DB|O_CONST,      1,   1,                 1, 0, 0, 0, "ScaleN"),
-                        KerArg ("Filter", KerArgSpace(1,D1),       O_IN|O_DB|O_CONST|Wa, 1, WBuffSize,            Ws, 0, 0, 0, "Filter"),
+                        KerArgPV("In",    KerArgSpace(2,T0,D0),    O_IN|O_DB|O_HWC,    Width,    Height, UsedWidth, UsedHeight, PadIncT, PadInc, PadValue, Abs(In_DataSize), OverlapC, 0, 0, "In"),
+                        KerArg ("ColBuff",KerArgSpace(1,T0),       O_BUFF|O_NTILED,    BuffS,         1,                                                                  1,        0, 0, 0, 0),
+                        KerArg ("Bias",   KerArgSpace(1,D1),       O_IN|O_DB|O_CONST,      1,         1,                                                                 Bs,        0, 0, 0, "Bias"),
+                        KerArg ("Scale",  KerArgSpace(1,D1),       O_IN|O_DB|O_CONST,      1,         1,                                                                  1,        0, 0, 0, "Scale"),
+                        KerArg ("ScaleN", KerArgSpace(1,D1),       O_IN|O_DB|O_CONST,      1,         1,                                                                  1,        0, 0, 0, "ScaleN"),
+                        KerArg ("Filter", KerArgSpace(1,D1),       O_IN|O_DB|O_CONST|Wa,   1, WBuffSize,                                                                 Ws,        0, 0, 0, "Filter"),
                         NeedConvout?
-                        KerArgP("ConvOut",KerArgSpace(2,T0,D1),    O_BUFF|O_ONETILE|O_HWC,Wc,  Hc,  UsedWc, UsedHc, PadInp, PadInp,        Cos, OverlapP, 0,        0, ""):AT_NO_KER_ARG,
-                        KerArg ("Out",    KerArgSpace(2,T0,D1),    O_OUT|O_DB|O_HWC,      Wo,  Ho, Abs(Out_DataSize), 0, 0, 0, "Out"),
-                        KerArg ("Infos",  KerArgSpace(1,T0),       O_IN|O_BUFF|O_NTILED|O_CONST,  AT_INF_NE16_DIM, 1,   1, 0, 0, 0, "Infos")
+                        KerArgP("ConvOut",KerArgSpace(2,T0,D1),    O_BUFF|O_ONETILE|O_HWC,Wc,        Hc,    UsedWc, UsedHc, PadInp, PadInp,                             Cos, OverlapP, 0, 0, ""):AT_NO_KER_ARG,
+                        KerArg ("Out",    KerArgSpace(2,T0,D1),    O_OUT|O_DB|O_HWC,      Wo,        Ho,                                                  Abs(Out_DataSize),        0, 0, 0, "Out"),
+                        KerArg ("Infos",  KerArgSpace(1,T0),       O_IN|O_BUFF|O_NTILED|O_CONST, AT_INF_NE16_DIM,1,                                                       1,        0, 0, 0, "Infos")
                 )
         );
         if (Kernel) {
@@ -680,14 +707,18 @@ static Kernel_T *CNN_ConvolutionNE16_Internal(
         ConvKerName = CNN_FindMatchingKernelAttr(ConvOper, KOP_NONE, ParFeat, CALL_NE16_KER, Abs(In_DataSize), Abs(Out_DataSize), Bias_DataSize, 0, 4, Fcx, Fcy, Dcx, Dcy, Scx, Scy,
                                                  &NeedFcx, &NeedFcy, &NeedDcx, &NeedDcy, &NeedScx, &NeedScy, 0);
         if (ConvKerName==0) GenTilingError("CNN_ConvolutionPoolAct_NE16 Kernel: %s, Can't find a matching Convolution basic kernel", Name);
-        int BasicNE16Mode = 0;
-        // if ((Fcx==1 && Fcy==1) || (Fcx==3 && Fcy==3 && Scx==1 && Scy==1)) {BasicNE16Mode = 1; printf("BASIC MODE\n");}
+
         if (PoolOper==KOP_MAXPOOL || PoolOper==KOP_AVGPOOL) {
-                PoolKerName = CNN_FindMatchingKernelAttr(PoolOper, KOP_NONE, 1, CALL_HWC_KER, In_DataSize, 0, 0, 0, Out_DataSize, Fpx, Fpy, Dpx, Dpy, Spx, Spy,
+                PoolKerName = CNN_FindMatchingKernelAttr(PoolOper, NeedReduct?KOP_NONE:ActOper, 1, CALL_HWC_KER, In_DataSize, 0, 0, 0, Out_DataSize, Fpx, Fpy, Dpx, Dpy, Spx, Spy,
                                                          &NeedFpx, &NeedFpy, &NeedDpx, &NeedDpy, &NeedSpx, &NeedSpy, 0);
                 if (PoolKerName==0) GenTilingError("CNN_ConvolutionPoolAct_NE16 Kernel: %s, Can't find a matching Pooling basic kernel", Name);
-        }
-        if (NeedReduct) {
+                if (NeedReduct) {
+                        DPReductionKerName = CNN_FindMatchingKernelAttr(NeedReductScale?KOP_DP_REDUCT_IO:KOP_DP_REDUCT_IO_NOSCALE, ActOper, 1, CALL_HWC_KER,
+                                                                        4, 0, 0, 0, Out_DataSize, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+                        if (DPReductionKerName==0) GenTilingError("CNN_ConvolutionPoolAct_NE16 Kernel: %s, Can't find a matching Reduction basic kernel %d %s", Name, Out_DataSize, NeedReductNoScale?"NoScale":"Scale");
+                }
+
+        } else if (NeedReduct) {
                 DPReductionKerName = CNN_FindMatchingKernelAttr(NeedReductScale?KOP_DP_REDUCT:KOP_DP_REDUCT_NOSCALE, ActOper, 1, CALL_HWC_KER,
                                                                 4, 0, 0, 0, Out_DataSize, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
                 if (DPReductionKerName==0) GenTilingError("CNN_ConvolutionPoolAct_NE16 Kernel: %s, Can't find a matching Reduction basic kernel %d %s", Name, Out_DataSize, NeedReductNoScale?"NoScale":"Scale");
@@ -698,7 +729,7 @@ static Kernel_T *CNN_ConvolutionNE16_Internal(
         }
 
         // If pooling you need an extra buffer for convout but reduction can be done in the accelerator
-        int NeedConvout = NeedReduct || PoolKerName || NeedSetBias;
+        int NeedConvout = NeedReduct || NeedSetBias || PoolKerName;
         unsigned int Cos = NeedConvout?4:1;
 
         if (Log) {
@@ -750,7 +781,7 @@ static Kernel_T *CNN_ConvolutionNE16_Internal(
         KCArgs[Ca++] = TCArg(CNN_ArgDataType(1,            1,1),  "Infos");
 
         /* User kernel kernel arguments */
-        Object_T **KArgs = AllocateKerArgs(NeedConvout?(PoolKerName?9:8):7);
+        Object_T **KArgs = AllocateKerArgs(NeedConvout?8:7);
         int Ka=0;
         KArgs[Ka++] = KerArgPV("In",    KerArgSpace(2,T0,D0),    O_IN|O_DB|O_HWC,  Width, Height, UsedWidth, UsedHeight, PadIncT, PadInc, PadValue, Abs(In_DataSize),   OverlapC, 0, TileCons, "In");
         if (MinTileDim && (MinTileDim > TileCons)) SetKerArgMinTileSize(KArgs[Ka-1], MinTileDim);
@@ -764,8 +795,6 @@ static Kernel_T *CNN_ConvolutionNE16_Internal(
         }
         if (NeedConvout) 
         KArgs[Ka++] = KerArgP("ConvOut",KerArgSpace(2,T0,Os),    O_BUFF|O_ONETILE|O_HWC,  Wc,    Hc,  UsedWc, UsedHc, PadInp, PadInp,        Cos, OverlapP, 0,        0, "");
-        if (NeedConvout && PoolKerName)
-        KArgs[Ka++] = KerArgP("ActOut", KerArgSpace(2,T0,Os),    O_BUFF|O_ONETILE|O_HWC,  Wc,    Hc,  UsedWc, UsedHc, PadInp, PadInp,          1, OverlapP, 0,        0, "");
         KArgs[Ka++] = KerArg ("Out",    KerArgSpace(2,T0,Os),    O_OUT|O_DB|O_HWC,        Wo,    Ho,                                         Abs(Out_DataSize),0,0,        0, "Out");
         KArgs[Ka++] = KerArg ("Infos",  KerArgSpace(1,T0),       O_IN|O_BUFF|O_NTILED|O_CONST,  AT_INF_NE16_DIM,    1,                      1,          0, 0,        0, "Infos");
 
@@ -776,34 +805,9 @@ static Kernel_T *CNN_ConvolutionNE16_Internal(
                 KernelIterSpace(3, IterParSpace(D1, OutFeat, OutTileCons), IterTiledSpace(T0), IterParSpace(D0|InFeatProp, InFeat, InTileCons))),
                 TileOrientation|TILE_HWC,
                 KCArgs,
-                Calls(10,
+                Calls(8,
                         Call("NE16_Enable", DWConv?LOC_D0_PROLOG:LOC_D1_PROLOG, Bindings(0)),
-                        BasicNE16Mode?Call("NE16_SoftReset", DWConv?LOC_D0_PROLOG:LOC_D1_PROLOG, Bindings(0)):AT_NO_CALL,
-                        BasicNE16Mode?Call("NE16_PrepareJob", DWConv?LOC_D0_PROLOG:LOC_D1_PROLOG,
-                                Bindings(21,
-                                        K_Arg("In", KER_ARG_FIRST_TILE),
-                                        K_Arg("In", KER_ARG_FIRST_TILE_W),
-                                        K_Arg("In", KER_ARG_FIRST_TILE_H),
-                                        K_Arg("In", KER_ARG_FIRST_TILE_PAD),
-                                        K_Arg("Filter", KER_ARG_FIRST_TILE),
-                                        K_Arg("Bias", KER_ARG_FIRST_TILE),
-                                        K_Arg("Out", KER_ARG_FIRST_TILE),
-                                        K_Arg("Scale", KER_ARG_FIRST_TILE),
-                                        K_Arg("ScaleN", KER_ARG_FIRST_TILE),
-                                        K_Arg("Out", KER_ARG_FIRST_TILE_W),
-                                        K_Arg("Out", KER_ARG_FIRST_TILE_H),
-                                        K_ArgPar("In", KER_ARG_PARTILE_SIZE, D0),
-                                        K_ArgPar(NeedConvout?"ConvOut":"Out", KER_ARG_FIRST_PARTILE_SIZE, Os),
-                                        Imm(Filter_DataSizeBits),
-                                        Imm(DEFAULT_NE16_JOB_CFG),
-                                        K_TileOper("Infos", "int *", '@', AT_INF_NE16_WOFFSET/4),
-                                        Imm(PadValue),
-                                        Imm(1),
-                                        K_ArgParOper("In", KER_ARG_PARTILE_DIM, D0, '=', 1),
-                                        Imm(0),
-                                        Imm(0)
-                                )
-                        ):AT_NO_CALL,
+                        Call("NE16_SoftReset", DWConv?LOC_D0_PROLOG:LOC_D1_PROLOG, Bindings(0)),
                         SetBiasKerName?Call(SetBiasKerName, DWConv?LOC_LOOP:LOC_D0, 
                                 Bindings(6,
                                         K_Arg("ConvOut", KER_ARG_TILE),                                         /* SetBias output tile */
@@ -815,8 +819,8 @@ static Kernel_T *CNN_ConvolutionNE16_Internal(
                                 )
                         ):AT_NO_CALL,
                         Call("NE16_SoftReset", DWConv?LOC_LOOP:LOC_D0, Bindings(0)),
-                        Call(BasicNE16Mode?"NE16_FireJob":ConvKerName, DWConv?LOC_LOOP:LOC_D0,
-                                Bindings(BasicNE16Mode?0:26,
+                        Call(ConvKerName, DWConv?LOC_LOOP:LOC_D0,
+                                Bindings(26,
                                         K_Arg("In", KER_ARG_TILE),                                              /* Conv input tile */
                                         K_Arg("Filter", KER_ARG_TILE),                                          /* Conv filter */
                                         K_Arg("Bias", KER_ARG_TILE),                                            /* Conv Bias when depth wise conv*/
@@ -827,7 +831,7 @@ static Kernel_T *CNN_ConvolutionNE16_Internal(
                                         K_ArgPar("Filter", KER_ARG_LOADEDPARTILE_SIZE, D0),                     /* Total Number of loaded input features in case of promotion */
                                         K_Arg("In", KER_ARG_TILE_H),                                            /* Conv input tile height */
                                         K_Arg("In", KER_ARG_TILE_W),                                            /* Conv input tile width */
-                                        K_ArgPar(NeedConvout?"ConvOut":"Out", KER_ARG_PARTILE_SIZE, Os),                          /* Number of output features in this tile */
+                                        K_ArgPar(NeedConvout?"ConvOut":"Out", KER_ARG_PARTILE_SIZE, Os),        /* Number of output features in this tile */
                                         K_Arg(NeedConvout?"ConvOut":"Out", KER_ARG_TILE_H),
                                         K_Arg(NeedConvout?"ConvOut":"Out", KER_ARG_TILE_W),
                                         Imm(PadValue),
@@ -845,36 +849,11 @@ static Kernel_T *CNN_ConvolutionNE16_Internal(
                                         NeedDcy?Imm(Dcy):AT_IGNORE_ARG_BINDING                                  /* Pooling Dy */
                                 )
                         ),
-                        BasicNE16Mode?Call("NE16_PrepareJob", DWConv?LOC_LOOP:LOC_D0,
-                                Bindings(21,
-                                        K_Arg("In", KER_ARG_NEXT_TILE),
-                                        K_Arg("In", KER_ARG_NEXT_TILE_W),
-                                        K_Arg("In", KER_ARG_NEXT_TILE_H),
-                                        K_Arg("In", KER_ARG_NEXT_TILE_PAD),
-                                        K_Arg("Filter", KER_ARG_NEXT_TILE),
-                                        K_Arg("Bias", KER_ARG_NEXT_TILE),
-                                        K_Arg("Out", KER_ARG_NEXT_TILE),
-                                        K_Arg("Scale", KER_ARG_NEXT_TILE),
-                                        K_Arg("ScaleN", KER_ARG_NEXT_TILE),
-                                        K_Arg("Out", KER_ARG_NEXT_TILE_W),
-                                        K_Arg("Out", KER_ARG_NEXT_TILE_H),
-                                        K_ArgPar("In", KER_ARG_NEXT_PARTILE_SIZE, D0),
-                                        K_ArgPar(NeedConvout?"ConvOut":"Out", KER_ARG_NEXT_PARTILE_SIZE, Os),
-                                        Imm(Filter_DataSizeBits),
-                                        Imm(DEFAULT_NE16_JOB_CFG),
-                                        K_TileOper("Infos", "int *", '@', AT_INF_NE16_WOFFSET/4),
-                                        Imm(PadValue),
-                                        K_ArgPred("In", KER_ARG_TILEFIRST, D0),
-                                        K_ArgPred("In", KER_ARG_NEXT_TILELAST, D0),
-                                        K_ArgPred("In", KER_ARG_NEXT_TILELAST, T0),
-                                        Imm(0)
-                                )
-                        ):AT_NO_CALL,
                         (NeedReduct==0)?AT_NO_CALL:
-                        Call(DPReductionKerName, DWConv?LOC_LOOP_EPILOG:LOC_D0_EPILOG,                                 /* DP Reduction also take care of optional activation */
+                        Call(DPReductionKerName, DWConv?LOC_LOOP_EPILOG:LOC_D0_EPILOG,                          /* DPReduction also take care of optional activation */
                                 Bindings(8,
                                         K_Arg("ConvOut", KER_ARG_TILE),                                         /* Double precision input tile */
-                                        K_Arg(PoolOper?"ActOut":"Out", KER_ARG_TILE),                          /* Single precision output tile, warning use IO kernel when In=Out */
+                                        K_Arg(PoolOper?"ConvOut":"Out", KER_ARG_TILE),                           /* Single precision output tile, warning use IO kernel when In=Out */
                                         ParFeat?K_ArgPar("ConvOut", KER_ARG_PARTILE_SIZE, Os):Imm(1),           /* Input tile Number of features */
                                         K_Arg("ConvOut", KER_ARG_TILE_W),                                       /* Input tile width */
                                         K_Arg("ConvOut", KER_ARG_TILE_H),                                       /* Input tile height */
@@ -885,20 +864,21 @@ static Kernel_T *CNN_ConvolutionNE16_Internal(
                         ),
                         (PoolKerName==0)?AT_NO_CALL:
                         Call(PoolKerName, DWConv?LOC_LOOP:LOC_D0_EPILOG,
-                                Bindings(13,
-                                        K_Arg(NeedReduct?"ActOut":"ConvOut", KER_ARG_TILE),
-                                        K_Arg(NeedReduct?"ActOut":"ConvOut", KER_ARG_TILE_W),
-                                        K_Arg(NeedReduct?"ActOut":"ConvOut", KER_ARG_TILE_H),
+                                Bindings(14,
+                                        K_Arg("ConvOut", KER_ARG_TILE),
+                                        K_Arg("ConvOut", KER_ARG_TILE_W),
+                                        K_Arg("ConvOut", KER_ARG_TILE_H),
                                         NeedFpx?Imm(Fpx):AT_IGNORE_ARG_BINDING,                 /* Pool Fx */
                                         NeedFpy?Imm(Fpy):AT_IGNORE_ARG_BINDING,                 /* Pool Fy */
                                         NeedSpx?Imm(Spx):AT_IGNORE_ARG_BINDING,                 /* Pool Stridex */
                                         NeedSpy?Imm(Spy):AT_IGNORE_ARG_BINDING,                 /* Pool Stridey */
-                                        K_ArgPred(NeedReduct?"ActOut":"ConvOut", KER_ARG_TILEFIRST, T0),            /* First Tile */   
-                                        K_Arg(NeedReduct?"ActOut":"ConvOut", KER_ARG_TILE_PAD),                     /* Pool Padding */
+                                        K_ArgPred("ConvOut", KER_ARG_TILEFIRST, T0),            /* First Tile */   
+                                        K_Arg("ConvOut", KER_ARG_TILE_PAD),                     /* Pool Padding */
                                         K_Arg("Out", KER_ARG_TILE),                             /* Pooling output tile */
-                                        K_ArgPar(NeedReduct?"ActOut":"ConvOut", KER_ARG_PARTILE_SIZE, D1),          /* In Features */
+                                        K_ArgPar("ConvOut", KER_ARG_PARTILE_SIZE, D1),          /* In Features */
                                         K_Arg("Out", KER_ARG_TILE_W),                           /* Output tile width */
-                                        K_Arg("Out", KER_ARG_TILE_H)                            /* Output tile height */
+                                        K_Arg("Out", KER_ARG_TILE_H),                            /* Output tile height */
+                                        K_Arg("Infos", KER_ARG_TILE)                                            /* Infos */
                                 )
                         ),
                         Call("NE16_Disable", DWConv?LOC_D0_EPILOG:LOC_D1_EPILOG, Bindings(0))
@@ -971,6 +951,10 @@ int CNN_ConvolutionNE16(
                 CNN_LinearAct_NE16(Name, Ctrl, In_DataSize, Out_DataSize, Bias_DataSize, Scale_DataSize, Filter_DataSizeBits, InFeat, OutFeat, KOP_LINEAR, ActOper);
                 return 1;
         }
+        int HWC = 0;
+        if (Ctrl) {
+                if (Ctrl->HWC != -1) HWC = Ctrl->HWC;
+        }
         unsigned int MinTile;
         unsigned int InTileCons;
         if (PoolOper==KOP_NONE) {
@@ -987,6 +971,14 @@ int CNN_ConvolutionNE16(
         unsigned int Sol1TileCons = TileCons, Sol2TileCons = TileCons;
 
         AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_OFF);
+        if (HWC) {
+                printf("---------------------------------------------------------- CNN_ConvolutionNE16 MM ---------------------------------------------------------------------------\n");
+                Ker = CNN_MM_ConvolutionNE16(Name, Ctrl, 
+                                             In_DataSize, Out_DataSize, Bias_DataSize, Scale_DataSize, Filter_DataSizeBits, InFeat, OutFeat, Width, Height,
+                                             ConvOper, Fcx, Fcy, Dcx, Dcy, Scx, Scy, ConvPad, PadValue, PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PoolPad, ActOper);
+                if (Ker) return 1;
+                else  printf("---------------------------------------------------------- MM NO SOLUTION FOUND ---------------------------------------------------------------------------\n");
+        }
         printf("----------------------------------------------------------CNN_ConvolutionNE16------------------------------------------------------------------------------\n");
         Ker = CNN_ConvolutionNE16_Internal(Name, Ctrl,
                     In_DataSize, Out_DataSize, Bias_DataSize, Scale_DataSize, Filter_DataSizeBits, InFeat, OutFeat, Width, Height,
diff --git a/tools/autotiler_v3/CNN_Generators_SQ8/CNN_Generators_SQ8.c b/tools/autotiler_v3/CNN_Generators_SQ8/CNN_Generators_SQ8.c
index 44b3c2891..2298074f7 100644
--- a/tools/autotiler_v3/CNN_Generators_SQ8/CNN_Generators_SQ8.c
+++ b/tools/autotiler_v3/CNN_Generators_SQ8/CNN_Generators_SQ8.c
@@ -104,7 +104,7 @@ void LoadCNN_SQ8_Library()
 	);
 
 	LibKernelTemplate("Ker_MM_Pool_SQ8_T",
-		CArgs(13,
+		CArgs(14,
 			TCArg("signed char * __restrict__", "In"),
 			TCArg("unsigned short int", "W"),
 			TCArg("unsigned short int", "H"),
@@ -117,11 +117,12 @@ void LoadCNN_SQ8_Library()
 			TCArg("signed char * __restrict__", "Out"),
 			TCArg("unsigned short int", "Feat"),
 			TCArg("unsigned short int", "Wo"),
-			TCArg("unsigned short int", "Ho")
+			TCArg("unsigned short int", "Ho"),
+			TCArg("signed char * __restrict__", "Infos")
 		)
 	);
 	LibKernelTemplate("Ker_MM_Pool_USQ8_T",
-		CArgs(13,
+		CArgs(14,
 			TCArg("unsigned char * __restrict__", "In"),
 			TCArg("unsigned short int", "W"),
 			TCArg("unsigned short int", "H"),
@@ -134,7 +135,8 @@ void LoadCNN_SQ8_Library()
 			TCArg("unsigned char * __restrict__", "Out"),
 			TCArg("unsigned short int", "Feat"),
 			TCArg("unsigned short int", "Wo"),
-			TCArg("unsigned short int", "Ho")
+			TCArg("unsigned short int", "Ho"),
+			TCArg("signed char * __restrict__", "Infos")
 		)
 	);
 	LibKernelTemplate("KerConvLinReduct_SQ8_T",
@@ -558,12 +560,49 @@ void LoadCNN_SQ8_Library()
         LibKernel("KerParPoolNxMStrideSxSy_ReLUMN_SQ8", CALL_PARALLEL, 0, "KerPool_SQ8_T",	CNN_Match(CNN_OperList(2, KOP_MAXPOOL, KOP_AVGPOOL), CNN_OperList(1, KOP_RELUMN), 1,
 													  CNN_Type(1,0,0,0,1), -1,-1,1,1,-1,-1));
 
-        LibKernel("KerParMaxPoolNxMStrideSxSy_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_SQ8_T",
-												CNN_Match(CNN_OperList(1, KOP_MAXPOOL), CNN_OperList(1, KOP_NONE), 1,
-													  CNN_Type(1,0,0,0,1), -1,-1,1,1,-1,-1));
-        LibKernel("KerParAvgPoolNxMStrideSxSy_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_SQ8_T",
-												CNN_Match(CNN_OperList(1, KOP_AVGPOOL), CNN_OperList(1, KOP_NONE), 1,
-													  CNN_Type(1,0,0,0,1), -1,-1,1,1,-1,-1));
+        LibKernel("KerParMaxPoolNxMStrideSxSy_HWC_SQ8", 		CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MAXPOOL), CNN_OperList(1, KOP_NONE), 	    1, CNN_Type(1,0,0,0,1), -1,-1,1,1,-1,-1));
+        LibKernel("KerParMaxPoolNxMStrideSxSy_HWC_ReLU_SQ8", 		CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MAXPOOL), CNN_OperList(1, KOP_RELU),	    1, CNN_Type(1,0,0,0,1), -1,-1,1,1,-1,-1));
+        LibKernel("KerParMaxPoolNxMStrideSxSy_HWC_ReLUN_SQ8", 		CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MAXPOOL), CNN_OperList(1, KOP_RELUN),	    1, CNN_Type(1,0,0,0,1), -1,-1,1,1,-1,-1));
+        LibKernel("KerParMaxPoolNxMStrideSxSy_HWC_ReLUM_SQ8", 		CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MAXPOOL), CNN_OperList(1, KOP_RELUM),	    1, CNN_Type(1,0,0,0,1), -1,-1,1,1,-1,-1));
+        LibKernel("KerParMaxPoolNxMStrideSxSy_HWC_ReLUMN_SQ8", 		CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MAXPOOL), CNN_OperList(1, KOP_RELUMN),    1, CNN_Type(1,0,0,0,1), -1,-1,1,1,-1,-1));
+        LibKernel("KerParMaxPoolNxMStrideSxSy_HWC_LeakyReLU_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MAXPOOL), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(1,0,0,0,1), -1,-1,1,1,-1,-1));
+        LibKernel("KerParMaxPoolNxMStrideSxSy_HWC_HSwish_SQ8", 		CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MAXPOOL), CNN_OperList(1, KOP_HSWISH),    1, CNN_Type(1,0,0,0,1), -1,-1,1,1,-1,-1));
+        LibKernel("KerParMaxPoolNxMStrideSxSy_HWC_HSigmoid_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MAXPOOL), CNN_OperList(1, KOP_HSIGMOID),  1, CNN_Type(1,0,0,0,1), -1,-1,1,1,-1,-1));
+        LibKernel("KerParMaxPoolNxMStrideSxSy_HWC_Sigmoid_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MAXPOOL), CNN_OperList(1, KOP_SIGMOID),   1, CNN_Type(1,0,0,0,1), -1,-1,1,1,-1,-1));
+        LibKernel("KerParMaxPoolNxMStrideSxSy_HWC_Tanh_SQ8", 		CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MAXPOOL), CNN_OperList(1, KOP_TANH),	    1, CNN_Type(1,0,0,0,1), -1,-1,1,1,-1,-1));
+        
+        LibKernel("KerParAvgPoolNxMStrideSxSy_HWC_SQ8", 		CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_SQ8_T", CNN_Match(CNN_OperList(1, KOP_AVGPOOL), CNN_OperList(1, KOP_NONE), 	    1, CNN_Type(1,0,0,0,1), -1,-1,1,1,-1,-1));
+        LibKernel("KerParAvgPoolNxMStrideSxSy_HWC_ReLU_SQ8", 		CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_SQ8_T", CNN_Match(CNN_OperList(1, KOP_AVGPOOL), CNN_OperList(1, KOP_RELU),	    1, CNN_Type(1,0,0,0,1), -1,-1,1,1,-1,-1));
+        LibKernel("KerParAvgPoolNxMStrideSxSy_HWC_ReLUN_SQ8", 		CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_SQ8_T", CNN_Match(CNN_OperList(1, KOP_AVGPOOL), CNN_OperList(1, KOP_RELUN),	    1, CNN_Type(1,0,0,0,1), -1,-1,1,1,-1,-1));
+        LibKernel("KerParAvgPoolNxMStrideSxSy_HWC_ReLUM_SQ8", 		CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_SQ8_T", CNN_Match(CNN_OperList(1, KOP_AVGPOOL), CNN_OperList(1, KOP_RELUM),	    1, CNN_Type(1,0,0,0,1), -1,-1,1,1,-1,-1));
+        LibKernel("KerParAvgPoolNxMStrideSxSy_HWC_ReLUMN_SQ8", 		CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_SQ8_T", CNN_Match(CNN_OperList(1, KOP_AVGPOOL), CNN_OperList(1, KOP_RELUMN),    1, CNN_Type(1,0,0,0,1), -1,-1,1,1,-1,-1));
+        LibKernel("KerParAvgPoolNxMStrideSxSy_HWC_LeakyReLU_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_SQ8_T", CNN_Match(CNN_OperList(1, KOP_AVGPOOL), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(1,0,0,0,1), -1,-1,1,1,-1,-1));
+        LibKernel("KerParAvgPoolNxMStrideSxSy_HWC_HSwish_SQ8", 		CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_SQ8_T", CNN_Match(CNN_OperList(1, KOP_AVGPOOL), CNN_OperList(1, KOP_HSWISH),    1, CNN_Type(1,0,0,0,1), -1,-1,1,1,-1,-1));
+        LibKernel("KerParAvgPoolNxMStrideSxSy_HWC_HSigmoid_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_SQ8_T", CNN_Match(CNN_OperList(1, KOP_AVGPOOL), CNN_OperList(1, KOP_HSIGMOID),  1, CNN_Type(1,0,0,0,1), -1,-1,1,1,-1,-1));
+        LibKernel("KerParAvgPoolNxMStrideSxSy_HWC_Sigmoid_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_SQ8_T", CNN_Match(CNN_OperList(1, KOP_AVGPOOL), CNN_OperList(1, KOP_SIGMOID),   1, CNN_Type(1,0,0,0,1), -1,-1,1,1,-1,-1));
+        LibKernel("KerParAvgPoolNxMStrideSxSy_HWC_Tanh_SQ8", 		CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_SQ8_T", CNN_Match(CNN_OperList(1, KOP_AVGPOOL), CNN_OperList(1, KOP_TANH),	    1, CNN_Type(1,0,0,0,1), -1,-1,1,1,-1,-1));
+
+        LibKernel("KerParMaxPoolNxMStrideSxSy_HWC_USQ8", 		CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_USQ8_T", CNN_Match(CNN_OperList(1, KOP_MAXPOOL), CNN_OperList(1, KOP_NONE),      1, CNN_Type(-1,0,0,0,-1), -1,-1,1,1,-1,-1));
+        LibKernel("KerParMaxPoolNxMStrideSxSy_HWC_ReLU_USQ8", 		CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_USQ8_T", CNN_Match(CNN_OperList(1, KOP_MAXPOOL), CNN_OperList(1, KOP_RELU),	     1, CNN_Type(-1,0,0,0,-1), -1,-1,1,1,-1,-1));
+        LibKernel("KerParMaxPoolNxMStrideSxSy_HWC_ReLUN_USQ8", 		CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_USQ8_T", CNN_Match(CNN_OperList(1, KOP_MAXPOOL), CNN_OperList(1, KOP_RELUN),     1, CNN_Type(-1,0,0,0,-1), -1,-1,1,1,-1,-1));
+        LibKernel("KerParMaxPoolNxMStrideSxSy_HWC_ReLUM_USQ8", 		CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_USQ8_T", CNN_Match(CNN_OperList(1, KOP_MAXPOOL), CNN_OperList(1, KOP_RELUM),     1, CNN_Type(-1,0,0,0,-1), -1,-1,1,1,-1,-1));
+        LibKernel("KerParMaxPoolNxMStrideSxSy_HWC_ReLUMN_USQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_USQ8_T", CNN_Match(CNN_OperList(1, KOP_MAXPOOL), CNN_OperList(1, KOP_RELUMN),    1, CNN_Type(-1,0,0,0,-1), -1,-1,1,1,-1,-1));
+        LibKernel("KerParMaxPoolNxMStrideSxSy_HWC_LeakyReLU_USQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_USQ8_T", CNN_Match(CNN_OperList(1, KOP_MAXPOOL), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(-1,0,0,0,-1), -1,-1,1,1,-1,-1));
+        LibKernel("KerParMaxPoolNxMStrideSxSy_HWC_HSwish_USQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_USQ8_T", CNN_Match(CNN_OperList(1, KOP_MAXPOOL), CNN_OperList(1, KOP_HSWISH),    1, CNN_Type(-1,0,0,0,-1), -1,-1,1,1,-1,-1));
+        LibKernel("KerParMaxPoolNxMStrideSxSy_HWC_HSigmoid_USQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_USQ8_T", CNN_Match(CNN_OperList(1, KOP_MAXPOOL), CNN_OperList(1, KOP_HSIGMOID),  1, CNN_Type(-1,0,0,0,-1), -1,-1,1,1,-1,-1));
+        LibKernel("KerParMaxPoolNxMStrideSxSy_HWC_Sigmoid_USQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_USQ8_T", CNN_Match(CNN_OperList(1, KOP_MAXPOOL), CNN_OperList(1, KOP_SIGMOID),   1, CNN_Type(-1,0,0,0,-1), -1,-1,1,1,-1,-1));
+        LibKernel("KerParMaxPoolNxMStrideSxSy_HWC_Tanh_USQ8", 		CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_USQ8_T", CNN_Match(CNN_OperList(1, KOP_MAXPOOL), CNN_OperList(1, KOP_TANH),	     1, CNN_Type(-1,0,0,0,-1), -1,-1,1,1,-1,-1));
+        
+        LibKernel("KerParAvgPoolNxMStrideSxSy_HWC_USQ8", 		CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_USQ8_T", CNN_Match(CNN_OperList(1, KOP_AVGPOOL), CNN_OperList(1, KOP_NONE),      1, CNN_Type(-1,0,0,0,-1), -1,-1,1,1,-1,-1));
+        LibKernel("KerParAvgPoolNxMStrideSxSy_HWC_ReLU_USQ8", 		CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_USQ8_T", CNN_Match(CNN_OperList(1, KOP_AVGPOOL), CNN_OperList(1, KOP_RELU),	     1, CNN_Type(-1,0,0,0,-1), -1,-1,1,1,-1,-1));
+        LibKernel("KerParAvgPoolNxMStrideSxSy_HWC_ReLUN_USQ8", 		CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_USQ8_T", CNN_Match(CNN_OperList(1, KOP_AVGPOOL), CNN_OperList(1, KOP_RELUN),     1, CNN_Type(-1,0,0,0,-1), -1,-1,1,1,-1,-1));
+        LibKernel("KerParAvgPoolNxMStrideSxSy_HWC_ReLUM_USQ8", 		CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_USQ8_T", CNN_Match(CNN_OperList(1, KOP_AVGPOOL), CNN_OperList(1, KOP_RELUM),     1, CNN_Type(-1,0,0,0,-1), -1,-1,1,1,-1,-1));
+        LibKernel("KerParAvgPoolNxMStrideSxSy_HWC_ReLUMN_USQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_USQ8_T", CNN_Match(CNN_OperList(1, KOP_AVGPOOL), CNN_OperList(1, KOP_RELUMN),    1, CNN_Type(-1,0,0,0,-1), -1,-1,1,1,-1,-1));
+        LibKernel("KerParAvgPoolNxMStrideSxSy_HWC_LeakyReLU_USQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_USQ8_T", CNN_Match(CNN_OperList(1, KOP_AVGPOOL), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(-1,0,0,0,-1), -1,-1,1,1,-1,-1));
+        LibKernel("KerParAvgPoolNxMStrideSxSy_HWC_HSwish_USQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_USQ8_T", CNN_Match(CNN_OperList(1, KOP_AVGPOOL), CNN_OperList(1, KOP_HSWISH),    1, CNN_Type(-1,0,0,0,-1), -1,-1,1,1,-1,-1));
+        LibKernel("KerParAvgPoolNxMStrideSxSy_HWC_HSigmoid_USQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_USQ8_T", CNN_Match(CNN_OperList(1, KOP_AVGPOOL), CNN_OperList(1, KOP_HSIGMOID),  1, CNN_Type(-1,0,0,0,-1), -1,-1,1,1,-1,-1));
+        LibKernel("KerParAvgPoolNxMStrideSxSy_HWC_Sigmoid_USQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_USQ8_T", CNN_Match(CNN_OperList(1, KOP_AVGPOOL), CNN_OperList(1, KOP_SIGMOID),   1, CNN_Type(-1,0,0,0,-1), -1,-1,1,1,-1,-1));
+        LibKernel("KerParAvgPoolNxMStrideSxSy_HWC_Tanh_USQ8", 		CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_USQ8_T", CNN_Match(CNN_OperList(1, KOP_AVGPOOL), CNN_OperList(1, KOP_TANH),	     1, CNN_Type(-1,0,0,0,-1), -1,-1,1,1,-1,-1));
 
 	/* Global Pooling (Max or Avg) with tensor centric scaling and optional ReLU or ReLUN activation */
         LibKernel("KerParGlobalMaxPoolFullFeat_SQ8", CALL_PARALLEL, 0, "KerGlobalPool_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_GLOBAL_MAXPOOL), CNN_OperList(1, KOP_NONE), 1,
@@ -1076,47 +1115,38 @@ void LoadCNN_SQ8_Library()
         LibKernel("KerConvDWNxMDxDyStrideSxSyB32_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T",CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,4,0,4), -1,-1,-1,-1,-1,-1));
 
 	/* Convolution, Linear output reduction with per channel scaling and optional activation. Out != In and In Place (IO)  */
-	LibKernel("KerReduct_CC_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T",	CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_NONE), 0,
-												  CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
-	LibKernel("KerReduct_CC_ReLU_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T",	CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_RELU), 0,
-												  CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
-	LibKernel("KerReduct_CC_ReLUN_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T",	CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_RELUN), 0,
-												  CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
-	LibKernel("KerReduct_CC_ReLUM_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T",	CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_RELUM), 0,
-												  CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
-	LibKernel("KerReduct_CC_ReLUMN_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T",	CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_RELUMN), 0,
-												  CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
-	LibKernel("KerReduct_CC_HSigmoid_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T",CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_HSIGMOID), 0,
-												  CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
-	LibKernel("KerReduct_CC_HSwish_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T",CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_HSWISH), 0,
-												  CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
-	LibKernel("KerReduct_CC_LeakyReLU_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T",CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_LEAKYRELU), 0,
-												  CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
-	LibKernel("KerReduct_CC_Sigmoid_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T",CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_SIGMOID), 0,
-												  CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
-	LibKernel("KerReduct_CC_Tanh_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T",CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_TANH), 0,
-												  CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
-
-	LibKernel("KerReductIO_CC_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T",	CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_NONE), 0,
-												  CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
-	LibKernel("KerReductIO_CC_ReLU_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T",CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_RELU), 0,
-												  CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
-	LibKernel("KerReductIO_CC_ReLUN_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T",CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_RELUN), 0,
-												  CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
-	LibKernel("KerReductIO_CC_ReLUM_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T",CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_RELUM), 0,
-												  CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
-	LibKernel("KerReductIO_CC_ReLUMN_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T",CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_RELUMN), 0,
-												  CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
-	LibKernel("KerReductIO_CC_HSigmoid_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T",CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_HSIGMOID), 0,
-												  CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
-	LibKernel("KerReductIO_CC_HSwish_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T",CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_HSWISH), 0,
-												  CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
-	LibKernel("KerReductIO_CC_LeakyReLU_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T",CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_LEAKYRELU), 0,
-												  CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
-	LibKernel("KerReductIO_CC_Sigmoid_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T",CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_SIGMOID), 0,
-												  CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
-	LibKernel("KerReductIO_CC_Tanh_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T",CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_TANH), 0,
-												  CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
+	LibKernel("KerReduct_CC_SQ8", 			CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_NONE),      0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
+	LibKernel("KerReduct_CC_ReLU_SQ8", 		CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_RELU),      0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
+	LibKernel("KerReduct_CC_ReLUN_SQ8", 		CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_RELUN),     0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
+	LibKernel("KerReduct_CC_ReLUM_SQ8", 		CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_RELUM),     0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
+	LibKernel("KerReduct_CC_ReLUMN_SQ8", 		CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_RELUMN),    0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
+	LibKernel("KerReduct_CC_HSigmoid_SQ8", 		CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_HSIGMOID),  0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
+	LibKernel("KerReduct_CC_HSwish_SQ8", 		CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_HSWISH),    0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
+	LibKernel("KerReduct_CC_LeakyReLU_SQ8", 	CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_LEAKYRELU), 0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
+	LibKernel("KerReduct_CC_Sigmoid_SQ8", 		CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_SIGMOID),   0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
+	LibKernel("KerReduct_CC_Tanh_SQ8", 		CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_TANH),      0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
+
+	LibKernel("KerReductIO_CC_SQ8", 		CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_NONE),      0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
+	LibKernel("KerReductIO_CC_ReLU_SQ8", 		CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_RELU),      0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
+	LibKernel("KerReductIO_CC_ReLUN_SQ8", 		CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_RELUN),     0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
+	LibKernel("KerReductIO_CC_ReLUM_SQ8", 		CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_RELUM),     0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
+	LibKernel("KerReductIO_CC_ReLUMN_SQ8", 		CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_RELUMN),    0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
+	LibKernel("KerReductIO_CC_HSigmoid_SQ8", 	CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_HSIGMOID),  0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
+	LibKernel("KerReductIO_CC_HSwish_SQ8", 		CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_HSWISH),    0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
+	LibKernel("KerReductIO_CC_LeakyReLU_SQ8", 	CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_LEAKYRELU), 0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
+	LibKernel("KerReductIO_CC_Sigmoid_SQ8", 	CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_SIGMOID),   0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
+	LibKernel("KerReductIO_CC_Tanh_SQ8", 		CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_TANH),      0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
+
+	LibKernel("KerReductIO_CC_HWC_SQ8", 		CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_NONE),      0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
+	LibKernel("KerReductIO_CC_HWC_ReLU_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_RELU),      0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
+	LibKernel("KerReductIO_CC_HWC_ReLUN_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_RELUN),     0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
+	LibKernel("KerReductIO_CC_HWC_ReLUM_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_RELUM),     0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
+	LibKernel("KerReductIO_CC_HWC_ReLUMN_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_RELUMN),    0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
+	LibKernel("KerReductIO_CC_HWC_HSigmoid_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_HSIGMOID),  0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
+	LibKernel("KerReductIO_CC_HWC_HSwish_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_HSWISH),    0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
+	LibKernel("KerReductIO_CC_HWC_LeakyReLU_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_LEAKYRELU), 0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
+	LibKernel("KerReductIO_CC_HWC_Sigmoid_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_SIGMOID),   0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
+	LibKernel("KerReductIO_CC_HWC_Tanh_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_TANH),      0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
 
 	/* Activation and reduct for CHW input and HWC output Layer Layout */
 	LibKernel("KerParReduct_CC_CHW2HWC_SQ8",	        CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_CHW2HWC), CNN_OperList(1, KOP_NONE),      1, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
@@ -1221,6 +1251,54 @@ void LoadCNN_SQ8_Library()
         LibKernel("KerReduct_CC_NoScale_Sigmoid_USQ16",		CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE), CNN_OperList(1, KOP_SIGMOID),   1, CNN_Type(4,0,0,0,-2), 0,0,0,0,0,0));
         LibKernel("KerReduct_CC_NoScale_Tanh_USQ16",		CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE), CNN_OperList(1, KOP_TANH),      1, CNN_Type(4,0,0,0,-2), 0,0,0,0,0,0));
 
+
+        /* Activation and Reduct without PerChannel Scaling */
+        LibKernel("KerReductIO_CC_NoScale_SQ8",	   		CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_NONE),      1, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
+        LibKernel("KerReductIO_CC_NoScale_ReLU_SQ8", 		CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_RELU),      1, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
+        LibKernel("KerReductIO_CC_NoScale_ReLUN_SQ8",	 	CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_RELUN),     1, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
+        LibKernel("KerReductIO_CC_NoScale_ReLUM_SQ8",	 	CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_RELUM),     1, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
+        LibKernel("KerReductIO_CC_NoScale_ReLUMN_SQ8",	 	CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_RELUMN),    1, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
+        LibKernel("KerReductIO_CC_NoScale_HSigmoid_SQ8",	CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_HSIGMOID),  1, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
+        LibKernel("KerReductIO_CC_NoScale_HSwish_SQ8",	 	CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_HSWISH),    1, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
+        LibKernel("KerReductIO_CC_NoScale_LeakyReLU_SQ8",	CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
+        LibKernel("KerReductIO_CC_NoScale_Sigmoid_SQ8",	 	CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_SIGMOID),   1, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
+        LibKernel("KerReductIO_CC_NoScale_Tanh_SQ8", 		CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_TANH),      1, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
+
+        LibKernel("KerReductIO_CC_NoScale_SQ16",	   	CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_NONE),      1, CNN_Type(4,0,0,0,2), 0,0,0,0,0,0));
+        LibKernel("KerReductIO_CC_NoScale_ReLU_SQ16",	 	CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_RELU),      1, CNN_Type(4,0,0,0,2), 0,0,0,0,0,0));
+        LibKernel("KerReductIO_CC_NoScale_ReLUN_SQ16",	 	CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_RELUN),     1, CNN_Type(4,0,0,0,2), 0,0,0,0,0,0));
+        LibKernel("KerReductIO_CC_NoScale_ReLUM_SQ16",	 	CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_RELUM),     1, CNN_Type(4,0,0,0,2), 0,0,0,0,0,0));
+        LibKernel("KerReductIO_CC_NoScale_ReLUMN_SQ16",	 	CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_RELUMN),    1, CNN_Type(4,0,0,0,2), 0,0,0,0,0,0));
+        LibKernel("KerReductIO_CC_NoScale_HSigmoid_SQ16",	CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_HSIGMOID),  1, CNN_Type(4,0,0,0,2), 0,0,0,0,0,0));
+        LibKernel("KerReductIO_CC_NoScale_HSwish_SQ16",	 	CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_HSWISH),    1, CNN_Type(4,0,0,0,2), 0,0,0,0,0,0));
+        LibKernel("KerReductIO_CC_NoScale_LeakyReLU_SQ16",	CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(4,0,0,0,2), 0,0,0,0,0,0));
+        LibKernel("KerReductIO_CC_NoScale_Sigmoid_SQ16",	CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_SIGMOID),   1, CNN_Type(4,0,0,0,2), 0,0,0,0,0,0));
+        LibKernel("KerReductIO_CC_NoScale_Tanh_SQ16",	 	CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_TANH),      1, CNN_Type(4,0,0,0,2), 0,0,0,0,0,0));
+
+        /* Unsigned */
+        LibKernel("KerReductIO_CC_NoScale_USQ8",  	   	CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_NONE),      1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0));
+        LibKernel("KerReductIO_CC_NoScale_ReLU_USQ8", 		CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_RELU),      1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0));
+        LibKernel("KerReductIO_CC_NoScale_ReLUN_USQ8",		CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_RELUN),     1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0));
+        LibKernel("KerReductIO_CC_NoScale_ReLUM_USQ8",		CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_RELUM),     1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0));
+        LibKernel("KerReductIO_CC_NoScale_ReLUMN_USQ8",		CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_RELUMN),    1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0));
+        LibKernel("KerReductIO_CC_NoScale_HSigmoid_USQ8",	CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_HSIGMOID),  1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0));
+        LibKernel("KerReductIO_CC_NoScale_HSwish_USQ8",	 	CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_HSWISH),    1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0));
+        LibKernel("KerReductIO_CC_NoScale_LeakyReLU_USQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0));
+        LibKernel("KerReductIO_CC_NoScale_Sigmoid_USQ8",	CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_SIGMOID),   1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0));
+        LibKernel("KerReductIO_CC_NoScale_Tanh_USQ8", 		CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_TANH),      1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0));
+
+        LibKernel("KerReductIO_CC_NoScale_USQ16",  		CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_NONE),      1, CNN_Type(4,0,0,0,-2), 0,0,0,0,0,0));
+        LibKernel("KerReductIO_CC_NoScale_ReLU_USQ16", 		CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_RELU),      1, CNN_Type(4,0,0,0,-2), 0,0,0,0,0,0));
+        LibKernel("KerReductIO_CC_NoScale_ReLUN_USQ16",		CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_RELUN),     1, CNN_Type(4,0,0,0,-2), 0,0,0,0,0,0));
+        LibKernel("KerReductIO_CC_NoScale_ReLUM_USQ16",		CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_RELUM),     1, CNN_Type(4,0,0,0,-2), 0,0,0,0,0,0));
+        LibKernel("KerReductIO_CC_NoScale_ReLUMN_USQ16",	CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_RELUMN),    1, CNN_Type(4,0,0,0,-2), 0,0,0,0,0,0));
+        LibKernel("KerReductIO_CC_NoScale_HSigmoid_USQ16",	CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_HSIGMOID),  1, CNN_Type(4,0,0,0,-2), 0,0,0,0,0,0));
+        LibKernel("KerReductIO_CC_NoScale_HSwish_USQ16",	CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_HSWISH),    1, CNN_Type(4,0,0,0,-2), 0,0,0,0,0,0));
+        LibKernel("KerReductIO_CC_NoScale_LeakyReLU_USQ16",	CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(4,0,0,0,-2), 0,0,0,0,0,0));
+        LibKernel("KerReductIO_CC_NoScale_Sigmoid_USQ16",	CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_SIGMOID),   1, CNN_Type(4,0,0,0,-2), 0,0,0,0,0,0));
+        LibKernel("KerReductIO_CC_NoScale_Tanh_USQ16",		CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_TANH),      1, CNN_Type(4,0,0,0,-2), 0,0,0,0,0,0));
+
+
 	/* Activations with tensor centric scaling */
 	LibKernel("Ker_ActNone_SQ8", CALL_PARALLEL, 0, "KerActivation_SQ8_T",		CNN_Match(CNN_OperList(1, KOP_ACT_NONE), 0, 0, CNN_Type(1,0,0,0,1), 0,0,0,0,0,0));
         LibKernel("Ker_ReLU_SQ8", CALL_PARALLEL, 0, "KerActivation_SQ8_T",		CNN_Match(CNN_OperList(1, KOP_RELU),     0, 0, CNN_Type(1,0,0,0,1), 0,0,0,0,0,0));
@@ -1255,13 +1333,6 @@ void LoadCNN_SQ8_Library()
         LibKernel("KerPoolNxMStrideSxSy_ReLUN_SQ8", CALL_PARALLEL, 0, "KerPool_SQ8_T",	CNN_Match(CNN_OperList(2, KOP_MAXPOOL, KOP_AVGPOOL), CNN_OperList(1, KOP_RELUN), 0,
         										 	  CNN_Type(1,0,0,0,1), -1,-1,1,1,-1,-1));
 
-        /* Unsigned int8 input/output functions for NE16 */
-        LibKernel("KerParMaxPoolNxMStrideSxSy_HWC_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_USQ8_T",
-        										CNN_Match(CNN_OperList(1, KOP_MAXPOOL), 0, 1,
-        										 	  CNN_Type(-1,0,0,0,-1), -1,-1,1,1,-1,-1));
-        LibKernel("KerParAvgPoolNxMStrideSxSy_HWC_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_USQ8_T",
-        										CNN_Match(CNN_OperList(1, KOP_AVGPOOL), 0, 1,
-        										 	  CNN_Type(-1,0,0,0,-1), -1,-1,1,1,-1,-1));
         LoadCNN_Copy_Library();
 }
 
@@ -1549,7 +1620,7 @@ static Kernel_T *CNN_MM_ConvolutionPoolAct_SQ8_Internal(
 					Imm((ActOper==KOP_NONE)),						/* Scaling when no activation */
 					K_Arg("Infos", KER_ARG_TILE)						/* Infos */
 				):
-				Bindings(13,
+				Bindings(14,
 					K_Arg("ConvOut", KER_ARG_TILE),						/* Input tile */
 					K_Arg("ConvOut", KER_ARG_TILE_W),					/* Input tile width */
 					K_Arg("ConvOut", KER_ARG_TILE_H),					/* Input tile height */
@@ -1562,7 +1633,8 @@ static Kernel_T *CNN_MM_ConvolutionPoolAct_SQ8_Internal(
 					K_Arg("Out", KER_ARG_TILE),						/* Pooling output tile */
 					K_ArgPar("ConvOut", KER_ARG_PARTILE_SIZE, Os),				/* In Features */
 					K_Arg("Out", KER_ARG_TILE_W),						/* Output tile width */
-					K_Arg("Out", KER_ARG_TILE_H)						/* Output tile height */
+					K_Arg("Out", KER_ARG_TILE_H),						/* Output tile height */
+					K_Arg("Infos", KER_ARG_TILE)						/* Infos */
 				)
 			),
 			(ActKerName==0)?AT_NO_CALL:
@@ -1859,7 +1931,7 @@ static Kernel_T *CNN_HWC_DWConvolutionPoolAct_SQ8_Internal(
 			),
 			(PoolKerName==0)?AT_NO_CALL:
 			Call(PoolKerName, LOC_D0,
-				Bindings(13,
+				Bindings(14,
 					K_Arg("ConvOut", KER_ARG_TILE),						/* Input tile */
 					K_Arg("ConvOut", KER_ARG_TILE_W),					/* Input tile width */
 					K_Arg("ConvOut", KER_ARG_TILE_H),					/* Input tile height */
@@ -1872,7 +1944,8 @@ static Kernel_T *CNN_HWC_DWConvolutionPoolAct_SQ8_Internal(
 					K_Arg("Out", KER_ARG_TILE),						/* Pooling output tile */
 					K_ArgPar("ConvOut", KER_ARG_PARTILE_SIZE, D0),				/* In Features */
 					K_Arg("Out", KER_ARG_TILE_W),						/* Output tile width */
-					K_Arg("Out", KER_ARG_TILE_H)						/* Output tile height */
+					K_Arg("Out", KER_ARG_TILE_H),						/* Output tile height */
+					K_Arg("Infos", KER_ARG_TILE)						/* Infos */
 				)
 			),
 			(ActKerName==0)?AT_NO_CALL:
@@ -2651,7 +2724,7 @@ Kernel_T * CNN_PoolAct_SQ8_Internal(
 					Imm((ActOper==KOP_NONE)),					/* Scaling when no activation */
 					K_Arg("Infos", KER_ARG_TILE)					/* Infos */
 				):
-                                Bindings(13,
+                                Bindings(14,
                                         K_Arg("In", KER_ARG_TILE),                             		/* Input tile */
                                         K_Arg("In", KER_ARG_TILE_W),                          		/* Input tile width */
                                         K_Arg("In", KER_ARG_TILE_H),                         		/* Input tile height */
@@ -2664,7 +2737,8 @@ Kernel_T * CNN_PoolAct_SQ8_Internal(
                                         K_Arg("Out", KER_ARG_TILE),                            		/* Pooling output tile */
                                         ParFeat?K_ArgPar("In", KER_ARG_PARTILE_SIZE, D0):Imm(1),	/* In Features */
                                         K_Arg("Out", KER_ARG_TILE_W),                          		/* Output tile width */
-                                        K_Arg("Out", KER_ARG_TILE_H)                           		/* Output tile height */
+                                        K_Arg("Out", KER_ARG_TILE_H),                          		/* Output tile height */
+					K_Arg("Infos", KER_ARG_TILE)					/* Infos */
                                 )
 
 			),
@@ -3378,13 +3452,13 @@ static Kernel_T * CNN_SoftMax2D_SQ8_Internal(
 		),
 		(HWC==0)?
                 KerArgs(3,
-                        KerArg("In",    KerArgSpace(2,D0,T0), OBJ_BUFFER_IN,                1,          Dim, 1, 0, 0, 8, "In"),
-                        KerArg("Out",   KerArgSpace(2,D0,T0), OBJ_BUFFER_OUT,               1,          Dim, OutBytes, 0, 0, 0, "Out"),
+                        KerArg("In",    KerArgSpace(2,D0,T0), OBJ_IN_DB,                1,          Dim, 1, 0, 0, 8, "In"),
+                        KerArg("Out",   KerArgSpace(2,D0,T0), OBJ_OUT_DB,               1,          Dim, OutBytes, 0, 0, 0, "Out"),
 			KerArg("Infos", KerArgSpace(1,T0),    O_IN|O_BUFF|O_NTILED|O_CONST, AT_INF_DIM, 1,   1, 0, 0, 0, "Infos")
 		):
                 KerArgs(3,
-                        KerArg("In",    KerArgSpace(2,T0,D0), OBJ_BUFFER_IN,                1,          Dim, 1, 0, 0, 8, "In"),
-                        KerArg("Out",   KerArgSpace(2,T0,D0), OBJ_BUFFER_OUT,               1,          Dim, OutBytes, 0, 0, 0, "Out"),
+                        KerArg("In",    KerArgSpace(2,T0,D0), OBJ_IN_DB,                1,          Dim, 1, 0, 0, 8, "In"),
+                        KerArg("Out",   KerArgSpace(2,T0,D0), OBJ_OUT_DB,               1,          Dim, OutBytes, 0, 0, 0, "Out"),
 			KerArg("Infos", KerArgSpace(1,T0),    O_IN|O_BUFF|O_NTILED|O_CONST, AT_INF_DIM, 1,   1, 0, 0, 0, "Infos")
 		)
 	);
@@ -4007,44 +4081,24 @@ Kernel_T *CNN_MatMulAct_SQ8_Internal(
 		),
 		ColFirst?
 		KerArgs(8,
-	    		!Transposed?
-	    		KerArg("KerBuff",KerArgSpace(1,   T1), O_BUFF|O_NTILED, Nbuff*ColM1,  1,      1,             0, 0,                                                0, 0):AT_NO_KER_ARG,
-	   		(NBatches>1)?
-	   		KerArg("In1",    KerArgSpace(2,D0,T0), O_IN|O_DB|O_CONST,     ColM1,  LineM1, 1,             0, OBJ_CONSTRAINTS_PAD_REM,                          8, "In1"):
-	   	        KerArg("In1",    KerArgSpace(1,   T0), O_IN|O_DB|O_CONST,     ColM1,  LineM1, 1,             0, OBJ_CONSTRAINTS_PAD_REM,                          8, "In1"),
-	   	        (NBatches>1)?
-			KerArg("In2",    KerArgSpace(2,D0,T1), O_IN|O_DB,             ColM2,  LineM2, 1,             0, ObjCons|OBJ_CONSTRAINTS_PAD_REM,	     ConsT0, "In2"):
-			KerArg("In2",    KerArgSpace(1,   T1), O_IN|O_DB,             ColM2,  LineM2, 1,             0, ObjCons|OBJ_CONSTRAINTS_PAD_REM,	     ConsT0, "In2"),
-	        	!NoBias?
-	        	KerArg("Bias",   KerArgSpace(1,   TA), O_BUFF|O_IN|O_CONST,       1,  SAxis,  Bias_DataSize, 0, OBJ_CONSTRAINTS_PAD_REM,                          0, "Bias"):AT_NO_KER_ARG,
-	   	        (NBatches>1)?
-			KerArg("Out",    KerArgSpace(2,D0,T1), O_OUT|O_DB,             ColO,  LineO,  1,             0, OBJ_CONSTRAINTS_TILE_VER|OBJ_CONSTRAINTS_PAD_REM, 0, "Out"):
-			KerArg("Out",    KerArgSpace(1,   T1), O_OUT|O_DB,             ColO,  LineO,  1,             0, OBJ_CONSTRAINTS_TILE_VER|OBJ_CONSTRAINTS_PAD_REM, 0, "Out"),
-	   		!ScaleScalar?
-	   		KerArg("Scale",  KerArgSpace(1,   TA), O_BUFF|O_IN|O_CONST,        1, SAxis,  1,             0, 0,                                                0, "Scale"):AT_NO_KER_ARG,
-	   		!ScaleScalar?
-	   		KerArg("ScaleN", KerArgSpace(1,   TA), O_BUFF|O_IN|O_CONST,        1, SAxis,  1,             0, 0,                                                0, "ScaleN"):AT_NO_KER_ARG,
-			KerArg("Infos",  KerArgSpace(1,   T1), O_IN|O_BUFF|O_NTILED,       1,     1,  AT_INF_DIM*1,  0, 0,                                                0, "Infos")
+	    !Transposed?KerArg("KerBuff",KerArgSpace(1, T1), O_BUFF|O_NTILED, Nbuff*ColM1,  1,      1,             0, 0,                                                0, 0):AT_NO_KER_ARG,
+			KerArg("In1",    KerArgSpace(1, T0), O_IN|O_DB|O_CONST,     ColM1,  LineM1, 1,             0, OBJ_CONSTRAINTS_PAD_REM,                          8, "In1"),
+			KerArg("In2",    KerArgSpace(1, T1), O_IN|O_DB,             ColM2,  LineM2, 1,             0, ObjCons|OBJ_CONSTRAINTS_PAD_REM, ConsT0, "In2"),
+	        !NoBias?KerArg("Bias",   KerArgSpace(1, TA), O_BUFF|O_IN|O_CONST,       1,  SAxis,  Bias_DataSize, 0, OBJ_CONSTRAINTS_PAD_REM,                          0, "Bias"):AT_NO_KER_ARG,
+			KerArg("Out",    KerArgSpace(1, T1), O_OUT|O_DB,             ColO,  LineO,  1,             0, OBJ_CONSTRAINTS_TILE_VER|OBJ_CONSTRAINTS_PAD_REM, 0, "Out"),
+	   !ScaleScalar?KerArg("Scale",  KerArgSpace(1, TA), O_BUFF|O_IN|O_CONST,        1, SAxis,  1,             0, 0,                                                0, "Scale"):AT_NO_KER_ARG,
+	   !ScaleScalar?KerArg("ScaleN", KerArgSpace(1, TA), O_BUFF|O_IN|O_CONST,        1, SAxis,  1,             0, 0,                                                0, "ScaleN"):AT_NO_KER_ARG,
+			KerArg("Infos",  KerArgSpace(1, T1), O_IN|O_BUFF|O_NTILED,       1,     1,  AT_INF_DIM*1,  0, 0,                                                0, "Infos")
 		):
 		KerArgs(8,
-	    		!Transposed?
-	    		KerArg("KerBuff",KerArgSpace(1,   T0), O_BUFF|O_NTILED, Nbuff*ColM1,  1,      1,             0, 0,                                                0, 0):AT_NO_KER_ARG,
-	   		(NBatches>1)?
-			KerArg("In1",    KerArgSpace(2,D0,T1), O_IN|O_DB|O_CONST,     ColM1,  LineM1, 1,             0, OBJ_CONSTRAINTS_PAD_REM,                          8, "In1"):
-			KerArg("In1",    KerArgSpace(1,   T1), O_IN|O_DB|O_CONST,     ColM1,  LineM1, 1,             0, OBJ_CONSTRAINTS_PAD_REM,                          8, "In1"),
-	   		(NBatches>1)?
-			KerArg("In2",    KerArgSpace(2,D0,T0), O_IN|O_DB,             ColM2,  LineM2, 1,             0, ObjCons|OBJ_CONSTRAINTS_PAD_REM, 		   ConsT0, "In2"):
-			KerArg("In2",    KerArgSpace(1,   T0), O_IN|O_DB,             ColM2,  LineM2, 1,             0, ObjCons|OBJ_CONSTRAINTS_PAD_REM, 		   ConsT0, "In2"),
-	        	!NoBias?
-	        	KerArg("Bias",   KerArgSpace(1,   TB), O_BUFF|O_IN|O_CONST,       1,  SAxis,  Bias_DataSize, 0, OBJ_CONSTRAINTS_PAD_REM,                          0, "Bias"):AT_NO_KER_ARG,
-	   		(NBatches>1)?
-			KerArg("Out",    KerArgSpace(2,D0,T1), O_OUT|O_DB,             ColO,  LineO,  1,             0, OBJ_CONSTRAINTS_PAD_REM,                          0, "Out"):
-			KerArg("Out",    KerArgSpace(1,   T1), O_OUT|O_DB,             ColO,  LineO,  1,             0, OBJ_CONSTRAINTS_PAD_REM,                          0, "Out"),
-	   		!ScaleScalar?
-	   		KerArg("Scale",  KerArgSpace(1,   TB), O_BUFF|O_IN|O_CONST,        1, SAxis,  1,             0, 0,                                                0, "Scale"):AT_NO_KER_ARG,
-	   		!ScaleScalar?
-	   		KerArg("ScaleN", KerArgSpace(1,   TB), O_BUFF|O_IN|O_CONST,        1, SAxis,  1,             0, 0,                                                0, "ScaleN"):AT_NO_KER_ARG,
-			KerArg("Infos",  KerArgSpace(1,   T0), O_IN|O_BUFF|O_NTILED,       1,     1,  AT_INF_DIM*1,  0, 0,                                                0, "Infos")
+	    !Transposed?KerArg("KerBuff",KerArgSpace(1, T0), O_BUFF|O_NTILED, Nbuff*ColM1,  1,      1,             0, 0,                                                0, 0):AT_NO_KER_ARG,
+			KerArg("In1",    KerArgSpace(1, T1), O_IN|O_DB|O_CONST,     ColM1,  LineM1, 1,             0, OBJ_CONSTRAINTS_PAD_REM,                          8, "In1"),
+			KerArg("In2",    KerArgSpace(1, T0), O_IN|O_DB,             ColM2,  LineM2, 1,             0, ObjCons|OBJ_CONSTRAINTS_PAD_REM, ConsT0, "In2"),
+	        !NoBias?KerArg("Bias",   KerArgSpace(1, TB), O_BUFF|O_IN|O_CONST,       1,  SAxis,  Bias_DataSize, 0, OBJ_CONSTRAINTS_PAD_REM,                          0, "Bias"):AT_NO_KER_ARG,
+			KerArg("Out",    KerArgSpace(1, T1), O_OUT|O_DB,             ColO,  LineO,  1,             0, OBJ_CONSTRAINTS_PAD_REM,                          0, "Out"),
+	   !ScaleScalar?KerArg("Scale",  KerArgSpace(1, TB), O_BUFF|O_IN|O_CONST,        1, SAxis,  1,             0, 0,                                                0, "Scale"):AT_NO_KER_ARG,
+	   !ScaleScalar?KerArg("ScaleN", KerArgSpace(1, TB), O_BUFF|O_IN|O_CONST,        1, SAxis,  1,             0, 0,                                                0, "ScaleN"):AT_NO_KER_ARG,
+			KerArg("Infos",  KerArgSpace(1, T0), O_IN|O_BUFF|O_NTILED,       1,     1,  AT_INF_DIM*1,  0, 0,                                                0, "Infos")
 		)
 	);
 	if (Kernel) {
diff --git a/tools/autotiler_v3/CNN_Generators_SQ8/RNN_Generators_SQ8.c b/tools/autotiler_v3/CNN_Generators_SQ8/RNN_Generators_SQ8.c
index ca31fbf57..527c10588 100644
--- a/tools/autotiler_v3/CNN_Generators_SQ8/RNN_Generators_SQ8.c
+++ b/tools/autotiler_v3/CNN_Generators_SQ8/RNN_Generators_SQ8.c
@@ -132,7 +132,7 @@ int RNN_Sequence(int Nc, int K0, int K1, int *n1, int *n2, int *n3, int *n2_io)
 	return ((N1!=0) + (N2!=0) + (N3!=0));
 }
 
-static Kernel_T *RNN_Stack_Seq_SQ8(
+static Kernel_T *RNN_Stack_Seq_SQ8_Internal(
 	char *Name,
 	CNN_GenControl_T *Ctrl,
 	char *RNNKerName,
@@ -257,6 +257,45 @@ static Kernel_T *RNN_Stack_Seq_SQ8(
 	return Kernel;
 }
 
+static Kernel_T *RNN_Stack_Seq_SQ8(
+	char *Name,
+	CNN_GenControl_T *Ctrl,
+	char *RNNKerName,
+
+	int BiasDataSize,
+	int FeatDataSize,
+
+	int AlwaysReset,
+	int NCells,
+	int DimState,
+	int DimIn,
+	int UseIn,
+	int ExposeSequence,
+	int Buffer,
+	int FirstSeq,
+	int LastSeq,
+	int Revert,
+	int Dynamic
+	)
+{
+	Kernel_T *Ker = 0;
+
+	AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_OFF);
+	Ker = RNN_Stack_Seq_SQ8_Internal(Name, Ctrl, RNNKerName, BiasDataSize, FeatDataSize, AlwaysReset, NCells, DimState, DimIn, UseIn, ExposeSequence, Buffer, FirstSeq, LastSeq, Revert, Dynamic);
+	if (Ker) return Ker;
+	AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_ON);
+
+	printf("\n\n=============================== Solution not found for %s: Trying PARALLELFEATURES=0 ===============================\n\n", Name);
+	/* If solution not found try with ParallelFeature = 0 */
+	CNN_GenControl_T InternalCtrl;
+	if (!Ctrl) CNN_InitGenCtrl(&InternalCtrl);
+    	else 	   InternalCtrl = *Ctrl;
+    	CNN_SetGenCtrl(&InternalCtrl, "PARALLELFEATURES", AT_OPT_VAL(0));
+    	Ker = RNN_Stack_Seq_SQ8_Internal(Name, &InternalCtrl, RNNKerName, BiasDataSize, FeatDataSize, AlwaysReset, NCells, DimState, DimIn, UseIn, ExposeSequence, Buffer, FirstSeq, LastSeq, Revert, Dynamic);
+    	return Ker;
+}
+
+
 int RNN_Stack_SQ8(
 	char *Name,
 	CNN_GenControl_T *Ctrl,
@@ -485,7 +524,7 @@ int RNN_Stack_SQ8(
 }
 
 
-static int LSTM_Stack_Seq_SQ8(
+static int LSTM_Stack_Seq_SQ8_Internal(
 	char *Name,
 	CNN_GenControl_T *Ctrl,
 	char *LSTMKerName,
@@ -660,6 +699,44 @@ static int LSTM_Stack_Seq_SQ8(
 	return (Kernel!=0);
 }
 
+static int LSTM_Stack_Seq_SQ8(
+	char *Name,
+	CNN_GenControl_T *Ctrl,
+	char *LSTMKerName,
+
+	int BiasDataSize,
+	int FeatDataSize,
+
+	int AlwaysReset,
+	int NCells,
+	int DimState,
+	int DimIn,
+	int UseIn,
+	int ExposeSequence,
+	int FirstSeq,
+	int LastSeq,
+	int Revert,
+	int Dynamic
+	)
+{
+	int Ker = 0;
+
+	AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_OFF);
+	Ker = LSTM_Stack_Seq_SQ8_Internal(Name, Ctrl, LSTMKerName, BiasDataSize, FeatDataSize, AlwaysReset, NCells, DimState, DimIn, UseIn, ExposeSequence, FirstSeq, LastSeq, Revert, Dynamic);
+	if (Ker) return 1;
+	AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_ON);
+
+	printf("\n\n=============================== Solution not found for %s: Trying PARALLELFEATURES=0 ===============================\n\n", Name);
+	/* If solution not found try with ParallelFeature = 0 */
+	CNN_GenControl_T InternalCtrl;
+	if (!Ctrl) CNN_InitGenCtrl(&InternalCtrl);
+    	else 	   InternalCtrl = *Ctrl;
+    	CNN_SetGenCtrl(&InternalCtrl, "PARALLELFEATURES", AT_OPT_VAL(0));
+    	Ker = LSTM_Stack_Seq_SQ8_Internal(Name, &InternalCtrl, LSTMKerName, BiasDataSize, FeatDataSize, AlwaysReset, NCells, DimState, DimIn, UseIn, ExposeSequence, FirstSeq, LastSeq, Revert, Dynamic);
+    	return Ker;
+}
+
+
 int LSTM_Stack_SQ8(
 	char *Name,
 	CNN_GenControl_T *Ctrl,
@@ -904,7 +981,7 @@ int LSTM_Stack_SQ8(
 }
 
 
-static int GRU_Stack_Seq_SQ8(
+static int GRU_Stack_Seq_SQ8_Internal(
 	char *Name,
 	CNN_GenControl_T *Ctrl,
 	char *GRUKerName,
@@ -1064,6 +1141,43 @@ static int GRU_Stack_Seq_SQ8(
 	return (Kernel!=0);
 }
 
+static int GRU_Stack_Seq_SQ8(
+	char *Name,
+	CNN_GenControl_T *Ctrl,
+	char *GRUKerName,
+
+	int BiasDataSize,
+	int FeatDataSize,
+
+	int AlwaysReset,
+	int NCells,
+	int DimState,
+	int DimIn,
+	int UseIn,
+	int ExposeSequence,
+	int FirstSeq,
+	int LastSeq,
+	int Revert,
+	int Dynamic
+	)
+{
+	int Ker = 0;
+
+	AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_OFF);
+	Ker = GRU_Stack_Seq_SQ8_Internal(Name, Ctrl, GRUKerName, BiasDataSize, FeatDataSize, AlwaysReset, NCells, DimState, DimIn, UseIn, ExposeSequence, FirstSeq, LastSeq, Revert, Dynamic);
+	if (Ker) return 1;
+	AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_ON);
+
+	printf("\n\n=============================== Solution not found for %s: Trying PARALLELFEATURES=0 ===============================\n\n", Name);
+	/* If solution not found try with ParallelFeature = 0 */
+	CNN_GenControl_T InternalCtrl;
+	if (!Ctrl) CNN_InitGenCtrl(&InternalCtrl);
+    	else 	   InternalCtrl = *Ctrl;
+    	CNN_SetGenCtrl(&InternalCtrl, "PARALLELFEATURES", AT_OPT_VAL(0));
+    	Ker = GRU_Stack_Seq_SQ8_Internal(Name, &InternalCtrl, GRUKerName, BiasDataSize, FeatDataSize, AlwaysReset, NCells, DimState, DimIn, UseIn, ExposeSequence, FirstSeq, LastSeq, Revert, Dynamic);
+    	return Ker;
+}
+
 int GRU_Stack_SQ8(
 	char *Name,
 	CNN_GenControl_T *Ctrl,
diff --git a/tools/autotiler_v3/CNN_Generators_fp16/RNN_Generators_fp16.c b/tools/autotiler_v3/CNN_Generators_fp16/RNN_Generators_fp16.c
index 5a6ca45dc..435e965e6 100644
--- a/tools/autotiler_v3/CNN_Generators_fp16/RNN_Generators_fp16.c
+++ b/tools/autotiler_v3/CNN_Generators_fp16/RNN_Generators_fp16.c
@@ -106,7 +106,7 @@ int RNN_Sequence_fp16(int Nc, int K0, int K1, int *n1, int *n2, int *n3, int *n2
 	return ((N1!=0) + (N2!=0) + (N3!=0));
 }
 
-static int RNN_Stack_Seq_fp16(
+static int RNN_Stack_Seq_fp16_Internal(
 	char *Name,
 	CNN_GenControl_T *Ctrl,
 	char *RNNKerName,
@@ -222,6 +222,40 @@ static int RNN_Stack_Seq_fp16(
 	return (Kernel!=0);
 }
 
+static int RNN_Stack_Seq_fp16(
+	char *Name,
+	CNN_GenControl_T *Ctrl,
+	char *RNNKerName,
+
+	int AlwaysReset,
+	int NCells,
+	int DimState,
+	int DimIn,
+	int UseIn,
+	int ExposeSequence,
+	int Buffer,
+	int FirstSeq,
+	int LastSeq,
+	int Revert,
+	int Dynamic)
+{
+	int Ker = 0;
+
+	AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_OFF);
+	Ker = RNN_Stack_Seq_fp16_Internal(Name, Ctrl, RNNKerName, AlwaysReset, NCells, DimState, DimIn, UseIn, ExposeSequence, Buffer, FirstSeq, LastSeq, Revert, Dynamic);
+	if (Ker) return 1;
+	AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_ON);
+
+	printf("\n\n=============================== Solution not found for %s: Trying PARALLELFEATURES=0 ===============================\n\n", Name);
+	/* If solution not found try with ParallelFeature = 0 */
+	CNN_GenControl_T InternalCtrl;
+	if (!Ctrl) CNN_InitGenCtrl(&InternalCtrl);
+    	else 	   InternalCtrl = *Ctrl;
+    	CNN_SetGenCtrl(&InternalCtrl, "PARALLELFEATURES", AT_OPT_VAL(0));
+    	Ker = RNN_Stack_Seq_fp16_Internal(Name, &InternalCtrl, RNNKerName, AlwaysReset, NCells, DimState, DimIn, UseIn, ExposeSequence, Buffer, FirstSeq, LastSeq, Revert, Dynamic);
+    	return Ker;
+}
+
 int RNN_Stack_fp16(
 	char *Name,
 	CNN_GenControl_T *Ctrl,
@@ -400,7 +434,7 @@ int RNN_Stack_fp16(
 }
 
 
-static int LSTM_Stack_Seq_fp16(
+static int LSTM_Stack_Seq_fp16_Internal(
 	char *Name,
 	CNN_GenControl_T *Ctrl,
 	char *LSTMKerName,
@@ -568,6 +602,39 @@ static int LSTM_Stack_Seq_fp16(
 	return (Kernel!=0);
 }
 
+static int LSTM_Stack_Seq_fp16(
+	char *Name,
+	CNN_GenControl_T *Ctrl,
+	char *LSTMKerName,
+
+	int AlwaysReset,
+	int NCells,
+	int DimState,
+	int DimIn,
+	int UseIn,
+	int ExposeSequence,
+	int FirstSeq,
+	int LastSeq,
+	int Revert,
+	int Dynamic)
+{
+	int Ker = 0;
+
+	AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_OFF);
+	Ker = LSTM_Stack_Seq_fp16_Internal(Name, Ctrl, LSTMKerName, AlwaysReset, NCells, DimState, DimIn, UseIn, ExposeSequence, FirstSeq, LastSeq, Revert, Dynamic);
+	if (Ker) return 1;
+	AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_ON);
+
+	printf("\n\n=============================== Solution not found for %s: Trying PARALLELFEATURES=0 ===============================\n\n", Name);
+	/* If solution not found try with ParallelFeature = 0 */
+	CNN_GenControl_T InternalCtrl;
+	if (!Ctrl) CNN_InitGenCtrl(&InternalCtrl);
+    	else 	   InternalCtrl = *Ctrl;
+    	CNN_SetGenCtrl(&InternalCtrl, "PARALLELFEATURES", AT_OPT_VAL(0));
+    	Ker = LSTM_Stack_Seq_fp16_Internal(Name, &InternalCtrl, LSTMKerName, AlwaysReset, NCells, DimState, DimIn, UseIn, ExposeSequence, FirstSeq, LastSeq, Revert, Dynamic);
+    	return Ker;
+}
+
 int LSTM_Stack_fp16(
 	char *Name,
 	CNN_GenControl_T *Ctrl,
@@ -791,7 +858,7 @@ int LSTM_Stack_fp16(
 }
 
 
-static int GRU_Stack_Seq_fp16(
+static int GRU_Stack_Seq_fp16_Internal(
 	char *Name,
 	CNN_GenControl_T *Ctrl,
 	char *GRUKerName,
@@ -943,6 +1010,40 @@ static int GRU_Stack_Seq_fp16(
 	return (Kernel!=0);
 }
 
+static int GRU_Stack_Seq_fp16(
+	char *Name,
+	CNN_GenControl_T *Ctrl,
+	char *GRUKerName,
+
+	int AlwaysReset,
+	int NCells,
+	int DimState,
+	int DimIn,
+	int UseIn,
+	int ExposeSequence,
+	int FirstSeq,
+	int LastSeq,
+	int Revert,
+	int Dynamic)
+{
+	int Ker = 0;
+
+	AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_OFF);
+	Ker = GRU_Stack_Seq_fp16_Internal(Name, Ctrl, GRUKerName, AlwaysReset, NCells, DimState, DimIn, UseIn, ExposeSequence, FirstSeq, LastSeq, Revert, Dynamic);
+	if (Ker) return 1;
+	AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_ON);
+
+	printf("\n\n=============================== Solution not found for %s: Trying PARALLELFEATURES=0 ===============================\n\n", Name);
+	/* If solution not found try with ParallelFeature = 0 */
+	CNN_GenControl_T InternalCtrl;
+	if (!Ctrl) CNN_InitGenCtrl(&InternalCtrl);
+    	else 	   InternalCtrl = *Ctrl;
+    	CNN_SetGenCtrl(&InternalCtrl, "PARALLELFEATURES", AT_OPT_VAL(0));
+    	Ker = GRU_Stack_Seq_fp16_Internal(Name, &InternalCtrl, GRUKerName, AlwaysReset, NCells, DimState, DimIn, UseIn, ExposeSequence, FirstSeq, LastSeq, Revert, Dynamic);
+    	return Ker;
+}
+
+
 int GRU_Stack_fp16(
 	char *Name,
 	CNN_GenControl_T *Ctrl,
diff --git a/tools/autotiler_v3/CNN_Libraries/SSD_BasicKernels.c b/tools/autotiler_v3/CNN_Libraries/SSD_BasicKernels.c
index b1a395965..09e5e814f 100644
--- a/tools/autotiler_v3/CNN_Libraries/SSD_BasicKernels.c
+++ b/tools/autotiler_v3/CNN_Libraries/SSD_BasicKernels.c
@@ -1,7 +1,3 @@
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wextra"
-#pragma GCC diagnostic ignored "-Wpointer-sign"
-#pragma GCC diagnostic ignored "-Wsign-compare"
 /*
  * Copyright (C) 2020 GreenWaves Technologies
  * All rights reserved.
@@ -10,6 +6,9 @@
  * of the BSD license.  See the LICENSE file for details.
  *
  */
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wsign-compare"
+
 #include <stdio.h>
 #include "Gap.h"
 #include "CNN_BasicKernels.h"
diff --git a/tools/autotiler_v3/CNN_Libraries_NE16/CNN_BasicKernels_NE16.c b/tools/autotiler_v3/CNN_Libraries_NE16/CNN_BasicKernels_NE16.c
index 1c302417a..e0bd7445e 100644
--- a/tools/autotiler_v3/CNN_Libraries_NE16/CNN_BasicKernels_NE16.c
+++ b/tools/autotiler_v3/CNN_Libraries_NE16/CNN_BasicKernels_NE16.c
@@ -1275,7 +1275,7 @@ void KerConvDW3x3Stride2_NE16(KerConv_NE16_T *Arg)
 	SetNE16_ScaleNPointer (ScaleN);
 	SetNE16_Strides       (Tile_InFeat, Tile_InFeat * Tile_InW, 0, 	        			// In_D0, In_D1, In_D2 - unused
 		      	       Out_Stride0, OutBytes * Tile_OutFeat / 2, OutBytes * Tile_OutFeat * Tile_OutW / 2,	// Out_D0, Out_D1, Out_D2 div 2 to take into account strideness
-		      	       2*3*3, 2*3*3*Arg->Qw*Nb_KI, 0);                                				// Weights_D0, Weights_D1, Weights_D2
+		      	       2*3*3, 0, 0);                                				// Weights_D0, Weights_D1, Weights_D2
 	SetNE16_Dim           (Nb_KI, Nb_KO, Nb_WO, Nb_HO);
 	// Assume first subtile no need for right/bottom pad
 	SetNE16_ConfigPad     ((v4s) {PadL, IsLastSubtileW?PadR:0, PadT, IsLastSubtileH?PadB:0}, Arg->Pad_Val);
@@ -1343,7 +1343,7 @@ void KerConvDW3x3Stride2_NE16(KerConv_NE16_T *Arg)
 	SetNE16_ScaleNPointer (ScaleN);
 	SetNE16_Strides       (Tile_InFeat, Tile_InFeat * Tile_InW, 0, 				        // In_D0, In_D1, In_D2 - unused
 		      	       Out_Stride0, OutBytes * Tile_OutFeat / 2, OutBytes * Tile_OutFeat * Tile_OutW / 2,	// Out_D0, Out_D1, Out_D2 div 2 to take into account strideness
-		      	       2*3*3, 2*3*3*Arg->Qw*Nb_KI, 0);				                                // Weights_D0, Weights_D1, Weights_D2
+		      	       2*3*3, 0, 0);				                                // Weights_D0, Weights_D1, Weights_D2
 	SetNE16_Dim           (Nb_KI, Nb_KO, Nb_WO, Nb_HO);
 	// Moving to next spatial subtile means consider less padding (2 because of the stride)
 	PadL = Max(0, TilePadL-2*subtile_j_major);
diff --git a/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_AT_Misc.c b/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_AT_Misc.c
index 187fd3ab3..dd82478c9 100644
--- a/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_AT_Misc.c
+++ b/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_AT_Misc.c
@@ -25,6 +25,7 @@
 #include "CNN_AT_Misc.h"
 
 
+
 #ifdef __pulp__
 #define Abs(a)          __builtin_pulp_abs((a))
 #define Min(a, b)       __builtin_pulp_minsi((a), (b))
diff --git a/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_Activation_SQ8.c b/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_Activation_SQ8.c
index ef6918d19..233b637bc 100644
--- a/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_Activation_SQ8.c
+++ b/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_Activation_SQ8.c
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+#include "Gap.h"
+#include "CNN_BasicKernels_SQ8.h"
+
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wextra"
 #pragma GCC diagnostic ignored "-Wpointer-sign"
@@ -21,9 +24,6 @@
 #pragma GCC diagnostic ignored "-Wswitch"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 
-#include "Gap.h"
-#include "CNN_BasicKernels_SQ8.h"
-
 static int CoreCountDynamic = 1;
 static int ActiveCore = gap_ncore();
 
@@ -135,7 +135,7 @@ int TanhTable(int x, unsigned short * table){
 #endif
 }
 
-#define KER_ACT(Activation, in_d_type, out_d_type, p_type, n_bits, is_unsigned) \
+#define KER_ACT(Activation, in_d_type, out_d_type, p_type, in_n_bits, out_n_bits, is_unsigned) \
 do { \
 	unsigned int S = Arg->W*Arg->H*Arg->Feat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S); \
 	decl(in_d_type * __restrict__, In) = decl((in_d_type *__restrict__), Arg->In); \
@@ -147,12 +147,33 @@ do { \
 \
 	for (unsigned int i=First; i<Last; i++) { \
 		int Acc0 = In[i]; \
-		ACT_SWITCH(Acc0, Activation, ActScale, ActScaleN, A0, B0, C0, n_bits, is_unsigned); \
-		Out[i] = OUT_CLIP(Acc0, is_unsigned, n_bits); \
+		ACT_SWITCH(Acc0, Activation, ActScale, ActScaleN, A0, B0, C0, in_n_bits, is_unsigned); \
+		Out[i] = OUT_CLIP(Acc0, is_unsigned, out_n_bits); \
 	} \
+	gap_waitbarrier(0); \
 } while(0)
 
-#define KER_PAR_REDUCT_ACT_CHW(Activation, d_type, p_type, n_bits, is_unsigned) \
+#define KER_ACT_IO(Activation, in_d_type, out_d_type, p_type, in_n_bits, out_n_bits, is_unsigned) \
+do { \
+	unsigned int S = Arg->W*Arg->H*Arg->Feat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S); \
+	int * __restrict__ InOut = (int *__restrict__) Arg->In; \
+	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; \
+	unsigned int Size = Max(0, Last-First); \
+	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; \
+	int A0 = arr_at_as(Infos, AT_INF_A0, p_type); int B0 = arr_at_as(Infos, AT_INF_B0, p_type); int C0 = arr_at_as(Infos, AT_INF_C0, p_type); \
+\
+	for (unsigned int i=0; i<Size; i++) { \
+		int *In = (int *) (InOut + First); \
+		out_d_type *Out = (out_d_type *) (InOut + First); \
+		int Acc0 = In[i]; \
+		ACT_SWITCH(Acc0, Activation, ActScale, ActScaleN, A0, B0, C0, in_n_bits, is_unsigned); \
+		Out[i] = OUT_CLIP(Acc0, is_unsigned, out_n_bits); \
+	} \
+	gap_waitbarrier(0); \
+	KerReductIO_Compact_SQ8_1((signed char *__restrict__)InOut, (signed char *__restrict__)InOut, Size, S); \
+} while(0)
+
+#define KER_PAR_REDUCT_ACT_CHW(Activation, d_type, p_type, in_n_bits, out_n_bits, is_unsigned) \
 do { \
 	int S = Arg->Feat; \
 	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S); \
@@ -169,14 +190,14 @@ do { \
 	for (unsigned int c=First; c<Last; c++) { \
 		for (unsigned int i=0; i<Size; i++) { \
 			int Acc0 = AT_SCALE(AT_NORM(In[Size*c + i], Prenorm), Scale[c], ScaleN[c]); \
-			ACT_SWITCH(Acc0, Activation, ActScale, ActScaleN, A0, B0, C0, n_bits, is_unsigned); \
-			Out[Size*c + i] = OUT_CLIP(Acc0, is_unsigned, n_bits); \
+			ACT_SWITCH(Acc0, Activation, ActScale, ActScaleN, A0, B0, C0, in_n_bits, is_unsigned); \
+			Out[Size*c + i] = OUT_CLIP(Acc0, is_unsigned, out_n_bits); \
 		} \
 	} \
 	gap_waitbarrier(0); \
 } while(0)
 
-#define KER_REDUCT_ACT_CHW(Activation, d_type, p_type, n_bits, is_unsigned) \
+#define KER_REDUCT_ACT_CHW(Activation, d_type, p_type, in_n_bits, out_n_bits, is_unsigned) \
 do { \
 	unsigned int Feat = Arg->Feat; \
 	unsigned S = Arg->W*Arg->H; \
@@ -194,14 +215,14 @@ do { \
 	for (unsigned int c=0; c<Feat; c++) { \
 		for (unsigned int i=First; i<Last; i++) { \
 			int Acc0 = AT_SCALE(AT_NORM(In[Size*c + i], Prenorm), Scale[c], ScaleN[c]); \
-			ACT_SWITCH(Acc0, Activation, ActScale, ActScaleN, A0, B0, C0, n_bits, is_unsigned); \
-			Out[Size*c + i] = OUT_CLIP(Acc0, is_unsigned, n_bits); \
+			ACT_SWITCH(Acc0, Activation, ActScale, ActScaleN, A0, B0, C0, in_n_bits, is_unsigned); \
+			Out[Size*c + i] = OUT_CLIP(Acc0, is_unsigned, out_n_bits); \
 		} \
 	} \
 	gap_waitbarrier(0); \
 } while(0)
 
-#define KER_PAR_REDUCT_IO_ACT_CHW(Activation, d_type, p_type, n_bits, is_unsigned) \
+#define KER_PAR_REDUCT_IO_ACT_CHW(Activation, d_type, p_type, in_n_bits, out_n_bits, is_unsigned) \
 do { \
 	int S = Arg->Feat; \
 	unsigned int Size = Arg->W*Arg->H; \
@@ -218,8 +239,8 @@ do { \
 	for (unsigned int c=First; c<Last; c++) { \
 		for (unsigned int i=0; i<Size; i++) { \
 			int Acc0 = AT_SCALE(AT_NORM(In[Size*c + i], Prenorm), Scale[c], ScaleN[c]); \
-			ACT_SWITCH(Acc0, Activation, ActScale, ActScaleN, A0, B0, C0, n_bits, is_unsigned); \
-			Out[i] = OUT_CLIP(Acc0, is_unsigned, n_bits); \
+			ACT_SWITCH(Acc0, Activation, ActScale, ActScaleN, A0, B0, C0, in_n_bits, is_unsigned); \
+			Out[i] = OUT_CLIP(Acc0, is_unsigned, out_n_bits); \
 		} \
 		Out += Size; \
 	} \
@@ -227,7 +248,7 @@ do { \
 	KerReductIO_Compact_SQ8_1((signed char *__restrict__)In, (signed char *__restrict__)In, Size*ChunkCell, Size * Arg->Feat); \
 } while(0);
 
-#define KER_REDUCT_IO_ACT_CHW(Activation, d_type, p_type, n_bits, is_unsigned) \
+#define KER_REDUCT_IO_ACT_CHW(Activation, d_type, p_type, in_n_bits, out_n_bits, is_unsigned) \
 do { \
 	unsigned int Feat = Arg->Feat; \
 	unsigned int S = Arg->W*Arg->H; \
@@ -246,15 +267,15 @@ do { \
 		d_type *Out = (d_type *) (InOut+S*c+First); \
 		for (unsigned int i=0; i<Size; i++) { \
 			int Acc0 = AT_SCALE(AT_NORM(In[i], Prenorm), Scale[c], ScaleN[c]); \
-			ACT_SWITCH(Acc0, Activation, ActScale, ActScaleN, A0, B0, C0, n_bits, is_unsigned); \
-			Out[i] = OUT_CLIP(Acc0, is_unsigned, n_bits); \
+			ACT_SWITCH(Acc0, Activation, ActScale, ActScaleN, A0, B0, C0, in_n_bits, is_unsigned); \
+			Out[i] = OUT_CLIP(Acc0, is_unsigned, out_n_bits); \
 		} \
 		gap_waitbarrier(0); \
 		KerReductIO_Compact_SQ8_1((signed char *__restrict__)InOut+S*c, (signed char *__restrict__)(InOut+S*c), ChunkCell, S); \
 	} \
 } while(0)
 
-#define KER_PAR_REDUCT_ACT_CHW2HWC(Activation, d_type, p_type, n_bits, is_unsigned) \
+#define KER_PAR_REDUCT_ACT_CHW2HWC(Activation, d_type, p_type, in_n_bits, out_n_bits, is_unsigned) \
 do { \
 	int Feat = Arg->Feat; \
 	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(Feat), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, Feat); \
@@ -271,14 +292,14 @@ do { \
 	for (unsigned int c=First; c<Last; c++) { \
 		for (unsigned int i=0; i<Size; i++) { \
 			int Acc0 = AT_SCALE(AT_NORM(In[Size*c + i], Prenorm), Scale[c], ScaleN[c]); \
-			ACT_SWITCH(Acc0, Activation, ActScale, ActScaleN, A0, B0, C0, n_bits, is_unsigned); \
-			Out[Feat*i + c] = OUT_CLIP(Acc0, is_unsigned, n_bits); \
+			ACT_SWITCH(Acc0, Activation, ActScale, ActScaleN, A0, B0, C0, in_n_bits, is_unsigned); \
+			Out[Feat*i + c] = OUT_CLIP(Acc0, is_unsigned, out_n_bits); \
 		} \
 	} \
 	gap_waitbarrier(0); \
 } while(0)
 
-#define KER_REDUCT_ACT_CHW2HWC(Activation, d_type, p_type, n_bits, is_unsigned) \
+#define KER_REDUCT_ACT_CHW2HWC(Activation, d_type, p_type, in_n_bits, out_n_bits, is_unsigned) \
 do { \
 	unsigned int Feat = Arg->Feat; \
 	unsigned S = Arg->W*Arg->H; \
@@ -296,14 +317,14 @@ do { \
 	for (unsigned int c=0; c<Feat; c++) { \
 		for (unsigned int i=First; i<Last; i++) { \
 	                int Acc0 = AT_SCALE(AT_NORM(In[Size*c + i], Prenorm), Scale[c], ScaleN[c]); \
-			ACT_SWITCH(Acc0, Activation, ActScale, ActScaleN, A0, B0, C0, n_bits, is_unsigned); \
-	                Out[Feat*i + c] = OUT_CLIP(Acc0, is_unsigned, n_bits); \
+			ACT_SWITCH(Acc0, Activation, ActScale, ActScaleN, A0, B0, C0, in_n_bits, is_unsigned); \
+	                Out[Feat*i + c] = OUT_CLIP(Acc0, is_unsigned, out_n_bits); \
 	        } \
 	} \
 	gap_waitbarrier(0); \
 } while(0)
 
-#define KER_PAR_REDUCT_ACT_HWC(Activation, d_type, p_type, n_bits, is_unsigned) \
+#define KER_PAR_REDUCT_ACT_HWC(Activation, d_type, p_type, in_n_bits, out_n_bits, is_unsigned) \
 do { \
 	unsigned int Feat = Arg->Feat; \
 	unsigned S = Arg->W*Arg->H; \
@@ -320,14 +341,14 @@ do { \
 	for (unsigned int i=First; i<Last; i++) { \
 		for (unsigned int c=0; c<Feat; c++) { \
 			int Acc0 = AT_SCALE(AT_NORM(In[Feat*i + c], Prenorm), Scale[c], ScaleN[c]); \
-			ACT_SWITCH(Acc0, Activation, ActScale, ActScaleN, A0, B0, C0, n_bits, is_unsigned); \
-			Out[Feat*i + c] = OUT_CLIP(Acc0, is_unsigned, n_bits); \
+			ACT_SWITCH(Acc0, Activation, ActScale, ActScaleN, A0, B0, C0, in_n_bits, is_unsigned); \
+			Out[Feat*i + c] = OUT_CLIP(Acc0, is_unsigned, out_n_bits); \
 		} \
 	} \
 	gap_waitbarrier(0); \
 } while(0)
 
-#define KER_REDUCT_ACT_HWC(Activation, d_type, p_type, n_bits, is_unsigned) \
+#define KER_REDUCT_ACT_HWC(Activation, d_type, p_type, in_n_bits, out_n_bits, is_unsigned) \
 do { \
 	unsigned int Feat = Arg->Feat; \
 	unsigned S = Arg->W*Arg->H; \
@@ -344,8 +365,8 @@ do { \
 	for (unsigned int i=First; i<Last; i++) { \
 		for (unsigned int c=0; c<Feat; c++) { \
 			int Acc0 = AT_SCALE(AT_NORM(In[Feat*i + c], Prenorm), Scale[c], ScaleN[c]); \
-			ACT_SWITCH(Acc0, Activation, ActScale, ActScaleN, A0, B0, C0, n_bits, is_unsigned); \
-			Out[Feat*i + c] = OUT_CLIP(Acc0, is_unsigned, n_bits); \
+			ACT_SWITCH(Acc0, Activation, ActScale, ActScaleN, A0, B0, C0, in_n_bits, is_unsigned); \
+			Out[Feat*i + c] = OUT_CLIP(Acc0, is_unsigned, out_n_bits); \
 		} \
 	} \
 	gap_waitbarrier(0); \
@@ -375,131 +396,131 @@ static void KerReductIO_Compact_SQ8_1(signed char *__restrict__ To, signed char
  * Input Scaling and reduction to 8b then channel centric activation, Out location != In location. Features are evaluated in parallel
 */
 void KerParReduct_CC_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_CHW(ACT_NONE, signed char, unsigned char, 8, 0);
+	KER_PAR_REDUCT_ACT_CHW(ACT_NONE, signed char, unsigned char, 32, 8, 0);
 }
 
 
 void KerParReduct_CC_ReLU_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_CHW(ACT_RELU, signed char, unsigned char, 8, 0);
+	KER_PAR_REDUCT_ACT_CHW(ACT_RELU, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerParReduct_CC_ReLUN_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_CHW(ACT_RELUN, signed char, unsigned char, 8, 0);
+	KER_PAR_REDUCT_ACT_CHW(ACT_RELUN, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerParReduct_CC_ReLUM_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_CHW(ACT_RELUM, signed char, unsigned char, 8, 0);
+	KER_PAR_REDUCT_ACT_CHW(ACT_RELUM, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerParReduct_CC_ReLUMN_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_CHW(ACT_RELUMN, signed char, unsigned char, 8, 0);
+	KER_PAR_REDUCT_ACT_CHW(ACT_RELUMN, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerParReduct_CC_HSigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_CHW(ACT_HSIGMOID, signed char, unsigned char, 8, 0);
+	KER_PAR_REDUCT_ACT_CHW(ACT_HSIGMOID, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerParReduct_CC_HSwish_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_CHW(ACT_HSWISH, signed char, unsigned char, 8, 0);
+	KER_PAR_REDUCT_ACT_CHW(ACT_HSWISH, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerParReduct_CC_LeakyReLU_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_CHW(ACT_LEAKYRELU, signed char, unsigned char, 8, 0);
+	KER_PAR_REDUCT_ACT_CHW(ACT_LEAKYRELU, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerParReduct_CC_Sigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_CHW(ACT_SIGMOID, signed char, unsigned char, 8, 0);
+	KER_PAR_REDUCT_ACT_CHW(ACT_SIGMOID, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerParReduct_CC_Tanh_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_CHW(ACT_TANH, signed char, unsigned char, 8, 0);
+	KER_PAR_REDUCT_ACT_CHW(ACT_TANH, signed char, unsigned char, 32, 8, 0);
 }
 
 /*
  * Input Scaling and reduction to 8b then channel centric activation, Out location != In location. Features are evaluated in parallel. In: CHW Out: HWC
 */
 void KerParReduct_CC_CHW2HWC_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_CHW2HWC(ACT_NONE, signed char, unsigned char, 8, 0);
+	KER_PAR_REDUCT_ACT_CHW2HWC(ACT_NONE, signed char, unsigned char, 32, 8, 0);
 }
 
 
 void KerParReduct_CC_CHW2HWC_ReLU_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_CHW2HWC(ACT_RELU, signed char, unsigned char, 8, 0);
+	KER_PAR_REDUCT_ACT_CHW2HWC(ACT_RELU, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerParReduct_CC_CHW2HWC_ReLUN_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_CHW2HWC(ACT_RELUN, signed char, unsigned char, 8, 0);
+	KER_PAR_REDUCT_ACT_CHW2HWC(ACT_RELUN, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerParReduct_CC_CHW2HWC_ReLUM_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_CHW2HWC(ACT_RELUM, signed char, unsigned char, 8, 0);
+	KER_PAR_REDUCT_ACT_CHW2HWC(ACT_RELUM, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerParReduct_CC_CHW2HWC_ReLUMN_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_CHW2HWC(ACT_RELUMN, signed char, unsigned char, 8, 0);
+	KER_PAR_REDUCT_ACT_CHW2HWC(ACT_RELUMN, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerParReduct_CC_CHW2HWC_HSigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_CHW2HWC(ACT_HSIGMOID, signed char, unsigned char, 8, 0);
+	KER_PAR_REDUCT_ACT_CHW2HWC(ACT_HSIGMOID, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerParReduct_CC_CHW2HWC_HSwish_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_CHW2HWC(ACT_HSWISH, signed char, unsigned char, 8, 0);
+	KER_PAR_REDUCT_ACT_CHW2HWC(ACT_HSWISH, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerParReduct_CC_CHW2HWC_LeakyReLU_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_CHW2HWC(ACT_LEAKYRELU, signed char, unsigned char, 8, 0);
+	KER_PAR_REDUCT_ACT_CHW2HWC(ACT_LEAKYRELU, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerParReduct_CC_CHW2HWC_Sigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_CHW2HWC(ACT_SIGMOID, signed char, unsigned char, 8, 0);
+	KER_PAR_REDUCT_ACT_CHW2HWC(ACT_SIGMOID, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerParReduct_CC_CHW2HWC_Tanh_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_CHW2HWC(ACT_TANH, signed char, unsigned char, 8, 0);
+	KER_PAR_REDUCT_ACT_CHW2HWC(ACT_TANH, signed char, unsigned char, 32, 8, 0);
 }
 
 /*
  * Input Scaling and reduction to 8b then channel centric activation, Out location = In location. Features are evaluated in parallel
 */
 void KerParReductIO_CC_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_IO_ACT_CHW(ACT_NONE, signed char, unsigned char, 8, 0);
+	KER_PAR_REDUCT_IO_ACT_CHW(ACT_NONE, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerParReductIO_CC_ReLU_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_IO_ACT_CHW(ACT_RELU, signed char, unsigned char, 8, 0);
+	KER_PAR_REDUCT_IO_ACT_CHW(ACT_RELU, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerParReductIO_CC_ReLUN_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_IO_ACT_CHW(ACT_RELUN, signed char, unsigned char, 8, 0);
+	KER_PAR_REDUCT_IO_ACT_CHW(ACT_RELUN, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerParReductIO_CC_ReLUM_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_IO_ACT_CHW(ACT_RELUM, signed char, unsigned char, 8, 0);
+	KER_PAR_REDUCT_IO_ACT_CHW(ACT_RELUM, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerParReductIO_CC_ReLUMN_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_IO_ACT_CHW(ACT_RELUMN, signed char, unsigned char, 8, 0);
+	KER_PAR_REDUCT_IO_ACT_CHW(ACT_RELUMN, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerParReductIO_CC_HSigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_IO_ACT_CHW(ACT_HSIGMOID, signed char, unsigned char, 8, 0);
+	KER_PAR_REDUCT_IO_ACT_CHW(ACT_HSIGMOID, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerParReductIO_CC_HSwish_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_IO_ACT_CHW(ACT_HSWISH, signed char, unsigned char, 8, 0);
+	KER_PAR_REDUCT_IO_ACT_CHW(ACT_HSWISH, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerParReductIO_CC_LeakyReLU_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_IO_ACT_CHW(ACT_LEAKYRELU, signed char, unsigned char, 8, 0);
+	KER_PAR_REDUCT_IO_ACT_CHW(ACT_LEAKYRELU, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerParReductIO_CC_Sigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_IO_ACT_CHW(ACT_SIGMOID, signed char, unsigned char, 8, 0);
+	KER_PAR_REDUCT_IO_ACT_CHW(ACT_SIGMOID, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerParReductIO_CC_Tanh_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_IO_ACT_CHW(ACT_TANH, signed char, unsigned char, 8, 0);
+	KER_PAR_REDUCT_IO_ACT_CHW(ACT_TANH, signed char, unsigned char, 32, 8, 0);
 }
 
 
@@ -507,86 +528,86 @@ void KerParReductIO_CC_Tanh_SQ8(KerConvLinReduct_SQ8_T *Arg) {
  * Input Scaling and reduction to 8b then channel centric activation, Out location != In location. Features are evaluated one after the other in parallel
 */
 void KerReduct_CC_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_REDUCT_ACT_CHW(ACT_NONE, signed char, unsigned char, 8, 0);
+	KER_REDUCT_ACT_CHW(ACT_NONE, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerReduct_CC_ReLU_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_REDUCT_ACT_CHW(ACT_RELU, signed char, unsigned char, 8, 0);
+	KER_REDUCT_ACT_CHW(ACT_RELU, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerReduct_CC_ReLUN_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_REDUCT_ACT_CHW(ACT_RELUN, signed char, unsigned char, 8, 0);
+	KER_REDUCT_ACT_CHW(ACT_RELUN, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerReduct_CC_ReLUM_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_REDUCT_ACT_CHW(ACT_RELUM, signed char, unsigned char, 8, 0);
+	KER_REDUCT_ACT_CHW(ACT_RELUM, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerReduct_CC_ReLUMN_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_REDUCT_ACT_CHW(ACT_RELUMN, signed char, unsigned char, 8, 0);
+	KER_REDUCT_ACT_CHW(ACT_RELUMN, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerReduct_CC_HSigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_REDUCT_ACT_CHW(ACT_HSIGMOID, signed char, unsigned char, 8, 0);
+	KER_REDUCT_ACT_CHW(ACT_HSIGMOID, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerReduct_CC_HSwish_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_REDUCT_ACT_CHW(ACT_HSWISH, signed char, unsigned char, 8, 0);
+	KER_REDUCT_ACT_CHW(ACT_HSWISH, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerReduct_CC_LeakyReLU_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_REDUCT_ACT_CHW(ACT_LEAKYRELU, signed char, unsigned char, 8, 0);
+	KER_REDUCT_ACT_CHW(ACT_LEAKYRELU, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerReduct_CC_Sigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_REDUCT_ACT_CHW(ACT_SIGMOID, signed char, unsigned char, 8, 0);
+	KER_REDUCT_ACT_CHW(ACT_SIGMOID, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerReduct_CC_Tanh_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_REDUCT_ACT_CHW(ACT_TANH, signed char, unsigned char, 8, 0);
+	KER_REDUCT_ACT_CHW(ACT_TANH, signed char, unsigned char, 32, 8, 0);
 }
 
 /* 
  * Input Scaling and reduction to 8b then channel centric activation, Out location = In location. Features are evaluated one after the other in parallel 
 */
 void KerReductIO_CC_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_REDUCT_IO_ACT_CHW(ACT_NONE, signed char, unsigned char, 8, 0);
+	KER_REDUCT_IO_ACT_CHW(ACT_NONE, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerReductIO_CC_ReLU_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_REDUCT_IO_ACT_CHW(ACT_RELU, signed char, unsigned char, 8, 0);
+	KER_REDUCT_IO_ACT_CHW(ACT_RELU, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerReductIO_CC_ReLUN_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_REDUCT_IO_ACT_CHW(ACT_RELUN, signed char, unsigned char, 8, 0);
+	KER_REDUCT_IO_ACT_CHW(ACT_RELUN, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerReductIO_CC_ReLUM_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_REDUCT_IO_ACT_CHW(ACT_RELUM, signed char, unsigned char, 8, 0);
+	KER_REDUCT_IO_ACT_CHW(ACT_RELUM, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerReductIO_CC_ReLUMN_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_REDUCT_IO_ACT_CHW(ACT_RELUMN, signed char, unsigned char, 8, 0);
+	KER_REDUCT_IO_ACT_CHW(ACT_RELUMN, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerReductIO_CC_HSigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_REDUCT_IO_ACT_CHW(ACT_HSIGMOID, signed char, unsigned char, 8, 0);
+	KER_REDUCT_IO_ACT_CHW(ACT_HSIGMOID, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerReductIO_CC_HSwish_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_REDUCT_IO_ACT_CHW(ACT_HSWISH, signed char, unsigned char, 8, 0);
+	KER_REDUCT_IO_ACT_CHW(ACT_HSWISH, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerReductIO_CC_LeakyReLU_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_REDUCT_IO_ACT_CHW(ACT_LEAKYRELU, signed char, unsigned char, 8, 0);
+	KER_REDUCT_IO_ACT_CHW(ACT_LEAKYRELU, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerReductIO_CC_Sigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_REDUCT_IO_ACT_CHW(ACT_SIGMOID, signed char, unsigned char, 8, 0);
+	KER_REDUCT_IO_ACT_CHW(ACT_SIGMOID, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerReductIO_CC_Tanh_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_REDUCT_IO_ACT_CHW(ACT_TANH, signed char, unsigned char, 8, 0);
+	KER_REDUCT_IO_ACT_CHW(ACT_TANH, signed char, unsigned char, 32, 8, 0);
 }
 
 /*
@@ -594,43 +615,43 @@ void KerReductIO_CC_Tanh_SQ8(KerConvLinReduct_SQ8_T *Arg) {
 */
 
 void Ker_ActNone_SQ8(KerActivation_SQ8_T *Arg) {
-	KER_ACT(ACT_NONE, signed char, signed char, unsigned char, 8, 0);
+	KER_ACT(ACT_NONE, signed char, signed char, unsigned char, 8, 8, 0);
 }
 
 void Ker_ReLU_SQ8(KerActivation_SQ8_T *Arg) {
-	KER_ACT(ACT_RELU, signed char, signed char, unsigned char, 8, 0);
+	KER_ACT(ACT_RELU, signed char, signed char, unsigned char, 8, 8, 0);
 }
 
 void Ker_ReLUN_SQ8(KerActivation_SQ8_T *Arg) {
-	KER_ACT(ACT_RELUN, signed char, signed char, unsigned char, 8, 0);
+	KER_ACT(ACT_RELUN, signed char, signed char, unsigned char, 8, 8, 0);
 }
 
 void Ker_ReLUM_SQ8(KerActivation_SQ8_T *Arg) {
-	KER_ACT(ACT_RELUM, signed char, signed char, unsigned char, 8, 0);
+	KER_ACT(ACT_RELUM, signed char, signed char, unsigned char, 8, 8, 0);
 }
 
 void Ker_ReLUMN_SQ8(KerActivation_SQ8_T *Arg) {
-	KER_ACT(ACT_RELUMN, signed char, signed char, unsigned char, 8, 0);
+	KER_ACT(ACT_RELUMN, signed char, signed char, unsigned char, 8, 8, 0);
 }
 
 void Ker_HSigmoid_SQ8(KerActivation_SQ8_T *Arg) {
-	KER_ACT(ACT_HSIGMOID, signed char, signed char, unsigned char, 8, 0);
+	KER_ACT(ACT_HSIGMOID, signed char, signed char, unsigned char, 8, 8, 0);
 }
 
 void Ker_HSwish_SQ8(KerActivation_SQ8_T *Arg) {
-	KER_ACT(ACT_HSWISH, signed char, signed char, unsigned char, 8, 0);
+	KER_ACT(ACT_HSWISH, signed char, signed char, unsigned char, 8, 8, 0);
 }
 
 void Ker_LeakyReLU_SQ8(KerActivation_SQ8_T *Arg) {
-	KER_ACT(ACT_LEAKYRELU, signed char, signed char, unsigned char, 8, 0);
+	KER_ACT(ACT_LEAKYRELU, signed char, signed char, unsigned char, 8, 8, 0);
 }
 
 void Ker_Sigmoid_SQ8(KerActivation_SQ8_T *Arg) {
-	KER_ACT(ACT_SIGMOID, signed char, signed char, unsigned char, 8, 0);
+	KER_ACT(ACT_SIGMOID, signed char, signed char, unsigned char, 8, 8, 0);
 }
 
 void Ker_Tanh_SQ8(KerActivation_SQ8_T *Arg) {
-	KER_ACT(ACT_TANH, signed char, signed char, unsigned char, 8, 0);
+	KER_ACT(ACT_TANH, signed char, signed char, unsigned char, 8, 8, 0);
 }
 
 /* 
@@ -639,169 +660,327 @@ void Ker_Tanh_SQ8(KerActivation_SQ8_T *Arg) {
 
 /* ------------------------------------------------------ Signed 8 bits ------------------------------------------------------ */
 void KerReduct_CC_NoScale_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_ACT(ACT_NONE, int, signed char, unsigned char, 8, 0);
+	KER_ACT(ACT_NONE, int, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerReduct_CC_NoScale_ReLU_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_ACT(ACT_RELU, int, signed char, unsigned char, 8, 0);
+	KER_ACT(ACT_RELU, int, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerReduct_CC_NoScale_ReLUN_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_ACT(ACT_RELUN, int, signed char, unsigned char, 8, 0);
+	KER_ACT(ACT_RELUN, int, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerReduct_CC_NoScale_ReLUM_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_ACT(ACT_RELUM, int, signed char, unsigned char, 8, 0);
+	KER_ACT(ACT_RELUM, int, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerReduct_CC_NoScale_ReLUMN_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_ACT(ACT_RELUMN, int, signed char, unsigned char, 8, 0);
+	KER_ACT(ACT_RELUMN, int, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerReduct_CC_NoScale_HSigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_ACT(ACT_HSIGMOID, int, signed char, unsigned char, 8, 0);
+	KER_ACT(ACT_HSIGMOID, int, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerReduct_CC_NoScale_HSwish_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_ACT(ACT_HSWISH, int, signed char, unsigned char, 8, 0);
+	KER_ACT(ACT_HSWISH, int, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerReduct_CC_NoScale_LeakyReLU_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_ACT(ACT_LEAKYRELU, int, signed char, unsigned char, 8, 0);
+	KER_ACT(ACT_LEAKYRELU, int, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerReduct_CC_NoScale_Sigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_ACT(ACT_SIGMOID, int, signed char, unsigned char, 8, 0);
+	KER_ACT(ACT_SIGMOID, int, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerReduct_CC_NoScale_Tanh_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_ACT(ACT_TANH, int, signed char, unsigned char, 8, 0);
+	KER_ACT(ACT_TANH, int, signed char, unsigned char, 32, 8, 0);
+}
+
+void KerReductIO_CC_NoScale_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT_IO(ACT_NONE, int, signed char, unsigned char, 32, 8, 0);
+}
+
+void KerReductIO_CC_NoScale_ReLU_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT_IO(ACT_RELU, int, signed char, unsigned char, 32, 8, 0);
+}
+
+void KerReductIO_CC_NoScale_ReLUN_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT_IO(ACT_RELUN, int, signed char, unsigned char, 32, 8, 0);
+}
+
+void KerReductIO_CC_NoScale_ReLUM_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT_IO(ACT_RELUM, int, signed char, unsigned char, 32, 8, 0);
+}
+
+void KerReductIO_CC_NoScale_ReLUMN_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT_IO(ACT_RELUMN, int, signed char, unsigned char, 32, 8, 0);
+}
+
+void KerReductIO_CC_NoScale_HSigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT_IO(ACT_HSIGMOID, int, signed char, unsigned char, 32, 8, 0);
+}
+
+void KerReductIO_CC_NoScale_HSwish_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT_IO(ACT_HSWISH, int, signed char, unsigned char, 32, 8, 0);
+}
+
+void KerReductIO_CC_NoScale_LeakyReLU_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT_IO(ACT_LEAKYRELU, int, signed char, unsigned char, 32, 8, 0);
+}
+
+void KerReductIO_CC_NoScale_Sigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT_IO(ACT_SIGMOID, int, signed char, unsigned char, 32, 8, 0);
+}
+
+void KerReductIO_CC_NoScale_Tanh_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT_IO(ACT_TANH, int, signed char, unsigned char, 32, 8, 0);
 }
 
 
 /* ------------------------------------------------------ Signed 16 bits ------------------------------------------------------ */
 void KerReduct_CC_NoScale_SQ16(KerConvLinReduct_SQ8_T *Arg) {
-	KER_ACT(ACT_NONE, int, signed short, unsigned short, 16, 0);
+	KER_ACT(ACT_NONE, int, signed short, unsigned short, 32, 16, 0);
 }
 
 void KerReduct_CC_NoScale_ReLU_SQ16(KerConvLinReduct_SQ8_T *Arg) {
-	KER_ACT(ACT_RELU, int, signed short, unsigned short, 16, 0);
+	KER_ACT(ACT_RELU, int, signed short, unsigned short, 32, 16, 0);
 }
 
 void KerReduct_CC_NoScale_ReLUN_SQ16(KerConvLinReduct_SQ8_T *Arg) {
-	KER_ACT(ACT_RELUN, int, signed short, unsigned short, 16, 0);
+	KER_ACT(ACT_RELUN, int, signed short, unsigned short, 32, 16, 0);
 }
 
 void KerReduct_CC_NoScale_ReLUM_SQ16(KerConvLinReduct_SQ8_T *Arg) {
-	KER_ACT(ACT_RELUM, int, signed short, unsigned short, 16, 0);
+	KER_ACT(ACT_RELUM, int, signed short, unsigned short, 32, 16, 0);
 }
 
 void KerReduct_CC_NoScale_ReLUMN_SQ16(KerConvLinReduct_SQ8_T *Arg) {
-	KER_ACT(ACT_RELUMN, int, signed short, unsigned short, 16, 0);
+	KER_ACT(ACT_RELUMN, int, signed short, unsigned short, 32, 16, 0);
 }
 
 void KerReduct_CC_NoScale_HSigmoid_SQ16(KerConvLinReduct_SQ8_T *Arg) {
-	KER_ACT(ACT_HSIGMOID, int, signed short, unsigned short, 16, 0);
+	KER_ACT(ACT_HSIGMOID, int, signed short, unsigned short, 32, 16, 0);
 }
 
 void KerReduct_CC_NoScale_HSwish_SQ16(KerConvLinReduct_SQ8_T *Arg) {
-	KER_ACT(ACT_HSWISH, int, signed short, unsigned short, 16, 0);
+	KER_ACT(ACT_HSWISH, int, signed short, unsigned short, 32, 16, 0);
 }
 
 void KerReduct_CC_NoScale_LeakyReLU_SQ16(KerConvLinReduct_SQ8_T *Arg) {
-	KER_ACT(ACT_LEAKYRELU, int, signed short, unsigned short, 16, 0);
+	KER_ACT(ACT_LEAKYRELU, int, signed short, unsigned short, 32, 16, 0);
 }
 
 void KerReduct_CC_NoScale_Sigmoid_SQ16(KerConvLinReduct_SQ8_T *Arg) {
-	KER_ACT(ACT_SIGMOID, int, signed short, unsigned short, 16, 0);
+	KER_ACT(ACT_SIGMOID, int, signed short, unsigned short, 32, 16, 0);
 }
 
 void KerReduct_CC_NoScale_Tanh_SQ16(KerConvLinReduct_SQ8_T *Arg) {
-	KER_ACT(ACT_TANH, int, signed short, unsigned short, 16, 0);
+	KER_ACT(ACT_TANH, int, signed short, unsigned short, 32, 16, 0);
+}
+
+void KerReductIO_CC_NoScale_SQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT_IO(ACT_NONE, int, signed short, unsigned short, 32, 16, 0);
+}
+
+void KerReductIO_CC_NoScale_ReLU_SQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT_IO(ACT_RELU, int, signed short, unsigned short, 32, 16, 0);
+}
+
+void KerReductIO_CC_NoScale_ReLUN_SQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT_IO(ACT_RELUN, int, signed short, unsigned short, 32, 16, 0);
+}
+
+void KerReductIO_CC_NoScale_ReLUM_SQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT_IO(ACT_RELUM, int, signed short, unsigned short, 32, 16, 0);
+}
+
+void KerReductIO_CC_NoScale_ReLUMN_SQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT_IO(ACT_RELUMN, int, signed short, unsigned short, 32, 16, 0);
+}
+
+void KerReductIO_CC_NoScale_HSigmoid_SQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT_IO(ACT_HSIGMOID, int, signed short, unsigned short, 32, 16, 0);
 }
 
+void KerReductIO_CC_NoScale_HSwish_SQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT_IO(ACT_HSWISH, int, signed short, unsigned short, 32, 16, 0);
+}
+
+void KerReductIO_CC_NoScale_LeakyReLU_SQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT_IO(ACT_LEAKYRELU, int, signed short, unsigned short, 32, 16, 0);
+}
+
+void KerReductIO_CC_NoScale_Sigmoid_SQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT_IO(ACT_SIGMOID, int, signed short, unsigned short, 32, 16, 0);
+}
+
+void KerReductIO_CC_NoScale_Tanh_SQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT_IO(ACT_TANH, int, signed short, unsigned short, 32, 16, 0);
+}
 
 /* ---------------------------------------------------- Unsigned 8 bits ----------------------------------------------------- */
 void KerReduct_CC_NoScale_USQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_ACT(ACT_NONE, int, unsigned char, unsigned char, 8, 1);
+	KER_ACT(ACT_NONE, int, unsigned char, unsigned char, 32, 8, 1);
 }
 
 void KerReduct_CC_NoScale_ReLU_USQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_ACT(ACT_RELU, int, unsigned char, unsigned char, 8, 1);
+	KER_ACT(ACT_RELU, int, unsigned char, unsigned char, 32, 8, 1);
 }
 
 void KerReduct_CC_NoScale_ReLUN_USQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_ACT(ACT_RELUN, int, unsigned char, unsigned char, 8, 1);
+	KER_ACT(ACT_RELUN, int, unsigned char, unsigned char, 32, 8, 1);
 }
 
 void KerReduct_CC_NoScale_ReLUM_USQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_ACT(ACT_RELUM, int, unsigned char, unsigned char, 8, 1);
+	KER_ACT(ACT_RELUM, int, unsigned char, unsigned char, 32, 8, 1);
 }
 
 void KerReduct_CC_NoScale_ReLUMN_USQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_ACT(ACT_RELUMN, int, unsigned char, unsigned char, 8, 1);
+	KER_ACT(ACT_RELUMN, int, unsigned char, unsigned char, 32, 8, 1);
 }
 
 void KerReduct_CC_NoScale_HSigmoid_USQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_ACT(ACT_HSIGMOID, int, unsigned char, unsigned char, 8, 1);
+	KER_ACT(ACT_HSIGMOID, int, unsigned char, unsigned char, 32, 8, 1);
 }
 
 void KerReduct_CC_NoScale_HSwish_USQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_ACT(ACT_HSWISH, int, unsigned char, unsigned char, 8, 1);
+	KER_ACT(ACT_HSWISH, int, unsigned char, unsigned char, 32, 8, 1);
 }
 
 void KerReduct_CC_NoScale_LeakyReLU_USQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_ACT(ACT_LEAKYRELU, int, unsigned char, unsigned char, 8, 1);
+	KER_ACT(ACT_LEAKYRELU, int, unsigned char, unsigned char, 32, 8, 1);
 }
 
 void KerReduct_CC_NoScale_Sigmoid_USQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_ACT(ACT_SIGMOID, int, unsigned char, unsigned char, 8, 1);
+	KER_ACT(ACT_SIGMOID, int, unsigned char, unsigned char, 32, 8, 1);
 }
 
 void KerReduct_CC_NoScale_Tanh_USQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_ACT(ACT_TANH, int, unsigned char, unsigned char, 8, 1);
+	KER_ACT(ACT_TANH, int, unsigned char, unsigned char, 32, 8, 1);
 }
 
+void KerReductIO_CC_NoScale_USQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT_IO(ACT_NONE, int, unsigned char, unsigned char, 32, 8, 1);
+}
+
+void KerReductIO_CC_NoScale_ReLU_USQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT_IO(ACT_RELU, int, unsigned char, unsigned char, 32, 8, 1);
+}
+
+void KerReductIO_CC_NoScale_ReLUN_USQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT_IO(ACT_RELUN, int, unsigned char, unsigned char, 32, 8, 1);
+}
+
+void KerReductIO_CC_NoScale_ReLUM_USQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT_IO(ACT_RELUM, int, unsigned char, unsigned char, 32, 8, 1);
+}
+
+void KerReductIO_CC_NoScale_ReLUMN_USQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT_IO(ACT_RELUMN, int, unsigned char, unsigned char, 32, 8, 1);
+}
+
+void KerReductIO_CC_NoScale_HSigmoid_USQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT_IO(ACT_HSIGMOID, int, unsigned char, unsigned char, 32, 8, 1);
+}
+
+void KerReductIO_CC_NoScale_HSwish_USQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT_IO(ACT_HSWISH, int, unsigned char, unsigned char, 32, 8, 1);
+}
+
+void KerReductIO_CC_NoScale_LeakyReLU_USQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT_IO(ACT_LEAKYRELU, int, unsigned char, unsigned char, 32, 8, 1);
+}
+
+void KerReductIO_CC_NoScale_Sigmoid_USQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT_IO(ACT_SIGMOID, int, unsigned char, unsigned char, 32, 8, 1);
+}
+
+void KerReductIO_CC_NoScale_Tanh_USQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT_IO(ACT_TANH, int, unsigned char, unsigned char, 32, 8, 1);
+}
 
 /* ---------------------------------------------------- UnSigned 16 bits ----------------------------------------------------- */
 void KerReduct_CC_NoScale_USQ16(KerConvLinReduct_SQ8_T *Arg) {
-	KER_ACT(ACT_NONE, int, unsigned short, unsigned short, 16, 1);
+	KER_ACT(ACT_NONE, int, unsigned short, unsigned short, 32, 16, 1);
 }
 
 void KerReduct_CC_NoScale_ReLU_USQ16(KerConvLinReduct_SQ8_T *Arg) {
-	KER_ACT(ACT_RELU, int, unsigned short, unsigned short, 16, 1);
+	KER_ACT(ACT_RELU, int, unsigned short, unsigned short, 32, 16, 1);
 }
 
 void KerReduct_CC_NoScale_ReLUN_USQ16(KerConvLinReduct_SQ8_T *Arg) {
-	KER_ACT(ACT_RELUN, int, unsigned short, unsigned short, 16, 1);
+	KER_ACT(ACT_RELUN, int, unsigned short, unsigned short, 32, 16, 1);
 }
 
 void KerReduct_CC_NoScale_ReLUM_USQ16(KerConvLinReduct_SQ8_T *Arg) {
-	KER_ACT(ACT_RELUM, int, unsigned short, unsigned short, 16, 1);
+	KER_ACT(ACT_RELUM, int, unsigned short, unsigned short, 32, 16, 1);
 }
 
 void KerReduct_CC_NoScale_ReLUMN_USQ16(KerConvLinReduct_SQ8_T *Arg) {
-	KER_ACT(ACT_RELUMN, int, unsigned short, unsigned short, 16, 1);
+	KER_ACT(ACT_RELUMN, int, unsigned short, unsigned short, 32, 16, 1);
 }
 
 void KerReduct_CC_NoScale_HSigmoid_USQ16(KerConvLinReduct_SQ8_T *Arg) {
-	KER_ACT(ACT_HSIGMOID, int, unsigned short, unsigned short, 16, 1);
+	KER_ACT(ACT_HSIGMOID, int, unsigned short, unsigned short, 32, 16, 1);
 }
 
 void KerReduct_CC_NoScale_HSwish_USQ16(KerConvLinReduct_SQ8_T *Arg) {
-	KER_ACT(ACT_HSWISH, int, unsigned short, unsigned short, 16, 1);
+	KER_ACT(ACT_HSWISH, int, unsigned short, unsigned short, 32, 16, 1);
 }
 
 void KerReduct_CC_NoScale_LeakyReLU_USQ16(KerConvLinReduct_SQ8_T *Arg) {
-	KER_ACT(ACT_LEAKYRELU, int, unsigned short, unsigned short, 16, 1);
+	KER_ACT(ACT_LEAKYRELU, int, unsigned short, unsigned short, 32, 16, 1);
 }
 
 void KerReduct_CC_NoScale_Sigmoid_USQ16(KerConvLinReduct_SQ8_T *Arg) {
-	KER_ACT(ACT_SIGMOID, int, unsigned short, unsigned short, 16, 1);
+	KER_ACT(ACT_SIGMOID, int, unsigned short, unsigned short, 32, 16, 1);
 }
 
 void KerReduct_CC_NoScale_Tanh_USQ16(KerConvLinReduct_SQ8_T *Arg) {
-	KER_ACT(ACT_TANH, int, unsigned short, unsigned short, 16, 1);
+	KER_ACT(ACT_TANH, int, unsigned short, unsigned short, 32, 16, 1);
+}
+
+void KerReductIO_CC_NoScale_USQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT_IO(ACT_NONE, int, unsigned short, unsigned short, 32, 16, 1);
+}
+
+void KerReductIO_CC_NoScale_ReLU_USQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT_IO(ACT_RELU, int, unsigned short, unsigned short, 32, 16, 1);
+}
+
+void KerReductIO_CC_NoScale_ReLUN_USQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT_IO(ACT_RELUN, int, unsigned short, unsigned short, 32, 16, 1);
+}
+
+void KerReductIO_CC_NoScale_ReLUM_USQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT_IO(ACT_RELUM, int, unsigned short, unsigned short, 32, 16, 1);
+}
+
+void KerReductIO_CC_NoScale_ReLUMN_USQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT_IO(ACT_RELUMN, int, unsigned short, unsigned short, 32, 16, 1);
+}
+
+void KerReductIO_CC_NoScale_HSigmoid_USQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT_IO(ACT_HSIGMOID, int, unsigned short, unsigned short, 32, 16, 1);
+}
+
+void KerReductIO_CC_NoScale_HSwish_USQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT_IO(ACT_HSWISH, int, unsigned short, unsigned short, 32, 16, 1);
+}
+
+void KerReductIO_CC_NoScale_LeakyReLU_USQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT_IO(ACT_LEAKYRELU, int, unsigned short, unsigned short, 32, 16, 1);
+}
+
+void KerReductIO_CC_NoScale_Sigmoid_USQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT_IO(ACT_SIGMOID, int, unsigned short, unsigned short, 32, 16, 1);
+}
+
+void KerReductIO_CC_NoScale_Tanh_USQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT_IO(ACT_TANH, int, unsigned short, unsigned short, 32, 16, 1);
 }
 
 /* 
@@ -810,167 +989,167 @@ void KerReduct_CC_NoScale_Tanh_USQ16(KerConvLinReduct_SQ8_T *Arg) {
 
 /* ------------------------------------------------------ Signed 8 bits ------------------------------------------------------ */
 void KerParReduct_CC_HWC_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_HWC(ACT_NONE, signed char, unsigned char, 8, 0);
+	KER_PAR_REDUCT_ACT_HWC(ACT_NONE, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerParReduct_CC_ReLU_HWC_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_HWC(ACT_RELU, signed char, unsigned char, 8, 0);
+	KER_PAR_REDUCT_ACT_HWC(ACT_RELU, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerParReduct_CC_ReLUN_HWC_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_HWC(ACT_RELUN, signed char, unsigned char, 8, 0);
+	KER_PAR_REDUCT_ACT_HWC(ACT_RELUN, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerParReduct_CC_ReLUM_HWC_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_HWC(ACT_RELUM, signed char, unsigned char, 8, 0);
+	KER_PAR_REDUCT_ACT_HWC(ACT_RELUM, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerParReduct_CC_ReLUMN_HWC_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_HWC(ACT_RELUMN, signed char, unsigned char, 8, 0);
+	KER_PAR_REDUCT_ACT_HWC(ACT_RELUMN, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerParReduct_CC_HSigmoid_HWC_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_HWC(ACT_HSIGMOID, signed char, unsigned char, 8, 0);
+	KER_PAR_REDUCT_ACT_HWC(ACT_HSIGMOID, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerParReduct_CC_HSwish_HWC_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_HWC(ACT_HSWISH, signed char, unsigned char, 8, 0);
+	KER_PAR_REDUCT_ACT_HWC(ACT_HSWISH, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerParReduct_CC_LeakyReLU_HWC_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_HWC(ACT_LEAKYRELU, signed char, unsigned char, 8, 0);
+	KER_PAR_REDUCT_ACT_HWC(ACT_LEAKYRELU, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerParReduct_CC_Sigmoid_HWC_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_HWC(ACT_SIGMOID, signed char, unsigned char, 8, 0);
+	KER_PAR_REDUCT_ACT_HWC(ACT_SIGMOID, signed char, unsigned char, 32, 8, 0);
 }
 
 void KerParReduct_CC_Tanh_HWC_SQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_HWC(ACT_TANH, signed char, unsigned char, 8, 0);
+	KER_PAR_REDUCT_ACT_HWC(ACT_TANH, signed char, unsigned char, 32, 8, 0);
 }
 
 /* ----------------------------------------------------- UnSigned 8 bits ----------------------------------------------------- */
 void KerParReduct_CC_HWC_USQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_HWC(ACT_NONE, unsigned char, unsigned char, 8, 1);
+	KER_PAR_REDUCT_ACT_HWC(ACT_NONE, unsigned char, unsigned char, 32, 8, 1);
 }
 
 void KerParReduct_CC_ReLU_HWC_USQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_HWC(ACT_RELU, unsigned char, unsigned char, 8, 1);
+	KER_PAR_REDUCT_ACT_HWC(ACT_RELU, unsigned char, unsigned char, 32, 8, 1);
 }
 
 void KerParReduct_CC_ReLUN_HWC_USQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_HWC(ACT_RELUN, unsigned char, unsigned char, 8, 1);
+	KER_PAR_REDUCT_ACT_HWC(ACT_RELUN, unsigned char, unsigned char, 32, 8, 1);
 }
 
 void KerParReduct_CC_ReLUM_HWC_USQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_HWC(ACT_RELUM, unsigned char, unsigned char, 8, 1);
+	KER_PAR_REDUCT_ACT_HWC(ACT_RELUM, unsigned char, unsigned char, 32, 8, 1);
 }
 
 void KerParReduct_CC_ReLUMN_HWC_USQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_HWC(ACT_RELUMN, unsigned char, unsigned char, 8, 1);
+	KER_PAR_REDUCT_ACT_HWC(ACT_RELUMN, unsigned char, unsigned char, 32, 8, 1);
 }
 
 void KerParReduct_CC_HSigmoid_HWC_USQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_HWC(ACT_HSIGMOID, unsigned char, unsigned char, 8, 1);
+	KER_PAR_REDUCT_ACT_HWC(ACT_HSIGMOID, unsigned char, unsigned char, 32, 8, 1);
 }
 
 void KerParReduct_CC_HSwish_HWC_USQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_HWC(ACT_HSWISH, unsigned char, unsigned char, 8, 1);
+	KER_PAR_REDUCT_ACT_HWC(ACT_HSWISH, unsigned char, unsigned char, 32, 8, 1);
 }
 
 void KerParReduct_CC_LeakyReLU_HWC_USQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_HWC(ACT_LEAKYRELU, unsigned char, unsigned char, 8, 1);
+	KER_PAR_REDUCT_ACT_HWC(ACT_LEAKYRELU, unsigned char, unsigned char, 32, 8, 1);
 }
 
 void KerParReduct_CC_Sigmoid_HWC_USQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_HWC(ACT_SIGMOID, unsigned char, unsigned char, 8, 1);
+	KER_PAR_REDUCT_ACT_HWC(ACT_SIGMOID, unsigned char, unsigned char, 32, 8, 1);
 }
 
 void KerParReduct_CC_Tanh_HWC_USQ8(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_HWC(ACT_TANH, unsigned char, unsigned char, 8, 1);
+	KER_PAR_REDUCT_ACT_HWC(ACT_TANH, unsigned char, unsigned char, 32, 8, 1);
 }
 
 /* ----------------------------------------------------- Signed 16 bits ---------------------------------------------------- */
 void KerParReduct_CC_HWC_SQ16(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_HWC(ACT_NONE, signed short, unsigned short, 16, 0);
+	KER_PAR_REDUCT_ACT_HWC(ACT_NONE, signed short, unsigned short, 32, 16, 0);
 }
 
 void KerParReduct_CC_ReLU_HWC_SQ16(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_HWC(ACT_RELU, signed short, unsigned short, 16, 0);
+	KER_PAR_REDUCT_ACT_HWC(ACT_RELU, signed short, unsigned short, 32, 16, 0);
 }
 
 void KerParReduct_CC_ReLUN_HWC_SQ16(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_HWC(ACT_RELUN, signed short, unsigned short, 16, 0);
+	KER_PAR_REDUCT_ACT_HWC(ACT_RELUN, signed short, unsigned short, 32, 16, 0);
 }
 
 void KerParReduct_CC_ReLUM_HWC_SQ16(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_HWC(ACT_RELUM, signed short, unsigned short, 16, 0);
+	KER_PAR_REDUCT_ACT_HWC(ACT_RELUM, signed short, unsigned short, 32, 16, 0);
 }
 
 void KerParReduct_CC_ReLUMN_HWC_SQ16(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_HWC(ACT_RELUMN, signed short, unsigned short, 16, 0);
+	KER_PAR_REDUCT_ACT_HWC(ACT_RELUMN, signed short, unsigned short, 32, 16, 0);
 }
 
 void KerParReduct_CC_HSigmoid_HWC_SQ16(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_HWC(ACT_HSIGMOID, signed short, unsigned short, 16, 0);
+	KER_PAR_REDUCT_ACT_HWC(ACT_HSIGMOID, signed short, unsigned short, 32, 16, 0);
 }
 
 void KerParReduct_CC_HSwish_HWC_SQ16(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_HWC(ACT_HSWISH, signed short, unsigned short, 16, 0);
+	KER_PAR_REDUCT_ACT_HWC(ACT_HSWISH, signed short, unsigned short, 32, 16, 0);
 }
 
 void KerParReduct_CC_LeakyReLU_HWC_SQ16(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_HWC(ACT_LEAKYRELU, signed short, unsigned short, 16, 0);
+	KER_PAR_REDUCT_ACT_HWC(ACT_LEAKYRELU, signed short, unsigned short, 32, 16, 0);
 }
 
 void KerParReduct_CC_Sigmoid_HWC_SQ16(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_HWC(ACT_SIGMOID, signed short, unsigned short, 16, 0);
+	KER_PAR_REDUCT_ACT_HWC(ACT_SIGMOID, signed short, unsigned short, 32, 16, 0);
 }
 
 void KerParReduct_CC_Tanh_HWC_SQ16(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_HWC(ACT_TANH, signed short, unsigned short, 16, 0);
+	KER_PAR_REDUCT_ACT_HWC(ACT_TANH, signed short, unsigned short, 32, 16, 0);
 }
 
 
 /* ----------------------------------------------------- UnSigned 16 bits ---------------------------------------------------- */
 void KerParReduct_CC_HWC_USQ16(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_HWC(ACT_NONE, unsigned short, unsigned short, 16, 1);
+	KER_PAR_REDUCT_ACT_HWC(ACT_NONE, unsigned short, unsigned short, 32, 16, 1);
 }
 
 void KerParReduct_CC_ReLU_HWC_USQ16(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_HWC(ACT_RELU, unsigned short, unsigned short, 16, 1);
+	KER_PAR_REDUCT_ACT_HWC(ACT_RELU, unsigned short, unsigned short, 32, 16, 1);
 }
 
 void KerParReduct_CC_ReLUN_HWC_USQ16(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_HWC(ACT_RELUN, unsigned short, unsigned short, 16, 1);
+	KER_PAR_REDUCT_ACT_HWC(ACT_RELUN, unsigned short, unsigned short, 32, 16, 1);
 }
 
 void KerParReduct_CC_ReLUM_HWC_USQ16(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_HWC(ACT_RELUM, unsigned short, unsigned short, 16, 1);
+	KER_PAR_REDUCT_ACT_HWC(ACT_RELUM, unsigned short, unsigned short, 32, 16, 1);
 }
 
 void KerParReduct_CC_ReLUMN_HWC_USQ16(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_HWC(ACT_RELUMN, unsigned short, unsigned short, 16, 1);
+	KER_PAR_REDUCT_ACT_HWC(ACT_RELUMN, unsigned short, unsigned short, 32, 16, 1);
 }
 
 void KerParReduct_CC_HSigmoid_HWC_USQ16(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_HWC(ACT_HSIGMOID, unsigned short, unsigned short, 16, 1);
+	KER_PAR_REDUCT_ACT_HWC(ACT_HSIGMOID, unsigned short, unsigned short, 32, 16, 1);
 }
 
 void KerParReduct_CC_HSwish_HWC_USQ16(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_HWC(ACT_HSWISH, unsigned short, unsigned short, 16, 1);
+	KER_PAR_REDUCT_ACT_HWC(ACT_HSWISH, unsigned short, unsigned short, 32, 16, 1);
 }
 
 void KerParReduct_CC_LeakyReLU_HWC_USQ16(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_HWC(ACT_LEAKYRELU, unsigned short, unsigned short, 16, 1);
+	KER_PAR_REDUCT_ACT_HWC(ACT_LEAKYRELU, unsigned short, unsigned short, 32, 16, 1);
 }
 
 void KerParReduct_CC_Sigmoid_HWC_USQ16(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_HWC(ACT_SIGMOID, unsigned short, unsigned short, 16, 1);
+	KER_PAR_REDUCT_ACT_HWC(ACT_SIGMOID, unsigned short, unsigned short, 32, 16, 1);
 }
 
 void KerParReduct_CC_Tanh_HWC_USQ16(KerConvLinReduct_SQ8_T *Arg) {
-	KER_PAR_REDUCT_ACT_HWC(ACT_TANH, unsigned short, unsigned short, 16, 1);
+	KER_PAR_REDUCT_ACT_HWC(ACT_TANH, unsigned short, unsigned short, 32, 16, 1);
 }
 
 
diff --git a/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_BasicKernels_SQ8.h b/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_BasicKernels_SQ8.h
index af3f1734d..b83de757b 100644
--- a/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_BasicKernels_SQ8.h
+++ b/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_BasicKernels_SQ8.h
@@ -152,6 +152,7 @@ typedef struct {
         unsigned short int Feat;              	/**< Input Features */
         unsigned short int Wo;			/**< Output tile width */
         unsigned short int Ho;			/**< Output tile height */
+        signed char * __restrict__ Infos;       /**< Scaling and constants data */
 } Ker_MM_Pool_SQ8_T;
 
 typedef struct {
@@ -168,6 +169,7 @@ typedef struct {
         unsigned short int Feat;              	/**< Input Features */
         unsigned short int Wo;			/**< Output tile width */
         unsigned short int Ho;			/**< Output tile height */
+        signed char * __restrict__ Infos;       /**< Scaling and constants data */
 } Ker_MM_Pool_USQ8_T;
 
 typedef struct {
@@ -184,6 +186,7 @@ typedef struct {
         unsigned short int Feat;              	/**< Input Features */
         unsigned short int Wo;			/**< Output tile width */
         unsigned short int Ho;			/**< Output tile height */
+        signed char * __restrict__ Infos;       /**< Scaling and constants data */
 } Ker_MM_Pool_SQ16_T;
 
 typedef struct {
@@ -200,6 +203,7 @@ typedef struct {
         unsigned short int Feat;              	/**< Input Features */
         unsigned short int Wo;			/**< Output tile width */
         unsigned short int Ho;			/**< Output tile height */
+        signed char * __restrict__ Infos;       /**< Scaling and constants data */
 } Ker_MM_Pool_USQ16_T;
 
 /******************************************************************************************************************
@@ -818,11 +822,12 @@ int TanhTable(int x, unsigned short int * table);
 #define Sigmoid(__x) SigmoidTable((__x), SIGMOID_LUT_uint16)
 #define SigmoidU(__x) SigmoidTableUnsigned((__x), SIGMOID_LUT_uint16)
 
+#pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wswitch"
 #define decl(__t, __v) __t __v
 #define arr_at_as(__arr, __offset, __t) *((__t *) &__arr[__offset])
 #define OUT_CLIP(__acc, __is_unsigned, __n_bits) (__is_unsigned)?gap_clipu(__acc, __n_bits):gap_clip(__acc, (__n_bits-1))
-#define ACT_SWITCH(__acc, __act_type, __act_scale, __act_scalen, __a0, __b0, __c0, __n_bits, __is_unsigned) \
+#define ACT_SWITCH(__acc, __act_type, __act_scale, __act_scalen, __a0, __b0, __c0, __in_n_bits, __is_unsigned) \
 do { \
 	switch (__act_type) { \
 		case ACT_NONE: \
@@ -854,14 +859,13 @@ do { \
 			break; \
 		case ACT_SIGMOID: \
 			{ \
-				int Acc0N = __acc << (16 - __n_bits); \
-				if (!__is_unsigned) __acc = AT_SCALE(Sigmoid(Acc0N),  __act_scale, __act_scalen); \
-				else 		    __acc = AT_SCALE(SigmoidU(Acc0N), __act_scale, __act_scalen); \
+				int Acc0N = __acc << ((__in_n_bits==8)?8:0); \
+				__acc = AT_SCALE(Sigmoid(Acc0N),  __act_scale, __act_scalen); \
 			} \
 			break; \
 		case ACT_TANH: \
 			{ \
-				int Acc0N = __acc << (16 - __n_bits); \
+				int Acc0N = __acc << ((__in_n_bits==8)?8:0); \
 				if (!__is_unsigned) __acc = AT_SCALE(Tanh(Acc0N), __act_scale, __act_scalen); \
 				else 		    __acc = AT_SCALE(Tanh(Acc0N), __act_scale, __act_scalen) + __a0; \
 			} \
@@ -1029,6 +1033,51 @@ extern void KerReduct_CC_NoScale_Sigmoid_USQ16(KerConvLinReduct_SQ8_T *Arg);
 extern void KerReduct_CC_NoScale_Tanh_USQ16(KerConvLinReduct_SQ8_T *Arg);
 
 
+extern void KerReductIO_CC_NoScale_SQ8(KerConvLinReduct_SQ8_T *Arg);
+extern void KerReductIO_CC_NoScale_ReLU_SQ8(KerConvLinReduct_SQ8_T *Arg);
+extern void KerReductIO_CC_NoScale_ReLUN_SQ8(KerConvLinReduct_SQ8_T *Arg);
+extern void KerReductIO_CC_NoScale_ReLUM_SQ8(KerConvLinReduct_SQ8_T *Arg);
+extern void KerReductIO_CC_NoScale_ReLUMN_SQ8(KerConvLinReduct_SQ8_T *Arg);
+extern void KerReductIO_CC_NoScale_HSigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg);
+extern void KerReductIO_CC_NoScale_HSwish_SQ8(KerConvLinReduct_SQ8_T *Arg);
+extern void KerReductIO_CC_NoScale_LeakyReLU_SQ8(KerConvLinReduct_SQ8_T *Arg);
+extern void KerReductIO_CC_NoScale_Sigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg);
+extern void KerReductIO_CC_NoScale_Tanh_SQ8(KerConvLinReduct_SQ8_T *Arg);
+
+extern void KerReductIO_CC_NoScale_USQ8(KerConvLinReduct_SQ8_T *Arg);
+extern void KerReductIO_CC_NoScale_ReLU_USQ8(KerConvLinReduct_SQ8_T *Arg);
+extern void KerReductIO_CC_NoScale_ReLUN_USQ8(KerConvLinReduct_SQ8_T *Arg);
+extern void KerReductIO_CC_NoScale_ReLUM_USQ8(KerConvLinReduct_SQ8_T *Arg);
+extern void KerReductIO_CC_NoScale_ReLUMN_USQ8(KerConvLinReduct_SQ8_T *Arg);
+extern void KerReductIO_CC_NoScale_HSigmoid_USQ8(KerConvLinReduct_SQ8_T *Arg);
+extern void KerReductIO_CC_NoScale_HSwish_USQ8(KerConvLinReduct_SQ8_T *Arg);
+extern void KerReductIO_CC_NoScale_LeakyReLU_USQ8(KerConvLinReduct_SQ8_T *Arg);
+extern void KerReductIO_CC_NoScale_Sigmoid_USQ8(KerConvLinReduct_SQ8_T *Arg);
+extern void KerReductIO_CC_NoScale_Tanh_USQ8(KerConvLinReduct_SQ8_T *Arg);
+
+extern void KerReductIO_CC_NoScale_SQ16(KerConvLinReduct_SQ8_T *Arg);
+extern void KerReductIO_CC_NoScale_ReLU_SQ16(KerConvLinReduct_SQ8_T *Arg);
+extern void KerReductIO_CC_NoScale_ReLUN_SQ16(KerConvLinReduct_SQ8_T *Arg);
+extern void KerReductIO_CC_NoScale_ReLUM_SQ16(KerConvLinReduct_SQ8_T *Arg);
+extern void KerReductIO_CC_NoScale_ReLUMN_SQ16(KerConvLinReduct_SQ8_T *Arg);
+extern void KerReductIO_CC_NoScale_HSigmoid_SQ16(KerConvLinReduct_SQ8_T *Arg);
+extern void KerReductIO_CC_NoScale_HSwish_SQ16(KerConvLinReduct_SQ8_T *Arg);
+extern void KerReductIO_CC_NoScale_LeakyReLU_SQ16(KerConvLinReduct_SQ8_T *Arg);
+extern void KerReductIO_CC_NoScale_Sigmoid_SQ16(KerConvLinReduct_SQ8_T *Arg);
+extern void KerReductIO_CC_NoScale_Tanh_SQ16(KerConvLinReduct_SQ8_T *Arg);
+
+extern void KerReductIO_CC_NoScale_USQ16(KerConvLinReduct_SQ8_T *Arg);
+extern void KerReductIO_CC_NoScale_ReLU_USQ16(KerConvLinReduct_SQ8_T *Arg);
+extern void KerReductIO_CC_NoScale_ReLUN_USQ16(KerConvLinReduct_SQ8_T *Arg);
+extern void KerReductIO_CC_NoScale_ReLUM_USQ16(KerConvLinReduct_SQ8_T *Arg);
+extern void KerReductIO_CC_NoScale_ReLUMN_USQ16(KerConvLinReduct_SQ8_T *Arg);
+extern void KerReductIO_CC_NoScale_HSigmoid_USQ16(KerConvLinReduct_SQ8_T *Arg);
+extern void KerReductIO_CC_NoScale_HSwish_USQ16(KerConvLinReduct_SQ8_T *Arg);
+extern void KerReductIO_CC_NoScale_LeakyReLU_USQ16(KerConvLinReduct_SQ8_T *Arg);
+extern void KerReductIO_CC_NoScale_Sigmoid_USQ16(KerConvLinReduct_SQ8_T *Arg);
+extern void KerReductIO_CC_NoScale_Tanh_USQ16(KerConvLinReduct_SQ8_T *Arg);
+
+
 /******************************************************************************************************************
           Stand alone activation. Parallel Feature, Feature Parallel
 	  Input is a scaled 8b tensor
@@ -1113,12 +1162,93 @@ extern void KerParGlobalSumPoolFullFeat_ReLUM_SQ8(KerGlobalPool_SQ8_T *Arg);
 extern void KerParGlobalSumPoolFullFeat_ReLUMN_SQ8(KerGlobalPool_SQ8_T *Arg);
 
 /* Pooling Basic Kernels for HWC Layers layout */
+extern void KerParMaxPoolNxMStrideSxSy_HWC_SQ8(Ker_MM_Pool_SQ8_T *Arg);
+extern void KerParMaxPoolNxMStrideSxSy_HWC_ReLU_SQ8(Ker_MM_Pool_SQ8_T *Arg);
+extern void KerParMaxPoolNxMStrideSxSy_HWC_ReLUN_SQ8(Ker_MM_Pool_SQ8_T *Arg);
+extern void KerParMaxPoolNxMStrideSxSy_HWC_ReLUM_SQ8(Ker_MM_Pool_SQ8_T *Arg);
+extern void KerParMaxPoolNxMStrideSxSy_HWC_ReLUMN_SQ8(Ker_MM_Pool_SQ8_T *Arg);
+extern void KerParMaxPoolNxMStrideSxSy_HWC_HSigmoid_SQ8(Ker_MM_Pool_SQ8_T *Arg);
+extern void KerParMaxPoolNxMStrideSxSy_HWC_HSwish_SQ8(Ker_MM_Pool_SQ8_T *Arg);
+extern void KerParMaxPoolNxMStrideSxSy_HWC_LeakyReLU_SQ8(Ker_MM_Pool_SQ8_T *Arg);
+extern void KerParMaxPoolNxMStrideSxSy_HWC_Sigmoid_SQ8(Ker_MM_Pool_SQ8_T *Arg);
+extern void KerParMaxPoolNxMStrideSxSy_HWC_Tanh_SQ8(Ker_MM_Pool_SQ8_T *Arg);
+
 extern void KerParMaxPoolNxMStrideSxSy_HWC_USQ8(Ker_MM_Pool_USQ8_T *Arg);
-extern void KerParAvgPoolNxMStrideSxSy_HWC_USQ8(Ker_MM_Pool_USQ8_T *Arg);
-extern void KerParPool_MaxPoolNxMStrideSxSy__HWC_USQ8(KerPool_HWC_USQ8_T *Arg);
+extern void KerParMaxPoolNxMStrideSxSy_HWC_ReLU_USQ8(Ker_MM_Pool_USQ8_T *Arg);
+extern void KerParMaxPoolNxMStrideSxSy_HWC_ReLUN_USQ8(Ker_MM_Pool_USQ8_T *Arg);
+extern void KerParMaxPoolNxMStrideSxSy_HWC_ReLUM_USQ8(Ker_MM_Pool_USQ8_T *Arg);
+extern void KerParMaxPoolNxMStrideSxSy_HWC_ReLUMN_USQ8(Ker_MM_Pool_USQ8_T *Arg);
+extern void KerParMaxPoolNxMStrideSxSy_HWC_HSigmoid_USQ8(Ker_MM_Pool_USQ8_T *Arg);
+extern void KerParMaxPoolNxMStrideSxSy_HWC_HSwish_USQ8(Ker_MM_Pool_USQ8_T *Arg);
+extern void KerParMaxPoolNxMStrideSxSy_HWC_LeakyReLU_USQ8(Ker_MM_Pool_USQ8_T *Arg);
+extern void KerParMaxPoolNxMStrideSxSy_HWC_Sigmoid_USQ8(Ker_MM_Pool_USQ8_T *Arg);
+extern void KerParMaxPoolNxMStrideSxSy_HWC_Tanh_USQ8(Ker_MM_Pool_USQ8_T *Arg);
 
-extern void KerParMaxPoolNxMStrideSxSy_HWC_SQ8(Ker_MM_Pool_SQ8_T *Arg);
 extern void KerParAvgPoolNxMStrideSxSy_HWC_SQ8(Ker_MM_Pool_SQ8_T *Arg);
+extern void KerParAvgPoolNxMStrideSxSy_HWC_ReLU_SQ8(Ker_MM_Pool_SQ8_T *Arg);
+extern void KerParAvgPoolNxMStrideSxSy_HWC_ReLUN_SQ8(Ker_MM_Pool_SQ8_T *Arg);
+extern void KerParAvgPoolNxMStrideSxSy_HWC_ReLUM_SQ8(Ker_MM_Pool_SQ8_T *Arg);
+extern void KerParAvgPoolNxMStrideSxSy_HWC_ReLUMN_SQ8(Ker_MM_Pool_SQ8_T *Arg);
+extern void KerParAvgPoolNxMStrideSxSy_HWC_HSigmoid_SQ8(Ker_MM_Pool_SQ8_T *Arg);
+extern void KerParAvgPoolNxMStrideSxSy_HWC_HSwish_SQ8(Ker_MM_Pool_SQ8_T *Arg);
+extern void KerParAvgPoolNxMStrideSxSy_HWC_LeakyReLU_SQ8(Ker_MM_Pool_SQ8_T *Arg);
+extern void KerParAvgPoolNxMStrideSxSy_HWC_Sigmoid_SQ8(Ker_MM_Pool_SQ8_T *Arg);
+extern void KerParAvgPoolNxMStrideSxSy_HWC_Tanh_SQ8(Ker_MM_Pool_SQ8_T *Arg);
+
+extern void KerParAvgPoolNxMStrideSxSy_HWC_USQ8(Ker_MM_Pool_USQ8_T *Arg);
+extern void KerParAvgPoolNxMStrideSxSy_HWC_ReLU_USQ8(Ker_MM_Pool_USQ8_T *Arg);
+extern void KerParAvgPoolNxMStrideSxSy_HWC_ReLUN_USQ8(Ker_MM_Pool_USQ8_T *Arg);
+extern void KerParAvgPoolNxMStrideSxSy_HWC_ReLUM_USQ8(Ker_MM_Pool_USQ8_T *Arg);
+extern void KerParAvgPoolNxMStrideSxSy_HWC_ReLUMN_USQ8(Ker_MM_Pool_USQ8_T *Arg);
+extern void KerParAvgPoolNxMStrideSxSy_HWC_HSigmoid_USQ8(Ker_MM_Pool_USQ8_T *Arg);
+extern void KerParAvgPoolNxMStrideSxSy_HWC_HSwish_USQ8(Ker_MM_Pool_USQ8_T *Arg);
+extern void KerParAvgPoolNxMStrideSxSy_HWC_LeakyReLU_USQ8(Ker_MM_Pool_USQ8_T *Arg);
+extern void KerParAvgPoolNxMStrideSxSy_HWC_Sigmoid_USQ8(Ker_MM_Pool_USQ8_T *Arg);
+extern void KerParAvgPoolNxMStrideSxSy_HWC_Tanh_USQ8(Ker_MM_Pool_USQ8_T *Arg);
+
+extern void KerParMaxPoolNxMStrideSxSy_HWC_SQ16(Ker_MM_Pool_SQ16_T *Arg);
+extern void KerParMaxPoolNxMStrideSxSy_HWC_ReLU_SQ16(Ker_MM_Pool_SQ16_T *Arg);
+extern void KerParMaxPoolNxMStrideSxSy_HWC_ReLUN_SQ16(Ker_MM_Pool_SQ16_T *Arg);
+extern void KerParMaxPoolNxMStrideSxSy_HWC_ReLUM_SQ16(Ker_MM_Pool_SQ16_T *Arg);
+extern void KerParMaxPoolNxMStrideSxSy_HWC_ReLUMN_SQ16(Ker_MM_Pool_SQ16_T *Arg);
+extern void KerParMaxPoolNxMStrideSxSy_HWC_HSigmoid_SQ16(Ker_MM_Pool_SQ16_T *Arg);
+extern void KerParMaxPoolNxMStrideSxSy_HWC_HSwish_SQ16(Ker_MM_Pool_SQ16_T *Arg);
+extern void KerParMaxPoolNxMStrideSxSy_HWC_LeakyReLU_SQ16(Ker_MM_Pool_SQ16_T *Arg);
+extern void KerParMaxPoolNxMStrideSxSy_HWC_Sigmoid_SQ16(Ker_MM_Pool_SQ16_T *Arg);
+extern void KerParMaxPoolNxMStrideSxSy_HWC_Tanh_SQ16(Ker_MM_Pool_SQ16_T *Arg);
+
+extern void KerParMaxPoolNxMStrideSxSy_HWC_USQ16(Ker_MM_Pool_USQ16_T *Arg);
+extern void KerParMaxPoolNxMStrideSxSy_HWC_ReLU_USQ16(Ker_MM_Pool_USQ16_T *Arg);
+extern void KerParMaxPoolNxMStrideSxSy_HWC_ReLUN_USQ16(Ker_MM_Pool_USQ16_T *Arg);
+extern void KerParMaxPoolNxMStrideSxSy_HWC_ReLUM_USQ16(Ker_MM_Pool_USQ16_T *Arg);
+extern void KerParMaxPoolNxMStrideSxSy_HWC_ReLUMN_USQ16(Ker_MM_Pool_USQ16_T *Arg);
+extern void KerParMaxPoolNxMStrideSxSy_HWC_HSigmoid_USQ16(Ker_MM_Pool_USQ16_T *Arg);
+extern void KerParMaxPoolNxMStrideSxSy_HWC_HSwish_USQ16(Ker_MM_Pool_USQ16_T *Arg);
+extern void KerParMaxPoolNxMStrideSxSy_HWC_LeakyReLU_USQ16(Ker_MM_Pool_USQ16_T *Arg);
+extern void KerParMaxPoolNxMStrideSxSy_HWC_Sigmoid_USQ16(Ker_MM_Pool_USQ16_T *Arg);
+extern void KerParMaxPoolNxMStrideSxSy_HWC_Tanh_USQ16(Ker_MM_Pool_USQ16_T *Arg);
+
+extern void KerParAvgPoolNxMStrideSxSy_HWC_SQ16(Ker_MM_Pool_SQ16_T *Arg);
+extern void KerParAvgPoolNxMStrideSxSy_HWC_ReLU_SQ16(Ker_MM_Pool_SQ16_T *Arg);
+extern void KerParAvgPoolNxMStrideSxSy_HWC_ReLUN_SQ16(Ker_MM_Pool_SQ16_T *Arg);
+extern void KerParAvgPoolNxMStrideSxSy_HWC_ReLUM_SQ16(Ker_MM_Pool_SQ16_T *Arg);
+extern void KerParAvgPoolNxMStrideSxSy_HWC_ReLUMN_SQ16(Ker_MM_Pool_SQ16_T *Arg);
+extern void KerParAvgPoolNxMStrideSxSy_HWC_HSigmoid_SQ16(Ker_MM_Pool_SQ16_T *Arg);
+extern void KerParAvgPoolNxMStrideSxSy_HWC_HSwish_SQ16(Ker_MM_Pool_SQ16_T *Arg);
+extern void KerParAvgPoolNxMStrideSxSy_HWC_LeakyReLU_SQ16(Ker_MM_Pool_SQ16_T *Arg);
+extern void KerParAvgPoolNxMStrideSxSy_HWC_Sigmoid_SQ16(Ker_MM_Pool_SQ16_T *Arg);
+extern void KerParAvgPoolNxMStrideSxSy_HWC_Tanh_SQ16(Ker_MM_Pool_SQ16_T *Arg);
+
+extern void KerParAvgPoolNxMStrideSxSy_HWC_USQ16(Ker_MM_Pool_USQ16_T *Arg);
+extern void KerParAvgPoolNxMStrideSxSy_HWC_ReLU_USQ16(Ker_MM_Pool_USQ16_T *Arg);
+extern void KerParAvgPoolNxMStrideSxSy_HWC_ReLUN_USQ16(Ker_MM_Pool_USQ16_T *Arg);
+extern void KerParAvgPoolNxMStrideSxSy_HWC_ReLUM_USQ16(Ker_MM_Pool_USQ16_T *Arg);
+extern void KerParAvgPoolNxMStrideSxSy_HWC_ReLUMN_USQ16(Ker_MM_Pool_USQ16_T *Arg);
+extern void KerParAvgPoolNxMStrideSxSy_HWC_HSigmoid_USQ16(Ker_MM_Pool_USQ16_T *Arg);
+extern void KerParAvgPoolNxMStrideSxSy_HWC_HSwish_USQ16(Ker_MM_Pool_USQ16_T *Arg);
+extern void KerParAvgPoolNxMStrideSxSy_HWC_LeakyReLU_USQ16(Ker_MM_Pool_USQ16_T *Arg);
+extern void KerParAvgPoolNxMStrideSxSy_HWC_Sigmoid_USQ16(Ker_MM_Pool_USQ16_T *Arg);
+extern void KerParAvgPoolNxMStrideSxSy_HWC_Tanh_USQ16(Ker_MM_Pool_USQ16_T *Arg);
 
 /*************************************************************************************************************************************************
 	Pooling group.
diff --git a/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_Bias_Linear_SQ8.c b/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_Bias_Linear_SQ8.c
index 4dc30ccc5..3ac43710b 100644
--- a/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_Bias_Linear_SQ8.c
+++ b/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_Bias_Linear_SQ8.c
@@ -14,12 +14,13 @@
  * limitations under the License.
  */
 
+#include "Gap.h"
+#include "CNN_BasicKernels_SQ8.h"
+
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wextra"
 #pragma GCC diagnostic ignored "-Wpointer-sign"
 #pragma GCC diagnostic ignored "-Wsign-compare"
-#include "Gap.h"
-#include "CNN_BasicKernels_SQ8.h"
 
 #define VOL volatile
 
@@ -333,7 +334,7 @@ static inline void __attribute__((always_inline)) KerParLinearLayerFullFeatB8_SQ
 
 	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(OutDim), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, OutDim);
 	v4s * __restrict__ VectIn = (v4s *) In;
-	unsigned char * Infos = Arg->Infos;
+	unsigned char * Infos = (unsigned char *) Arg->Infos;
 	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
 	int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]);
 
@@ -347,7 +348,7 @@ static inline void __attribute__((always_inline)) KerParLinearLayerFullFeatB8_SQ
 		}
 		if (InDim&0x4) Acc = gap_sumdotp4(VectIn[InDim/4-1], W[InDim/4-1], Acc);
 		for (int j=4*(InDim/4); j<InDim; j++) Acc += In[j]*Weights[i*InDim+j];
-		Acc = AT_SCALE(Acc, Scale[i], ScaleN[i]); ACT_SWITCH(Acc, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+		Acc = AT_SCALE(Acc, Scale[i], ScaleN[i]); ACT_SWITCH(Acc, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 		Out[i] = gap_clip(Acc, 7);
 	}
 	gap_waitbarrier(0);
@@ -412,7 +413,7 @@ static inline void __attribute__((always_inline)) KerParLinearLayerFullFeatB16_S
 
 	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(OutDim), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, OutDim);
 	v4s * __restrict__ VectIn = (v4s *) In;
-	unsigned char * Infos = Arg->Infos;
+	unsigned char * Infos = (unsigned char *) Arg->Infos;
 	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
 	int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]);
 
@@ -426,7 +427,7 @@ static inline void __attribute__((always_inline)) KerParLinearLayerFullFeatB16_S
 		}
 		if (InDim&0x4) Acc = gap_sumdotp4(VectIn[InDim/4-1], W[InDim/4-1], Acc);
 		for (int j=4*(InDim/4); j<InDim; j++) Acc += In[j]*Weights[i*InDim+j];
-		Acc = AT_SCALE(Acc, Scale[i], ScaleN[i]); ACT_SWITCH(Acc, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+		Acc = AT_SCALE(Acc, Scale[i], ScaleN[i]); ACT_SWITCH(Acc, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 		Out[i] = gap_clip(Acc, 7);
 	}
 	gap_waitbarrier(0);
@@ -487,7 +488,7 @@ static inline void __attribute__((always_inline)) KerParLinearLayerFullFeatB32_S
 	unsigned char *Scale = Arg->Scale;
 	unsigned char *ScaleN = Arg->ScaleN;
 	signed char * __restrict__ Out = (signed char * __restrict__) Arg->Out;
-	unsigned char * Infos = Arg->Infos;
+	unsigned char * Infos = (unsigned char *) Arg->Infos;
 	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
 	int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]);
 
@@ -504,12 +505,63 @@ static inline void __attribute__((always_inline)) KerParLinearLayerFullFeatB32_S
 		}
 		if (InDim&0x4) Acc = gap_sumdotp4(VectIn[InDim/4-1], W[InDim/4-1], Acc);
 		for (int j=4*(InDim/4); j<InDim; j++) Acc += In[j]*Weights[i*InDim+j];
-		Acc = AT_SCALE(Acc, Scale[i], ScaleN[i]); ACT_SWITCH(Acc, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+		Acc = AT_SCALE(Acc, Scale[i], ScaleN[i]); ACT_SWITCH(Acc, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 		Out[i] = gap_clip(Acc, 7);
 	}
 	gap_waitbarrier(0);
 }
 
+/*	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(OutDim), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, OutDim);
+	int Iter = Last-First;
+	v4s * __restrict__ VectIn = (v4s *) In;
+
+	for (int i=0; i<(Iter/2); i++) {
+		int line1 = First + 2*i;
+		int line2 = First + 2*i+1;
+		v4s * __restrict__ W1 = (v4s *) (&Weights[(line1)*InDim]);
+		v4s * __restrict__ W2 = (v4s *) (&Weights[(line2)*InDim]);
+		int Acc1 = AT_LSHIFT(Bias[line1], NormBias);
+		int Acc2 = AT_LSHIFT(Bias[line2], NormBias);
+
+		for (int j=0; j<(InDim/(4*2)); j++) {
+			v4s V0=VectIn[2*j], V1=VectIn[2*j+1];
+			v4s C10=W1[2*j], C11=W1[2*j+1];
+			v4s C20=W2[2*j], C21=W2[2*j+1];
+			Acc1 = gap_sumdotp4(V0, C10, Acc1); Acc1 = gap_sumdotp4(V1, C11, Acc1);
+			Acc2 = gap_sumdotp4(V0, C20, Acc2); Acc2 = gap_sumdotp4(V1, C21, Acc2);
+		}
+		if (InDim&0x4) {
+			Acc1 = gap_sumdotp4(VectIn[InDim/4-1], W1[InDim/4-1], Acc1);
+			Acc2 = gap_sumdotp4(VectIn[InDim/4-1], W2[InDim/4-1], Acc2);
+		}
+		for (int j=4*(InDim/4); j<InDim; j++) {
+			Acc1 += In[j]*Weights[(line1)*InDim+j];
+			Acc2 += In[j]*Weights[(line2)*InDim+j];
+		}
+		Acc1 = AT_SCALE(Acc1, Scale[line1], ScaleN[line1]); ACT_SWITCH(Acc1, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+		Acc2 = AT_SCALE(Acc2, Scale[line2], ScaleN[line2]); ACT_SWITCH(Acc2, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+		Out[line1] = gap_clip(Acc1, 7);
+		Out[line2] = gap_clip(Acc2, 7);
+	}
+	if (Iter&0x1) {
+		v4s * __restrict__ W1 = (v4s *) (&Weights[(Last-1)*InDim]);
+		int Acc1 = AT_LSHIFT(Bias[Last-1], NormBias);
+		for (int j=0; j<(InDim/(4*2)); j++) {
+			v4s V0=VectIn[2*j], V1=VectIn[2*j+1];
+			v4s C10=W1[2*j], C11=W1[2*j+1];
+			Acc1 = gap_sumdotp4(V0, C10, Acc1); Acc1 = gap_sumdotp4(V1, C11, Acc1);
+		}
+		if (InDim&0x4) {
+			Acc1 = gap_sumdotp4(VectIn[InDim/4-1], W1[InDim/4-1], Acc1);
+		}
+		for (int j=4*(InDim/4); j<InDim; j++) {
+			Acc1 += In[j]*Weights[(Last-1)*InDim+j];
+		}
+		Acc1 = AT_SCALE(Acc1, Scale[Last-1], ScaleN[Last-1]); ACT_SWITCH(Acc1, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+		Out[Last-1] = gap_clip(Acc1, 7);
+	}
+	gap_waitbarrier(0);
+}*/
 
 void KerParLinearLayerFullFeatB32_SQ8(KerLinear_SQ8_T *Arg) {
 	KerParLinearLayerFullFeatB32_SQ8_act(Arg, ACT_NONE);
diff --git a/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_Conv_DW_SQ8.c b/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_Conv_DW_SQ8.c
index 143534700..8f2782c49 100644
--- a/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_Conv_DW_SQ8.c
+++ b/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_Conv_DW_SQ8.c
@@ -14,14 +14,15 @@
  * limitations under the License.
  */
 
+#include "Gap.h"
+#include "CNN_BasicKernels_SQ8.h"
+
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wextra"
 #pragma GCC diagnostic ignored "-Wpointer-sign"
 #pragma GCC diagnostic ignored "-Wsign-compare"
 #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 
-#include "Gap.h"
-#include "CNN_BasicKernels_SQ8.h"
 
 static int CoreCountDynamic = 1;
 static int ActiveCore = gap_ncore();
diff --git a/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_Conv_SQ8.c b/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_Conv_SQ8.c
index 2177327fc..fcc60a1ec 100644
--- a/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_Conv_SQ8.c
+++ b/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_Conv_SQ8.c
@@ -14,18 +14,18 @@
  * limitations under the License.
  */
 
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wextra"
-#pragma GCC diagnostic ignored "-Wpointer-sign"
-#pragma GCC diagnostic ignored "-Wsign-compare"
-#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
-
 #include "Gap.h"
 #include "CNN_BasicKernels_SQ8.h"
 
 static int CoreCountDynamic = 1;
 static int ActiveCore = gap_ncore();
 
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wextra"
+#pragma GCC diagnostic ignored "-Wpointer-sign"
+#pragma GCC diagnostic ignored "-Wsign-compare"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+
 static inline unsigned int __attribute__((always_inline)) ChunkSize(unsigned int X)
 
 {
diff --git a/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_MatAlgebra_SQ8.c b/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_MatAlgebra_SQ8.c
index 4c1c2218c..f05bf4dff 100644
--- a/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_MatAlgebra_SQ8.c
+++ b/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_MatAlgebra_SQ8.c
@@ -14,13 +14,14 @@
  * limitations under the License.
  */
 
+#include <stdio.h>
+#include "CNN_BasicKernels_SQ8.h"
+
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wextra"
 #pragma GCC diagnostic ignored "-Wpointer-sign"
 #pragma GCC diagnostic ignored "-Wsign-compare"
 #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
-#include <stdio.h>
-#include "CNN_BasicKernels_SQ8.h"
 
 static int CoreCountDynamic = 1;
 static int ActiveCore = gap_ncore();
@@ -707,10 +708,10 @@ static inline void __attribute__((always_inline)) KerParMatMulB8_SQ8_act(
 				S3 += V0 * BufferColIn2[i+3*H_In2];
 			}
 			unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S1 = AT_SCALE(S1, Sc,  ScN);  ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S2 = AT_SCALE(S2, Sc,  ScN);  ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S3 = AT_SCALE(S3, Sc,  ScN);  ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S1 = AT_SCALE(S1, Sc,  ScN);  ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S2 = AT_SCALE(S2, Sc,  ScN);  ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S3 = AT_SCALE(S3, Sc,  ScN);  ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 			v4s R = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7));
 			*((v4s *) (Out+(Line+OffLine)*W_Out+4*Col+0+OffCol)) = R;
                 }
@@ -739,8 +740,8 @@ static inline void __attribute__((always_inline)) KerParMatMulB8_SQ8_act(
 				S1 += V0 * BufferColIn2[i+1*H_In2];
 			}
 			unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S1 = AT_SCALE(S1, Sc,  ScN);  ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S1 = AT_SCALE(S1, Sc,  ScN);  ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 			Out[(Line+OffLine)*W_Out+2*Col+0+OffCol] = gap_clip(S0, 7);
 			Out[(Line+OffLine)*W_Out+2*Col+1+OffCol] = gap_clip(S1, 7);
                 }
@@ -765,7 +766,7 @@ static inline void __attribute__((always_inline)) KerParMatMulB8_SQ8_act(
 				S0 += V0 * BufferColIn2[i];
 			}
 			unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 			Out[(Line+OffLine)*W_Out+1*Col+0+OffCol] = gap_clip(S0, 7);
                 }
                 gap_waitbarrier(0);
@@ -872,7 +873,7 @@ static inline void __attribute__((always_inline)) KerParMatMulSxSyB8_SQ8_act(
 			if (W_In1&0x4) S = gap_sumdotp4(VIn1[W_In1/4-1], VBuff[W_In1/4-1], S);
 		       	for (i=(W_In1/4)*4; i<W_In1; i++) S += In1[Line*W_In1 + i] * BufferColIn2[i];
 			unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-			S = AT_SCALE(S, Sc,  ScN);  ACT_SWITCH(S, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S = AT_SCALE(S, Sc,  ScN);  ACT_SWITCH(S, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 		       	Out[(Line+OffLine)*W_Out+Oo] = gap_clip(S, 7);
 	       	}
 		int nF = F+Sx;
@@ -997,10 +998,10 @@ static inline void __attribute__((always_inline)) KerParMatMulB16_SQ8_act(
 				S3 += V0 * BufferColIn2[i+3*H_In2];
 			}
 			unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S1 = AT_SCALE(S1, Sc,  ScN);  ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S2 = AT_SCALE(S2, Sc,  ScN);  ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S3 = AT_SCALE(S3, Sc,  ScN);  ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S1 = AT_SCALE(S1, Sc,  ScN);  ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S2 = AT_SCALE(S2, Sc,  ScN);  ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S3 = AT_SCALE(S3, Sc,  ScN);  ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 			v4s R = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7));
 			*((v4s *) (Out+(Line+OffLine)*W_Out+4*Col+0+OffCol)) = R;
                 }
@@ -1029,8 +1030,8 @@ static inline void __attribute__((always_inline)) KerParMatMulB16_SQ8_act(
 				S1 += V0 * BufferColIn2[i+1*H_In2];
 			}
 			unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S1 = AT_SCALE(S1, Sc,  ScN);  ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S1 = AT_SCALE(S1, Sc,  ScN);  ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 			Out[(Line+OffLine)*W_Out+2*Col+0+OffCol] = gap_clip(S0, 7);
 			Out[(Line+OffLine)*W_Out+2*Col+1+OffCol] = gap_clip(S1, 7);
                 }
@@ -1055,7 +1056,7 @@ static inline void __attribute__((always_inline)) KerParMatMulB16_SQ8_act(
 				S0 += V0 * BufferColIn2[i];
 			}
 			unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 			Out[(Line+OffLine)*W_Out+1*Col+0+OffCol] = gap_clip(S0, 7);
                 }
                 gap_waitbarrier(0);
@@ -1163,7 +1164,7 @@ static inline void __attribute__((always_inline)) KerParMatMulSxSyB16_SQ8_act(
 			if (W_In1&0x4) S = gap_sumdotp4(VIn1[W_In1/4-1], VBuff[W_In1/4-1], S);
 		       	for (i=(W_In1/4)*4; i<W_In1; i++) S += In1[Line*W_In1 + i] * BufferColIn2[i];
 			unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-			S = AT_SCALE(S, Sc,  ScN);  ACT_SWITCH(S, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S = AT_SCALE(S, Sc,  ScN);  ACT_SWITCH(S, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 		       	Out[(Line+OffLine)*W_Out+Oo] = gap_clip(S, 7);
 	       	}
 		int nF = F+Sx;
@@ -1311,14 +1312,14 @@ static inline void __attribute__((always_inline)) KerParMatMulB32_SQ8_act(
 			}
 			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
 			unsigned int Sc1 = Scale[l1+1], ScN1 = ScaleN[l1+1];
-			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S1 = AT_SCALE(S1, Sc,  ScN);  ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S2 = AT_SCALE(S2, Sc,  ScN);  ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S3 = AT_SCALE(S3, Sc,  ScN);  ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S4 = AT_SCALE(S4, Sc1, ScN1); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S5 = AT_SCALE(S5, Sc1, ScN1); ACT_SWITCH(S5, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S6 = AT_SCALE(S6, Sc1, ScN1); ACT_SWITCH(S6, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S7 = AT_SCALE(S7, Sc1, ScN1); ACT_SWITCH(S7, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S1 = AT_SCALE(S1, Sc,  ScN);  ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S2 = AT_SCALE(S2, Sc,  ScN);  ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S3 = AT_SCALE(S3, Sc,  ScN);  ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S4 = AT_SCALE(S4, Sc1, ScN1); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S5 = AT_SCALE(S5, Sc1, ScN1); ACT_SWITCH(S5, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S6 = AT_SCALE(S6, Sc1, ScN1); ACT_SWITCH(S6, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S7 = AT_SCALE(S7, Sc1, ScN1); ACT_SWITCH(S7, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 			v4s R1 = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7));
 			v4s R2 = gap_pack4(gap_clip(S4, 7), gap_clip(S5, 7), gap_clip(S6, 7), gap_clip(S7, 7));
 			*((v4s *) (Out+(l1+OffLine)*W_Out+4*Col+0+OffCol)) = R1;
@@ -1346,10 +1347,10 @@ static inline void __attribute__((always_inline)) KerParMatMulB32_SQ8_act(
 				S3 += V0 * BufferColIn2[i+3*H_In2];
 			}
 			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
-			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S1 = AT_SCALE(S1, Sc,  ScN);  ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S2 = AT_SCALE(S2, Sc,  ScN);  ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S3 = AT_SCALE(S3, Sc,  ScN);  ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S1 = AT_SCALE(S1, Sc,  ScN);  ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S2 = AT_SCALE(S2, Sc,  ScN);  ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S3 = AT_SCALE(S3, Sc,  ScN);  ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 			v4s R1 = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7));
 			*((v4s *) (Out+(l1+OffLine)*W_Out+4*Col+0+OffCol)) = R1;
 		}
@@ -1381,8 +1382,8 @@ static inline void __attribute__((always_inline)) KerParMatMulB32_SQ8_act(
 				S1 += V0 * BufferColIn2[i+1*H_In2];
 			}
 			unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S1 = AT_SCALE(S1, Sc,  ScN);  ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S1 = AT_SCALE(S1, Sc,  ScN);  ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 			Out[(Line+OffLine)*W_Out+2*Col+0+OffCol] = gap_clip(S0, 7);
 			Out[(Line+OffLine)*W_Out+2*Col+1+OffCol] = gap_clip(S1, 7);
                 }
@@ -1410,7 +1411,7 @@ static inline void __attribute__((always_inline)) KerParMatMulB32_SQ8_act(
 				S0 += V0 * BufferColIn2[i];
 			}
 			unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 			Out[(Line+OffLine)*W_Out+1*Col+0+OffCol] = gap_clip(S0, 7);
                 }
                 gap_waitbarrier(0);
@@ -1517,7 +1518,7 @@ static inline void __attribute__((always_inline)) KerParMatMulSxSyB32_SQ8_act(
 			if (W_In1&0x4) S = gap_sumdotp4(VIn1[W_In1/4-1], VBuff[W_In1/4-1], S);
 		       	for (i=(W_In1/4)*4; i<W_In1; i++) S += In1[Line*W_In1 + i] * BufferColIn2[i];
 			unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-			S = AT_SCALE(S, Sc, ScN); ACT_SWITCH(S, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S = AT_SCALE(S, Sc, ScN); ACT_SWITCH(S, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 		       	Out[(Line+OffLine)*W_Out+Oo] = gap_clip(S, 7);
 	       	}
 		int nF = F+Sx;
@@ -1620,10 +1621,10 @@ void KerParMatMulB8_SF_SQ8_act(
 				S0 += C0 * In2[(l2+0)*W_In2+c]; S1 += C0 * In2[(l2+1)*W_In2+c]; S2 += C0 * In2[(l2+2)*W_In2+c]; S3 += C0 * In2[(l2+3)*W_In2+c];
 			}
 			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
-			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S1 = AT_SCALE(S1, Sc,  ScN);  ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S2 = AT_SCALE(S2, Sc,  ScN);  ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S3 = AT_SCALE(S3, Sc,  ScN);  ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S1 = AT_SCALE(S1, Sc,  ScN);  ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S2 = AT_SCALE(S2, Sc,  ScN);  ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S3 = AT_SCALE(S3, Sc,  ScN);  ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 			v4s R = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7));
 			*((v4s *) (Out+l1*H_In2 + l2)) = R;
 		}
@@ -1644,8 +1645,8 @@ void KerParMatMulB8_SF_SQ8_act(
 				S0 += C0 * In2[(l2+0)*W_In2+c]; S1 += C0 * In2[(l2+1)*W_In2+c];
 			}
 			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
-			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S1 = AT_SCALE(S1, Sc,  ScN);  ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S1 = AT_SCALE(S1, Sc,  ScN);  ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 			Out[l1*H_In2 + l2+0] = gap_clip(S0, 7);
 			Out[l1*H_In2 + l2+1] = gap_clip(S1, 7);
 		}
@@ -1666,7 +1667,7 @@ void KerParMatMulB8_SF_SQ8_act(
 				S0 += C0 * In2[(l2+0)*W_In2+c];
 			}
 			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
-			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 			Out[l1*H_In2 + l2+0] = gap_clip(S0, 7);
 		}
 	}
@@ -1754,10 +1755,10 @@ static inline void __attribute__((always_inline)) KerParMatMulB16_SF_SQ8_act(
 				S0 += C0 * In2[(l2+0)*W_In2+c]; S1 += C0 * In2[(l2+1)*W_In2+c]; S2 += C0 * In2[(l2+2)*W_In2+c]; S3 += C0 * In2[(l2+3)*W_In2+c];
 			}
 			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
-			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S1 = AT_SCALE(S1, Sc,  ScN);  ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S2 = AT_SCALE(S2, Sc,  ScN);  ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S3 = AT_SCALE(S3, Sc,  ScN);  ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S1 = AT_SCALE(S1, Sc,  ScN);  ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S2 = AT_SCALE(S2, Sc,  ScN);  ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S3 = AT_SCALE(S3, Sc,  ScN);  ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 			v4s R = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7));
 			*((v4s *) (Out+l1*H_In2 + l2)) = R;
 		}
@@ -1778,8 +1779,8 @@ static inline void __attribute__((always_inline)) KerParMatMulB16_SF_SQ8_act(
 				S0 += C0 * In2[(l2+0)*W_In2+c]; S1 += C0 * In2[(l2+1)*W_In2+c];
 			}
 			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
-			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S1 = AT_SCALE(S1, Sc,  ScN);  ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S1 = AT_SCALE(S1, Sc,  ScN);  ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 			Out[l1*H_In2 + l2+0] = gap_clip(S0, 7);
 			Out[l1*H_In2 + l2+1] = gap_clip(S1, 7);
 		}
@@ -1800,7 +1801,7 @@ static inline void __attribute__((always_inline)) KerParMatMulB16_SF_SQ8_act(
 				S0 += C0 * In2[(l2+0)*W_In2+c];
 			}
 			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
-			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 			Out[l1*H_In2 + l2+0] = gap_clip(S0, 7);
 		}
 	}
@@ -1894,17 +1895,17 @@ static inline void __attribute__((always_inline)) KerParMatMulB32_SF_SQ8_act(
 				S4 += C1 * In2[(l2+0)*W_In2+c]; S5 += C1 * In2[(l2+1)*W_In2+c]; S6 += C1 * In2[(l2+2)*W_In2+c]; S7 += C1 * In2[(l2+3)*W_In2+c];
 			}
 			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
-			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S1 = AT_SCALE(S1, Sc,  ScN);  ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S2 = AT_SCALE(S2, Sc,  ScN);  ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S3 = AT_SCALE(S3, Sc,  ScN);  ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S1 = AT_SCALE(S1, Sc,  ScN);  ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S2 = AT_SCALE(S2, Sc,  ScN);  ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S3 = AT_SCALE(S3, Sc,  ScN);  ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 			v4s R = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7));
 			*((v4s *) (Out+l1*H_In2 + l2)) = R;
 			unsigned int Sc1 = Scale[l1+1], ScN1 = ScaleN[l1+1];
-			S4 = AT_SCALE(S4, Sc1, ScN1);  ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S5 = AT_SCALE(S5, Sc1, ScN1);  ACT_SWITCH(S5, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S6 = AT_SCALE(S6, Sc1, ScN1);  ACT_SWITCH(S6, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S7 = AT_SCALE(S7, Sc1, ScN1);  ACT_SWITCH(S7, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S4 = AT_SCALE(S4, Sc1, ScN1);  ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S5 = AT_SCALE(S5, Sc1, ScN1);  ACT_SWITCH(S5, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S6 = AT_SCALE(S6, Sc1, ScN1);  ACT_SWITCH(S6, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S7 = AT_SCALE(S7, Sc1, ScN1);  ACT_SWITCH(S7, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 			v4s R1 = gap_pack4(gap_clip(S4, 7), gap_clip(S5, 7), gap_clip(S6, 7), gap_clip(S7, 7));
 			*((v4s *) (Out+(l1+1)*H_In2 + l2)) = R1;
 		}
@@ -1921,10 +1922,10 @@ static inline void __attribute__((always_inline)) KerParMatMulB32_SF_SQ8_act(
 				S0 += C0 * In2[(l2+0)*W_In2+c]; S1 += C0 * In2[(l2+1)*W_In2+c]; S2 += C0 * In2[(l2+2)*W_In2+c]; S3 += C0 * In2[(l2+3)*W_In2+c];
 			}
 			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
-			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S1 = AT_SCALE(S1, Sc,  ScN);  ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S2 = AT_SCALE(S2, Sc,  ScN);  ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S3 = AT_SCALE(S3, Sc,  ScN);  ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S1 = AT_SCALE(S1, Sc,  ScN);  ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S2 = AT_SCALE(S2, Sc,  ScN);  ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S3 = AT_SCALE(S3, Sc,  ScN);  ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 			v4s R = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7));
 			*((v4s *) (Out+l1*H_In2 + l2)) = R;
 		}
@@ -1945,8 +1946,8 @@ static inline void __attribute__((always_inline)) KerParMatMulB32_SF_SQ8_act(
 				S0 += C0 * In2[(l2+0)*W_In2+c]; S1 += C0 * In2[(l2+1)*W_In2+c];
 			}
 			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
-			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S1 = AT_SCALE(S1, Sc,  ScN);  ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S1 = AT_SCALE(S1, Sc,  ScN);  ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 			Out[l1*H_In2 + l2+0] = gap_clip(S0, 7);
 			Out[l1*H_In2 + l2+1] = gap_clip(S1, 7);
 		}
@@ -1967,7 +1968,7 @@ static inline void __attribute__((always_inline)) KerParMatMulB32_SF_SQ8_act(
 				S0 += C0 * In2[(l2+0)*W_In2+c];
 			}
 			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
-			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 			Out[l1*H_In2 + l2+0] = gap_clip(S0, 7);
 		}
 	}
@@ -2043,11 +2044,11 @@ static inline void __attribute__((always_inline)) KerParMatVectMul_SQ8_act(
 			signed char * __restrict__ O  = Out + i*W*H;
 			for (int j=0; j<((W*H)/2); j++) {
 				int I10 = I1[2*j], I11 = I1[2*j+1];
-				int P1 = AT_SCALE(I10*Scale, I2, ScaleN); ACT_SWITCH(P1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-				int P2 = AT_SCALE(I11*Scale, I2, ScaleN); ACT_SWITCH(P2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				int P1 = AT_SCALE(I10*Scale, I2, ScaleN); ACT_SWITCH(P1, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+				int P2 = AT_SCALE(I11*Scale, I2, ScaleN); ACT_SWITCH(P2, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 				O[2*j  ] = gap_clip(P1, 7); O[2*j+1] = gap_clip(P2, 7);
 			}
-			int P1 = AT_SCALE(I1[W*H-1]*Scale, I2, ScaleN); ACT_SWITCH(P1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			int P1 = AT_SCALE(I1[W*H-1]*Scale, I2, ScaleN); ACT_SWITCH(P1, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 			O[W*H-1] = gap_clip(P1, 7);
 		}
 	else
@@ -2057,11 +2058,11 @@ static inline void __attribute__((always_inline)) KerParMatVectMul_SQ8_act(
 			signed char * __restrict__ O  = Out + i*W*H;
 			for (int j=0; j<((W*H)/2); j++) {
 				int I10 = I1[2*j], I11 = I1[2*j+1];
-				int P1 = AT_SCALE(I10, I2, ScaleN); ACT_SWITCH(P1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-				int P2 = AT_SCALE(I11, I2, ScaleN); ACT_SWITCH(P2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				int P1 = AT_SCALE(I10, I2, ScaleN); ACT_SWITCH(P1, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+				int P2 = AT_SCALE(I11, I2, ScaleN); ACT_SWITCH(P2, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 				O[2*j  ] = gap_clip(P1, 7); O[2*j+1] = gap_clip(P2, 7);
 			}
-			int P1 = AT_SCALE(I1[W*H-1], I2, ScaleN); ACT_SWITCH(P1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			int P1 = AT_SCALE(I1[W*H-1], I2, ScaleN); ACT_SWITCH(P1, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 			O[W*H-1] = gap_clip(P1, 7);
 		}
 	gap_waitbarrier(0);
@@ -2134,11 +2135,11 @@ static inline void __attribute__((always_inline)) KerParMatVectMul_HWC_SQ8_act(
 			signed char * __restrict__ O  = Out + i;
 			for (int j=0; j<(S/2); j++) {
 				int I10 = I1[(2*j)*Feat], I11 = I1[(2*j+1)*Feat];
-				int P1 = AT_SCALE(I10*Scale, I2, ScaleN); ACT_SWITCH(P1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-				int P2 = AT_SCALE(I11*Scale, I2, ScaleN); ACT_SWITCH(P2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				int P1 = AT_SCALE(I10*Scale, I2, ScaleN); ACT_SWITCH(P1, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+				int P2 = AT_SCALE(I11*Scale, I2, ScaleN); ACT_SWITCH(P2, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 				O[(2*j)*Feat] = gap_clip(P1, 7); O[(2*j+1)*Feat] = gap_clip(P2, 7);
 			}
-			int P1 = AT_SCALE(I1[(S-1)*Feat]*Scale, I2, ScaleN); ACT_SWITCH(P1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			int P1 = AT_SCALE(I1[(S-1)*Feat]*Scale, I2, ScaleN); ACT_SWITCH(P1, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 			O[(S-1)*Feat] = gap_clip(P1, 7);
 		}
 	else
@@ -2148,11 +2149,11 @@ static inline void __attribute__((always_inline)) KerParMatVectMul_HWC_SQ8_act(
 			signed char * __restrict__ O  = Out + i;
 			for (int j=0; j<(S/2); j++) {
 				int I10 = I1[(2*j)*Feat], I11 = I1[(2*j+1)*Feat];
-				int P1 = AT_SCALE(I10, I2, ScaleN); ACT_SWITCH(P1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-				int P2 = AT_SCALE(I11, I2, ScaleN); ACT_SWITCH(P2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				int P1 = AT_SCALE(I10, I2, ScaleN); ACT_SWITCH(P1, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+				int P2 = AT_SCALE(I11, I2, ScaleN); ACT_SWITCH(P2, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 				O[(2*j)*Feat] = gap_clip(P1, 7); O[(2*j+1)*Feat] = gap_clip(P2, 7);
 			}
-			int P1 = AT_SCALE(I1[(S-1)*Feat], I2, ScaleN); ACT_SWITCH(P1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			int P1 = AT_SCALE(I1[(S-1)*Feat], I2, ScaleN); ACT_SWITCH(P1, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 			O[(S-1)*Feat] = gap_clip(P1, 7);
 		}
 	gap_waitbarrier(0);
@@ -2285,14 +2286,14 @@ static inline void __attribute__((always_inline)) KerParMatMulNoBias_PL_SQ8_act(
 				S6 += V1 * BufferColIn2[i+2*H_In2];
 				S7 += V1 * BufferColIn2[i+3*H_In2];
 			}
-			S0 = AT_SCALE(S0, Scale, ScaleN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S1 = AT_SCALE(S1, Scale, ScaleN); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S2 = AT_SCALE(S2, Scale, ScaleN); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S3 = AT_SCALE(S3, Scale, ScaleN); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S4 = AT_SCALE(S4, Scale, ScaleN); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S5 = AT_SCALE(S5, Scale, ScaleN); ACT_SWITCH(S5, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S6 = AT_SCALE(S6, Scale, ScaleN); ACT_SWITCH(S6, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S7 = AT_SCALE(S7, Scale, ScaleN); ACT_SWITCH(S7, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S0 = AT_SCALE(S0, Scale, ScaleN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S1 = AT_SCALE(S1, Scale, ScaleN); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S2 = AT_SCALE(S2, Scale, ScaleN); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S3 = AT_SCALE(S3, Scale, ScaleN); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S4 = AT_SCALE(S4, Scale, ScaleN); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S5 = AT_SCALE(S5, Scale, ScaleN); ACT_SWITCH(S5, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S6 = AT_SCALE(S6, Scale, ScaleN); ACT_SWITCH(S6, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S7 = AT_SCALE(S7, Scale, ScaleN); ACT_SWITCH(S7, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 			v4s R1 = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7));
 			v4s R2 = gap_pack4(gap_clip(S4, 7), gap_clip(S5, 7), gap_clip(S6, 7), gap_clip(S7, 7));
 			*((v4s *) (Out+(l1+OffLine)*W_Out+4*Col+0+OffCol)) = R1;
@@ -2316,10 +2317,10 @@ static inline void __attribute__((always_inline)) KerParMatMulNoBias_PL_SQ8_act(
 				S2 += V0 * BufferColIn2[i+2*H_In2];
 				S3 += V0 * BufferColIn2[i+3*H_In2];
 			}
-			S0 = AT_SCALE(S0, Scale, ScaleN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S1 = AT_SCALE(S1, Scale, ScaleN); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S2 = AT_SCALE(S2, Scale, ScaleN); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S3 = AT_SCALE(S3, Scale, ScaleN); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S0 = AT_SCALE(S0, Scale, ScaleN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S1 = AT_SCALE(S1, Scale, ScaleN); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S2 = AT_SCALE(S2, Scale, ScaleN); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S3 = AT_SCALE(S3, Scale, ScaleN); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 			v4s R1 = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7));
 			*((v4s *) (Out+(l1+OffLine)*W_Out+4*Col+0+OffCol)) = R1;
 		}
@@ -2347,8 +2348,8 @@ static inline void __attribute__((always_inline)) KerParMatMulNoBias_PL_SQ8_act(
 				S0 += V0 * BufferColIn2[i];
 				S1 += V0 * BufferColIn2[i+1*H_In2];
 			}
-			S0 = AT_SCALE(S0, Scale, ScaleN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S1 = AT_SCALE(S1, Scale, ScaleN); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S0 = AT_SCALE(S0, Scale, ScaleN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S1 = AT_SCALE(S1, Scale, ScaleN); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 			Out[(Line+OffLine)*W_Out+2*Col+0+OffCol] = gap_clip(S0, 7);
 			Out[(Line+OffLine)*W_Out+2*Col+1+OffCol] = gap_clip(S1, 7);
                 }
@@ -2372,7 +2373,7 @@ static inline void __attribute__((always_inline)) KerParMatMulNoBias_PL_SQ8_act(
 				int V0 = In1[Line*W_In1 + i];
 				S0 += V0 * BufferColIn2[i];
 			}
-			S0 = AT_SCALE(S0, Scale, ScaleN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S0 = AT_SCALE(S0, Scale, ScaleN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 			Out[(Line+OffLine)*W_Out+1*Col+0+OffCol] = gap_clip(S0, 7);
                 }
                 gap_waitbarrier(0);
@@ -2509,14 +2510,14 @@ static inline void __attribute__((always_inline)) KerParMatMulB32_PL_SQ8_act(
 				S6 += V1 * BufferColIn2[i+2*H_In2];
 				S7 += V1 * BufferColIn2[i+3*H_In2];
 			}
-			S0 = AT_SCALE(S0, Scale, ScaleN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S1 = AT_SCALE(S1, Scale, ScaleN); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S2 = AT_SCALE(S2, Scale, ScaleN); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S3 = AT_SCALE(S3, Scale, ScaleN); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S4 = AT_SCALE(S4, Scale, ScaleN); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S5 = AT_SCALE(S5, Scale, ScaleN); ACT_SWITCH(S5, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S6 = AT_SCALE(S6, Scale, ScaleN); ACT_SWITCH(S6, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S7 = AT_SCALE(S7, Scale, ScaleN); ACT_SWITCH(S7, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S0 = AT_SCALE(S0, Scale, ScaleN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S1 = AT_SCALE(S1, Scale, ScaleN); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S2 = AT_SCALE(S2, Scale, ScaleN); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S3 = AT_SCALE(S3, Scale, ScaleN); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S4 = AT_SCALE(S4, Scale, ScaleN); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S5 = AT_SCALE(S5, Scale, ScaleN); ACT_SWITCH(S5, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S6 = AT_SCALE(S6, Scale, ScaleN); ACT_SWITCH(S6, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S7 = AT_SCALE(S7, Scale, ScaleN); ACT_SWITCH(S7, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 			v4s R1 = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7));
 			v4s R2 = gap_pack4(gap_clip(S4, 7), gap_clip(S5, 7), gap_clip(S6, 7), gap_clip(S7, 7));
 			*((v4s *) (Out+(l1+OffLine)*W_Out+4*Col+0+OffCol)) = R1;
@@ -2543,10 +2544,10 @@ static inline void __attribute__((always_inline)) KerParMatMulB32_PL_SQ8_act(
 				S2 += V0 * BufferColIn2[i+2*H_In2];
 				S3 += V0 * BufferColIn2[i+3*H_In2];
 			}
-			S0 = AT_SCALE(S0, Scale, ScaleN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S1 = AT_SCALE(S1, Scale, ScaleN); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S2 = AT_SCALE(S2, Scale, ScaleN); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S3 = AT_SCALE(S3, Scale, ScaleN); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S0 = AT_SCALE(S0, Scale, ScaleN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S1 = AT_SCALE(S1, Scale, ScaleN); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S2 = AT_SCALE(S2, Scale, ScaleN); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S3 = AT_SCALE(S3, Scale, ScaleN); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 			v4s R1 = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7));
 			*((v4s *) (Out+(l1+OffLine)*W_Out+4*Col+0+OffCol)) = R1;
 		}
@@ -2577,8 +2578,8 @@ static inline void __attribute__((always_inline)) KerParMatMulB32_PL_SQ8_act(
 				S0 += V0 * BufferColIn2[i];
 				S1 += V0 * BufferColIn2[i+1*H_In2];
 			}
-			S0 = AT_SCALE(S0, Scale, ScaleN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S1 = AT_SCALE(S1, Scale, ScaleN); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S0 = AT_SCALE(S0, Scale, ScaleN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S1 = AT_SCALE(S1, Scale, ScaleN); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 			Out[(Line+OffLine)*W_Out+2*Col  +OffCol] = gap_clip(S0, 7);
 			Out[(Line+OffLine)*W_Out+2*Col+1+OffCol] = gap_clip(S1, 7);
                 }
@@ -2602,7 +2603,7 @@ static inline void __attribute__((always_inline)) KerParMatMulB32_PL_SQ8_act(
 				int V0 = In1[Line*W_In1 + i];
 				S0 += V0 * BufferColIn2[i];
 			}
-			S0 = AT_SCALE(S0, Scale, ScaleN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S0 = AT_SCALE(S0, Scale, ScaleN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 			Out[(Line+OffLine)*W_Out+1*Col+0+OffCol] = gap_clip(S0, 7);
                 }
                 gap_waitbarrier(0);
@@ -2729,14 +2730,14 @@ static inline void __attribute__((always_inline)) KerParMatMulTransposedB32_SQ8_
 				S6 += V1 * pIn2[i+2*H_In2];
 				S7 += V1 * pIn2[i+3*H_In2];
 			}
-			S0 = AT_SCALE(S0, Scale[4*Col  ], ScaleN[4*Col  ]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S1 = AT_SCALE(S1, Scale[4*Col+1], ScaleN[4*Col+1]); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S2 = AT_SCALE(S2, Scale[4*Col+2], ScaleN[4*Col+2]); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S3 = AT_SCALE(S3, Scale[4*Col+3], ScaleN[4*Col+3]); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S4 = AT_SCALE(S4, Scale[4*Col  ], ScaleN[4*Col  ]); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S5 = AT_SCALE(S5, Scale[4*Col+1], ScaleN[4*Col+1]); ACT_SWITCH(S5, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S6 = AT_SCALE(S6, Scale[4*Col+2], ScaleN[4*Col+2]); ACT_SWITCH(S6, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S7 = AT_SCALE(S7, Scale[4*Col+3], ScaleN[4*Col+3]); ACT_SWITCH(S7, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S0 = AT_SCALE(S0, Scale[4*Col  ], ScaleN[4*Col  ]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S1 = AT_SCALE(S1, Scale[4*Col+1], ScaleN[4*Col+1]); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S2 = AT_SCALE(S2, Scale[4*Col+2], ScaleN[4*Col+2]); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S3 = AT_SCALE(S3, Scale[4*Col+3], ScaleN[4*Col+3]); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S4 = AT_SCALE(S4, Scale[4*Col  ], ScaleN[4*Col  ]); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S5 = AT_SCALE(S5, Scale[4*Col+1], ScaleN[4*Col+1]); ACT_SWITCH(S5, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S6 = AT_SCALE(S6, Scale[4*Col+2], ScaleN[4*Col+2]); ACT_SWITCH(S6, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S7 = AT_SCALE(S7, Scale[4*Col+3], ScaleN[4*Col+3]); ACT_SWITCH(S7, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 			v4s R1 = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7));
 			v4s R2 = gap_pack4(gap_clip(S4, 7), gap_clip(S5, 7), gap_clip(S6, 7), gap_clip(S7, 7));
 			*((v4s *) (pOut+(l1  )*W_Out+4*Col)) = R1;
@@ -2762,8 +2763,8 @@ static inline void __attribute__((always_inline)) KerParMatMulTransposedB32_SQ8_
 				S0 += V0 * pIn2[i];
 				S1 += V1 * pIn2[i];
 			}
-			S0 = AT_SCALE(S0, Scale[Col], ScaleN[Col]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S1 = AT_SCALE(S1, Scale[Col], ScaleN[Col]); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S0 = AT_SCALE(S0, Scale[Col], ScaleN[Col]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S1 = AT_SCALE(S1, Scale[Col], ScaleN[Col]); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
                         pOut[(l1  )*W_Out + Col] = gap_clip(S0, 7);
                         pOut[(l1+1)*W_Out + Col] = gap_clip(S1, 7);
 			pIn2 += H_In2;
@@ -2803,10 +2804,10 @@ static inline void __attribute__((always_inline)) KerParMatMulTransposedB32_SQ8_
 				S2 += V0 * pIn2[i+2*H_In2];
 				S3 += V0 * pIn2[i+3*H_In2];
 			}
-			S0 = AT_SCALE(S0, Scale[4*Col  ], ScaleN[4*Col  ]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S1 = AT_SCALE(S1, Scale[4*Col+1], ScaleN[4*Col+1]); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S2 = AT_SCALE(S2, Scale[4*Col+2], ScaleN[4*Col+2]); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S3 = AT_SCALE(S3, Scale[4*Col+3], ScaleN[4*Col+3]); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S0 = AT_SCALE(S0, Scale[4*Col  ], ScaleN[4*Col  ]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S1 = AT_SCALE(S1, Scale[4*Col+1], ScaleN[4*Col+1]); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S2 = AT_SCALE(S2, Scale[4*Col+2], ScaleN[4*Col+2]); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S3 = AT_SCALE(S3, Scale[4*Col+3], ScaleN[4*Col+3]); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 			v4s R1 = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7));
 			*((v4s *) (pOut+(l1  )*W_Out+4*Col)) = R1;
 			pIn2 += 4*H_In2;
@@ -2826,7 +2827,7 @@ static inline void __attribute__((always_inline)) KerParMatMulTransposedB32_SQ8_
 				int V0 = In1[(l1  )*W_In1 + i];
 				S0 += V0 * pIn2[i];
 			}
-			S0 = AT_SCALE(S0, Scale[Col], ScaleN[Col]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S0 = AT_SCALE(S0, Scale[Col], ScaleN[Col]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
                         pOut[(l1  )*W_Out + Col] = gap_clip(S0, 7);
 			pIn2 += H_In2;
         	}
@@ -2955,14 +2956,14 @@ static inline void __attribute__((always_inline)) KerParMatMulTransposedB32_PL_S
 				S6 += V1 * pIn2[i+2*H_In2];
 				S7 += V1 * pIn2[i+3*H_In2];
 			}
-			S0 = AT_SCALE(S0, Scale, ScaleN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S1 = AT_SCALE(S1, Scale, ScaleN); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S2 = AT_SCALE(S2, Scale, ScaleN); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S3 = AT_SCALE(S3, Scale, ScaleN); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S4 = AT_SCALE(S4, Scale, ScaleN); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S5 = AT_SCALE(S5, Scale, ScaleN); ACT_SWITCH(S5, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S6 = AT_SCALE(S6, Scale, ScaleN); ACT_SWITCH(S6, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S7 = AT_SCALE(S7, Scale, ScaleN); ACT_SWITCH(S7, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S0 = AT_SCALE(S0, Scale, ScaleN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S1 = AT_SCALE(S1, Scale, ScaleN); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S2 = AT_SCALE(S2, Scale, ScaleN); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S3 = AT_SCALE(S3, Scale, ScaleN); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S4 = AT_SCALE(S4, Scale, ScaleN); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S5 = AT_SCALE(S5, Scale, ScaleN); ACT_SWITCH(S5, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S6 = AT_SCALE(S6, Scale, ScaleN); ACT_SWITCH(S6, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S7 = AT_SCALE(S7, Scale, ScaleN); ACT_SWITCH(S7, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 			v4s R1 = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7));
 			v4s R2 = gap_pack4(gap_clip(S4, 7), gap_clip(S5, 7), gap_clip(S6, 7), gap_clip(S7, 7));
 			*((v4s *) (pOut+(l1  )*W_Out+4*Col)) = R1;
@@ -2988,8 +2989,8 @@ static inline void __attribute__((always_inline)) KerParMatMulTransposedB32_PL_S
 				S0 += V0 * pIn2[i];
 				S1 += V1 * pIn2[i];
 			}
-			S0 = AT_SCALE(S0, Scale, ScaleN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S1 = AT_SCALE(S1, Scale, ScaleN); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S0 = AT_SCALE(S0, Scale, ScaleN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S1 = AT_SCALE(S1, Scale, ScaleN); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
                         pOut[(l1  )*W_Out + Col] = gap_clip(S0, 7);
                         pOut[(l1+1)*W_Out + Col] = gap_clip(S1, 7);
 			pIn2 += H_In2;
@@ -3029,10 +3030,10 @@ static inline void __attribute__((always_inline)) KerParMatMulTransposedB32_PL_S
 				S2 += V0 * pIn2[i+2*H_In2];
 				S3 += V0 * pIn2[i+3*H_In2];
 			}
-			S0 = AT_SCALE(S0, Scale, ScaleN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S1 = AT_SCALE(S1, Scale, ScaleN); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S2 = AT_SCALE(S2, Scale, ScaleN); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S3 = AT_SCALE(S3, Scale, ScaleN); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S0 = AT_SCALE(S0, Scale, ScaleN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S1 = AT_SCALE(S1, Scale, ScaleN); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S2 = AT_SCALE(S2, Scale, ScaleN); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S3 = AT_SCALE(S3, Scale, ScaleN); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 			v4s R1 = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7));
 			*((v4s *) (pOut+(l1  )*W_Out+4*Col)) = R1;
 			pIn2 += 4*H_In2;
@@ -3052,7 +3053,7 @@ static inline void __attribute__((always_inline)) KerParMatMulTransposedB32_PL_S
 				int V0 = In1[(l1  )*W_In1 + i];
 				S0 += V0 * pIn2[i];
 			}
-			S0 = AT_SCALE(S0, Scale, ScaleN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S0 = AT_SCALE(S0, Scale, ScaleN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
                         pOut[(l1  )*W_Out + Col] = gap_clip(S0, 7);
 			pIn2 += H_In2;
         	}
diff --git a/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_MatMul_Conv_SQ8.c b/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_MatMul_Conv_SQ8.c
index e90a8d3b4..6f8e6c578 100644
--- a/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_MatMul_Conv_SQ8.c
+++ b/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_MatMul_Conv_SQ8.c
@@ -14,12 +14,13 @@
  * limitations under the License.
  */
 
+#include <stdio.h>
+#include "CNN_BasicKernels_SQ8.h"
+
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wsign-compare"
 #pragma GCC diagnostic ignored "-Wswitch"
-
-#include <stdio.h>
-#include "CNN_BasicKernels_SQ8.h"
+#pragma GCC diagnostic ignored "-Wpointer-sign"
 
 static int CoreCountDynamic = 1;
 static int ActiveCore = gap_ncore();
@@ -80,7 +81,7 @@ static inline void __attribute__((always_inline)) KerPar_MM_Conv1D_SQ8_act(
         signed char * __restrict__ ColBuff = Arg->ColBuff;
 
 	int Wo = Arg->Wo, Ho = Arg->Ho;
-	unsigned char * Infos = Arg->Infos;
+	unsigned char * Infos = (unsigned char *) Arg->Infos;
 	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
 	int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]);
 
@@ -159,7 +160,7 @@ static inline void __attribute__((always_inline)) KerPar_MM_Conv1D_SQ8_act(
 	                                S0 = gap_sumdotp4(V1, C1, S0);
 	                        }
 	                        unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-	                        S0 = AT_SCALE(S0, Sc, ScN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+	                        S0 = AT_SCALE(S0, Sc, ScN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 	                        Out[Line*Wo*Ho + l*Wo + c] = gap_clip(S0, 7);
 	                }
 			gap_waitbarrier(0);
@@ -235,7 +236,7 @@ static inline void __attribute__((always_inline)) KerPar_MM_Conv1x1_HWC_SQ8_act(
         unsigned char * __restrict__ ScaleN = Arg->ScaleN;
 
 	int Wo = Arg->Wo, Ho = Arg->Ho;
-	unsigned char * Infos = Arg->Infos;
+	unsigned char * Infos = (unsigned char *) Arg->Infos;
 	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
 	int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]);
 
@@ -281,23 +282,23 @@ static inline void __attribute__((always_inline)) KerPar_MM_Conv1x1_HWC_SQ8_act(
 				}
 	                        unsigned int Sc, ScN;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-				S01 = AT_SCALE(S01, Sc, ScN); ACT_SWITCH(S01, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+				S01 = AT_SCALE(S01, Sc, ScN); ACT_SWITCH(S01, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 				*pOut0 = gap_clip(S00, 7); pOut0++;
 				*pOut1 = gap_clip(S01, 7); pOut1++;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				S10 = AT_SCALE(S10, Sc, ScN); ACT_SWITCH(S10, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-				S11 = AT_SCALE(S11, Sc, ScN); ACT_SWITCH(S11, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S10 = AT_SCALE(S10, Sc, ScN); ACT_SWITCH(S10, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+				S11 = AT_SCALE(S11, Sc, ScN); ACT_SWITCH(S11, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 				*pOut0 = gap_clip(S10, 7); pOut0++;
 				*pOut1 = gap_clip(S11, 7); pOut1++;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				S20 = AT_SCALE(S20, Sc, ScN); ACT_SWITCH(S20, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-				S21 = AT_SCALE(S21, Sc, ScN); ACT_SWITCH(S21, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S20 = AT_SCALE(S20, Sc, ScN); ACT_SWITCH(S20, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+				S21 = AT_SCALE(S21, Sc, ScN); ACT_SWITCH(S21, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 				*pOut0 = gap_clip(S20, 7); pOut0++;
 				*pOut1 = gap_clip(S21, 7); pOut1++;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				S30 = AT_SCALE(S30, Sc, ScN); ACT_SWITCH(S30, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-				S31 = AT_SCALE(S31, Sc, ScN); ACT_SWITCH(S31, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S30 = AT_SCALE(S30, Sc, ScN); ACT_SWITCH(S30, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+				S31 = AT_SCALE(S31, Sc, ScN); ACT_SWITCH(S31, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 				*pOut0 = gap_clip(S30, 7); pOut0++;
 				*pOut1 = gap_clip(S31, 7); pOut1++;
 	                }
@@ -316,8 +317,8 @@ static inline void __attribute__((always_inline)) KerPar_MM_Conv1x1_HWC_SQ8_act(
 				}
 	                        unsigned int Sc, ScN;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-				S01 = AT_SCALE(S01, Sc, ScN); ACT_SWITCH(S01, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+				S01 = AT_SCALE(S01, Sc, ScN); ACT_SWITCH(S01, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 				*pOut0 = gap_clip(S00, 7); pOut0++;
 				*pOut1 = gap_clip(S01, 7); pOut1++;
 			}
@@ -358,16 +359,16 @@ static inline void __attribute__((always_inline)) KerPar_MM_Conv1x1_HWC_SQ8_act(
 				}
 	                        unsigned int Sc, ScN;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 				*pOut0 = gap_clip(S00, 7); pOut0++;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				S10 = AT_SCALE(S10, Sc, ScN); ACT_SWITCH(S10, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S10 = AT_SCALE(S10, Sc, ScN); ACT_SWITCH(S10, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 				*pOut0 = gap_clip(S10, 7); pOut0++;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				S20 = AT_SCALE(S20, Sc, ScN); ACT_SWITCH(S20, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S20 = AT_SCALE(S20, Sc, ScN); ACT_SWITCH(S20, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 				*pOut0 = gap_clip(S20, 7); pOut0++;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				S30 = AT_SCALE(S30, Sc, ScN); ACT_SWITCH(S30, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S30 = AT_SCALE(S30, Sc, ScN); ACT_SWITCH(S30, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 				*pOut0 = gap_clip(S30, 7); pOut0++;
 			}
 			for (int i=4*(IterOut/4); i<IterOut; i++) {
@@ -385,7 +386,7 @@ static inline void __attribute__((always_inline)) KerPar_MM_Conv1x1_HWC_SQ8_act(
 				}
 	                        unsigned int Sc, ScN;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 				*pOut0 = gap_clip(S00, 7); pOut0++;
 			}
 		}
@@ -462,7 +463,7 @@ static inline void __attribute__((always_inline)) Ker_MM_Conv1x1_HWC_SQ8_act(
         unsigned char * __restrict__ ScaleN = Arg->ScaleN;
 
 	int Wo = Arg->Wo, Ho = Arg->Ho;
-	unsigned char * Infos = Arg->Infos;
+	unsigned char * Infos = (unsigned char *) Arg->Infos;
 	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
 	int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]);
 
@@ -507,23 +508,23 @@ static inline void __attribute__((always_inline)) Ker_MM_Conv1x1_HWC_SQ8_act(
 				}
 	                        unsigned int Sc, ScN;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-				S01 = AT_SCALE(S01, Sc, ScN); ACT_SWITCH(S01, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+				S01 = AT_SCALE(S01, Sc, ScN); ACT_SWITCH(S01, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 				*pOut0 = gap_clip(S00, 7); pOut0++;
 				*pOut1 = gap_clip(S01, 7); pOut1++;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				S10 = AT_SCALE(S10, Sc, ScN); ACT_SWITCH(S10, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-				S11 = AT_SCALE(S11, Sc, ScN); ACT_SWITCH(S11, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S10 = AT_SCALE(S10, Sc, ScN); ACT_SWITCH(S10, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+				S11 = AT_SCALE(S11, Sc, ScN); ACT_SWITCH(S11, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 				*pOut0 = gap_clip(S10, 7); pOut0++;
 				*pOut1 = gap_clip(S11, 7); pOut1++;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				S20 = AT_SCALE(S20, Sc, ScN); ACT_SWITCH(S20, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-				S21 = AT_SCALE(S21, Sc, ScN); ACT_SWITCH(S21, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S20 = AT_SCALE(S20, Sc, ScN); ACT_SWITCH(S20, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+				S21 = AT_SCALE(S21, Sc, ScN); ACT_SWITCH(S21, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 				*pOut0 = gap_clip(S20, 7); pOut0++;
 				*pOut1 = gap_clip(S21, 7); pOut1++;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				S30 = AT_SCALE(S30, Sc, ScN); ACT_SWITCH(S30, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-				S31 = AT_SCALE(S31, Sc, ScN); ACT_SWITCH(S31, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S30 = AT_SCALE(S30, Sc, ScN); ACT_SWITCH(S30, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+				S31 = AT_SCALE(S31, Sc, ScN); ACT_SWITCH(S31, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 				*pOut0 = gap_clip(S30, 7); pOut0++;
 				*pOut1 = gap_clip(S31, 7); pOut1++;
 	                }
@@ -542,8 +543,8 @@ static inline void __attribute__((always_inline)) Ker_MM_Conv1x1_HWC_SQ8_act(
 				}
 	                        unsigned int Sc, ScN;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-				S01 = AT_SCALE(S01, Sc, ScN); ACT_SWITCH(S01, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+				S01 = AT_SCALE(S01, Sc, ScN); ACT_SWITCH(S01, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 				*pOut0 = gap_clip(S00, 7); pOut0++;
 				*pOut1 = gap_clip(S01, 7); pOut1++;
 			}
@@ -584,16 +585,16 @@ static inline void __attribute__((always_inline)) Ker_MM_Conv1x1_HWC_SQ8_act(
 				}
 	                        unsigned int Sc, ScN;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 				*pOut0 = gap_clip(S00, 7); pOut0++;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				S10 = AT_SCALE(S10, Sc, ScN); ACT_SWITCH(S10, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S10 = AT_SCALE(S10, Sc, ScN); ACT_SWITCH(S10, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 				*pOut0 = gap_clip(S10, 7); pOut0++;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				S20 = AT_SCALE(S20, Sc, ScN); ACT_SWITCH(S20, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S20 = AT_SCALE(S20, Sc, ScN); ACT_SWITCH(S20, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 				*pOut0 = gap_clip(S20, 7); pOut0++;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				S30 = AT_SCALE(S30, Sc, ScN); ACT_SWITCH(S30, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S30 = AT_SCALE(S30, Sc, ScN); ACT_SWITCH(S30, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 				*pOut0 = gap_clip(S30, 7); pOut0++;
 			}
 			for (int i=4*(OutFeat/4); i<OutFeat; i++) {
@@ -611,7 +612,7 @@ static inline void __attribute__((always_inline)) Ker_MM_Conv1x1_HWC_SQ8_act(
 				}
 	                        unsigned int Sc, ScN;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 				*pOut0 = gap_clip(S00, 7); pOut0++;
 			}
 		}
@@ -687,7 +688,7 @@ static inline void __attribute__((always_inline)) KerPar_MM_Conv1D_HWC_SQ8_act(
         signed char * __restrict__ ColBuff = Arg->ColBuff;
 
 	int Wo = Arg->Wo, Ho = Arg->Ho;
-	unsigned char * Infos = Arg->Infos;
+	unsigned char * Infos = (unsigned char *) Arg->Infos;
 	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
 	int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]);
 
@@ -784,14 +785,14 @@ static inline void __attribute__((always_inline)) KerPar_MM_Conv1D_HWC_SQ8_act(
 					S3 += V0*C3; S7 += V1*C3;
 					pIn++; pIn1++; pC0++; pC1++; pC2++; pC3++;
 				}
-				S0 = AT_SCALE(S0, pSc[4*Line  ], pScN[4*Line  ]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-				S1 = AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-				S2 = AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-				S3 = AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-				S4 = AT_SCALE(S4, pSc[4*Line  ], pScN[4*Line  ]); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-				S5 = AT_SCALE(S5, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S5, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-				S6 = AT_SCALE(S6, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S6, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-				S7 = AT_SCALE(S7, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S7, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S0 = AT_SCALE(S0, pSc[4*Line  ], pScN[4*Line  ]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+				S1 = AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+				S2 = AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+				S3 = AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+				S4 = AT_SCALE(S4, pSc[4*Line  ], pScN[4*Line  ]); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+				S5 = AT_SCALE(S5, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S5, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+				S6 = AT_SCALE(S6, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S6, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+				S7 = AT_SCALE(S7, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S7, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 				v4s R1 = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7));
 				v4s R2 = gap_pack4(gap_clip(S4, 7), gap_clip(S5, 7), gap_clip(S6, 7), gap_clip(S7, 7));
 				*((v4s *) (pOut0+4*Line)) = R1;
@@ -811,8 +812,8 @@ static inline void __attribute__((always_inline)) KerPar_MM_Conv1D_HWC_SQ8_act(
 					S0 += V0*C0; S4 += V1*C0;
 					pIn++; pIn1++; pC++;
 				}
-				S0 = AT_SCALE(S0, pSc[i], pScN[i]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-				S4 = AT_SCALE(S4, pSc[i], pScN[i]); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S0 = AT_SCALE(S0, pSc[i], pScN[i]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+				S4 = AT_SCALE(S4, pSc[i], pScN[i]); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 				*(pOut0+i) = gap_clip(S0, 7);
 				*(pOut1+i) = gap_clip(S4, 7);
 			}
@@ -868,10 +869,10 @@ static inline void __attribute__((always_inline)) KerPar_MM_Conv1D_HWC_SQ8_act(
 					S3 += V0*C3;
 					pIn++; pC0++; pC1++; pC2++; pC3++;
 				}
-				S0 = AT_SCALE(S0, pSc[4*Line  ], pScN[4*Line  ]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-				S1 = AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-				S2 = AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-				S3 = AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S0 = AT_SCALE(S0, pSc[4*Line  ], pScN[4*Line  ]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+				S1 = AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+				S2 = AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+				S3 = AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 				v4s R1 = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7));
 				*((v4s *) (pOut0+4*Line)) = R1;
 	                }
@@ -888,7 +889,7 @@ static inline void __attribute__((always_inline)) KerPar_MM_Conv1D_HWC_SQ8_act(
 					S0 += V0*C0;
 					pIn++; pC++;
 				}
-				S0 = AT_SCALE(S0, pSc[i], pScN[i]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S0 = AT_SCALE(S0, pSc[i], pScN[i]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 				*(pOut0+i) = gap_clip(S0, 7);
 			}
 			gap_waitbarrier(0);
@@ -963,7 +964,7 @@ static inline void __attribute__((always_inline)) KerPar_MM_Conv1D_DxDy_SQ8_act(
         signed char * __restrict__ ColBuff = Arg->ColBuff;
 
 	int Wo = Arg->Wo, Ho = Arg->Ho;
-	unsigned char * Infos = Arg->Infos;
+	unsigned char * Infos = (unsigned char *) Arg->Infos;
 	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
 	int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]);
 
@@ -1010,7 +1011,7 @@ static inline void __attribute__((always_inline)) KerPar_MM_Conv1D_DxDy_SQ8_act(
 	                                S0 = gap_sumdotp4(V1, C1, S0);
 	                        }
 	                        unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-	                        S0 = AT_SCALE(S0, Sc, ScN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+	                        S0 = AT_SCALE(S0, Sc, ScN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 	                        Out[Line*Wo*Ho + l*Wo + c] = gap_clip(S0, 7);
 	                }
 			gap_waitbarrier(0);
@@ -1089,7 +1090,7 @@ static inline void __attribute__((always_inline)) KerPar_MM_Conv1D_DxDy_HWC_SQ8_
 
 	int Wo = Arg->Wo, Ho = Arg->Ho;
 
-	unsigned char * Infos = Arg->Infos;
+	unsigned char * Infos = (unsigned char *) Arg->Infos;
 	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
 	int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]);
 
@@ -1210,14 +1211,14 @@ static inline void __attribute__((always_inline)) KerPar_MM_Conv1D_DxDy_HWC_SQ8_
 				S3 += V0*C3; S7 += V1*C3;
 				pIn++; pIn1++; pC0++; pC1++; pC2++; pC3++;
 			}
-			S0 = AT_SCALE(S0, pSc[4*Line  ], pScN[4*Line  ]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S1 = AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S2 = AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S3 = AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S4 = AT_SCALE(S4, pSc[4*Line  ], pScN[4*Line  ]); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S5 = AT_SCALE(S5, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S5, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S6 = AT_SCALE(S6, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S6, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S7 = AT_SCALE(S7, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S7, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S0 = AT_SCALE(S0, pSc[4*Line  ], pScN[4*Line  ]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S1 = AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S2 = AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S3 = AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S4 = AT_SCALE(S4, pSc[4*Line  ], pScN[4*Line  ]); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S5 = AT_SCALE(S5, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S5, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S6 = AT_SCALE(S6, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S6, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S7 = AT_SCALE(S7, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S7, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 			v4s R1 = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7));
 			v4s R2 = gap_pack4(gap_clip(S4, 7), gap_clip(S5, 7), gap_clip(S6, 7), gap_clip(S7, 7));
 			*((v4s *) (pOut0+4*Line)) = R1;
@@ -1237,8 +1238,8 @@ static inline void __attribute__((always_inline)) KerPar_MM_Conv1D_DxDy_HWC_SQ8_
 				S0 += V0*C0; S4 += V1*C0;
 				pIn++; pIn1++; pC++;
 			}
-			S0 = AT_SCALE(S0, pSc[i], pScN[i]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S4 = AT_SCALE(S4, pSc[i], pScN[i]); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S0 = AT_SCALE(S0, pSc[i], pScN[i]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S4 = AT_SCALE(S4, pSc[i], pScN[i]); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 			*(pOut0+i) = gap_clip(S0, 7);
 			*(pOut1+i) = gap_clip(S4, 7);
 		}
@@ -1305,10 +1306,10 @@ static inline void __attribute__((always_inline)) KerPar_MM_Conv1D_DxDy_HWC_SQ8_
 				S3 += V0*C3;
 				pIn++; pC0++; pC1++; pC2++; pC3++;
 			}
-			S0 = AT_SCALE(S0, pSc[4*Line  ], pScN[4*Line  ]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S1 = AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S2 = AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-			S3 = AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S0 = AT_SCALE(S0, pSc[4*Line  ], pScN[4*Line  ]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S1 = AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S2 = AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+			S3 = AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 			v4s R1 = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7));
 			*((v4s *) (pOut0+4*Line)) = R1;
                 }
@@ -1325,7 +1326,7 @@ static inline void __attribute__((always_inline)) KerPar_MM_Conv1D_DxDy_HWC_SQ8_
 				S0 += V0*C0;
 				pIn++; pC++;
 			}
-			S0 = AT_SCALE(S0, pSc[i], pScN[i]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S0 = AT_SCALE(S0, pSc[i], pScN[i]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 			*(pOut0+i) = gap_clip(S0, 7);
 		}
 		gap_waitbarrier(0);
@@ -1397,7 +1398,7 @@ static inline void __attribute__((always_inline)) KerPar_MM_Conv2D_SQ8_act(
         signed char * __restrict__ ColBuff = Arg->ColBuff;
 	int Wo = Arg->Wo, Ho = Arg->Ho;
 
-	unsigned char * Infos = Arg->Infos;
+	unsigned char * Infos = (unsigned char *) Arg->Infos;
 	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
 	int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]);
 
@@ -1504,7 +1505,7 @@ static inline void __attribute__((always_inline)) KerPar_MM_Conv2D_SQ8_act(
 	                        }
 	                        unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
 				// printf("Out[F:%d, H:%d, W:%d] = (%d * %d) >> %d = %d\n", Line, l, c, S0, Sc, ScN, gap_clip(AT_SCALE(S0, Sc, ScN), 7));
-				S0 = AT_SCALE(S0, Sc, ScN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S0 = AT_SCALE(S0, Sc, ScN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 	                        Out[Line*Wo*Ho + l*Wo + c] = gap_clip(S0, 7);
 	                }
 			gap_waitbarrier(0);
@@ -1580,7 +1581,7 @@ static inline void __attribute__((always_inline)) KerPar_MM_Conv2D_HWC_SQ8_act(
         unsigned char * __restrict__ ScaleN = Arg->ScaleN;
         signed char * __restrict__ ColBuff = Arg->ColBuff;
 	int Wo = Arg->Wo, Ho = Arg->Ho;
-	unsigned char * Infos = Arg->Infos;
+	unsigned char * Infos = (unsigned char *) Arg->Infos;
 	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
 	int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]);
 
@@ -1703,14 +1704,14 @@ static inline void __attribute__((always_inline)) KerPar_MM_Conv2D_HWC_SQ8_act(
 					S3 += V0*C3; S7 += V1*C3;
 					pIn++; pIn1++; pC0++; pC1++; pC2++; pC3++;
 				}
-				S0 = AT_SCALE(S0, pSc[4*Line  ], pScN[4*Line  ]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-				S1 = AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-				S2 = AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-				S3 = AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-				S4 = AT_SCALE(S4, pSc[4*Line  ], pScN[4*Line  ]); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-				S5 = AT_SCALE(S5, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S5, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-				S6 = AT_SCALE(S6, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S6, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-				S7 = AT_SCALE(S7, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S7, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S0 = AT_SCALE(S0, pSc[4*Line  ], pScN[4*Line  ]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+				S1 = AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+				S2 = AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+				S3 = AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+				S4 = AT_SCALE(S4, pSc[4*Line  ], pScN[4*Line  ]); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+				S5 = AT_SCALE(S5, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S5, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+				S6 = AT_SCALE(S6, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S6, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+				S7 = AT_SCALE(S7, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S7, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 				v4s R1 = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7));
 				v4s R2 = gap_pack4(gap_clip(S4, 7), gap_clip(S5, 7), gap_clip(S6, 7), gap_clip(S7, 7));
 				*((v4s *) (pOut0+4*Line)) = R1;
@@ -1730,8 +1731,8 @@ static inline void __attribute__((always_inline)) KerPar_MM_Conv2D_HWC_SQ8_act(
 					S0 += V0*C0; S4 += V1*C0;
 					pIn++; pIn1++; pC++;
 				}
-				S0 = AT_SCALE(S0, pSc[i], pScN[i]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-				S4 = AT_SCALE(S4, pSc[i], pScN[i]); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S0 = AT_SCALE(S0, pSc[i], pScN[i]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+				S4 = AT_SCALE(S4, pSc[i], pScN[i]); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 				*(pOut0+i) = gap_clip(S0, 7);
 				*(pOut1+i) = gap_clip(S4, 7);
 			}
@@ -1804,10 +1805,10 @@ static inline void __attribute__((always_inline)) KerPar_MM_Conv2D_HWC_SQ8_act(
 					S3 += V0*C3;
 					pIn++; pC0++; pC1++; pC2++; pC3++;
 				}
-				S0 = AT_SCALE(S0, pSc[4*Line  ], pScN[4*Line  ]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-				S1 = AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-				S2 = AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-				S3 = AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S0 = AT_SCALE(S0, pSc[4*Line  ], pScN[4*Line  ]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+				S1 = AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+				S2 = AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+				S3 = AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 				v4s R1 = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7));
 				*((v4s *) (pOut0+4*Line)) = R1;
 	                }
@@ -1824,7 +1825,7 @@ static inline void __attribute__((always_inline)) KerPar_MM_Conv2D_HWC_SQ8_act(
 					S0 += V0*C0;
 					pIn++; pC++;
 				}
-				S0 = AT_SCALE(S0, pSc[i], pScN[i]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S0 = AT_SCALE(S0, pSc[i], pScN[i]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 				*(pOut0+i) = gap_clip(S0, 7);
 			}
 			gap_waitbarrier(0);
@@ -1902,7 +1903,7 @@ static inline void __attribute__((always_inline)) Ker_MM_Conv2D_HWC_SQ8_act(
         signed char * __restrict__ ColBuff = Arg->ColBuff;
         signed char * __restrict__ ColBuff1;
 	int Wo = Arg->Wo, Ho = Arg->Ho;
-	unsigned char * Infos = Arg->Infos;
+	unsigned char * Infos = (unsigned char *) Arg->Infos;
 	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
 	int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]);
 
@@ -2021,23 +2022,23 @@ static inline void __attribute__((always_inline)) Ker_MM_Conv2D_HWC_SQ8_act(
 				}
 	                        unsigned int Sc, ScN;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-				S01 = AT_SCALE(S01, Sc, ScN); ACT_SWITCH(S01, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+				S01 = AT_SCALE(S01, Sc, ScN); ACT_SWITCH(S01, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 				*pOut0 = gap_clip(S00, 7); pOut0++;
 				*pOut1 = gap_clip(S01, 7); pOut1++;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				S10 = AT_SCALE(S10, Sc, ScN); ACT_SWITCH(S10, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-				S11 = AT_SCALE(S11, Sc, ScN); ACT_SWITCH(S11, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S10 = AT_SCALE(S10, Sc, ScN); ACT_SWITCH(S10, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+				S11 = AT_SCALE(S11, Sc, ScN); ACT_SWITCH(S11, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 				*pOut0 = gap_clip(S10, 7); pOut0++;
 				*pOut1 = gap_clip(S11, 7); pOut1++;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				S20 = AT_SCALE(S20, Sc, ScN); ACT_SWITCH(S20, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-				S21 = AT_SCALE(S21, Sc, ScN); ACT_SWITCH(S21, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S20 = AT_SCALE(S20, Sc, ScN); ACT_SWITCH(S20, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+				S21 = AT_SCALE(S21, Sc, ScN); ACT_SWITCH(S21, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 				*pOut0 = gap_clip(S20, 7); pOut0++;
 				*pOut1 = gap_clip(S21, 7); pOut1++;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				S30 = AT_SCALE(S30, Sc, ScN); ACT_SWITCH(S30, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-				S31 = AT_SCALE(S31, Sc, ScN); ACT_SWITCH(S31, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S30 = AT_SCALE(S30, Sc, ScN); ACT_SWITCH(S30, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+				S31 = AT_SCALE(S31, Sc, ScN); ACT_SWITCH(S31, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 				*pOut0 = gap_clip(S30, 7); pOut0++;
 				*pOut1 = gap_clip(S31, 7); pOut1++;
 	                }
@@ -2056,8 +2057,8 @@ static inline void __attribute__((always_inline)) Ker_MM_Conv2D_HWC_SQ8_act(
 				}
 	                        unsigned int Sc, ScN;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-				S01 = AT_SCALE(S01, Sc, ScN); ACT_SWITCH(S01, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+				S01 = AT_SCALE(S01, Sc, ScN); ACT_SWITCH(S01, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 				*pOut0 = gap_clip(S00, 7); pOut0++;
 				*pOut1 = gap_clip(S01, 7); pOut1++;
 			}
@@ -2138,16 +2139,16 @@ static inline void __attribute__((always_inline)) Ker_MM_Conv2D_HWC_SQ8_act(
 				}
 	                        unsigned int Sc, ScN;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 				*pOut0 = gap_clip(S00, 7); pOut0++;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				S10 = AT_SCALE(S10, Sc, ScN); ACT_SWITCH(S10, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S10 = AT_SCALE(S10, Sc, ScN); ACT_SWITCH(S10, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 				*pOut0 = gap_clip(S10, 7); pOut0++;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				S20 = AT_SCALE(S20, Sc, ScN); ACT_SWITCH(S20, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S20 = AT_SCALE(S20, Sc, ScN); ACT_SWITCH(S20, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 				*pOut0 = gap_clip(S20, 7); pOut0++;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				S30 = AT_SCALE(S30, Sc, ScN); ACT_SWITCH(S30, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S30 = AT_SCALE(S30, Sc, ScN); ACT_SWITCH(S30, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 				*pOut0 = gap_clip(S30, 7); pOut0++;
 	                }
 			for (int i=4*(IterOut/4); i<IterOut; i++) {
@@ -2165,7 +2166,7 @@ static inline void __attribute__((always_inline)) Ker_MM_Conv2D_HWC_SQ8_act(
 				}
 	                        unsigned int Sc, ScN;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 				*pOut0 = gap_clip(S00, 7); pOut0++;
 			}
 		}
@@ -2238,7 +2239,7 @@ static inline void __attribute__((always_inline)) KerPar_MM_Conv2D_DxDy_SQ8_act(
         unsigned char * __restrict__ ScaleN = Arg->ScaleN;
         signed char * __restrict__ ColBuff = Arg->ColBuff;
 	int Wo = Arg->Wo, Ho = Arg->Ho;
-	unsigned char * Infos = Arg->Infos;
+	unsigned char * Infos = (unsigned char *) Arg->Infos;
 	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
 	int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]);
 
@@ -2292,7 +2293,7 @@ static inline void __attribute__((always_inline)) KerPar_MM_Conv2D_DxDy_SQ8_act(
 	                                S0 = gap_sumdotp4(V1, C1, S0);
 	                        }
 	                        unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-	                        S0 = AT_SCALE(S0, Sc, ScN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+	                        S0 = AT_SCALE(S0, Sc, ScN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 	                        Out[Line*Wo*Ho + l*Wo + c] = gap_clip(S0, 7);
 	                }
 			gap_waitbarrier(0);
@@ -2370,7 +2371,7 @@ static inline void __attribute__((always_inline)) KerPar_MM_Conv2D_DxDy_HWC_SQ8_
         unsigned char * __restrict__ ScaleN = Arg->ScaleN;
         signed char * __restrict__ ColBuff = Arg->ColBuff;
 	int Wo = Arg->Wo, Ho = Arg->Ho;
-	unsigned char * Infos = Arg->Infos;
+	unsigned char * Infos = (unsigned char *) Arg->Infos;
 	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
 	int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]);
 
@@ -2490,14 +2491,14 @@ This part is more efficient but NOT WORKING ???? TOCHECK
 					S3 += V0*C3; S7 += V1*C3;
 					pIn++; pIn1++; pC0++; pC1++; pC2++; pC3++;
 				}
-				S0 = AT_SCALE(S0, pSc[4*Line  ], pScN[4*Line  ]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-				S1 = AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-				S2 = AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-				S3 = AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-				S4 = AT_SCALE(S4, pSc[4*Line  ], pScN[4*Line  ]); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-				S5 = AT_SCALE(S5, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S5, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-				S6 = AT_SCALE(S6, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S6, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-				S7 = AT_SCALE(S7, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S7, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S0 = AT_SCALE(S0, pSc[4*Line  ], pScN[4*Line  ]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+				S1 = AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+				S2 = AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+				S3 = AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+				S4 = AT_SCALE(S4, pSc[4*Line  ], pScN[4*Line  ]); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+				S5 = AT_SCALE(S5, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S5, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+				S6 = AT_SCALE(S6, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S6, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+				S7 = AT_SCALE(S7, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S7, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 				v4s R1 = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7));
 				v4s R2 = gap_pack4(gap_clip(S4, 7), gap_clip(S5, 7), gap_clip(S6, 7), gap_clip(S7, 7));
 				*((v4s *) (pOut0+4*Line)) = R1;
@@ -2517,8 +2518,8 @@ This part is more efficient but NOT WORKING ???? TOCHECK
 					S0 += V0*C0; S4 += V1*C0;
 					pIn++; pIn1++; pC++;
 				}
-				S0 = AT_SCALE(S0, pSc[i], pScN[i]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
-				S4 = AT_SCALE(S4, pSc[i], pScN[i]); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S0 = AT_SCALE(S0, pSc[i], pScN[i]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
+				S4 = AT_SCALE(S4, pSc[i], pScN[i]); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 				*(pOut0+i) = gap_clip(S0, 7);
 				*(pOut1+i) = gap_clip(S4, 7);
 			}
@@ -2570,16 +2571,16 @@ This part is more efficient but NOT WORKING ???? TOCHECK
 				}
 				unsigned int Sc, ScN;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				S0 = AT_SCALE(S0, Sc, ScN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S0 = AT_SCALE(S0, Sc, ScN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 				*pOut0 = gap_clip(S0, 7); pOut0++;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				S1 = AT_SCALE(S1, Sc, ScN); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S1 = AT_SCALE(S1, Sc, ScN); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 				*pOut0 = gap_clip(S1, 7); pOut0++;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				S2 = AT_SCALE(S2, Sc, ScN); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S2 = AT_SCALE(S2, Sc, ScN); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 				*pOut0 = gap_clip(S2, 7); pOut0++;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				S3 = AT_SCALE(S3, Sc, ScN); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S3 = AT_SCALE(S3, Sc, ScN); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 				*pOut0 = gap_clip(S3, 7); pOut0++;
 			}
 			for (int i=4*(IterOut/4); i<IterOut; i++) {
@@ -2597,7 +2598,7 @@ This part is more efficient but NOT WORKING ???? TOCHECK
 				}
 				unsigned int Sc, ScN;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				S0 = AT_SCALE(S0, Sc, ScN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S0 = AT_SCALE(S0, Sc, ScN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0);
 				*pOut0 = gap_clip(S0, 7); pOut0++;
 			}
 			gap_waitbarrier(0);
diff --git a/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_Pooling_SQ8.c b/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_Pooling_SQ8.c
index 12bcdd42e..83985fa46 100644
--- a/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_Pooling_SQ8.c
+++ b/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_Pooling_SQ8.c
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+#include <stdio.h>
+#include "Gap.h"
+#include "CNN_BasicKernels_SQ8.h"
+
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wextra"
 #pragma GCC diagnostic ignored "-Wpointer-sign"
@@ -21,10 +25,6 @@
 #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #pragma GCC diagnostic ignored "-Wswitch"
 
-#include <stdio.h>
-#include "Gap.h"
-#include "CNN_BasicKernels_SQ8.h"
-
 static int CoreCountDynamic = 1;
 static int ActiveCore = gap_ncore();
 
@@ -3209,11 +3209,28 @@ void KerPoolNxMStrideSxSy_ReLUMN_SQ8(KerPool_SQ8_T *Arg)
 }
 
 
+/* HWC Version */
 
+#define KER_POOL_ACT(Activation, p_type, n_bits, is_unsigned) \
+do { \
+	int Size = Wo*Ho*Feat; \
+	int CoreId = gap_coreid(), ChunkCell = ChunkSize(Size), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, Size); \
+	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; \
+	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; \
+	int A0 = arr_at_as(Infos, AT_INF_A0, p_type); int B0 = arr_at_as(Infos, AT_INF_B0, p_type); int C0 = arr_at_as(Infos, AT_INF_C0, p_type); \
+\
+	for (int i=First; i<Last; i++) { \
+		int Acc0 = Out[i]; \
+		ACT_SWITCH(Acc0, Activation, ActScale, ActScaleN, A0, B0, C0, 16-n_bits, is_unsigned); \
+		Out[i] = OUT_CLIP(Acc0, is_unsigned, n_bits); \
+	} \
+	gap_waitbarrier(0); \
+} while(0)
 
-
-
-void KerParMaxPoolNxMStrideSxSy_HWC_SQ8(Ker_MM_Pool_SQ8_T *Arg)
+static inline void __attribute__((always_inline)) KerParMaxPoolNxMStrideSxSy_HWC_SQ8_act(
+	Ker_MM_Pool_SQ8_T *Arg,
+	CNN_ActivationOper_T Activation
+)
 
 {
         signed char *__restrict__ In = Arg->In;
@@ -3262,11 +3279,55 @@ void KerParMaxPoolNxMStrideSxSy_HWC_SQ8(Ker_MM_Pool_SQ8_T *Arg)
                 PosL += Sy;
         }
         gap_waitbarrier(0);
-	// KerParPoolActivation(Out, Wo, Ho, First, Last, Infos, Arg->Activation);
-        // gap_waitbarrier(0);
+        if (Activation != ACT_NONE) {
+        	KER_POOL_ACT(Activation, unsigned char, 8, 0);
+        }
+}
+
+void KerParMaxPoolNxMStrideSxSy_HWC_SQ8(Ker_MM_Pool_SQ8_T *Arg) {
+	KerParMaxPoolNxMStrideSxSy_HWC_SQ8_act(Arg, ACT_NONE);
+}
+
+void KerParMaxPoolNxMStrideSxSy_HWC_ReLU_SQ8(Ker_MM_Pool_SQ8_T *Arg) {
+	KerParMaxPoolNxMStrideSxSy_HWC_SQ8_act(Arg, ACT_RELU);
+}
+
+void KerParMaxPoolNxMStrideSxSy_HWC_ReLUN_SQ8(Ker_MM_Pool_SQ8_T *Arg) {
+	KerParMaxPoolNxMStrideSxSy_HWC_SQ8_act(Arg, ACT_RELUN);
+}
+
+void KerParMaxPoolNxMStrideSxSy_HWC_ReLUM_SQ8(Ker_MM_Pool_SQ8_T *Arg) {
+	KerParMaxPoolNxMStrideSxSy_HWC_SQ8_act(Arg, ACT_RELUM);
+}
+
+void KerParMaxPoolNxMStrideSxSy_HWC_ReLUMN_SQ8(Ker_MM_Pool_SQ8_T *Arg) {
+	KerParMaxPoolNxMStrideSxSy_HWC_SQ8_act(Arg, ACT_RELUMN);
 }
 
-void KerParMaxPoolNxMStrideSxSy_HWC_USQ8(Ker_MM_Pool_USQ8_T *Arg)
+void KerParMaxPoolNxMStrideSxSy_HWC_HSigmoid_SQ8(Ker_MM_Pool_SQ8_T *Arg) {
+	KerParMaxPoolNxMStrideSxSy_HWC_SQ8_act(Arg, ACT_HSIGMOID);
+}
+
+void KerParMaxPoolNxMStrideSxSy_HWC_HSwish_SQ8(Ker_MM_Pool_SQ8_T *Arg) {
+	KerParMaxPoolNxMStrideSxSy_HWC_SQ8_act(Arg, ACT_HSWISH);
+}
+
+void KerParMaxPoolNxMStrideSxSy_HWC_LeakyReLU_SQ8(Ker_MM_Pool_SQ8_T *Arg) {
+	KerParMaxPoolNxMStrideSxSy_HWC_SQ8_act(Arg, ACT_LEAKYRELU);
+}
+
+void KerParMaxPoolNxMStrideSxSy_HWC_Sigmoid_SQ8(Ker_MM_Pool_SQ8_T *Arg) {
+	KerParMaxPoolNxMStrideSxSy_HWC_SQ8_act(Arg, ACT_SIGMOID);
+}
+
+void KerParMaxPoolNxMStrideSxSy_HWC_Tanh_SQ8(Ker_MM_Pool_SQ8_T *Arg) {
+	KerParMaxPoolNxMStrideSxSy_HWC_SQ8_act(Arg, ACT_TANH);
+}
+
+static inline void __attribute__((always_inline)) KerParMaxPoolNxMStrideSxSy_HWC_USQ8_act(
+	Ker_MM_Pool_USQ8_T *Arg,
+	CNN_ActivationOper_T Activation
+)
 
 {
         unsigned char *__restrict__ In = Arg->In;
@@ -3282,7 +3343,7 @@ void KerParMaxPoolNxMStrideSxSy_HWC_USQ8(Ker_MM_Pool_USQ8_T *Arg)
 
 	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(Feat), First = CoreId*ChunkCell, Last = Min(Feat, First+ChunkCell);
 	int PosL = Arg->FirstTile?(-PadT):0;
-	int Iter = Last-First;
+	int Iter = Max(0, Last-First);
         for (int l=0; l<Ho; l++) {
                 int PosC = -PadL;
                 int Tb = Max(PosL, 0), Db = Min(PosL+Fy, H);
@@ -3294,32 +3355,76 @@ void KerParMaxPoolNxMStrideSxSy_HWC_USQ8(Ker_MM_Pool_USQ8_T *Arg)
 				for (int j=Tb; j<Db; j++) {
 					for (int i=Lb; i<Rb; i++) M = gap_maxu4(M, ((v4u *)(In+j*W*Feat + i*Feat+First))[f]);
 				}
-				((int *)(Out+l*Wo*Feat + c*Feat+First))[f] = (int) M;
+				((unsigned int *)(Out+l*Wo*Feat + c*Feat+First))[f] = (unsigned int) M;
 			}
-			// if (Iter&0x2) {
-			// 	v4u M = M_Init;
-			// 	for (int j=Tb; j<Db; j++) {
-			// 		for (int i=Lb; i<Rb; i++) M = gap_maxu4(M, (v4u) (int) ((unsigned short int *)(In+j*W*Feat + i*Feat+First))[0]);
-			// 	}
-			// 	((short int *)(Out+l*Wo*Feat + c*Feat+First))[0] = (int) M;
-			// }
-			for (int f=(Iter/4)*4; f<Iter; f++) {
-				unsigned char M = 0;
+			if (Iter&0x2) {
+				v4u M = M_Init;
 				for (int j=Tb; j<Db; j++) {
-					for (int i=Lb; i<Rb; i++) M = Max(M, ((unsigned char *)(In+j*W*Feat + i*Feat+First))[f]);
+					for (int i=Lb; i<Rb; i++) M = gap_maxu4(M, (v4u) (int) ((short int *)(In+j*W*Feat + i*Feat+First))[0]);
 				}
-				((unsigned char *)(Out+l*Wo*Feat + c*Feat+First))[f] = M;
+				((short int *)(Out+l*Wo*Feat + c*Feat+First))[Iter/2-1] = (int) M;
+			}
+			if (Iter&0x1) {
+				v4u M = M_Init;
+				for (int j=Tb; j<Db; j++) {
+					for (int i=Lb; i<Rb; i++) M = gap_maxu4(M, (v4u) (int) ((unsigned char *)(In+j*W*Feat + i*Feat+First))[0]);
+				}
+				((signed char *)(Out+l*Wo*Feat + c*Feat+First))[Iter-1] = (int) M;
 			}
 			PosC += Sx;
                 }
                 PosL += Sy;
         }
         gap_waitbarrier(0);
-	// KerParPoolActivation(Out, Wo, Ho, First, Last, Infos, Arg->Activation);
-        // gap_waitbarrier(0);
+        if (Activation != ACT_NONE) {
+        	KER_POOL_ACT(Activation, unsigned char, 8, 1);
+        }
+}
+
+void KerParMaxPoolNxMStrideSxSy_HWC_USQ8(Ker_MM_Pool_USQ8_T *Arg) {
+	KerParMaxPoolNxMStrideSxSy_HWC_USQ8_act(Arg, ACT_NONE);
+}
+
+void KerParMaxPoolNxMStrideSxSy_HWC_ReLU_USQ8(Ker_MM_Pool_USQ8_T *Arg) {
+	KerParMaxPoolNxMStrideSxSy_HWC_USQ8_act(Arg, ACT_RELU);
+}
+
+void KerParMaxPoolNxMStrideSxSy_HWC_ReLUN_USQ8(Ker_MM_Pool_USQ8_T *Arg) {
+	KerParMaxPoolNxMStrideSxSy_HWC_USQ8_act(Arg, ACT_RELUN);
 }
 
-void KerParAvgPoolNxMStrideSxSy_HWC_SQ8(Ker_MM_Pool_SQ8_T *Arg)
+void KerParMaxPoolNxMStrideSxSy_HWC_ReLUM_USQ8(Ker_MM_Pool_USQ8_T *Arg) {
+	KerParMaxPoolNxMStrideSxSy_HWC_USQ8_act(Arg, ACT_RELUM);
+}
+
+void KerParMaxPoolNxMStrideSxSy_HWC_ReLUMN_USQ8(Ker_MM_Pool_USQ8_T *Arg) {
+	KerParMaxPoolNxMStrideSxSy_HWC_USQ8_act(Arg, ACT_RELUMN);
+}
+
+void KerParMaxPoolNxMStrideSxSy_HWC_HSigmoid_USQ8(Ker_MM_Pool_USQ8_T *Arg) {
+	KerParMaxPoolNxMStrideSxSy_HWC_USQ8_act(Arg, ACT_HSIGMOID);
+}
+
+void KerParMaxPoolNxMStrideSxSy_HWC_HSwish_USQ8(Ker_MM_Pool_USQ8_T *Arg) {
+	KerParMaxPoolNxMStrideSxSy_HWC_USQ8_act(Arg, ACT_HSWISH);
+}
+
+void KerParMaxPoolNxMStrideSxSy_HWC_LeakyReLU_USQ8(Ker_MM_Pool_USQ8_T *Arg) {
+	KerParMaxPoolNxMStrideSxSy_HWC_USQ8_act(Arg, ACT_LEAKYRELU);
+}
+
+void KerParMaxPoolNxMStrideSxSy_HWC_Sigmoid_USQ8(Ker_MM_Pool_USQ8_T *Arg) {
+	KerParMaxPoolNxMStrideSxSy_HWC_USQ8_act(Arg, ACT_SIGMOID);
+}
+
+void KerParMaxPoolNxMStrideSxSy_HWC_Tanh_USQ8(Ker_MM_Pool_USQ8_T *Arg) {
+	KerParMaxPoolNxMStrideSxSy_HWC_USQ8_act(Arg, ACT_TANH);
+}
+
+static inline void __attribute__((always_inline)) KerParAvgPoolNxMStrideSxSy_HWC_SQ8_act(
+	Ker_MM_Pool_SQ8_T *Arg,
+	CNN_ActivationOper_T Activation
+)
 
 {
         signed char *__restrict__ In = Arg->In;
@@ -3388,11 +3493,55 @@ void KerParAvgPoolNxMStrideSxSy_HWC_SQ8(Ker_MM_Pool_SQ8_T *Arg)
                 PosL += Sy;
         }
         gap_waitbarrier(0);
-	// KerParPoolActivation(Out, Wo, Ho, First, Last, Infos, Arg->Activation);
-        // gap_waitbarrier(0);
+        if (Activation != ACT_NONE) {
+        	KER_POOL_ACT(Activation, unsigned char, 8, 0);
+        }
+}
+
+void KerParAvgPoolNxMStrideSxSy_HWC_SQ8(Ker_MM_Pool_SQ8_T *Arg) {
+	KerParAvgPoolNxMStrideSxSy_HWC_SQ8_act(Arg, ACT_NONE);
+}
+
+void KerParAvgPoolNxMStrideSxSy_HWC_ReLU_SQ8(Ker_MM_Pool_SQ8_T *Arg) {
+	KerParAvgPoolNxMStrideSxSy_HWC_SQ8_act(Arg, ACT_RELU);
+}
+
+void KerParAvgPoolNxMStrideSxSy_HWC_ReLUN_SQ8(Ker_MM_Pool_SQ8_T *Arg) {
+	KerParAvgPoolNxMStrideSxSy_HWC_SQ8_act(Arg, ACT_RELUN);
 }
 
-void KerParAvgPoolNxMStrideSxSy_HWC_USQ8(Ker_MM_Pool_USQ8_T *Arg)
+void KerParAvgPoolNxMStrideSxSy_HWC_ReLUM_SQ8(Ker_MM_Pool_SQ8_T *Arg) {
+	KerParAvgPoolNxMStrideSxSy_HWC_SQ8_act(Arg, ACT_RELUM);
+}
+
+void KerParAvgPoolNxMStrideSxSy_HWC_ReLUMN_SQ8(Ker_MM_Pool_SQ8_T *Arg) {
+	KerParAvgPoolNxMStrideSxSy_HWC_SQ8_act(Arg, ACT_RELUMN);
+}
+
+void KerParAvgPoolNxMStrideSxSy_HWC_HSigmoid_SQ8(Ker_MM_Pool_SQ8_T *Arg) {
+	KerParAvgPoolNxMStrideSxSy_HWC_SQ8_act(Arg, ACT_HSIGMOID);
+}
+
+void KerParAvgPoolNxMStrideSxSy_HWC_HSwish_SQ8(Ker_MM_Pool_SQ8_T *Arg) {
+	KerParAvgPoolNxMStrideSxSy_HWC_SQ8_act(Arg, ACT_HSWISH);
+}
+
+void KerParAvgPoolNxMStrideSxSy_HWC_LeakyReLU_SQ8(Ker_MM_Pool_SQ8_T *Arg) {
+	KerParAvgPoolNxMStrideSxSy_HWC_SQ8_act(Arg, ACT_LEAKYRELU);
+}
+
+void KerParAvgPoolNxMStrideSxSy_HWC_Sigmoid_SQ8(Ker_MM_Pool_SQ8_T *Arg) {
+	KerParAvgPoolNxMStrideSxSy_HWC_SQ8_act(Arg, ACT_SIGMOID);
+}
+
+void KerParAvgPoolNxMStrideSxSy_HWC_Tanh_SQ8(Ker_MM_Pool_SQ8_T *Arg) {
+	KerParAvgPoolNxMStrideSxSy_HWC_SQ8_act(Arg, ACT_TANH);
+}
+
+static inline void __attribute__((always_inline)) KerParAvgPoolNxMStrideSxSy_HWC_USQ8_act(
+	Ker_MM_Pool_USQ8_T *Arg,
+	CNN_ActivationOper_T Activation
+)
 
 {
         unsigned char *__restrict__ In = Arg->In;
@@ -3461,14 +3610,56 @@ void KerParAvgPoolNxMStrideSxSy_HWC_USQ8(Ker_MM_Pool_USQ8_T *Arg)
                 PosL += Sy;
         }
         gap_waitbarrier(0);
-	// KerParPoolActivation(Out, Wo, Ho, First, Last, Infos, Arg->Activation);
-        // gap_waitbarrier(0);
+        if (Activation != ACT_NONE) {
+        	KER_POOL_ACT(Activation, unsigned char, 8, 1);
+        }
 }
 
+void KerParAvgPoolNxMStrideSxSy_HWC_USQ8(Ker_MM_Pool_USQ8_T *Arg) {
+	KerParAvgPoolNxMStrideSxSy_HWC_USQ8_act(Arg, ACT_NONE);
+}
+
+void KerParAvgPoolNxMStrideSxSy_HWC_ReLU_USQ8(Ker_MM_Pool_USQ8_T *Arg) {
+	KerParAvgPoolNxMStrideSxSy_HWC_USQ8_act(Arg, ACT_RELU);
+}
 
+void KerParAvgPoolNxMStrideSxSy_HWC_ReLUN_USQ8(Ker_MM_Pool_USQ8_T *Arg) {
+	KerParAvgPoolNxMStrideSxSy_HWC_USQ8_act(Arg, ACT_RELUN);
+}
+
+void KerParAvgPoolNxMStrideSxSy_HWC_ReLUM_USQ8(Ker_MM_Pool_USQ8_T *Arg) {
+	KerParAvgPoolNxMStrideSxSy_HWC_USQ8_act(Arg, ACT_RELUM);
+}
 
-void KerParMaxPoolNxMStrideSxSy_HWC_SQ16(Ker_MM_Pool_SQ16_T *Arg)
+void KerParAvgPoolNxMStrideSxSy_HWC_ReLUMN_USQ8(Ker_MM_Pool_USQ8_T *Arg) {
+	KerParAvgPoolNxMStrideSxSy_HWC_USQ8_act(Arg, ACT_RELUMN);
+}
+
+void KerParAvgPoolNxMStrideSxSy_HWC_HSigmoid_USQ8(Ker_MM_Pool_USQ8_T *Arg) {
+	KerParAvgPoolNxMStrideSxSy_HWC_USQ8_act(Arg, ACT_HSIGMOID);
+}
+
+void KerParAvgPoolNxMStrideSxSy_HWC_HSwish_USQ8(Ker_MM_Pool_USQ8_T *Arg) {
+	KerParAvgPoolNxMStrideSxSy_HWC_USQ8_act(Arg, ACT_HSWISH);
+}
 
+void KerParAvgPoolNxMStrideSxSy_HWC_LeakyReLU_USQ8(Ker_MM_Pool_USQ8_T *Arg) {
+	KerParAvgPoolNxMStrideSxSy_HWC_USQ8_act(Arg, ACT_LEAKYRELU);
+}
+
+void KerParAvgPoolNxMStrideSxSy_HWC_Sigmoid_USQ8(Ker_MM_Pool_USQ8_T *Arg) {
+	KerParAvgPoolNxMStrideSxSy_HWC_USQ8_act(Arg, ACT_SIGMOID);
+}
+
+void KerParAvgPoolNxMStrideSxSy_HWC_Tanh_USQ8(Ker_MM_Pool_USQ8_T *Arg) {
+	KerParAvgPoolNxMStrideSxSy_HWC_USQ8_act(Arg, ACT_TANH);
+}
+
+
+static inline void __attribute__((always_inline)) KerParMaxPoolNxMStrideSxSy_HWC_SQ16_act(
+	Ker_MM_Pool_SQ16_T *Arg,
+	CNN_ActivationOper_T Activation
+)
 {
         short int *__restrict__ In = Arg->In;
         int W = Arg->W, H = Arg->H;
@@ -3509,21 +3700,64 @@ void KerParMaxPoolNxMStrideSxSy_HWC_SQ16(Ker_MM_Pool_SQ16_T *Arg)
                 PosL += Sy;
         }
         gap_waitbarrier(0);
-	// KerParPoolActivation(Out, Wo, Ho, First, Last, Infos, Arg->Activation);
-        // gap_waitbarrier(0);
+        if (Activation != ACT_NONE) {
+        	KER_POOL_ACT(Activation, unsigned char, 8, 1);
+        }
+}
+
+void KerParMaxPoolNxMStrideSxSy_HWC_SQ16(Ker_MM_Pool_SQ16_T *Arg) {
+	KerParMaxPoolNxMStrideSxSy_HWC_SQ16_act(Arg, ACT_NONE);
+}
+
+void KerParMaxPoolNxMStrideSxSy_HWC_ReLU_SQ16(Ker_MM_Pool_SQ16_T *Arg) {
+	KerParMaxPoolNxMStrideSxSy_HWC_SQ16_act(Arg, ACT_RELU);
 }
 
+void KerParMaxPoolNxMStrideSxSy_HWC_ReLUN_SQ16(Ker_MM_Pool_SQ16_T *Arg) {
+	KerParMaxPoolNxMStrideSxSy_HWC_SQ16_act(Arg, ACT_RELUN);
+}
+
+void KerParMaxPoolNxMStrideSxSy_HWC_ReLUM_SQ16(Ker_MM_Pool_SQ16_T *Arg) {
+	KerParMaxPoolNxMStrideSxSy_HWC_SQ16_act(Arg, ACT_RELUM);
+}
+
+void KerParMaxPoolNxMStrideSxSy_HWC_ReLUMN_SQ16(Ker_MM_Pool_SQ16_T *Arg) {
+	KerParMaxPoolNxMStrideSxSy_HWC_SQ16_act(Arg, ACT_RELUMN);
+}
+
+void KerParMaxPoolNxMStrideSxSy_HWC_HSigmoid_SQ16(Ker_MM_Pool_SQ16_T *Arg) {
+	KerParMaxPoolNxMStrideSxSy_HWC_SQ16_act(Arg, ACT_HSIGMOID);
+}
+
+void KerParMaxPoolNxMStrideSxSy_HWC_HSwish_SQ16(Ker_MM_Pool_SQ16_T *Arg) {
+	KerParMaxPoolNxMStrideSxSy_HWC_SQ16_act(Arg, ACT_HSWISH);
+}
 
-void KerParMaxPoolNxMStrideSxSy_HWC_USQ16(Ker_MM_Pool_SQ16_T *Arg)
+void KerParMaxPoolNxMStrideSxSy_HWC_LeakyReLU_SQ16(Ker_MM_Pool_SQ16_T *Arg) {
+	KerParMaxPoolNxMStrideSxSy_HWC_SQ16_act(Arg, ACT_LEAKYRELU);
+}
+
+void KerParMaxPoolNxMStrideSxSy_HWC_Sigmoid_SQ16(Ker_MM_Pool_SQ16_T *Arg) {
+	KerParMaxPoolNxMStrideSxSy_HWC_SQ16_act(Arg, ACT_SIGMOID);
+}
 
+void KerParMaxPoolNxMStrideSxSy_HWC_Tanh_SQ16(Ker_MM_Pool_SQ16_T *Arg) {
+	KerParMaxPoolNxMStrideSxSy_HWC_SQ16_act(Arg, ACT_TANH);
+}
+
+
+static inline void __attribute__((always_inline)) KerParMaxPoolNxMStrideSxSy_HWC_USQ16_act(
+	Ker_MM_Pool_USQ16_T *Arg,
+	CNN_ActivationOper_T Activation
+)
 {
-        short int *__restrict__ In = Arg->In;
+        unsigned short int *__restrict__ In = Arg->In;
         int W = Arg->W, H = Arg->H;
         int Fx = Arg->Fx, Sx = Arg->Sx;
         int Fy = Arg->Fy, Sy = Arg->Sy;
         int PadL = Arg->Pad[0], PadT = Arg->Pad[2];
         int Feat = Arg->Feat;
-        short int * __restrict__ Out = Arg->Out;
+        unsigned short int * __restrict__ Out = Arg->Out;
         int Wo = Arg->Wo, Ho = Arg->Ho;
 
 	v2u M_Init = (v2u) {-32767,-32767};
@@ -3556,13 +3790,56 @@ void KerParMaxPoolNxMStrideSxSy_HWC_USQ16(Ker_MM_Pool_SQ16_T *Arg)
                 PosL += Sy;
         }
         gap_waitbarrier(0);
-	// KerParPoolActivation(Out, Wo, Ho, First, Last, Infos, Arg->Activation);
-        // gap_waitbarrier(0);
+        if (Activation != ACT_NONE) {
+        	KER_POOL_ACT(Activation, unsigned char, 8, 1);
+        }
 }
 
+void KerParMaxPoolNxMStrideSxSy_HWC_USQ16(Ker_MM_Pool_USQ16_T *Arg) {
+	KerParMaxPoolNxMStrideSxSy_HWC_USQ16_act(Arg, ACT_NONE);
+}
+
+void KerParMaxPoolNxMStrideSxSy_HWC_ReLU_USQ16(Ker_MM_Pool_USQ16_T *Arg) {
+	KerParMaxPoolNxMStrideSxSy_HWC_USQ16_act(Arg, ACT_RELU);
+}
 
-void KerParAvgPoolNxMStrideSxSy_HWC_SQ16(Ker_MM_Pool_SQ16_T *Arg)
+void KerParMaxPoolNxMStrideSxSy_HWC_ReLUN_USQ16(Ker_MM_Pool_USQ16_T *Arg) {
+	KerParMaxPoolNxMStrideSxSy_HWC_USQ16_act(Arg, ACT_RELUN);
+}
+
+void KerParMaxPoolNxMStrideSxSy_HWC_ReLUM_USQ16(Ker_MM_Pool_USQ16_T *Arg) {
+	KerParMaxPoolNxMStrideSxSy_HWC_USQ16_act(Arg, ACT_RELUM);
+}
 
+void KerParMaxPoolNxMStrideSxSy_HWC_ReLUMN_USQ16(Ker_MM_Pool_USQ16_T *Arg) {
+	KerParMaxPoolNxMStrideSxSy_HWC_USQ16_act(Arg, ACT_RELUMN);
+}
+
+void KerParMaxPoolNxMStrideSxSy_HWC_HSigmoid_USQ16(Ker_MM_Pool_USQ16_T *Arg) {
+	KerParMaxPoolNxMStrideSxSy_HWC_USQ16_act(Arg, ACT_HSIGMOID);
+}
+
+void KerParMaxPoolNxMStrideSxSy_HWC_HSwish_USQ16(Ker_MM_Pool_USQ16_T *Arg) {
+	KerParMaxPoolNxMStrideSxSy_HWC_USQ16_act(Arg, ACT_HSWISH);
+}
+
+void KerParMaxPoolNxMStrideSxSy_HWC_LeakyReLU_USQ16(Ker_MM_Pool_USQ16_T *Arg) {
+	KerParMaxPoolNxMStrideSxSy_HWC_USQ16_act(Arg, ACT_LEAKYRELU);
+}
+
+void KerParMaxPoolNxMStrideSxSy_HWC_Sigmoid_USQ16(Ker_MM_Pool_USQ16_T *Arg) {
+	KerParMaxPoolNxMStrideSxSy_HWC_USQ16_act(Arg, ACT_SIGMOID);
+}
+
+void KerParMaxPoolNxMStrideSxSy_HWC_Tanh_USQ16(Ker_MM_Pool_USQ16_T *Arg) {
+	KerParMaxPoolNxMStrideSxSy_HWC_USQ16_act(Arg, ACT_TANH);
+}
+
+
+static inline void __attribute__((always_inline)) KerParAvgPoolNxMStrideSxSy_HWC_SQ16_act(
+	Ker_MM_Pool_SQ16_T *Arg,
+	CNN_ActivationOper_T Activation
+)
 {
         signed short *__restrict__ In = Arg->In;
         int W = Arg->W, H = Arg->H;
@@ -3611,12 +3888,56 @@ void KerParAvgPoolNxMStrideSxSy_HWC_SQ16(Ker_MM_Pool_SQ16_T *Arg)
                 PosL += Sy;
         }
         gap_waitbarrier(0);
-	// KerParPoolActivation(Out, Wo, Ho, First, Last, Infos, Arg->Activation);
-        // gap_waitbarrier(0);
+        if (Activation != ACT_NONE) {
+        	KER_POOL_ACT(Activation, unsigned char, 8, 1);
+        }
+}
+
+void KerParAvgPoolNxMStrideSxSy_HWC_SQ16(Ker_MM_Pool_SQ16_T *Arg) {
+	KerParAvgPoolNxMStrideSxSy_HWC_SQ16_act(Arg, ACT_NONE);
+}
+
+void KerParAvgPoolNxMStrideSxSy_HWC_ReLU_SQ16(Ker_MM_Pool_SQ16_T *Arg) {
+	KerParAvgPoolNxMStrideSxSy_HWC_SQ16_act(Arg, ACT_RELU);
+}
+
+void KerParAvgPoolNxMStrideSxSy_HWC_ReLUN_SQ16(Ker_MM_Pool_SQ16_T *Arg) {
+	KerParAvgPoolNxMStrideSxSy_HWC_SQ16_act(Arg, ACT_RELUN);
+}
+
+void KerParAvgPoolNxMStrideSxSy_HWC_ReLUM_SQ16(Ker_MM_Pool_SQ16_T *Arg) {
+	KerParAvgPoolNxMStrideSxSy_HWC_SQ16_act(Arg, ACT_RELUM);
+}
+
+void KerParAvgPoolNxMStrideSxSy_HWC_ReLUMN_SQ16(Ker_MM_Pool_SQ16_T *Arg) {
+	KerParAvgPoolNxMStrideSxSy_HWC_SQ16_act(Arg, ACT_RELUMN);
+}
+
+void KerParAvgPoolNxMStrideSxSy_HWC_HSigmoid_SQ16(Ker_MM_Pool_SQ16_T *Arg) {
+	KerParAvgPoolNxMStrideSxSy_HWC_SQ16_act(Arg, ACT_HSIGMOID);
+}
+
+void KerParAvgPoolNxMStrideSxSy_HWC_HSwish_SQ16(Ker_MM_Pool_SQ16_T *Arg) {
+	KerParAvgPoolNxMStrideSxSy_HWC_SQ16_act(Arg, ACT_HSWISH);
+}
+
+void KerParAvgPoolNxMStrideSxSy_HWC_LeakyReLU_SQ16(Ker_MM_Pool_SQ16_T *Arg) {
+	KerParAvgPoolNxMStrideSxSy_HWC_SQ16_act(Arg, ACT_LEAKYRELU);
+}
+
+void KerParAvgPoolNxMStrideSxSy_HWC_Sigmoid_SQ16(Ker_MM_Pool_SQ16_T *Arg) {
+	KerParAvgPoolNxMStrideSxSy_HWC_SQ16_act(Arg, ACT_SIGMOID);
+}
+
+void KerParAvgPoolNxMStrideSxSy_HWC_Tanh_SQ16(Ker_MM_Pool_SQ16_T *Arg) {
+	KerParAvgPoolNxMStrideSxSy_HWC_SQ16_act(Arg, ACT_TANH);
 }
 
-void KerParAvgPoolNxMStrideSxSy_HWC_USQ16(Ker_MM_Pool_SQ16_T *Arg)
 
+static inline void __attribute__((always_inline)) KerParAvgPoolNxMStrideSxSy_HWC_USQ16_act(
+	Ker_MM_Pool_USQ16_T *Arg,
+	CNN_ActivationOper_T Activation
+)
 {
         unsigned short *__restrict__ In = Arg->In;
         int W = Arg->W, H = Arg->H;
@@ -3665,6 +3986,49 @@ void KerParAvgPoolNxMStrideSxSy_HWC_USQ16(Ker_MM_Pool_SQ16_T *Arg)
                 PosL += Sy;
         }
         gap_waitbarrier(0);
-	// KerParPoolActivation(Out, Wo, Ho, First, Last, Infos, Arg->Activation);
-        // gap_waitbarrier(0);
-}
\ No newline at end of file
+        if (Activation != ACT_NONE) {
+        	KER_POOL_ACT(Activation, unsigned char, 8, 1);
+        }
+}
+
+void KerParAvgPoolNxMStrideSxSy_HWC_USQ16(Ker_MM_Pool_USQ16_T *Arg) {
+	KerParAvgPoolNxMStrideSxSy_HWC_USQ16_act(Arg, ACT_NONE);
+}
+
+void KerParAvgPoolNxMStrideSxSy_HWC_ReLU_USQ16(Ker_MM_Pool_USQ16_T *Arg) {
+	KerParAvgPoolNxMStrideSxSy_HWC_USQ16_act(Arg, ACT_RELU);
+}
+
+void KerParAvgPoolNxMStrideSxSy_HWC_ReLUN_USQ16(Ker_MM_Pool_USQ16_T *Arg) {
+	KerParAvgPoolNxMStrideSxSy_HWC_USQ16_act(Arg, ACT_RELUN);
+}
+
+void KerParAvgPoolNxMStrideSxSy_HWC_ReLUM_USQ16(Ker_MM_Pool_USQ16_T *Arg) {
+	KerParAvgPoolNxMStrideSxSy_HWC_USQ16_act(Arg, ACT_RELUM);
+}
+
+void KerParAvgPoolNxMStrideSxSy_HWC_ReLUMN_USQ16(Ker_MM_Pool_USQ16_T *Arg) {
+	KerParAvgPoolNxMStrideSxSy_HWC_USQ16_act(Arg, ACT_RELUMN);
+}
+
+void KerParAvgPoolNxMStrideSxSy_HWC_HSigmoid_USQ16(Ker_MM_Pool_USQ16_T *Arg) {
+	KerParAvgPoolNxMStrideSxSy_HWC_USQ16_act(Arg, ACT_HSIGMOID);
+}
+
+void KerParAvgPoolNxMStrideSxSy_HWC_HSwish_USQ16(Ker_MM_Pool_USQ16_T *Arg) {
+	KerParAvgPoolNxMStrideSxSy_HWC_USQ16_act(Arg, ACT_HSWISH);
+}
+
+void KerParAvgPoolNxMStrideSxSy_HWC_LeakyReLU_USQ16(Ker_MM_Pool_USQ16_T *Arg) {
+	KerParAvgPoolNxMStrideSxSy_HWC_USQ16_act(Arg, ACT_LEAKYRELU);
+}
+
+void KerParAvgPoolNxMStrideSxSy_HWC_Sigmoid_USQ16(Ker_MM_Pool_USQ16_T *Arg) {
+	KerParAvgPoolNxMStrideSxSy_HWC_USQ16_act(Arg, ACT_SIGMOID);
+}
+
+void KerParAvgPoolNxMStrideSxSy_HWC_Tanh_USQ16(Ker_MM_Pool_USQ16_T *Arg) {
+	KerParAvgPoolNxMStrideSxSy_HWC_USQ16_act(Arg, ACT_TANH);
+}
+
+#pragma GCC diagnostic pop
diff --git a/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_SoftMax_SQ8.c b/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_SoftMax_SQ8.c
index 889d7cfd4..51ae1a98b 100644
--- a/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_SoftMax_SQ8.c
+++ b/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_SoftMax_SQ8.c
@@ -14,13 +14,14 @@
  * limitations under the License.
  */
 
+#include <stdio.h>
+#include <math.h>
+#include "CNN_BasicKernels_SQ8.h"
+
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wextra"
 #pragma GCC diagnostic ignored "-Wpointer-sign"
 #pragma GCC diagnostic ignored "-Wsign-compare"
-#include <stdio.h>
-#include <math.h>
-#include "CNN_BasicKernels_SQ8.h"
 
 static int CoreCountDynamic = 1;
 static int ActiveCore = gap_ncore();
diff --git a/tools/autotiler_v3/CNN_Libraries_SQ8/RNN_SQ8.c b/tools/autotiler_v3/CNN_Libraries_SQ8/RNN_SQ8.c
index 72a17aa83..5c134b67b 100644
--- a/tools/autotiler_v3/CNN_Libraries_SQ8/RNN_SQ8.c
+++ b/tools/autotiler_v3/CNN_Libraries_SQ8/RNN_SQ8.c
@@ -14,13 +14,15 @@
  * limitations under the License.
  */
 
+
+#include <stdio.h>
+#include <math.h>
+#include "CNN_BasicKernels_SQ8.h"
+
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wextra"
 #pragma GCC diagnostic ignored "-Wpointer-sign"
 #pragma GCC diagnostic ignored "-Wsign-compare"
-#include <stdio.h>
-#include <math.h>
-#include "CNN_BasicKernels_SQ8.h"
 
 static int CoreCountDynamic = 1;
 static int ActiveCore = gap_ncore();
diff --git a/tools/autotiler_v3/CNN_Libraries_fp16/CNN_Bias_Linear_Activation_fp16.c b/tools/autotiler_v3/CNN_Libraries_fp16/CNN_Bias_Linear_Activation_fp16.c
index 7992ebbfe..30a622ee2 100644
--- a/tools/autotiler_v3/CNN_Libraries_fp16/CNN_Bias_Linear_Activation_fp16.c
+++ b/tools/autotiler_v3/CNN_Libraries_fp16/CNN_Bias_Linear_Activation_fp16.c
@@ -14,6 +14,12 @@
  * limitations under the License.
  */
 
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wextra"
+#pragma GCC diagnostic ignored "-Wpointer-sign"
+#pragma GCC diagnostic ignored "-Wsign-compare"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+
 #include "Gap.h"
 #include "CNN_BasicKernels_fp16.h"
 #include "CNN_Defines_fp16.h"
@@ -679,3 +685,4 @@ void KerParLinearLayerLeakyReLU_fp16(KerLinear_fp16_T *Arg)
 	gap_waitbarrier(0);
 }
 
+#pragma GCC diagnostic pop
diff --git a/tools/autotiler_v3/CNN_Libraries_fp16/CNN_Conv_BasicKernels_fp16.c b/tools/autotiler_v3/CNN_Libraries_fp16/CNN_Conv_BasicKernels_fp16.c
index 09a8faf29..86d55c0ea 100644
--- a/tools/autotiler_v3/CNN_Libraries_fp16/CNN_Conv_BasicKernels_fp16.c
+++ b/tools/autotiler_v3/CNN_Libraries_fp16/CNN_Conv_BasicKernels_fp16.c
@@ -14,6 +14,12 @@
  * limitations under the License.
  */
 
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wextra"
+#pragma GCC diagnostic ignored "-Wpointer-sign"
+#pragma GCC diagnostic ignored "-Wsign-compare"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+
 #include "Gap.h"
 #include "CNN_BasicKernels_fp16.h"
 
@@ -4024,3 +4030,5 @@ void KerConvNxMDxDyStrideSxSy_fp16(KerConv_fp16_T *Arg)
 	}
 	gap_waitbarrier(0);
 }
+
+#pragma GCC diagnostic pop
diff --git a/tools/autotiler_v3/CNN_Libraries_fp16/CNN_Conv_DW_BasicKernels_fp16.c b/tools/autotiler_v3/CNN_Libraries_fp16/CNN_Conv_DW_BasicKernels_fp16.c
index 681de86d8..5d7d4a3b4 100644
--- a/tools/autotiler_v3/CNN_Libraries_fp16/CNN_Conv_DW_BasicKernels_fp16.c
+++ b/tools/autotiler_v3/CNN_Libraries_fp16/CNN_Conv_DW_BasicKernels_fp16.c
@@ -14,6 +14,12 @@
  * limitations under the License.
  */
 
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wextra"
+#pragma GCC diagnostic ignored "-Wpointer-sign"
+#pragma GCC diagnostic ignored "-Wsign-compare"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+
 #include "Gap.h"
 #include "CNN_BasicKernels_fp16.h"
 
@@ -4151,3 +4157,5 @@ void KerConvDWNxMDxDyStrideSxSy_fp16(KerConv_fp16_T *Arg)
 	gap_waitbarrier(0);
 }
 
+
+#pragma GCC diagnostic pop
\ No newline at end of file
diff --git a/tools/autotiler_v3/CNN_Libraries_fp16/CNN_MatAlgebra_fp16.c b/tools/autotiler_v3/CNN_Libraries_fp16/CNN_MatAlgebra_fp16.c
index ece4f10f6..401728fab 100644
--- a/tools/autotiler_v3/CNN_Libraries_fp16/CNN_MatAlgebra_fp16.c
+++ b/tools/autotiler_v3/CNN_Libraries_fp16/CNN_MatAlgebra_fp16.c
@@ -14,6 +14,12 @@
  * limitations under the License.
  */
 
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wextra"
+#pragma GCC diagnostic ignored "-Wpointer-sign"
+#pragma GCC diagnostic ignored "-Wsign-compare"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+
 #include <stdio.h>
 #include <math.h>
 #include "CNN_BasicKernels_fp16.h"
@@ -7038,3 +7044,5 @@ void KerParMatMulSmallFeatLeakyrelu_fp16(KerMatMul_fp16_T *Arg)
 	}
 	gap_waitbarrier(0);
 }
+
+#pragma GCC diagnostic pop
diff --git a/tools/autotiler_v3/CNN_Libraries_fp16/CNN_MatMul_Conv_fp16.c b/tools/autotiler_v3/CNN_Libraries_fp16/CNN_MatMul_Conv_fp16.c
index 5af1deb3a..29730226e 100644
--- a/tools/autotiler_v3/CNN_Libraries_fp16/CNN_MatMul_Conv_fp16.c
+++ b/tools/autotiler_v3/CNN_Libraries_fp16/CNN_MatMul_Conv_fp16.c
@@ -1,3 +1,10 @@
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wextra"
+#pragma GCC diagnostic ignored "-Wpointer-sign"
+#pragma GCC diagnostic ignored "-Wsign-compare"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+
 #include <stdio.h>
 #include "CNN_BasicKernels_fp16.h"
 
@@ -1387,3 +1394,5 @@ void KerPar_MM_Conv2D_DxDy_ReLU_fp16(
 	}
 	gap_waitbarrier(0);
 }
+
+#pragma GCC diagnostic pop
diff --git a/tools/autotiler_v3/CNN_Libraries_fp16/CNN_Pooling_BasicKernels_fp16.c b/tools/autotiler_v3/CNN_Libraries_fp16/CNN_Pooling_BasicKernels_fp16.c
index c04650d15..0884c78e3 100644
--- a/tools/autotiler_v3/CNN_Libraries_fp16/CNN_Pooling_BasicKernels_fp16.c
+++ b/tools/autotiler_v3/CNN_Libraries_fp16/CNN_Pooling_BasicKernels_fp16.c
@@ -14,6 +14,12 @@
  * limitations under the License.
  */
 
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wextra"
+#pragma GCC diagnostic ignored "-Wpointer-sign"
+#pragma GCC diagnostic ignored "-Wsign-compare"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+
 #include <stdio.h>
 #include "Gap.h"
 #include "CNN_BasicKernels_fp16.h"
@@ -1524,3 +1530,5 @@ void KerParAvgPoolNxMStrideSxSy_HWC_fp16(Ker_MM_Pool_fp16_T *Arg)
         }
         gap_waitbarrier(0);
 }
+
+#pragma GCC diagnostic pop
diff --git a/tools/autotiler_v3/CNN_Libraries_fp16/RNN_fp16.c b/tools/autotiler_v3/CNN_Libraries_fp16/RNN_fp16.c
index 08f52e1a8..f72da190f 100644
--- a/tools/autotiler_v3/CNN_Libraries_fp16/RNN_fp16.c
+++ b/tools/autotiler_v3/CNN_Libraries_fp16/RNN_fp16.c
@@ -14,6 +14,12 @@
  * limitations under the License.
  */
 
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wextra"
+#pragma GCC diagnostic ignored "-Wpointer-sign"
+#pragma GCC diagnostic ignored "-Wsign-compare"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+
 #include <stdio.h>
 #include "CNN_BasicKernels_fp16.h"
 
@@ -485,4 +491,6 @@ void GRU_ParKer_fp16(KerGRU_fp16_T *Arg)
 	}
 	gap_waitbarrier(0);
 }
-#endif
\ No newline at end of file
+#endif
+
+#pragma GCC diagnostic pop
diff --git a/tools/autotiler_v3/CNN_Libraries_fp16/SSD_BasicKernels_fp16.c b/tools/autotiler_v3/CNN_Libraries_fp16/SSD_BasicKernels_fp16.c
index 1703742f3..03fa3d2af 100644
--- a/tools/autotiler_v3/CNN_Libraries_fp16/SSD_BasicKernels_fp16.c
+++ b/tools/autotiler_v3/CNN_Libraries_fp16/SSD_BasicKernels_fp16.c
@@ -61,12 +61,12 @@ void Ker_SSD_Init_f16(Ker_SSD_Init_Arg_f16_T  *KerArg0)
 }
 
 // The actual code that does the tile addition
-void Ker_SSD_Decoder_fp16(Ker_SSD_Decoder_Arg_fp16_T  *KerArg0 )
+void Ker_SSD_Decoder_f16(Ker_SSD_Decoder_Arg_f16_T  *KerArg0 )
 {
     unsigned int CoreId = gap_coreid();
-    unsigned int Chunk  = ChunkSize(KerArg0->H);
+    unsigned int Chunk  = ChunkSize(KerArg0->N_Anchors);
     unsigned int First  = CoreId*Chunk;
-    unsigned int Last   = (First+Chunk > KerArg0->H) ? (KerArg0->H) : (First+Chunk);
+    unsigned int Last   = (First+Chunk > KerArg0->N_Anchors) ? (KerArg0->N_Anchors) : (First+Chunk);
     bbox_f16_t * bbox   = KerArg0->bbox_buf;
     F16 * scores        = KerArg0->classes_in;
     int num_classes     = KerArg0->N_Classes;
@@ -152,7 +152,7 @@ static int16_t KerIoverU(F16 a_x, F16 a_y, F16 a_w, F16 a_h,
 }
 
 
-static void KerNonMaxSuppress(bbox_t * boundbxs, float iouThres, int nnbb){
+static void KerNonMaxSuppress(bbox_f16_t * boundbxs, float iouThres, int nnbb){
     //BBOX value are in Q14 and non_max_threshold in Q14
     int idx, idx_int;
     //Non-max supression
@@ -175,7 +175,7 @@ static void KerNonMaxSuppress(bbox_t * boundbxs, float iouThres, int nnbb){
     }
 }
 
-void Ker_SSD_NMS(Ker_SSD_NMS_ArgT  *KerArg0 )
+void Ker_SSD_NMS_f16(Ker_SSD_NMS_Arg_f16_T  *KerArg0 )
 {
     
     short int bbox_idx_max = *(KerArg0->bbox_idx);
diff --git a/tools/autotiler_v3/CNN_Libraries_fp16/SSD_BasicKernels_fp16.h b/tools/autotiler_v3/CNN_Libraries_fp16/SSD_BasicKernels_fp16.h
index 3bb8ca9cd..513581a17 100644
--- a/tools/autotiler_v3/CNN_Libraries_fp16/SSD_BasicKernels_fp16.h
+++ b/tools/autotiler_v3/CNN_Libraries_fp16/SSD_BasicKernels_fp16.h
@@ -72,9 +72,9 @@ typedef struct {
     F16 NMSThr;
     short int n_max_bb;
     short int *bbox_idx;
-} Ker_SSD_NMS_ArgT;
+} Ker_SSD_NMS_Arg_f16_T;
 
-void Ker_SSD_NMS(Ker_SSD_NMS_ArgT *Arg);
+void Ker_SSD_NMS_f16(Ker_SSD_NMS_Arg_f16_T *Arg);
 
 
 
diff --git a/tools/autotiler_v3/DSP_Generators/DSP_Generators.c b/tools/autotiler_v3/DSP_Generators/DSP_Generators.c
index 06a9b3658..df0f64605 100644
--- a/tools/autotiler_v3/DSP_Generators/DSP_Generators.c
+++ b/tools/autotiler_v3/DSP_Generators/DSP_Generators.c
@@ -234,6 +234,21 @@ void LoadMFCCLibrary()
 				)
 		);
 
+	LibKernelTemplate("MatMul_DSP_T",
+			  CArgs(10,
+			  	TCArg("void * __restrict__", "In1"),
+        			TCArg("void * __restrict__", "In2"),
+        			TCArg("void * __restrict__", "Out"),
+        			TCArg("void *", 	     "BufferColIn2"),
+        			TCArg("unsigned int",  	     "W_In1"),
+        			TCArg("unsigned int",  	     "H_In1"),
+        			TCArg("unsigned int",  	     "W_In2"),
+        			TCArg("unsigned int",  	     "W_Out"),
+        			TCArg("unsigned int",  	     "OutFirstCol"),
+        			TCArg("int",  		     "ColFirst")
+				)
+		);
+
 	/* FFT Basic Kernels */
 	LibKernel("Radix2FFT_DIF_Par_Fix16",	CALL_PARALLEL, 0, "FFT_Arg_T", NULL);
 	LibKernel("Radix2FFT_DIF_Par_Fix32",	CALL_PARALLEL, 0, "FFT_Arg_T", NULL);
@@ -332,6 +347,11 @@ void LoadMFCCLibrary()
         LibKernel("Conjugate_Fix32_Par",   CALL_PARALLEL, CArgs(2, TCArg("int   * __restrict__", "Data"), TCArg("int", "Ni")), "SwapSamples_Arg_T", NULL);
         LibKernel("Conjugate_Float16_Par", CALL_PARALLEL, CArgs(2, TCArg("F16V_DSP   * __restrict__", "Data"), TCArg("int", "Ni")), "SwapSamples_Arg_T", NULL);
         LibKernel("Conjugate_Float32_Par", CALL_PARALLEL, CArgs(2, TCArg("float * __restrict__", "Data"), TCArg("int", "Ni")), "SwapSamples_Arg_T", NULL);
+
+        LibKernel("KerParMatMulDSP_fp16",  CALL_PARALLEL, 0, "MatMul_DSP_T", NULL);
+        LibKernel("KerParMatMulDSPT_fp16", CALL_PARALLEL, 0, "MatMul_DSP_T", NULL);
+        LibKernel("KerParMatMulDSP_fp32",  CALL_PARALLEL, 0, "MatMul_DSP_T", NULL);
+        LibKernel("KerParMatMulDSPT_fp32", CALL_PARALLEL, 0, "MatMul_DSP_T", NULL);
 }
 
 void PieceWiseGenerator(char *Name, CNN_GenControl_T *Ctrl, char *FunName, int Dim, int DataType, int Inplace)
@@ -921,6 +941,104 @@ int MFCC_Generator(
 	return (Kernel!=0);
 }
 
+int IMel_Generator(
+	char *Name,
+	CNN_GenControl_T *Ctrl,
+	int NFrames,
+	int Nfft,
+	int NMelBanks,
+	int SizeMelCoeff,
+	int DataType
+	)
+{
+	if (__builtin_popcount(Nfft) != 1) GenTilingError("%s, Incorrect FFTDim: %d, it has to be a a power of 2", Name, Nfft);
+	if (DataType==FIX32 || DataType==FIX16) GenTilingError("Not supported FIX_32");
+
+	int MFCC_Coeff_Dyn = 15;
+	char *PreEmpKernel=0, *InverseMelKer=0, *UserKernType=0, *UserKernPointer=0, InItemSize=2, OutItemSize=2, LUTItemSize=2; 
+
+	switch (DataType){
+		case FIX16:
+			InverseMelKer = "MelFilterBank_Fix32";
+			UserKernType = "short int";
+			UserKernPointer = "short int * __restrict__";
+			InItemSize=2; OutItemSize=2, LUTItemSize=2;
+			break;
+		case FLOAT16:
+			InverseMelKer = "MelFilterBank_f16";
+			UserKernType = "F16_DSP";
+			UserKernPointer = "F16_DSP * __restrict__";
+			InItemSize=F16_SIZE; OutItemSize=F16_SIZE, LUTItemSize=F16_SIZE;
+			break;
+		case FLOAT32:
+			InverseMelKer = "MelFilterBank_f32";
+			UserKernType = "float";
+			UserKernPointer = "float * __restrict__";
+			InItemSize=4; OutItemSize=4, LUTItemSize=4;
+			break;
+		default:
+			GenTilingError("Data Type %d not known", DataType);
+			return 0;
+	}
+	unsigned int LayerOp = 0;
+	unsigned int LayerBandwidth = 0;
+	printf("Inverse Mel:\n");
+	printf("\tNb Oper: %d\n", LayerOp);
+	printf("\tBandwidth: %d\n", LayerBandwidth);
+
+	Kernel_T *Kernel = UserKernel(Name,
+                NFrames<0?
+                KernelIterSpace(2, IterFixedSpaceDynBound(D0, -NFrames, "NFrames"), IterTiledSpace(T0)):
+                KernelIterSpace(2, IterFixedSpace(D0, NFrames), IterTiledSpace(T0)),
+                TILE_HOR,
+                CArgs(5,
+                	TCArg(UserKernPointer, "In"),
+			TCArg(UserKernPointer, "Out"),
+			TCArg("fbank_type_t *","IMel_FilterBank"),
+			TCArg(UserKernPointer, "IMel_Coeffs"),
+			(NFrames<0)?
+			TCArg("short int",     "NFrames"):AT_NO_C_ARG
+                ),
+                Calls(1,
+                	Call(InverseMelKer, LOC_LOOP,
+                		Bindings(9,
+					K_Arg("In", KER_ARG_TILE),
+					K_Arg("Out" , KER_ARG_TILE),
+					K_Arg("IMel_Coeffs"    , KER_ARG_TILE),
+					K_Arg("IMel_FilterBank", KER_ARG_TILE),
+					Imm(NMelBanks),
+					Imm(MFCC_Coeff_Dyn),
+					AT_IGNORE_ARG_BINDING,
+					(DataType==FIX16)?K_Arg("shift_buff", KER_ARG_TILE):AT_IGNORE_ARG_BINDING,
+					AT_IGNORE_ARG_BINDING
+                			)
+                		)
+		),
+		KerArgs(4,
+			KerArg("In",		  KerArgSpace(1,D0), OBJ_IN_DB,		  1, NMelBanks, 	InItemSize, 			   0, 0, 0, "In"),
+			KerArg("Out",		  KerArgSpace(1,D0), OBJ_OUT_DB,	  1, Nfft*2, 		OutItemSize, 			   0, 0, 0, "Out"),
+			KerArg("IMel_FilterBank", KerArgSpace(1,T0), O_IN|O_BUFF|O_CONST, 1, NMelBanks,		6, /* size of filterbank type */   0, 0, 0, "IMel_FilterBank"),
+			KerArg("IMel_Coeffs",     KerArgSpace(1,T0), O_IN|O_BUFF|O_CONST, 1, SizeMelCoeff,	LUTItemSize,		   	   0, 0, 0, "IMel_Coeffs")
+		)
+	);
+	if (Kernel) {
+		AddKernelInfos(Name, AT_KERINFO_OPER, LayerOp, 0);
+		AddKernelInfos(Name, AT_KERINFO_BANDWIDTH, LayerBandwidth, 0);
+
+		if (DataType==FIX32 || DataType==FIX16) {
+			AddKernelArgDim(Name, "In",   3, Abs(NFrames), NMelBanks, InItemSize);
+			AddKernelArgDim(Name, "Out",  3, Abs(NFrames), 2*Nfft, OutItemSize);
+			AddKernelArgDim(Name, "IMel_Coeffs",  2, SizeMelCoeff, LUTItemSize);
+		} else {
+			AddKernelFloatArgDim(Name, "In",   3, Abs(NFrames), NMelBanks, InItemSize);
+			AddKernelFloatArgDim(Name, "Out",  3, Abs(NFrames), 2*Nfft, OutItemSize);
+			AddKernelFloatArgDim(Name, "IMel_Coeffs",  2, SizeMelCoeff, LUTItemSize);		
+		}
+		AddKernelArgDim(Name, "IMel_FilterBank",  3, NMelBanks, 3, 2);
+	}
+	return (Kernel!=0);
+}
+
 int RFFT_2D_Generator(
 	char *Name,
 	CNN_GenControl_T *Ctrl,
@@ -1538,3 +1656,126 @@ void STFT_Generator(
 		)
 	);
 }
+
+int DSP_MatMul_Generator(
+	char *Name,
+
+	CNN_GenControl_T *Ctrl,
+
+	int ColM1,
+	int LineM1,
+	int ColM2,
+	int LineM2,
+
+	int TransposedIn2,
+	int DataType
+)
+
+{
+	int Log = 1;
+	Tile_Orientation_T TileOrientation = TILE_HOR;
+	int F = 0;
+	unsigned long long int LayerOp = 0;
+	unsigned long long int LayerBandwidth = 0;
+        int LineO = LineM1, ColO = ColM2;
+	int Nbuff, ItemSize;
+
+	if (ColM1 != LineM2) GenTilingError("DSP_MatMul_Generator: %s, Incorrect input matrices dimensions for a matrix multiplication: [%d x %d]*[%d x %d]", Name, LineM1, ColM1, LineM2, ColM2);
+
+	char *MatMulKerName=0, *UserKernType=0, *UserKernPointer=0;
+	switch (DataType){
+		case FIX16:
+			GenTilingError("DSP_MatMul_Generator Not yet implemented in FIX16");
+			UserKernType = "short int"; UserKernPointer = "short int * __restrict__";
+			ItemSize=2;
+			break;
+		case FIX32:
+			GenTilingError("DSP_MatMul_Generator Not yet implemented in FIX16");
+			UserKernType = "int"; UserKernPointer = "int * __restrict__";
+			ItemSize=2;
+			break;
+		case FLOAT16:
+			MatMulKerName = TransposedIn2?"KerParMatMulDSPT_fp16":"KerParMatMulDSP_fp16";
+			UserKernType = "F16_DSP"; UserKernPointer = "F16_DSP * __restrict__";
+			ItemSize=F16_SIZE; F = O_FLOAT;
+			break;
+		case FLOAT32:
+			MatMulKerName = TransposedIn2?"KerParMatMulDSPT_fp32":"KerParMatMulDSP_fp32";
+			UserKernType = "float"; UserKernPointer = "float * __restrict__";
+			ItemSize=4; F = O_FLOAT;
+			break;
+		default:
+			GenTilingError("Data Type %d not known", DataType);
+	}
+
+
+	int ColFirst = ((LineM1*ColM1)<(LineM2*ColM2));
+	Nbuff = 4;
+	LayerOp += ColM1*ColM2*LineM1;
+	LayerBandwidth += LineM1*(ColM1*ColM2*(2+2));
+	LayerBandwidth += LineM1*ColM2*2;
+	LayerBandwidth += LineM1*2;
+	
+	if (Log) {
+		printf("CNN_MatMulAct_fp16: %s\n", Name);
+		printf("In1  => W: %4d, H: %4d\n", ColM1, LineM1);
+		printf("In2  => W: %4d, H: %4d\n", ColM2, LineM2);
+		printf("Out  => W: %4d, H: %4d => %s\n", ColO, LineO, ColFirst?"Column first":"Line First");
+		printf("Total Op: %lld\n", LayerOp);
+		if (MatMulKerName) printf("%20s: %s\n", "MatMulKerName", MatMulKerName);
+	}
+
+	int ObjCons = (!TransposedIn2)?OBJ_CONSTRAINTS_TILE_VER:0;
+	if (TransposedIn2) {
+		LineM2 = ColM2; ColM2 = ColM1;
+	}
+	Kernel_T *Kernel = UserKernel(Name,
+		KernelIterSpace(2, IterTiledSpace(T1), IterTiledSpace(T0)),
+                TILE_HOR,
+                CArgs(3,
+                      TCArg(UserKernPointer, "In1"),
+                      TCArg(UserKernPointer, "In2"),
+                      TCArg(UserKernPointer, "Out")
+                ),
+		Calls(1,
+			Call(MatMulKerName, LOC_LOOP,
+				Bindings(10,
+					K_Arg("In1",  KER_ARG_TILE),
+					K_Arg("In2",  KER_ARG_TILE),
+					K_Arg("Out",  KER_ARG_TILE),
+					(!TransposedIn2)?K_Arg("KerBuff", KER_ARG_TILE):AT_IGNORE_ARG_BINDING,
+					K_Arg("In1",  KER_ARG_TILE_W),
+					K_Arg("In1",  KER_ARG_TILE_H),
+					TransposedIn2?K_Arg("In2",  KER_ARG_TILE_H):K_Arg("In2",  KER_ARG_TILE_W),
+					K_Arg("Out", KER_ARG_TILE_W),
+					K_Arg(ColFirst?"In1":"In2",  KER_ARG_TILE_BASE),
+					Imm(ColFirst)
+				)
+			)
+		),
+		ColFirst?
+		KerArgs(4,
+	    		(!TransposedIn2)?
+	    		KerArg("KerBuff",KerArgSpace(1,T1), F|O_BUFF|O_NTILED, Nbuff*ColM1,      1, ItemSize, 0, 0,                                                0, 0):AT_NO_KER_ARG,
+			KerArg("In1",    KerArgSpace(1,T0), F|O_IN|O_DB|O_CONST,     ColM1, LineM1, ItemSize, 0, OBJ_CONSTRAINTS_PAD_REM,                          8, "In1"),
+			KerArg("In2",    KerArgSpace(1,T1), F|O_IN|O_DB,             ColM2, LineM2, ItemSize, 0, ObjCons|OBJ_CONSTRAINTS_PAD_REM,		   2, "In2"),
+			KerArg("Out",    KerArgSpace(1,T1), F|O_OUT|O_DB,             ColO,  LineO, ItemSize, 0, OBJ_CONSTRAINTS_TILE_VER|OBJ_CONSTRAINTS_PAD_REM, 0, "Out")
+		):
+		KerArgs(4,
+	 		(!TransposedIn2)?
+	 		KerArg("KerBuff",KerArgSpace(1,T0), F|O_BUFF|O_NTILED, Nbuff*ColM1,      1, ItemSize, 0, 0,                                                0, 0):AT_NO_KER_ARG,
+			KerArg("In1",    KerArgSpace(1,T1), F|O_IN|O_DB|O_CONST,     ColM1, LineM1, ItemSize, 0, OBJ_CONSTRAINTS_PAD_REM,                          8, "In1"),
+			KerArg("In2",    KerArgSpace(1,T0), F|O_IN|O_DB,             ColM2, LineM2, ItemSize, 0, ObjCons|OBJ_CONSTRAINTS_PAD_REM, 	           2, "In2"),
+			KerArg("Out",    KerArgSpace(1,T1), F|O_OUT|O_DB,             ColO,  LineO, ItemSize, 0, OBJ_CONSTRAINTS_PAD_REM,                          0, "Out")
+		)
+	);
+	if (Kernel) {
+		AddKernelInfos(Name, AT_KERINFO_OPER, LayerOp, 0);
+		AddKernelInfos(Name, AT_KERINFO_BANDWIDTH, LayerBandwidth, 0);
+
+		AddKernelFloatArgDim(Name, "In1", 3, LineM1, ColM1, ItemSize);
+		AddKernelFloatArgDim(Name, "In2", 3, LineM2, ColM2, ItemSize);
+		AddKernelFloatArgDim(Name, "Out", 3, LineO, ColO, ItemSize);
+	}
+	return (Kernel!=0);
+}
diff --git a/tools/autotiler_v3/DSP_Generators/DSP_Generators.h b/tools/autotiler_v3/DSP_Generators/DSP_Generators.h
index e6ff452ac..74938f019 100644
--- a/tools/autotiler_v3/DSP_Generators/DSP_Generators.h
+++ b/tools/autotiler_v3/DSP_Generators/DSP_Generators.h
@@ -29,6 +29,30 @@ int MFCC_Generator(
 	int OutFFT 		/* If output FFT beside mel spect */
 	);
 
+int DSP_MatMul_Generator(
+	char *Name,
+
+	CNN_GenControl_T *Ctrl,
+
+	int ColM1,
+	int LineM1,
+	int ColM2,
+	int LineM2,
+
+	int TransposedIn2,
+	int DataType
+);
+
+int IMel_Generator(
+	char *Name,
+	CNN_GenControl_T *Ctrl,
+	int NFrames,
+	int Nfft,
+	int NMelBanks,
+	int SizeMelCoeff,
+	int DataType
+	);
+
 int RFFT_2D_Generator(
 	char *Name,
 	CNN_GenControl_T *Ctrl,
diff --git a/tools/autotiler_v3/DSP_Libraries/DSP_Lib.h b/tools/autotiler_v3/DSP_Libraries/DSP_Lib.h
index c1f642364..292a84998 100644
--- a/tools/autotiler_v3/DSP_Libraries/DSP_Lib.h
+++ b/tools/autotiler_v3/DSP_Libraries/DSP_Lib.h
@@ -121,9 +121,9 @@ typedef struct {
 } FFT_InstallArg_T;
 
 typedef struct fbank_type_ {
-	short int Start;
-	short int Items;
-	short int Base;
+	unsigned short int Start;
+	unsigned short int Items;
+	unsigned short int Base;
 } fbank_type_t;
 
 typedef struct {
@@ -231,6 +231,19 @@ typedef struct {
 	int FFT_Dim;
 } Windowing_T;
 
+typedef struct {
+        void * __restrict__ In1;
+        void * __restrict__ In2;
+        void * __restrict__ Out;
+        void *BufferColIn2;
+        unsigned int W_In1;
+        unsigned int H_In1;
+        unsigned int W_In2;
+        unsigned int W_Out;
+        unsigned int OutFirstCol;
+        int ColFirst;
+} MatMul_DSP_T;
+
 /********************************************************************************************************************************************************************/
 /****************** FFT Library  ************************************************************************************************************************************/
 /********************************************************************************************************************************************************************/
@@ -353,4 +366,9 @@ extern void WindowingReal2Cmplx_PadCenter_f16(Windowing_T *Arg);
 extern void WindowingReal2Real_f16(Windowing_T *Arg);
 extern void WindowingReal2Real_PadCenter_f16(Windowing_T *Arg);
 
+extern void KerParMatMulDSP_fp16(MatMul_DSP_T *Arg);
+extern void KerParMatMulDSPT_fp16(MatMul_DSP_T *Arg);
+extern void KerParMatMulDSP_fp32(MatMul_DSP_T *Arg);
+extern void KerParMatMulDSPT_fp32(MatMul_DSP_T *Arg);
+
 #endif //DSP_LIB_H
\ No newline at end of file
diff --git a/tools/autotiler_v3/DSP_Libraries/FFT_Library.c b/tools/autotiler_v3/DSP_Libraries/FFT_Library.c
index 9a6e96b2c..cef2cb0f9 100644
--- a/tools/autotiler_v3/DSP_Libraries/FFT_Library.c
+++ b/tools/autotiler_v3/DSP_Libraries/FFT_Library.c
@@ -29,8 +29,8 @@ void FFT_InstallTwiddlesAndSwapLUT(FFT_InstallArg_T *Arg, int format)
     LUTSize = Arg->Nfft*sizeof(short);
 
 
-    AT_L2_COPY(0, (AT_L2_EXT_ADDR_TYPE) Arg->SwapLUT, (AT_L2_INT_ADDR_TYPE) Arg->L1_SwapLUT,  LUTSize,  0, &DmaR_Evt2);
-    AT_L2_COPY(0, (AT_L2_EXT_ADDR_TYPE) Arg->Twiddles, (AT_L2_INT_ADDR_TYPE)Arg->L1_Twiddles, TwidSize, 0, &DmaR_Evt1);
+    AT_L2_COPY(0, (AT_L2_EXT_ADDR_TYPE) Arg->SwapLUT,  (AT_L2_INT_ADDR_TYPE) Arg->L1_SwapLUT,  LUTSize,  0, &DmaR_Evt1);
+    AT_L2_COPY(0, (AT_L2_EXT_ADDR_TYPE) Arg->Twiddles, (AT_L2_INT_ADDR_TYPE) Arg->L1_Twiddles, TwidSize, 0, &DmaR_Evt2);
 
     AT_L2_WAIT(0, &DmaR_Evt1);
     AT_L2_WAIT(0, &DmaR_Evt2);
@@ -42,8 +42,8 @@ void RFFT_InstallTwiddlesAndSwapLUT(FFT_InstallArg_T *Arg, int format)
     AT_L2_EVENT DmaR_Evt1, DmaR_Evt2, DmaR_Evt3;
     int TwidSize, RTwidSize, LUTSize;
 
-    if (Arg->Radix == 2) TwidSize = Arg->Nfft * sizeof(short);
-    else TwidSize = 3 * Arg->Nfft * (sizeof(short)/2);
+    if (Arg->Radix == 2) TwidSize = (Arg->Nfft>>1) * sizeof(short);
+    else TwidSize = 3 * (Arg->Nfft>>1) * (sizeof(short)/2);
 
     // when floating 32, size is double
     if (format==1) TwidSize *=2;
@@ -52,10 +52,9 @@ void RFFT_InstallTwiddlesAndSwapLUT(FFT_InstallArg_T *Arg, int format)
     if (format==1) RTwidSize = Arg->Nfft * sizeof(float);
     else           RTwidSize = Arg->Nfft * sizeof(short);
 
-
-    AT_L2_COPY(0, (AT_L2_EXT_ADDR_TYPE) Arg->SwapLUT, (AT_L2_INT_ADDR_TYPE) Arg->L1_SwapLUT,  LUTSize,  0, &DmaR_Evt1);
-    AT_L2_COPY(0, (AT_L2_EXT_ADDR_TYPE) Arg->Twiddles, (AT_L2_INT_ADDR_TYPE)Arg->L1_Twiddles, TwidSize, 0, &DmaR_Evt2);
-    AT_L2_COPY(0, (AT_L2_EXT_ADDR_TYPE) Arg->RTwiddles, (AT_L2_INT_ADDR_TYPE)Arg->L1_RTwiddles, RTwidSize, 0, &DmaR_Evt3);
+    AT_L2_COPY(0, (AT_L2_EXT_ADDR_TYPE) Arg->SwapLUT,   (AT_L2_INT_ADDR_TYPE) Arg->L1_SwapLUT,   LUTSize,   0, &DmaR_Evt1);
+    AT_L2_COPY(0, (AT_L2_EXT_ADDR_TYPE) Arg->Twiddles,  (AT_L2_INT_ADDR_TYPE) Arg->L1_Twiddles,  TwidSize,  0, &DmaR_Evt2);
+    AT_L2_COPY(0, (AT_L2_EXT_ADDR_TYPE) Arg->RTwiddles, (AT_L2_INT_ADDR_TYPE) Arg->L1_RTwiddles, RTwidSize, 0, &DmaR_Evt3);
 
     AT_L2_WAIT(0, &DmaR_Evt1);
     AT_L2_WAIT(0, &DmaR_Evt2);
@@ -2014,9 +2013,10 @@ void IRFFT_DIF_Par_Fix16(RFFT_Arg_T *Arg){
         if (CoreId == 0){
                 xAR = pA[0][0];
                 xAI = pA[0][1];
+                xBR = pA[k+1][0];
 
-                RFFT_Out[0][0] = (xAR + xAI) >> 1;
-                RFFT_Out[0][1] = (xAR - xAI) >> 1;
+                RFFT_Out[0][0] = (xAR + xAI + xBR) >> 1;
+                RFFT_Out[0][1] = (xAR + xAI - xBR) >> 1;
         }
         Chunk = ChunkSize(k);
         First = CoreId*Chunk; Last = Min(First+Chunk, k);
@@ -2085,9 +2085,10 @@ void IRFFT_DIF_Par_f16(RFFT_Arg_T *Arg){
         if (CoreId == 0){
                 xAR = pA[0][0];
                 xAI = pA[0][1];
+                xBR = pA[k+1][0];
 
-                RFFT_Out[0][0] = 0.5f * ( xAR + xAI );
-                RFFT_Out[0][1] = 0.5f * ( xAR - xAI );
+                RFFT_Out[0][0] = 0.5f * ( xAR + xAI + xBR);
+                RFFT_Out[0][1] = 0.5f * ( xAR + xAI - xBR);
         }
         Chunk = ChunkSize(k);
         First = CoreId*Chunk; Last = Min(First+Chunk, k);
@@ -2159,9 +2160,10 @@ void IRFFT_DIF_Par_f32(RFFT_Arg_T *Arg){
         if (CoreId == 0){
                 xAR = pA[0];
                 xAI = pA[1];
+                xBR = pA[2*(k+1)];
 
-                RFFT_Out[0] = 0.5f * ( xAR + xAI );
-                RFFT_Out[1] = 0.5f * ( xAR - xAI );
+                RFFT_Out[0] = 0.5f * ( xAR + xAI + xBR );
+                RFFT_Out[1] = 0.5f * ( xAR + xAI - xBR );
         }
         Chunk = ChunkSize(k);
         First = CoreId*Chunk; Last = Min(First+Chunk, k);
diff --git a/tools/autotiler_v3/DSP_Libraries/LUT_Tables/gen_scripts/GenMFCCLUT.py b/tools/autotiler_v3/DSP_Libraries/LUT_Tables/gen_scripts/GenMFCCLUT.py
index 947e2d625..5cea41772 100644
--- a/tools/autotiler_v3/DSP_Libraries/LUT_Tables/gen_scripts/GenMFCCLUT.py
+++ b/tools/autotiler_v3/DSP_Libraries/LUT_Tables/gen_scripts/GenMFCCLUT.py
@@ -26,7 +26,9 @@ def create_parser():
 	parser.add_argument('--fft_lut_file', required="--params_json" not in sys.argv,
 						help="path to fft lut file")
 	parser.add_argument('--mfcc_bf_lut_file', default=None,
-						help="path to fft lut file")
+						help="path to mfcc lut file")
+	parser.add_argument('--imel_lut_file', default=None,
+						help="path to inverse mel lut file")
 	parser.add_argument('--sample_rate', default=16000, type=int)
 	parser.add_argument('--name_suffix', default="", type=str)
 	parser.add_argument('--frame_size', required="--params_json" not in sys.argv, type=int,
@@ -81,6 +83,7 @@ def main():
 
 	fft_lut_file     = args.fft_lut_file     if not "fft_lut_file"	   in models_params else models_params["fft_lut_file"]
 	mfcc_bf_lut_file = args.mfcc_bf_lut_file if not "mfcc_bf_lut_file" in models_params else models_params["mfcc_bf_lut_file"]
+	imel_lut_file    = args.imel_lut_file    if not "imel_lut_file"    in models_params else models_params["imel_lut_file"]
 	use_tf_mfcc      = args.use_tf_mfcc      if not "use_tf_mfcc"	   in models_params else models_params["use_tf_mfcc"]
 	use_librosa      = args.use_librosa      if not "use_librosa"	   in models_params else models_params["use_librosa"]
 	sample_rate      = args.sample_rate      if not "sample_rate"	   in models_params else models_params["sample_rate"]
@@ -218,10 +221,19 @@ def main():
 			from SetupLUT import GenMFCC_FB
 			filters = GenMFCC_FB(n_fft, mfcc_bank_cnt, Fmin=fmin, Fmax=fmax, sample_rate=sample_rate, dtype=lut_dtype)
 
-		MfccLUT, HeadCoeff = GenMelFilterBanksCode(filters, mfcc_bank_cnt, fmin, fmax, lut_dtype, data_type, name_suffix)
+		MelLUT, NCoeffMEL = GenMelFilterBanksCode(filters, mfcc_bank_cnt, fmin, fmax, lut_dtype, data_type, name_suffix)
 
 		with open(mfcc_bf_lut_file, "w") as f:
-			f.write(MfccLUT)
+			f.write(MelLUT)
+
+	if imel_lut_file:
+		# Inverse matrix of filterbank generated with least squares algorithm
+		# A.T*b = A.T*A*x^
+		# x^ = (A.T*A)^-1 * A.T * b
+		inverse_mel_fb = np.matmul(np.linalg.inv(np.matmul(filters, filters.T)), filters)
+		ImelLUT = array_to_def_c_file(inverse_mel_fb.flatten(), f"ImelLUT{name_suffix}", data_type, inverse_mel_fb.size, elem_in_rows=inverse_mel_fb.size)
+		with open(imel_lut_file, "w") as f:
+			f.write(ImelLUT)
 
 	if args.save_params_header:
 		with open(args.save_params_header, "w") as f:
@@ -230,11 +242,11 @@ def main():
 			f.write("#define\t{:21}{:>10}\n".format("FRAME_STEP", frame_step))
 			f.write("#define\t{:21}{:>10}\n".format("N_FFT", n_fft))
 			f.write("#define\t{:21}{:>10}\n".format("DATA_TYPE", 2 if dtype=="float16" else (3 if dtype=="float32" else (1 if dtype=="fix32_scal" else 0))))
-			if mfcc_bf_lut_file:
+			if mfcc_bf_lut_file or imel_lut_file:
 				f.write("#define\t{:21}{:>10}\n".format("MFCC_BANK_CNT", mfcc_bank_cnt))
 				f.write("#define\t{:21}{:>10}\n".format("FMIN", fmin))
 				f.write("#define\t{:21}{:>10}\n".format("FMAX", fmax))
-				f.write("#define\t{:21}{:>10}\n".format("MFCC_COEFF_CNT", HeadCoeff+1))
+				f.write("#define\t{:21}{:>10}\n".format("MFCC_COEFF_CNT", NCoeffMEL+1))
 				f.write("#define\t{:21}{:>10}\n".format("N_DCT", n_dct))
 
 
diff --git a/tools/autotiler_v3/DSP_Libraries/LUT_Tables/gen_scripts/SetupLUT.py b/tools/autotiler_v3/DSP_Libraries/LUT_Tables/gen_scripts/SetupLUT.py
index 671c7d190..5460da336 100644
--- a/tools/autotiler_v3/DSP_Libraries/LUT_Tables/gen_scripts/SetupLUT.py
+++ b/tools/autotiler_v3/DSP_Libraries/LUT_Tables/gen_scripts/SetupLUT.py
@@ -117,7 +117,14 @@ def SetupLiftCoeff(L, N, dtype="int"):
 def GenMelFilterBanksCode(filters, mfcc_bank_cnt, fmin, fmax, dtype, data_type, name_suffix):
 	HeadCoeff = 0
 	MFCC_Coeff = []
-	for i, filt in enumerate(filters):
+	if dtype == "int":
+		quant_filters = FP2FIX(filters, MFCC_COEFF_DYN)
+	elif dtype == "float16":
+		quant_filters = filters.astype(np.float16)
+	else:
+		quant_filters = filters.astype(np.float32)
+
+	for i, filt in enumerate(quant_filters):
 		if np.all(filt == 0):
 			Start = 0
 			Stop = 0
@@ -130,22 +137,17 @@ def GenMelFilterBanksCode(filters, mfcc_bank_cnt, fmin, fmax, dtype, data_type,
 			Items = Stop - Start + 1
 		print("Filter {}: Start: {} Stop: {} Base: {} Items: {}".format(i, Start, Stop, Base, Items))
 		for j in range(Items):
-			if dtype == "int":
-				MFCC_Coeff.append(FP2FIX(filt[Start+j], MFCC_COEFF_DYN))
-			elif dtype == "float16":
-				MFCC_Coeff.append(filt[Start+j].astype(np.float16))
-			else:
-				MFCC_Coeff.append(filt[Start+j])
+			MFCC_Coeff.append(filt[Start+j])
 		HeadCoeff += Items
 
-	Out_str =  "#define MFCC_COEFF_CNT\t{}\n\n".format(HeadCoeff+1)
-	Out_str += "/* Filter Bank bands:\n\n"
+	#Out_str =  "#define MFCC_COEFF_CNT\t{}\n\n".format(HeadCoeff+1)
+	Out_str  = "/* Filter Bank bands:\n\n"
 	Out_str += "\tMinimum Frequency: {} Hz\n".format(fmin)
 	Out_str += "\tMaximum Frequency: {} Hz*/\n\n".format(fmax)
 
 	Out_str += "PI_L2 fbank_type_t MFCC_FilterBank{}[{}] = {{\n".format(name_suffix, mfcc_bank_cnt)
 	HeadCoeff = 0
-	for i, filt in enumerate(filters):
+	for i, filt in enumerate(quant_filters):
 		if np.all(filt == 0):
 			Start = 0
 			Stop = 0
diff --git a/tools/autotiler_v3/DSP_Libraries/MatMulDSP.c b/tools/autotiler_v3/DSP_Libraries/MatMulDSP.c
new file mode 100644
index 000000000..94aa3cea5
--- /dev/null
+++ b/tools/autotiler_v3/DSP_Libraries/MatMulDSP.c
@@ -0,0 +1,514 @@
+#include <stdio.h>
+#include "FastFloatApprox16.h"
+#include "DSP_Lib.h"
+
+static int CoreCountDynamic = 1;
+static int ActiveCore = gap_ncore();
+static inline unsigned int __attribute__((always_inline)) ChunkSize(unsigned int X)
+
+{
+	unsigned int NCore;
+	unsigned int Log2Core;
+	unsigned int Chunk;
+
+	if (CoreCountDynamic) NCore = ActiveCore; else NCore = gap_ncore();
+	Log2Core = gap_fl1(NCore);
+	Chunk = (X>>Log2Core) + ((X&(NCore-1))!=0);
+	return Chunk;
+}
+
+void KerParMatMulDSP_fp16(MatMul_DSP_T *Arg)
+
+{
+	F16_DSP * __restrict__ In1 = (F16_DSP * __restrict__) Arg->In1;
+	F16_DSP * __restrict__ In2 = (F16_DSP * __restrict__) Arg->In2;
+	F16_DSP * __restrict__ Out = (F16_DSP * __restrict__) Arg->Out;
+	F16_DSP *BufferColIn2      = (F16_DSP *) Arg->BufferColIn2;
+	unsigned int W_In1 = Arg->W_In1;
+	unsigned int H_In1 = Arg->H_In1;
+	unsigned int W_In2 = Arg->W_In2; 	/* H_In2 = W_In1 by construction */
+	unsigned int W_Out = Arg->W_Out;
+	unsigned int OutFirstCol = Arg->OutFirstCol;
+	int ColFirst = Arg->ColFirst;
+
+	unsigned int H_In2 = W_In1;
+	unsigned int H_Out = H_In1;
+	unsigned int Line, Col, i;
+	F16V_DSP *VBuff1 = (F16V_DSP *) (&BufferColIn2[0]);
+	F16V_DSP *VBuff2 = (F16V_DSP *) (&BufferColIn2[1*H_In2]);
+	F16V_DSP *VBuff3 = (F16V_DSP *) (&BufferColIn2[2*H_In2]);
+	F16V_DSP *VBuff4 = (F16V_DSP *) (&BufferColIn2[3*H_In2]);
+
+	unsigned int CoreId = gap_coreid();
+	unsigned int ChunkCell = ChunkSize(H_In1);
+	unsigned int First = CoreId*ChunkCell, Last  = Min(H_In1, First+ChunkCell);
+	unsigned int Iter = (Last>First)?(Last-First):0;
+	unsigned int C = ChunkSize(H_In2), F = CoreId*C, L  = Min(H_In2, F+C);
+	int OffLine = 0, OffCol = 0;
+
+	if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol;
+	for (Col=0; Col<W_In2/4; Col++) {
+		for (i=F; i<L; i++) {
+			BufferColIn2[i        ] = In2[i*W_In2+4*Col];
+			BufferColIn2[i+1*H_In2] = In2[i*W_In2+4*Col+1];
+			BufferColIn2[i+2*H_In2] = In2[i*W_In2+4*Col+2];
+			BufferColIn2[i+3*H_In2] = In2[i*W_In2+4*Col+3];
+		}
+		gap_waitbarrier(0);
+		for (Line=0; Line<Iter/2; Line++) {
+	        	int l1 = 2*Line + First;
+	        	F16V_DSP *VIn1 = (F16V_DSP *) (&In1[(l1  )*W_In1 + 0]);
+	        	F16V_DSP *VIn2 = (F16V_DSP *) (&In1[(l1+1)*W_In1 + 0]);
+			F16V_DSP S1 = (F16V_DSP) {0.0, 0.0}, S5 = (F16V_DSP) {0.0, 0.0};
+			F16V_DSP S2 = (F16V_DSP) {0.0, 0.0}, S6 = (F16V_DSP) {0.0, 0.0};
+			F16V_DSP S3 = (F16V_DSP) {0.0, 0.0}, S7 = (F16V_DSP) {0.0, 0.0};
+			F16V_DSP S4 = (F16V_DSP) {0.0, 0.0}, S8 = (F16V_DSP) {0.0, 0.0};
+			for (i=0; i<W_In1/2; i++) {
+				S1 += VIn1[i] * VBuff1[i];
+				S2 += VIn1[i] * VBuff2[i];
+				S3 += VIn1[i] * VBuff3[i];
+				S4 += VIn1[i] * VBuff4[i];
+				S5 += VIn2[i] * VBuff1[i];
+				S6 += VIn2[i] * VBuff2[i];
+				S7 += VIn2[i] * VBuff3[i];
+				S8 += VIn2[i] * VBuff4[i];
+			}
+			F16_DSP s1 = S1[0]+S1[1];
+			F16_DSP s2 = S2[0]+S2[1];
+			F16_DSP s3 = S3[0]+S3[1];
+			F16_DSP s4 = S4[0]+S4[1];
+			F16_DSP s5 = S5[0]+S5[1];
+			F16_DSP s6 = S6[0]+S6[1];
+			F16_DSP s7 = S7[0]+S7[1];
+			F16_DSP s8 = S8[0]+S8[1];
+			if (W_In1&0x1) {
+				s1 += In1[(l1  )*W_In1 + W_In1-1] * BufferColIn2[W_In1-1];
+				s2 += In1[(l1  )*W_In1 + W_In1-1] * BufferColIn2[W_In1-1+1*H_In2];
+				s3 += In1[(l1  )*W_In1 + W_In1-1] * BufferColIn2[W_In1-1+2*H_In2];
+				s4 += In1[(l1  )*W_In1 + W_In1-1] * BufferColIn2[W_In1-1+3*H_In2];
+				s5 += In1[(l1+1)*W_In1 + W_In1-1] * BufferColIn2[W_In1-1];
+				s6 += In1[(l1+1)*W_In1 + W_In1-1] * BufferColIn2[W_In1-1+1*H_In2];
+				s7 += In1[(l1+1)*W_In1 + W_In1-1] * BufferColIn2[W_In1-1+2*H_In2];
+				s8 += In1[(l1+1)*W_In1 + W_In1-1] * BufferColIn2[W_In1-1+3*H_In2];
+			}
+		       	Out[(l1  +OffLine)*W_Out+4*Col  +OffCol] = s1;
+		       	Out[(l1  +OffLine)*W_Out+4*Col+1+OffCol] = s2;
+		       	Out[(l1  +OffLine)*W_Out+4*Col+2+OffCol] = s3;
+		       	Out[(l1  +OffLine)*W_Out+4*Col+3+OffCol] = s4;
+		       	Out[(l1+1+OffLine)*W_Out+4*Col  +OffCol] = s5;
+		       	Out[(l1+1+OffLine)*W_Out+4*Col+1+OffCol] = s6;
+		       	Out[(l1+1+OffLine)*W_Out+4*Col+2+OffCol] = s7;
+		       	Out[(l1+1+OffLine)*W_Out+4*Col+3+OffCol] = s8;
+		}
+		if (Iter&0x1) {
+			int l1 = Last-1;
+	        	F16V_DSP *VIn1 = (F16V_DSP *) (&In1[(l1  )*W_In1 + 0]);
+			F16V_DSP S1 = (F16V_DSP) {0.0, 0.0};
+			F16V_DSP S2 = (F16V_DSP) {0.0, 0.0};
+			F16V_DSP S3 = (F16V_DSP) {0.0, 0.0};
+			F16V_DSP S4 = (F16V_DSP) {0.0, 0.0};
+			for (i=0; i<W_In1/2; i++) {
+				S1 += VIn1[i] * VBuff1[i];
+				S2 += VIn1[i] * VBuff2[i];
+				S3 += VIn1[i] * VBuff3[i];
+				S4 += VIn1[i] * VBuff4[i];
+			}
+			F16_DSP s1 = S1[0]+S1[1];
+			F16_DSP s2 = S2[0]+S2[1];
+			F16_DSP s3 = S3[0]+S3[1];
+			F16_DSP s4 = S4[0]+S4[1];
+			if (W_In1&0x1) {
+				s1 += In1[(l1)*W_In1 + W_In1-1] * BufferColIn2[W_In1-1];
+				s2 += In1[(l1)*W_In1 + W_In1-1] * BufferColIn2[W_In1-1+1*H_In2];
+				s3 += In1[(l1)*W_In1 + W_In1-1] * BufferColIn2[W_In1-1+2*H_In2];
+				s4 += In1[(l1)*W_In1 + W_In1-1] * BufferColIn2[W_In1-1+3*H_In2];
+			}
+		       	Out[(l1  +OffLine)*W_Out+4*Col  +OffCol] = s1;
+		       	Out[(l1  +OffLine)*W_Out+4*Col+1+OffCol] = s2;
+		       	Out[(l1  +OffLine)*W_Out+4*Col+2+OffCol] = s3;
+		       	Out[(l1  +OffLine)*W_Out+4*Col+3+OffCol] = s4;
+		}
+		gap_waitbarrier(0);
+	}
+	for (Col=(W_In2/4)*4; Col<W_In2; Col++) {
+		for (i=F; i<L; i++) BufferColIn2[i] = In2[i*W_In2+Col];
+		gap_waitbarrier(0);
+		for (Line=0; Line<Iter/2; Line++) {
+	        	int l1 = 2*Line + First;
+	        	F16V_DSP *VIn1 = (F16V_DSP *) (&In1[(l1  )*W_In1 + 0]);
+	        	F16V_DSP *VIn2 = (F16V_DSP *) (&In1[(l1+1)*W_In1 + 0]);
+			F16V_DSP S1 = (F16V_DSP) {0.0, 0.0}, S5 = (F16V_DSP) {0.0, 0.0};
+			for (i=0; i<W_In1/2; i++) {
+				S1 += VIn1[i] * VBuff1[i];
+				S5 += VIn2[i] * VBuff1[i];
+			}
+			F16_DSP s1 = S1[0]+S1[1];
+			F16_DSP s5 = S5[0]+S5[1];
+			if (W_In1&0x1) {
+				s1 += In1[(l1  )*W_In1 + W_In1-1] * BufferColIn2[W_In1-1];
+				s5 += In1[(l1+1)*W_In1 + W_In1-1] * BufferColIn2[W_In1-1];
+			}
+		       	Out[(l1  +OffLine)*W_Out+Col  +OffCol] = s1;
+		       	Out[(l1+1+OffLine)*W_Out+Col  +OffCol] = s5;
+		}
+		if (Iter&0x1) {
+			int l1 = Last-1;
+	        	F16V_DSP *VIn1 = (F16V_DSP *) (&In1[(l1  )*W_In1 + 0]);
+			F16V_DSP S1 = (F16V_DSP) {0.0, 0.0};
+			for (i=0; i<W_In1/2; i++) S1 += VIn1[i] * VBuff1[i];
+			F16_DSP s1 = S1[0]+S1[1];
+			if (W_In1&0x1) s1 += In1[(l1)*W_In1 + W_In1-1] * BufferColIn2[W_In1-1];
+		       	Out[(l1  +OffLine)*W_Out+Col  +OffCol] = s1;
+		}
+		gap_waitbarrier(0);
+	}
+}
+
+void KerParMatMulDSPT_fp16(MatMul_DSP_T *Arg)
+
+{
+	F16_DSP * __restrict__ In1 = (F16_DSP * __restrict__) Arg->In1;
+	F16_DSP * __restrict__ In2 = (F16_DSP * __restrict__) Arg->In2;
+	F16_DSP * __restrict__ Out = (F16_DSP * __restrict__) Arg->Out;
+	unsigned int W_In1 = Arg->W_In1;
+	unsigned int H_In1 = Arg->H_In1;
+	unsigned int W_In2 = Arg->W_In2; 	/* H_In2 = W_In1 by construction */
+	unsigned int W_Out = Arg->W_Out;
+	unsigned int OutFirstCol = Arg->OutFirstCol;
+	int ColFirst = Arg->ColFirst;
+
+	unsigned int H_In2 = W_In1;
+	unsigned int H_Out = H_In1;
+	unsigned int Line, Col, i;
+
+	unsigned int CoreId = gap_coreid();
+	unsigned int ChunkCell = ChunkSize(H_In1);
+	unsigned int First = CoreId*ChunkCell, Last  = Min(H_In1, First+ChunkCell);
+	unsigned int Iter = (Last>First)?(Last-First):0;
+	int OffLine = 0, OffCol = 0;
+
+	if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol;
+	F16_DSP * pOut = Out + (OffLine+First)*W_Out + OffCol;
+	for (Line=0; Line<Iter/2; Line++) {
+        	F16_DSP *pIn2 = In2;
+        	int l1 = 2*Line + First;
+        	F16V_DSP *VIn1 = (F16V_DSP *) (&In1[(l1  )*W_In1 + 0]);
+        	F16V_DSP *VIn2 = (F16V_DSP *) (&In1[(l1+1)*W_In1 + 0]);
+		for (Col=0; Col<W_In2/4; Col++) {
+			F16V_DSP *VBuff0 = (F16V_DSP *) (pIn2);
+			F16V_DSP *VBuff1 = (F16V_DSP *) (pIn2+H_In2);
+			F16V_DSP *VBuff2 = (F16V_DSP *) (pIn2+2*H_In2);
+			F16V_DSP *VBuff3 = (F16V_DSP *) (pIn2+3*H_In2);
+			F16V_DSP S0 = (F16V_DSP) {0.0, 0.0}, S4=S0;
+			F16V_DSP S1 = (F16V_DSP) {0.0, 0.0}, S5=S1;
+			F16V_DSP S2 = (F16V_DSP) {0.0, 0.0}, S6=S2;
+			F16V_DSP S3 = (F16V_DSP) {0.0, 0.0}, S7=S3;
+			for (i=0; i<W_In1/2; i++) {
+				S0 += VIn1[i] * VBuff0[i];
+				S1 += VIn1[i] * VBuff1[i];
+				S2 += VIn1[i] * VBuff2[i];
+				S3 += VIn1[i] * VBuff3[i];
+				S4 += VIn2[i] * VBuff0[i];
+				S5 += VIn2[i] * VBuff1[i];
+				S6 += VIn2[i] * VBuff2[i];
+				S7 += VIn2[i] * VBuff3[i];
+			}
+			F16_DSP s0 = S0[0]+S0[1];
+			F16_DSP s1 = S1[0]+S1[1];
+			F16_DSP s2 = S2[0]+S2[1];
+			F16_DSP s3 = S3[0]+S3[1];
+			F16_DSP s4 = S4[0]+S4[1];
+			F16_DSP s5 = S5[0]+S5[1];
+			F16_DSP s6 = S6[0]+S6[1];
+			F16_DSP s7 = S7[0]+S7[1];
+			if (W_In1&0x1) {
+				s0 += In1[(l1  )*W_In1 + W_In1-1] * pIn2[W_In1-1];
+				s1 += In1[(l1  )*W_In1 + W_In1-1] * pIn2[W_In1-1+H_In2];
+				s2 += In1[(l1  )*W_In1 + W_In1-1] * pIn2[W_In1-1+2*H_In2];
+				s3 += In1[(l1  )*W_In1 + W_In1-1] * pIn2[W_In1-1+3*H_In2];
+				s4 += In1[(l1+1)*W_In1 + W_In1-1] * pIn2[W_In1-1];
+				s5 += In1[(l1+1)*W_In1 + W_In1-1] * pIn2[W_In1-1+H_In2];
+				s6 += In1[(l1+1)*W_In1 + W_In1-1] * pIn2[W_In1-1+2*H_In2];
+				s7 += In1[(l1+1)*W_In1 + W_In1-1] * pIn2[W_In1-1+3*H_In2];
+			}
+		       	pOut[      (4*Col  )] = s0;
+		       	pOut[      (4*Col+1)] = s1;
+		       	pOut[      (4*Col+2)] = s2;
+		       	pOut[      (4*Col+3)] = s3;
+		       	pOut[W_Out+(4*Col  )] = s4;
+		       	pOut[W_Out+(4*Col+1)] = s5;
+		       	pOut[W_Out+(4*Col+2)] = s6;
+		       	pOut[W_Out+(4*Col+3)] = s7;
+		       	pIn2 += 4*H_In2;
+		}
+		for (Col=(W_In2/4)*4; Col<W_In2; Col++) {
+			F16V_DSP *VBuff0 = (F16V_DSP *) (pIn2);
+			F16V_DSP S0 = (F16V_DSP) {0.0, 0.0}, S4=S0;
+			for (i=0; i<W_In1/2; i++) {
+				S0 += VIn1[i] * VBuff0[i];
+				S4 += VIn2[i] * VBuff0[i];
+			}
+			F16_DSP s0 = S0[0]+S0[1];
+			F16_DSP s4 = S4[0]+S4[1];
+			if (W_In1&0x1) {
+				s0 += In1[(l1  )*W_In1 + W_In1-1] * pIn2[W_In1-1];
+				s4 += In1[(l1+1)*W_In1 + W_In1-1] * pIn2[W_In1-1];
+			}
+		       	pOut[      (W_In2-1)] = s0;
+		       	pOut[W_Out+(W_In2-1)] = s4;;
+		       	pIn2 += H_In2;
+		}
+		pOut += 2*W_Out;
+	}
+	if (Iter&0x1) {
+        	F16_DSP *pIn2 = In2;
+        	int l1 = Last-1;
+        	F16V_DSP *VIn1 = (F16V_DSP *) (&In1[(l1  )*W_In1 + 0]);
+        	for (Col=0; Col<W_In2/4; Col++) {
+			F16V_DSP *VBuff0 = (F16V_DSP *) (pIn2);
+			F16V_DSP *VBuff1 = (F16V_DSP *) (pIn2+H_In2);
+			F16V_DSP *VBuff2 = (F16V_DSP *) (pIn2+2*H_In2);
+			F16V_DSP *VBuff3 = (F16V_DSP *) (pIn2+3*H_In2);
+			F16V_DSP S0 = (F16V_DSP) {0.0, 0.0};
+			F16V_DSP S1 = (F16V_DSP) {0.0, 0.0};
+			F16V_DSP S2 = (F16V_DSP) {0.0, 0.0};
+			F16V_DSP S3 = (F16V_DSP) {0.0, 0.0};
+			for (i=0; i<W_In1/2; i++) {
+				S0 += VIn1[i] * VBuff0[i];
+				S1 += VIn1[i] * VBuff1[i];
+				S2 += VIn1[i] * VBuff2[i];
+				S3 += VIn1[i] * VBuff3[i];
+			}
+			F16_DSP s0 = S0[0]+S0[1];
+			F16_DSP s1 = S1[0]+S1[1];
+			F16_DSP s2 = S2[0]+S2[1];
+			F16_DSP s3 = S3[0]+S3[1];
+			if (W_In1&0x1) {
+				s0 += In1[(l1  )*W_In1 + W_In1-1] * pIn2[W_In1-1];
+				s1 += In1[(l1  )*W_In1 + W_In1-1] * pIn2[W_In1-1+H_In2];
+				s2 += In1[(l1  )*W_In1 + W_In1-1] * pIn2[W_In1-1+2*H_In2];
+				s3 += In1[(l1  )*W_In1 + W_In1-1] * pIn2[W_In1-1+3*H_In2];
+			}
+		       	pOut[      (4*Col  )] = s0;
+		       	pOut[      (4*Col+1)] = s1;
+		       	pOut[      (4*Col+2)] = s2;
+		       	pOut[      (4*Col+3)] = s3;
+		       	pIn2 += 4*H_In2;
+		}
+		for (Col=(W_In2/4)*4; Col<W_In2; Col++) {
+			F16V_DSP *VBuff0 = (F16V_DSP *) (pIn2);
+			F16V_DSP S0 = (F16V_DSP) {0.0, 0.0};
+			for (i=0; i<W_In1/2; i++) {
+				S0 += VIn1[i] * VBuff0[i];
+			}
+			F16_DSP s0 = S0[0]+S0[1];
+			if (W_In1&0x1) {
+				s0 += In1[(l1  )*W_In1 + W_In1-1] * pIn2[W_In1-1];
+			}
+		       	pOut[(W_In2-1)] = s0;;
+		       	pIn2 += H_In2;
+		}
+        }
+	gap_waitbarrier(0);
+}
+
+
+void KerParMatMulDSP_fp32(MatMul_DSP_T *Arg)
+
+{
+	float * __restrict__ In1 = (float *__restrict__) Arg->In1;
+	float * __restrict__ In2 = (float *__restrict__) Arg->In2;
+	float * __restrict__ Out = (float *__restrict__) Arg->Out;
+	float *BufferColIn2      = (float *) Arg->BufferColIn2;
+	unsigned int W_In1 = Arg->W_In1;
+	unsigned int H_In1 = Arg->H_In1;
+	unsigned int W_In2 = Arg->W_In2; 	/* H_In2 = W_In1 by construction */
+	unsigned int W_Out = Arg->W_Out;
+	unsigned int OutFirstCol = Arg->OutFirstCol;
+	int ColFirst = Arg->ColFirst;
+
+	unsigned int H_In2 = W_In1;
+	unsigned int H_Out = H_In1;
+	unsigned int Line, Col, i;
+
+	unsigned int CoreId = gap_coreid();
+	unsigned int ChunkCell = ChunkSize(H_In1);
+	unsigned int First = CoreId*ChunkCell, Last  = Min(H_In1, First+ChunkCell);
+	unsigned int Iter = (Last>First)?(Last-First):0;
+	unsigned int C = ChunkSize(H_In2), F = CoreId*C, L  = Min(H_In2, F+C);
+	int OffLine = 0, OffCol = 0;
+
+	if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol;
+	for (Col=0; Col<W_In2/4; Col++) {
+		for (i=F; i<L; i++) {
+			BufferColIn2[i        ] = In2[i*W_In2+4*Col];
+			BufferColIn2[i+1*H_In2] = In2[i*W_In2+4*Col+1];
+			BufferColIn2[i+2*H_In2] = In2[i*W_In2+4*Col+2];
+			BufferColIn2[i+3*H_In2] = In2[i*W_In2+4*Col+3];
+		}
+		gap_waitbarrier(0);
+		for (Line=0; Line<Iter/2; Line++) {
+	        	int l1 = 2*Line + First;
+			float S1 = 0.0, S5 = 0.0;
+			float S2 = 0.0, S6 = 0.0;
+			float S3 = 0.0, S7 = 0.0;
+			float S4 = 0.0, S8 = 0.0;
+			for (i=0; i<W_In1; i++) {
+				S1 += In1[(l1  )*W_In1+i] * BufferColIn2[i        ];
+				S2 += In1[(l1  )*W_In1+i] * BufferColIn2[i+1*H_In2];
+				S3 += In1[(l1  )*W_In1+i] * BufferColIn2[i+2*H_In2];
+				S4 += In1[(l1  )*W_In1+i] * BufferColIn2[i+3*H_In2];
+
+				S5 += In1[(l1+1)*W_In1+i] * BufferColIn2[i        ];
+				S6 += In1[(l1+1)*W_In1+i] * BufferColIn2[i+1*H_In2];
+				S7 += In1[(l1+1)*W_In1+i] * BufferColIn2[i+2*H_In2];
+				S8 += In1[(l1+1)*W_In1+i] * BufferColIn2[i+3*H_In2];
+			}
+		       	Out[(l1  +OffLine)*W_Out+4*Col  +OffCol] = S1;
+		       	Out[(l1  +OffLine)*W_Out+4*Col+1+OffCol] = S2;
+		       	Out[(l1  +OffLine)*W_Out+4*Col+2+OffCol] = S3;
+		       	Out[(l1  +OffLine)*W_Out+4*Col+3+OffCol] = S4;
+		       	Out[(l1+1+OffLine)*W_Out+4*Col  +OffCol] = S5;
+		       	Out[(l1+1+OffLine)*W_Out+4*Col+1+OffCol] = S6;
+		       	Out[(l1+1+OffLine)*W_Out+4*Col+2+OffCol] = S7;
+		       	Out[(l1+1+OffLine)*W_Out+4*Col+3+OffCol] = S8;
+		}
+		if (Iter&0x1) {
+			int l1 = Last-1;
+			float S1 = 0.0;
+			float S2 = 0.0;
+			float S3 = 0.0;
+			float S4 = 0.0;
+			for (i=0; i<W_In1; i++) {
+				S1 += In1[(l1  )*W_In1+i] * BufferColIn2[i        ];
+				S2 += In1[(l1  )*W_In1+i] * BufferColIn2[i+1*H_In2];
+				S3 += In1[(l1  )*W_In1+i] * BufferColIn2[i+2*H_In2];
+				S4 += In1[(l1  )*W_In1+i] * BufferColIn2[i+3*H_In2];
+			}
+		       	Out[(l1  +OffLine)*W_Out+4*Col  +OffCol] = S1;
+		       	Out[(l1  +OffLine)*W_Out+4*Col+1+OffCol] = S2;
+		       	Out[(l1  +OffLine)*W_Out+4*Col+2+OffCol] = S3;
+		       	Out[(l1  +OffLine)*W_Out+4*Col+3+OffCol] = S4;
+		}
+		gap_waitbarrier(0);
+	}
+	for (Col=(W_In2/4)*4; Col<W_In2; Col++) {
+		for (i=F; i<L; i++) BufferColIn2[i] = In2[i*W_In2+Col];
+		gap_waitbarrier(0);
+		for (Line=0; Line<Iter/2; Line++) {
+	        	int l1 = 2*Line + First;
+			float S1 = 0.0, S5 = 0.0;
+			for (i=0; i<W_In1; i++) {
+				S1 += In1[(l1  )*W_In1+i] * BufferColIn2[i];
+				S5 += In1[(l1+1)*W_In1+i] * BufferColIn2[i];
+			}
+		       	Out[(l1  +OffLine)*W_Out+Col  +OffCol] = S1;
+		       	Out[(l1+1+OffLine)*W_Out+Col  +OffCol] = S5;
+		}
+		if (Iter&0x1) {
+			int l1 = Last-1;
+			float S1 = 0.0;
+			for (i=0; i<W_In1; i++) S1 += In1[l1*W_In1+i] * BufferColIn2[i];
+		       	Out[(l1  +OffLine)*W_Out+Col  +OffCol] = S1;
+		}
+		gap_waitbarrier(0);
+	}
+}
+
+void KerParMatMulDSPT_fp32(MatMul_DSP_T *Arg)
+
+{
+	float * __restrict__ In1 = (float *__restrict__) Arg->In1;
+	float * __restrict__ In2 = (float *__restrict__) Arg->In2;
+	float * __restrict__ Out = (float *__restrict__) Arg->Out;
+	unsigned int W_In1 = Arg->W_In1;
+	unsigned int H_In1 = Arg->H_In1;
+	unsigned int W_In2 = Arg->W_In2; 	/* H_In2 = W_In1 by construction */
+	unsigned int W_Out = Arg->W_Out;
+	unsigned int OutFirstCol = Arg->OutFirstCol;
+	int ColFirst = Arg->ColFirst;
+
+	unsigned int H_In2 = W_In1;
+	unsigned int H_Out = H_In1;
+	unsigned int Line, Col, i;
+
+	unsigned int CoreId = gap_coreid();
+	unsigned int ChunkCell = ChunkSize(H_In1);
+	unsigned int First = CoreId*ChunkCell, Last  = Min(H_In1, First+ChunkCell);
+	unsigned int Iter = (Last>First)?(Last-First):0;
+	int OffLine = 0, OffCol = 0;
+
+	if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol;
+	float * pOut = Out + (OffLine+First)*W_Out + OffCol;
+	for (Line=0; Line<Iter/2; Line++) {
+        	float *pIn2 = In2;
+        	int l1 = 2*Line + First;
+		for (Col=0; Col<W_In2/4; Col++) {
+			float S0 = 0.0, S4 = 0.0;
+			float S1 = 0.0, S5 = 0.0;
+			float S2 = 0.0, S6 = 0.0;
+			float S3 = 0.0, S7 = 0.0;
+			for (i=0; i<W_In1; i++) {
+				S0 += In1[(l1  )*W_In1] * pIn2[i        ];
+				S1 += In1[(l1  )*W_In1] * pIn2[i+  H_In2];
+				S2 += In1[(l1  )*W_In1] * pIn2[i+2*H_In2];
+				S3 += In1[(l1  )*W_In1] * pIn2[i+3*H_In2];
+
+				S4 += In1[(l1+1)*W_In1] * pIn2[i        ];
+				S5 += In1[(l1+1)*W_In1] * pIn2[i+  H_In2];
+				S6 += In1[(l1+1)*W_In1] * pIn2[i+2*H_In2];
+				S7 += In1[(l1+1)*W_In1] * pIn2[i+3*H_In2];
+			}
+		       	pOut[      (4*Col  )] = S0;
+		       	pOut[      (4*Col+1)] = S1;
+		       	pOut[      (4*Col+2)] = S2;
+		       	pOut[      (4*Col+3)] = S3;
+		       	pOut[W_Out+(4*Col  )] = S4;
+		       	pOut[W_Out+(4*Col+1)] = S5;
+		       	pOut[W_Out+(4*Col+2)] = S6;
+		       	pOut[W_Out+(4*Col+3)] = S7;
+		       	pIn2 += 4*H_In2;
+		}
+		for (Col=(W_In2/4)*4; Col<W_In2; Col++) {
+			float S0 = 0.0, S4 = 0.0;
+			for (i=0; i<W_In1; i++) {
+				S0 += In1[(l1  )*W_In1] * pIn2[i];
+				S4 += In1[(l1+1)*W_In1] * pIn2[i];
+			}
+		       	pOut[      (W_In2-1)] = S0;
+		       	pOut[W_Out+(W_In2-1)] = S4;
+		       	pIn2 += H_In2;
+		}
+		pOut += 2*W_Out;
+	}
+	if (Iter&0x1) {
+        	float *pIn2 = In2;
+        	int l1 = Last-1;
+        	for (Col=0; Col<W_In2/4; Col++) {
+			float S0 = 0.0;
+			float S1 = 0.0;
+			float S2 = 0.0;
+			float S3 = 0.0;
+			for (i=0; i<W_In1; i++) {
+				S0 += In1[(l1  )*W_In1] * pIn2[i        ];
+				S1 += In1[(l1  )*W_In1] * pIn2[i+  H_In2];
+				S2 += In1[(l1  )*W_In1] * pIn2[i+2*H_In2];
+				S3 += In1[(l1  )*W_In1] * pIn2[i+3*H_In2];
+			}
+		       	pOut[      (4*Col  )] = S0;
+		       	pOut[      (4*Col+1)] = S1;
+		       	pOut[      (4*Col+2)] = S2;
+		       	pOut[      (4*Col+3)] = S3;
+		       	pIn2 += 4*H_In2;
+		}
+		for (Col=(W_In2/4)*4; Col<W_In2; Col++) {
+			float S0 = 0.0;
+			for (i=0; i<W_In1; i++) {
+				S0 += In1[(l1  )*W_In1] * pIn2[i];
+			}
+		       	pOut[      (W_In2-1)] = S0;
+		       	pIn2 += H_In2;
+		}
+        }
+	gap_waitbarrier(0);
+}
diff --git a/tools/autotiler_v3/Emulation/at_api_emul.h b/tools/autotiler_v3/Emulation/at_api_emul.h
index ce57b8ecf..f56578a4e 100644
--- a/tools/autotiler_v3/Emulation/at_api_emul.h
+++ b/tools/autotiler_v3/Emulation/at_api_emul.h
@@ -55,6 +55,10 @@ extern unsigned int __L3_Read, __L3_Write, __L2_Read, __L2_Write;
 
 #define AT_QSPIRAM_FREE(dev,ptr,size) free(ptr)
 
+#define AT_OSPIRAM_ALLOC(dev,size) malloc(size)
+
+#define AT_OSPIRAM_FREE(dev,ptr,size) free(ptr)
+
 #define AT_L2_ALLOC(dev,size) malloc(size)
 
 #define AT_L2_FREE(dev,ptr,size) free(ptr)
@@ -307,6 +311,78 @@ do { \
 
 #define AT_QSPIRAM_CL_WAIT(dev,event) 
 
+
+/*
+ * OSpiram
+ */
+
+#define AT_OSPIRAM_TYPE 0
+
+typedef int         AT_OSPIRAM_CONF_T;
+typedef int         AT_OSPIRAM_T;
+typedef char *      AT_OSPIRAM_EXT_ADDR_TYPE;
+typedef char *      AT_OSPIRAM_LOC_ADDR_TYPE;
+typedef int         AT_OSPIRAM_FC_EVENT;
+typedef int         AT_OSPIRAM_CL_EVENT;
+typedef char *      AT_OSPIRAM_POINTER;
+typedef char *      AT_OSPIRAM_INT_ADDR_TYPE;
+
+#define AT_OSPIRAM_EXT2LOC 0
+#define AT_OSPIRAM_LOC2EXT 1
+
+#define AT_OSPIRAM_CONF_INIT(dev,type,name) 
+
+#define AT_OSPIRAM_OPEN(dev,conf,err) \
+  do { *(err) = 0; } while (0)
+
+#define AT_OSPIRAM_CLOSE(dev) 
+
+#define AT_OSPIRAM_FC_COPY(dev,ext,loc,size,dir,event) \
+do { \
+  int i; \
+  char *To   = (dir==AT_OSPIRAM_EXT2LOC)?((char *) (loc)):((char *) (ext)); \
+  char *From = (dir==AT_OSPIRAM_EXT2LOC)?((char *) (ext)):((char *) (loc)); \
+ \
+  if (dir==AT_OSPIRAM_EXT2LOC) { \
+    if (1) __L3_Read += size; else __L2_Read += size; \
+  } else { \
+    if (1) __L3_Write += size; else __L2_Write += size; \
+  } \
+ \
+  for (i=0; i<size; i++) To[i] = From[i]; \
+} while (0)
+
+#define AT_OSPIRAM_FC_COPY2D(dev,ext,loc,size,stride,length,dir,event) \
+do { \
+  int CopyIn = (dir==AT_OSPIRAM_EXT2LOC); \
+  char *To   = CopyIn?((char *) (loc)):((char *) (ext)); \
+  char *From = CopyIn?((char *) (ext)):((char *) (loc)); \
+  int i, j, Chunk; \
+ \
+  if (dir==AT_OSPIRAM_EXT2LOC) { \
+    if (1) __L3_Read += size; else __L2_Read += size; \
+  } else { \
+    if (1) __L3_Write += size; else __L2_Write += size; \
+  } \
+  for (Chunk=0; Chunk<size; Chunk+=length)  { \
+    for (i=0; i<length; i++) To[i] = From[i]; \
+      if (CopyIn) { \
+      From += stride; To += length; \
+    } else { \
+      To += stride; From += length; \
+    } \
+  } \
+} while (0)
+
+#define AT_OSPIRAM_FC_WAIT(dev,event) 
+
+#define AT_OSPIRAM_CL_COPY(dev,ext,loc,size,dir,event) AT_OSPIRAM_FC_COPY(dev,ext,loc,size,dir,event)
+
+#define AT_OSPIRAM_CL_COPY2D(dev,ext,loc,size,stride,len,dir,event) AT_OSPIRAM_FC_COPY2D(dev,ext,loc,size,stride,len,dir,event)
+
+#define AT_OSPIRAM_CL_WAIT(dev,event) 
+
+
 /*
  * Spiflash
  */
@@ -415,6 +491,79 @@ static inline void __at_qspiflash_fs_copy_2d(FILE *file, unsigned int ext, void
 
 #define AT_QSPIFLASH_FS_CL_WAIT(file,event) 
 
+/*
+ * OSPIflash FS
+ */
+
+#define AT_OSPIFLASH_FS_TYPE 1
+
+typedef int            AT_OSPIFLASH_FS_CONF_T;
+typedef FILE*          AT_OSPIFLASH_FS_T;
+typedef unsigned int   AT_OSPIFLASH_FS_EXT_ADDR_TYPE;
+typedef void *         AT_OSPIFLASH_FS_INT_ADDR_TYPE;
+typedef int            AT_OSPIFLASH_FS_FC_EVENT;
+typedef int            AT_OSPIFLASH_FS_CL_EVENT;
+
+#define AT_OSPIFLASH_FS_EXT2LOC 0
+#define AT_OSPIFLASH_FS_LOC2EXT 1
+
+static inline void __at_ospiflash_fs_copy(FILE *file, unsigned int ext, void *loc, int size, int dir)
+{
+  fseek(file, ext, SEEK_SET);
+  if (dir==AT_QSPIFLASH_FS_EXT2LOC) {
+    fwrite(loc, 1, size, file); __L3_Read += size;
+  } else {
+    fread(loc, 1, size, file); __L3_Write += size;
+  }
+}
+
+static inline void __at_ospiflash_fs_copy_2d(FILE *file, unsigned int ext, void *loc, int size, int stride, int length, int dir)
+{
+  int Chunk;
+  for (Chunk=0; Chunk<size; Chunk+=length)
+  {
+    if (length > size)
+      length = size;
+
+    fseek(file, ext, SEEK_SET);
+    if (dir==AT_QSPIFLASH_FS_EXT2LOC) fread(loc, 1, length, file);
+    else fwrite(loc, 1, length, file);
+
+    loc = ((char *)loc) + length;
+    ext += stride;
+  }
+}
+
+
+#define AT_OSPIFLASH_FS_CONF_INIT(dev,type,name) 
+
+#define AT_OSPIFLASH_FS_OPEN(file,conf,filename,err) \
+  do { *(file) = fopen(filename, "r"); *(err) = *(file) == NULL; } while(0)
+
+#define AT_OSPIFLASH_FS_OPEN_WRITE(file,conf,filename,err) \
+  do { *(file) = fopen(filename, "w"); *(err) = *(file) == NULL; } while(0)
+
+#define AT_OSPIFLASH_FS_OPEN_SET_SIZE(file, size) 
+
+#define AT_OSPIFLASH_FS_CLOSE(file) \
+  fclose(*file)
+
+#define AT_OSPIFLASH_FS_FC_COPY(file,ext,loc,size,dir,event) \
+  __at_ospiflash_fs_copy(*(file), ext, loc, size, dir)
+
+#define AT_OSPIFLASH_FS_FC_COPY2D(file, dev,ext,loc,size,stride,len,dir,event) \
+  __at_ospiflash_fs_copy_2d(*(file), ext, loc, size, stride, len, dir)
+
+#define AT_OSPIFLASH_FS_FC_WAIT(file,event) 
+
+#define AT_OSPIFLASH_FS_CL_COPY(file,ext,loc,size,dir,event) \
+  __at_ospiflash_fs_copy(*(file), ext, loc, size, dir)
+
+#define AT_OSPIFLASH_FS_CL_COPY2D(file, dev,ext,loc,size,stride,len,dir,event) \
+  __at_ospiflash_fs_copy_2d(*(file), ext, loc, size, stride, len, dir)
+
+#define AT_OSPIFLASH_FS_CL_WAIT(file,event) 
+
 
 /*
  * EMRAMflash
diff --git a/tools/autotiler_v3/Emulation/at_api_pmsis.h b/tools/autotiler_v3/Emulation/at_api_pmsis.h
index 733992315..584fc99ce 100644
--- a/tools/autotiler_v3/Emulation/at_api_pmsis.h
+++ b/tools/autotiler_v3/Emulation/at_api_pmsis.h
@@ -18,6 +18,7 @@
 #define __AT__AT_API_PMSIS_H__
 
 #include "pmsis.h"
+#include <bsp/bsp.h>
 #include "bsp/ram/hyperram.h"
 #include "bsp/ram/spiram.h"
 #include "bsp/flash/hyperflash.h"
@@ -83,6 +84,10 @@ static inline uint32_t gap_cl_readhwtimer()
 
 #define AT_QSPIRAM_FREE(dev,ptr,size) pi_ram_free((dev), (ptr), (size))
 
+#define AT_OSPIRAM_ALLOC(dev,size) ({ uint32_t ptr; int err = pi_ram_alloc((dev), &ptr, (size)); if (!err && ptr == 0) err = pi_ram_alloc((dev), &ptr, (size)); if (err) ptr = 0; ptr; })
+
+#define AT_OSPIRAM_FREE(dev,ptr,size) pi_ram_free((dev), (ptr), (size))
+
 #define AT_L2_ALLOC(dev,size) pmsis_l2_malloc(size)
 
 #define AT_L2_FREE(dev,ptr,size) pmsis_l2_malloc_free((ptr), (size))
@@ -328,6 +333,53 @@ typedef char *                  AT_QSPIRAM_INT_ADDR_TYPE;
   pi_cl_ram_copy_wait(event)
 
 
+/*
+ * OctaSpiram
+ */
+
+#ifdef __GAP9__
+#define AT_OSPIRAM_TYPE 0
+
+typedef struct pi_aps25xxxn_conf   AT_OSPIRAM_CONF_T;
+typedef struct pi_device        AT_OSPIRAM_T;
+typedef uint32_t                AT_OSPIRAM_EXT_ADDR_TYPE;
+typedef void *                  AT_OSPIRAM_LOC_ADDR_TYPE;
+typedef pi_task_t               AT_OSPIRAM_FC_EVENT;
+typedef pi_cl_ram_req_t         AT_OSPIRAM_CL_EVENT;
+typedef uint32_t                AT_OSPIRAM_POINTER;
+typedef char *                  AT_OSPIRAM_INT_ADDR_TYPE;
+
+#define AT_OSPIRAM_EXT2LOC 0
+#define AT_OSPIRAM_LOC2EXT 1
+
+#define AT_OSPIRAM_CONF_INIT(dev,type,name) \
+  pi_aps25xxxn_conf_init(dev)
+
+#define AT_OSPIRAM_OPEN(dev,conf,err) \
+  do { pi_open_from_conf((dev), (conf)); *(err) = pi_ram_open(dev); } while(0)
+
+#define AT_OSPIRAM_CLOSE(dev) \
+  pi_ram_close(dev)
+
+#define AT_OSPIRAM_FC_COPY(dev,ext,loc,size,dir,event) \
+  pi_ram_copy_async(dev, (AT_OSPIRAM_EXT_ADDR_TYPE)(ext), (AT_OSPIRAM_LOC_ADDR_TYPE)(loc), (size), !(dir), pi_task_block(event))
+
+#define AT_OSPIRAM_FC_COPY2D(dev,ext,loc,size,stride,len,dir,event) \
+  pi_ram_copy_2d_async(dev, (AT_OSPIRAM_EXT_ADDR_TYPE)(ext), (AT_OSPIRAM_LOC_ADDR_TYPE)(loc), (size), (stride), (len), !(dir), pi_task_block(event))
+
+#define AT_OSPIRAM_FC_WAIT(dev,event) \
+  pi_task_wait_on(event)
+
+#define AT_OSPIRAM_CL_COPY(dev,ext,loc,size,dir,event) \
+  pi_cl_ram_copy(dev, (AT_OSPIRAM_EXT_ADDR_TYPE)(ext), (AT_OSPIRAM_LOC_ADDR_TYPE)(loc), (size), !(dir), (event))
+
+#define AT_OSPIRAM_CL_COPY2D(dev,ext,loc,size,stride,len,dir,event) \
+  pi_cl_ram_copy_2d(dev, (AT_OSPIRAM_EXT_ADDR_TYPE)(ext), (AT_OSPIRAM_LOC_ADDR_TYPE)(loc), (size), (stride), (len), !(dir), (event))
+
+#define AT_OSPIRAM_CL_WAIT(dev,event) \
+  pi_cl_ram_copy_wait(event)
+#endif
+
 /*
  * Spiflash
  */
@@ -362,6 +414,47 @@ typedef pi_cl_ram_req_t             AT_QSPIFLASH_EVENT;
 #define AT_QSPIFLASH_WAIT(dev,event)
 
 
+/*
+ * OctaSpiflash
+ */
+
+#ifdef __GAP9__
+#define AT_OSPIFLASH_TYPE 1
+
+#if defined(CONFIG_ATXP032)
+typedef struct pi_atxp032_conf      AT_OSPIFLASH_CONF_T;
+#else
+#if defined(CONFIG_MX25U51245G)
+typedef struct pi_mx25u51245g_conf      AT_OSPIFLASH_CONF_T;
+#endif
+#endif
+typedef struct pi_device            AT_OSPIFLASH_T;
+typedef uint32_t                    AT_OSPIFLASH_EXT_ADDR_TYPE;
+typedef void *                      AT_OSPIFLASH_LOC_ADDR_TYPE;
+typedef pi_cl_ram_req_t             AT_OSPIFLASH_EVENT;
+
+#define AT_OSPIFLASH_EXT2LOC 0
+#define AT_OSPIFLASH_LOC2EXT 1
+
+#define AT_OSPIFLASH_CONF_INIT(dev,type,name) \
+  pi_spiflash_conf_init(dev)
+
+#define AT_OSPIFLASH_OPEN(dev,conf,err) \
+  do { pi_open_from_conf((dev), (conf)); *(err) = pi_flash_open(dev); } while(0)
+
+#define AT_OSPIFLASH_CLOSE(dev) \
+  pi_flash_close(dev)
+
+// TODO not yet supported
+#define AT_OSPIFLASH_COPY(dev,ext,loc,size,dir,event)
+
+// TODO not yet supported
+#define AT_OSPIFLASH_COPY2D(dev,ext,loc,size,stride,len,dir,event)
+
+// TODO not yet supported
+#define AT_OSPIFLASH_WAIT(dev,event)
+#endif
+
 
 /*
  * SPIflash FS
@@ -463,6 +556,116 @@ static inline void __at_qspiflash_fs_close(AT_QSPIFLASH_FS_T *file)
 #define AT_QSPIFLASH_FS_CL_WAIT(file,event) \
   pi_cl_fs_wait(event)
 
+
+/*
+ * OctoSPIflash FS
+ */
+
+#ifdef __GAP9__
+#define AT_OSPIFLASH_FS_TYPE 1
+
+typedef struct pi_fs_conf AT_OSPIFLASH_FS_CONF_T;
+
+typedef struct
+{
+  struct pi_device fs;
+  struct pi_device ospiflash;
+  pi_fs_file_t *file;
+} AT_OSPIFLASH_FS_T;
+
+typedef unsigned int AT_OSPIFLASH_FS_EXT_ADDR_TYPE;
+typedef void *AT_OSPIFLASH_FS_INT_ADDR_TYPE;
+typedef pi_task_t AT_OSPIFLASH_FS_FC_EVENT;
+typedef pi_cl_fs_req_t AT_OSPIFLASH_FS_CL_EVENT;
+
+static inline void __at_ospiflash_fs_open(AT_OSPIFLASH_FS_T *file, int is_write, struct pi_fs_conf *conf, const char *filename, int *err)
+{
+  #if defined(CONFIG_ATXP032)
+    struct pi_atxp032_conf flash_conf;
+    pi_atxp032_conf_init(&flash_conf);
+  #else
+  #if defined(CONFIG_MX25U51245G)
+    struct pi_mx25u51245g_conf flash_conf;
+    pi_mx25u51245g_conf_init(&flash_conf);
+  #endif
+  #endif
+  pi_open_from_conf(&file->ospiflash, &flash_conf);
+  if (pi_flash_open(&file->ospiflash))
+  {
+    *err = -1;
+    return;
+  }
+  conf->flash = &file->ospiflash;
+  if (is_write)
+    conf->type = PI_FS_HOST;
+  else
+    conf->type = PI_FS_READ_ONLY;
+
+  pi_open_from_conf(&file->fs, conf);
+  if (pi_fs_mount(&file->fs))
+  {
+    pi_flash_close(&file->ospiflash);
+    *err = -1;
+    return;
+  }
+  file->file = pi_fs_open(&file->fs, filename, is_write ? PI_FS_FLAGS_WRITE : 0);
+  if (file->file == NULL)
+  {
+    pi_fs_unmount(&file->fs);
+    pi_flash_close(&file->ospiflash);
+    *err = -1;
+    return;
+  }
+  *err = 0;
+
+  if (is_write)
+    file->file->size = 4*1024*1024;
+}
+
+static inline void __at_ospiflash_fs_close(AT_OSPIFLASH_FS_T *file)
+{
+  pi_fs_close(file->file);
+  pi_fs_unmount(&file->fs);
+  pi_flash_close(&file->ospiflash);
+}
+
+#define AT_OSPIFLASH_FS_EXT2LOC 0
+#define AT_OSPIFLASH_FS_LOC2EXT 1
+
+#define AT_OSPIFLASH_FS_CONF_INIT(dev,type,name) \
+  pi_fs_conf_init(dev)
+
+#define AT_OSPIFLASH_FS_OPEN(file,conf,filename,err) \
+  __at_ospiflash_fs_open(file, 0, conf, filename, err)
+
+#define AT_OSPIFLASH_FS_OPEN_WRITE(file,conf,filename,err) \
+  __at_ospiflash_fs_open(file, 1, conf, filename, err)
+
+#define AT_OSPIFLASH_FS_OPEN_SET_SIZE(file, size) \
+  file->file->size = size
+
+#define AT_OSPIFLASH_FS_CLOSE(file) \
+  __at_ospiflash_fs_close(file)
+
+#define AT_OSPIFLASH_FS_FC_COPY(fs,ext,loc,size,dir,event) \
+  pi_fs_copy_async((fs)->file, ext, loc, size, !(dir), pi_task_block(event))
+
+#define AT_OSPIFLASH_FS_FC_COPY2D(file, dev,ext,loc,size,stride,len,dir,event) \
+  pi_fs_copy_2d_async(file->file, ext, loc, size, stride, len, !(dir), pi_task_block(event))
+
+#define AT_OSPIFLASH_FS_FC_WAIT(file,event) \
+  pi_task_wait_on(event)
+
+#define AT_OSPIFLASH_FS_CL_COPY(fs,ext,loc,size,dir,event) \
+  pi_cl_fs_copy((fs)->file, ext, loc, size, !(dir), event)
+
+#define AT_OSPIFLASH_FS_CL_COPY2D(file, dev,ext,loc,size,stride,len,dir,event) \
+  pi_cl_fs_copy_2d(file->file, ext, loc, size, stride, len, !(dir), event)
+
+#define AT_OSPIFLASH_FS_CL_WAIT(file,event) \
+  pi_cl_fs_wait(event)
+#endif
+
 #ifdef __GAP9__
 
 /*
diff --git a/tools/autotiler_v3/Makefile b/tools/autotiler_v3/Makefile
index 0abf4ba65..d099d66b9 100644
--- a/tools/autotiler_v3/Makefile
+++ b/tools/autotiler_v3/Makefile
@@ -1,4 +1,4 @@
-TILER_VER=4.3.1
+TILER_VER=4.3.2
 export TILER_LIB=libtile.${TILER_VER}.a
 ifdef GAP_SDK_HOME
 export TILER_URL=$(GAP_SDK_HOME)/.tiler_url
diff --git a/tools/autotiler_v3/version.cfg b/tools/autotiler_v3/version.cfg
index 332f897c0..047a40256 100644
--- a/tools/autotiler_v3/version.cfg
+++ b/tools/autotiler_v3/version.cfg
@@ -3,7 +3,7 @@
         {
             "version": "autotiler-v3",
             "magicNum": 718930176,
-            "git-hash": "de88fbeb3017c0db55f1e86e49cce5a0160ccbe5"
+            "git-hash": "4be2dc2f29bb4719d481b20c8cd37ae3b68937cf"
         }
     ]
 }
\ No newline at end of file
diff --git a/tools/jenkins/gap_sdk_version.txt b/tools/jenkins/gap_sdk_version.txt
index 59f52fae3..86cc31dbb 100644
--- a/tools/jenkins/gap_sdk_version.txt
+++ b/tools/jenkins/gap_sdk_version.txt
@@ -1 +1 @@
-9af2d93598d20541f4c18ba45e2124b767be2388
+65d7014bdc0a46fff8f45d826301de74829b89ab
diff --git a/tools/nntool/_version.py b/tools/nntool/_version.py
index 62227b113..a1297615f 100644
--- a/tools/nntool/_version.py
+++ b/tools/nntool/_version.py
@@ -13,4 +13,4 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
-__version__ = '3.11'
+__version__ = '4.1'
diff --git a/tools/nntool/execution/kernels/float/dsp_preprocessing.py b/tools/nntool/execution/kernels/float/dsp_preprocessing.py
index 2d094e224..111146cb3 100644
--- a/tools/nntool/execution/kernels/float/dsp_preprocessing.py
+++ b/tools/nntool/execution/kernels/float/dsp_preprocessing.py
@@ -18,8 +18,6 @@
 import numpy as np
 from graph.types import MFCCPreprocessingParameters, RFFT2DPreprocessingParameters
 from execution.kernels.kernel_base import KernelBase, params_type, qrec_type
-from quantization.multiplicative.mulbias import (apply_multiplicative_bias,
-                                                 apply_zero_offset_bias)
 from quantization.new_qrec import QRec
 from utils.at_norm import at_norm
 
diff --git a/tools/nntool/execution/kernels/float/fast_conv.py b/tools/nntool/execution/kernels/float/fast_conv.py
index 0963ae22d..8c20195bd 100644
--- a/tools/nntool/execution/kernels/float/fast_conv.py
+++ b/tools/nntool/execution/kernels/float/fast_conv.py
@@ -51,7 +51,9 @@ def execute(cls, params,
             details['max_acc'] = float("-Infinity")
             details['min_pre_mul_bias'] = float("Infinity")
             details['max_pre_mul_bias'] = float("-Infinity")
-
+        in_rank = len(in_tensor.shape)
+        if in_rank != 3:
+            raise NotImplementedError(f'{params.name} input has input rank of {in_rank} shape {in_tensor.shape} which is not supported by nntool kernels')
         in_tensor = in_tensor.transpose(
             in_dims.transpose_to_order(['h', 'w', 'c']))
         if params.padding.h + params.padding.w > 0:
diff --git a/tools/nntool/execution/kernels/float/tensor_functions.py b/tools/nntool/execution/kernels/float/tensor_functions.py
index cae1352af..704afc0a7 100644
--- a/tools/nntool/execution/kernels/float/tensor_functions.py
+++ b/tools/nntool/execution/kernels/float/tensor_functions.py
@@ -13,20 +13,20 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
-import math
 from typing import cast as typing_cast
-from utils.at_norm import at_norm
 
 import numpy as np
+from skimage.transform import resize
+
+from execution.kernels.kernel_base import KernelBase, params_type, qrec_type
 from graph.types import (ConcatParameters, ConstantInputParameters,
                          CopyParameters, InputParameters, OutputParameters,
                          ReshapeParameters, ReverseParameters, SplitParameters,
                          StridedSliceParameters, TransposeParameters)
-from graph.types.others import (ExpandParameters, GatherParameters, NoOPParameters,
-                                QuantizeParameters)
-from execution.kernels.kernel_base import KernelBase, params_type, qrec_type
+from graph.types.others import (ExpandParameters, GatherParameters,
+                                NoOPParameters, QuantizeParameters)
 from quantization.new_qrec import AllFloatQRec, QRec
-from skimage.transform import resize
+from utils.at_norm import at_norm
 
 
 @params_type(InputParameters)
diff --git a/tools/nntool/execution/kernels/quant/activations.py b/tools/nntool/execution/kernels/quant/activations.py
index b146a8da9..73bec1c78 100644
--- a/tools/nntool/execution/kernels/quant/activations.py
+++ b/tools/nntool/execution/kernels/quant/activations.py
@@ -196,9 +196,9 @@ def execute(cls, params,
         return qrec.get_outputs(params, [in_tensor], ktype="symmetric")
 
 
-@params_type(SigmoidActivationParameters)
+@params_type(SigmoidActivationParameters, TanHActivationParameters)
 @qrec_type('scaled')
-class SigmoidScaledSymmetricMult(KernelBase):
+class SigmoidTanHScaledSymmetricMult(KernelBase):
     @classmethod
     def execute(cls, params,
                 in_tensors,
@@ -206,17 +206,19 @@ def execute(cls, params,
                 **kwargs):
         in_tensor = qrec.prepare_inputs(
             params, in_tensors, ktype="symmetric")[0]
-        if in_tensor.dtype == np.int8:
+        if in_tensor.dtype == np.int8: # Q4
             in_tensor = in_tensor.astype(np.int32) << 8
-        elif in_tensor.dtype == np.uint8:
-            in_tensor = in_tensor.astype(np.int32) - qrec.in_qs[0].zero_point
+        elif in_tensor.dtype == np.uint8: # Q4 sym
+            in_tensor = in_tensor.astype(np.int32) - (1 << 8)
             in_tensor <<= 8
-        elif in_tensor.dtype == np.uint16:
-            in_tensor = in_tensor.astype(np.int32) - qrec.in_qs[0].zero_point
-        else:
+        elif in_tensor.dtype == np.uint16: # Q12 sym
+            in_tensor = in_tensor.astype(np.int32) - (1 << 16)
+        else: # Q12
             in_tensor = in_tensor.astype(np.int32)
-
-        out_q15 = sigmoid_lut(in_tensor)
+        if isinstance(params, TanHActivationParameters):
+            out_q15 = tanh_lut(in_tensor)
+        else:
+            out_q15 = sigmoid_lut(in_tensor)
         scale_mul_biases_q = qrec.cache['scale_mul_biases_q']
         outp = scale_mul_biases_q.apply_scales(out_q15) + qrec.cache['zero_point']
         output = qrec.out_qs[0].clip(outp)
@@ -251,35 +253,35 @@ def execute(cls, params,
                                 ktype="symmetric")
 
 
-@params_type(TanHActivationParameters)
-@qrec_type('scaled')
-class TanHScaledMult(KernelBase):
-    @classmethod
-    def execute(cls, params,
-                in_tensors,
-                qrec: QRec,
-                **kwargs):
-        in_tensor = qrec.prepare_inputs(
-            params, in_tensors, ktype="symmetric")[0]
-        if in_tensor.dtype == np.int8:
-            in_tensor = in_tensor.astype(np.int32) << 8
-        elif in_tensor.dtype == np.uint8:
-            in_tensor = in_tensor.astype(np.int32) - qrec.cache['zero_point']
-            in_tensor <<= 8
-        elif in_tensor.dtype == np.uint16:
-            in_tensor = in_tensor.astype(np.int32) - qrec.cache['zero_point']
-        else:
-            in_tensor = in_tensor.astype(np.int32)
-
-        out_q15 = tanh_lut(in_tensor)
-        # compute_in_out_scale(qrec, extra_scale=QType.Pow2(
-        #     bits=32, q=7, signed=True).scale/qrec.in_qs[0].scale)
-        scale_mul_biases_q = qrec.cache['scale_mul_biases_q']
-        outp = scale_mul_biases_q.apply_scales(out_q15) + qrec.out_qs[0].zero_point
-        output = qrec.out_qs[0].clip(outp)
-        return qrec.get_outputs(params,
-                                [output],
-                                ktype="symmetric")
+# @params_type(TanHActivationParameters)
+# @qrec_type('scaled')
+# class TanHScaledMult(KernelBase):
+#     @classmethod
+#     def execute(cls, params,
+#                 in_tensors,
+#                 qrec: QRec,
+#                 **kwargs):
+#         in_tensor = qrec.prepare_inputs(
+#             params, in_tensors, ktype="symmetric")[0]
+#         if in_tensor.dtype == np.int8: # Q4
+#             in_tensor = in_tensor.astype(np.int32) << 8
+#         elif in_tensor.dtype == np.uint8: # Q4 sym
+#             in_tensor = in_tensor.astype(np.int32) - (1 << 8)
+#             in_tensor <<= 8
+#         elif in_tensor.dtype == np.uint16: # Q12 sym
+#             in_tensor = in_tensor.astype(np.int32) - (1 << 16)
+#         else: # Q12
+#             in_tensor = in_tensor.astype(np.int32)
+
+#         out_q15 = tanh_lut(in_tensor)
+#         # compute_in_out_scale(qrec, extra_scale=QType.Pow2(
+#         #     bits=32, q=7, signed=True).scale/qrec.in_qs[0].scale)
+#         scale_mul_biases_q = qrec.cache['scale_mul_biases_q']
+#         outp = scale_mul_biases_q.apply_scales(out_q15) + qrec.cache['zero_point']
+#         output = qrec.out_qs[0].clip(outp)
+#         return qrec.get_outputs(params,
+#                                 [output],
+#                                 ktype="symmetric")
 
 
 @params_type(TanHActivationParameters)
diff --git a/tools/nntool/execution/kernels/quant/fast_conv.py b/tools/nntool/execution/kernels/quant/fast_conv.py
index c965f0d81..493c38be1 100644
--- a/tools/nntool/execution/kernels/quant/fast_conv.py
+++ b/tools/nntool/execution/kernels/quant/fast_conv.py
@@ -18,8 +18,7 @@
 import numpy as np
 from graph.types import Conv2DParameters
 from execution.kernels.kernel_base import KernelBase, params_type, qrec_type
-from quantization.multiplicative.mulbias import (apply_multiplicative_bias,
-                                                 apply_zero_offset_bias)
+from quantization.multiplicative.mulbias import apply_multiplicative_bias
 from quantization.new_qrec import QRec
 
 FORCE_INT64 = False
@@ -43,11 +42,10 @@ def execute(cls, params,
 
         in_dims, out_dims = params.in_dims[0], params.out_dims[0]
         prepared_in_tensors = qrec.prepare_inputs(params, in_tensors, ktype="symmetric")
-        # if zero offset is already applied in biases by constant quantizer this does nothing
-        prepared_in_tensors = apply_zero_offset_bias(qrec, params, prepared_in_tensors, ktype="symmetric")
         in_tensor = prepared_in_tensors[0]
         # expand the weights to apply the zero offset
         weights = prepared_in_tensors[1].astype(np.int32) - qrec.in_qs[1].zero_point.astype(np.int32)
+        # if zero offset is already applied in biases by constant quantizer this does nothing
         biases = prepared_in_tensors[2]
 
         acc_q = qrec.cache.get('acc_q') or qrec.in_qs[2]
@@ -99,7 +97,6 @@ def execute(cls, params,
         out_h = ((in_h - dillated_filter_h + pad_h)) // params.stride.h + 1
 
         if params.has_bias:
-            # biases = qrec.prepare_biases(params, params.biases, params.weights, ktype="symmetric")
             if acc_q != qrec.in_qs[2]:
                 biases = acc_q.expand_from(biases, qrec.in_qs[2])
             result = np.broadcast_to(biases.reshape(
diff --git a/tools/nntool/execution/kernels/quant/linear.py b/tools/nntool/execution/kernels/quant/linear.py
index 318e6e71e..f8a7a4ff4 100644
--- a/tools/nntool/execution/kernels/quant/linear.py
+++ b/tools/nntool/execution/kernels/quant/linear.py
@@ -18,8 +18,7 @@
 import numpy as np
 from graph.types.linear import FcParameters
 from execution.kernels.kernel_base import KernelBase, params_type, qrec_type
-from quantization.multiplicative.mulbias import (apply_multiplicative_bias,
-                                                 apply_zero_offset_bias)
+from quantization.multiplicative.mulbias import apply_multiplicative_bias
 from quantization.new_qrec import QRec
 
 LOG = logging.getLogger("nntool." + __name__)
@@ -38,8 +37,6 @@ def execute(cls, params,
         in_dims, out_dims = params.in_dims[0], params.out_dims[0]
         prepared_in_tensors = qrec.prepare_inputs(
             params, in_tensors, ktype="symmetric")
-        prepared_in_tensors = apply_zero_offset_bias(
-            qrec, params, prepared_in_tensors, ktype="symmetric")
         in_tensor = prepared_in_tensors[0]
         # expand the weights to apply the zero offset
         weights = prepared_in_tensors[1].astype(np.int32) - qrec.in_qs[1].zero_point.astype(np.int32)
diff --git a/tools/nntool/execution/kernels/quant/matrix_operations.py b/tools/nntool/execution/kernels/quant/matrix_operations.py
index ba4c84ed6..162b7e477 100644
--- a/tools/nntool/execution/kernels/quant/matrix_operations.py
+++ b/tools/nntool/execution/kernels/quant/matrix_operations.py
@@ -164,8 +164,7 @@ def execute(cls, params,
                 in_tensors,
                 qrec: QRec,
                 **kwargs):
-        in_tensors = [in_tensor.astype(np.int32) for in_tensor in qrec.prepare_inputs(
-            params, in_tensors, ktype="symmetric")]
+        in_tensors = qrec.prepare_inputs(params, in_tensors, ktype="symmetric")
         details = kwargs.get('details')
         if details is not None:
             results = {}
diff --git a/tools/nntool/execution/kernels/quant/tensor_functions.py b/tools/nntool/execution/kernels/quant/tensor_functions.py
index 39a050c9a..9487d3b9e 100644
--- a/tools/nntool/execution/kernels/quant/tensor_functions.py
+++ b/tools/nntool/execution/kernels/quant/tensor_functions.py
@@ -44,7 +44,9 @@ def execute(cls, params,
         else:
             in_tensor = resize(in_tensor, params.dims.shape)
         # output_tensors = qrec.get_outputs(params, [in_tensor], ktype="symmetric")
-        return [qrec.out_qs[0].quantize(in_tensor)]
+        if in_tensor.dtype != qrec.out_qs[0].dtype:
+            in_tensor = qrec.out_qs[0].quantize(in_tensor)
+        return [in_tensor]
 
 
 @params_type(OutputParameters)
diff --git a/tools/nntool/expressions/symbolic/assignments.py b/tools/nntool/expressions/symbolic/assignments.py
index 3cadfaf6f..ba376a030 100644
--- a/tools/nntool/expressions/symbolic/assignments.py
+++ b/tools/nntool/expressions/symbolic/assignments.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2021  GreenWaves Technologies, SAS
+# Copyright (C) 2022  GreenWaves Technologies, SAS
 
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU Affero General Public License as
@@ -13,265 +13,251 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
+from collections.abc import Sequence as ABCSequence
+from copy import deepcopy
 
-from functools import reduce
-from typing import Mapping
-
-import numpy as np
+from expressions.symbolic.quantization_base import QuantizationHandlerBase
 from generation.code_block import CodeBlock
-
-from .iteration_space import IterationSpace
-from .symbol import Constant, Symbol, Variable, copy_props
-from .variable_container import VariableContainerAndAssigner
-
-
-@copy_props('var')
-class Assignment(VariableContainerAndAssigner, Symbol):
-    def __init__(self, arg, name="", var=None, **kwargs):
-        if var is None:
-            self._var = Variable(name, shape=arg.shape, symbol_binding=arg)
-        else:
-            name = var.name
-            self._var = var
-        super(Assignment, self).__init__(arg, name=name, **kwargs)
+from utils.disjoint_reduction import disjoint_reduction
+
+from .symbol import Symbol, Variable
+
+
+class Assignments(ABCSequence):
+    def __init__(self, assignments=None, returns=None, qrecs=None) -> None:
+        super().__init__()
+        self._assignments = []
+        self._returns = set(returns if returns is not None else [])
+        self._outputs = None
+        self._inputs = None
+        self._inters = None
+        self._vars = []
+        self._qrecs = qrecs
+        if assignments:
+            for assignment in assignments:
+                self.add(*assignment)
+        self._update()
 
     @property
-    def unbound_variables(self):
-        return self.contents[0].unbound_variables
+    def max_shape(self):
+        return tuple(max(elems) for elems in zip(*Symbol.extend_shapes(*[ass[1].shape for ass in self._assignments])))
 
     @property
-    def var(self):
-        return self._var
-
-    def find(self, name):
-        for elem in [self._var, self.contents[0]]:
-            res = elem.find(name)
-            if res:
-                return res
-        return None
-
-    @property
-    def var_shapes(self):
-        shapes = {self.name: self.contents[0].shape}
-        shapes.update(zip(self.contents[0].unbound_variables, self.contents[0].unbound_shapes))
-        return shapes
-
-    def _resolve(self, **kwargs):
-        """Given a set of substitions for variable in kwargs resolve all variables"""
-        return self._contents[0].resolve(**kwargs)
-
-    def _resolve_assignment(self, substitute_all=False, **kwargs) -> Mapping[str, Symbol]:
-        return {self.name: self._contents[0].resolve(**kwargs)}
-
-    def _calculate(self, calculate_ranges=False, **kwargs):
-        res = self._contents[0].resolve(**kwargs)
-        if not isinstance(res, Constant):
-            raise ValueError(
-                f"unable to calculate {self.name}")
-        if calculate_ranges:
-            self.control.add_stat(self, res.value)
-        return res.value
-
-    def _calculate_assignment(self, **kwargs) -> Mapping[str, np.ndarray]:
-        return {self.name: self._calculate(**kwargs)}
-
-    def c_block(self, code_block: CodeBlock = None, iteration_space: IterationSpace = None):
-        if code_block is None:
-            code_block = CodeBlock()
-        if iteration_space:
-            if self.var.name in iteration_space.temporary_variables:
-                code_block.write(
-                    f"{self.var.c_expr(declare=True, dtype=self.contents[0].dtype)}"
-                    f" = {self.contents[0].c_expr(iteration_space=iteration_space)};")
-            else:
-                code_block.write(
-                    f"{self.var.c_expr(dtype=self.contents[0].dtype)}{iteration_space.c_index(self.var.name)}"
-                    f" = {self.contents[0].c_expr(iteration_space=iteration_space)};")
-        else:
-            code_block.write(f'{self.var.name} = {self.contents[0].c_expr()};')
-        return code_block
+    def unbound_shapes(self):
+        return tuple(self._vars[name].shape for name in self.unbound_variables)
 
     @property
-    def returned_variables(self):
-        return [self.name]
+    def input_names(self):
+        return self._inputs
 
     @property
-    def shape(self):
-        return self._contents[0].shape
-
-    def _py_expr(self, *args, **kwargs):
-        return self._contents[0].py_expr(*args, **kwargs)
-
-    def _c_expr(self, *args, **kwargs):
-        return self._contents[0].c_expr(*args, **kwargs)
-
-    def __repr__(self) -> str:
-        return f"{{{self.var.name} <- {self.contents[0].__repr__()}}}"
-
-
-@copy_props('preconditions', 'returned_variables')
-class Let(VariableContainerAndAssigner, Symbol):
-    def __init__(self, *args, preconditions=None, returned_variables=None, name="", **kwargs):
-        args = [Assignment(arg[1], name=arg[0]) if isinstance(
-            arg, tuple) else arg for arg in args]
-        super(Let, self).__init__(*args, name=name, **kwargs)
-        if preconditions is None:
-            preconditions = []
-        else:
-            preconditions = [Assignment(arg[1], name=arg[0]) if isinstance(
-                arg, tuple) else arg for arg in preconditions]
-        self._preconditions = preconditions
-        self._returned_variables = returned_variables
-
-# pylint: disable=invalid-name
-    def In(self, *expressions):
-        return Let(*expressions, preconditions=[self])
-
-    def Return(self, *variable_names):
-        produced = self.produced_variables
-        if not all(variable in produced for variable in variable_names):
-            raise ValueError('not all variables are produced')
-        return Let(*self.contents, preconditions=self.preconditions, name=self.name, returned_variables=variable_names)
+    def output_names(self):
+        return self._outputs
 
     @property
     def unbound_variables(self):
-        resolution = self.resolve_assignment()
-        _vars = reduce(lambda s, x: s | set(
-            x.unbound_variables.values()), resolution.values(), set())
-        return {var.name: var for var in _vars if var.name not in set(resolution.keys())}
+        return self._inputs
 
     @property
-    def produced_variables(self):
-        resolution = self.resolve_assignment()
-        return set(resolution.keys())
+    def intermediate_names(self):
+        return self._inters
 
     @property
-    def preconditions(self):
-        return self._preconditions
+    def variables(self):
+        return self._vars
 
     @property
-    def returned_variables(self):
-        return self._returned_variables
-
-    @staticmethod
-    def substitute_variables(assignments):
-        res = {}
-        substitutions = {}
-        for var_name, val in assignments.items():
-            if isinstance(val, (Constant, np.ndarray, int, float)):
-                substitutions[var_name] = val
-            else:
-                substitutions[var_name] = Variable(var_name, shape=val.shape)
-                res[var_name] = val
-        return res, substitutions
-
-    def find(self, name):
-        for elem in list(self._preconditions) + list(self.contents):
-            res = elem.find(name)
-            if res:
-                return res
-        return None
-
-    def _resolve_assignment(self, substitute_all=False, **kwargs) -> Mapping[str, Symbol]:
-        """Given a set of substitions for variable in kwargs resolve all variables - return a dictionary of variables"""
-        preconditions = self._resolve_contents(
-            contents=self._preconditions, substitute_all=substitute_all, **kwargs)
-        return self._resolve_contents(contents=self.contents, substitute_all=substitute_all, **preconditions)
-
-    def _calculate_assignment(self, **kwargs) -> Mapping[str, np.ndarray]:
-        preconditions = self._calculate_contents(
-            contents=self._preconditions, **kwargs)
-        res = self._calculate_contents(contents=self.contents, **preconditions)
-        if self.returned_variables:
-            res = {vname: val for vname, val in res.items(
-            ) if vname in self.returned_variables}
-        return res
-
-    @staticmethod
-    def _resolve_contents(contents=None, substitute_all=False, **kwargs):
-        if substitute_all:
-            substitutions = kwargs
-            res = kwargs
-        else:
-            res, substitutions = Let.substitute_variables(kwargs)
-        for elem in contents:
-            elem_res = elem.resolve_assignment(
-                substitute_all=substitute_all, **substitutions)
-            if substitute_all:
-                substitutions.update(elem_res)
-                res.update(elem_res)
-            else:
-                elem_res, elem_substitutions = Let.substitute_variables(
-                    elem_res)
-                res.update(elem_res)
-                substitutions.update(elem_substitutions)
-        return res
-
-    @staticmethod
-    def _calculate_contents(contents=None, **kwargs):
-        for elem in contents:
-            kwargs.update(elem.calculate_assignment(**kwargs))
-        return kwargs
-
-    def _resolve(self, **kwargs):
-        """Given a set of substitions for variable in kwargs resolve all variables - return a single symbol"""
-        preconditions = self._resolve_contents(
-            contents=self._preconditions, substitute_all=True, **kwargs)
-        resolution = self._resolve_contents(
-            contents=self.contents, substitute_all=True, **preconditions)
-        return Assignment(resolution[self.contents[-1].name], name=self.contents[-1].name)
-
-    def _calculate(self, calculate_ranges=False, **kwargs):
-        res = self._resolve(**kwargs)
-        if not isinstance(res.contents[0], Constant):
-            raise ValueError(
-                f"unable to calculate {self.name}")
-        if calculate_ranges:
-            self.control.add_stat(self, res.value)
-        return res.contents[0].value
+    def axes(self):
+        var_shapes = Symbol.extend_shapes(*self.unbound_shapes, max_length=len(self.max_shape))
+        axes = disjoint_reduction(set(frozenset(idx for idx, dim in enumerate(
+            shape) if dim != 1) for shape in var_shapes))
+        return tuple(sorted([tuple(x) for x in axes]))
 
     @property
     def var_shapes(self):
-        shapes = {}
-        for var_name, elem in self.resolve_assignment().items():
-            shapes[var_name] = elem.shape
-            shapes.update(dict(zip(elem.unbound_variables, elem.unbound_shapes)))
-        return shapes
+        return {var.name: var.shape for var in self._vars.values()}
 
     @property
-    def shape(self):
-        return self._contents[-1].shape
+    def ops(self):
+        # TODO: Implement
+        return 1
 
-    def _py_expr(self, *args, **kwargs):
-        return self._contents[0].py_expr(*args, **kwargs)
+    @property
+    def qrecs(self):
+        return self._qrecs
 
-    def c_block(self, code_block: CodeBlock = None, iteration_space: IterationSpace = None, with_loops=False):
+    @property
+    def c_header_set(self):
+        return set().union(*[assignment[1].c_header_set
+                             for assignment in self._assignments])
+
+    def variable(self, name):
+        return self._vars[name]
+
+    def _add_int(self, var, func):
+        for uname, uvar in func.unbound_variables.items():
+            if uname in self._vars:
+                uvar.shape = self._vars[uname].shape
+                uvar.qrec = self._vars[uname].qrec
+        if isinstance(var, str):
+            if var in self._vars:
+                var = self._vars[var]
+            else:
+                var = Variable(var, shape=func.shape, dtype=func.dtype)
+        self._assignments.append((var, func))
+
+    def add(self, var, func):
+        self._add_int(var, func)
+        self._update()
+
+    def _update(self):
+        self._vars = {}
+        free_var_names = set()
+        for var, func in self._assignments:
+            self._vars[var.name] = var
+            for name, uvar in func.unbound_variables.items():
+                self._vars[name] = uvar
+                free_var_names.add(name)
+
+        # these are all the produced variables
+        prod_var_names = set(
+            [assignment[0].name for assignment in self._assignments])
+        # sort all the variable names to keep a determined order
+        # the outputs are things produced that are not consumed
+        self._outputs = sorted(
+            list((prod_var_names - free_var_names) | self._returns))
+        # the inputs are variables that are not produced
+        self._inputs = sorted(list(free_var_names - prod_var_names))
+        # the intermediates are the produced variables that are not in the outputs
+        self._inters = sorted(list(prod_var_names - set(self._outputs)))
+
+    def c_block(self, code_block: CodeBlock = None, iteration_space: 'IterationSpace' = None,
+                with_loops=False, with_comment=True, with_fixed=False, tags=None):
         if code_block is None:
             code_block = CodeBlock()
+        # create loops from iteration space
         if with_loops:
             assert iteration_space, "must have space"
-            for idx, _ in enumerate(iteration_space.axis_shape):
-                if idx in iteration_space.fixed_spaces:
+            if with_comment:
+                # write some comments describing the iteration space
+                code_block.comment(
+                    f"Max shape: {iteration_space.shape} var shapes:")
+                writer = code_block.start_long_comment()
+                for shape_comment in [f'{name}: {shape}'
+                                      for name, shape in iteration_space.var_shapes.items()]:
+                    writer.write(shape_comment)
+                writer.end()
+                code_block.comment(
+                    f'Iteration reduced to spaces {iteration_space.spaces}')
+                code_block.comment(
+                    f'Fixed spaces {iteration_space.fixed_spaces}')
+                code_block.comment(
+                    f'Parameteric spaces {iteration_space.parametric_spaces}')
+                code_block.comment(
+                    f'Paralelized space {iteration_space.paralellized_space}')
+                code_block.comment(
+                    f'Interior spaces {iteration_space.interior_spaces}')
+            # write the loops
+            for space in iteration_space.spaces:
+                if not with_fixed and space in iteration_space.fixed_spaces:
                     continue
-                code_block.write(f"{iteration_space.c_for(idx)} {{")
+                code_block.write(f"{iteration_space.c_for(space, with_fixed=with_fixed)} {{")
                 code_block.indent()
-        for precondition in self.preconditions:
-            precondition.c_block(code_block=code_block,
-                                 iteration_space=iteration_space)
-        for item in self.contents:
-            item.c_block(code_block=code_block,
-                         iteration_space=iteration_space)
+        # write each assignment
+        for var, func in self._assignments:
+            this_tags = {} if tags is None else tags.copy()
+
+            # write comment with quantization if present
+            if with_comment:
+                uvars = [f'{uvar.name}: {uvar.qrec}'
+                        for uvar in func.unbound_variables.values()
+                        if uvar.qrec]
+                if uvars:
+                    writer = code_block.start_long_comment()
+                    writer.write('inputs')
+                    for uvar in uvars:
+                        writer.write(uvar)
+                    writer.end()
+                code_block.comment(f'{var.name} = {repr(func)}')
+            # if iteration space is present pick up if this is a temporary or an output
+            # assignment from that
+            if iteration_space:
+                if var.name in iteration_space.temporary_names:
+                    this_tags[func] = (var, True)
+                else:
+                    this_tags[func] = (var, False)
+            else:
+                this_tags[func] = (var, var.name in self.intermediate_names)
+
+            # The iteration space will be passed down the symbol structure
+            func.tag = True
+            func.c_block(code_block=code_block,
+                         tags=this_tags,
+                         iteration_space=iteration_space,
+                         with_comment=with_comment)
+            func.tag = False
+
         if with_loops:
-            for idx, _ in enumerate(iteration_space.axis_shape):
-                if idx in iteration_space.fixed_spaces:
+            for space in iteration_space.spaces:
+                if not with_fixed and space in iteration_space.fixed_spaces:
                     continue
                 code_block.deindent()
                 code_block.write("}")
         return code_block
 
-    def _c_expr(self, *args, **kwargs):
-        return self._contents[0].c_expr(*args, **kwargs)
+    def quantize(self, quantizer: QuantizationHandlerBase, symbol_control, quantize_inputs=False, qtypes=None):
+        funcs = []
+        out_qrecs = {}
+        in_qrecs = {}
+        for var, func in self._assignments:
+            qfunc, qrec = quantizer.quantize(
+                func,
+                symbol_control,
+                quantize_inputs=quantize_inputs,
+                prequantized_variables=out_qrecs,
+                qtypes=qtypes)
+            qfunc = qfunc.resolve()
+            in_qrecs.update(qfunc.variable_quantization)
+            if var.name in self._outputs:
+                qfunc, qrec = quantizer.quantize_output(
+                    func,
+                    qfunc,
+                    var,
+                    symbol_control,
+                    qrec,
+                    quantize_inputs=quantize_inputs,
+                    prequantized_variables=out_qrecs,
+                    qtypes=qtypes)
+                qfunc = qfunc.resolve()
+            var = deepcopy(var)
+            var.qrec = qrec
+            funcs.append((var, qfunc(substitute=True)))
+            out_qrecs[var.name] = qrec
+        in_qrecs.update(out_qrecs)
+        return Assignments(funcs, returns=self._returns, qrecs=in_qrecs)
+
+    def __getitem__(self, idx):
+        return self._assignments[idx]
+
+    def __len__(self) -> int:
+        return len(self._assignments)
+
+    def __iter__(self):
+        return iter(self._assignments)
+
+    def __call__(self, quantize_inputs=False, dequantize_outputs=False, **subs):
+        subs = dict(subs)
+        if quantize_inputs:
+            subs = {name: self.qrecs[name].quantize_and_clip(val) if name in self.qrecs else val
+                    for name, val in subs.items()}
+        for var, func in self._assignments:
+            subs[var.name] = func(
+                dequantize_outputs=dequantize_outputs, **subs)
+        res = dict(filter(lambda elem: elem[0] in self._outputs, subs.items()))
+        if dequantize_outputs:
+            if self.qrecs is None:
+                raise ValueError('assignments are not quantized')
+            res = {name: self.qrecs[name].dequantize(
+                val) for name, val in res.items()}
+        return res
 
-    def __repr__(self) -> str:
-        return (f"Let({','.join([elem.__repr__() for elem in self.preconditions])})"
-                f".In({','.join([elem.__repr__() for elem in self.contents])})")
diff --git a/tools/nntool/expressions/symbolic/basic.py b/tools/nntool/expressions/symbolic/basic.py
index cdb9f9c2e..862dbb065 100644
--- a/tools/nntool/expressions/symbolic/basic.py
+++ b/tools/nntool/expressions/symbolic/basic.py
@@ -14,6 +14,7 @@
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
 import logging
+import math
 
 import numpy as np
 from bfloat16 import bfloat16
@@ -21,8 +22,8 @@
 from scipy.special import expit
 
 from .function import Function
-from .symbol import (Constant, Rational, c_headers, copy_props, environment,
-                     handles, handlesr, nargs)
+from .symbol import (Constant, QRecBase, Rational, Symbol, Variable, c_headers,
+                     copy_props, environment, handles, handlesr, nargs)
 
 LOG = logging.getLogger('nntool.'+__name__)
 
@@ -33,7 +34,8 @@
 class Add(Function):
 
     def _impl(self, *args, **kwargs):
-        return np.add(args[0], args[1], dtype=self.dtype)
+        res = np.add(args[0], args[1], dtype=self.dtype)
+        return res
 
     def _py_expr(self, *args, **kwargs):
         return "np.add(%s, %s)" % (args[0], args[1])
@@ -460,6 +462,8 @@ def __init__(self, *args, **kwargs):
         self._inner_function = self._eval(*args, **kwargs)
         # self._inner_function.name = self.name
         self._inner_function.qrec = self.qrec
+        self._inner_function.tag = self.tag
+        self._inner_function.comment = self.comment
 
     def _collect_globals(self) -> dict:
         global_dict = self.ENVIRONMENT or {}
@@ -478,6 +482,9 @@ def _resolve(self, **kwargs):
         func = self._inner_function.resolve(**kwargs)
         # func.name = self.name
         func.qrec = self.qrec
+        if isinstance(func, Function):
+            func.tag = self.tag
+            func.comment = self.comment
         return func
 
     def _eval(self, *args, **kwargs):
@@ -500,6 +507,14 @@ def _py_expr(self, *args, **kwargs):
     def _c_expr(self, *args, **kwargs):
         return self._inner_function.c_expr(*args, **kwargs)
 
+    def c_block(self, code_block=None, tags=None, **kwargs):
+        if tags is not None and self._inner_function not in tags:
+            name = tags.get(self, f'{self.SYMBOL_PREFEX}{self.name}')
+            if isinstance(name, str):
+                name = (Variable(name, dtype=self.dtype), True)
+            tags[self._inner_function] = name
+        return self._inner_function.c_block(code_block=code_block, tags=tags, **kwargs)
+
 
 @nargs(1)
 class HTanh(CompoundFunction):
@@ -544,32 +559,122 @@ def _eval(self, *args, **kwargs):
                 return args[0]
 
 
+@nargs(3)
+class ClipFloat(CompoundFunction):
+
+    def _eval(self, *args, **kwargs):
+        return Min(Max(args[0], args[1], dtype=self.dtype), args[2], dtype=self.dtype)
+
+
 @nargs(1)
 @copy_props('_from_qrec', '_to_qrec')
-class ConvertFloatScaled(CompoundFunction):
-    def __init__(self, *args, from_qrec=None, to_qrec=None, **kwargs):
+class ConvertQuantization(CompoundFunction):
+    def __init__(self, *args, from_qrec: QRecBase=None, to_qrec: QRecBase=None, **kwargs):
         self._from_qrec = from_qrec
         self._to_qrec = to_qrec
         super().__init__(*args, **kwargs)
 
     @property
-    def from_qrec(self):
+    def from_qrec(self) -> QRecBase:
         return self._from_qrec
 
     @property
-    def to_qrec(self):
-        return self._to_qrec
+    def from_is_float(self) -> bool:
+        return self._from_qrec.dtype in [np.float16, np.float32, bfloat16]
 
-    def _eval_float_to_quant(self, *args, **kwargs):
-        raise NotImplementedError()
+    @property
+    def from_is_fix(self) -> bool:
+        return self._from_qrec.dtype in [np.int8, np.uint8, np.int16, np.uint16, np.int32]
 
-    def _eval_quant_to_float(self, *args, **kwargs):
-        raise NotImplementedError()
+    @property
+    def to_is_float(self) -> bool:
+        return self._to_qrec.dtype in [np.float16, np.float32, bfloat16]
 
-    def _eval(self, *args, **kwargs):
-        if self._from_qrec.dtype == np.int16 or self._from_qrec.dtype == bfloat16:
-            return self._eval_float_to_quant(*args, **kwargs)
-        return self._eval_quant_to_float(*args, **kwargs)
+    @property
+    def to_is_fix(self) -> bool:
+        return self._to_qrec.dtype in [np.int8, np.uint8, np.int16, np.uint16, np.int32]
+
+    @property
+    def to_qrec(self) -> QRecBase:
+        return self._to_qrec
+
+    def _eval_float_to_fix(self, *args, **kwargs) -> Symbol:
+        to_qrec = self.to_qrec
+        from_qrec = self.from_qrec
+        scaled_val = Mul(
+            args[0],
+            Constant(
+                [math.pow(2, to_qrec.q)/to_qrec.scale],
+                dtype=from_qrec.dtype),
+            dtype=from_qrec.dtype)
+        if to_qrec.zero_point != 0:
+            # need to add zero_point plus rounding
+            scaled_val = Add(
+                scaled_val,
+                Constant([to_qrec.zero_point + 0.5], dtype=from_qrec.dtype),
+                dtype=from_qrec.dtype)
+        else:
+            # Just add rounding
+            scaled_val = Add(
+                scaled_val,
+                Constant([0.5], dtype=from_qrec.dtype),
+                dtype=from_qrec.dtype)
+        iinfo = np.iinfo(to_qrec.dtype)
+        return Cast(
+            ClipFloat(
+                scaled_val,
+                Constant(iinfo.min, dtype=from_qrec.dtype),
+                Constant(iinfo.max, dtype=from_qrec.dtype),
+                dtype=from_qrec.dtype),
+            dtype=to_qrec.dtype,
+            tag=self.tag,
+            comment=self.comment)
+
+    def _eval_fix_to_float(self, *args, **kwargs) -> Symbol:
+        to_qrec = self.to_qrec
+        from_qrec = self.from_qrec
+        float_val = Cast(args[0], dtype=to_qrec.dtype)
+        if from_qrec.zero_point != 0:
+            float_val = Sub(
+                float_val,
+                Constant([from_qrec.zero_point], dtype=to_qrec.dtype),
+                dtype=to_qrec.dtype)
+        float_val = Mul(
+            float_val,
+            Constant(
+                [from_qrec.scale/math.pow(2, from_qrec.q)],
+                dtype=to_qrec.dtype),
+            dtype=to_qrec.dtype,
+            tag=self.tag,
+            comment=self.comment)
+        return float_val
+
+    def _eval(self, *args, **kwargs) -> Symbol:
+        if self.from_is_float:
+            if self.to_is_fix:
+                return self._eval_float_to_fix(*args, **kwargs)
+            elif self.to_is_float:
+                if self.to_qrec.dtype != self.from_qrec.dtype:
+                    return Cast(
+                        *args,
+                        dtype=self.to_qrec.dtype,
+                        **kwargs)
+                return args[0]
+        elif self.from_is_fix:
+            if self.to_is_float:
+                return self._eval_fix_to_float(*args, **kwargs)
+            elif self.to_is_fix:
+                # if self.to_qrec.dtype == self.from_qrec.dtype:
+                #     return args[0]
+                # sign_change = from_qrec.signed != to_qrec.signed
+                # growing = from_qrec.size < to_qrec.size
+                # reducing = from_qrec.size > to_qrec.size
+                # zeropoint_change = from_qrec.zero_point != to_qrec.zero_point
+                # scale_change = from_qrec.scale != to_qrec.scale
+                # q_change = from_qrec.q != to_qrec.q
+                raise NotImplementedError()
+
+        raise ValueError('unsupported conversion')
 
 
 @nargs(2)
diff --git a/tools/nntool/expressions/symbolic/common/__init__.py b/tools/nntool/expressions/symbolic/common/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tools/nntool/expressions/symbolic/float_quantization/float_qrec.py b/tools/nntool/expressions/symbolic/float_quantization/float_qrec.py
index 5f2d4a2d7..bcb463e6f 100644
--- a/tools/nntool/expressions/symbolic/float_quantization/float_qrec.py
+++ b/tools/nntool/expressions/symbolic/float_quantization/float_qrec.py
@@ -26,6 +26,9 @@ def __init__(self, dtype: np.dtype, min_val=None, max_val=None) -> None:
         self._min_val = min_val
         self._max_val = max_val
 
+    def __repr__(self) -> str:
+        return self._dtype.__name__
+
     @property
     def min_val(self):
         return self._min_val
diff --git a/tools/nntool/expressions/symbolic/float_quantization/float_quantization.py b/tools/nntool/expressions/symbolic/float_quantization/float_quantization.py
index 00eb69f0c..fb6eae262 100644
--- a/tools/nntool/expressions/symbolic/float_quantization/float_quantization.py
+++ b/tools/nntool/expressions/symbolic/float_quantization/float_quantization.py
@@ -13,19 +13,17 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
-import math
 from typing import Tuple
 
 import numpy as np
 
-from ..basic import Cast, ConvertFloatScaled
+from ..basic import Cast, ConvertQuantization
 from ..q15_quantization.q15_scale_q_rec import Q15ScaleQRec
 from ..quantization_base import (QRecBase, QuantizationHandlerBase,
                                  handles_scheme)
 from ..symbol import Symbol, SymbolStats
 from .float_qrec import FloatQRec
 
-
 @handles_scheme('Float')
 class FloatQuantization(QuantizationHandlerBase):
 
@@ -64,28 +62,19 @@ def _quantize_output(cls,
         qtypes = kwargs.get('qtypes', {})
         # first see if this has already been quantized by nntool
         # note that the qtype will be stored against the name of the output symbol
-        max_val, out_dtype, out_q = cls._get_scale_dtype_from_qtypes(osym, qtypes)
-        if max_val is not None:
-            qrec_out = Q15ScaleQRec(out_dtype, max_val, out_q)
-            # scale clip and cast to output type
-            return (ConvertFloatScaled(qsym, from_qrec=qrec, to_qrec=qrec_out), qrec_out)
-        if not out_dtype:
-            out_dtype = kwargs.get('out_dtype', np.float32)
-        # Just cast
-        return (Cast(qsym, dtype=out_dtype), FloatQRec(dtype=out_dtype, min_val=qrec.min_val, max_val=qrec.max_val))
+        if not qtypes or osym.name not in qtypes:
+            out_dtype  = kwargs.get('out_dtype', np.float32)
+            qrec_out = FloatQRec(out_dtype)
+            return (Cast(qsym, dtype=out_dtype), qrec_out)
 
-    @classmethod
-    def _get_scale_dtype_from_qtypes(cls, sym, qtypes):
-        if not qtypes or sym.name not in qtypes:
-            return None, None, None
-        qtype = qtypes[sym.name]
-        if qtype.dtype == np.int8:
-            if len(qtype.scale) > 1:
-                return None, None, None
-            return qtype.scale[0] * math.pow(2, 7), np.int8, 7
-        elif qtype.dtype == np.int16:
+        qtype = qtypes[osym.name]
+        if qtype.dtype in [np.int8, np.uint8, np.int16, np.uint16]:
             if len(qtype.scale) > 1:
-                return None, None, None
-            return qtype.scale[0] * math.pow(2, 15), np.int16, 15
-        else:
-            return None, qtype.dtype, None
+                out_dtype  = kwargs.get('out_dtype', np.float32)
+                qrec_out = FloatQRec(out_dtype)
+                return (Cast(qsym, dtype=out_dtype), qrec_out)
+            max_val, min_val, bitlen = Q15ScaleQRec.dtype_zp_to_min_max(qtype.dtype, qtype.scale[0], qtype.zero_point)
+            qrec_out = Q15ScaleQRec(qtype.dtype, max_val, bitlen, min_val=min_val, max_val=max_val, zero_point=qtype.zero_point)
+            return (ConvertQuantization(qsym, from_qrec=qrec, to_qrec=qrec_out), qrec_out)
+        qrec_out = FloatQRec(dtype=qtype.dtype, max_val=qtype.max_val, min_val=qtype.min_val)
+        return (Cast(qsym, dtype=qtype.dtype),qrec_out)
diff --git a/tools/nntool/expressions/symbolic/float_quantization/handlers.py b/tools/nntool/expressions/symbolic/float_quantization/handlers.py
index 298139bcc..0c5c8a5e6 100644
--- a/tools/nntool/expressions/symbolic/float_quantization/handlers.py
+++ b/tools/nntool/expressions/symbolic/float_quantization/handlers.py
@@ -23,7 +23,7 @@
                               np_fastpow2, np_fastrsqrt, np_fastsigmoid,
                               np_fasttanh)
 
-from ..basic import (Abs, Add, ATan, Cast, Cos, Div, Exp, HSigmoid, HTanh, Log,
+from ..basic import (Abs, Add, ATan, Cast, ConvertQuantization, Cos, Div, Exp, HSigmoid, HTanh, Log,
                      Max, Min, Mul, Pow, RSqrt, Sigmoid, Sin, Sqrt, Sub, TanH)
 from ..function import Function
 from ..quantization_base import qhandler
@@ -35,7 +35,6 @@
 # from utils.sigmoid_tanh_lut import sigmoid_lut_float, tanh_lut_float
 
 
-
 @qhandler("Float", Constant, Rational)
 class BasicConstantQuant(FloatQuantization):
 
@@ -58,7 +57,7 @@ def _quantize(cls,
                   sym_ctrl: SymbolStats,
                   qrec: FloatQRec = None,
                   **kwargs) -> Tuple[Symbol, FloatQRec]:
-
+        # TODO: Needs merging with Q15 version
         prequantized_variables = kwargs.get('prequantized_variables', {})
         qtypes = kwargs.get('qtypes', {})
 
@@ -75,25 +74,27 @@ def _quantize(cls,
             qrec = cls.qrec_from_qtype(qtypes[sym.name], max_val)
             if qrec:
                 sym.qrec = qrec
-                return (sym, qrec)
+                if isinstance(qrec, FloatQRec):
+                    return (sym, qrec)
+                out_dtype = kwargs.get('out_dtype', np.float32)
+                out_qrec = FloatQRec(dtype=out_dtype, max_val=max_val, min_val=-max_val)
+                return (
+                    ConvertQuantization(sym, from_qrec=qrec, to_qrec=out_qrec, tag=sym.name),
+                    out_qrec)
 
         out_dtype = kwargs.get('out_dtype', np.float32)
         return sym, FloatQRec(dtype=out_dtype, max_val=max_val, min_val=-max_val)
 
     @classmethod
     def qrec_from_qtype(cls, qtype, max_val):
-        if qtype.dtype == np.int8 or qtype.dtype == np.int16:
-            if qtype.dtype == np.int8:
-                if len(qtype.scale) > 1:
-                    qtype.scale = np.max(qtype.scale)
-                q = 7
-                dtype = np.int8
-            elif qtype.dtype == np.int16:
-                if len(qtype.scale) > 1:
-                    qtype.scale = np.max(qtype.scale)
-                q = 15
-                dtype = np.int16
-            return Q15ScaleQRec(dtype, max_val, q, max_val=max_val, min_val=-max_val)
+        if qtype.dtype in [np.int8, np.uint8, np.int16, np.uint16]:
+            if len(qtype.scale) > 1:
+                return None
+            max_val, min_val, bitlen = Q15ScaleQRec.dtype_zp_to_min_max(
+                qtype.dtype, qtype.scale[0], qtype.zero_point[0])
+            return Q15ScaleQRec(qtype.dtype, max_val, bitlen,
+                                max_val=max_val, min_val=min_val,
+                                zero_point=qtype.zero_point[0])
         elif qtype.dtype in [np.float32, np.float16, bfloat16]:
             return FloatQRec(dtype=qtype.dtype, max_val=max_val, min_val=-max_val)
         else:
@@ -157,6 +158,7 @@ def _c_expr(self, *args, **kwargs):
 # TODO - Need numpy equivalents of sin and cos
 # TODO - All of these should return correct function based on output type (i.e. bfloat16/ieee16 version)
 
+
 @nargs(1)
 @environment({
     'npcos': np.cos,
diff --git a/tools/nntool/expressions/symbolic/function.py b/tools/nntool/expressions/symbolic/function.py
index e09fabecb..b44404655 100644
--- a/tools/nntool/expressions/symbolic/function.py
+++ b/tools/nntool/expressions/symbolic/function.py
@@ -16,6 +16,8 @@
 
 import numpy as np
 
+from generation.code_block import CodeBlock
+
 from .symbol import Constant, Symbol, Variable, environment
 from .variable_container import VariableContainer
 
@@ -81,16 +83,17 @@ def _resolve(self, **kwargs):
                     for elem in self._contents]
         return self._eval(*contents, **kwargs)
 
-    def _calculate(self, calculate_ranges=False, track_results=None, **kwargs):
+    def _calculate(self, calculate_ranges=False, track_results=None, dequantize_outputs=False, **kwargs):
         contents = [elem.calculate(calculate_ranges=calculate_ranges,
                                    track_results=track_results,
+                                   dequantize_outputs=dequantize_outputs,
                                    **kwargs)
                     for elem in self._contents]
         res = self._eval(*contents, **kwargs)
         if calculate_ranges:
             self.control.add_stat(self, res.value)
         if track_results is not None:
-            if self.qrec is not None:
+            if self.qrec is not None and dequantize_outputs:
                 track_results[self.name] = self.qrec.dequantize(
                     res.value.copy())
             else:
@@ -136,6 +139,38 @@ def py_compiled_lambda(self):
     def c_expr(self, *args, **kwargs) -> str:
         return self._c_expr(*(arg.c_expr(*args, **kwargs) for arg in self._contents))
 
+    def c_block(self, code_block=None, tags=None, with_comment=False, **kwargs):
+        if code_block is None:
+            code_block = CodeBlock()
+        if tags is not None:
+            args = []
+            for arg in self._contents:
+                arg.c_block(code_block=code_block, tags=tags,
+                            with_comment=with_comment, **kwargs)
+                if arg.tag:
+                    if arg in tags:
+                        args.append(tags[arg])
+                    else:
+                        name = tags.get(arg, f'{self.SYMBOL_PREFEX}{arg.name}')
+                        if isinstance(name, tuple):
+                            name = name[0].c_expr()
+                        args.append(name)
+                else:
+                    args.append(code_block.lines.pop(-1).strip())
+            if self.tag:
+                if self.comment and with_comment:
+                    code_block.write(f'// {self.comment}')
+                name = tags.get(self, f'{self.ctype} {self.SYMBOL_PREFEX}{self.name}')
+                if isinstance(name, tuple):
+                    name = name[0].c_expr(
+                        dtype=name[0].dtype, declare=name[1], **kwargs)
+                code_block.write(f'{name} = {self._c_expr(*args)};')
+            else:
+                code_block.write(f'{self._c_expr(*args)}')
+        else:
+            code_block.write(self.c_expr(*args, **kwargs))
+        return code_block
+
     @property
     def py_lambda(self) -> str:
         return "lambda %s: %s" % (",".join("%s=None" % (var) for var in self.unbound_variables), self.py_expr())
diff --git a/tools/nntool/expressions/symbolic/function_collection.py b/tools/nntool/expressions/symbolic/function_collection.py
index ff6717ebd..880c3f5b8 100644
--- a/tools/nntool/expressions/symbolic/function_collection.py
+++ b/tools/nntool/expressions/symbolic/function_collection.py
@@ -31,7 +31,7 @@ class FunctionCollection():
     def __init__(self, functions: Sequence[Tuple[Variable, Symbol]], qrecs=None) -> None:
         self._qrecs = qrecs
         # save map from produced variable to function
-        self._functions = {k: v for k, v in  functions}
+        self._functions = {k: v for k, v in functions}
         # now create a map with producted variable name to free variables in function
         self._freevars = {var.name: set([name for name in func.unbound_variables.keys()])
                           for var, func in self._functions.items()}
@@ -51,16 +51,18 @@ def __init__(self, functions: Sequence[Tuple[Variable, Symbol]], qrecs=None) ->
             for name, symbol in func.unbound_variables.items():
                 if name in self._vars:
                     if self._vars[name] != symbol:
-                        raise ValueError('%s points to more than one variable' % name)
+                        raise ValueError(
+                            '%s points to more than one variable' % name)
                 else:
                     self._vars[name] = symbol
             if res_symbol.name in self._vars:
                 if self._vars[res_symbol.name] != res_symbol:
-                    raise ValueError('%s points to more than one variable' % res_symbol.name)
+                    raise ValueError(
+                        '%s points to more than one variable' % res_symbol.name)
             else:
                 self._vars[res_symbol.name] = res_symbol
         self.init_indexes()
-    
+
     def init_indexes(self):
         # iterators contains list of iterators
         self._iterators = None
@@ -168,7 +170,8 @@ def _create_indexes(self):
                                 key=lambda x: next(i for i in x))
         idx_names = ["_".join(["d%s" % idx for idx in sorted(list(idxes))])
                      for idxes in unique_indexes]
-        idx_dims = [reduce(lambda x, y: x*max_shape[y], idxes, 1) for idxes in unique_indexes]
+        idx_dims = [reduce(lambda x, y: x*max_shape[y], idxes, 1)
+                    for idxes in unique_indexes]
         self._iterators = [Variable(idx_name, shape=tuple([idx_dim]), dtype=np.int32)
                            for idx_name, idx_dim in zip(idx_names, idx_dims)]
         if not self._iterators:
@@ -202,7 +205,8 @@ def get_iterator_vars(self):
             if depth == 0:
                 iters.extend([('First', 0), ('Last', var.shape[0])])
             else:
-                iters.append((self.iterators[depth].name.upper(), var.shape[0]))
+                iters.append(
+                    (self.iterators[depth].name.upper(), var.shape[0]))
         return iters
 
     def create_kernel(self, parallel_iterator, fixed_iterators, code_block=None):
@@ -255,13 +259,23 @@ def create_kernel(self, parallel_iterator, fixed_iterators, code_block=None):
         assert produced_idx >= len(execution_order)
         return code_block
 
-    def produce_functions(self, produced_idx, execution_order, index_dependencies, depth, code_block):
+    def produce_functions(self, produced_idx, execution_order, index_dependencies, depth, code_block, tags=None):
         while (produced_idx < len(execution_order) and
                index_dependencies[execution_order[produced_idx].name] == depth):
+            this_tags = {} if tags is None else tags.copy()
             var = execution_order[produced_idx]
             declare = var.name in self.intermediate_names
-            code_block.write("{} = {};", var.c_expr(declare=declare, dtype=var.dtype),
-                             self._functions[var].c_expr())
+            # write comment with quantization if present
+            uvars = [f'{uvar.name}: {uvar.qrec}'
+                     for uvar in self._functions[var].unbound_variables.values()
+                     if uvar.qrec]
+            if uvars:
+                uvars = "  ".join(uvars)
+                code_block.write(f'// inputs {uvars}')
+            this_tags[self._functions[var]] = (var, declare)
+            self._functions[var].tag = True
+            self._functions[var].c_block(code_block=code_block, tags=this_tags)
+            self._functions[var].tag = False
             produced_idx += 1
         return produced_idx
 
diff --git a/tools/nntool/expressions/symbolic/iteration_space.py b/tools/nntool/expressions/symbolic/iteration_space.py
index fe07dea7b..c2f8327ee 100644
--- a/tools/nntool/expressions/symbolic/iteration_space.py
+++ b/tools/nntool/expressions/symbolic/iteration_space.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2021  GreenWaves Technologies, SAS
+# Copyright (C) 2022  GreenWaves Technologies, SAS
 
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU Affero General Public License as
@@ -13,31 +13,69 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
+
 from functools import partial, reduce
 from itertools import groupby
 from operator import itemgetter
 from typing import List, Sequence
 
 import numpy as np
-from quantization.qtype import DTYPE_GAP_CTYPE
+from bfloat16 import bfloat16
+from generation.code_block import CodeBlock
+from quantization.qtype import DTYPE_GAP_CTYPE, DTYPES
 
+from .assignments import Assignments
 from .symbol import Symbol
-from .variable_container import VariableContainerAndAssigner
 
 
 def split_list(sequence: Sequence):
     return [list(map(itemgetter(1), g)) for k, g in groupby(enumerate(sequence), lambda x: x[0]-x[1])]
 
 
+def tabulate(lines):
+    """Takes a list of lists of strings and lines up lengths to improve formating"""
+    # get max length of each segment
+    max_len = tuple(max(elems) for elems
+                    in zip(*[tuple(len(line_elem) for line_elem in line)
+                             for line in lines]))
+    # add spaces to each line segment
+
+    def reduction(s, x):
+        s.append(
+            f'{"".join(elem + " " * (max_len[idx] - len(elem)) for idx, elem in enumerate(x[:-1]))}{x[-1]}')
+        return s
+    return reduce(reduction, lines, [])
+
+
+AT_ARG_DATATYPES = {
+    np.uint8:   ('CNN_ArgDataTypeUns',  1, False, False),
+    np.uint16:  ('CNN_ArgDataTypeUns',  2, False, False),
+    np.uint32:  ('CNN_ArgDataTypeUns',  4, False, False),
+    np.uint64:  ('CNN_ArgDataTypeUns',  8, False, False),
+    np.int8:    ('CNN_ArgDataType',     1, True, False),
+    np.int16:   ('CNN_ArgDataType',     2, True, False),
+    np.int32:   ('CNN_ArgDataType',     4, True, False),
+    np.int64:   ('CNN_ArgDataType',     8, True, False),
+    np.float16: ('CNN_ArgDataTypeF',    2, True, True),
+    bfloat16:   ('CNN_ArgDataTypeF',    2, True, True),
+    np.float32: ('CNN_ArgDataTypeF',    4, True, True),
+}
+
 class IterationSpace():
-    def __init__(self, assigner: VariableContainerAndAssigner, min_interior_space=1000) -> None:
-        self._assigner = assigner
+    def __init__(self, assignments: Assignments, constants=None, min_interior_space=1000, max_interior_space=10000, num_parameteric_spaces=2) -> None:
+        self._assignments = assignments
         self._var_shapes = None
         self._var_axes = None
+        self._var_is_constant = {}
         self._spaces = None
+        self._var_axes_idx = None
         self._min_interior_space = min_interior_space
+        self._max_interior_space = max_interior_space
         self._num_workers = 8
         self._var_strides = {}
+        self._num_parameteric_spaces = num_parameteric_spaces
+        if constants:
+            self.vars_are_constant(*constants)
 
     @staticmethod
     def _var_name(idx):
@@ -68,10 +106,27 @@ def set_var_stride(self, vname, stride):
         self._var_axes = None
         self._spaces = None
         self._var_strides[vname] = stride
+        self._var_axes_idx = None
+
+    def is_var_constant(self, var_name):
+        return self._var_is_constant.get(var_name, False)
+
+    def vars_are_constant(self, *var_names):
+        for var_name in var_names:
+            self._var_is_constant[var_name] = True
+        return self
+
+    @property
+    def assignments(self):
+        return self._assignments
+
+    @property
+    def real_shape(self):
+        return tuple(dim for dim in self.shape if dim != 1)
 
     @property
     def shape(self):
-        return max(zip(*Symbol.extend_shapes(*self._assigner.unbound_shapes)))
+        return self.assignments.max_shape
 
     @property
     def full_rank(self):
@@ -96,16 +151,16 @@ def extended_strides(self):
         return tuple(reduce(lambda state, x: state + [frozenset(set.union(*x))], zip(*tuple(vstrides.values())), []))
 
     @property
-    def produced_variables(self):
-        return set(self._assigner.returned_variables)
+    def output_names(self):
+        return self._assignments.output_names
 
     @property
-    def consumed_variables(self):
-        return set(self._assigner.unbound_variables)
+    def input_names(self):
+        return self._assignments.input_names
 
     @property
-    def temporary_variables(self):
-        return set(self.variables) - self.produced_variables - self.consumed_variables
+    def temporary_names(self):
+        return self._assignments.intermediate_names
 
     def space_for_axis(self, axis):
         return next((axes for axes in self.spaces if axis in axes), None)
@@ -123,18 +178,35 @@ def var_axes(self):
                               for vname, shape in self.var_shapes.items()}
         return self._var_axes
 
+    @property
+    def var_axes_idx(self):
+        """Map of variable name to index of iteration space used
+
+        Returns:
+            dict: Map of variable name to index of iteration space used
+        """
+        if self._var_axes_idx is None:
+            self._var_axes_idx = {vname: tuple(self.spaces.index(dim) for dim in axes)
+                                  for vname, axes in self.var_axes.items()}
+        return self._var_axes_idx
+
     @property
     def variables(self):
         """Set of variable names
         """
         return set(self.var_shapes)
 
+    @property
+    def spaces_size(self):
+        return tuple(int(np.prod([self.shape[idx] for idx in space])) for space in self.spaces)
+
     @property
     def spaces(self):
         """Set of disjoint iteration spaces that have the same set of strides
         """
         if self._spaces is None:
-            spaces = self._assigner.axes
+            max_shape = self.assignments.max_shape
+            spaces = self._assignments.axes
             # here we modify grouped spaces so that continuous spaces have the same stride
             if self._var_strides:
                 final_spaces = list(spaces)
@@ -148,12 +220,14 @@ def spaces(self):
                     def reduction(var_stride, state: List, space):
                         space_strides = {}
                         for dim in space:
-                            space_strides.setdefault(var_stride[dim], []).append(dim)
+                            space_strides.setdefault(
+                                var_stride[dim], []).append(dim)
                         for space_group in space_strides.values():
                             state.extend(split_list(space_group))
                         return state
 
-                    final_spaces = reduce(partial(reduction, var_stride), final_spaces, [])
+                    final_spaces = reduce(
+                        partial(reduction, var_stride), final_spaces, [])
                 self._spaces = tuple(sorted(tuple(sorted(space))
                                             for space in final_spaces))
             else:
@@ -161,57 +235,89 @@ def reduction(var_stride, state: List, space):
 
         return self._spaces
 
+    @property
+    def expanded_spaces(self):
+        res = []
+        last = 0
+        for space in self.spaces:
+            res.append(tuple(range(last, min(space))) + space)
+        if res:
+            res[-1] = res[-1] + tuple(range(max(res[-1])+1, len(self.shape)))
+        return tuple(res)
+
+    @property
+    def space_total_items(self):
+        return tuple(np.stack(list(self.var_shapes.values())).sum(axis=0))
+
+    @property
+    def space_total_bytes(self):
+        variables = [self.assignments.variables[name] for name in self.var_shapes]
+        sizes = [1 if var.dtype is None else AT_ARG_DATATYPES[var.dtype][1] for var in variables]
+        return tuple((np.stack(list(self.var_shapes.values())) * np.array(sizes).reshape((-1, 1))).sum(axis=0))
+
     @property
     def var_shapes(self):
         if self._var_shapes is None:
-            self._var_shapes = self._assigner.var_shapes.copy()
+            self._var_shapes = self._assignments.var_shapes.copy()
             self._var_shapes = dict(
                 zip(self._var_shapes.keys(), Symbol.extend_shapes(*self._var_shapes.values())))
 
         return self._var_shapes
 
     @property
-    def axis_shape(self):
-        return tuple(np.prod([self.shape[axis] for axis in axis_set]) for axis_set in self.spaces)
+    def has_scalar_parameters(self):
+        return any(len(shape) == 1 and shape[0] == 1 for shape in self.var_shapes.values())
+
 
     @property
     def iterator_names(self):
         return [self._var_name(idx) for idx in range(len(self.spaces))]
 
     @property
-    def interior_space(self):
+    def interior_spaces(self):
         """This provides the minimum tile space if it is more than one axis"""
+        expanded_spaces = list(self.expanded_spaces)
+        if len(expanded_spaces) <= 1:
+            return tuple()
         dims = []
-        shape = list(self.axis_shape)
+        bytes = self.space_total_bytes
         total = 1
-        while len(shape) > 1 and total < self._min_interior_space:
-            dims.append(len(shape) - 1)
-            total *= shape[-1]
-            shape = shape[0:-1]
-        return tuple(reversed(dims))
+        while len(expanded_spaces) > 1 and total < self._min_interior_space:
+            new_size = total * np.prod([bytes[idx] for idx in expanded_spaces[-1]])
+            if new_size * 8 > self._max_interior_space:
+                break
+            dims.append(len(expanded_spaces) - 1)
+            total = new_size
+            expanded_spaces = expanded_spaces[0:-1]
+        return tuple(self.spaces[idx] for idx in reversed(dims))
 
     @property
     def interior_shape(self):
-        shape = list(self.axis_shape)
-        return tuple(shape[idx] for idx in self.interior_space)
+        return tuple(self.shape[self.spaces.index(space)] for space in self.interior_spaces)
 
     @property
     def exterior_spaces(self):
-        return tuple(range(len(self.axis_shape) - len(self.interior_space)))
+        return tuple(self.spaces[idx] for idx in range(len(self.spaces_size) - len(self.interior_spaces)))
 
     @property
     def exterior_space_names(self):
-        return tuple(self._par_name(idx) for idx in range(len(self.exterior_spaces)))
+        return tuple(self._par_name(self.spaces.index(space)) for space in range(len(self.exterior_spaces)))
 
     @property
     def exterior_shape(self):
-        shape = list(self.axis_shape)
+        shape = list(self.spaces_size)
         num_ext_spaces = len(self.exterior_spaces)
         return tuple(shape[:num_ext_spaces - 1] + [shape[num_ext_spaces - 1] * np.prod(self.interior_shape)])
 
     @property
-    def parameteric_spaces(self):
-        return tuple(self.exterior_spaces[-2:])
+    def parametric_spaces(self):
+        return tuple(self.exterior_spaces[-self._num_parameteric_spaces:])
+
+    @property
+    def paralellized_space(self):
+        if self.parametric_spaces:
+            return max([(space, self.real_shape[self.spaces.index(space)]) for space in self.parametric_spaces], key=lambda x: x[1])[0]
+        return 0
 
     @property
     def interior_shape_size(self):
@@ -219,7 +325,7 @@ def interior_shape_size(self):
 
     @property
     def fixed_spaces(self):
-        return tuple(self.exterior_spaces[:-2])
+        return tuple(self.exterior_spaces[:-self._num_parameteric_spaces])
 
     def preferred_divisor(self, space):
         if space == 0:
@@ -232,19 +338,22 @@ def preferred_divisor(self, space):
         return 1
 
     def c_indexed_var(self, var_name, declare=False, assignment=False):
-        if var_name in self.temporary_variables:
+        # if var_name.startswith('_SYMBOL'):
+        #     return var_name
+        if var_name in self.temporary_names:
             if declare or assignment:
-                dtype = self._assigner.find(var_name).dtype
+                dtype = self._assignments.variable(var_name).dtype
                 return f"{DTYPE_GAP_CTYPE[dtype]} {var_name}"
             return var_name
         if declare:
-            dtype = self._assigner.find(var_name).dtype
+            dtype = self._assignments.variable(var_name).dtype
             return f"{DTYPE_GAP_CTYPE[dtype]} *{var_name}"
-        return f'{var_name}{self.c_index(var_name)}'
+        c_index = self.c_index(var_name)
+        if c_index:
+            return f'{var_name}{c_index}'
+        return f'*{var_name}'
 
     def c_index(self, var_name):
-        var_spaces = [self.spaces.index(space)
-                      for space in self.var_axes[var_name]]
         var_ext_shape = self.var_shapes[var_name]
         var_shape = [np.prod([var_ext_shape[dim] for dim in space])
                      for space in self.var_axes[var_name]]
@@ -256,23 +365,24 @@ def c_index(self, var_name):
                 assert all(var_stride_in_space[-1] ==
                            var_stride[dim] for dim in space[1:])
         else:
-            var_stride_in_space = [1] * len(var_spaces)
+            var_stride_in_space = [1] * len(self.var_axes[var_name])
 
         def reduction(state, x):
             var_space, space_dim, var_stride = x
+            var_space_idx = self.spaces.index(var_space)
             # fixed spaces are iterated by tiler code
             if var_space in self.fixed_spaces:
                 return state
             space_size = str(
-                space_dim) if var_space in self.interior_space else self._var_max_name(var_space)
+                space_dim) if var_space == self.interior_spaces else self._var_max_name(var_space_idx)
             assert abs(var_stride) == 1, "non unit strides not supported yet"
             if var_stride < 0:
                 if var_space == 0:
-                    index = f'(Last-1-{self._var_name(var_space)})'
+                    index = f'(Last-1-{self._var_name(var_space_idx)})'
                 else:
-                    index = f'({space_size}-1-{self._var_name(var_space)})'
+                    index = f'({space_size}-1-{self._var_name(var_space_idx)})'
             else:
-                index = f'{self._var_name(var_space)}'
+                index = f'{self._var_name(var_space_idx)}'
             if state[0]:
                 state[1].insert(
                     0, f"({index}*{'*'.join(state[0])})")
@@ -280,20 +390,313 @@ def reduction(state, x):
                 state[1].insert(0, index)
 
             state[0].insert(0, str(
-                space_dim) if var_space in self.interior_space else self._var_max_name(var_space))
+                space_dim) if var_space == self.interior_spaces else self._var_max_name(var_space_idx))
             return state
-        index = reduce(reduction, zip(reversed(var_spaces),
+        index = reduce(reduction, zip(reversed(self.var_axes[var_name]),
                                       reversed(var_shape),
                                       reversed(var_stride_in_space)), ([], []))[1]
-        return f"[{'+'.join(index)}]"
+        return f"[{'+'.join(index)}]" if index else ""
+
+    def get_iterator_vars(self):
+        iters = []
+        for idx, space in enumerate(self.spaces):
+            if space in self.interior_spaces:
+                continue
+            if space == self.paralellized_space:
+                iters.extend([('First', 0), ('Last', self.spaces_size[idx]), (self._var_max_name(idx), self.spaces_size[idx])])
+            else:
+                iters.append(
+                    (self._var_max_name(idx), self.spaces_size[idx]))
+        return iters
 
-    def c_for(self, space):
-        if space in self.fixed_spaces:
+    def c_for(self, space, with_fixed=False):
+        if not with_fixed and space == self.fixed_spaces:
             raise ValueError(
-                "space is fixed so not iterated and requires no for loop")
-        var_name = self._var_name(space)
-        if space in self.interior_space:
-            return f"for (int {var_name}=0; {var_name}<{self.shape[space]}; {var_name}++)"
-        if space == 0:
+                "space is fixed so not iterated inside basic kernel and requires no for loop")
+        space_index = self.spaces.index(space)
+        var_name = self._var_name(space_index)
+        if space in self.interior_spaces:
+            return f"for (int {var_name}=0; {var_name}<{self.real_shape[space_index]}; {var_name}++)"
+        if space == self.paralellized_space:
             return f"for (int {var_name}=First; {var_name}<Last; {var_name}++)"
-        return f"for (int {var_name}=0; {var_name}<{self._var_max_name(space)}; {var_name}++)"
+        return f"for (int {var_name}=0; {var_name}<{self._var_max_name(space_index)}; {var_name}++)"
+
+    def ctype_len(self, var_name):
+        return DTYPES[self._assignments.variables[var_name].dtype][0]//8
+
+    def ctype(self, var_name, pointer=False, restrict=False):
+        return (f'{DTYPE_GAP_CTYPE[self._assignments.variables[var_name].dtype]}'
+                f'{" *" if pointer else ""}'
+                f'{"__restrict__ " if restrict else ""}')
+
+    def at_argdatatype(self, var_name, restrict=False, pointer=False):
+        dtype = self._assignments.variables[var_name].dtype
+        dt_func, datasize, _, _ = AT_ARG_DATATYPES[dtype]
+        return f'{dt_func}({datasize}, {1 if pointer else 0}, {1 if restrict else 0})'
+
+    @property
+    def at_uk_iterspace(self):
+        # All iterators are in parametric spaces. The iterator we will
+        # parallelize on has its preferred div set to 8
+        # since only 3 tiled spaces are allowed including the dummy TILE0 space if there are scalars
+        # we check for that and only tile the first 3 spaces
+        iterators = (
+            [f'IterFixedSpace(KER_ITER_D{self.spaces.index(space)}, {self.spaces_size[self.spaces.index(space)]})'
+             for space in self.fixed_spaces] +  # fixed spaces
+            [f'IterParSpace(KER_ITER_D{self.spaces.index(space)}, {self.spaces_size[self.spaces.index(space)]}, '
+             f'{8 if space == self.paralellized_space else 1})'  # other par spaces
+             for space in self.parametric_spaces])
+        # always include tile space?
+        iterators.append('IterTiledSpace(KER_ITER_TILE0)')
+        return f'KernelIterSpace({len(iterators)}, {", ".join(iterators)})'
+
+    @property
+    def at_uk_kargs(self):
+        kargs = []
+        for var_name, axis_idxes in self.var_axes_idx.items():
+            if var_name in self.temporary_names:
+                continue
+            iter_space_descrs = []
+            int_size = 1
+            var_stride = self._var_strides.get(var_name)
+            for var_dim_idx, space_idx in enumerate(axis_idxes):
+                if space_idx >= len(self.exterior_spaces):
+                    int_size *= self.spaces_size[space_idx]
+                    continue
+                if var_stride and var_stride[var_dim_idx] < 0:
+                    iter_space_descrs.append(
+                        f'KER_ITER_D{space_idx}|SPACE_PROP_REVERT')
+                else:
+                    iter_space_descrs.append(f'KER_ITER_D{space_idx}')
+            if iter_space_descrs:
+                argspace = f'KerArgSpace({len(iter_space_descrs)}, {", ".join(iter_space_descrs)})'
+            else:
+                argspace = 'KerArgSpace(1, KER_ITER_TILE0)'
+            if var_name in self.output_names:
+                constraints = "O_OUT|O_DB" if iter_space_descrs else "O_OUT|O_BUFF|O_NTILED"
+            elif self.is_var_constant(var_name):
+                constraints = "O_IN|O_DB|O_CONST"
+            else:
+                constraints = "O_IN|O_DB" if iter_space_descrs else "O_IN|O_BUFF|O_NTILED"
+            kargs.append(
+                (f'KerArg("{var_name}", ',
+                 f'{argspace}, ',
+                 f'{constraints}, ',
+                 f'1, 1, ',
+                 f'{self.ctype_len(var_name) * int_size}, ',
+                 f'0, 0, 0, "{var_name}")'))
+        return tabulate(kargs)
+
+    @property
+    def at_uk_cargs(self):
+        return ([f'TCArg({self.at_argdatatype(var_name, pointer=True, restrict=True)}, "{var_name}")'
+                 for var_set in [self.input_names, self.output_names]
+                 for var_name in sorted(var_set)])
+
+    @property
+    def at_uk_kinfos(self):
+        cvars = sorted(self.input_names)
+        pvars = sorted(self.output_names)
+        in_sizes = [np.prod(self.var_shapes[var_name])
+                    for var_name in cvars]
+        out_sizes = [np.prod(self.var_shapes[var_name])
+                     for var_name in pvars]
+        bandwidth = sum(in_sizes + out_sizes)
+        kinfos = [
+            f"AddKernelInfos(Name, AT_KERINFO_OPER, {self._assignments.ops * max(in_sizes)}, 0)",
+            f"AddKernelInfos(Name, AT_KERINFO_BANDWIDTH, {bandwidth}, 0)"
+        ]
+        ksize_infos = []
+        for var_name in cvars + pvars:
+            shape = reduce(
+                lambda s, x: s + [x] if x > 1 or s else s, self.var_shapes[var_name], [])
+            _, item_size, _, is_float = AT_ARG_DATATYPES[
+                self._assignments.variables[var_name].dtype]
+            add_arg_func = "AddKernelFloatArgDim" if is_float else "AddKernelArgDim"
+            ksize_infos.append(
+                (f'{add_arg_func}(Name, "{var_name}", ',
+                 f'{len(shape) + 1}, ',
+                 f'{", ".join(str(dim) for dim in shape) if shape else 1}, ',
+                 f'{item_size})'))
+        return kinfos + tabulate(ksize_infos)
+
+    @property
+    def kernel_args(self):
+        return ([(self._var_max_name(self.spaces.index(space)), 'unsigned int') for space in self.exterior_spaces] +
+                [(var_name, self.ctype(var_name, pointer=True, restrict=True))
+                 for var_set in [self.input_names, self.output_names]
+                 for var_name in sorted(var_set)])
+
+    @property
+    def at_uk_kerbindings(self):
+        bindings = [
+            f"K_ArgPar(\"{max(self.output_names, key=lambda x: self.var_shapes[x][idx])}\", KER_ARG_PARTILE_SIZE, KER_ITER_D{idx})"
+            for idx in range(len(self.exterior_shape))
+        ] + [
+            f"K_Arg(\"{var_name}\", KER_ARG_TILE)"
+            for var_set in [self.input_names, self.output_names]
+            for var_name in sorted(var_set)
+        ]
+        return bindings
+
+    def comment_attrs(self, code, *attrs):
+        code.comment("".join(f'{name}: {getattr(self, name)} ' if getattr(self, name) else ''
+                             for name in attrs))
+
+    def gen_kernel_headers(self, code: CodeBlock = None):
+        if code is None:
+            code = CodeBlock()
+        for include in self._assignments.c_header_set:
+            code.write('#include {}', include)
+        return code
+
+    def gen_user_kernel(self, ukname: str, kname: str, code: CodeBlock = None):
+        if code is None:
+            code = CodeBlock()
+        code.write(f"int {ukname}(char *Name) {{")
+        code.indent()
+        code.write("Kernel_T *Kernel = UserKernel(")
+        code.indent()
+        code.write("Name,")
+        # include some useful parameters as comment
+        self.comment_attrs(code,
+                           'shape',
+                           'spaces')
+        self.comment_attrs(code,
+                           'fixed_spaces',
+                           'parametric_spaces',
+                           'interior_spaces')
+        self.comment_attrs(code,
+                           'exterior_shape',
+                           'interior_shape')
+        code.write(f'{self.at_uk_iterspace},')
+        kargs = self.at_uk_kargs
+        code.write("TILE_VER,")
+        cargs = self.at_uk_cargs
+        code.write(f"CArgs({len(cargs)},")
+        code.indent()
+        for carg in cargs[: -1:]:
+            code.write(f"{carg},")
+        code.write(f"{cargs[-1]}")
+        code.deindent()
+        code.write("),")
+        code.write("Calls(1,")
+        code.indent()
+        code.write(f'Call("{kname}", LOC_D{len(self.exterior_shape) - 1},')
+        code.indent()
+        bindings = self.at_uk_kerbindings
+        code.write(f"Bindings({len(bindings)},")
+        code.indent()
+        for binding in bindings[: -1:]:
+            code.write(f"{binding},")
+        code.write(f"{bindings[-1]}")
+        code.deindent()
+        code.write(")")
+        code.deindent()
+        code.write(")")
+        code.deindent()
+        code.write("),")
+        for var_name, idxes in self.var_axes_idx.items():
+            if var_name in self.temporary_names:
+                continue
+            stride = f" stride: {self._var_strides[var_name]}" if var_name in self._var_strides else ""
+            code.comment(f'var: {var_name} axes: {idxes}{stride}')
+        code.write("KerArgs({0},", len(kargs))
+        code.indent()
+        for karg in kargs[: -1:]:
+            code.write("{0},", karg)
+        code.write("{0}", kargs[-1])
+        code.deindent()
+        code.write(")")
+        code.deindent()
+        code.write(");")
+        code.write("if (Kernel) {")
+        code.indent()
+        for kinfo in self.at_uk_kinfos:
+            code.write("{0};", kinfo)
+        code.deindent()
+        code.write("}")
+        code.write("return (Kernel!=0);")
+        code.deindent()
+        code.write("}")
+        return code
+
+    def gen_function(self, kernel_name: str, kernel_arg_type_name: str, code: CodeBlock = None):
+        if code is None:
+            code = CodeBlock()
+
+        code.comment(
+            f'Output iteration space reduced to {len(self.interior_spaces)} internal '
+            f'and {len(self.exterior_spaces)} external iteration spaces')
+        code.write(f"void {kernel_name}({kernel_arg_type_name} *Args) {{")
+        code.indent()
+        comments = []
+        for kerarg_name, _ in self.kernel_args:
+            # TODO - add qrecs for quantized kernels
+            comments.append([
+                f'{self.var_shapes[kerarg_name]} ' if kerarg_name in self.var_shapes else '',
+                f'{self._assignments.qrecs[kerarg_name]}' if kerarg_name in self._assignments.qrecs else ''
+            ])
+        comments = tabulate(comments)
+        for idx, (kerarg_name, kerarg_type) in enumerate(self.kernel_args):
+            # TODO - add qrecs for quantized kernels
+            comment = comments[idx]
+            if comment.strip():
+                comment = f' // {comment}'
+            else:
+                comment = ''
+            code.write(
+                f'{kerarg_type} {kerarg_name} = Args->{kerarg_name};{comment}')
+        # paralellize on largest dimension
+        last_first = self._var_max_name(self.spaces.index(self.paralellized_space))
+        code.write('unsigned int CoreId = gap_coreid();')
+        code.write(f'unsigned int Chunk = ChunkSize({last_first});')
+        code.write('unsigned int First = Chunk*CoreId;')
+        code.write(f'unsigned int Last = gap_min(First+Chunk, {last_first});')
+        self._assignments.c_block(code, iteration_space=self,
+                                  with_loops=True, with_comment=True)
+        code.write('gap_waitbarrier(0);')
+        code.deindent()
+        code.write('}')
+        return code
+
+    def gen_kernel_arg_typedecl(self, type_name, code=None):
+        if code is None:
+            code = CodeBlock()
+        code.write('typedef struct {')
+        code.indent()
+        for kerarg_name, kerarg_type in self.kernel_args:
+            code.write(f'{kerarg_type} {kerarg_name};')
+        code.deindent()
+        code.write(f'}} {type_name};')
+        return code
+
+    def gen_kernel_model(self, kernel_name, kernel_arg_type_name, code=None):
+        if code is None:
+            code = CodeBlock()
+        code.write('LibKernelTemplate(')
+        code.indent()
+        code.write(f'"{kernel_arg_type_name}",')
+        code.write(f'CArgs({len(self.kernel_args)},')
+        code.indent()
+        for idx, (kerarg_name, kerarg_type) in enumerate(self.kernel_args):
+            code.write('TCArg("{}", "{}"){}',
+                       kerarg_type,
+                       kerarg_name,
+                       "," if idx < (len(self.kernel_args) - 1) else '')
+        code.deindent()
+        code.write(')')
+        code.deindent()
+        code.write(');')
+        code.write('')
+        code.write('LibKernel(')
+        code.indent()
+        code.write('"{}",', kernel_name)
+        code.write('CALL_PARALLEL,')
+        code.write('0,')
+        code.write('"{}",', kernel_arg_type_name)
+        code.write('0')
+        code.deindent()
+        code.write(');')
+
+        return code
diff --git a/tools/nntool/expressions/symbolic/kernel_codegen.py b/tools/nntool/expressions/symbolic/kernel_codegen.py
deleted file mode 100644
index 1e45a6ab3..000000000
--- a/tools/nntool/expressions/symbolic/kernel_codegen.py
+++ /dev/null
@@ -1,313 +0,0 @@
-# Copyright (C) 2020  GreenWaves Technologies, SAS
-
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as
-# published by the Free Software Foundation, either version 3 of the
-# License, or (at your option) any later version.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU Affero General Public License for more details.
-
-# You should have received a copy of the GNU Affero General Public License
-# along with this program.  If not, see <https://www.gnu.org/licenses/>.
-
-import logging
-from typing import Sequence
-
-import numpy as np
-from generation.code_block import CodeBlock
-from quantization.qtype import DTYPES
-
-from .function_collection import FunctionCollection
-
-LOG = logging.getLogger("nntool." + __name__)
-
-
-class BasicKernel():
-    def __init__(self, func_col: FunctionCollection, constant_input_names: Sequence[str]) -> None:
-        self._func_col = func_col
-        self._constant_input_names = constant_input_names
-
-    @property
-    def func_col(self):
-        return self._func_col
-
-    @property
-    def input_names(self):
-        return self._func_col.input_names
-
-    @property
-    def input_names_and_ctypes(self):
-        return [(name, self.func_col.qrecs[name].ctype) for name in self.input_names]
-
-    @property
-    def output_names(self):
-        return self._func_col.output_names
-
-    @property
-    def output_names_and_ctypes(self):
-        return [(name, self.func_col.qrecs[name].ctype) for name in self.output_names]
-
-    @property
-    def intermediate_names(self):
-        return self._func_col.intermediate_names
-
-    @property
-    def shapes(self):
-        return self._func_col.var_shapes
-
-    @property
-    def kernel_dims(self):
-        return self._func_col.kernel_dims
-
-    @property
-    def kernel_args(self):
-        kernel_args = []
-        for kiter in self.func_col.iterators:
-            kernel_args.append((kiter.name.upper(), "unsigned int"))
-        for input_name, ctype in self.input_names_and_ctypes:
-            kernel_args.append((input_name, f"{ctype} *"))
-        for output_name, ctype in self.output_names_and_ctypes:
-            kernel_args.append((output_name, f"{ctype} *"))
-        return kernel_args
-
-    def ctype_len(self, sym_name):
-        dtype = self.func_col.qrecs[sym_name].dtype
-        if dtype not in DTYPES:
-            raise ValueError(f"don't know dtype {dtype}")
-        return DTYPES[dtype][0]//8
-
-    def gen_kernel_headers(self, code: CodeBlock = None):
-        if code is None:
-            code = CodeBlock()
-        for include in self._func_col.c_header_set:
-            code.write('#include {}', include)
-        return code
-
-    def gen_user_kernel(self, ukname: str, kname: str, code: CodeBlock = None):
-        if code is None:
-            code = CodeBlock()
-        code.write("int {0}(char *Name) {{", ukname)
-        code.indent()
-        code.write("Kernel_T *Kernel = UserKernel(")
-        code.indent()
-        code.write("Name,")
-        code.write("{0},", self.gen_iterspace())
-        kargs = self.gen_kargs()
-        code.write("TILE_HOR,")
-        cargs = self.gen_cargs()
-        code.write("CArgs({0},", len(cargs))
-        code.indent()
-        for carg in cargs[:-1:]:
-            code.write("{0},", carg)
-        code.write("{0}", cargs[-1])
-        code.deindent()
-        code.write("),")
-        code.write("Calls(1,")
-        code.indent()
-        code.write("Call(\"{0}\", LOC_D{1},", kname,
-                   len(self.func_col.iterators) - 1)
-        code.indent()
-        bindings = self.gen_kerbingings()
-        code.write("Bindings({0},", len(bindings))
-        code.indent()
-        for binding in bindings[:-1:]:
-            code.write("{0},", binding)
-        code.write("{0}", bindings[-1])
-        code.deindent()
-        code.write(")")
-        code.deindent()
-        code.write(")")
-        code.deindent()
-        code.write("),")
-        code.write("KerArgs({0},", len(cargs))
-        code.indent()
-        for karg in kargs[:-1:]:
-            code.write("{0},", karg)
-        code.write("{0}", kargs[-1])
-        code.deindent()
-        code.write(")")
-        code.deindent()
-        code.write(");")
-        code.write("if (Kernel) {")
-        code.indent()
-        for kinfo in self.gen_kinfos():
-            code.write("{0};", kinfo)
-        code.deindent()
-        code.write("}")
-        code.write("return (Kernel!=0);")
-        code.deindent()
-        code.write("}")
-        return code
-
-    def gen_kinfos(self):
-        in_sizes = [np.prod(self._func_col.var_shapes[var_name])
-                    for var_name in self._func_col.input_names]
-        bandwidth = sum([np.prod(self._func_col.var_shapes[var_name])
-                         for var_name in self._func_col.output_names]) + sum(in_sizes)
-        kinfos = [
-            "AddKernelInfos(Name, AT_KERINFO_OPER, {0}, 0)".format(
-                self._func_col.ops * max(in_sizes)),
-            "AddKernelInfos(Name, AT_KERINFO_BANDWIDTH, {0}, 0)".format(
-                bandwidth)
-        ]
-        for name_type in self.input_names_and_ctypes + self.output_names_and_ctypes:
-            name = name_type[0]
-            shape = self.shapes[name]
-            kinfos.append("{0}(Name, \"{1}\", {2}, {3}, {4})".format(
-                "AddKernelFloatArgDim" if name_type[1] == 'F16' or name_type[1] == 'float' else "AddKernelArgDim",
-                name, len(shape) + 1, ", ".join(str(dim) for dim in shape),
-                self.ctype_len(name)))
-        return kinfos
-
-    def gen_cargs(self):
-        cargs = []
-        for name_type in self.input_names_and_ctypes + self.output_names_and_ctypes:
-            name = name_type[0]
-            if name_type[1] == 'F16' or name_type[1] == 'float':
-                cargs.append("TCArg(CNN_ArgDataTypeF({0},1,1),\"{1}\")".format(
-                    self.ctype_len(name), name))
-            else:
-                cargs.append("TCArg(CNN_ArgDataType({0},1,1),\"{1}\")".format(
-                    self.ctype_len(name), name))
-        return cargs
-
-    def gen_kargs(self):
-        kargs = []
-        for input_name in self.input_names:
-            arg_indexes = self._func_col.variable_indexes[input_name]
-            argspaces = ", ".join(f'KER_ITER_D{idx}' for idx in arg_indexes)
-            argspace = f'KerArgSpace({len(arg_indexes)}, {argspaces})' if arg_indexes else f'KerArgSpace(1, KER_ITER_TILE0)'
-            if input_name in self._constant_input_names:
-                constraints = "O_IN|O_DB|O_CONST"
-            else:
-                constraints = "O_IN|O_DB" if arg_indexes else "O_IN|O_BUFF|O_NTILED"
-            kargs.append("KerArg(\"{0}\", {1}, {2}, {3}, {4}, {5}, 0, 0, 0, \"{0}\")".format(
-                input_name,
-                argspace,
-                constraints,
-                1,
-                1,
-                self.ctype_len(input_name)))
-
-        for output_name in self.output_names:
-            arg_indexes = self._func_col.variable_indexes[output_name]
-            argspaces = ", ".join(f'KER_ITER_D{idx}' for idx in arg_indexes)
-            argspace = f'KerArgSpace({len(arg_indexes)}, {argspaces})' if arg_indexes else 'KerArgSpace(1, KER_ITER_TILE0)'
-            name = output_name
-            constraints = "O_OUT|O_DB" if arg_indexes else "O_OUT|O_BUFF|O_NTILED"
-            kargs.append("KerArg(\"{0}\", {1}, {2}, {3}, {4}, {5}, 0, 0, 0, \"{0}\")".format(
-                name, argspace, constraints, 1, 1,
-                self.ctype_len(output_name)))
-        return kargs
-
-    def gen_iterspace(self):
-        # All iterators are in parametric spaces. The iterator we will
-        # parallelize on has its preferred div set to 8
-        # since only 3 tiled spaces are allowed including the dummy TILE0 space if there are scalars
-        # we check for that and only tile the first 3 spaces
-        tiled_iterators = self.tiled_iterators
-        iterators = [
-            f'IterFixedSpace(KER_ITER_D{idx}, {iterator.shape[0]})'
-            if iterator not in tiled_iterators else
-            f'IterParSpace(KER_ITER_D{idx}, {iterator.shape[0]}, '
-            f'{min(8, iterator.shape[0]) if iterator == self.parallel_iterator else 1})'
-            for idx, iterator in enumerate(self._func_col.iterators)]
-        # append dummy TILE0 space to put scalars into if there are scalar inputs (which is unlikely)
-        if self.has_scalar_parameters:
-            iterators.append('IterTiledSpace(KER_ITER_TILE0)')
-        return f'KernelIterSpace({len(iterators)}, {",".join(iterators)})'
-
-
-    def gen_kerbingings(self):
-        max_dim_var = max(self.output_names, key=lambda x: len(self.shapes[x]))
-        bindings = [
-            f"K_ArgPar(\"{max_dim_var}\", KER_ARG_PARTILE_SIZE, KER_ITER_D{idx})"
-            for idx in range(len(self._func_col.iterators))
-        ] + [
-            f"K_Arg(\"{name}\", KER_ARG_TILE)"
-            for name in self.input_names + self.output_names
-        ]
-        return bindings
-
-    @property
-    def parallel_iterator(self):
-        return max(self.func_col.iterators, key=lambda x: x.shape[0])
-
-    @property
-    def tiled_iterators(self):
-        return sorted(self.func_col.iterators, key=lambda x: x.shape[0])[-2::]
-
-    @property
-    def fixed_iterators(self):
-        tiled_iterators = self.tiled_iterators
-        return [iterator for iterator in self.func_col.iterators if iterator not in tiled_iterators]
-
-    @property
-    def has_scalar_parameters(self):
-        return any(not self._func_col.variable_indexes[input_name]
-                   for input_name in self.input_names + self.output_names)
-
-    def gen_function(self, kernel_name, kernel_arg_type_name, code=None):
-        if code is None:
-            code = CodeBlock()
-
-        code.comment(
-            "Output iteration space reduced to %s iteration spaces" % (self.kernel_dims))
-        code.write(f"void {kernel_name}({kernel_arg_type_name} *Args) {{")
-        code.indent()
-        for kerarg_name, kerarg_type in self.kernel_args:
-            code.write('{0} {1} = Args->{1};', kerarg_type, kerarg_name)
-        # paralellize on largest dimension
-        last_first = self.parallel_iterator.name.upper()
-        code.write('unsigned int CoreId = gap_coreid();')
-        code.write('unsigned int Chunk = ChunkSize({});', last_first)
-        code.write('unsigned int First = Chunk*CoreId;')
-        code.write('unsigned int Last = gap_min(First+Chunk, {});', last_first)
-        self._func_col.create_kernel(self.parallel_iterator, self.fixed_iterators, code)
-        code.write('gap_waitbarrier(0);')
-        code.deindent()
-        code.write('}')
-        return code
-
-    def kernel_arg_type_codegen(self, type_name, code=None):
-        if code is None:
-            code = CodeBlock()
-        code.write('typedef struct {')
-        code.indent()
-        for kerarg_name, kerarg_type in self.kernel_args:
-            code.write('{} {};', kerarg_type, kerarg_name)
-        code.deindent()
-        code.write('}} {};', type_name)
-        return code
-
-    def gen_kernel_model(self, kernel_name, kernel_arg_type_name, code=None):
-        if code is None:
-            code = CodeBlock()
-        code.write('LibKernelTemplate(')
-        code.indent()
-        code.write('"{}",', kernel_arg_type_name)
-        code.write('CArgs({},', len(self.kernel_args))
-        code.indent()
-        for idx, (kerarg_name, kerarg_type) in enumerate(self.kernel_args):
-            code.write('TCArg("{}", "{}"){}',
-                       kerarg_type,
-                       kerarg_name,
-                       "," if idx < (len(self.kernel_args) - 1) else '')
-        code.deindent()
-        code.write(')')
-        code.deindent()
-        code.write(');')
-        code.write('LibKernel(')
-        code.indent()
-        code.write('"{}",', kernel_name)
-        code.write('CALL_PARALLEL,')
-        code.write('0,')
-        code.write('"{}",', kernel_arg_type_name)
-        code.write('0')
-        code.deindent()
-        code.write(');')
-
-        return code
diff --git a/tools/nntool/expressions/symbolic/q15_quantization/handlers.py b/tools/nntool/expressions/symbolic/q15_quantization/handlers.py
index 4cd591872..eab7394a4 100644
--- a/tools/nntool/expressions/symbolic/q15_quantization/handlers.py
+++ b/tools/nntool/expressions/symbolic/q15_quantization/handlers.py
@@ -17,24 +17,26 @@
 from typing import Tuple
 
 import numpy as np
+from bfloat16 import bfloat16
+from expressions.symbolic.float_quantization.float_qrec import FloatQRec
 from utils.exp_17_15 import exp_fp_17_15
-from utils.pow_sqrt import (arctan_17_15alt, logn_17_15, pow_17_15, rsqrt_16_16, sqrt_17_15,
-                            square_17_15)
+from utils.pow_sqrt import (arctan_17_15alt, logn_17_15, pow_17_15,
+                            rsqrt_16_16, sqrt_17_15, square_17_15)
 from utils.sigmoid_tanh_lut import sigmoid_lut, tanh_lut
 from utils.sin_cos import fpcos, fpsin
 
 from ..basic import (Abs, Add, ATan, Cast, Cos, Div, Exp, GapAbs, GapMax,
-                     GapMin, Log, LShift, Max, Min, Mul, Pow, RSqrt, Sigmoid, Sin,
-                     Sqrt, Sub, TanH)
+                     GapMin, Log, LShift, Max, Min, Mul, Pow, RSqrt, Sigmoid,
+                     Sin, Sqrt, Sub, TanH)
 from ..function import Function
 from ..quantization_base import qhandler
-from ..symbol import (Constant, Rational, Symbol, SymbolStats, Variable,
-                      c_headers, environment, nargs)
+from ..symbol import (Constant, QuantizedConstant, QuantizedValue, Rational,
+                      Symbol, SymbolStats, Variable, c_headers, environment,
+                      nargs)
 from .clip_norm import Clip, Norm
 from .q15_scale_float import Q15ScaleFloat
 from .q15_scale_q_rec import Q15ScaleQRec
 from .q15_scaled_quantization import Q15ScaledQuantization
-from .quantized_constant import QuantizedConstant, QuantizedValue
 from .scale_quantized import ScaleQuantized
 
 
@@ -91,10 +93,22 @@ def _quantize(cls,
 
         # see if an nntool quantizer qtype is available
         if not qrec and qtypes and sym.name in qtypes:
-            sym, qrec = cls.qrec_from_qtype(sym, qtypes[sym.name])
+            in_range = sym_ctrl.get_range(sym)
+            qtype = qtypes[sym.name]
+            if in_range is None:
+                if qtype.max_val is not None and qtype.min_val is None:
+                    in_range = (qtype.min_val, qtype.max_val)
+                else:
+                    in_range = (qtype.min, qtype.max)
+            osym, qrec = cls.qrec_from_qtype(sym, qtypes[sym.name], in_range)
             if qrec:
-                sym.qrec = qrec
-                return (sym, qrec)
+                osym.qrec = qrec
+                if isinstance(qrec, Q15ScaleQRec) and qrec.zero_point != 0:
+                    osym = Sub(Cast(osym, dtype=np.int32), QuantizedConstant(
+                        qrec.zero_point), dtype=np.int32, tag=True)
+                    qrec = Q15ScaleQRec.override(
+                        qrec, dtype=np.int32, zero_point=0)
+                return (osym, qrec)
 
         # figure out the quantization from the maximum value recorded
         max_val = sym_ctrl.get_max(sym)
@@ -110,41 +124,41 @@ def _quantize(cls,
         return (sym, qrec)
 
     @classmethod
-    def qrec_from_qtype(cls, sym, qtype):
-        if qtype.dtype == np.int8:
+    def qrec_from_qtype(cls, sym, qtype, in_range):
+        if qtype.dtype in [np.int8, np.uint8, np.int16, np.uint16]:
             if len(qtype.scale) > 1:
                 return sym, None
-            max_val = qtype.scale[0] * (math.pow(2, 7) - qtype.zero_point[0])
-            min_val = -qtype.scale[0] * (math.pow(2, 7) + qtype.zero_point[0])
-            return sym, Q15ScaleQRec(np.int8, max_val, 7,
-                                     max_val=max_val, min_val=min_val,
-                                     zero_point=qtype.zero_point[0])
-        elif qtype.dtype == np.int16:
-            if len(qtype.scale) > 1:
-                return sym, None
-            max_val = qtype.scale[0] * (math.pow(2, 15) - qtype.zero_point[0])
-            min_val = -qtype.scale[0] * (math.pow(2, 15) + qtype.zero_point[0])
-            return sym, Q15ScaleQRec(np.int16, max_val, 15,
-                                     max_val=max_val, min_val=min_val,
-                                     zero_point=qtype.zero_point[0])
-        elif qtype.dtype == np.uint8:
-            if len(qtype.scale) > 1:
-                return sym, None
-            max_val = qtype.scale[0] * (math.pow(2, 8) - qtype.zero_point[0])
-            min_val = qtype.scale[0] * -qtype.zero_point[0]
-            return sym, Q15ScaleQRec(np.uint8, max_val, 8,
-                                     max_val=max_val, min_val=min_val,
-                                     zero_point=qtype.zero_point[0])
-        elif qtype.dtype == np.uint16:
-            if len(qtype.scale) > 1:
-                return sym, None
-            max_val = qtype.scale[0] * (math.pow(2, 16) - qtype.zero_point[0])
-            min_val = qtype.scale[0] * -qtype.zero_point[0]
-            return sym, Q15ScaleQRec(np.uint16, max_val, 16,
-                                     max_val=max_val, min_val=min_val,
-                                     zero_point=qtype.zero_point[0])
-        else:
-            return None
+            max_val, min_val, bitlen = Q15ScaleQRec.dtype_zp_to_min_max(
+                qtype.dtype, qtype.scale[0], qtype.zero_point[0])
+            qrec = Q15ScaleQRec(qtype.dtype, max_val, bitlen,
+                                max_val=max_val, min_val=min_val,
+                                zero_point=qtype.zero_point[0])
+            sym.qrec = qrec
+            return sym, qrec
+        if qtype.dtype in [np.float16, bfloat16, np.float32]:
+            qrec = FloatQRec(dtype=qtype.dtype)
+            sym.qrec = qrec
+            max_val = np.max(np.maximum(
+                np.abs(in_range[0]), np.abs(in_range[1])))
+            return (
+                Cast(
+                    Add(
+                        Mul(
+                            sym,
+                            Constant(
+                                np.atleast_1d(math.pow(2, 15) /
+                                              max_val).astype(qtype.dtype),
+                                dtype=qtype.dtype),
+                            dtype=qtype.dtype),
+                        Constant([0.5], dtype=qtype.dtype),
+                        dtype=qtype.dtype
+                    ),
+                    dtype=np.int32),
+                Q15ScaleQRec(np.int32, max_val, 15,
+                             max_val=max_val, min_val=-max_val,
+                             zero_point=0))
+        raise NotImplementedError(
+            "don't know how to convert input type to Q15 quantization")
 
 
 @qhandler("Q15Scale", QuantizedValue)
@@ -184,7 +198,8 @@ def cast_symbols(in_syms, qrecs, dtype=np.int32):
 
 
 def find_range(sym, qrecs):
-    assert all(qrec.min_val is not None and qrec.max_val is not None for qrec in qrecs), 'all values must be set'
+    assert all(
+        qrec.min_val is not None and qrec.max_val is not None for qrec in qrecs), 'all values must be set'
     val_range = np.array([
         sym.call_with_constants(qrecs[0].min_val, qrecs[1].min_val),
         sym.call_with_constants(qrecs[0].max_val, qrecs[1].min_val),
@@ -240,12 +255,21 @@ def _quantize(cls,
         if scale_to == 0:
             in_syms = [
                 in_syms[0],
-                ScaleQuantized(in_syms[1], from_qrec=in_qrecs[1], to_qrec=in_qrecs[0])]
+                ScaleQuantized(
+                    in_syms[1],
+                    from_qrec=in_qrecs[1],
+                    to_qrec=in_qrecs[0],
+                    tag=True,
+                    comment=f"{sym.name} scale arg 1 to 0 - {in_qrecs[1]} -> {in_qrecs[0]}")]
             calc_qrec = Q15ScaleQRec.override(in_qrecs[0], dtype=np.int32)
         elif scale_to == 1:
             in_syms = [
                 ScaleQuantized(
-                    in_syms[0], from_qrec=in_qrecs[0], to_qrec=in_qrecs[1]),
+                    in_syms[0],
+                    from_qrec=in_qrecs[0],
+                    to_qrec=in_qrecs[1],
+                    tag=True,
+                    comment=f"{sym.name} scale arg 0 to 1 - {in_qrecs[0]} -> {in_qrecs[1]}"),
                 in_syms[1]]
             calc_qrec = Q15ScaleQRec.override(in_qrecs[1], dtype=np.int32)
         else:
@@ -258,11 +282,16 @@ def _quantize(cls,
 
         # Try not to scale if we are still in bounds
         # This creates more error -> if np.abs(calc_qrec.quantize(max_val)) < max_short or
-        if calc_qrec == out_qrec:
-            return (sym_cls(*in_syms), calc_qrec)
+        # if calc_qrec == out_qrec:
+        return (sym_cls(*in_syms), calc_qrec)
 
-        return (ScaleQuantized(sym_cls(*in_syms, dtype=out_qrec.dtype),
-                               from_qrec=calc_qrec, to_qrec=out_qrec), out_qrec)
+        # return (
+        #     ScaleQuantized(
+        #         sym_cls(*in_syms, dtype=out_qrec.dtype),
+        #         from_qrec=calc_qrec,
+        #         to_qrec=out_qrec,
+        #         tag=True,
+        #         comment=f'{sym.name} scale result to output - {calc_qrec} -> {out_qrec}'), out_qrec)
 
 
 @qhandler("Q15Scale", Mul)
@@ -294,9 +323,13 @@ def _quantize(cls,
         out_qrec = Q15ScaleQRec(np.int32, prod_scale, min(
             prod_q, 15), max_val=prod_scale, min_val=-prod_scale)
         if prod_q > 15:
-            qsym = Norm(sym_cls(*in_syms, dtype=np.int32),
-                        QuantizedConstant(prod_q - 15),
-                        dtype=np.int32)
+            qsym = Norm(
+                sym_cls(*in_syms, dtype=np.int32),
+                QuantizedConstant(prod_q - 15),
+                dtype=np.int32,
+                tag=True,
+                comment=f'normalize input to Q15 - {prod_q - 15}'
+            )
         else:
             qsym = sym_cls(*in_syms)
         return (qsym, out_qrec)
@@ -400,7 +433,7 @@ def _quantize(cls,
                 in_qrec.q - 15), dtype=in_sym.dtype)
 
         out_qrec = Q15ScaleQRec(np.int32, new_scale, 15)
-        return (Cast(Sqrt1715(in_sym, dtype=np.uint32), dtype=np.int32), out_qrec)
+        return (Cast(Sqrt1715(in_sym, dtype=np.uint32), dtype=np.int32, tag=True), out_qrec)
 
 
 @nargs(1)
@@ -451,7 +484,7 @@ def _quantize(cls,
         in_sym = in_syms[0]
 
         out_qrec = Q15ScaleQRec(np.int32, new_scale, 15)
-        return (Cast(Norm(RSqrt1616(in_sym, dtype=np.uint32), QuantizedConstant(norm), dtype=np.uint32), dtype=np.int32), out_qrec)
+        return (Cast(Norm(RSqrt1616(in_sym, dtype=np.uint32), QuantizedConstant(norm), dtype=np.uint32), dtype=np.int32, tag=True), out_qrec)
 
 
 @nargs(1)
@@ -504,7 +537,7 @@ def _quantize(cls,
         max_bits = math.ceil(math.log2(math.fabs(-340695 + qlog_off))) + 2
         return (
             ScaleQuantized(Add(Log1715(in_sym, dtype=np.int32), QuantizedConstant(
-                qlog_off), dtype=np.int32), from_qrec=calc_qrec, to_qrec=out_qrec, num_bits=31-max_bits),
+                qlog_off), dtype=np.int32), from_qrec=calc_qrec, to_qrec=out_qrec, num_bits=31-max_bits, tag=True),
             out_qrec)
 
 
@@ -571,7 +604,7 @@ def _quantize(cls,
 
         if val == 2:
             out_qrec = Q15ScaleQRec(np.int32, np.power(lhs_qrec.scale, 2), 15)
-            return (Cast(Square1715(lhs, dtype=np.int32), dtype=np.int32), out_qrec)
+            return (Cast(Square1715(lhs, dtype=np.int32), dtype=np.int32, tag=True), out_qrec)
         if val == -2:
             out_qrec = Q15ScaleQRec(
                 np.int32, 1/np.power(lhs_qrec.scale, 2), 15)
@@ -583,10 +616,20 @@ def _quantize(cls,
             out_qrec = Q15ScaleQRec(np.int32, 1, 0)
             return (QuantizedConstant(1), out_qrec)
         if val > 0 and val < 1:
+            out_scale = np.power(lhs_qrec.scale, val).item()
             out_qrec = Q15ScaleQRec(
-                np.uint32, np.power(lhs_qrec.scale, val), 15)
+                np.uint32, out_scale, 15)
             qval = int(math.floor(val * math.pow(2, 15) + 0.5))
-            return (Cast(Pow1715(lhs, QuantizedConstant(qval), dtype=np.int32), dtype=np.int32), out_qrec)
+            return (
+                Cast(
+                    Pow1715(
+                        lhs,
+                        QuantizedConstant(qval),
+                        dtype=np.int32),
+                    dtype=np.int32,
+                    comment=f'{sym.name} POW on scale {lhs_qrec.scale:.3f} -> {out_scale:.3f}',
+                    tag=True),
+                out_qrec)
         raise NotImplementedError(
             "power is currently only supported with fractional constants, 2, 1, & 0")
 
@@ -631,7 +674,7 @@ def _quantize(cls,
 
         return (ScaleQuantized(Cast(Exp1715(arg, dtype=np.uint32), dtype=np.int32),
                                from_qrec=calc_qrec,
-                               to_qrec=out_qrec),
+                               to_qrec=out_qrec, tag=True),
                 out_qrec)
 
 
@@ -669,7 +712,7 @@ def _quantize(cls,
         in_syms, in_qrecs = cls.cast_symbols(in_syms, in_qrecs)
         lhs = ScaleQuantized(
             in_syms[0], from_qrec=in_qrecs[0], to_qrec=calc_qrec)
-        return (Arctan1715(lhs), calc_qrec)
+        return (Arctan1715(lhs, tag=True), calc_qrec)
 
 
 @nargs(1)
@@ -723,8 +766,17 @@ def _quantize(cls,
         # output is Q12 * 1
         out_qrec = Q15ScaleQRec(np.int16, 1, 12)
         in_syms, in_qrecs = cls.cast_symbols(in_syms, in_qrecs)
-        lhs = Cast(Clip(ScaleQuantized(
-            in_syms[0], from_qrec=in_qrecs[0], to_qrec=calc_qrec), dtype=calc_qrec.dtype, clip_dtype=np.int16), dtype=np.int16)
+        lhs = Cast(
+            Clip(
+                ScaleQuantized(
+                    in_syms[0],
+                    from_qrec=in_qrecs[0],
+                    to_qrec=calc_qrec),
+                dtype=calc_qrec.dtype,
+                clip_dtype=np.int16),
+            dtype=np.int16,
+            tag=True,
+            comment=f'{sym.name} scale and clip input - {in_qrecs[0]} -> {calc_qrec}')
         if isinstance(sym, Cos):
             qsym = Cos_Q15
         else:
@@ -782,7 +834,8 @@ def _quantize(cls,
         func = 'tanh' if isinstance(sym, TanH) else 'sigmoid'
         calc_qrec = Q15ScaleQRec(np.int32, 1, 12)
         # output is Q15 * 1
-        out_qrec = Q15ScaleQRec(np.int32, 1, 15, min_val=-1.0 if func == 'tanh' else 0.0, max_val=1.0)
+        out_qrec = Q15ScaleQRec(
+            np.int32, 1, 15, min_val=-1.0 if func == 'tanh' else 0.0, max_val=1.0)
         in_syms, in_qrecs = cls.cast_symbols(in_syms, in_qrecs)
         lhs = ScaleQuantized(
             in_syms[0], from_qrec=in_qrecs[0], to_qrec=calc_qrec)
diff --git a/tools/nntool/expressions/symbolic/q15_quantization/q15_scale_q_rec.py b/tools/nntool/expressions/symbolic/q15_quantization/q15_scale_q_rec.py
index 5e43157ba..0cb6e4d7f 100644
--- a/tools/nntool/expressions/symbolic/q15_quantization/q15_scale_q_rec.py
+++ b/tools/nntool/expressions/symbolic/q15_quantization/q15_scale_q_rec.py
@@ -18,12 +18,16 @@
 
 import numpy as np
 
+from quantization.qtype import DTYPES
+
 from ..quantization_base import QRecBase
 
 
 class Q15ScaleQRec(QRecBase):
     def __init__(self, dtype: np.dtype, scale: float, q: int, min_val=None, max_val=None, zero_point=0) -> None:
         super(Q15ScaleQRec, self).__init__(dtype)
+        if isinstance(scale, np.ndarray):
+            scale = scale.item()
         self._scale = scale
         self._q = q
         self._min_val = min_val
@@ -31,7 +35,7 @@ def __init__(self, dtype: np.dtype, scale: float, q: int, min_val=None, max_val=
         self._zero_point = zero_point
 
     def __repr__(self) -> str:
-        return f"{self._dtype.__name__} {self.scale} Q{self._q}"
+        return f"{self._dtype.__name__} {self.scale:.3f} Q{self._q}"
 
     @classmethod
     def inherit(cls, rec, dtype: np.dtype = None, scale: float = None, q: int = None, max_val=None, min_val=None, zero_point=None):
@@ -157,3 +161,14 @@ def __str__(self) -> str:
 
     def __eq__(self, o: object) -> bool:
         return self.q == o.q and self.scale == o.scale and self.dtype == o.dtype and self.zero_point == o.zero_point
+
+    @staticmethod
+    def dtype_zp_to_min_max(dtype, scale, zero_point):
+        bitlen, signed = DTYPES[dtype]
+        maxquns = math.pow(2, bitlen)
+        zpoff = math.pow(2, bitlen - 1) if signed else 0
+        maxq_range = maxquns - (zero_point + zpoff)
+        minq_range = maxquns - maxq_range
+        max_val = maxq_range * scale
+        min_val = minq_range * scale
+        return max_val, min_val, bitlen - (1 if signed or zero_point != 0 else 0)
diff --git a/tools/nntool/expressions/symbolic/q15_quantization/q15_scaled_quantization.py b/tools/nntool/expressions/symbolic/q15_quantization/q15_scaled_quantization.py
index 5f5f81185..1f7488c90 100644
--- a/tools/nntool/expressions/symbolic/q15_quantization/q15_scaled_quantization.py
+++ b/tools/nntool/expressions/symbolic/q15_quantization/q15_scaled_quantization.py
@@ -13,12 +13,14 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
-from typing import Tuple, cast
+import math
+from typing import Tuple
 
 import numpy as np
-import math
+from bfloat16 import bfloat16
+from expressions.symbolic.float_quantization.float_qrec import FloatQRec
 
-from ..basic import Cast
+from ..basic import Cast, ConvertQuantization
 from ..quantization_base import (QRecBase, QuantizationHandlerBase,
                                  handles_scheme)
 from ..symbol import Symbol, SymbolStats
@@ -63,6 +65,55 @@ def _dequantize_py_expr(cls, py_expr: str, qrec: Q15ScaleQRec, **kwargs) -> np.n
     def _dequantize_c_expr(cls, c_expr: str, qrec: Q15ScaleQRec, **kwargs) -> np.ndarray:
         return qrec.dequantize_c_expr(c_expr)
 
+    # @classmethod
+    # def _quantize_output(cls,
+    #                      sym: Symbol,
+    #                      qsym: Symbol,
+    #                      osym: Symbol,
+    #                      sym_ctrl: SymbolStats,
+    #                      qrec: QRecBase,
+    #                      **kwargs) -> Tuple[Symbol, QRecBase]:
+    #     from_qrec = qrec
+    #     qtypes = kwargs.get('qtypes', {})
+    #     # first see if this has already been quantized by nntool
+    #     # note that the qtype will be stored against the name of the output symbol
+    #     res = cls._get_scale_dtype_from_qtypes(
+    #         osym, qtypes)
+    #     if res is None:
+    #         max_val = math.fabs(sym_ctrl.get_max(sym))
+    #         min_val = -max_val
+    #         out_dtype = np.int8
+    #         out_q = 7
+    #         zero_point = 0
+    #     else:
+    #         max_val, min_val, out_dtype, out_q, zero_point = res
+
+    #     qrec_scale = Q15ScaleQRec(np.int32, max_val, out_q, min_val=min_val, max_val=max_val, zero_point=zero_point)
+    #     qrec_out = Q15ScaleQRec(out_dtype, max_val, out_q, min_val=min_val, max_val=max_val, zero_point=zero_point)
+    #     # scale clip and cast to output type
+    #     return (
+    #         Cast(
+    #             Clip(
+    #                 ScaleQuantized(qsym,
+    #                                from_qrec=from_qrec,
+    #                                to_qrec=qrec_scale),
+    #                 clip_dtype=out_dtype,
+    #                 dtype=qrec_scale.dtype),
+    #             dtype=qrec.dtype), qrec_out)
+
+    # @classmethod
+    # def _get_scale_dtype_from_qtypes(cls, sym, qtypes):
+    #     if not qtypes or sym.name not in qtypes:
+    #         return None
+    #     qtype = qtypes[sym.name]
+    #     if qtype.dtype in [np.int8, np.uint8, np.int16, np.uint16]:
+    #         if len(qtype.scale) > 1:
+    #             return None
+    #         max_val, min_val, bitlen = Q15ScaleQRec.dtype_zp_to_min_max(qtype.dtype, qtype.scale[0], qtype.zero_point)
+    #         return max_val, min_val, qtype.dtype, bitlen, qtype.zero_point
+    #     else:
+    #         return None
+
     @classmethod
     def _quantize_output(cls,
                          sym: Symbol,
@@ -75,49 +126,56 @@ def _quantize_output(cls,
         qtypes = kwargs.get('qtypes', {})
         # first see if this has already been quantized by nntool
         # note that the qtype will be stored against the name of the output symbol
-        max_val, out_dtype, out_q, zero_point = cls._get_scale_dtype_from_qtypes(
-            osym, qtypes)
-        if max_val is None:
+
+        if qtypes and osym.name in qtypes:
+            qtype = qtypes[osym.name]
+            if qtype.dtype in [np.int8, np.uint8, np.int16, np.uint16]:
+                max_val, min_val, out_q = Q15ScaleQRec.dtype_zp_to_min_max(
+                    qtype.dtype, qtype.scale[0], qtype.zero_point)
+                out_dtype = qtype.dtype
+                zero_point = qtype.zero_point
+            elif qtype.dtype in [np.float16, bfloat16, np.float32]:
+                min_val = qtype.min_val
+                max_val = qtype.max_val
+                out_dtype = qtype.dtype
+            else:
+                raise ValueError(f"don't know how to output {qtype.dtype}")
+        else:
+            out_dtype = kwargs.get('out_dtype', np.int8)
+            assert out_dtype in [np.int8, np.int16]
             max_val = math.fabs(sym_ctrl.get_max(sym))
-            out_dtype = np.int8
-            out_q = 7
+            min_val = -max_val
+            out_dtype = out_dtype
+            out_q = 7 if out_dtype == np.int8 else 15
             zero_point = 0
 
-#pylint: disable=invalid-unary-operand-type
-        min_val = -max_val
-        qrec_scale = Q15ScaleQRec(np.int32, max_val, out_q, min_val=min_val, max_val=max_val, zero_point=zero_point)
-        qrec_out = Q15ScaleQRec(out_dtype, max_val, out_q, min_val=min_val, max_val=max_val, zero_point=zero_point)
-        # scale clip and cast to output type
+        if out_dtype in [np.float16, bfloat16, np.float32]:
+            qrec_out = FloatQRec(
+                dtype=out_dtype, max_val=max_val, min_val=min_val)
+            return (
+                ConvertQuantization(
+                    qsym,
+                    from_qrec=from_qrec,
+                    to_qrec=qrec_out,
+                    comment=f'convert quantization - {from_qrec} -> {qrec_out}'
+                ), qrec_out)
+
+        qrec_scale = Q15ScaleQRec(
+            np.int32, max_val, out_q, min_val=min_val, max_val=max_val, zero_point=zero_point)
+        qrec_out = Q15ScaleQRec(
+            out_dtype, max_val, out_q, min_val=min_val, max_val=max_val, zero_point=zero_point)
         return (
             Cast(
                 Clip(
-                    ScaleQuantized(qsym,
-                                   from_qrec=from_qrec,
-                                   to_qrec=qrec_scale),
+                    ScaleQuantized(
+                        qsym,
+                        from_qrec=from_qrec,
+                        to_qrec=qrec_scale
+                    ),
                     clip_dtype=out_dtype,
-                    dtype=qrec_scale.dtype),
-                dtype=qrec.dtype), qrec_out)
-
-    @classmethod
-    def _get_scale_dtype_from_qtypes(cls, sym, qtypes):
-        if not qtypes or sym.name not in qtypes:
-            return None, None, None, None
-        qtype = qtypes[sym.name]
-        if qtype.dtype == np.int8:
-            if len(qtype.scale) > 1:
-                return None, None, None, None
-            return qtype.scale[0] * math.pow(2, 7), np.int8, 7, qtype.zero_point
-        if qtype.dtype == np.uint8:
-            if len(qtype.scale) > 1:
-                return None, None, None, None
-            return qtype.scale[0] * math.pow(2, 8), np.uint8, 8, qtype.zero_point
-        elif qtype.dtype == np.int16:
-            if len(qtype.scale) > 1:
-                return None, None, None
-            return qtype.scale[0] * math.pow(2, 15), np.int16, 15, qtype.zero_point
-        if qtype.dtype == np.uint16:
-            if len(qtype.scale) > 1:
-                return None, None, None, None
-            return qtype.scale[0] * math.pow(2, 16), np.uint16, 16, qtype.zero_point
-        else:
-            return None, None, None, None
+                    dtype=qrec_scale.dtype
+                ),
+                dtype=qrec_out.dtype,
+                comment=f'scale clip and cast - {from_qrec} -> {qrec_out}'
+            ),
+            qrec_out)
diff --git a/tools/nntool/expressions/symbolic/q15_quantization/quantized_constant.py b/tools/nntool/expressions/symbolic/q15_quantization/quantized_constant.py
deleted file mode 100644
index 258036854..000000000
--- a/tools/nntool/expressions/symbolic/q15_quantization/quantized_constant.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright (C) 2020  GreenWaves Technologies, SAS
-
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as
-# published by the Free Software Foundation, either version 3 of the
-# License, or (at your option) any later version.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU Affero General Public License for more details.
-
-# You should have received a copy of the GNU Affero General Public License
-# along with this program.  If not, see <https://www.gnu.org/licenses/>.
-
-
-import numpy as np
-
-from ..symbol import Constant, Symbol
-
-
-class QuantizedConstant(Constant):
-    def __init__(self, *args, dtype=np.int32, **kwargs):
-        super().__init__(*args, dtype=dtype, **kwargs)
-
-class QuantizedValue(Symbol):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-    def _calculate(self, calculate_ranges=False, **kwargs):
-        raise ValueError('wrapper class for quantization purposes - not designed to be evaluated')
-
-    def _impl(self, *args, **kwargs):
-        raise ValueError('wrapper class for quantization purposes - not designed to be evaluated')
diff --git a/tools/nntool/expressions/symbolic/q15_quantization/scale_quantized.py b/tools/nntool/expressions/symbolic/q15_quantization/scale_quantized.py
index 30d812ee4..90eb9c791 100644
--- a/tools/nntool/expressions/symbolic/q15_quantization/scale_quantized.py
+++ b/tools/nntool/expressions/symbolic/q15_quantization/scale_quantized.py
@@ -14,16 +14,15 @@
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
 import math
+from xml.etree.ElementTree import Comment
 
 import numpy as np
-from numpy.core.getlimits import iinfo
 from expressions.symbolic.function import Function
 
 from ..basic import Add, Cast, CompoundFunction, LShift, Mul, Sub, copy_props
-from ..symbol import c_headers, nargs
+from ..symbol import QuantizedConstant, c_headers, nargs
 from .clip_norm import Norm
 from .q15_scale_q_rec import Q15ScaleQRec
-from .quantized_constant import QuantizedConstant
 
 
 @nargs(2)
@@ -87,7 +86,13 @@ def _eval(self, *args, **kwargs):
         # this should be safe as we never go much above Q15 and the scaling step
         # is also a Q15
         if isinstance(sym, ScaleQuantized):
-            return ScaleQuantized(*sym.contents, from_qrec=sym.from_qrec, to_qrec=self.to_qrec, num_bits=min(self._num_bits, sym.num_bits))
+            return ScaleQuantized(
+                *sym.contents,
+                from_qrec=sym.from_qrec,
+                to_qrec=self.to_qrec,
+                num_bits=min(self._num_bits, sym.num_bits),
+                tag=self.tag,
+                comment=self.comment)
         # Check if we do nothing
         if self._from_qrec == self._to_qrec:
             return sym
@@ -159,6 +164,8 @@ def _eval(self, *args, **kwargs):
 
         if self._to_qrec.dtype != np.int32:
             sym = Cast(sym, dtype=self._to_qrec.dtype)
+        sym.tag = self.tag
+        sym.comment=self.comment
         return sym
 
     def __repr__(self) -> str:
diff --git a/tools/nntool/expressions/symbolic/quantization_base.py b/tools/nntool/expressions/symbolic/quantization_base.py
index 48221469c..c66d6f346 100644
--- a/tools/nntool/expressions/symbolic/quantization_base.py
+++ b/tools/nntool/expressions/symbolic/quantization_base.py
@@ -1,8 +1,24 @@
+# Copyright (C) 2020  GreenWaves Technologies, SAS
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
 from typing import Tuple
 
 import numpy as np
 
 from expressions.symbolic.basic import CompoundFunction
+from expressions.symbolic.function import Function
 
 from .symbol import Symbol, SymbolStats, Variable, QRecBase
 
@@ -123,6 +139,8 @@ def quantize(cls,
         if not isinstance(sym, Variable):
             qsym.name = sym.name
         qsym.qrec = qrec
+        if isinstance(sym, Function):
+            qsym.tag = True
         return (qsym, qrec)
 
     @classmethod
diff --git a/tools/nntool/expressions/symbolic/symbol.py b/tools/nntool/expressions/symbolic/symbol.py
index 0480073bc..31bf41972 100644
--- a/tools/nntool/expressions/symbolic/symbol.py
+++ b/tools/nntool/expressions/symbolic/symbol.py
@@ -19,7 +19,7 @@
 import numpy as np
 from bfloat16 import bfloat16
 from generation.code_block import CodeBlock
-from quantization.qtype import DTYPE_GAP_CTYPE
+from quantization.qtype import DTYPE_GAP_CTYPE, DTYPES
 
 
 class SymbolStats():
@@ -64,17 +64,21 @@ def reset_ranges(self):
 
 class QRecBase():
     DTYPE_TO_CTYPE = {
-        np.int8: 'int8_t',
-        np.int16: 'int16_t',
-        np.int32: 'int32_t',
-        np.uint8: 'uint8_t',
-        np.uint16: 'uint16_t',
-        np.uint32: 'uint32_t',
+        np.int8: 'signed char',
+        np.int16: 'short',
+        np.int32: 'int',
+        np.uint8: 'unsigned char',
+        np.uint16: 'unsigned short',
+        np.uint32: 'unsigned int',
         np.float32: 'float',
         bfloat16: 'F16',
         np.float16: 'F16'
     }
     def __init__(self, dtype=None) -> None:
+        if isinstance(dtype, np.dtype):
+            dtype = dtype.type
+        if dtype is not None and dtype not in self.DTYPE_TO_CTYPE:
+            raise ValueError('unknown dtype')
         self._dtype = dtype
 
     @property
@@ -89,6 +93,14 @@ def dtype(self, val):
     def ctype(self):
         return self.DTYPE_TO_CTYPE[self.dtype]
 
+    @property
+    def signed(self):
+        return DTYPES[self.dtype][1]
+
+    @property
+    def size(self):
+        return DTYPES[self.dtype][0]
+
 class Symbol():
     NARGS = None
     CURRENT_CONTROL = SymbolStats()
@@ -96,9 +108,10 @@ class Symbol():
     COUNTS = {}
     C_HEADERS = []
     COPY_PROPS = tuple()
+    SYMBOL_PREFEX = '_SYMBOL_'
 
 #pylint: disable=unused-argument
-    def __init__(self, *args, name="", shape=None, dtype=np.float32, qrec: QRecBase = None, **kwargs):
+    def __init__(self, *args, name="", shape=None, dtype=np.float32, qrec: QRecBase = None, tag=None, comment: str=None, **kwargs):
         super(Symbol, self).__init__(**kwargs)
         if self.NARGS is not None and len(args) != self.NARGS:
             raise ValueError("wrong number of arguments to Symbol %s"%self.__class__.__name__)
@@ -107,6 +120,8 @@ def __init__(self, *args, name="", shape=None, dtype=np.float32, qrec: QRecBase
         self._dtype = dtype
         self._shape = shape
         self._qrec = qrec
+        self._tag = tag
+        self._comment = comment
 
     @classmethod
     def get_name(cls, cls_to_name):
@@ -114,6 +129,22 @@ def get_name(cls, cls_to_name):
         cls.COUNTS[cls_to_name] += 1
         return name
 
+    @property
+    def tag(self):
+        return self._tag
+
+    @tag.setter
+    def tag(self, val):
+        self._tag = val
+
+    @property
+    def comment(self):
+        return self._comment
+
+    @comment.setter
+    def comment(self, val):
+        self._comment = val
+
     @property
     def qrec(self):
         return self._qrec
@@ -153,6 +184,10 @@ def dtype(self):
             return self._qrec.dtype
         return self._dtype
 
+    @property
+    def ctype(self):
+        return QRecBase.DTYPE_TO_CTYPE[self.dtype]
+
     @property
     def name(self):
         return self._name
@@ -182,10 +217,11 @@ def set_default_control(cls, control):
         cls.CURRENT_CONTROL = control
 
     @staticmethod
-    def extend_shapes(*shapes):
+    def extend_shapes(*shapes, max_length=None):
         if len(shapes) == 1:
             return list(shapes)
-        max_length = max(len(x) for x in shapes)
+        if max_length is None:
+            max_length = max(len(x) for x in shapes)
         return [tuple([1] * (max_length - len(x)) + list(x)) for x in shapes]
 
     @staticmethod
@@ -226,7 +262,10 @@ def resolve(self, **kwargs):
 
     def calculate(self, calculate_ranges=False, **kwargs):
         """Given a set of substitions for variable in kwargs calculate a result"""
-        return self._calculate(calculate_ranges=calculate_ranges, **kwargs)
+        val = self._calculate(calculate_ranges=calculate_ranges, **kwargs)
+        if self.tag and 'details' in kwargs:
+            kwargs['details'][self.tag[0]] = val
+        return val
 
     def collect_globals(self) -> dict:
         """Returns a dict of globals necessary to execute a lambda of this symbol. Globals
@@ -330,10 +369,19 @@ def py_expr(self, *args, **kwargs):
     def c_expr(self, *args, **kwargs):
         return self._c_expr([], **kwargs)
 
-    def c_block(self, code_block=None, **kwargs):
+    def c_block(self, code_block=None, with_comment=False, tags=None, **kwargs):
         if code_block is None:
-            code_block = CodeBlock
-        code_block.write(self.c_expr)
+            code_block = CodeBlock()
+        if tags is not None and self.tag:
+            if self.comment and with_comment:
+                code_block.write(f'// {self.comment}')
+            name = tags.get(self, f'{self.ctype} {self.SYMBOL_PREFEX}{self.name}')
+            if isinstance(name, tuple):
+                name = name[0].c_expr(dtype=name[0].dtype, declare=name[1], **kwargs)
+            code_block.write(f'{name} = {self._c_expr([], **kwargs)};')
+        else:
+            code_block.write(self._c_expr([], **kwargs))
+        return code_block
 
     def _equivalent(self, other) -> bool:
         pass
@@ -466,7 +514,7 @@ def _c_expr(self, *args, **kwargs):
             return f"(F16){print_float_constant(val)}"
         elif self.dtype == np.float32:
             return print_float_constant(val)
-        return val
+        return str(val)
 
     def __repr__(self) -> str:
         return str(self._value)
@@ -524,6 +572,7 @@ def __init__(self, var_name, shape=None, symbol_binding=None, name="", **kwargs)
         self._shape = shape
         self._index_vars = None
         self._ispointer = False
+        self._cindex = None
 
     @property
     def shape(self):
@@ -560,6 +609,14 @@ def value(self):
     def unbound_variables(self):
         return {self._name: self}
 
+    @property
+    def cindex(self):
+        return self._cindex
+
+    @cindex.setter
+    def cindex(self, val):
+        self._cindex = val
+
     @property
     def index_vars(self):
         return self._index_vars
@@ -586,11 +643,11 @@ def _impl(self, *args, **kwargs):
             val = np.array(kwargs[self.name])
             if self.shape is not None:
                 val = np.reshape(val, self.shape)
-            quantize_inputs = kwargs.get('quantize_inputs', False)
-            if quantize_inputs is True or isinstance(quantize_inputs, Iterable) and self.name in quantize_inputs:
-                if self.qrec is None:
-                    raise ValueError("can't quantize %s. no quantization record is set."%self.name)
-                val = self.qrec.quantize_and_clip(val)
+            # quantize_inputs = kwargs.get('quantize_inputs', False)
+            # if quantize_inputs is True or isinstance(quantize_inputs, Iterable) and self.name in quantize_inputs:
+            #     if self.qrec is None:
+            #         raise ValueError("can't quantize %s. no quantization record is set."%self.name)
+            #     val = self.qrec.quantize_and_clip(val)
             if calculate_ranges:
                 self.control.add_stat(self, val)
             return val
@@ -645,8 +702,8 @@ def gen_index(index_vars):
 
 #pylint: disable=arguments-differ
     def _c_expr(self, *args, declare=False, dtype=None, pointer=None, iteration_space=None, **kwargs):
-        if iteration_space:
-            return iteration_space.c_indexed_var(self.name)
+        if iteration_space and not self.name.startswith(self.SYMBOL_PREFEX):
+            return iteration_space.c_indexed_var(self.name, declare=declare)
         if pointer is None:
             pointer = self._ispointer
         if declare:
@@ -666,3 +723,17 @@ def _c_expr(self, *args, declare=False, dtype=None, pointer=None, iteration_spac
 
     def __repr__(self) -> str:
         return f'{self.name}'
+
+class QuantizedConstant(Constant):
+    def __init__(self, *args, dtype=np.int32, **kwargs):
+        super().__init__(*args, dtype=dtype, **kwargs)
+
+class QuantizedValue(Symbol):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def _calculate(self, calculate_ranges=False, **kwargs):
+        raise ValueError('wrapper class for quantization purposes - not designed to be evaluated')
+
+    def _impl(self, *args, **kwargs):
+        raise ValueError('wrapper class for quantization purposes - not designed to be evaluated')
diff --git a/tools/nntool/expressions/symbolic/variable_container.py b/tools/nntool/expressions/symbolic/variable_container.py
index 3647509d2..89399bc32 100644
--- a/tools/nntool/expressions/symbolic/variable_container.py
+++ b/tools/nntool/expressions/symbolic/variable_container.py
@@ -13,8 +13,8 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
-from abc import ABC, abstractmethod, abstractproperty
-from typing import Mapping
+from collections import Counter
+from itertools import chain
 
 import numpy as np
 from utils.disjoint_reduction import disjoint_reduction
@@ -22,54 +22,41 @@
 from .symbol import Symbol, Variable
 
 
-class VariableAssigner(ABC):
-    @abstractmethod
-    def _resolve_assignment(self, substitute_all=False, **kwargs) -> Mapping[str, Symbol]:
-        """ Resolves an container that is one or more assigments substituting values contained in
-        **kwargs into unresolved variables
-
-        Args:
-            substitute_all (bool, optional): If False only expressions that resolve to constants will be substituted.
-                                             Defaults to False.
-
-        Returns:
-            Mapping[str, Symbol]: A map of the variable names and their values (Symbols)
-        """
-
-    def resolve_assignment(self, substitute_all=False, **kwargs) -> Mapping[str, Symbol]:
-        return self._resolve_assignment(substitute_all=substitute_all, **kwargs)
-
-    @abstractmethod
-    def _calculate_assignment(self, **kwargs) -> Mapping[str, np.ndarray]:
-        """ Attempts to resolve a series of assignments to a map of values
-
-        Returns:
-            Mapping[str, np.ndarray]: Map of resolved values
-        """
-
-    def calculate_assignment(self, **kwargs) -> Mapping[str, np.ndarray]:
-        return self._calculate_assignment(**kwargs)
-
-    @abstractproperty
-    def returned_variables(self):
-        pass
-
-    @abstractproperty
-    def var_shapes(self):
-        pass
+def search_variables(elem):
+    if isinstance(elem, Variable):
+        return [elem]
+    if type(elem) == int or type(elem) == float or isinstance(elem, np.ndarray):
+        return []
+    return chain(*[search_variables(sub_elem) for sub_elem in elem.contents])
 
 class VariableContainer():
     def __init__(self, *args, **kwargs):
+        args = list(args)
+        # if string variable names are provided match existing variables or create a new one
+        if any(isinstance(arg, str) for arg in args):
+            all_vars = {var.name: var
+                        for var in chain(*[search_variables(arg)
+                                        for arg in args if not isinstance(arg, str)])}
+            for idx, arg in enumerate(args):
+                if isinstance(arg, str):
+                    if arg in all_vars:
+                        args[idx] = all_vars[arg]
+                    else:
+                        all_vars[arg] = Variable(arg)
+                        args[idx] = all_vars[arg]
         super().__init__(*args, **kwargs)
-        self._unbound_variables = self._init_unbound_variables(*args)
+        # variables with same name must be the same variable instance
+        names = list(
+            {object.__hash__(var): var.name for var in search_variables(self)}.values())
+        if len(set(names)) != len(names):
+            bad_vars = [item[0] for item in filter(
+                lambda x: x[1] > 1, Counter(names).items())]
+            raise ValueError(
+                f'duplicate variable names detected: {" ".join(bad_vars)}')
 
     @property
     def unbound_variables(self):
-        return self._unbound_variables
-
-    @unbound_variables.setter
-    def unbound_variables(self, val):
-        self._unbound_variables = val
+        return {var.name: var for var in search_variables(self)}
 
     @property
     def unbound_shapes(self):
@@ -83,12 +70,10 @@ def extended_unbound_var_shapes(self):
         return {vname: tuple(([1] * (max_length - len(var.shape))) + list(var.shape))
                 for vname, var in self.unbound_variables.items()}
 
-
     @staticmethod
     def adjust(axes, adjust):
         return tuple(tuple(dim+adjust for dim in axes_group) for axes_group in axes)
 
-
     @property
     def axes(self):
         var_shapes = Symbol.extend_shapes(*self.unbound_shapes)
@@ -96,28 +81,6 @@ def axes(self):
             shape) if dim != 1) for shape in var_shapes))
         return tuple(sorted([tuple(x) for x in axes]))
 
-    @staticmethod
-    def _init_unbound_variables(*args):
-        unbound_variables = {}
-        for arg in args:
-            if isinstance(arg, Variable):
-                if arg.name in unbound_variables:
-                    if unbound_variables[arg.name].shape != arg.shape:
-                        raise ValueError(
-                            'there is more than one variable called %s with different shapes' % arg.name)
-                else:
-                    unbound_variables[arg.name] = arg
-            elif isinstance(arg, VariableContainer):
-                unbound_variables.update(arg.unbound_variables)
-            elif isinstance(arg, str):
-                if arg in unbound_variables:
-                    raise ValueError(
-                        'there is more than one variable called %s' % arg)
-                else:
-                    unbound_variables[arg] = Variable(arg)
-
-        return unbound_variables
-
     def vars_to_axes(self, axes=None):
         if axes is None:
             axes = self.axes
@@ -129,20 +92,3 @@ def axes_sizes(self, axes=None):
             axes = self.axes
         shape = Symbol.broadcast(*self.unbound_shapes)
         return {axis: int(np.prod([shape[x] for x in axis])) for axis in axes}
-
-
-class VariableContainerAndAssigner(VariableContainer, VariableAssigner):
-    @property
-    def var_axes(self):
-        elems = self.resolve_assignment(substitute_all=True)
-        max_axis_groups = np.array(
-            [max(max(x) for x in elem.axes) for elem in elems.values()])
-        max_axis = np.max(max_axis_groups)
-        axis_adjust = max_axis - max_axis_groups
-
-        axes = {}
-        for elem_idx, (elem_name, elem) in enumerate(elems.items()):
-            axes[elem_name] = self.adjust(elem.axes, axis_adjust[elem_idx])
-            for vname, vaxes in elem.vars_to_axes().items():
-                axes[vname] = self.adjust(vaxes, axis_adjust[elem_idx])
-        return axes
diff --git a/tools/nntool/generation/code_block.py b/tools/nntool/generation/code_block.py
index e615c6844..659b61847 100644
--- a/tools/nntool/generation/code_block.py
+++ b/tools/nntool/generation/code_block.py
@@ -15,12 +15,39 @@
 
 QUOTE = lambda s: '"'+s+'"'
 
+
 class CodeBlock():
+
+    class CommentWriter():
+        def __init__(self, cb, max_len) -> None:
+            self._cb = cb
+            self._max_len = max_len
+            self.reset()
+
+        def write(self, comment):
+            for elem in comment.split(' '):
+                if self._cur_len + len(elem) + 1 > self._max_len:
+                    self.end()
+                self._cur_line.append(elem)
+                self._cur_len += len(elem) + 1
+
+        def end(self):
+            self._cb.write(f'// {" ".join(self._cur_line)}')
+            self.reset()
+
+        def reset(self):
+            self._cur_line = []
+            self._cur_len = len(self._cb.get_indent()) + 3
+
     def __init__(self, starting_indent=0, indent_char="    "):
         self._indent = starting_indent
         self._indent_char = indent_char
         self._lines = []
 
+    @property
+    def lines(self):
+        return self._lines
+
     def indent(self):
         self._indent += 1
         return self
@@ -60,6 +87,9 @@ def write_start(self, fmt, *args):
         self._lines.insert(0, fmt.format(*args))
         return self
 
+    def start_long_comment(self, max_len=80):
+        return CodeBlock.CommentWriter(self, max_len)
+
     def comment(self, fmt, *args):
         fmt = self.get_indent() + '// ' + fmt
         if args:
diff --git a/tools/nntool/generation/code_generator.py b/tools/nntool/generation/code_generator.py
index 3a243be1c..093eff2fe 100644
--- a/tools/nntool/generation/code_generator.py
+++ b/tools/nntool/generation/code_generator.py
@@ -17,14 +17,14 @@
 
 import numpy as np
 from bfloat16 import bfloat16
-from expressions.symbolic.kernel_codegen import BasicKernel
+from expressions.symbolic.iteration_space import IterationSpace
 from graph.manipulations.dimensions import add_dimensions
 from graph.types import (ConcatParameters, ConstantInputParameters,
-                         InputParameters, OutputParameters, ReshapeParameters,
-                         SplitParameters, TransposeParameters)
+                         InputParameters, OutputParameters, SplitParameters,
+                         TransposeParameters)
 from graph.types.base import NNEdge
 from graph.types.fusions import FusionBase
-from graph.types.others import CopyParameters, NoOPParameters, QuantizeParameters
+from graph.types.others import CopyParameters, QuantizeParameters
 from graph.types.rnn import RNNBaseParameters
 from utils.node_id import NodeId
 
@@ -534,7 +534,8 @@ def add_checksum_binding(self, cname, name, step_idx, eparams, before):
             FunctionBindingList(cname,
                                 checksum_func(self.hidden_graph, name),
                                 Imm(step_idx),
-                                Imm(calc_value_checksum(self.hidden_graph, name)),
+                                Imm(calc_value_checksum(
+                                    self.hidden_graph, name)),
                                 GArgEdge(eparams[0]),
                                 Imm(size),
                                 before=before)
@@ -609,8 +610,8 @@ def expressions_foreach_basic_kernel(self):
             basic_kernel = self.expressions_kernel_cache.get(node)
             if not basic_kernel:
                 qrec = self.G.quantization[NodeId(node)]
-                basic_kernel = BasicKernel(qrec.cache['qfunc_col'],
-                                           [inp.name for inp in node.constant_inputs])
+                basic_kernel = IterationSpace(qrec.cache['qfunc_col'],
+                                                 constants=[inp.name for inp in node.constant_inputs])
                 self.expressions_kernel_cache[node] = basic_kernel
             yield node, basic_kernel
 
@@ -628,12 +629,12 @@ def expressions_kernel_types_generator(self):
         code_block = CodeBlock(starting_indent=0)
         for node, basic_kernel in self.expressions_foreach_basic_kernel():
             _, arg_name = self.expressions_get_names(node)
-            basic_kernel.kernel_arg_type_codegen(arg_name, code=code_block)
+            basic_kernel.gen_kernel_arg_typedecl(arg_name, code=code_block)
         return str(code_block)
 
     def expressions_kernel_includes_generator(self):
         code_block = CodeBlock(starting_indent=0)
-        includes = set.union(*[basic_kernel.func_col.c_header_set for node,
+        includes = set.union(*[basic_kernel.assignments.c_header_set for node,
                                basic_kernel in self.expressions_foreach_basic_kernel()])
         for include in includes:
             code_block.write('#include {}', include)
@@ -733,26 +734,25 @@ def gen_inout_list(self):
     def generate_output_check(self, tol=0.0, indent=0):
         code = CodeBlock(starting_indent=indent)
         code.write('int errors;')
-        for out_node in self.output_nodes:
+        for idx, out_node in enumerate(self.output_nodes):
             out_sz = out_node.out_dims[0].size()
             nodeq = self.G.quantization[NodeId(out_node, None)].out_qs[0]
             dtype = "%f" if nodeq.is_floating else "%d"
             code.write('errors = 0;')
-            if tol:
-                code.write(f"{dtype2ctype(nodeq)} max_diff = 0;")
+            code.write(f"{'float' if nodeq.is_floating else 'int'} max_diff_{idx} = 0;")
             code.write(f'for (int j=0; j<{out_sz}; j++) {{')
             code.indent()
+            code.write(
+                f"{'float' if nodeq.is_floating else 'int'} diff = {out_node.name.capitalize()}[j] - "
+                f"{out_node.name.capitalize()}_gt[j];")
+            code.write("diff = (diff>0)?diff:(-diff);")
+            code.write(f"if (diff > max_diff_{idx}) max_diff_{idx} = diff;")
             if tol:
-                code.write(
-                    f"{dtype2ctype(nodeq)} diff = {out_node.name.capitalize()}[j] - "
-                    f"{out_node.name.capitalize()}_gt[j];")
-                code.write("diff = (diff>0)?diff:(-diff);")
-                code.write("if (diff > max_diff) max_diff = diff;")
                 code.write(
                     f'if (diff > {nodeq.quantize(np.array(tol)).item()}) {{')
             else:
                 code.write(
-                    f'if ({out_node.name.capitalize()}[j] != {out_node.name.capitalize()}_gt[j]) {{')
+                    f'if (diff > 0) {{')
             code.indent()
             code.write('errors++;')
             code.write(f'printf("Error @ %d: {dtype} instead of {dtype}\\n", j, '
@@ -763,6 +763,5 @@ def generate_output_check(self, tol=0.0, indent=0):
             code.write('}')
             code.write(
                 f'printf("{out_node.name.capitalize()}: %d/{out_sz} errors\\n", errors);')
-            if tol:
-                code.write(f'printf("Max error: {dtype}\\n", max_diff);')
+            code.write(f'printf("Max error: {dtype}\\n", max_diff_{idx});')
         return str(code)
diff --git a/tools/nntool/generation/default_appl_main_template.py b/tools/nntool/generation/default_appl_main_template.py
index f0a3a5e5f..d6b20ecf2 100644
--- a/tools/nntool/generation/default_appl_main_template.py
+++ b/tools/nntool/generation/default_appl_main_template.py
@@ -57,7 +57,6 @@ def generate_main_appl_template(G, gen, test_inputs=None, test_outputs=None, tol
      * Put here Your input settings
      * <---------------
      */
-    
 
 #ifndef __EMUL__
     /* Configure And open cluster. */
@@ -70,22 +69,19 @@ def generate_main_appl_template(G, gen, test_inputs=None, test_outputs=None, tol
         printf("Cluster open failed !\\n");
         pmsis_exit(-4);
     }
-    int cur_fc_freq = pi_freq_set(PI_FREQ_DOMAIN_FC, ${gen.opts['fc_freq']});
-    if (cur_fc_freq == -1)
+
+    /* Frequency Settings: defined in the Makefile */
+    int cur_fc_freq = pi_freq_set(PI_FREQ_DOMAIN_FC, FREQ_FC*1000*1000);
+    int cur_cl_freq = pi_freq_set(PI_FREQ_DOMAIN_CL, FREQ_CL*1000*1000);
+    int cur_pe_freq = pi_freq_set(PI_FREQ_DOMAIN_PERIPH, FREQ_PE*1000*1000);
+    if (cur_fc_freq == -1 || cur_cl_freq == -1 || cur_pe_freq == -1)
     {
         printf("Error changing frequency !\\nTest failed...\\n");
         pmsis_exit(-4);
     }
+	printf("FC Frequency as %d Hz, CL Frequency = %d Hz, PERIIPH Frequency = %d Hz\\n", 
+            pi_freq_get(PI_FREQ_DOMAIN_FC), pi_freq_get(PI_FREQ_DOMAIN_CL), pi_freq_get(PI_FREQ_DOMAIN_PERIPH));
 
-    int cur_cl_freq = pi_freq_set(PI_FREQ_DOMAIN_CL, ${gen.opts['cl_freq']});
-    if (cur_cl_freq == -1)
-    {
-        printf("Error changing frequency !\\nTest failed...\\n");
-        pmsis_exit(-5);
-    }
-#ifdef __GAP9__
-    pi_freq_set(PI_FREQ_DOMAIN_PERIPH, 250000000);
-#endif
 #endif
     // IMPORTANT - MUST BE CALLED AFTER THE CLUSTER IS SWITCHED ON!!!!
     printf("Constructor\\n");
@@ -202,6 +198,15 @@ def generate_main_appl_make(G, gen, quantized, open_args=""):
 CLUSTER_SLAVE_STACK_SIZE=${gen.opts['cluster_slave_stack_size']}
 CLUSTER_NUM_CORES=${gen.opts['cluster_num_cores']}
 
+# FLASH and RAM type
+FLASH_TYPE = ${"MRAM" if gen.opts['l3_flash_device'] == 'AT_MEM_L3_MRAMFLASH' else \
+               "QSPI" if gen.opts['l3_flash_device'] == 'AT_MEM_L3_QSPIFLASH' else \
+               "OSPI" if gen.opts['l3_flash_device'] == 'AT_MEM_L3_OSPIFLASH' else \
+               "HYPER"}
+RAM_TYPE   = ${"QSPI" if gen.opts['l3_ram_device'] == 'AT_MEM_L3_QSPIRAM' else \
+               "OSPI" if gen.opts['l3_ram_device'] == 'AT_MEM_L3_OSPIRAM' else \
+               "HYPER"}
+
 NNTOOL_SCRIPT = nntool_script
 ${"APP_CFLAGS += -DSTD_FLOAT" if any(qrec[1].out_qs[0].dtype == np.float16 for qrec in G.quantization.sorted_iterator(G)) else ""}
 ${"APP_LDFLAGS += -lm" if gen.G.has_expressions and "FLOAT" in gen.G.quantization.schemes_present else ""}
diff --git a/tools/nntool/generation/new_generators/general/transpose.py b/tools/nntool/generation/new_generators/general/transpose.py
index 9b5c3dbae..6e513dc31 100644
--- a/tools/nntool/generation/new_generators/general/transpose.py
+++ b/tools/nntool/generation/new_generators/general/transpose.py
@@ -72,6 +72,10 @@ def __init__(self, cname, params, in_shape, real_transpose, qrec, perm_op=None,
         if qrec.out_qs[0].is_floating:
             gen_ctrl.float_dump = 1
 
+        datasize = qrec.out_qs[0].dtype_bits//8
+        if not qrec.out_qs[0].signed:
+            datasize = -datasize
+
         attrs = {
             'in_dim': params.in_dims[0],
             'out_dim': params.out_dims[0],
@@ -81,7 +85,7 @@ def __init__(self, cname, params, in_shape, real_transpose, qrec, perm_op=None,
             'height': in_shape[1],
             'width': in_shape[2],
             'perm_op': perm_op,
-            'datasize': (qrec.out_qs[0].dtype_bits//8)
+            'datasize': datasize
         }
 
         # other attributes
diff --git a/tools/nntool/generation/new_generators/mult8/pool_mult8.py b/tools/nntool/generation/new_generators/mult8/pool_mult8.py
index 2294ea81c..f98a9e881 100644
--- a/tools/nntool/generation/new_generators/mult8/pool_mult8.py
+++ b/tools/nntool/generation/new_generators/mult8/pool_mult8.py
@@ -202,6 +202,11 @@ def __init__(self, node_name, cname, pool_params, pool_q, act_params, act_q, for
                 LOG.debug("%s: generating pad control block", node_name)
                 self.gen_ctrl.PadType = at_pad_ctrl
 
+        if not out_q.signed:
+            gen_ctrl.output_datasize = -out_q.dtype_bits//8
+        if not in_q.signed:
+            gen_ctrl.input_datasize = -in_q.dtype_bits//8
+
         attrs = {
             'in_size': in_q.dtype_bits//8 if in_q.signed else -in_q.dtype_bits//8,
             'out_size': out_q.dtype_bits//8 if out_q.signed else -out_q.dtype_bits//8,
diff --git a/tools/nntool/generation/project_template/Makefile b/tools/nntool/generation/project_template/Makefile
index 71960b115..c1f528ea8 100644
--- a/tools/nntool/generation/project_template/Makefile
+++ b/tools/nntool/generation/project_template/Makefile
@@ -11,22 +11,46 @@ endif
 include common.mk
 include $(RULES_DIR)/at_common_decl.mk
 
-io=stdout
+io?=host
 
-RAM_FLASH_TYPE ?= HYPER
+FLASH_TYPE ?= HYPER
+RAM_TYPE   ?= HYPER
 #PMSIS_OS=freertos
 
-ifeq '$(RAM_FLASH_TYPE)' 'HYPER'
-APP_CFLAGS += -DUSE_HYPER
-MODEL_L3_EXEC=hram
-MODEL_L3_CONST=hflash
-else
-APP_CFLAGS += -DUSE_SPI
-CONFIG_SPIRAM = 1
-MODEL_L3_EXEC=qspiram
-MODEL_L3_CONST=qpsiflash
+ifeq '$(FLASH_TYPE)' 'HYPER'
+  MODEL_L3_CONST=AT_MEM_L3_HFLASH
+else ifeq '$(FLASH_TYPE)' 'MRAM'
+  MODEL_L3_CONST=AT_MEM_L3_MRAMFLASH
+  READFS_FLASH = target/chip/soc/mram
+else ifeq '$(FLASH_TYPE)' 'QSPI'
+  MODEL_L3_CONST=AT_MEM_L3_QSPIFLASH
+  READFS_FLASH = target/board/devices/spiflash
+else ifeq '$(FLASH_TYPE)' 'OSPI'
+  MODEL_L3_CONST=AT_MEM_L3_OSPIFLASH
+  READFS_FLASH = target/board/devices/ospiflash
+endif
+
+ifeq '$(RAM_TYPE)' 'HYPER'
+  MODEL_L3_EXEC=AT_MEM_L3_HRAM
+else ifeq '$(RAM_TYPE)' 'QSPI'
+  MODEL_L3_EXEC=AT_MEM_L3_QSPIRAM
+else ifeq '$(RAM_TYPE)' 'OSPI'
+  MODEL_L3_EXEC=AT_MEM_L3_OSPIRAM
 endif
 
+ifeq '$(TARGET_CHIP_FAMILY)' 'GAP9'
+FREQ_CL?=370
+FREQ_FC?=370
+FREQ_PE?=370
+else
+ifeq '$(TARGET_CHIP)' 'GAP8_V3'
+FREQ_CL?=175
+else
+FREQ_CL?=50
+endif
+FREQ_FC?=250
+FREQ_PE?=250
+endif
 
 $(info Building NNTOOL model)
 NNTOOL_EXTRA_FLAGS ?= 
@@ -43,7 +67,7 @@ APP_CFLAGS += -g -O3 -mno-memcpy -fno-tree-loop-distribute-patterns
 APP_CFLAGS += -I. -I$(MODEL_COMMON_INC) -I$(TILER_EMU_INC) -I$(TILER_INC) $(CNN_LIB_INCLUDE) -I$(MODEL_BUILD)
 APP_CFLAGS += -DPERF -DAT_MODEL_PREFIX=$(MODEL_PREFIX) $(MODEL_SIZE_CFLAGS)
 APP_CFLAGS += -DSTACK_SIZE=$(CLUSTER_STACK_SIZE) -DSLAVE_STACK_SIZE=$(CLUSTER_SLAVE_STACK_SIZE)
-APP_CFLAGS += -DAT_IMAGE=$(IMAGE)
+APP_CFLAGS += -DAT_IMAGE=$(IMAGE) -DFREQ_FC=$(FREQ_FC) -DFREQ_CL=$(FREQ_CL) -DFREQ_PE=$(FREQ_PE)
 
 READFS_FILES=$(abspath $(MODEL_TENSORS))
 
diff --git a/tools/nntool/generation/project_template/common/model_decl.mk b/tools/nntool/generation/project_template/common/model_decl.mk
index 1d0ba1b20..4f72f28e4 100644
--- a/tools/nntool/generation/project_template/common/model_decl.mk
+++ b/tools/nntool/generation/project_template/common/model_decl.mk
@@ -48,11 +48,21 @@ RM=rm -f
 
 NNTOOL?=nntool
 
-TOTAL_STACK_SIZE=$(shell expr $(CLUSTER_STACK_SIZE) \+ $(CLUSTER_SLAVE_STACK_SIZE) \* 7)
+ifeq '$(TARGET_CHIP_FAMILY)' 'GAP9'
+CLUSTER_SLAVE_PE=8
+else ifeq '$(TARGET_CHIP_FAMILY)' 'GAP8'
+CLUSTER_SLAVE_PE=7
+else
+  $(error TARGE_CHIP_FAMILY not found in env or not correct)
+endif
+
+TOTAL_STACK_SIZE=$(shell expr $(CLUSTER_STACK_SIZE) \+ $(CLUSTER_SLAVE_STACK_SIZE) \* $(CLUSTER_SLAVE_PE))
 MODEL_L1_MEMORY=$(shell expr $(TARGET_L1_SIZE) \- $(TOTAL_STACK_SIZE))
 MODEL_L2_MEMORY=$(TARGET_L2_SIZE)
 MODEL_L3_MEMORY=$(TARGET_L3_SIZE)
 
+
+
 # hram - HyperBus RAM
 # qspiram - Quad SPI RA
 # hflash - HyperBus Flash
diff --git a/tools/nntool/graph/manipulations/eliminate_transposes/eliminate_transposes.py b/tools/nntool/graph/manipulations/eliminate_transposes/eliminate_transposes.py
index e1f13ee60..986119294 100644
--- a/tools/nntool/graph/manipulations/eliminate_transposes/eliminate_transposes.py
+++ b/tools/nntool/graph/manipulations/eliminate_transposes/eliminate_transposes.py
@@ -513,7 +513,7 @@ def continue_down(G, node, exclude_nodes, visited_nodes, cur_visited_nodes, cur_
         if check_continue(visited_nodes, cur_visited_nodes, exclude_nodes, edge.to_node, 'down', edge.to_idx):
             continue
         new_actions, visited_down_nodes = search_down(
-            G, edge.to_node, exclude_nodes, visited_nodes | cur_visited_nodes, edge, transpose_history)
+            G, edge.to_node, exclude_nodes, visited_nodes | cur_visited_nodes, edge, transpose_history.copy())
         cur_visited_nodes |= visited_down_nodes
         cur_actions += new_actions
     return cur_actions, cur_visited_nodes
diff --git a/tools/nntool/graph/manipulations/formatter.py b/tools/nntool/graph/manipulations/formatter.py
new file mode 100644
index 000000000..d3ff09531
--- /dev/null
+++ b/tools/nntool/graph/manipulations/formatter.py
@@ -0,0 +1,122 @@
+# Copyright (C) 2022  GreenWaves Technologies, SAS
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+from copy import deepcopy
+
+from graph.types import ImageFormatParameters, NNEdge, TransposeParameters
+from quantization.qtype import QType
+from utils.node_id import NodeId
+
+
+def insert_formatter(G, input_node, formatter, normalizer):
+    format_node = ImageFormatParameters(input_node.name + "_formatter",
+                                        norm_func=normalizer.upper(),
+                                        format_change=formatter.upper())
+    out_edges = G.out_edges(input_node.name)
+
+    # dims updated to reflect formatter
+    if format_node.output_channels is not None and format_node.input_channels is not None:
+        out_dim = input_node.get_output_size(None)[0]
+        if formatter.upper() in ("BW8", "BW16"):
+            assert format_node.input_channels == 1
+            in_dim = out_dim.clone()
+            format_node.out_dims_hint = input_node.out_dims_hint
+            format_node.in_dims_hint = input_node.out_dims_hint
+            input_node.dims = in_dim
+            for out_edge in out_edges:
+                G.remove_edge(out_edge)
+        else:
+            if not out_dim.is_named or out_dim.c != format_node.output_channels:
+                raise ValueError(
+                    "current graph input is not named or does not match formatter output channels")
+            if formatter.upper() in ("RGB16", "BW16") and normalizer.upper() != "OUT_INT16":
+                raise ValueError(
+                    "rgb16 and bw16 formatters must have out_int16 as normalization function")
+            in_dim = out_dim.clone()
+            in_dim.c = format_node.input_channels
+            in_dim.impose_order(("h", "w", "c"))
+            format_node.in_dims_hint = [["h", "w", "c"]]
+            input_node.dims = in_dim
+            if input_node.fixed_order:
+                new_out_edges = []
+                for out_edge in out_edges:
+                    if isinstance(out_edge.to_node, TransposeParameters):
+                        trans_node = out_edge.to_node
+                        transpose_edges = G.out_edges(trans_node.name)
+                        new_out_edges.extend(transpose_edges)
+                        G.remove(trans_node)
+                        if G.quantization:
+                            nid = NodeId(trans_node)
+                            if nid in G.quantization:
+                                del G.quantization[NodeId(trans_node)]
+                    else:
+                        new_out_edges.append(out_edge)
+                out_edges = new_out_edges
+            else:
+                input_node.fixed_order = True
+                for out_edge in out_edges:
+                    G.remove_edge(out_edge)
+            format_node.out_dims_hint = [["c", "h", "w"]] * len(out_edges)
+            input_node.out_dims_hint = [["h", "w", "c"]]
+            G.node_options[NodeId(input_node)] = input_node.at_options
+    # qrec updated to reflect formatter
+    input_qrec = G.quantization and G.quantization.get(NodeId(input_node))
+    if input_qrec and format_node.input_dtype and format_node.output_dtype:
+        formatter_qrec = G.quantization.get(NodeId(format_node))
+        if not formatter_qrec:
+            if input_qrec.out_qs[0].dtype != format_node.output_dtype:
+                raise ValueError(
+                    "current graph input output quantization does not match formatter output")
+            formatter_qrec = deepcopy(input_qrec)
+            formatter_qrec.out_qs[0] = deepcopy(formatter_qrec.out_qs[0])
+            if formatter_qrec.ktype.startswith('scaled'):
+                formatter_in_q = QType(
+                    scale=1, zero_point=0, dtype=format_node.input_dtype)
+            elif formatter_qrec.ktype.startswith('symmetric'):
+                formatter_in_q = QType(q=0, dtype=format_node.input_dtype)
+            else:
+                raise NotImplementedError("quantization has unknown type")
+            if len(formatter_qrec.in_qs) > 0:
+                formatter_qrec.in_qs[0] = formatter_in_q
+                input_qrec.in_qs[0] = formatter_in_q
+            else:
+                formatter_qrec.in_qs.append(formatter_in_q)
+                input_qrec.in_qs.append(formatter_in_q)
+            input_qrec.out_qs[0] = formatter_in_q
+        G.quantization[NodeId(format_node)] = formatter_qrec
+
+    G.add_node(format_node)
+    G.add_edge(NNEdge(input_node, format_node))
+    for out_edge in out_edges:
+        G.add_edge(NNEdge(format_node, out_edge.to_node, to_idx=out_edge.to_idx))
+
+
+def remove_formatter(G, fmt_node):
+    input_edges = G.in_edges(fmt_node.name)
+    assert len(input_edges) == 1, "formatter node should only have one input"
+    input_node = input_edges[0].from_node
+    fmt_edges = G.out_edges(fmt_node.name)
+    fmt_qrec = G.quantization and G.quantization.get(NodeId(fmt_node))
+    G.remove(fmt_node)
+
+    input_node.dims = fmt_node.out_dims[0]
+    input_node.out_dims_hint = fmt_node.out_dims_hint
+    for fmt_edge in fmt_edges:
+        G.add_edge(NNEdge(input_node, fmt_edge.to_node, to_idx=fmt_edge.to_idx))
+    if fmt_qrec:
+        input_qrec = G.quantization[NodeId(input_node)]
+        input_qrec.out_qs = fmt_qrec.out_qs
+        input_qrec.in_qs = fmt_qrec.out_qs
+        G.quantization.remove_node(fmt_node)
diff --git a/tools/nntool/graph/matches/matchers/batchnorm_to_discrete_ops.py b/tools/nntool/graph/matches/matchers/batchnorm_to_discrete_ops.py
new file mode 100644
index 000000000..9b7e9ae39
--- /dev/null
+++ b/tools/nntool/graph/matches/matchers/batchnorm_to_discrete_ops.py
@@ -0,0 +1,70 @@
+# Copyright (C) 2020  GreenWaves Technologies, SAS
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+import logging
+
+import numpy as np
+from graph.dim import Dim
+from graph.types import (ConstantInputParameters, MatrixSubParameters,
+                         NNEdge)
+from graph.types.conv2d import BatchNormalizationParameters
+from graph.types.tensor_arithmetic import MatrixMulParameters
+from utils.graph import GraphView
+
+from ..matcher import Matcher, match_name, description, groups, run_qtune_on_match
+
+LOG = logging.getLogger("nntool." + __name__)
+
+
+@match_name('batchnorm_to_discrete_ops')
+@description('Convert BatchNormParameters into a set of broadcasted operations')
+@groups('scaled', 'symmetric')
+class FuseBatchnorm(Matcher):
+
+    def _match(self, G: GraphView, set_identity: bool = True, **kwargs):
+
+        has_modified_graph = False
+        for bn_node in G.nodes(node_classes=BatchNormalizationParameters):
+            w_bn = bn_node.scale / \
+                np.sqrt(bn_node.epsilon + bn_node.running_variance)
+            b_bn = bn_node.bias - bn_node.running_mean * bn_node.scale / \
+                np.sqrt(bn_node.running_variance + bn_node.epsilon)
+
+            mul_params = MatrixMulParameters(
+                G.unique_name(f"{bn_node.name}_mul"))
+            add_params = MatrixSubParameters(
+                G.unique_name(f"{bn_node.name}_add"))
+            broadcasted_shape = [1 if i != bn_node.axis else dim for i, dim in enumerate(
+                bn_node.in_dims[0].shape)]
+            scale_node = ConstantInputParameters(G.unique_name(f"{bn_node.name}_scale"), value=w_bn.reshape(
+                broadcasted_shape), dims=Dim.unnamed(broadcasted_shape))
+            bias_node = ConstantInputParameters(G.unique_name(f"{bn_node.name}_bias"), value=b_bn.reshape(
+                broadcasted_shape), dims=Dim.unnamed(broadcasted_shape))
+
+            from_node = G.in_edges(bn_node)[0].from_node
+            to_node = G.out_edges(bn_node)[0].to_node
+            G.remove(bn_node)
+            G.add_edge(NNEdge(from_node, mul_params))
+            G.add_edge(NNEdge(scale_node, mul_params, to_idx=1))
+            G.add_edge(NNEdge(mul_params, add_params))
+            G.add_edge(NNEdge(bias_node, add_params, to_idx=1))
+            G.add_edge(NNEdge(add_params, to_node))
+
+            has_modified_graph = True
+
+        if set_identity:
+            self.set_identity(G)
+
+        return has_modified_graph
diff --git a/tools/nntool/graph/matches/matchers/concat_slice.py b/tools/nntool/graph/matches/matchers/concat_slice.py
index ef6fb82b6..3fa4f2c44 100644
--- a/tools/nntool/graph/matches/matchers/concat_slice.py
+++ b/tools/nntool/graph/matches/matchers/concat_slice.py
@@ -163,7 +163,7 @@ def eliminate_slice(self, G, concat, slice_node, remove_nodes, concat_in_idx, re
         elif slice_node.changes_shape:
             reshape = ReshapeParameters(
                 G.unique_name(f'{slice_node.name}_reshape'),
-                old_shape=slice_node.post_slice_shape,
+                old_shape=slice_node.slice_shape,
                 shape=slice_node.out_shape)
         else:
             reshape = None
diff --git a/tools/nntool/graph/matches/matchers/concat_split.py b/tools/nntool/graph/matches/matchers/concat_split.py
index e6a362b0c..d253489a2 100644
--- a/tools/nntool/graph/matches/matchers/concat_split.py
+++ b/tools/nntool/graph/matches/matchers/concat_split.py
@@ -14,11 +14,13 @@
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
 import logging
+from graph.matches.match_utils import search_up
 
 from graph.types import ConcatParameters, NNEdge, SplitParameters
+from graph.types.others import CopyParameters
 from utils.graph import GraphView
 
-from ..matcher import Matcher, description, groups, match_name
+from ..matcher import Matcher, description, groups, match_name, run_before
 
 LOG = logging.getLogger("nntool." + __name__)
 
@@ -26,6 +28,7 @@
 @groups('*')
 @match_name("concat_split")
 @description("removes concat/split pair where all in edges on the concat match the out edges on the split")
+@run_before('insert_copies')
 class ConcatSplitMatch(Matcher):
 
     def _match(self, G: GraphView, set_identity: bool = True, **kwargs) -> bool:
@@ -35,11 +38,11 @@ def _match(self, G: GraphView, set_identity: bool = True, **kwargs) -> bool:
             if len(in_edges) > 1:
                 continue
             in_edge = in_edges[0]
-            if not isinstance(in_edge.from_node, ConcatParameters):
-                continue
-            concat_node = in_edge.from_node
-            if len(G.out_edges(concat_node.name)) > 1:
+            edges = search_up(G, in_edge, ConcatParameters, can_pass=(CopyParameters,), multi_on_target=False)
+            if not edges:
                 continue
+            nodes = [split_node] + [edge.from_node for edge in edges]
+            concat_node = nodes[-1]
             if concat_node.axis != split_node.axis:
                 continue
             axis = concat_node.axis
@@ -54,8 +57,7 @@ def _match(self, G: GraphView, set_identity: bool = True, **kwargs) -> bool:
                      concat_node.name, split_node.name)
             concat_in_edges = G.indexed_in_edges(concat_node.name)
             split_out_edges = G.indexed_out_edges(split_node.name)
-            G.remove(split_node)
-            G.remove(concat_node)
+            G.remove_all(nodes)
             for idx, in_edge in enumerate(concat_in_edges):
                 for out_edge in split_out_edges[idx]:
                     G.add_edge(NNEdge(from_node=in_edge.from_node, from_idx=in_edge.from_idx,
diff --git a/tools/nntool/graph/matches/matchers/expand_to_reshape.py b/tools/nntool/graph/matches/matchers/expand_to_reshape.py
new file mode 100644
index 000000000..547f0852b
--- /dev/null
+++ b/tools/nntool/graph/matches/matchers/expand_to_reshape.py
@@ -0,0 +1,50 @@
+# Copyright (C) 2020  GreenWaves Technologies, SAS
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+from graph.manipulations.eliminate_transposes.transpose_helpers import strip_ones
+from graph.types.others import ExpandParameters, TransposeParameters
+import logging
+
+from graph.dim import Dim
+from graph.types import NNEdge, ReshapeParameters
+from utils.graph import GraphView
+from utils.node_id import NodeId
+
+from ..matcher import Matcher, match_name, description, run_before, groups, needs_valid_dimension
+
+LOG = logging.getLogger("nntool." + __name__)
+
+@match_name("expand_to_reshape")
+@description("remove expands that are really just reshapes")
+@run_before('*')
+@groups('*')
+@needs_valid_dimension(True)
+class ExpandToReshape(Matcher):
+
+    def _match(self, G: GraphView, set_identity: bool = True, **kwargs):
+        modified_graph = False
+        for node in G.nodes(node_classes=ExpandParameters):
+            in_shape = node.in_dims[0].shape
+            out_shape = node.out_dims[0].shape
+            if strip_ones(in_shape) != strip_ones(out_shape):
+                continue
+            LOG.info(f'replacing expand {node.name} with a reshape')
+            reshape = ReshapeParameters(G.unique_name(f'{node.name}_reshape'), old_shape=in_shape, shape=out_shape)
+            G.replace_node(node, reshape)
+            modified_graph = True
+
+        if set_identity:
+            self.set_identity(G)
+
+        return modified_graph
diff --git a/tools/nntool/graph/matches/matchers/fuse_batchnorm.py b/tools/nntool/graph/matches/matchers/fuse_batchnorm.py
new file mode 100644
index 000000000..57722664f
--- /dev/null
+++ b/tools/nntool/graph/matches/matchers/fuse_batchnorm.py
@@ -0,0 +1,87 @@
+# Copyright (C) 2020  GreenWaves Technologies, SAS
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+import logging
+
+import numpy as np
+from graph.types import (ConstantInputParameters, MatMulOpParameters, MatMulTransposedParameters,
+                         NNEdge)
+from graph.types.conv2d import BatchNormalizationParameters
+from utils.graph import GraphView
+
+from ..matcher import Matcher, match_name, description, groups, run_qtune_on_match
+
+LOG = logging.getLogger("nntool." + __name__)
+
+
+@match_name('fuse_batchnorm')
+@description('Fuse batch normalization into MatMul')
+@groups('scaled', 'symmetric')
+@run_qtune_on_match
+class FuseBatchnorm(Matcher):
+
+    def _match(self, G: GraphView, set_identity: bool = True, **kwargs):
+
+        has_modified_graph = False
+        nodes = []
+        for node in G.nodes(node_classes=BatchNormalizationParameters):
+            in_node = G.indexed_in_edges(node)[0].from_node
+            if isinstance(in_node, MatMulOpParameters):
+                nodes.append((node, in_node))
+
+        for bn_node, filt_node in nodes:
+            filt_in_edges = G.indexed_in_edges(filt_node.name)
+            weights_node = filt_in_edges[1].from_node
+            biases_node = filt_in_edges[2].from_node if len(
+                filt_in_edges) > 2 else None
+            w_bn = bn_node.scale / np.sqrt(bn_node.epsilon + bn_node.running_variance)
+            if not isinstance(weights_node, ConstantInputParameters):
+                continue
+            weights = weights_node.dqvalue
+            if len(w_bn) > 1:
+                if not isinstance(filt_node, MatMulTransposedParameters):
+                    weights = np.swapaxes(weights.copy(), -2, -1)
+                if weights.shape[-2] != len(w_bn):
+                    LOG.info(f'{filt_node.name} - weights shape does not match batch norm')
+                    continue
+            if biases_node is None:
+                biases = np.zeros((weights.shape[-1],))
+                biases_node = ConstantInputParameters(
+                    G.unique_name(f'{filt_node.name}_biases'), value=biases)
+                G.add_edge(NNEdge(from_node=biases,
+                                  to_node=filt_node, to_idx=2))
+            elif not isinstance(biases_node, ConstantInputParameters):
+                continue
+            else:
+                biases = biases_node.dqvalue
+            # fold batch norm into conv weights and biases
+            if len(w_bn) > 1:
+                w_bn = np.diag(w_bn)
+                weights = np.matmul(w_bn, weights)
+            else:
+                weights = weights * w_bn
+            biases = bn_node.bias + ((biases - bn_node.running_mean) *
+                                     bn_node.scale / np.sqrt(bn_node.running_variance + bn_node.epsilon))
+            if len(w_bn) > 1 and not isinstance(filt_node, MatMulTransposedParameters):
+                weights = np.swapaxes(weights, -2, -1)
+            weights_node.value = weights
+            biases_node.value = biases
+            G.remove_and_reconnect(bn_node, edge_class=NNEdge)
+            has_modified_graph = True
+
+        if set_identity:
+            self.set_identity(G)
+
+        return has_modified_graph
diff --git a/tools/nntool/graph/matches/matchers/fuse_gap_convs.py b/tools/nntool/graph/matches/matchers/fuse_gap_convs.py
index 1b4c368cd..beecee70c 100644
--- a/tools/nntool/graph/matches/matchers/fuse_gap_convs.py
+++ b/tools/nntool/graph/matches/matchers/fuse_gap_convs.py
@@ -25,10 +25,9 @@
                                      TanHActivationParameters)
 from graph.types.base import NNNodeRef
 from graph.types.fusions import FusionInputParameters, FusionOutputParameters
-from utils.graph import GraphView, NodeRef
+from utils.graph import GraphView
 
-from ..matcher import (Matcher, description, groups, match_name,
-                       run_adjust_on_match, run_qtune_on_match)
+from ..matcher import (Matcher, description, groups, match_name, run_qtune_on_match)
 
 LOG = logging.getLogger("nntool." + __name__)
 
@@ -47,6 +46,8 @@
     'conv_max_active',
     'conv_average_active',
     'conv_active_max',
+    'conv_max',
+    'conv_average',
 )
 
 VALID_ACTIVATIONS_POW2 = (
@@ -62,6 +63,8 @@
     'conv_max_active',
     'conv_average_active',
     'conv_active_max',
+    'conv_max',
+    'conv_average',
 )
 
 
@@ -120,8 +123,8 @@ def add_node(self, params, in_fusion=False):
             try:
                 for cnode in params.contained_nodes():
                     self.add_node(cnode, in_fusion=True)
-            except MergeStopError:  # @IgnoreException
-                raise MergeAbortError()
+            except MergeStopError:
+                raise MergeAbortError() # @IgnoreException
         elif isinstance(params, Conv2DParameters):
             if self.conv or not self.can_add(params):
                 raise MergeStopError() # @IgnoreException
@@ -201,7 +204,9 @@ def fusion_type(self):
 @groups('*')
 @match_name("fuse_gap_convs")
 @run_qtune_on_match
-@description('Fuse convolutions, pools and activations to match GAP AutoTiler operations')
+@description(
+    'Fuse convolutions, pools and activations to match GAP AutoTiler operations. Pooling and activation nodes'
+    ' are also fused into existing convolution fusions.')
 class MatchAllGapConv(Matcher):
     def _match(self, G: GraphView, set_identity: bool = True, **kwargs):
         has_modified_graph = False
diff --git a/tools/nntool/graph/matches/matchers/gather_to_split.py b/tools/nntool/graph/matches/matchers/gather_to_split.py
index 05aae2550..3acd9dc47 100644
--- a/tools/nntool/graph/matches/matchers/gather_to_split.py
+++ b/tools/nntool/graph/matches/matchers/gather_to_split.py
@@ -37,6 +37,8 @@ def _match(self, G: GraphView, set_identity: bool = True, **kwargs) -> bool:
             group = gathers_by_origin.setdefault((in_edge.from_node, in_edge.from_idx), [])
             group.append(gather)
         for in_edge, gathers in gathers_by_origin.items():
+            if len(gathers[0].indices.shape) > 1:
+                continue
             # This is too difficult to handle if there are multiple slices
             axis = gathers[0].axis
             if not all(gather.axis == axis and len(gather.indices.shape) <= 1
diff --git a/tools/nntool/graph/matches/matchers/move_node_up.py b/tools/nntool/graph/matches/matchers/move_node_up.py
index 9522f7dcf..36c815b66 100644
--- a/tools/nntool/graph/matches/matchers/move_node_up.py
+++ b/tools/nntool/graph/matches/matchers/move_node_up.py
@@ -161,7 +161,7 @@ class MoveActivationsMatcherScale8(MoveNodeUpMatcher):
 @run_before('fuse_gap_convs', 'fuse_gap_linear', 'fuse_gap_pool', 'fuse_op_activation_scale8')
 class MoveMaxPoolMatcherScale8(MoveNodeUpMatcher):
 
-    ValidNodesToPass = (ReluActivationParameters,)
+    ValidNodesToPass = (ReluActivationParameters, ConcatParameters)
     ValidFusions = (Conv2DParameters, FcParameters)
     ValidNodes = (lambda node: isinstance(
         node, PoolingParameters) and node.pool_type == "max",)
diff --git a/tools/nntool/graph/matches/matchers/rnn_unpack.py b/tools/nntool/graph/matches/matchers/rnn_unpack.py
index cbbd746d6..bbf2d3316 100644
--- a/tools/nntool/graph/matches/matchers/rnn_unpack.py
+++ b/tools/nntool/graph/matches/matchers/rnn_unpack.py
@@ -291,7 +291,7 @@ def _match(self, G: GraphView, set_identity: bool = True, **kwargs):
             if changes_shape:
                 reshape = ReshapeParameters(unpack_node.name + '_reshape',
                                             old_shape=Dim.unnamed(
-                                                unpack_node.post_slice_shape),
+                                                unpack_node.slice_shape),
                                             shape=Dim.unnamed(unpack_node.out_shape))
                 G.add_edge(NNEdge(from_node=in_edge.from_node,
                                   to_node=reshape, from_idx=in_edge.from_idx))
diff --git a/tools/nntool/graph/matches/matchers/slice_to_split.py b/tools/nntool/graph/matches/matchers/slice_to_split.py
index 53ab42b5a..74a577db7 100644
--- a/tools/nntool/graph/matches/matchers/slice_to_split.py
+++ b/tools/nntool/graph/matches/matchers/slice_to_split.py
@@ -140,9 +140,11 @@ def slice_to_split(G, slice_nodes, slices):
         axis_dim = in_dims[axis]
         outs = []
         splits = []
+        two_unused = axis_slice[0] > 0 and axis_slice[1] < axis_dim
         if axis_slice[0] > 0:
+            two_unused = True
             splits.append(axis_slice[0])
-            oparams = OutputParameters(G.unique_name('unused'))
+            oparams = OutputParameters(G.unique_name(f'{slice_node.name}_unused{0 if two_unused else ""}'))
             oparams.at_options.allocate = 1
             outs.append(
                 ((oparams, 0),))
@@ -151,7 +153,7 @@ def slice_to_split(G, slice_nodes, slices):
                      for edge in G.out_edges(slice_node.name)])
         if axis_slice[1] < axis_dim:
             splits.append(axis_dim - axis_slice[1])
-            oparams = OutputParameters(G.unique_name('unused'))
+            oparams = OutputParameters(G.unique_name(f'{slice_node.name}_unused{1 if two_unused else ""}'))
             oparams.at_options.allocate = 1
             outs.append(
                 ((oparams, 0),))
diff --git a/tools/nntool/graph/nngraph.py b/tools/nntool/graph/nngraph.py
index 4448d913b..f7d83da0e 100644
--- a/tools/nntool/graph/nngraph.py
+++ b/tools/nntool/graph/nngraph.py
@@ -16,13 +16,19 @@
 import logging
 import os
 import re
-from typing import Callable, Generator, Sequence, Tuple, Union
+from typing import Any, Callable, Generator, Mapping, Sequence, Tuple, Union
 
 import numpy as np
+from execution.graph_executer import GraphExecuter
+from execution.quantization_mode import QuantizationMode
+from interpreter.commands.qtune import SCHEME_NAME_MAPPINGS
 from quantization.quantization_set import QuantizationSet
+from quantization.quantizer.new_quantizer import NewQuantizer
 from reports.graph_reporter import GraphReporter
+from stats.activation_ranges_collector import ActivationRangesCollector
 from utils.graph import Graph, Node
 from utils.node_id import NodeId
+from utils.stats_funcs import cos_similarity, qsnr
 from utils.tabular import TextTableRenderer
 
 from graph.dim import Dim
@@ -33,6 +39,7 @@
 from graph.manipulations.dimensions import add_dimensions
 from graph.manipulations.liveness import calculate_liveness
 from graph.matches.fusions import fusions
+from graph.matches.matches import get_fusions
 from graph.types import (ConstantInputParameters, InputBaseParameters,
                          InputParameters, MultiplicativeBiasParameters,
                          OutputParameters, ResizerParameters,
@@ -383,10 +390,10 @@ def add_input(self, dim: Union[Dim, Tuple[int]], name: str = None, **kwargs) ->
     def add_constant(self, dim: Union[Dim, Tuple[int]] = None,
                      name: str = None,
                      value: np.ndarray = None,
-                     adjust_transpose: Sequence[int]=None,
+                     adjust_transpose: Sequence[int] = None,
                      is_mutated=False,
                      is_intermediate=False,
-                     short_name: str=None) -> NNNodeRef:
+                     short_name: str = None) -> NNNodeRef:
         """Creates a constant node
 
         Args:
@@ -401,7 +408,8 @@ def add_constant(self, dim: Union[Dim, Tuple[int]] = None,
         Returns:
             NNNodeRef: A reference to the Node in the Graph
         """
-        node_name = name if name else self.unique_name(f"constant_{self.num_constants}")
+        node_name = name if name else self.unique_name(
+            f"constant_{self.num_constants}")
         node = ConstantInputParameters(node_name, dims=dim,
                                        value=value,
                                        adjust_transpose=adjust_transpose,
@@ -445,7 +453,14 @@ def nodes_iterator(self, yield_fusions=True):
                         yield (step_idx, node, fusion_idx, fnode)
             yield (step_idx, node, None, None)
 
-    def adjust_order(self, reshape_weights=True, no_postprocess=False, debug_function: Callable=None, steps: int=None, single_step=False):
+    def adjust_order(
+        self,
+        reshape_weights=True,
+        no_postprocess=False,
+        debug_function: Callable = None,
+        steps: int = None,
+        single_step=False
+    ):
         """Adjusts tensor order to match selected kernels
 
         Args:
@@ -461,6 +476,15 @@ def adjust_order(self, reshape_weights=True, no_postprocess=False, debug_functio
         LOG.info("adjusted order")
         self.graph_identity.is_adjusted = True
 
+    @staticmethod
+    def get_fusions():
+        """Returns a dictionary of all the fusion/graph optimization pass names and descriptions
+
+        Returns:
+            Dict[str, str]: Names and descriptions of graph optimisation passes
+        """
+        return get_fusions()
+
     def fusions(self, *match_names, no_postprocess: bool = False):
         """Run matchers on the graph
 
@@ -470,7 +494,10 @@ def fusions(self, *match_names, no_postprocess: bool = False):
         """
         fusions(self, *match_names, no_postprocess=no_postprocess)
 
-    def add_dimensions(self, quiet=False):
+    def add_dimensions(
+        self,
+        quiet=False
+    ):
         """Add dimensions to the graph and calculate execution order and liveness
 
         Args:
@@ -485,7 +512,121 @@ def add_dimensions(self, quiet=False):
             self,
             self.graph_state.steps)
 
-    def balance_filters(self, step_idx: int=None, precision_threshold=0.20):
+    def collect_statistics(
+        self,
+        input_tensors_iterator: Union[Sequence[Sequence[np.ndarray]], Sequence[np.ndarray]]
+    ) -> Mapping[Union[str, Tuple[str, str]], Mapping]:
+        """Collect tensor statistics for quantization
+
+        Args:
+            input_tensors_iterator (Union[Sequence[Sequence[np.ndarray]], Sequence[np.ndarray]]):
+                If the graph has a single input this can just be an iterator over numpy arrays. If the graph has
+                multiple inputs then it should be an iterator over sequences of numpy arrays.
+
+        Returns:
+            Mapping[Union[str, Tuple[str, str]], Mapping]: Mapping of statistics for each node's inputs and outputs
+        """
+        stats_collector = ActivationRangesCollector()
+        for input_tensors in input_tensors_iterator:
+            if isinstance(input_tensors, np.ndarray):
+                input_tensors = [input_tensors]
+            stats_collector.collect_stats(self, input_tensors)
+        return {k.key: v for k, v in stats_collector.stats.items()}
+
+    @staticmethod
+    def qsnrs(tensors1, tensors2, idx=0):
+        return tuple([qsnr(t1[idx], t2[idx]) if len(t1) > idx and len(t2) > idx else None for t1, t2 in zip(tensors1, tensors2)])
+
+    @staticmethod
+    def cos_sim(tensors1, tensors2, idx=0):
+        return tuple([cos_similarity(t1[idx], t2[idx]) if len(t1) > idx and len(t2) > idx else None for t1, t2 in zip(tensors1, tensors2)])
+
+    def quantize(
+        self,
+        statistics: Mapping[Union[str, Tuple[str, str]], Mapping] = None,
+        schemes: Sequence[str] = None,
+        graph_options: Mapping[str, Any] = None,
+        node_options: Mapping[Union[str, Tuple[str, str]],
+                              Mapping[str, Any]] = None,
+        read_existing_options = True
+    ) -> None:
+        """Quantize the graph
+
+        Args:
+            statistics (Mapping[Union[str, Tuple[str, str]], Mapping], optional): Statistics collected by the NNGraph.collect_statistics
+                method.
+            schemes (Sequence[], optional): Sequence of schemes "scaled", "pow2", or "float" to use in priority order. If None use scaled. Defaults to None.
+            graph_options (Mapping[str, Any], optional): Quantization options to set for the whole graph. Defaults to None.
+            node_options (Mapping[Union[str, Tuple[str, str]], Mapping[str, Any]], optional):
+                Quantization options to set for specific nodes. The map key should be the node name or if the node is inside a fusion
+                then a tuple of the fusion name and the node name. Defaults to None.
+            read_existing_options (bool, optional): Incorporate existing quantization options and schemes in the graph. Leaving this as
+                True and just supplying graph_option, node_options and/or schemes is the equivalent of the nntool qtune command
+        """
+        quantizer = NewQuantizer(self)
+        if schemes:
+            for scheme in schemes:
+                scheme = scheme.lower()
+                if scheme not in SCHEME_NAME_MAPPINGS:
+                    raise ValueError(f'invalid scheme name {scheme}')
+                quantizer.schemes.append(SCHEME_NAME_MAPPINGS[scheme])
+        elif 'SQ8' not in quantizer.schemes:
+            quantizer.schemes.append('SQ8')
+        options = {}
+        if graph_options:
+            options.update(graph_options)
+        if node_options:
+            options.update({NodeId(name) if isinstance(name, str) else NodeId(*name): v
+                            for name, v in node_options.items()})
+        quantizer.set_stats(statistics)
+        quantizer.update_options(options)
+        quantizer.quantize()
+
+    def execute(
+            self,
+            input_tensors: Union[np.ndarray, Sequence[np.ndarray]],
+            quantize=False,
+            dequantize=False,
+            output_fusion_tensors=False
+    ) -> Sequence[Sequence[np.ndarray]]:
+        """Runs inference on the graph
+
+        Args:
+            input_tensors (Union[np.ndarray, Sequence[np.ndarray]]):
+                Numpy arrays containing inputs (which should be normalized and in float)
+                If there is only one input it can be specified without a sequence.
+            quantize (bool, optional): Run the graph using quantization parameters. Defaults to False.
+            dequantize (bool, optional): Dequantize outputs. Implies quantize. Defaults to False.
+            output_fusion_tensors (bool, optional): Output outputs from nodes that have been fused. Defaults to False.
+
+        Raises:
+            ValueError: Incorrect parameters
+
+        Returns:
+            Sequence[Sequence[np.ndarray]]: List of lists of outputs of each node in the graph. If output_fusion_tensors
+            is True this will also include the output of nodes contained inside fusions (except fused expressions)
+        """
+        if dequantize:
+            quantize = True
+        if quantize:
+            if self.quantization is None or not self.quantization.verify_quantization(self):
+                raise ValueError('graph is not quantized')
+            if dequantize:
+                qmode = QuantizationMode.all_dequantize()
+            else:
+                qmode = QuantizationMode.all()
+        else:
+            qmode = QuantizationMode.none()
+        if isinstance(input_tensors, np.ndarray):
+            input_tensors = [input_tensors]
+        executer = GraphExecuter(self, self.quantization)
+        return executer.execute(input_tensors, qmode=qmode, append_fusion_output=output_fusion_tensors)
+
+    def balance_filters(
+        self,
+        step_idx: int = None,
+        precision_threshold=0.20
+    ):
         """Experimental filter balancing routines
 
         Args:
diff --git a/tools/nntool/graph/types/base.py b/tools/nntool/graph/types/base.py
index e80eb5223..a75113604 100644
--- a/tools/nntool/graph/types/base.py
+++ b/tools/nntool/graph/types/base.py
@@ -28,14 +28,6 @@
 LOG = logging.getLogger("nntool." + __name__)
 
 
-class ParameterError(Exception):
-    pass
-
-
-class CantPromoteQError(ParameterError):
-    pass
-
-
 class NodeOptions(OptionList):
     def __init__(self, *args, **kwargs):
         super(NodeOptions, self).__init__(*args, **kwargs)
@@ -253,13 +245,6 @@ def value(self):
     def value(self, val):
         self._value = val
 
-    @property
-    def can_promoteq(self):
-        return False
-
-    def promoteq(self):
-        raise CantPromoteQError()
-
     @property
     def in_dims(self):
         return self._in_dims
diff --git a/tools/nntool/graph/types/conv2d.py b/tools/nntool/graph/types/conv2d.py
index b9444314f..3e7b7bed2 100644
--- a/tools/nntool/graph/types/conv2d.py
+++ b/tools/nntool/graph/types/conv2d.py
@@ -27,7 +27,7 @@
 class BatchNormalizationParameters(NoSizeChangeParameters, SingleInputAndOutput, SensitiveToOrder):
 
     #pylint: disable-msg=too-many-arguments
-    def __init__(self, name, scale=None, bias=None, running_mean=None,
+    def __init__(self, name, scale=None, bias=None, running_mean=None, axis=0,
                  running_variance=None, spatial=None, momentum=None, epsilon=None, **kwargs):
         super(BatchNormalizationParameters, self).__init__(name, **kwargs)
         self.scale = scale
@@ -37,6 +37,7 @@ def __init__(self, name, scale=None, bias=None, running_mean=None,
         self.spatial = spatial
         self.momentum = momentum
         self.epsilon = epsilon
+        self.axis = axis
 
     @property
     def can_equalize(self):
diff --git a/tools/nntool/graph/types/expression_fusion.py b/tools/nntool/graph/types/expression_fusion.py
index 43afedf3b..554027cc2 100644
--- a/tools/nntool/graph/types/expression_fusion.py
+++ b/tools/nntool/graph/types/expression_fusion.py
@@ -18,6 +18,7 @@
 from collections import Counter
 
 from expressions.symbolic.function_collection import FunctionCollection
+from expressions.symbolic.iteration_space import Assignments
 from expressions.symbolic.symbol import Constant, Variable
 
 from utils.node_id import NodeId
@@ -138,9 +139,9 @@ def details_collector(self, stats, stat, details):
     def is_same_operation_as(self, G, other):
         if not isinstance(other, ExpressionFusionParameters):
             return False
-        if len(self.func_col.functions) != 1 or len(other.func_col.functions) != 1:
+        if len(self.func_col) != 1 or len(other.func_col) != 1:
             return False
-        if next(iter(self.func_col.functions.values())).equivalent(next(iter(other.func_col.functions.values()))):
+        if self.func_col[0][1].equivalent(other.func_col[0][1]):
             return True
         return False
 
@@ -156,7 +157,7 @@ def decompose(self, qrecs=None):
         LOG.info("expression decomposed into %s intermediate and %s output expressions",
                  len(intermediates), len(outputs))
 
-        expressions = []
+        expressions = Assignments()
         inter_vars = {node: Variable(
             node.name, shape=node.dims.shape) for node in inputs}
         # TODO - Intermediates are not sorted here so there may be interdependences
@@ -172,38 +173,37 @@ def decompose(self, qrecs=None):
                                            variable=variable,
                                            qrecs=qrecs)
             inter_vars[node] = variable
-            expressions.append(expr)
+            expressions.add(*expr)
 
         for node in outputs:
             expr = self.compose_expression(
                 self.subgraph, node, inter_vars, qrecs=qrecs)
-            expressions.append(expr)
+            expressions.add(*expr)
 
         # sort the inputs by idx
         inputs = sorted([node for node in inputs], key=lambda x: x.idx)
         outputs = sorted([node for node in outputs], key=lambda x: x.idx)
-        func_col = FunctionCollection(expressions)
 
-        return [node.name for node in inputs], [node.name for node in outputs], func_col
+        return [node.name for node in inputs], [node.name for node in outputs], expressions
 
     def get_output_size(self, in_dims):
         # the input shapes may have changed so the expression variables shapes could have
         # changed and the iterators will need to be recalculated
-        dim_change = False
+        # dim_change = False
         in_vars = [self.func_col.variables[name] for name in self.input_symbols]
         for idx, dim in enumerate(in_dims):
             shape = tuple(dim.shape)
             if tuple(in_vars[idx].shape) != shape:
                 in_vars[idx].shape = shape
-                dim_change = True
-        if dim_change:
-            self.func_col.set_var_shapes()
+        #         dim_change = True
+        # if dim_change:
+        #     self.func_col.set_var_shapes()
         out_dims = super().get_output_size(in_dims)
-        if dim_change: # if the input shapes haven't changed then the output shapes have not changed
-            out_vars = [self.func_col.variables[name] for name in self.output_symbols]
-            for idx, dim in enumerate(out_dims):
-                out_vars[idx].shape = tuple(dim.shape)
-            self.func_col.init_indexes() # recalculate the iterators
+        # if dim_change: # if the input shapes haven't changed then the output shapes have not changed
+        #     out_vars = [self.func_col.variables[name] for name in self.output_symbols]
+        #     for idx, dim in enumerate(out_dims):
+        #         out_vars[idx].shape = tuple(dim.shape)
+        #     self.func_col.init_indexes() # recalculate the iterators
         return out_dims
 
     def __str__(self):
diff --git a/tools/nntool/graph/types/others.py b/tools/nntool/graph/types/others.py
index 70bacc828..f22874cc6 100644
--- a/tools/nntool/graph/types/others.py
+++ b/tools/nntool/graph/types/others.py
@@ -142,7 +142,7 @@ def __str__(self):
 
 
 @cls_op_name('expand')
-class ExpandParameters(Parameters, InsensitiveToQuantization):
+class ExpandParameters(Parameters, SensitiveToOrder, InsensitiveToQuantization):
     def __init__(self, *args, shape=None, **kwargs):
         super(ExpandParameters, self).__init__(*args, **kwargs)
         self.shape = shape
@@ -178,6 +178,26 @@ def get_output_size(self, in_dims):
     def __str__(self):
         return f"{self.shape}"
 
+@cls_op_name('scatternd')
+class ScatterNdParameters(Parameters, SensitiveToOrder):
+    def __init__(self, *args, indices=None, updates=None, reduction=None, **kwargs):
+        super(ScatterNdParameters, self).__init__(*args, **kwargs)
+        self.indices = indices
+        self.updates = updates
+        self.reduction = reduction
+
+    def get_parameter_size(self):
+        return 0
+
+    @property
+    def can_equalize(self):
+        return False
+
+    def get_output_size(self, in_dims):
+        return [Dim.unnamed(in_dims[0].shape)]
+
+    def __str__(self):
+        return ""
 
 @cls_op_name('quantize')
 class QuantizeParameters(Parameters, ComparableParameters):
@@ -429,8 +449,6 @@ def __init__(self, *args,
 
         super(StridedSliceParameters, self).__init__(*args, **kwargs)
         self.act_slice = act_slice
-        self.slice_shape = tuple(
-            int(abs(math.ceil((sl[1] - sl[0])/sl[2]))) for sl in self.act_slice)
         self.out_shape = tuple(out_shape)
 
     @property
@@ -443,27 +461,17 @@ def graph_anon_label(self):
 
     @property
     def slice_shape(self):
-        return self._slice_shape
-
-    @slice_shape.setter
-    def slice_shape(self, val):
-        self._slice_shape = tuple(val)
+        return tuple(
+            int(abs(math.ceil((max(sl[1], -1) - max(sl[0], -1))/sl[2]))) for sl in self.act_slice)
 
     @property
     def slices_axes(self):
         in_shape = self.in_dims[0].shape
-        return tuple(idx for idx, shapes in enumerate(zip(self.post_slice_shape, in_shape)) if shapes[0] != shapes[1])
-
-    @property
-    def post_slice_shape(self):
-        old_settings = np.seterr(all='raise')
-        res = tuple(abs(((sl[1] if sl[1] >= -1 else -1) - sl[0])//sl[2]) for sl in self.act_slice)
-        np.seterr(**old_settings)
-        return res
+        return tuple(idx for idx, shapes in enumerate(zip(self.slice_shape, in_shape)) if shapes[0] != shapes[1])
 
     @property
     def changes_shape(self):
-        return self.post_slice_shape != self.out_shape
+        return self.slice_shape != self.out_shape
 
     @property
     def can_equalize(self):
@@ -509,7 +517,7 @@ def does_nothing(self) -> bool:
     def no_model_code(self) -> bool:
         if not self.in_dims:
             return False
-        return self.post_slice_shape == tuple(self.in_dims[0].shape)
+        return self.slice_shape == tuple(self.in_dims[0].shape)
 
     def get_parameter_size(self):
         return 0
diff --git a/tools/nntool/importer/common/constant_mixin.py b/tools/nntool/importer/common/constant_mixin.py
index ddaae71d0..8cc879542 100644
--- a/tools/nntool/importer/common/constant_mixin.py
+++ b/tools/nntool/importer/common/constant_mixin.py
@@ -47,3 +47,12 @@ def record_constant_qrec(cls, inp, cnode, **kwargs):
         if qrecs is None:
             return
         qrecs[NodeId(cnode)] = QRec.scaled(out_qs=[qtype])
+
+    @classmethod
+    def move_stat(cls, inp, new_name, **kwargs):
+        cnid = NodeId(new_name)
+        onid = NodeId(inp[0])
+        qopts = kwargs.get('qopts', {})
+        if onid in qopts:
+            qopts[cnid] = qopts[onid]
+            del qopts[onid]
diff --git a/tools/nntool/importer/onnx/common/__init__.py b/tools/nntool/importer/onnx/common/__init__.py
index 834daa668..e238b949c 100644
--- a/tools/nntool/importer/onnx/common/__init__.py
+++ b/tools/nntool/importer/onnx/common/__init__.py
@@ -4,7 +4,20 @@
 
 from onnx import TensorProto, mapping, helper
 
-logger = logging.getLogger('nntool.' + __name__)
+_logger = logging.getLogger('nntool.' + __name__)
+
+class logger:
+    @staticmethod
+    def info(*args, **kwargs):
+        _logger.info(*args, **kwargs)
+
+    @staticmethod
+    def warning(*args, **kwargs):
+        _logger.warning(*args, **kwargs)
+
+    @staticmethod
+    def debug(*args, **kwargs):
+        _logger.debug(*args, **kwargs)
 
 def get_unique_suffix():
     """ Get unique suffix by using first 8 chars from uuid.uuid4
diff --git a/tools/nntool/importer/onnx/common/handler_helper.py b/tools/nntool/importer/onnx/common/handler_helper.py
index e6d32a8bf..6d4723a46 100644
--- a/tools/nntool/importer/onnx/common/handler_helper.py
+++ b/tools/nntool/importer/onnx/common/handler_helper.py
@@ -87,6 +87,7 @@ def get_all_backend_handlers(opset_dict):
     return handlers
 
 
+
 def get_backend_coverage():
     """ Get backend coverage for document.
 
diff --git a/tools/nntool/importer/onnx/handlers/backend/add.py b/tools/nntool/importer/onnx/handlers/backend/add.py
index 7b5dcff3a..fa67f4db9 100644
--- a/tools/nntool/importer/onnx/handlers/backend/add.py
+++ b/tools/nntool/importer/onnx/handlers/backend/add.py
@@ -37,3 +37,7 @@ def version_7(cls, node, **kwargs):
     @classmethod
     def version_13(cls, node, **kwargs):
         return cls._common(node, **kwargs)
+
+    @classmethod
+    def version_14(cls, node, **kwargs):
+        return cls._common(node, **kwargs)
diff --git a/tools/nntool/importer/onnx/handlers/backend/batch_normalization.py b/tools/nntool/importer/onnx/handlers/backend/batch_normalization.py
index 0f01ef96a..645c72b30 100644
--- a/tools/nntool/importer/onnx/handlers/backend/batch_normalization.py
+++ b/tools/nntool/importer/onnx/handlers/backend/batch_normalization.py
@@ -95,7 +95,8 @@ def _common(cls, node, **kwargs):
             params = BatchNormalizationParameters(valid_name, scale=bn_scale, bias=bn_bias,
                                                   running_mean=running_mean,
                                                   running_variance=running_variance, spatial=spatial,
-                                                  momentum=momentum, epsilon=epsilon)
+                                                  momentum=momentum, epsilon=epsilon,
+                                                  axis=0)
             G.add_edge(NNEdge(from_node=x[0], to_node=params, from_idx=x[1], to_idx=0))
             all_nodes[node.output[0]] = (params, 0, deepcopy(x[2]), None)
             return params
@@ -115,3 +116,11 @@ def version_7(cls, node, **kwargs):
     @classmethod
     def version_9(cls, node, **kwargs):
         return cls._common(node, **kwargs)
+
+    @classmethod
+    def version_14(cls, node, **kwargs):
+        return cls._common(node, **kwargs)
+
+    @classmethod
+    def version_15(cls, node, **kwargs):
+        return cls._common(node, **kwargs)
diff --git a/tools/nntool/importer/onnx/handlers/backend/concat_mixin.py b/tools/nntool/importer/onnx/handlers/backend/concat_mixin.py
index a5a658467..0663e06c4 100644
--- a/tools/nntool/importer/onnx/handlers/backend/concat_mixin.py
+++ b/tools/nntool/importer/onnx/handlers/backend/concat_mixin.py
@@ -32,7 +32,7 @@ def gen_concat(cls, node, inputs, axis, **kwargs):
         all_nodes = kwargs['all_nodes']
         G = kwargs['G']
         valid_name = kwargs['valid_name']
-        inputs = [all_nodes[inp] for inp in node.input]
+        inputs = [all_nodes[inp] for inp in node.input if all_nodes[inp][2].shape]
         input_shapes = [inp[2].shape for inp in inputs]
         axis_sum = sum(shape[axis] for shape in input_shapes)
         axis = axis if axis >= 0 else len(input_shapes[0]) + axis
diff --git a/tools/nntool/importer/onnx/handlers/backend/conv_mixin.py b/tools/nntool/importer/onnx/handlers/backend/conv_mixin.py
index a672c75a5..920394d11 100644
--- a/tools/nntool/importer/onnx/handlers/backend/conv_mixin.py
+++ b/tools/nntool/importer/onnx/handlers/backend/conv_mixin.py
@@ -17,6 +17,7 @@
 from copy import deepcopy
 
 import numpy as np
+from sklearn.utils import resample
 from graph.dim import Conv2DFilterDim, DilationDim, Dim, StrideDim
 from graph.types import (ConstantInputParameters, Conv2DParameters, NNEdge,
                          ReshapeParameters)
@@ -86,7 +87,9 @@ def conv(cls, node, quantized=False, **kwargs):
         # M x C/group x kH x kW
         weights_idx = 3 if quantized else 1
         weights_node = inputs[weights_idx][0]
-        weights_node.name = f'{valid_name}_weights'
+        new_name = f'{valid_name}_weights'
+        cls.move_stat(inputs[weights_idx], new_name, **kwargs)
+        weights_node.name = new_name
         weights = cls.get_constant(inputs[weights_idx])
         out_c = weights.shape[0]
         group = node.attrs.get("group", 1)
@@ -203,6 +206,11 @@ def conv(cls, node, quantized=False, **kwargs):
 
         # check if input needs a reshape
         if conv_in_shape != real_in_shape:
+            # if batch is present add it back
+            if batch is not None:
+                conv_in_shape = (batch,) + conv_in_shape
+            if np.prod(real_in_shape) != np.prod(conv_in_shape):
+                raise ValueError(f'shape inference issue {valid_name} filter indicates {conv_in_shape} but has an input of {real_in_shape}')
             r1_params = ReshapeParameters(f'{valid_name}_reshape_in',
                                           old_shape=Dim.unnamed(real_in_shape),
                                           shape=Dim.unnamed(conv_in_shape))
diff --git a/tools/nntool/importer/onnx/handlers/backend/conv_transpose.py b/tools/nntool/importer/onnx/handlers/backend/conv_transpose.py
index cddd73fb2..17bd0f764 100644
--- a/tools/nntool/importer/onnx/handlers/backend/conv_transpose.py
+++ b/tools/nntool/importer/onnx/handlers/backend/conv_transpose.py
@@ -120,7 +120,7 @@ def _common(cls, node, **kwargs):
                                                   dims=Dim.unnamed(
                                                       biases.shape))
 
-        padding, dilations, strides, output_padding = cls.calc_shapes(node, spatial_size, Dim2D((h, w)), Dim2D((filt_h, filt_w)))
+        padding, dilations, strides, output_padding = cls.calc_shapes(node, spatial_size, Dim2D(h, w), Dim2D(filt_h, filt_w))
 
         params = TransposeConv2DParameters(valid_name,
                                   filt=filt_dim,
diff --git a/tools/nntool/importer/onnx/handlers/backend/div.py b/tools/nntool/importer/onnx/handlers/backend/div.py
index 39c9bc963..c668dd043 100644
--- a/tools/nntool/importer/onnx/handlers/backend/div.py
+++ b/tools/nntool/importer/onnx/handlers/backend/div.py
@@ -38,3 +38,7 @@ def version_7(cls, node, **kwargs):
     @classmethod
     def version_13(cls, node, **kwargs):
         return cls._common(node, **kwargs)
+
+    @classmethod
+    def version_14(cls, node, **kwargs):
+        return cls._common(node, **kwargs)
diff --git a/tools/nntool/importer/onnx/handlers/backend/expand.py b/tools/nntool/importer/onnx/handlers/backend/expand.py
index dd83c73c4..fa7026a8f 100644
--- a/tools/nntool/importer/onnx/handlers/backend/expand.py
+++ b/tools/nntool/importer/onnx/handlers/backend/expand.py
@@ -17,10 +17,11 @@
 from graph.types import ConstantInputParameters, ExpandParameters
 from graph.types.base import NNEdge
 from importer.common.constant_mixin import ConstantMixin
+from importer.common.provisional_dim import ProvisionalDim
 from importer.onnx.common import logger
 
 from ..backend_handler import BackendHandler
-from ..handler import onnx_op, constant_only
+from ..handler import constant_only, onnx_op
 from .broadcast_mixin import BroadcastMixin
 
 
@@ -38,7 +39,6 @@ def _common(cls, node, **kwargs):
         y = inputs[1]
         shape = cls.get_constant(y)
 
-        pshape = cls.broadcast_to(x, shape)
         if cls.is_constant(x):
             logger.info("reducing %s to a constant", valid_name)
             x_val = cls.get_constant(x)
@@ -47,7 +47,7 @@ def _common(cls, node, **kwargs):
             params = ExpandParameters(valid_name, shape=shape)
             G.add_edge(NNEdge(x[0], params, from_idx=x[1]))
 
-        all_nodes[node.output[0]] = (params, 0, pshape, x[3])
+        all_nodes[node.output[0]] = (params, 0, ProvisionalDim(shape), x[3])
         return params
 
     @classmethod
diff --git a/tools/nntool/importer/onnx/handlers/backend/gather.py b/tools/nntool/importer/onnx/handlers/backend/gather.py
index 911396330..b86f44828 100644
--- a/tools/nntool/importer/onnx/handlers/backend/gather.py
+++ b/tools/nntool/importer/onnx/handlers/backend/gather.py
@@ -38,11 +38,16 @@ def _common(cls, node, **kwargs):
         x = inputs[0]
         x_shape = x[2].shape
         y = inputs[1]
+        y_shape = y[2].shape
         indices = cls.get_constant(y)
         axis = node.attrs.get('axis', 0)
 
-        pshape = ProvisionalDim(
-            x_shape[:axis:] + list(indices.shape) + x_shape[axis + 1:])
+        if not y_shape:
+            pshape = ProvisionalDim(
+                x_shape[:axis:] + x_shape[axis + 1:])
+        else:
+            pshape = ProvisionalDim(
+                x_shape[:axis:] + list(indices.shape) + x_shape[axis + 1:])
         if cls.is_constant(x):
             x_val = cls.get_constant(x)
             logger.info(
@@ -57,7 +62,10 @@ def _common(cls, node, **kwargs):
                 out_shape = pshape.known_shape.copy()
                 params = StridedSliceParameters(
                     valid_name, act_slice=act_slice, out_shape=out_shape)
-                if params.post_slice_shape == tuple(x[2].known_shape):
+                if params.slice_shape == tuple(x[2].known_shape):
+                    if np.ndim(indices) == 0 and pshape.shape[idx] is not None:
+                        del out_shape[idx]
+                        pshape = ProvisionalDim(out_shape)
                     params = ReshapeParameters(valid_name, old_shape=tuple(
                         x[2].known_shape), shape=out_shape)
             else:
diff --git a/tools/nntool/importer/onnx/handlers/backend/gru.py b/tools/nntool/importer/onnx/handlers/backend/gru.py
index 6e6bb8c2d..72625db7b 100644
--- a/tools/nntool/importer/onnx/handlers/backend/gru.py
+++ b/tools/nntool/importer/onnx/handlers/backend/gru.py
@@ -94,3 +94,7 @@ def version_3(cls, node, **kwargs):
     @classmethod
     def version_7(cls, node, **kwargs):
         return cls._common(node, **kwargs)
+
+    @classmethod
+    def version_14(cls, node, **kwargs):
+        return cls._common(node, **kwargs)
diff --git a/tools/nntool/importer/onnx/handlers/backend/lstm.py b/tools/nntool/importer/onnx/handlers/backend/lstm.py
index c043fd030..230aa0835 100644
--- a/tools/nntool/importer/onnx/handlers/backend/lstm.py
+++ b/tools/nntool/importer/onnx/handlers/backend/lstm.py
@@ -81,3 +81,7 @@ def version_1(cls, node, **kwargs):
     @classmethod
     def version_7(cls, node, **kwargs):
         return cls._common(node, **kwargs)
+
+    @classmethod
+    def version_14(cls, node, **kwargs):
+        return cls._common(node, **kwargs)
diff --git a/tools/nntool/importer/onnx/handlers/backend/mul.py b/tools/nntool/importer/onnx/handlers/backend/mul.py
index da171f978..6c8f7f1d4 100644
--- a/tools/nntool/importer/onnx/handlers/backend/mul.py
+++ b/tools/nntool/importer/onnx/handlers/backend/mul.py
@@ -36,3 +36,7 @@ def version_7(cls, node, **kwargs):
     @classmethod
     def version_13(cls, node, **kwargs):
         return cls._common(node, **kwargs)
+
+    @classmethod
+    def version_14(cls, node, **kwargs):
+        return cls._common(node, **kwargs)
diff --git a/tools/nntool/importer/onnx/handlers/backend/nncf_fake_quantize.py b/tools/nntool/importer/onnx/handlers/backend/nncf_fake_quantize.py
index ff395a6c3..9d322f005 100644
--- a/tools/nntool/importer/onnx/handlers/backend/nncf_fake_quantize.py
+++ b/tools/nntool/importer/onnx/handlers/backend/nncf_fake_quantize.py
@@ -18,6 +18,7 @@
 import numpy as np
 from importer.common.constant_mixin import ConstantMixin
 from quantization.qtype import QType
+from utils.node_id import NodeId
 
 from ..backend_handler import BackendHandler
 from ..handler import domain, onnx_op
@@ -38,8 +39,7 @@ def _common(cls, node, **kwargs):
         if auto_broadcast != 'numpy':
             raise ValueError(f'{valid_name} - only numpy is supported for auto_broadcast')
 
-        qstats = kwargs.get('quant_stats', {})
-        qopts = kwargs.get('quant_opts', {})
+        qopts = kwargs.get('qopts', {})
         x = inputs[0]
         # input_low = inputs[1]
         # input_high = inputs[2]
@@ -54,6 +54,7 @@ def _common(cls, node, **kwargs):
             raise ValueError(f"{valid_name} - don't know how to handle more than {math.pow(2, 16)} levels")
 
         bits = int(math.log2(levels))
+        qopts.setdefault(NodeId(x[0]), {'output_size': [None] * (x[1] + 1)})['output_size'][x[1]] = bits
         low_shape = output_low.shape
         high_shape = output_high.shape
         bc_dims_low = sum(1 for dim in high_shape if dim > 1)
diff --git a/tools/nntool/importer/onnx/handlers/backend/range.py b/tools/nntool/importer/onnx/handlers/backend/range.py
new file mode 100644
index 000000000..242544e0f
--- /dev/null
+++ b/tools/nntool/importer/onnx/handlers/backend/range.py
@@ -0,0 +1,47 @@
+# Copyright (C) 2020  GreenWaves Technologies, SAS
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+import numpy as np
+from graph.types import ConstantInputParameters
+from importer.common.constant_mixin import ConstantMixin
+from importer.common.provisional_dim import ProvisionalDim
+
+from ..backend_handler import BackendHandler
+from ..handler import constant_only, onnx_op
+
+
+@onnx_op("Range")
+@constant_only(True)
+class Range(BackendHandler, ConstantMixin):
+
+    @classmethod
+    def _common(cls, node, **kwargs):
+        all_nodes = kwargs['all_nodes']
+        G = kwargs['G']
+        valid_name = kwargs['valid_name']
+        value = node.attrs.get('value', 0)
+        inputs = [all_nodes[inp] if inp else None for inp in node.input]
+        if len(inputs) != 3:
+            raise ValueError(f'Range {valid_name} does not have 3 inputs')
+        start, limit, delta = [cls.get_constant(x) for x in inputs]
+        value = np.arange(start, limit, delta, dtype=start.dtype)
+        params = ConstantInputParameters(valid_name,
+                                         value=value)
+        all_nodes[node.output[0]] = (params, 0, ProvisionalDim(value.shape), None)
+        return params
+
+    @classmethod
+    def version_11(cls, node, **kwargs):
+        return cls._common(node, **kwargs)
diff --git a/tools/nntool/importer/onnx/handlers/backend/reducer_mixin.py b/tools/nntool/importer/onnx/handlers/backend/reducer_mixin.py
index 1d126bda0..945fed447 100644
--- a/tools/nntool/importer/onnx/handlers/backend/reducer_mixin.py
+++ b/tools/nntool/importer/onnx/handlers/backend/reducer_mixin.py
@@ -13,6 +13,8 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
+from functools import reduce
+
 from graph.dim import Dim
 from graph.types import (ConstantInputParameters, GlobalPoolingParameters,
                          NNEdge, NoOPParameters)
@@ -20,6 +22,15 @@
 from importer.common.provisional_dim import ProvisionalDim
 from importer.onnx.common import logger
 
+def axis_reduction(shape, axes):
+    def reduction(state, idx_dim):
+        idx, dim = idx_dim
+        if dim is None:
+            return state[0], state[1]
+        if idx in axes:
+            return state[0] + 1, state[1] + [state[0]]
+        return state[0] + 1, state[1]
+    return tuple(reduce(reduction, enumerate(shape), (0,[]))[1])
 
 class ReducerMixin(ConstantMixin):
     @classmethod
@@ -43,8 +54,7 @@ def _common(cls, node, copy_qtype=False, constant_operation=None, **kwargs):
                    x_rank for axis in axes), "axis out of bounds"
         keep_dims = node.attrs.get('keepdims', 1)
 
-        stripped_axes = [axis for axis in axes if x_shape[axis] is not None]
-
+        stripped_axes = axis_reduction(x_shape, axes)
         if not stripped_axes:
             params = NoOPParameters(valid_name)
             pout_shape = x_shape.copy()
@@ -57,22 +67,16 @@ def _common(cls, node, copy_qtype=False, constant_operation=None, **kwargs):
             else:
                 pout_shape = [dim for idx, dim in enumerate(
                     x_shape) if idx not in axes]
-                # if all(dim is None for dim in pout_shape):
-                #     pout_shape.append(1)
 
-            # subtract 1 from axis for all None's preceeding it and remove
-            # axes that are not defined
-            axes = [ax - sum([1 if dim is None else 0 for dim in x_shape[:ax:]])
-                    for ax in stripped_axes]
             if cls.is_constant(x) and constant_operation:
-                val = constant_operation(cls.get_constant(x), axis=tuple(axes), keepdims=keep_dims)
+                val = constant_operation(cls.get_constant(x), axis=stripped_axes, keepdims=keep_dims)
                 if val.size < 10:
                     logger.info("reducing %s to a constant %s", valid_name, val)
                 else:
                     logger.info("reducing %s to a constant", valid_name)
                 params = ConstantInputParameters(valid_name, value=val, dims=Dim.unnamed(val.shape))
             else:
-                params = GlobalPoolingParameters(valid_name, pool_type=reduce_type, axis=tuple(axes),
+                params = GlobalPoolingParameters(valid_name, pool_type=reduce_type, axis=stripped_axes,
                                                  keep_dims=keep_dims)
 
                 G.add_edge(
diff --git a/tools/nntool/importer/onnx/handlers/backend/relu.py b/tools/nntool/importer/onnx/handlers/backend/relu.py
index 432208acc..626460a54 100644
--- a/tools/nntool/importer/onnx/handlers/backend/relu.py
+++ b/tools/nntool/importer/onnx/handlers/backend/relu.py
@@ -42,3 +42,7 @@ def version_6(cls, node, **kwargs):
     @classmethod
     def version_13(cls, node, **kwargs):
         return cls._common(node, **kwargs)
+
+    @classmethod
+    def version_14(cls, node, **kwargs):
+        return cls._common(node, **kwargs)
diff --git a/tools/nntool/importer/onnx/handlers/backend/reshape.py b/tools/nntool/importer/onnx/handlers/backend/reshape.py
index d5f0dda54..ab930b296 100644
--- a/tools/nntool/importer/onnx/handlers/backend/reshape.py
+++ b/tools/nntool/importer/onnx/handlers/backend/reshape.py
@@ -111,3 +111,7 @@ def version_5(cls, node, **kwargs):
     @classmethod
     def version_13(cls, node, **kwargs):
         return cls._common(node, **kwargs)
+
+    @classmethod
+    def version_14(cls, node, **kwargs):
+        return cls._common(node, **kwargs)
diff --git a/tools/nntool/importer/onnx/handlers/backend/resize.py b/tools/nntool/importer/onnx/handlers/backend/resize.py
index f5545d46a..0b3d42eda 100644
--- a/tools/nntool/importer/onnx/handlers/backend/resize.py
+++ b/tools/nntool/importer/onnx/handlers/backend/resize.py
@@ -14,8 +14,10 @@
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
 import numpy as np
+from pytest import param
 from graph.dim import Dim
 from graph.types import NNEdge, ReshapeParameters
+from graph.types.constant_input import ConstantInputParameters
 from graph.types.resizers import (BilinearResizerParameters,
                                   NearestNeighborResizerParameters)
 from importer.common.constant_mixin import ConstantMixin
@@ -51,6 +53,13 @@ def _common(cls, node, scales, sizes, nearest_mode='round_prefer_ceil', **kwargs
         else:
             sizes = [None if x_shape[idx] is None else dim
                     for idx, dim in enumerate(sizes)]
+        
+        if np.prod([sz for sz in sizes if sz is not None]) == 0:
+            logger.warn(f'{valid_name} has null output shape')
+            params = ConstantInputParameters(valid_name, value=np.array([]))
+            all_nodes[node.output[0]] = (params, 0, ProvisionalDim([]), x[3])
+            return params
+
         if spatial_size == 1:
             sizes.insert(-1, 1)
 
diff --git a/tools/nntool/importer/onnx/handlers/backend/rnn.py b/tools/nntool/importer/onnx/handlers/backend/rnn.py
index ce22a6340..a871f2719 100644
--- a/tools/nntool/importer/onnx/handlers/backend/rnn.py
+++ b/tools/nntool/importer/onnx/handlers/backend/rnn.py
@@ -80,3 +80,7 @@ def version_1(cls, node, **kwargs):
     @classmethod
     def version_7(cls, node, **kwargs):
         return cls._common(node, **kwargs)
+
+    @classmethod
+    def version_14(cls, node, **kwargs):
+        return cls._common(node, **kwargs)
diff --git a/tools/nntool/importer/onnx/handlers/backend/rnn_mixin.py b/tools/nntool/importer/onnx/handlers/backend/rnn_mixin.py
index 6f8a17329..eac9090bc 100644
--- a/tools/nntool/importer/onnx/handlers/backend/rnn_mixin.py
+++ b/tools/nntool/importer/onnx/handlers/backend/rnn_mixin.py
@@ -121,6 +121,13 @@ def attach_rnn(G, x, rnn_params_class, extra_args, valid_name, tensors,
             t = tensors['forward' if i == 0 else 'backward']
             for idx, name in enumerate(rnn_params.INPUT_NAMES):
                 if name == 'input':
+                    # x_shape = x[2].shape
+                    # new_shape = [x_shape[0] if x_shape[0] is not None else 1, x_shape[-1]]
+                    # reshape_param = ReshapeParameters(f"{valid_name}_reshape", old_shape=x_shape, shape=new_shape)
+                    # G.add_edge(
+                    #     NNEdge(from_node=x[0], to_node=reshape_param, from_idx=x[1], to_idx=0))
+                    # G.add_edge(
+                    #     NNEdge(from_node=reshape_param, to_node=rnn_params, from_idx=0, to_idx=0))
                     G.add_edge(
                         NNEdge(from_node=x[0], to_node=rnn_params, from_idx=x[1], to_idx=0))
                     continue
diff --git a/tools/nntool/importer/onnx/handlers/backend/scatternd.py b/tools/nntool/importer/onnx/handlers/backend/scatternd.py
new file mode 100644
index 000000000..9e3b6b3a0
--- /dev/null
+++ b/tools/nntool/importer/onnx/handlers/backend/scatternd.py
@@ -0,0 +1,83 @@
+# Copyright (C) 2020  GreenWaves Technologies, SAS
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+import numpy as np
+from graph.types import ConstantInputParameters, NNEdge
+from graph.types.others import ScatterNdParameters
+from importer.common.constant_mixin import ConstantMixin
+from importer.common.provisional_dim import ProvisionalDim
+from importer.onnx.common import logger
+
+from ..backend_handler import BackendHandler
+from ..handler import onnx_op, partial_support, ps_description
+
+def scatter_nd_impl(data, indices, updates, reduction='none'):
+    # Check tensor shapes
+    assert indices.shape[-1] <= len(data.shape)
+    assert updates.shape == indices.shape[:-1] + data.shape[indices.shape[-1]:]
+
+    # Compute output
+    output = np.copy(data)
+    for i in np.ndindex(indices.shape[:-1]):
+        if reduction == 'add':
+            output[indices[i]] += updates[i]
+        elif reduction == 'mul':
+            output[indices[i]] *= updates[i]
+        else:
+            output[indices[i]] = updates[i]
+    return output
+
+@onnx_op("ScatterND")
+@partial_support(True)
+@ps_description('ScatterND is only supported at input and is not supported by nntool or autotiler kernels')
+class ScatterND(ConstantMixin, BackendHandler):
+
+    @classmethod
+    def _common(cls, node, **kwargs):
+        all_nodes = kwargs['all_nodes']
+        G = kwargs['G']
+        valid_name = kwargs['valid_name']
+        inputs = [all_nodes[inp] for inp in node.input]
+        x = inputs[0]
+        x_shape = x[2].shape
+        indices = cls.get_constant(inputs[1])
+        updates = inputs[2]
+        reduction = node.attrs.get('reduction', None)
+
+        pshape = ProvisionalDim(x_shape)
+        if cls.is_constant(x) and cls.is_constant(updates):
+            logger.info("reducing %s to a constant", valid_name)
+            x_val = cls.get_constant(x)
+            updates_val = cls.get_constant(updates)
+            params = ConstantInputParameters(valid_name, value=scatter_nd_impl(x_val, indices, updates_val, reduction=reduction))
+        else:
+            logger.warning(f'{valid_name} ScatterND is not currently supported in the nntool or Autotiler kernels')
+            params = ScatterNdParameters(valid_name, indices=indices, updates=updates, reduction=reduction)
+            G.add_edge(NNEdge(from_node=x[0], to_node=params, from_idx=x[1], to_idx=0))
+            G.add_edge(NNEdge(from_node=updates[0], to_node=params, from_idx=updates[1], to_idx=1))
+        all_nodes[node.output[0]] = (params, 0, pshape, x[3])
+        return params
+
+    @classmethod
+    def version_11(cls, node, **kwargs):
+        return cls._common(node, **kwargs)
+
+    @classmethod
+    def version_13(cls, node, **kwargs):
+        return cls._common(node, **kwargs)
+
+    @classmethod
+    def version_16(cls, node, **kwargs):
+        return cls._common(node, **kwargs)
diff --git a/tools/nntool/importer/onnx/handlers/backend/sub.py b/tools/nntool/importer/onnx/handlers/backend/sub.py
index 866db635c..a290997c3 100644
--- a/tools/nntool/importer/onnx/handlers/backend/sub.py
+++ b/tools/nntool/importer/onnx/handlers/backend/sub.py
@@ -35,3 +35,7 @@ def version_7(cls, node, **kwargs):
     @classmethod
     def version_13(cls, node, **kwargs):
         return cls._common(node, **kwargs)
+
+    @classmethod
+    def version_14(cls, node, **kwargs):
+        return cls._common(node, **kwargs)
diff --git a/tools/nntool/importer/onnx/handlers/backend/thresholded_relu.py b/tools/nntool/importer/onnx/handlers/backend/thresholded_relu.py
new file mode 100644
index 000000000..b23f6f5f5
--- /dev/null
+++ b/tools/nntool/importer/onnx/handlers/backend/thresholded_relu.py
@@ -0,0 +1,40 @@
+# Copyright (C) 2020  GreenWaves Technologies, SAS
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+# TODO - This is not mappable onto our current kernels. To add if needed by a customer
+
+# import numpy as np
+# from graph.types.activations import ReluActivationParameters
+# from importer.onnx.handlers.backend.math_mixin import BasicMathMixin
+
+# from ..backend_handler import BackendHandler
+# from ..handler import onnx_op
+
+
+# @onnx_op("ThresholdedRelu")
+# class ThresholdedRelu(BasicMathMixin, BackendHandler):
+
+#     @classmethod
+#     def _common(cls, node, **kwargs):
+#         alpha = node.attrs.get('alpha', 1.0)
+#         return super(ThresholdedRelu, cls)._common(node,
+#                                         params_class=ReluActivationParameters,
+#                                         constant_operation=lambda x: np.clip(x, alpha, np.inf),
+#                                         params_args={'lower_limit': alpha},
+#                                         **kwargs)
+
+#     @classmethod
+#     def version_10(cls, node, **kwargs):
+#         return cls._common(node, **kwargs)
diff --git a/tools/nntool/importer/onnx/handlers/backend/unsqueeze.py b/tools/nntool/importer/onnx/handlers/backend/unsqueeze.py
index 3314062eb..9cbc10f71 100644
--- a/tools/nntool/importer/onnx/handlers/backend/unsqueeze.py
+++ b/tools/nntool/importer/onnx/handlers/backend/unsqueeze.py
@@ -36,7 +36,7 @@ def _common(cls, node, **kwargs):
         out_rank = len(x_shape) + len(kwargs['axes'])
         axes = cls._resolve_negative_ranks(kwargs['axes'], out_rank)
 
-        old_shape = x_shape.copy()
+        old_shape = list(x_shape)
         new_shape = [1 if new_idx in axes else old_shape.pop(0)
                      for new_idx in range(out_rank)]
 
diff --git a/tools/nntool/importer/onnx/handlers/backend/upsample.py b/tools/nntool/importer/onnx/handlers/backend/upsample.py
new file mode 100644
index 000000000..048a620c4
--- /dev/null
+++ b/tools/nntool/importer/onnx/handlers/backend/upsample.py
@@ -0,0 +1,117 @@
+# Copyright (C) 2020  GreenWaves Technologies, SAS
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+import numpy as np
+from graph.dim import Dim
+from graph.types import NNEdge, ReshapeParameters
+from graph.types.constant_input import ConstantInputParameters
+from graph.types.resizers import (BilinearResizerParameters,
+                                  NearestNeighborResizerParameters)
+from importer.common.constant_mixin import ConstantMixin
+from importer.common.provisional_dim import ProvisionalDim
+from importer.onnx.common import logger
+from pytest import param
+
+from ..backend_handler import BackendHandler
+from ..handler import onnx_op
+
+
+@onnx_op("Upsample")
+class Upsample(ConstantMixin, BackendHandler):
+
+    @classmethod
+    def _common(cls, node, inputs, scales, **kwargs):
+        all_nodes = kwargs['all_nodes']
+        G = kwargs['G']
+        valid_name = kwargs['valid_name']
+        x = inputs[0]
+        x_shape = x[2].shape
+        x_rank = len(x_shape)
+
+        mode = node.attrs.get('mode', 'nearest')
+
+        spatial_size = x_rank - 2
+        in_c = x_shape[1]
+        in_w = x_shape[-1]
+        sizes = [int(shape * scale) if shape is not None else None
+                    for shape, scale in zip(x_shape, scales)]
+        
+        if np.prod([sz for sz in sizes if sz is not None]) == 0:
+            logger.warn(f'{valid_name} has null output shape')
+            params = ConstantInputParameters(valid_name, value=np.array([]))
+            all_nodes[node.output[0]] = (params, 0, ProvisionalDim([]), x[3])
+            return params
+
+        if spatial_size == 1:
+            sizes.insert(-1, 1)
+
+        if spatial_size != 2 and spatial_size != 1:
+            raise ValueError('resize only supports 4D tensor in NCHW mode or 3D tensor in NCF mode'
+                             f' - input shape is {x_shape} sizes is {sizes}')
+
+        if not all(x_dim == size_dim for x_dim, size_dim in zip(x_shape[:2:], sizes[:2:])):
+            raise ValueError('resize only supports 4D tensor in NCHW mode or 3D tensor in NCF mode'
+                             f' - input shape is {x_shape} sizes is {sizes}')
+
+        params_class = BilinearResizerParameters if mode == 'linear' else NearestNeighborResizerParameters
+
+        params = params_class(valid_name,
+                              new_shape=tuple(sizes[2::]),
+                              align_corners=False,
+                              halfpixel_centers=False,
+                              in_dims_hint=[['c', 'h', 'w']],
+                              out_dims_hint=[['c', 'h', 'w']])
+
+        if spatial_size == 1:
+            r1_params = ReshapeParameters(f'{valid_name}_reshape2d',
+                                          old_shape=Dim.unnamed([in_c, in_w]),
+                                          shape=Dim.unnamed([in_c, 1, in_w]))
+            r2_params = ReshapeParameters(f'{valid_name}_reshape1d',
+                                          old_shape=Dim.unnamed(
+                                              [in_c, 1, sizes[-1]]),
+                                          shape=Dim.unnamed([in_c, sizes[-1]]))
+            G.add_edge(
+                NNEdge(from_node=x[0], to_node=r1_params, from_idx=x[1], to_idx=0))
+            G.add_edge(NNEdge(from_node=r1_params,
+                              to_node=params, from_idx=0, to_idx=0))
+            G.add_edge(NNEdge(from_node=params,
+                              to_node=r2_params, from_idx=0, to_idx=0))
+            pout_dims = ProvisionalDim(sizes[:-2:] + sizes[-1::])
+            params = r2_params
+        else:
+            pout_dims = ProvisionalDim(sizes)
+            G.add_edge(
+                NNEdge(from_node=x[0], to_node=params, from_idx=x[1], to_idx=0))
+
+        all_nodes[node.output[0]] = (params, 0, pout_dims, x[3])
+        return params
+
+    @classmethod
+    def version_7(cls, node, **kwargs):
+        all_nodes = kwargs['all_nodes']
+        inputs = [all_nodes[inp] if inp else None for inp in node.input]
+        scales = node.attrs['scales']
+        return cls._common(node, inputs, scales, **kwargs)
+
+    @classmethod
+    def version_9(cls, node, **kwargs):
+        all_nodes = kwargs['all_nodes']
+        inputs = [all_nodes[inp] if inp else None for inp in node.input]
+        scales = cls.get_constant(inputs[1])
+        return cls._common(node, inputs, scales, **kwargs)
+
+    @classmethod
+    def version_10(cls, node, **kwargs):
+        return cls.version_9(node, **kwargs)
diff --git a/tools/nntool/importer/onnx/onnx.py b/tools/nntool/importer/onnx/onnx.py
index a89f4d71f..13032d210 100644
--- a/tools/nntool/importer/onnx/onnx.py
+++ b/tools/nntool/importer/onnx/onnx.py
@@ -70,7 +70,8 @@ def create_graph(self, filename, opts) -> NNGraph:
             opset_import = model.opset_import
         G = NNGraph(filename=filename,
                     name=opts.get('name'))
-        G, qrecs = self._import_onnx_model(G, model.graph, opset_import, opts)
+        G, qrecs, qopts = self._import_onnx_model(
+            G, model.graph, opset_import, opts)
         G.add_dimensions(quiet=True)
         if qrecs:
             propagate_qrecs(G, qrecs)
@@ -78,6 +79,7 @@ def create_graph(self, filename, opts) -> NNGraph:
             qset.update(qrecs)
             qset.scheme_priority = ['SQ8']
             qset.schemes_present = {'SQ8'}
+            qset.options = qopts
             G.quantization = qset
             try:
                 quantizer = NewQuantizer(G)
@@ -88,9 +90,10 @@ def create_graph(self, filename, opts) -> NNGraph:
 
         clean_dangling_nodes(G)
         MatchDuplicateConstants().match(G)
+        G.add_dimensions(quiet=True)
         return G
 
-    def _update_qrecs(self, G, qrecs, all_nodes, ranges_dict):
+    def _update_qrecs(self, G, qrecs, all_nodes, ranges_dict, qopts):
         for node, idx, _, qtype in all_nodes.values():
             if qtype is None and node.name not in ranges_dict.keys():
                 continue
@@ -107,8 +110,11 @@ def _update_qrecs(self, G, qrecs, all_nodes, ranges_dict):
             if node.name in ranges_dict.keys():
                 out_min, out_max = ranges_dict[node.name]["range"]
                 dtype = ranges_dict[node.name].get("dtype", np.int8)
-                bits = ranges_dict[node.name].get("n_bits", 8)
+                bits = ranges_dict[node.name].get("bits", 8)
                 channel = ranges_dict[node.name].get("per_channel", None)
+                qopt = qopts.setdefault(
+                    nid, {'output_size': [None] * len(G.indexed_out_edges(node))})
+                qopt['output_size'][idx] = bits
                 qtype = QType.from_min_max_sq(
                     out_min, out_max, dtype=dtype, bits=bits, quantized_dimension=channel)
             qrec.out_qs[idx] = qtype
@@ -127,14 +133,16 @@ def _import_onnx_model(self, G, graph, opset, opts):
                                        input_shapes=opts.get('input_shapes', {}))
         all_nodes.update(inputs)
         qrecs = {}
+        qopts = {}
         outputs = self._get_output_nodes(
             G, graph.output, substitutions=opts.get('substitutions', None))
         shapes = {elem.name: elem.type for elem in graph.value_info}
         self._import_nodes(
             G, graph, self._handlers, all_nodes, outputs,
-            opts=opts, qrecs=qrecs, shapes=shapes)
-        self._update_qrecs(G, qrecs, all_nodes, opts.get('ranges_dict', {}))
-        return G, qrecs
+            opts=opts, qrecs=qrecs, shapes=shapes, qopts=qopts)
+        self._update_qrecs(G, qrecs, all_nodes,
+                           opts.get('ranges_dict', {}), qopts)
+        return G, qrecs, qopts
 
     def import_subgraph(self, G, graph, opts, all_nodes=None):
         if all_nodes is None:
@@ -153,7 +161,7 @@ def import_subgraph(self, G, graph, opts, all_nodes=None):
         self._import_nodes(
             G, graph, self._handlers, all_nodes, outputs,
             opts=opts, qrecs=qrecs)
-        self._update_qrecs(G, qrecs, all_nodes, {})
+        self._update_qrecs(G, qrecs, all_nodes, {}, {})
         return G, qrecs
 
     @staticmethod
@@ -331,9 +339,14 @@ def _import_nodes(self, G, graph, handlers, all_nodes, outputs, **kwargs):
                 continue
             handler = handlers[node.domain].get(
                 node.op_type, None) if node.domain in handlers else None
-            if not handler or (handler.CONSTANT_ONLY and
-                               not all(isinstance(all_nodes[inp_name][0], ConstantInputParameters)
-                                       for inp_name in node.input)):
+            if (handler and handler.CONSTANT_ONLY and
+                not all(isinstance(all_nodes[inp_name][0], ConstantInputParameters)
+                        for inp_name in node.input)):
+                logger.warning(
+                    f'{node.name} uses ONNX operator "{node.op_type}" domain '
+                    f'"{node.domain}" which is not currently supported in the Autotiler kernels. '
+                    'It may be eliminated by graph optimisations')
+            if not handler:
                 handler = handlers['__extensions'].get(node.op_type, None)
                 if not handler:
                     logger.warning(
@@ -360,7 +373,7 @@ def _import_nodes(self, G, graph, handlers, all_nodes, outputs, **kwargs):
                 x = inputs[0]
                 x_shape = x[2].shape
                 name = hasattr(node, 'name') and getattr(node, 'name')
-                x=0
+                x = 0
             params = handler.handle(onode, all_nodes=all_nodes, vars_dict=vars_dict,
                                     G=G, valid_name=self._node_name(node),
                                     used_tensors=used_tensors, importer=self, **kwargs)
diff --git a/tools/nntool/interpreter/commands/dump.py b/tools/nntool/interpreter/commands/dump.py
index ec05f2acc..d6fa2498a 100644
--- a/tools/nntool/interpreter/commands/dump.py
+++ b/tools/nntool/interpreter/commands/dump.py
@@ -83,6 +83,8 @@ class DumpCommand(NNToolShellBase):
                              action='store_true', help='dequantize result')
     parser_dump.add_argument('--quantize_and_dequantize',
                              action='store_true', help='quantize and dequantize float results')
+    parser_dump.add_argument('--append_fusion_output',
+                             action='store_true', help='quantize and dequantize float results')
     parser_dump_group = parser_dump.add_mutually_exclusive_group(
         required=False)
     parser_dump_group.add_argument('-q', '--quantize', action='store_true',
@@ -155,7 +157,7 @@ def do_dump(self, args: argparse.Namespace):
             qrecs = None if qmode.is_none else self.G.quantization
             executer = GraphExecuter(self.G, qrecs=qrecs)
             outputs = executer.execute(data, step_idx_limit=step,
-                                       qmode=qmode)
+                                       qmode=qmode, append_fusion_output=args.append_fusion_output)
 
             if args.pickle or self._in_py or args.save:
                 pickles.append(outputs)
diff --git a/tools/nntool/interpreter/commands/imageformat.py b/tools/nntool/interpreter/commands/imageformat.py
index 35f7755d9..f0468dc01 100644
--- a/tools/nntool/interpreter/commands/imageformat.py
+++ b/tools/nntool/interpreter/commands/imageformat.py
@@ -23,7 +23,7 @@
 
 from graph.types import ImageFormatParameters, NNEdge, TransposeParameters
 
-
+from graph.manipulations.formatter import insert_formatter, remove_formatter
 class ImageFormatCommand(NNToolShellBase):
     def inputs_choices(self):
         if self.G is None:
@@ -74,103 +74,3 @@ def do_imageformat(self, args: argparse.Namespace):
                        f'format {args.image_formatter} and normalization {args.image_normalizer}')
 
 
-def insert_formatter(G, input_node, formatter, normalizer):
-    format_node = ImageFormatParameters(input_node.name + "_formatter",
-                                        norm_func=normalizer.upper(),
-                                        format_change=formatter.upper())
-    out_edges = G.out_edges(input_node.name)
-
-    # dims updated to reflect formatter
-    if format_node.output_channels is not None and format_node.input_channels is not None:
-        out_dim = input_node.get_output_size(None)[0]
-        if formatter.upper() in ("BW8", "BW16"):
-            assert format_node.input_channels == 1
-            in_dim = out_dim.clone()
-            format_node.out_dims_hint = input_node.out_dims_hint
-            format_node.in_dims_hint = input_node.out_dims_hint
-            input_node.dims = in_dim
-            for out_edge in out_edges:
-                G.remove_edge(out_edge)
-        else:
-            if not out_dim.is_named or out_dim.c != format_node.output_channels:
-                raise ValueError(
-                    "current graph input is not named or does not match formatter output channels")
-            if formatter.upper() in ("RGB16", "BW16") and normalizer.upper() != "OUT_INT16":
-                raise ValueError(
-                    "rgb16 and bw16 formatters must have out_int16 as normalization function")
-            in_dim = out_dim.clone()
-            in_dim.c = format_node.input_channels
-            in_dim.impose_order(("h", "w", "c"))
-            format_node.in_dims_hint = [["h", "w", "c"]]
-            input_node.dims = in_dim
-            if input_node.fixed_order:
-                new_out_edges = []
-                for out_edge in out_edges:
-                    if isinstance(out_edge.to_node, TransposeParameters):
-                        trans_node = out_edge.to_node
-                        transpose_edges = G.out_edges(trans_node.name)
-                        new_out_edges.extend(transpose_edges)
-                        G.remove(trans_node)
-                        if G.quantization:
-                            nid = NodeId(trans_node)
-                            if nid in G.quantization:
-                                del G.quantization[NodeId(trans_node)]
-                    else:
-                        new_out_edges.append(out_edge)
-                out_edges = new_out_edges
-            else:
-                input_node.fixed_order = True
-                for out_edge in out_edges:
-                    G.remove_edge(out_edge)
-            format_node.out_dims_hint = [["c", "h", "w"]] * len(out_edges)
-            input_node.out_dims_hint = [["h", "w", "c"]]
-            G.node_options[NodeId(input_node)] = input_node.at_options
-    # qrec updated to reflect formatter
-    input_qrec = G.quantization and G.quantization.get(NodeId(input_node))
-    if input_qrec and format_node.input_dtype and format_node.output_dtype:
-        formatter_qrec = G.quantization.get(NodeId(format_node))
-        if not formatter_qrec:
-            if input_qrec.out_qs[0].dtype != format_node.output_dtype:
-                raise ValueError(
-                    "current graph input output quantization does not match formatter output")
-            formatter_qrec = deepcopy(input_qrec)
-            formatter_qrec.out_qs[0] = deepcopy(formatter_qrec.out_qs[0])
-            if formatter_qrec.ktype.startswith('scaled'):
-                formatter_in_q = QType(
-                    scale=1, zero_point=0, dtype=format_node.input_dtype)
-            elif formatter_qrec.ktype.startswith('symmetric'):
-                formatter_in_q = QType(q=0, dtype=format_node.input_dtype)
-            else:
-                raise NotImplementedError("quantization has unknown type")
-            if len(formatter_qrec.in_qs) > 0:
-                formatter_qrec.in_qs[0] = formatter_in_q
-                input_qrec.in_qs[0] = formatter_in_q
-            else:
-                formatter_qrec.in_qs.append(formatter_in_q)
-                input_qrec.in_qs.append(formatter_in_q)
-            input_qrec.out_qs[0] = formatter_in_q
-        G.quantization[NodeId(format_node)] = formatter_qrec
-
-    G.add_node(format_node)
-    G.add_edge(NNEdge(input_node, format_node))
-    for out_edge in out_edges:
-        G.add_edge(NNEdge(format_node, out_edge.to_node, to_idx=out_edge.to_idx))
-
-
-def remove_formatter(G, fmt_node):
-    input_edges = G.in_edges(fmt_node.name)
-    assert len(input_edges) == 1, "formatter node should only have one input"
-    input_node = input_edges[0].from_node
-    fmt_edges = G.out_edges(fmt_node.name)
-    fmt_qrec = G.quantization and G.quantization.get(NodeId(fmt_node))
-    G.remove(fmt_node)
-
-    input_node.dims = fmt_node.out_dims[0]
-    input_node.out_dims_hint = fmt_node.out_dims_hint
-    for fmt_edge in fmt_edges:
-        G.add_edge(NNEdge(input_node, fmt_edge.to_node, to_idx=fmt_edge.to_idx))
-    if fmt_qrec:
-        input_qrec = G.quantization[NodeId(input_node)]
-        input_qrec.out_qs = fmt_qrec.out_qs
-        input_qrec.in_qs = fmt_qrec.out_qs
-        G.quantization.remove_node(fmt_node)
diff --git a/tools/nntool/interpreter/commands/qtune.py b/tools/nntool/interpreter/commands/qtune.py
index e7b60a90f..f687fc392 100644
--- a/tools/nntool/interpreter/commands/qtune.py
+++ b/tools/nntool/interpreter/commands/qtune.py
@@ -110,7 +110,7 @@ def reduction(state, x):
             options = reduce(reduction, args.step, options)
 
         quantizer = NewQuantizer(self.G)
-        quantizer.options.update(options)
+        quantizer.update_options(options)
         quantizer.quantize()
         self.pfeedback('quantization options set')
 
diff --git a/tools/nntool/interpreter/commands/remove.py b/tools/nntool/interpreter/commands/remove.py
index 3283f59c8..17f51d84a 100644
--- a/tools/nntool/interpreter/commands/remove.py
+++ b/tools/nntool/interpreter/commands/remove.py
@@ -15,12 +15,15 @@
 
 import argparse
 from functools import reduce
+from itertools import chain, groupby
 
 from cmd2 import Cmd2ArgumentParser, with_argparser
 from interpreter.nntool_shell_base import NNToolShellBase
 
 from graph.types import ReshapeParameters, InputParameters, OutputParameters, ConstantInputParameters
 from graph.types.base import NNEdge
+from quantization.new_qrec import QRec
+from utils.node_id import NodeId
 
 
 class RemoveCommand(NNToolShellBase):
@@ -49,27 +52,32 @@ def nodes_choices(self):
     def do_remove(self, args: argparse.Namespace):
         """Removes all the edges and nodes between two node. Will only work if nodes do not affect shape of tensor."""
         self._check_graph()
-        if any(node not in self.G for node in args.nodes):
-            self.perror("node not found in graph")
-            return
+        for node in args.nodes:
+            if node not in self.G:
+                self.perror(f"node {node} not found in graph")
+                return
         node_from = self.G[args.nodes[0]]
         if len(args.nodes) == 1:
             if args.up:
                 nodes_above = set(self.G.nodes_above(node_from))
                 if args.leave:
                     remove_nodes = nodes_above
-                    inputs_on = []
-                    dims = node_from.in_dims
+                    # remove constant inputs on the node left as targets for removal
                     for in_edge in self.G.indexed_in_edges(node_from):
                         if isinstance(in_edge.from_node, ConstantInputParameters):
                             nodes_above.remove(in_edge.from_node)
-                        else:
-                            inputs_on.append([in_edge])
                 else:
-                    dims = node_from.out_dims
                     remove_nodes = nodes_above | {node_from}
-                    inputs_on = self.G.indexed_out_edges(node_from)
-
+                # check for deleted nodes that have edges to left nodes. These need to be the new inputs.
+                # group them by source so common edges have one input
+                inputs_on = [
+                    list(edges) for _, edges in
+                    groupby(
+                        [edge for node in remove_nodes for edge in self.G.out_edges(node)
+                         if edge.to_node not in remove_nodes],
+                        key=lambda x: (x.from_node, x.from_idx))]
+                dims = [edges[0].to_node.in_dims[edges[0].to_idx]
+                        for edges in inputs_on]
                 input_names = sorted(
                     [node.name for node in remove_nodes if isinstance(node, InputParameters)])
                 self.G.remove_all(remove_nodes)
@@ -82,6 +90,13 @@ def do_remove(self, args: argparse.Namespace):
                         self.G.add_edge(NNEdge(from_node=in_node,
                                                to_idx=edge.to_idx,
                                                to_node=edge.to_node))
+                    if self.G.quantization and edge_group:
+                        edge = edge_group[0]
+                        fnid = NodeId(edge.to_node)
+                        if fnid in self.G.quantization:
+                            qrec = self.G.quantization[fnid]
+                            self.G.quantization[NodeId(in_node)] = QRec.copy_ktype(
+                                qrec, out_qs=[qrec.in_qs[edge.to_idx]])
             else:
                 nodes_below = set(self.G.nodes_below(node_from))
                 if self.G.is_vertex_cut(nodes_below):
@@ -107,6 +122,12 @@ def do_remove(self, args: argparse.Namespace):
                     self.pfeedback(f'adding output {out_node.name}')
                     self.G.add_edge(NNEdge(from_node=edge.from_node,
                                            from_idx=edge.from_idx, to_node=out_node))
+                    if self.G.quantization:
+                        fnid = NodeId(edge.from_node)
+                        if fnid in self.G.quantization:
+                            qrec = self.G.quantization[fnid]
+                            self.G.quantization[NodeId(out_node)] = QRec.copy_ktype(
+                                qrec, in_qs=[qrec.out_qs[edge.from_idx]])
         else:
             node_to = self.G[args.nodes[1]]
             nodes_between = self.G.nodes_between(node_from, node_to)
@@ -121,7 +142,8 @@ def do_remove(self, args: argparse.Namespace):
 
             edges_from = set(self.G.out_edges(node_from))
             edges_to = set(self.G.in_edges(node_to.name))
-            between_edges = reduce(lambda s, x: s|set(self.G.edges(x)), nodes_between, set())
+            between_edges = reduce(lambda s, x: s | set(
+                self.G.edges(x)), nodes_between, set())
             edges_from = edges_from.intersection(between_edges)
             edges_to = edges_to.intersection(between_edges)
             if len(edges_from) != len(edges_to):
diff --git a/tools/nntool/interpreter/commands/tflite.py b/tools/nntool/interpreter/commands/tflite.py
index 71babab37..b8a729a11 100644
--- a/tools/nntool/interpreter/commands/tflite.py
+++ b/tools/nntool/interpreter/commands/tflite.py
@@ -15,14 +15,19 @@
 
 from interpreter.nntool_shell_base import NNToolShellBase
 from importer.tflite2.common.handler_helper import get_backend_coverage, get_backend_partial_support_detail
-
+import texttable
 class HelpTFLiteCommand(NNToolShellBase):
     def help_tflite(self):
         ops_dict = get_backend_coverage()[0]
         bc_dict = get_backend_partial_support_detail()
         self.pfeedback("Supported operators and versions")
+
+        table = texttable.Texttable()
+        table.set_cols_align(['l', 'l', 'l'])
+        table.set_max_width(120)
+        table.set_cols_width([30, 15, 60])
         for op in ops_dict:
-            self.pfeedback("%s (%s)"%(op, ",".join(str(ver) for ver in ops_dict[op])))
-            if op in bc_dict:
-                self.pfeedback(bc_dict[op])
+            table.add_row([op, ",".join(str(ver) for ver in ops_dict[op]), bc_dict.get(op, "")])
+        self.pfeedback("Supported operators and versions")
+        self.pfeedback(table.draw()+'\n')
         
\ No newline at end of file
diff --git a/tools/nntool/quantization/float/float_quantization_handler.py b/tools/nntool/quantization/float/float_quantization_handler.py
index 0fe9824be..776054095 100644
--- a/tools/nntool/quantization/float/float_quantization_handler.py
+++ b/tools/nntool/quantization/float/float_quantization_handler.py
@@ -65,3 +65,10 @@ def _get_in_qs_from_stats(cls, params, stats, in_qs, **kwargs):
                     for idx, dim in enumerate(params.in_dims)]
         return [QType(dtype=dtype) if dim is not None else None
                 for idx, dim in enumerate(params.in_dims)]
+
+    @classmethod
+    def get_min_max(cls, stats, idx=0, direction='out'):
+        if stats:
+            return (stats[f'range_{direction}'][idx]['min'],
+            stats[f'range_{direction}'][idx]['max'])
+        return None, None
diff --git a/tools/nntool/quantization/float/quantizers/expression_fusion_float.py b/tools/nntool/quantization/float/quantizers/expression_fusion_float.py
index ee45039e5..62fe5989e 100644
--- a/tools/nntool/quantization/float/quantizers/expression_fusion_float.py
+++ b/tools/nntool/quantization/float/quantizers/expression_fusion_float.py
@@ -30,8 +30,8 @@
 
 # Fusion handler attribute not set since expressions are handled only by this handler
 @params_type(ExpressionFusionParameters)
-@in_qs_constraint(MatchAll({'dtype': set([np.float32, np.float16, bfloat16])}))
-@out_qs_constraint(MatchAll({'dtype': set([np.float32, np.float16, bfloat16])}))
+@in_qs_constraint(MatchAll({'dtype': set([np.float32, np.float16, bfloat16, np.uint16, np.int16, np.uint8, np.int8])}))
+@out_qs_constraint(MatchAll({'dtype': set([np.float32, np.float16, bfloat16, np.uint16, np.int16, np.uint8, np.int8])}))
 class ExpressionFusionFloat(FloatQuantizionHandler):
     @classmethod
     def _quantize(cls, params, in_qs, stats, **kwargs):
diff --git a/tools/nntool/quantization/float/quantizers/input_float.py b/tools/nntool/quantization/float/quantizers/input_float.py
new file mode 100644
index 000000000..79c064ad9
--- /dev/null
+++ b/tools/nntool/quantization/float/quantizers/input_float.py
@@ -0,0 +1,54 @@
+# Copyright (C) 2020  GreenWaves Technologies, SAS
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+
+from copy import deepcopy
+
+import numpy as np
+from bfloat16 import bfloat16
+from graph.types import OutputParameters
+from graph.types.input_output import InputParameters
+from quantization.float.float_quantization_handler import \
+    FloatQuantizionHandler
+from quantization.new_qrec import QRec
+from quantization.qtype import QType
+from quantization.qtype_constraint import MatchAll
+from quantization.quantizer_options import QTYPE_IND_OPTION
+from quantization.unified_quantization_handler import (options,
+                                                       out_qs_constraint,
+                                                       params_type)
+
+
+@params_type(InputParameters)
+@out_qs_constraint(MatchAll({'dtype': set([np.float32, np.float16, bfloat16])}))
+@options(QTYPE_IND_OPTION)
+class FloatInput(FloatQuantizionHandler):
+    @classmethod
+    def _quantize(cls, params, in_qs, stats, **kwargs):
+        force_out_qs, dtype = cls.get_float_opts(**kwargs)
+        force_out_q = force_out_qs and force_out_qs[0]
+        opts = kwargs['opts']
+        i_q_ind = opts.get('qtype_ind')
+        if force_out_q:
+            if force_out_q.dtype != dtype:
+                return None
+            i_q = deepcopy(force_out_q)
+        elif i_q_ind:
+            i_q = deepcopy(i_q_ind)
+        else:
+            min_val, max_val = cls.get_min_max(stats)
+            i_q = QType(dtype=dtype, min_val=min_val, max_val=max_val)
+        return QRec.float(out_qs=[i_q],
+                          float_dtype=i_q.dtype)
diff --git a/tools/nntool/quantization/float/quantizers/output_float.py b/tools/nntool/quantization/float/quantizers/output_float.py
index 57e2daba0..2d506f890 100644
--- a/tools/nntool/quantization/float/quantizers/output_float.py
+++ b/tools/nntool/quantization/float/quantizers/output_float.py
@@ -14,6 +14,8 @@
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
 
+from copy import deepcopy
+
 import numpy as np
 from bfloat16 import bfloat16
 from graph.types import OutputParameters
@@ -22,7 +24,9 @@
 from quantization.new_qrec import QRec
 from quantization.qtype import QType
 from quantization.qtype_constraint import MatchAll
+from quantization.quantizer_options import QTYPE_IND_OPTION
 from quantization.unified_quantization_handler import (in_qs_constraint,
+                                                       options,
                                                        out_qs_constraint,
                                                        params_type)
 
@@ -30,12 +34,20 @@
 @params_type(OutputParameters)
 @in_qs_constraint(MatchAll({'dtype': set([np.float32, np.float16, bfloat16])}))
 @out_qs_constraint(MatchAll({'dtype': set([np.float32, np.float16, bfloat16])}))
+@options(QTYPE_IND_OPTION)
 class FloatOutput(FloatQuantizionHandler):
     @classmethod
     def _quantize(cls, params, in_qs, stats, **kwargs):
         force_out_qs, dtype = cls.get_float_opts(**kwargs)
         if force_out_qs and any(qtype.dtype != dtype for qtype in force_out_qs if qtype is not None):
             return None
-        return QRec.float(in_qs=[QType(dtype=dtype)],
-                          out_qs=[QType(dtype=dtype)],
-                          float_dtype=dtype)
+        opts = kwargs['opts']
+        o_q_ind = opts.get('qtype_ind')
+        if o_q_ind:
+            o_q = deepcopy(o_q_ind)
+        else:
+            min_val, max_val = cls.get_min_max(stats, direction='in')
+            o_q = QType(dtype=dtype, min_val=min_val, max_val=max_val)
+        return QRec.float(in_qs=[o_q],
+                          out_qs=[o_q],
+                          float_dtype=o_q.dtype)
diff --git a/tools/nntool/quantization/multiplicative/quantizers/activation_mult.py b/tools/nntool/quantization/multiplicative/quantizers/activation_mult.py
index 56c24ed63..0467048c0 100644
--- a/tools/nntool/quantization/multiplicative/quantizers/activation_mult.py
+++ b/tools/nntool/quantization/multiplicative/quantizers/activation_mult.py
@@ -31,7 +31,7 @@
 from quantization.qtype import QType
 from quantization.unified_quantization_handler import (in_qs_constraint,
                                                        out_qs_constraint,option_constraint,
-                                                       params_type, options)
+                                                       params_type, options, priority)
 
 from ..mult_quantization_handler import MultQuantizionHandler
 from quantization.quantizer_options import *
@@ -104,6 +104,11 @@ def _quantize_sw(cls, params, in_qs, stats, in_dtype, out_dtype, out_asym, **kwa
                     8,
                     dtype=in_dtype,
                     forced=True)
+            elif in_dtype in [np.uint8, np.uint16]:
+                in_q = QType(
+                    dtype=in_dtype,
+                    scale=pow(2, -12),
+                    zero_point=1<<(8 if in_dtype == np.uint8 else 16))
             else:
                 in_q = QType(
                     dtype=in_dtype,
@@ -133,7 +138,7 @@ def _quantize_sw(cls, params, in_qs, stats, in_dtype, out_dtype, out_asym, **kwa
                 o_q = QType.from_min_max_sq(0,
                                             max_val,
                                             dtype=out_dtype,
-                                            asymmetric=(in_q.zero_point != 0))
+                                            asymmetric=(in_q.zero_point != 0) or out_dtype in [np.uint8, np.uint16])
                 in_q = deepcopy(o_q)
             elif isinstance(params, TanHActivationParameters):
                 o_q = QType.from_min_max_sq(
@@ -225,6 +230,7 @@ def _quantize(cls, params, in_qs, stats, **kwargs):
 @in_qs_constraint({'dtype': {np.int8, np.int16, np.int32}})
 @out_qs_constraint({'dtype': np.uint8})
 @option_constraint(force_output_size={8, None})
+@priority(2)
 class ActivationMultSW_HSwish_I_U8(ActivationMultSWBase):
     @classmethod
     def _get_in_qs_from_stats(cls, params, stats, in_qs, **kwargs):
diff --git a/tools/nntool/quantization/multiplicative/quantizers/add_sub_mult.py b/tools/nntool/quantization/multiplicative/quantizers/add_sub_mult.py
index 68c39acd3..8f9d5832b 100644
--- a/tools/nntool/quantization/multiplicative/quantizers/add_sub_mult.py
+++ b/tools/nntool/quantization/multiplicative/quantizers/add_sub_mult.py
@@ -83,7 +83,7 @@ def _quantize_sw(cls, params, in_qs, stats, inout_dtype, asym=False, **kwargs):
         scale_in_mul_biases_q.scale = qrec.in_qs[scaled_idx].scale / \
             qrec.in_qs[not_scaled_idx].scale
 
-        if qrec.in_qs[0].asymmetric:
+        if qrec.in_qs[0].zero_point or qrec.in_qs[1].zero_point or qrec.out_qs[0].zero_point:
             # (C - Zc)*Sc = (A - Za)*Sa + (B - Zb)*Sb =
             # C = Sa/Sc*(A + B*Sb/Sa - Za - Zb*Sb/Sa) + Zc =
             #   = Sa/Sc*(A + B*Sb/Sa) + (Zc - Sa/Sc*(Za + Zb*Sb/Sa))
diff --git a/tools/nntool/quantization/multiplicative/quantizers/constant_input_mult.py b/tools/nntool/quantization/multiplicative/quantizers/constant_input_mult.py
index 0ad4f0e97..467bea16b 100644
--- a/tools/nntool/quantization/multiplicative/quantizers/constant_input_mult.py
+++ b/tools/nntool/quantization/multiplicative/quantizers/constant_input_mult.py
@@ -17,15 +17,18 @@
 
 from graph.types import ConstantInputParameters
 from quantization.new_qrec import QRec
-from quantization.qtype import QType
-from quantization.quantizer_options import QTYPE_IND_OPTION
+from quantization.qtype import QType, DTYPES
+from quantization.quantizer_options import QTYPE_IND_OPTION, OUTPUT_SIZE_OPTION
 from quantization.unified_quantization_handler import (needs_stats, options,
                                                        params_type)
 
 from ..mult_quantization_handler import MultQuantizionHandler
 
 
-@options(QTYPE_IND_OPTION)
+@options(
+    QTYPE_IND_OPTION,
+    OUTPUT_SIZE_OPTION
+)
 @params_type(ConstantInputParameters)
 @needs_stats(False)
 class ConstantInputMult(MultQuantizionHandler):
@@ -42,8 +45,12 @@ def _quantize(cls, params, in_qs, stats, **kwargs):
         # derive quantization from statistics
         else:
             opts = kwargs.get('opts', {})
-            o_q = opts.get('qtype_ind')
-            if not o_q:
-                o_q = QType.from_array_sq(params.value, dtype=out_dtype)
+            output_size = opts.get('output_size')
+            if output_size and output_size[0]:
+                cur_bits = DTYPES[out_dtype][0]
+                bits = min(output_size[0], cur_bits)
+            else:
+                bits = None
+            o_q = QType.from_array_sq(params.value, dtype=out_dtype, bits=bits)
         o_q.is_constant = True
         return QRec.scaled(in_qs=[o_q], out_qs=[o_q])
diff --git a/tools/nntool/quantization/multiplicative/quantizers/expression_fusion_mult.py b/tools/nntool/quantization/multiplicative/quantizers/expression_fusion_mult.py
index cd4be879e..6e4b47eed 100644
--- a/tools/nntool/quantization/multiplicative/quantizers/expression_fusion_mult.py
+++ b/tools/nntool/quantization/multiplicative/quantizers/expression_fusion_mult.py
@@ -26,14 +26,14 @@
 from quantization.unified_quantization_handler import (in_qs_constraint,
                                                        out_qs_constraint,
                                                        params_type)
-
+from bfloat16 import bfloat16
 from ..mult_quantization_handler import MultQuantizionHandler
 
 LOG = logging.getLogger('nntool.' + __name__)
 
 @params_type(ExpressionFusionParameters)
-@in_qs_constraint(MatchAll({'dtype': {np.int8, np.uint8, np.int16, np.uint16}}))
-@out_qs_constraint(MatchAll({'dtype': {np.int8, np.uint8, np.int16, np.uint16}}))
+@in_qs_constraint(MatchAll({'dtype': {np.int8, np.uint8, np.int16, np.uint16, np.float16, bfloat16}}))
+@out_qs_constraint(MatchAll({'dtype': {np.int8, np.uint8, np.int16, np.uint16, np.float16, bfloat16}}))
 class ExpressionFusionMult(MultQuantizionHandler):
     @classmethod
     def _quantize(cls, params, in_qs, stats, **kwargs):
diff --git a/tools/nntool/quantization/multiplicative/quantizers/filter_mult.py b/tools/nntool/quantization/multiplicative/quantizers/filter_mult.py
index 57f1fb857..9b49b245c 100644
--- a/tools/nntool/quantization/multiplicative/quantizers/filter_mult.py
+++ b/tools/nntool/quantization/multiplicative/quantizers/filter_mult.py
@@ -311,22 +311,26 @@ def _quantize_ne16(cls, params, in_qs, stats, input_dtype, **kwargs):
         G = kwargs['G']
         weights_node = cls.get_weights_node(G, fusion if fusion else params)
         min_val, max_val = None, None
+        wbits = (min(in_qs[1].bits, opts['weight_bits'])
+                 if 'weight_bits' not in opts['set_on_node'] else opts['weight_bits'])
         weights_q = QType.from_array_sq(arr=weights_node.dqvalue,
                                         quantized_dimension=cls.get_quantized_dimension(
                                             params, opts),
                                         dtype=np.uint8,
                                         narrow_range=opts['narrow_weights'],
-                                        bit_pack=opts['weight_bits'],
+                                        bit_pack=wbits,
                                         no_compression=True,
-                                        bits=opts['weight_bits'])
+                                        bits=wbits)
 
         in_q = in_qs[0]
-        in_q = limit_input_precision(
-            params, input_bits, in_q, params.filter.sz,
-            opts['narrow_weights'], opts['weight_bits'],
-            opts.get('max_precision_limit', MAX_PRECISION_LIMIT_OPTION['default']),
-            out_ranges=stats.get('range_out'),
-            w_qs=[weights_q])
+        if input_bits > 8:
+            in_q = limit_input_precision(
+                params, input_bits, in_q, params.filter.sz,
+                opts['narrow_weights'], wbits,
+                opts.get('max_precision_limit',
+                        MAX_PRECISION_LIMIT_OPTION['default']),
+                out_ranges=stats.get('range_out'),
+                w_qs=[weights_q])
 
         assert in_q.dtype == input_dtype
 
diff --git a/tools/nntool/quantization/multiplicative/quantizers/global_pooling_mult.py b/tools/nntool/quantization/multiplicative/quantizers/global_pooling_mult.py
index d32c2009b..38fb1492f 100644
--- a/tools/nntool/quantization/multiplicative/quantizers/global_pooling_mult.py
+++ b/tools/nntool/quantization/multiplicative/quantizers/global_pooling_mult.py
@@ -68,12 +68,16 @@ def _quantize(cls, params, in_qs, stats, **kwargs):
                         params.name, o_q.min, o_q.max, "asymmetric" if o_q.asymmetric else "symmetric")
         elif isinstance(params, GlobalAveragePoolParameters) or isinstance(params, GlobalSumPoolParameters):
             cls.check_valid_ranges(params, stats, idx=0, dirs='in')
+            in_qs = cls.force_symmetric(in_qs)
+            if in_qs is None:
+                return None
+            in_q = in_qs[0]
  
             # scaling needs to be based on stats and zero point
             o_q = QType.from_min_max_sq(stats['range_out'][0]['min'],
                                         stats['range_out'][0]['max'],
                                         dtype=out_dtype,
-                                        asymmetric=(stats['range_out'][0]['min'] == 0 and in_q.zero_point == -128))
+                                        asymmetric=False)
         else:
             o_q = deepcopy(in_q)
 
diff --git a/tools/nntool/quantization/multiplicative/quantizers/input_mult.py b/tools/nntool/quantization/multiplicative/quantizers/input_mult.py
index dddbc5893..9a35a16ce 100644
--- a/tools/nntool/quantization/multiplicative/quantizers/input_mult.py
+++ b/tools/nntool/quantization/multiplicative/quantizers/input_mult.py
@@ -19,7 +19,7 @@
 from graph.types import InputParameters
 from quantization.new_qrec import QRec
 from quantization.qtype import QType
-from quantization.quantizer_options import ALLOW_ASYMMETRIC_OPTION
+from quantization.quantizer_options import ALLOW_ASYMMETRIC_OPTION, QTYPE_IND_OPTION
 from quantization.unified_quantization_handler import (options,
                                                        out_qs_constraint,
                                                        params_type)
@@ -28,7 +28,8 @@
 
 
 @options(
-    ALLOW_ASYMMETRIC_OPTION
+    ALLOW_ASYMMETRIC_OPTION,
+    QTYPE_IND_OPTION
 )
 @params_type(InputParameters)
 @out_qs_constraint({'dtype': set([np.int8, np.uint8, np.int16, np.uint16])})
@@ -40,8 +41,11 @@ def _quantize(cls, params, in_qs, stats, **kwargs):
             out_dtype = in_qs[0].dtype
         force_out_q = force_out_qs and force_out_qs[0]
         opts = kwargs['opts']
+        o_q_ind = opts.get('qtype_ind')
         if force_out_q:
             o_q = deepcopy(force_out_q)
+        elif o_q_ind:
+            o_q = deepcopy(o_q_ind)
         else:
             cls.check_valid_ranges(params, stats, idx=0, dirs='out')
             o_q = QType.from_min_max_sq(stats['range_out'][0]['min'],
diff --git a/tools/nntool/quantization/multiplicative/quantizers/matmult_mult.py b/tools/nntool/quantization/multiplicative/quantizers/matmult_mult.py
index a09de7233..8b400b01d 100644
--- a/tools/nntool/quantization/multiplicative/quantizers/matmult_mult.py
+++ b/tools/nntool/quantization/multiplicative/quantizers/matmult_mult.py
@@ -22,6 +22,7 @@
 from graph.types.activations import (HSwishActivationParameters,
                                      TanHActivationParameters)
 from graph.types.base import NNEdge
+from graph.types.tensor_arithmetic import MatMulTransposedParameters
 from quantization.multiplicative.quantizers.filter_mult import \
     check_filter_options
 from quantization.multiplicative.quantizers.rnn_mult_ne16 import (
@@ -150,7 +151,8 @@ def _quantize(cls, params, in_qs, stats, **kwargs):
             kwargs['graph_update']['requires_adjust'] = True
             in_q2 = QType.from_array_sq(
                 arr=in2_node.dqvalue,
-                quantized_dimension=len(in2_node.dqvalue.shape) - 2,
+                quantized_dimension=(len(in2_node.dqvalue.shape) -
+                                     (2 if isinstance(params, MatMulTransposedParameters) else 1)),
                 dtype=np.int8,
                 narrow_range=True,
                 bits=8)
@@ -235,7 +237,7 @@ def _quantize_ne16(cls, params, in_qs, stats, input_dtype, **kwargs):
         in_q1 = limit_input_precision(
             params, input_bits, in_q1, w1, False, opts['weight_bits'],
             opts.get('max_precision_limit',
-            MAX_PRECISION_LIMIT_OPTION['default']),
+                     MAX_PRECISION_LIMIT_OPTION['default']),
             out_ranges=stats.get('range_out'),
             w_qs=[in_q2])
 
diff --git a/tools/nntool/quantization/multiplicative/quantizers/output_mult.py b/tools/nntool/quantization/multiplicative/quantizers/output_mult.py
index 47a9c2367..4b1a0f67d 100644
--- a/tools/nntool/quantization/multiplicative/quantizers/output_mult.py
+++ b/tools/nntool/quantization/multiplicative/quantizers/output_mult.py
@@ -18,7 +18,8 @@
 import numpy as np
 from graph.types import OutputParameters
 from quantization.new_qrec import QRec
-from quantization.unified_quantization_handler import (out_qs_constraint,
+from quantization.quantizer_options import QTYPE_IND_OPTION
+from quantization.unified_quantization_handler import (out_qs_constraint, options,
                                                        params_type, needs_stats)
 
 from ..mult_quantization_handler import MultQuantizionHandler
@@ -27,7 +28,14 @@
 @params_type(OutputParameters)
 @out_qs_constraint({'dtype': set([np.int8, np.uint8, np.int16])})
 @needs_stats(False)
+@options(QTYPE_IND_OPTION)
 class OutputMult(MultQuantizionHandler):
     @classmethod
     def _quantize(cls, params, in_qs, stats, **kwargs):
-        return QRec.scaled(in_qs=deepcopy(in_qs), out_qs=deepcopy(in_qs))
+        opts = kwargs['opts']
+        in_q_ind = opts.get('qtype_ind')
+        if in_q_ind:
+            in_q = deepcopy(in_q_ind)
+        else:
+            in_q = deepcopy(in_qs[0])
+        return QRec.scaled(in_qs=[in_q], out_qs=[in_q])
diff --git a/tools/nntool/quantization/multiplicative/quantizers/softmax_tanh_mult.py b/tools/nntool/quantization/multiplicative/quantizers/softmax_tanh_mult.py
index b2fbff55b..16ee1aad0 100644
--- a/tools/nntool/quantization/multiplicative/quantizers/softmax_tanh_mult.py
+++ b/tools/nntool/quantization/multiplicative/quantizers/softmax_tanh_mult.py
@@ -18,7 +18,7 @@
 from graph.types.activations import HTanHActivationParameters
 from quantization.new_qrec import QRec
 from quantization.qtype import QType
-from quantization.quantizer_options import SOFTMAX_OUT_8BITS_OPTION
+from quantization.quantizer_options import SOFTMAX_OUT_8BITS_OPTION, OUTPUT_SIZE_OPTION
 from quantization.unified_quantization_handler import (in_qs_constraint,
                                                        out_qs_constraint,
                                                        params_type, options)
@@ -39,7 +39,7 @@ def _quantize(cls, params, in_qs, stats, **kwargs):
         force_out_q = force_out_qs and force_out_qs[0]
         opts = kwargs['opts']
         if force_out_q:
-            if force_out_q.forced_scale or force_out_q.forced_zero_point:
+            if force_out_q.forced_scale or (force_out_q.forced_zero_point and not np.all(in_qs[0].zero_point == 0)):
                 return None
             if in_qs[0].dtype == np.int8:
                 dtypes = [np.int8, np.int16]
diff --git a/tools/nntool/quantization/new_qrec.py b/tools/nntool/quantization/new_qrec.py
index 7438a5f69..4f0e6badb 100644
--- a/tools/nntool/quantization/new_qrec.py
+++ b/tools/nntool/quantization/new_qrec.py
@@ -158,7 +158,7 @@ def get_outputs(self,
         del params
         if ktype == "symmetric":
             if self._auto_dequantize_outputs:
-                return [self.out_qs[idx].dequantize(output_tensor) for idx, output_tensor in enumerate(output_tensors)]
+                return [self.out_qs[idx].dequantize(x) for idx, x in enumerate(output_tensors)]
             output_tensors = [self.out_qs[idx].clip(output_tensor)
                               for idx, output_tensor in enumerate(output_tensors)]
         return output_tensors
diff --git a/tools/nntool/quantization/qtype.py b/tools/nntool/quantization/qtype.py
index 7d409ba36..45324622d 100644
--- a/tools/nntool/quantization/qtype.py
+++ b/tools/nntool/quantization/qtype.py
@@ -658,7 +658,9 @@ def calculate_scale(rmin, rmax, qmin, qmax, dtype, asymmetric=False,
             scale = np.maximum(
                 divide_ignore(rpos_range, qpos_range),
                 divide_ignore(rneg_range, qneg_range))
-            return np.atleast_1d(scale), np.atleast_1d(zero_point)
+            scale = np.atleast_1d(scale)
+            scale[scale == 0] = 1
+            return scale, np.atleast_1d(zero_point)
         elif asymmetric:
             if narrow_range:
                 raise ValueError(
@@ -705,7 +707,9 @@ def calculate_scale(rmin, rmax, qmin, qmax, dtype, asymmetric=False,
                 nudged_zero_point = qmax
             else:
                 nudged_zero_point = np.round(zero_point).astype(dtype)
-            return np.atleast_1d(scale), np.atleast_1d(nudged_zero_point)
+            scale = np.atleast_1d(scale)
+            scale[scale == 0] = 1
+            return scale, np.atleast_1d(nudged_zero_point)
         else:
             scale = QType.calculate_symmetric_scales(
                 qrange, rmin, rmax, narrow_range=narrow_range)
@@ -718,7 +722,9 @@ def calculate_scale(rmin, rmax, qmin, qmax, dtype, asymmetric=False,
             else:
                 zero_point = np.atleast_1d(
                     np.ceil(qrange/2) + qmin).astype(dtype)
-            return np.atleast_1d(scale), zero_point
+            scale = np.atleast_1d(scale)
+            scale[scale == 0] = 1
+            return scale, zero_point
 
     def recalculate_scale(self, min_val, max_val, narrow_range=None):
         if narrow_range is None:
diff --git a/tools/nntool/quantization/quantizer/new_quantizer.py b/tools/nntool/quantization/quantizer/new_quantizer.py
index f34a497c0..51a353c09 100644
--- a/tools/nntool/quantization/quantizer/new_quantizer.py
+++ b/tools/nntool/quantization/quantizer/new_quantizer.py
@@ -96,6 +96,14 @@ def options(self, val):
     def set_options(self, **kwargs):
         self._options.update(kwargs)
 
+    def update_options(self, new_options):
+        for k, v in new_options.items():
+            old_v = self._options.get(k, None)
+            if isinstance(old_v, dict) and isinstance(v, dict):
+                old_v.update(v)
+            else:
+                self._options[k] = v
+
     @property
     def schemes(self):
         return self._schemes
@@ -369,6 +377,7 @@ def get_options(self, nid, handler=None):
             node_options = self._options.get(nid, {})
             opts.update({k: v for k, v in node_options.items()
                          if k in opts})
+            opts['set_on_node'] = list(node_options.keys())
         else:
             opts = {k: v for k, v in self._options.items()
                     if not isinstance(k, NodeId)}
diff --git a/tools/nntool/quantization/quantizer_options.py b/tools/nntool/quantization/quantizer_options.py
index 48f114c18..59e56015f 100644
--- a/tools/nntool/quantization/quantizer_options.py
+++ b/tools/nntool/quantization/quantizer_options.py
@@ -89,6 +89,12 @@
     'default': 8
 }
 
+OUTPUT_SIZE_OPTION = {
+    'name': 'output_size',
+    'type': None,
+    'default': None
+}
+
 FORCE_EXTERNAL_SIZE_OPTION = {
     'name': 'force_external_size',
     'type': int,
diff --git a/tools/nntool/reports/draw_graph_reporter.py b/tools/nntool/reports/draw_graph_reporter.py
index da7b40b5c..812299e88 100644
--- a/tools/nntool/reports/draw_graph_reporter.py
+++ b/tools/nntool/reports/draw_graph_reporter.py
@@ -380,7 +380,7 @@ def report_expression(self, dot: Digraph, G: NNGraph,
                               'labelloc': 't',
                               'labeljust': 'l'},
                           node_attr={'style': 'solid(dashed)'}) as sub:
-            for var, func in func_col.functions.items():
+            for var, func in func_col:
                 node_id, shape = self.report_symbol(
                     sub, func, intermediates, anonymise=anonymise)
                 var_name = self.get_next('Var') if anonymise else var.name
diff --git a/tools/nntool/reports/quantization_reporter.py b/tools/nntool/reports/quantization_reporter.py
index 8b8673fc6..09ce00a52 100644
--- a/tools/nntool/reports/quantization_reporter.py
+++ b/tools/nntool/reports/quantization_reporter.py
@@ -14,6 +14,7 @@
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
 from graph.types import ConstantInputParameters
+from graph.types.activations import ActivationParameters
 from graph.types.base import FilterParameters
 from utils.node_id import NodeId
 from utils.tabular import Tabular, TabularColumn
@@ -92,6 +93,8 @@ def report(self, G, stats, nodes=None):
                                     row.append(self.emit_qs([qrec.cache[key]]))
                                 else:
                                     row.append("")
+                        elif "scale_mul_biases_q" in qrec.cache:
+                            row += ["", "", self.emit_qs([qrec.cache["scale_mul_biases_q"]]), "", ""]
                         else:
                             row += ["", "", "", "", ""]
                 else:
diff --git a/tools/nntool/utils/node_id.py b/tools/nntool/utils/node_id.py
index 2703db6b5..b44e6105e 100644
--- a/tools/nntool/utils/node_id.py
+++ b/tools/nntool/utils/node_id.py
@@ -25,6 +25,12 @@ def __init__(self, node, fnode=None):
             fnode_name = None if fnode is None else fnode if isinstance(fnode, str) else fnode.name
             self._id = [node.name, "" if fnode is None else fnode_name]
 
+    @property
+    def key(self):
+        if self._id[1]:
+            return self._id
+        return self._id[0]
+
     @property
     def id(self):
         return self._id
diff --git a/tools/nntool/utils/random_iter.py b/tools/nntool/utils/random_iter.py
new file mode 100644
index 000000000..8007aeade
--- /dev/null
+++ b/tools/nntool/utils/random_iter.py
@@ -0,0 +1,42 @@
+# Copyright (C) 2022  GreenWaves Technologies, SAS
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+import numpy as np
+
+
+class RandomIter():
+    def __init__(self, count, shapes, ranges, gen=None) -> None:
+        self._vars = list(zip(shapes, ranges))
+        self._gen = gen
+        self._count = count
+        self._cur = count
+        if self._gen is None:
+            self._gen = np.random.default_rng()
+
+    def __iter__(self):
+        self._cur = self._count
+        return self
+
+    def __next__(self):
+        if self._cur <= 0:
+            raise StopIteration()
+        self._cur -= 1
+        return self.val()
+
+    def val(self):
+        res = []
+        for shape, (minv, maxv) in self._vars:
+            res.append((self._gen.random(shape) * (maxv - minv)) + minv)
+        return res
diff --git a/utils/gaptest/gaptest b/utils/gaptest/gaptest
index e6d506785..ce077a9fa 100755
--- a/utils/gaptest/gaptest
+++ b/utils/gaptest/gaptest
@@ -77,6 +77,7 @@ sub exc_cmd_make {
     my $os = shift;
     my $platform = shift;
     my $flags = shift;
+    my $compile_only = shift;
     my $tags = shift;
     my $pre = shift;
     my $post = shift;
@@ -85,18 +86,24 @@ sub exc_cmd_make {
     my $exec_dir = shift;
     my $res = 0;
     my $make_path = $basedir."/".$exec_dir;
-    system ("make -C ".$make_path." PMSIS_OS=".$os." build_dir_ext=".$target_name." clean");
+    system ("make -C ".$make_path." ".$flags." PMSIS_OS=".$os." build_dir_ext=".$target_name." clean");
     if($pre == 1)
     {
         `make prepare`;
     }
 
+    my $targets = "all";
+    if (not $compile_only)
+    {
+        $targets = "${targets} run";
+    }
+
     chdir $exec_dir;
-    say "make  -C ".$make_path." ".$flags." PMSIS_OS=".$os." platform=".$platform." build_dir_ext=".$target_name." all run";
     my ($seconds_before, $seconds_after);
     timeout $timeout =>  sub {
         $seconds_before = time();
-        $res = system ("make  -C ".$make_path." ".$flags." PMSIS_OS=".$os." platform=".$platform." build_dir_ext=".$target_name." all run");
+        say "make  -C ".$make_path." ".$flags." PMSIS_OS=".$os." platform=".$platform." build_dir_ext=".$target_name." $targets";
+        $res = system ("make  -C ".$make_path." ".$flags." PMSIS_OS=".$os." platform=".$platform." build_dir_ext=".$target_name." $targets");
         $seconds_after = time();
         $res = $res >>=8;
         say $target_name." : Result is: ".$res;
@@ -122,6 +129,7 @@ sub exc_cmd_cmake {
     my $os = shift;
     my $platform = shift;
     my $flags = shift;
+    my $compile_only = shift;
     my $tags = shift;
     my $pre = shift;
     my $post = shift;
@@ -150,15 +158,19 @@ sub exc_cmd_cmake {
         say "CMAKE_GENERATOR=Ninja cmake -S $make_path -B $make_path/build".-$target_name."  -DCONFIG_GAP_SDK_HOME=$sdk_root_path -DCMAKE_MODULE_PATH=$sdk_root_path/utils/cmake -DCONFIG_CHIP=$chip -DCONFIG_CHIP_VERSION=$chip_version -DBOARD=$ENV{'BOARD_NAME'} $cmake_flags";
         $res = system ("CMAKE_GENERATOR=Ninja cmake -S $make_path -B $make_path/build".-$target_name."  -DCONFIG_GAP_SDK_HOME=$sdk_root_path -DCMAKE_MODULE_PATH=$sdk_root_path/utils/cmake -DCONFIG_CHIP=$chip -DCONFIG_CHIP_VERSION=$chip_version -DBOARD=$ENV{'BOARD_NAME'} $cmake_flags");
         say "cmake --build $make_path/build";
-        $res = system("cmake --build $make_path/build".-$target_name);
-        say "cmake --build $make_path/build --target run";
-        $res = system("cmake --build $make_path/build".-$target_name."  --target run");
-        $seconds_after = time();
-        $res = $res >>=8;
-        say $target_name." : Result is: ".$res;
-        if($post == 1)
+
+        if (not $compile_only)
         {
-            $res = system("cmake --build build --target postrun");
+            $res = system("cmake --build $make_path/build".-$target_name);
+            say "cmake --build $make_path/build --target run";
+            $res = system("cmake --build $make_path/build".-$target_name."  --target run");
+            $seconds_after = time();
+            $res = $res >>=8;
+            say $target_name." : Result is: ".$res;
+            if($post == 1)
+            {
+                $res = system("cmake --build build --target postrun");
+            }
         }
     };
     my $seconds = $seconds_after - $seconds_before;
@@ -256,6 +268,7 @@ sub process_yml {
             my @arg = ($config_os,
                 $config_platform,
                 $test_variant->{flags},
+                $test_variant->{compile_only},
                 '',
                 0,
                 0,
diff --git a/utils/gaptest/lib/gaptest/Loader.pm b/utils/gaptest/lib/gaptest/Loader.pm
index 12e0a835a..441816565 100644
--- a/utils/gaptest/lib/gaptest/Loader.pm
+++ b/utils/gaptest/lib/gaptest/Loader.pm
@@ -144,6 +144,15 @@ sub load_test_variant {
         $test_variant{flags} = "";
     }
 
+    $test_variant{compile_only} = 0;
+    if (defined $section->{compile_only})
+    {
+        if ($section->{compile_only} eq "true")
+        {
+            $test_variant{compile_only} = 1;
+        }
+    }
+
     if(defined $section->{duration})
     {
         $test_variant{duration} = $section->{duration};
diff --git a/utils/openocd_tools/gap_bins/gap_flasher-gap9_evk-fpga.elf b/utils/openocd_tools/gap_bins/gap_flasher-gap9_evk-fpga.elf
deleted file mode 100755
index 05fe0de49..000000000
Binary files a/utils/openocd_tools/gap_bins/gap_flasher-gap9_evk-fpga.elf and /dev/null differ
diff --git a/utils/openocd_tools/gap_bins/gap_flasher-gap9_evk-mram.elf b/utils/openocd_tools/gap_bins/gap_flasher-gap9_evk-mram.elf
deleted file mode 100755
index 78e5c9b04..000000000
Binary files a/utils/openocd_tools/gap_bins/gap_flasher-gap9_evk-mram.elf and /dev/null differ
diff --git a/utils/openocd_tools/gap_bins/gap_flasher-gap9_evk.elf b/utils/openocd_tools/gap_bins/gap_flasher-gap9_evk.elf
deleted file mode 100755
index 63d1e132b..000000000
Binary files a/utils/openocd_tools/gap_bins/gap_flasher-gap9_evk.elf and /dev/null differ
diff --git a/utils/openocd_tools/gap_bins/gap_flasher-gap9_v2.elf b/utils/openocd_tools/gap_bins/gap_flasher-gap9_v2.elf
deleted file mode 100755
index 29564a524..000000000
Binary files a/utils/openocd_tools/gap_bins/gap_flasher-gap9_v2.elf and /dev/null differ
diff --git a/utils/openocd_tools/src/fuser/gap9-efuse-gen b/utils/openocd_tools/src/fuser/gap9-efuse-gen
deleted file mode 100755
index ef1a4462a..000000000
--- a/utils/openocd_tools/src/fuser/gap9-efuse-gen
+++ /dev/null
@@ -1,91 +0,0 @@
-#!/usr/bin/env python3
-
-#
-# Copyright (C) 2019 GreenWaves Technologies
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import argparse
-import runner.chips.gap9_v2_efuse as efuse
-
-parser = argparse.ArgumentParser(description='Generate gap9 efuse map')
-
-parser.add_argument("--usecase", dest="usecase", default=None, help="specify the usecase")
-parser.add_argument("--output", dest="output", default=None, help="specify the output file path")
-parser.add_argument("--name", dest="name", default=None, help="specify the structure name")
-
-args = parser.parse_args()
-
-efuse_map = efuse.Efuse_map()
-
-efuse_map.get_efuse('info_1').get_field('icache_enabled').set(1)
-
-# By default, only activate fast clock and fed other blocks like timer at 24Mhz/16
-fast_osc_freq_div = 24576062.0 / 16
-efuse_map.get_efuse('info_1').get_field('osc_ctrl_setup').set(1)
-efuse_map.get_efuse('info_1').get_field('osc_ctrl').set(1)
-efuse_map.get_efuse('info_1').get_field('fast_clk_div_pow2_setup').set(1)
-efuse_map.get_efuse('fast_clk_div_pow2').set(4 | (1<<3))
-efuse_map.get_efuse('info_2').get_field('wake_osc_ctrl_setup').set(1)
-efuse_map.get_efuse('info_2').get_field('wake_osc_ctrl').set(1)
-efuse_map.get_efuse('info_2').get_field('wake_fast_clk_div_pow2_setup').set(1)
-efuse_map.get_efuse('wake_fast_clk_div_pow2').set(4 | (1<<3))
-
-# Lock FLL soc and periph
-efuse_map.get_efuse('info_1').get_field('fll_global_setup').set(1)
-efuse_map.get_efuse('info_1').get_field('fll_dco0_setup').set(1)
-# FLL DRR (DCO min | DCO max)
-efuse_map.get_efuse('fll_drr').set((0 << 0) | (0x1ff << 16))
-# Pre-lock FLL CCR1 (CLK0 DIV | CLK1 DIV)
-efuse_map.get_efuse('fll_ccr1_pre_lock').set((0 << 0) | (0 << 8))
-# Post-lock FLL CCR1 (CLK0 DIV | CLK1 DIV)
-efuse_map.get_efuse('fll_ccr1_post_lock').set((0 << 0) | (3 << 8))
-# FLL CCR2 (CLK0 SEL | CLK1 SEL | CLK2_SEL | CLK3_SEL | CKG0)
-efuse_map.get_efuse('fll_ccr2').set((0x1 << 0) | (0x1 << 4) | (0x1 << 8) | (0x2 << 12) | (1 << 16))
-# DCO0 CR1 (DCO EN | CLOSE LOOP | LOOP GAIN | LOCK TOL | ITG | ASSERT CYCLES)
-efuse_map.get_efuse('fll_f0cr1').set((1 << 0) | (1 << 1) | (4 << 4) | (10 << 8) | (24 << 16) | (6 << 26))
-# DCO0 CR2 (MFI | DCO CODE)
-efuse_map.get_efuse('fll_f0cr2').set((166 << 0) | (0x1A << 16))
-
-# FLL DRR (DCO min | DCO max)
-efuse_map.get_efuse('wakeup_fll_drr').set((0 << 0) | (0x1ff << 16))
-# Pre-lock FLL CCR1 (CLK0 DIV | CLK1 DIV)
-efuse_map.get_efuse('wakeup_fll_ccr1_pre_lock').set((0 << 0) | (0 << 8))
-# Post-lock FLL CCR1 (CLK0 DIV | CLK1 DIV)
-efuse_map.get_efuse('wakeup_fll_ccr1_post_lock').set((0 << 0) | (1 << 8))
-# FLL CCR2 (CLK0 SEL | CLK1 SEL | CLK2_SEL | CLK3_SEL | CKG0)
-efuse_map.get_efuse('wakeup_fll_ccr2').set((0x1 << 0) | (0x1 << 4) | (0x1 << 8) | (0x2 << 12) | (1 << 16))
-# DCO0 CR1 (DCO EN | CLOSE LOOP | LOOP GAIN | LOCK TOL | ITG | ASSERT CYCLES)
-efuse_map.get_efuse('wakeup_fll_f0cr1').set((1 << 0) | (1 << 1) | (4 << 4) | (10 << 8) | (24 << 16) | (6 << 26))
-# DCO0 CR2 (MFI | DCO CODE)
-efuse_map.get_efuse('wakeup_fll_f0cr2').set((166 << 0) | (0x1A << 16))
-
-
-if args.usecase == 'mram':
-    efuse_map.get_efuse('info_1').get_field('bootmode').set(3)
-    efuse_map.get_efuse('info_1').get_field('mram_reset_wait').set(1)
-    efuse_map.get_efuse('info_2').get_field('wake_mram_reset_wait').set(1)
-    efuse_map.get_efuse('mram_reset_wait_cycles').set(math.ceil(0.000003*fast_osc_freq_div))
-    efuse_map.get_efuse('wake_mram_reset_wait_cycles').set(math.ceil(0.000003*fast_osc_freq_div))
-    efuse_map.get_efuse('info_2').get_field('clkdiv_setup').set(1)
-    efuse_map.get_efuse('info_2').get_field('clkdiv').set(5)
-    efuse_map.get_efuse('info_3').get_field('flash_wait').set(1)
-    efuse_map.get_efuse('flash_wait').set(math.ceil(0.00002*fast_osc_freq_div))
-
-
-
-
-if args.output is not None:
-    with open(args.output, 'w') as output_file:
-        efuse_map.gen_c_struct(args.name, output_file)
diff --git a/utils/openocd_tools/tcl/gap9reva.tcl b/utils/openocd_tools/tcl/gap9reva.tcl
deleted file mode 100644
index 4a2da6f0a..000000000
--- a/utils/openocd_tools/tcl/gap9reva.tcl
+++ /dev/null
@@ -1,81 +0,0 @@
-adapter_khz     5000
-transport select jtag
-# Channel 1 is taken by Xilinx JTAG
-#reset_config srst_pulls_trst
-reset_config trst_and_srst
-#adapter_nsrst_assert_width 1000
-#adapter_nsrst_delay 1000
-#ftdi_tdo_sample_edge falling
-set _CHIPNAME riscv
-jtag newtap $_CHIPNAME cpu -irlen 5 -expected-id 0x00000001
-jtag newtap $_CHIPNAME unknown0 -irlen 4 -expected-id 0x10102001
-foreach t [jtag names] {
-    puts [format "TAP: %s\n" $t]
-}
-set _TARGETNAME $_CHIPNAME.cpu
-
-target create $_TARGETNAME riscv -chain-position $_TARGETNAME -coreid 0x3e0
-gdb_report_data_abort enable
-gdb_report_register_access_error enable
-
-riscv set_reset_timeout_sec 120
-riscv set_command_timeout_sec 120
-# prefer to use sba for system bus access
-riscv set_prefer_sba on
-proc jtag_init {} {
-    puts "----------- jtag init"
-    # ensure chip reset done: this might not always be what we want
-    jtag_reset 0 0
-    sleep 1
-    jtag_reset 0 1
-    sleep 10
-    jtag_reset 0 0
-    sleep 1
-    # ensure jtag reset is done
-    pathmove RESET
-    pathmove IDLE
-    # "going to examine"
-    #riscv.cpu arp_examine
-    # "examination done"
-    puts "----------- jtag init done"
-}
-proc init_reset {mode} {
-    puts "----------- init reset"
-    # ensure chip reset done: this might not always be what we want
-    # ensure jtag reset is done
-    jtag_reset 0 0
-    sleep 1
-    jtag_reset 0 1
-    sleep 10
-    jtag_reset 0 0
-    sleep 20
-    pathmove RESET
-    pathmove IDLE
-    # "going to examine"
-    #riscv.cpu arp_examine
-    # "examination done"
-    #if { $mode == 0x1} {
-    #    riscv.cpu arp_halt
-    #}
-}
-proc load_and_start_binary { elf_file pc_entry } {
-    puts "----------- load and start bin"
-    # first ensure we are rest and halt so that pc is accessible
-    riscv.cpu mww 0x1A100008 0x0fff1907
-    riscv.cpu mww 0x1A100018 0x0fff1907
-    riscv.cpu mww 0x1A100028 0x0fff1907
-    riscv.cpu mww 0x1A100004 0xd0885f5e
-    riscv.cpu mww 0x1A100014 0xd0885f5e
-    riscv.cpu mww 0x1A100024 0xd0885f5e
-    #reset halt
-    load_image ${elf_file} 0x0 elf
-    reg pc ${pc_entry}
-    resume
-}
-# dump jtag chain
-#scan_chain
-#telnet_port 6666
-init
-reset halt
-riscv.cpu arm semihosting enable
-echo "Ready for Remote Connections"
diff --git a/utils/openocd_tools/tcl/gap9revb-bootmode.tcl b/utils/openocd_tools/tcl/gap9revb-bootmode.tcl
deleted file mode 100644
index 4068cc609..000000000
--- a/utils/openocd_tools/tcl/gap9revb-bootmode.tcl
+++ /dev/null
@@ -1,134 +0,0 @@
-adapter_khz     500
-
-adapter driver remote_bitbang
-remote_bitbang_port 9999
-remote_bitbang_host localhost
-
-reset_config srst_only srst_nogate
-
-set _CHIPNAME gap9
-
-jtag newtap $_CHIPNAME riscv -irlen 5 -expected-id 0x20020bcb
-jtag newtap $_CHIPNAME pulp -irlen 4 -expected-id 0x20021bcb
-
-foreach t [jtag names] {
-	puts [format "TAP: %s\n" $t]
-}
-
-
-set _TAP_RISCV $_CHIPNAME.riscv
-set _TAP_PULP $_CHIPNAME.pulp
-set _CL0 $_CHIPNAME.cl0
-set _CL1 $_CHIPNAME.cl1
-set _CL2 $_CHIPNAME.cl2
-set _CL3 $_CHIPNAME.cl3
-set _CL4 $_CHIPNAME.cl4
-set _CL5 $_CHIPNAME.cl5
-set _CL6 $_CHIPNAME.cl6
-set _CL7 $_CHIPNAME.cl7
-set _CL8 $_CHIPNAME.cl8
-set _FC  $_CHIPNAME.fc
-
-target create $_FC riscv -chain-position $_TAP_RISCV -coreid 0x9 
-#target create $_CL0 riscv -chain-position $_TARGETNAME -coreid 0x0 -defer-examine
-#target create $_CL1 riscv -chain-position $_TARGETNAME -coreid 0x1 -defer-examine
-#target create $_CL2 riscv -chain-position $_TARGETNAME -coreid 0x2 -defer-examine
-#target create $_CL3 riscv -chain-position $_TARGETNAME -coreid 0x3 -defer-examine
-#target create $_CL4 riscv -chain-position $_TARGETNAME -coreid 0x4 -defer-examine
-#target create $_CL5 riscv -chain-position $_TARGETNAME -coreid 0x5 -defer-examine
-target create $_CL6 riscv -chain-position $_TAP_RISCV -coreid 0x6 -defer-examine
-target create $_CL7 riscv -chain-position $_TAP_RISCV -coreid 0x7 -defer-examine
-#target create $_CL8 riscv -chain-position $_TARGETNAME -coreid 0x8 -defer-examine
-target smp $_CL6 $_CL7
-#target create $_CL8 riscv -chain-position $_TARGETNAME -coreid 0x8 -gdb-port 6666 -defer-examine
-#target create $_CL8 riscv -chain-position $_TARGETNAME -coreid 0x8 -gdb-port 6666
-
-
-$_CL6 configure -rtos hwthread
-$_CL7 configure -rtos hwthread
-
-proc cl6_attach_proc { } {
-    $::_CL6 arp_examine
-    $::_CL7 arp_examine
-    # since smp, this will halt all concerned code
-    $::_CL6 arp_halt
-    #$::_CL7 arp_halt
-    $::_CL6 arm semihosting enable
-    $::_CL7 arm semihosting enable
-}
-$_CL6 configure -event gdb-attach cl6_attach_proc
-
-gdb_report_data_abort enable
-gdb_report_register_access_error enable
-
-riscv set_reset_timeout_sec 1440
-riscv set_command_timeout_sec 1440
-
-# prefer to use sba for system bus access
-riscv set_prefer_sba on
-
-
-proc poll_confreg { value } {
-    irscan $::_TAP_PULP 0x6
-    # size then value
-    set ret [eval drscan $::_TAP_PULP 0x8 $value]
-    puts "ret=$ret"
-    while { !$ret } {
-        irscan $::_TAP_PULP 0x6
-        # size then value
-        set ret [eval drscan $::_TAP_PULP 0x8 $value]
-        puts "ret=$ret"
-    }
-}
-
-proc jtag_init {} {
-    puts "jtag init"
-    targets $::_FC
-    jtag_reset 0 1
-    sleep 1
-    jtag_reset 0 0
-    sleep 1
-    # wait for jtag ready
-    poll_confreg 0x1
-    echo "confreg polling done"
-    jtag arp_init-reset
-}
-
-proc init_reset {mode} {
-    puts "hello"
-    targets $::_FC
-    jtag_reset 0 1
-    sleep 1
-    jtag_reset 0 0
-    sleep 1
-    # wait for jtag ready
-    poll_confreg 0x1
-    echo "confreg polling done"
-    jtag arp_init-reset
-}
-
-proc load_and_start_binary { elf_file pc_entry } {
-    targets $::_FC
-    # first ensure we are rest and halt so that pc is accessible
-    #$::_FC arp_reset assert 1
-    reset halt
-    load_image ${elf_file} 0x0 elf
-    reg pc ${pc_entry}
-    resume
-}
-
-
-# dump jtag chain
-#scan_chain
-
-targets $_FC
-init
-
-
-#targets
-#ftdi_set_signal nSRST 1
-halt
-
-$::_FC arm semihosting enable
-
-echo "Ready for Remote Connections"
diff --git a/utils/openocd_tools/tcl/gap9revb.tcl b/utils/openocd_tools/tcl/gap9revb.tcl
deleted file mode 100644
index c3246be19..000000000
--- a/utils/openocd_tools/tcl/gap9revb.tcl
+++ /dev/null
@@ -1,167 +0,0 @@
-adapter_khz     5000
-
-#interface jlink
-#transport select jtag
-# Channel 1 is taken by Xilinx JTAG
-#reset_config srst_pulls_trst
-
-#adapter driver remote_bitbang
-#remote_bitbang_port 9999
-#remote_bitbang_host localhost
-
-reset_config srst_only srst_nogate
-
-set _CHIPNAME gap9
-
-jtag newtap $_CHIPNAME riscv -irlen 5 -expected-id 0x20020bcb
-jtag newtap $_CHIPNAME pulp -irlen 4 -expected-id 0x20021bcb
-
-foreach t [jtag names] {
-	puts [format "TAP: %s\n" $t]
-}
-
-
-set _TAP_RISCV $_CHIPNAME.riscv
-set _TAP_PULP $_CHIPNAME.pulp
-set _CL0 $_CHIPNAME.cl0
-set _CL1 $_CHIPNAME.cl1
-set _CL2 $_CHIPNAME.cl2
-set _CL3 $_CHIPNAME.cl3
-set _CL4 $_CHIPNAME.cl4
-set _CL5 $_CHIPNAME.cl5
-set _CL6 $_CHIPNAME.cl6
-set _CL7 $_CHIPNAME.cl7
-set _CL8 $_CHIPNAME.cl8
-set _FC  $_CHIPNAME.fc
-
-target create $_FC riscv -chain-position $_TAP_RISCV -coreid 0x9 
-#target create $_CL0 riscv -chain-position $_TAP_RISCV -coreid 0x0 -defer-examine
-#target create $_CL1 riscv -chain-position $_TAP_RISCV -coreid 0x1 -defer-examine
-#target create $_CL2 riscv -chain-position $_TAP_RISCV -coreid 0x2 -defer-examine
-#target create $_CL3 riscv -chain-position $_TAP_RISCV -coreid 0x3 -defer-examine
-#target create $_CL4 riscv -chain-position $_TAP_RISCV -coreid 0x4 -defer-examine
-#target create $_CL5 riscv -chain-position $_TAP_RISCV -coreid 0x5 -defer-examine
-#target create $_CL6 riscv -chain-position $_TAP_RISCV -coreid 0x6 -defer-examine
-#target create $_CL7 riscv -chain-position $_TAP_RISCV -coreid 0x7 -defer-examine
-#target create $_CL8 riscv -chain-position $_TAP_RISCV -coreid 0x8 -defer-examine
-#target smp $_FC $_CL0 $_CL1 $_CL2 $_CL3 $_CL4 $_CL5 $_CL6 $_CL7 $_CL8
-#target create $_CL8 riscv -chain-position $_TARGETNAME -coreid 0x8 -gdb-port 6666 -defer-examine
-#target create $_CL8 riscv -chain-position $_TARGETNAME -coreid 0x8 -gdb-port 6666
-
-
-#$_CL0 configure -rtos hwthread
-#$_CL1 configure -rtos hwthread
-#$_CL2 configure -rtos hwthread
-#$_CL3 configure -rtos hwthread
-#$_CL4 configure -rtos hwthread
-#$_CL5 configure -rtos hwthread
-#$_CL6 configure -rtos hwthread
-#$_CL7 configure -rtos hwthread
-#$_CL8 configure -rtos hwthread
-#$_FC configure -rtos hwthread
-
-#proc cl6_attach_proc { } {
-#    $::_CL6 arp_examine
-#    $::_CL7 arp_examine
-    # since smp, this will halt all concerned code
-#    $::_CL6 arp_halt
-    #$::_CL7 arp_halt
-#    $::_CL6 arm semihosting enable
-#    $::_CL7 arm semihosting enable
-#}
-#$_CL6 configure -event gdb-attach cl6_attach_proc
-
-gdb_report_data_abort enable
-gdb_report_register_access_error enable
-
-riscv set_reset_timeout_sec 1440
-riscv set_command_timeout_sec 1440
-
-# prefer to use sba for system bus access
-riscv set_prefer_sba on
-
-
-proc poll_confreg { value } {
-    irscan $::_TAP_PULP 0x6
-    # size then value
-    set ret [eval drscan $::_TAP_PULP 0x8 $value]
-    puts "ret=$ret"
-    while { $ret != 0x3 } {
-        irscan $::_TAP_PULP 0x6
-        # size then value
-        set ret [eval drscan $::_TAP_PULP 0x8 $value]
-        puts "ret=$ret"
-    }
-}
-
-proc jtag_init {} {
-    puts "jtag init"
-    targets $::_FC
-    jtag_reset 0 1
-    sleep 1
-    jtag_reset 0 0
-    sleep 1
-    # wait for jtag ready
-    poll_confreg 0x1
-    echo "confreg polling done"
-    #$::_CL0 arp_examine
-    #$::_CL1 arp_examine
-    #$::_CL2 arp_examine
-    #$::_CL3 arp_examine
-    #$::_CL4 arp_examine
-    #$::_CL5 arp_examine
-    #$::_CL6 arp_examine
-    #$::_CL7 arp_examine
-    #$::_CL8 arp_examine
-    $::_FC arp_examine
-    #$::_CL0 arp_halt
-    #$::_CL1 arp_halt
-    #$::_CL2 arp_halt
-    #$::_CL3 arp_halt
-    #$::_CL4 arp_halt
-    #$::_CL5 arp_halt
-    #$::_CL6 arp_halt
-    #$::_CL7 arp_halt
-    #$::_CL8 arp_halt
-    #$::_FC arp_halt
-    echo "examine done"
-    jtag arp_init
-}
-
-proc init_reset {mode} {
-    puts "hello"
-    #targets $::_FC
-    jtag_reset 0 1
-    sleep 1
-    jtag_reset 0 0
-    sleep 1
-    # wait for jtag ready
-    poll_confreg 0x1
-    echo "confreg polling done"
-    jtag arp_init
-}
-
-proc load_and_start_binary { elf_file pc_entry } {
-    targets $::_FC
-    # first ensure we are rest and halt so that pc is accessible
-    #$::_FC arp_reset assert 1
-    reset halt
-    load_image ${elf_file} 0x0 elf
-    reg pc ${pc_entry}
-    resume
-}
-
-
-# dump jtag chain
-#scan_chain
-
-init
-
-
-#targets
-#ftdi_set_signal nSRST 1
-halt
-
-$::_FC arm semihosting enable
-
-echo "Ready for Remote Connections"
diff --git a/utils/openocd_tools/tcl/gap9revb_gdb.tcl b/utils/openocd_tools/tcl/gap9revb_gdb.tcl
deleted file mode 100644
index c21b8fc38..000000000
--- a/utils/openocd_tools/tcl/gap9revb_gdb.tcl
+++ /dev/null
@@ -1,245 +0,0 @@
-adapter_khz     500
-
-#interface jlink
-#transport select jtag
-# Channel 1 is taken by Xilinx JTAG
-#reset_config srst_pulls_trst
-
-#adapter driver remote_bitbang
-#remote_bitbang_port 9999
-#remote_bitbang_host localhost
-
-reset_config srst_only srst_nogate
-
-set _CHIPNAME gap9
-
-jtag newtap $_CHIPNAME riscv -irlen 5 -expected-id 0x20020bcb
-jtag newtap $_CHIPNAME pulp -irlen 4 -expected-id 0x20021bcb
-
-foreach t [jtag names] {
-	puts [format "TAP: %s\n" $t]
-}
-
-
-set _TAP_RISCV $_CHIPNAME.riscv
-set _TAP_PULP $_CHIPNAME.pulp
-set _CL0 $_CHIPNAME.cl0
-set _CL1 $_CHIPNAME.cl1
-set _CL2 $_CHIPNAME.cl2
-set _CL3 $_CHIPNAME.cl3
-set _CL4 $_CHIPNAME.cl4
-set _CL5 $_CHIPNAME.cl5
-set _CL6 $_CHIPNAME.cl6
-set _CL7 $_CHIPNAME.cl7
-set _CL8 $_CHIPNAME.cl8
-set _FC  $_CHIPNAME.fc
-
-target create $_FC riscv -chain-position $_TAP_RISCV -coreid 0x9 
-target create $_CL0 riscv -chain-position $_TAP_RISCV -coreid 0x0 -defer-examine
-target create $_CL1 riscv -chain-position $_TAP_RISCV -coreid 0x1 -defer-examine
-target create $_CL2 riscv -chain-position $_TAP_RISCV -coreid 0x2 -defer-examine
-target create $_CL3 riscv -chain-position $_TAP_RISCV -coreid 0x3 -defer-examine
-target create $_CL4 riscv -chain-position $_TAP_RISCV -coreid 0x4 -defer-examine
-target create $_CL5 riscv -chain-position $_TAP_RISCV -coreid 0x5 -defer-examine
-target create $_CL6 riscv -chain-position $_TAP_RISCV -coreid 0x6 -defer-examine
-target create $_CL7 riscv -chain-position $_TAP_RISCV -coreid 0x7 -defer-examine
-target create $_CL8 riscv -chain-position $_TAP_RISCV -coreid 0x8 -defer-examine
-#target create $_CL8 riscv -chain-position $_TARGETNAME -coreid 0x8 -gdb-port 6666 -defer-examine
-#target create $_CL8 riscv -chain-position $_TARGETNAME -coreid 0x8 -gdb-port 6666
-target smp $_FC $_CL0 $_CL1 $_CL2 $_CL3 $_CL4 $_CL5 $_CL6 $_CL7 $_CL8
-#target smp $_FC $_CL8
-
-$_CL0 configure -rtos hwthread
-$_CL1 configure -rtos hwthread
-$_CL2 configure -rtos hwthread
-$_CL3 configure -rtos hwthread
-$_CL4 configure -rtos hwthread
-$_CL5 configure -rtos hwthread
-$_CL6 configure -rtos hwthread
-$_CL7 configure -rtos hwthread
-$_CL8 configure -rtos hwthread
-$_FC configure -rtos hwthread
-
-#proc cl6_attach_proc { } {
-#    $::_CL6 arp_examine
-#    $::_CL7 arp_examine
-    # since smp, this will halt all concerned code
-#    $::_CL6 arp_halt
-    #$::_CL7 arp_halt
-#    $::_CL6 arm semihosting enable
-#    $::_CL7 arm semihosting enable
-#}
-#$_CL6 configure -event gdb-attach cl6_attach_proc
-
-gdb_report_data_abort enable
-gdb_report_register_access_error enable
-
-riscv set_reset_timeout_sec 36000
-riscv set_command_timeout_sec 36000
-
-# prefer to use sba for system bus access
-riscv set_prefer_sba on
-
-
-proc poll_confreg { value } {
-    irscan $::_TAP_PULP 0x6
-    # size then value
-    set ret [eval drscan $::_TAP_PULP 0x8 $value]
-    puts "ret=$ret"
-    while { $ret != 0x3 } {
-        irscan $::_TAP_PULP 0x6
-        # size then value
-        set ret [eval drscan $::_TAP_PULP 0x8 $value]
-        puts "ret=$ret"
-    }
-}
-
-proc cluster_reset { addr } {
-    # first reset the cluster
-    
-    poll off
-    $::_FC mww 0x10200008 0x0
-    $::_FC mww 0x1a1040e4 0x200
-    # SOC CTRL + 0x170
-    $::_FC mww 0x1a104170 0x0
-    sleep 1
-    $::_FC mww 0x1a104170 0x1
-    
-    # CLUSTER Ctrl: 0x10000000 + 0x00200000
-    # addr: +0x40
-    $::_FC mww 0x10200040 $addr 9
-    # fetch en: +0x8 
-    $::_FC mww 0x10200008 0x3ff
-    # available: + 0xe4
-    $::_FC mww 0x1a1040e4 0xffffffff
-    $::_CL0 arp_halt
-    $::_CL1 arp_halt
-    $::_CL2 arp_halt
-    $::_CL3 arp_halt
-    $::_CL4 arp_halt
-    $::_CL5 arp_halt
-    $::_CL6 arp_halt
-    $::_CL7 arp_halt
-    $::_CL8 arp_halt
-    $::_CL0 riscv set_ebreakm on
-    $::_CL1 riscv set_ebreakm on
-    $::_CL2 riscv set_ebreakm on
-    $::_CL3 riscv set_ebreakm on
-    $::_CL4 riscv set_ebreakm on
-    $::_CL5 riscv set_ebreakm on
-    $::_CL6 riscv set_ebreakm on
-    $::_CL7 riscv set_ebreakm on
-    $::_CL8 riscv set_ebreakm on
-    poll on
-}
-
-proc jtag_init {} {
-    puts "jtag init"
-    targets $::_FC
-    jtag_reset 0 1
-    sleep 1
-    jtag_reset 0 0
-    sleep 1
-    # wait for jtag ready
-    poll_confreg 0xb
-    echo "confreg polling done"
-
-    #$::_FC arm semihosting_fileio enable
-    #$::_FC arm semihosting_resexit enable
-    # APB SOC CTRL: 0x1A100000 + 0x00004000
-    # cl isolate: + 0xC
-    #$::_FC mww 0x1a10400c 0
-    # CLUSTER Ctrl: 0x10000000 + 0x00200000
-    # addr +0x40
-    #mww 0x10200040 0x1a00010c 9
-    # fetch en: +0x8 
-    #$::_FC mww 0x10200008 0x3ff
-    # available: + 0xe4
-    #$::_FC mww 0x1a1040e4 0xffffffff
-    $::_CL0 arp_examine
-    $::_CL1 arp_examine
-    $::_CL2 arp_examine
-    $::_CL3 arp_examine
-    $::_CL4 arp_examine
-    $::_CL5 arp_examine
-    $::_CL6 arp_examine
-    $::_CL7 arp_examine
-    $::_CL8 arp_examine
-
-    # halt all
-    #$::_CL0 arp_halt
-    #$::_CL1 arp_halt
-    #$::_CL2 arp_halt
-    #$::_CL3 arp_halt
-    #$::_CL4 arp_halt
-    #$::_CL5 arp_halt
-    #$::_CL6 arp_halt
-    #$::_CL7 arp_halt
-    #$::_CL8 arp_halt
-    #set ebreakm
-    #$::_FC riscv set_ebreakm on
-    #$::_CL0 riscv set_ebreakm on
-    #$::_CL1 riscv set_ebreakm on
-    #$::_CL2 riscv set_ebreakm on
-    #$::_CL3 riscv set_ebreakm on
-    #$::_CL4 riscv set_ebreakm on
-    #$::_CL5 riscv set_ebreakm on
-    #$::_CL6 riscv set_ebreakm on
-    #$::_CL7 riscv set_ebreakm on
-    #$::_CL8 riscv set_ebreakm on
-
-    $::_FC arp_examine
-    $::_FC arp_halt
-    $::_FC arm semihosting enable
-    #$::_CL0 arp_halt
-    #$::_CL1 arp_halt
-    #$::_CL2 arp_halt
-    #$::_CL3 arp_halt
-    #$::_CL4 arp_halt
-    #$::_CL5 arp_halt
-    #$::_CL6 arp_halt
-    #$::_CL7 arp_halt
-    #$::_CL8 arp_halt
-    echo "examine done"
-    jtag arp_init
-}
-
-proc init_reset {mode} {
-    puts "hello"
-    #targets $::_FC
-    jtag_reset 0 1
-    sleep 1
-    jtag_reset 0 0
-    sleep 1
-    # wait for jtag ready
-    poll_confreg 0xb
-    echo "confreg polling done"
-    jtag arp_init
-}
-
-proc load_and_start_binary { elf_file pc_entry } {
-    targets $::_FC
-    # first ensure we are rest and halt so that pc is accessible
-    #$::_FC arp_reset assert 1
-    #reset halt
-    halt
-    load_image ${elf_file} 0x0 elf
-    reg pc ${pc_entry}
-    resume
-}
-
-
-# dump jtag chain
-#scan_chain
-
-init
-
-
-#targets $::_FC
-#ftdi_set_signal nSRST 1
-halt
-
-#target smp $_FC $_CL8
-#$::_FC arm semihosting enable
-
-echo "Ready for Remote Connections"