diff --git a/configs/openocd.sh b/configs/openocd.sh index ff88497ee..61566bc8d 100644 --- a/configs/openocd.sh +++ b/configs/openocd.sh @@ -9,3 +9,7 @@ else fi export PATH=$GAP_SDK_HOME/install/workstation/openocd/bin:$PATH + +# Path to openocd scripts +export OPENOCD_SCRIPTS=$GAP_SDK_HOME/utils/openocd_tools + diff --git a/examples/pmsis/bsp/ble/ble_nina_b112/gaptest.yml b/examples/pmsis/bsp/ble/ble_nina_b112/gaptest.yml new file mode 100644 index 000000000..d822d0849 --- /dev/null +++ b/examples/pmsis/bsp/ble/ble_nina_b112/gaptest.yml @@ -0,0 +1,17 @@ +name: ble_nina_b112 +platforms: + - gvsoc +os: + - freertos + - pulpos +chips: + - gap8 +variants: + std: + name: standard + tags: + - integration + - release + duration: standard + flags: ~ + compile_only: true diff --git a/examples/pmsis/bsp/ble/ble_read_test/gaptest.yml b/examples/pmsis/bsp/ble/ble_read_test/gaptest.yml new file mode 100644 index 000000000..42f5e834a --- /dev/null +++ b/examples/pmsis/bsp/ble/ble_read_test/gaptest.yml @@ -0,0 +1,17 @@ +name: ble_read_test +platforms: + - gvsoc +os: + - freertos + - pulpos +chips: + - gap8 +variants: + std: + name: standard + tags: + - integration + - release + duration: standard + flags: ~ + compile_only: true diff --git a/examples/pmsis/bsp/blink_led/gaptest.yml b/examples/pmsis/bsp/blink_led/gaptest.yml new file mode 100644 index 000000000..0c9b4f4fd --- /dev/null +++ b/examples/pmsis/bsp/blink_led/gaptest.yml @@ -0,0 +1,17 @@ +name: blink_led +platforms: + - gvsoc +os: + - freertos + - pulpos +chips: + - gap8 +variants: + std: + name: standard + tags: + - integration + - release + duration: standard + flags: ~ + compile_only: true diff --git a/examples/pmsis/bsp/cameras/camera_ir_thermeye/always_on/gaptest.yml b/examples/pmsis/bsp/cameras/camera_ir_thermeye/always_on/gaptest.yml new file mode 100644 index 000000000..650f15570 --- /dev/null +++ b/examples/pmsis/bsp/cameras/camera_ir_thermeye/always_on/gaptest.yml @@ -0,0 +1,25 @@ +name: camera_ir_thermeye_always_on +platforms: + - gvsoc +os: + - freertos + - pulpos +chips: + - gap8 +variants: + std: + name: standard + tags: + - integration + - release + duration: standard + flags: ~ + compile_only: true + async: + name: async + tags: + - integration + - release + duration: standard + flags: ASYNC=1 + compile_only: true diff --git a/examples/pmsis/bsp/cameras/camera_ir_thermeye/lower_power_mode/gaptest.yml b/examples/pmsis/bsp/cameras/camera_ir_thermeye/lower_power_mode/gaptest.yml new file mode 100644 index 000000000..42151ac50 --- /dev/null +++ b/examples/pmsis/bsp/cameras/camera_ir_thermeye/lower_power_mode/gaptest.yml @@ -0,0 +1,25 @@ +name: camera_ir_thermeye_low_power_mode +platforms: + - gvsoc +os: + - freertos + - pulpos +chips: + - gap8 +variants: + std: + name: standard + tags: + - integration + - release + duration: standard + flags: ~ + compile_only: true + async: + name: async + tags: + - integration + - release + duration: standard + flags: ASYNC=1 + compile_only: true diff --git a/examples/pmsis/bsp/cameras/test_camera_gc0308/gaptest.yml b/examples/pmsis/bsp/cameras/test_camera_gc0308/gaptest.yml new file mode 100644 index 000000000..4392330e0 --- /dev/null +++ b/examples/pmsis/bsp/cameras/test_camera_gc0308/gaptest.yml @@ -0,0 +1,17 @@ +name: camera_gc0308 +platforms: + - gvsoc +os: + - freertos + - pulpos +chips: + - gap8 +variants: + std: + name: standard + tags: + - integration + - release + duration: standard + flags: ~ + compile_only: true diff --git a/examples/pmsis/bsp/cameras/test_camera_io/gaptest.yml b/examples/pmsis/bsp/cameras/test_camera_io/gaptest.yml new file mode 100644 index 000000000..15922f463 --- /dev/null +++ b/examples/pmsis/bsp/cameras/test_camera_io/gaptest.yml @@ -0,0 +1,25 @@ +name: camera_io +platforms: + - gvsoc +os: + - freertos + - pulpos +chips: + - gap8 +variants: + std: + name: standard + tags: + - integration + - release + duration: standard + flags: ~ + compile_only: true + async: + name: async + tags: + - integration + - release + duration: standard + flags: ASYNC=1 + compile_only: true diff --git a/examples/pmsis/bsp/cameras/test_camera_lcd/gaptest.yml b/examples/pmsis/bsp/cameras/test_camera_lcd/gaptest.yml new file mode 100644 index 000000000..6c0f08da0 --- /dev/null +++ b/examples/pmsis/bsp/cameras/test_camera_lcd/gaptest.yml @@ -0,0 +1,25 @@ +name: camera_lcd +platforms: + - gvsoc +os: + - freertos + - pulpos +chips: + - gap8 +variants: + std: + name: standard + tags: + - integration + - release + duration: standard + flags: ~ + compile_only: true + async: + name: async + tags: + - integration + - release + duration: standard + flags: ASYNC=1 + compile_only: true diff --git a/examples/pmsis/bsp/cameras/test_camera_ov5640/gaptest.yml b/examples/pmsis/bsp/cameras/test_camera_ov5640/gaptest.yml new file mode 100644 index 000000000..93049af03 --- /dev/null +++ b/examples/pmsis/bsp/cameras/test_camera_ov5640/gaptest.yml @@ -0,0 +1,17 @@ +name: camera_ov5640 +platforms: + - gvsoc +os: + - freertos + - pulpos +chips: + - gap8 +variants: + std: + name: standard + tags: + - integration + - release + duration: standard + flags: ~ + compile_only: true diff --git a/examples/pmsis/bsp/cameras/test_camera_ov7670/gaptest.yml b/examples/pmsis/bsp/cameras/test_camera_ov7670/gaptest.yml new file mode 100644 index 000000000..35f06b52a --- /dev/null +++ b/examples/pmsis/bsp/cameras/test_camera_ov7670/gaptest.yml @@ -0,0 +1,17 @@ +name: camera_ov7670 +platforms: + - gvsoc +os: + - freertos + - pulpos +chips: + - gap8 +variants: + std: + name: standard + tags: + - integration + - release + duration: standard + flags: ~ + compile_only: true diff --git a/examples/pmsis/bsp/cameras/test_camera_ov7670/ov7670_config.h b/examples/pmsis/bsp/cameras/test_camera_ov7670/ov7670_config.h old mode 100755 new mode 100644 diff --git a/examples/pmsis/bsp/cameras/test_camera_pixart/gaptest.yml b/examples/pmsis/bsp/cameras/test_camera_pixart/gaptest.yml new file mode 100644 index 000000000..d496534f3 --- /dev/null +++ b/examples/pmsis/bsp/cameras/test_camera_pixart/gaptest.yml @@ -0,0 +1,25 @@ +name: camera_pixart +platforms: + - gvsoc +os: + - freertos + - pulpos +chips: + - gap8 +variants: + std: + name: standard + tags: + - integration + - release + duration: standard + flags: ~ + compile_only: true + async: + name: async + tags: + - integration + - release + duration: standard + flags: ASYNC=1 + compile_only: true diff --git a/examples/pmsis/bsp/cameras/test_camera_stream/gaptest.yml b/examples/pmsis/bsp/cameras/test_camera_stream/gaptest.yml new file mode 100644 index 000000000..fea31f653 --- /dev/null +++ b/examples/pmsis/bsp/cameras/test_camera_stream/gaptest.yml @@ -0,0 +1,17 @@ +name: camera_stream +platforms: + - gvsoc +os: + - freertos + - pulpos +chips: + - gap8 +variants: + std: + name: standard + tags: + - integration + - release + duration: standard + flags: ~ + compile_only: true diff --git a/examples/pmsis/bsp/filesystem/fs_to_l3_copy/gaptest.yml b/examples/pmsis/bsp/filesystem/fs_to_l3_copy/gaptest.yml new file mode 100644 index 000000000..10d486d39 --- /dev/null +++ b/examples/pmsis/bsp/filesystem/fs_to_l3_copy/gaptest.yml @@ -0,0 +1,18 @@ +name: fs_to_l3_copy +platforms: + - gvsoc +os: + - freertos + - pulpos +chips: + - gap8 + - gap9 +variants: + std: + name: standard + tags: + - integration + - release + duration: standard + flags: ~ + compile_only: true diff --git a/examples/pmsis/bsp/filesystem/littlefs/native_lfs/gaptest.yml b/examples/pmsis/bsp/filesystem/littlefs/native_lfs/gaptest.yml new file mode 100644 index 000000000..aba58f9e7 --- /dev/null +++ b/examples/pmsis/bsp/filesystem/littlefs/native_lfs/gaptest.yml @@ -0,0 +1,18 @@ +name: littlefs_native_lfs +platforms: + - gvsoc +os: + - freertos + - pulpos +chips: + - gap8 + - gap9 +variants: + std: + name: standard + tags: + - integration + - release + duration: standard + flags: ~ + compile_only: true diff --git a/examples/pmsis/bsp/filesystem/readfs/gaptest.yml b/examples/pmsis/bsp/filesystem/readfs/gaptest.yml new file mode 100644 index 000000000..64fe4a473 --- /dev/null +++ b/examples/pmsis/bsp/filesystem/readfs/gaptest.yml @@ -0,0 +1,18 @@ +name: readfs +platforms: + - gvsoc +os: + - freertos + - pulpos +chips: + - gap8 + - gap9 +variants: + std: + name: standard + tags: + - integration + - release + duration: standard + flags: ~ + compile_only: true diff --git a/examples/pmsis/bsp/flash/hyper_flash/gaptest.yml b/examples/pmsis/bsp/flash/hyper_flash/gaptest.yml new file mode 100644 index 000000000..b2eb78659 --- /dev/null +++ b/examples/pmsis/bsp/flash/hyper_flash/gaptest.yml @@ -0,0 +1,26 @@ +name: hyper_flash +platforms: + - gvsoc +os: + - freertos + - pulpos +chips: + - gap8 + - gap9 +variants: + std: + name: standard + tags: + - integration + - release + duration: standard + flags: ~ + compile_only: true + async: + name: async + tags: + - integration + - release + duration: standard + flags: ASYNC=1 + compile_only: true diff --git a/examples/pmsis/bsp/flash/hyper_flash_multi_thread/gaptest.yml b/examples/pmsis/bsp/flash/hyper_flash_multi_thread/gaptest.yml new file mode 100644 index 000000000..81e854c32 --- /dev/null +++ b/examples/pmsis/bsp/flash/hyper_flash_multi_thread/gaptest.yml @@ -0,0 +1,25 @@ +name: hyper_flash_multi_thread +platforms: + - gvsoc +os: + - freertos +chips: + - gap8 + - gap9 +variants: + std: + name: standard + tags: + - integration + - release + duration: standard + flags: ~ + compile_only: true + async: + name: async + tags: + - integration + - release + duration: standard + flags: ASYNC=1 + compile_only: true diff --git a/examples/pmsis/bsp/lcd/gapuino_himax_with_lcd/gaptest.yml b/examples/pmsis/bsp/lcd/gapuino_himax_with_lcd/gaptest.yml new file mode 100644 index 000000000..98de5fade --- /dev/null +++ b/examples/pmsis/bsp/lcd/gapuino_himax_with_lcd/gaptest.yml @@ -0,0 +1,17 @@ +name: gapuino_himax_with_lcd +platforms: + - gvsoc +os: + - freertos + - pulpos +chips: + - gap8 +variants: + std: + name: standard + tags: + - integration + - release + duration: standard + flags: ~ + compile_only: true diff --git a/examples/pmsis/bsp/microphones/vesper/vm3011_wakeup/gaptest.yml b/examples/pmsis/bsp/microphones/vesper/vm3011_wakeup/gaptest.yml new file mode 100644 index 000000000..edfb6290f --- /dev/null +++ b/examples/pmsis/bsp/microphones/vesper/vm3011_wakeup/gaptest.yml @@ -0,0 +1,17 @@ +name: microphones_vesper_vm3011_wakeup +platforms: + - gvsoc +os: + - freertos + - pulpos +chips: + - gap8 +variants: + std: + name: standard + tags: + - integration + - release + duration: standard + flags: ~ + compile_only: true diff --git a/examples/pmsis/bsp/ram/hyper_ram/gaptest.yml b/examples/pmsis/bsp/ram/hyper_ram/gaptest.yml new file mode 100644 index 000000000..2539ec88b --- /dev/null +++ b/examples/pmsis/bsp/ram/hyper_ram/gaptest.yml @@ -0,0 +1,26 @@ +name: hyper_ram +platforms: + - gvsoc +os: + - freertos + - pulpos +chips: + - gap8 + - gap9 +variants: + std: + name: standard + tags: + - integration + - release + duration: standard + flags: ~ + compile_only: true + async: + name: async + tags: + - integration + - release + duration: standard + flags: ASYNC=1 + compile_only: true diff --git a/examples/pmsis/bsp/ram/hyper_ram_flash/gaptest.yml b/examples/pmsis/bsp/ram/hyper_ram_flash/gaptest.yml new file mode 100644 index 000000000..387122e40 --- /dev/null +++ b/examples/pmsis/bsp/ram/hyper_ram_flash/gaptest.yml @@ -0,0 +1,26 @@ +name: hyper_ram_flash +platforms: + - gvsoc +os: + - freertos + - pulpos +chips: + - gap8 + - gap9 +variants: + std: + name: standard + tags: + - integration + - release + duration: standard + flags: ~ + compile_only: true + async: + name: async + tags: + - integration + - release + duration: standard + flags: ASYNC=1 + compile_only: true diff --git a/examples/pmsis/bsp/ram/hyper_ram_multi_thread/gaptest.yml b/examples/pmsis/bsp/ram/hyper_ram_multi_thread/gaptest.yml new file mode 100644 index 000000000..f0c34f889 --- /dev/null +++ b/examples/pmsis/bsp/ram/hyper_ram_multi_thread/gaptest.yml @@ -0,0 +1,25 @@ +name: hyper_ram_multi_thread +platforms: + - gvsoc +os: + - freertos +chips: + - gap8 + - gap9 +variants: + std: + name: standard + tags: + - integration + - release + duration: standard + flags: ~ + compile_only: true + async: + name: async + tags: + - integration + - release + duration: standard + flags: ASYNC=1 + compile_only: true diff --git a/examples/pmsis/bsp/wifi/nina_b112_example/gaptest.yml b/examples/pmsis/bsp/wifi/nina_b112_example/gaptest.yml new file mode 100644 index 000000000..67a08ccad --- /dev/null +++ b/examples/pmsis/bsp/wifi/nina_b112_example/gaptest.yml @@ -0,0 +1,17 @@ +name: wifi_nina_b112 +platforms: + - gvsoc +os: + - freertos + - pulpos +chips: + - gap8 +variants: + std: + name: standard + tags: + - integration + - release + duration: standard + flags: ~ + compile_only: true diff --git a/examples/pmsis/features/aes128_sw/gaptest.yml b/examples/pmsis/features/aes128_sw/gaptest.yml new file mode 100644 index 000000000..ec524aa63 --- /dev/null +++ b/examples/pmsis/features/aes128_sw/gaptest.yml @@ -0,0 +1,17 @@ +name: aes128_sw +platforms: + - gvsoc +os: + - freertos + - pulpos +chips: + - gap8 + - gap9 +variants: + std: + name: standard + tags: + - integration + - release + duration: standard + flags: ~ diff --git a/examples/pmsis/features/aes128_sw/main.c b/examples/pmsis/features/aes128_sw/main.c index 8514d59c0..6f12a2188 100644 --- a/examples/pmsis/features/aes128_sw/main.c +++ b/examples/pmsis/features/aes128_sw/main.c @@ -1,16 +1,12 @@ #include "pmsis.h" #include "AesLib.h" -#define TEST_BUFF_SIZE (40600) +#define TEST_BUFF_SIZE (40600) #define TEST_KEY_HI (0x1122334455667788) #define TEST_KEY_LO (0x9900AABBCCDDEEFF) #define TEST_IV (0x1122334455667788) -#if defined (__PULP_OS__) -RT_FC_DATA aes_data_t aes_data; -#else -GAP_FC_DATA aes_data_t aes_data; -#endif +PI_FC_L1 aes_data_t aes_data; static void load_key(unsigned char * key, unsigned char * iv) { @@ -58,7 +54,7 @@ void aes128() pi_perf_start(); cycles[0] = pi_perf_read(PI_PERF_CYCLES); - + AesBuildLUT(&aes_data); pi_perf_stop(); @@ -77,7 +73,7 @@ void aes128() cycles[3] = pi_perf_read(PI_PERF_CYCLES); for(int i=0 ; i - - -static int entry() -{ - printf("Entering main controller\n"); - -#ifdef EFUSE_WRITE - printf("Writing efuse 50 with value 0x12\n"); - - // Before writing the efuse, we must activate the program operation - // Once activated, we can wrote as many efuses as we want - plp_efuse_startProgram(); - - plp_efuse_writeByte(80, 0x12); - - // Close the current operation once done - plp_efuse_sleep(); -#else - printf("Efuse has not been written, recompile with make clean all run EFUSE_WRITE=1, be careful that this is a permanent operation !!!\n"); -#endif - - - // Before reading the efuse, we must activate the read operation - // Once activated, we can wrote as many efuses as we want - plp_efuse_startRead(); - - int value = plp_efuse_readWord(80); - - // Close the current operation once done - plp_efuse_sleep(); - - printf("Read efuse 50: 0x%x\n", value); - - return 0; -} - - -static void pmsis_wrapper(void) -{ - int retval = entry(); - pmsis_exit(retval); -} - - -int main(void) -{ - return pmsis_kickoff((void *)pmsis_wrapper); -} - diff --git a/examples/pmsis/features/helloworld_cxx/gaptest.yml b/examples/pmsis/features/helloworld_cxx/gaptest.yml new file mode 100644 index 000000000..7d9864113 --- /dev/null +++ b/examples/pmsis/features/helloworld_cxx/gaptest.yml @@ -0,0 +1,16 @@ +name: helloworld_cxx +platforms: + - gvsoc +os: + - freertos +chips: + - gap8 + - gap9 +variants: + std: + name: standard + tags: + - integration + - release + duration: standard + flags: ~ diff --git a/examples/pmsis/features/hyper_ram_delegate/gaptest.yml b/examples/pmsis/features/hyper_ram_delegate/gaptest.yml new file mode 100644 index 000000000..811611f76 --- /dev/null +++ b/examples/pmsis/features/hyper_ram_delegate/gaptest.yml @@ -0,0 +1,26 @@ +name: hyper_ram_delegate +platforms: + - gvsoc +os: + - freertos + - pulpos +chips: + - gap8 + - gap9 +variants: + std: + name: standard + tags: + - integration + - release + duration: standard + flags: ~ + compile_only: true + async: + name: async + tags: + - integration + - release + duration: standard + flags: ASYNC=1 + compile_only: true diff --git a/examples/pmsis/features/test_malloc/gaptest.yml b/examples/pmsis/features/test_malloc/gaptest.yml new file mode 100644 index 000000000..85043c25c --- /dev/null +++ b/examples/pmsis/features/test_malloc/gaptest.yml @@ -0,0 +1,17 @@ +name: test_malloc +platforms: + - gvsoc +os: + - freertos + - pulpos +chips: + - gap8 + - gap9 +variants: + std: + name: standard + tags: + - integration + - release + duration: standard + flags: ~ diff --git a/examples/pmsis/features/uart_delegate/gaptest.yml b/examples/pmsis/features/uart_delegate/gaptest.yml new file mode 100644 index 000000000..008a9e7a5 --- /dev/null +++ b/examples/pmsis/features/uart_delegate/gaptest.yml @@ -0,0 +1,18 @@ +name: uart_delegate +platforms: + - gvsoc +os: + - freertos + - pulpos +chips: + - gap8 + - gap9 +variants: + std: + name: standard + tags: + - integration + - release + duration: standard + flags: ~ + compile_only: true diff --git a/examples/pmsis/helloworld/gaptest.yml b/examples/pmsis/helloworld/gaptest.yml new file mode 100644 index 000000000..6bedf8220 --- /dev/null +++ b/examples/pmsis/helloworld/gaptest.yml @@ -0,0 +1,18 @@ +name: helloworld +platforms: + - gvsoc + - board +os: + - freertos + - pulpos +chips: + - gap8 + - gap9 +variants: + std: + name: standard + tags: + - integration + - release + duration: standard + flags: ~ diff --git a/examples/pmsis/periph/dmacpy/gaptest.yml b/examples/pmsis/periph/dmacpy/gaptest.yml new file mode 100644 index 000000000..67cd62962 --- /dev/null +++ b/examples/pmsis/periph/dmacpy/gaptest.yml @@ -0,0 +1,26 @@ +name: dmacpy +platforms: + - gvsoc +os: + - freertos + - pulpos +chips: + - gap8 + - gap9 +variants: + std: + name: standard + tags: + - integration + - release + duration: standard + flags: ~ + compile_only: true + async: + name: async + tags: + - integration + - release + duration: standard + flags: ASYNC=1 + compile_only: true diff --git a/examples/pmsis/periph/gpio/gpio_input/gaptest.yml b/examples/pmsis/periph/gpio/gpio_input/gaptest.yml new file mode 100644 index 000000000..0cb0c762c --- /dev/null +++ b/examples/pmsis/periph/gpio/gpio_input/gaptest.yml @@ -0,0 +1,18 @@ +name: gpio_input +platforms: + - gvsoc +os: + - freertos + - pulpos +chips: + - gap8 + - gap9 +variants: + std: + name: standard + tags: + - integration + - release + duration: standard + flags: ~ + compile_only: true diff --git a/examples/pmsis/periph/gpio/gpio_input/gpio.c b/examples/pmsis/periph/gpio/gpio_input/gpio.c index 0c3005e75..80f4de0ae 100644 --- a/examples/pmsis/periph/gpio/gpio_input/gpio.c +++ b/examples/pmsis/periph/gpio/gpio_input/gpio.c @@ -6,6 +6,15 @@ /* PMSIS includes */ #include "pmsis.h" +/* Defines */ +#if defined(__GAP8__) +#define GPIO_PIN (PI_GPIO_A0_PAD_12_A3) +#elif defined(__GAP9__) +#define GPIO_PIN (PI_GPIO_A68) +#else +#error "Unknown chip" +#endif + /* Variables used. */ struct pi_device gpio; @@ -34,7 +43,7 @@ void test_gpio(void) } pi_task_t cb_gpio; - pi_gpio_e gpio_in = PI_GPIO_A0_PAD_12_A3; + pi_gpio_e gpio_in = GPIO_PIN; pi_gpio_notif_e irq_type = PI_GPIO_NOTIF_RISE; pi_gpio_flags_e cfg_flags = PI_GPIO_INPUT|PI_GPIO_PULL_DISABLE|PI_GPIO_DRIVE_STRENGTH_LOW; diff --git a/examples/pmsis/periph/gpio/gpio_irq_cb/gaptest.yml b/examples/pmsis/periph/gpio/gpio_irq_cb/gaptest.yml new file mode 100644 index 000000000..ead916f96 --- /dev/null +++ b/examples/pmsis/periph/gpio/gpio_irq_cb/gaptest.yml @@ -0,0 +1,18 @@ +name: gpio_irq_cb +platforms: + - gvsoc +os: + - freertos + - pulpos +chips: + - gap8 + - gap9 +variants: + std: + name: standard + tags: + - integration + - release + duration: standard + flags: ~ + compile_only: true diff --git a/examples/pmsis/periph/gpio/gpio_irq_cb/gpio.c b/examples/pmsis/periph/gpio/gpio_irq_cb/gpio.c index 1b6a64a26..851226b49 100644 --- a/examples/pmsis/periph/gpio/gpio_irq_cb/gpio.c +++ b/examples/pmsis/periph/gpio/gpio_irq_cb/gpio.c @@ -6,6 +6,15 @@ /* PMSIS includes */ #include "pmsis.h" +/* Defines */ +#if defined(__GAP8__) +#define GPIO_PIN (PI_GPIO_A0_PAD_12_A3) +#elif defined(__GAP9__) +#define GPIO_PIN (PI_GPIO_A68) +#else +#error "Unknown chip" +#endif + /* Variables used. */ struct pi_device gpio; @@ -51,7 +60,7 @@ void test_gpio(void) pmsis_exit(errors); } - pi_gpio_e gpio_in = PI_GPIO_A0_PAD_12_A3; + pi_gpio_e gpio_in = GPIO_PIN; pi_gpio_notif_e irq_type = PI_GPIO_NOTIF_RISE; pi_gpio_flags_e cfg_flags = PI_GPIO_INPUT|PI_GPIO_PULL_DISABLE|PI_GPIO_DRIVE_STRENGTH_LOW; diff --git a/examples/pmsis/periph/gpio/gpio_output/gaptest.yml b/examples/pmsis/periph/gpio/gpio_output/gaptest.yml new file mode 100644 index 000000000..8bc059d75 --- /dev/null +++ b/examples/pmsis/periph/gpio/gpio_output/gaptest.yml @@ -0,0 +1,18 @@ +name: gpio_output +platforms: + - gvsoc +os: + - freertos + - pulpos +chips: + - gap8 + - gap9 +variants: + std: + name: standard + tags: + - integration + - release + duration: standard + flags: ~ + compile_only: true diff --git a/examples/pmsis/periph/gpio/gpio_output/gpio.c b/examples/pmsis/periph/gpio/gpio_output/gpio.c index 5aeb07ffd..850f22ef2 100644 --- a/examples/pmsis/periph/gpio/gpio_output/gpio.c +++ b/examples/pmsis/periph/gpio/gpio_output/gpio.c @@ -6,6 +6,26 @@ /* PMSIS includes */ #include "pmsis.h" +/* Defines */ +#if defined(__GAP8__) + +#define GPIO_PAD1 (PI_PAD_12_A3_RF_PACTRL0) +#define GPIO_PIN1 (PI_GPIO_A0_PAD_12_A3) + +#define GPIO_PAD2 (PI_PAD_15_B1_RF_PACTRL3) +#define GPIO_PIN2 (PI_GPIO_A3_PAD_15_B1) + +#elif defined(__GAP9__) +#define GPIO_PAD1 (PI_PAD_068) +#define GPIO_PIN1 (PI_GPIO_A68) + +#define GPIO_PAD2 (PI_PAD_086) +#define GPIO_PIN2 (PI_GPIO_A86) + +#else +#error "Unknown chip" +#endif + #define DELAY_MS 500 /* Variables used. */ @@ -19,12 +39,12 @@ void test_gpio(void) uint32_t value = 0; //Setting pad to alternate 1 //GPIO A1 - pi_pad_set_function(PI_PAD_12_A3_RF_PACTRL0, PI_PAD_12_A3_GPIO_A0_FUNC1); + pi_pad_set_function(GPIO_PAD1, PI_PAD_FUNC1); //GPIO LED (A3) - pi_pad_set_function(PI_PAD_15_B1_RF_PACTRL3, PI_PAD_FUNC1); - - pi_gpio_e gpio_out_a1 = PI_GPIO_A0_PAD_12_A3; - pi_gpio_e gpio_out_led = PI_GPIO_A3_PAD_15_B1; + pi_pad_set_function(GPIO_PAD2, PI_PAD_FUNC1); + + pi_gpio_e gpio_out_a1 = GPIO_PIN1; + pi_gpio_e gpio_out_led = GPIO_PIN2; /* Configure gpio output. */ pi_gpio_flags_e cfg_flags = PI_GPIO_OUTPUT; diff --git a/examples/pmsis/periph/i2c/i2c_bmp280/gaptest.yml b/examples/pmsis/periph/i2c/i2c_bmp280/gaptest.yml new file mode 100644 index 000000000..964cb67c7 --- /dev/null +++ b/examples/pmsis/periph/i2c/i2c_bmp280/gaptest.yml @@ -0,0 +1,18 @@ +name: i2c_bmp280 +platforms: + - gvsoc +os: + - freertos + - pulpos +chips: + - gap8 + - gap9 +variants: + std: + name: standard + tags: + - integration + - release + duration: standard + flags: ~ + compile_only: true diff --git a/examples/pmsis/periph/i2c/i2c_detect/gaptest.yml b/examples/pmsis/periph/i2c/i2c_detect/gaptest.yml new file mode 100644 index 000000000..88c99bd11 --- /dev/null +++ b/examples/pmsis/periph/i2c/i2c_detect/gaptest.yml @@ -0,0 +1,17 @@ +name: i2c_detect +platforms: + - gvsoc +os: + - freertos + - pulpos +chips: + - gap8 +variants: + std: + name: standard + tags: + - integration + - release + duration: standard + flags: ~ + compile_only: true diff --git a/examples/pmsis/periph/i2c/i2c_eeprom_pulp_fmc/gaptest.yml b/examples/pmsis/periph/i2c/i2c_eeprom_pulp_fmc/gaptest.yml new file mode 100644 index 000000000..2acc38e5e --- /dev/null +++ b/examples/pmsis/periph/i2c/i2c_eeprom_pulp_fmc/gaptest.yml @@ -0,0 +1,17 @@ +name: i2c_eeprom_pulp_fmc +platforms: + - gvsoc +os: + - freertos + - pulpos +chips: + - gap9 +variants: + std: + name: standard + tags: + - integration + - release + duration: standard + flags: ~ + compile_only: true diff --git a/examples/pmsis/periph/i2c/i2c_scan/gaptest.yml b/examples/pmsis/periph/i2c/i2c_scan/gaptest.yml new file mode 100644 index 000000000..208865a38 --- /dev/null +++ b/examples/pmsis/periph/i2c/i2c_scan/gaptest.yml @@ -0,0 +1,17 @@ +name: i2c_scan +platforms: + - gvsoc +os: + - freertos + - pulpos +chips: + - gap9 +variants: + std: + name: standard + tags: + - integration + - release + duration: standard + flags: ~ + compile_only: true diff --git a/examples/pmsis/periph/i2c/i2c_slave/gaptest.yml b/examples/pmsis/periph/i2c/i2c_slave/gaptest.yml new file mode 100644 index 000000000..251436da8 --- /dev/null +++ b/examples/pmsis/periph/i2c/i2c_slave/gaptest.yml @@ -0,0 +1,17 @@ +name: i2c_slave_loopback +platforms: + - gvsoc +os: + - freertos + - pulpos +chips: + - gap9 +variants: + std: + name: standard + tags: + - integration + - release + duration: standard + flags: ~ + compile_only: true diff --git a/examples/pmsis/periph/i2s/pcm/gaptest.yml b/examples/pmsis/periph/i2s/pcm/gaptest.yml new file mode 100644 index 000000000..bbdd53305 --- /dev/null +++ b/examples/pmsis/periph/i2s/pcm/gaptest.yml @@ -0,0 +1,18 @@ +name: i2s_pcm +platforms: + - gvsoc +os: + - freertos + - pulpos +chips: + - gap8 + - gap9 +variants: + std: + name: standard + tags: + - integration + - release + duration: standard + flags: ~ + compile_only: true diff --git a/examples/pmsis/periph/i2s/pdm/gaptest.yml b/examples/pmsis/periph/i2s/pdm/gaptest.yml new file mode 100644 index 000000000..4cb332ece --- /dev/null +++ b/examples/pmsis/periph/i2s/pdm/gaptest.yml @@ -0,0 +1,18 @@ +name: i2s_pdm +platforms: + - gvsoc +os: + - freertos + - pulpos +chips: + - gap8 + - gap9 +variants: + std: + name: standard + tags: + - integration + - release + duration: standard + flags: ~ + compile_only: true diff --git a/examples/pmsis/periph/i2s/pdm_4mic/gaptest.yml b/examples/pmsis/periph/i2s/pdm_4mic/gaptest.yml new file mode 100644 index 000000000..cd88be255 --- /dev/null +++ b/examples/pmsis/periph/i2s/pdm_4mic/gaptest.yml @@ -0,0 +1,18 @@ +name: i2s_pdm_4mic +platforms: + - gvsoc +os: + - freertos + - pulpos +chips: + - gap8 + - gap9 +variants: + std: + name: standard + tags: + - integration + - release + duration: standard + flags: ~ + compile_only: true diff --git a/examples/pmsis/periph/i2s/pdm_async/gaptest.yml b/examples/pmsis/periph/i2s/pdm_async/gaptest.yml new file mode 100644 index 000000000..3d99bb3f0 --- /dev/null +++ b/examples/pmsis/periph/i2s/pdm_async/gaptest.yml @@ -0,0 +1,18 @@ +name: i2s_pdm_async +platforms: + - gvsoc +os: + - freertos + - pulpos +chips: + - gap8 + - gap9 +variants: + std: + name: standard + tags: + - integration + - release + duration: standard + flags: ~ + compile_only: true diff --git a/examples/pmsis/periph/i2s/wav_out/gaptest.yml b/examples/pmsis/periph/i2s/wav_out/gaptest.yml new file mode 100644 index 000000000..e91dfe000 --- /dev/null +++ b/examples/pmsis/periph/i2s/wav_out/gaptest.yml @@ -0,0 +1,18 @@ +name: i2s_wav_out +platforms: + - gvsoc +os: + - freertos + - pulpos +chips: + - gap8 + - gap9 +variants: + std: + name: standard + tags: + - integration + - release + duration: standard + flags: ~ + compile_only: true diff --git a/examples/pmsis/periph/i2s/wav_out_long/gaptest.yml b/examples/pmsis/periph/i2s/wav_out_long/gaptest.yml new file mode 100644 index 000000000..d9ddabd7a --- /dev/null +++ b/examples/pmsis/periph/i2s/wav_out_long/gaptest.yml @@ -0,0 +1,18 @@ +name: i2s_wav_out_long +platforms: + - gvsoc +os: + - freertos + - pulpos +chips: + - gap8 + - gap9 +variants: + std: + name: standard + tags: + - integration + - release + duration: standard + flags: ~ + compile_only: true diff --git a/examples/pmsis/periph/i2s/wav_out_one_shot/gaptest.yml b/examples/pmsis/periph/i2s/wav_out_one_shot/gaptest.yml new file mode 100644 index 000000000..6cc8975f6 --- /dev/null +++ b/examples/pmsis/periph/i2s/wav_out_one_shot/gaptest.yml @@ -0,0 +1,18 @@ +name: i2s_wav_out_one_shot +platforms: + - gvsoc +os: + - freertos + - pulpos +chips: + - gap8 + - gap9 +variants: + std: + name: standard + tags: + - integration + - release + duration: standard + flags: ~ + compile_only: true diff --git a/examples/pmsis/periph/perf/gaptest.yml b/examples/pmsis/periph/perf/gaptest.yml new file mode 100644 index 000000000..d7a791e20 --- /dev/null +++ b/examples/pmsis/periph/perf/gaptest.yml @@ -0,0 +1,17 @@ +name: perf +platforms: + - gvsoc +os: + - freertos + - pulpos +chips: + - gap8 + - gap9 +variants: + std: + name: standard + tags: + - integration + - release + duration: standard + flags: ~ diff --git a/examples/pmsis/periph/pwm/gaptest.yml b/examples/pmsis/periph/pwm/gaptest.yml new file mode 100644 index 000000000..fa57415f0 --- /dev/null +++ b/examples/pmsis/periph/pwm/gaptest.yml @@ -0,0 +1,18 @@ +name: pwm +platforms: + - gvsoc +os: + - freertos + - pulpos +chips: + - gap8 + - gap9 +variants: + std: + name: standard + tags: + - integration + - release + duration: standard + flags: ~ + compile_only: true diff --git a/examples/pmsis/periph/rtc/rtc_alarm/gaptest.yml b/examples/pmsis/periph/rtc/rtc_alarm/gaptest.yml new file mode 100644 index 000000000..44cb70c4c --- /dev/null +++ b/examples/pmsis/periph/rtc/rtc_alarm/gaptest.yml @@ -0,0 +1,18 @@ +name: rtc_alarm +platforms: + - gvsoc +os: + - freertos + - pulpos +chips: + - gap8 + - gap9 +variants: + std: + name: standard + tags: + - integration + - release + duration: standard + flags: ~ + compile_only: true diff --git a/examples/pmsis/periph/rtc/rtc_calendar/gaptest.yml b/examples/pmsis/periph/rtc/rtc_calendar/gaptest.yml new file mode 100644 index 000000000..7c35557ff --- /dev/null +++ b/examples/pmsis/periph/rtc/rtc_calendar/gaptest.yml @@ -0,0 +1,18 @@ +name: rtc_calendar +platforms: + - gvsoc +os: + - freertos + - pulpos +chips: + - gap8 + - gap9 +variants: + std: + name: standard + tags: + - integration + - release + duration: standard + flags: ~ + compile_only: true diff --git a/examples/pmsis/periph/rtc/rtc_counter/gaptest.yml b/examples/pmsis/periph/rtc/rtc_counter/gaptest.yml new file mode 100644 index 000000000..ae74bd85d --- /dev/null +++ b/examples/pmsis/periph/rtc/rtc_counter/gaptest.yml @@ -0,0 +1,18 @@ +name: rtc_counter +platforms: + - gvsoc +os: + - freertos + - pulpos +chips: + - gap8 + - gap9 +variants: + std: + name: standard + tags: + - integration + - release + duration: standard + flags: ~ + compile_only: true diff --git a/examples/pmsis/periph/spi/spi_master/gaptest.yml b/examples/pmsis/periph/spi/spi_master/gaptest.yml new file mode 100644 index 000000000..c3ef7ba4f --- /dev/null +++ b/examples/pmsis/periph/spi/spi_master/gaptest.yml @@ -0,0 +1,18 @@ +name: spi_master +platforms: + - gvsoc +os: + - freertos + - pulpos +chips: + - gap8 + - gap9 +variants: + std: + name: standard + tags: + - integration + - release + duration: standard + flags: ~ + compile_only: true diff --git a/examples/pmsis/periph/uart/uart_helloworld/gaptest.yml b/examples/pmsis/periph/uart/uart_helloworld/gaptest.yml new file mode 100644 index 000000000..67a176fa8 --- /dev/null +++ b/examples/pmsis/periph/uart/uart_helloworld/gaptest.yml @@ -0,0 +1,18 @@ +name: uart_helloworld +platforms: + - gvsoc +os: + - freertos + - pulpos +chips: + - gap8 + - gap9 +variants: + std: + name: standard + tags: + - integration + - release + duration: standard + flags: ~ + compile_only: true diff --git a/examples/pmsis/periph/uart/uart_helloworld_timeout/gaptest.yml b/examples/pmsis/periph/uart/uart_helloworld_timeout/gaptest.yml new file mode 100644 index 000000000..df9d7562b --- /dev/null +++ b/examples/pmsis/periph/uart/uart_helloworld_timeout/gaptest.yml @@ -0,0 +1,16 @@ +name: uart_helloworld_timeout +platforms: + - gvsoc +os: + - freertos +chips: + - gap9 +variants: + std: + name: standard + tags: + - integration + - release + duration: standard + flags: ~ + compile_only: true diff --git a/examples/pmsis/periph/uart/uart_input/gaptest.yml b/examples/pmsis/periph/uart/uart_input/gaptest.yml new file mode 100644 index 000000000..a7e409203 --- /dev/null +++ b/examples/pmsis/periph/uart/uart_input/gaptest.yml @@ -0,0 +1,18 @@ +name: uart_input +platforms: + - gvsoc +os: + - freertos + - pulpos +chips: + - gap8 + - gap9 +variants: + std: + name: standard + tags: + - integration + - release + duration: standard + flags: ~ + compile_only: true diff --git a/examples/pmsis/periph/uart/uart_input_timeout/gaptest.yml b/examples/pmsis/periph/uart/uart_input_timeout/gaptest.yml new file mode 100644 index 000000000..c49ab3ee9 --- /dev/null +++ b/examples/pmsis/periph/uart/uart_input_timeout/gaptest.yml @@ -0,0 +1,16 @@ +name: uart_input_timeout +platforms: + - gvsoc +os: + - freertos +chips: + - gap9 +variants: + std: + name: standard + tags: + - integration + - release + duration: standard + flags: ~ + compile_only: true diff --git a/gvsoc/gvsoc/models/cpu/iss/include/pulp_v2.hpp b/gvsoc/gvsoc/models/cpu/iss/include/pulp_v2.hpp index 778ab4d58..4d23e13f6 100644 --- a/gvsoc/gvsoc/models/cpu/iss/include/pulp_v2.hpp +++ b/gvsoc/gvsoc/models/cpu/iss/include/pulp_v2.hpp @@ -26,13 +26,13 @@ #define PULPV2_HWLOOP_LPEND0 1 #define PULPV2_HWLOOP_LPCOUNT0 2 -#define PULPV2_HWLOOP_LPSTART1 3 -#define PULPV2_HWLOOP_LPEND1 4 -#define PULPV2_HWLOOP_LPCOUNT1 5 +#define PULPV2_HWLOOP_LPSTART1 4 +#define PULPV2_HWLOOP_LPEND1 5 +#define PULPV2_HWLOOP_LPCOUNT1 6 -#define PULPV2_HWLOOP_LPSTART(x) (PULPV2_HWLOOP_LPSTART0 + (x)*3) -#define PULPV2_HWLOOP_LPEND(x) (PULPV2_HWLOOP_LPEND0 + (x)*3) -#define PULPV2_HWLOOP_LPCOUNT(x) (PULPV2_HWLOOP_LPCOUNT0 + (x)*3) +#define PULPV2_HWLOOP_LPSTART(x) (PULPV2_HWLOOP_LPSTART0 + (x)*4) +#define PULPV2_HWLOOP_LPEND(x) (PULPV2_HWLOOP_LPEND0 + (x)*4) +#define PULPV2_HWLOOP_LPCOUNT(x) (PULPV2_HWLOOP_LPCOUNT0 + (x)*4) static inline iss_insn_t *LB_RR_exec_fast(iss_t *iss, iss_insn_t *insn) { @@ -595,7 +595,7 @@ static inline iss_insn_t *hwloop_check_exec(iss_t *iss, iss_insn_t *insn) static inline void hwloop_set_start(iss_t *iss, iss_insn_t *insn, int index, iss_reg_t start) { iss->cpu.pulpv2.hwloop_regs[PULPV2_HWLOOP_LPSTART(index)] = start; - iss->cpu.state.hwloop_start_insn[index] = insn_cache_get(iss, start); + iss->cpu.state.hwloop_start_insn[index] = insn_cache_get(iss, start); } static inline void hwloop_set_end(iss_t *iss, iss_insn_t *insn, int index, iss_reg_t end) diff --git a/gvsoc/gvsoc/models/cpu/iss/src/csr.cpp b/gvsoc/gvsoc/models/cpu/iss/src/csr.cpp index 8258e80f0..84f4ca5d2 100644 --- a/gvsoc/gvsoc/models/cpu/iss/src/csr.cpp +++ b/gvsoc/gvsoc/models/cpu/iss/src/csr.cpp @@ -940,6 +940,18 @@ static bool hwloop_read(iss_t *iss, int reg, iss_reg_t *value) { static bool hwloop_write(iss_t *iss, int reg, unsigned int value) { iss->cpu.pulpv2.hwloop_regs[reg] = value; + + // Since the HW loop is using decode instruction for the HW loop start to jump faster + // we need to recompute it when it is modified. + if (reg == 0) + { + iss->cpu.state.hwloop_start_insn[0] = insn_cache_get(iss, value); + } + else if (reg == 4) + { + iss->cpu.state.hwloop_start_insn[1] = insn_cache_get(iss, value); + } + return false; } diff --git a/rtos/freeRTOS/freertos_kernel/include/FreeRTOS.h b/rtos/freeRTOS/freertos_kernel/include/FreeRTOS.h index 5e443a629..05f007802 100644 --- a/rtos/freeRTOS/freertos_kernel/include/FreeRTOS.h +++ b/rtos/freeRTOS/freertos_kernel/include/FreeRTOS.h @@ -871,7 +871,7 @@ #endif #ifndef configTASK_NOTIFICATION_ARRAY_ENTRIES - #define configTASK_NOTIFICATION_ARRAY_ENTRIES 1 + #define configTASK_NOTIFICATION_ARRAY_ENTRIES 2 #endif #if configTASK_NOTIFICATION_ARRAY_ENTRIES < 1 diff --git a/rtos/freeRTOS/vendors/gwt/gap8/src/device/system_gap8.c b/rtos/freeRTOS/vendors/gwt/gap8/src/device/system_gap8.c index e2be80db9..b9d56565e 100644 --- a/rtos/freeRTOS/vendors/gwt/gap8/src/device/system_gap8.c +++ b/rtos/freeRTOS/vendors/gwt/gap8/src/device/system_gap8.c @@ -33,7 +33,7 @@ /* PMSIS includes. */ #include "pmsis.h" -#include "../driver/semihost.h" +#include "semihost.h" /* FC & L2 heaps. */ extern char __heapfcram_start; diff --git a/rtos/freeRTOS/vendors/gwt/gap8/include/driver/semihost.h b/rtos/freeRTOS/vendors/gwt/libs/include/semihost.h similarity index 63% rename from rtos/freeRTOS/vendors/gwt/gap8/include/driver/semihost.h rename to rtos/freeRTOS/vendors/gwt/libs/include/semihost.h index fba61196c..81ba11c44 100644 --- a/rtos/freeRTOS/vendors/gwt/gap8/include/driver/semihost.h +++ b/rtos/freeRTOS/vendors/gwt/libs/include/semihost.h @@ -20,6 +20,10 @@ #include #include +#ifdef __cplusplus +extern "C" { +#endif + enum semihosting_operation_numbers { /* * ARM/openocd semihosting operations. @@ -64,7 +68,7 @@ enum semihosting_operation_numbers { #define SEMIHOST_EXIT_SUCCESS 0x20026 #define SEMIHOST_EXIT_ERROR 0x20023 -extern long __syscall_error(long); +//extern long __syscall_error(long); /* riscv semihosting standard: * IN: a0 holds syscall number @@ -101,20 +105,52 @@ __internal_semihost(long n, long _a1) // roughly this is the last stage of printf: // print a string until '\0' -void semihost_write0(const char *print_string); - -int semihost_open(const char *name, int mode); +static inline void semihost_write0(const char *print_string) +{ + __internal_semihost(SEMIHOSTING_SYS_WRITE0, (long) print_string); +} -int semihost_close(int fd); +static inline int semihost_open(const char *name, int mode) +{ + uint32_t len = strlen(name); + volatile uint32_t args[3] = {(uint32_t)name,mode,len}; + return __internal_semihost(SEMIHOSTING_SYS_OPEN, (long) args); +} -int semihost_read(int fd, uint8_t *buffer, int len); +static inline int semihost_close(int fd) +{ + //uint32_t args[3] = {name,mode,len}; + return __internal_semihost(SEMIHOSTING_SYS_CLOSE, (long) fd); +} -int semihost_write(int fd, uint8_t *buffer, int len); +static inline int semihost_read(int fd, uint8_t *buffer, int len) +{ + volatile uint32_t args[3] = {(uint32_t)fd,(uint32_t)buffer,(uint32_t)len}; + return __internal_semihost(SEMIHOSTING_SYS_READ, (long) args); +} -int semihost_seek(int fd, uint32_t pos); +static inline int semihost_write(int fd, uint8_t *buffer, int len) +{ + volatile uint32_t args[3] = {(uint32_t)fd,(uint32_t)buffer,(uint32_t)len}; + return __internal_semihost(SEMIHOSTING_SYS_WRITE, (long) args); +} -int semihost_flen(int fd); +static inline int semihost_seek(int fd, uint32_t pos) +{ + volatile uint32_t args[2] = {(uint32_t)fd,pos}; + return __internal_semihost(SEMIHOSTING_SYS_SEEK, (long) args); +} -int semihost_exit(int code); +static inline int semihost_flen(int fd) +{ + return __internal_semihost(SEMIHOSTING_SYS_FLEN, (long) fd); +} +static inline int semihost_exit(int code) +{ + return __internal_semihost(SEMIHOSTING_SYS_EXIT, (long) code); +} +#ifdef __cplusplus +} +#endif #endif diff --git a/rtos/freeRTOS/vendors/gwt/pmsis/include/pmsis/backend/pmsis_backend_native_task_api.h b/rtos/freeRTOS/vendors/gwt/pmsis/include/pmsis/backend/pmsis_backend_native_task_api.h index 3fc2f7763..7ce9bfd03 100644 --- a/rtos/freeRTOS/vendors/gwt/pmsis/include/pmsis/backend/pmsis_backend_native_task_api.h +++ b/rtos/freeRTOS/vendors/gwt/pmsis/include/pmsis/backend/pmsis_backend_native_task_api.h @@ -154,7 +154,7 @@ static inline int __os_native_api_sync_obj_deinit(void *sync_obj) static inline void __os_native_api_sync_obj_take(void *sync_obj) { - ulTaskNotifyTake(pdTRUE, portMAX_DELAY); + ulTaskNotifyTakeIndexed(1, pdTRUE, portMAX_DELAY); } static inline void __os_native_api_sync_obj_release(void *sync_obj) @@ -162,7 +162,7 @@ static inline void __os_native_api_sync_obj_release(void *sync_obj) uint32_t irq = __disable_irq(); BaseType_t higher_priority_task_woken = pdFALSE; TaskHandle_t task_handler = (TaskHandle_t) sync_obj; - vTaskNotifyGiveFromISR(task_handler, &higher_priority_task_woken); + vTaskNotifyGiveIndexedFromISR(task_handler, 1, &higher_priority_task_woken); portYIELD_FROM_ISR(higher_priority_task_woken); __restore_irq(irq); } diff --git a/rtos/freeRTOS/vendors/gwt/rules/freeRTOS_rules.mk b/rtos/freeRTOS/vendors/gwt/rules/freeRTOS_rules.mk index c2225d9c6..e2e16f5aa 100644 --- a/rtos/freeRTOS/vendors/gwt/rules/freeRTOS_rules.mk +++ b/rtos/freeRTOS/vendors/gwt/rules/freeRTOS_rules.mk @@ -413,7 +413,7 @@ flash: $(BIN) flash_noforce: $(BIN) gapy $(GAPY_TARGET_OPT) --platform=$(platform) --work-dir=$(BUILDDIR) $(config_args) $(gapy_args) run --flash --binary=$(BIN) $(runner_args) -flash_fs: $(BIN) +flash_fs: $(BIN) image gapy $(GAPY_TARGET_OPT) --platform=$(platform) --work-dir=$(BUILDDIR) $(config_args) $(gapy_args) run --flash --binary=$(BIN) $(runner_args) $(WSL_ENV) image: $(BIN) diff --git a/rtos/pmsis/pmsis_bsp/CMakeLists.txt b/rtos/pmsis/pmsis_bsp/CMakeLists.txt index 77c951b92..b2abc381a 100644 --- a/rtos/pmsis/pmsis_bsp/CMakeLists.txt +++ b/rtos/pmsis/pmsis_bsp/CMakeLists.txt @@ -1,5 +1,5 @@ set(BSP_READFS_SRC fs/read_fs/read_fs.c) -set(BSP_HOSTFS_SRC fs/host_fs/semihost.c fs/host_fs/host_fs.c) +set(BSP_HOSTFS_SRC fs/host_fs/host_fs.c) set(BSP_LFS_SRC fs/lfs/lfs.c fs/lfs/lfs_util.c fs/lfs/pi_lfs.c) set(BSP_FS_SRC fs/fs.c) set(BSP_FLASH_SRC diff --git a/rtos/pmsis/pmsis_bsp/flash/mram/mram-v2.c b/rtos/pmsis/pmsis_bsp/flash/mram/mram-v2.c index 6fb99e928..3cddb5001 100644 --- a/rtos/pmsis/pmsis_bsp/flash/mram/mram-v2.c +++ b/rtos/pmsis/pmsis_bsp/flash/mram/mram-v2.c @@ -338,7 +338,9 @@ static int mram_open(struct pi_device *device) // In XIP mode, we need to lock XIP refills to avoid having a read while the flash is doing the program operation. udma_mram_trans_mode_set(base, UDMA_MRAM_TRANS_MODE_AUTO_ENA(1) | UDMA_MRAM_TRANS_MODE_XIP_EN(1) | UDMA_MRAM_TRANS_MODE_XIP_AUTO_HALTED(1)); #else - udma_mram_trans_mode_set(base, UDMA_MRAM_TRANS_MODE_AUTO_ENA(1)); + udma_mram_trans_mode_set(base, UDMA_MRAM_TRANS_MODE_AUTO_ENA(1) + | UDMA_MRAM_TRANS_MODE_XIP_EN(conf->xip_en) + | UDMA_MRAM_TRANS_MODE_XIP_AUTO_HALTED(conf->xip_en)); #endif #ifndef CONFIG_XIP_MRAM @@ -896,4 +898,5 @@ void pi_mram_conf_init(struct pi_mram_conf *conf) conf->flash.api = &mram_api; conf->itf = 0; conf->baudrate = 15000000; + conf->xip_en = 0; } diff --git a/rtos/pmsis/pmsis_bsp/fs/host_fs/semihost.c b/rtos/pmsis/pmsis_bsp/fs/host_fs/semihost.c deleted file mode 100644 index 016237eee..000000000 --- a/rtos/pmsis/pmsis_bsp/fs/host_fs/semihost.c +++ /dev/null @@ -1,50 +0,0 @@ -#include "semihost.h" -#include "string.h" - -// roughly this is the last stage of printf: -// print a string until '\0' -void semihost_write0(const char *print_string) -{ - __internal_semihost(SEMIHOSTING_SYS_WRITE0, (long) print_string); -} - -int semihost_open(const char *name, int mode) -{ - uint32_t len = strlen(name); - volatile uint32_t args[3] = {(uint32_t)name,mode,len}; - return __internal_semihost(SEMIHOSTING_SYS_OPEN, (long) args); -} - -int semihost_close(int fd) -{ - //uint32_t args[3] = {name,mode,len}; - return __internal_semihost(SEMIHOSTING_SYS_CLOSE, (long) fd); -} - -int semihost_read(int fd, uint8_t *buffer, int len) -{ - volatile uint32_t args[3] = {(uint32_t)fd,(uint32_t)buffer,(uint32_t)len}; - return __internal_semihost(SEMIHOSTING_SYS_READ, (long) args); -} - -int semihost_write(int fd, uint8_t *buffer, int len) -{ - volatile uint32_t args[3] = {(uint32_t)fd,(uint32_t)buffer,(uint32_t)len}; - return __internal_semihost(SEMIHOSTING_SYS_WRITE, (long) args); -} - -int semihost_seek(int fd, uint32_t pos) -{ - volatile uint32_t args[2] = {(uint32_t)fd,pos}; - return __internal_semihost(SEMIHOSTING_SYS_SEEK, (long) args); -} - -int semihost_flen(int fd) -{ - return __internal_semihost(SEMIHOSTING_SYS_FLEN, (long) fd); -} - -int semihost_exit(int code) -{ - return __internal_semihost(SEMIHOSTING_SYS_EXIT, (long) code); -} diff --git a/rtos/pmsis/pmsis_bsp/fs/host_fs/semihost.h b/rtos/pmsis/pmsis_bsp/fs/host_fs/semihost.h index fba61196c..4d58a542e 100644 --- a/rtos/pmsis/pmsis_bsp/fs/host_fs/semihost.h +++ b/rtos/pmsis/pmsis_bsp/fs/host_fs/semihost.h @@ -98,23 +98,52 @@ __internal_semihost(long n, long _a1) #endif } - // roughly this is the last stage of printf: // print a string until '\0' -void semihost_write0(const char *print_string); +static inline void semihost_write0(const char *print_string) +{ + __internal_semihost(SEMIHOSTING_SYS_WRITE0, (long) print_string); +} -int semihost_open(const char *name, int mode); +static inline int semihost_open(const char *name, int mode) +{ + uint32_t len = strlen(name); + volatile uint32_t args[3] = {(uint32_t)name,mode,len}; + return __internal_semihost(SEMIHOSTING_SYS_OPEN, (long) args); +} -int semihost_close(int fd); +static inline int semihost_close(int fd) +{ + //uint32_t args[3] = {name,mode,len}; + return __internal_semihost(SEMIHOSTING_SYS_CLOSE, (long) fd); +} -int semihost_read(int fd, uint8_t *buffer, int len); +static inline int semihost_read(int fd, uint8_t *buffer, int len) +{ + volatile uint32_t args[3] = {(uint32_t)fd,(uint32_t)buffer,(uint32_t)len}; + return __internal_semihost(SEMIHOSTING_SYS_READ, (long) args); +} -int semihost_write(int fd, uint8_t *buffer, int len); +static inline int semihost_write(int fd, uint8_t *buffer, int len) +{ + volatile uint32_t args[3] = {(uint32_t)fd,(uint32_t)buffer,(uint32_t)len}; + return __internal_semihost(SEMIHOSTING_SYS_WRITE, (long) args); +} -int semihost_seek(int fd, uint32_t pos); +static inline int semihost_seek(int fd, uint32_t pos) +{ + volatile uint32_t args[2] = {(uint32_t)fd,pos}; + return __internal_semihost(SEMIHOSTING_SYS_SEEK, (long) args); +} -int semihost_flen(int fd); +static inline int semihost_flen(int fd) +{ + return __internal_semihost(SEMIHOSTING_SYS_FLEN, (long) fd); +} -int semihost_exit(int code); +static inline int semihost_exit(int code) +{ + return __internal_semihost(SEMIHOSTING_SYS_EXIT, (long) code); +} #endif diff --git a/rtos/pmsis/pmsis_bsp/include/bsp/flash/mram.h b/rtos/pmsis/pmsis_bsp/include/bsp/flash/mram.h index 4e57bc943..face1328e 100644 --- a/rtos/pmsis/pmsis_bsp/include/bsp/flash/mram.h +++ b/rtos/pmsis/pmsis_bsp/include/bsp/flash/mram.h @@ -49,6 +49,7 @@ struct pi_mram_conf int itf; /*!< Mram interface where the flash is connected. */ int baudrate; /*!< Baudrate in byte/s. */ + int xip_en; }; /** \brief Initialize an Mram configuration with default values. diff --git a/rtos/pmsis/pmsis_bsp/src.mk b/rtos/pmsis/pmsis_bsp/src.mk index 505fe287f..326ee544c 100644 --- a/rtos/pmsis/pmsis_bsp/src.mk +++ b/rtos/pmsis/pmsis_bsp/src.mk @@ -1,5 +1,5 @@ BSP_READFS_SRC = fs/read_fs/read_fs.c -BSP_HOSTFS_SRC = fs/host_fs/semihost.c fs/host_fs/host_fs.c +BSP_HOSTFS_SRC = fs/host_fs/host_fs.c BSP_LFS_SRC = fs/lfs/lfs.c fs/lfs/lfs_util.c fs/lfs/pi_lfs.c BSP_FS_SRC = fs/fs.c BSP_FLASH_SRC = flash/flash.c partition/partition.c partition/flash_partition.c \ diff --git a/rtos/pmsis/pmsis_bsp/zephyr/CMakeLists.txt b/rtos/pmsis/pmsis_bsp/zephyr/CMakeLists.txt index 69a891dcc..dcf7d9e70 100644 --- a/rtos/pmsis/pmsis_bsp/zephyr/CMakeLists.txt +++ b/rtos/pmsis/pmsis_bsp/zephyr/CMakeLists.txt @@ -8,7 +8,6 @@ zephyr_sources( ../fs/read_fs/read_fs.c ../fs/fs.c ../fs/host_fs/host_fs.c - ../fs/host_fs/semihost.c ../flash/flash.c ../flash/hyperflash/hyperflash.c ../ram/ram.c @@ -20,4 +19,4 @@ zephyr_compile_options( -DCONFIG_GAPUINO ) -zephyr_include_directories(../include) \ No newline at end of file +zephyr_include_directories(../include) diff --git a/rtos/pmsis/pmsis_implem/chips/gap9/drivers/i2c/i2c.c b/rtos/pmsis/pmsis_implem/chips/gap9/drivers/i2c/i2c.c index 566c8c84a..7bad66e79 100644 --- a/rtos/pmsis/pmsis_implem/chips/gap9/drivers/i2c/i2c.c +++ b/rtos/pmsis/pmsis_implem/chips/gap9/drivers/i2c/i2c.c @@ -342,7 +342,7 @@ static int __pi_i2c_prepare_write_read_buf(i2c_slave_data_t *slave_data, buffer[index++] = I2C_CMD_LEAD_START(1); buffer[index++] = I2C_CMD_LEAD_SEND_IMM(slave_data->slave_addrh|1); } - buffer[index++] = I2C_CMD_RPT(size1); + buffer[index++] = I2C_CMD_RPT(size1-1); // receive -1 byte because there is a "last" buffer[index++] = I2C_CMD_MISC_RECEIVE(1); buffer[index++] = I2C_CMD_MISC_RECEIVE_LAST(1); diff --git a/tools/autotiler_v3/Autotiler/AutoTilerLibTypes.h b/tools/autotiler_v3/Autotiler/AutoTilerLibTypes.h index e3d807233..4f4395b0d 100644 --- a/tools/autotiler_v3/Autotiler/AutoTilerLibTypes.h +++ b/tools/autotiler_v3/Autotiler/AutoTilerLibTypes.h @@ -49,6 +49,7 @@ typedef enum { KOP_DP_REDUCT_NOSCALE, KOP_DP_REDUCT_CHW2HWC, KOP_DP_REDUCT_IO, + KOP_DP_REDUCT_IO_NOSCALE, KOP_DP_REDUCT_MULBIAS, KOP_DP_REDUCT_IO_MULBIAS, KOP_DP_REDUCT_MULBIAS_SCALAR, diff --git a/tools/autotiler_v3/CNN_Generators/CNN_Copy_Generators.c b/tools/autotiler_v3/CNN_Generators/CNN_Copy_Generators.c index a6030494d..0f69bba89 100644 --- a/tools/autotiler_v3/CNN_Generators/CNN_Copy_Generators.c +++ b/tools/autotiler_v3/CNN_Generators/CNN_Copy_Generators.c @@ -721,6 +721,8 @@ static int CNN_MatTranspose_Internal( add_kernel_arg_func_t AddKArgDimFunc = AddKernelArgDim; cnn_kernel_arg_datatype_func_t CNN_ArgDtype = CNN_ArgDataType; + if (Size < 0) CNN_ArgDtype = CNN_ArgDataTypeUns; + if (Ctrl) { if (Ctrl->TileOrientation != -1) TileOrientation = (Ctrl->TileOrientation==0)?TILE_HOR:TILE_VER; if (Ctrl->ParallelFeatures != -1) ParFeat = Ctrl->ParallelFeatures; @@ -731,6 +733,7 @@ static int CNN_MatTranspose_Internal( if (HWC) { return CNN_3DTensorPermute(Name, Ctrl, Feat, Size, Width, Height, KOP_MATPERM_HWC2WHC); } + if (Size < 0) Size = -Size; unsigned long long int LayerOp = Width*Height*Feat*Size; unsigned long long int LayerBandwidth = 0; @@ -890,6 +893,11 @@ int CNN_3DTensorPermute( add_kernel_arg_func_t AddKArgDimFunc = AddKernelArgDim; cnn_kernel_arg_datatype_func_t CNN_ArgDtype = CNN_ArgDataType; + if (Size < 0) { + CNN_ArgDtype = CNN_ArgDataTypeUns; + Size = -Size; + } + if (Ctrl) { if (Ctrl->HWC != -1) HWC = Ctrl->HWC; if (Ctrl->FloatDump != -1&&Ctrl->FloatDump) AddKArgDimFunc = AddKernelFloatArgDim; diff --git a/tools/autotiler_v3/CNN_Generators_NE16/CNN_Generators_NE16.c b/tools/autotiler_v3/CNN_Generators_NE16/CNN_Generators_NE16.c index ef93bf8b1..0edf74202 100644 --- a/tools/autotiler_v3/CNN_Generators_NE16/CNN_Generators_NE16.c +++ b/tools/autotiler_v3/CNN_Generators_NE16/CNN_Generators_NE16.c @@ -316,8 +316,6 @@ Kernel_T *CNN_MM_ConvolutionNE16( char *ConvKerName=0, *PoolKerName=0, *ActKerName=0, *SetBiasKerName=0, *DPReductionKerName=0; int NeedFcx, NeedFcy, NeedDcx, NeedDcy, NeedScx, NeedScy, NeedFpx, NeedFpy, NeedDpx, NeedDpy, NeedSpx, NeedSpy; int UsedWidth, UsedHeight, UsedWc, UsedHc; - - unsigned int InTileCons = 16; int OutTileCons = 32; int StandAloneAct = (ActOper!=KOP_NONE); unsigned long long int LayerOp = 0; @@ -331,11 +329,18 @@ Kernel_T *CNN_MM_ConvolutionNE16( if (!(ActOper == KOP_NONE || ActOper == KOP_RELU || ActOper == KOP_RELUN || ActOper == KOP_RELUM || ActOper == KOP_HSIGMOID || ActOper == KOP_HSWISH || ActOper == KOP_LEAKYRELU || ActOper == KOP_SIGMOID || ActOper == KOP_TANH)) GenTilingError("CNN_MM_ConvolutionNE16 Kernel: %s, ActOper, expecting KOP_NONE, KOP_RELU, KOP_RELUN, KOP_RELUM, KOP_HSIGMOID, KOP_HSWISH, KOP_LEAKYRELU, KOP_SIGMOID or KOP_TANH", Name); - Wa |= O_NE16_LIN | O_LINEAR; + Wa |= O_NE16_LIN | O_LINEAR; + int Mode16 = (Abs(In_DataSize) == 2); + if (Mode16) { + Wa |= O_NE16_MODE16; + } + + unsigned int InTileCons = Mode16?8:16; + int NeedSetBias = Mode16; /* When there is a special activation (not supported by the accelerator itself), you need to streamout 32bits and do the act in the cluster but the ((*S) >> N) is done in the accelerator (KOP_DP_REDUCT_NOSCALE) */ int NeedReductNoScale = !(ActOper == KOP_RELU || ActOper == KOP_NONE); /* Also when in/out are 16bits you need to streamout 32bits but here the reduction step will be done in the cluster (KOP_DP_REDUCT) */ - int NeedReductScale = Abs(In_DataSize) == 2; + int NeedReductScale = Mode16; int NeedReduct = NeedReductNoScale || NeedReductScale; CNN_LayerOutputDim(Width, Height, ConvOper, Fcx, Fcy, Dcx, Dcy, Scx, Scy, ConvPad, PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PoolPad, @@ -376,26 +381,38 @@ Kernel_T *CNN_MM_ConvolutionNE16( ConvKerName = CNN_FindMatchingKernelAttr(KOP_MM_CONV, KOP_NONE, ParFeat, CALL_NE16_KER, Abs(In_DataSize), Abs(Out_DataSize), Bias_DataSize, 0, 4, Fcx, Fcy, Dcx, Dcy, Scx, Scy, &NeedFcx, &NeedFcy, &NeedDcx, &NeedDcy, &NeedScx, &NeedScy, 0); - if (ConvKerName==0) GenTilingError("CNN_MM_ConvolutionNE16 Kernel: %s, Can't find a matching Convolution basic kernel", Name); - if (PoolOper==KOP_MAXPOOL) { - PoolKerName = CNN_FindMatchingKernelAttr(PoolOper, KOP_NONE, 1, CALL_HWC_KER, In_DataSize, 0, 0, 0, Out_DataSize, Fpx, Fpy, Dpx, Dpy, Spx, Spy, + if (ConvKerName==0) GenTilingError("CNN_ConvolutionPoolAct_NE16 Kernel: %s, Can't find a matching Convolution basic kernel", Name); + + if (PoolOper==KOP_MAXPOOL || PoolOper==KOP_AVGPOOL) { + PoolKerName = CNN_FindMatchingKernelAttr(PoolOper, NeedReduct?KOP_NONE:ActOper, 1, CALL_HWC_KER, In_DataSize, 0, 0, 0, Out_DataSize, Fpx, Fpy, Dpx, Dpy, Spx, Spy, &NeedFpx, &NeedFpy, &NeedDpx, &NeedDpy, &NeedSpx, &NeedSpy, 0); - if (PoolKerName==0) GenTilingError("CNN_MM_ConvolutionNE16 Kernel: %s, Can't find a matching Pooling basic kernel", Name); + if (PoolKerName==0) GenTilingError("CNN_ConvolutionPoolAct_NE16 Kernel: %s, Can't find a matching Pooling basic kernel", Name); + if (NeedReduct) { + DPReductionKerName = CNN_FindMatchingKernelAttr(NeedReductScale?KOP_DP_REDUCT_IO:KOP_DP_REDUCT_IO_NOSCALE, ActOper, 1, CALL_HWC_KER, + 4, 0, 0, 0, Out_DataSize, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + if (DPReductionKerName==0) GenTilingError("CNN_ConvolutionPoolAct_NE16 Kernel: %s, Can't find a matching Reduction basic kernel %d %s", Name, Out_DataSize, NeedReductNoScale?"NoScale":"Scale"); + } + + } else if (NeedReduct) { + DPReductionKerName = CNN_FindMatchingKernelAttr(NeedReductScale?KOP_DP_REDUCT:KOP_DP_REDUCT_NOSCALE, ActOper, 1, CALL_HWC_KER, + 4, 0, 0, 0, Out_DataSize, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + if (DPReductionKerName==0) GenTilingError("CNN_ConvolutionPoolAct_NE16 Kernel: %s, Can't find a matching Reduction basic kernel %d %s", Name, Out_DataSize, NeedReductNoScale?"NoScale":"Scale"); } - if (NeedReduct) { - DPReductionKerName = CNN_FindMatchingKernelAttr(NeedReductNoScale?KOP_DP_REDUCT_NOSCALE:KOP_DP_REDUCT, ActOper, 1, CALL_HWC_KER, 4, 0, 0, 0, Out_DataSize, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); - if (DPReductionKerName==0) GenTilingError("CNN_MM_ConvolutionNE16 Kernel: %s, Can't find a matching Reduction basic kernel %d", Name, Out_DataSize); + if (NeedSetBias) { + SetBiasKerName = CNN_FindMatchingKernelAttr(KOP_SETBIAS, KOP_NONE, ParFeat, CALL_HWC_KER, Bias_DataSize,0,0,0,4, 0,0,0,0,0,0, 0,0,0,0,0,0, 0); + if (SetBiasKerName==0) GenTilingError("CNN_ConvolutionPoolAct_NE16 Kernel: %s, Can't find a matching SetBias basic kernel", Name); } + // If pooling you need an extra buffer for convout but reduction can be done in the accelerator - int NeedConvout = NeedReduct || PoolKerName; + int NeedConvout = NeedReduct || NeedSetBias || PoolKerName; unsigned int Cos = NeedConvout?4:1; if (Log) { printf("InFeat: %d%s, OutFeat: %d, InFeatCons: %d\n", InFeat, " Im2Col", OutFeat, InTileCons); printf("Conv => W: %4d, Pad:[%d,%d] PadT:[%d,%d] => Wc: %d, Filter:[%d,%d]x%d Bits\n", Width, PadInc[0], PadInc[1], PadIncT[0], PadIncT[1], Wc, Fcx, Fcy, Filter_DataSizeBits); printf(" => H: %4d, Pad:[%d,%d] PadT:[%d,%d] => Hc: %d\n", Height, PadInc[2], PadInc[3], PadIncT[2], PadIncT[3], Hc); - printf(" ConvOut_DataSize: %d\n", Cos); - printf("Pool => Wc: %4d, Pad:[%d,%d] => Wo: %d, Filter:[%d,%d]\n", UsedWc, PadInp[0], PadInp[1], Wo, Fpx, Fpy); + printf("%s -- >ConvOut_DataSize: %d\n", NeedConvout?"NeedConvOut":"NoConvOut", Cos); + printf("Pool => Wc: %4d, Pad:[%d,%d] => Wo: %d, Filter:[%d,%d] %d\n", UsedWc, PadInp[0], PadInp[1], Wo, Fpx, Fpy, Mode16); printf(" => Hc: %4d, Pad:[%d,%d] => Ho: %d\n", UsedHc, PadInp[2], PadInp[3], Ho); printf("OverlapC: %d\n", OverlapC); printf("OverlapP: %d\n", OverlapP); @@ -406,15 +423,14 @@ Kernel_T *CNN_MM_ConvolutionNE16( if (DPReductionKerName) printf("%20s: %s\n", "DPReductionKerName", DPReductionKerName); if (PoolKerName) printf("%20s: %s\n", "PoolKerName", PoolKerName); printf("Nb Oper : %lld\n", LayerOp); - printf("NeedConvout: %d\n", NeedConvout); + } /* User kernel C arguments */ CKernel_Arg_T **KCArgs = AllocateCArgs(7); Kernel_T *Kernel; - int StreamoutMode = 1; // Streamout = apply *Scale >> ScaleN - int Mode16 = (Abs(In_DataSize) == 2); - int Streamin = 0; // Streamin initialized at 0, set to 1 in the basic kernel if multiple chin tile + int StreamoutMode = !Mode16; // Streamout = apply *Scale >> ScaleN + int Streamin = Mode16; // Streamin initialized at 0, set to 1 in the basic kernel if multiple chin tile int FilterMode = 3; int LinearMode = 1; int StridedMode = 0; @@ -424,7 +440,7 @@ Kernel_T *CNN_MM_ConvolutionNE16( int QuantBits = (NeedReduct)?2:(Abs(Out_DataSize)==2?1:0); // 00: 8bit, 01: 16bit, 10: 32bit --> If tiling the channel input dimension you need to streamin (need 32 bits output) int QuantNoRect = (NeedReduct || (Out_DataSize>0))?1:0; int NormShift = 1; - int NormBias = 1; + int NormBias = !Mode16; unsigned int DEFAULT_NE16_JOB_CFG = NE16_DefaultConfig(Filter_DataSizeBits, Mode16, StreamoutMode, FilterMode, LinearMode, StridedMode, NormBits, Streamin, \ WOffsetCfg, QuantRightShift, QuantBits, QuantNoRect, NormShift, NormBias); @@ -440,9 +456,19 @@ Kernel_T *CNN_MM_ConvolutionNE16( TCArg(CNN_ArgDataType(1, 1,1), "ScaleN"), TCArg(CNN_ArgDataType(1, 1,1), "Infos") ), - Calls(6, + Calls(7, Call("NE16_Enable", LOC_D1_PROLOG, Bindings(0)), Call("NE16_SoftReset", LOC_D0, Bindings(0)), + SetBiasKerName?Call(SetBiasKerName, LOC_D0, + Bindings(6, + K_Arg("ConvOut", KER_ARG_TILE), /* SetBias output tile */ + K_Arg("ConvOut", KER_ARG_TILE_W), /* SetBias output tile width */ + K_Arg("ConvOut", KER_ARG_TILE_H), /* SetBias output tile height */ + ParFeat?K_ArgPar("ConvOut", KER_ARG_PARTILE_SIZE, D1):Imm(1), /* Number of output features in this tile */ + K_Arg("Bias", KER_ARG_TILE), /* SetBias Bias tile */ + K_TileOper("Infos", "char *", '@', AT_INF_BIASN) /* Bias Norm */ + ) + ):AT_NO_CALL, Call(ConvKerName, LOC_D0, Bindings(28, K_Arg("In", KER_ARG_TILE), /* Conv input tile */ @@ -452,11 +478,11 @@ Kernel_T *CNN_MM_ConvolutionNE16( K_Arg(NeedConvout?"ConvOut":"Out", KER_ARG_TILE), /* Conv output */ K_Arg("Scale", KER_ARG_TILE), /* Per channel scale tile */ K_Arg("ScaleN", KER_ARG_TILE), /* Per channel scale normalization tile */ - K_ArgPar("In", KER_ARG_PARTILE_SIZE, D0), /* Number of input features in this tile */ - K_ArgPar("In", KER_ARG_LOADEDPARTILE_SIZE, D0), /* Total Number of loaded input features in case of promotion */ + K_ArgPar("In", KER_ARG_PARTILE_SIZE, D0), /* Number of input features in this tile */ + K_ArgPar("In", KER_ARG_LOADEDPARTILE_SIZE, D0), /* Total Number of loaded input features in case of promotion */ K_Arg("In", KER_ARG_TILE_H), /* Conv input tile height */ K_Arg("In", KER_ARG_TILE_W), /* Conv input tile width */ - K_ArgPar(NeedConvout?"ConvOut":"Out", KER_ARG_PARTILE_SIZE, D1), /* Number of output features in this tile */ + K_ArgPar(NeedConvout?"ConvOut":"Out", KER_ARG_PARTILE_SIZE, D1), /* Number of output features in this tile */ K_Arg(NeedConvout?"ConvOut":"Out", KER_ARG_TILE_H), K_Arg(NeedConvout?"ConvOut":"Out", KER_ARG_TILE_W), Imm(PadValue), @@ -490,7 +516,7 @@ Kernel_T *CNN_MM_ConvolutionNE16( ), (PoolKerName==0)?AT_NO_CALL: Call(PoolKerName, LOC_D0_EPILOG, - Bindings(13, + Bindings(14, K_Arg("ConvOut", KER_ARG_TILE), K_Arg("ConvOut", KER_ARG_TILE_W), K_Arg("ConvOut", KER_ARG_TILE_H), @@ -503,22 +529,23 @@ Kernel_T *CNN_MM_ConvolutionNE16( K_Arg("Out", KER_ARG_TILE), /* Pooling output tile */ K_ArgPar("ConvOut", KER_ARG_PARTILE_SIZE, D1), /* In Features */ K_Arg("Out", KER_ARG_TILE_W), /* Output tile width */ - K_Arg("Out", KER_ARG_TILE_H) /* Output tile height */ + K_Arg("Out", KER_ARG_TILE_H), /* Output tile height */ + K_Arg("Infos", KER_ARG_TILE) /* Infos */ ) ), Call("NE16_Disable", LOC_D1_EPILOG, Bindings(0)) ), KerArgs(9, - KerArgPV("In", KerArgSpace(2,T0,D0), O_IN|O_DB|O_HWC, Width, Height, UsedWidth, UsedHeight, PadIncT, PadInc, PadValue, Abs(In_DataSize), OverlapC, 0, 0, "In"), - KerArg ("ColBuff",KerArgSpace(1,T0), O_BUFF|O_NTILED, BuffS, 1, 1, 0, 0, 0, 0), - KerArg ("Bias", KerArgSpace(1,D1), O_IN|O_DB|O_CONST, 1, 1, Bs, 0, 0, 0, "Bias"), - KerArg ("Scale", KerArgSpace(1,D1), O_IN|O_DB|O_CONST, 1, 1, 1, 0, 0, 0, "Scale"), - KerArg ("ScaleN", KerArgSpace(1,D1), O_IN|O_DB|O_CONST, 1, 1, 1, 0, 0, 0, "ScaleN"), - KerArg ("Filter", KerArgSpace(1,D1), O_IN|O_DB|O_CONST|Wa, 1, WBuffSize, Ws, 0, 0, 0, "Filter"), + KerArgPV("In", KerArgSpace(2,T0,D0), O_IN|O_DB|O_HWC, Width, Height, UsedWidth, UsedHeight, PadIncT, PadInc, PadValue, Abs(In_DataSize), OverlapC, 0, 0, "In"), + KerArg ("ColBuff",KerArgSpace(1,T0), O_BUFF|O_NTILED, BuffS, 1, 1, 0, 0, 0, 0), + KerArg ("Bias", KerArgSpace(1,D1), O_IN|O_DB|O_CONST, 1, 1, Bs, 0, 0, 0, "Bias"), + KerArg ("Scale", KerArgSpace(1,D1), O_IN|O_DB|O_CONST, 1, 1, 1, 0, 0, 0, "Scale"), + KerArg ("ScaleN", KerArgSpace(1,D1), O_IN|O_DB|O_CONST, 1, 1, 1, 0, 0, 0, "ScaleN"), + KerArg ("Filter", KerArgSpace(1,D1), O_IN|O_DB|O_CONST|Wa, 1, WBuffSize, Ws, 0, 0, 0, "Filter"), NeedConvout? - KerArgP("ConvOut",KerArgSpace(2,T0,D1), O_BUFF|O_ONETILE|O_HWC,Wc, Hc, UsedWc, UsedHc, PadInp, PadInp, Cos, OverlapP, 0, 0, ""):AT_NO_KER_ARG, - KerArg ("Out", KerArgSpace(2,T0,D1), O_OUT|O_DB|O_HWC, Wo, Ho, Abs(Out_DataSize), 0, 0, 0, "Out"), - KerArg ("Infos", KerArgSpace(1,T0), O_IN|O_BUFF|O_NTILED|O_CONST, AT_INF_NE16_DIM, 1, 1, 0, 0, 0, "Infos") + KerArgP("ConvOut",KerArgSpace(2,T0,D1), O_BUFF|O_ONETILE|O_HWC,Wc, Hc, UsedWc, UsedHc, PadInp, PadInp, Cos, OverlapP, 0, 0, ""):AT_NO_KER_ARG, + KerArg ("Out", KerArgSpace(2,T0,D1), O_OUT|O_DB|O_HWC, Wo, Ho, Abs(Out_DataSize), 0, 0, 0, "Out"), + KerArg ("Infos", KerArgSpace(1,T0), O_IN|O_BUFF|O_NTILED|O_CONST, AT_INF_NE16_DIM,1, 1, 0, 0, 0, "Infos") ) ); if (Kernel) { @@ -680,14 +707,18 @@ static Kernel_T *CNN_ConvolutionNE16_Internal( ConvKerName = CNN_FindMatchingKernelAttr(ConvOper, KOP_NONE, ParFeat, CALL_NE16_KER, Abs(In_DataSize), Abs(Out_DataSize), Bias_DataSize, 0, 4, Fcx, Fcy, Dcx, Dcy, Scx, Scy, &NeedFcx, &NeedFcy, &NeedDcx, &NeedDcy, &NeedScx, &NeedScy, 0); if (ConvKerName==0) GenTilingError("CNN_ConvolutionPoolAct_NE16 Kernel: %s, Can't find a matching Convolution basic kernel", Name); - int BasicNE16Mode = 0; - // if ((Fcx==1 && Fcy==1) || (Fcx==3 && Fcy==3 && Scx==1 && Scy==1)) {BasicNE16Mode = 1; printf("BASIC MODE\n");} + if (PoolOper==KOP_MAXPOOL || PoolOper==KOP_AVGPOOL) { - PoolKerName = CNN_FindMatchingKernelAttr(PoolOper, KOP_NONE, 1, CALL_HWC_KER, In_DataSize, 0, 0, 0, Out_DataSize, Fpx, Fpy, Dpx, Dpy, Spx, Spy, + PoolKerName = CNN_FindMatchingKernelAttr(PoolOper, NeedReduct?KOP_NONE:ActOper, 1, CALL_HWC_KER, In_DataSize, 0, 0, 0, Out_DataSize, Fpx, Fpy, Dpx, Dpy, Spx, Spy, &NeedFpx, &NeedFpy, &NeedDpx, &NeedDpy, &NeedSpx, &NeedSpy, 0); if (PoolKerName==0) GenTilingError("CNN_ConvolutionPoolAct_NE16 Kernel: %s, Can't find a matching Pooling basic kernel", Name); - } - if (NeedReduct) { + if (NeedReduct) { + DPReductionKerName = CNN_FindMatchingKernelAttr(NeedReductScale?KOP_DP_REDUCT_IO:KOP_DP_REDUCT_IO_NOSCALE, ActOper, 1, CALL_HWC_KER, + 4, 0, 0, 0, Out_DataSize, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + if (DPReductionKerName==0) GenTilingError("CNN_ConvolutionPoolAct_NE16 Kernel: %s, Can't find a matching Reduction basic kernel %d %s", Name, Out_DataSize, NeedReductNoScale?"NoScale":"Scale"); + } + + } else if (NeedReduct) { DPReductionKerName = CNN_FindMatchingKernelAttr(NeedReductScale?KOP_DP_REDUCT:KOP_DP_REDUCT_NOSCALE, ActOper, 1, CALL_HWC_KER, 4, 0, 0, 0, Out_DataSize, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); if (DPReductionKerName==0) GenTilingError("CNN_ConvolutionPoolAct_NE16 Kernel: %s, Can't find a matching Reduction basic kernel %d %s", Name, Out_DataSize, NeedReductNoScale?"NoScale":"Scale"); @@ -698,7 +729,7 @@ static Kernel_T *CNN_ConvolutionNE16_Internal( } // If pooling you need an extra buffer for convout but reduction can be done in the accelerator - int NeedConvout = NeedReduct || PoolKerName || NeedSetBias; + int NeedConvout = NeedReduct || NeedSetBias || PoolKerName; unsigned int Cos = NeedConvout?4:1; if (Log) { @@ -750,7 +781,7 @@ static Kernel_T *CNN_ConvolutionNE16_Internal( KCArgs[Ca++] = TCArg(CNN_ArgDataType(1, 1,1), "Infos"); /* User kernel kernel arguments */ - Object_T **KArgs = AllocateKerArgs(NeedConvout?(PoolKerName?9:8):7); + Object_T **KArgs = AllocateKerArgs(NeedConvout?8:7); int Ka=0; KArgs[Ka++] = KerArgPV("In", KerArgSpace(2,T0,D0), O_IN|O_DB|O_HWC, Width, Height, UsedWidth, UsedHeight, PadIncT, PadInc, PadValue, Abs(In_DataSize), OverlapC, 0, TileCons, "In"); if (MinTileDim && (MinTileDim > TileCons)) SetKerArgMinTileSize(KArgs[Ka-1], MinTileDim); @@ -764,8 +795,6 @@ static Kernel_T *CNN_ConvolutionNE16_Internal( } if (NeedConvout) KArgs[Ka++] = KerArgP("ConvOut",KerArgSpace(2,T0,Os), O_BUFF|O_ONETILE|O_HWC, Wc, Hc, UsedWc, UsedHc, PadInp, PadInp, Cos, OverlapP, 0, 0, ""); - if (NeedConvout && PoolKerName) - KArgs[Ka++] = KerArgP("ActOut", KerArgSpace(2,T0,Os), O_BUFF|O_ONETILE|O_HWC, Wc, Hc, UsedWc, UsedHc, PadInp, PadInp, 1, OverlapP, 0, 0, ""); KArgs[Ka++] = KerArg ("Out", KerArgSpace(2,T0,Os), O_OUT|O_DB|O_HWC, Wo, Ho, Abs(Out_DataSize),0,0, 0, "Out"); KArgs[Ka++] = KerArg ("Infos", KerArgSpace(1,T0), O_IN|O_BUFF|O_NTILED|O_CONST, AT_INF_NE16_DIM, 1, 1, 0, 0, 0, "Infos"); @@ -776,34 +805,9 @@ static Kernel_T *CNN_ConvolutionNE16_Internal( KernelIterSpace(3, IterParSpace(D1, OutFeat, OutTileCons), IterTiledSpace(T0), IterParSpace(D0|InFeatProp, InFeat, InTileCons))), TileOrientation|TILE_HWC, KCArgs, - Calls(10, + Calls(8, Call("NE16_Enable", DWConv?LOC_D0_PROLOG:LOC_D1_PROLOG, Bindings(0)), - BasicNE16Mode?Call("NE16_SoftReset", DWConv?LOC_D0_PROLOG:LOC_D1_PROLOG, Bindings(0)):AT_NO_CALL, - BasicNE16Mode?Call("NE16_PrepareJob", DWConv?LOC_D0_PROLOG:LOC_D1_PROLOG, - Bindings(21, - K_Arg("In", KER_ARG_FIRST_TILE), - K_Arg("In", KER_ARG_FIRST_TILE_W), - K_Arg("In", KER_ARG_FIRST_TILE_H), - K_Arg("In", KER_ARG_FIRST_TILE_PAD), - K_Arg("Filter", KER_ARG_FIRST_TILE), - K_Arg("Bias", KER_ARG_FIRST_TILE), - K_Arg("Out", KER_ARG_FIRST_TILE), - K_Arg("Scale", KER_ARG_FIRST_TILE), - K_Arg("ScaleN", KER_ARG_FIRST_TILE), - K_Arg("Out", KER_ARG_FIRST_TILE_W), - K_Arg("Out", KER_ARG_FIRST_TILE_H), - K_ArgPar("In", KER_ARG_PARTILE_SIZE, D0), - K_ArgPar(NeedConvout?"ConvOut":"Out", KER_ARG_FIRST_PARTILE_SIZE, Os), - Imm(Filter_DataSizeBits), - Imm(DEFAULT_NE16_JOB_CFG), - K_TileOper("Infos", "int *", '@', AT_INF_NE16_WOFFSET/4), - Imm(PadValue), - Imm(1), - K_ArgParOper("In", KER_ARG_PARTILE_DIM, D0, '=', 1), - Imm(0), - Imm(0) - ) - ):AT_NO_CALL, + Call("NE16_SoftReset", DWConv?LOC_D0_PROLOG:LOC_D1_PROLOG, Bindings(0)), SetBiasKerName?Call(SetBiasKerName, DWConv?LOC_LOOP:LOC_D0, Bindings(6, K_Arg("ConvOut", KER_ARG_TILE), /* SetBias output tile */ @@ -815,8 +819,8 @@ static Kernel_T *CNN_ConvolutionNE16_Internal( ) ):AT_NO_CALL, Call("NE16_SoftReset", DWConv?LOC_LOOP:LOC_D0, Bindings(0)), - Call(BasicNE16Mode?"NE16_FireJob":ConvKerName, DWConv?LOC_LOOP:LOC_D0, - Bindings(BasicNE16Mode?0:26, + Call(ConvKerName, DWConv?LOC_LOOP:LOC_D0, + Bindings(26, K_Arg("In", KER_ARG_TILE), /* Conv input tile */ K_Arg("Filter", KER_ARG_TILE), /* Conv filter */ K_Arg("Bias", KER_ARG_TILE), /* Conv Bias when depth wise conv*/ @@ -827,7 +831,7 @@ static Kernel_T *CNN_ConvolutionNE16_Internal( K_ArgPar("Filter", KER_ARG_LOADEDPARTILE_SIZE, D0), /* Total Number of loaded input features in case of promotion */ K_Arg("In", KER_ARG_TILE_H), /* Conv input tile height */ K_Arg("In", KER_ARG_TILE_W), /* Conv input tile width */ - K_ArgPar(NeedConvout?"ConvOut":"Out", KER_ARG_PARTILE_SIZE, Os), /* Number of output features in this tile */ + K_ArgPar(NeedConvout?"ConvOut":"Out", KER_ARG_PARTILE_SIZE, Os), /* Number of output features in this tile */ K_Arg(NeedConvout?"ConvOut":"Out", KER_ARG_TILE_H), K_Arg(NeedConvout?"ConvOut":"Out", KER_ARG_TILE_W), Imm(PadValue), @@ -845,36 +849,11 @@ static Kernel_T *CNN_ConvolutionNE16_Internal( NeedDcy?Imm(Dcy):AT_IGNORE_ARG_BINDING /* Pooling Dy */ ) ), - BasicNE16Mode?Call("NE16_PrepareJob", DWConv?LOC_LOOP:LOC_D0, - Bindings(21, - K_Arg("In", KER_ARG_NEXT_TILE), - K_Arg("In", KER_ARG_NEXT_TILE_W), - K_Arg("In", KER_ARG_NEXT_TILE_H), - K_Arg("In", KER_ARG_NEXT_TILE_PAD), - K_Arg("Filter", KER_ARG_NEXT_TILE), - K_Arg("Bias", KER_ARG_NEXT_TILE), - K_Arg("Out", KER_ARG_NEXT_TILE), - K_Arg("Scale", KER_ARG_NEXT_TILE), - K_Arg("ScaleN", KER_ARG_NEXT_TILE), - K_Arg("Out", KER_ARG_NEXT_TILE_W), - K_Arg("Out", KER_ARG_NEXT_TILE_H), - K_ArgPar("In", KER_ARG_NEXT_PARTILE_SIZE, D0), - K_ArgPar(NeedConvout?"ConvOut":"Out", KER_ARG_NEXT_PARTILE_SIZE, Os), - Imm(Filter_DataSizeBits), - Imm(DEFAULT_NE16_JOB_CFG), - K_TileOper("Infos", "int *", '@', AT_INF_NE16_WOFFSET/4), - Imm(PadValue), - K_ArgPred("In", KER_ARG_TILEFIRST, D0), - K_ArgPred("In", KER_ARG_NEXT_TILELAST, D0), - K_ArgPred("In", KER_ARG_NEXT_TILELAST, T0), - Imm(0) - ) - ):AT_NO_CALL, (NeedReduct==0)?AT_NO_CALL: - Call(DPReductionKerName, DWConv?LOC_LOOP_EPILOG:LOC_D0_EPILOG, /* DP Reduction also take care of optional activation */ + Call(DPReductionKerName, DWConv?LOC_LOOP_EPILOG:LOC_D0_EPILOG, /* DPReduction also take care of optional activation */ Bindings(8, K_Arg("ConvOut", KER_ARG_TILE), /* Double precision input tile */ - K_Arg(PoolOper?"ActOut":"Out", KER_ARG_TILE), /* Single precision output tile, warning use IO kernel when In=Out */ + K_Arg(PoolOper?"ConvOut":"Out", KER_ARG_TILE), /* Single precision output tile, warning use IO kernel when In=Out */ ParFeat?K_ArgPar("ConvOut", KER_ARG_PARTILE_SIZE, Os):Imm(1), /* Input tile Number of features */ K_Arg("ConvOut", KER_ARG_TILE_W), /* Input tile width */ K_Arg("ConvOut", KER_ARG_TILE_H), /* Input tile height */ @@ -885,20 +864,21 @@ static Kernel_T *CNN_ConvolutionNE16_Internal( ), (PoolKerName==0)?AT_NO_CALL: Call(PoolKerName, DWConv?LOC_LOOP:LOC_D0_EPILOG, - Bindings(13, - K_Arg(NeedReduct?"ActOut":"ConvOut", KER_ARG_TILE), - K_Arg(NeedReduct?"ActOut":"ConvOut", KER_ARG_TILE_W), - K_Arg(NeedReduct?"ActOut":"ConvOut", KER_ARG_TILE_H), + Bindings(14, + K_Arg("ConvOut", KER_ARG_TILE), + K_Arg("ConvOut", KER_ARG_TILE_W), + K_Arg("ConvOut", KER_ARG_TILE_H), NeedFpx?Imm(Fpx):AT_IGNORE_ARG_BINDING, /* Pool Fx */ NeedFpy?Imm(Fpy):AT_IGNORE_ARG_BINDING, /* Pool Fy */ NeedSpx?Imm(Spx):AT_IGNORE_ARG_BINDING, /* Pool Stridex */ NeedSpy?Imm(Spy):AT_IGNORE_ARG_BINDING, /* Pool Stridey */ - K_ArgPred(NeedReduct?"ActOut":"ConvOut", KER_ARG_TILEFIRST, T0), /* First Tile */ - K_Arg(NeedReduct?"ActOut":"ConvOut", KER_ARG_TILE_PAD), /* Pool Padding */ + K_ArgPred("ConvOut", KER_ARG_TILEFIRST, T0), /* First Tile */ + K_Arg("ConvOut", KER_ARG_TILE_PAD), /* Pool Padding */ K_Arg("Out", KER_ARG_TILE), /* Pooling output tile */ - K_ArgPar(NeedReduct?"ActOut":"ConvOut", KER_ARG_PARTILE_SIZE, D1), /* In Features */ + K_ArgPar("ConvOut", KER_ARG_PARTILE_SIZE, D1), /* In Features */ K_Arg("Out", KER_ARG_TILE_W), /* Output tile width */ - K_Arg("Out", KER_ARG_TILE_H) /* Output tile height */ + K_Arg("Out", KER_ARG_TILE_H), /* Output tile height */ + K_Arg("Infos", KER_ARG_TILE) /* Infos */ ) ), Call("NE16_Disable", DWConv?LOC_D0_EPILOG:LOC_D1_EPILOG, Bindings(0)) @@ -971,6 +951,10 @@ int CNN_ConvolutionNE16( CNN_LinearAct_NE16(Name, Ctrl, In_DataSize, Out_DataSize, Bias_DataSize, Scale_DataSize, Filter_DataSizeBits, InFeat, OutFeat, KOP_LINEAR, ActOper); return 1; } + int HWC = 0; + if (Ctrl) { + if (Ctrl->HWC != -1) HWC = Ctrl->HWC; + } unsigned int MinTile; unsigned int InTileCons; if (PoolOper==KOP_NONE) { @@ -987,6 +971,14 @@ int CNN_ConvolutionNE16( unsigned int Sol1TileCons = TileCons, Sol2TileCons = TileCons; AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_OFF); + if (HWC) { + printf("---------------------------------------------------------- CNN_ConvolutionNE16 MM ---------------------------------------------------------------------------\n"); + Ker = CNN_MM_ConvolutionNE16(Name, Ctrl, + In_DataSize, Out_DataSize, Bias_DataSize, Scale_DataSize, Filter_DataSizeBits, InFeat, OutFeat, Width, Height, + ConvOper, Fcx, Fcy, Dcx, Dcy, Scx, Scy, ConvPad, PadValue, PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PoolPad, ActOper); + if (Ker) return 1; + else printf("---------------------------------------------------------- MM NO SOLUTION FOUND ---------------------------------------------------------------------------\n"); + } printf("----------------------------------------------------------CNN_ConvolutionNE16------------------------------------------------------------------------------\n"); Ker = CNN_ConvolutionNE16_Internal(Name, Ctrl, In_DataSize, Out_DataSize, Bias_DataSize, Scale_DataSize, Filter_DataSizeBits, InFeat, OutFeat, Width, Height, diff --git a/tools/autotiler_v3/CNN_Generators_SQ8/CNN_Generators_SQ8.c b/tools/autotiler_v3/CNN_Generators_SQ8/CNN_Generators_SQ8.c index 44b3c2891..2298074f7 100644 --- a/tools/autotiler_v3/CNN_Generators_SQ8/CNN_Generators_SQ8.c +++ b/tools/autotiler_v3/CNN_Generators_SQ8/CNN_Generators_SQ8.c @@ -104,7 +104,7 @@ void LoadCNN_SQ8_Library() ); LibKernelTemplate("Ker_MM_Pool_SQ8_T", - CArgs(13, + CArgs(14, TCArg("signed char * __restrict__", "In"), TCArg("unsigned short int", "W"), TCArg("unsigned short int", "H"), @@ -117,11 +117,12 @@ void LoadCNN_SQ8_Library() TCArg("signed char * __restrict__", "Out"), TCArg("unsigned short int", "Feat"), TCArg("unsigned short int", "Wo"), - TCArg("unsigned short int", "Ho") + TCArg("unsigned short int", "Ho"), + TCArg("signed char * __restrict__", "Infos") ) ); LibKernelTemplate("Ker_MM_Pool_USQ8_T", - CArgs(13, + CArgs(14, TCArg("unsigned char * __restrict__", "In"), TCArg("unsigned short int", "W"), TCArg("unsigned short int", "H"), @@ -134,7 +135,8 @@ void LoadCNN_SQ8_Library() TCArg("unsigned char * __restrict__", "Out"), TCArg("unsigned short int", "Feat"), TCArg("unsigned short int", "Wo"), - TCArg("unsigned short int", "Ho") + TCArg("unsigned short int", "Ho"), + TCArg("signed char * __restrict__", "Infos") ) ); LibKernelTemplate("KerConvLinReduct_SQ8_T", @@ -558,12 +560,49 @@ void LoadCNN_SQ8_Library() LibKernel("KerParPoolNxMStrideSxSy_ReLUMN_SQ8", CALL_PARALLEL, 0, "KerPool_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MAXPOOL, KOP_AVGPOOL), CNN_OperList(1, KOP_RELUMN), 1, CNN_Type(1,0,0,0,1), -1,-1,1,1,-1,-1)); - LibKernel("KerParMaxPoolNxMStrideSxSy_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_SQ8_T", - CNN_Match(CNN_OperList(1, KOP_MAXPOOL), CNN_OperList(1, KOP_NONE), 1, - CNN_Type(1,0,0,0,1), -1,-1,1,1,-1,-1)); - LibKernel("KerParAvgPoolNxMStrideSxSy_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_SQ8_T", - CNN_Match(CNN_OperList(1, KOP_AVGPOOL), CNN_OperList(1, KOP_NONE), 1, - CNN_Type(1,0,0,0,1), -1,-1,1,1,-1,-1)); + LibKernel("KerParMaxPoolNxMStrideSxSy_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MAXPOOL), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,0,0,0,1), -1,-1,1,1,-1,-1)); + LibKernel("KerParMaxPoolNxMStrideSxSy_HWC_ReLU_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MAXPOOL), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,0,0,0,1), -1,-1,1,1,-1,-1)); + LibKernel("KerParMaxPoolNxMStrideSxSy_HWC_ReLUN_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MAXPOOL), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,0,0,0,1), -1,-1,1,1,-1,-1)); + LibKernel("KerParMaxPoolNxMStrideSxSy_HWC_ReLUM_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MAXPOOL), CNN_OperList(1, KOP_RELUM), 1, CNN_Type(1,0,0,0,1), -1,-1,1,1,-1,-1)); + LibKernel("KerParMaxPoolNxMStrideSxSy_HWC_ReLUMN_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MAXPOOL), CNN_OperList(1, KOP_RELUMN), 1, CNN_Type(1,0,0,0,1), -1,-1,1,1,-1,-1)); + LibKernel("KerParMaxPoolNxMStrideSxSy_HWC_LeakyReLU_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MAXPOOL), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(1,0,0,0,1), -1,-1,1,1,-1,-1)); + LibKernel("KerParMaxPoolNxMStrideSxSy_HWC_HSwish_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MAXPOOL), CNN_OperList(1, KOP_HSWISH), 1, CNN_Type(1,0,0,0,1), -1,-1,1,1,-1,-1)); + LibKernel("KerParMaxPoolNxMStrideSxSy_HWC_HSigmoid_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MAXPOOL), CNN_OperList(1, KOP_HSIGMOID), 1, CNN_Type(1,0,0,0,1), -1,-1,1,1,-1,-1)); + LibKernel("KerParMaxPoolNxMStrideSxSy_HWC_Sigmoid_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MAXPOOL), CNN_OperList(1, KOP_SIGMOID), 1, CNN_Type(1,0,0,0,1), -1,-1,1,1,-1,-1)); + LibKernel("KerParMaxPoolNxMStrideSxSy_HWC_Tanh_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MAXPOOL), CNN_OperList(1, KOP_TANH), 1, CNN_Type(1,0,0,0,1), -1,-1,1,1,-1,-1)); + + LibKernel("KerParAvgPoolNxMStrideSxSy_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_SQ8_T", CNN_Match(CNN_OperList(1, KOP_AVGPOOL), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,0,0,0,1), -1,-1,1,1,-1,-1)); + LibKernel("KerParAvgPoolNxMStrideSxSy_HWC_ReLU_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_SQ8_T", CNN_Match(CNN_OperList(1, KOP_AVGPOOL), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,0,0,0,1), -1,-1,1,1,-1,-1)); + LibKernel("KerParAvgPoolNxMStrideSxSy_HWC_ReLUN_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_SQ8_T", CNN_Match(CNN_OperList(1, KOP_AVGPOOL), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,0,0,0,1), -1,-1,1,1,-1,-1)); + LibKernel("KerParAvgPoolNxMStrideSxSy_HWC_ReLUM_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_SQ8_T", CNN_Match(CNN_OperList(1, KOP_AVGPOOL), CNN_OperList(1, KOP_RELUM), 1, CNN_Type(1,0,0,0,1), -1,-1,1,1,-1,-1)); + LibKernel("KerParAvgPoolNxMStrideSxSy_HWC_ReLUMN_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_SQ8_T", CNN_Match(CNN_OperList(1, KOP_AVGPOOL), CNN_OperList(1, KOP_RELUMN), 1, CNN_Type(1,0,0,0,1), -1,-1,1,1,-1,-1)); + LibKernel("KerParAvgPoolNxMStrideSxSy_HWC_LeakyReLU_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_SQ8_T", CNN_Match(CNN_OperList(1, KOP_AVGPOOL), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(1,0,0,0,1), -1,-1,1,1,-1,-1)); + LibKernel("KerParAvgPoolNxMStrideSxSy_HWC_HSwish_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_SQ8_T", CNN_Match(CNN_OperList(1, KOP_AVGPOOL), CNN_OperList(1, KOP_HSWISH), 1, CNN_Type(1,0,0,0,1), -1,-1,1,1,-1,-1)); + LibKernel("KerParAvgPoolNxMStrideSxSy_HWC_HSigmoid_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_SQ8_T", CNN_Match(CNN_OperList(1, KOP_AVGPOOL), CNN_OperList(1, KOP_HSIGMOID), 1, CNN_Type(1,0,0,0,1), -1,-1,1,1,-1,-1)); + LibKernel("KerParAvgPoolNxMStrideSxSy_HWC_Sigmoid_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_SQ8_T", CNN_Match(CNN_OperList(1, KOP_AVGPOOL), CNN_OperList(1, KOP_SIGMOID), 1, CNN_Type(1,0,0,0,1), -1,-1,1,1,-1,-1)); + LibKernel("KerParAvgPoolNxMStrideSxSy_HWC_Tanh_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_SQ8_T", CNN_Match(CNN_OperList(1, KOP_AVGPOOL), CNN_OperList(1, KOP_TANH), 1, CNN_Type(1,0,0,0,1), -1,-1,1,1,-1,-1)); + + LibKernel("KerParMaxPoolNxMStrideSxSy_HWC_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_USQ8_T", CNN_Match(CNN_OperList(1, KOP_MAXPOOL), CNN_OperList(1, KOP_NONE), 1, CNN_Type(-1,0,0,0,-1), -1,-1,1,1,-1,-1)); + LibKernel("KerParMaxPoolNxMStrideSxSy_HWC_ReLU_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_USQ8_T", CNN_Match(CNN_OperList(1, KOP_MAXPOOL), CNN_OperList(1, KOP_RELU), 1, CNN_Type(-1,0,0,0,-1), -1,-1,1,1,-1,-1)); + LibKernel("KerParMaxPoolNxMStrideSxSy_HWC_ReLUN_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_USQ8_T", CNN_Match(CNN_OperList(1, KOP_MAXPOOL), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(-1,0,0,0,-1), -1,-1,1,1,-1,-1)); + LibKernel("KerParMaxPoolNxMStrideSxSy_HWC_ReLUM_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_USQ8_T", CNN_Match(CNN_OperList(1, KOP_MAXPOOL), CNN_OperList(1, KOP_RELUM), 1, CNN_Type(-1,0,0,0,-1), -1,-1,1,1,-1,-1)); + LibKernel("KerParMaxPoolNxMStrideSxSy_HWC_ReLUMN_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_USQ8_T", CNN_Match(CNN_OperList(1, KOP_MAXPOOL), CNN_OperList(1, KOP_RELUMN), 1, CNN_Type(-1,0,0,0,-1), -1,-1,1,1,-1,-1)); + LibKernel("KerParMaxPoolNxMStrideSxSy_HWC_LeakyReLU_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_USQ8_T", CNN_Match(CNN_OperList(1, KOP_MAXPOOL), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(-1,0,0,0,-1), -1,-1,1,1,-1,-1)); + LibKernel("KerParMaxPoolNxMStrideSxSy_HWC_HSwish_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_USQ8_T", CNN_Match(CNN_OperList(1, KOP_MAXPOOL), CNN_OperList(1, KOP_HSWISH), 1, CNN_Type(-1,0,0,0,-1), -1,-1,1,1,-1,-1)); + LibKernel("KerParMaxPoolNxMStrideSxSy_HWC_HSigmoid_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_USQ8_T", CNN_Match(CNN_OperList(1, KOP_MAXPOOL), CNN_OperList(1, KOP_HSIGMOID), 1, CNN_Type(-1,0,0,0,-1), -1,-1,1,1,-1,-1)); + LibKernel("KerParMaxPoolNxMStrideSxSy_HWC_Sigmoid_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_USQ8_T", CNN_Match(CNN_OperList(1, KOP_MAXPOOL), CNN_OperList(1, KOP_SIGMOID), 1, CNN_Type(-1,0,0,0,-1), -1,-1,1,1,-1,-1)); + LibKernel("KerParMaxPoolNxMStrideSxSy_HWC_Tanh_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_USQ8_T", CNN_Match(CNN_OperList(1, KOP_MAXPOOL), CNN_OperList(1, KOP_TANH), 1, CNN_Type(-1,0,0,0,-1), -1,-1,1,1,-1,-1)); + + LibKernel("KerParAvgPoolNxMStrideSxSy_HWC_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_USQ8_T", CNN_Match(CNN_OperList(1, KOP_AVGPOOL), CNN_OperList(1, KOP_NONE), 1, CNN_Type(-1,0,0,0,-1), -1,-1,1,1,-1,-1)); + LibKernel("KerParAvgPoolNxMStrideSxSy_HWC_ReLU_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_USQ8_T", CNN_Match(CNN_OperList(1, KOP_AVGPOOL), CNN_OperList(1, KOP_RELU), 1, CNN_Type(-1,0,0,0,-1), -1,-1,1,1,-1,-1)); + LibKernel("KerParAvgPoolNxMStrideSxSy_HWC_ReLUN_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_USQ8_T", CNN_Match(CNN_OperList(1, KOP_AVGPOOL), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(-1,0,0,0,-1), -1,-1,1,1,-1,-1)); + LibKernel("KerParAvgPoolNxMStrideSxSy_HWC_ReLUM_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_USQ8_T", CNN_Match(CNN_OperList(1, KOP_AVGPOOL), CNN_OperList(1, KOP_RELUM), 1, CNN_Type(-1,0,0,0,-1), -1,-1,1,1,-1,-1)); + LibKernel("KerParAvgPoolNxMStrideSxSy_HWC_ReLUMN_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_USQ8_T", CNN_Match(CNN_OperList(1, KOP_AVGPOOL), CNN_OperList(1, KOP_RELUMN), 1, CNN_Type(-1,0,0,0,-1), -1,-1,1,1,-1,-1)); + LibKernel("KerParAvgPoolNxMStrideSxSy_HWC_LeakyReLU_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_USQ8_T", CNN_Match(CNN_OperList(1, KOP_AVGPOOL), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(-1,0,0,0,-1), -1,-1,1,1,-1,-1)); + LibKernel("KerParAvgPoolNxMStrideSxSy_HWC_HSwish_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_USQ8_T", CNN_Match(CNN_OperList(1, KOP_AVGPOOL), CNN_OperList(1, KOP_HSWISH), 1, CNN_Type(-1,0,0,0,-1), -1,-1,1,1,-1,-1)); + LibKernel("KerParAvgPoolNxMStrideSxSy_HWC_HSigmoid_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_USQ8_T", CNN_Match(CNN_OperList(1, KOP_AVGPOOL), CNN_OperList(1, KOP_HSIGMOID), 1, CNN_Type(-1,0,0,0,-1), -1,-1,1,1,-1,-1)); + LibKernel("KerParAvgPoolNxMStrideSxSy_HWC_Sigmoid_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_USQ8_T", CNN_Match(CNN_OperList(1, KOP_AVGPOOL), CNN_OperList(1, KOP_SIGMOID), 1, CNN_Type(-1,0,0,0,-1), -1,-1,1,1,-1,-1)); + LibKernel("KerParAvgPoolNxMStrideSxSy_HWC_Tanh_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_USQ8_T", CNN_Match(CNN_OperList(1, KOP_AVGPOOL), CNN_OperList(1, KOP_TANH), 1, CNN_Type(-1,0,0,0,-1), -1,-1,1,1,-1,-1)); /* Global Pooling (Max or Avg) with tensor centric scaling and optional ReLU or ReLUN activation */ LibKernel("KerParGlobalMaxPoolFullFeat_SQ8", CALL_PARALLEL, 0, "KerGlobalPool_SQ8_T", CNN_Match(CNN_OperList(1, KOP_GLOBAL_MAXPOOL), CNN_OperList(1, KOP_NONE), 1, @@ -1076,47 +1115,38 @@ void LoadCNN_SQ8_Library() LibKernel("KerConvDWNxMDxDyStrideSxSyB32_SQ8", CALL_PARALLEL, 0, "KerConv_SQ8_T",CNN_Match(CNN_OperList(1, KOP_CONV_DW), 0, 0, CNN_Type(1,1,4,0,4), -1,-1,-1,-1,-1,-1)); /* Convolution, Linear output reduction with per channel scaling and optional activation. Out != In and In Place (IO) */ - LibKernel("KerReduct_CC_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_NONE), 0, - CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); - LibKernel("KerReduct_CC_ReLU_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_RELU), 0, - CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); - LibKernel("KerReduct_CC_ReLUN_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_RELUN), 0, - CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); - LibKernel("KerReduct_CC_ReLUM_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_RELUM), 0, - CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); - LibKernel("KerReduct_CC_ReLUMN_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_RELUMN), 0, - CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); - LibKernel("KerReduct_CC_HSigmoid_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T",CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_HSIGMOID), 0, - CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); - LibKernel("KerReduct_CC_HSwish_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T",CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_HSWISH), 0, - CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); - LibKernel("KerReduct_CC_LeakyReLU_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T",CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_LEAKYRELU), 0, - CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); - LibKernel("KerReduct_CC_Sigmoid_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T",CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_SIGMOID), 0, - CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); - LibKernel("KerReduct_CC_Tanh_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T",CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_TANH), 0, - CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); - - LibKernel("KerReductIO_CC_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_NONE), 0, - CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); - LibKernel("KerReductIO_CC_ReLU_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T",CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_RELU), 0, - CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); - LibKernel("KerReductIO_CC_ReLUN_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T",CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_RELUN), 0, - CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); - LibKernel("KerReductIO_CC_ReLUM_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T",CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_RELUM), 0, - CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); - LibKernel("KerReductIO_CC_ReLUMN_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T",CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_RELUMN), 0, - CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); - LibKernel("KerReductIO_CC_HSigmoid_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T",CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_HSIGMOID), 0, - CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); - LibKernel("KerReductIO_CC_HSwish_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T",CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_HSWISH), 0, - CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); - LibKernel("KerReductIO_CC_LeakyReLU_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T",CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_LEAKYRELU), 0, - CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); - LibKernel("KerReductIO_CC_Sigmoid_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T",CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_SIGMOID), 0, - CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); - LibKernel("KerReductIO_CC_Tanh_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T",CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_TANH), 0, - CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerReduct_CC_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_NONE), 0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerReduct_CC_ReLU_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_RELU), 0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerReduct_CC_ReLUN_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_RELUN), 0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerReduct_CC_ReLUM_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_RELUM), 0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerReduct_CC_ReLUMN_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_RELUMN), 0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerReduct_CC_HSigmoid_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_HSIGMOID), 0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerReduct_CC_HSwish_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_HSWISH), 0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerReduct_CC_LeakyReLU_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_LEAKYRELU), 0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerReduct_CC_Sigmoid_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_SIGMOID), 0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerReduct_CC_Tanh_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_TANH), 0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + + LibKernel("KerReductIO_CC_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_NONE), 0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_ReLU_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_RELU), 0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_ReLUN_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_RELUN), 0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_ReLUM_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_RELUM), 0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_ReLUMN_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_RELUMN), 0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_HSigmoid_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_HSIGMOID), 0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_HSwish_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_HSWISH), 0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_LeakyReLU_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_LEAKYRELU), 0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_Sigmoid_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_SIGMOID), 0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_Tanh_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_TANH), 0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + + LibKernel("KerReductIO_CC_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_NONE), 0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_HWC_ReLU_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_RELU), 0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_HWC_ReLUN_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_RELUN), 0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_HWC_ReLUM_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_RELUM), 0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_HWC_ReLUMN_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_RELUMN), 0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_HWC_HSigmoid_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_HSIGMOID), 0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_HWC_HSwish_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_HSWISH), 0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_HWC_LeakyReLU_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_LEAKYRELU), 0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_HWC_Sigmoid_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_SIGMOID), 0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_HWC_Tanh_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO), CNN_OperList(1, KOP_TANH), 0, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); /* Activation and reduct for CHW input and HWC output Layer Layout */ LibKernel("KerParReduct_CC_CHW2HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_CHW2HWC), CNN_OperList(1, KOP_NONE), 1, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); @@ -1221,6 +1251,54 @@ void LoadCNN_SQ8_Library() LibKernel("KerReduct_CC_NoScale_Sigmoid_USQ16", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE), CNN_OperList(1, KOP_SIGMOID), 1, CNN_Type(4,0,0,0,-2), 0,0,0,0,0,0)); LibKernel("KerReduct_CC_NoScale_Tanh_USQ16", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE), CNN_OperList(1, KOP_TANH), 1, CNN_Type(4,0,0,0,-2), 0,0,0,0,0,0)); + + /* Activation and Reduct without PerChannel Scaling */ + LibKernel("KerReductIO_CC_NoScale_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_NONE), 1, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_NoScale_ReLU_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_RELU), 1, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_NoScale_ReLUN_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_NoScale_ReLUM_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_RELUM), 1, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_NoScale_ReLUMN_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_RELUMN), 1, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_NoScale_HSigmoid_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_HSIGMOID), 1, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_NoScale_HSwish_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_HSWISH), 1, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_NoScale_LeakyReLU_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_NoScale_Sigmoid_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_SIGMOID), 1, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_NoScale_Tanh_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_TANH), 1, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); + + LibKernel("KerReductIO_CC_NoScale_SQ16", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_NONE), 1, CNN_Type(4,0,0,0,2), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_NoScale_ReLU_SQ16", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_RELU), 1, CNN_Type(4,0,0,0,2), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_NoScale_ReLUN_SQ16", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(4,0,0,0,2), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_NoScale_ReLUM_SQ16", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_RELUM), 1, CNN_Type(4,0,0,0,2), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_NoScale_ReLUMN_SQ16", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_RELUMN), 1, CNN_Type(4,0,0,0,2), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_NoScale_HSigmoid_SQ16", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_HSIGMOID), 1, CNN_Type(4,0,0,0,2), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_NoScale_HSwish_SQ16", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_HSWISH), 1, CNN_Type(4,0,0,0,2), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_NoScale_LeakyReLU_SQ16", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(4,0,0,0,2), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_NoScale_Sigmoid_SQ16", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_SIGMOID), 1, CNN_Type(4,0,0,0,2), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_NoScale_Tanh_SQ16", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_TANH), 1, CNN_Type(4,0,0,0,2), 0,0,0,0,0,0)); + + /* Unsigned */ + LibKernel("KerReductIO_CC_NoScale_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_NONE), 1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_NoScale_ReLU_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_RELU), 1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_NoScale_ReLUN_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_NoScale_ReLUM_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_RELUM), 1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_NoScale_ReLUMN_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_RELUMN), 1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_NoScale_HSigmoid_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_HSIGMOID), 1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_NoScale_HSwish_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_HSWISH), 1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_NoScale_LeakyReLU_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_NoScale_Sigmoid_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_SIGMOID), 1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_NoScale_Tanh_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_TANH), 1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0)); + + LibKernel("KerReductIO_CC_NoScale_USQ16", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_NONE), 1, CNN_Type(4,0,0,0,-2), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_NoScale_ReLU_USQ16", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_RELU), 1, CNN_Type(4,0,0,0,-2), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_NoScale_ReLUN_USQ16", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(4,0,0,0,-2), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_NoScale_ReLUM_USQ16", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_RELUM), 1, CNN_Type(4,0,0,0,-2), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_NoScale_ReLUMN_USQ16", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_RELUMN), 1, CNN_Type(4,0,0,0,-2), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_NoScale_HSigmoid_USQ16", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_HSIGMOID), 1, CNN_Type(4,0,0,0,-2), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_NoScale_HSwish_USQ16", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_HSWISH), 1, CNN_Type(4,0,0,0,-2), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_NoScale_LeakyReLU_USQ16", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(4,0,0,0,-2), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_NoScale_Sigmoid_USQ16", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_SIGMOID), 1, CNN_Type(4,0,0,0,-2), 0,0,0,0,0,0)); + LibKernel("KerReductIO_CC_NoScale_Tanh_USQ16", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_IO_NOSCALE), CNN_OperList(1, KOP_TANH), 1, CNN_Type(4,0,0,0,-2), 0,0,0,0,0,0)); + + /* Activations with tensor centric scaling */ LibKernel("Ker_ActNone_SQ8", CALL_PARALLEL, 0, "KerActivation_SQ8_T", CNN_Match(CNN_OperList(1, KOP_ACT_NONE), 0, 0, CNN_Type(1,0,0,0,1), 0,0,0,0,0,0)); LibKernel("Ker_ReLU_SQ8", CALL_PARALLEL, 0, "KerActivation_SQ8_T", CNN_Match(CNN_OperList(1, KOP_RELU), 0, 0, CNN_Type(1,0,0,0,1), 0,0,0,0,0,0)); @@ -1255,13 +1333,6 @@ void LoadCNN_SQ8_Library() LibKernel("KerPoolNxMStrideSxSy_ReLUN_SQ8", CALL_PARALLEL, 0, "KerPool_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MAXPOOL, KOP_AVGPOOL), CNN_OperList(1, KOP_RELUN), 0, CNN_Type(1,0,0,0,1), -1,-1,1,1,-1,-1)); - /* Unsigned int8 input/output functions for NE16 */ - LibKernel("KerParMaxPoolNxMStrideSxSy_HWC_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_USQ8_T", - CNN_Match(CNN_OperList(1, KOP_MAXPOOL), 0, 1, - CNN_Type(-1,0,0,0,-1), -1,-1,1,1,-1,-1)); - LibKernel("KerParAvgPoolNxMStrideSxSy_HWC_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Pool_USQ8_T", - CNN_Match(CNN_OperList(1, KOP_AVGPOOL), 0, 1, - CNN_Type(-1,0,0,0,-1), -1,-1,1,1,-1,-1)); LoadCNN_Copy_Library(); } @@ -1549,7 +1620,7 @@ static Kernel_T *CNN_MM_ConvolutionPoolAct_SQ8_Internal( Imm((ActOper==KOP_NONE)), /* Scaling when no activation */ K_Arg("Infos", KER_ARG_TILE) /* Infos */ ): - Bindings(13, + Bindings(14, K_Arg("ConvOut", KER_ARG_TILE), /* Input tile */ K_Arg("ConvOut", KER_ARG_TILE_W), /* Input tile width */ K_Arg("ConvOut", KER_ARG_TILE_H), /* Input tile height */ @@ -1562,7 +1633,8 @@ static Kernel_T *CNN_MM_ConvolutionPoolAct_SQ8_Internal( K_Arg("Out", KER_ARG_TILE), /* Pooling output tile */ K_ArgPar("ConvOut", KER_ARG_PARTILE_SIZE, Os), /* In Features */ K_Arg("Out", KER_ARG_TILE_W), /* Output tile width */ - K_Arg("Out", KER_ARG_TILE_H) /* Output tile height */ + K_Arg("Out", KER_ARG_TILE_H), /* Output tile height */ + K_Arg("Infos", KER_ARG_TILE) /* Infos */ ) ), (ActKerName==0)?AT_NO_CALL: @@ -1859,7 +1931,7 @@ static Kernel_T *CNN_HWC_DWConvolutionPoolAct_SQ8_Internal( ), (PoolKerName==0)?AT_NO_CALL: Call(PoolKerName, LOC_D0, - Bindings(13, + Bindings(14, K_Arg("ConvOut", KER_ARG_TILE), /* Input tile */ K_Arg("ConvOut", KER_ARG_TILE_W), /* Input tile width */ K_Arg("ConvOut", KER_ARG_TILE_H), /* Input tile height */ @@ -1872,7 +1944,8 @@ static Kernel_T *CNN_HWC_DWConvolutionPoolAct_SQ8_Internal( K_Arg("Out", KER_ARG_TILE), /* Pooling output tile */ K_ArgPar("ConvOut", KER_ARG_PARTILE_SIZE, D0), /* In Features */ K_Arg("Out", KER_ARG_TILE_W), /* Output tile width */ - K_Arg("Out", KER_ARG_TILE_H) /* Output tile height */ + K_Arg("Out", KER_ARG_TILE_H), /* Output tile height */ + K_Arg("Infos", KER_ARG_TILE) /* Infos */ ) ), (ActKerName==0)?AT_NO_CALL: @@ -2651,7 +2724,7 @@ Kernel_T * CNN_PoolAct_SQ8_Internal( Imm((ActOper==KOP_NONE)), /* Scaling when no activation */ K_Arg("Infos", KER_ARG_TILE) /* Infos */ ): - Bindings(13, + Bindings(14, K_Arg("In", KER_ARG_TILE), /* Input tile */ K_Arg("In", KER_ARG_TILE_W), /* Input tile width */ K_Arg("In", KER_ARG_TILE_H), /* Input tile height */ @@ -2664,7 +2737,8 @@ Kernel_T * CNN_PoolAct_SQ8_Internal( K_Arg("Out", KER_ARG_TILE), /* Pooling output tile */ ParFeat?K_ArgPar("In", KER_ARG_PARTILE_SIZE, D0):Imm(1), /* In Features */ K_Arg("Out", KER_ARG_TILE_W), /* Output tile width */ - K_Arg("Out", KER_ARG_TILE_H) /* Output tile height */ + K_Arg("Out", KER_ARG_TILE_H), /* Output tile height */ + K_Arg("Infos", KER_ARG_TILE) /* Infos */ ) ), @@ -3378,13 +3452,13 @@ static Kernel_T * CNN_SoftMax2D_SQ8_Internal( ), (HWC==0)? KerArgs(3, - KerArg("In", KerArgSpace(2,D0,T0), OBJ_BUFFER_IN, 1, Dim, 1, 0, 0, 8, "In"), - KerArg("Out", KerArgSpace(2,D0,T0), OBJ_BUFFER_OUT, 1, Dim, OutBytes, 0, 0, 0, "Out"), + KerArg("In", KerArgSpace(2,D0,T0), OBJ_IN_DB, 1, Dim, 1, 0, 0, 8, "In"), + KerArg("Out", KerArgSpace(2,D0,T0), OBJ_OUT_DB, 1, Dim, OutBytes, 0, 0, 0, "Out"), KerArg("Infos", KerArgSpace(1,T0), O_IN|O_BUFF|O_NTILED|O_CONST, AT_INF_DIM, 1, 1, 0, 0, 0, "Infos") ): KerArgs(3, - KerArg("In", KerArgSpace(2,T0,D0), OBJ_BUFFER_IN, 1, Dim, 1, 0, 0, 8, "In"), - KerArg("Out", KerArgSpace(2,T0,D0), OBJ_BUFFER_OUT, 1, Dim, OutBytes, 0, 0, 0, "Out"), + KerArg("In", KerArgSpace(2,T0,D0), OBJ_IN_DB, 1, Dim, 1, 0, 0, 8, "In"), + KerArg("Out", KerArgSpace(2,T0,D0), OBJ_OUT_DB, 1, Dim, OutBytes, 0, 0, 0, "Out"), KerArg("Infos", KerArgSpace(1,T0), O_IN|O_BUFF|O_NTILED|O_CONST, AT_INF_DIM, 1, 1, 0, 0, 0, "Infos") ) ); @@ -4007,44 +4081,24 @@ Kernel_T *CNN_MatMulAct_SQ8_Internal( ), ColFirst? KerArgs(8, - !Transposed? - KerArg("KerBuff",KerArgSpace(1, T1), O_BUFF|O_NTILED, Nbuff*ColM1, 1, 1, 0, 0, 0, 0):AT_NO_KER_ARG, - (NBatches>1)? - KerArg("In1", KerArgSpace(2,D0,T0), O_IN|O_DB|O_CONST, ColM1, LineM1, 1, 0, OBJ_CONSTRAINTS_PAD_REM, 8, "In1"): - KerArg("In1", KerArgSpace(1, T0), O_IN|O_DB|O_CONST, ColM1, LineM1, 1, 0, OBJ_CONSTRAINTS_PAD_REM, 8, "In1"), - (NBatches>1)? - KerArg("In2", KerArgSpace(2,D0,T1), O_IN|O_DB, ColM2, LineM2, 1, 0, ObjCons|OBJ_CONSTRAINTS_PAD_REM, ConsT0, "In2"): - KerArg("In2", KerArgSpace(1, T1), O_IN|O_DB, ColM2, LineM2, 1, 0, ObjCons|OBJ_CONSTRAINTS_PAD_REM, ConsT0, "In2"), - !NoBias? - KerArg("Bias", KerArgSpace(1, TA), O_BUFF|O_IN|O_CONST, 1, SAxis, Bias_DataSize, 0, OBJ_CONSTRAINTS_PAD_REM, 0, "Bias"):AT_NO_KER_ARG, - (NBatches>1)? - KerArg("Out", KerArgSpace(2,D0,T1), O_OUT|O_DB, ColO, LineO, 1, 0, OBJ_CONSTRAINTS_TILE_VER|OBJ_CONSTRAINTS_PAD_REM, 0, "Out"): - KerArg("Out", KerArgSpace(1, T1), O_OUT|O_DB, ColO, LineO, 1, 0, OBJ_CONSTRAINTS_TILE_VER|OBJ_CONSTRAINTS_PAD_REM, 0, "Out"), - !ScaleScalar? - KerArg("Scale", KerArgSpace(1, TA), O_BUFF|O_IN|O_CONST, 1, SAxis, 1, 0, 0, 0, "Scale"):AT_NO_KER_ARG, - !ScaleScalar? - KerArg("ScaleN", KerArgSpace(1, TA), O_BUFF|O_IN|O_CONST, 1, SAxis, 1, 0, 0, 0, "ScaleN"):AT_NO_KER_ARG, - KerArg("Infos", KerArgSpace(1, T1), O_IN|O_BUFF|O_NTILED, 1, 1, AT_INF_DIM*1, 0, 0, 0, "Infos") + !Transposed?KerArg("KerBuff",KerArgSpace(1, T1), O_BUFF|O_NTILED, Nbuff*ColM1, 1, 1, 0, 0, 0, 0):AT_NO_KER_ARG, + KerArg("In1", KerArgSpace(1, T0), O_IN|O_DB|O_CONST, ColM1, LineM1, 1, 0, OBJ_CONSTRAINTS_PAD_REM, 8, "In1"), + KerArg("In2", KerArgSpace(1, T1), O_IN|O_DB, ColM2, LineM2, 1, 0, ObjCons|OBJ_CONSTRAINTS_PAD_REM, ConsT0, "In2"), + !NoBias?KerArg("Bias", KerArgSpace(1, TA), O_BUFF|O_IN|O_CONST, 1, SAxis, Bias_DataSize, 0, OBJ_CONSTRAINTS_PAD_REM, 0, "Bias"):AT_NO_KER_ARG, + KerArg("Out", KerArgSpace(1, T1), O_OUT|O_DB, ColO, LineO, 1, 0, OBJ_CONSTRAINTS_TILE_VER|OBJ_CONSTRAINTS_PAD_REM, 0, "Out"), + !ScaleScalar?KerArg("Scale", KerArgSpace(1, TA), O_BUFF|O_IN|O_CONST, 1, SAxis, 1, 0, 0, 0, "Scale"):AT_NO_KER_ARG, + !ScaleScalar?KerArg("ScaleN", KerArgSpace(1, TA), O_BUFF|O_IN|O_CONST, 1, SAxis, 1, 0, 0, 0, "ScaleN"):AT_NO_KER_ARG, + KerArg("Infos", KerArgSpace(1, T1), O_IN|O_BUFF|O_NTILED, 1, 1, AT_INF_DIM*1, 0, 0, 0, "Infos") ): KerArgs(8, - !Transposed? - KerArg("KerBuff",KerArgSpace(1, T0), O_BUFF|O_NTILED, Nbuff*ColM1, 1, 1, 0, 0, 0, 0):AT_NO_KER_ARG, - (NBatches>1)? - KerArg("In1", KerArgSpace(2,D0,T1), O_IN|O_DB|O_CONST, ColM1, LineM1, 1, 0, OBJ_CONSTRAINTS_PAD_REM, 8, "In1"): - KerArg("In1", KerArgSpace(1, T1), O_IN|O_DB|O_CONST, ColM1, LineM1, 1, 0, OBJ_CONSTRAINTS_PAD_REM, 8, "In1"), - (NBatches>1)? - KerArg("In2", KerArgSpace(2,D0,T0), O_IN|O_DB, ColM2, LineM2, 1, 0, ObjCons|OBJ_CONSTRAINTS_PAD_REM, ConsT0, "In2"): - KerArg("In2", KerArgSpace(1, T0), O_IN|O_DB, ColM2, LineM2, 1, 0, ObjCons|OBJ_CONSTRAINTS_PAD_REM, ConsT0, "In2"), - !NoBias? - KerArg("Bias", KerArgSpace(1, TB), O_BUFF|O_IN|O_CONST, 1, SAxis, Bias_DataSize, 0, OBJ_CONSTRAINTS_PAD_REM, 0, "Bias"):AT_NO_KER_ARG, - (NBatches>1)? - KerArg("Out", KerArgSpace(2,D0,T1), O_OUT|O_DB, ColO, LineO, 1, 0, OBJ_CONSTRAINTS_PAD_REM, 0, "Out"): - KerArg("Out", KerArgSpace(1, T1), O_OUT|O_DB, ColO, LineO, 1, 0, OBJ_CONSTRAINTS_PAD_REM, 0, "Out"), - !ScaleScalar? - KerArg("Scale", KerArgSpace(1, TB), O_BUFF|O_IN|O_CONST, 1, SAxis, 1, 0, 0, 0, "Scale"):AT_NO_KER_ARG, - !ScaleScalar? - KerArg("ScaleN", KerArgSpace(1, TB), O_BUFF|O_IN|O_CONST, 1, SAxis, 1, 0, 0, 0, "ScaleN"):AT_NO_KER_ARG, - KerArg("Infos", KerArgSpace(1, T0), O_IN|O_BUFF|O_NTILED, 1, 1, AT_INF_DIM*1, 0, 0, 0, "Infos") + !Transposed?KerArg("KerBuff",KerArgSpace(1, T0), O_BUFF|O_NTILED, Nbuff*ColM1, 1, 1, 0, 0, 0, 0):AT_NO_KER_ARG, + KerArg("In1", KerArgSpace(1, T1), O_IN|O_DB|O_CONST, ColM1, LineM1, 1, 0, OBJ_CONSTRAINTS_PAD_REM, 8, "In1"), + KerArg("In2", KerArgSpace(1, T0), O_IN|O_DB, ColM2, LineM2, 1, 0, ObjCons|OBJ_CONSTRAINTS_PAD_REM, ConsT0, "In2"), + !NoBias?KerArg("Bias", KerArgSpace(1, TB), O_BUFF|O_IN|O_CONST, 1, SAxis, Bias_DataSize, 0, OBJ_CONSTRAINTS_PAD_REM, 0, "Bias"):AT_NO_KER_ARG, + KerArg("Out", KerArgSpace(1, T1), O_OUT|O_DB, ColO, LineO, 1, 0, OBJ_CONSTRAINTS_PAD_REM, 0, "Out"), + !ScaleScalar?KerArg("Scale", KerArgSpace(1, TB), O_BUFF|O_IN|O_CONST, 1, SAxis, 1, 0, 0, 0, "Scale"):AT_NO_KER_ARG, + !ScaleScalar?KerArg("ScaleN", KerArgSpace(1, TB), O_BUFF|O_IN|O_CONST, 1, SAxis, 1, 0, 0, 0, "ScaleN"):AT_NO_KER_ARG, + KerArg("Infos", KerArgSpace(1, T0), O_IN|O_BUFF|O_NTILED, 1, 1, AT_INF_DIM*1, 0, 0, 0, "Infos") ) ); if (Kernel) { diff --git a/tools/autotiler_v3/CNN_Generators_SQ8/RNN_Generators_SQ8.c b/tools/autotiler_v3/CNN_Generators_SQ8/RNN_Generators_SQ8.c index ca31fbf57..527c10588 100644 --- a/tools/autotiler_v3/CNN_Generators_SQ8/RNN_Generators_SQ8.c +++ b/tools/autotiler_v3/CNN_Generators_SQ8/RNN_Generators_SQ8.c @@ -132,7 +132,7 @@ int RNN_Sequence(int Nc, int K0, int K1, int *n1, int *n2, int *n3, int *n2_io) return ((N1!=0) + (N2!=0) + (N3!=0)); } -static Kernel_T *RNN_Stack_Seq_SQ8( +static Kernel_T *RNN_Stack_Seq_SQ8_Internal( char *Name, CNN_GenControl_T *Ctrl, char *RNNKerName, @@ -257,6 +257,45 @@ static Kernel_T *RNN_Stack_Seq_SQ8( return Kernel; } +static Kernel_T *RNN_Stack_Seq_SQ8( + char *Name, + CNN_GenControl_T *Ctrl, + char *RNNKerName, + + int BiasDataSize, + int FeatDataSize, + + int AlwaysReset, + int NCells, + int DimState, + int DimIn, + int UseIn, + int ExposeSequence, + int Buffer, + int FirstSeq, + int LastSeq, + int Revert, + int Dynamic + ) +{ + Kernel_T *Ker = 0; + + AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_OFF); + Ker = RNN_Stack_Seq_SQ8_Internal(Name, Ctrl, RNNKerName, BiasDataSize, FeatDataSize, AlwaysReset, NCells, DimState, DimIn, UseIn, ExposeSequence, Buffer, FirstSeq, LastSeq, Revert, Dynamic); + if (Ker) return Ker; + AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_ON); + + printf("\n\n=============================== Solution not found for %s: Trying PARALLELFEATURES=0 ===============================\n\n", Name); + /* If solution not found try with ParallelFeature = 0 */ + CNN_GenControl_T InternalCtrl; + if (!Ctrl) CNN_InitGenCtrl(&InternalCtrl); + else InternalCtrl = *Ctrl; + CNN_SetGenCtrl(&InternalCtrl, "PARALLELFEATURES", AT_OPT_VAL(0)); + Ker = RNN_Stack_Seq_SQ8_Internal(Name, &InternalCtrl, RNNKerName, BiasDataSize, FeatDataSize, AlwaysReset, NCells, DimState, DimIn, UseIn, ExposeSequence, Buffer, FirstSeq, LastSeq, Revert, Dynamic); + return Ker; +} + + int RNN_Stack_SQ8( char *Name, CNN_GenControl_T *Ctrl, @@ -485,7 +524,7 @@ int RNN_Stack_SQ8( } -static int LSTM_Stack_Seq_SQ8( +static int LSTM_Stack_Seq_SQ8_Internal( char *Name, CNN_GenControl_T *Ctrl, char *LSTMKerName, @@ -660,6 +699,44 @@ static int LSTM_Stack_Seq_SQ8( return (Kernel!=0); } +static int LSTM_Stack_Seq_SQ8( + char *Name, + CNN_GenControl_T *Ctrl, + char *LSTMKerName, + + int BiasDataSize, + int FeatDataSize, + + int AlwaysReset, + int NCells, + int DimState, + int DimIn, + int UseIn, + int ExposeSequence, + int FirstSeq, + int LastSeq, + int Revert, + int Dynamic + ) +{ + int Ker = 0; + + AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_OFF); + Ker = LSTM_Stack_Seq_SQ8_Internal(Name, Ctrl, LSTMKerName, BiasDataSize, FeatDataSize, AlwaysReset, NCells, DimState, DimIn, UseIn, ExposeSequence, FirstSeq, LastSeq, Revert, Dynamic); + if (Ker) return 1; + AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_ON); + + printf("\n\n=============================== Solution not found for %s: Trying PARALLELFEATURES=0 ===============================\n\n", Name); + /* If solution not found try with ParallelFeature = 0 */ + CNN_GenControl_T InternalCtrl; + if (!Ctrl) CNN_InitGenCtrl(&InternalCtrl); + else InternalCtrl = *Ctrl; + CNN_SetGenCtrl(&InternalCtrl, "PARALLELFEATURES", AT_OPT_VAL(0)); + Ker = LSTM_Stack_Seq_SQ8_Internal(Name, &InternalCtrl, LSTMKerName, BiasDataSize, FeatDataSize, AlwaysReset, NCells, DimState, DimIn, UseIn, ExposeSequence, FirstSeq, LastSeq, Revert, Dynamic); + return Ker; +} + + int LSTM_Stack_SQ8( char *Name, CNN_GenControl_T *Ctrl, @@ -904,7 +981,7 @@ int LSTM_Stack_SQ8( } -static int GRU_Stack_Seq_SQ8( +static int GRU_Stack_Seq_SQ8_Internal( char *Name, CNN_GenControl_T *Ctrl, char *GRUKerName, @@ -1064,6 +1141,43 @@ static int GRU_Stack_Seq_SQ8( return (Kernel!=0); } +static int GRU_Stack_Seq_SQ8( + char *Name, + CNN_GenControl_T *Ctrl, + char *GRUKerName, + + int BiasDataSize, + int FeatDataSize, + + int AlwaysReset, + int NCells, + int DimState, + int DimIn, + int UseIn, + int ExposeSequence, + int FirstSeq, + int LastSeq, + int Revert, + int Dynamic + ) +{ + int Ker = 0; + + AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_OFF); + Ker = GRU_Stack_Seq_SQ8_Internal(Name, Ctrl, GRUKerName, BiasDataSize, FeatDataSize, AlwaysReset, NCells, DimState, DimIn, UseIn, ExposeSequence, FirstSeq, LastSeq, Revert, Dynamic); + if (Ker) return 1; + AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_ON); + + printf("\n\n=============================== Solution not found for %s: Trying PARALLELFEATURES=0 ===============================\n\n", Name); + /* If solution not found try with ParallelFeature = 0 */ + CNN_GenControl_T InternalCtrl; + if (!Ctrl) CNN_InitGenCtrl(&InternalCtrl); + else InternalCtrl = *Ctrl; + CNN_SetGenCtrl(&InternalCtrl, "PARALLELFEATURES", AT_OPT_VAL(0)); + Ker = GRU_Stack_Seq_SQ8_Internal(Name, &InternalCtrl, GRUKerName, BiasDataSize, FeatDataSize, AlwaysReset, NCells, DimState, DimIn, UseIn, ExposeSequence, FirstSeq, LastSeq, Revert, Dynamic); + return Ker; +} + int GRU_Stack_SQ8( char *Name, CNN_GenControl_T *Ctrl, diff --git a/tools/autotiler_v3/CNN_Generators_fp16/RNN_Generators_fp16.c b/tools/autotiler_v3/CNN_Generators_fp16/RNN_Generators_fp16.c index 5a6ca45dc..435e965e6 100644 --- a/tools/autotiler_v3/CNN_Generators_fp16/RNN_Generators_fp16.c +++ b/tools/autotiler_v3/CNN_Generators_fp16/RNN_Generators_fp16.c @@ -106,7 +106,7 @@ int RNN_Sequence_fp16(int Nc, int K0, int K1, int *n1, int *n2, int *n3, int *n2 return ((N1!=0) + (N2!=0) + (N3!=0)); } -static int RNN_Stack_Seq_fp16( +static int RNN_Stack_Seq_fp16_Internal( char *Name, CNN_GenControl_T *Ctrl, char *RNNKerName, @@ -222,6 +222,40 @@ static int RNN_Stack_Seq_fp16( return (Kernel!=0); } +static int RNN_Stack_Seq_fp16( + char *Name, + CNN_GenControl_T *Ctrl, + char *RNNKerName, + + int AlwaysReset, + int NCells, + int DimState, + int DimIn, + int UseIn, + int ExposeSequence, + int Buffer, + int FirstSeq, + int LastSeq, + int Revert, + int Dynamic) +{ + int Ker = 0; + + AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_OFF); + Ker = RNN_Stack_Seq_fp16_Internal(Name, Ctrl, RNNKerName, AlwaysReset, NCells, DimState, DimIn, UseIn, ExposeSequence, Buffer, FirstSeq, LastSeq, Revert, Dynamic); + if (Ker) return 1; + AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_ON); + + printf("\n\n=============================== Solution not found for %s: Trying PARALLELFEATURES=0 ===============================\n\n", Name); + /* If solution not found try with ParallelFeature = 0 */ + CNN_GenControl_T InternalCtrl; + if (!Ctrl) CNN_InitGenCtrl(&InternalCtrl); + else InternalCtrl = *Ctrl; + CNN_SetGenCtrl(&InternalCtrl, "PARALLELFEATURES", AT_OPT_VAL(0)); + Ker = RNN_Stack_Seq_fp16_Internal(Name, &InternalCtrl, RNNKerName, AlwaysReset, NCells, DimState, DimIn, UseIn, ExposeSequence, Buffer, FirstSeq, LastSeq, Revert, Dynamic); + return Ker; +} + int RNN_Stack_fp16( char *Name, CNN_GenControl_T *Ctrl, @@ -400,7 +434,7 @@ int RNN_Stack_fp16( } -static int LSTM_Stack_Seq_fp16( +static int LSTM_Stack_Seq_fp16_Internal( char *Name, CNN_GenControl_T *Ctrl, char *LSTMKerName, @@ -568,6 +602,39 @@ static int LSTM_Stack_Seq_fp16( return (Kernel!=0); } +static int LSTM_Stack_Seq_fp16( + char *Name, + CNN_GenControl_T *Ctrl, + char *LSTMKerName, + + int AlwaysReset, + int NCells, + int DimState, + int DimIn, + int UseIn, + int ExposeSequence, + int FirstSeq, + int LastSeq, + int Revert, + int Dynamic) +{ + int Ker = 0; + + AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_OFF); + Ker = LSTM_Stack_Seq_fp16_Internal(Name, Ctrl, LSTMKerName, AlwaysReset, NCells, DimState, DimIn, UseIn, ExposeSequence, FirstSeq, LastSeq, Revert, Dynamic); + if (Ker) return 1; + AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_ON); + + printf("\n\n=============================== Solution not found for %s: Trying PARALLELFEATURES=0 ===============================\n\n", Name); + /* If solution not found try with ParallelFeature = 0 */ + CNN_GenControl_T InternalCtrl; + if (!Ctrl) CNN_InitGenCtrl(&InternalCtrl); + else InternalCtrl = *Ctrl; + CNN_SetGenCtrl(&InternalCtrl, "PARALLELFEATURES", AT_OPT_VAL(0)); + Ker = LSTM_Stack_Seq_fp16_Internal(Name, &InternalCtrl, LSTMKerName, AlwaysReset, NCells, DimState, DimIn, UseIn, ExposeSequence, FirstSeq, LastSeq, Revert, Dynamic); + return Ker; +} + int LSTM_Stack_fp16( char *Name, CNN_GenControl_T *Ctrl, @@ -791,7 +858,7 @@ int LSTM_Stack_fp16( } -static int GRU_Stack_Seq_fp16( +static int GRU_Stack_Seq_fp16_Internal( char *Name, CNN_GenControl_T *Ctrl, char *GRUKerName, @@ -943,6 +1010,40 @@ static int GRU_Stack_Seq_fp16( return (Kernel!=0); } +static int GRU_Stack_Seq_fp16( + char *Name, + CNN_GenControl_T *Ctrl, + char *GRUKerName, + + int AlwaysReset, + int NCells, + int DimState, + int DimIn, + int UseIn, + int ExposeSequence, + int FirstSeq, + int LastSeq, + int Revert, + int Dynamic) +{ + int Ker = 0; + + AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_OFF); + Ker = GRU_Stack_Seq_fp16_Internal(Name, Ctrl, GRUKerName, AlwaysReset, NCells, DimState, DimIn, UseIn, ExposeSequence, FirstSeq, LastSeq, Revert, Dynamic); + if (Ker) return 1; + AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_ON); + + printf("\n\n=============================== Solution not found for %s: Trying PARALLELFEATURES=0 ===============================\n\n", Name); + /* If solution not found try with ParallelFeature = 0 */ + CNN_GenControl_T InternalCtrl; + if (!Ctrl) CNN_InitGenCtrl(&InternalCtrl); + else InternalCtrl = *Ctrl; + CNN_SetGenCtrl(&InternalCtrl, "PARALLELFEATURES", AT_OPT_VAL(0)); + Ker = GRU_Stack_Seq_fp16_Internal(Name, &InternalCtrl, GRUKerName, AlwaysReset, NCells, DimState, DimIn, UseIn, ExposeSequence, FirstSeq, LastSeq, Revert, Dynamic); + return Ker; +} + + int GRU_Stack_fp16( char *Name, CNN_GenControl_T *Ctrl, diff --git a/tools/autotiler_v3/CNN_Libraries/SSD_BasicKernels.c b/tools/autotiler_v3/CNN_Libraries/SSD_BasicKernels.c index b1a395965..09e5e814f 100644 --- a/tools/autotiler_v3/CNN_Libraries/SSD_BasicKernels.c +++ b/tools/autotiler_v3/CNN_Libraries/SSD_BasicKernels.c @@ -1,7 +1,3 @@ -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wextra" -#pragma GCC diagnostic ignored "-Wpointer-sign" -#pragma GCC diagnostic ignored "-Wsign-compare" /* * Copyright (C) 2020 GreenWaves Technologies * All rights reserved. @@ -10,6 +6,9 @@ * of the BSD license. See the LICENSE file for details. * */ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wsign-compare" + #include #include "Gap.h" #include "CNN_BasicKernels.h" diff --git a/tools/autotiler_v3/CNN_Libraries_NE16/CNN_BasicKernels_NE16.c b/tools/autotiler_v3/CNN_Libraries_NE16/CNN_BasicKernels_NE16.c index 1c302417a..e0bd7445e 100644 --- a/tools/autotiler_v3/CNN_Libraries_NE16/CNN_BasicKernels_NE16.c +++ b/tools/autotiler_v3/CNN_Libraries_NE16/CNN_BasicKernels_NE16.c @@ -1275,7 +1275,7 @@ void KerConvDW3x3Stride2_NE16(KerConv_NE16_T *Arg) SetNE16_ScaleNPointer (ScaleN); SetNE16_Strides (Tile_InFeat, Tile_InFeat * Tile_InW, 0, // In_D0, In_D1, In_D2 - unused Out_Stride0, OutBytes * Tile_OutFeat / 2, OutBytes * Tile_OutFeat * Tile_OutW / 2, // Out_D0, Out_D1, Out_D2 div 2 to take into account strideness - 2*3*3, 2*3*3*Arg->Qw*Nb_KI, 0); // Weights_D0, Weights_D1, Weights_D2 + 2*3*3, 0, 0); // Weights_D0, Weights_D1, Weights_D2 SetNE16_Dim (Nb_KI, Nb_KO, Nb_WO, Nb_HO); // Assume first subtile no need for right/bottom pad SetNE16_ConfigPad ((v4s) {PadL, IsLastSubtileW?PadR:0, PadT, IsLastSubtileH?PadB:0}, Arg->Pad_Val); @@ -1343,7 +1343,7 @@ void KerConvDW3x3Stride2_NE16(KerConv_NE16_T *Arg) SetNE16_ScaleNPointer (ScaleN); SetNE16_Strides (Tile_InFeat, Tile_InFeat * Tile_InW, 0, // In_D0, In_D1, In_D2 - unused Out_Stride0, OutBytes * Tile_OutFeat / 2, OutBytes * Tile_OutFeat * Tile_OutW / 2, // Out_D0, Out_D1, Out_D2 div 2 to take into account strideness - 2*3*3, 2*3*3*Arg->Qw*Nb_KI, 0); // Weights_D0, Weights_D1, Weights_D2 + 2*3*3, 0, 0); // Weights_D0, Weights_D1, Weights_D2 SetNE16_Dim (Nb_KI, Nb_KO, Nb_WO, Nb_HO); // Moving to next spatial subtile means consider less padding (2 because of the stride) PadL = Max(0, TilePadL-2*subtile_j_major); diff --git a/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_AT_Misc.c b/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_AT_Misc.c index 187fd3ab3..dd82478c9 100644 --- a/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_AT_Misc.c +++ b/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_AT_Misc.c @@ -25,6 +25,7 @@ #include "CNN_AT_Misc.h" + #ifdef __pulp__ #define Abs(a) __builtin_pulp_abs((a)) #define Min(a, b) __builtin_pulp_minsi((a), (b)) diff --git a/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_Activation_SQ8.c b/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_Activation_SQ8.c index ef6918d19..233b637bc 100644 --- a/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_Activation_SQ8.c +++ b/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_Activation_SQ8.c @@ -14,6 +14,9 @@ * limitations under the License. */ +#include "Gap.h" +#include "CNN_BasicKernels_SQ8.h" + #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wextra" #pragma GCC diagnostic ignored "-Wpointer-sign" @@ -21,9 +24,6 @@ #pragma GCC diagnostic ignored "-Wswitch" #pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "Gap.h" -#include "CNN_BasicKernels_SQ8.h" - static int CoreCountDynamic = 1; static int ActiveCore = gap_ncore(); @@ -135,7 +135,7 @@ int TanhTable(int x, unsigned short * table){ #endif } -#define KER_ACT(Activation, in_d_type, out_d_type, p_type, n_bits, is_unsigned) \ +#define KER_ACT(Activation, in_d_type, out_d_type, p_type, in_n_bits, out_n_bits, is_unsigned) \ do { \ unsigned int S = Arg->W*Arg->H*Arg->Feat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); \ decl(in_d_type * __restrict__, In) = decl((in_d_type *__restrict__), Arg->In); \ @@ -147,12 +147,33 @@ do { \ \ for (unsigned int i=First; iW*Arg->H*Arg->Feat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); \ + int * __restrict__ InOut = (int *__restrict__) Arg->In; \ + signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; \ + unsigned int Size = Max(0, Last-First); \ + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; \ + int A0 = arr_at_as(Infos, AT_INF_A0, p_type); int B0 = arr_at_as(Infos, AT_INF_B0, p_type); int C0 = arr_at_as(Infos, AT_INF_C0, p_type); \ +\ + for (unsigned int i=0; iFeat; \ unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); \ @@ -169,14 +190,14 @@ do { \ for (unsigned int c=First; cFeat; \ unsigned S = Arg->W*Arg->H; \ @@ -194,14 +215,14 @@ do { \ for (unsigned int c=0; cFeat; \ unsigned int Size = Arg->W*Arg->H; \ @@ -218,8 +239,8 @@ do { \ for (unsigned int c=First; cFeat); \ } while(0); -#define KER_REDUCT_IO_ACT_CHW(Activation, d_type, p_type, n_bits, is_unsigned) \ +#define KER_REDUCT_IO_ACT_CHW(Activation, d_type, p_type, in_n_bits, out_n_bits, is_unsigned) \ do { \ unsigned int Feat = Arg->Feat; \ unsigned int S = Arg->W*Arg->H; \ @@ -246,15 +267,15 @@ do { \ d_type *Out = (d_type *) (InOut+S*c+First); \ for (unsigned int i=0; iFeat; \ unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(Feat), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, Feat); \ @@ -271,14 +292,14 @@ do { \ for (unsigned int c=First; cFeat; \ unsigned S = Arg->W*Arg->H; \ @@ -296,14 +317,14 @@ do { \ for (unsigned int c=0; cFeat; \ unsigned S = Arg->W*Arg->H; \ @@ -320,14 +341,14 @@ do { \ for (unsigned int i=First; iFeat; \ unsigned S = Arg->W*Arg->H; \ @@ -344,8 +365,8 @@ do { \ for (unsigned int i=First; iInfos; + unsigned char * Infos = (unsigned char *) Arg->Infos; unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]); @@ -347,7 +348,7 @@ static inline void __attribute__((always_inline)) KerParLinearLayerFullFeatB8_SQ } if (InDim&0x4) Acc = gap_sumdotp4(VectIn[InDim/4-1], W[InDim/4-1], Acc); for (int j=4*(InDim/4); jInfos; + unsigned char * Infos = (unsigned char *) Arg->Infos; unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]); @@ -426,7 +427,7 @@ static inline void __attribute__((always_inline)) KerParLinearLayerFullFeatB16_S } if (InDim&0x4) Acc = gap_sumdotp4(VectIn[InDim/4-1], W[InDim/4-1], Acc); for (int j=4*(InDim/4); jScale; unsigned char *ScaleN = Arg->ScaleN; signed char * __restrict__ Out = (signed char * __restrict__) Arg->Out; - unsigned char * Infos = Arg->Infos; + unsigned char * Infos = (unsigned char *) Arg->Infos; unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]); @@ -504,12 +505,63 @@ static inline void __attribute__((always_inline)) KerParLinearLayerFullFeatB32_S } if (InDim&0x4) Acc = gap_sumdotp4(VectIn[InDim/4-1], W[InDim/4-1], Acc); for (int j=4*(InDim/4); j +#include "CNN_BasicKernels_SQ8.h" + #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wextra" #pragma GCC diagnostic ignored "-Wpointer-sign" #pragma GCC diagnostic ignored "-Wsign-compare" #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include -#include "CNN_BasicKernels_SQ8.h" static int CoreCountDynamic = 1; static int ActiveCore = gap_ncore(); @@ -707,10 +708,10 @@ static inline void __attribute__((always_inline)) KerParMatMulB8_SQ8_act( S3 += V0 * BufferColIn2[i+3*H_In2]; } unsigned int Sc = Scale[Line], ScN = ScaleN[Line]; - S0 = AT_SCALE(S0, Sc, ScN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S1 = AT_SCALE(S1, Sc, ScN); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S2 = AT_SCALE(S2, Sc, ScN); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S3 = AT_SCALE(S3, Sc, ScN); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S0 = AT_SCALE(S0, Sc, ScN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S1 = AT_SCALE(S1, Sc, ScN); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S2 = AT_SCALE(S2, Sc, ScN); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S3 = AT_SCALE(S3, Sc, ScN); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); v4s R = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7)); *((v4s *) (Out+(Line+OffLine)*W_Out+4*Col+0+OffCol)) = R; } @@ -739,8 +740,8 @@ static inline void __attribute__((always_inline)) KerParMatMulB8_SQ8_act( S1 += V0 * BufferColIn2[i+1*H_In2]; } unsigned int Sc = Scale[Line], ScN = ScaleN[Line]; - S0 = AT_SCALE(S0, Sc, ScN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S1 = AT_SCALE(S1, Sc, ScN); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S0 = AT_SCALE(S0, Sc, ScN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S1 = AT_SCALE(S1, Sc, ScN); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); Out[(Line+OffLine)*W_Out+2*Col+0+OffCol] = gap_clip(S0, 7); Out[(Line+OffLine)*W_Out+2*Col+1+OffCol] = gap_clip(S1, 7); } @@ -765,7 +766,7 @@ static inline void __attribute__((always_inline)) KerParMatMulB8_SQ8_act( S0 += V0 * BufferColIn2[i]; } unsigned int Sc = Scale[Line], ScN = ScaleN[Line]; - S0 = AT_SCALE(S0, Sc, ScN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S0 = AT_SCALE(S0, Sc, ScN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); Out[(Line+OffLine)*W_Out+1*Col+0+OffCol] = gap_clip(S0, 7); } gap_waitbarrier(0); @@ -872,7 +873,7 @@ static inline void __attribute__((always_inline)) KerParMatMulSxSyB8_SQ8_act( if (W_In1&0x4) S = gap_sumdotp4(VIn1[W_In1/4-1], VBuff[W_In1/4-1], S); for (i=(W_In1/4)*4; i +#include "CNN_BasicKernels_SQ8.h" + #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wsign-compare" #pragma GCC diagnostic ignored "-Wswitch" - -#include -#include "CNN_BasicKernels_SQ8.h" +#pragma GCC diagnostic ignored "-Wpointer-sign" static int CoreCountDynamic = 1; static int ActiveCore = gap_ncore(); @@ -80,7 +81,7 @@ static inline void __attribute__((always_inline)) KerPar_MM_Conv1D_SQ8_act( signed char * __restrict__ ColBuff = Arg->ColBuff; int Wo = Arg->Wo, Ho = Arg->Ho; - unsigned char * Infos = Arg->Infos; + unsigned char * Infos = (unsigned char *) Arg->Infos; unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]); @@ -159,7 +160,7 @@ static inline void __attribute__((always_inline)) KerPar_MM_Conv1D_SQ8_act( S0 = gap_sumdotp4(V1, C1, S0); } unsigned int Sc = Scale[Line], ScN = ScaleN[Line]; - S0 = AT_SCALE(S0, Sc, ScN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S0 = AT_SCALE(S0, Sc, ScN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); Out[Line*Wo*Ho + l*Wo + c] = gap_clip(S0, 7); } gap_waitbarrier(0); @@ -235,7 +236,7 @@ static inline void __attribute__((always_inline)) KerPar_MM_Conv1x1_HWC_SQ8_act( unsigned char * __restrict__ ScaleN = Arg->ScaleN; int Wo = Arg->Wo, Ho = Arg->Ho; - unsigned char * Infos = Arg->Infos; + unsigned char * Infos = (unsigned char *) Arg->Infos; unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]); @@ -281,23 +282,23 @@ static inline void __attribute__((always_inline)) KerPar_MM_Conv1x1_HWC_SQ8_act( } unsigned int Sc, ScN; Sc = *pSc; ScN = *pScN; pSc++; pScN++; - S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S01 = AT_SCALE(S01, Sc, ScN); ACT_SWITCH(S01, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S01 = AT_SCALE(S01, Sc, ScN); ACT_SWITCH(S01, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); *pOut0 = gap_clip(S00, 7); pOut0++; *pOut1 = gap_clip(S01, 7); pOut1++; Sc = *pSc; ScN = *pScN; pSc++; pScN++; - S10 = AT_SCALE(S10, Sc, ScN); ACT_SWITCH(S10, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S11 = AT_SCALE(S11, Sc, ScN); ACT_SWITCH(S11, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S10 = AT_SCALE(S10, Sc, ScN); ACT_SWITCH(S10, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S11 = AT_SCALE(S11, Sc, ScN); ACT_SWITCH(S11, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); *pOut0 = gap_clip(S10, 7); pOut0++; *pOut1 = gap_clip(S11, 7); pOut1++; Sc = *pSc; ScN = *pScN; pSc++; pScN++; - S20 = AT_SCALE(S20, Sc, ScN); ACT_SWITCH(S20, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S21 = AT_SCALE(S21, Sc, ScN); ACT_SWITCH(S21, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S20 = AT_SCALE(S20, Sc, ScN); ACT_SWITCH(S20, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S21 = AT_SCALE(S21, Sc, ScN); ACT_SWITCH(S21, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); *pOut0 = gap_clip(S20, 7); pOut0++; *pOut1 = gap_clip(S21, 7); pOut1++; Sc = *pSc; ScN = *pScN; pSc++; pScN++; - S30 = AT_SCALE(S30, Sc, ScN); ACT_SWITCH(S30, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S31 = AT_SCALE(S31, Sc, ScN); ACT_SWITCH(S31, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S30 = AT_SCALE(S30, Sc, ScN); ACT_SWITCH(S30, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S31 = AT_SCALE(S31, Sc, ScN); ACT_SWITCH(S31, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); *pOut0 = gap_clip(S30, 7); pOut0++; *pOut1 = gap_clip(S31, 7); pOut1++; } @@ -316,8 +317,8 @@ static inline void __attribute__((always_inline)) KerPar_MM_Conv1x1_HWC_SQ8_act( } unsigned int Sc, ScN; Sc = *pSc; ScN = *pScN; pSc++; pScN++; - S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S01 = AT_SCALE(S01, Sc, ScN); ACT_SWITCH(S01, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S01 = AT_SCALE(S01, Sc, ScN); ACT_SWITCH(S01, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); *pOut0 = gap_clip(S00, 7); pOut0++; *pOut1 = gap_clip(S01, 7); pOut1++; } @@ -358,16 +359,16 @@ static inline void __attribute__((always_inline)) KerPar_MM_Conv1x1_HWC_SQ8_act( } unsigned int Sc, ScN; Sc = *pSc; ScN = *pScN; pSc++; pScN++; - S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); *pOut0 = gap_clip(S00, 7); pOut0++; Sc = *pSc; ScN = *pScN; pSc++; pScN++; - S10 = AT_SCALE(S10, Sc, ScN); ACT_SWITCH(S10, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S10 = AT_SCALE(S10, Sc, ScN); ACT_SWITCH(S10, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); *pOut0 = gap_clip(S10, 7); pOut0++; Sc = *pSc; ScN = *pScN; pSc++; pScN++; - S20 = AT_SCALE(S20, Sc, ScN); ACT_SWITCH(S20, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S20 = AT_SCALE(S20, Sc, ScN); ACT_SWITCH(S20, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); *pOut0 = gap_clip(S20, 7); pOut0++; Sc = *pSc; ScN = *pScN; pSc++; pScN++; - S30 = AT_SCALE(S30, Sc, ScN); ACT_SWITCH(S30, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S30 = AT_SCALE(S30, Sc, ScN); ACT_SWITCH(S30, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); *pOut0 = gap_clip(S30, 7); pOut0++; } for (int i=4*(IterOut/4); iScaleN; int Wo = Arg->Wo, Ho = Arg->Ho; - unsigned char * Infos = Arg->Infos; + unsigned char * Infos = (unsigned char *) Arg->Infos; unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]); @@ -507,23 +508,23 @@ static inline void __attribute__((always_inline)) Ker_MM_Conv1x1_HWC_SQ8_act( } unsigned int Sc, ScN; Sc = *pSc; ScN = *pScN; pSc++; pScN++; - S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S01 = AT_SCALE(S01, Sc, ScN); ACT_SWITCH(S01, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S01 = AT_SCALE(S01, Sc, ScN); ACT_SWITCH(S01, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); *pOut0 = gap_clip(S00, 7); pOut0++; *pOut1 = gap_clip(S01, 7); pOut1++; Sc = *pSc; ScN = *pScN; pSc++; pScN++; - S10 = AT_SCALE(S10, Sc, ScN); ACT_SWITCH(S10, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S11 = AT_SCALE(S11, Sc, ScN); ACT_SWITCH(S11, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S10 = AT_SCALE(S10, Sc, ScN); ACT_SWITCH(S10, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S11 = AT_SCALE(S11, Sc, ScN); ACT_SWITCH(S11, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); *pOut0 = gap_clip(S10, 7); pOut0++; *pOut1 = gap_clip(S11, 7); pOut1++; Sc = *pSc; ScN = *pScN; pSc++; pScN++; - S20 = AT_SCALE(S20, Sc, ScN); ACT_SWITCH(S20, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S21 = AT_SCALE(S21, Sc, ScN); ACT_SWITCH(S21, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S20 = AT_SCALE(S20, Sc, ScN); ACT_SWITCH(S20, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S21 = AT_SCALE(S21, Sc, ScN); ACT_SWITCH(S21, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); *pOut0 = gap_clip(S20, 7); pOut0++; *pOut1 = gap_clip(S21, 7); pOut1++; Sc = *pSc; ScN = *pScN; pSc++; pScN++; - S30 = AT_SCALE(S30, Sc, ScN); ACT_SWITCH(S30, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S31 = AT_SCALE(S31, Sc, ScN); ACT_SWITCH(S31, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S30 = AT_SCALE(S30, Sc, ScN); ACT_SWITCH(S30, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S31 = AT_SCALE(S31, Sc, ScN); ACT_SWITCH(S31, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); *pOut0 = gap_clip(S30, 7); pOut0++; *pOut1 = gap_clip(S31, 7); pOut1++; } @@ -542,8 +543,8 @@ static inline void __attribute__((always_inline)) Ker_MM_Conv1x1_HWC_SQ8_act( } unsigned int Sc, ScN; Sc = *pSc; ScN = *pScN; pSc++; pScN++; - S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S01 = AT_SCALE(S01, Sc, ScN); ACT_SWITCH(S01, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S01 = AT_SCALE(S01, Sc, ScN); ACT_SWITCH(S01, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); *pOut0 = gap_clip(S00, 7); pOut0++; *pOut1 = gap_clip(S01, 7); pOut1++; } @@ -584,16 +585,16 @@ static inline void __attribute__((always_inline)) Ker_MM_Conv1x1_HWC_SQ8_act( } unsigned int Sc, ScN; Sc = *pSc; ScN = *pScN; pSc++; pScN++; - S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); *pOut0 = gap_clip(S00, 7); pOut0++; Sc = *pSc; ScN = *pScN; pSc++; pScN++; - S10 = AT_SCALE(S10, Sc, ScN); ACT_SWITCH(S10, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S10 = AT_SCALE(S10, Sc, ScN); ACT_SWITCH(S10, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); *pOut0 = gap_clip(S10, 7); pOut0++; Sc = *pSc; ScN = *pScN; pSc++; pScN++; - S20 = AT_SCALE(S20, Sc, ScN); ACT_SWITCH(S20, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S20 = AT_SCALE(S20, Sc, ScN); ACT_SWITCH(S20, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); *pOut0 = gap_clip(S20, 7); pOut0++; Sc = *pSc; ScN = *pScN; pSc++; pScN++; - S30 = AT_SCALE(S30, Sc, ScN); ACT_SWITCH(S30, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S30 = AT_SCALE(S30, Sc, ScN); ACT_SWITCH(S30, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); *pOut0 = gap_clip(S30, 7); pOut0++; } for (int i=4*(OutFeat/4); iColBuff; int Wo = Arg->Wo, Ho = Arg->Ho; - unsigned char * Infos = Arg->Infos; + unsigned char * Infos = (unsigned char *) Arg->Infos; unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]); @@ -784,14 +785,14 @@ static inline void __attribute__((always_inline)) KerPar_MM_Conv1D_HWC_SQ8_act( S3 += V0*C3; S7 += V1*C3; pIn++; pIn1++; pC0++; pC1++; pC2++; pC3++; } - S0 = AT_SCALE(S0, pSc[4*Line ], pScN[4*Line ]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S1 = AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S2 = AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S3 = AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S4 = AT_SCALE(S4, pSc[4*Line ], pScN[4*Line ]); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S5 = AT_SCALE(S5, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S5, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S6 = AT_SCALE(S6, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S6, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S7 = AT_SCALE(S7, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S7, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S0 = AT_SCALE(S0, pSc[4*Line ], pScN[4*Line ]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S1 = AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S2 = AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S3 = AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S4 = AT_SCALE(S4, pSc[4*Line ], pScN[4*Line ]); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S5 = AT_SCALE(S5, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S5, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S6 = AT_SCALE(S6, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S6, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S7 = AT_SCALE(S7, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S7, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); v4s R1 = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7)); v4s R2 = gap_pack4(gap_clip(S4, 7), gap_clip(S5, 7), gap_clip(S6, 7), gap_clip(S7, 7)); *((v4s *) (pOut0+4*Line)) = R1; @@ -811,8 +812,8 @@ static inline void __attribute__((always_inline)) KerPar_MM_Conv1D_HWC_SQ8_act( S0 += V0*C0; S4 += V1*C0; pIn++; pIn1++; pC++; } - S0 = AT_SCALE(S0, pSc[i], pScN[i]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S4 = AT_SCALE(S4, pSc[i], pScN[i]); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S0 = AT_SCALE(S0, pSc[i], pScN[i]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S4 = AT_SCALE(S4, pSc[i], pScN[i]); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); *(pOut0+i) = gap_clip(S0, 7); *(pOut1+i) = gap_clip(S4, 7); } @@ -868,10 +869,10 @@ static inline void __attribute__((always_inline)) KerPar_MM_Conv1D_HWC_SQ8_act( S3 += V0*C3; pIn++; pC0++; pC1++; pC2++; pC3++; } - S0 = AT_SCALE(S0, pSc[4*Line ], pScN[4*Line ]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S1 = AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S2 = AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S3 = AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S0 = AT_SCALE(S0, pSc[4*Line ], pScN[4*Line ]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S1 = AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S2 = AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S3 = AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); v4s R1 = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7)); *((v4s *) (pOut0+4*Line)) = R1; } @@ -888,7 +889,7 @@ static inline void __attribute__((always_inline)) KerPar_MM_Conv1D_HWC_SQ8_act( S0 += V0*C0; pIn++; pC++; } - S0 = AT_SCALE(S0, pSc[i], pScN[i]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S0 = AT_SCALE(S0, pSc[i], pScN[i]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); *(pOut0+i) = gap_clip(S0, 7); } gap_waitbarrier(0); @@ -963,7 +964,7 @@ static inline void __attribute__((always_inline)) KerPar_MM_Conv1D_DxDy_SQ8_act( signed char * __restrict__ ColBuff = Arg->ColBuff; int Wo = Arg->Wo, Ho = Arg->Ho; - unsigned char * Infos = Arg->Infos; + unsigned char * Infos = (unsigned char *) Arg->Infos; unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]); @@ -1010,7 +1011,7 @@ static inline void __attribute__((always_inline)) KerPar_MM_Conv1D_DxDy_SQ8_act( S0 = gap_sumdotp4(V1, C1, S0); } unsigned int Sc = Scale[Line], ScN = ScaleN[Line]; - S0 = AT_SCALE(S0, Sc, ScN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S0 = AT_SCALE(S0, Sc, ScN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); Out[Line*Wo*Ho + l*Wo + c] = gap_clip(S0, 7); } gap_waitbarrier(0); @@ -1089,7 +1090,7 @@ static inline void __attribute__((always_inline)) KerPar_MM_Conv1D_DxDy_HWC_SQ8_ int Wo = Arg->Wo, Ho = Arg->Ho; - unsigned char * Infos = Arg->Infos; + unsigned char * Infos = (unsigned char *) Arg->Infos; unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]); @@ -1210,14 +1211,14 @@ static inline void __attribute__((always_inline)) KerPar_MM_Conv1D_DxDy_HWC_SQ8_ S3 += V0*C3; S7 += V1*C3; pIn++; pIn1++; pC0++; pC1++; pC2++; pC3++; } - S0 = AT_SCALE(S0, pSc[4*Line ], pScN[4*Line ]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S1 = AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S2 = AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S3 = AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S4 = AT_SCALE(S4, pSc[4*Line ], pScN[4*Line ]); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S5 = AT_SCALE(S5, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S5, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S6 = AT_SCALE(S6, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S6, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S7 = AT_SCALE(S7, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S7, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S0 = AT_SCALE(S0, pSc[4*Line ], pScN[4*Line ]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S1 = AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S2 = AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S3 = AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S4 = AT_SCALE(S4, pSc[4*Line ], pScN[4*Line ]); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S5 = AT_SCALE(S5, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S5, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S6 = AT_SCALE(S6, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S6, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S7 = AT_SCALE(S7, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S7, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); v4s R1 = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7)); v4s R2 = gap_pack4(gap_clip(S4, 7), gap_clip(S5, 7), gap_clip(S6, 7), gap_clip(S7, 7)); *((v4s *) (pOut0+4*Line)) = R1; @@ -1237,8 +1238,8 @@ static inline void __attribute__((always_inline)) KerPar_MM_Conv1D_DxDy_HWC_SQ8_ S0 += V0*C0; S4 += V1*C0; pIn++; pIn1++; pC++; } - S0 = AT_SCALE(S0, pSc[i], pScN[i]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S4 = AT_SCALE(S4, pSc[i], pScN[i]); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S0 = AT_SCALE(S0, pSc[i], pScN[i]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S4 = AT_SCALE(S4, pSc[i], pScN[i]); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); *(pOut0+i) = gap_clip(S0, 7); *(pOut1+i) = gap_clip(S4, 7); } @@ -1305,10 +1306,10 @@ static inline void __attribute__((always_inline)) KerPar_MM_Conv1D_DxDy_HWC_SQ8_ S3 += V0*C3; pIn++; pC0++; pC1++; pC2++; pC3++; } - S0 = AT_SCALE(S0, pSc[4*Line ], pScN[4*Line ]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S1 = AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S2 = AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S3 = AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S0 = AT_SCALE(S0, pSc[4*Line ], pScN[4*Line ]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S1 = AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S2 = AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S3 = AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); v4s R1 = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7)); *((v4s *) (pOut0+4*Line)) = R1; } @@ -1325,7 +1326,7 @@ static inline void __attribute__((always_inline)) KerPar_MM_Conv1D_DxDy_HWC_SQ8_ S0 += V0*C0; pIn++; pC++; } - S0 = AT_SCALE(S0, pSc[i], pScN[i]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S0 = AT_SCALE(S0, pSc[i], pScN[i]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); *(pOut0+i) = gap_clip(S0, 7); } gap_waitbarrier(0); @@ -1397,7 +1398,7 @@ static inline void __attribute__((always_inline)) KerPar_MM_Conv2D_SQ8_act( signed char * __restrict__ ColBuff = Arg->ColBuff; int Wo = Arg->Wo, Ho = Arg->Ho; - unsigned char * Infos = Arg->Infos; + unsigned char * Infos = (unsigned char *) Arg->Infos; unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]); @@ -1504,7 +1505,7 @@ static inline void __attribute__((always_inline)) KerPar_MM_Conv2D_SQ8_act( } unsigned int Sc = Scale[Line], ScN = ScaleN[Line]; // printf("Out[F:%d, H:%d, W:%d] = (%d * %d) >> %d = %d\n", Line, l, c, S0, Sc, ScN, gap_clip(AT_SCALE(S0, Sc, ScN), 7)); - S0 = AT_SCALE(S0, Sc, ScN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S0 = AT_SCALE(S0, Sc, ScN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); Out[Line*Wo*Ho + l*Wo + c] = gap_clip(S0, 7); } gap_waitbarrier(0); @@ -1580,7 +1581,7 @@ static inline void __attribute__((always_inline)) KerPar_MM_Conv2D_HWC_SQ8_act( unsigned char * __restrict__ ScaleN = Arg->ScaleN; signed char * __restrict__ ColBuff = Arg->ColBuff; int Wo = Arg->Wo, Ho = Arg->Ho; - unsigned char * Infos = Arg->Infos; + unsigned char * Infos = (unsigned char *) Arg->Infos; unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]); @@ -1703,14 +1704,14 @@ static inline void __attribute__((always_inline)) KerPar_MM_Conv2D_HWC_SQ8_act( S3 += V0*C3; S7 += V1*C3; pIn++; pIn1++; pC0++; pC1++; pC2++; pC3++; } - S0 = AT_SCALE(S0, pSc[4*Line ], pScN[4*Line ]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S1 = AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S2 = AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S3 = AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S4 = AT_SCALE(S4, pSc[4*Line ], pScN[4*Line ]); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S5 = AT_SCALE(S5, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S5, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S6 = AT_SCALE(S6, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S6, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S7 = AT_SCALE(S7, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S7, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S0 = AT_SCALE(S0, pSc[4*Line ], pScN[4*Line ]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S1 = AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S2 = AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S3 = AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S4 = AT_SCALE(S4, pSc[4*Line ], pScN[4*Line ]); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S5 = AT_SCALE(S5, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S5, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S6 = AT_SCALE(S6, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S6, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S7 = AT_SCALE(S7, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S7, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); v4s R1 = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7)); v4s R2 = gap_pack4(gap_clip(S4, 7), gap_clip(S5, 7), gap_clip(S6, 7), gap_clip(S7, 7)); *((v4s *) (pOut0+4*Line)) = R1; @@ -1730,8 +1731,8 @@ static inline void __attribute__((always_inline)) KerPar_MM_Conv2D_HWC_SQ8_act( S0 += V0*C0; S4 += V1*C0; pIn++; pIn1++; pC++; } - S0 = AT_SCALE(S0, pSc[i], pScN[i]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S4 = AT_SCALE(S4, pSc[i], pScN[i]); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S0 = AT_SCALE(S0, pSc[i], pScN[i]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S4 = AT_SCALE(S4, pSc[i], pScN[i]); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); *(pOut0+i) = gap_clip(S0, 7); *(pOut1+i) = gap_clip(S4, 7); } @@ -1804,10 +1805,10 @@ static inline void __attribute__((always_inline)) KerPar_MM_Conv2D_HWC_SQ8_act( S3 += V0*C3; pIn++; pC0++; pC1++; pC2++; pC3++; } - S0 = AT_SCALE(S0, pSc[4*Line ], pScN[4*Line ]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S1 = AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S2 = AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S3 = AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S0 = AT_SCALE(S0, pSc[4*Line ], pScN[4*Line ]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S1 = AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S2 = AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S3 = AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); v4s R1 = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7)); *((v4s *) (pOut0+4*Line)) = R1; } @@ -1824,7 +1825,7 @@ static inline void __attribute__((always_inline)) KerPar_MM_Conv2D_HWC_SQ8_act( S0 += V0*C0; pIn++; pC++; } - S0 = AT_SCALE(S0, pSc[i], pScN[i]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S0 = AT_SCALE(S0, pSc[i], pScN[i]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); *(pOut0+i) = gap_clip(S0, 7); } gap_waitbarrier(0); @@ -1902,7 +1903,7 @@ static inline void __attribute__((always_inline)) Ker_MM_Conv2D_HWC_SQ8_act( signed char * __restrict__ ColBuff = Arg->ColBuff; signed char * __restrict__ ColBuff1; int Wo = Arg->Wo, Ho = Arg->Ho; - unsigned char * Infos = Arg->Infos; + unsigned char * Infos = (unsigned char *) Arg->Infos; unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]); @@ -2021,23 +2022,23 @@ static inline void __attribute__((always_inline)) Ker_MM_Conv2D_HWC_SQ8_act( } unsigned int Sc, ScN; Sc = *pSc; ScN = *pScN; pSc++; pScN++; - S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S01 = AT_SCALE(S01, Sc, ScN); ACT_SWITCH(S01, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S01 = AT_SCALE(S01, Sc, ScN); ACT_SWITCH(S01, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); *pOut0 = gap_clip(S00, 7); pOut0++; *pOut1 = gap_clip(S01, 7); pOut1++; Sc = *pSc; ScN = *pScN; pSc++; pScN++; - S10 = AT_SCALE(S10, Sc, ScN); ACT_SWITCH(S10, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S11 = AT_SCALE(S11, Sc, ScN); ACT_SWITCH(S11, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S10 = AT_SCALE(S10, Sc, ScN); ACT_SWITCH(S10, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S11 = AT_SCALE(S11, Sc, ScN); ACT_SWITCH(S11, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); *pOut0 = gap_clip(S10, 7); pOut0++; *pOut1 = gap_clip(S11, 7); pOut1++; Sc = *pSc; ScN = *pScN; pSc++; pScN++; - S20 = AT_SCALE(S20, Sc, ScN); ACT_SWITCH(S20, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S21 = AT_SCALE(S21, Sc, ScN); ACT_SWITCH(S21, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S20 = AT_SCALE(S20, Sc, ScN); ACT_SWITCH(S20, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S21 = AT_SCALE(S21, Sc, ScN); ACT_SWITCH(S21, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); *pOut0 = gap_clip(S20, 7); pOut0++; *pOut1 = gap_clip(S21, 7); pOut1++; Sc = *pSc; ScN = *pScN; pSc++; pScN++; - S30 = AT_SCALE(S30, Sc, ScN); ACT_SWITCH(S30, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S31 = AT_SCALE(S31, Sc, ScN); ACT_SWITCH(S31, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S30 = AT_SCALE(S30, Sc, ScN); ACT_SWITCH(S30, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S31 = AT_SCALE(S31, Sc, ScN); ACT_SWITCH(S31, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); *pOut0 = gap_clip(S30, 7); pOut0++; *pOut1 = gap_clip(S31, 7); pOut1++; } @@ -2056,8 +2057,8 @@ static inline void __attribute__((always_inline)) Ker_MM_Conv2D_HWC_SQ8_act( } unsigned int Sc, ScN; Sc = *pSc; ScN = *pScN; pSc++; pScN++; - S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S01 = AT_SCALE(S01, Sc, ScN); ACT_SWITCH(S01, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S01 = AT_SCALE(S01, Sc, ScN); ACT_SWITCH(S01, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); *pOut0 = gap_clip(S00, 7); pOut0++; *pOut1 = gap_clip(S01, 7); pOut1++; } @@ -2138,16 +2139,16 @@ static inline void __attribute__((always_inline)) Ker_MM_Conv2D_HWC_SQ8_act( } unsigned int Sc, ScN; Sc = *pSc; ScN = *pScN; pSc++; pScN++; - S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); *pOut0 = gap_clip(S00, 7); pOut0++; Sc = *pSc; ScN = *pScN; pSc++; pScN++; - S10 = AT_SCALE(S10, Sc, ScN); ACT_SWITCH(S10, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S10 = AT_SCALE(S10, Sc, ScN); ACT_SWITCH(S10, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); *pOut0 = gap_clip(S10, 7); pOut0++; Sc = *pSc; ScN = *pScN; pSc++; pScN++; - S20 = AT_SCALE(S20, Sc, ScN); ACT_SWITCH(S20, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S20 = AT_SCALE(S20, Sc, ScN); ACT_SWITCH(S20, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); *pOut0 = gap_clip(S20, 7); pOut0++; Sc = *pSc; ScN = *pScN; pSc++; pScN++; - S30 = AT_SCALE(S30, Sc, ScN); ACT_SWITCH(S30, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S30 = AT_SCALE(S30, Sc, ScN); ACT_SWITCH(S30, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); *pOut0 = gap_clip(S30, 7); pOut0++; } for (int i=4*(IterOut/4); iScaleN; signed char * __restrict__ ColBuff = Arg->ColBuff; int Wo = Arg->Wo, Ho = Arg->Ho; - unsigned char * Infos = Arg->Infos; + unsigned char * Infos = (unsigned char *) Arg->Infos; unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]); @@ -2292,7 +2293,7 @@ static inline void __attribute__((always_inline)) KerPar_MM_Conv2D_DxDy_SQ8_act( S0 = gap_sumdotp4(V1, C1, S0); } unsigned int Sc = Scale[Line], ScN = ScaleN[Line]; - S0 = AT_SCALE(S0, Sc, ScN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S0 = AT_SCALE(S0, Sc, ScN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); Out[Line*Wo*Ho + l*Wo + c] = gap_clip(S0, 7); } gap_waitbarrier(0); @@ -2370,7 +2371,7 @@ static inline void __attribute__((always_inline)) KerPar_MM_Conv2D_DxDy_HWC_SQ8_ unsigned char * __restrict__ ScaleN = Arg->ScaleN; signed char * __restrict__ ColBuff = Arg->ColBuff; int Wo = Arg->Wo, Ho = Arg->Ho; - unsigned char * Infos = Arg->Infos; + unsigned char * Infos = (unsigned char *) Arg->Infos; unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]); @@ -2490,14 +2491,14 @@ This part is more efficient but NOT WORKING ???? TOCHECK S3 += V0*C3; S7 += V1*C3; pIn++; pIn1++; pC0++; pC1++; pC2++; pC3++; } - S0 = AT_SCALE(S0, pSc[4*Line ], pScN[4*Line ]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S1 = AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S2 = AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S3 = AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S4 = AT_SCALE(S4, pSc[4*Line ], pScN[4*Line ]); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S5 = AT_SCALE(S5, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S5, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S6 = AT_SCALE(S6, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S6, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S7 = AT_SCALE(S7, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S7, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S0 = AT_SCALE(S0, pSc[4*Line ], pScN[4*Line ]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S1 = AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S2 = AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S3 = AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S4 = AT_SCALE(S4, pSc[4*Line ], pScN[4*Line ]); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S5 = AT_SCALE(S5, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S5, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S6 = AT_SCALE(S6, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S6, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S7 = AT_SCALE(S7, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S7, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); v4s R1 = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7)); v4s R2 = gap_pack4(gap_clip(S4, 7), gap_clip(S5, 7), gap_clip(S6, 7), gap_clip(S7, 7)); *((v4s *) (pOut0+4*Line)) = R1; @@ -2517,8 +2518,8 @@ This part is more efficient but NOT WORKING ???? TOCHECK S0 += V0*C0; S4 += V1*C0; pIn++; pIn1++; pC++; } - S0 = AT_SCALE(S0, pSc[i], pScN[i]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); - S4 = AT_SCALE(S4, pSc[i], pScN[i]); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S0 = AT_SCALE(S0, pSc[i], pScN[i]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); + S4 = AT_SCALE(S4, pSc[i], pScN[i]); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); *(pOut0+i) = gap_clip(S0, 7); *(pOut1+i) = gap_clip(S4, 7); } @@ -2570,16 +2571,16 @@ This part is more efficient but NOT WORKING ???? TOCHECK } unsigned int Sc, ScN; Sc = *pSc; ScN = *pScN; pSc++; pScN++; - S0 = AT_SCALE(S0, Sc, ScN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S0 = AT_SCALE(S0, Sc, ScN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); *pOut0 = gap_clip(S0, 7); pOut0++; Sc = *pSc; ScN = *pScN; pSc++; pScN++; - S1 = AT_SCALE(S1, Sc, ScN); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S1 = AT_SCALE(S1, Sc, ScN); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); *pOut0 = gap_clip(S1, 7); pOut0++; Sc = *pSc; ScN = *pScN; pSc++; pScN++; - S2 = AT_SCALE(S2, Sc, ScN); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S2 = AT_SCALE(S2, Sc, ScN); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); *pOut0 = gap_clip(S2, 7); pOut0++; Sc = *pSc; ScN = *pScN; pSc++; pScN++; - S3 = AT_SCALE(S3, Sc, ScN); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S3 = AT_SCALE(S3, Sc, ScN); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 0, 0); *pOut0 = gap_clip(S3, 7); pOut0++; } for (int i=4*(IterOut/4); i +#include "Gap.h" +#include "CNN_BasicKernels_SQ8.h" + #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wextra" #pragma GCC diagnostic ignored "-Wpointer-sign" @@ -21,10 +25,6 @@ #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #pragma GCC diagnostic ignored "-Wswitch" -#include -#include "Gap.h" -#include "CNN_BasicKernels_SQ8.h" - static int CoreCountDynamic = 1; static int ActiveCore = gap_ncore(); @@ -3209,11 +3209,28 @@ void KerPoolNxMStrideSxSy_ReLUMN_SQ8(KerPool_SQ8_T *Arg) } +/* HWC Version */ +#define KER_POOL_ACT(Activation, p_type, n_bits, is_unsigned) \ +do { \ + int Size = Wo*Ho*Feat; \ + int CoreId = gap_coreid(), ChunkCell = ChunkSize(Size), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, Size); \ + signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; \ + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; \ + int A0 = arr_at_as(Infos, AT_INF_A0, p_type); int B0 = arr_at_as(Infos, AT_INF_B0, p_type); int C0 = arr_at_as(Infos, AT_INF_C0, p_type); \ +\ + for (int i=First; iIn; @@ -3262,11 +3279,55 @@ void KerParMaxPoolNxMStrideSxSy_HWC_SQ8(Ker_MM_Pool_SQ8_T *Arg) PosL += Sy; } gap_waitbarrier(0); - // KerParPoolActivation(Out, Wo, Ho, First, Last, Infos, Arg->Activation); - // gap_waitbarrier(0); + if (Activation != ACT_NONE) { + KER_POOL_ACT(Activation, unsigned char, 8, 0); + } +} + +void KerParMaxPoolNxMStrideSxSy_HWC_SQ8(Ker_MM_Pool_SQ8_T *Arg) { + KerParMaxPoolNxMStrideSxSy_HWC_SQ8_act(Arg, ACT_NONE); +} + +void KerParMaxPoolNxMStrideSxSy_HWC_ReLU_SQ8(Ker_MM_Pool_SQ8_T *Arg) { + KerParMaxPoolNxMStrideSxSy_HWC_SQ8_act(Arg, ACT_RELU); +} + +void KerParMaxPoolNxMStrideSxSy_HWC_ReLUN_SQ8(Ker_MM_Pool_SQ8_T *Arg) { + KerParMaxPoolNxMStrideSxSy_HWC_SQ8_act(Arg, ACT_RELUN); +} + +void KerParMaxPoolNxMStrideSxSy_HWC_ReLUM_SQ8(Ker_MM_Pool_SQ8_T *Arg) { + KerParMaxPoolNxMStrideSxSy_HWC_SQ8_act(Arg, ACT_RELUM); +} + +void KerParMaxPoolNxMStrideSxSy_HWC_ReLUMN_SQ8(Ker_MM_Pool_SQ8_T *Arg) { + KerParMaxPoolNxMStrideSxSy_HWC_SQ8_act(Arg, ACT_RELUMN); } -void KerParMaxPoolNxMStrideSxSy_HWC_USQ8(Ker_MM_Pool_USQ8_T *Arg) +void KerParMaxPoolNxMStrideSxSy_HWC_HSigmoid_SQ8(Ker_MM_Pool_SQ8_T *Arg) { + KerParMaxPoolNxMStrideSxSy_HWC_SQ8_act(Arg, ACT_HSIGMOID); +} + +void KerParMaxPoolNxMStrideSxSy_HWC_HSwish_SQ8(Ker_MM_Pool_SQ8_T *Arg) { + KerParMaxPoolNxMStrideSxSy_HWC_SQ8_act(Arg, ACT_HSWISH); +} + +void KerParMaxPoolNxMStrideSxSy_HWC_LeakyReLU_SQ8(Ker_MM_Pool_SQ8_T *Arg) { + KerParMaxPoolNxMStrideSxSy_HWC_SQ8_act(Arg, ACT_LEAKYRELU); +} + +void KerParMaxPoolNxMStrideSxSy_HWC_Sigmoid_SQ8(Ker_MM_Pool_SQ8_T *Arg) { + KerParMaxPoolNxMStrideSxSy_HWC_SQ8_act(Arg, ACT_SIGMOID); +} + +void KerParMaxPoolNxMStrideSxSy_HWC_Tanh_SQ8(Ker_MM_Pool_SQ8_T *Arg) { + KerParMaxPoolNxMStrideSxSy_HWC_SQ8_act(Arg, ACT_TANH); +} + +static inline void __attribute__((always_inline)) KerParMaxPoolNxMStrideSxSy_HWC_USQ8_act( + Ker_MM_Pool_USQ8_T *Arg, + CNN_ActivationOper_T Activation +) { unsigned char *__restrict__ In = Arg->In; @@ -3282,7 +3343,7 @@ void KerParMaxPoolNxMStrideSxSy_HWC_USQ8(Ker_MM_Pool_USQ8_T *Arg) unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(Feat), First = CoreId*ChunkCell, Last = Min(Feat, First+ChunkCell); int PosL = Arg->FirstTile?(-PadT):0; - int Iter = Last-First; + int Iter = Max(0, Last-First); for (int l=0; lActivation); - // gap_waitbarrier(0); + if (Activation != ACT_NONE) { + KER_POOL_ACT(Activation, unsigned char, 8, 1); + } +} + +void KerParMaxPoolNxMStrideSxSy_HWC_USQ8(Ker_MM_Pool_USQ8_T *Arg) { + KerParMaxPoolNxMStrideSxSy_HWC_USQ8_act(Arg, ACT_NONE); +} + +void KerParMaxPoolNxMStrideSxSy_HWC_ReLU_USQ8(Ker_MM_Pool_USQ8_T *Arg) { + KerParMaxPoolNxMStrideSxSy_HWC_USQ8_act(Arg, ACT_RELU); +} + +void KerParMaxPoolNxMStrideSxSy_HWC_ReLUN_USQ8(Ker_MM_Pool_USQ8_T *Arg) { + KerParMaxPoolNxMStrideSxSy_HWC_USQ8_act(Arg, ACT_RELUN); } -void KerParAvgPoolNxMStrideSxSy_HWC_SQ8(Ker_MM_Pool_SQ8_T *Arg) +void KerParMaxPoolNxMStrideSxSy_HWC_ReLUM_USQ8(Ker_MM_Pool_USQ8_T *Arg) { + KerParMaxPoolNxMStrideSxSy_HWC_USQ8_act(Arg, ACT_RELUM); +} + +void KerParMaxPoolNxMStrideSxSy_HWC_ReLUMN_USQ8(Ker_MM_Pool_USQ8_T *Arg) { + KerParMaxPoolNxMStrideSxSy_HWC_USQ8_act(Arg, ACT_RELUMN); +} + +void KerParMaxPoolNxMStrideSxSy_HWC_HSigmoid_USQ8(Ker_MM_Pool_USQ8_T *Arg) { + KerParMaxPoolNxMStrideSxSy_HWC_USQ8_act(Arg, ACT_HSIGMOID); +} + +void KerParMaxPoolNxMStrideSxSy_HWC_HSwish_USQ8(Ker_MM_Pool_USQ8_T *Arg) { + KerParMaxPoolNxMStrideSxSy_HWC_USQ8_act(Arg, ACT_HSWISH); +} + +void KerParMaxPoolNxMStrideSxSy_HWC_LeakyReLU_USQ8(Ker_MM_Pool_USQ8_T *Arg) { + KerParMaxPoolNxMStrideSxSy_HWC_USQ8_act(Arg, ACT_LEAKYRELU); +} + +void KerParMaxPoolNxMStrideSxSy_HWC_Sigmoid_USQ8(Ker_MM_Pool_USQ8_T *Arg) { + KerParMaxPoolNxMStrideSxSy_HWC_USQ8_act(Arg, ACT_SIGMOID); +} + +void KerParMaxPoolNxMStrideSxSy_HWC_Tanh_USQ8(Ker_MM_Pool_USQ8_T *Arg) { + KerParMaxPoolNxMStrideSxSy_HWC_USQ8_act(Arg, ACT_TANH); +} + +static inline void __attribute__((always_inline)) KerParAvgPoolNxMStrideSxSy_HWC_SQ8_act( + Ker_MM_Pool_SQ8_T *Arg, + CNN_ActivationOper_T Activation +) { signed char *__restrict__ In = Arg->In; @@ -3388,11 +3493,55 @@ void KerParAvgPoolNxMStrideSxSy_HWC_SQ8(Ker_MM_Pool_SQ8_T *Arg) PosL += Sy; } gap_waitbarrier(0); - // KerParPoolActivation(Out, Wo, Ho, First, Last, Infos, Arg->Activation); - // gap_waitbarrier(0); + if (Activation != ACT_NONE) { + KER_POOL_ACT(Activation, unsigned char, 8, 0); + } +} + +void KerParAvgPoolNxMStrideSxSy_HWC_SQ8(Ker_MM_Pool_SQ8_T *Arg) { + KerParAvgPoolNxMStrideSxSy_HWC_SQ8_act(Arg, ACT_NONE); +} + +void KerParAvgPoolNxMStrideSxSy_HWC_ReLU_SQ8(Ker_MM_Pool_SQ8_T *Arg) { + KerParAvgPoolNxMStrideSxSy_HWC_SQ8_act(Arg, ACT_RELU); +} + +void KerParAvgPoolNxMStrideSxSy_HWC_ReLUN_SQ8(Ker_MM_Pool_SQ8_T *Arg) { + KerParAvgPoolNxMStrideSxSy_HWC_SQ8_act(Arg, ACT_RELUN); } -void KerParAvgPoolNxMStrideSxSy_HWC_USQ8(Ker_MM_Pool_USQ8_T *Arg) +void KerParAvgPoolNxMStrideSxSy_HWC_ReLUM_SQ8(Ker_MM_Pool_SQ8_T *Arg) { + KerParAvgPoolNxMStrideSxSy_HWC_SQ8_act(Arg, ACT_RELUM); +} + +void KerParAvgPoolNxMStrideSxSy_HWC_ReLUMN_SQ8(Ker_MM_Pool_SQ8_T *Arg) { + KerParAvgPoolNxMStrideSxSy_HWC_SQ8_act(Arg, ACT_RELUMN); +} + +void KerParAvgPoolNxMStrideSxSy_HWC_HSigmoid_SQ8(Ker_MM_Pool_SQ8_T *Arg) { + KerParAvgPoolNxMStrideSxSy_HWC_SQ8_act(Arg, ACT_HSIGMOID); +} + +void KerParAvgPoolNxMStrideSxSy_HWC_HSwish_SQ8(Ker_MM_Pool_SQ8_T *Arg) { + KerParAvgPoolNxMStrideSxSy_HWC_SQ8_act(Arg, ACT_HSWISH); +} + +void KerParAvgPoolNxMStrideSxSy_HWC_LeakyReLU_SQ8(Ker_MM_Pool_SQ8_T *Arg) { + KerParAvgPoolNxMStrideSxSy_HWC_SQ8_act(Arg, ACT_LEAKYRELU); +} + +void KerParAvgPoolNxMStrideSxSy_HWC_Sigmoid_SQ8(Ker_MM_Pool_SQ8_T *Arg) { + KerParAvgPoolNxMStrideSxSy_HWC_SQ8_act(Arg, ACT_SIGMOID); +} + +void KerParAvgPoolNxMStrideSxSy_HWC_Tanh_SQ8(Ker_MM_Pool_SQ8_T *Arg) { + KerParAvgPoolNxMStrideSxSy_HWC_SQ8_act(Arg, ACT_TANH); +} + +static inline void __attribute__((always_inline)) KerParAvgPoolNxMStrideSxSy_HWC_USQ8_act( + Ker_MM_Pool_USQ8_T *Arg, + CNN_ActivationOper_T Activation +) { unsigned char *__restrict__ In = Arg->In; @@ -3461,14 +3610,56 @@ void KerParAvgPoolNxMStrideSxSy_HWC_USQ8(Ker_MM_Pool_USQ8_T *Arg) PosL += Sy; } gap_waitbarrier(0); - // KerParPoolActivation(Out, Wo, Ho, First, Last, Infos, Arg->Activation); - // gap_waitbarrier(0); + if (Activation != ACT_NONE) { + KER_POOL_ACT(Activation, unsigned char, 8, 1); + } } +void KerParAvgPoolNxMStrideSxSy_HWC_USQ8(Ker_MM_Pool_USQ8_T *Arg) { + KerParAvgPoolNxMStrideSxSy_HWC_USQ8_act(Arg, ACT_NONE); +} + +void KerParAvgPoolNxMStrideSxSy_HWC_ReLU_USQ8(Ker_MM_Pool_USQ8_T *Arg) { + KerParAvgPoolNxMStrideSxSy_HWC_USQ8_act(Arg, ACT_RELU); +} +void KerParAvgPoolNxMStrideSxSy_HWC_ReLUN_USQ8(Ker_MM_Pool_USQ8_T *Arg) { + KerParAvgPoolNxMStrideSxSy_HWC_USQ8_act(Arg, ACT_RELUN); +} + +void KerParAvgPoolNxMStrideSxSy_HWC_ReLUM_USQ8(Ker_MM_Pool_USQ8_T *Arg) { + KerParAvgPoolNxMStrideSxSy_HWC_USQ8_act(Arg, ACT_RELUM); +} -void KerParMaxPoolNxMStrideSxSy_HWC_SQ16(Ker_MM_Pool_SQ16_T *Arg) +void KerParAvgPoolNxMStrideSxSy_HWC_ReLUMN_USQ8(Ker_MM_Pool_USQ8_T *Arg) { + KerParAvgPoolNxMStrideSxSy_HWC_USQ8_act(Arg, ACT_RELUMN); +} + +void KerParAvgPoolNxMStrideSxSy_HWC_HSigmoid_USQ8(Ker_MM_Pool_USQ8_T *Arg) { + KerParAvgPoolNxMStrideSxSy_HWC_USQ8_act(Arg, ACT_HSIGMOID); +} + +void KerParAvgPoolNxMStrideSxSy_HWC_HSwish_USQ8(Ker_MM_Pool_USQ8_T *Arg) { + KerParAvgPoolNxMStrideSxSy_HWC_USQ8_act(Arg, ACT_HSWISH); +} +void KerParAvgPoolNxMStrideSxSy_HWC_LeakyReLU_USQ8(Ker_MM_Pool_USQ8_T *Arg) { + KerParAvgPoolNxMStrideSxSy_HWC_USQ8_act(Arg, ACT_LEAKYRELU); +} + +void KerParAvgPoolNxMStrideSxSy_HWC_Sigmoid_USQ8(Ker_MM_Pool_USQ8_T *Arg) { + KerParAvgPoolNxMStrideSxSy_HWC_USQ8_act(Arg, ACT_SIGMOID); +} + +void KerParAvgPoolNxMStrideSxSy_HWC_Tanh_USQ8(Ker_MM_Pool_USQ8_T *Arg) { + KerParAvgPoolNxMStrideSxSy_HWC_USQ8_act(Arg, ACT_TANH); +} + + +static inline void __attribute__((always_inline)) KerParMaxPoolNxMStrideSxSy_HWC_SQ16_act( + Ker_MM_Pool_SQ16_T *Arg, + CNN_ActivationOper_T Activation +) { short int *__restrict__ In = Arg->In; int W = Arg->W, H = Arg->H; @@ -3509,21 +3700,64 @@ void KerParMaxPoolNxMStrideSxSy_HWC_SQ16(Ker_MM_Pool_SQ16_T *Arg) PosL += Sy; } gap_waitbarrier(0); - // KerParPoolActivation(Out, Wo, Ho, First, Last, Infos, Arg->Activation); - // gap_waitbarrier(0); + if (Activation != ACT_NONE) { + KER_POOL_ACT(Activation, unsigned char, 8, 1); + } +} + +void KerParMaxPoolNxMStrideSxSy_HWC_SQ16(Ker_MM_Pool_SQ16_T *Arg) { + KerParMaxPoolNxMStrideSxSy_HWC_SQ16_act(Arg, ACT_NONE); +} + +void KerParMaxPoolNxMStrideSxSy_HWC_ReLU_SQ16(Ker_MM_Pool_SQ16_T *Arg) { + KerParMaxPoolNxMStrideSxSy_HWC_SQ16_act(Arg, ACT_RELU); } +void KerParMaxPoolNxMStrideSxSy_HWC_ReLUN_SQ16(Ker_MM_Pool_SQ16_T *Arg) { + KerParMaxPoolNxMStrideSxSy_HWC_SQ16_act(Arg, ACT_RELUN); +} + +void KerParMaxPoolNxMStrideSxSy_HWC_ReLUM_SQ16(Ker_MM_Pool_SQ16_T *Arg) { + KerParMaxPoolNxMStrideSxSy_HWC_SQ16_act(Arg, ACT_RELUM); +} + +void KerParMaxPoolNxMStrideSxSy_HWC_ReLUMN_SQ16(Ker_MM_Pool_SQ16_T *Arg) { + KerParMaxPoolNxMStrideSxSy_HWC_SQ16_act(Arg, ACT_RELUMN); +} + +void KerParMaxPoolNxMStrideSxSy_HWC_HSigmoid_SQ16(Ker_MM_Pool_SQ16_T *Arg) { + KerParMaxPoolNxMStrideSxSy_HWC_SQ16_act(Arg, ACT_HSIGMOID); +} + +void KerParMaxPoolNxMStrideSxSy_HWC_HSwish_SQ16(Ker_MM_Pool_SQ16_T *Arg) { + KerParMaxPoolNxMStrideSxSy_HWC_SQ16_act(Arg, ACT_HSWISH); +} -void KerParMaxPoolNxMStrideSxSy_HWC_USQ16(Ker_MM_Pool_SQ16_T *Arg) +void KerParMaxPoolNxMStrideSxSy_HWC_LeakyReLU_SQ16(Ker_MM_Pool_SQ16_T *Arg) { + KerParMaxPoolNxMStrideSxSy_HWC_SQ16_act(Arg, ACT_LEAKYRELU); +} + +void KerParMaxPoolNxMStrideSxSy_HWC_Sigmoid_SQ16(Ker_MM_Pool_SQ16_T *Arg) { + KerParMaxPoolNxMStrideSxSy_HWC_SQ16_act(Arg, ACT_SIGMOID); +} +void KerParMaxPoolNxMStrideSxSy_HWC_Tanh_SQ16(Ker_MM_Pool_SQ16_T *Arg) { + KerParMaxPoolNxMStrideSxSy_HWC_SQ16_act(Arg, ACT_TANH); +} + + +static inline void __attribute__((always_inline)) KerParMaxPoolNxMStrideSxSy_HWC_USQ16_act( + Ker_MM_Pool_USQ16_T *Arg, + CNN_ActivationOper_T Activation +) { - short int *__restrict__ In = Arg->In; + unsigned short int *__restrict__ In = Arg->In; int W = Arg->W, H = Arg->H; int Fx = Arg->Fx, Sx = Arg->Sx; int Fy = Arg->Fy, Sy = Arg->Sy; int PadL = Arg->Pad[0], PadT = Arg->Pad[2]; int Feat = Arg->Feat; - short int * __restrict__ Out = Arg->Out; + unsigned short int * __restrict__ Out = Arg->Out; int Wo = Arg->Wo, Ho = Arg->Ho; v2u M_Init = (v2u) {-32767,-32767}; @@ -3556,13 +3790,56 @@ void KerParMaxPoolNxMStrideSxSy_HWC_USQ16(Ker_MM_Pool_SQ16_T *Arg) PosL += Sy; } gap_waitbarrier(0); - // KerParPoolActivation(Out, Wo, Ho, First, Last, Infos, Arg->Activation); - // gap_waitbarrier(0); + if (Activation != ACT_NONE) { + KER_POOL_ACT(Activation, unsigned char, 8, 1); + } } +void KerParMaxPoolNxMStrideSxSy_HWC_USQ16(Ker_MM_Pool_USQ16_T *Arg) { + KerParMaxPoolNxMStrideSxSy_HWC_USQ16_act(Arg, ACT_NONE); +} + +void KerParMaxPoolNxMStrideSxSy_HWC_ReLU_USQ16(Ker_MM_Pool_USQ16_T *Arg) { + KerParMaxPoolNxMStrideSxSy_HWC_USQ16_act(Arg, ACT_RELU); +} -void KerParAvgPoolNxMStrideSxSy_HWC_SQ16(Ker_MM_Pool_SQ16_T *Arg) +void KerParMaxPoolNxMStrideSxSy_HWC_ReLUN_USQ16(Ker_MM_Pool_USQ16_T *Arg) { + KerParMaxPoolNxMStrideSxSy_HWC_USQ16_act(Arg, ACT_RELUN); +} + +void KerParMaxPoolNxMStrideSxSy_HWC_ReLUM_USQ16(Ker_MM_Pool_USQ16_T *Arg) { + KerParMaxPoolNxMStrideSxSy_HWC_USQ16_act(Arg, ACT_RELUM); +} +void KerParMaxPoolNxMStrideSxSy_HWC_ReLUMN_USQ16(Ker_MM_Pool_USQ16_T *Arg) { + KerParMaxPoolNxMStrideSxSy_HWC_USQ16_act(Arg, ACT_RELUMN); +} + +void KerParMaxPoolNxMStrideSxSy_HWC_HSigmoid_USQ16(Ker_MM_Pool_USQ16_T *Arg) { + KerParMaxPoolNxMStrideSxSy_HWC_USQ16_act(Arg, ACT_HSIGMOID); +} + +void KerParMaxPoolNxMStrideSxSy_HWC_HSwish_USQ16(Ker_MM_Pool_USQ16_T *Arg) { + KerParMaxPoolNxMStrideSxSy_HWC_USQ16_act(Arg, ACT_HSWISH); +} + +void KerParMaxPoolNxMStrideSxSy_HWC_LeakyReLU_USQ16(Ker_MM_Pool_USQ16_T *Arg) { + KerParMaxPoolNxMStrideSxSy_HWC_USQ16_act(Arg, ACT_LEAKYRELU); +} + +void KerParMaxPoolNxMStrideSxSy_HWC_Sigmoid_USQ16(Ker_MM_Pool_USQ16_T *Arg) { + KerParMaxPoolNxMStrideSxSy_HWC_USQ16_act(Arg, ACT_SIGMOID); +} + +void KerParMaxPoolNxMStrideSxSy_HWC_Tanh_USQ16(Ker_MM_Pool_USQ16_T *Arg) { + KerParMaxPoolNxMStrideSxSy_HWC_USQ16_act(Arg, ACT_TANH); +} + + +static inline void __attribute__((always_inline)) KerParAvgPoolNxMStrideSxSy_HWC_SQ16_act( + Ker_MM_Pool_SQ16_T *Arg, + CNN_ActivationOper_T Activation +) { signed short *__restrict__ In = Arg->In; int W = Arg->W, H = Arg->H; @@ -3611,12 +3888,56 @@ void KerParAvgPoolNxMStrideSxSy_HWC_SQ16(Ker_MM_Pool_SQ16_T *Arg) PosL += Sy; } gap_waitbarrier(0); - // KerParPoolActivation(Out, Wo, Ho, First, Last, Infos, Arg->Activation); - // gap_waitbarrier(0); + if (Activation != ACT_NONE) { + KER_POOL_ACT(Activation, unsigned char, 8, 1); + } +} + +void KerParAvgPoolNxMStrideSxSy_HWC_SQ16(Ker_MM_Pool_SQ16_T *Arg) { + KerParAvgPoolNxMStrideSxSy_HWC_SQ16_act(Arg, ACT_NONE); +} + +void KerParAvgPoolNxMStrideSxSy_HWC_ReLU_SQ16(Ker_MM_Pool_SQ16_T *Arg) { + KerParAvgPoolNxMStrideSxSy_HWC_SQ16_act(Arg, ACT_RELU); +} + +void KerParAvgPoolNxMStrideSxSy_HWC_ReLUN_SQ16(Ker_MM_Pool_SQ16_T *Arg) { + KerParAvgPoolNxMStrideSxSy_HWC_SQ16_act(Arg, ACT_RELUN); +} + +void KerParAvgPoolNxMStrideSxSy_HWC_ReLUM_SQ16(Ker_MM_Pool_SQ16_T *Arg) { + KerParAvgPoolNxMStrideSxSy_HWC_SQ16_act(Arg, ACT_RELUM); +} + +void KerParAvgPoolNxMStrideSxSy_HWC_ReLUMN_SQ16(Ker_MM_Pool_SQ16_T *Arg) { + KerParAvgPoolNxMStrideSxSy_HWC_SQ16_act(Arg, ACT_RELUMN); +} + +void KerParAvgPoolNxMStrideSxSy_HWC_HSigmoid_SQ16(Ker_MM_Pool_SQ16_T *Arg) { + KerParAvgPoolNxMStrideSxSy_HWC_SQ16_act(Arg, ACT_HSIGMOID); +} + +void KerParAvgPoolNxMStrideSxSy_HWC_HSwish_SQ16(Ker_MM_Pool_SQ16_T *Arg) { + KerParAvgPoolNxMStrideSxSy_HWC_SQ16_act(Arg, ACT_HSWISH); +} + +void KerParAvgPoolNxMStrideSxSy_HWC_LeakyReLU_SQ16(Ker_MM_Pool_SQ16_T *Arg) { + KerParAvgPoolNxMStrideSxSy_HWC_SQ16_act(Arg, ACT_LEAKYRELU); +} + +void KerParAvgPoolNxMStrideSxSy_HWC_Sigmoid_SQ16(Ker_MM_Pool_SQ16_T *Arg) { + KerParAvgPoolNxMStrideSxSy_HWC_SQ16_act(Arg, ACT_SIGMOID); +} + +void KerParAvgPoolNxMStrideSxSy_HWC_Tanh_SQ16(Ker_MM_Pool_SQ16_T *Arg) { + KerParAvgPoolNxMStrideSxSy_HWC_SQ16_act(Arg, ACT_TANH); } -void KerParAvgPoolNxMStrideSxSy_HWC_USQ16(Ker_MM_Pool_SQ16_T *Arg) +static inline void __attribute__((always_inline)) KerParAvgPoolNxMStrideSxSy_HWC_USQ16_act( + Ker_MM_Pool_USQ16_T *Arg, + CNN_ActivationOper_T Activation +) { unsigned short *__restrict__ In = Arg->In; int W = Arg->W, H = Arg->H; @@ -3665,6 +3986,49 @@ void KerParAvgPoolNxMStrideSxSy_HWC_USQ16(Ker_MM_Pool_SQ16_T *Arg) PosL += Sy; } gap_waitbarrier(0); - // KerParPoolActivation(Out, Wo, Ho, First, Last, Infos, Arg->Activation); - // gap_waitbarrier(0); -} \ No newline at end of file + if (Activation != ACT_NONE) { + KER_POOL_ACT(Activation, unsigned char, 8, 1); + } +} + +void KerParAvgPoolNxMStrideSxSy_HWC_USQ16(Ker_MM_Pool_USQ16_T *Arg) { + KerParAvgPoolNxMStrideSxSy_HWC_USQ16_act(Arg, ACT_NONE); +} + +void KerParAvgPoolNxMStrideSxSy_HWC_ReLU_USQ16(Ker_MM_Pool_USQ16_T *Arg) { + KerParAvgPoolNxMStrideSxSy_HWC_USQ16_act(Arg, ACT_RELU); +} + +void KerParAvgPoolNxMStrideSxSy_HWC_ReLUN_USQ16(Ker_MM_Pool_USQ16_T *Arg) { + KerParAvgPoolNxMStrideSxSy_HWC_USQ16_act(Arg, ACT_RELUN); +} + +void KerParAvgPoolNxMStrideSxSy_HWC_ReLUM_USQ16(Ker_MM_Pool_USQ16_T *Arg) { + KerParAvgPoolNxMStrideSxSy_HWC_USQ16_act(Arg, ACT_RELUM); +} + +void KerParAvgPoolNxMStrideSxSy_HWC_ReLUMN_USQ16(Ker_MM_Pool_USQ16_T *Arg) { + KerParAvgPoolNxMStrideSxSy_HWC_USQ16_act(Arg, ACT_RELUMN); +} + +void KerParAvgPoolNxMStrideSxSy_HWC_HSigmoid_USQ16(Ker_MM_Pool_USQ16_T *Arg) { + KerParAvgPoolNxMStrideSxSy_HWC_USQ16_act(Arg, ACT_HSIGMOID); +} + +void KerParAvgPoolNxMStrideSxSy_HWC_HSwish_USQ16(Ker_MM_Pool_USQ16_T *Arg) { + KerParAvgPoolNxMStrideSxSy_HWC_USQ16_act(Arg, ACT_HSWISH); +} + +void KerParAvgPoolNxMStrideSxSy_HWC_LeakyReLU_USQ16(Ker_MM_Pool_USQ16_T *Arg) { + KerParAvgPoolNxMStrideSxSy_HWC_USQ16_act(Arg, ACT_LEAKYRELU); +} + +void KerParAvgPoolNxMStrideSxSy_HWC_Sigmoid_USQ16(Ker_MM_Pool_USQ16_T *Arg) { + KerParAvgPoolNxMStrideSxSy_HWC_USQ16_act(Arg, ACT_SIGMOID); +} + +void KerParAvgPoolNxMStrideSxSy_HWC_Tanh_USQ16(Ker_MM_Pool_USQ16_T *Arg) { + KerParAvgPoolNxMStrideSxSy_HWC_USQ16_act(Arg, ACT_TANH); +} + +#pragma GCC diagnostic pop diff --git a/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_SoftMax_SQ8.c b/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_SoftMax_SQ8.c index 889d7cfd4..51ae1a98b 100644 --- a/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_SoftMax_SQ8.c +++ b/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_SoftMax_SQ8.c @@ -14,13 +14,14 @@ * limitations under the License. */ +#include +#include +#include "CNN_BasicKernels_SQ8.h" + #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wextra" #pragma GCC diagnostic ignored "-Wpointer-sign" #pragma GCC diagnostic ignored "-Wsign-compare" -#include -#include -#include "CNN_BasicKernels_SQ8.h" static int CoreCountDynamic = 1; static int ActiveCore = gap_ncore(); diff --git a/tools/autotiler_v3/CNN_Libraries_SQ8/RNN_SQ8.c b/tools/autotiler_v3/CNN_Libraries_SQ8/RNN_SQ8.c index 72a17aa83..5c134b67b 100644 --- a/tools/autotiler_v3/CNN_Libraries_SQ8/RNN_SQ8.c +++ b/tools/autotiler_v3/CNN_Libraries_SQ8/RNN_SQ8.c @@ -14,13 +14,15 @@ * limitations under the License. */ + +#include +#include +#include "CNN_BasicKernels_SQ8.h" + #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wextra" #pragma GCC diagnostic ignored "-Wpointer-sign" #pragma GCC diagnostic ignored "-Wsign-compare" -#include -#include -#include "CNN_BasicKernels_SQ8.h" static int CoreCountDynamic = 1; static int ActiveCore = gap_ncore(); diff --git a/tools/autotiler_v3/CNN_Libraries_fp16/CNN_Bias_Linear_Activation_fp16.c b/tools/autotiler_v3/CNN_Libraries_fp16/CNN_Bias_Linear_Activation_fp16.c index 7992ebbfe..30a622ee2 100644 --- a/tools/autotiler_v3/CNN_Libraries_fp16/CNN_Bias_Linear_Activation_fp16.c +++ b/tools/autotiler_v3/CNN_Libraries_fp16/CNN_Bias_Linear_Activation_fp16.c @@ -14,6 +14,12 @@ * limitations under the License. */ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wextra" +#pragma GCC diagnostic ignored "-Wpointer-sign" +#pragma GCC diagnostic ignored "-Wsign-compare" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" + #include "Gap.h" #include "CNN_BasicKernels_fp16.h" #include "CNN_Defines_fp16.h" @@ -679,3 +685,4 @@ void KerParLinearLayerLeakyReLU_fp16(KerLinear_fp16_T *Arg) gap_waitbarrier(0); } +#pragma GCC diagnostic pop diff --git a/tools/autotiler_v3/CNN_Libraries_fp16/CNN_Conv_BasicKernels_fp16.c b/tools/autotiler_v3/CNN_Libraries_fp16/CNN_Conv_BasicKernels_fp16.c index 09a8faf29..86d55c0ea 100644 --- a/tools/autotiler_v3/CNN_Libraries_fp16/CNN_Conv_BasicKernels_fp16.c +++ b/tools/autotiler_v3/CNN_Libraries_fp16/CNN_Conv_BasicKernels_fp16.c @@ -14,6 +14,12 @@ * limitations under the License. */ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wextra" +#pragma GCC diagnostic ignored "-Wpointer-sign" +#pragma GCC diagnostic ignored "-Wsign-compare" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" + #include "Gap.h" #include "CNN_BasicKernels_fp16.h" @@ -4024,3 +4030,5 @@ void KerConvNxMDxDyStrideSxSy_fp16(KerConv_fp16_T *Arg) } gap_waitbarrier(0); } + +#pragma GCC diagnostic pop diff --git a/tools/autotiler_v3/CNN_Libraries_fp16/CNN_Conv_DW_BasicKernels_fp16.c b/tools/autotiler_v3/CNN_Libraries_fp16/CNN_Conv_DW_BasicKernels_fp16.c index 681de86d8..5d7d4a3b4 100644 --- a/tools/autotiler_v3/CNN_Libraries_fp16/CNN_Conv_DW_BasicKernels_fp16.c +++ b/tools/autotiler_v3/CNN_Libraries_fp16/CNN_Conv_DW_BasicKernels_fp16.c @@ -14,6 +14,12 @@ * limitations under the License. */ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wextra" +#pragma GCC diagnostic ignored "-Wpointer-sign" +#pragma GCC diagnostic ignored "-Wsign-compare" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" + #include "Gap.h" #include "CNN_BasicKernels_fp16.h" @@ -4151,3 +4157,5 @@ void KerConvDWNxMDxDyStrideSxSy_fp16(KerConv_fp16_T *Arg) gap_waitbarrier(0); } + +#pragma GCC diagnostic pop \ No newline at end of file diff --git a/tools/autotiler_v3/CNN_Libraries_fp16/CNN_MatAlgebra_fp16.c b/tools/autotiler_v3/CNN_Libraries_fp16/CNN_MatAlgebra_fp16.c index ece4f10f6..401728fab 100644 --- a/tools/autotiler_v3/CNN_Libraries_fp16/CNN_MatAlgebra_fp16.c +++ b/tools/autotiler_v3/CNN_Libraries_fp16/CNN_MatAlgebra_fp16.c @@ -14,6 +14,12 @@ * limitations under the License. */ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wextra" +#pragma GCC diagnostic ignored "-Wpointer-sign" +#pragma GCC diagnostic ignored "-Wsign-compare" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" + #include #include #include "CNN_BasicKernels_fp16.h" @@ -7038,3 +7044,5 @@ void KerParMatMulSmallFeatLeakyrelu_fp16(KerMatMul_fp16_T *Arg) } gap_waitbarrier(0); } + +#pragma GCC diagnostic pop diff --git a/tools/autotiler_v3/CNN_Libraries_fp16/CNN_MatMul_Conv_fp16.c b/tools/autotiler_v3/CNN_Libraries_fp16/CNN_MatMul_Conv_fp16.c index 5af1deb3a..29730226e 100644 --- a/tools/autotiler_v3/CNN_Libraries_fp16/CNN_MatMul_Conv_fp16.c +++ b/tools/autotiler_v3/CNN_Libraries_fp16/CNN_MatMul_Conv_fp16.c @@ -1,3 +1,10 @@ + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wextra" +#pragma GCC diagnostic ignored "-Wpointer-sign" +#pragma GCC diagnostic ignored "-Wsign-compare" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" + #include #include "CNN_BasicKernels_fp16.h" @@ -1387,3 +1394,5 @@ void KerPar_MM_Conv2D_DxDy_ReLU_fp16( } gap_waitbarrier(0); } + +#pragma GCC diagnostic pop diff --git a/tools/autotiler_v3/CNN_Libraries_fp16/CNN_Pooling_BasicKernels_fp16.c b/tools/autotiler_v3/CNN_Libraries_fp16/CNN_Pooling_BasicKernels_fp16.c index c04650d15..0884c78e3 100644 --- a/tools/autotiler_v3/CNN_Libraries_fp16/CNN_Pooling_BasicKernels_fp16.c +++ b/tools/autotiler_v3/CNN_Libraries_fp16/CNN_Pooling_BasicKernels_fp16.c @@ -14,6 +14,12 @@ * limitations under the License. */ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wextra" +#pragma GCC diagnostic ignored "-Wpointer-sign" +#pragma GCC diagnostic ignored "-Wsign-compare" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" + #include #include "Gap.h" #include "CNN_BasicKernels_fp16.h" @@ -1524,3 +1530,5 @@ void KerParAvgPoolNxMStrideSxSy_HWC_fp16(Ker_MM_Pool_fp16_T *Arg) } gap_waitbarrier(0); } + +#pragma GCC diagnostic pop diff --git a/tools/autotiler_v3/CNN_Libraries_fp16/RNN_fp16.c b/tools/autotiler_v3/CNN_Libraries_fp16/RNN_fp16.c index 08f52e1a8..f72da190f 100644 --- a/tools/autotiler_v3/CNN_Libraries_fp16/RNN_fp16.c +++ b/tools/autotiler_v3/CNN_Libraries_fp16/RNN_fp16.c @@ -14,6 +14,12 @@ * limitations under the License. */ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wextra" +#pragma GCC diagnostic ignored "-Wpointer-sign" +#pragma GCC diagnostic ignored "-Wsign-compare" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" + #include #include "CNN_BasicKernels_fp16.h" @@ -485,4 +491,6 @@ void GRU_ParKer_fp16(KerGRU_fp16_T *Arg) } gap_waitbarrier(0); } -#endif \ No newline at end of file +#endif + +#pragma GCC diagnostic pop diff --git a/tools/autotiler_v3/CNN_Libraries_fp16/SSD_BasicKernels_fp16.c b/tools/autotiler_v3/CNN_Libraries_fp16/SSD_BasicKernels_fp16.c index 1703742f3..03fa3d2af 100644 --- a/tools/autotiler_v3/CNN_Libraries_fp16/SSD_BasicKernels_fp16.c +++ b/tools/autotiler_v3/CNN_Libraries_fp16/SSD_BasicKernels_fp16.c @@ -61,12 +61,12 @@ void Ker_SSD_Init_f16(Ker_SSD_Init_Arg_f16_T *KerArg0) } // The actual code that does the tile addition -void Ker_SSD_Decoder_fp16(Ker_SSD_Decoder_Arg_fp16_T *KerArg0 ) +void Ker_SSD_Decoder_f16(Ker_SSD_Decoder_Arg_f16_T *KerArg0 ) { unsigned int CoreId = gap_coreid(); - unsigned int Chunk = ChunkSize(KerArg0->H); + unsigned int Chunk = ChunkSize(KerArg0->N_Anchors); unsigned int First = CoreId*Chunk; - unsigned int Last = (First+Chunk > KerArg0->H) ? (KerArg0->H) : (First+Chunk); + unsigned int Last = (First+Chunk > KerArg0->N_Anchors) ? (KerArg0->N_Anchors) : (First+Chunk); bbox_f16_t * bbox = KerArg0->bbox_buf; F16 * scores = KerArg0->classes_in; int num_classes = KerArg0->N_Classes; @@ -152,7 +152,7 @@ static int16_t KerIoverU(F16 a_x, F16 a_y, F16 a_w, F16 a_h, } -static void KerNonMaxSuppress(bbox_t * boundbxs, float iouThres, int nnbb){ +static void KerNonMaxSuppress(bbox_f16_t * boundbxs, float iouThres, int nnbb){ //BBOX value are in Q14 and non_max_threshold in Q14 int idx, idx_int; //Non-max supression @@ -175,7 +175,7 @@ static void KerNonMaxSuppress(bbox_t * boundbxs, float iouThres, int nnbb){ } } -void Ker_SSD_NMS(Ker_SSD_NMS_ArgT *KerArg0 ) +void Ker_SSD_NMS_f16(Ker_SSD_NMS_Arg_f16_T *KerArg0 ) { short int bbox_idx_max = *(KerArg0->bbox_idx); diff --git a/tools/autotiler_v3/CNN_Libraries_fp16/SSD_BasicKernels_fp16.h b/tools/autotiler_v3/CNN_Libraries_fp16/SSD_BasicKernels_fp16.h index 3bb8ca9cd..513581a17 100644 --- a/tools/autotiler_v3/CNN_Libraries_fp16/SSD_BasicKernels_fp16.h +++ b/tools/autotiler_v3/CNN_Libraries_fp16/SSD_BasicKernels_fp16.h @@ -72,9 +72,9 @@ typedef struct { F16 NMSThr; short int n_max_bb; short int *bbox_idx; -} Ker_SSD_NMS_ArgT; +} Ker_SSD_NMS_Arg_f16_T; -void Ker_SSD_NMS(Ker_SSD_NMS_ArgT *Arg); +void Ker_SSD_NMS_f16(Ker_SSD_NMS_Arg_f16_T *Arg); diff --git a/tools/autotiler_v3/DSP_Generators/DSP_Generators.c b/tools/autotiler_v3/DSP_Generators/DSP_Generators.c index 06a9b3658..df0f64605 100644 --- a/tools/autotiler_v3/DSP_Generators/DSP_Generators.c +++ b/tools/autotiler_v3/DSP_Generators/DSP_Generators.c @@ -234,6 +234,21 @@ void LoadMFCCLibrary() ) ); + LibKernelTemplate("MatMul_DSP_T", + CArgs(10, + TCArg("void * __restrict__", "In1"), + TCArg("void * __restrict__", "In2"), + TCArg("void * __restrict__", "Out"), + TCArg("void *", "BufferColIn2"), + TCArg("unsigned int", "W_In1"), + TCArg("unsigned int", "H_In1"), + TCArg("unsigned int", "W_In2"), + TCArg("unsigned int", "W_Out"), + TCArg("unsigned int", "OutFirstCol"), + TCArg("int", "ColFirst") + ) + ); + /* FFT Basic Kernels */ LibKernel("Radix2FFT_DIF_Par_Fix16", CALL_PARALLEL, 0, "FFT_Arg_T", NULL); LibKernel("Radix2FFT_DIF_Par_Fix32", CALL_PARALLEL, 0, "FFT_Arg_T", NULL); @@ -332,6 +347,11 @@ void LoadMFCCLibrary() LibKernel("Conjugate_Fix32_Par", CALL_PARALLEL, CArgs(2, TCArg("int * __restrict__", "Data"), TCArg("int", "Ni")), "SwapSamples_Arg_T", NULL); LibKernel("Conjugate_Float16_Par", CALL_PARALLEL, CArgs(2, TCArg("F16V_DSP * __restrict__", "Data"), TCArg("int", "Ni")), "SwapSamples_Arg_T", NULL); LibKernel("Conjugate_Float32_Par", CALL_PARALLEL, CArgs(2, TCArg("float * __restrict__", "Data"), TCArg("int", "Ni")), "SwapSamples_Arg_T", NULL); + + LibKernel("KerParMatMulDSP_fp16", CALL_PARALLEL, 0, "MatMul_DSP_T", NULL); + LibKernel("KerParMatMulDSPT_fp16", CALL_PARALLEL, 0, "MatMul_DSP_T", NULL); + LibKernel("KerParMatMulDSP_fp32", CALL_PARALLEL, 0, "MatMul_DSP_T", NULL); + LibKernel("KerParMatMulDSPT_fp32", CALL_PARALLEL, 0, "MatMul_DSP_T", NULL); } void PieceWiseGenerator(char *Name, CNN_GenControl_T *Ctrl, char *FunName, int Dim, int DataType, int Inplace) @@ -921,6 +941,104 @@ int MFCC_Generator( return (Kernel!=0); } +int IMel_Generator( + char *Name, + CNN_GenControl_T *Ctrl, + int NFrames, + int Nfft, + int NMelBanks, + int SizeMelCoeff, + int DataType + ) +{ + if (__builtin_popcount(Nfft) != 1) GenTilingError("%s, Incorrect FFTDim: %d, it has to be a a power of 2", Name, Nfft); + if (DataType==FIX32 || DataType==FIX16) GenTilingError("Not supported FIX_32"); + + int MFCC_Coeff_Dyn = 15; + char *PreEmpKernel=0, *InverseMelKer=0, *UserKernType=0, *UserKernPointer=0, InItemSize=2, OutItemSize=2, LUTItemSize=2; + + switch (DataType){ + case FIX16: + InverseMelKer = "MelFilterBank_Fix32"; + UserKernType = "short int"; + UserKernPointer = "short int * __restrict__"; + InItemSize=2; OutItemSize=2, LUTItemSize=2; + break; + case FLOAT16: + InverseMelKer = "MelFilterBank_f16"; + UserKernType = "F16_DSP"; + UserKernPointer = "F16_DSP * __restrict__"; + InItemSize=F16_SIZE; OutItemSize=F16_SIZE, LUTItemSize=F16_SIZE; + break; + case FLOAT32: + InverseMelKer = "MelFilterBank_f32"; + UserKernType = "float"; + UserKernPointer = "float * __restrict__"; + InItemSize=4; OutItemSize=4, LUTItemSize=4; + break; + default: + GenTilingError("Data Type %d not known", DataType); + return 0; + } + unsigned int LayerOp = 0; + unsigned int LayerBandwidth = 0; + printf("Inverse Mel:\n"); + printf("\tNb Oper: %d\n", LayerOp); + printf("\tBandwidth: %d\n", LayerBandwidth); + + Kernel_T *Kernel = UserKernel(Name, + NFrames<0? + KernelIterSpace(2, IterFixedSpaceDynBound(D0, -NFrames, "NFrames"), IterTiledSpace(T0)): + KernelIterSpace(2, IterFixedSpace(D0, NFrames), IterTiledSpace(T0)), + TILE_HOR, + CArgs(5, + TCArg(UserKernPointer, "In"), + TCArg(UserKernPointer, "Out"), + TCArg("fbank_type_t *","IMel_FilterBank"), + TCArg(UserKernPointer, "IMel_Coeffs"), + (NFrames<0)? + TCArg("short int", "NFrames"):AT_NO_C_ARG + ), + Calls(1, + Call(InverseMelKer, LOC_LOOP, + Bindings(9, + K_Arg("In", KER_ARG_TILE), + K_Arg("Out" , KER_ARG_TILE), + K_Arg("IMel_Coeffs" , KER_ARG_TILE), + K_Arg("IMel_FilterBank", KER_ARG_TILE), + Imm(NMelBanks), + Imm(MFCC_Coeff_Dyn), + AT_IGNORE_ARG_BINDING, + (DataType==FIX16)?K_Arg("shift_buff", KER_ARG_TILE):AT_IGNORE_ARG_BINDING, + AT_IGNORE_ARG_BINDING + ) + ) + ), + KerArgs(4, + KerArg("In", KerArgSpace(1,D0), OBJ_IN_DB, 1, NMelBanks, InItemSize, 0, 0, 0, "In"), + KerArg("Out", KerArgSpace(1,D0), OBJ_OUT_DB, 1, Nfft*2, OutItemSize, 0, 0, 0, "Out"), + KerArg("IMel_FilterBank", KerArgSpace(1,T0), O_IN|O_BUFF|O_CONST, 1, NMelBanks, 6, /* size of filterbank type */ 0, 0, 0, "IMel_FilterBank"), + KerArg("IMel_Coeffs", KerArgSpace(1,T0), O_IN|O_BUFF|O_CONST, 1, SizeMelCoeff, LUTItemSize, 0, 0, 0, "IMel_Coeffs") + ) + ); + if (Kernel) { + AddKernelInfos(Name, AT_KERINFO_OPER, LayerOp, 0); + AddKernelInfos(Name, AT_KERINFO_BANDWIDTH, LayerBandwidth, 0); + + if (DataType==FIX32 || DataType==FIX16) { + AddKernelArgDim(Name, "In", 3, Abs(NFrames), NMelBanks, InItemSize); + AddKernelArgDim(Name, "Out", 3, Abs(NFrames), 2*Nfft, OutItemSize); + AddKernelArgDim(Name, "IMel_Coeffs", 2, SizeMelCoeff, LUTItemSize); + } else { + AddKernelFloatArgDim(Name, "In", 3, Abs(NFrames), NMelBanks, InItemSize); + AddKernelFloatArgDim(Name, "Out", 3, Abs(NFrames), 2*Nfft, OutItemSize); + AddKernelFloatArgDim(Name, "IMel_Coeffs", 2, SizeMelCoeff, LUTItemSize); + } + AddKernelArgDim(Name, "IMel_FilterBank", 3, NMelBanks, 3, 2); + } + return (Kernel!=0); +} + int RFFT_2D_Generator( char *Name, CNN_GenControl_T *Ctrl, @@ -1538,3 +1656,126 @@ void STFT_Generator( ) ); } + +int DSP_MatMul_Generator( + char *Name, + + CNN_GenControl_T *Ctrl, + + int ColM1, + int LineM1, + int ColM2, + int LineM2, + + int TransposedIn2, + int DataType +) + +{ + int Log = 1; + Tile_Orientation_T TileOrientation = TILE_HOR; + int F = 0; + unsigned long long int LayerOp = 0; + unsigned long long int LayerBandwidth = 0; + int LineO = LineM1, ColO = ColM2; + int Nbuff, ItemSize; + + if (ColM1 != LineM2) GenTilingError("DSP_MatMul_Generator: %s, Incorrect input matrices dimensions for a matrix multiplication: [%d x %d]*[%d x %d]", Name, LineM1, ColM1, LineM2, ColM2); + + char *MatMulKerName=0, *UserKernType=0, *UserKernPointer=0; + switch (DataType){ + case FIX16: + GenTilingError("DSP_MatMul_Generator Not yet implemented in FIX16"); + UserKernType = "short int"; UserKernPointer = "short int * __restrict__"; + ItemSize=2; + break; + case FIX32: + GenTilingError("DSP_MatMul_Generator Not yet implemented in FIX16"); + UserKernType = "int"; UserKernPointer = "int * __restrict__"; + ItemSize=2; + break; + case FLOAT16: + MatMulKerName = TransposedIn2?"KerParMatMulDSPT_fp16":"KerParMatMulDSP_fp16"; + UserKernType = "F16_DSP"; UserKernPointer = "F16_DSP * __restrict__"; + ItemSize=F16_SIZE; F = O_FLOAT; + break; + case FLOAT32: + MatMulKerName = TransposedIn2?"KerParMatMulDSPT_fp32":"KerParMatMulDSP_fp32"; + UserKernType = "float"; UserKernPointer = "float * __restrict__"; + ItemSize=4; F = O_FLOAT; + break; + default: + GenTilingError("Data Type %d not known", DataType); + } + + + int ColFirst = ((LineM1*ColM1)<(LineM2*ColM2)); + Nbuff = 4; + LayerOp += ColM1*ColM2*LineM1; + LayerBandwidth += LineM1*(ColM1*ColM2*(2+2)); + LayerBandwidth += LineM1*ColM2*2; + LayerBandwidth += LineM1*2; + + if (Log) { + printf("CNN_MatMulAct_fp16: %s\n", Name); + printf("In1 => W: %4d, H: %4d\n", ColM1, LineM1); + printf("In2 => W: %4d, H: %4d\n", ColM2, LineM2); + printf("Out => W: %4d, H: %4d => %s\n", ColO, LineO, ColFirst?"Column first":"Line First"); + printf("Total Op: %lld\n", LayerOp); + if (MatMulKerName) printf("%20s: %s\n", "MatMulKerName", MatMulKerName); + } + + int ObjCons = (!TransposedIn2)?OBJ_CONSTRAINTS_TILE_VER:0; + if (TransposedIn2) { + LineM2 = ColM2; ColM2 = ColM1; + } + Kernel_T *Kernel = UserKernel(Name, + KernelIterSpace(2, IterTiledSpace(T1), IterTiledSpace(T0)), + TILE_HOR, + CArgs(3, + TCArg(UserKernPointer, "In1"), + TCArg(UserKernPointer, "In2"), + TCArg(UserKernPointer, "Out") + ), + Calls(1, + Call(MatMulKerName, LOC_LOOP, + Bindings(10, + K_Arg("In1", KER_ARG_TILE), + K_Arg("In2", KER_ARG_TILE), + K_Arg("Out", KER_ARG_TILE), + (!TransposedIn2)?K_Arg("KerBuff", KER_ARG_TILE):AT_IGNORE_ARG_BINDING, + K_Arg("In1", KER_ARG_TILE_W), + K_Arg("In1", KER_ARG_TILE_H), + TransposedIn2?K_Arg("In2", KER_ARG_TILE_H):K_Arg("In2", KER_ARG_TILE_W), + K_Arg("Out", KER_ARG_TILE_W), + K_Arg(ColFirst?"In1":"In2", KER_ARG_TILE_BASE), + Imm(ColFirst) + ) + ) + ), + ColFirst? + KerArgs(4, + (!TransposedIn2)? + KerArg("KerBuff",KerArgSpace(1,T1), F|O_BUFF|O_NTILED, Nbuff*ColM1, 1, ItemSize, 0, 0, 0, 0):AT_NO_KER_ARG, + KerArg("In1", KerArgSpace(1,T0), F|O_IN|O_DB|O_CONST, ColM1, LineM1, ItemSize, 0, OBJ_CONSTRAINTS_PAD_REM, 8, "In1"), + KerArg("In2", KerArgSpace(1,T1), F|O_IN|O_DB, ColM2, LineM2, ItemSize, 0, ObjCons|OBJ_CONSTRAINTS_PAD_REM, 2, "In2"), + KerArg("Out", KerArgSpace(1,T1), F|O_OUT|O_DB, ColO, LineO, ItemSize, 0, OBJ_CONSTRAINTS_TILE_VER|OBJ_CONSTRAINTS_PAD_REM, 0, "Out") + ): + KerArgs(4, + (!TransposedIn2)? + KerArg("KerBuff",KerArgSpace(1,T0), F|O_BUFF|O_NTILED, Nbuff*ColM1, 1, ItemSize, 0, 0, 0, 0):AT_NO_KER_ARG, + KerArg("In1", KerArgSpace(1,T1), F|O_IN|O_DB|O_CONST, ColM1, LineM1, ItemSize, 0, OBJ_CONSTRAINTS_PAD_REM, 8, "In1"), + KerArg("In2", KerArgSpace(1,T0), F|O_IN|O_DB, ColM2, LineM2, ItemSize, 0, ObjCons|OBJ_CONSTRAINTS_PAD_REM, 2, "In2"), + KerArg("Out", KerArgSpace(1,T1), F|O_OUT|O_DB, ColO, LineO, ItemSize, 0, OBJ_CONSTRAINTS_PAD_REM, 0, "Out") + ) + ); + if (Kernel) { + AddKernelInfos(Name, AT_KERINFO_OPER, LayerOp, 0); + AddKernelInfos(Name, AT_KERINFO_BANDWIDTH, LayerBandwidth, 0); + + AddKernelFloatArgDim(Name, "In1", 3, LineM1, ColM1, ItemSize); + AddKernelFloatArgDim(Name, "In2", 3, LineM2, ColM2, ItemSize); + AddKernelFloatArgDim(Name, "Out", 3, LineO, ColO, ItemSize); + } + return (Kernel!=0); +} diff --git a/tools/autotiler_v3/DSP_Generators/DSP_Generators.h b/tools/autotiler_v3/DSP_Generators/DSP_Generators.h index e6ff452ac..74938f019 100644 --- a/tools/autotiler_v3/DSP_Generators/DSP_Generators.h +++ b/tools/autotiler_v3/DSP_Generators/DSP_Generators.h @@ -29,6 +29,30 @@ int MFCC_Generator( int OutFFT /* If output FFT beside mel spect */ ); +int DSP_MatMul_Generator( + char *Name, + + CNN_GenControl_T *Ctrl, + + int ColM1, + int LineM1, + int ColM2, + int LineM2, + + int TransposedIn2, + int DataType +); + +int IMel_Generator( + char *Name, + CNN_GenControl_T *Ctrl, + int NFrames, + int Nfft, + int NMelBanks, + int SizeMelCoeff, + int DataType + ); + int RFFT_2D_Generator( char *Name, CNN_GenControl_T *Ctrl, diff --git a/tools/autotiler_v3/DSP_Libraries/DSP_Lib.h b/tools/autotiler_v3/DSP_Libraries/DSP_Lib.h index c1f642364..292a84998 100644 --- a/tools/autotiler_v3/DSP_Libraries/DSP_Lib.h +++ b/tools/autotiler_v3/DSP_Libraries/DSP_Lib.h @@ -121,9 +121,9 @@ typedef struct { } FFT_InstallArg_T; typedef struct fbank_type_ { - short int Start; - short int Items; - short int Base; + unsigned short int Start; + unsigned short int Items; + unsigned short int Base; } fbank_type_t; typedef struct { @@ -231,6 +231,19 @@ typedef struct { int FFT_Dim; } Windowing_T; +typedef struct { + void * __restrict__ In1; + void * __restrict__ In2; + void * __restrict__ Out; + void *BufferColIn2; + unsigned int W_In1; + unsigned int H_In1; + unsigned int W_In2; + unsigned int W_Out; + unsigned int OutFirstCol; + int ColFirst; +} MatMul_DSP_T; + /********************************************************************************************************************************************************************/ /****************** FFT Library ************************************************************************************************************************************/ /********************************************************************************************************************************************************************/ @@ -353,4 +366,9 @@ extern void WindowingReal2Cmplx_PadCenter_f16(Windowing_T *Arg); extern void WindowingReal2Real_f16(Windowing_T *Arg); extern void WindowingReal2Real_PadCenter_f16(Windowing_T *Arg); +extern void KerParMatMulDSP_fp16(MatMul_DSP_T *Arg); +extern void KerParMatMulDSPT_fp16(MatMul_DSP_T *Arg); +extern void KerParMatMulDSP_fp32(MatMul_DSP_T *Arg); +extern void KerParMatMulDSPT_fp32(MatMul_DSP_T *Arg); + #endif //DSP_LIB_H \ No newline at end of file diff --git a/tools/autotiler_v3/DSP_Libraries/FFT_Library.c b/tools/autotiler_v3/DSP_Libraries/FFT_Library.c index 9a6e96b2c..cef2cb0f9 100644 --- a/tools/autotiler_v3/DSP_Libraries/FFT_Library.c +++ b/tools/autotiler_v3/DSP_Libraries/FFT_Library.c @@ -29,8 +29,8 @@ void FFT_InstallTwiddlesAndSwapLUT(FFT_InstallArg_T *Arg, int format) LUTSize = Arg->Nfft*sizeof(short); - AT_L2_COPY(0, (AT_L2_EXT_ADDR_TYPE) Arg->SwapLUT, (AT_L2_INT_ADDR_TYPE) Arg->L1_SwapLUT, LUTSize, 0, &DmaR_Evt2); - AT_L2_COPY(0, (AT_L2_EXT_ADDR_TYPE) Arg->Twiddles, (AT_L2_INT_ADDR_TYPE)Arg->L1_Twiddles, TwidSize, 0, &DmaR_Evt1); + AT_L2_COPY(0, (AT_L2_EXT_ADDR_TYPE) Arg->SwapLUT, (AT_L2_INT_ADDR_TYPE) Arg->L1_SwapLUT, LUTSize, 0, &DmaR_Evt1); + AT_L2_COPY(0, (AT_L2_EXT_ADDR_TYPE) Arg->Twiddles, (AT_L2_INT_ADDR_TYPE) Arg->L1_Twiddles, TwidSize, 0, &DmaR_Evt2); AT_L2_WAIT(0, &DmaR_Evt1); AT_L2_WAIT(0, &DmaR_Evt2); @@ -42,8 +42,8 @@ void RFFT_InstallTwiddlesAndSwapLUT(FFT_InstallArg_T *Arg, int format) AT_L2_EVENT DmaR_Evt1, DmaR_Evt2, DmaR_Evt3; int TwidSize, RTwidSize, LUTSize; - if (Arg->Radix == 2) TwidSize = Arg->Nfft * sizeof(short); - else TwidSize = 3 * Arg->Nfft * (sizeof(short)/2); + if (Arg->Radix == 2) TwidSize = (Arg->Nfft>>1) * sizeof(short); + else TwidSize = 3 * (Arg->Nfft>>1) * (sizeof(short)/2); // when floating 32, size is double if (format==1) TwidSize *=2; @@ -52,10 +52,9 @@ void RFFT_InstallTwiddlesAndSwapLUT(FFT_InstallArg_T *Arg, int format) if (format==1) RTwidSize = Arg->Nfft * sizeof(float); else RTwidSize = Arg->Nfft * sizeof(short); - - AT_L2_COPY(0, (AT_L2_EXT_ADDR_TYPE) Arg->SwapLUT, (AT_L2_INT_ADDR_TYPE) Arg->L1_SwapLUT, LUTSize, 0, &DmaR_Evt1); - AT_L2_COPY(0, (AT_L2_EXT_ADDR_TYPE) Arg->Twiddles, (AT_L2_INT_ADDR_TYPE)Arg->L1_Twiddles, TwidSize, 0, &DmaR_Evt2); - AT_L2_COPY(0, (AT_L2_EXT_ADDR_TYPE) Arg->RTwiddles, (AT_L2_INT_ADDR_TYPE)Arg->L1_RTwiddles, RTwidSize, 0, &DmaR_Evt3); + AT_L2_COPY(0, (AT_L2_EXT_ADDR_TYPE) Arg->SwapLUT, (AT_L2_INT_ADDR_TYPE) Arg->L1_SwapLUT, LUTSize, 0, &DmaR_Evt1); + AT_L2_COPY(0, (AT_L2_EXT_ADDR_TYPE) Arg->Twiddles, (AT_L2_INT_ADDR_TYPE) Arg->L1_Twiddles, TwidSize, 0, &DmaR_Evt2); + AT_L2_COPY(0, (AT_L2_EXT_ADDR_TYPE) Arg->RTwiddles, (AT_L2_INT_ADDR_TYPE) Arg->L1_RTwiddles, RTwidSize, 0, &DmaR_Evt3); AT_L2_WAIT(0, &DmaR_Evt1); AT_L2_WAIT(0, &DmaR_Evt2); @@ -2014,9 +2013,10 @@ void IRFFT_DIF_Par_Fix16(RFFT_Arg_T *Arg){ if (CoreId == 0){ xAR = pA[0][0]; xAI = pA[0][1]; + xBR = pA[k+1][0]; - RFFT_Out[0][0] = (xAR + xAI) >> 1; - RFFT_Out[0][1] = (xAR - xAI) >> 1; + RFFT_Out[0][0] = (xAR + xAI + xBR) >> 1; + RFFT_Out[0][1] = (xAR + xAI - xBR) >> 1; } Chunk = ChunkSize(k); First = CoreId*Chunk; Last = Min(First+Chunk, k); @@ -2085,9 +2085,10 @@ void IRFFT_DIF_Par_f16(RFFT_Arg_T *Arg){ if (CoreId == 0){ xAR = pA[0][0]; xAI = pA[0][1]; + xBR = pA[k+1][0]; - RFFT_Out[0][0] = 0.5f * ( xAR + xAI ); - RFFT_Out[0][1] = 0.5f * ( xAR - xAI ); + RFFT_Out[0][0] = 0.5f * ( xAR + xAI + xBR); + RFFT_Out[0][1] = 0.5f * ( xAR + xAI - xBR); } Chunk = ChunkSize(k); First = CoreId*Chunk; Last = Min(First+Chunk, k); @@ -2159,9 +2160,10 @@ void IRFFT_DIF_Par_f32(RFFT_Arg_T *Arg){ if (CoreId == 0){ xAR = pA[0]; xAI = pA[1]; + xBR = pA[2*(k+1)]; - RFFT_Out[0] = 0.5f * ( xAR + xAI ); - RFFT_Out[1] = 0.5f * ( xAR - xAI ); + RFFT_Out[0] = 0.5f * ( xAR + xAI + xBR ); + RFFT_Out[1] = 0.5f * ( xAR + xAI - xBR ); } Chunk = ChunkSize(k); First = CoreId*Chunk; Last = Min(First+Chunk, k); diff --git a/tools/autotiler_v3/DSP_Libraries/LUT_Tables/gen_scripts/GenMFCCLUT.py b/tools/autotiler_v3/DSP_Libraries/LUT_Tables/gen_scripts/GenMFCCLUT.py index 947e2d625..5cea41772 100644 --- a/tools/autotiler_v3/DSP_Libraries/LUT_Tables/gen_scripts/GenMFCCLUT.py +++ b/tools/autotiler_v3/DSP_Libraries/LUT_Tables/gen_scripts/GenMFCCLUT.py @@ -26,7 +26,9 @@ def create_parser(): parser.add_argument('--fft_lut_file', required="--params_json" not in sys.argv, help="path to fft lut file") parser.add_argument('--mfcc_bf_lut_file', default=None, - help="path to fft lut file") + help="path to mfcc lut file") + parser.add_argument('--imel_lut_file', default=None, + help="path to inverse mel lut file") parser.add_argument('--sample_rate', default=16000, type=int) parser.add_argument('--name_suffix', default="", type=str) parser.add_argument('--frame_size', required="--params_json" not in sys.argv, type=int, @@ -81,6 +83,7 @@ def main(): fft_lut_file = args.fft_lut_file if not "fft_lut_file" in models_params else models_params["fft_lut_file"] mfcc_bf_lut_file = args.mfcc_bf_lut_file if not "mfcc_bf_lut_file" in models_params else models_params["mfcc_bf_lut_file"] + imel_lut_file = args.imel_lut_file if not "imel_lut_file" in models_params else models_params["imel_lut_file"] use_tf_mfcc = args.use_tf_mfcc if not "use_tf_mfcc" in models_params else models_params["use_tf_mfcc"] use_librosa = args.use_librosa if not "use_librosa" in models_params else models_params["use_librosa"] sample_rate = args.sample_rate if not "sample_rate" in models_params else models_params["sample_rate"] @@ -218,10 +221,19 @@ def main(): from SetupLUT import GenMFCC_FB filters = GenMFCC_FB(n_fft, mfcc_bank_cnt, Fmin=fmin, Fmax=fmax, sample_rate=sample_rate, dtype=lut_dtype) - MfccLUT, HeadCoeff = GenMelFilterBanksCode(filters, mfcc_bank_cnt, fmin, fmax, lut_dtype, data_type, name_suffix) + MelLUT, NCoeffMEL = GenMelFilterBanksCode(filters, mfcc_bank_cnt, fmin, fmax, lut_dtype, data_type, name_suffix) with open(mfcc_bf_lut_file, "w") as f: - f.write(MfccLUT) + f.write(MelLUT) + + if imel_lut_file: + # Inverse matrix of filterbank generated with least squares algorithm + # A.T*b = A.T*A*x^ + # x^ = (A.T*A)^-1 * A.T * b + inverse_mel_fb = np.matmul(np.linalg.inv(np.matmul(filters, filters.T)), filters) + ImelLUT = array_to_def_c_file(inverse_mel_fb.flatten(), f"ImelLUT{name_suffix}", data_type, inverse_mel_fb.size, elem_in_rows=inverse_mel_fb.size) + with open(imel_lut_file, "w") as f: + f.write(ImelLUT) if args.save_params_header: with open(args.save_params_header, "w") as f: @@ -230,11 +242,11 @@ def main(): f.write("#define\t{:21}{:>10}\n".format("FRAME_STEP", frame_step)) f.write("#define\t{:21}{:>10}\n".format("N_FFT", n_fft)) f.write("#define\t{:21}{:>10}\n".format("DATA_TYPE", 2 if dtype=="float16" else (3 if dtype=="float32" else (1 if dtype=="fix32_scal" else 0)))) - if mfcc_bf_lut_file: + if mfcc_bf_lut_file or imel_lut_file: f.write("#define\t{:21}{:>10}\n".format("MFCC_BANK_CNT", mfcc_bank_cnt)) f.write("#define\t{:21}{:>10}\n".format("FMIN", fmin)) f.write("#define\t{:21}{:>10}\n".format("FMAX", fmax)) - f.write("#define\t{:21}{:>10}\n".format("MFCC_COEFF_CNT", HeadCoeff+1)) + f.write("#define\t{:21}{:>10}\n".format("MFCC_COEFF_CNT", NCoeffMEL+1)) f.write("#define\t{:21}{:>10}\n".format("N_DCT", n_dct)) diff --git a/tools/autotiler_v3/DSP_Libraries/LUT_Tables/gen_scripts/SetupLUT.py b/tools/autotiler_v3/DSP_Libraries/LUT_Tables/gen_scripts/SetupLUT.py index 671c7d190..5460da336 100644 --- a/tools/autotiler_v3/DSP_Libraries/LUT_Tables/gen_scripts/SetupLUT.py +++ b/tools/autotiler_v3/DSP_Libraries/LUT_Tables/gen_scripts/SetupLUT.py @@ -117,7 +117,14 @@ def SetupLiftCoeff(L, N, dtype="int"): def GenMelFilterBanksCode(filters, mfcc_bank_cnt, fmin, fmax, dtype, data_type, name_suffix): HeadCoeff = 0 MFCC_Coeff = [] - for i, filt in enumerate(filters): + if dtype == "int": + quant_filters = FP2FIX(filters, MFCC_COEFF_DYN) + elif dtype == "float16": + quant_filters = filters.astype(np.float16) + else: + quant_filters = filters.astype(np.float32) + + for i, filt in enumerate(quant_filters): if np.all(filt == 0): Start = 0 Stop = 0 @@ -130,22 +137,17 @@ def GenMelFilterBanksCode(filters, mfcc_bank_cnt, fmin, fmax, dtype, data_type, Items = Stop - Start + 1 print("Filter {}: Start: {} Stop: {} Base: {} Items: {}".format(i, Start, Stop, Base, Items)) for j in range(Items): - if dtype == "int": - MFCC_Coeff.append(FP2FIX(filt[Start+j], MFCC_COEFF_DYN)) - elif dtype == "float16": - MFCC_Coeff.append(filt[Start+j].astype(np.float16)) - else: - MFCC_Coeff.append(filt[Start+j]) + MFCC_Coeff.append(filt[Start+j]) HeadCoeff += Items - Out_str = "#define MFCC_COEFF_CNT\t{}\n\n".format(HeadCoeff+1) - Out_str += "/* Filter Bank bands:\n\n" + #Out_str = "#define MFCC_COEFF_CNT\t{}\n\n".format(HeadCoeff+1) + Out_str = "/* Filter Bank bands:\n\n" Out_str += "\tMinimum Frequency: {} Hz\n".format(fmin) Out_str += "\tMaximum Frequency: {} Hz*/\n\n".format(fmax) Out_str += "PI_L2 fbank_type_t MFCC_FilterBank{}[{}] = {{\n".format(name_suffix, mfcc_bank_cnt) HeadCoeff = 0 - for i, filt in enumerate(filters): + for i, filt in enumerate(quant_filters): if np.all(filt == 0): Start = 0 Stop = 0 diff --git a/tools/autotiler_v3/DSP_Libraries/MatMulDSP.c b/tools/autotiler_v3/DSP_Libraries/MatMulDSP.c new file mode 100644 index 000000000..94aa3cea5 --- /dev/null +++ b/tools/autotiler_v3/DSP_Libraries/MatMulDSP.c @@ -0,0 +1,514 @@ +#include +#include "FastFloatApprox16.h" +#include "DSP_Lib.h" + +static int CoreCountDynamic = 1; +static int ActiveCore = gap_ncore(); +static inline unsigned int __attribute__((always_inline)) ChunkSize(unsigned int X) + +{ + unsigned int NCore; + unsigned int Log2Core; + unsigned int Chunk; + + if (CoreCountDynamic) NCore = ActiveCore; else NCore = gap_ncore(); + Log2Core = gap_fl1(NCore); + Chunk = (X>>Log2Core) + ((X&(NCore-1))!=0); + return Chunk; +} + +void KerParMatMulDSP_fp16(MatMul_DSP_T *Arg) + +{ + F16_DSP * __restrict__ In1 = (F16_DSP * __restrict__) Arg->In1; + F16_DSP * __restrict__ In2 = (F16_DSP * __restrict__) Arg->In2; + F16_DSP * __restrict__ Out = (F16_DSP * __restrict__) Arg->Out; + F16_DSP *BufferColIn2 = (F16_DSP *) Arg->BufferColIn2; + unsigned int W_In1 = Arg->W_In1; + unsigned int H_In1 = Arg->H_In1; + unsigned int W_In2 = Arg->W_In2; /* H_In2 = W_In1 by construction */ + unsigned int W_Out = Arg->W_Out; + unsigned int OutFirstCol = Arg->OutFirstCol; + int ColFirst = Arg->ColFirst; + + unsigned int H_In2 = W_In1; + unsigned int H_Out = H_In1; + unsigned int Line, Col, i; + F16V_DSP *VBuff1 = (F16V_DSP *) (&BufferColIn2[0]); + F16V_DSP *VBuff2 = (F16V_DSP *) (&BufferColIn2[1*H_In2]); + F16V_DSP *VBuff3 = (F16V_DSP *) (&BufferColIn2[2*H_In2]); + F16V_DSP *VBuff4 = (F16V_DSP *) (&BufferColIn2[3*H_In2]); + + unsigned int CoreId = gap_coreid(); + unsigned int ChunkCell = ChunkSize(H_In1); + unsigned int First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); + unsigned int Iter = (Last>First)?(Last-First):0; + unsigned int C = ChunkSize(H_In2), F = CoreId*C, L = Min(H_In2, F+C); + int OffLine = 0, OffCol = 0; + + if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol; + for (Col=0; ColIn1; + F16_DSP * __restrict__ In2 = (F16_DSP * __restrict__) Arg->In2; + F16_DSP * __restrict__ Out = (F16_DSP * __restrict__) Arg->Out; + unsigned int W_In1 = Arg->W_In1; + unsigned int H_In1 = Arg->H_In1; + unsigned int W_In2 = Arg->W_In2; /* H_In2 = W_In1 by construction */ + unsigned int W_Out = Arg->W_Out; + unsigned int OutFirstCol = Arg->OutFirstCol; + int ColFirst = Arg->ColFirst; + + unsigned int H_In2 = W_In1; + unsigned int H_Out = H_In1; + unsigned int Line, Col, i; + + unsigned int CoreId = gap_coreid(); + unsigned int ChunkCell = ChunkSize(H_In1); + unsigned int First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); + unsigned int Iter = (Last>First)?(Last-First):0; + int OffLine = 0, OffCol = 0; + + if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol; + F16_DSP * pOut = Out + (OffLine+First)*W_Out + OffCol; + for (Line=0; LineIn1; + float * __restrict__ In2 = (float *__restrict__) Arg->In2; + float * __restrict__ Out = (float *__restrict__) Arg->Out; + float *BufferColIn2 = (float *) Arg->BufferColIn2; + unsigned int W_In1 = Arg->W_In1; + unsigned int H_In1 = Arg->H_In1; + unsigned int W_In2 = Arg->W_In2; /* H_In2 = W_In1 by construction */ + unsigned int W_Out = Arg->W_Out; + unsigned int OutFirstCol = Arg->OutFirstCol; + int ColFirst = Arg->ColFirst; + + unsigned int H_In2 = W_In1; + unsigned int H_Out = H_In1; + unsigned int Line, Col, i; + + unsigned int CoreId = gap_coreid(); + unsigned int ChunkCell = ChunkSize(H_In1); + unsigned int First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); + unsigned int Iter = (Last>First)?(Last-First):0; + unsigned int C = ChunkSize(H_In2), F = CoreId*C, L = Min(H_In2, F+C); + int OffLine = 0, OffCol = 0; + + if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol; + for (Col=0; ColIn1; + float * __restrict__ In2 = (float *__restrict__) Arg->In2; + float * __restrict__ Out = (float *__restrict__) Arg->Out; + unsigned int W_In1 = Arg->W_In1; + unsigned int H_In1 = Arg->H_In1; + unsigned int W_In2 = Arg->W_In2; /* H_In2 = W_In1 by construction */ + unsigned int W_Out = Arg->W_Out; + unsigned int OutFirstCol = Arg->OutFirstCol; + int ColFirst = Arg->ColFirst; + + unsigned int H_In2 = W_In1; + unsigned int H_Out = H_In1; + unsigned int Line, Col, i; + + unsigned int CoreId = gap_coreid(); + unsigned int ChunkCell = ChunkSize(H_In1); + unsigned int First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); + unsigned int Iter = (Last>First)?(Last-First):0; + int OffLine = 0, OffCol = 0; + + if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol; + float * pOut = Out + (OffLine+First)*W_Out + OffCol; + for (Line=0; Line size) + length = size; + + fseek(file, ext, SEEK_SET); + if (dir==AT_QSPIFLASH_FS_EXT2LOC) fread(loc, 1, length, file); + else fwrite(loc, 1, length, file); + + loc = ((char *)loc) + length; + ext += stride; + } +} + + +#define AT_OSPIFLASH_FS_CONF_INIT(dev,type,name) + +#define AT_OSPIFLASH_FS_OPEN(file,conf,filename,err) \ + do { *(file) = fopen(filename, "r"); *(err) = *(file) == NULL; } while(0) + +#define AT_OSPIFLASH_FS_OPEN_WRITE(file,conf,filename,err) \ + do { *(file) = fopen(filename, "w"); *(err) = *(file) == NULL; } while(0) + +#define AT_OSPIFLASH_FS_OPEN_SET_SIZE(file, size) + +#define AT_OSPIFLASH_FS_CLOSE(file) \ + fclose(*file) + +#define AT_OSPIFLASH_FS_FC_COPY(file,ext,loc,size,dir,event) \ + __at_ospiflash_fs_copy(*(file), ext, loc, size, dir) + +#define AT_OSPIFLASH_FS_FC_COPY2D(file, dev,ext,loc,size,stride,len,dir,event) \ + __at_ospiflash_fs_copy_2d(*(file), ext, loc, size, stride, len, dir) + +#define AT_OSPIFLASH_FS_FC_WAIT(file,event) + +#define AT_OSPIFLASH_FS_CL_COPY(file,ext,loc,size,dir,event) \ + __at_ospiflash_fs_copy(*(file), ext, loc, size, dir) + +#define AT_OSPIFLASH_FS_CL_COPY2D(file, dev,ext,loc,size,stride,len,dir,event) \ + __at_ospiflash_fs_copy_2d(*(file), ext, loc, size, stride, len, dir) + +#define AT_OSPIFLASH_FS_CL_WAIT(file,event) + /* * EMRAMflash diff --git a/tools/autotiler_v3/Emulation/at_api_pmsis.h b/tools/autotiler_v3/Emulation/at_api_pmsis.h index 733992315..584fc99ce 100644 --- a/tools/autotiler_v3/Emulation/at_api_pmsis.h +++ b/tools/autotiler_v3/Emulation/at_api_pmsis.h @@ -18,6 +18,7 @@ #define __AT__AT_API_PMSIS_H__ #include "pmsis.h" +#include #include "bsp/ram/hyperram.h" #include "bsp/ram/spiram.h" #include "bsp/flash/hyperflash.h" @@ -83,6 +84,10 @@ static inline uint32_t gap_cl_readhwtimer() #define AT_QSPIRAM_FREE(dev,ptr,size) pi_ram_free((dev), (ptr), (size)) +#define AT_OSPIRAM_ALLOC(dev,size) ({ uint32_t ptr; int err = pi_ram_alloc((dev), &ptr, (size)); if (!err && ptr == 0) err = pi_ram_alloc((dev), &ptr, (size)); if (err) ptr = 0; ptr; }) + +#define AT_OSPIRAM_FREE(dev,ptr,size) pi_ram_free((dev), (ptr), (size)) + #define AT_L2_ALLOC(dev,size) pmsis_l2_malloc(size) #define AT_L2_FREE(dev,ptr,size) pmsis_l2_malloc_free((ptr), (size)) @@ -328,6 +333,53 @@ typedef char * AT_QSPIRAM_INT_ADDR_TYPE; pi_cl_ram_copy_wait(event) +/* + * OctaSpiram + */ + +#ifdef __GAP9__ +#define AT_OSPIRAM_TYPE 0 + +typedef struct pi_aps25xxxn_conf AT_OSPIRAM_CONF_T; +typedef struct pi_device AT_OSPIRAM_T; +typedef uint32_t AT_OSPIRAM_EXT_ADDR_TYPE; +typedef void * AT_OSPIRAM_LOC_ADDR_TYPE; +typedef pi_task_t AT_OSPIRAM_FC_EVENT; +typedef pi_cl_ram_req_t AT_OSPIRAM_CL_EVENT; +typedef uint32_t AT_OSPIRAM_POINTER; +typedef char * AT_OSPIRAM_INT_ADDR_TYPE; + +#define AT_OSPIRAM_EXT2LOC 0 +#define AT_OSPIRAM_LOC2EXT 1 + +#define AT_OSPIRAM_CONF_INIT(dev,type,name) \ + pi_aps25xxxn_conf_init(dev) + +#define AT_OSPIRAM_OPEN(dev,conf,err) \ + do { pi_open_from_conf((dev), (conf)); *(err) = pi_ram_open(dev); } while(0) + +#define AT_OSPIRAM_CLOSE(dev) \ + pi_ram_close(dev) + +#define AT_OSPIRAM_FC_COPY(dev,ext,loc,size,dir,event) \ + pi_ram_copy_async(dev, (AT_OSPIRAM_EXT_ADDR_TYPE)(ext), (AT_OSPIRAM_LOC_ADDR_TYPE)(loc), (size), !(dir), pi_task_block(event)) + +#define AT_OSPIRAM_FC_COPY2D(dev,ext,loc,size,stride,len,dir,event) \ + pi_ram_copy_2d_async(dev, (AT_OSPIRAM_EXT_ADDR_TYPE)(ext), (AT_OSPIRAM_LOC_ADDR_TYPE)(loc), (size), (stride), (len), !(dir), pi_task_block(event)) + +#define AT_OSPIRAM_FC_WAIT(dev,event) \ + pi_task_wait_on(event) + +#define AT_OSPIRAM_CL_COPY(dev,ext,loc,size,dir,event) \ + pi_cl_ram_copy(dev, (AT_OSPIRAM_EXT_ADDR_TYPE)(ext), (AT_OSPIRAM_LOC_ADDR_TYPE)(loc), (size), !(dir), (event)) + +#define AT_OSPIRAM_CL_COPY2D(dev,ext,loc,size,stride,len,dir,event) \ + pi_cl_ram_copy_2d(dev, (AT_OSPIRAM_EXT_ADDR_TYPE)(ext), (AT_OSPIRAM_LOC_ADDR_TYPE)(loc), (size), (stride), (len), !(dir), (event)) + +#define AT_OSPIRAM_CL_WAIT(dev,event) \ + pi_cl_ram_copy_wait(event) +#endif + /* * Spiflash */ @@ -362,6 +414,47 @@ typedef pi_cl_ram_req_t AT_QSPIFLASH_EVENT; #define AT_QSPIFLASH_WAIT(dev,event) +/* + * OctaSpiflash + */ + +#ifdef __GAP9__ +#define AT_OSPIFLASH_TYPE 1 + +#if defined(CONFIG_ATXP032) +typedef struct pi_atxp032_conf AT_OSPIFLASH_CONF_T; +#else +#if defined(CONFIG_MX25U51245G) +typedef struct pi_mx25u51245g_conf AT_OSPIFLASH_CONF_T; +#endif +#endif +typedef struct pi_device AT_OSPIFLASH_T; +typedef uint32_t AT_OSPIFLASH_EXT_ADDR_TYPE; +typedef void * AT_OSPIFLASH_LOC_ADDR_TYPE; +typedef pi_cl_ram_req_t AT_OSPIFLASH_EVENT; + +#define AT_OSPIFLASH_EXT2LOC 0 +#define AT_OSPIFLASH_LOC2EXT 1 + +#define AT_OSPIFLASH_CONF_INIT(dev,type,name) \ + pi_spiflash_conf_init(dev) + +#define AT_OSPIFLASH_OPEN(dev,conf,err) \ + do { pi_open_from_conf((dev), (conf)); *(err) = pi_flash_open(dev); } while(0) + +#define AT_OSPIFLASH_CLOSE(dev) \ + pi_flash_close(dev) + +// TODO not yet supported +#define AT_OSPIFLASH_COPY(dev,ext,loc,size,dir,event) + +// TODO not yet supported +#define AT_OSPIFLASH_COPY2D(dev,ext,loc,size,stride,len,dir,event) + +// TODO not yet supported +#define AT_OSPIFLASH_WAIT(dev,event) +#endif + /* * SPIflash FS @@ -463,6 +556,116 @@ static inline void __at_qspiflash_fs_close(AT_QSPIFLASH_FS_T *file) #define AT_QSPIFLASH_FS_CL_WAIT(file,event) \ pi_cl_fs_wait(event) + +/* + * OctoSPIflash FS + */ + +#ifdef __GAP9__ +#define AT_OSPIFLASH_FS_TYPE 1 + +typedef struct pi_fs_conf AT_OSPIFLASH_FS_CONF_T; + +typedef struct +{ + struct pi_device fs; + struct pi_device ospiflash; + pi_fs_file_t *file; +} AT_OSPIFLASH_FS_T; + +typedef unsigned int AT_OSPIFLASH_FS_EXT_ADDR_TYPE; +typedef void *AT_OSPIFLASH_FS_INT_ADDR_TYPE; +typedef pi_task_t AT_OSPIFLASH_FS_FC_EVENT; +typedef pi_cl_fs_req_t AT_OSPIFLASH_FS_CL_EVENT; + +static inline void __at_ospiflash_fs_open(AT_OSPIFLASH_FS_T *file, int is_write, struct pi_fs_conf *conf, const char *filename, int *err) +{ + #if defined(CONFIG_ATXP032) + struct pi_atxp032_conf flash_conf; + pi_atxp032_conf_init(&flash_conf); + #else + #if defined(CONFIG_MX25U51245G) + struct pi_mx25u51245g_conf flash_conf; + pi_mx25u51245g_conf_init(&flash_conf); + #endif + #endif + pi_open_from_conf(&file->ospiflash, &flash_conf); + if (pi_flash_open(&file->ospiflash)) + { + *err = -1; + return; + } + conf->flash = &file->ospiflash; + if (is_write) + conf->type = PI_FS_HOST; + else + conf->type = PI_FS_READ_ONLY; + + pi_open_from_conf(&file->fs, conf); + if (pi_fs_mount(&file->fs)) + { + pi_flash_close(&file->ospiflash); + *err = -1; + return; + } + file->file = pi_fs_open(&file->fs, filename, is_write ? PI_FS_FLAGS_WRITE : 0); + if (file->file == NULL) + { + pi_fs_unmount(&file->fs); + pi_flash_close(&file->ospiflash); + *err = -1; + return; + } + *err = 0; + + if (is_write) + file->file->size = 4*1024*1024; +} + +static inline void __at_ospiflash_fs_close(AT_OSPIFLASH_FS_T *file) +{ + pi_fs_close(file->file); + pi_fs_unmount(&file->fs); + pi_flash_close(&file->ospiflash); +} + +#define AT_OSPIFLASH_FS_EXT2LOC 0 +#define AT_OSPIFLASH_FS_LOC2EXT 1 + +#define AT_OSPIFLASH_FS_CONF_INIT(dev,type,name) \ + pi_fs_conf_init(dev) + +#define AT_OSPIFLASH_FS_OPEN(file,conf,filename,err) \ + __at_ospiflash_fs_open(file, 0, conf, filename, err) + +#define AT_OSPIFLASH_FS_OPEN_WRITE(file,conf,filename,err) \ + __at_ospiflash_fs_open(file, 1, conf, filename, err) + +#define AT_OSPIFLASH_FS_OPEN_SET_SIZE(file, size) \ + file->file->size = size + +#define AT_OSPIFLASH_FS_CLOSE(file) \ + __at_ospiflash_fs_close(file) + +#define AT_OSPIFLASH_FS_FC_COPY(fs,ext,loc,size,dir,event) \ + pi_fs_copy_async((fs)->file, ext, loc, size, !(dir), pi_task_block(event)) + +#define AT_OSPIFLASH_FS_FC_COPY2D(file, dev,ext,loc,size,stride,len,dir,event) \ + pi_fs_copy_2d_async(file->file, ext, loc, size, stride, len, !(dir), pi_task_block(event)) + +#define AT_OSPIFLASH_FS_FC_WAIT(file,event) \ + pi_task_wait_on(event) + +#define AT_OSPIFLASH_FS_CL_COPY(fs,ext,loc,size,dir,event) \ + pi_cl_fs_copy((fs)->file, ext, loc, size, !(dir), event) + +#define AT_OSPIFLASH_FS_CL_COPY2D(file, dev,ext,loc,size,stride,len,dir,event) \ + pi_cl_fs_copy_2d(file->file, ext, loc, size, stride, len, !(dir), event) + +#define AT_OSPIFLASH_FS_CL_WAIT(file,event) \ + pi_cl_fs_wait(event) +#endif + #ifdef __GAP9__ /* diff --git a/tools/autotiler_v3/Makefile b/tools/autotiler_v3/Makefile index 0abf4ba65..d099d66b9 100644 --- a/tools/autotiler_v3/Makefile +++ b/tools/autotiler_v3/Makefile @@ -1,4 +1,4 @@ -TILER_VER=4.3.1 +TILER_VER=4.3.2 export TILER_LIB=libtile.${TILER_VER}.a ifdef GAP_SDK_HOME export TILER_URL=$(GAP_SDK_HOME)/.tiler_url diff --git a/tools/autotiler_v3/version.cfg b/tools/autotiler_v3/version.cfg index 332f897c0..047a40256 100644 --- a/tools/autotiler_v3/version.cfg +++ b/tools/autotiler_v3/version.cfg @@ -3,7 +3,7 @@ { "version": "autotiler-v3", "magicNum": 718930176, - "git-hash": "de88fbeb3017c0db55f1e86e49cce5a0160ccbe5" + "git-hash": "4be2dc2f29bb4719d481b20c8cd37ae3b68937cf" } ] } \ No newline at end of file diff --git a/tools/jenkins/gap_sdk_version.txt b/tools/jenkins/gap_sdk_version.txt index 59f52fae3..86cc31dbb 100644 --- a/tools/jenkins/gap_sdk_version.txt +++ b/tools/jenkins/gap_sdk_version.txt @@ -1 +1 @@ -9af2d93598d20541f4c18ba45e2124b767be2388 +65d7014bdc0a46fff8f45d826301de74829b89ab diff --git a/tools/nntool/_version.py b/tools/nntool/_version.py index 62227b113..a1297615f 100644 --- a/tools/nntool/_version.py +++ b/tools/nntool/_version.py @@ -13,4 +13,4 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . -__version__ = '3.11' +__version__ = '4.1' diff --git a/tools/nntool/execution/kernels/float/dsp_preprocessing.py b/tools/nntool/execution/kernels/float/dsp_preprocessing.py index 2d094e224..111146cb3 100644 --- a/tools/nntool/execution/kernels/float/dsp_preprocessing.py +++ b/tools/nntool/execution/kernels/float/dsp_preprocessing.py @@ -18,8 +18,6 @@ import numpy as np from graph.types import MFCCPreprocessingParameters, RFFT2DPreprocessingParameters from execution.kernels.kernel_base import KernelBase, params_type, qrec_type -from quantization.multiplicative.mulbias import (apply_multiplicative_bias, - apply_zero_offset_bias) from quantization.new_qrec import QRec from utils.at_norm import at_norm diff --git a/tools/nntool/execution/kernels/float/fast_conv.py b/tools/nntool/execution/kernels/float/fast_conv.py index 0963ae22d..8c20195bd 100644 --- a/tools/nntool/execution/kernels/float/fast_conv.py +++ b/tools/nntool/execution/kernels/float/fast_conv.py @@ -51,7 +51,9 @@ def execute(cls, params, details['max_acc'] = float("-Infinity") details['min_pre_mul_bias'] = float("Infinity") details['max_pre_mul_bias'] = float("-Infinity") - + in_rank = len(in_tensor.shape) + if in_rank != 3: + raise NotImplementedError(f'{params.name} input has input rank of {in_rank} shape {in_tensor.shape} which is not supported by nntool kernels') in_tensor = in_tensor.transpose( in_dims.transpose_to_order(['h', 'w', 'c'])) if params.padding.h + params.padding.w > 0: diff --git a/tools/nntool/execution/kernels/float/tensor_functions.py b/tools/nntool/execution/kernels/float/tensor_functions.py index cae1352af..704afc0a7 100644 --- a/tools/nntool/execution/kernels/float/tensor_functions.py +++ b/tools/nntool/execution/kernels/float/tensor_functions.py @@ -13,20 +13,20 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . -import math from typing import cast as typing_cast -from utils.at_norm import at_norm import numpy as np +from skimage.transform import resize + +from execution.kernels.kernel_base import KernelBase, params_type, qrec_type from graph.types import (ConcatParameters, ConstantInputParameters, CopyParameters, InputParameters, OutputParameters, ReshapeParameters, ReverseParameters, SplitParameters, StridedSliceParameters, TransposeParameters) -from graph.types.others import (ExpandParameters, GatherParameters, NoOPParameters, - QuantizeParameters) -from execution.kernels.kernel_base import KernelBase, params_type, qrec_type +from graph.types.others import (ExpandParameters, GatherParameters, + NoOPParameters, QuantizeParameters) from quantization.new_qrec import AllFloatQRec, QRec -from skimage.transform import resize +from utils.at_norm import at_norm @params_type(InputParameters) diff --git a/tools/nntool/execution/kernels/quant/activations.py b/tools/nntool/execution/kernels/quant/activations.py index b146a8da9..73bec1c78 100644 --- a/tools/nntool/execution/kernels/quant/activations.py +++ b/tools/nntool/execution/kernels/quant/activations.py @@ -196,9 +196,9 @@ def execute(cls, params, return qrec.get_outputs(params, [in_tensor], ktype="symmetric") -@params_type(SigmoidActivationParameters) +@params_type(SigmoidActivationParameters, TanHActivationParameters) @qrec_type('scaled') -class SigmoidScaledSymmetricMult(KernelBase): +class SigmoidTanHScaledSymmetricMult(KernelBase): @classmethod def execute(cls, params, in_tensors, @@ -206,17 +206,19 @@ def execute(cls, params, **kwargs): in_tensor = qrec.prepare_inputs( params, in_tensors, ktype="symmetric")[0] - if in_tensor.dtype == np.int8: + if in_tensor.dtype == np.int8: # Q4 in_tensor = in_tensor.astype(np.int32) << 8 - elif in_tensor.dtype == np.uint8: - in_tensor = in_tensor.astype(np.int32) - qrec.in_qs[0].zero_point + elif in_tensor.dtype == np.uint8: # Q4 sym + in_tensor = in_tensor.astype(np.int32) - (1 << 8) in_tensor <<= 8 - elif in_tensor.dtype == np.uint16: - in_tensor = in_tensor.astype(np.int32) - qrec.in_qs[0].zero_point - else: + elif in_tensor.dtype == np.uint16: # Q12 sym + in_tensor = in_tensor.astype(np.int32) - (1 << 16) + else: # Q12 in_tensor = in_tensor.astype(np.int32) - - out_q15 = sigmoid_lut(in_tensor) + if isinstance(params, TanHActivationParameters): + out_q15 = tanh_lut(in_tensor) + else: + out_q15 = sigmoid_lut(in_tensor) scale_mul_biases_q = qrec.cache['scale_mul_biases_q'] outp = scale_mul_biases_q.apply_scales(out_q15) + qrec.cache['zero_point'] output = qrec.out_qs[0].clip(outp) @@ -251,35 +253,35 @@ def execute(cls, params, ktype="symmetric") -@params_type(TanHActivationParameters) -@qrec_type('scaled') -class TanHScaledMult(KernelBase): - @classmethod - def execute(cls, params, - in_tensors, - qrec: QRec, - **kwargs): - in_tensor = qrec.prepare_inputs( - params, in_tensors, ktype="symmetric")[0] - if in_tensor.dtype == np.int8: - in_tensor = in_tensor.astype(np.int32) << 8 - elif in_tensor.dtype == np.uint8: - in_tensor = in_tensor.astype(np.int32) - qrec.cache['zero_point'] - in_tensor <<= 8 - elif in_tensor.dtype == np.uint16: - in_tensor = in_tensor.astype(np.int32) - qrec.cache['zero_point'] - else: - in_tensor = in_tensor.astype(np.int32) - - out_q15 = tanh_lut(in_tensor) - # compute_in_out_scale(qrec, extra_scale=QType.Pow2( - # bits=32, q=7, signed=True).scale/qrec.in_qs[0].scale) - scale_mul_biases_q = qrec.cache['scale_mul_biases_q'] - outp = scale_mul_biases_q.apply_scales(out_q15) + qrec.out_qs[0].zero_point - output = qrec.out_qs[0].clip(outp) - return qrec.get_outputs(params, - [output], - ktype="symmetric") +# @params_type(TanHActivationParameters) +# @qrec_type('scaled') +# class TanHScaledMult(KernelBase): +# @classmethod +# def execute(cls, params, +# in_tensors, +# qrec: QRec, +# **kwargs): +# in_tensor = qrec.prepare_inputs( +# params, in_tensors, ktype="symmetric")[0] +# if in_tensor.dtype == np.int8: # Q4 +# in_tensor = in_tensor.astype(np.int32) << 8 +# elif in_tensor.dtype == np.uint8: # Q4 sym +# in_tensor = in_tensor.astype(np.int32) - (1 << 8) +# in_tensor <<= 8 +# elif in_tensor.dtype == np.uint16: # Q12 sym +# in_tensor = in_tensor.astype(np.int32) - (1 << 16) +# else: # Q12 +# in_tensor = in_tensor.astype(np.int32) + +# out_q15 = tanh_lut(in_tensor) +# # compute_in_out_scale(qrec, extra_scale=QType.Pow2( +# # bits=32, q=7, signed=True).scale/qrec.in_qs[0].scale) +# scale_mul_biases_q = qrec.cache['scale_mul_biases_q'] +# outp = scale_mul_biases_q.apply_scales(out_q15) + qrec.cache['zero_point'] +# output = qrec.out_qs[0].clip(outp) +# return qrec.get_outputs(params, +# [output], +# ktype="symmetric") @params_type(TanHActivationParameters) diff --git a/tools/nntool/execution/kernels/quant/fast_conv.py b/tools/nntool/execution/kernels/quant/fast_conv.py index c965f0d81..493c38be1 100644 --- a/tools/nntool/execution/kernels/quant/fast_conv.py +++ b/tools/nntool/execution/kernels/quant/fast_conv.py @@ -18,8 +18,7 @@ import numpy as np from graph.types import Conv2DParameters from execution.kernels.kernel_base import KernelBase, params_type, qrec_type -from quantization.multiplicative.mulbias import (apply_multiplicative_bias, - apply_zero_offset_bias) +from quantization.multiplicative.mulbias import apply_multiplicative_bias from quantization.new_qrec import QRec FORCE_INT64 = False @@ -43,11 +42,10 @@ def execute(cls, params, in_dims, out_dims = params.in_dims[0], params.out_dims[0] prepared_in_tensors = qrec.prepare_inputs(params, in_tensors, ktype="symmetric") - # if zero offset is already applied in biases by constant quantizer this does nothing - prepared_in_tensors = apply_zero_offset_bias(qrec, params, prepared_in_tensors, ktype="symmetric") in_tensor = prepared_in_tensors[0] # expand the weights to apply the zero offset weights = prepared_in_tensors[1].astype(np.int32) - qrec.in_qs[1].zero_point.astype(np.int32) + # if zero offset is already applied in biases by constant quantizer this does nothing biases = prepared_in_tensors[2] acc_q = qrec.cache.get('acc_q') or qrec.in_qs[2] @@ -99,7 +97,6 @@ def execute(cls, params, out_h = ((in_h - dillated_filter_h + pad_h)) // params.stride.h + 1 if params.has_bias: - # biases = qrec.prepare_biases(params, params.biases, params.weights, ktype="symmetric") if acc_q != qrec.in_qs[2]: biases = acc_q.expand_from(biases, qrec.in_qs[2]) result = np.broadcast_to(biases.reshape( diff --git a/tools/nntool/execution/kernels/quant/linear.py b/tools/nntool/execution/kernels/quant/linear.py index 318e6e71e..f8a7a4ff4 100644 --- a/tools/nntool/execution/kernels/quant/linear.py +++ b/tools/nntool/execution/kernels/quant/linear.py @@ -18,8 +18,7 @@ import numpy as np from graph.types.linear import FcParameters from execution.kernels.kernel_base import KernelBase, params_type, qrec_type -from quantization.multiplicative.mulbias import (apply_multiplicative_bias, - apply_zero_offset_bias) +from quantization.multiplicative.mulbias import apply_multiplicative_bias from quantization.new_qrec import QRec LOG = logging.getLogger("nntool." + __name__) @@ -38,8 +37,6 @@ def execute(cls, params, in_dims, out_dims = params.in_dims[0], params.out_dims[0] prepared_in_tensors = qrec.prepare_inputs( params, in_tensors, ktype="symmetric") - prepared_in_tensors = apply_zero_offset_bias( - qrec, params, prepared_in_tensors, ktype="symmetric") in_tensor = prepared_in_tensors[0] # expand the weights to apply the zero offset weights = prepared_in_tensors[1].astype(np.int32) - qrec.in_qs[1].zero_point.astype(np.int32) diff --git a/tools/nntool/execution/kernels/quant/matrix_operations.py b/tools/nntool/execution/kernels/quant/matrix_operations.py index ba4c84ed6..162b7e477 100644 --- a/tools/nntool/execution/kernels/quant/matrix_operations.py +++ b/tools/nntool/execution/kernels/quant/matrix_operations.py @@ -164,8 +164,7 @@ def execute(cls, params, in_tensors, qrec: QRec, **kwargs): - in_tensors = [in_tensor.astype(np.int32) for in_tensor in qrec.prepare_inputs( - params, in_tensors, ktype="symmetric")] + in_tensors = qrec.prepare_inputs(params, in_tensors, ktype="symmetric") details = kwargs.get('details') if details is not None: results = {} diff --git a/tools/nntool/execution/kernels/quant/tensor_functions.py b/tools/nntool/execution/kernels/quant/tensor_functions.py index 39a050c9a..9487d3b9e 100644 --- a/tools/nntool/execution/kernels/quant/tensor_functions.py +++ b/tools/nntool/execution/kernels/quant/tensor_functions.py @@ -44,7 +44,9 @@ def execute(cls, params, else: in_tensor = resize(in_tensor, params.dims.shape) # output_tensors = qrec.get_outputs(params, [in_tensor], ktype="symmetric") - return [qrec.out_qs[0].quantize(in_tensor)] + if in_tensor.dtype != qrec.out_qs[0].dtype: + in_tensor = qrec.out_qs[0].quantize(in_tensor) + return [in_tensor] @params_type(OutputParameters) diff --git a/tools/nntool/expressions/symbolic/assignments.py b/tools/nntool/expressions/symbolic/assignments.py index 3cadfaf6f..ba376a030 100644 --- a/tools/nntool/expressions/symbolic/assignments.py +++ b/tools/nntool/expressions/symbolic/assignments.py @@ -1,4 +1,4 @@ -# Copyright (C) 2021 GreenWaves Technologies, SAS +# Copyright (C) 2022 GreenWaves Technologies, SAS # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as @@ -13,265 +13,251 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . +from collections.abc import Sequence as ABCSequence +from copy import deepcopy -from functools import reduce -from typing import Mapping - -import numpy as np +from expressions.symbolic.quantization_base import QuantizationHandlerBase from generation.code_block import CodeBlock - -from .iteration_space import IterationSpace -from .symbol import Constant, Symbol, Variable, copy_props -from .variable_container import VariableContainerAndAssigner - - -@copy_props('var') -class Assignment(VariableContainerAndAssigner, Symbol): - def __init__(self, arg, name="", var=None, **kwargs): - if var is None: - self._var = Variable(name, shape=arg.shape, symbol_binding=arg) - else: - name = var.name - self._var = var - super(Assignment, self).__init__(arg, name=name, **kwargs) +from utils.disjoint_reduction import disjoint_reduction + +from .symbol import Symbol, Variable + + +class Assignments(ABCSequence): + def __init__(self, assignments=None, returns=None, qrecs=None) -> None: + super().__init__() + self._assignments = [] + self._returns = set(returns if returns is not None else []) + self._outputs = None + self._inputs = None + self._inters = None + self._vars = [] + self._qrecs = qrecs + if assignments: + for assignment in assignments: + self.add(*assignment) + self._update() @property - def unbound_variables(self): - return self.contents[0].unbound_variables + def max_shape(self): + return tuple(max(elems) for elems in zip(*Symbol.extend_shapes(*[ass[1].shape for ass in self._assignments]))) @property - def var(self): - return self._var - - def find(self, name): - for elem in [self._var, self.contents[0]]: - res = elem.find(name) - if res: - return res - return None - - @property - def var_shapes(self): - shapes = {self.name: self.contents[0].shape} - shapes.update(zip(self.contents[0].unbound_variables, self.contents[0].unbound_shapes)) - return shapes - - def _resolve(self, **kwargs): - """Given a set of substitions for variable in kwargs resolve all variables""" - return self._contents[0].resolve(**kwargs) - - def _resolve_assignment(self, substitute_all=False, **kwargs) -> Mapping[str, Symbol]: - return {self.name: self._contents[0].resolve(**kwargs)} - - def _calculate(self, calculate_ranges=False, **kwargs): - res = self._contents[0].resolve(**kwargs) - if not isinstance(res, Constant): - raise ValueError( - f"unable to calculate {self.name}") - if calculate_ranges: - self.control.add_stat(self, res.value) - return res.value - - def _calculate_assignment(self, **kwargs) -> Mapping[str, np.ndarray]: - return {self.name: self._calculate(**kwargs)} - - def c_block(self, code_block: CodeBlock = None, iteration_space: IterationSpace = None): - if code_block is None: - code_block = CodeBlock() - if iteration_space: - if self.var.name in iteration_space.temporary_variables: - code_block.write( - f"{self.var.c_expr(declare=True, dtype=self.contents[0].dtype)}" - f" = {self.contents[0].c_expr(iteration_space=iteration_space)};") - else: - code_block.write( - f"{self.var.c_expr(dtype=self.contents[0].dtype)}{iteration_space.c_index(self.var.name)}" - f" = {self.contents[0].c_expr(iteration_space=iteration_space)};") - else: - code_block.write(f'{self.var.name} = {self.contents[0].c_expr()};') - return code_block + def unbound_shapes(self): + return tuple(self._vars[name].shape for name in self.unbound_variables) @property - def returned_variables(self): - return [self.name] + def input_names(self): + return self._inputs @property - def shape(self): - return self._contents[0].shape - - def _py_expr(self, *args, **kwargs): - return self._contents[0].py_expr(*args, **kwargs) - - def _c_expr(self, *args, **kwargs): - return self._contents[0].c_expr(*args, **kwargs) - - def __repr__(self) -> str: - return f"{{{self.var.name} <- {self.contents[0].__repr__()}}}" - - -@copy_props('preconditions', 'returned_variables') -class Let(VariableContainerAndAssigner, Symbol): - def __init__(self, *args, preconditions=None, returned_variables=None, name="", **kwargs): - args = [Assignment(arg[1], name=arg[0]) if isinstance( - arg, tuple) else arg for arg in args] - super(Let, self).__init__(*args, name=name, **kwargs) - if preconditions is None: - preconditions = [] - else: - preconditions = [Assignment(arg[1], name=arg[0]) if isinstance( - arg, tuple) else arg for arg in preconditions] - self._preconditions = preconditions - self._returned_variables = returned_variables - -# pylint: disable=invalid-name - def In(self, *expressions): - return Let(*expressions, preconditions=[self]) - - def Return(self, *variable_names): - produced = self.produced_variables - if not all(variable in produced for variable in variable_names): - raise ValueError('not all variables are produced') - return Let(*self.contents, preconditions=self.preconditions, name=self.name, returned_variables=variable_names) + def output_names(self): + return self._outputs @property def unbound_variables(self): - resolution = self.resolve_assignment() - _vars = reduce(lambda s, x: s | set( - x.unbound_variables.values()), resolution.values(), set()) - return {var.name: var for var in _vars if var.name not in set(resolution.keys())} + return self._inputs @property - def produced_variables(self): - resolution = self.resolve_assignment() - return set(resolution.keys()) + def intermediate_names(self): + return self._inters @property - def preconditions(self): - return self._preconditions + def variables(self): + return self._vars @property - def returned_variables(self): - return self._returned_variables - - @staticmethod - def substitute_variables(assignments): - res = {} - substitutions = {} - for var_name, val in assignments.items(): - if isinstance(val, (Constant, np.ndarray, int, float)): - substitutions[var_name] = val - else: - substitutions[var_name] = Variable(var_name, shape=val.shape) - res[var_name] = val - return res, substitutions - - def find(self, name): - for elem in list(self._preconditions) + list(self.contents): - res = elem.find(name) - if res: - return res - return None - - def _resolve_assignment(self, substitute_all=False, **kwargs) -> Mapping[str, Symbol]: - """Given a set of substitions for variable in kwargs resolve all variables - return a dictionary of variables""" - preconditions = self._resolve_contents( - contents=self._preconditions, substitute_all=substitute_all, **kwargs) - return self._resolve_contents(contents=self.contents, substitute_all=substitute_all, **preconditions) - - def _calculate_assignment(self, **kwargs) -> Mapping[str, np.ndarray]: - preconditions = self._calculate_contents( - contents=self._preconditions, **kwargs) - res = self._calculate_contents(contents=self.contents, **preconditions) - if self.returned_variables: - res = {vname: val for vname, val in res.items( - ) if vname in self.returned_variables} - return res - - @staticmethod - def _resolve_contents(contents=None, substitute_all=False, **kwargs): - if substitute_all: - substitutions = kwargs - res = kwargs - else: - res, substitutions = Let.substitute_variables(kwargs) - for elem in contents: - elem_res = elem.resolve_assignment( - substitute_all=substitute_all, **substitutions) - if substitute_all: - substitutions.update(elem_res) - res.update(elem_res) - else: - elem_res, elem_substitutions = Let.substitute_variables( - elem_res) - res.update(elem_res) - substitutions.update(elem_substitutions) - return res - - @staticmethod - def _calculate_contents(contents=None, **kwargs): - for elem in contents: - kwargs.update(elem.calculate_assignment(**kwargs)) - return kwargs - - def _resolve(self, **kwargs): - """Given a set of substitions for variable in kwargs resolve all variables - return a single symbol""" - preconditions = self._resolve_contents( - contents=self._preconditions, substitute_all=True, **kwargs) - resolution = self._resolve_contents( - contents=self.contents, substitute_all=True, **preconditions) - return Assignment(resolution[self.contents[-1].name], name=self.contents[-1].name) - - def _calculate(self, calculate_ranges=False, **kwargs): - res = self._resolve(**kwargs) - if not isinstance(res.contents[0], Constant): - raise ValueError( - f"unable to calculate {self.name}") - if calculate_ranges: - self.control.add_stat(self, res.value) - return res.contents[0].value + def axes(self): + var_shapes = Symbol.extend_shapes(*self.unbound_shapes, max_length=len(self.max_shape)) + axes = disjoint_reduction(set(frozenset(idx for idx, dim in enumerate( + shape) if dim != 1) for shape in var_shapes)) + return tuple(sorted([tuple(x) for x in axes])) @property def var_shapes(self): - shapes = {} - for var_name, elem in self.resolve_assignment().items(): - shapes[var_name] = elem.shape - shapes.update(dict(zip(elem.unbound_variables, elem.unbound_shapes))) - return shapes + return {var.name: var.shape for var in self._vars.values()} @property - def shape(self): - return self._contents[-1].shape + def ops(self): + # TODO: Implement + return 1 - def _py_expr(self, *args, **kwargs): - return self._contents[0].py_expr(*args, **kwargs) + @property + def qrecs(self): + return self._qrecs - def c_block(self, code_block: CodeBlock = None, iteration_space: IterationSpace = None, with_loops=False): + @property + def c_header_set(self): + return set().union(*[assignment[1].c_header_set + for assignment in self._assignments]) + + def variable(self, name): + return self._vars[name] + + def _add_int(self, var, func): + for uname, uvar in func.unbound_variables.items(): + if uname in self._vars: + uvar.shape = self._vars[uname].shape + uvar.qrec = self._vars[uname].qrec + if isinstance(var, str): + if var in self._vars: + var = self._vars[var] + else: + var = Variable(var, shape=func.shape, dtype=func.dtype) + self._assignments.append((var, func)) + + def add(self, var, func): + self._add_int(var, func) + self._update() + + def _update(self): + self._vars = {} + free_var_names = set() + for var, func in self._assignments: + self._vars[var.name] = var + for name, uvar in func.unbound_variables.items(): + self._vars[name] = uvar + free_var_names.add(name) + + # these are all the produced variables + prod_var_names = set( + [assignment[0].name for assignment in self._assignments]) + # sort all the variable names to keep a determined order + # the outputs are things produced that are not consumed + self._outputs = sorted( + list((prod_var_names - free_var_names) | self._returns)) + # the inputs are variables that are not produced + self._inputs = sorted(list(free_var_names - prod_var_names)) + # the intermediates are the produced variables that are not in the outputs + self._inters = sorted(list(prod_var_names - set(self._outputs))) + + def c_block(self, code_block: CodeBlock = None, iteration_space: 'IterationSpace' = None, + with_loops=False, with_comment=True, with_fixed=False, tags=None): if code_block is None: code_block = CodeBlock() + # create loops from iteration space if with_loops: assert iteration_space, "must have space" - for idx, _ in enumerate(iteration_space.axis_shape): - if idx in iteration_space.fixed_spaces: + if with_comment: + # write some comments describing the iteration space + code_block.comment( + f"Max shape: {iteration_space.shape} var shapes:") + writer = code_block.start_long_comment() + for shape_comment in [f'{name}: {shape}' + for name, shape in iteration_space.var_shapes.items()]: + writer.write(shape_comment) + writer.end() + code_block.comment( + f'Iteration reduced to spaces {iteration_space.spaces}') + code_block.comment( + f'Fixed spaces {iteration_space.fixed_spaces}') + code_block.comment( + f'Parameteric spaces {iteration_space.parametric_spaces}') + code_block.comment( + f'Paralelized space {iteration_space.paralellized_space}') + code_block.comment( + f'Interior spaces {iteration_space.interior_spaces}') + # write the loops + for space in iteration_space.spaces: + if not with_fixed and space in iteration_space.fixed_spaces: continue - code_block.write(f"{iteration_space.c_for(idx)} {{") + code_block.write(f"{iteration_space.c_for(space, with_fixed=with_fixed)} {{") code_block.indent() - for precondition in self.preconditions: - precondition.c_block(code_block=code_block, - iteration_space=iteration_space) - for item in self.contents: - item.c_block(code_block=code_block, - iteration_space=iteration_space) + # write each assignment + for var, func in self._assignments: + this_tags = {} if tags is None else tags.copy() + + # write comment with quantization if present + if with_comment: + uvars = [f'{uvar.name}: {uvar.qrec}' + for uvar in func.unbound_variables.values() + if uvar.qrec] + if uvars: + writer = code_block.start_long_comment() + writer.write('inputs') + for uvar in uvars: + writer.write(uvar) + writer.end() + code_block.comment(f'{var.name} = {repr(func)}') + # if iteration space is present pick up if this is a temporary or an output + # assignment from that + if iteration_space: + if var.name in iteration_space.temporary_names: + this_tags[func] = (var, True) + else: + this_tags[func] = (var, False) + else: + this_tags[func] = (var, var.name in self.intermediate_names) + + # The iteration space will be passed down the symbol structure + func.tag = True + func.c_block(code_block=code_block, + tags=this_tags, + iteration_space=iteration_space, + with_comment=with_comment) + func.tag = False + if with_loops: - for idx, _ in enumerate(iteration_space.axis_shape): - if idx in iteration_space.fixed_spaces: + for space in iteration_space.spaces: + if not with_fixed and space in iteration_space.fixed_spaces: continue code_block.deindent() code_block.write("}") return code_block - def _c_expr(self, *args, **kwargs): - return self._contents[0].c_expr(*args, **kwargs) + def quantize(self, quantizer: QuantizationHandlerBase, symbol_control, quantize_inputs=False, qtypes=None): + funcs = [] + out_qrecs = {} + in_qrecs = {} + for var, func in self._assignments: + qfunc, qrec = quantizer.quantize( + func, + symbol_control, + quantize_inputs=quantize_inputs, + prequantized_variables=out_qrecs, + qtypes=qtypes) + qfunc = qfunc.resolve() + in_qrecs.update(qfunc.variable_quantization) + if var.name in self._outputs: + qfunc, qrec = quantizer.quantize_output( + func, + qfunc, + var, + symbol_control, + qrec, + quantize_inputs=quantize_inputs, + prequantized_variables=out_qrecs, + qtypes=qtypes) + qfunc = qfunc.resolve() + var = deepcopy(var) + var.qrec = qrec + funcs.append((var, qfunc(substitute=True))) + out_qrecs[var.name] = qrec + in_qrecs.update(out_qrecs) + return Assignments(funcs, returns=self._returns, qrecs=in_qrecs) + + def __getitem__(self, idx): + return self._assignments[idx] + + def __len__(self) -> int: + return len(self._assignments) + + def __iter__(self): + return iter(self._assignments) + + def __call__(self, quantize_inputs=False, dequantize_outputs=False, **subs): + subs = dict(subs) + if quantize_inputs: + subs = {name: self.qrecs[name].quantize_and_clip(val) if name in self.qrecs else val + for name, val in subs.items()} + for var, func in self._assignments: + subs[var.name] = func( + dequantize_outputs=dequantize_outputs, **subs) + res = dict(filter(lambda elem: elem[0] in self._outputs, subs.items())) + if dequantize_outputs: + if self.qrecs is None: + raise ValueError('assignments are not quantized') + res = {name: self.qrecs[name].dequantize( + val) for name, val in res.items()} + return res - def __repr__(self) -> str: - return (f"Let({','.join([elem.__repr__() for elem in self.preconditions])})" - f".In({','.join([elem.__repr__() for elem in self.contents])})") diff --git a/tools/nntool/expressions/symbolic/basic.py b/tools/nntool/expressions/symbolic/basic.py index cdb9f9c2e..862dbb065 100644 --- a/tools/nntool/expressions/symbolic/basic.py +++ b/tools/nntool/expressions/symbolic/basic.py @@ -14,6 +14,7 @@ # along with this program. If not, see . import logging +import math import numpy as np from bfloat16 import bfloat16 @@ -21,8 +22,8 @@ from scipy.special import expit from .function import Function -from .symbol import (Constant, Rational, c_headers, copy_props, environment, - handles, handlesr, nargs) +from .symbol import (Constant, QRecBase, Rational, Symbol, Variable, c_headers, + copy_props, environment, handles, handlesr, nargs) LOG = logging.getLogger('nntool.'+__name__) @@ -33,7 +34,8 @@ class Add(Function): def _impl(self, *args, **kwargs): - return np.add(args[0], args[1], dtype=self.dtype) + res = np.add(args[0], args[1], dtype=self.dtype) + return res def _py_expr(self, *args, **kwargs): return "np.add(%s, %s)" % (args[0], args[1]) @@ -460,6 +462,8 @@ def __init__(self, *args, **kwargs): self._inner_function = self._eval(*args, **kwargs) # self._inner_function.name = self.name self._inner_function.qrec = self.qrec + self._inner_function.tag = self.tag + self._inner_function.comment = self.comment def _collect_globals(self) -> dict: global_dict = self.ENVIRONMENT or {} @@ -478,6 +482,9 @@ def _resolve(self, **kwargs): func = self._inner_function.resolve(**kwargs) # func.name = self.name func.qrec = self.qrec + if isinstance(func, Function): + func.tag = self.tag + func.comment = self.comment return func def _eval(self, *args, **kwargs): @@ -500,6 +507,14 @@ def _py_expr(self, *args, **kwargs): def _c_expr(self, *args, **kwargs): return self._inner_function.c_expr(*args, **kwargs) + def c_block(self, code_block=None, tags=None, **kwargs): + if tags is not None and self._inner_function not in tags: + name = tags.get(self, f'{self.SYMBOL_PREFEX}{self.name}') + if isinstance(name, str): + name = (Variable(name, dtype=self.dtype), True) + tags[self._inner_function] = name + return self._inner_function.c_block(code_block=code_block, tags=tags, **kwargs) + @nargs(1) class HTanh(CompoundFunction): @@ -544,32 +559,122 @@ def _eval(self, *args, **kwargs): return args[0] +@nargs(3) +class ClipFloat(CompoundFunction): + + def _eval(self, *args, **kwargs): + return Min(Max(args[0], args[1], dtype=self.dtype), args[2], dtype=self.dtype) + + @nargs(1) @copy_props('_from_qrec', '_to_qrec') -class ConvertFloatScaled(CompoundFunction): - def __init__(self, *args, from_qrec=None, to_qrec=None, **kwargs): +class ConvertQuantization(CompoundFunction): + def __init__(self, *args, from_qrec: QRecBase=None, to_qrec: QRecBase=None, **kwargs): self._from_qrec = from_qrec self._to_qrec = to_qrec super().__init__(*args, **kwargs) @property - def from_qrec(self): + def from_qrec(self) -> QRecBase: return self._from_qrec @property - def to_qrec(self): - return self._to_qrec + def from_is_float(self) -> bool: + return self._from_qrec.dtype in [np.float16, np.float32, bfloat16] - def _eval_float_to_quant(self, *args, **kwargs): - raise NotImplementedError() + @property + def from_is_fix(self) -> bool: + return self._from_qrec.dtype in [np.int8, np.uint8, np.int16, np.uint16, np.int32] - def _eval_quant_to_float(self, *args, **kwargs): - raise NotImplementedError() + @property + def to_is_float(self) -> bool: + return self._to_qrec.dtype in [np.float16, np.float32, bfloat16] - def _eval(self, *args, **kwargs): - if self._from_qrec.dtype == np.int16 or self._from_qrec.dtype == bfloat16: - return self._eval_float_to_quant(*args, **kwargs) - return self._eval_quant_to_float(*args, **kwargs) + @property + def to_is_fix(self) -> bool: + return self._to_qrec.dtype in [np.int8, np.uint8, np.int16, np.uint16, np.int32] + + @property + def to_qrec(self) -> QRecBase: + return self._to_qrec + + def _eval_float_to_fix(self, *args, **kwargs) -> Symbol: + to_qrec = self.to_qrec + from_qrec = self.from_qrec + scaled_val = Mul( + args[0], + Constant( + [math.pow(2, to_qrec.q)/to_qrec.scale], + dtype=from_qrec.dtype), + dtype=from_qrec.dtype) + if to_qrec.zero_point != 0: + # need to add zero_point plus rounding + scaled_val = Add( + scaled_val, + Constant([to_qrec.zero_point + 0.5], dtype=from_qrec.dtype), + dtype=from_qrec.dtype) + else: + # Just add rounding + scaled_val = Add( + scaled_val, + Constant([0.5], dtype=from_qrec.dtype), + dtype=from_qrec.dtype) + iinfo = np.iinfo(to_qrec.dtype) + return Cast( + ClipFloat( + scaled_val, + Constant(iinfo.min, dtype=from_qrec.dtype), + Constant(iinfo.max, dtype=from_qrec.dtype), + dtype=from_qrec.dtype), + dtype=to_qrec.dtype, + tag=self.tag, + comment=self.comment) + + def _eval_fix_to_float(self, *args, **kwargs) -> Symbol: + to_qrec = self.to_qrec + from_qrec = self.from_qrec + float_val = Cast(args[0], dtype=to_qrec.dtype) + if from_qrec.zero_point != 0: + float_val = Sub( + float_val, + Constant([from_qrec.zero_point], dtype=to_qrec.dtype), + dtype=to_qrec.dtype) + float_val = Mul( + float_val, + Constant( + [from_qrec.scale/math.pow(2, from_qrec.q)], + dtype=to_qrec.dtype), + dtype=to_qrec.dtype, + tag=self.tag, + comment=self.comment) + return float_val + + def _eval(self, *args, **kwargs) -> Symbol: + if self.from_is_float: + if self.to_is_fix: + return self._eval_float_to_fix(*args, **kwargs) + elif self.to_is_float: + if self.to_qrec.dtype != self.from_qrec.dtype: + return Cast( + *args, + dtype=self.to_qrec.dtype, + **kwargs) + return args[0] + elif self.from_is_fix: + if self.to_is_float: + return self._eval_fix_to_float(*args, **kwargs) + elif self.to_is_fix: + # if self.to_qrec.dtype == self.from_qrec.dtype: + # return args[0] + # sign_change = from_qrec.signed != to_qrec.signed + # growing = from_qrec.size < to_qrec.size + # reducing = from_qrec.size > to_qrec.size + # zeropoint_change = from_qrec.zero_point != to_qrec.zero_point + # scale_change = from_qrec.scale != to_qrec.scale + # q_change = from_qrec.q != to_qrec.q + raise NotImplementedError() + + raise ValueError('unsupported conversion') @nargs(2) diff --git a/tools/nntool/expressions/symbolic/common/__init__.py b/tools/nntool/expressions/symbolic/common/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tools/nntool/expressions/symbolic/float_quantization/float_qrec.py b/tools/nntool/expressions/symbolic/float_quantization/float_qrec.py index 5f2d4a2d7..bcb463e6f 100644 --- a/tools/nntool/expressions/symbolic/float_quantization/float_qrec.py +++ b/tools/nntool/expressions/symbolic/float_quantization/float_qrec.py @@ -26,6 +26,9 @@ def __init__(self, dtype: np.dtype, min_val=None, max_val=None) -> None: self._min_val = min_val self._max_val = max_val + def __repr__(self) -> str: + return self._dtype.__name__ + @property def min_val(self): return self._min_val diff --git a/tools/nntool/expressions/symbolic/float_quantization/float_quantization.py b/tools/nntool/expressions/symbolic/float_quantization/float_quantization.py index 00eb69f0c..fb6eae262 100644 --- a/tools/nntool/expressions/symbolic/float_quantization/float_quantization.py +++ b/tools/nntool/expressions/symbolic/float_quantization/float_quantization.py @@ -13,19 +13,17 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . -import math from typing import Tuple import numpy as np -from ..basic import Cast, ConvertFloatScaled +from ..basic import Cast, ConvertQuantization from ..q15_quantization.q15_scale_q_rec import Q15ScaleQRec from ..quantization_base import (QRecBase, QuantizationHandlerBase, handles_scheme) from ..symbol import Symbol, SymbolStats from .float_qrec import FloatQRec - @handles_scheme('Float') class FloatQuantization(QuantizationHandlerBase): @@ -64,28 +62,19 @@ def _quantize_output(cls, qtypes = kwargs.get('qtypes', {}) # first see if this has already been quantized by nntool # note that the qtype will be stored against the name of the output symbol - max_val, out_dtype, out_q = cls._get_scale_dtype_from_qtypes(osym, qtypes) - if max_val is not None: - qrec_out = Q15ScaleQRec(out_dtype, max_val, out_q) - # scale clip and cast to output type - return (ConvertFloatScaled(qsym, from_qrec=qrec, to_qrec=qrec_out), qrec_out) - if not out_dtype: - out_dtype = kwargs.get('out_dtype', np.float32) - # Just cast - return (Cast(qsym, dtype=out_dtype), FloatQRec(dtype=out_dtype, min_val=qrec.min_val, max_val=qrec.max_val)) + if not qtypes or osym.name not in qtypes: + out_dtype = kwargs.get('out_dtype', np.float32) + qrec_out = FloatQRec(out_dtype) + return (Cast(qsym, dtype=out_dtype), qrec_out) - @classmethod - def _get_scale_dtype_from_qtypes(cls, sym, qtypes): - if not qtypes or sym.name not in qtypes: - return None, None, None - qtype = qtypes[sym.name] - if qtype.dtype == np.int8: - if len(qtype.scale) > 1: - return None, None, None - return qtype.scale[0] * math.pow(2, 7), np.int8, 7 - elif qtype.dtype == np.int16: + qtype = qtypes[osym.name] + if qtype.dtype in [np.int8, np.uint8, np.int16, np.uint16]: if len(qtype.scale) > 1: - return None, None, None - return qtype.scale[0] * math.pow(2, 15), np.int16, 15 - else: - return None, qtype.dtype, None + out_dtype = kwargs.get('out_dtype', np.float32) + qrec_out = FloatQRec(out_dtype) + return (Cast(qsym, dtype=out_dtype), qrec_out) + max_val, min_val, bitlen = Q15ScaleQRec.dtype_zp_to_min_max(qtype.dtype, qtype.scale[0], qtype.zero_point) + qrec_out = Q15ScaleQRec(qtype.dtype, max_val, bitlen, min_val=min_val, max_val=max_val, zero_point=qtype.zero_point) + return (ConvertQuantization(qsym, from_qrec=qrec, to_qrec=qrec_out), qrec_out) + qrec_out = FloatQRec(dtype=qtype.dtype, max_val=qtype.max_val, min_val=qtype.min_val) + return (Cast(qsym, dtype=qtype.dtype),qrec_out) diff --git a/tools/nntool/expressions/symbolic/float_quantization/handlers.py b/tools/nntool/expressions/symbolic/float_quantization/handlers.py index 298139bcc..0c5c8a5e6 100644 --- a/tools/nntool/expressions/symbolic/float_quantization/handlers.py +++ b/tools/nntool/expressions/symbolic/float_quantization/handlers.py @@ -23,7 +23,7 @@ np_fastpow2, np_fastrsqrt, np_fastsigmoid, np_fasttanh) -from ..basic import (Abs, Add, ATan, Cast, Cos, Div, Exp, HSigmoid, HTanh, Log, +from ..basic import (Abs, Add, ATan, Cast, ConvertQuantization, Cos, Div, Exp, HSigmoid, HTanh, Log, Max, Min, Mul, Pow, RSqrt, Sigmoid, Sin, Sqrt, Sub, TanH) from ..function import Function from ..quantization_base import qhandler @@ -35,7 +35,6 @@ # from utils.sigmoid_tanh_lut import sigmoid_lut_float, tanh_lut_float - @qhandler("Float", Constant, Rational) class BasicConstantQuant(FloatQuantization): @@ -58,7 +57,7 @@ def _quantize(cls, sym_ctrl: SymbolStats, qrec: FloatQRec = None, **kwargs) -> Tuple[Symbol, FloatQRec]: - + # TODO: Needs merging with Q15 version prequantized_variables = kwargs.get('prequantized_variables', {}) qtypes = kwargs.get('qtypes', {}) @@ -75,25 +74,27 @@ def _quantize(cls, qrec = cls.qrec_from_qtype(qtypes[sym.name], max_val) if qrec: sym.qrec = qrec - return (sym, qrec) + if isinstance(qrec, FloatQRec): + return (sym, qrec) + out_dtype = kwargs.get('out_dtype', np.float32) + out_qrec = FloatQRec(dtype=out_dtype, max_val=max_val, min_val=-max_val) + return ( + ConvertQuantization(sym, from_qrec=qrec, to_qrec=out_qrec, tag=sym.name), + out_qrec) out_dtype = kwargs.get('out_dtype', np.float32) return sym, FloatQRec(dtype=out_dtype, max_val=max_val, min_val=-max_val) @classmethod def qrec_from_qtype(cls, qtype, max_val): - if qtype.dtype == np.int8 or qtype.dtype == np.int16: - if qtype.dtype == np.int8: - if len(qtype.scale) > 1: - qtype.scale = np.max(qtype.scale) - q = 7 - dtype = np.int8 - elif qtype.dtype == np.int16: - if len(qtype.scale) > 1: - qtype.scale = np.max(qtype.scale) - q = 15 - dtype = np.int16 - return Q15ScaleQRec(dtype, max_val, q, max_val=max_val, min_val=-max_val) + if qtype.dtype in [np.int8, np.uint8, np.int16, np.uint16]: + if len(qtype.scale) > 1: + return None + max_val, min_val, bitlen = Q15ScaleQRec.dtype_zp_to_min_max( + qtype.dtype, qtype.scale[0], qtype.zero_point[0]) + return Q15ScaleQRec(qtype.dtype, max_val, bitlen, + max_val=max_val, min_val=min_val, + zero_point=qtype.zero_point[0]) elif qtype.dtype in [np.float32, np.float16, bfloat16]: return FloatQRec(dtype=qtype.dtype, max_val=max_val, min_val=-max_val) else: @@ -157,6 +158,7 @@ def _c_expr(self, *args, **kwargs): # TODO - Need numpy equivalents of sin and cos # TODO - All of these should return correct function based on output type (i.e. bfloat16/ieee16 version) + @nargs(1) @environment({ 'npcos': np.cos, diff --git a/tools/nntool/expressions/symbolic/function.py b/tools/nntool/expressions/symbolic/function.py index e09fabecb..b44404655 100644 --- a/tools/nntool/expressions/symbolic/function.py +++ b/tools/nntool/expressions/symbolic/function.py @@ -16,6 +16,8 @@ import numpy as np +from generation.code_block import CodeBlock + from .symbol import Constant, Symbol, Variable, environment from .variable_container import VariableContainer @@ -81,16 +83,17 @@ def _resolve(self, **kwargs): for elem in self._contents] return self._eval(*contents, **kwargs) - def _calculate(self, calculate_ranges=False, track_results=None, **kwargs): + def _calculate(self, calculate_ranges=False, track_results=None, dequantize_outputs=False, **kwargs): contents = [elem.calculate(calculate_ranges=calculate_ranges, track_results=track_results, + dequantize_outputs=dequantize_outputs, **kwargs) for elem in self._contents] res = self._eval(*contents, **kwargs) if calculate_ranges: self.control.add_stat(self, res.value) if track_results is not None: - if self.qrec is not None: + if self.qrec is not None and dequantize_outputs: track_results[self.name] = self.qrec.dequantize( res.value.copy()) else: @@ -136,6 +139,38 @@ def py_compiled_lambda(self): def c_expr(self, *args, **kwargs) -> str: return self._c_expr(*(arg.c_expr(*args, **kwargs) for arg in self._contents)) + def c_block(self, code_block=None, tags=None, with_comment=False, **kwargs): + if code_block is None: + code_block = CodeBlock() + if tags is not None: + args = [] + for arg in self._contents: + arg.c_block(code_block=code_block, tags=tags, + with_comment=with_comment, **kwargs) + if arg.tag: + if arg in tags: + args.append(tags[arg]) + else: + name = tags.get(arg, f'{self.SYMBOL_PREFEX}{arg.name}') + if isinstance(name, tuple): + name = name[0].c_expr() + args.append(name) + else: + args.append(code_block.lines.pop(-1).strip()) + if self.tag: + if self.comment and with_comment: + code_block.write(f'// {self.comment}') + name = tags.get(self, f'{self.ctype} {self.SYMBOL_PREFEX}{self.name}') + if isinstance(name, tuple): + name = name[0].c_expr( + dtype=name[0].dtype, declare=name[1], **kwargs) + code_block.write(f'{name} = {self._c_expr(*args)};') + else: + code_block.write(f'{self._c_expr(*args)}') + else: + code_block.write(self.c_expr(*args, **kwargs)) + return code_block + @property def py_lambda(self) -> str: return "lambda %s: %s" % (",".join("%s=None" % (var) for var in self.unbound_variables), self.py_expr()) diff --git a/tools/nntool/expressions/symbolic/function_collection.py b/tools/nntool/expressions/symbolic/function_collection.py index ff6717ebd..880c3f5b8 100644 --- a/tools/nntool/expressions/symbolic/function_collection.py +++ b/tools/nntool/expressions/symbolic/function_collection.py @@ -31,7 +31,7 @@ class FunctionCollection(): def __init__(self, functions: Sequence[Tuple[Variable, Symbol]], qrecs=None) -> None: self._qrecs = qrecs # save map from produced variable to function - self._functions = {k: v for k, v in functions} + self._functions = {k: v for k, v in functions} # now create a map with producted variable name to free variables in function self._freevars = {var.name: set([name for name in func.unbound_variables.keys()]) for var, func in self._functions.items()} @@ -51,16 +51,18 @@ def __init__(self, functions: Sequence[Tuple[Variable, Symbol]], qrecs=None) -> for name, symbol in func.unbound_variables.items(): if name in self._vars: if self._vars[name] != symbol: - raise ValueError('%s points to more than one variable' % name) + raise ValueError( + '%s points to more than one variable' % name) else: self._vars[name] = symbol if res_symbol.name in self._vars: if self._vars[res_symbol.name] != res_symbol: - raise ValueError('%s points to more than one variable' % res_symbol.name) + raise ValueError( + '%s points to more than one variable' % res_symbol.name) else: self._vars[res_symbol.name] = res_symbol self.init_indexes() - + def init_indexes(self): # iterators contains list of iterators self._iterators = None @@ -168,7 +170,8 @@ def _create_indexes(self): key=lambda x: next(i for i in x)) idx_names = ["_".join(["d%s" % idx for idx in sorted(list(idxes))]) for idxes in unique_indexes] - idx_dims = [reduce(lambda x, y: x*max_shape[y], idxes, 1) for idxes in unique_indexes] + idx_dims = [reduce(lambda x, y: x*max_shape[y], idxes, 1) + for idxes in unique_indexes] self._iterators = [Variable(idx_name, shape=tuple([idx_dim]), dtype=np.int32) for idx_name, idx_dim in zip(idx_names, idx_dims)] if not self._iterators: @@ -202,7 +205,8 @@ def get_iterator_vars(self): if depth == 0: iters.extend([('First', 0), ('Last', var.shape[0])]) else: - iters.append((self.iterators[depth].name.upper(), var.shape[0])) + iters.append( + (self.iterators[depth].name.upper(), var.shape[0])) return iters def create_kernel(self, parallel_iterator, fixed_iterators, code_block=None): @@ -255,13 +259,23 @@ def create_kernel(self, parallel_iterator, fixed_iterators, code_block=None): assert produced_idx >= len(execution_order) return code_block - def produce_functions(self, produced_idx, execution_order, index_dependencies, depth, code_block): + def produce_functions(self, produced_idx, execution_order, index_dependencies, depth, code_block, tags=None): while (produced_idx < len(execution_order) and index_dependencies[execution_order[produced_idx].name] == depth): + this_tags = {} if tags is None else tags.copy() var = execution_order[produced_idx] declare = var.name in self.intermediate_names - code_block.write("{} = {};", var.c_expr(declare=declare, dtype=var.dtype), - self._functions[var].c_expr()) + # write comment with quantization if present + uvars = [f'{uvar.name}: {uvar.qrec}' + for uvar in self._functions[var].unbound_variables.values() + if uvar.qrec] + if uvars: + uvars = " ".join(uvars) + code_block.write(f'// inputs {uvars}') + this_tags[self._functions[var]] = (var, declare) + self._functions[var].tag = True + self._functions[var].c_block(code_block=code_block, tags=this_tags) + self._functions[var].tag = False produced_idx += 1 return produced_idx diff --git a/tools/nntool/expressions/symbolic/iteration_space.py b/tools/nntool/expressions/symbolic/iteration_space.py index fe07dea7b..c2f8327ee 100644 --- a/tools/nntool/expressions/symbolic/iteration_space.py +++ b/tools/nntool/expressions/symbolic/iteration_space.py @@ -1,4 +1,4 @@ -# Copyright (C) 2021 GreenWaves Technologies, SAS +# Copyright (C) 2022 GreenWaves Technologies, SAS # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as @@ -13,31 +13,69 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . + from functools import partial, reduce from itertools import groupby from operator import itemgetter from typing import List, Sequence import numpy as np -from quantization.qtype import DTYPE_GAP_CTYPE +from bfloat16 import bfloat16 +from generation.code_block import CodeBlock +from quantization.qtype import DTYPE_GAP_CTYPE, DTYPES +from .assignments import Assignments from .symbol import Symbol -from .variable_container import VariableContainerAndAssigner def split_list(sequence: Sequence): return [list(map(itemgetter(1), g)) for k, g in groupby(enumerate(sequence), lambda x: x[0]-x[1])] +def tabulate(lines): + """Takes a list of lists of strings and lines up lengths to improve formating""" + # get max length of each segment + max_len = tuple(max(elems) for elems + in zip(*[tuple(len(line_elem) for line_elem in line) + for line in lines])) + # add spaces to each line segment + + def reduction(s, x): + s.append( + f'{"".join(elem + " " * (max_len[idx] - len(elem)) for idx, elem in enumerate(x[:-1]))}{x[-1]}') + return s + return reduce(reduction, lines, []) + + +AT_ARG_DATATYPES = { + np.uint8: ('CNN_ArgDataTypeUns', 1, False, False), + np.uint16: ('CNN_ArgDataTypeUns', 2, False, False), + np.uint32: ('CNN_ArgDataTypeUns', 4, False, False), + np.uint64: ('CNN_ArgDataTypeUns', 8, False, False), + np.int8: ('CNN_ArgDataType', 1, True, False), + np.int16: ('CNN_ArgDataType', 2, True, False), + np.int32: ('CNN_ArgDataType', 4, True, False), + np.int64: ('CNN_ArgDataType', 8, True, False), + np.float16: ('CNN_ArgDataTypeF', 2, True, True), + bfloat16: ('CNN_ArgDataTypeF', 2, True, True), + np.float32: ('CNN_ArgDataTypeF', 4, True, True), +} + class IterationSpace(): - def __init__(self, assigner: VariableContainerAndAssigner, min_interior_space=1000) -> None: - self._assigner = assigner + def __init__(self, assignments: Assignments, constants=None, min_interior_space=1000, max_interior_space=10000, num_parameteric_spaces=2) -> None: + self._assignments = assignments self._var_shapes = None self._var_axes = None + self._var_is_constant = {} self._spaces = None + self._var_axes_idx = None self._min_interior_space = min_interior_space + self._max_interior_space = max_interior_space self._num_workers = 8 self._var_strides = {} + self._num_parameteric_spaces = num_parameteric_spaces + if constants: + self.vars_are_constant(*constants) @staticmethod def _var_name(idx): @@ -68,10 +106,27 @@ def set_var_stride(self, vname, stride): self._var_axes = None self._spaces = None self._var_strides[vname] = stride + self._var_axes_idx = None + + def is_var_constant(self, var_name): + return self._var_is_constant.get(var_name, False) + + def vars_are_constant(self, *var_names): + for var_name in var_names: + self._var_is_constant[var_name] = True + return self + + @property + def assignments(self): + return self._assignments + + @property + def real_shape(self): + return tuple(dim for dim in self.shape if dim != 1) @property def shape(self): - return max(zip(*Symbol.extend_shapes(*self._assigner.unbound_shapes))) + return self.assignments.max_shape @property def full_rank(self): @@ -96,16 +151,16 @@ def extended_strides(self): return tuple(reduce(lambda state, x: state + [frozenset(set.union(*x))], zip(*tuple(vstrides.values())), [])) @property - def produced_variables(self): - return set(self._assigner.returned_variables) + def output_names(self): + return self._assignments.output_names @property - def consumed_variables(self): - return set(self._assigner.unbound_variables) + def input_names(self): + return self._assignments.input_names @property - def temporary_variables(self): - return set(self.variables) - self.produced_variables - self.consumed_variables + def temporary_names(self): + return self._assignments.intermediate_names def space_for_axis(self, axis): return next((axes for axes in self.spaces if axis in axes), None) @@ -123,18 +178,35 @@ def var_axes(self): for vname, shape in self.var_shapes.items()} return self._var_axes + @property + def var_axes_idx(self): + """Map of variable name to index of iteration space used + + Returns: + dict: Map of variable name to index of iteration space used + """ + if self._var_axes_idx is None: + self._var_axes_idx = {vname: tuple(self.spaces.index(dim) for dim in axes) + for vname, axes in self.var_axes.items()} + return self._var_axes_idx + @property def variables(self): """Set of variable names """ return set(self.var_shapes) + @property + def spaces_size(self): + return tuple(int(np.prod([self.shape[idx] for idx in space])) for space in self.spaces) + @property def spaces(self): """Set of disjoint iteration spaces that have the same set of strides """ if self._spaces is None: - spaces = self._assigner.axes + max_shape = self.assignments.max_shape + spaces = self._assignments.axes # here we modify grouped spaces so that continuous spaces have the same stride if self._var_strides: final_spaces = list(spaces) @@ -148,12 +220,14 @@ def spaces(self): def reduction(var_stride, state: List, space): space_strides = {} for dim in space: - space_strides.setdefault(var_stride[dim], []).append(dim) + space_strides.setdefault( + var_stride[dim], []).append(dim) for space_group in space_strides.values(): state.extend(split_list(space_group)) return state - final_spaces = reduce(partial(reduction, var_stride), final_spaces, []) + final_spaces = reduce( + partial(reduction, var_stride), final_spaces, []) self._spaces = tuple(sorted(tuple(sorted(space)) for space in final_spaces)) else: @@ -161,57 +235,89 @@ def reduction(var_stride, state: List, space): return self._spaces + @property + def expanded_spaces(self): + res = [] + last = 0 + for space in self.spaces: + res.append(tuple(range(last, min(space))) + space) + if res: + res[-1] = res[-1] + tuple(range(max(res[-1])+1, len(self.shape))) + return tuple(res) + + @property + def space_total_items(self): + return tuple(np.stack(list(self.var_shapes.values())).sum(axis=0)) + + @property + def space_total_bytes(self): + variables = [self.assignments.variables[name] for name in self.var_shapes] + sizes = [1 if var.dtype is None else AT_ARG_DATATYPES[var.dtype][1] for var in variables] + return tuple((np.stack(list(self.var_shapes.values())) * np.array(sizes).reshape((-1, 1))).sum(axis=0)) + @property def var_shapes(self): if self._var_shapes is None: - self._var_shapes = self._assigner.var_shapes.copy() + self._var_shapes = self._assignments.var_shapes.copy() self._var_shapes = dict( zip(self._var_shapes.keys(), Symbol.extend_shapes(*self._var_shapes.values()))) return self._var_shapes @property - def axis_shape(self): - return tuple(np.prod([self.shape[axis] for axis in axis_set]) for axis_set in self.spaces) + def has_scalar_parameters(self): + return any(len(shape) == 1 and shape[0] == 1 for shape in self.var_shapes.values()) + @property def iterator_names(self): return [self._var_name(idx) for idx in range(len(self.spaces))] @property - def interior_space(self): + def interior_spaces(self): """This provides the minimum tile space if it is more than one axis""" + expanded_spaces = list(self.expanded_spaces) + if len(expanded_spaces) <= 1: + return tuple() dims = [] - shape = list(self.axis_shape) + bytes = self.space_total_bytes total = 1 - while len(shape) > 1 and total < self._min_interior_space: - dims.append(len(shape) - 1) - total *= shape[-1] - shape = shape[0:-1] - return tuple(reversed(dims)) + while len(expanded_spaces) > 1 and total < self._min_interior_space: + new_size = total * np.prod([bytes[idx] for idx in expanded_spaces[-1]]) + if new_size * 8 > self._max_interior_space: + break + dims.append(len(expanded_spaces) - 1) + total = new_size + expanded_spaces = expanded_spaces[0:-1] + return tuple(self.spaces[idx] for idx in reversed(dims)) @property def interior_shape(self): - shape = list(self.axis_shape) - return tuple(shape[idx] for idx in self.interior_space) + return tuple(self.shape[self.spaces.index(space)] for space in self.interior_spaces) @property def exterior_spaces(self): - return tuple(range(len(self.axis_shape) - len(self.interior_space))) + return tuple(self.spaces[idx] for idx in range(len(self.spaces_size) - len(self.interior_spaces))) @property def exterior_space_names(self): - return tuple(self._par_name(idx) for idx in range(len(self.exterior_spaces))) + return tuple(self._par_name(self.spaces.index(space)) for space in range(len(self.exterior_spaces))) @property def exterior_shape(self): - shape = list(self.axis_shape) + shape = list(self.spaces_size) num_ext_spaces = len(self.exterior_spaces) return tuple(shape[:num_ext_spaces - 1] + [shape[num_ext_spaces - 1] * np.prod(self.interior_shape)]) @property - def parameteric_spaces(self): - return tuple(self.exterior_spaces[-2:]) + def parametric_spaces(self): + return tuple(self.exterior_spaces[-self._num_parameteric_spaces:]) + + @property + def paralellized_space(self): + if self.parametric_spaces: + return max([(space, self.real_shape[self.spaces.index(space)]) for space in self.parametric_spaces], key=lambda x: x[1])[0] + return 0 @property def interior_shape_size(self): @@ -219,7 +325,7 @@ def interior_shape_size(self): @property def fixed_spaces(self): - return tuple(self.exterior_spaces[:-2]) + return tuple(self.exterior_spaces[:-self._num_parameteric_spaces]) def preferred_divisor(self, space): if space == 0: @@ -232,19 +338,22 @@ def preferred_divisor(self, space): return 1 def c_indexed_var(self, var_name, declare=False, assignment=False): - if var_name in self.temporary_variables: + # if var_name.startswith('_SYMBOL'): + # return var_name + if var_name in self.temporary_names: if declare or assignment: - dtype = self._assigner.find(var_name).dtype + dtype = self._assignments.variable(var_name).dtype return f"{DTYPE_GAP_CTYPE[dtype]} {var_name}" return var_name if declare: - dtype = self._assigner.find(var_name).dtype + dtype = self._assignments.variable(var_name).dtype return f"{DTYPE_GAP_CTYPE[dtype]} *{var_name}" - return f'{var_name}{self.c_index(var_name)}' + c_index = self.c_index(var_name) + if c_index: + return f'{var_name}{c_index}' + return f'*{var_name}' def c_index(self, var_name): - var_spaces = [self.spaces.index(space) - for space in self.var_axes[var_name]] var_ext_shape = self.var_shapes[var_name] var_shape = [np.prod([var_ext_shape[dim] for dim in space]) for space in self.var_axes[var_name]] @@ -256,23 +365,24 @@ def c_index(self, var_name): assert all(var_stride_in_space[-1] == var_stride[dim] for dim in space[1:]) else: - var_stride_in_space = [1] * len(var_spaces) + var_stride_in_space = [1] * len(self.var_axes[var_name]) def reduction(state, x): var_space, space_dim, var_stride = x + var_space_idx = self.spaces.index(var_space) # fixed spaces are iterated by tiler code if var_space in self.fixed_spaces: return state space_size = str( - space_dim) if var_space in self.interior_space else self._var_max_name(var_space) + space_dim) if var_space == self.interior_spaces else self._var_max_name(var_space_idx) assert abs(var_stride) == 1, "non unit strides not supported yet" if var_stride < 0: if var_space == 0: - index = f'(Last-1-{self._var_name(var_space)})' + index = f'(Last-1-{self._var_name(var_space_idx)})' else: - index = f'({space_size}-1-{self._var_name(var_space)})' + index = f'({space_size}-1-{self._var_name(var_space_idx)})' else: - index = f'{self._var_name(var_space)}' + index = f'{self._var_name(var_space_idx)}' if state[0]: state[1].insert( 0, f"({index}*{'*'.join(state[0])})") @@ -280,20 +390,313 @@ def reduction(state, x): state[1].insert(0, index) state[0].insert(0, str( - space_dim) if var_space in self.interior_space else self._var_max_name(var_space)) + space_dim) if var_space == self.interior_spaces else self._var_max_name(var_space_idx)) return state - index = reduce(reduction, zip(reversed(var_spaces), + index = reduce(reduction, zip(reversed(self.var_axes[var_name]), reversed(var_shape), reversed(var_stride_in_space)), ([], []))[1] - return f"[{'+'.join(index)}]" + return f"[{'+'.join(index)}]" if index else "" + + def get_iterator_vars(self): + iters = [] + for idx, space in enumerate(self.spaces): + if space in self.interior_spaces: + continue + if space == self.paralellized_space: + iters.extend([('First', 0), ('Last', self.spaces_size[idx]), (self._var_max_name(idx), self.spaces_size[idx])]) + else: + iters.append( + (self._var_max_name(idx), self.spaces_size[idx])) + return iters - def c_for(self, space): - if space in self.fixed_spaces: + def c_for(self, space, with_fixed=False): + if not with_fixed and space == self.fixed_spaces: raise ValueError( - "space is fixed so not iterated and requires no for loop") - var_name = self._var_name(space) - if space in self.interior_space: - return f"for (int {var_name}=0; {var_name}<{self.shape[space]}; {var_name}++)" - if space == 0: + "space is fixed so not iterated inside basic kernel and requires no for loop") + space_index = self.spaces.index(space) + var_name = self._var_name(space_index) + if space in self.interior_spaces: + return f"for (int {var_name}=0; {var_name}<{self.real_shape[space_index]}; {var_name}++)" + if space == self.paralellized_space: return f"for (int {var_name}=First; {var_name}= len(self.exterior_spaces): + int_size *= self.spaces_size[space_idx] + continue + if var_stride and var_stride[var_dim_idx] < 0: + iter_space_descrs.append( + f'KER_ITER_D{space_idx}|SPACE_PROP_REVERT') + else: + iter_space_descrs.append(f'KER_ITER_D{space_idx}') + if iter_space_descrs: + argspace = f'KerArgSpace({len(iter_space_descrs)}, {", ".join(iter_space_descrs)})' + else: + argspace = 'KerArgSpace(1, KER_ITER_TILE0)' + if var_name in self.output_names: + constraints = "O_OUT|O_DB" if iter_space_descrs else "O_OUT|O_BUFF|O_NTILED" + elif self.is_var_constant(var_name): + constraints = "O_IN|O_DB|O_CONST" + else: + constraints = "O_IN|O_DB" if iter_space_descrs else "O_IN|O_BUFF|O_NTILED" + kargs.append( + (f'KerArg("{var_name}", ', + f'{argspace}, ', + f'{constraints}, ', + f'1, 1, ', + f'{self.ctype_len(var_name) * int_size}, ', + f'0, 0, 0, "{var_name}")')) + return tabulate(kargs) + + @property + def at_uk_cargs(self): + return ([f'TCArg({self.at_argdatatype(var_name, pointer=True, restrict=True)}, "{var_name}")' + for var_set in [self.input_names, self.output_names] + for var_name in sorted(var_set)]) + + @property + def at_uk_kinfos(self): + cvars = sorted(self.input_names) + pvars = sorted(self.output_names) + in_sizes = [np.prod(self.var_shapes[var_name]) + for var_name in cvars] + out_sizes = [np.prod(self.var_shapes[var_name]) + for var_name in pvars] + bandwidth = sum(in_sizes + out_sizes) + kinfos = [ + f"AddKernelInfos(Name, AT_KERINFO_OPER, {self._assignments.ops * max(in_sizes)}, 0)", + f"AddKernelInfos(Name, AT_KERINFO_BANDWIDTH, {bandwidth}, 0)" + ] + ksize_infos = [] + for var_name in cvars + pvars: + shape = reduce( + lambda s, x: s + [x] if x > 1 or s else s, self.var_shapes[var_name], []) + _, item_size, _, is_float = AT_ARG_DATATYPES[ + self._assignments.variables[var_name].dtype] + add_arg_func = "AddKernelFloatArgDim" if is_float else "AddKernelArgDim" + ksize_infos.append( + (f'{add_arg_func}(Name, "{var_name}", ', + f'{len(shape) + 1}, ', + f'{", ".join(str(dim) for dim in shape) if shape else 1}, ', + f'{item_size})')) + return kinfos + tabulate(ksize_infos) + + @property + def kernel_args(self): + return ([(self._var_max_name(self.spaces.index(space)), 'unsigned int') for space in self.exterior_spaces] + + [(var_name, self.ctype(var_name, pointer=True, restrict=True)) + for var_set in [self.input_names, self.output_names] + for var_name in sorted(var_set)]) + + @property + def at_uk_kerbindings(self): + bindings = [ + f"K_ArgPar(\"{max(self.output_names, key=lambda x: self.var_shapes[x][idx])}\", KER_ARG_PARTILE_SIZE, KER_ITER_D{idx})" + for idx in range(len(self.exterior_shape)) + ] + [ + f"K_Arg(\"{var_name}\", KER_ARG_TILE)" + for var_set in [self.input_names, self.output_names] + for var_name in sorted(var_set) + ] + return bindings + + def comment_attrs(self, code, *attrs): + code.comment("".join(f'{name}: {getattr(self, name)} ' if getattr(self, name) else '' + for name in attrs)) + + def gen_kernel_headers(self, code: CodeBlock = None): + if code is None: + code = CodeBlock() + for include in self._assignments.c_header_set: + code.write('#include {}', include) + return code + + def gen_user_kernel(self, ukname: str, kname: str, code: CodeBlock = None): + if code is None: + code = CodeBlock() + code.write(f"int {ukname}(char *Name) {{") + code.indent() + code.write("Kernel_T *Kernel = UserKernel(") + code.indent() + code.write("Name,") + # include some useful parameters as comment + self.comment_attrs(code, + 'shape', + 'spaces') + self.comment_attrs(code, + 'fixed_spaces', + 'parametric_spaces', + 'interior_spaces') + self.comment_attrs(code, + 'exterior_shape', + 'interior_shape') + code.write(f'{self.at_uk_iterspace},') + kargs = self.at_uk_kargs + code.write("TILE_VER,") + cargs = self.at_uk_cargs + code.write(f"CArgs({len(cargs)},") + code.indent() + for carg in cargs[: -1:]: + code.write(f"{carg},") + code.write(f"{cargs[-1]}") + code.deindent() + code.write("),") + code.write("Calls(1,") + code.indent() + code.write(f'Call("{kname}", LOC_D{len(self.exterior_shape) - 1},') + code.indent() + bindings = self.at_uk_kerbindings + code.write(f"Bindings({len(bindings)},") + code.indent() + for binding in bindings[: -1:]: + code.write(f"{binding},") + code.write(f"{bindings[-1]}") + code.deindent() + code.write(")") + code.deindent() + code.write(")") + code.deindent() + code.write("),") + for var_name, idxes in self.var_axes_idx.items(): + if var_name in self.temporary_names: + continue + stride = f" stride: {self._var_strides[var_name]}" if var_name in self._var_strides else "" + code.comment(f'var: {var_name} axes: {idxes}{stride}') + code.write("KerArgs({0},", len(kargs)) + code.indent() + for karg in kargs[: -1:]: + code.write("{0},", karg) + code.write("{0}", kargs[-1]) + code.deindent() + code.write(")") + code.deindent() + code.write(");") + code.write("if (Kernel) {") + code.indent() + for kinfo in self.at_uk_kinfos: + code.write("{0};", kinfo) + code.deindent() + code.write("}") + code.write("return (Kernel!=0);") + code.deindent() + code.write("}") + return code + + def gen_function(self, kernel_name: str, kernel_arg_type_name: str, code: CodeBlock = None): + if code is None: + code = CodeBlock() + + code.comment( + f'Output iteration space reduced to {len(self.interior_spaces)} internal ' + f'and {len(self.exterior_spaces)} external iteration spaces') + code.write(f"void {kernel_name}({kernel_arg_type_name} *Args) {{") + code.indent() + comments = [] + for kerarg_name, _ in self.kernel_args: + # TODO - add qrecs for quantized kernels + comments.append([ + f'{self.var_shapes[kerarg_name]} ' if kerarg_name in self.var_shapes else '', + f'{self._assignments.qrecs[kerarg_name]}' if kerarg_name in self._assignments.qrecs else '' + ]) + comments = tabulate(comments) + for idx, (kerarg_name, kerarg_type) in enumerate(self.kernel_args): + # TODO - add qrecs for quantized kernels + comment = comments[idx] + if comment.strip(): + comment = f' // {comment}' + else: + comment = '' + code.write( + f'{kerarg_type} {kerarg_name} = Args->{kerarg_name};{comment}') + # paralellize on largest dimension + last_first = self._var_max_name(self.spaces.index(self.paralellized_space)) + code.write('unsigned int CoreId = gap_coreid();') + code.write(f'unsigned int Chunk = ChunkSize({last_first});') + code.write('unsigned int First = Chunk*CoreId;') + code.write(f'unsigned int Last = gap_min(First+Chunk, {last_first});') + self._assignments.c_block(code, iteration_space=self, + with_loops=True, with_comment=True) + code.write('gap_waitbarrier(0);') + code.deindent() + code.write('}') + return code + + def gen_kernel_arg_typedecl(self, type_name, code=None): + if code is None: + code = CodeBlock() + code.write('typedef struct {') + code.indent() + for kerarg_name, kerarg_type in self.kernel_args: + code.write(f'{kerarg_type} {kerarg_name};') + code.deindent() + code.write(f'}} {type_name};') + return code + + def gen_kernel_model(self, kernel_name, kernel_arg_type_name, code=None): + if code is None: + code = CodeBlock() + code.write('LibKernelTemplate(') + code.indent() + code.write(f'"{kernel_arg_type_name}",') + code.write(f'CArgs({len(self.kernel_args)},') + code.indent() + for idx, (kerarg_name, kerarg_type) in enumerate(self.kernel_args): + code.write('TCArg("{}", "{}"){}', + kerarg_type, + kerarg_name, + "," if idx < (len(self.kernel_args) - 1) else '') + code.deindent() + code.write(')') + code.deindent() + code.write(');') + code.write('') + code.write('LibKernel(') + code.indent() + code.write('"{}",', kernel_name) + code.write('CALL_PARALLEL,') + code.write('0,') + code.write('"{}",', kernel_arg_type_name) + code.write('0') + code.deindent() + code.write(');') + + return code diff --git a/tools/nntool/expressions/symbolic/kernel_codegen.py b/tools/nntool/expressions/symbolic/kernel_codegen.py deleted file mode 100644 index 1e45a6ab3..000000000 --- a/tools/nntool/expressions/symbolic/kernel_codegen.py +++ /dev/null @@ -1,313 +0,0 @@ -# Copyright (C) 2020 GreenWaves Technologies, SAS - -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as -# published by the Free Software Foundation, either version 3 of the -# License, or (at your option) any later version. - -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. - -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . - -import logging -from typing import Sequence - -import numpy as np -from generation.code_block import CodeBlock -from quantization.qtype import DTYPES - -from .function_collection import FunctionCollection - -LOG = logging.getLogger("nntool." + __name__) - - -class BasicKernel(): - def __init__(self, func_col: FunctionCollection, constant_input_names: Sequence[str]) -> None: - self._func_col = func_col - self._constant_input_names = constant_input_names - - @property - def func_col(self): - return self._func_col - - @property - def input_names(self): - return self._func_col.input_names - - @property - def input_names_and_ctypes(self): - return [(name, self.func_col.qrecs[name].ctype) for name in self.input_names] - - @property - def output_names(self): - return self._func_col.output_names - - @property - def output_names_and_ctypes(self): - return [(name, self.func_col.qrecs[name].ctype) for name in self.output_names] - - @property - def intermediate_names(self): - return self._func_col.intermediate_names - - @property - def shapes(self): - return self._func_col.var_shapes - - @property - def kernel_dims(self): - return self._func_col.kernel_dims - - @property - def kernel_args(self): - kernel_args = [] - for kiter in self.func_col.iterators: - kernel_args.append((kiter.name.upper(), "unsigned int")) - for input_name, ctype in self.input_names_and_ctypes: - kernel_args.append((input_name, f"{ctype} *")) - for output_name, ctype in self.output_names_and_ctypes: - kernel_args.append((output_name, f"{ctype} *")) - return kernel_args - - def ctype_len(self, sym_name): - dtype = self.func_col.qrecs[sym_name].dtype - if dtype not in DTYPES: - raise ValueError(f"don't know dtype {dtype}") - return DTYPES[dtype][0]//8 - - def gen_kernel_headers(self, code: CodeBlock = None): - if code is None: - code = CodeBlock() - for include in self._func_col.c_header_set: - code.write('#include {}', include) - return code - - def gen_user_kernel(self, ukname: str, kname: str, code: CodeBlock = None): - if code is None: - code = CodeBlock() - code.write("int {0}(char *Name) {{", ukname) - code.indent() - code.write("Kernel_T *Kernel = UserKernel(") - code.indent() - code.write("Name,") - code.write("{0},", self.gen_iterspace()) - kargs = self.gen_kargs() - code.write("TILE_HOR,") - cargs = self.gen_cargs() - code.write("CArgs({0},", len(cargs)) - code.indent() - for carg in cargs[:-1:]: - code.write("{0},", carg) - code.write("{0}", cargs[-1]) - code.deindent() - code.write("),") - code.write("Calls(1,") - code.indent() - code.write("Call(\"{0}\", LOC_D{1},", kname, - len(self.func_col.iterators) - 1) - code.indent() - bindings = self.gen_kerbingings() - code.write("Bindings({0},", len(bindings)) - code.indent() - for binding in bindings[:-1:]: - code.write("{0},", binding) - code.write("{0}", bindings[-1]) - code.deindent() - code.write(")") - code.deindent() - code.write(")") - code.deindent() - code.write("),") - code.write("KerArgs({0},", len(cargs)) - code.indent() - for karg in kargs[:-1:]: - code.write("{0},", karg) - code.write("{0}", kargs[-1]) - code.deindent() - code.write(")") - code.deindent() - code.write(");") - code.write("if (Kernel) {") - code.indent() - for kinfo in self.gen_kinfos(): - code.write("{0};", kinfo) - code.deindent() - code.write("}") - code.write("return (Kernel!=0);") - code.deindent() - code.write("}") - return code - - def gen_kinfos(self): - in_sizes = [np.prod(self._func_col.var_shapes[var_name]) - for var_name in self._func_col.input_names] - bandwidth = sum([np.prod(self._func_col.var_shapes[var_name]) - for var_name in self._func_col.output_names]) + sum(in_sizes) - kinfos = [ - "AddKernelInfos(Name, AT_KERINFO_OPER, {0}, 0)".format( - self._func_col.ops * max(in_sizes)), - "AddKernelInfos(Name, AT_KERINFO_BANDWIDTH, {0}, 0)".format( - bandwidth) - ] - for name_type in self.input_names_and_ctypes + self.output_names_and_ctypes: - name = name_type[0] - shape = self.shapes[name] - kinfos.append("{0}(Name, \"{1}\", {2}, {3}, {4})".format( - "AddKernelFloatArgDim" if name_type[1] == 'F16' or name_type[1] == 'float' else "AddKernelArgDim", - name, len(shape) + 1, ", ".join(str(dim) for dim in shape), - self.ctype_len(name))) - return kinfos - - def gen_cargs(self): - cargs = [] - for name_type in self.input_names_and_ctypes + self.output_names_and_ctypes: - name = name_type[0] - if name_type[1] == 'F16' or name_type[1] == 'float': - cargs.append("TCArg(CNN_ArgDataTypeF({0},1,1),\"{1}\")".format( - self.ctype_len(name), name)) - else: - cargs.append("TCArg(CNN_ArgDataType({0},1,1),\"{1}\")".format( - self.ctype_len(name), name)) - return cargs - - def gen_kargs(self): - kargs = [] - for input_name in self.input_names: - arg_indexes = self._func_col.variable_indexes[input_name] - argspaces = ", ".join(f'KER_ITER_D{idx}' for idx in arg_indexes) - argspace = f'KerArgSpace({len(arg_indexes)}, {argspaces})' if arg_indexes else f'KerArgSpace(1, KER_ITER_TILE0)' - if input_name in self._constant_input_names: - constraints = "O_IN|O_DB|O_CONST" - else: - constraints = "O_IN|O_DB" if arg_indexes else "O_IN|O_BUFF|O_NTILED" - kargs.append("KerArg(\"{0}\", {1}, {2}, {3}, {4}, {5}, 0, 0, 0, \"{0}\")".format( - input_name, - argspace, - constraints, - 1, - 1, - self.ctype_len(input_name))) - - for output_name in self.output_names: - arg_indexes = self._func_col.variable_indexes[output_name] - argspaces = ", ".join(f'KER_ITER_D{idx}' for idx in arg_indexes) - argspace = f'KerArgSpace({len(arg_indexes)}, {argspaces})' if arg_indexes else 'KerArgSpace(1, KER_ITER_TILE0)' - name = output_name - constraints = "O_OUT|O_DB" if arg_indexes else "O_OUT|O_BUFF|O_NTILED" - kargs.append("KerArg(\"{0}\", {1}, {2}, {3}, {4}, {5}, 0, 0, 0, \"{0}\")".format( - name, argspace, constraints, 1, 1, - self.ctype_len(output_name))) - return kargs - - def gen_iterspace(self): - # All iterators are in parametric spaces. The iterator we will - # parallelize on has its preferred div set to 8 - # since only 3 tiled spaces are allowed including the dummy TILE0 space if there are scalars - # we check for that and only tile the first 3 spaces - tiled_iterators = self.tiled_iterators - iterators = [ - f'IterFixedSpace(KER_ITER_D{idx}, {iterator.shape[0]})' - if iterator not in tiled_iterators else - f'IterParSpace(KER_ITER_D{idx}, {iterator.shape[0]}, ' - f'{min(8, iterator.shape[0]) if iterator == self.parallel_iterator else 1})' - for idx, iterator in enumerate(self._func_col.iterators)] - # append dummy TILE0 space to put scalars into if there are scalar inputs (which is unlikely) - if self.has_scalar_parameters: - iterators.append('IterTiledSpace(KER_ITER_TILE0)') - return f'KernelIterSpace({len(iterators)}, {",".join(iterators)})' - - - def gen_kerbingings(self): - max_dim_var = max(self.output_names, key=lambda x: len(self.shapes[x])) - bindings = [ - f"K_ArgPar(\"{max_dim_var}\", KER_ARG_PARTILE_SIZE, KER_ITER_D{idx})" - for idx in range(len(self._func_col.iterators)) - ] + [ - f"K_Arg(\"{name}\", KER_ARG_TILE)" - for name in self.input_names + self.output_names - ] - return bindings - - @property - def parallel_iterator(self): - return max(self.func_col.iterators, key=lambda x: x.shape[0]) - - @property - def tiled_iterators(self): - return sorted(self.func_col.iterators, key=lambda x: x.shape[0])[-2::] - - @property - def fixed_iterators(self): - tiled_iterators = self.tiled_iterators - return [iterator for iterator in self.func_col.iterators if iterator not in tiled_iterators] - - @property - def has_scalar_parameters(self): - return any(not self._func_col.variable_indexes[input_name] - for input_name in self.input_names + self.output_names) - - def gen_function(self, kernel_name, kernel_arg_type_name, code=None): - if code is None: - code = CodeBlock() - - code.comment( - "Output iteration space reduced to %s iteration spaces" % (self.kernel_dims)) - code.write(f"void {kernel_name}({kernel_arg_type_name} *Args) {{") - code.indent() - for kerarg_name, kerarg_type in self.kernel_args: - code.write('{0} {1} = Args->{1};', kerarg_type, kerarg_name) - # paralellize on largest dimension - last_first = self.parallel_iterator.name.upper() - code.write('unsigned int CoreId = gap_coreid();') - code.write('unsigned int Chunk = ChunkSize({});', last_first) - code.write('unsigned int First = Chunk*CoreId;') - code.write('unsigned int Last = gap_min(First+Chunk, {});', last_first) - self._func_col.create_kernel(self.parallel_iterator, self.fixed_iterators, code) - code.write('gap_waitbarrier(0);') - code.deindent() - code.write('}') - return code - - def kernel_arg_type_codegen(self, type_name, code=None): - if code is None: - code = CodeBlock() - code.write('typedef struct {') - code.indent() - for kerarg_name, kerarg_type in self.kernel_args: - code.write('{} {};', kerarg_type, kerarg_name) - code.deindent() - code.write('}} {};', type_name) - return code - - def gen_kernel_model(self, kernel_name, kernel_arg_type_name, code=None): - if code is None: - code = CodeBlock() - code.write('LibKernelTemplate(') - code.indent() - code.write('"{}",', kernel_arg_type_name) - code.write('CArgs({},', len(self.kernel_args)) - code.indent() - for idx, (kerarg_name, kerarg_type) in enumerate(self.kernel_args): - code.write('TCArg("{}", "{}"){}', - kerarg_type, - kerarg_name, - "," if idx < (len(self.kernel_args) - 1) else '') - code.deindent() - code.write(')') - code.deindent() - code.write(');') - code.write('LibKernel(') - code.indent() - code.write('"{}",', kernel_name) - code.write('CALL_PARALLEL,') - code.write('0,') - code.write('"{}",', kernel_arg_type_name) - code.write('0') - code.deindent() - code.write(');') - - return code diff --git a/tools/nntool/expressions/symbolic/q15_quantization/handlers.py b/tools/nntool/expressions/symbolic/q15_quantization/handlers.py index 4cd591872..eab7394a4 100644 --- a/tools/nntool/expressions/symbolic/q15_quantization/handlers.py +++ b/tools/nntool/expressions/symbolic/q15_quantization/handlers.py @@ -17,24 +17,26 @@ from typing import Tuple import numpy as np +from bfloat16 import bfloat16 +from expressions.symbolic.float_quantization.float_qrec import FloatQRec from utils.exp_17_15 import exp_fp_17_15 -from utils.pow_sqrt import (arctan_17_15alt, logn_17_15, pow_17_15, rsqrt_16_16, sqrt_17_15, - square_17_15) +from utils.pow_sqrt import (arctan_17_15alt, logn_17_15, pow_17_15, + rsqrt_16_16, sqrt_17_15, square_17_15) from utils.sigmoid_tanh_lut import sigmoid_lut, tanh_lut from utils.sin_cos import fpcos, fpsin from ..basic import (Abs, Add, ATan, Cast, Cos, Div, Exp, GapAbs, GapMax, - GapMin, Log, LShift, Max, Min, Mul, Pow, RSqrt, Sigmoid, Sin, - Sqrt, Sub, TanH) + GapMin, Log, LShift, Max, Min, Mul, Pow, RSqrt, Sigmoid, + Sin, Sqrt, Sub, TanH) from ..function import Function from ..quantization_base import qhandler -from ..symbol import (Constant, Rational, Symbol, SymbolStats, Variable, - c_headers, environment, nargs) +from ..symbol import (Constant, QuantizedConstant, QuantizedValue, Rational, + Symbol, SymbolStats, Variable, c_headers, environment, + nargs) from .clip_norm import Clip, Norm from .q15_scale_float import Q15ScaleFloat from .q15_scale_q_rec import Q15ScaleQRec from .q15_scaled_quantization import Q15ScaledQuantization -from .quantized_constant import QuantizedConstant, QuantizedValue from .scale_quantized import ScaleQuantized @@ -91,10 +93,22 @@ def _quantize(cls, # see if an nntool quantizer qtype is available if not qrec and qtypes and sym.name in qtypes: - sym, qrec = cls.qrec_from_qtype(sym, qtypes[sym.name]) + in_range = sym_ctrl.get_range(sym) + qtype = qtypes[sym.name] + if in_range is None: + if qtype.max_val is not None and qtype.min_val is None: + in_range = (qtype.min_val, qtype.max_val) + else: + in_range = (qtype.min, qtype.max) + osym, qrec = cls.qrec_from_qtype(sym, qtypes[sym.name], in_range) if qrec: - sym.qrec = qrec - return (sym, qrec) + osym.qrec = qrec + if isinstance(qrec, Q15ScaleQRec) and qrec.zero_point != 0: + osym = Sub(Cast(osym, dtype=np.int32), QuantizedConstant( + qrec.zero_point), dtype=np.int32, tag=True) + qrec = Q15ScaleQRec.override( + qrec, dtype=np.int32, zero_point=0) + return (osym, qrec) # figure out the quantization from the maximum value recorded max_val = sym_ctrl.get_max(sym) @@ -110,41 +124,41 @@ def _quantize(cls, return (sym, qrec) @classmethod - def qrec_from_qtype(cls, sym, qtype): - if qtype.dtype == np.int8: + def qrec_from_qtype(cls, sym, qtype, in_range): + if qtype.dtype in [np.int8, np.uint8, np.int16, np.uint16]: if len(qtype.scale) > 1: return sym, None - max_val = qtype.scale[0] * (math.pow(2, 7) - qtype.zero_point[0]) - min_val = -qtype.scale[0] * (math.pow(2, 7) + qtype.zero_point[0]) - return sym, Q15ScaleQRec(np.int8, max_val, 7, - max_val=max_val, min_val=min_val, - zero_point=qtype.zero_point[0]) - elif qtype.dtype == np.int16: - if len(qtype.scale) > 1: - return sym, None - max_val = qtype.scale[0] * (math.pow(2, 15) - qtype.zero_point[0]) - min_val = -qtype.scale[0] * (math.pow(2, 15) + qtype.zero_point[0]) - return sym, Q15ScaleQRec(np.int16, max_val, 15, - max_val=max_val, min_val=min_val, - zero_point=qtype.zero_point[0]) - elif qtype.dtype == np.uint8: - if len(qtype.scale) > 1: - return sym, None - max_val = qtype.scale[0] * (math.pow(2, 8) - qtype.zero_point[0]) - min_val = qtype.scale[0] * -qtype.zero_point[0] - return sym, Q15ScaleQRec(np.uint8, max_val, 8, - max_val=max_val, min_val=min_val, - zero_point=qtype.zero_point[0]) - elif qtype.dtype == np.uint16: - if len(qtype.scale) > 1: - return sym, None - max_val = qtype.scale[0] * (math.pow(2, 16) - qtype.zero_point[0]) - min_val = qtype.scale[0] * -qtype.zero_point[0] - return sym, Q15ScaleQRec(np.uint16, max_val, 16, - max_val=max_val, min_val=min_val, - zero_point=qtype.zero_point[0]) - else: - return None + max_val, min_val, bitlen = Q15ScaleQRec.dtype_zp_to_min_max( + qtype.dtype, qtype.scale[0], qtype.zero_point[0]) + qrec = Q15ScaleQRec(qtype.dtype, max_val, bitlen, + max_val=max_val, min_val=min_val, + zero_point=qtype.zero_point[0]) + sym.qrec = qrec + return sym, qrec + if qtype.dtype in [np.float16, bfloat16, np.float32]: + qrec = FloatQRec(dtype=qtype.dtype) + sym.qrec = qrec + max_val = np.max(np.maximum( + np.abs(in_range[0]), np.abs(in_range[1]))) + return ( + Cast( + Add( + Mul( + sym, + Constant( + np.atleast_1d(math.pow(2, 15) / + max_val).astype(qtype.dtype), + dtype=qtype.dtype), + dtype=qtype.dtype), + Constant([0.5], dtype=qtype.dtype), + dtype=qtype.dtype + ), + dtype=np.int32), + Q15ScaleQRec(np.int32, max_val, 15, + max_val=max_val, min_val=-max_val, + zero_point=0)) + raise NotImplementedError( + "don't know how to convert input type to Q15 quantization") @qhandler("Q15Scale", QuantizedValue) @@ -184,7 +198,8 @@ def cast_symbols(in_syms, qrecs, dtype=np.int32): def find_range(sym, qrecs): - assert all(qrec.min_val is not None and qrec.max_val is not None for qrec in qrecs), 'all values must be set' + assert all( + qrec.min_val is not None and qrec.max_val is not None for qrec in qrecs), 'all values must be set' val_range = np.array([ sym.call_with_constants(qrecs[0].min_val, qrecs[1].min_val), sym.call_with_constants(qrecs[0].max_val, qrecs[1].min_val), @@ -240,12 +255,21 @@ def _quantize(cls, if scale_to == 0: in_syms = [ in_syms[0], - ScaleQuantized(in_syms[1], from_qrec=in_qrecs[1], to_qrec=in_qrecs[0])] + ScaleQuantized( + in_syms[1], + from_qrec=in_qrecs[1], + to_qrec=in_qrecs[0], + tag=True, + comment=f"{sym.name} scale arg 1 to 0 - {in_qrecs[1]} -> {in_qrecs[0]}")] calc_qrec = Q15ScaleQRec.override(in_qrecs[0], dtype=np.int32) elif scale_to == 1: in_syms = [ ScaleQuantized( - in_syms[0], from_qrec=in_qrecs[0], to_qrec=in_qrecs[1]), + in_syms[0], + from_qrec=in_qrecs[0], + to_qrec=in_qrecs[1], + tag=True, + comment=f"{sym.name} scale arg 0 to 1 - {in_qrecs[0]} -> {in_qrecs[1]}"), in_syms[1]] calc_qrec = Q15ScaleQRec.override(in_qrecs[1], dtype=np.int32) else: @@ -258,11 +282,16 @@ def _quantize(cls, # Try not to scale if we are still in bounds # This creates more error -> if np.abs(calc_qrec.quantize(max_val)) < max_short or - if calc_qrec == out_qrec: - return (sym_cls(*in_syms), calc_qrec) + # if calc_qrec == out_qrec: + return (sym_cls(*in_syms), calc_qrec) - return (ScaleQuantized(sym_cls(*in_syms, dtype=out_qrec.dtype), - from_qrec=calc_qrec, to_qrec=out_qrec), out_qrec) + # return ( + # ScaleQuantized( + # sym_cls(*in_syms, dtype=out_qrec.dtype), + # from_qrec=calc_qrec, + # to_qrec=out_qrec, + # tag=True, + # comment=f'{sym.name} scale result to output - {calc_qrec} -> {out_qrec}'), out_qrec) @qhandler("Q15Scale", Mul) @@ -294,9 +323,13 @@ def _quantize(cls, out_qrec = Q15ScaleQRec(np.int32, prod_scale, min( prod_q, 15), max_val=prod_scale, min_val=-prod_scale) if prod_q > 15: - qsym = Norm(sym_cls(*in_syms, dtype=np.int32), - QuantizedConstant(prod_q - 15), - dtype=np.int32) + qsym = Norm( + sym_cls(*in_syms, dtype=np.int32), + QuantizedConstant(prod_q - 15), + dtype=np.int32, + tag=True, + comment=f'normalize input to Q15 - {prod_q - 15}' + ) else: qsym = sym_cls(*in_syms) return (qsym, out_qrec) @@ -400,7 +433,7 @@ def _quantize(cls, in_qrec.q - 15), dtype=in_sym.dtype) out_qrec = Q15ScaleQRec(np.int32, new_scale, 15) - return (Cast(Sqrt1715(in_sym, dtype=np.uint32), dtype=np.int32), out_qrec) + return (Cast(Sqrt1715(in_sym, dtype=np.uint32), dtype=np.int32, tag=True), out_qrec) @nargs(1) @@ -451,7 +484,7 @@ def _quantize(cls, in_sym = in_syms[0] out_qrec = Q15ScaleQRec(np.int32, new_scale, 15) - return (Cast(Norm(RSqrt1616(in_sym, dtype=np.uint32), QuantizedConstant(norm), dtype=np.uint32), dtype=np.int32), out_qrec) + return (Cast(Norm(RSqrt1616(in_sym, dtype=np.uint32), QuantizedConstant(norm), dtype=np.uint32), dtype=np.int32, tag=True), out_qrec) @nargs(1) @@ -504,7 +537,7 @@ def _quantize(cls, max_bits = math.ceil(math.log2(math.fabs(-340695 + qlog_off))) + 2 return ( ScaleQuantized(Add(Log1715(in_sym, dtype=np.int32), QuantizedConstant( - qlog_off), dtype=np.int32), from_qrec=calc_qrec, to_qrec=out_qrec, num_bits=31-max_bits), + qlog_off), dtype=np.int32), from_qrec=calc_qrec, to_qrec=out_qrec, num_bits=31-max_bits, tag=True), out_qrec) @@ -571,7 +604,7 @@ def _quantize(cls, if val == 2: out_qrec = Q15ScaleQRec(np.int32, np.power(lhs_qrec.scale, 2), 15) - return (Cast(Square1715(lhs, dtype=np.int32), dtype=np.int32), out_qrec) + return (Cast(Square1715(lhs, dtype=np.int32), dtype=np.int32, tag=True), out_qrec) if val == -2: out_qrec = Q15ScaleQRec( np.int32, 1/np.power(lhs_qrec.scale, 2), 15) @@ -583,10 +616,20 @@ def _quantize(cls, out_qrec = Q15ScaleQRec(np.int32, 1, 0) return (QuantizedConstant(1), out_qrec) if val > 0 and val < 1: + out_scale = np.power(lhs_qrec.scale, val).item() out_qrec = Q15ScaleQRec( - np.uint32, np.power(lhs_qrec.scale, val), 15) + np.uint32, out_scale, 15) qval = int(math.floor(val * math.pow(2, 15) + 0.5)) - return (Cast(Pow1715(lhs, QuantizedConstant(qval), dtype=np.int32), dtype=np.int32), out_qrec) + return ( + Cast( + Pow1715( + lhs, + QuantizedConstant(qval), + dtype=np.int32), + dtype=np.int32, + comment=f'{sym.name} POW on scale {lhs_qrec.scale:.3f} -> {out_scale:.3f}', + tag=True), + out_qrec) raise NotImplementedError( "power is currently only supported with fractional constants, 2, 1, & 0") @@ -631,7 +674,7 @@ def _quantize(cls, return (ScaleQuantized(Cast(Exp1715(arg, dtype=np.uint32), dtype=np.int32), from_qrec=calc_qrec, - to_qrec=out_qrec), + to_qrec=out_qrec, tag=True), out_qrec) @@ -669,7 +712,7 @@ def _quantize(cls, in_syms, in_qrecs = cls.cast_symbols(in_syms, in_qrecs) lhs = ScaleQuantized( in_syms[0], from_qrec=in_qrecs[0], to_qrec=calc_qrec) - return (Arctan1715(lhs), calc_qrec) + return (Arctan1715(lhs, tag=True), calc_qrec) @nargs(1) @@ -723,8 +766,17 @@ def _quantize(cls, # output is Q12 * 1 out_qrec = Q15ScaleQRec(np.int16, 1, 12) in_syms, in_qrecs = cls.cast_symbols(in_syms, in_qrecs) - lhs = Cast(Clip(ScaleQuantized( - in_syms[0], from_qrec=in_qrecs[0], to_qrec=calc_qrec), dtype=calc_qrec.dtype, clip_dtype=np.int16), dtype=np.int16) + lhs = Cast( + Clip( + ScaleQuantized( + in_syms[0], + from_qrec=in_qrecs[0], + to_qrec=calc_qrec), + dtype=calc_qrec.dtype, + clip_dtype=np.int16), + dtype=np.int16, + tag=True, + comment=f'{sym.name} scale and clip input - {in_qrecs[0]} -> {calc_qrec}') if isinstance(sym, Cos): qsym = Cos_Q15 else: @@ -782,7 +834,8 @@ def _quantize(cls, func = 'tanh' if isinstance(sym, TanH) else 'sigmoid' calc_qrec = Q15ScaleQRec(np.int32, 1, 12) # output is Q15 * 1 - out_qrec = Q15ScaleQRec(np.int32, 1, 15, min_val=-1.0 if func == 'tanh' else 0.0, max_val=1.0) + out_qrec = Q15ScaleQRec( + np.int32, 1, 15, min_val=-1.0 if func == 'tanh' else 0.0, max_val=1.0) in_syms, in_qrecs = cls.cast_symbols(in_syms, in_qrecs) lhs = ScaleQuantized( in_syms[0], from_qrec=in_qrecs[0], to_qrec=calc_qrec) diff --git a/tools/nntool/expressions/symbolic/q15_quantization/q15_scale_q_rec.py b/tools/nntool/expressions/symbolic/q15_quantization/q15_scale_q_rec.py index 5e43157ba..0cb6e4d7f 100644 --- a/tools/nntool/expressions/symbolic/q15_quantization/q15_scale_q_rec.py +++ b/tools/nntool/expressions/symbolic/q15_quantization/q15_scale_q_rec.py @@ -18,12 +18,16 @@ import numpy as np +from quantization.qtype import DTYPES + from ..quantization_base import QRecBase class Q15ScaleQRec(QRecBase): def __init__(self, dtype: np.dtype, scale: float, q: int, min_val=None, max_val=None, zero_point=0) -> None: super(Q15ScaleQRec, self).__init__(dtype) + if isinstance(scale, np.ndarray): + scale = scale.item() self._scale = scale self._q = q self._min_val = min_val @@ -31,7 +35,7 @@ def __init__(self, dtype: np.dtype, scale: float, q: int, min_val=None, max_val= self._zero_point = zero_point def __repr__(self) -> str: - return f"{self._dtype.__name__} {self.scale} Q{self._q}" + return f"{self._dtype.__name__} {self.scale:.3f} Q{self._q}" @classmethod def inherit(cls, rec, dtype: np.dtype = None, scale: float = None, q: int = None, max_val=None, min_val=None, zero_point=None): @@ -157,3 +161,14 @@ def __str__(self) -> str: def __eq__(self, o: object) -> bool: return self.q == o.q and self.scale == o.scale and self.dtype == o.dtype and self.zero_point == o.zero_point + + @staticmethod + def dtype_zp_to_min_max(dtype, scale, zero_point): + bitlen, signed = DTYPES[dtype] + maxquns = math.pow(2, bitlen) + zpoff = math.pow(2, bitlen - 1) if signed else 0 + maxq_range = maxquns - (zero_point + zpoff) + minq_range = maxquns - maxq_range + max_val = maxq_range * scale + min_val = minq_range * scale + return max_val, min_val, bitlen - (1 if signed or zero_point != 0 else 0) diff --git a/tools/nntool/expressions/symbolic/q15_quantization/q15_scaled_quantization.py b/tools/nntool/expressions/symbolic/q15_quantization/q15_scaled_quantization.py index 5f5f81185..1f7488c90 100644 --- a/tools/nntool/expressions/symbolic/q15_quantization/q15_scaled_quantization.py +++ b/tools/nntool/expressions/symbolic/q15_quantization/q15_scaled_quantization.py @@ -13,12 +13,14 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . -from typing import Tuple, cast +import math +from typing import Tuple import numpy as np -import math +from bfloat16 import bfloat16 +from expressions.symbolic.float_quantization.float_qrec import FloatQRec -from ..basic import Cast +from ..basic import Cast, ConvertQuantization from ..quantization_base import (QRecBase, QuantizationHandlerBase, handles_scheme) from ..symbol import Symbol, SymbolStats @@ -63,6 +65,55 @@ def _dequantize_py_expr(cls, py_expr: str, qrec: Q15ScaleQRec, **kwargs) -> np.n def _dequantize_c_expr(cls, c_expr: str, qrec: Q15ScaleQRec, **kwargs) -> np.ndarray: return qrec.dequantize_c_expr(c_expr) + # @classmethod + # def _quantize_output(cls, + # sym: Symbol, + # qsym: Symbol, + # osym: Symbol, + # sym_ctrl: SymbolStats, + # qrec: QRecBase, + # **kwargs) -> Tuple[Symbol, QRecBase]: + # from_qrec = qrec + # qtypes = kwargs.get('qtypes', {}) + # # first see if this has already been quantized by nntool + # # note that the qtype will be stored against the name of the output symbol + # res = cls._get_scale_dtype_from_qtypes( + # osym, qtypes) + # if res is None: + # max_val = math.fabs(sym_ctrl.get_max(sym)) + # min_val = -max_val + # out_dtype = np.int8 + # out_q = 7 + # zero_point = 0 + # else: + # max_val, min_val, out_dtype, out_q, zero_point = res + + # qrec_scale = Q15ScaleQRec(np.int32, max_val, out_q, min_val=min_val, max_val=max_val, zero_point=zero_point) + # qrec_out = Q15ScaleQRec(out_dtype, max_val, out_q, min_val=min_val, max_val=max_val, zero_point=zero_point) + # # scale clip and cast to output type + # return ( + # Cast( + # Clip( + # ScaleQuantized(qsym, + # from_qrec=from_qrec, + # to_qrec=qrec_scale), + # clip_dtype=out_dtype, + # dtype=qrec_scale.dtype), + # dtype=qrec.dtype), qrec_out) + + # @classmethod + # def _get_scale_dtype_from_qtypes(cls, sym, qtypes): + # if not qtypes or sym.name not in qtypes: + # return None + # qtype = qtypes[sym.name] + # if qtype.dtype in [np.int8, np.uint8, np.int16, np.uint16]: + # if len(qtype.scale) > 1: + # return None + # max_val, min_val, bitlen = Q15ScaleQRec.dtype_zp_to_min_max(qtype.dtype, qtype.scale[0], qtype.zero_point) + # return max_val, min_val, qtype.dtype, bitlen, qtype.zero_point + # else: + # return None + @classmethod def _quantize_output(cls, sym: Symbol, @@ -75,49 +126,56 @@ def _quantize_output(cls, qtypes = kwargs.get('qtypes', {}) # first see if this has already been quantized by nntool # note that the qtype will be stored against the name of the output symbol - max_val, out_dtype, out_q, zero_point = cls._get_scale_dtype_from_qtypes( - osym, qtypes) - if max_val is None: + + if qtypes and osym.name in qtypes: + qtype = qtypes[osym.name] + if qtype.dtype in [np.int8, np.uint8, np.int16, np.uint16]: + max_val, min_val, out_q = Q15ScaleQRec.dtype_zp_to_min_max( + qtype.dtype, qtype.scale[0], qtype.zero_point) + out_dtype = qtype.dtype + zero_point = qtype.zero_point + elif qtype.dtype in [np.float16, bfloat16, np.float32]: + min_val = qtype.min_val + max_val = qtype.max_val + out_dtype = qtype.dtype + else: + raise ValueError(f"don't know how to output {qtype.dtype}") + else: + out_dtype = kwargs.get('out_dtype', np.int8) + assert out_dtype in [np.int8, np.int16] max_val = math.fabs(sym_ctrl.get_max(sym)) - out_dtype = np.int8 - out_q = 7 + min_val = -max_val + out_dtype = out_dtype + out_q = 7 if out_dtype == np.int8 else 15 zero_point = 0 -#pylint: disable=invalid-unary-operand-type - min_val = -max_val - qrec_scale = Q15ScaleQRec(np.int32, max_val, out_q, min_val=min_val, max_val=max_val, zero_point=zero_point) - qrec_out = Q15ScaleQRec(out_dtype, max_val, out_q, min_val=min_val, max_val=max_val, zero_point=zero_point) - # scale clip and cast to output type + if out_dtype in [np.float16, bfloat16, np.float32]: + qrec_out = FloatQRec( + dtype=out_dtype, max_val=max_val, min_val=min_val) + return ( + ConvertQuantization( + qsym, + from_qrec=from_qrec, + to_qrec=qrec_out, + comment=f'convert quantization - {from_qrec} -> {qrec_out}' + ), qrec_out) + + qrec_scale = Q15ScaleQRec( + np.int32, max_val, out_q, min_val=min_val, max_val=max_val, zero_point=zero_point) + qrec_out = Q15ScaleQRec( + out_dtype, max_val, out_q, min_val=min_val, max_val=max_val, zero_point=zero_point) return ( Cast( Clip( - ScaleQuantized(qsym, - from_qrec=from_qrec, - to_qrec=qrec_scale), + ScaleQuantized( + qsym, + from_qrec=from_qrec, + to_qrec=qrec_scale + ), clip_dtype=out_dtype, - dtype=qrec_scale.dtype), - dtype=qrec.dtype), qrec_out) - - @classmethod - def _get_scale_dtype_from_qtypes(cls, sym, qtypes): - if not qtypes or sym.name not in qtypes: - return None, None, None, None - qtype = qtypes[sym.name] - if qtype.dtype == np.int8: - if len(qtype.scale) > 1: - return None, None, None, None - return qtype.scale[0] * math.pow(2, 7), np.int8, 7, qtype.zero_point - if qtype.dtype == np.uint8: - if len(qtype.scale) > 1: - return None, None, None, None - return qtype.scale[0] * math.pow(2, 8), np.uint8, 8, qtype.zero_point - elif qtype.dtype == np.int16: - if len(qtype.scale) > 1: - return None, None, None - return qtype.scale[0] * math.pow(2, 15), np.int16, 15, qtype.zero_point - if qtype.dtype == np.uint16: - if len(qtype.scale) > 1: - return None, None, None, None - return qtype.scale[0] * math.pow(2, 16), np.uint16, 16, qtype.zero_point - else: - return None, None, None, None + dtype=qrec_scale.dtype + ), + dtype=qrec_out.dtype, + comment=f'scale clip and cast - {from_qrec} -> {qrec_out}' + ), + qrec_out) diff --git a/tools/nntool/expressions/symbolic/q15_quantization/quantized_constant.py b/tools/nntool/expressions/symbolic/q15_quantization/quantized_constant.py deleted file mode 100644 index 258036854..000000000 --- a/tools/nntool/expressions/symbolic/q15_quantization/quantized_constant.py +++ /dev/null @@ -1,34 +0,0 @@ -# Copyright (C) 2020 GreenWaves Technologies, SAS - -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as -# published by the Free Software Foundation, either version 3 of the -# License, or (at your option) any later version. - -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. - -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . - - -import numpy as np - -from ..symbol import Constant, Symbol - - -class QuantizedConstant(Constant): - def __init__(self, *args, dtype=np.int32, **kwargs): - super().__init__(*args, dtype=dtype, **kwargs) - -class QuantizedValue(Symbol): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def _calculate(self, calculate_ranges=False, **kwargs): - raise ValueError('wrapper class for quantization purposes - not designed to be evaluated') - - def _impl(self, *args, **kwargs): - raise ValueError('wrapper class for quantization purposes - not designed to be evaluated') diff --git a/tools/nntool/expressions/symbolic/q15_quantization/scale_quantized.py b/tools/nntool/expressions/symbolic/q15_quantization/scale_quantized.py index 30d812ee4..90eb9c791 100644 --- a/tools/nntool/expressions/symbolic/q15_quantization/scale_quantized.py +++ b/tools/nntool/expressions/symbolic/q15_quantization/scale_quantized.py @@ -14,16 +14,15 @@ # along with this program. If not, see . import math +from xml.etree.ElementTree import Comment import numpy as np -from numpy.core.getlimits import iinfo from expressions.symbolic.function import Function from ..basic import Add, Cast, CompoundFunction, LShift, Mul, Sub, copy_props -from ..symbol import c_headers, nargs +from ..symbol import QuantizedConstant, c_headers, nargs from .clip_norm import Norm from .q15_scale_q_rec import Q15ScaleQRec -from .quantized_constant import QuantizedConstant @nargs(2) @@ -87,7 +86,13 @@ def _eval(self, *args, **kwargs): # this should be safe as we never go much above Q15 and the scaling step # is also a Q15 if isinstance(sym, ScaleQuantized): - return ScaleQuantized(*sym.contents, from_qrec=sym.from_qrec, to_qrec=self.to_qrec, num_bits=min(self._num_bits, sym.num_bits)) + return ScaleQuantized( + *sym.contents, + from_qrec=sym.from_qrec, + to_qrec=self.to_qrec, + num_bits=min(self._num_bits, sym.num_bits), + tag=self.tag, + comment=self.comment) # Check if we do nothing if self._from_qrec == self._to_qrec: return sym @@ -159,6 +164,8 @@ def _eval(self, *args, **kwargs): if self._to_qrec.dtype != np.int32: sym = Cast(sym, dtype=self._to_qrec.dtype) + sym.tag = self.tag + sym.comment=self.comment return sym def __repr__(self) -> str: diff --git a/tools/nntool/expressions/symbolic/quantization_base.py b/tools/nntool/expressions/symbolic/quantization_base.py index 48221469c..c66d6f346 100644 --- a/tools/nntool/expressions/symbolic/quantization_base.py +++ b/tools/nntool/expressions/symbolic/quantization_base.py @@ -1,8 +1,24 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + from typing import Tuple import numpy as np from expressions.symbolic.basic import CompoundFunction +from expressions.symbolic.function import Function from .symbol import Symbol, SymbolStats, Variable, QRecBase @@ -123,6 +139,8 @@ def quantize(cls, if not isinstance(sym, Variable): qsym.name = sym.name qsym.qrec = qrec + if isinstance(sym, Function): + qsym.tag = True return (qsym, qrec) @classmethod diff --git a/tools/nntool/expressions/symbolic/symbol.py b/tools/nntool/expressions/symbolic/symbol.py index 0480073bc..31bf41972 100644 --- a/tools/nntool/expressions/symbolic/symbol.py +++ b/tools/nntool/expressions/symbolic/symbol.py @@ -19,7 +19,7 @@ import numpy as np from bfloat16 import bfloat16 from generation.code_block import CodeBlock -from quantization.qtype import DTYPE_GAP_CTYPE +from quantization.qtype import DTYPE_GAP_CTYPE, DTYPES class SymbolStats(): @@ -64,17 +64,21 @@ def reset_ranges(self): class QRecBase(): DTYPE_TO_CTYPE = { - np.int8: 'int8_t', - np.int16: 'int16_t', - np.int32: 'int32_t', - np.uint8: 'uint8_t', - np.uint16: 'uint16_t', - np.uint32: 'uint32_t', + np.int8: 'signed char', + np.int16: 'short', + np.int32: 'int', + np.uint8: 'unsigned char', + np.uint16: 'unsigned short', + np.uint32: 'unsigned int', np.float32: 'float', bfloat16: 'F16', np.float16: 'F16' } def __init__(self, dtype=None) -> None: + if isinstance(dtype, np.dtype): + dtype = dtype.type + if dtype is not None and dtype not in self.DTYPE_TO_CTYPE: + raise ValueError('unknown dtype') self._dtype = dtype @property @@ -89,6 +93,14 @@ def dtype(self, val): def ctype(self): return self.DTYPE_TO_CTYPE[self.dtype] + @property + def signed(self): + return DTYPES[self.dtype][1] + + @property + def size(self): + return DTYPES[self.dtype][0] + class Symbol(): NARGS = None CURRENT_CONTROL = SymbolStats() @@ -96,9 +108,10 @@ class Symbol(): COUNTS = {} C_HEADERS = [] COPY_PROPS = tuple() + SYMBOL_PREFEX = '_SYMBOL_' #pylint: disable=unused-argument - def __init__(self, *args, name="", shape=None, dtype=np.float32, qrec: QRecBase = None, **kwargs): + def __init__(self, *args, name="", shape=None, dtype=np.float32, qrec: QRecBase = None, tag=None, comment: str=None, **kwargs): super(Symbol, self).__init__(**kwargs) if self.NARGS is not None and len(args) != self.NARGS: raise ValueError("wrong number of arguments to Symbol %s"%self.__class__.__name__) @@ -107,6 +120,8 @@ def __init__(self, *args, name="", shape=None, dtype=np.float32, qrec: QRecBase self._dtype = dtype self._shape = shape self._qrec = qrec + self._tag = tag + self._comment = comment @classmethod def get_name(cls, cls_to_name): @@ -114,6 +129,22 @@ def get_name(cls, cls_to_name): cls.COUNTS[cls_to_name] += 1 return name + @property + def tag(self): + return self._tag + + @tag.setter + def tag(self, val): + self._tag = val + + @property + def comment(self): + return self._comment + + @comment.setter + def comment(self, val): + self._comment = val + @property def qrec(self): return self._qrec @@ -153,6 +184,10 @@ def dtype(self): return self._qrec.dtype return self._dtype + @property + def ctype(self): + return QRecBase.DTYPE_TO_CTYPE[self.dtype] + @property def name(self): return self._name @@ -182,10 +217,11 @@ def set_default_control(cls, control): cls.CURRENT_CONTROL = control @staticmethod - def extend_shapes(*shapes): + def extend_shapes(*shapes, max_length=None): if len(shapes) == 1: return list(shapes) - max_length = max(len(x) for x in shapes) + if max_length is None: + max_length = max(len(x) for x in shapes) return [tuple([1] * (max_length - len(x)) + list(x)) for x in shapes] @staticmethod @@ -226,7 +262,10 @@ def resolve(self, **kwargs): def calculate(self, calculate_ranges=False, **kwargs): """Given a set of substitions for variable in kwargs calculate a result""" - return self._calculate(calculate_ranges=calculate_ranges, **kwargs) + val = self._calculate(calculate_ranges=calculate_ranges, **kwargs) + if self.tag and 'details' in kwargs: + kwargs['details'][self.tag[0]] = val + return val def collect_globals(self) -> dict: """Returns a dict of globals necessary to execute a lambda of this symbol. Globals @@ -330,10 +369,19 @@ def py_expr(self, *args, **kwargs): def c_expr(self, *args, **kwargs): return self._c_expr([], **kwargs) - def c_block(self, code_block=None, **kwargs): + def c_block(self, code_block=None, with_comment=False, tags=None, **kwargs): if code_block is None: - code_block = CodeBlock - code_block.write(self.c_expr) + code_block = CodeBlock() + if tags is not None and self.tag: + if self.comment and with_comment: + code_block.write(f'// {self.comment}') + name = tags.get(self, f'{self.ctype} {self.SYMBOL_PREFEX}{self.name}') + if isinstance(name, tuple): + name = name[0].c_expr(dtype=name[0].dtype, declare=name[1], **kwargs) + code_block.write(f'{name} = {self._c_expr([], **kwargs)};') + else: + code_block.write(self._c_expr([], **kwargs)) + return code_block def _equivalent(self, other) -> bool: pass @@ -466,7 +514,7 @@ def _c_expr(self, *args, **kwargs): return f"(F16){print_float_constant(val)}" elif self.dtype == np.float32: return print_float_constant(val) - return val + return str(val) def __repr__(self) -> str: return str(self._value) @@ -524,6 +572,7 @@ def __init__(self, var_name, shape=None, symbol_binding=None, name="", **kwargs) self._shape = shape self._index_vars = None self._ispointer = False + self._cindex = None @property def shape(self): @@ -560,6 +609,14 @@ def value(self): def unbound_variables(self): return {self._name: self} + @property + def cindex(self): + return self._cindex + + @cindex.setter + def cindex(self, val): + self._cindex = val + @property def index_vars(self): return self._index_vars @@ -586,11 +643,11 @@ def _impl(self, *args, **kwargs): val = np.array(kwargs[self.name]) if self.shape is not None: val = np.reshape(val, self.shape) - quantize_inputs = kwargs.get('quantize_inputs', False) - if quantize_inputs is True or isinstance(quantize_inputs, Iterable) and self.name in quantize_inputs: - if self.qrec is None: - raise ValueError("can't quantize %s. no quantization record is set."%self.name) - val = self.qrec.quantize_and_clip(val) + # quantize_inputs = kwargs.get('quantize_inputs', False) + # if quantize_inputs is True or isinstance(quantize_inputs, Iterable) and self.name in quantize_inputs: + # if self.qrec is None: + # raise ValueError("can't quantize %s. no quantization record is set."%self.name) + # val = self.qrec.quantize_and_clip(val) if calculate_ranges: self.control.add_stat(self, val) return val @@ -645,8 +702,8 @@ def gen_index(index_vars): #pylint: disable=arguments-differ def _c_expr(self, *args, declare=False, dtype=None, pointer=None, iteration_space=None, **kwargs): - if iteration_space: - return iteration_space.c_indexed_var(self.name) + if iteration_space and not self.name.startswith(self.SYMBOL_PREFEX): + return iteration_space.c_indexed_var(self.name, declare=declare) if pointer is None: pointer = self._ispointer if declare: @@ -666,3 +723,17 @@ def _c_expr(self, *args, declare=False, dtype=None, pointer=None, iteration_spac def __repr__(self) -> str: return f'{self.name}' + +class QuantizedConstant(Constant): + def __init__(self, *args, dtype=np.int32, **kwargs): + super().__init__(*args, dtype=dtype, **kwargs) + +class QuantizedValue(Symbol): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def _calculate(self, calculate_ranges=False, **kwargs): + raise ValueError('wrapper class for quantization purposes - not designed to be evaluated') + + def _impl(self, *args, **kwargs): + raise ValueError('wrapper class for quantization purposes - not designed to be evaluated') diff --git a/tools/nntool/expressions/symbolic/variable_container.py b/tools/nntool/expressions/symbolic/variable_container.py index 3647509d2..89399bc32 100644 --- a/tools/nntool/expressions/symbolic/variable_container.py +++ b/tools/nntool/expressions/symbolic/variable_container.py @@ -13,8 +13,8 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . -from abc import ABC, abstractmethod, abstractproperty -from typing import Mapping +from collections import Counter +from itertools import chain import numpy as np from utils.disjoint_reduction import disjoint_reduction @@ -22,54 +22,41 @@ from .symbol import Symbol, Variable -class VariableAssigner(ABC): - @abstractmethod - def _resolve_assignment(self, substitute_all=False, **kwargs) -> Mapping[str, Symbol]: - """ Resolves an container that is one or more assigments substituting values contained in - **kwargs into unresolved variables - - Args: - substitute_all (bool, optional): If False only expressions that resolve to constants will be substituted. - Defaults to False. - - Returns: - Mapping[str, Symbol]: A map of the variable names and their values (Symbols) - """ - - def resolve_assignment(self, substitute_all=False, **kwargs) -> Mapping[str, Symbol]: - return self._resolve_assignment(substitute_all=substitute_all, **kwargs) - - @abstractmethod - def _calculate_assignment(self, **kwargs) -> Mapping[str, np.ndarray]: - """ Attempts to resolve a series of assignments to a map of values - - Returns: - Mapping[str, np.ndarray]: Map of resolved values - """ - - def calculate_assignment(self, **kwargs) -> Mapping[str, np.ndarray]: - return self._calculate_assignment(**kwargs) - - @abstractproperty - def returned_variables(self): - pass - - @abstractproperty - def var_shapes(self): - pass +def search_variables(elem): + if isinstance(elem, Variable): + return [elem] + if type(elem) == int or type(elem) == float or isinstance(elem, np.ndarray): + return [] + return chain(*[search_variables(sub_elem) for sub_elem in elem.contents]) class VariableContainer(): def __init__(self, *args, **kwargs): + args = list(args) + # if string variable names are provided match existing variables or create a new one + if any(isinstance(arg, str) for arg in args): + all_vars = {var.name: var + for var in chain(*[search_variables(arg) + for arg in args if not isinstance(arg, str)])} + for idx, arg in enumerate(args): + if isinstance(arg, str): + if arg in all_vars: + args[idx] = all_vars[arg] + else: + all_vars[arg] = Variable(arg) + args[idx] = all_vars[arg] super().__init__(*args, **kwargs) - self._unbound_variables = self._init_unbound_variables(*args) + # variables with same name must be the same variable instance + names = list( + {object.__hash__(var): var.name for var in search_variables(self)}.values()) + if len(set(names)) != len(names): + bad_vars = [item[0] for item in filter( + lambda x: x[1] > 1, Counter(names).items())] + raise ValueError( + f'duplicate variable names detected: {" ".join(bad_vars)}') @property def unbound_variables(self): - return self._unbound_variables - - @unbound_variables.setter - def unbound_variables(self, val): - self._unbound_variables = val + return {var.name: var for var in search_variables(self)} @property def unbound_shapes(self): @@ -83,12 +70,10 @@ def extended_unbound_var_shapes(self): return {vname: tuple(([1] * (max_length - len(var.shape))) + list(var.shape)) for vname, var in self.unbound_variables.items()} - @staticmethod def adjust(axes, adjust): return tuple(tuple(dim+adjust for dim in axes_group) for axes_group in axes) - @property def axes(self): var_shapes = Symbol.extend_shapes(*self.unbound_shapes) @@ -96,28 +81,6 @@ def axes(self): shape) if dim != 1) for shape in var_shapes)) return tuple(sorted([tuple(x) for x in axes])) - @staticmethod - def _init_unbound_variables(*args): - unbound_variables = {} - for arg in args: - if isinstance(arg, Variable): - if arg.name in unbound_variables: - if unbound_variables[arg.name].shape != arg.shape: - raise ValueError( - 'there is more than one variable called %s with different shapes' % arg.name) - else: - unbound_variables[arg.name] = arg - elif isinstance(arg, VariableContainer): - unbound_variables.update(arg.unbound_variables) - elif isinstance(arg, str): - if arg in unbound_variables: - raise ValueError( - 'there is more than one variable called %s' % arg) - else: - unbound_variables[arg] = Variable(arg) - - return unbound_variables - def vars_to_axes(self, axes=None): if axes is None: axes = self.axes @@ -129,20 +92,3 @@ def axes_sizes(self, axes=None): axes = self.axes shape = Symbol.broadcast(*self.unbound_shapes) return {axis: int(np.prod([shape[x] for x in axis])) for axis in axes} - - -class VariableContainerAndAssigner(VariableContainer, VariableAssigner): - @property - def var_axes(self): - elems = self.resolve_assignment(substitute_all=True) - max_axis_groups = np.array( - [max(max(x) for x in elem.axes) for elem in elems.values()]) - max_axis = np.max(max_axis_groups) - axis_adjust = max_axis - max_axis_groups - - axes = {} - for elem_idx, (elem_name, elem) in enumerate(elems.items()): - axes[elem_name] = self.adjust(elem.axes, axis_adjust[elem_idx]) - for vname, vaxes in elem.vars_to_axes().items(): - axes[vname] = self.adjust(vaxes, axis_adjust[elem_idx]) - return axes diff --git a/tools/nntool/generation/code_block.py b/tools/nntool/generation/code_block.py index e615c6844..659b61847 100644 --- a/tools/nntool/generation/code_block.py +++ b/tools/nntool/generation/code_block.py @@ -15,12 +15,39 @@ QUOTE = lambda s: '"'+s+'"' + class CodeBlock(): + + class CommentWriter(): + def __init__(self, cb, max_len) -> None: + self._cb = cb + self._max_len = max_len + self.reset() + + def write(self, comment): + for elem in comment.split(' '): + if self._cur_len + len(elem) + 1 > self._max_len: + self.end() + self._cur_line.append(elem) + self._cur_len += len(elem) + 1 + + def end(self): + self._cb.write(f'// {" ".join(self._cur_line)}') + self.reset() + + def reset(self): + self._cur_line = [] + self._cur_len = len(self._cb.get_indent()) + 3 + def __init__(self, starting_indent=0, indent_char=" "): self._indent = starting_indent self._indent_char = indent_char self._lines = [] + @property + def lines(self): + return self._lines + def indent(self): self._indent += 1 return self @@ -60,6 +87,9 @@ def write_start(self, fmt, *args): self._lines.insert(0, fmt.format(*args)) return self + def start_long_comment(self, max_len=80): + return CodeBlock.CommentWriter(self, max_len) + def comment(self, fmt, *args): fmt = self.get_indent() + '// ' + fmt if args: diff --git a/tools/nntool/generation/code_generator.py b/tools/nntool/generation/code_generator.py index 3a243be1c..093eff2fe 100644 --- a/tools/nntool/generation/code_generator.py +++ b/tools/nntool/generation/code_generator.py @@ -17,14 +17,14 @@ import numpy as np from bfloat16 import bfloat16 -from expressions.symbolic.kernel_codegen import BasicKernel +from expressions.symbolic.iteration_space import IterationSpace from graph.manipulations.dimensions import add_dimensions from graph.types import (ConcatParameters, ConstantInputParameters, - InputParameters, OutputParameters, ReshapeParameters, - SplitParameters, TransposeParameters) + InputParameters, OutputParameters, SplitParameters, + TransposeParameters) from graph.types.base import NNEdge from graph.types.fusions import FusionBase -from graph.types.others import CopyParameters, NoOPParameters, QuantizeParameters +from graph.types.others import CopyParameters, QuantizeParameters from graph.types.rnn import RNNBaseParameters from utils.node_id import NodeId @@ -534,7 +534,8 @@ def add_checksum_binding(self, cname, name, step_idx, eparams, before): FunctionBindingList(cname, checksum_func(self.hidden_graph, name), Imm(step_idx), - Imm(calc_value_checksum(self.hidden_graph, name)), + Imm(calc_value_checksum( + self.hidden_graph, name)), GArgEdge(eparams[0]), Imm(size), before=before) @@ -609,8 +610,8 @@ def expressions_foreach_basic_kernel(self): basic_kernel = self.expressions_kernel_cache.get(node) if not basic_kernel: qrec = self.G.quantization[NodeId(node)] - basic_kernel = BasicKernel(qrec.cache['qfunc_col'], - [inp.name for inp in node.constant_inputs]) + basic_kernel = IterationSpace(qrec.cache['qfunc_col'], + constants=[inp.name for inp in node.constant_inputs]) self.expressions_kernel_cache[node] = basic_kernel yield node, basic_kernel @@ -628,12 +629,12 @@ def expressions_kernel_types_generator(self): code_block = CodeBlock(starting_indent=0) for node, basic_kernel in self.expressions_foreach_basic_kernel(): _, arg_name = self.expressions_get_names(node) - basic_kernel.kernel_arg_type_codegen(arg_name, code=code_block) + basic_kernel.gen_kernel_arg_typedecl(arg_name, code=code_block) return str(code_block) def expressions_kernel_includes_generator(self): code_block = CodeBlock(starting_indent=0) - includes = set.union(*[basic_kernel.func_col.c_header_set for node, + includes = set.union(*[basic_kernel.assignments.c_header_set for node, basic_kernel in self.expressions_foreach_basic_kernel()]) for include in includes: code_block.write('#include {}', include) @@ -733,26 +734,25 @@ def gen_inout_list(self): def generate_output_check(self, tol=0.0, indent=0): code = CodeBlock(starting_indent=indent) code.write('int errors;') - for out_node in self.output_nodes: + for idx, out_node in enumerate(self.output_nodes): out_sz = out_node.out_dims[0].size() nodeq = self.G.quantization[NodeId(out_node, None)].out_qs[0] dtype = "%f" if nodeq.is_floating else "%d" code.write('errors = 0;') - if tol: - code.write(f"{dtype2ctype(nodeq)} max_diff = 0;") + code.write(f"{'float' if nodeq.is_floating else 'int'} max_diff_{idx} = 0;") code.write(f'for (int j=0; j<{out_sz}; j++) {{') code.indent() + code.write( + f"{'float' if nodeq.is_floating else 'int'} diff = {out_node.name.capitalize()}[j] - " + f"{out_node.name.capitalize()}_gt[j];") + code.write("diff = (diff>0)?diff:(-diff);") + code.write(f"if (diff > max_diff_{idx}) max_diff_{idx} = diff;") if tol: - code.write( - f"{dtype2ctype(nodeq)} diff = {out_node.name.capitalize()}[j] - " - f"{out_node.name.capitalize()}_gt[j];") - code.write("diff = (diff>0)?diff:(-diff);") - code.write("if (diff > max_diff) max_diff = diff;") code.write( f'if (diff > {nodeq.quantize(np.array(tol)).item()}) {{') else: code.write( - f'if ({out_node.name.capitalize()}[j] != {out_node.name.capitalize()}_gt[j]) {{') + f'if (diff > 0) {{') code.indent() code.write('errors++;') code.write(f'printf("Error @ %d: {dtype} instead of {dtype}\\n", j, ' @@ -763,6 +763,5 @@ def generate_output_check(self, tol=0.0, indent=0): code.write('}') code.write( f'printf("{out_node.name.capitalize()}: %d/{out_sz} errors\\n", errors);') - if tol: - code.write(f'printf("Max error: {dtype}\\n", max_diff);') + code.write(f'printf("Max error: {dtype}\\n", max_diff_{idx});') return str(code) diff --git a/tools/nntool/generation/default_appl_main_template.py b/tools/nntool/generation/default_appl_main_template.py index f0a3a5e5f..d6b20ecf2 100644 --- a/tools/nntool/generation/default_appl_main_template.py +++ b/tools/nntool/generation/default_appl_main_template.py @@ -57,7 +57,6 @@ def generate_main_appl_template(G, gen, test_inputs=None, test_outputs=None, tol * Put here Your input settings * <--------------- */ - #ifndef __EMUL__ /* Configure And open cluster. */ @@ -70,22 +69,19 @@ def generate_main_appl_template(G, gen, test_inputs=None, test_outputs=None, tol printf("Cluster open failed !\\n"); pmsis_exit(-4); } - int cur_fc_freq = pi_freq_set(PI_FREQ_DOMAIN_FC, ${gen.opts['fc_freq']}); - if (cur_fc_freq == -1) + + /* Frequency Settings: defined in the Makefile */ + int cur_fc_freq = pi_freq_set(PI_FREQ_DOMAIN_FC, FREQ_FC*1000*1000); + int cur_cl_freq = pi_freq_set(PI_FREQ_DOMAIN_CL, FREQ_CL*1000*1000); + int cur_pe_freq = pi_freq_set(PI_FREQ_DOMAIN_PERIPH, FREQ_PE*1000*1000); + if (cur_fc_freq == -1 || cur_cl_freq == -1 || cur_pe_freq == -1) { printf("Error changing frequency !\\nTest failed...\\n"); pmsis_exit(-4); } + printf("FC Frequency as %d Hz, CL Frequency = %d Hz, PERIIPH Frequency = %d Hz\\n", + pi_freq_get(PI_FREQ_DOMAIN_FC), pi_freq_get(PI_FREQ_DOMAIN_CL), pi_freq_get(PI_FREQ_DOMAIN_PERIPH)); - int cur_cl_freq = pi_freq_set(PI_FREQ_DOMAIN_CL, ${gen.opts['cl_freq']}); - if (cur_cl_freq == -1) - { - printf("Error changing frequency !\\nTest failed...\\n"); - pmsis_exit(-5); - } -#ifdef __GAP9__ - pi_freq_set(PI_FREQ_DOMAIN_PERIPH, 250000000); -#endif #endif // IMPORTANT - MUST BE CALLED AFTER THE CLUSTER IS SWITCHED ON!!!! printf("Constructor\\n"); @@ -202,6 +198,15 @@ def generate_main_appl_make(G, gen, quantized, open_args=""): CLUSTER_SLAVE_STACK_SIZE=${gen.opts['cluster_slave_stack_size']} CLUSTER_NUM_CORES=${gen.opts['cluster_num_cores']} +# FLASH and RAM type +FLASH_TYPE = ${"MRAM" if gen.opts['l3_flash_device'] == 'AT_MEM_L3_MRAMFLASH' else \ + "QSPI" if gen.opts['l3_flash_device'] == 'AT_MEM_L3_QSPIFLASH' else \ + "OSPI" if gen.opts['l3_flash_device'] == 'AT_MEM_L3_OSPIFLASH' else \ + "HYPER"} +RAM_TYPE = ${"QSPI" if gen.opts['l3_ram_device'] == 'AT_MEM_L3_QSPIRAM' else \ + "OSPI" if gen.opts['l3_ram_device'] == 'AT_MEM_L3_OSPIRAM' else \ + "HYPER"} + NNTOOL_SCRIPT = nntool_script ${"APP_CFLAGS += -DSTD_FLOAT" if any(qrec[1].out_qs[0].dtype == np.float16 for qrec in G.quantization.sorted_iterator(G)) else ""} ${"APP_LDFLAGS += -lm" if gen.G.has_expressions and "FLOAT" in gen.G.quantization.schemes_present else ""} diff --git a/tools/nntool/generation/new_generators/general/transpose.py b/tools/nntool/generation/new_generators/general/transpose.py index 9b5c3dbae..6e513dc31 100644 --- a/tools/nntool/generation/new_generators/general/transpose.py +++ b/tools/nntool/generation/new_generators/general/transpose.py @@ -72,6 +72,10 @@ def __init__(self, cname, params, in_shape, real_transpose, qrec, perm_op=None, if qrec.out_qs[0].is_floating: gen_ctrl.float_dump = 1 + datasize = qrec.out_qs[0].dtype_bits//8 + if not qrec.out_qs[0].signed: + datasize = -datasize + attrs = { 'in_dim': params.in_dims[0], 'out_dim': params.out_dims[0], @@ -81,7 +85,7 @@ def __init__(self, cname, params, in_shape, real_transpose, qrec, perm_op=None, 'height': in_shape[1], 'width': in_shape[2], 'perm_op': perm_op, - 'datasize': (qrec.out_qs[0].dtype_bits//8) + 'datasize': datasize } # other attributes diff --git a/tools/nntool/generation/new_generators/mult8/pool_mult8.py b/tools/nntool/generation/new_generators/mult8/pool_mult8.py index 2294ea81c..f98a9e881 100644 --- a/tools/nntool/generation/new_generators/mult8/pool_mult8.py +++ b/tools/nntool/generation/new_generators/mult8/pool_mult8.py @@ -202,6 +202,11 @@ def __init__(self, node_name, cname, pool_params, pool_q, act_params, act_q, for LOG.debug("%s: generating pad control block", node_name) self.gen_ctrl.PadType = at_pad_ctrl + if not out_q.signed: + gen_ctrl.output_datasize = -out_q.dtype_bits//8 + if not in_q.signed: + gen_ctrl.input_datasize = -in_q.dtype_bits//8 + attrs = { 'in_size': in_q.dtype_bits//8 if in_q.signed else -in_q.dtype_bits//8, 'out_size': out_q.dtype_bits//8 if out_q.signed else -out_q.dtype_bits//8, diff --git a/tools/nntool/generation/project_template/Makefile b/tools/nntool/generation/project_template/Makefile index 71960b115..c1f528ea8 100644 --- a/tools/nntool/generation/project_template/Makefile +++ b/tools/nntool/generation/project_template/Makefile @@ -11,22 +11,46 @@ endif include common.mk include $(RULES_DIR)/at_common_decl.mk -io=stdout +io?=host -RAM_FLASH_TYPE ?= HYPER +FLASH_TYPE ?= HYPER +RAM_TYPE ?= HYPER #PMSIS_OS=freertos -ifeq '$(RAM_FLASH_TYPE)' 'HYPER' -APP_CFLAGS += -DUSE_HYPER -MODEL_L3_EXEC=hram -MODEL_L3_CONST=hflash -else -APP_CFLAGS += -DUSE_SPI -CONFIG_SPIRAM = 1 -MODEL_L3_EXEC=qspiram -MODEL_L3_CONST=qpsiflash +ifeq '$(FLASH_TYPE)' 'HYPER' + MODEL_L3_CONST=AT_MEM_L3_HFLASH +else ifeq '$(FLASH_TYPE)' 'MRAM' + MODEL_L3_CONST=AT_MEM_L3_MRAMFLASH + READFS_FLASH = target/chip/soc/mram +else ifeq '$(FLASH_TYPE)' 'QSPI' + MODEL_L3_CONST=AT_MEM_L3_QSPIFLASH + READFS_FLASH = target/board/devices/spiflash +else ifeq '$(FLASH_TYPE)' 'OSPI' + MODEL_L3_CONST=AT_MEM_L3_OSPIFLASH + READFS_FLASH = target/board/devices/ospiflash +endif + +ifeq '$(RAM_TYPE)' 'HYPER' + MODEL_L3_EXEC=AT_MEM_L3_HRAM +else ifeq '$(RAM_TYPE)' 'QSPI' + MODEL_L3_EXEC=AT_MEM_L3_QSPIRAM +else ifeq '$(RAM_TYPE)' 'OSPI' + MODEL_L3_EXEC=AT_MEM_L3_OSPIRAM endif +ifeq '$(TARGET_CHIP_FAMILY)' 'GAP9' +FREQ_CL?=370 +FREQ_FC?=370 +FREQ_PE?=370 +else +ifeq '$(TARGET_CHIP)' 'GAP8_V3' +FREQ_CL?=175 +else +FREQ_CL?=50 +endif +FREQ_FC?=250 +FREQ_PE?=250 +endif $(info Building NNTOOL model) NNTOOL_EXTRA_FLAGS ?= @@ -43,7 +67,7 @@ APP_CFLAGS += -g -O3 -mno-memcpy -fno-tree-loop-distribute-patterns APP_CFLAGS += -I. -I$(MODEL_COMMON_INC) -I$(TILER_EMU_INC) -I$(TILER_INC) $(CNN_LIB_INCLUDE) -I$(MODEL_BUILD) APP_CFLAGS += -DPERF -DAT_MODEL_PREFIX=$(MODEL_PREFIX) $(MODEL_SIZE_CFLAGS) APP_CFLAGS += -DSTACK_SIZE=$(CLUSTER_STACK_SIZE) -DSLAVE_STACK_SIZE=$(CLUSTER_SLAVE_STACK_SIZE) -APP_CFLAGS += -DAT_IMAGE=$(IMAGE) +APP_CFLAGS += -DAT_IMAGE=$(IMAGE) -DFREQ_FC=$(FREQ_FC) -DFREQ_CL=$(FREQ_CL) -DFREQ_PE=$(FREQ_PE) READFS_FILES=$(abspath $(MODEL_TENSORS)) diff --git a/tools/nntool/generation/project_template/common/model_decl.mk b/tools/nntool/generation/project_template/common/model_decl.mk index 1d0ba1b20..4f72f28e4 100644 --- a/tools/nntool/generation/project_template/common/model_decl.mk +++ b/tools/nntool/generation/project_template/common/model_decl.mk @@ -48,11 +48,21 @@ RM=rm -f NNTOOL?=nntool -TOTAL_STACK_SIZE=$(shell expr $(CLUSTER_STACK_SIZE) \+ $(CLUSTER_SLAVE_STACK_SIZE) \* 7) +ifeq '$(TARGET_CHIP_FAMILY)' 'GAP9' +CLUSTER_SLAVE_PE=8 +else ifeq '$(TARGET_CHIP_FAMILY)' 'GAP8' +CLUSTER_SLAVE_PE=7 +else + $(error TARGE_CHIP_FAMILY not found in env or not correct) +endif + +TOTAL_STACK_SIZE=$(shell expr $(CLUSTER_STACK_SIZE) \+ $(CLUSTER_SLAVE_STACK_SIZE) \* $(CLUSTER_SLAVE_PE)) MODEL_L1_MEMORY=$(shell expr $(TARGET_L1_SIZE) \- $(TOTAL_STACK_SIZE)) MODEL_L2_MEMORY=$(TARGET_L2_SIZE) MODEL_L3_MEMORY=$(TARGET_L3_SIZE) + + # hram - HyperBus RAM # qspiram - Quad SPI RA # hflash - HyperBus Flash diff --git a/tools/nntool/graph/manipulations/eliminate_transposes/eliminate_transposes.py b/tools/nntool/graph/manipulations/eliminate_transposes/eliminate_transposes.py index e1f13ee60..986119294 100644 --- a/tools/nntool/graph/manipulations/eliminate_transposes/eliminate_transposes.py +++ b/tools/nntool/graph/manipulations/eliminate_transposes/eliminate_transposes.py @@ -513,7 +513,7 @@ def continue_down(G, node, exclude_nodes, visited_nodes, cur_visited_nodes, cur_ if check_continue(visited_nodes, cur_visited_nodes, exclude_nodes, edge.to_node, 'down', edge.to_idx): continue new_actions, visited_down_nodes = search_down( - G, edge.to_node, exclude_nodes, visited_nodes | cur_visited_nodes, edge, transpose_history) + G, edge.to_node, exclude_nodes, visited_nodes | cur_visited_nodes, edge, transpose_history.copy()) cur_visited_nodes |= visited_down_nodes cur_actions += new_actions return cur_actions, cur_visited_nodes diff --git a/tools/nntool/graph/manipulations/formatter.py b/tools/nntool/graph/manipulations/formatter.py new file mode 100644 index 000000000..d3ff09531 --- /dev/null +++ b/tools/nntool/graph/manipulations/formatter.py @@ -0,0 +1,122 @@ +# Copyright (C) 2022 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from copy import deepcopy + +from graph.types import ImageFormatParameters, NNEdge, TransposeParameters +from quantization.qtype import QType +from utils.node_id import NodeId + + +def insert_formatter(G, input_node, formatter, normalizer): + format_node = ImageFormatParameters(input_node.name + "_formatter", + norm_func=normalizer.upper(), + format_change=formatter.upper()) + out_edges = G.out_edges(input_node.name) + + # dims updated to reflect formatter + if format_node.output_channels is not None and format_node.input_channels is not None: + out_dim = input_node.get_output_size(None)[0] + if formatter.upper() in ("BW8", "BW16"): + assert format_node.input_channels == 1 + in_dim = out_dim.clone() + format_node.out_dims_hint = input_node.out_dims_hint + format_node.in_dims_hint = input_node.out_dims_hint + input_node.dims = in_dim + for out_edge in out_edges: + G.remove_edge(out_edge) + else: + if not out_dim.is_named or out_dim.c != format_node.output_channels: + raise ValueError( + "current graph input is not named or does not match formatter output channels") + if formatter.upper() in ("RGB16", "BW16") and normalizer.upper() != "OUT_INT16": + raise ValueError( + "rgb16 and bw16 formatters must have out_int16 as normalization function") + in_dim = out_dim.clone() + in_dim.c = format_node.input_channels + in_dim.impose_order(("h", "w", "c")) + format_node.in_dims_hint = [["h", "w", "c"]] + input_node.dims = in_dim + if input_node.fixed_order: + new_out_edges = [] + for out_edge in out_edges: + if isinstance(out_edge.to_node, TransposeParameters): + trans_node = out_edge.to_node + transpose_edges = G.out_edges(trans_node.name) + new_out_edges.extend(transpose_edges) + G.remove(trans_node) + if G.quantization: + nid = NodeId(trans_node) + if nid in G.quantization: + del G.quantization[NodeId(trans_node)] + else: + new_out_edges.append(out_edge) + out_edges = new_out_edges + else: + input_node.fixed_order = True + for out_edge in out_edges: + G.remove_edge(out_edge) + format_node.out_dims_hint = [["c", "h", "w"]] * len(out_edges) + input_node.out_dims_hint = [["h", "w", "c"]] + G.node_options[NodeId(input_node)] = input_node.at_options + # qrec updated to reflect formatter + input_qrec = G.quantization and G.quantization.get(NodeId(input_node)) + if input_qrec and format_node.input_dtype and format_node.output_dtype: + formatter_qrec = G.quantization.get(NodeId(format_node)) + if not formatter_qrec: + if input_qrec.out_qs[0].dtype != format_node.output_dtype: + raise ValueError( + "current graph input output quantization does not match formatter output") + formatter_qrec = deepcopy(input_qrec) + formatter_qrec.out_qs[0] = deepcopy(formatter_qrec.out_qs[0]) + if formatter_qrec.ktype.startswith('scaled'): + formatter_in_q = QType( + scale=1, zero_point=0, dtype=format_node.input_dtype) + elif formatter_qrec.ktype.startswith('symmetric'): + formatter_in_q = QType(q=0, dtype=format_node.input_dtype) + else: + raise NotImplementedError("quantization has unknown type") + if len(formatter_qrec.in_qs) > 0: + formatter_qrec.in_qs[0] = formatter_in_q + input_qrec.in_qs[0] = formatter_in_q + else: + formatter_qrec.in_qs.append(formatter_in_q) + input_qrec.in_qs.append(formatter_in_q) + input_qrec.out_qs[0] = formatter_in_q + G.quantization[NodeId(format_node)] = formatter_qrec + + G.add_node(format_node) + G.add_edge(NNEdge(input_node, format_node)) + for out_edge in out_edges: + G.add_edge(NNEdge(format_node, out_edge.to_node, to_idx=out_edge.to_idx)) + + +def remove_formatter(G, fmt_node): + input_edges = G.in_edges(fmt_node.name) + assert len(input_edges) == 1, "formatter node should only have one input" + input_node = input_edges[0].from_node + fmt_edges = G.out_edges(fmt_node.name) + fmt_qrec = G.quantization and G.quantization.get(NodeId(fmt_node)) + G.remove(fmt_node) + + input_node.dims = fmt_node.out_dims[0] + input_node.out_dims_hint = fmt_node.out_dims_hint + for fmt_edge in fmt_edges: + G.add_edge(NNEdge(input_node, fmt_edge.to_node, to_idx=fmt_edge.to_idx)) + if fmt_qrec: + input_qrec = G.quantization[NodeId(input_node)] + input_qrec.out_qs = fmt_qrec.out_qs + input_qrec.in_qs = fmt_qrec.out_qs + G.quantization.remove_node(fmt_node) diff --git a/tools/nntool/graph/matches/matchers/batchnorm_to_discrete_ops.py b/tools/nntool/graph/matches/matchers/batchnorm_to_discrete_ops.py new file mode 100644 index 000000000..9b7e9ae39 --- /dev/null +++ b/tools/nntool/graph/matches/matchers/batchnorm_to_discrete_ops.py @@ -0,0 +1,70 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import logging + +import numpy as np +from graph.dim import Dim +from graph.types import (ConstantInputParameters, MatrixSubParameters, + NNEdge) +from graph.types.conv2d import BatchNormalizationParameters +from graph.types.tensor_arithmetic import MatrixMulParameters +from utils.graph import GraphView + +from ..matcher import Matcher, match_name, description, groups, run_qtune_on_match + +LOG = logging.getLogger("nntool." + __name__) + + +@match_name('batchnorm_to_discrete_ops') +@description('Convert BatchNormParameters into a set of broadcasted operations') +@groups('scaled', 'symmetric') +class FuseBatchnorm(Matcher): + + def _match(self, G: GraphView, set_identity: bool = True, **kwargs): + + has_modified_graph = False + for bn_node in G.nodes(node_classes=BatchNormalizationParameters): + w_bn = bn_node.scale / \ + np.sqrt(bn_node.epsilon + bn_node.running_variance) + b_bn = bn_node.bias - bn_node.running_mean * bn_node.scale / \ + np.sqrt(bn_node.running_variance + bn_node.epsilon) + + mul_params = MatrixMulParameters( + G.unique_name(f"{bn_node.name}_mul")) + add_params = MatrixSubParameters( + G.unique_name(f"{bn_node.name}_add")) + broadcasted_shape = [1 if i != bn_node.axis else dim for i, dim in enumerate( + bn_node.in_dims[0].shape)] + scale_node = ConstantInputParameters(G.unique_name(f"{bn_node.name}_scale"), value=w_bn.reshape( + broadcasted_shape), dims=Dim.unnamed(broadcasted_shape)) + bias_node = ConstantInputParameters(G.unique_name(f"{bn_node.name}_bias"), value=b_bn.reshape( + broadcasted_shape), dims=Dim.unnamed(broadcasted_shape)) + + from_node = G.in_edges(bn_node)[0].from_node + to_node = G.out_edges(bn_node)[0].to_node + G.remove(bn_node) + G.add_edge(NNEdge(from_node, mul_params)) + G.add_edge(NNEdge(scale_node, mul_params, to_idx=1)) + G.add_edge(NNEdge(mul_params, add_params)) + G.add_edge(NNEdge(bias_node, add_params, to_idx=1)) + G.add_edge(NNEdge(add_params, to_node)) + + has_modified_graph = True + + if set_identity: + self.set_identity(G) + + return has_modified_graph diff --git a/tools/nntool/graph/matches/matchers/concat_slice.py b/tools/nntool/graph/matches/matchers/concat_slice.py index ef6fb82b6..3fa4f2c44 100644 --- a/tools/nntool/graph/matches/matchers/concat_slice.py +++ b/tools/nntool/graph/matches/matchers/concat_slice.py @@ -163,7 +163,7 @@ def eliminate_slice(self, G, concat, slice_node, remove_nodes, concat_in_idx, re elif slice_node.changes_shape: reshape = ReshapeParameters( G.unique_name(f'{slice_node.name}_reshape'), - old_shape=slice_node.post_slice_shape, + old_shape=slice_node.slice_shape, shape=slice_node.out_shape) else: reshape = None diff --git a/tools/nntool/graph/matches/matchers/concat_split.py b/tools/nntool/graph/matches/matchers/concat_split.py index e6a362b0c..d253489a2 100644 --- a/tools/nntool/graph/matches/matchers/concat_split.py +++ b/tools/nntool/graph/matches/matchers/concat_split.py @@ -14,11 +14,13 @@ # along with this program. If not, see . import logging +from graph.matches.match_utils import search_up from graph.types import ConcatParameters, NNEdge, SplitParameters +from graph.types.others import CopyParameters from utils.graph import GraphView -from ..matcher import Matcher, description, groups, match_name +from ..matcher import Matcher, description, groups, match_name, run_before LOG = logging.getLogger("nntool." + __name__) @@ -26,6 +28,7 @@ @groups('*') @match_name("concat_split") @description("removes concat/split pair where all in edges on the concat match the out edges on the split") +@run_before('insert_copies') class ConcatSplitMatch(Matcher): def _match(self, G: GraphView, set_identity: bool = True, **kwargs) -> bool: @@ -35,11 +38,11 @@ def _match(self, G: GraphView, set_identity: bool = True, **kwargs) -> bool: if len(in_edges) > 1: continue in_edge = in_edges[0] - if not isinstance(in_edge.from_node, ConcatParameters): - continue - concat_node = in_edge.from_node - if len(G.out_edges(concat_node.name)) > 1: + edges = search_up(G, in_edge, ConcatParameters, can_pass=(CopyParameters,), multi_on_target=False) + if not edges: continue + nodes = [split_node] + [edge.from_node for edge in edges] + concat_node = nodes[-1] if concat_node.axis != split_node.axis: continue axis = concat_node.axis @@ -54,8 +57,7 @@ def _match(self, G: GraphView, set_identity: bool = True, **kwargs) -> bool: concat_node.name, split_node.name) concat_in_edges = G.indexed_in_edges(concat_node.name) split_out_edges = G.indexed_out_edges(split_node.name) - G.remove(split_node) - G.remove(concat_node) + G.remove_all(nodes) for idx, in_edge in enumerate(concat_in_edges): for out_edge in split_out_edges[idx]: G.add_edge(NNEdge(from_node=in_edge.from_node, from_idx=in_edge.from_idx, diff --git a/tools/nntool/graph/matches/matchers/expand_to_reshape.py b/tools/nntool/graph/matches/matchers/expand_to_reshape.py new file mode 100644 index 000000000..547f0852b --- /dev/null +++ b/tools/nntool/graph/matches/matchers/expand_to_reshape.py @@ -0,0 +1,50 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from graph.manipulations.eliminate_transposes.transpose_helpers import strip_ones +from graph.types.others import ExpandParameters, TransposeParameters +import logging + +from graph.dim import Dim +from graph.types import NNEdge, ReshapeParameters +from utils.graph import GraphView +from utils.node_id import NodeId + +from ..matcher import Matcher, match_name, description, run_before, groups, needs_valid_dimension + +LOG = logging.getLogger("nntool." + __name__) + +@match_name("expand_to_reshape") +@description("remove expands that are really just reshapes") +@run_before('*') +@groups('*') +@needs_valid_dimension(True) +class ExpandToReshape(Matcher): + + def _match(self, G: GraphView, set_identity: bool = True, **kwargs): + modified_graph = False + for node in G.nodes(node_classes=ExpandParameters): + in_shape = node.in_dims[0].shape + out_shape = node.out_dims[0].shape + if strip_ones(in_shape) != strip_ones(out_shape): + continue + LOG.info(f'replacing expand {node.name} with a reshape') + reshape = ReshapeParameters(G.unique_name(f'{node.name}_reshape'), old_shape=in_shape, shape=out_shape) + G.replace_node(node, reshape) + modified_graph = True + + if set_identity: + self.set_identity(G) + + return modified_graph diff --git a/tools/nntool/graph/matches/matchers/fuse_batchnorm.py b/tools/nntool/graph/matches/matchers/fuse_batchnorm.py new file mode 100644 index 000000000..57722664f --- /dev/null +++ b/tools/nntool/graph/matches/matchers/fuse_batchnorm.py @@ -0,0 +1,87 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import logging + +import numpy as np +from graph.types import (ConstantInputParameters, MatMulOpParameters, MatMulTransposedParameters, + NNEdge) +from graph.types.conv2d import BatchNormalizationParameters +from utils.graph import GraphView + +from ..matcher import Matcher, match_name, description, groups, run_qtune_on_match + +LOG = logging.getLogger("nntool." + __name__) + + +@match_name('fuse_batchnorm') +@description('Fuse batch normalization into MatMul') +@groups('scaled', 'symmetric') +@run_qtune_on_match +class FuseBatchnorm(Matcher): + + def _match(self, G: GraphView, set_identity: bool = True, **kwargs): + + has_modified_graph = False + nodes = [] + for node in G.nodes(node_classes=BatchNormalizationParameters): + in_node = G.indexed_in_edges(node)[0].from_node + if isinstance(in_node, MatMulOpParameters): + nodes.append((node, in_node)) + + for bn_node, filt_node in nodes: + filt_in_edges = G.indexed_in_edges(filt_node.name) + weights_node = filt_in_edges[1].from_node + biases_node = filt_in_edges[2].from_node if len( + filt_in_edges) > 2 else None + w_bn = bn_node.scale / np.sqrt(bn_node.epsilon + bn_node.running_variance) + if not isinstance(weights_node, ConstantInputParameters): + continue + weights = weights_node.dqvalue + if len(w_bn) > 1: + if not isinstance(filt_node, MatMulTransposedParameters): + weights = np.swapaxes(weights.copy(), -2, -1) + if weights.shape[-2] != len(w_bn): + LOG.info(f'{filt_node.name} - weights shape does not match batch norm') + continue + if biases_node is None: + biases = np.zeros((weights.shape[-1],)) + biases_node = ConstantInputParameters( + G.unique_name(f'{filt_node.name}_biases'), value=biases) + G.add_edge(NNEdge(from_node=biases, + to_node=filt_node, to_idx=2)) + elif not isinstance(biases_node, ConstantInputParameters): + continue + else: + biases = biases_node.dqvalue + # fold batch norm into conv weights and biases + if len(w_bn) > 1: + w_bn = np.diag(w_bn) + weights = np.matmul(w_bn, weights) + else: + weights = weights * w_bn + biases = bn_node.bias + ((biases - bn_node.running_mean) * + bn_node.scale / np.sqrt(bn_node.running_variance + bn_node.epsilon)) + if len(w_bn) > 1 and not isinstance(filt_node, MatMulTransposedParameters): + weights = np.swapaxes(weights, -2, -1) + weights_node.value = weights + biases_node.value = biases + G.remove_and_reconnect(bn_node, edge_class=NNEdge) + has_modified_graph = True + + if set_identity: + self.set_identity(G) + + return has_modified_graph diff --git a/tools/nntool/graph/matches/matchers/fuse_gap_convs.py b/tools/nntool/graph/matches/matchers/fuse_gap_convs.py index 1b4c368cd..beecee70c 100644 --- a/tools/nntool/graph/matches/matchers/fuse_gap_convs.py +++ b/tools/nntool/graph/matches/matchers/fuse_gap_convs.py @@ -25,10 +25,9 @@ TanHActivationParameters) from graph.types.base import NNNodeRef from graph.types.fusions import FusionInputParameters, FusionOutputParameters -from utils.graph import GraphView, NodeRef +from utils.graph import GraphView -from ..matcher import (Matcher, description, groups, match_name, - run_adjust_on_match, run_qtune_on_match) +from ..matcher import (Matcher, description, groups, match_name, run_qtune_on_match) LOG = logging.getLogger("nntool." + __name__) @@ -47,6 +46,8 @@ 'conv_max_active', 'conv_average_active', 'conv_active_max', + 'conv_max', + 'conv_average', ) VALID_ACTIVATIONS_POW2 = ( @@ -62,6 +63,8 @@ 'conv_max_active', 'conv_average_active', 'conv_active_max', + 'conv_max', + 'conv_average', ) @@ -120,8 +123,8 @@ def add_node(self, params, in_fusion=False): try: for cnode in params.contained_nodes(): self.add_node(cnode, in_fusion=True) - except MergeStopError: # @IgnoreException - raise MergeAbortError() + except MergeStopError: + raise MergeAbortError() # @IgnoreException elif isinstance(params, Conv2DParameters): if self.conv or not self.can_add(params): raise MergeStopError() # @IgnoreException @@ -201,7 +204,9 @@ def fusion_type(self): @groups('*') @match_name("fuse_gap_convs") @run_qtune_on_match -@description('Fuse convolutions, pools and activations to match GAP AutoTiler operations') +@description( + 'Fuse convolutions, pools and activations to match GAP AutoTiler operations. Pooling and activation nodes' + ' are also fused into existing convolution fusions.') class MatchAllGapConv(Matcher): def _match(self, G: GraphView, set_identity: bool = True, **kwargs): has_modified_graph = False diff --git a/tools/nntool/graph/matches/matchers/gather_to_split.py b/tools/nntool/graph/matches/matchers/gather_to_split.py index 05aae2550..3acd9dc47 100644 --- a/tools/nntool/graph/matches/matchers/gather_to_split.py +++ b/tools/nntool/graph/matches/matchers/gather_to_split.py @@ -37,6 +37,8 @@ def _match(self, G: GraphView, set_identity: bool = True, **kwargs) -> bool: group = gathers_by_origin.setdefault((in_edge.from_node, in_edge.from_idx), []) group.append(gather) for in_edge, gathers in gathers_by_origin.items(): + if len(gathers[0].indices.shape) > 1: + continue # This is too difficult to handle if there are multiple slices axis = gathers[0].axis if not all(gather.axis == axis and len(gather.indices.shape) <= 1 diff --git a/tools/nntool/graph/matches/matchers/move_node_up.py b/tools/nntool/graph/matches/matchers/move_node_up.py index 9522f7dcf..36c815b66 100644 --- a/tools/nntool/graph/matches/matchers/move_node_up.py +++ b/tools/nntool/graph/matches/matchers/move_node_up.py @@ -161,7 +161,7 @@ class MoveActivationsMatcherScale8(MoveNodeUpMatcher): @run_before('fuse_gap_convs', 'fuse_gap_linear', 'fuse_gap_pool', 'fuse_op_activation_scale8') class MoveMaxPoolMatcherScale8(MoveNodeUpMatcher): - ValidNodesToPass = (ReluActivationParameters,) + ValidNodesToPass = (ReluActivationParameters, ConcatParameters) ValidFusions = (Conv2DParameters, FcParameters) ValidNodes = (lambda node: isinstance( node, PoolingParameters) and node.pool_type == "max",) diff --git a/tools/nntool/graph/matches/matchers/rnn_unpack.py b/tools/nntool/graph/matches/matchers/rnn_unpack.py index cbbd746d6..bbf2d3316 100644 --- a/tools/nntool/graph/matches/matchers/rnn_unpack.py +++ b/tools/nntool/graph/matches/matchers/rnn_unpack.py @@ -291,7 +291,7 @@ def _match(self, G: GraphView, set_identity: bool = True, **kwargs): if changes_shape: reshape = ReshapeParameters(unpack_node.name + '_reshape', old_shape=Dim.unnamed( - unpack_node.post_slice_shape), + unpack_node.slice_shape), shape=Dim.unnamed(unpack_node.out_shape)) G.add_edge(NNEdge(from_node=in_edge.from_node, to_node=reshape, from_idx=in_edge.from_idx)) diff --git a/tools/nntool/graph/matches/matchers/slice_to_split.py b/tools/nntool/graph/matches/matchers/slice_to_split.py index 53ab42b5a..74a577db7 100644 --- a/tools/nntool/graph/matches/matchers/slice_to_split.py +++ b/tools/nntool/graph/matches/matchers/slice_to_split.py @@ -140,9 +140,11 @@ def slice_to_split(G, slice_nodes, slices): axis_dim = in_dims[axis] outs = [] splits = [] + two_unused = axis_slice[0] > 0 and axis_slice[1] < axis_dim if axis_slice[0] > 0: + two_unused = True splits.append(axis_slice[0]) - oparams = OutputParameters(G.unique_name('unused')) + oparams = OutputParameters(G.unique_name(f'{slice_node.name}_unused{0 if two_unused else ""}')) oparams.at_options.allocate = 1 outs.append( ((oparams, 0),)) @@ -151,7 +153,7 @@ def slice_to_split(G, slice_nodes, slices): for edge in G.out_edges(slice_node.name)]) if axis_slice[1] < axis_dim: splits.append(axis_dim - axis_slice[1]) - oparams = OutputParameters(G.unique_name('unused')) + oparams = OutputParameters(G.unique_name(f'{slice_node.name}_unused{1 if two_unused else ""}')) oparams.at_options.allocate = 1 outs.append( ((oparams, 0),)) diff --git a/tools/nntool/graph/nngraph.py b/tools/nntool/graph/nngraph.py index 4448d913b..f7d83da0e 100644 --- a/tools/nntool/graph/nngraph.py +++ b/tools/nntool/graph/nngraph.py @@ -16,13 +16,19 @@ import logging import os import re -from typing import Callable, Generator, Sequence, Tuple, Union +from typing import Any, Callable, Generator, Mapping, Sequence, Tuple, Union import numpy as np +from execution.graph_executer import GraphExecuter +from execution.quantization_mode import QuantizationMode +from interpreter.commands.qtune import SCHEME_NAME_MAPPINGS from quantization.quantization_set import QuantizationSet +from quantization.quantizer.new_quantizer import NewQuantizer from reports.graph_reporter import GraphReporter +from stats.activation_ranges_collector import ActivationRangesCollector from utils.graph import Graph, Node from utils.node_id import NodeId +from utils.stats_funcs import cos_similarity, qsnr from utils.tabular import TextTableRenderer from graph.dim import Dim @@ -33,6 +39,7 @@ from graph.manipulations.dimensions import add_dimensions from graph.manipulations.liveness import calculate_liveness from graph.matches.fusions import fusions +from graph.matches.matches import get_fusions from graph.types import (ConstantInputParameters, InputBaseParameters, InputParameters, MultiplicativeBiasParameters, OutputParameters, ResizerParameters, @@ -383,10 +390,10 @@ def add_input(self, dim: Union[Dim, Tuple[int]], name: str = None, **kwargs) -> def add_constant(self, dim: Union[Dim, Tuple[int]] = None, name: str = None, value: np.ndarray = None, - adjust_transpose: Sequence[int]=None, + adjust_transpose: Sequence[int] = None, is_mutated=False, is_intermediate=False, - short_name: str=None) -> NNNodeRef: + short_name: str = None) -> NNNodeRef: """Creates a constant node Args: @@ -401,7 +408,8 @@ def add_constant(self, dim: Union[Dim, Tuple[int]] = None, Returns: NNNodeRef: A reference to the Node in the Graph """ - node_name = name if name else self.unique_name(f"constant_{self.num_constants}") + node_name = name if name else self.unique_name( + f"constant_{self.num_constants}") node = ConstantInputParameters(node_name, dims=dim, value=value, adjust_transpose=adjust_transpose, @@ -445,7 +453,14 @@ def nodes_iterator(self, yield_fusions=True): yield (step_idx, node, fusion_idx, fnode) yield (step_idx, node, None, None) - def adjust_order(self, reshape_weights=True, no_postprocess=False, debug_function: Callable=None, steps: int=None, single_step=False): + def adjust_order( + self, + reshape_weights=True, + no_postprocess=False, + debug_function: Callable = None, + steps: int = None, + single_step=False + ): """Adjusts tensor order to match selected kernels Args: @@ -461,6 +476,15 @@ def adjust_order(self, reshape_weights=True, no_postprocess=False, debug_functio LOG.info("adjusted order") self.graph_identity.is_adjusted = True + @staticmethod + def get_fusions(): + """Returns a dictionary of all the fusion/graph optimization pass names and descriptions + + Returns: + Dict[str, str]: Names and descriptions of graph optimisation passes + """ + return get_fusions() + def fusions(self, *match_names, no_postprocess: bool = False): """Run matchers on the graph @@ -470,7 +494,10 @@ def fusions(self, *match_names, no_postprocess: bool = False): """ fusions(self, *match_names, no_postprocess=no_postprocess) - def add_dimensions(self, quiet=False): + def add_dimensions( + self, + quiet=False + ): """Add dimensions to the graph and calculate execution order and liveness Args: @@ -485,7 +512,121 @@ def add_dimensions(self, quiet=False): self, self.graph_state.steps) - def balance_filters(self, step_idx: int=None, precision_threshold=0.20): + def collect_statistics( + self, + input_tensors_iterator: Union[Sequence[Sequence[np.ndarray]], Sequence[np.ndarray]] + ) -> Mapping[Union[str, Tuple[str, str]], Mapping]: + """Collect tensor statistics for quantization + + Args: + input_tensors_iterator (Union[Sequence[Sequence[np.ndarray]], Sequence[np.ndarray]]): + If the graph has a single input this can just be an iterator over numpy arrays. If the graph has + multiple inputs then it should be an iterator over sequences of numpy arrays. + + Returns: + Mapping[Union[str, Tuple[str, str]], Mapping]: Mapping of statistics for each node's inputs and outputs + """ + stats_collector = ActivationRangesCollector() + for input_tensors in input_tensors_iterator: + if isinstance(input_tensors, np.ndarray): + input_tensors = [input_tensors] + stats_collector.collect_stats(self, input_tensors) + return {k.key: v for k, v in stats_collector.stats.items()} + + @staticmethod + def qsnrs(tensors1, tensors2, idx=0): + return tuple([qsnr(t1[idx], t2[idx]) if len(t1) > idx and len(t2) > idx else None for t1, t2 in zip(tensors1, tensors2)]) + + @staticmethod + def cos_sim(tensors1, tensors2, idx=0): + return tuple([cos_similarity(t1[idx], t2[idx]) if len(t1) > idx and len(t2) > idx else None for t1, t2 in zip(tensors1, tensors2)]) + + def quantize( + self, + statistics: Mapping[Union[str, Tuple[str, str]], Mapping] = None, + schemes: Sequence[str] = None, + graph_options: Mapping[str, Any] = None, + node_options: Mapping[Union[str, Tuple[str, str]], + Mapping[str, Any]] = None, + read_existing_options = True + ) -> None: + """Quantize the graph + + Args: + statistics (Mapping[Union[str, Tuple[str, str]], Mapping], optional): Statistics collected by the NNGraph.collect_statistics + method. + schemes (Sequence[], optional): Sequence of schemes "scaled", "pow2", or "float" to use in priority order. If None use scaled. Defaults to None. + graph_options (Mapping[str, Any], optional): Quantization options to set for the whole graph. Defaults to None. + node_options (Mapping[Union[str, Tuple[str, str]], Mapping[str, Any]], optional): + Quantization options to set for specific nodes. The map key should be the node name or if the node is inside a fusion + then a tuple of the fusion name and the node name. Defaults to None. + read_existing_options (bool, optional): Incorporate existing quantization options and schemes in the graph. Leaving this as + True and just supplying graph_option, node_options and/or schemes is the equivalent of the nntool qtune command + """ + quantizer = NewQuantizer(self) + if schemes: + for scheme in schemes: + scheme = scheme.lower() + if scheme not in SCHEME_NAME_MAPPINGS: + raise ValueError(f'invalid scheme name {scheme}') + quantizer.schemes.append(SCHEME_NAME_MAPPINGS[scheme]) + elif 'SQ8' not in quantizer.schemes: + quantizer.schemes.append('SQ8') + options = {} + if graph_options: + options.update(graph_options) + if node_options: + options.update({NodeId(name) if isinstance(name, str) else NodeId(*name): v + for name, v in node_options.items()}) + quantizer.set_stats(statistics) + quantizer.update_options(options) + quantizer.quantize() + + def execute( + self, + input_tensors: Union[np.ndarray, Sequence[np.ndarray]], + quantize=False, + dequantize=False, + output_fusion_tensors=False + ) -> Sequence[Sequence[np.ndarray]]: + """Runs inference on the graph + + Args: + input_tensors (Union[np.ndarray, Sequence[np.ndarray]]): + Numpy arrays containing inputs (which should be normalized and in float) + If there is only one input it can be specified without a sequence. + quantize (bool, optional): Run the graph using quantization parameters. Defaults to False. + dequantize (bool, optional): Dequantize outputs. Implies quantize. Defaults to False. + output_fusion_tensors (bool, optional): Output outputs from nodes that have been fused. Defaults to False. + + Raises: + ValueError: Incorrect parameters + + Returns: + Sequence[Sequence[np.ndarray]]: List of lists of outputs of each node in the graph. If output_fusion_tensors + is True this will also include the output of nodes contained inside fusions (except fused expressions) + """ + if dequantize: + quantize = True + if quantize: + if self.quantization is None or not self.quantization.verify_quantization(self): + raise ValueError('graph is not quantized') + if dequantize: + qmode = QuantizationMode.all_dequantize() + else: + qmode = QuantizationMode.all() + else: + qmode = QuantizationMode.none() + if isinstance(input_tensors, np.ndarray): + input_tensors = [input_tensors] + executer = GraphExecuter(self, self.quantization) + return executer.execute(input_tensors, qmode=qmode, append_fusion_output=output_fusion_tensors) + + def balance_filters( + self, + step_idx: int = None, + precision_threshold=0.20 + ): """Experimental filter balancing routines Args: diff --git a/tools/nntool/graph/types/base.py b/tools/nntool/graph/types/base.py index e80eb5223..a75113604 100644 --- a/tools/nntool/graph/types/base.py +++ b/tools/nntool/graph/types/base.py @@ -28,14 +28,6 @@ LOG = logging.getLogger("nntool." + __name__) -class ParameterError(Exception): - pass - - -class CantPromoteQError(ParameterError): - pass - - class NodeOptions(OptionList): def __init__(self, *args, **kwargs): super(NodeOptions, self).__init__(*args, **kwargs) @@ -253,13 +245,6 @@ def value(self): def value(self, val): self._value = val - @property - def can_promoteq(self): - return False - - def promoteq(self): - raise CantPromoteQError() - @property def in_dims(self): return self._in_dims diff --git a/tools/nntool/graph/types/conv2d.py b/tools/nntool/graph/types/conv2d.py index b9444314f..3e7b7bed2 100644 --- a/tools/nntool/graph/types/conv2d.py +++ b/tools/nntool/graph/types/conv2d.py @@ -27,7 +27,7 @@ class BatchNormalizationParameters(NoSizeChangeParameters, SingleInputAndOutput, SensitiveToOrder): #pylint: disable-msg=too-many-arguments - def __init__(self, name, scale=None, bias=None, running_mean=None, + def __init__(self, name, scale=None, bias=None, running_mean=None, axis=0, running_variance=None, spatial=None, momentum=None, epsilon=None, **kwargs): super(BatchNormalizationParameters, self).__init__(name, **kwargs) self.scale = scale @@ -37,6 +37,7 @@ def __init__(self, name, scale=None, bias=None, running_mean=None, self.spatial = spatial self.momentum = momentum self.epsilon = epsilon + self.axis = axis @property def can_equalize(self): diff --git a/tools/nntool/graph/types/expression_fusion.py b/tools/nntool/graph/types/expression_fusion.py index 43afedf3b..554027cc2 100644 --- a/tools/nntool/graph/types/expression_fusion.py +++ b/tools/nntool/graph/types/expression_fusion.py @@ -18,6 +18,7 @@ from collections import Counter from expressions.symbolic.function_collection import FunctionCollection +from expressions.symbolic.iteration_space import Assignments from expressions.symbolic.symbol import Constant, Variable from utils.node_id import NodeId @@ -138,9 +139,9 @@ def details_collector(self, stats, stat, details): def is_same_operation_as(self, G, other): if not isinstance(other, ExpressionFusionParameters): return False - if len(self.func_col.functions) != 1 or len(other.func_col.functions) != 1: + if len(self.func_col) != 1 or len(other.func_col) != 1: return False - if next(iter(self.func_col.functions.values())).equivalent(next(iter(other.func_col.functions.values()))): + if self.func_col[0][1].equivalent(other.func_col[0][1]): return True return False @@ -156,7 +157,7 @@ def decompose(self, qrecs=None): LOG.info("expression decomposed into %s intermediate and %s output expressions", len(intermediates), len(outputs)) - expressions = [] + expressions = Assignments() inter_vars = {node: Variable( node.name, shape=node.dims.shape) for node in inputs} # TODO - Intermediates are not sorted here so there may be interdependences @@ -172,38 +173,37 @@ def decompose(self, qrecs=None): variable=variable, qrecs=qrecs) inter_vars[node] = variable - expressions.append(expr) + expressions.add(*expr) for node in outputs: expr = self.compose_expression( self.subgraph, node, inter_vars, qrecs=qrecs) - expressions.append(expr) + expressions.add(*expr) # sort the inputs by idx inputs = sorted([node for node in inputs], key=lambda x: x.idx) outputs = sorted([node for node in outputs], key=lambda x: x.idx) - func_col = FunctionCollection(expressions) - return [node.name for node in inputs], [node.name for node in outputs], func_col + return [node.name for node in inputs], [node.name for node in outputs], expressions def get_output_size(self, in_dims): # the input shapes may have changed so the expression variables shapes could have # changed and the iterators will need to be recalculated - dim_change = False + # dim_change = False in_vars = [self.func_col.variables[name] for name in self.input_symbols] for idx, dim in enumerate(in_dims): shape = tuple(dim.shape) if tuple(in_vars[idx].shape) != shape: in_vars[idx].shape = shape - dim_change = True - if dim_change: - self.func_col.set_var_shapes() + # dim_change = True + # if dim_change: + # self.func_col.set_var_shapes() out_dims = super().get_output_size(in_dims) - if dim_change: # if the input shapes haven't changed then the output shapes have not changed - out_vars = [self.func_col.variables[name] for name in self.output_symbols] - for idx, dim in enumerate(out_dims): - out_vars[idx].shape = tuple(dim.shape) - self.func_col.init_indexes() # recalculate the iterators + # if dim_change: # if the input shapes haven't changed then the output shapes have not changed + # out_vars = [self.func_col.variables[name] for name in self.output_symbols] + # for idx, dim in enumerate(out_dims): + # out_vars[idx].shape = tuple(dim.shape) + # self.func_col.init_indexes() # recalculate the iterators return out_dims def __str__(self): diff --git a/tools/nntool/graph/types/others.py b/tools/nntool/graph/types/others.py index 70bacc828..f22874cc6 100644 --- a/tools/nntool/graph/types/others.py +++ b/tools/nntool/graph/types/others.py @@ -142,7 +142,7 @@ def __str__(self): @cls_op_name('expand') -class ExpandParameters(Parameters, InsensitiveToQuantization): +class ExpandParameters(Parameters, SensitiveToOrder, InsensitiveToQuantization): def __init__(self, *args, shape=None, **kwargs): super(ExpandParameters, self).__init__(*args, **kwargs) self.shape = shape @@ -178,6 +178,26 @@ def get_output_size(self, in_dims): def __str__(self): return f"{self.shape}" +@cls_op_name('scatternd') +class ScatterNdParameters(Parameters, SensitiveToOrder): + def __init__(self, *args, indices=None, updates=None, reduction=None, **kwargs): + super(ScatterNdParameters, self).__init__(*args, **kwargs) + self.indices = indices + self.updates = updates + self.reduction = reduction + + def get_parameter_size(self): + return 0 + + @property + def can_equalize(self): + return False + + def get_output_size(self, in_dims): + return [Dim.unnamed(in_dims[0].shape)] + + def __str__(self): + return "" @cls_op_name('quantize') class QuantizeParameters(Parameters, ComparableParameters): @@ -429,8 +449,6 @@ def __init__(self, *args, super(StridedSliceParameters, self).__init__(*args, **kwargs) self.act_slice = act_slice - self.slice_shape = tuple( - int(abs(math.ceil((sl[1] - sl[0])/sl[2]))) for sl in self.act_slice) self.out_shape = tuple(out_shape) @property @@ -443,27 +461,17 @@ def graph_anon_label(self): @property def slice_shape(self): - return self._slice_shape - - @slice_shape.setter - def slice_shape(self, val): - self._slice_shape = tuple(val) + return tuple( + int(abs(math.ceil((max(sl[1], -1) - max(sl[0], -1))/sl[2]))) for sl in self.act_slice) @property def slices_axes(self): in_shape = self.in_dims[0].shape - return tuple(idx for idx, shapes in enumerate(zip(self.post_slice_shape, in_shape)) if shapes[0] != shapes[1]) - - @property - def post_slice_shape(self): - old_settings = np.seterr(all='raise') - res = tuple(abs(((sl[1] if sl[1] >= -1 else -1) - sl[0])//sl[2]) for sl in self.act_slice) - np.seterr(**old_settings) - return res + return tuple(idx for idx, shapes in enumerate(zip(self.slice_shape, in_shape)) if shapes[0] != shapes[1]) @property def changes_shape(self): - return self.post_slice_shape != self.out_shape + return self.slice_shape != self.out_shape @property def can_equalize(self): @@ -509,7 +517,7 @@ def does_nothing(self) -> bool: def no_model_code(self) -> bool: if not self.in_dims: return False - return self.post_slice_shape == tuple(self.in_dims[0].shape) + return self.slice_shape == tuple(self.in_dims[0].shape) def get_parameter_size(self): return 0 diff --git a/tools/nntool/importer/common/constant_mixin.py b/tools/nntool/importer/common/constant_mixin.py index ddaae71d0..8cc879542 100644 --- a/tools/nntool/importer/common/constant_mixin.py +++ b/tools/nntool/importer/common/constant_mixin.py @@ -47,3 +47,12 @@ def record_constant_qrec(cls, inp, cnode, **kwargs): if qrecs is None: return qrecs[NodeId(cnode)] = QRec.scaled(out_qs=[qtype]) + + @classmethod + def move_stat(cls, inp, new_name, **kwargs): + cnid = NodeId(new_name) + onid = NodeId(inp[0]) + qopts = kwargs.get('qopts', {}) + if onid in qopts: + qopts[cnid] = qopts[onid] + del qopts[onid] diff --git a/tools/nntool/importer/onnx/common/__init__.py b/tools/nntool/importer/onnx/common/__init__.py index 834daa668..e238b949c 100644 --- a/tools/nntool/importer/onnx/common/__init__.py +++ b/tools/nntool/importer/onnx/common/__init__.py @@ -4,7 +4,20 @@ from onnx import TensorProto, mapping, helper -logger = logging.getLogger('nntool.' + __name__) +_logger = logging.getLogger('nntool.' + __name__) + +class logger: + @staticmethod + def info(*args, **kwargs): + _logger.info(*args, **kwargs) + + @staticmethod + def warning(*args, **kwargs): + _logger.warning(*args, **kwargs) + + @staticmethod + def debug(*args, **kwargs): + _logger.debug(*args, **kwargs) def get_unique_suffix(): """ Get unique suffix by using first 8 chars from uuid.uuid4 diff --git a/tools/nntool/importer/onnx/common/handler_helper.py b/tools/nntool/importer/onnx/common/handler_helper.py index e6d32a8bf..6d4723a46 100644 --- a/tools/nntool/importer/onnx/common/handler_helper.py +++ b/tools/nntool/importer/onnx/common/handler_helper.py @@ -87,6 +87,7 @@ def get_all_backend_handlers(opset_dict): return handlers + def get_backend_coverage(): """ Get backend coverage for document. diff --git a/tools/nntool/importer/onnx/handlers/backend/add.py b/tools/nntool/importer/onnx/handlers/backend/add.py index 7b5dcff3a..fa67f4db9 100644 --- a/tools/nntool/importer/onnx/handlers/backend/add.py +++ b/tools/nntool/importer/onnx/handlers/backend/add.py @@ -37,3 +37,7 @@ def version_7(cls, node, **kwargs): @classmethod def version_13(cls, node, **kwargs): return cls._common(node, **kwargs) + + @classmethod + def version_14(cls, node, **kwargs): + return cls._common(node, **kwargs) diff --git a/tools/nntool/importer/onnx/handlers/backend/batch_normalization.py b/tools/nntool/importer/onnx/handlers/backend/batch_normalization.py index 0f01ef96a..645c72b30 100644 --- a/tools/nntool/importer/onnx/handlers/backend/batch_normalization.py +++ b/tools/nntool/importer/onnx/handlers/backend/batch_normalization.py @@ -95,7 +95,8 @@ def _common(cls, node, **kwargs): params = BatchNormalizationParameters(valid_name, scale=bn_scale, bias=bn_bias, running_mean=running_mean, running_variance=running_variance, spatial=spatial, - momentum=momentum, epsilon=epsilon) + momentum=momentum, epsilon=epsilon, + axis=0) G.add_edge(NNEdge(from_node=x[0], to_node=params, from_idx=x[1], to_idx=0)) all_nodes[node.output[0]] = (params, 0, deepcopy(x[2]), None) return params @@ -115,3 +116,11 @@ def version_7(cls, node, **kwargs): @classmethod def version_9(cls, node, **kwargs): return cls._common(node, **kwargs) + + @classmethod + def version_14(cls, node, **kwargs): + return cls._common(node, **kwargs) + + @classmethod + def version_15(cls, node, **kwargs): + return cls._common(node, **kwargs) diff --git a/tools/nntool/importer/onnx/handlers/backend/concat_mixin.py b/tools/nntool/importer/onnx/handlers/backend/concat_mixin.py index a5a658467..0663e06c4 100644 --- a/tools/nntool/importer/onnx/handlers/backend/concat_mixin.py +++ b/tools/nntool/importer/onnx/handlers/backend/concat_mixin.py @@ -32,7 +32,7 @@ def gen_concat(cls, node, inputs, axis, **kwargs): all_nodes = kwargs['all_nodes'] G = kwargs['G'] valid_name = kwargs['valid_name'] - inputs = [all_nodes[inp] for inp in node.input] + inputs = [all_nodes[inp] for inp in node.input if all_nodes[inp][2].shape] input_shapes = [inp[2].shape for inp in inputs] axis_sum = sum(shape[axis] for shape in input_shapes) axis = axis if axis >= 0 else len(input_shapes[0]) + axis diff --git a/tools/nntool/importer/onnx/handlers/backend/conv_mixin.py b/tools/nntool/importer/onnx/handlers/backend/conv_mixin.py index a672c75a5..920394d11 100644 --- a/tools/nntool/importer/onnx/handlers/backend/conv_mixin.py +++ b/tools/nntool/importer/onnx/handlers/backend/conv_mixin.py @@ -17,6 +17,7 @@ from copy import deepcopy import numpy as np +from sklearn.utils import resample from graph.dim import Conv2DFilterDim, DilationDim, Dim, StrideDim from graph.types import (ConstantInputParameters, Conv2DParameters, NNEdge, ReshapeParameters) @@ -86,7 +87,9 @@ def conv(cls, node, quantized=False, **kwargs): # M x C/group x kH x kW weights_idx = 3 if quantized else 1 weights_node = inputs[weights_idx][0] - weights_node.name = f'{valid_name}_weights' + new_name = f'{valid_name}_weights' + cls.move_stat(inputs[weights_idx], new_name, **kwargs) + weights_node.name = new_name weights = cls.get_constant(inputs[weights_idx]) out_c = weights.shape[0] group = node.attrs.get("group", 1) @@ -203,6 +206,11 @@ def conv(cls, node, quantized=False, **kwargs): # check if input needs a reshape if conv_in_shape != real_in_shape: + # if batch is present add it back + if batch is not None: + conv_in_shape = (batch,) + conv_in_shape + if np.prod(real_in_shape) != np.prod(conv_in_shape): + raise ValueError(f'shape inference issue {valid_name} filter indicates {conv_in_shape} but has an input of {real_in_shape}') r1_params = ReshapeParameters(f'{valid_name}_reshape_in', old_shape=Dim.unnamed(real_in_shape), shape=Dim.unnamed(conv_in_shape)) diff --git a/tools/nntool/importer/onnx/handlers/backend/conv_transpose.py b/tools/nntool/importer/onnx/handlers/backend/conv_transpose.py index cddd73fb2..17bd0f764 100644 --- a/tools/nntool/importer/onnx/handlers/backend/conv_transpose.py +++ b/tools/nntool/importer/onnx/handlers/backend/conv_transpose.py @@ -120,7 +120,7 @@ def _common(cls, node, **kwargs): dims=Dim.unnamed( biases.shape)) - padding, dilations, strides, output_padding = cls.calc_shapes(node, spatial_size, Dim2D((h, w)), Dim2D((filt_h, filt_w))) + padding, dilations, strides, output_padding = cls.calc_shapes(node, spatial_size, Dim2D(h, w), Dim2D(filt_h, filt_w)) params = TransposeConv2DParameters(valid_name, filt=filt_dim, diff --git a/tools/nntool/importer/onnx/handlers/backend/div.py b/tools/nntool/importer/onnx/handlers/backend/div.py index 39c9bc963..c668dd043 100644 --- a/tools/nntool/importer/onnx/handlers/backend/div.py +++ b/tools/nntool/importer/onnx/handlers/backend/div.py @@ -38,3 +38,7 @@ def version_7(cls, node, **kwargs): @classmethod def version_13(cls, node, **kwargs): return cls._common(node, **kwargs) + + @classmethod + def version_14(cls, node, **kwargs): + return cls._common(node, **kwargs) diff --git a/tools/nntool/importer/onnx/handlers/backend/expand.py b/tools/nntool/importer/onnx/handlers/backend/expand.py index dd83c73c4..fa7026a8f 100644 --- a/tools/nntool/importer/onnx/handlers/backend/expand.py +++ b/tools/nntool/importer/onnx/handlers/backend/expand.py @@ -17,10 +17,11 @@ from graph.types import ConstantInputParameters, ExpandParameters from graph.types.base import NNEdge from importer.common.constant_mixin import ConstantMixin +from importer.common.provisional_dim import ProvisionalDim from importer.onnx.common import logger from ..backend_handler import BackendHandler -from ..handler import onnx_op, constant_only +from ..handler import constant_only, onnx_op from .broadcast_mixin import BroadcastMixin @@ -38,7 +39,6 @@ def _common(cls, node, **kwargs): y = inputs[1] shape = cls.get_constant(y) - pshape = cls.broadcast_to(x, shape) if cls.is_constant(x): logger.info("reducing %s to a constant", valid_name) x_val = cls.get_constant(x) @@ -47,7 +47,7 @@ def _common(cls, node, **kwargs): params = ExpandParameters(valid_name, shape=shape) G.add_edge(NNEdge(x[0], params, from_idx=x[1])) - all_nodes[node.output[0]] = (params, 0, pshape, x[3]) + all_nodes[node.output[0]] = (params, 0, ProvisionalDim(shape), x[3]) return params @classmethod diff --git a/tools/nntool/importer/onnx/handlers/backend/gather.py b/tools/nntool/importer/onnx/handlers/backend/gather.py index 911396330..b86f44828 100644 --- a/tools/nntool/importer/onnx/handlers/backend/gather.py +++ b/tools/nntool/importer/onnx/handlers/backend/gather.py @@ -38,11 +38,16 @@ def _common(cls, node, **kwargs): x = inputs[0] x_shape = x[2].shape y = inputs[1] + y_shape = y[2].shape indices = cls.get_constant(y) axis = node.attrs.get('axis', 0) - pshape = ProvisionalDim( - x_shape[:axis:] + list(indices.shape) + x_shape[axis + 1:]) + if not y_shape: + pshape = ProvisionalDim( + x_shape[:axis:] + x_shape[axis + 1:]) + else: + pshape = ProvisionalDim( + x_shape[:axis:] + list(indices.shape) + x_shape[axis + 1:]) if cls.is_constant(x): x_val = cls.get_constant(x) logger.info( @@ -57,7 +62,10 @@ def _common(cls, node, **kwargs): out_shape = pshape.known_shape.copy() params = StridedSliceParameters( valid_name, act_slice=act_slice, out_shape=out_shape) - if params.post_slice_shape == tuple(x[2].known_shape): + if params.slice_shape == tuple(x[2].known_shape): + if np.ndim(indices) == 0 and pshape.shape[idx] is not None: + del out_shape[idx] + pshape = ProvisionalDim(out_shape) params = ReshapeParameters(valid_name, old_shape=tuple( x[2].known_shape), shape=out_shape) else: diff --git a/tools/nntool/importer/onnx/handlers/backend/gru.py b/tools/nntool/importer/onnx/handlers/backend/gru.py index 6e6bb8c2d..72625db7b 100644 --- a/tools/nntool/importer/onnx/handlers/backend/gru.py +++ b/tools/nntool/importer/onnx/handlers/backend/gru.py @@ -94,3 +94,7 @@ def version_3(cls, node, **kwargs): @classmethod def version_7(cls, node, **kwargs): return cls._common(node, **kwargs) + + @classmethod + def version_14(cls, node, **kwargs): + return cls._common(node, **kwargs) diff --git a/tools/nntool/importer/onnx/handlers/backend/lstm.py b/tools/nntool/importer/onnx/handlers/backend/lstm.py index c043fd030..230aa0835 100644 --- a/tools/nntool/importer/onnx/handlers/backend/lstm.py +++ b/tools/nntool/importer/onnx/handlers/backend/lstm.py @@ -81,3 +81,7 @@ def version_1(cls, node, **kwargs): @classmethod def version_7(cls, node, **kwargs): return cls._common(node, **kwargs) + + @classmethod + def version_14(cls, node, **kwargs): + return cls._common(node, **kwargs) diff --git a/tools/nntool/importer/onnx/handlers/backend/mul.py b/tools/nntool/importer/onnx/handlers/backend/mul.py index da171f978..6c8f7f1d4 100644 --- a/tools/nntool/importer/onnx/handlers/backend/mul.py +++ b/tools/nntool/importer/onnx/handlers/backend/mul.py @@ -36,3 +36,7 @@ def version_7(cls, node, **kwargs): @classmethod def version_13(cls, node, **kwargs): return cls._common(node, **kwargs) + + @classmethod + def version_14(cls, node, **kwargs): + return cls._common(node, **kwargs) diff --git a/tools/nntool/importer/onnx/handlers/backend/nncf_fake_quantize.py b/tools/nntool/importer/onnx/handlers/backend/nncf_fake_quantize.py index ff395a6c3..9d322f005 100644 --- a/tools/nntool/importer/onnx/handlers/backend/nncf_fake_quantize.py +++ b/tools/nntool/importer/onnx/handlers/backend/nncf_fake_quantize.py @@ -18,6 +18,7 @@ import numpy as np from importer.common.constant_mixin import ConstantMixin from quantization.qtype import QType +from utils.node_id import NodeId from ..backend_handler import BackendHandler from ..handler import domain, onnx_op @@ -38,8 +39,7 @@ def _common(cls, node, **kwargs): if auto_broadcast != 'numpy': raise ValueError(f'{valid_name} - only numpy is supported for auto_broadcast') - qstats = kwargs.get('quant_stats', {}) - qopts = kwargs.get('quant_opts', {}) + qopts = kwargs.get('qopts', {}) x = inputs[0] # input_low = inputs[1] # input_high = inputs[2] @@ -54,6 +54,7 @@ def _common(cls, node, **kwargs): raise ValueError(f"{valid_name} - don't know how to handle more than {math.pow(2, 16)} levels") bits = int(math.log2(levels)) + qopts.setdefault(NodeId(x[0]), {'output_size': [None] * (x[1] + 1)})['output_size'][x[1]] = bits low_shape = output_low.shape high_shape = output_high.shape bc_dims_low = sum(1 for dim in high_shape if dim > 1) diff --git a/tools/nntool/importer/onnx/handlers/backend/range.py b/tools/nntool/importer/onnx/handlers/backend/range.py new file mode 100644 index 000000000..242544e0f --- /dev/null +++ b/tools/nntool/importer/onnx/handlers/backend/range.py @@ -0,0 +1,47 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import numpy as np +from graph.types import ConstantInputParameters +from importer.common.constant_mixin import ConstantMixin +from importer.common.provisional_dim import ProvisionalDim + +from ..backend_handler import BackendHandler +from ..handler import constant_only, onnx_op + + +@onnx_op("Range") +@constant_only(True) +class Range(BackendHandler, ConstantMixin): + + @classmethod + def _common(cls, node, **kwargs): + all_nodes = kwargs['all_nodes'] + G = kwargs['G'] + valid_name = kwargs['valid_name'] + value = node.attrs.get('value', 0) + inputs = [all_nodes[inp] if inp else None for inp in node.input] + if len(inputs) != 3: + raise ValueError(f'Range {valid_name} does not have 3 inputs') + start, limit, delta = [cls.get_constant(x) for x in inputs] + value = np.arange(start, limit, delta, dtype=start.dtype) + params = ConstantInputParameters(valid_name, + value=value) + all_nodes[node.output[0]] = (params, 0, ProvisionalDim(value.shape), None) + return params + + @classmethod + def version_11(cls, node, **kwargs): + return cls._common(node, **kwargs) diff --git a/tools/nntool/importer/onnx/handlers/backend/reducer_mixin.py b/tools/nntool/importer/onnx/handlers/backend/reducer_mixin.py index 1d126bda0..945fed447 100644 --- a/tools/nntool/importer/onnx/handlers/backend/reducer_mixin.py +++ b/tools/nntool/importer/onnx/handlers/backend/reducer_mixin.py @@ -13,6 +13,8 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . +from functools import reduce + from graph.dim import Dim from graph.types import (ConstantInputParameters, GlobalPoolingParameters, NNEdge, NoOPParameters) @@ -20,6 +22,15 @@ from importer.common.provisional_dim import ProvisionalDim from importer.onnx.common import logger +def axis_reduction(shape, axes): + def reduction(state, idx_dim): + idx, dim = idx_dim + if dim is None: + return state[0], state[1] + if idx in axes: + return state[0] + 1, state[1] + [state[0]] + return state[0] + 1, state[1] + return tuple(reduce(reduction, enumerate(shape), (0,[]))[1]) class ReducerMixin(ConstantMixin): @classmethod @@ -43,8 +54,7 @@ def _common(cls, node, copy_qtype=False, constant_operation=None, **kwargs): x_rank for axis in axes), "axis out of bounds" keep_dims = node.attrs.get('keepdims', 1) - stripped_axes = [axis for axis in axes if x_shape[axis] is not None] - + stripped_axes = axis_reduction(x_shape, axes) if not stripped_axes: params = NoOPParameters(valid_name) pout_shape = x_shape.copy() @@ -57,22 +67,16 @@ def _common(cls, node, copy_qtype=False, constant_operation=None, **kwargs): else: pout_shape = [dim for idx, dim in enumerate( x_shape) if idx not in axes] - # if all(dim is None for dim in pout_shape): - # pout_shape.append(1) - # subtract 1 from axis for all None's preceeding it and remove - # axes that are not defined - axes = [ax - sum([1 if dim is None else 0 for dim in x_shape[:ax:]]) - for ax in stripped_axes] if cls.is_constant(x) and constant_operation: - val = constant_operation(cls.get_constant(x), axis=tuple(axes), keepdims=keep_dims) + val = constant_operation(cls.get_constant(x), axis=stripped_axes, keepdims=keep_dims) if val.size < 10: logger.info("reducing %s to a constant %s", valid_name, val) else: logger.info("reducing %s to a constant", valid_name) params = ConstantInputParameters(valid_name, value=val, dims=Dim.unnamed(val.shape)) else: - params = GlobalPoolingParameters(valid_name, pool_type=reduce_type, axis=tuple(axes), + params = GlobalPoolingParameters(valid_name, pool_type=reduce_type, axis=stripped_axes, keep_dims=keep_dims) G.add_edge( diff --git a/tools/nntool/importer/onnx/handlers/backend/relu.py b/tools/nntool/importer/onnx/handlers/backend/relu.py index 432208acc..626460a54 100644 --- a/tools/nntool/importer/onnx/handlers/backend/relu.py +++ b/tools/nntool/importer/onnx/handlers/backend/relu.py @@ -42,3 +42,7 @@ def version_6(cls, node, **kwargs): @classmethod def version_13(cls, node, **kwargs): return cls._common(node, **kwargs) + + @classmethod + def version_14(cls, node, **kwargs): + return cls._common(node, **kwargs) diff --git a/tools/nntool/importer/onnx/handlers/backend/reshape.py b/tools/nntool/importer/onnx/handlers/backend/reshape.py index d5f0dda54..ab930b296 100644 --- a/tools/nntool/importer/onnx/handlers/backend/reshape.py +++ b/tools/nntool/importer/onnx/handlers/backend/reshape.py @@ -111,3 +111,7 @@ def version_5(cls, node, **kwargs): @classmethod def version_13(cls, node, **kwargs): return cls._common(node, **kwargs) + + @classmethod + def version_14(cls, node, **kwargs): + return cls._common(node, **kwargs) diff --git a/tools/nntool/importer/onnx/handlers/backend/resize.py b/tools/nntool/importer/onnx/handlers/backend/resize.py index f5545d46a..0b3d42eda 100644 --- a/tools/nntool/importer/onnx/handlers/backend/resize.py +++ b/tools/nntool/importer/onnx/handlers/backend/resize.py @@ -14,8 +14,10 @@ # along with this program. If not, see . import numpy as np +from pytest import param from graph.dim import Dim from graph.types import NNEdge, ReshapeParameters +from graph.types.constant_input import ConstantInputParameters from graph.types.resizers import (BilinearResizerParameters, NearestNeighborResizerParameters) from importer.common.constant_mixin import ConstantMixin @@ -51,6 +53,13 @@ def _common(cls, node, scales, sizes, nearest_mode='round_prefer_ceil', **kwargs else: sizes = [None if x_shape[idx] is None else dim for idx, dim in enumerate(sizes)] + + if np.prod([sz for sz in sizes if sz is not None]) == 0: + logger.warn(f'{valid_name} has null output shape') + params = ConstantInputParameters(valid_name, value=np.array([])) + all_nodes[node.output[0]] = (params, 0, ProvisionalDim([]), x[3]) + return params + if spatial_size == 1: sizes.insert(-1, 1) diff --git a/tools/nntool/importer/onnx/handlers/backend/rnn.py b/tools/nntool/importer/onnx/handlers/backend/rnn.py index ce22a6340..a871f2719 100644 --- a/tools/nntool/importer/onnx/handlers/backend/rnn.py +++ b/tools/nntool/importer/onnx/handlers/backend/rnn.py @@ -80,3 +80,7 @@ def version_1(cls, node, **kwargs): @classmethod def version_7(cls, node, **kwargs): return cls._common(node, **kwargs) + + @classmethod + def version_14(cls, node, **kwargs): + return cls._common(node, **kwargs) diff --git a/tools/nntool/importer/onnx/handlers/backend/rnn_mixin.py b/tools/nntool/importer/onnx/handlers/backend/rnn_mixin.py index 6f8a17329..eac9090bc 100644 --- a/tools/nntool/importer/onnx/handlers/backend/rnn_mixin.py +++ b/tools/nntool/importer/onnx/handlers/backend/rnn_mixin.py @@ -121,6 +121,13 @@ def attach_rnn(G, x, rnn_params_class, extra_args, valid_name, tensors, t = tensors['forward' if i == 0 else 'backward'] for idx, name in enumerate(rnn_params.INPUT_NAMES): if name == 'input': + # x_shape = x[2].shape + # new_shape = [x_shape[0] if x_shape[0] is not None else 1, x_shape[-1]] + # reshape_param = ReshapeParameters(f"{valid_name}_reshape", old_shape=x_shape, shape=new_shape) + # G.add_edge( + # NNEdge(from_node=x[0], to_node=reshape_param, from_idx=x[1], to_idx=0)) + # G.add_edge( + # NNEdge(from_node=reshape_param, to_node=rnn_params, from_idx=0, to_idx=0)) G.add_edge( NNEdge(from_node=x[0], to_node=rnn_params, from_idx=x[1], to_idx=0)) continue diff --git a/tools/nntool/importer/onnx/handlers/backend/scatternd.py b/tools/nntool/importer/onnx/handlers/backend/scatternd.py new file mode 100644 index 000000000..9e3b6b3a0 --- /dev/null +++ b/tools/nntool/importer/onnx/handlers/backend/scatternd.py @@ -0,0 +1,83 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import numpy as np +from graph.types import ConstantInputParameters, NNEdge +from graph.types.others import ScatterNdParameters +from importer.common.constant_mixin import ConstantMixin +from importer.common.provisional_dim import ProvisionalDim +from importer.onnx.common import logger + +from ..backend_handler import BackendHandler +from ..handler import onnx_op, partial_support, ps_description + +def scatter_nd_impl(data, indices, updates, reduction='none'): + # Check tensor shapes + assert indices.shape[-1] <= len(data.shape) + assert updates.shape == indices.shape[:-1] + data.shape[indices.shape[-1]:] + + # Compute output + output = np.copy(data) + for i in np.ndindex(indices.shape[:-1]): + if reduction == 'add': + output[indices[i]] += updates[i] + elif reduction == 'mul': + output[indices[i]] *= updates[i] + else: + output[indices[i]] = updates[i] + return output + +@onnx_op("ScatterND") +@partial_support(True) +@ps_description('ScatterND is only supported at input and is not supported by nntool or autotiler kernels') +class ScatterND(ConstantMixin, BackendHandler): + + @classmethod + def _common(cls, node, **kwargs): + all_nodes = kwargs['all_nodes'] + G = kwargs['G'] + valid_name = kwargs['valid_name'] + inputs = [all_nodes[inp] for inp in node.input] + x = inputs[0] + x_shape = x[2].shape + indices = cls.get_constant(inputs[1]) + updates = inputs[2] + reduction = node.attrs.get('reduction', None) + + pshape = ProvisionalDim(x_shape) + if cls.is_constant(x) and cls.is_constant(updates): + logger.info("reducing %s to a constant", valid_name) + x_val = cls.get_constant(x) + updates_val = cls.get_constant(updates) + params = ConstantInputParameters(valid_name, value=scatter_nd_impl(x_val, indices, updates_val, reduction=reduction)) + else: + logger.warning(f'{valid_name} ScatterND is not currently supported in the nntool or Autotiler kernels') + params = ScatterNdParameters(valid_name, indices=indices, updates=updates, reduction=reduction) + G.add_edge(NNEdge(from_node=x[0], to_node=params, from_idx=x[1], to_idx=0)) + G.add_edge(NNEdge(from_node=updates[0], to_node=params, from_idx=updates[1], to_idx=1)) + all_nodes[node.output[0]] = (params, 0, pshape, x[3]) + return params + + @classmethod + def version_11(cls, node, **kwargs): + return cls._common(node, **kwargs) + + @classmethod + def version_13(cls, node, **kwargs): + return cls._common(node, **kwargs) + + @classmethod + def version_16(cls, node, **kwargs): + return cls._common(node, **kwargs) diff --git a/tools/nntool/importer/onnx/handlers/backend/sub.py b/tools/nntool/importer/onnx/handlers/backend/sub.py index 866db635c..a290997c3 100644 --- a/tools/nntool/importer/onnx/handlers/backend/sub.py +++ b/tools/nntool/importer/onnx/handlers/backend/sub.py @@ -35,3 +35,7 @@ def version_7(cls, node, **kwargs): @classmethod def version_13(cls, node, **kwargs): return cls._common(node, **kwargs) + + @classmethod + def version_14(cls, node, **kwargs): + return cls._common(node, **kwargs) diff --git a/tools/nntool/importer/onnx/handlers/backend/thresholded_relu.py b/tools/nntool/importer/onnx/handlers/backend/thresholded_relu.py new file mode 100644 index 000000000..b23f6f5f5 --- /dev/null +++ b/tools/nntool/importer/onnx/handlers/backend/thresholded_relu.py @@ -0,0 +1,40 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +# TODO - This is not mappable onto our current kernels. To add if needed by a customer + +# import numpy as np +# from graph.types.activations import ReluActivationParameters +# from importer.onnx.handlers.backend.math_mixin import BasicMathMixin + +# from ..backend_handler import BackendHandler +# from ..handler import onnx_op + + +# @onnx_op("ThresholdedRelu") +# class ThresholdedRelu(BasicMathMixin, BackendHandler): + +# @classmethod +# def _common(cls, node, **kwargs): +# alpha = node.attrs.get('alpha', 1.0) +# return super(ThresholdedRelu, cls)._common(node, +# params_class=ReluActivationParameters, +# constant_operation=lambda x: np.clip(x, alpha, np.inf), +# params_args={'lower_limit': alpha}, +# **kwargs) + +# @classmethod +# def version_10(cls, node, **kwargs): +# return cls._common(node, **kwargs) diff --git a/tools/nntool/importer/onnx/handlers/backend/unsqueeze.py b/tools/nntool/importer/onnx/handlers/backend/unsqueeze.py index 3314062eb..9cbc10f71 100644 --- a/tools/nntool/importer/onnx/handlers/backend/unsqueeze.py +++ b/tools/nntool/importer/onnx/handlers/backend/unsqueeze.py @@ -36,7 +36,7 @@ def _common(cls, node, **kwargs): out_rank = len(x_shape) + len(kwargs['axes']) axes = cls._resolve_negative_ranks(kwargs['axes'], out_rank) - old_shape = x_shape.copy() + old_shape = list(x_shape) new_shape = [1 if new_idx in axes else old_shape.pop(0) for new_idx in range(out_rank)] diff --git a/tools/nntool/importer/onnx/handlers/backend/upsample.py b/tools/nntool/importer/onnx/handlers/backend/upsample.py new file mode 100644 index 000000000..048a620c4 --- /dev/null +++ b/tools/nntool/importer/onnx/handlers/backend/upsample.py @@ -0,0 +1,117 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import numpy as np +from graph.dim import Dim +from graph.types import NNEdge, ReshapeParameters +from graph.types.constant_input import ConstantInputParameters +from graph.types.resizers import (BilinearResizerParameters, + NearestNeighborResizerParameters) +from importer.common.constant_mixin import ConstantMixin +from importer.common.provisional_dim import ProvisionalDim +from importer.onnx.common import logger +from pytest import param + +from ..backend_handler import BackendHandler +from ..handler import onnx_op + + +@onnx_op("Upsample") +class Upsample(ConstantMixin, BackendHandler): + + @classmethod + def _common(cls, node, inputs, scales, **kwargs): + all_nodes = kwargs['all_nodes'] + G = kwargs['G'] + valid_name = kwargs['valid_name'] + x = inputs[0] + x_shape = x[2].shape + x_rank = len(x_shape) + + mode = node.attrs.get('mode', 'nearest') + + spatial_size = x_rank - 2 + in_c = x_shape[1] + in_w = x_shape[-1] + sizes = [int(shape * scale) if shape is not None else None + for shape, scale in zip(x_shape, scales)] + + if np.prod([sz for sz in sizes if sz is not None]) == 0: + logger.warn(f'{valid_name} has null output shape') + params = ConstantInputParameters(valid_name, value=np.array([])) + all_nodes[node.output[0]] = (params, 0, ProvisionalDim([]), x[3]) + return params + + if spatial_size == 1: + sizes.insert(-1, 1) + + if spatial_size != 2 and spatial_size != 1: + raise ValueError('resize only supports 4D tensor in NCHW mode or 3D tensor in NCF mode' + f' - input shape is {x_shape} sizes is {sizes}') + + if not all(x_dim == size_dim for x_dim, size_dim in zip(x_shape[:2:], sizes[:2:])): + raise ValueError('resize only supports 4D tensor in NCHW mode or 3D tensor in NCF mode' + f' - input shape is {x_shape} sizes is {sizes}') + + params_class = BilinearResizerParameters if mode == 'linear' else NearestNeighborResizerParameters + + params = params_class(valid_name, + new_shape=tuple(sizes[2::]), + align_corners=False, + halfpixel_centers=False, + in_dims_hint=[['c', 'h', 'w']], + out_dims_hint=[['c', 'h', 'w']]) + + if spatial_size == 1: + r1_params = ReshapeParameters(f'{valid_name}_reshape2d', + old_shape=Dim.unnamed([in_c, in_w]), + shape=Dim.unnamed([in_c, 1, in_w])) + r2_params = ReshapeParameters(f'{valid_name}_reshape1d', + old_shape=Dim.unnamed( + [in_c, 1, sizes[-1]]), + shape=Dim.unnamed([in_c, sizes[-1]])) + G.add_edge( + NNEdge(from_node=x[0], to_node=r1_params, from_idx=x[1], to_idx=0)) + G.add_edge(NNEdge(from_node=r1_params, + to_node=params, from_idx=0, to_idx=0)) + G.add_edge(NNEdge(from_node=params, + to_node=r2_params, from_idx=0, to_idx=0)) + pout_dims = ProvisionalDim(sizes[:-2:] + sizes[-1::]) + params = r2_params + else: + pout_dims = ProvisionalDim(sizes) + G.add_edge( + NNEdge(from_node=x[0], to_node=params, from_idx=x[1], to_idx=0)) + + all_nodes[node.output[0]] = (params, 0, pout_dims, x[3]) + return params + + @classmethod + def version_7(cls, node, **kwargs): + all_nodes = kwargs['all_nodes'] + inputs = [all_nodes[inp] if inp else None for inp in node.input] + scales = node.attrs['scales'] + return cls._common(node, inputs, scales, **kwargs) + + @classmethod + def version_9(cls, node, **kwargs): + all_nodes = kwargs['all_nodes'] + inputs = [all_nodes[inp] if inp else None for inp in node.input] + scales = cls.get_constant(inputs[1]) + return cls._common(node, inputs, scales, **kwargs) + + @classmethod + def version_10(cls, node, **kwargs): + return cls.version_9(node, **kwargs) diff --git a/tools/nntool/importer/onnx/onnx.py b/tools/nntool/importer/onnx/onnx.py index a89f4d71f..13032d210 100644 --- a/tools/nntool/importer/onnx/onnx.py +++ b/tools/nntool/importer/onnx/onnx.py @@ -70,7 +70,8 @@ def create_graph(self, filename, opts) -> NNGraph: opset_import = model.opset_import G = NNGraph(filename=filename, name=opts.get('name')) - G, qrecs = self._import_onnx_model(G, model.graph, opset_import, opts) + G, qrecs, qopts = self._import_onnx_model( + G, model.graph, opset_import, opts) G.add_dimensions(quiet=True) if qrecs: propagate_qrecs(G, qrecs) @@ -78,6 +79,7 @@ def create_graph(self, filename, opts) -> NNGraph: qset.update(qrecs) qset.scheme_priority = ['SQ8'] qset.schemes_present = {'SQ8'} + qset.options = qopts G.quantization = qset try: quantizer = NewQuantizer(G) @@ -88,9 +90,10 @@ def create_graph(self, filename, opts) -> NNGraph: clean_dangling_nodes(G) MatchDuplicateConstants().match(G) + G.add_dimensions(quiet=True) return G - def _update_qrecs(self, G, qrecs, all_nodes, ranges_dict): + def _update_qrecs(self, G, qrecs, all_nodes, ranges_dict, qopts): for node, idx, _, qtype in all_nodes.values(): if qtype is None and node.name not in ranges_dict.keys(): continue @@ -107,8 +110,11 @@ def _update_qrecs(self, G, qrecs, all_nodes, ranges_dict): if node.name in ranges_dict.keys(): out_min, out_max = ranges_dict[node.name]["range"] dtype = ranges_dict[node.name].get("dtype", np.int8) - bits = ranges_dict[node.name].get("n_bits", 8) + bits = ranges_dict[node.name].get("bits", 8) channel = ranges_dict[node.name].get("per_channel", None) + qopt = qopts.setdefault( + nid, {'output_size': [None] * len(G.indexed_out_edges(node))}) + qopt['output_size'][idx] = bits qtype = QType.from_min_max_sq( out_min, out_max, dtype=dtype, bits=bits, quantized_dimension=channel) qrec.out_qs[idx] = qtype @@ -127,14 +133,16 @@ def _import_onnx_model(self, G, graph, opset, opts): input_shapes=opts.get('input_shapes', {})) all_nodes.update(inputs) qrecs = {} + qopts = {} outputs = self._get_output_nodes( G, graph.output, substitutions=opts.get('substitutions', None)) shapes = {elem.name: elem.type for elem in graph.value_info} self._import_nodes( G, graph, self._handlers, all_nodes, outputs, - opts=opts, qrecs=qrecs, shapes=shapes) - self._update_qrecs(G, qrecs, all_nodes, opts.get('ranges_dict', {})) - return G, qrecs + opts=opts, qrecs=qrecs, shapes=shapes, qopts=qopts) + self._update_qrecs(G, qrecs, all_nodes, + opts.get('ranges_dict', {}), qopts) + return G, qrecs, qopts def import_subgraph(self, G, graph, opts, all_nodes=None): if all_nodes is None: @@ -153,7 +161,7 @@ def import_subgraph(self, G, graph, opts, all_nodes=None): self._import_nodes( G, graph, self._handlers, all_nodes, outputs, opts=opts, qrecs=qrecs) - self._update_qrecs(G, qrecs, all_nodes, {}) + self._update_qrecs(G, qrecs, all_nodes, {}, {}) return G, qrecs @staticmethod @@ -331,9 +339,14 @@ def _import_nodes(self, G, graph, handlers, all_nodes, outputs, **kwargs): continue handler = handlers[node.domain].get( node.op_type, None) if node.domain in handlers else None - if not handler or (handler.CONSTANT_ONLY and - not all(isinstance(all_nodes[inp_name][0], ConstantInputParameters) - for inp_name in node.input)): + if (handler and handler.CONSTANT_ONLY and + not all(isinstance(all_nodes[inp_name][0], ConstantInputParameters) + for inp_name in node.input)): + logger.warning( + f'{node.name} uses ONNX operator "{node.op_type}" domain ' + f'"{node.domain}" which is not currently supported in the Autotiler kernels. ' + 'It may be eliminated by graph optimisations') + if not handler: handler = handlers['__extensions'].get(node.op_type, None) if not handler: logger.warning( @@ -360,7 +373,7 @@ def _import_nodes(self, G, graph, handlers, all_nodes, outputs, **kwargs): x = inputs[0] x_shape = x[2].shape name = hasattr(node, 'name') and getattr(node, 'name') - x=0 + x = 0 params = handler.handle(onode, all_nodes=all_nodes, vars_dict=vars_dict, G=G, valid_name=self._node_name(node), used_tensors=used_tensors, importer=self, **kwargs) diff --git a/tools/nntool/interpreter/commands/dump.py b/tools/nntool/interpreter/commands/dump.py index ec05f2acc..d6fa2498a 100644 --- a/tools/nntool/interpreter/commands/dump.py +++ b/tools/nntool/interpreter/commands/dump.py @@ -83,6 +83,8 @@ class DumpCommand(NNToolShellBase): action='store_true', help='dequantize result') parser_dump.add_argument('--quantize_and_dequantize', action='store_true', help='quantize and dequantize float results') + parser_dump.add_argument('--append_fusion_output', + action='store_true', help='quantize and dequantize float results') parser_dump_group = parser_dump.add_mutually_exclusive_group( required=False) parser_dump_group.add_argument('-q', '--quantize', action='store_true', @@ -155,7 +157,7 @@ def do_dump(self, args: argparse.Namespace): qrecs = None if qmode.is_none else self.G.quantization executer = GraphExecuter(self.G, qrecs=qrecs) outputs = executer.execute(data, step_idx_limit=step, - qmode=qmode) + qmode=qmode, append_fusion_output=args.append_fusion_output) if args.pickle or self._in_py or args.save: pickles.append(outputs) diff --git a/tools/nntool/interpreter/commands/imageformat.py b/tools/nntool/interpreter/commands/imageformat.py index 35f7755d9..f0468dc01 100644 --- a/tools/nntool/interpreter/commands/imageformat.py +++ b/tools/nntool/interpreter/commands/imageformat.py @@ -23,7 +23,7 @@ from graph.types import ImageFormatParameters, NNEdge, TransposeParameters - +from graph.manipulations.formatter import insert_formatter, remove_formatter class ImageFormatCommand(NNToolShellBase): def inputs_choices(self): if self.G is None: @@ -74,103 +74,3 @@ def do_imageformat(self, args: argparse.Namespace): f'format {args.image_formatter} and normalization {args.image_normalizer}') -def insert_formatter(G, input_node, formatter, normalizer): - format_node = ImageFormatParameters(input_node.name + "_formatter", - norm_func=normalizer.upper(), - format_change=formatter.upper()) - out_edges = G.out_edges(input_node.name) - - # dims updated to reflect formatter - if format_node.output_channels is not None and format_node.input_channels is not None: - out_dim = input_node.get_output_size(None)[0] - if formatter.upper() in ("BW8", "BW16"): - assert format_node.input_channels == 1 - in_dim = out_dim.clone() - format_node.out_dims_hint = input_node.out_dims_hint - format_node.in_dims_hint = input_node.out_dims_hint - input_node.dims = in_dim - for out_edge in out_edges: - G.remove_edge(out_edge) - else: - if not out_dim.is_named or out_dim.c != format_node.output_channels: - raise ValueError( - "current graph input is not named or does not match formatter output channels") - if formatter.upper() in ("RGB16", "BW16") and normalizer.upper() != "OUT_INT16": - raise ValueError( - "rgb16 and bw16 formatters must have out_int16 as normalization function") - in_dim = out_dim.clone() - in_dim.c = format_node.input_channels - in_dim.impose_order(("h", "w", "c")) - format_node.in_dims_hint = [["h", "w", "c"]] - input_node.dims = in_dim - if input_node.fixed_order: - new_out_edges = [] - for out_edge in out_edges: - if isinstance(out_edge.to_node, TransposeParameters): - trans_node = out_edge.to_node - transpose_edges = G.out_edges(trans_node.name) - new_out_edges.extend(transpose_edges) - G.remove(trans_node) - if G.quantization: - nid = NodeId(trans_node) - if nid in G.quantization: - del G.quantization[NodeId(trans_node)] - else: - new_out_edges.append(out_edge) - out_edges = new_out_edges - else: - input_node.fixed_order = True - for out_edge in out_edges: - G.remove_edge(out_edge) - format_node.out_dims_hint = [["c", "h", "w"]] * len(out_edges) - input_node.out_dims_hint = [["h", "w", "c"]] - G.node_options[NodeId(input_node)] = input_node.at_options - # qrec updated to reflect formatter - input_qrec = G.quantization and G.quantization.get(NodeId(input_node)) - if input_qrec and format_node.input_dtype and format_node.output_dtype: - formatter_qrec = G.quantization.get(NodeId(format_node)) - if not formatter_qrec: - if input_qrec.out_qs[0].dtype != format_node.output_dtype: - raise ValueError( - "current graph input output quantization does not match formatter output") - formatter_qrec = deepcopy(input_qrec) - formatter_qrec.out_qs[0] = deepcopy(formatter_qrec.out_qs[0]) - if formatter_qrec.ktype.startswith('scaled'): - formatter_in_q = QType( - scale=1, zero_point=0, dtype=format_node.input_dtype) - elif formatter_qrec.ktype.startswith('symmetric'): - formatter_in_q = QType(q=0, dtype=format_node.input_dtype) - else: - raise NotImplementedError("quantization has unknown type") - if len(formatter_qrec.in_qs) > 0: - formatter_qrec.in_qs[0] = formatter_in_q - input_qrec.in_qs[0] = formatter_in_q - else: - formatter_qrec.in_qs.append(formatter_in_q) - input_qrec.in_qs.append(formatter_in_q) - input_qrec.out_qs[0] = formatter_in_q - G.quantization[NodeId(format_node)] = formatter_qrec - - G.add_node(format_node) - G.add_edge(NNEdge(input_node, format_node)) - for out_edge in out_edges: - G.add_edge(NNEdge(format_node, out_edge.to_node, to_idx=out_edge.to_idx)) - - -def remove_formatter(G, fmt_node): - input_edges = G.in_edges(fmt_node.name) - assert len(input_edges) == 1, "formatter node should only have one input" - input_node = input_edges[0].from_node - fmt_edges = G.out_edges(fmt_node.name) - fmt_qrec = G.quantization and G.quantization.get(NodeId(fmt_node)) - G.remove(fmt_node) - - input_node.dims = fmt_node.out_dims[0] - input_node.out_dims_hint = fmt_node.out_dims_hint - for fmt_edge in fmt_edges: - G.add_edge(NNEdge(input_node, fmt_edge.to_node, to_idx=fmt_edge.to_idx)) - if fmt_qrec: - input_qrec = G.quantization[NodeId(input_node)] - input_qrec.out_qs = fmt_qrec.out_qs - input_qrec.in_qs = fmt_qrec.out_qs - G.quantization.remove_node(fmt_node) diff --git a/tools/nntool/interpreter/commands/qtune.py b/tools/nntool/interpreter/commands/qtune.py index e7b60a90f..f687fc392 100644 --- a/tools/nntool/interpreter/commands/qtune.py +++ b/tools/nntool/interpreter/commands/qtune.py @@ -110,7 +110,7 @@ def reduction(state, x): options = reduce(reduction, args.step, options) quantizer = NewQuantizer(self.G) - quantizer.options.update(options) + quantizer.update_options(options) quantizer.quantize() self.pfeedback('quantization options set') diff --git a/tools/nntool/interpreter/commands/remove.py b/tools/nntool/interpreter/commands/remove.py index 3283f59c8..17f51d84a 100644 --- a/tools/nntool/interpreter/commands/remove.py +++ b/tools/nntool/interpreter/commands/remove.py @@ -15,12 +15,15 @@ import argparse from functools import reduce +from itertools import chain, groupby from cmd2 import Cmd2ArgumentParser, with_argparser from interpreter.nntool_shell_base import NNToolShellBase from graph.types import ReshapeParameters, InputParameters, OutputParameters, ConstantInputParameters from graph.types.base import NNEdge +from quantization.new_qrec import QRec +from utils.node_id import NodeId class RemoveCommand(NNToolShellBase): @@ -49,27 +52,32 @@ def nodes_choices(self): def do_remove(self, args: argparse.Namespace): """Removes all the edges and nodes between two node. Will only work if nodes do not affect shape of tensor.""" self._check_graph() - if any(node not in self.G for node in args.nodes): - self.perror("node not found in graph") - return + for node in args.nodes: + if node not in self.G: + self.perror(f"node {node} not found in graph") + return node_from = self.G[args.nodes[0]] if len(args.nodes) == 1: if args.up: nodes_above = set(self.G.nodes_above(node_from)) if args.leave: remove_nodes = nodes_above - inputs_on = [] - dims = node_from.in_dims + # remove constant inputs on the node left as targets for removal for in_edge in self.G.indexed_in_edges(node_from): if isinstance(in_edge.from_node, ConstantInputParameters): nodes_above.remove(in_edge.from_node) - else: - inputs_on.append([in_edge]) else: - dims = node_from.out_dims remove_nodes = nodes_above | {node_from} - inputs_on = self.G.indexed_out_edges(node_from) - + # check for deleted nodes that have edges to left nodes. These need to be the new inputs. + # group them by source so common edges have one input + inputs_on = [ + list(edges) for _, edges in + groupby( + [edge for node in remove_nodes for edge in self.G.out_edges(node) + if edge.to_node not in remove_nodes], + key=lambda x: (x.from_node, x.from_idx))] + dims = [edges[0].to_node.in_dims[edges[0].to_idx] + for edges in inputs_on] input_names = sorted( [node.name for node in remove_nodes if isinstance(node, InputParameters)]) self.G.remove_all(remove_nodes) @@ -82,6 +90,13 @@ def do_remove(self, args: argparse.Namespace): self.G.add_edge(NNEdge(from_node=in_node, to_idx=edge.to_idx, to_node=edge.to_node)) + if self.G.quantization and edge_group: + edge = edge_group[0] + fnid = NodeId(edge.to_node) + if fnid in self.G.quantization: + qrec = self.G.quantization[fnid] + self.G.quantization[NodeId(in_node)] = QRec.copy_ktype( + qrec, out_qs=[qrec.in_qs[edge.to_idx]]) else: nodes_below = set(self.G.nodes_below(node_from)) if self.G.is_vertex_cut(nodes_below): @@ -107,6 +122,12 @@ def do_remove(self, args: argparse.Namespace): self.pfeedback(f'adding output {out_node.name}') self.G.add_edge(NNEdge(from_node=edge.from_node, from_idx=edge.from_idx, to_node=out_node)) + if self.G.quantization: + fnid = NodeId(edge.from_node) + if fnid in self.G.quantization: + qrec = self.G.quantization[fnid] + self.G.quantization[NodeId(out_node)] = QRec.copy_ktype( + qrec, in_qs=[qrec.out_qs[edge.from_idx]]) else: node_to = self.G[args.nodes[1]] nodes_between = self.G.nodes_between(node_from, node_to) @@ -121,7 +142,8 @@ def do_remove(self, args: argparse.Namespace): edges_from = set(self.G.out_edges(node_from)) edges_to = set(self.G.in_edges(node_to.name)) - between_edges = reduce(lambda s, x: s|set(self.G.edges(x)), nodes_between, set()) + between_edges = reduce(lambda s, x: s | set( + self.G.edges(x)), nodes_between, set()) edges_from = edges_from.intersection(between_edges) edges_to = edges_to.intersection(between_edges) if len(edges_from) != len(edges_to): diff --git a/tools/nntool/interpreter/commands/tflite.py b/tools/nntool/interpreter/commands/tflite.py index 71babab37..b8a729a11 100644 --- a/tools/nntool/interpreter/commands/tflite.py +++ b/tools/nntool/interpreter/commands/tflite.py @@ -15,14 +15,19 @@ from interpreter.nntool_shell_base import NNToolShellBase from importer.tflite2.common.handler_helper import get_backend_coverage, get_backend_partial_support_detail - +import texttable class HelpTFLiteCommand(NNToolShellBase): def help_tflite(self): ops_dict = get_backend_coverage()[0] bc_dict = get_backend_partial_support_detail() self.pfeedback("Supported operators and versions") + + table = texttable.Texttable() + table.set_cols_align(['l', 'l', 'l']) + table.set_max_width(120) + table.set_cols_width([30, 15, 60]) for op in ops_dict: - self.pfeedback("%s (%s)"%(op, ",".join(str(ver) for ver in ops_dict[op]))) - if op in bc_dict: - self.pfeedback(bc_dict[op]) + table.add_row([op, ",".join(str(ver) for ver in ops_dict[op]), bc_dict.get(op, "")]) + self.pfeedback("Supported operators and versions") + self.pfeedback(table.draw()+'\n') \ No newline at end of file diff --git a/tools/nntool/quantization/float/float_quantization_handler.py b/tools/nntool/quantization/float/float_quantization_handler.py index 0fe9824be..776054095 100644 --- a/tools/nntool/quantization/float/float_quantization_handler.py +++ b/tools/nntool/quantization/float/float_quantization_handler.py @@ -65,3 +65,10 @@ def _get_in_qs_from_stats(cls, params, stats, in_qs, **kwargs): for idx, dim in enumerate(params.in_dims)] return [QType(dtype=dtype) if dim is not None else None for idx, dim in enumerate(params.in_dims)] + + @classmethod + def get_min_max(cls, stats, idx=0, direction='out'): + if stats: + return (stats[f'range_{direction}'][idx]['min'], + stats[f'range_{direction}'][idx]['max']) + return None, None diff --git a/tools/nntool/quantization/float/quantizers/expression_fusion_float.py b/tools/nntool/quantization/float/quantizers/expression_fusion_float.py index ee45039e5..62fe5989e 100644 --- a/tools/nntool/quantization/float/quantizers/expression_fusion_float.py +++ b/tools/nntool/quantization/float/quantizers/expression_fusion_float.py @@ -30,8 +30,8 @@ # Fusion handler attribute not set since expressions are handled only by this handler @params_type(ExpressionFusionParameters) -@in_qs_constraint(MatchAll({'dtype': set([np.float32, np.float16, bfloat16])})) -@out_qs_constraint(MatchAll({'dtype': set([np.float32, np.float16, bfloat16])})) +@in_qs_constraint(MatchAll({'dtype': set([np.float32, np.float16, bfloat16, np.uint16, np.int16, np.uint8, np.int8])})) +@out_qs_constraint(MatchAll({'dtype': set([np.float32, np.float16, bfloat16, np.uint16, np.int16, np.uint8, np.int8])})) class ExpressionFusionFloat(FloatQuantizionHandler): @classmethod def _quantize(cls, params, in_qs, stats, **kwargs): diff --git a/tools/nntool/quantization/float/quantizers/input_float.py b/tools/nntool/quantization/float/quantizers/input_float.py new file mode 100644 index 000000000..79c064ad9 --- /dev/null +++ b/tools/nntool/quantization/float/quantizers/input_float.py @@ -0,0 +1,54 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + + +from copy import deepcopy + +import numpy as np +from bfloat16 import bfloat16 +from graph.types import OutputParameters +from graph.types.input_output import InputParameters +from quantization.float.float_quantization_handler import \ + FloatQuantizionHandler +from quantization.new_qrec import QRec +from quantization.qtype import QType +from quantization.qtype_constraint import MatchAll +from quantization.quantizer_options import QTYPE_IND_OPTION +from quantization.unified_quantization_handler import (options, + out_qs_constraint, + params_type) + + +@params_type(InputParameters) +@out_qs_constraint(MatchAll({'dtype': set([np.float32, np.float16, bfloat16])})) +@options(QTYPE_IND_OPTION) +class FloatInput(FloatQuantizionHandler): + @classmethod + def _quantize(cls, params, in_qs, stats, **kwargs): + force_out_qs, dtype = cls.get_float_opts(**kwargs) + force_out_q = force_out_qs and force_out_qs[0] + opts = kwargs['opts'] + i_q_ind = opts.get('qtype_ind') + if force_out_q: + if force_out_q.dtype != dtype: + return None + i_q = deepcopy(force_out_q) + elif i_q_ind: + i_q = deepcopy(i_q_ind) + else: + min_val, max_val = cls.get_min_max(stats) + i_q = QType(dtype=dtype, min_val=min_val, max_val=max_val) + return QRec.float(out_qs=[i_q], + float_dtype=i_q.dtype) diff --git a/tools/nntool/quantization/float/quantizers/output_float.py b/tools/nntool/quantization/float/quantizers/output_float.py index 57e2daba0..2d506f890 100644 --- a/tools/nntool/quantization/float/quantizers/output_float.py +++ b/tools/nntool/quantization/float/quantizers/output_float.py @@ -14,6 +14,8 @@ # along with this program. If not, see . +from copy import deepcopy + import numpy as np from bfloat16 import bfloat16 from graph.types import OutputParameters @@ -22,7 +24,9 @@ from quantization.new_qrec import QRec from quantization.qtype import QType from quantization.qtype_constraint import MatchAll +from quantization.quantizer_options import QTYPE_IND_OPTION from quantization.unified_quantization_handler import (in_qs_constraint, + options, out_qs_constraint, params_type) @@ -30,12 +34,20 @@ @params_type(OutputParameters) @in_qs_constraint(MatchAll({'dtype': set([np.float32, np.float16, bfloat16])})) @out_qs_constraint(MatchAll({'dtype': set([np.float32, np.float16, bfloat16])})) +@options(QTYPE_IND_OPTION) class FloatOutput(FloatQuantizionHandler): @classmethod def _quantize(cls, params, in_qs, stats, **kwargs): force_out_qs, dtype = cls.get_float_opts(**kwargs) if force_out_qs and any(qtype.dtype != dtype for qtype in force_out_qs if qtype is not None): return None - return QRec.float(in_qs=[QType(dtype=dtype)], - out_qs=[QType(dtype=dtype)], - float_dtype=dtype) + opts = kwargs['opts'] + o_q_ind = opts.get('qtype_ind') + if o_q_ind: + o_q = deepcopy(o_q_ind) + else: + min_val, max_val = cls.get_min_max(stats, direction='in') + o_q = QType(dtype=dtype, min_val=min_val, max_val=max_val) + return QRec.float(in_qs=[o_q], + out_qs=[o_q], + float_dtype=o_q.dtype) diff --git a/tools/nntool/quantization/multiplicative/quantizers/activation_mult.py b/tools/nntool/quantization/multiplicative/quantizers/activation_mult.py index 56c24ed63..0467048c0 100644 --- a/tools/nntool/quantization/multiplicative/quantizers/activation_mult.py +++ b/tools/nntool/quantization/multiplicative/quantizers/activation_mult.py @@ -31,7 +31,7 @@ from quantization.qtype import QType from quantization.unified_quantization_handler import (in_qs_constraint, out_qs_constraint,option_constraint, - params_type, options) + params_type, options, priority) from ..mult_quantization_handler import MultQuantizionHandler from quantization.quantizer_options import * @@ -104,6 +104,11 @@ def _quantize_sw(cls, params, in_qs, stats, in_dtype, out_dtype, out_asym, **kwa 8, dtype=in_dtype, forced=True) + elif in_dtype in [np.uint8, np.uint16]: + in_q = QType( + dtype=in_dtype, + scale=pow(2, -12), + zero_point=1<<(8 if in_dtype == np.uint8 else 16)) else: in_q = QType( dtype=in_dtype, @@ -133,7 +138,7 @@ def _quantize_sw(cls, params, in_qs, stats, in_dtype, out_dtype, out_asym, **kwa o_q = QType.from_min_max_sq(0, max_val, dtype=out_dtype, - asymmetric=(in_q.zero_point != 0)) + asymmetric=(in_q.zero_point != 0) or out_dtype in [np.uint8, np.uint16]) in_q = deepcopy(o_q) elif isinstance(params, TanHActivationParameters): o_q = QType.from_min_max_sq( @@ -225,6 +230,7 @@ def _quantize(cls, params, in_qs, stats, **kwargs): @in_qs_constraint({'dtype': {np.int8, np.int16, np.int32}}) @out_qs_constraint({'dtype': np.uint8}) @option_constraint(force_output_size={8, None}) +@priority(2) class ActivationMultSW_HSwish_I_U8(ActivationMultSWBase): @classmethod def _get_in_qs_from_stats(cls, params, stats, in_qs, **kwargs): diff --git a/tools/nntool/quantization/multiplicative/quantizers/add_sub_mult.py b/tools/nntool/quantization/multiplicative/quantizers/add_sub_mult.py index 68c39acd3..8f9d5832b 100644 --- a/tools/nntool/quantization/multiplicative/quantizers/add_sub_mult.py +++ b/tools/nntool/quantization/multiplicative/quantizers/add_sub_mult.py @@ -83,7 +83,7 @@ def _quantize_sw(cls, params, in_qs, stats, inout_dtype, asym=False, **kwargs): scale_in_mul_biases_q.scale = qrec.in_qs[scaled_idx].scale / \ qrec.in_qs[not_scaled_idx].scale - if qrec.in_qs[0].asymmetric: + if qrec.in_qs[0].zero_point or qrec.in_qs[1].zero_point or qrec.out_qs[0].zero_point: # (C - Zc)*Sc = (A - Za)*Sa + (B - Zb)*Sb = # C = Sa/Sc*(A + B*Sb/Sa - Za - Zb*Sb/Sa) + Zc = # = Sa/Sc*(A + B*Sb/Sa) + (Zc - Sa/Sc*(Za + Zb*Sb/Sa)) diff --git a/tools/nntool/quantization/multiplicative/quantizers/constant_input_mult.py b/tools/nntool/quantization/multiplicative/quantizers/constant_input_mult.py index 0ad4f0e97..467bea16b 100644 --- a/tools/nntool/quantization/multiplicative/quantizers/constant_input_mult.py +++ b/tools/nntool/quantization/multiplicative/quantizers/constant_input_mult.py @@ -17,15 +17,18 @@ from graph.types import ConstantInputParameters from quantization.new_qrec import QRec -from quantization.qtype import QType -from quantization.quantizer_options import QTYPE_IND_OPTION +from quantization.qtype import QType, DTYPES +from quantization.quantizer_options import QTYPE_IND_OPTION, OUTPUT_SIZE_OPTION from quantization.unified_quantization_handler import (needs_stats, options, params_type) from ..mult_quantization_handler import MultQuantizionHandler -@options(QTYPE_IND_OPTION) +@options( + QTYPE_IND_OPTION, + OUTPUT_SIZE_OPTION +) @params_type(ConstantInputParameters) @needs_stats(False) class ConstantInputMult(MultQuantizionHandler): @@ -42,8 +45,12 @@ def _quantize(cls, params, in_qs, stats, **kwargs): # derive quantization from statistics else: opts = kwargs.get('opts', {}) - o_q = opts.get('qtype_ind') - if not o_q: - o_q = QType.from_array_sq(params.value, dtype=out_dtype) + output_size = opts.get('output_size') + if output_size and output_size[0]: + cur_bits = DTYPES[out_dtype][0] + bits = min(output_size[0], cur_bits) + else: + bits = None + o_q = QType.from_array_sq(params.value, dtype=out_dtype, bits=bits) o_q.is_constant = True return QRec.scaled(in_qs=[o_q], out_qs=[o_q]) diff --git a/tools/nntool/quantization/multiplicative/quantizers/expression_fusion_mult.py b/tools/nntool/quantization/multiplicative/quantizers/expression_fusion_mult.py index cd4be879e..6e4b47eed 100644 --- a/tools/nntool/quantization/multiplicative/quantizers/expression_fusion_mult.py +++ b/tools/nntool/quantization/multiplicative/quantizers/expression_fusion_mult.py @@ -26,14 +26,14 @@ from quantization.unified_quantization_handler import (in_qs_constraint, out_qs_constraint, params_type) - +from bfloat16 import bfloat16 from ..mult_quantization_handler import MultQuantizionHandler LOG = logging.getLogger('nntool.' + __name__) @params_type(ExpressionFusionParameters) -@in_qs_constraint(MatchAll({'dtype': {np.int8, np.uint8, np.int16, np.uint16}})) -@out_qs_constraint(MatchAll({'dtype': {np.int8, np.uint8, np.int16, np.uint16}})) +@in_qs_constraint(MatchAll({'dtype': {np.int8, np.uint8, np.int16, np.uint16, np.float16, bfloat16}})) +@out_qs_constraint(MatchAll({'dtype': {np.int8, np.uint8, np.int16, np.uint16, np.float16, bfloat16}})) class ExpressionFusionMult(MultQuantizionHandler): @classmethod def _quantize(cls, params, in_qs, stats, **kwargs): diff --git a/tools/nntool/quantization/multiplicative/quantizers/filter_mult.py b/tools/nntool/quantization/multiplicative/quantizers/filter_mult.py index 57f1fb857..9b49b245c 100644 --- a/tools/nntool/quantization/multiplicative/quantizers/filter_mult.py +++ b/tools/nntool/quantization/multiplicative/quantizers/filter_mult.py @@ -311,22 +311,26 @@ def _quantize_ne16(cls, params, in_qs, stats, input_dtype, **kwargs): G = kwargs['G'] weights_node = cls.get_weights_node(G, fusion if fusion else params) min_val, max_val = None, None + wbits = (min(in_qs[1].bits, opts['weight_bits']) + if 'weight_bits' not in opts['set_on_node'] else opts['weight_bits']) weights_q = QType.from_array_sq(arr=weights_node.dqvalue, quantized_dimension=cls.get_quantized_dimension( params, opts), dtype=np.uint8, narrow_range=opts['narrow_weights'], - bit_pack=opts['weight_bits'], + bit_pack=wbits, no_compression=True, - bits=opts['weight_bits']) + bits=wbits) in_q = in_qs[0] - in_q = limit_input_precision( - params, input_bits, in_q, params.filter.sz, - opts['narrow_weights'], opts['weight_bits'], - opts.get('max_precision_limit', MAX_PRECISION_LIMIT_OPTION['default']), - out_ranges=stats.get('range_out'), - w_qs=[weights_q]) + if input_bits > 8: + in_q = limit_input_precision( + params, input_bits, in_q, params.filter.sz, + opts['narrow_weights'], wbits, + opts.get('max_precision_limit', + MAX_PRECISION_LIMIT_OPTION['default']), + out_ranges=stats.get('range_out'), + w_qs=[weights_q]) assert in_q.dtype == input_dtype diff --git a/tools/nntool/quantization/multiplicative/quantizers/global_pooling_mult.py b/tools/nntool/quantization/multiplicative/quantizers/global_pooling_mult.py index d32c2009b..38fb1492f 100644 --- a/tools/nntool/quantization/multiplicative/quantizers/global_pooling_mult.py +++ b/tools/nntool/quantization/multiplicative/quantizers/global_pooling_mult.py @@ -68,12 +68,16 @@ def _quantize(cls, params, in_qs, stats, **kwargs): params.name, o_q.min, o_q.max, "asymmetric" if o_q.asymmetric else "symmetric") elif isinstance(params, GlobalAveragePoolParameters) or isinstance(params, GlobalSumPoolParameters): cls.check_valid_ranges(params, stats, idx=0, dirs='in') + in_qs = cls.force_symmetric(in_qs) + if in_qs is None: + return None + in_q = in_qs[0] # scaling needs to be based on stats and zero point o_q = QType.from_min_max_sq(stats['range_out'][0]['min'], stats['range_out'][0]['max'], dtype=out_dtype, - asymmetric=(stats['range_out'][0]['min'] == 0 and in_q.zero_point == -128)) + asymmetric=False) else: o_q = deepcopy(in_q) diff --git a/tools/nntool/quantization/multiplicative/quantizers/input_mult.py b/tools/nntool/quantization/multiplicative/quantizers/input_mult.py index dddbc5893..9a35a16ce 100644 --- a/tools/nntool/quantization/multiplicative/quantizers/input_mult.py +++ b/tools/nntool/quantization/multiplicative/quantizers/input_mult.py @@ -19,7 +19,7 @@ from graph.types import InputParameters from quantization.new_qrec import QRec from quantization.qtype import QType -from quantization.quantizer_options import ALLOW_ASYMMETRIC_OPTION +from quantization.quantizer_options import ALLOW_ASYMMETRIC_OPTION, QTYPE_IND_OPTION from quantization.unified_quantization_handler import (options, out_qs_constraint, params_type) @@ -28,7 +28,8 @@ @options( - ALLOW_ASYMMETRIC_OPTION + ALLOW_ASYMMETRIC_OPTION, + QTYPE_IND_OPTION ) @params_type(InputParameters) @out_qs_constraint({'dtype': set([np.int8, np.uint8, np.int16, np.uint16])}) @@ -40,8 +41,11 @@ def _quantize(cls, params, in_qs, stats, **kwargs): out_dtype = in_qs[0].dtype force_out_q = force_out_qs and force_out_qs[0] opts = kwargs['opts'] + o_q_ind = opts.get('qtype_ind') if force_out_q: o_q = deepcopy(force_out_q) + elif o_q_ind: + o_q = deepcopy(o_q_ind) else: cls.check_valid_ranges(params, stats, idx=0, dirs='out') o_q = QType.from_min_max_sq(stats['range_out'][0]['min'], diff --git a/tools/nntool/quantization/multiplicative/quantizers/matmult_mult.py b/tools/nntool/quantization/multiplicative/quantizers/matmult_mult.py index a09de7233..8b400b01d 100644 --- a/tools/nntool/quantization/multiplicative/quantizers/matmult_mult.py +++ b/tools/nntool/quantization/multiplicative/quantizers/matmult_mult.py @@ -22,6 +22,7 @@ from graph.types.activations import (HSwishActivationParameters, TanHActivationParameters) from graph.types.base import NNEdge +from graph.types.tensor_arithmetic import MatMulTransposedParameters from quantization.multiplicative.quantizers.filter_mult import \ check_filter_options from quantization.multiplicative.quantizers.rnn_mult_ne16 import ( @@ -150,7 +151,8 @@ def _quantize(cls, params, in_qs, stats, **kwargs): kwargs['graph_update']['requires_adjust'] = True in_q2 = QType.from_array_sq( arr=in2_node.dqvalue, - quantized_dimension=len(in2_node.dqvalue.shape) - 2, + quantized_dimension=(len(in2_node.dqvalue.shape) - + (2 if isinstance(params, MatMulTransposedParameters) else 1)), dtype=np.int8, narrow_range=True, bits=8) @@ -235,7 +237,7 @@ def _quantize_ne16(cls, params, in_qs, stats, input_dtype, **kwargs): in_q1 = limit_input_precision( params, input_bits, in_q1, w1, False, opts['weight_bits'], opts.get('max_precision_limit', - MAX_PRECISION_LIMIT_OPTION['default']), + MAX_PRECISION_LIMIT_OPTION['default']), out_ranges=stats.get('range_out'), w_qs=[in_q2]) diff --git a/tools/nntool/quantization/multiplicative/quantizers/output_mult.py b/tools/nntool/quantization/multiplicative/quantizers/output_mult.py index 47a9c2367..4b1a0f67d 100644 --- a/tools/nntool/quantization/multiplicative/quantizers/output_mult.py +++ b/tools/nntool/quantization/multiplicative/quantizers/output_mult.py @@ -18,7 +18,8 @@ import numpy as np from graph.types import OutputParameters from quantization.new_qrec import QRec -from quantization.unified_quantization_handler import (out_qs_constraint, +from quantization.quantizer_options import QTYPE_IND_OPTION +from quantization.unified_quantization_handler import (out_qs_constraint, options, params_type, needs_stats) from ..mult_quantization_handler import MultQuantizionHandler @@ -27,7 +28,14 @@ @params_type(OutputParameters) @out_qs_constraint({'dtype': set([np.int8, np.uint8, np.int16])}) @needs_stats(False) +@options(QTYPE_IND_OPTION) class OutputMult(MultQuantizionHandler): @classmethod def _quantize(cls, params, in_qs, stats, **kwargs): - return QRec.scaled(in_qs=deepcopy(in_qs), out_qs=deepcopy(in_qs)) + opts = kwargs['opts'] + in_q_ind = opts.get('qtype_ind') + if in_q_ind: + in_q = deepcopy(in_q_ind) + else: + in_q = deepcopy(in_qs[0]) + return QRec.scaled(in_qs=[in_q], out_qs=[in_q]) diff --git a/tools/nntool/quantization/multiplicative/quantizers/softmax_tanh_mult.py b/tools/nntool/quantization/multiplicative/quantizers/softmax_tanh_mult.py index b2fbff55b..16ee1aad0 100644 --- a/tools/nntool/quantization/multiplicative/quantizers/softmax_tanh_mult.py +++ b/tools/nntool/quantization/multiplicative/quantizers/softmax_tanh_mult.py @@ -18,7 +18,7 @@ from graph.types.activations import HTanHActivationParameters from quantization.new_qrec import QRec from quantization.qtype import QType -from quantization.quantizer_options import SOFTMAX_OUT_8BITS_OPTION +from quantization.quantizer_options import SOFTMAX_OUT_8BITS_OPTION, OUTPUT_SIZE_OPTION from quantization.unified_quantization_handler import (in_qs_constraint, out_qs_constraint, params_type, options) @@ -39,7 +39,7 @@ def _quantize(cls, params, in_qs, stats, **kwargs): force_out_q = force_out_qs and force_out_qs[0] opts = kwargs['opts'] if force_out_q: - if force_out_q.forced_scale or force_out_q.forced_zero_point: + if force_out_q.forced_scale or (force_out_q.forced_zero_point and not np.all(in_qs[0].zero_point == 0)): return None if in_qs[0].dtype == np.int8: dtypes = [np.int8, np.int16] diff --git a/tools/nntool/quantization/new_qrec.py b/tools/nntool/quantization/new_qrec.py index 7438a5f69..4f0e6badb 100644 --- a/tools/nntool/quantization/new_qrec.py +++ b/tools/nntool/quantization/new_qrec.py @@ -158,7 +158,7 @@ def get_outputs(self, del params if ktype == "symmetric": if self._auto_dequantize_outputs: - return [self.out_qs[idx].dequantize(output_tensor) for idx, output_tensor in enumerate(output_tensors)] + return [self.out_qs[idx].dequantize(x) for idx, x in enumerate(output_tensors)] output_tensors = [self.out_qs[idx].clip(output_tensor) for idx, output_tensor in enumerate(output_tensors)] return output_tensors diff --git a/tools/nntool/quantization/qtype.py b/tools/nntool/quantization/qtype.py index 7d409ba36..45324622d 100644 --- a/tools/nntool/quantization/qtype.py +++ b/tools/nntool/quantization/qtype.py @@ -658,7 +658,9 @@ def calculate_scale(rmin, rmax, qmin, qmax, dtype, asymmetric=False, scale = np.maximum( divide_ignore(rpos_range, qpos_range), divide_ignore(rneg_range, qneg_range)) - return np.atleast_1d(scale), np.atleast_1d(zero_point) + scale = np.atleast_1d(scale) + scale[scale == 0] = 1 + return scale, np.atleast_1d(zero_point) elif asymmetric: if narrow_range: raise ValueError( @@ -705,7 +707,9 @@ def calculate_scale(rmin, rmax, qmin, qmax, dtype, asymmetric=False, nudged_zero_point = qmax else: nudged_zero_point = np.round(zero_point).astype(dtype) - return np.atleast_1d(scale), np.atleast_1d(nudged_zero_point) + scale = np.atleast_1d(scale) + scale[scale == 0] = 1 + return scale, np.atleast_1d(nudged_zero_point) else: scale = QType.calculate_symmetric_scales( qrange, rmin, rmax, narrow_range=narrow_range) @@ -718,7 +722,9 @@ def calculate_scale(rmin, rmax, qmin, qmax, dtype, asymmetric=False, else: zero_point = np.atleast_1d( np.ceil(qrange/2) + qmin).astype(dtype) - return np.atleast_1d(scale), zero_point + scale = np.atleast_1d(scale) + scale[scale == 0] = 1 + return scale, zero_point def recalculate_scale(self, min_val, max_val, narrow_range=None): if narrow_range is None: diff --git a/tools/nntool/quantization/quantizer/new_quantizer.py b/tools/nntool/quantization/quantizer/new_quantizer.py index f34a497c0..51a353c09 100644 --- a/tools/nntool/quantization/quantizer/new_quantizer.py +++ b/tools/nntool/quantization/quantizer/new_quantizer.py @@ -96,6 +96,14 @@ def options(self, val): def set_options(self, **kwargs): self._options.update(kwargs) + def update_options(self, new_options): + for k, v in new_options.items(): + old_v = self._options.get(k, None) + if isinstance(old_v, dict) and isinstance(v, dict): + old_v.update(v) + else: + self._options[k] = v + @property def schemes(self): return self._schemes @@ -369,6 +377,7 @@ def get_options(self, nid, handler=None): node_options = self._options.get(nid, {}) opts.update({k: v for k, v in node_options.items() if k in opts}) + opts['set_on_node'] = list(node_options.keys()) else: opts = {k: v for k, v in self._options.items() if not isinstance(k, NodeId)} diff --git a/tools/nntool/quantization/quantizer_options.py b/tools/nntool/quantization/quantizer_options.py index 48f114c18..59e56015f 100644 --- a/tools/nntool/quantization/quantizer_options.py +++ b/tools/nntool/quantization/quantizer_options.py @@ -89,6 +89,12 @@ 'default': 8 } +OUTPUT_SIZE_OPTION = { + 'name': 'output_size', + 'type': None, + 'default': None +} + FORCE_EXTERNAL_SIZE_OPTION = { 'name': 'force_external_size', 'type': int, diff --git a/tools/nntool/reports/draw_graph_reporter.py b/tools/nntool/reports/draw_graph_reporter.py index da7b40b5c..812299e88 100644 --- a/tools/nntool/reports/draw_graph_reporter.py +++ b/tools/nntool/reports/draw_graph_reporter.py @@ -380,7 +380,7 @@ def report_expression(self, dot: Digraph, G: NNGraph, 'labelloc': 't', 'labeljust': 'l'}, node_attr={'style': 'solid(dashed)'}) as sub: - for var, func in func_col.functions.items(): + for var, func in func_col: node_id, shape = self.report_symbol( sub, func, intermediates, anonymise=anonymise) var_name = self.get_next('Var') if anonymise else var.name diff --git a/tools/nntool/reports/quantization_reporter.py b/tools/nntool/reports/quantization_reporter.py index 8b8673fc6..09ce00a52 100644 --- a/tools/nntool/reports/quantization_reporter.py +++ b/tools/nntool/reports/quantization_reporter.py @@ -14,6 +14,7 @@ # along with this program. If not, see . from graph.types import ConstantInputParameters +from graph.types.activations import ActivationParameters from graph.types.base import FilterParameters from utils.node_id import NodeId from utils.tabular import Tabular, TabularColumn @@ -92,6 +93,8 @@ def report(self, G, stats, nodes=None): row.append(self.emit_qs([qrec.cache[key]])) else: row.append("") + elif "scale_mul_biases_q" in qrec.cache: + row += ["", "", self.emit_qs([qrec.cache["scale_mul_biases_q"]]), "", ""] else: row += ["", "", "", "", ""] else: diff --git a/tools/nntool/utils/node_id.py b/tools/nntool/utils/node_id.py index 2703db6b5..b44e6105e 100644 --- a/tools/nntool/utils/node_id.py +++ b/tools/nntool/utils/node_id.py @@ -25,6 +25,12 @@ def __init__(self, node, fnode=None): fnode_name = None if fnode is None else fnode if isinstance(fnode, str) else fnode.name self._id = [node.name, "" if fnode is None else fnode_name] + @property + def key(self): + if self._id[1]: + return self._id + return self._id[0] + @property def id(self): return self._id diff --git a/tools/nntool/utils/random_iter.py b/tools/nntool/utils/random_iter.py new file mode 100644 index 000000000..8007aeade --- /dev/null +++ b/tools/nntool/utils/random_iter.py @@ -0,0 +1,42 @@ +# Copyright (C) 2022 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import numpy as np + + +class RandomIter(): + def __init__(self, count, shapes, ranges, gen=None) -> None: + self._vars = list(zip(shapes, ranges)) + self._gen = gen + self._count = count + self._cur = count + if self._gen is None: + self._gen = np.random.default_rng() + + def __iter__(self): + self._cur = self._count + return self + + def __next__(self): + if self._cur <= 0: + raise StopIteration() + self._cur -= 1 + return self.val() + + def val(self): + res = [] + for shape, (minv, maxv) in self._vars: + res.append((self._gen.random(shape) * (maxv - minv)) + minv) + return res diff --git a/utils/gaptest/gaptest b/utils/gaptest/gaptest index e6d506785..ce077a9fa 100755 --- a/utils/gaptest/gaptest +++ b/utils/gaptest/gaptest @@ -77,6 +77,7 @@ sub exc_cmd_make { my $os = shift; my $platform = shift; my $flags = shift; + my $compile_only = shift; my $tags = shift; my $pre = shift; my $post = shift; @@ -85,18 +86,24 @@ sub exc_cmd_make { my $exec_dir = shift; my $res = 0; my $make_path = $basedir."/".$exec_dir; - system ("make -C ".$make_path." PMSIS_OS=".$os." build_dir_ext=".$target_name." clean"); + system ("make -C ".$make_path." ".$flags." PMSIS_OS=".$os." build_dir_ext=".$target_name." clean"); if($pre == 1) { `make prepare`; } + my $targets = "all"; + if (not $compile_only) + { + $targets = "${targets} run"; + } + chdir $exec_dir; - say "make -C ".$make_path." ".$flags." PMSIS_OS=".$os." platform=".$platform." build_dir_ext=".$target_name." all run"; my ($seconds_before, $seconds_after); timeout $timeout => sub { $seconds_before = time(); - $res = system ("make -C ".$make_path." ".$flags." PMSIS_OS=".$os." platform=".$platform." build_dir_ext=".$target_name." all run"); + say "make -C ".$make_path." ".$flags." PMSIS_OS=".$os." platform=".$platform." build_dir_ext=".$target_name." $targets"; + $res = system ("make -C ".$make_path." ".$flags." PMSIS_OS=".$os." platform=".$platform." build_dir_ext=".$target_name." $targets"); $seconds_after = time(); $res = $res >>=8; say $target_name." : Result is: ".$res; @@ -122,6 +129,7 @@ sub exc_cmd_cmake { my $os = shift; my $platform = shift; my $flags = shift; + my $compile_only = shift; my $tags = shift; my $pre = shift; my $post = shift; @@ -150,15 +158,19 @@ sub exc_cmd_cmake { say "CMAKE_GENERATOR=Ninja cmake -S $make_path -B $make_path/build".-$target_name." -DCONFIG_GAP_SDK_HOME=$sdk_root_path -DCMAKE_MODULE_PATH=$sdk_root_path/utils/cmake -DCONFIG_CHIP=$chip -DCONFIG_CHIP_VERSION=$chip_version -DBOARD=$ENV{'BOARD_NAME'} $cmake_flags"; $res = system ("CMAKE_GENERATOR=Ninja cmake -S $make_path -B $make_path/build".-$target_name." -DCONFIG_GAP_SDK_HOME=$sdk_root_path -DCMAKE_MODULE_PATH=$sdk_root_path/utils/cmake -DCONFIG_CHIP=$chip -DCONFIG_CHIP_VERSION=$chip_version -DBOARD=$ENV{'BOARD_NAME'} $cmake_flags"); say "cmake --build $make_path/build"; - $res = system("cmake --build $make_path/build".-$target_name); - say "cmake --build $make_path/build --target run"; - $res = system("cmake --build $make_path/build".-$target_name." --target run"); - $seconds_after = time(); - $res = $res >>=8; - say $target_name." : Result is: ".$res; - if($post == 1) + + if (not $compile_only) { - $res = system("cmake --build build --target postrun"); + $res = system("cmake --build $make_path/build".-$target_name); + say "cmake --build $make_path/build --target run"; + $res = system("cmake --build $make_path/build".-$target_name." --target run"); + $seconds_after = time(); + $res = $res >>=8; + say $target_name." : Result is: ".$res; + if($post == 1) + { + $res = system("cmake --build build --target postrun"); + } } }; my $seconds = $seconds_after - $seconds_before; @@ -256,6 +268,7 @@ sub process_yml { my @arg = ($config_os, $config_platform, $test_variant->{flags}, + $test_variant->{compile_only}, '', 0, 0, diff --git a/utils/gaptest/lib/gaptest/Loader.pm b/utils/gaptest/lib/gaptest/Loader.pm index 12e0a835a..441816565 100644 --- a/utils/gaptest/lib/gaptest/Loader.pm +++ b/utils/gaptest/lib/gaptest/Loader.pm @@ -144,6 +144,15 @@ sub load_test_variant { $test_variant{flags} = ""; } + $test_variant{compile_only} = 0; + if (defined $section->{compile_only}) + { + if ($section->{compile_only} eq "true") + { + $test_variant{compile_only} = 1; + } + } + if(defined $section->{duration}) { $test_variant{duration} = $section->{duration}; diff --git a/utils/openocd_tools/gap_bins/gap_flasher-gap9_evk-fpga.elf b/utils/openocd_tools/gap_bins/gap_flasher-gap9_evk-fpga.elf deleted file mode 100755 index 05fe0de49..000000000 Binary files a/utils/openocd_tools/gap_bins/gap_flasher-gap9_evk-fpga.elf and /dev/null differ diff --git a/utils/openocd_tools/gap_bins/gap_flasher-gap9_evk-mram.elf b/utils/openocd_tools/gap_bins/gap_flasher-gap9_evk-mram.elf deleted file mode 100755 index 78e5c9b04..000000000 Binary files a/utils/openocd_tools/gap_bins/gap_flasher-gap9_evk-mram.elf and /dev/null differ diff --git a/utils/openocd_tools/gap_bins/gap_flasher-gap9_evk.elf b/utils/openocd_tools/gap_bins/gap_flasher-gap9_evk.elf deleted file mode 100755 index 63d1e132b..000000000 Binary files a/utils/openocd_tools/gap_bins/gap_flasher-gap9_evk.elf and /dev/null differ diff --git a/utils/openocd_tools/gap_bins/gap_flasher-gap9_v2.elf b/utils/openocd_tools/gap_bins/gap_flasher-gap9_v2.elf deleted file mode 100755 index 29564a524..000000000 Binary files a/utils/openocd_tools/gap_bins/gap_flasher-gap9_v2.elf and /dev/null differ diff --git a/utils/openocd_tools/src/fuser/gap9-efuse-gen b/utils/openocd_tools/src/fuser/gap9-efuse-gen deleted file mode 100755 index ef1a4462a..000000000 --- a/utils/openocd_tools/src/fuser/gap9-efuse-gen +++ /dev/null @@ -1,91 +0,0 @@ -#!/usr/bin/env python3 - -# -# Copyright (C) 2019 GreenWaves Technologies -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import argparse -import runner.chips.gap9_v2_efuse as efuse - -parser = argparse.ArgumentParser(description='Generate gap9 efuse map') - -parser.add_argument("--usecase", dest="usecase", default=None, help="specify the usecase") -parser.add_argument("--output", dest="output", default=None, help="specify the output file path") -parser.add_argument("--name", dest="name", default=None, help="specify the structure name") - -args = parser.parse_args() - -efuse_map = efuse.Efuse_map() - -efuse_map.get_efuse('info_1').get_field('icache_enabled').set(1) - -# By default, only activate fast clock and fed other blocks like timer at 24Mhz/16 -fast_osc_freq_div = 24576062.0 / 16 -efuse_map.get_efuse('info_1').get_field('osc_ctrl_setup').set(1) -efuse_map.get_efuse('info_1').get_field('osc_ctrl').set(1) -efuse_map.get_efuse('info_1').get_field('fast_clk_div_pow2_setup').set(1) -efuse_map.get_efuse('fast_clk_div_pow2').set(4 | (1<<3)) -efuse_map.get_efuse('info_2').get_field('wake_osc_ctrl_setup').set(1) -efuse_map.get_efuse('info_2').get_field('wake_osc_ctrl').set(1) -efuse_map.get_efuse('info_2').get_field('wake_fast_clk_div_pow2_setup').set(1) -efuse_map.get_efuse('wake_fast_clk_div_pow2').set(4 | (1<<3)) - -# Lock FLL soc and periph -efuse_map.get_efuse('info_1').get_field('fll_global_setup').set(1) -efuse_map.get_efuse('info_1').get_field('fll_dco0_setup').set(1) -# FLL DRR (DCO min | DCO max) -efuse_map.get_efuse('fll_drr').set((0 << 0) | (0x1ff << 16)) -# Pre-lock FLL CCR1 (CLK0 DIV | CLK1 DIV) -efuse_map.get_efuse('fll_ccr1_pre_lock').set((0 << 0) | (0 << 8)) -# Post-lock FLL CCR1 (CLK0 DIV | CLK1 DIV) -efuse_map.get_efuse('fll_ccr1_post_lock').set((0 << 0) | (3 << 8)) -# FLL CCR2 (CLK0 SEL | CLK1 SEL | CLK2_SEL | CLK3_SEL | CKG0) -efuse_map.get_efuse('fll_ccr2').set((0x1 << 0) | (0x1 << 4) | (0x1 << 8) | (0x2 << 12) | (1 << 16)) -# DCO0 CR1 (DCO EN | CLOSE LOOP | LOOP GAIN | LOCK TOL | ITG | ASSERT CYCLES) -efuse_map.get_efuse('fll_f0cr1').set((1 << 0) | (1 << 1) | (4 << 4) | (10 << 8) | (24 << 16) | (6 << 26)) -# DCO0 CR2 (MFI | DCO CODE) -efuse_map.get_efuse('fll_f0cr2').set((166 << 0) | (0x1A << 16)) - -# FLL DRR (DCO min | DCO max) -efuse_map.get_efuse('wakeup_fll_drr').set((0 << 0) | (0x1ff << 16)) -# Pre-lock FLL CCR1 (CLK0 DIV | CLK1 DIV) -efuse_map.get_efuse('wakeup_fll_ccr1_pre_lock').set((0 << 0) | (0 << 8)) -# Post-lock FLL CCR1 (CLK0 DIV | CLK1 DIV) -efuse_map.get_efuse('wakeup_fll_ccr1_post_lock').set((0 << 0) | (1 << 8)) -# FLL CCR2 (CLK0 SEL | CLK1 SEL | CLK2_SEL | CLK3_SEL | CKG0) -efuse_map.get_efuse('wakeup_fll_ccr2').set((0x1 << 0) | (0x1 << 4) | (0x1 << 8) | (0x2 << 12) | (1 << 16)) -# DCO0 CR1 (DCO EN | CLOSE LOOP | LOOP GAIN | LOCK TOL | ITG | ASSERT CYCLES) -efuse_map.get_efuse('wakeup_fll_f0cr1').set((1 << 0) | (1 << 1) | (4 << 4) | (10 << 8) | (24 << 16) | (6 << 26)) -# DCO0 CR2 (MFI | DCO CODE) -efuse_map.get_efuse('wakeup_fll_f0cr2').set((166 << 0) | (0x1A << 16)) - - -if args.usecase == 'mram': - efuse_map.get_efuse('info_1').get_field('bootmode').set(3) - efuse_map.get_efuse('info_1').get_field('mram_reset_wait').set(1) - efuse_map.get_efuse('info_2').get_field('wake_mram_reset_wait').set(1) - efuse_map.get_efuse('mram_reset_wait_cycles').set(math.ceil(0.000003*fast_osc_freq_div)) - efuse_map.get_efuse('wake_mram_reset_wait_cycles').set(math.ceil(0.000003*fast_osc_freq_div)) - efuse_map.get_efuse('info_2').get_field('clkdiv_setup').set(1) - efuse_map.get_efuse('info_2').get_field('clkdiv').set(5) - efuse_map.get_efuse('info_3').get_field('flash_wait').set(1) - efuse_map.get_efuse('flash_wait').set(math.ceil(0.00002*fast_osc_freq_div)) - - - - -if args.output is not None: - with open(args.output, 'w') as output_file: - efuse_map.gen_c_struct(args.name, output_file) diff --git a/utils/openocd_tools/tcl/gap9reva.tcl b/utils/openocd_tools/tcl/gap9reva.tcl deleted file mode 100644 index 4a2da6f0a..000000000 --- a/utils/openocd_tools/tcl/gap9reva.tcl +++ /dev/null @@ -1,81 +0,0 @@ -adapter_khz 5000 -transport select jtag -# Channel 1 is taken by Xilinx JTAG -#reset_config srst_pulls_trst -reset_config trst_and_srst -#adapter_nsrst_assert_width 1000 -#adapter_nsrst_delay 1000 -#ftdi_tdo_sample_edge falling -set _CHIPNAME riscv -jtag newtap $_CHIPNAME cpu -irlen 5 -expected-id 0x00000001 -jtag newtap $_CHIPNAME unknown0 -irlen 4 -expected-id 0x10102001 -foreach t [jtag names] { - puts [format "TAP: %s\n" $t] -} -set _TARGETNAME $_CHIPNAME.cpu - -target create $_TARGETNAME riscv -chain-position $_TARGETNAME -coreid 0x3e0 -gdb_report_data_abort enable -gdb_report_register_access_error enable - -riscv set_reset_timeout_sec 120 -riscv set_command_timeout_sec 120 -# prefer to use sba for system bus access -riscv set_prefer_sba on -proc jtag_init {} { - puts "----------- jtag init" - # ensure chip reset done: this might not always be what we want - jtag_reset 0 0 - sleep 1 - jtag_reset 0 1 - sleep 10 - jtag_reset 0 0 - sleep 1 - # ensure jtag reset is done - pathmove RESET - pathmove IDLE - # "going to examine" - #riscv.cpu arp_examine - # "examination done" - puts "----------- jtag init done" -} -proc init_reset {mode} { - puts "----------- init reset" - # ensure chip reset done: this might not always be what we want - # ensure jtag reset is done - jtag_reset 0 0 - sleep 1 - jtag_reset 0 1 - sleep 10 - jtag_reset 0 0 - sleep 20 - pathmove RESET - pathmove IDLE - # "going to examine" - #riscv.cpu arp_examine - # "examination done" - #if { $mode == 0x1} { - # riscv.cpu arp_halt - #} -} -proc load_and_start_binary { elf_file pc_entry } { - puts "----------- load and start bin" - # first ensure we are rest and halt so that pc is accessible - riscv.cpu mww 0x1A100008 0x0fff1907 - riscv.cpu mww 0x1A100018 0x0fff1907 - riscv.cpu mww 0x1A100028 0x0fff1907 - riscv.cpu mww 0x1A100004 0xd0885f5e - riscv.cpu mww 0x1A100014 0xd0885f5e - riscv.cpu mww 0x1A100024 0xd0885f5e - #reset halt - load_image ${elf_file} 0x0 elf - reg pc ${pc_entry} - resume -} -# dump jtag chain -#scan_chain -#telnet_port 6666 -init -reset halt -riscv.cpu arm semihosting enable -echo "Ready for Remote Connections" diff --git a/utils/openocd_tools/tcl/gap9revb-bootmode.tcl b/utils/openocd_tools/tcl/gap9revb-bootmode.tcl deleted file mode 100644 index 4068cc609..000000000 --- a/utils/openocd_tools/tcl/gap9revb-bootmode.tcl +++ /dev/null @@ -1,134 +0,0 @@ -adapter_khz 500 - -adapter driver remote_bitbang -remote_bitbang_port 9999 -remote_bitbang_host localhost - -reset_config srst_only srst_nogate - -set _CHIPNAME gap9 - -jtag newtap $_CHIPNAME riscv -irlen 5 -expected-id 0x20020bcb -jtag newtap $_CHIPNAME pulp -irlen 4 -expected-id 0x20021bcb - -foreach t [jtag names] { - puts [format "TAP: %s\n" $t] -} - - -set _TAP_RISCV $_CHIPNAME.riscv -set _TAP_PULP $_CHIPNAME.pulp -set _CL0 $_CHIPNAME.cl0 -set _CL1 $_CHIPNAME.cl1 -set _CL2 $_CHIPNAME.cl2 -set _CL3 $_CHIPNAME.cl3 -set _CL4 $_CHIPNAME.cl4 -set _CL5 $_CHIPNAME.cl5 -set _CL6 $_CHIPNAME.cl6 -set _CL7 $_CHIPNAME.cl7 -set _CL8 $_CHIPNAME.cl8 -set _FC $_CHIPNAME.fc - -target create $_FC riscv -chain-position $_TAP_RISCV -coreid 0x9 -#target create $_CL0 riscv -chain-position $_TARGETNAME -coreid 0x0 -defer-examine -#target create $_CL1 riscv -chain-position $_TARGETNAME -coreid 0x1 -defer-examine -#target create $_CL2 riscv -chain-position $_TARGETNAME -coreid 0x2 -defer-examine -#target create $_CL3 riscv -chain-position $_TARGETNAME -coreid 0x3 -defer-examine -#target create $_CL4 riscv -chain-position $_TARGETNAME -coreid 0x4 -defer-examine -#target create $_CL5 riscv -chain-position $_TARGETNAME -coreid 0x5 -defer-examine -target create $_CL6 riscv -chain-position $_TAP_RISCV -coreid 0x6 -defer-examine -target create $_CL7 riscv -chain-position $_TAP_RISCV -coreid 0x7 -defer-examine -#target create $_CL8 riscv -chain-position $_TARGETNAME -coreid 0x8 -defer-examine -target smp $_CL6 $_CL7 -#target create $_CL8 riscv -chain-position $_TARGETNAME -coreid 0x8 -gdb-port 6666 -defer-examine -#target create $_CL8 riscv -chain-position $_TARGETNAME -coreid 0x8 -gdb-port 6666 - - -$_CL6 configure -rtos hwthread -$_CL7 configure -rtos hwthread - -proc cl6_attach_proc { } { - $::_CL6 arp_examine - $::_CL7 arp_examine - # since smp, this will halt all concerned code - $::_CL6 arp_halt - #$::_CL7 arp_halt - $::_CL6 arm semihosting enable - $::_CL7 arm semihosting enable -} -$_CL6 configure -event gdb-attach cl6_attach_proc - -gdb_report_data_abort enable -gdb_report_register_access_error enable - -riscv set_reset_timeout_sec 1440 -riscv set_command_timeout_sec 1440 - -# prefer to use sba for system bus access -riscv set_prefer_sba on - - -proc poll_confreg { value } { - irscan $::_TAP_PULP 0x6 - # size then value - set ret [eval drscan $::_TAP_PULP 0x8 $value] - puts "ret=$ret" - while { !$ret } { - irscan $::_TAP_PULP 0x6 - # size then value - set ret [eval drscan $::_TAP_PULP 0x8 $value] - puts "ret=$ret" - } -} - -proc jtag_init {} { - puts "jtag init" - targets $::_FC - jtag_reset 0 1 - sleep 1 - jtag_reset 0 0 - sleep 1 - # wait for jtag ready - poll_confreg 0x1 - echo "confreg polling done" - jtag arp_init-reset -} - -proc init_reset {mode} { - puts "hello" - targets $::_FC - jtag_reset 0 1 - sleep 1 - jtag_reset 0 0 - sleep 1 - # wait for jtag ready - poll_confreg 0x1 - echo "confreg polling done" - jtag arp_init-reset -} - -proc load_and_start_binary { elf_file pc_entry } { - targets $::_FC - # first ensure we are rest and halt so that pc is accessible - #$::_FC arp_reset assert 1 - reset halt - load_image ${elf_file} 0x0 elf - reg pc ${pc_entry} - resume -} - - -# dump jtag chain -#scan_chain - -targets $_FC -init - - -#targets -#ftdi_set_signal nSRST 1 -halt - -$::_FC arm semihosting enable - -echo "Ready for Remote Connections" diff --git a/utils/openocd_tools/tcl/gap9revb.tcl b/utils/openocd_tools/tcl/gap9revb.tcl deleted file mode 100644 index c3246be19..000000000 --- a/utils/openocd_tools/tcl/gap9revb.tcl +++ /dev/null @@ -1,167 +0,0 @@ -adapter_khz 5000 - -#interface jlink -#transport select jtag -# Channel 1 is taken by Xilinx JTAG -#reset_config srst_pulls_trst - -#adapter driver remote_bitbang -#remote_bitbang_port 9999 -#remote_bitbang_host localhost - -reset_config srst_only srst_nogate - -set _CHIPNAME gap9 - -jtag newtap $_CHIPNAME riscv -irlen 5 -expected-id 0x20020bcb -jtag newtap $_CHIPNAME pulp -irlen 4 -expected-id 0x20021bcb - -foreach t [jtag names] { - puts [format "TAP: %s\n" $t] -} - - -set _TAP_RISCV $_CHIPNAME.riscv -set _TAP_PULP $_CHIPNAME.pulp -set _CL0 $_CHIPNAME.cl0 -set _CL1 $_CHIPNAME.cl1 -set _CL2 $_CHIPNAME.cl2 -set _CL3 $_CHIPNAME.cl3 -set _CL4 $_CHIPNAME.cl4 -set _CL5 $_CHIPNAME.cl5 -set _CL6 $_CHIPNAME.cl6 -set _CL7 $_CHIPNAME.cl7 -set _CL8 $_CHIPNAME.cl8 -set _FC $_CHIPNAME.fc - -target create $_FC riscv -chain-position $_TAP_RISCV -coreid 0x9 -#target create $_CL0 riscv -chain-position $_TAP_RISCV -coreid 0x0 -defer-examine -#target create $_CL1 riscv -chain-position $_TAP_RISCV -coreid 0x1 -defer-examine -#target create $_CL2 riscv -chain-position $_TAP_RISCV -coreid 0x2 -defer-examine -#target create $_CL3 riscv -chain-position $_TAP_RISCV -coreid 0x3 -defer-examine -#target create $_CL4 riscv -chain-position $_TAP_RISCV -coreid 0x4 -defer-examine -#target create $_CL5 riscv -chain-position $_TAP_RISCV -coreid 0x5 -defer-examine -#target create $_CL6 riscv -chain-position $_TAP_RISCV -coreid 0x6 -defer-examine -#target create $_CL7 riscv -chain-position $_TAP_RISCV -coreid 0x7 -defer-examine -#target create $_CL8 riscv -chain-position $_TAP_RISCV -coreid 0x8 -defer-examine -#target smp $_FC $_CL0 $_CL1 $_CL2 $_CL3 $_CL4 $_CL5 $_CL6 $_CL7 $_CL8 -#target create $_CL8 riscv -chain-position $_TARGETNAME -coreid 0x8 -gdb-port 6666 -defer-examine -#target create $_CL8 riscv -chain-position $_TARGETNAME -coreid 0x8 -gdb-port 6666 - - -#$_CL0 configure -rtos hwthread -#$_CL1 configure -rtos hwthread -#$_CL2 configure -rtos hwthread -#$_CL3 configure -rtos hwthread -#$_CL4 configure -rtos hwthread -#$_CL5 configure -rtos hwthread -#$_CL6 configure -rtos hwthread -#$_CL7 configure -rtos hwthread -#$_CL8 configure -rtos hwthread -#$_FC configure -rtos hwthread - -#proc cl6_attach_proc { } { -# $::_CL6 arp_examine -# $::_CL7 arp_examine - # since smp, this will halt all concerned code -# $::_CL6 arp_halt - #$::_CL7 arp_halt -# $::_CL6 arm semihosting enable -# $::_CL7 arm semihosting enable -#} -#$_CL6 configure -event gdb-attach cl6_attach_proc - -gdb_report_data_abort enable -gdb_report_register_access_error enable - -riscv set_reset_timeout_sec 1440 -riscv set_command_timeout_sec 1440 - -# prefer to use sba for system bus access -riscv set_prefer_sba on - - -proc poll_confreg { value } { - irscan $::_TAP_PULP 0x6 - # size then value - set ret [eval drscan $::_TAP_PULP 0x8 $value] - puts "ret=$ret" - while { $ret != 0x3 } { - irscan $::_TAP_PULP 0x6 - # size then value - set ret [eval drscan $::_TAP_PULP 0x8 $value] - puts "ret=$ret" - } -} - -proc jtag_init {} { - puts "jtag init" - targets $::_FC - jtag_reset 0 1 - sleep 1 - jtag_reset 0 0 - sleep 1 - # wait for jtag ready - poll_confreg 0x1 - echo "confreg polling done" - #$::_CL0 arp_examine - #$::_CL1 arp_examine - #$::_CL2 arp_examine - #$::_CL3 arp_examine - #$::_CL4 arp_examine - #$::_CL5 arp_examine - #$::_CL6 arp_examine - #$::_CL7 arp_examine - #$::_CL8 arp_examine - $::_FC arp_examine - #$::_CL0 arp_halt - #$::_CL1 arp_halt - #$::_CL2 arp_halt - #$::_CL3 arp_halt - #$::_CL4 arp_halt - #$::_CL5 arp_halt - #$::_CL6 arp_halt - #$::_CL7 arp_halt - #$::_CL8 arp_halt - #$::_FC arp_halt - echo "examine done" - jtag arp_init -} - -proc init_reset {mode} { - puts "hello" - #targets $::_FC - jtag_reset 0 1 - sleep 1 - jtag_reset 0 0 - sleep 1 - # wait for jtag ready - poll_confreg 0x1 - echo "confreg polling done" - jtag arp_init -} - -proc load_and_start_binary { elf_file pc_entry } { - targets $::_FC - # first ensure we are rest and halt so that pc is accessible - #$::_FC arp_reset assert 1 - reset halt - load_image ${elf_file} 0x0 elf - reg pc ${pc_entry} - resume -} - - -# dump jtag chain -#scan_chain - -init - - -#targets -#ftdi_set_signal nSRST 1 -halt - -$::_FC arm semihosting enable - -echo "Ready for Remote Connections" diff --git a/utils/openocd_tools/tcl/gap9revb_gdb.tcl b/utils/openocd_tools/tcl/gap9revb_gdb.tcl deleted file mode 100644 index c21b8fc38..000000000 --- a/utils/openocd_tools/tcl/gap9revb_gdb.tcl +++ /dev/null @@ -1,245 +0,0 @@ -adapter_khz 500 - -#interface jlink -#transport select jtag -# Channel 1 is taken by Xilinx JTAG -#reset_config srst_pulls_trst - -#adapter driver remote_bitbang -#remote_bitbang_port 9999 -#remote_bitbang_host localhost - -reset_config srst_only srst_nogate - -set _CHIPNAME gap9 - -jtag newtap $_CHIPNAME riscv -irlen 5 -expected-id 0x20020bcb -jtag newtap $_CHIPNAME pulp -irlen 4 -expected-id 0x20021bcb - -foreach t [jtag names] { - puts [format "TAP: %s\n" $t] -} - - -set _TAP_RISCV $_CHIPNAME.riscv -set _TAP_PULP $_CHIPNAME.pulp -set _CL0 $_CHIPNAME.cl0 -set _CL1 $_CHIPNAME.cl1 -set _CL2 $_CHIPNAME.cl2 -set _CL3 $_CHIPNAME.cl3 -set _CL4 $_CHIPNAME.cl4 -set _CL5 $_CHIPNAME.cl5 -set _CL6 $_CHIPNAME.cl6 -set _CL7 $_CHIPNAME.cl7 -set _CL8 $_CHIPNAME.cl8 -set _FC $_CHIPNAME.fc - -target create $_FC riscv -chain-position $_TAP_RISCV -coreid 0x9 -target create $_CL0 riscv -chain-position $_TAP_RISCV -coreid 0x0 -defer-examine -target create $_CL1 riscv -chain-position $_TAP_RISCV -coreid 0x1 -defer-examine -target create $_CL2 riscv -chain-position $_TAP_RISCV -coreid 0x2 -defer-examine -target create $_CL3 riscv -chain-position $_TAP_RISCV -coreid 0x3 -defer-examine -target create $_CL4 riscv -chain-position $_TAP_RISCV -coreid 0x4 -defer-examine -target create $_CL5 riscv -chain-position $_TAP_RISCV -coreid 0x5 -defer-examine -target create $_CL6 riscv -chain-position $_TAP_RISCV -coreid 0x6 -defer-examine -target create $_CL7 riscv -chain-position $_TAP_RISCV -coreid 0x7 -defer-examine -target create $_CL8 riscv -chain-position $_TAP_RISCV -coreid 0x8 -defer-examine -#target create $_CL8 riscv -chain-position $_TARGETNAME -coreid 0x8 -gdb-port 6666 -defer-examine -#target create $_CL8 riscv -chain-position $_TARGETNAME -coreid 0x8 -gdb-port 6666 -target smp $_FC $_CL0 $_CL1 $_CL2 $_CL3 $_CL4 $_CL5 $_CL6 $_CL7 $_CL8 -#target smp $_FC $_CL8 - -$_CL0 configure -rtos hwthread -$_CL1 configure -rtos hwthread -$_CL2 configure -rtos hwthread -$_CL3 configure -rtos hwthread -$_CL4 configure -rtos hwthread -$_CL5 configure -rtos hwthread -$_CL6 configure -rtos hwthread -$_CL7 configure -rtos hwthread -$_CL8 configure -rtos hwthread -$_FC configure -rtos hwthread - -#proc cl6_attach_proc { } { -# $::_CL6 arp_examine -# $::_CL7 arp_examine - # since smp, this will halt all concerned code -# $::_CL6 arp_halt - #$::_CL7 arp_halt -# $::_CL6 arm semihosting enable -# $::_CL7 arm semihosting enable -#} -#$_CL6 configure -event gdb-attach cl6_attach_proc - -gdb_report_data_abort enable -gdb_report_register_access_error enable - -riscv set_reset_timeout_sec 36000 -riscv set_command_timeout_sec 36000 - -# prefer to use sba for system bus access -riscv set_prefer_sba on - - -proc poll_confreg { value } { - irscan $::_TAP_PULP 0x6 - # size then value - set ret [eval drscan $::_TAP_PULP 0x8 $value] - puts "ret=$ret" - while { $ret != 0x3 } { - irscan $::_TAP_PULP 0x6 - # size then value - set ret [eval drscan $::_TAP_PULP 0x8 $value] - puts "ret=$ret" - } -} - -proc cluster_reset { addr } { - # first reset the cluster - - poll off - $::_FC mww 0x10200008 0x0 - $::_FC mww 0x1a1040e4 0x200 - # SOC CTRL + 0x170 - $::_FC mww 0x1a104170 0x0 - sleep 1 - $::_FC mww 0x1a104170 0x1 - - # CLUSTER Ctrl: 0x10000000 + 0x00200000 - # addr: +0x40 - $::_FC mww 0x10200040 $addr 9 - # fetch en: +0x8 - $::_FC mww 0x10200008 0x3ff - # available: + 0xe4 - $::_FC mww 0x1a1040e4 0xffffffff - $::_CL0 arp_halt - $::_CL1 arp_halt - $::_CL2 arp_halt - $::_CL3 arp_halt - $::_CL4 arp_halt - $::_CL5 arp_halt - $::_CL6 arp_halt - $::_CL7 arp_halt - $::_CL8 arp_halt - $::_CL0 riscv set_ebreakm on - $::_CL1 riscv set_ebreakm on - $::_CL2 riscv set_ebreakm on - $::_CL3 riscv set_ebreakm on - $::_CL4 riscv set_ebreakm on - $::_CL5 riscv set_ebreakm on - $::_CL6 riscv set_ebreakm on - $::_CL7 riscv set_ebreakm on - $::_CL8 riscv set_ebreakm on - poll on -} - -proc jtag_init {} { - puts "jtag init" - targets $::_FC - jtag_reset 0 1 - sleep 1 - jtag_reset 0 0 - sleep 1 - # wait for jtag ready - poll_confreg 0xb - echo "confreg polling done" - - #$::_FC arm semihosting_fileio enable - #$::_FC arm semihosting_resexit enable - # APB SOC CTRL: 0x1A100000 + 0x00004000 - # cl isolate: + 0xC - #$::_FC mww 0x1a10400c 0 - # CLUSTER Ctrl: 0x10000000 + 0x00200000 - # addr +0x40 - #mww 0x10200040 0x1a00010c 9 - # fetch en: +0x8 - #$::_FC mww 0x10200008 0x3ff - # available: + 0xe4 - #$::_FC mww 0x1a1040e4 0xffffffff - $::_CL0 arp_examine - $::_CL1 arp_examine - $::_CL2 arp_examine - $::_CL3 arp_examine - $::_CL4 arp_examine - $::_CL5 arp_examine - $::_CL6 arp_examine - $::_CL7 arp_examine - $::_CL8 arp_examine - - # halt all - #$::_CL0 arp_halt - #$::_CL1 arp_halt - #$::_CL2 arp_halt - #$::_CL3 arp_halt - #$::_CL4 arp_halt - #$::_CL5 arp_halt - #$::_CL6 arp_halt - #$::_CL7 arp_halt - #$::_CL8 arp_halt - #set ebreakm - #$::_FC riscv set_ebreakm on - #$::_CL0 riscv set_ebreakm on - #$::_CL1 riscv set_ebreakm on - #$::_CL2 riscv set_ebreakm on - #$::_CL3 riscv set_ebreakm on - #$::_CL4 riscv set_ebreakm on - #$::_CL5 riscv set_ebreakm on - #$::_CL6 riscv set_ebreakm on - #$::_CL7 riscv set_ebreakm on - #$::_CL8 riscv set_ebreakm on - - $::_FC arp_examine - $::_FC arp_halt - $::_FC arm semihosting enable - #$::_CL0 arp_halt - #$::_CL1 arp_halt - #$::_CL2 arp_halt - #$::_CL3 arp_halt - #$::_CL4 arp_halt - #$::_CL5 arp_halt - #$::_CL6 arp_halt - #$::_CL7 arp_halt - #$::_CL8 arp_halt - echo "examine done" - jtag arp_init -} - -proc init_reset {mode} { - puts "hello" - #targets $::_FC - jtag_reset 0 1 - sleep 1 - jtag_reset 0 0 - sleep 1 - # wait for jtag ready - poll_confreg 0xb - echo "confreg polling done" - jtag arp_init -} - -proc load_and_start_binary { elf_file pc_entry } { - targets $::_FC - # first ensure we are rest and halt so that pc is accessible - #$::_FC arp_reset assert 1 - #reset halt - halt - load_image ${elf_file} 0x0 elf - reg pc ${pc_entry} - resume -} - - -# dump jtag chain -#scan_chain - -init - - -#targets $::_FC -#ftdi_set_signal nSRST 1 -halt - -#target smp $_FC $_CL8 -#$::_FC arm semihosting enable - -echo "Ready for Remote Connections"